Source code for zoo.models.textmatching.knrm

#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys

import zoo.pipeline.api.autograd as A
from zoo.models.common import ZooModel
from zoo.models.textmatching import TextMatcher
from zoo.pipeline.api.keras.layers import Input, Embedding, Dense, Squeeze, prepare_embedding
from zoo.pipeline.api.keras.models import Model
from bigdl.util.common import JTensor
from zoo.common.utils import callZooFunc

if sys.version >= '3':
    long = int
    unicode = str


[docs]class KNRM(TextMatcher):
    """
    Kernel-pooling Neural Ranking Model with RBF kernel.
    https://arxiv.org/abs/1706.06613

    # Arguments:
    text1_length: Sequence length of text1 (query).
    text2_length: Sequence length of text2 (doc).
    embedding_file: The path to the word embedding file.
                    Currently only the following GloVe files are supported:
                    "glove.6B.50d.txt", "glove.6B.100d.txt", "glove.6B.200d.txt"
                    "glove.6B.300d.txt", "glove.42B.300d.txt", "glove.840B.300d.txt".
                    You can download from: https://nlp.stanford.edu/projects/glove/.
    word_index: Dictionary of word (string) and its corresponding index (int).
                The index is supposed to start from 1 with 0 reserved for unknown words.
                During the prediction, if you have words that are not in the word_index
                for the training, you can map them to index 0.
                Default is None. In this case, all the words in the embedding_file will
                be taken into account and you can call
                WordEmbedding.get_word_index(embedding_file) to retrieve the dictionary.
    train_embed: Boolean. Whether to train the embedding layer or not. Default is True.
    kernel_num: Int > 1. The number of kernels to use. Default is 21.
    sigma: Float. Defines the kernel width, or the range of its softTF count. Default is 0.1.
    exact_sigma: Float. The sigma used for the kernel that harvests exact matches
                 in the case where RBF mu=1.0. Default is 0.001.
    target_mode: String. The target mode of the model. Either 'ranking' or 'classification'.
                 For ranking, the output will be the relevance score between text1 and text2 and
                 you are recommended to use 'rank_hinge' as loss for pairwise training.
                 For classification, the last layer will be sigmoid and the output will be the
                 probability between 0 and 1 indicating whether text1 is related to text2 and
                 you are recommended to use 'binary_crossentropy' as loss for binary classification.
                 Default mode is 'ranking'.
    """

    def __init__(self, text1_length, text2_length, embedding_file, word_index=None,
                 train_embed=True, kernel_num=21, sigma=0.1, exact_sigma=0.001,
                 target_mode="ranking", bigdl_type="float"):
        embed_weights = prepare_embedding(embedding_file, word_index,
                                          randomize_unknown=True, normalize=True)
        vocab_size, embed_size = embed_weights.shape
        super(KNRM, self).__init__(text1_length, vocab_size, embed_size,
                                   embed_weights, train_embed, target_mode, bigdl_type)
        self.text2_length = text2_length
        assert kernel_num > 1, "kernel_num must be an int larger than 1"
        self.kernel_num = kernel_num
        self.sigma = float(sigma)
        self.exact_sigma = float(exact_sigma)
        self.model = self.build_model()
        super(TextMatcher, self).__init__(None, self.bigdl_type,
                                          self.text1_length,
                                          self.text2_length,
                                          self.vocab_size,
                                          self.embed_size,
                                          JTensor.from_ndarray(embed_weights),
                                          self.train_embed,
                                          self.kernel_num,
                                          self.sigma,
                                          self.exact_sigma,
                                          self.target_mode,
                                          self.model)

[docs]    def build_model(self):
        # Remark: Share weights for embedding is not supported.
        # Thus here the model takes concatenated input and slice to split the input.
        input = Input(name='input', shape=(self.text1_length + self.text2_length,))
        embedding = Embedding(self.vocab_size, self.embed_size,
                              weights=self.embed_weights, trainable=self.train_embed)(input)
        query_embed = embedding.slice(1, 0, self.text1_length)
        doc_embed = embedding.slice(1, self.text1_length, self.text2_length)
        mm = A.batch_dot(query_embed, doc_embed, axes=[2, 2])  # Translation Matrix.
        KM = []
        for i in range(self.kernel_num):
            mu = 1. / (self.kernel_num - 1) + (2. * i) / (self.kernel_num - 1) - 1.0
            sigma = self.sigma
            if mu > 1.0:  # Exact match.
                sigma = self.exact_sigma
                mu = 1.0
            mm_exp = A.exp((-0.5) * (mm - mu) * (mm - mu) / sigma / sigma)
            mm_doc_sum = A.sum(mm_exp, axis=2)
            mm_log = A.log(mm_doc_sum + 1.0)
            # Remark: Keep the reduced dimension for the last sum and squeeze after stack.
            # Otherwise, when batch=1, the output will become a Scalar not compatible for stack.
            mm_sum = A.sum(mm_log, axis=1, keepDims=True)
            KM.append(mm_sum)
        Phi = Squeeze(2)(A.stack(KM))
        if self.target_mode == "ranking":
            output = Dense(1, init="uniform")(Phi)
        else:
            output = Dense(1, init="uniform", activation="sigmoid")(Phi)
        model = Model(input=input, output=output)
        return model

[docs]    @staticmethod
    def load_model(path, weight_path=None, bigdl_type="float"):
        """
        Load an existing KNRM model (with weights).

        # Arguments
        path: The path for the pre-defined model.
              Local file system, HDFS and Amazon S3 are supported.
              HDFS path should be like 'hdfs://[host]:[port]/xxx'.
              Amazon S3 path should be like 's3a://bucket/xxx'.
        weight_path: The path for pre-trained weights if any. Default is None.
        """
        jmodel = callZooFunc(bigdl_type, "loadKNRM", path, weight_path)
        model = ZooModel._do_load(jmodel, bigdl_type)
        model.__class__ = KNRM
        return model