Source code for zoo.models.textmatching.knrm

#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys

import zoo.pipeline.api.autograd as A
from zoo.models.common import ZooModel
from zoo.models.textmatching import TextMatcher
from zoo.pipeline.api.keras.layers import Input, Embedding, Dense, Squeeze, prepare_embedding
from zoo.pipeline.api.keras.models import Model
from bigdl.util.common import JTensor
from zoo.common.utils import callZooFunc

if sys.version >= '3':
    long = int
    unicode = str


[docs]class KNRM(TextMatcher): """ Kernel-pooling Neural Ranking Model with RBF kernel. https://arxiv.org/abs/1706.06613 # Arguments: text1_length: Sequence length of text1 (query). text2_length: Sequence length of text2 (doc). embedding_file: The path to the word embedding file. Currently only the following GloVe files are supported: "glove.6B.50d.txt", "glove.6B.100d.txt", "glove.6B.200d.txt" "glove.6B.300d.txt", "glove.42B.300d.txt", "glove.840B.300d.txt". You can download from: https://nlp.stanford.edu/projects/glove/. word_index: Dictionary of word (string) and its corresponding index (int). The index is supposed to start from 1 with 0 reserved for unknown words. During the prediction, if you have words that are not in the word_index for the training, you can map them to index 0. Default is None. In this case, all the words in the embedding_file will be taken into account and you can call WordEmbedding.get_word_index(embedding_file) to retrieve the dictionary. train_embed: Boolean. Whether to train the embedding layer or not. Default is True. kernel_num: Int > 1. The number of kernels to use. Default is 21. sigma: Float. Defines the kernel width, or the range of its softTF count. Default is 0.1. exact_sigma: Float. The sigma used for the kernel that harvests exact matches in the case where RBF mu=1.0. Default is 0.001. target_mode: String. The target mode of the model. Either 'ranking' or 'classification'. For ranking, the output will be the relevance score between text1 and text2 and you are recommended to use 'rank_hinge' as loss for pairwise training. For classification, the last layer will be sigmoid and the output will be the probability between 0 and 1 indicating whether text1 is related to text2 and you are recommended to use 'binary_crossentropy' as loss for binary classification. Default mode is 'ranking'. """ def __init__(self, text1_length, text2_length, embedding_file, word_index=None, train_embed=True, kernel_num=21, sigma=0.1, exact_sigma=0.001, target_mode="ranking", bigdl_type="float"): embed_weights = prepare_embedding(embedding_file, word_index, randomize_unknown=True, normalize=True) vocab_size, embed_size = embed_weights.shape super(KNRM, self).__init__(text1_length, vocab_size, embed_size, embed_weights, train_embed, target_mode, bigdl_type) self.text2_length = text2_length assert kernel_num > 1, "kernel_num must be an int larger than 1" self.kernel_num = kernel_num self.sigma = float(sigma) self.exact_sigma = float(exact_sigma) self.model = self.build_model() super(TextMatcher, self).__init__(None, self.bigdl_type, self.text1_length, self.text2_length, self.vocab_size, self.embed_size, JTensor.from_ndarray(embed_weights), self.train_embed, self.kernel_num, self.sigma, self.exact_sigma, self.target_mode, self.model)
[docs] def build_model(self): # Remark: Share weights for embedding is not supported. # Thus here the model takes concatenated input and slice to split the input. input = Input(name='input', shape=(self.text1_length + self.text2_length,)) embedding = Embedding(self.vocab_size, self.embed_size, weights=self.embed_weights, trainable=self.train_embed)(input) query_embed = embedding.slice(1, 0, self.text1_length) doc_embed = embedding.slice(1, self.text1_length, self.text2_length) mm = A.batch_dot(query_embed, doc_embed, axes=[2, 2]) # Translation Matrix. KM = [] for i in range(self.kernel_num): mu = 1. / (self.kernel_num - 1) + (2. * i) / (self.kernel_num - 1) - 1.0 sigma = self.sigma if mu > 1.0: # Exact match. sigma = self.exact_sigma mu = 1.0 mm_exp = A.exp((-0.5) * (mm - mu) * (mm - mu) / sigma / sigma) mm_doc_sum = A.sum(mm_exp, axis=2) mm_log = A.log(mm_doc_sum + 1.0) # Remark: Keep the reduced dimension for the last sum and squeeze after stack. # Otherwise, when batch=1, the output will become a Scalar not compatible for stack. mm_sum = A.sum(mm_log, axis=1, keepDims=True) KM.append(mm_sum) Phi = Squeeze(2)(A.stack(KM)) if self.target_mode == "ranking": output = Dense(1, init="uniform")(Phi) else: output = Dense(1, init="uniform", activation="sigmoid")(Phi) model = Model(input=input, output=output) return model
[docs] @staticmethod def load_model(path, weight_path=None, bigdl_type="float"): """ Load an existing KNRM model (with weights). # Arguments path: The path for the pre-defined model. Local file system, HDFS and Amazon S3 are supported. HDFS path should be like 'hdfs://[host]:[port]/xxx'. Amazon S3 path should be like 's3a://bucket/xxx'. weight_path: The path for pre-trained weights if any. Default is None. """ jmodel = callZooFunc(bigdl_type, "loadKNRM", path, weight_path) model = ZooModel._do_load(jmodel, bigdl_type) model.__class__ = KNRM return model