Source code for zoo.pipeline.api.keras.layers.embeddings

#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import sys

from bigdl.util.common import JTensor
from zoo.common.utils import callZooFunc
from ..engine.topology import ZooKerasLayer

if sys.version >= '3':
    long = int
    unicode = str


[docs]class Embedding(ZooKerasLayer):
    """
    Turn positive integers (indexes) into dense vectors of fixed size.
    The input of this layer should be 2D.

    This layer can only be used as the first layer in a model, you need to provide the argument
    input_length (an integer) or input_shape (a shape tuple, does not include the batch dimension).

    # Arguments
    input_dim: Size of the vocabulary. Int > 0.
    output_dim: Dimension of the dense embedding. Int > 0.
    init: String representation of the initialization method for the weights of the layer.
          Default is 'uniform'.
    W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                   applied to the embedding matrix. Default is None.
    weights: Initial weights set to this layer, which should be a numpy array of
             size (inputDim, outputDim). Default is None and in this case weights are
             initialized by the initialization method specified by 'init'.
             Otherwise, 'weights' will override 'init' to take effect.
    trainable: Whether this layer is trainable or not. Default is True.
    input_length: Positive int. The sequence length of each input.
    mask_zero: if maskZero is set to true, the input whose value equals `paddingValue`
    the output will be masked to zero vector.
    padding_value: padding value, default 0
    zero_based_id: default True and input should be 0 based. Otherwise need to be 1 base
    name: String to set the name of the layer.
          If not specified, its name will by default to be a generated string.

    >>> embedding = Embedding(1000, 32, input_length=10, name="embedding1")
    creating: createZooKerasEmbedding

    >>> import numpy as np
    >>> embedding = Embedding(10, 200, weights=np.random.random([10, 200]), input_length=10)
    creating: createZooKerasEmbedding
    """

    def __init__(self, input_dim, output_dim, init="uniform", weights=None, trainable=True,
                 input_length=None, W_regularizer=None, input_shape=None, mask_zero=False,
                 padding_value=0, zero_based_id=True, **kwargs):
        if input_length:
            input_shape = (input_length,)
        super(Embedding, self).__init__(None,
                                        input_dim,
                                        output_dim,
                                        init,
                                        JTensor.from_ndarray(weights),
                                        trainable,
                                        W_regularizer,
                                        list(input_shape) if input_shape else None,
                                        mask_zero,
                                        padding_value,
                                        zero_based_id,
                                        **kwargs)


[docs]class WordEmbedding(ZooKerasLayer):
    """
    Embedding layer that directly loads pre-trained word vectors as weights.
    Turn non-negative integers (indices) into dense vectors of fixed size.
    Currently only GloVe embedding is supported.
    The input of this layer should be 2D.

    This layer can only be used as the first layer in a model, you need to provide the argument
    input_length (an integer) or input_shape (a shape tuple, does not include the batch dimension).

    # Arguments
    embedding_file: The path to the embedding file.
                    Currently the following GloVe files are supported:
                    "glove.6B.50d.txt", "glove.6B.100d.txt", "glove.6B.200d.txt"
                    "glove.6B.300d.txt", "glove.42B.300d.txt", "glove.840B.300d.txt".
                    You can download them from: https://nlp.stanford.edu/projects/glove/.
    word_index: Dictionary of word (string) and its corresponding index (int).
                The index is supposed to start from 1 with 0 reserved for unknown words.
                During the prediction, if you have words that are not in the word_index
                for the training, you can map them to index 0.
                Default is None. In this case, all the words in the embedding_file will
                be taken into account and you can call
                WordEmbedding.get_word_index(embedding_file) to retrieve the dictionary.
    trainable: To configure whether the weights of this layer will be updated or not.
               Only False is supported for now.
    input_length: Positive int. The sequence length of each input.
    name: String to set the name of the layer.
          If not specified, its name will by default to be a generated string.
    """

    def __init__(self, embedding_file, word_index=None, trainable=False, input_length=None,
                 input_shape=None, **kwargs):
        if input_length:
            input_shape = (input_length,)
        super(WordEmbedding, self).__init__(None,
                                            embedding_file,
                                            word_index,
                                            trainable,
                                            list(input_shape) if input_shape else None,
                                            **kwargs)

[docs]    @staticmethod
    def get_word_index(embedding_file, bigdl_type="float"):
        """
        Get the full wordIndex map from the given embedding_file.

        # Arguments
        embedding_file: The path to the embedding file.
                        Currently only the following GloVe files are supported:
                        "glove.6B.50d.txt", "glove.6B.100d.txt", "glove.6B.200d.txt"
                        "glove.6B.300d.txt", "glove.42B.300d.txt", "glove.840B.300d.txt".
                        You can download them from: https://nlp.stanford.edu/projects/glove/.

        # Return
        Dictionary of word (string) and its corresponding index (int) obtained from
        the given embedding file.
        """
        return callZooFunc(bigdl_type, "wordEmbeddingGetWordIndex",
                           embedding_file)


[docs]def prepare_embedding(embedding_file, word_index=None,
                      randomize_unknown=False, normalize=False):
    """
    Prepare embedding weights from embedding_file given word_index.

    # Arguments
    embedding_file and word_index: See WordEmbedding.
    randomize_unknown: Boolean. Whether to randomly initialize words that don't exist in
                       embedding_file. Default is False and in this case corresponding entries
                       to unknown words will be zero vectors.
    normalize: Boolean. Whether to normalize word vectors. Default is False.

    # Return
    Pretrained embedding weights as a numpy array.
    """
    return callZooFunc("float", "prepareEmbedding",
                       embedding_file,
                       word_index,
                       randomize_unknown,
                       normalize).to_ndarray()


[docs]class SparseEmbedding(ZooKerasLayer):
    """
    SparseEmbedding is the sparse version of layer Embedding.

    The input of SparseEmbedding should be a 2D SparseTensor or two 2D sparseTensors.
    If the input is a SparseTensor, the values are positive integer ids,
    values in each row of this SparseTensor will be turned into a dense vector.
    If the input is two SparseTensors, the first tensor should be the integer ids, just
    like the SparseTensor input. And the second tensor is the corresponding
    weights of the integer ids.

    This layer can only be used as the first layer in a model, you need to provide the argument
    inputShape (a Single Shape, does not include the batch dimension).

    # Arguments
    input_dim: Size of the vocabulary. Int > 0.
    output_dim: Dimension of the dense embedding. Int >= 0.
    init: String representation of the initialization method for the weights of the layer.
          Default is 'uniform'.
    combiner: A string specifying the reduce type.
              Currently "mean", "sum", "sqrtn" is supported.
    max_norm: If provided, each embedding is normalized to have l2 norm equal to
               maxNorm before combining.
    W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                   applied to the embedding matrix. Default is None.
    input_shape: A Single Shape, does not include the batch dimension.
    name: String to set the name of the layer.
          If not specified, its name will by default to be a generated string.

    >>> sparse_embedding = SparseEmbedding(input_dim=10, output_dim=4, input_shape=(10, ))
    creating: createZooKerasSparseEmbedding
    """

    def __init__(self, input_dim, output_dim, combiner="sum", max_norm=-1.0, init="uniform",
                 W_regularizer=None, input_shape=None, **kwargs):
        super(SparseEmbedding, self).__init__(None,
                                              input_dim,
                                              output_dim,
                                              combiner,
                                              max_norm,
                                              init,
                                              W_regularizer,
                                              list(input_shape) if input_shape else None,
                                              **kwargs)