Source code for tensorlayerx.nn.layers.embedding

#! /usr/bin/python
# -*- coding: utf-8 -*-

import tensorlayerx as tlx
from tensorlayerx import logging
from tensorlayerx.nn.core import Module

__all__ = ['OneHot', 'Word2vecEmbedding', 'Embedding', 'AverageEmbedding']


[docs]class OneHot(Module):
    """
    The :class:`OneHot` class is the starting layer of a neural network, see ``tf.one_hot``.
    Useful link: `https://www.tensorflow.org/api_docs/python/tf/one_hot`.

    Parameters
    ----------
    depth : None or int
        If the input indices is rank N, the output will have rank N+1. The new axis is created at dimension `axis` (default: the new axis is appended at the end).
    on_value : None or number
        The value to represnt `ON`. If None, it will default to the value 1.
    off_value : None or number
        The value to represnt `OFF`. If None, it will default to the value 0.
    axis : None or int
        The axis.
    dtype : None or TensorFlow dtype
        The data type, None means tlx.float32.
    name : str
        A unique layer name.

    Examples
    ---------
    >>> net = tlx.nn.Input([32], dtype=tlx.int32)
    >>> onehot = tlx.nn.OneHot(depth=8)
    >>> print(onehot)
    OneHot(depth=8, name='onehot')
    >>> tensor = tlx.nn.OneHot(depth=8)(net)
    >>> print(tensor)
    Tensor([...], shape=(32, 8), dtype=float32)

    """

    def __init__(self, depth=None, on_value=1.0, off_value=0.0, axis=-1, dtype=tlx.float32, name=None):
        super(OneHot, self).__init__(name)
        self.depth = depth
        self.on_value = on_value
        self.off_value = off_value
        self.axis = axis
        self.dtype = dtype
        logging.info("OneHotInput  %s" % (self.name))

        self.build()
        self._built = True

        if self.depth is None:
            raise RuntimeError(self.__class__.__name__ + ": depth == None the number of output units is undefined")

    def __repr__(self):
        s = ('{classname}(depth={depth}')
        if self.on_value is not None:
            s += ', on_value={on_value}'
        if self.off_value is not None:
            s += ', off_value={off_value}'
        if self.axis is not None:
            s += ', axis={axis}'
        if self.name is not None:
            s += ', name=\'{name}\''
        s += ')'
        return s.format(classname=self.__class__.__name__, **self.__dict__)

    def build(self, inputs_shape=None):
        self.onehot = tlx.ops.OneHot(
            depth=self.depth, on_value=self.on_value, off_value=self.off_value, axis=self.axis, dtype=self.dtype
        )

    def forward(self, inputs):
        """
        Parameters
        ----------
        inputs : input tensor
            The inputs are indices. The locations represented by indices in indices take value on_value, while all other locations take value off_value.
        """
        outputs = self.onehot(inputs)

        if not self._nodes_fixed and self._build_graph:
            self._add_node(inputs, outputs)
            self._nodes_fixed = True
        return outputs


[docs]class Word2vecEmbedding(Module):
    """
    The :class:`Word2vecEmbedding` class is a fully connected layer.
    For Word Embedding, words are input as integer index.
    The output is the embedded word vector.

    The layer integrates NCE loss by default (activate_nce_loss=True).
    If the NCE loss is activated, in a dynamic model,
    the computation of nce loss can be turned off in customised forward feeding
    by setting use_nce_loss=False when the layer is called.
    The NCE loss can be deactivated by setting activate_nce_loss=False.

    Parameters
    ----------
    num_embeddings : int
        size of the dictionary of embeddings.
    embedding_dim  : int
         the size of each embedding vector.
    num_sampled : int
        The number of negative examples for NCE loss
    activate_nce_loss : boolean
        Whether activate nce loss or not. By default, True
        If True, the layer will return both outputs of embedding and nce_cost in forward feeding.
        If False, the layer will only return outputs of embedding.
        In a dynamic model, the computation of nce loss can be turned off in forward feeding
        by setting use_nce_loss=False when the layer is called.
        In a static model, once the model is constructed, the computation of nce loss
        cannot be changed (always computed or not computed).
    nce_loss_args : dictionary
        The arguments for tf.ops.nce_loss()
    E_init : initializer or str
        The initializer for initializing the embedding matrix
    nce_W_init : initializer or str
        The initializer for initializing the nce decoder weight matrix
    nce_b_init : initializer or str
        The initializer for initializing of the nce decoder bias vector
    name : str
        A unique layer name

    Attributes
    ----------
    outputs : Tensor
        The embedding layer outputs.
    normalized_embeddings : Tensor
        Normalized embedding matrix.
    nce_weights : Tensor
        The NCE weights only when activate_nce_loss is True.
    nce_biases: Tensor
        The NCE biases only when activate_nce_loss is True.

    Examples
    --------
    Word2Vec With TensorLayer (Example in `examples/text_word_embedding/tutorial_word2vec_basic.py`)

    >>> import tensorlayerx as tlx
    >>> batch_size = 8
    >>> embedding_dim = 50
    >>> inputs = tlx.nn.Input([batch_size], dtype=tlx.int32)
    >>> labels = tlx.nn.Input([batch_size, 1], dtype=tlx.int32)
    >>> emb_net = tlx.nn.Word2vecEmbedding(
    >>>     num_embeddings=10000,
    >>>     embedding_dim=embedding_dim,
    >>>     num_sampled=100,
    >>>     activate_nce_loss=True, # the nce loss is activated
    >>>     nce_loss_args={},
    >>>     E_init=tlx.initializers.random_uniform(minval=-1.0, maxval=1.0),
    >>>     nce_W_init=tlx.initializers.truncated_normal(stddev=float(1.0 / np.sqrt(embedding_dim))),
    >>>     nce_b_init=tlx.initializers.constant(value=0.0),
    >>>     name='word2vec_layer',
    >>> )
    >>> print(emb_net)
    Word2vecEmbedding(num_embeddings=10000, embedding_dim=50, num_sampled=100, activate_nce_loss=True, nce_loss_args={})
    >>> embed_tensor = emb_net(inputs, use_nce_loss=False) # the nce loss is turned off and no need to provide labels
    >>> embed_tensor = emb_net([inputs, labels], use_nce_loss=False) # the nce loss is turned off and the labels will be ignored
    >>> embed_tensor, embed_nce_loss = emb_net([inputs, labels]) # the nce loss is calculated
    >>> outputs = tlx.nn.Linear(out_features=10, name="linear")(embed_tensor)
    >>> model = tlx.model.Model(inputs=[inputs, labels], outputs=[outputs, embed_nce_loss], name="word2vec_model") # a static model
    >>> out = model([data_x, data_y], is_train=True) # where data_x is inputs and data_y is labels

    References
    ----------
    `https://www.tensorflow.org/tutorials/representation/word2vec`

    """

    def __init__(
        self,
        num_embeddings,
        embedding_dim,
        num_sampled=64,
        activate_nce_loss=True,
        nce_loss_args=None,
        E_init='random_uniform',
        nce_W_init='truncated_normal',
        nce_b_init='constant',
        name=None,  #'word2vec',
    ):

        super(Word2vecEmbedding, self).__init__(name)
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.num_sampled = num_sampled
        self.E_init = self.str_to_init(E_init)
        self.activate_nce_loss = activate_nce_loss

        if self.activate_nce_loss:
            self.nce_loss_args = nce_loss_args
            self.nce_W_init = self.str_to_init(nce_W_init)
            self.nce_b_init = self.str_to_init(nce_b_init)

        if not self._built:
            self.build(tuple())
            self._built = True

        logging.info("Word2vecEmbedding %s: (%d, %d)" % (self.name, self.num_embeddings, self.embedding_dim))

    def __repr__(self):
        s = ('{classname}(')
        s += 'num_embeddings={num_embeddings}'
        s += ', embedding_dim={embedding_dim}'
        s += ', num_sampled={num_sampled}'
        s += ', activate_nce_loss={activate_nce_loss}'
        if self.activate_nce_loss:
            s += ', nce_loss_args={nce_loss_args}'
        s += ')'
        return s.format(classname=self.__class__.__name__, **self.__dict__)

    def build(self, inputs_shape):
        """
        Parameters
        ----------
        inputs_shape : tuple
            the shape of inputs tensor
        """
        # Look up embeddings for inputs.
        # Note: a row of 'embeddings' is the vector representation of a word.
        # for the sake of speed, it is better to slice the embedding matrix
        # instead of transferring a word id to one-hot-format vector and then
        # multiply by the embedding matrix.
        # embed is the outputs of the hidden layer (embedding layer), it is a
        # row vector with 'embedding_dim' values.

        self.embeddings = self._get_weights(
            "embeddings",
            shape=(self.num_embeddings, self.embedding_dim),
            init=self.E_init,
        )

        self.normalized_embeddings = tlx.L2Normalize(axis=1)(self.embeddings)

        if self.activate_nce_loss:
            # Construct the variables for the NCE loss (i.e. negative sampling)
            self.nce_weights = self._get_weights(
                "nce_weights",
                shape=(self.num_embeddings, self.embedding_dim),
                init=self.nce_W_init,
            )

            self.nce_biases = self._get_weights(
                "nce_biases",
                shape=(self.num_embeddings, ),
                init=self.nce_b_init,
            )

        self.embedding_lookup = tlx.EmbeddingLookup()

        if self.activate_nce_loss:
            self.nce_loss = tlx.NCELoss(**self.nce_loss_args)

    def forward(self, inputs, use_nce_loss=None):
        """
        Parameters
        ----------
        inputs : tensor or list
            If the nce loss is activated and is used, the argument should be a list of two tensors [inputs, labels].
            Otherwise, the argument should be a single tensor which is inputs.
        use_nce_loss: boolean
            Whether use NCE loss in this run.
            If the nce loss is used, the activate_nce_loss should be True when the layer is initialized.
            By default, same as activate_nce_loss.

        Outputs:
        ----------
        outputs: tensor
        nce_cost: tensor
            The nce_cost is returned only if the nce_loss is used.
        """

        if isinstance(inputs, list):
            outputs = self.embedding_lookup(params=self.embeddings, ids=inputs[0])
        else:
            outputs = self.embedding_lookup(params=self.embeddings, ids=inputs)

        if use_nce_loss is True and not self.activate_nce_loss:
            raise AttributeError(
                "The nce loss is not activated when the %s is initialized. Please set activate_nce_loss=True." %
                self.__class__.__name__
            )

        if self.activate_nce_loss and (use_nce_loss is True or use_nce_loss is None):
            if not isinstance(inputs, list):
                raise ValueError("If nce loss is used, the labels of inputs must be provided.")

            nce_cost = tlx.ops.reduce_mean(
                input_tensor=self.nce_loss(
                    weights=self.nce_weights, biases=self.nce_biases, inputs=outputs, labels=inputs[1],
                    num_sampled=self.num_sampled, num_classes=self.num_embeddings
                )
            )

            if not self._nodes_fixed and self._build_graph:
                self._add_node(inputs, [outputs, nce_cost])
                self._nodes_fixed = True
            return outputs, nce_cost

        if not self._nodes_fixed and self._build_graph:
            self._add_node(inputs, outputs)
            self._nodes_fixed = True
        return outputs


[docs]class Embedding(Module):
    """
    A simple lookup table that stores embeddings of a fixed dictionary and size.

    This module is often used to store word embeddings and retrieve them using indices.
    The input to the module is a list of indices, and the output is the corresponding word embeddings.

    Parameters
    ----------
    num_embeddings : int
        size of the dictionary of embeddings.
    embedding_dim  : int
         the size of each embedding vector.
    E_init : initializer or str
        The initializer for the embedding matrix.
    E_init_args : dictionary
        The arguments for embedding matrix initializer.
    name : str
        A unique layer name.

    Attributes
    ----------
    outputs : tensor
        The embedding layer output is a 3D tensor in the shape: (batch_size, num_steps(num_words), embedding_dim).

    Examples
    --------
    >>> import tensorlayerx as tlx
    >>> input = tlx.nn.Input([8, 100], dtype=tlx.int32)
    >>> embed = tlx.nn.Embedding(num_embeddings=1000, embedding_dim=50, name='embed')
    >>> print(embed)
    Embedding(num_embeddings=1000, embedding_dim=50)
    >>> tensor = embed(input)
    >>> print(tensor)
    Tensor([...], shape=(8, 100, 50), dtype=float32)

    """

    def __init__(
        self,
        num_embeddings,
        embedding_dim,
        E_init='random_uniform',
        name=None,  #'embedding',
    ):
        super(Embedding, self).__init__(name)
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.E_init = self.str_to_init(E_init)

        if not self._built:
            self.build(tuple())
            self._built = True

        logging.info("Embedding %s: (%d, %d)" % (self.name, self.num_embeddings, self.embedding_dim))

    def __repr__(self):
        s = ('{classname}(')
        s += 'num_embeddings={num_embeddings}'
        s += ', embedding_dim={embedding_dim}'
        s += ')'
        return s.format(classname=self.__class__.__name__, **self.__dict__)

    def build(self, inputs_shape):
        """
        Parameters
        ----------
        inputs_shape : tuple
            the shape of inputs tensor
        """

        self.embeddings = self._get_weights(
            "embeddings",
            shape=(self.num_embeddings, self.embedding_dim),
            init=self.E_init,
        )
        self.embedding_lookup = tlx.EmbeddingLookup()

    def forward(self, inputs):
        """
        Parameters
        ----------
        inputs : Tensor
            The input of a network.
        """
        outputs = self.embedding_lookup(params=self.embeddings, ids=inputs)

        if not self._nodes_fixed and self._build_graph:
            self._add_node(inputs, outputs)
            self._nodes_fixed = True
        return outputs


[docs]class AverageEmbedding(Module):
    """The :class:`AverageEmbedding` averages over embeddings of inputs.
    This is often used as the input layer for model like DAN[1] and FastText[2].

    Parameters
    ----------
    num_embeddings : int
        size of the dictionary of embeddings.
    embedding_dim  : int
         the size of each embedding vector.
    pad_value : int
        The scalar padding value used in inputs, 0 as default.
    E_init : initializer or str
        The initializer of the embedding matrix.
    name : str
        A unique layer name.

    Attributes
    ----------
    outputs : tensor
        The embedding layer output is a 2D tensor in the shape: (batch_size, embedding_dim).

    References
    ----------
    - [1] Iyyer, M., Manjunatha, V., Boyd-Graber, J., & Daum’e III, H. (2015). Deep Unordered Composition Rivals Syntactic Methods for Text Classification. In Association for Computational Linguistics.
    - [2] Joulin, A., Grave, E., Bojanowski, P., & Mikolov, T. (2016). `Bag of Tricks for Efficient Text Classification. <http://arxiv.org/abs/1607.01759>`__

    Examples
    ---------
    >>> import tensorlayerx as tlx
    >>> batch_size = 8
    >>> length = 5
    >>> input = tlx.nn.Input([batch_size, length], dtype=tlx.int32)
    >>> avgembed = tlx.nn.AverageEmbedding(num_embeddings=1000, embedding_dim=50, name='avg')
    >>> print(avgembed)
    AverageEmbedding(num_embeddings=1000, embedding_dim=50, pad_value=0)
    >>> tensor = avgembed(input)
    >>> print(tensor)
    Tensor([...], shape=(8, 50), dtype=float32)

    """

    def __init__(
        self,
        num_embeddings,
        embedding_dim,
        pad_value=0,
        E_init='random_uniform',
        name=None,  # 'average_embedding',
    ):

        super(AverageEmbedding, self).__init__(name)
        self.num_embeddings = num_embeddings
        self.embedding_dim = embedding_dim
        self.pad_value = pad_value
        self.E_init = self.str_to_init(E_init)

        if not self._built:
            self.build(tuple())
            self._built = True

        logging.info("AverageEmbedding %s: (%d, %d)" % (self.name, self.num_embeddings, self.embedding_dim))

    def __repr__(self):
        s = ('{classname}(')
        s += 'num_embeddings={num_embeddings}'
        s += ', embedding_dim={embedding_dim}'
        s += ', pad_value={pad_value}'
        s += ')'
        return s.format(classname=self.__class__.__name__, **self.__dict__)

    def build(self, inputs_shape):
        """
        Parameters
        ----------
        inputs_shape : tuple
            the shape of inputs tensor.
        """
        # if len(inputs_shape) != 2:
        #     raise ValueError('inputs must be of size (batch_size, sentence_length)')

        self.embeddings = self._get_weights(
            "embeddings",
            shape=(self.num_embeddings, self.embedding_dim),
            init=self.E_init,
        )
        self.embedding_lookup = tlx.EmbeddingLookup()
        self.not_equal = tlx.NotEqual()
        self.cast = tlx.Cast(tlx.float32)
        self.expand_dims = tlx.ExpandDims(axis=-1)
        self.reduce_sum = tlx.ReduceSum(axis=1)
        self.count_nonzero = tlx.CountNonzero(keepdims=True, dtype=tlx.float32)

    def forward(self, inputs):
        """
        Parameters
        ----------
        inputs : tensor
            The network input.
            For word inputs, please use integer index format, 2D tensor: (batch_size, sentence_length).
        """
        word_embeddings = self.embedding_lookup(params=self.embeddings, ids=inputs)

        # Zero out embeddings of pad value
        masks = self.not_equal(inputs, self.pad_value)
        word_embeddings *= self.cast(self.expand_dims(masks))
        sum_word_embeddings = self.reduce_sum(input=word_embeddings)

        # Count number of non-padding words in each sentence
        sentence_lengths = self.count_nonzero(masks, axis=1)
        sentence_embeddings = tlx.ops.divide(
            sum_word_embeddings,
            sentence_lengths + 1e-8,  # Add epsilon to avoid dividing by 0
        )

        outputs = sentence_embeddings

        if not self._nodes_fixed and self._build_graph:
            self._add_node(inputs, outputs)
            self._nodes_fixed = True
        return outputs