Source code for tensorlayerx.nn.layers.Transformer

##! /usr/bin/python
# -*- coding: utf-8 -*-

import tensorlayerx as tlx
from tensorlayerx import logging
from tensorlayerx.nn.core import Module
from tensorlayerx.nn.core import ModuleList
import numpy as np

__all__ = [
    'MultiheadAttention',
    'Transformer',
    'TransformerEncoder',
    'TransformerDecoder',
    'TransformerEncoderLayer',
    'TransformerDecoderLayer',
]


[docs]class MultiheadAttention(Module): """ Allows the model to jointly attend to information from different representation subspaces. Parameters ---------- embed_dim: int total dimension of the model. num_heads : int The number of heads in multi-head attention. dropout : float a Dropout layer on attn_output_weights. Default: 0.0. kdim : int total number of features in key. Default: None. vdim : int total number of features in value. Default: None. bias : bool add bias as module parameter. Default: True. batch_first: bool If ``True``, then the input and output tensors are provided as `[batch, seq, feature]`. Default: ``False`` `[seq, batch, feature]`. need_weights: bool Indicate whether to return the attention weights. Default ``False``. name: None or str A unique layer name. Examples --------- With TensorLayerX >>> q = tlx.nn.Input(shape=(4,2,128)) >>> attn_mask = tlx.convert_to_tensor(np.zeros((4,4)),dtype='bool') >>> layer = MultiheadAttention(embed_dim=128, num_heads=4) >>> output = layer(q, attn_mask=attn_mask) References ---------- - `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`__ """ def __init__( self, embed_dim, num_heads, dropout=0.0, kdim=None, vdim=None, bias=True, batch_first=False, need_weights=True, name=None, ): super(MultiheadAttention, self).__init__(name) self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.num_heads = num_heads self.dropout = dropout self.need_weights = need_weights self.head_dim = embed_dim // num_heads self.bias = bias self.batch_first = batch_first assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.build(None) logging.info( "MultiheadAttention %s: embed_dim: %d num_heads: %d kdim: %d vdim: %d dropout: %f" % (self.name, embed_dim, num_heads, self.kdim, self.vdim, dropout) ) def __repr__(self): s = ( '{classname}(embed_dim={embed_dim}, num_heads={num_heads}, dropout={dropout}' ', kdim={kdim}, vdim={vdim}, bias={bias}, batch_first={batch_first}, ' 'need_weights={need_weights}' ) if self.name is not None: s += ', name = \'{name}\'' s += ')' return s.format(classname=self.__class__.__name__, **self.__dict__) def build(self, inputs_shape): bias_init = tlx.nn.initializers.zeros() weight_init = tlx.nn.initializers.XavierNormal() self.q_weight = self._get_weights( 'q_weight', shape=(self.embed_dim, self.embed_dim), init=weight_init, order=True ) self.k_weight = self._get_weights( 'k_weight', shape=(self.embed_dim, self.kdim), init=weight_init, order=True ) self.v_weight = self._get_weights( 'v_weight', shape=(self.embed_dim, self.vdim), init=weight_init, order=True ) self.out_weight = self._get_weights( 'out_weight', shape=(self.embed_dim, self.embed_dim), init=weight_init, order=True ) self.q_bias = None self.k_bias = None self.v_bias = None self.out_bias = None if self.bias: self.q_bias = self._get_weights('q_bias', shape=(self.embed_dim, ), init=bias_init, order=True) self.k_bias = self._get_weights('k_bias', shape=(self.embed_dim, ), init=bias_init, order=True) self.v_bias = self._get_weights('v_bias', shape=(self.embed_dim, ), init=bias_init, order=True) self.out_bias = self._get_weights('out_bias', shape=(self.embed_dim, ), init=bias_init, order=True) self.multiheadattention = tlx.ops.multiheadattention( embed_dim=self.embed_dim, num_heads=self.num_heads, dropout=self.dropout, batch_first=self.batch_first, need_weights=self.need_weights, q_weight=self.q_weight, k_weight=self.k_weight, v_weight=self.v_weight, out_weight=self.out_weight, q_bias=self.q_bias, k_bias=self.k_bias, v_bias=self.v_bias, out_bias=self.out_bias, train=self.is_train )
[docs] def forward(self, q, k=None, v=None, attn_mask=None, key_padding_mask=None): """ Parameters ---------- q: Tensor The queries for multi-head attention. If `batch_first` is ``True``, it is a tensor with shape `[batch_size, query_length, embed_dim]`. If `batch_first` is ``False``, it is a tensor with shape `[query_length, batch_size, embed_dim]`. The data type should be float32 or float64. k: Tensor The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. If `batch_first` is ``False``, it is a tensor with shape `[key_length, batch_size, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. Default is `None`. v: Tensor The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. If `batch_first` is ``False``, it is a tensor with shape `[value_length, batch_size, vdim]`. The data type should be float32 or float64. If None, use `value` as `key`. Default is `None`. attn_mask: Tensor 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all the batches while a 3D mask allows to specify a different mask for the entries of each batch. if a 2D mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length. if a 3D mask: :math:`(N\cdot\text{num\_heads}, L, S)`. Where N is the batch size, L is the target sequence length, S is the source sequence length. ``attn_mask`` ensure that position i is allowed to attend the unmasked positions. If a ByteTensor is provided, the non-zero positions are not allowed to attend while the zero positions will be unchanged. If a BoolTensor is provided, positions with ``True`` is not allowed to attend while ``False`` values will be unchanged. If a FloatTensor is provided, it will be added to the attention weight. key_padding_mask: Tensor if provided, specified padding elements in the key will be ignored by the attention. When given a binary mask and a value is True, the corresponding value on the attention layer will be ignored. When given a byte mask and a value is non-zero, the corresponding value on the attention layer will be ignored :math:`(N, S)` where N is the batch size, S is the source sequence length. If a ByteTensor is provided, the non-zero positions will be ignored while the position with the zero positions will be unchanged. If a BoolTensor is provided, the positions with the value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged. Returns ------- attn_output:Tensor :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is the embedding dimension. :math:`(N, L, E)` if ``batch_first`` is ``True``. attn_output_weights: :math:`(N, L, S)` where N is the batch size, L is the target sequence length, S is the source sequence length. """ attn_output, attn_output_weights = self.multiheadattention(q, k, v, attn_mask, key_padding_mask) if not self._nodes_fixed and self._build_graph: self._add_node([q, k, v, attn_mask, key_padding_mask], [attn_output, attn_output_weights]) self._nodes_fixed = True return attn_output, attn_output_weights
[docs]class Transformer(Module): """A transformer model. User is able to modify the attributes as needed. Parameters ---------- d_model: int the number of expected features in the encoder/decoder inputs. nhead: int the number of heads in the multiheadattention model. num_encoder_layers: the number of sub-encoder-layers in the encoder. num_decoder_layers: the number of sub-decoder-layers in the decoder. dim_feedforward: int the dimension of the feedforward network model. dropout : float a Dropout layer on attn_output_weights. Default: 0.0. act: str the activation function of encoder/decoder intermediate layer, 'relu' or 'gelu'. Default: 'relu'. custom_encoder: Module or None custom encoder. custom_decoder: Module or None custom decoder layer_norm_eps: float the eps value in layer normalization components. Default: 1e-5. batch_first: bool If ``True``, then the input and output tensors are provided as `[batch, seq, feature]`. Default: ``False`` `[seq, batch, feature]`. Examples --------- With TensorLayerX >>> src = tlx.nn.Input(shape=(4,2,128)) >>> tgt = tlx.nn.Input(shape=(4,2,128)) >>> layer = Transformer(d_model=128, nhead=4) >>> output = layer(src, tgt) References ---------- - `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`__ - `BERT <https://arxiv.org/abs/1810.04805>`__ """ def __init__( self, d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, act='relu', custom_encoder=None, custom_decoder=None, layer_norm_eps=1e-5, batch_first=False, ): super(Transformer, self).__init__() if custom_encoder is not None: self.encoder = custom_encoder else: encoder_layer = TransformerEncoderLayer( d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, act=act, layer_norm_eps=layer_norm_eps, batch_first=batch_first ) encoder_norm = tlx.nn.layers.LayerNorm(d_model, epsilon=layer_norm_eps) self.encoder = TransformerEncoder( encoder_layer=encoder_layer, num_layers=num_encoder_layers, norm=encoder_norm ) if custom_decoder is not None: self.decoder = custom_decoder else: decoder_layer = TransformerDecoderLayer( d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout, act=act, layer_norm_eps=layer_norm_eps, batch_first=batch_first ) decoder_norm = tlx.nn.layers.LayerNorm(d_model, epsilon=layer_norm_eps) self.decoder = TransformerDecoder( decoder_layer=decoder_layer, num_layers=num_decoder_layers, norm=decoder_norm ) self.d_model = d_model self.nhead = nhead self.batch_first = batch_first
[docs] def forward( self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None ): """ Parameters ---------- src: Tensor the sequence to the encoder. tgt: Tensor the sequence to the decoder. src_mask: Tensor the additive mask for the src sequence. tgt_mask: Tensor the additive mask for the tgt sequence. memory_mask: Tensor the additive mask for the encoder output. src_key_padding_mask: Tensor mask for src keys per batch. tgt_key_padding_mask: Tensor mask for tgt keys per batch. memory_key_padding_mask: Tensor mask for memory keys per batch. """ if not self.batch_first and src.shape[1] != tgt.shape[1]: raise ValueError("the batch number of src and tgt must be equal") elif self.batch_first and src.shape[0] != tgt.shape[0]: raise ValueError("the batch number of src and tgt must be equal") if src.shape[2] != self.d_model or tgt.shape[2] != self.d_model: raise ValueError("the feature number of src and tgt must be equal to d_model") memory = self.encoder(src, mask=src_mask, src_key_padding_mask=src_key_padding_mask) output = self.decoder( tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask ) if not self._nodes_fixed and self._build_graph: self._add_node([src, tgt, src_mask, tgt_mask, memory_mask, src_key_padding_mask, tgt_key_padding_mask, memory_key_padding_mask], output) self._nodes_fixed = True return output
[docs] def generate_square_subsequent_mask(self, length): """Generate a square mask for the sequence. The masked positions are filled with float('-inf'). Unmasked positions are filled with float(0.0). Parameters ---------- length: int The length of sequence. Examples --------- With TensorLayerX >>> length = 5 >>> mask = transformer.generate_square_subsequent_mask(length) >>> print(mask) >>> [[ 0. -inf -inf -inf -inf] >>> [ 0. 0. -inf -inf -inf] >>> [ 0. 0. 0. -inf -inf] >>> [ 0. 0. 0. 0. -inf] >>> [ 0. 0. 0. 0. 0.]] """ return tlx.triu(tlx.ones(shape=(length, length)) * -np.inf, 1)
[docs]class TransformerEncoder(Module): """TransformerEncoder is a stack of N encoder layers Parameters ---------- encoder_layer: Module an instance of the TransformerEncoderLayer() class. num_layers : int the number of sub-encoder-layers in the encoder. norm: None the layer normalization component. Examples --------- With TensorLayerX >>> q = tlx.nn.Input(shape=(4,2,128)) >>> attn_mask = tlx.convert_to_tensor(np.zeros((4,4)),dtype='bool') >>> encoder = TransformerEncoderLayer(128, 2, 256) >>> encoder = TransformerEncoder(encoder, num_layers=3) >>> output = encoder(q, mask=attn_mask) """ def __init__(self, encoder_layer, num_layers, norm=None): super(TransformerEncoder, self).__init__() # self.encoder_layers = ModuleList([copy.deepcopy(encoder_layer) for i in range(num_layers)]) self.encoder_layers = ModuleList( [(encoder_layer if i == 0 else type(encoder_layer)(**encoder_layer._config)) for i in range(num_layers)] ) self.num_layers = num_layers self.norm = norm
[docs] def forward(self, src, mask=None, src_key_padding_mask=None): """ Parameters ---------- src: Tensor the sequence to the encoder. mask: Tensor the mask for the src sequence. src_key_padding_mask: the mask for the src keys per batch. """ output = src for module in self.encoder_layers: output = module(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask) if self.norm is not None: output = self.norm(output) if not self._nodes_fixed and self._build_graph: self._add_node([src, mask, src_key_padding_mask], output) self._nodes_fixed = True return output
[docs]class TransformerDecoder(Module): """TransformerDecoder is a stack of N decoder layers Parameters ---------- decoder_layer: Module an instance of the TransformerDecoderLayer() class. num_layers : int the number of sub-decoder-layers in the decoder. norm: None the layer normalization component. Examples --------- With TensorLayerX >>> q = tlx.nn.Input(shape=(4,2,128)) >>> decoder = TransformerDecoderLayer(128, 2, 256) >>> decoder = TransformerDecoder(decoder, num_layers=3) >>> output = decoder(q, q) """ def __init__(self, decoder_layer, num_layers, norm=None): super(TransformerDecoder, self).__init__() self.decoder_layers = ModuleList( [(decoder_layer if i == 0 else type(decoder_layer)(**decoder_layer._config)) for i in range(num_layers)] ) self.num_layers = num_layers self.norm = norm
[docs] def forward( self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None ): """ Parameters ---------- tgt: Tensor the sequence to the decoder. memory: Tensor the sequence from the last layer of the encoder. tgt_mask: Tensor the mask for the tgt sequence. memory_mask: Tensor the mask for the memory sequence. tgt_key_padding_mask: Tensor the mask for the tgt keys per batch. memory_key_padding_mask: Tensor the mask for the memory keys per batch. """ output = tgt for module in self.decoder_layers: output = module( output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=memory_key_padding_mask, ) if self.norm is not None: output = self.norm(output) if not self._nodes_fixed and self._build_graph: self._add_node([tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask], output) self._nodes_fixed = True return output
[docs]class TransformerEncoderLayer(Module): """TransformerEncoderLayer is made up of self-attn and feedforward network. This standard encoder layer is based on the paper "Attention Is All You Need". Parameters ---------- d_model: int total dimension of the model. nhead : int The number of heads in multi-head attention. dim_feedforward:int the dimension of the feedforward network model. dropout : float a Dropout layer on attn_output_weights. Default: 0.1. act: str The activation function in the feedforward network. 'relu' or 'gelu'. Default 'relu'. layer_norm_eps: float the eps value in layer normalization components. Default 1e-5. batch_first: bool If ``True``, then the input and output tensors are provided as `[batch, seq, feature]`. Default: ``False`` `[seq, batch, feature]`. Examples --------- With TensorLayerX >>> q = tlx.nn.Input(shape=(4,2,128)) >>> attn_mask = tlx.convert_to_tensor(np.zeros((4,4)),dtype='bool') >>> encoder = TransformerEncoderLayer(128, 2, 256) >>> output = encoder(q, src_mask=attn_mask) """ def __init__( self, d_model, nhead, dim_feedforward, dropout=0.1, act='relu', layer_norm_eps=1e-5, batch_first=False, ): super(TransformerEncoderLayer, self).__init__() self._config = locals() self._config.pop("self") self._config.pop("__class__", None) self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first) self.linear1 = tlx.nn.layers.Linear(in_features=d_model, out_features=dim_feedforward) self.dropout1 = tlx.nn.layers.Dropout(float(dropout)) self.linear2 = tlx.nn.layers.Linear(in_features=dim_feedforward, out_features=d_model) self.norm1 = tlx.nn.layers.LayerNorm(d_model, epsilon=layer_norm_eps) self.norm2 = tlx.nn.layers.LayerNorm(d_model, epsilon=layer_norm_eps) self.dropout2 = tlx.nn.layers.Dropout(float(dropout)) self.dropout3 = tlx.nn.layers.Dropout(float(dropout)) if act == 'relu': self.act = tlx.relu elif act == 'gelu': self.act = tlx.gelu else: raise ValueError("activation should be relu or gelu, but got {}".format(act)) logging.info( "TransformerEncoderLayer %s: d_model: %d nhead: %d dim_feedforward: %d dropout: %f act: %s" % ( self.name, d_model, nhead, dim_feedforward, dropout, self.act.__class__.__name__ if self.act is not None else 'No Activation' ) )
[docs] def forward(self, src, src_mask=None, src_key_padding_mask=None): """ Parameters ---------- src: Tensor the sequence to the encoder layer. src_mask: Tensor or None the mask for the src sequence. src_key_padding_mask: Tensor or None the mask for the src keys per batch. """ inputs = [src, src_mask, src_key_padding_mask] src1 = self.self_attn(src, src, src, src_mask, src_key_padding_mask)[0] src = src + self.dropout1(src1) src = self.norm1(src) src1 = self.linear2(self.dropout2(self.act(self.linear1(src)))) src = src + self.dropout3(src1) src = self.norm2(src) if not self._nodes_fixed and self._build_graph: self._add_node(inputs, src) self._nodes_fixed = True return src
[docs]class TransformerDecoderLayer(Module): """TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. This standard decoder layer is based on the paper "Attention Is All You Need". Parameters ---------- d_model: int total dimension of the model. nhead : int The number of heads in multi-head attention. dim_feedforward:int the dimension of the feedforward network model. dropout : float a Dropout layer on attn_output_weights. Default: 0.1. act: str The activation function in the feedforward network. 'relu' or 'gelu'. Default 'relu'. layer_norm_eps: float the eps value in layer normalization components. Default 1e-5. batch_first: bool If ``True``, then the input and output tensors are provided as `[batch, seq, feature]`. Default: ``False`` `[seq, batch, feature]`. Examples --------- With TensorLayerX >>> q = tlx.nn.Input(shape=(4,2,128)) >>> encoder = TransformerDecoderLayer(128, 2, 256) >>> output = encoder(q, q) """ def __init__( self, d_model, nhead, dim_feedforward, dropout=0.1, act='relu', layer_norm_eps=1e-5, batch_first=False, ): super(TransformerDecoderLayer, self).__init__() self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first) self.cross_attn = MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=batch_first) self.dropout1 = tlx.nn.layers.Dropout(float(dropout)) self.dropout2 = tlx.nn.layers.Dropout(float(dropout)) self.dropout3 = tlx.nn.layers.Dropout(float(dropout)) self.norm1 = tlx.nn.layers.LayerNorm(d_model, epsilon=layer_norm_eps) self.norm2 = tlx.nn.layers.LayerNorm(d_model, epsilon=layer_norm_eps) self.norm3 = tlx.nn.layers.LayerNorm(d_model, epsilon=layer_norm_eps) self.linear1 = tlx.nn.layers.Linear(in_features=d_model, out_features=dim_feedforward) self.linear2 = tlx.nn.layers.Linear(in_features=dim_feedforward, out_features=d_model) if act == 'relu': self.act = tlx.relu elif act == 'gelu': self.act = tlx.gelu logging.info( "TransformerDecoderLayer %s: d_model: %d nhead: %d dim_feedforward: %d dropout: %f act: %s" % ( self.name, d_model, nhead, dim_feedforward, dropout, self.act.__class__.__name__ if self.act is not None else 'No Activation' ) )
[docs] def forward( self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None ): """ Parameters ---------- tgt: Tensor the sequence to the decoder layer. memory: the sequence from the last layer of the encoder. tgt_mask: the mask for the tgt sequence. memory_mask: the mask for the memory sequence. tgt_key_padding_mask: the mask for the tgt keys per batch. memory_key_padding_mask: the mask for the memory keys per batch. """ inputs = [tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask] tgt1 = self.self_attn(tgt, tgt, tgt, tgt_mask, tgt_key_padding_mask)[0] tgt = tgt + self.dropout1(tgt1) tgt = self.norm1(tgt) tgt1 = self.cross_attn(tgt, memory, memory, memory_mask, memory_key_padding_mask)[0] tgt = tgt + self.dropout2(tgt1) tgt = self.norm2(tgt) tgt1 = self.linear2(self.dropout3(self.act(self.linear1(tgt)))) tgt = tgt + self.dropout3(tgt1) tgt = self.norm3(tgt) if not self._nodes_fixed and self._build_graph: self._add_node(inputs, tgt) self._nodes_fixed = True return tgt