Spaces:
Sleeping
Sleeping
| # coding=utf-8 | |
| # Copyright 2018 The Google AI Language Team Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """The main BERT model and related functions.""" | |
| from __future__ import absolute_import | |
| from __future__ import division | |
| from __future__ import print_function | |
| import collections | |
| import copy | |
| import json | |
| import math | |
| import re | |
| import six | |
| import tensorflow as tf | |
| class BertConfig(object): | |
| """Configuration for `BertModel`.""" | |
| def __init__(self, | |
| vocab_size, | |
| hidden_size=768, | |
| num_hidden_layers=12, | |
| num_attention_heads=12, | |
| intermediate_size=3072, | |
| hidden_act="gelu", | |
| hidden_dropout_prob=0.1, | |
| attention_probs_dropout_prob=0.1, | |
| max_position_embeddings=512, | |
| type_vocab_size=16, | |
| initializer_range=0.02): | |
| """Constructs BertConfig. | |
| Args: | |
| vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. | |
| hidden_size: Size of the encoder layers and the pooler layer. | |
| num_hidden_layers: Number of hidden layers in the Transformer encoder. | |
| num_attention_heads: Number of attention heads for each attention layer in | |
| the Transformer encoder. | |
| intermediate_size: The size of the "intermediate" (i.e., feed-forward) | |
| layer in the Transformer encoder. | |
| hidden_act: The non-linear activation function (function or string) in the | |
| encoder and pooler. | |
| hidden_dropout_prob: The dropout probability for all fully connected | |
| layers in the embeddings, encoder, and pooler. | |
| attention_probs_dropout_prob: The dropout ratio for the attention | |
| probabilities. | |
| max_position_embeddings: The maximum sequence length that this model might | |
| ever be used with. Typically set this to something large just in case | |
| (e.g., 512 or 1024 or 2048). | |
| type_vocab_size: The vocabulary size of the `token_type_ids` passed into | |
| `BertModel`. | |
| initializer_range: The stdev of the truncated_normal_initializer for | |
| initializing all weight matrices. | |
| """ | |
| self.vocab_size = vocab_size | |
| self.hidden_size = hidden_size | |
| self.num_hidden_layers = num_hidden_layers | |
| self.num_attention_heads = num_attention_heads | |
| self.hidden_act = hidden_act | |
| self.intermediate_size = intermediate_size | |
| self.hidden_dropout_prob = hidden_dropout_prob | |
| self.attention_probs_dropout_prob = attention_probs_dropout_prob | |
| self.max_position_embeddings = max_position_embeddings | |
| self.type_vocab_size = type_vocab_size | |
| self.initializer_range = initializer_range | |
| def from_dict(cls, json_object): | |
| """Constructs a `BertConfig` from a Python dictionary of parameters.""" | |
| config = BertConfig(vocab_size=None) | |
| for (key, value) in six.iteritems(json_object): | |
| config.__dict__[key] = value | |
| return config | |
| def from_json_file(cls, json_file): | |
| """Constructs a `BertConfig` from a json file of parameters.""" | |
| with tf.gfile.GFile(json_file, "r") as reader: | |
| text = reader.read() | |
| return cls.from_dict(json.loads(text)) | |
| def to_dict(self): | |
| """Serializes this instance to a Python dictionary.""" | |
| output = copy.deepcopy(self.__dict__) | |
| return output | |
| def to_json_string(self): | |
| """Serializes this instance to a JSON string.""" | |
| return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" | |
| class BertModel(object): | |
| """BERT model ("Bidirectional Embedding Representations from a Transformer"). | |
| Example usage: | |
| ```python | |
| # Already been converted into WordPiece token ids | |
| input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) | |
| input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) | |
| token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) | |
| config = modeling.BertConfig(vocab_size=32000, hidden_size=512, | |
| num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) | |
| model = modeling.BertModel(config=config, is_training=True, | |
| input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) | |
| label_embeddings = tf.get_variable(...) | |
| pooled_output = model.get_pooled_output() | |
| logits = tf.matmul(pooled_output, label_embeddings) | |
| ... | |
| ``` | |
| """ | |
| def __init__(self, | |
| config, | |
| is_training, | |
| input_ids, | |
| input_mask=None, | |
| token_type_ids=None, | |
| use_one_hot_embeddings=True, | |
| scope=None): | |
| """Constructor for BertModel. | |
| Args: | |
| config: `BertConfig` instance. | |
| is_training: bool. rue for training model, false for eval model. Controls | |
| whether dropout will be applied. | |
| input_ids: int32 Tensor of shape [batch_size, seq_length]. | |
| input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. | |
| token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. | |
| use_one_hot_embeddings: (optional) bool. Whether to use one-hot word | |
| embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, | |
| it is must faster if this is True, on the CPU or GPU, it is faster if | |
| this is False. | |
| scope: (optional) variable scope. Defaults to "bert". | |
| Raises: | |
| ValueError: The config is invalid or one of the input tensor shapes | |
| is invalid. | |
| """ | |
| config = copy.deepcopy(config) | |
| if not is_training: | |
| config.hidden_dropout_prob = 0.0 | |
| config.attention_probs_dropout_prob = 0.0 | |
| input_shape = get_shape_list(input_ids, expected_rank=2) | |
| batch_size = input_shape[0] | |
| seq_length = input_shape[1] | |
| if input_mask is None: | |
| input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) | |
| if token_type_ids is None: | |
| token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) | |
| with tf.variable_scope(scope, default_name="bert"): | |
| with tf.variable_scope("embeddings"): | |
| # Perform embedding lookup on the word ids. | |
| (self.embedding_output, self.embedding_table) = embedding_lookup( | |
| input_ids=input_ids, | |
| vocab_size=config.vocab_size, | |
| embedding_size=config.hidden_size, | |
| initializer_range=config.initializer_range, | |
| word_embedding_name="word_embeddings", | |
| use_one_hot_embeddings=use_one_hot_embeddings) | |
| # Add positional embeddings and token type embeddings, then layer | |
| # normalize and perform dropout. | |
| self.embedding_output = embedding_postprocessor( | |
| input_tensor=self.embedding_output, | |
| use_token_type=True, | |
| token_type_ids=token_type_ids, | |
| token_type_vocab_size=config.type_vocab_size, | |
| token_type_embedding_name="token_type_embeddings", | |
| use_position_embeddings=True, | |
| position_embedding_name="position_embeddings", | |
| initializer_range=config.initializer_range, | |
| max_position_embeddings=config.max_position_embeddings, | |
| dropout_prob=config.hidden_dropout_prob) | |
| with tf.variable_scope("encoder"): | |
| # This converts a 2D mask of shape [batch_size, seq_length] to a 3D | |
| # mask of shape [batch_size, seq_length, seq_length] which is used | |
| # for the attention scores. | |
| attention_mask = create_attention_mask_from_input_mask( | |
| input_ids, input_mask) | |
| # Run the stacked transformer. | |
| # `sequence_output` shape = [batch_size, seq_length, hidden_size]. | |
| self.all_encoder_layers = transformer_model( | |
| input_tensor=self.embedding_output, | |
| attention_mask=attention_mask, | |
| hidden_size=config.hidden_size, | |
| num_hidden_layers=config.num_hidden_layers, | |
| num_attention_heads=config.num_attention_heads, | |
| intermediate_size=config.intermediate_size, | |
| intermediate_act_fn=get_activation(config.hidden_act), | |
| hidden_dropout_prob=config.hidden_dropout_prob, | |
| attention_probs_dropout_prob=config.attention_probs_dropout_prob, | |
| initializer_range=config.initializer_range, | |
| do_return_all_layers=True) | |
| self.sequence_output = self.all_encoder_layers[-1] | |
| # The "pooler" converts the encoded sequence tensor of shape | |
| # [batch_size, seq_length, hidden_size] to a tensor of shape | |
| # [batch_size, hidden_size]. This is necessary for segment-level | |
| # (or segment-pair-level) classification tasks where we need a fixed | |
| # dimensional representation of the segment. | |
| with tf.variable_scope("pooler"): | |
| # We "pool" the model by simply taking the hidden state corresponding | |
| # to the first token. We assume that this has been pre-trained | |
| first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) | |
| self.pooled_output = tf.layers.dense( | |
| first_token_tensor, | |
| config.hidden_size, | |
| activation=tf.tanh, | |
| kernel_initializer=create_initializer(config.initializer_range)) | |
| def get_pooled_output(self): | |
| return self.pooled_output | |
| def get_sequence_output(self): | |
| """Gets final hidden layer of encoder. | |
| Returns: | |
| float Tensor of shape [batch_size, seq_length, hidden_size] corresponding | |
| to the final hidden of the transformer encoder. | |
| """ | |
| return self.sequence_output | |
| def get_all_encoder_layers(self): | |
| return self.all_encoder_layers | |
| def get_embedding_output(self): | |
| """Gets output of the embedding lookup (i.e., input to the transformer). | |
| Returns: | |
| float Tensor of shape [batch_size, seq_length, hidden_size] corresponding | |
| to the output of the embedding layer, after summing the word | |
| embeddings with the positional embeddings and the token type embeddings, | |
| then performing layer normalization. This is the input to the transformer. | |
| """ | |
| return self.embedding_output | |
| def get_embedding_table(self): | |
| return self.embedding_table | |
| def gelu(input_tensor): | |
| """Gaussian Error Linear Unit. | |
| This is a smoother version of the RELU. | |
| Original paper: https://arxiv.org/abs/1606.08415 | |
| Args: | |
| input_tensor: float Tensor to perform activation. | |
| Returns: | |
| `input_tensor` with the GELU activation applied. | |
| """ | |
| cdf = 0.5 * (1.0 + tf.erf(input_tensor / tf.sqrt(2.0))) | |
| return input_tensor * cdf | |
| def get_activation(activation_string): | |
| """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. | |
| Args: | |
| activation_string: String name of the activation function. | |
| Returns: | |
| A Python function corresponding to the activation function. If | |
| `activation_string` is None, empty, or "linear", this will return None. | |
| If `activation_string` is not a string, it will return `activation_string`. | |
| Raises: | |
| ValueError: The `activation_string` does not correspond to a known | |
| activation. | |
| """ | |
| # We assume that anything that"s not a string is already an activation | |
| # function, so we just return it. | |
| if not isinstance(activation_string, six.string_types): | |
| return activation_string | |
| if not activation_string: | |
| return None | |
| act = activation_string.lower() | |
| if act == "linear": | |
| return None | |
| elif act == "relu": | |
| return tf.nn.relu | |
| elif act == "gelu": | |
| return gelu | |
| elif act == "tanh": | |
| return tf.tanh | |
| else: | |
| raise ValueError("Unsupported activation: %s" % act) | |
| def get_assignment_map_from_checkpoint(tvars, init_checkpoint): | |
| """Compute the union of the current variables and checkpoint variables.""" | |
| assignment_map = {} | |
| initialized_variable_names = {} | |
| name_to_variable = collections.OrderedDict() | |
| for var in tvars: | |
| name = var.name | |
| m = re.match("^(.*):\\d+$", name) | |
| if m is not None: | |
| name = m.group(1) | |
| name_to_variable[name] = var | |
| init_vars = tf.train.list_variables(init_checkpoint) | |
| assignment_map = collections.OrderedDict() | |
| for x in init_vars: | |
| (name, var) = (x[0], x[1]) | |
| if name not in name_to_variable: | |
| continue | |
| assignment_map[name] = name | |
| initialized_variable_names[name] = 1 | |
| initialized_variable_names[name + ":0"] = 1 | |
| return (assignment_map, initialized_variable_names) | |
| def dropout(input_tensor, dropout_prob): | |
| """Perform dropout. | |
| Args: | |
| input_tensor: float Tensor. | |
| dropout_prob: Python float. The probability of dropping out a value (NOT of | |
| *keeping* a dimension as in `tf.nn.dropout`). | |
| Returns: | |
| A version of `input_tensor` with dropout applied. | |
| """ | |
| if dropout_prob is None or dropout_prob == 0.0: | |
| return input_tensor | |
| output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) | |
| return output | |
| def layer_norm(input_tensor, name=None): | |
| """Run layer normalization on the last dimension of the tensor.""" | |
| return tf.contrib.layers.layer_norm( | |
| inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) | |
| def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): | |
| """Runs layer normalization followed by dropout.""" | |
| output_tensor = layer_norm(input_tensor, name) | |
| output_tensor = dropout(output_tensor, dropout_prob) | |
| return output_tensor | |
| def create_initializer(initializer_range=0.02): | |
| """Creates a `truncated_normal_initializer` with the given range.""" | |
| return tf.truncated_normal_initializer(stddev=initializer_range) | |
| def embedding_lookup(input_ids, | |
| vocab_size, | |
| embedding_size=128, | |
| initializer_range=0.02, | |
| word_embedding_name="word_embeddings", | |
| use_one_hot_embeddings=False): | |
| """Looks up words embeddings for id tensor. | |
| Args: | |
| input_ids: int32 Tensor of shape [batch_size, seq_length] containing word | |
| ids. | |
| vocab_size: int. Size of the embedding vocabulary. | |
| embedding_size: int. Width of the word embeddings. | |
| initializer_range: float. Embedding initialization range. | |
| word_embedding_name: string. Name of the embedding table. | |
| use_one_hot_embeddings: bool. If True, use one-hot method for word | |
| embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better | |
| for TPUs. | |
| Returns: | |
| float Tensor of shape [batch_size, seq_length, embedding_size]. | |
| """ | |
| # This function assumes that the input is of shape [batch_size, seq_length, | |
| # num_inputs]. | |
| # | |
| # If the input is a 2D tensor of shape [batch_size, seq_length], we | |
| # reshape to [batch_size, seq_length, 1]. | |
| if input_ids.shape.ndims == 2: | |
| input_ids = tf.expand_dims(input_ids, axis=[-1]) | |
| embedding_table = tf.get_variable( | |
| name=word_embedding_name, | |
| shape=[vocab_size, embedding_size], | |
| initializer=create_initializer(initializer_range)) | |
| if use_one_hot_embeddings: | |
| flat_input_ids = tf.reshape(input_ids, [-1]) | |
| one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) | |
| output = tf.matmul(one_hot_input_ids, embedding_table) | |
| else: | |
| output = tf.nn.embedding_lookup(embedding_table, input_ids) | |
| input_shape = get_shape_list(input_ids) | |
| output = tf.reshape(output, | |
| input_shape[0:-1] + [input_shape[-1] * embedding_size]) | |
| return (output, embedding_table) | |
| def embedding_postprocessor(input_tensor, | |
| use_token_type=False, | |
| token_type_ids=None, | |
| token_type_vocab_size=16, | |
| token_type_embedding_name="token_type_embeddings", | |
| use_position_embeddings=True, | |
| position_embedding_name="position_embeddings", | |
| initializer_range=0.02, | |
| max_position_embeddings=512, | |
| dropout_prob=0.1): | |
| """Performs various post-processing on a word embedding tensor. | |
| Args: | |
| input_tensor: float Tensor of shape [batch_size, seq_length, | |
| embedding_size]. | |
| use_token_type: bool. Whether to add embeddings for `token_type_ids`. | |
| token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. | |
| Must be specified if `use_token_type` is True. | |
| token_type_vocab_size: int. The vocabulary size of `token_type_ids`. | |
| token_type_embedding_name: string. The name of the embedding table variable | |
| for token type ids. | |
| use_position_embeddings: bool. Whether to add position embeddings for the | |
| position of each token in the sequence. | |
| position_embedding_name: string. The name of the embedding table variable | |
| for positional embeddings. | |
| initializer_range: float. Range of the weight initialization. | |
| max_position_embeddings: int. Maximum sequence length that might ever be | |
| used with this model. This can be longer than the sequence length of | |
| input_tensor, but cannot be shorter. | |
| dropout_prob: float. Dropout probability applied to the final output tensor. | |
| Returns: | |
| float tensor with same shape as `input_tensor`. | |
| Raises: | |
| ValueError: One of the tensor shapes or input values is invalid. | |
| """ | |
| input_shape = get_shape_list(input_tensor, expected_rank=3) | |
| batch_size = input_shape[0] | |
| seq_length = input_shape[1] | |
| width = input_shape[2] | |
| if seq_length > max_position_embeddings: | |
| raise ValueError("The seq length (%d) cannot be greater than " | |
| "`max_position_embeddings` (%d)" % | |
| (seq_length, max_position_embeddings)) | |
| output = input_tensor | |
| if use_token_type: | |
| if token_type_ids is None: | |
| raise ValueError("`token_type_ids` must be specified if" | |
| "`use_token_type` is True.") | |
| token_type_table = tf.get_variable( | |
| name=token_type_embedding_name, | |
| shape=[token_type_vocab_size, width], | |
| initializer=create_initializer(initializer_range)) | |
| # This vocab will be small so we always do one-hot here, since it is always | |
| # faster for a small vocabulary. | |
| flat_token_type_ids = tf.reshape(token_type_ids, [-1]) | |
| one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) | |
| token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) | |
| token_type_embeddings = tf.reshape(token_type_embeddings, | |
| [batch_size, seq_length, width]) | |
| output += token_type_embeddings | |
| if use_position_embeddings: | |
| full_position_embeddings = tf.get_variable( | |
| name=position_embedding_name, | |
| shape=[max_position_embeddings, width], | |
| initializer=create_initializer(initializer_range)) | |
| # Since the position embedding table is a learned variable, we create it | |
| # using a (long) sequence length `max_position_embeddings`. The actual | |
| # sequence length might be shorter than this, for faster training of | |
| # tasks that do not have long sequences. | |
| # | |
| # So `full_position_embeddings` is effectively an embedding table | |
| # for position [0, 1, 2, ..., max_position_embeddings-1], and the current | |
| # sequence has positions [0, 1, 2, ... seq_length-1], so we can just | |
| # perform a slice. | |
| if seq_length < max_position_embeddings: | |
| position_embeddings = tf.slice(full_position_embeddings, [0, 0], | |
| [seq_length, -1]) | |
| else: | |
| position_embeddings = full_position_embeddings | |
| num_dims = len(output.shape.as_list()) | |
| # Only the last two dimensions are relevant (`seq_length` and `width`), so | |
| # we broadcast among the first dimensions, which is typically just | |
| # the batch size. | |
| position_broadcast_shape = [] | |
| for _ in range(num_dims - 2): | |
| position_broadcast_shape.append(1) | |
| position_broadcast_shape.extend([seq_length, width]) | |
| position_embeddings = tf.reshape(position_embeddings, | |
| position_broadcast_shape) | |
| output += position_embeddings | |
| output = layer_norm_and_dropout(output, dropout_prob) | |
| return output | |
| def create_attention_mask_from_input_mask(from_tensor, to_mask): | |
| """Create 3D attention mask from a 2D tensor mask. | |
| Args: | |
| from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. | |
| to_mask: int32 Tensor of shape [batch_size, to_seq_length]. | |
| Returns: | |
| float Tensor of shape [batch_size, from_seq_length, to_seq_length]. | |
| """ | |
| from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) | |
| batch_size = from_shape[0] | |
| from_seq_length = from_shape[1] | |
| to_shape = get_shape_list(to_mask, expected_rank=2) | |
| to_seq_length = to_shape[1] | |
| to_mask = tf.cast( | |
| tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) | |
| # We don't assume that `from_tensor` is a mask (although it could be). We | |
| # don't actually care if we attend *from* padding tokens (only *to* padding) | |
| # tokens so we create a tensor of all ones. | |
| # | |
| # `broadcast_ones` = [batch_size, from_seq_length, 1] | |
| broadcast_ones = tf.ones( | |
| shape=[batch_size, from_seq_length, 1], dtype=tf.float32) | |
| # Here we broadcast along two dimensions to create the mask. | |
| mask = broadcast_ones * to_mask | |
| return mask | |
| def attention_layer(from_tensor, | |
| to_tensor, | |
| attention_mask=None, | |
| num_attention_heads=1, | |
| size_per_head=512, | |
| query_act=None, | |
| key_act=None, | |
| value_act=None, | |
| attention_probs_dropout_prob=0.0, | |
| initializer_range=0.02, | |
| do_return_2d_tensor=False, | |
| batch_size=None, | |
| from_seq_length=None, | |
| to_seq_length=None): | |
| """Performs multi-headed attention from `from_tensor` to `to_tensor`. | |
| This is an implementation of multi-headed attention based on "Attention | |
| is all you Need". If `from_tensor` and `to_tensor` are the same, then | |
| this is self-attention. Each timestep in `from_tensor` attends to the | |
| corresponding sequence in `to_tensor`, and returns a fixed-with vector. | |
| This function first projects `from_tensor` into a "query" tensor and | |
| `to_tensor` into "key" and "value" tensors. These are (effectively) a list | |
| of tensors of length `num_attention_heads`, where each tensor is of shape | |
| [batch_size, seq_length, size_per_head]. | |
| Then, the query and key tensors are dot-producted and scaled. These are | |
| softmaxed to obtain attention probabilities. The value tensors are then | |
| interpolated by these probabilities, then concatenated back to a single | |
| tensor and returned. | |
| In practice, the multi-headed attention are done with transposes and | |
| reshapes rather than actual separate tensors. | |
| Args: | |
| from_tensor: float Tensor of shape [batch_size, from_seq_length, | |
| from_width]. | |
| to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. | |
| attention_mask: (optional) int32 Tensor of shape [batch_size, | |
| from_seq_length, to_seq_length]. The values should be 1 or 0. The | |
| attention scores will effectively be set to -infinity for any positions in | |
| the mask that are 0, and will be unchanged for positions that are 1. | |
| num_attention_heads: int. Number of attention heads. | |
| size_per_head: int. Size of each attention head. | |
| query_act: (optional) Activation function for the query transform. | |
| key_act: (optional) Activation function for the key transform. | |
| value_act: (optional) Activation function for the value transform. | |
| attention_probs_dropout_prob: (optional) float. Dropout probability of the | |
| attention probabilities. | |
| initializer_range: float. Range of the weight initializer. | |
| do_return_2d_tensor: bool. If True, the output will be of shape [batch_size | |
| * from_seq_length, num_attention_heads * size_per_head]. If False, the | |
| output will be of shape [batch_size, from_seq_length, num_attention_heads | |
| * size_per_head]. | |
| batch_size: (Optional) int. If the input is 2D, this might be the batch size | |
| of the 3D version of the `from_tensor` and `to_tensor`. | |
| from_seq_length: (Optional) If the input is 2D, this might be the seq length | |
| of the 3D version of the `from_tensor`. | |
| to_seq_length: (Optional) If the input is 2D, this might be the seq length | |
| of the 3D version of the `to_tensor`. | |
| Returns: | |
| float Tensor of shape [batch_size, from_seq_length, | |
| num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is | |
| true, this will be of shape [batch_size * from_seq_length, | |
| num_attention_heads * size_per_head]). | |
| Raises: | |
| ValueError: Any of the arguments or tensor shapes are invalid. | |
| """ | |
| def transpose_for_scores(input_tensor, batch_size, num_attention_heads, | |
| seq_length, width): | |
| output_tensor = tf.reshape( | |
| input_tensor, [batch_size, seq_length, num_attention_heads, width]) | |
| output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) | |
| return output_tensor | |
| from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) | |
| to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) | |
| if len(from_shape) != len(to_shape): | |
| raise ValueError( | |
| "The rank of `from_tensor` must match the rank of `to_tensor`.") | |
| if len(from_shape) == 3: | |
| batch_size = from_shape[0] | |
| from_seq_length = from_shape[1] | |
| to_seq_length = to_shape[1] | |
| elif len(from_shape) == 2: | |
| if (batch_size is None or from_seq_length is None or to_seq_length is None): | |
| raise ValueError( | |
| "When passing in rank 2 tensors to attention_layer, the values " | |
| "for `batch_size`, `from_seq_length`, and `to_seq_length` " | |
| "must all be specified.") | |
| # Scalar dimensions referenced here: | |
| # B = batch size (number of sequences) | |
| # F = `from_tensor` sequence length | |
| # T = `to_tensor` sequence length | |
| # N = `num_attention_heads` | |
| # H = `size_per_head` | |
| from_tensor_2d = reshape_to_matrix(from_tensor) | |
| to_tensor_2d = reshape_to_matrix(to_tensor) | |
| # `query_layer` = [B*F, N*H] | |
| query_layer = tf.layers.dense( | |
| from_tensor_2d, | |
| num_attention_heads * size_per_head, | |
| activation=query_act, | |
| name="query", | |
| kernel_initializer=create_initializer(initializer_range)) | |
| # `key_layer` = [B*T, N*H] | |
| key_layer = tf.layers.dense( | |
| to_tensor_2d, | |
| num_attention_heads * size_per_head, | |
| activation=key_act, | |
| name="key", | |
| kernel_initializer=create_initializer(initializer_range)) | |
| # `value_layer` = [B*T, N*H] | |
| value_layer = tf.layers.dense( | |
| to_tensor_2d, | |
| num_attention_heads * size_per_head, | |
| activation=value_act, | |
| name="value", | |
| kernel_initializer=create_initializer(initializer_range)) | |
| # `query_layer` = [B, N, F, H] | |
| query_layer = transpose_for_scores(query_layer, batch_size, | |
| num_attention_heads, from_seq_length, | |
| size_per_head) | |
| # `key_layer` = [B, N, T, H] | |
| key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, | |
| to_seq_length, size_per_head) | |
| # Take the dot product between "query" and "key" to get the raw | |
| # attention scores. | |
| # `attention_scores` = [B, N, F, T] | |
| attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) | |
| attention_scores = tf.multiply(attention_scores, | |
| 1.0 / math.sqrt(float(size_per_head))) | |
| if attention_mask is not None: | |
| # `attention_mask` = [B, 1, F, T] | |
| attention_mask = tf.expand_dims(attention_mask, axis=[1]) | |
| # Since attention_mask is 1.0 for positions we want to attend and 0.0 for | |
| # masked positions, this operation will create a tensor which is 0.0 for | |
| # positions we want to attend and -10000.0 for masked positions. | |
| adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 | |
| # Since we are adding it to the raw scores before the softmax, this is | |
| # effectively the same as removing these entirely. | |
| attention_scores += adder | |
| # Normalize the attention scores to probabilities. | |
| # `attention_probs` = [B, N, F, T] | |
| attention_probs = tf.nn.softmax(attention_scores) | |
| # This is actually dropping out entire tokens to attend to, which might | |
| # seem a bit unusual, but is taken from the original Transformer paper. | |
| attention_probs = dropout(attention_probs, attention_probs_dropout_prob) | |
| # `value_layer` = [B, T, N, H] | |
| value_layer = tf.reshape( | |
| value_layer, | |
| [batch_size, to_seq_length, num_attention_heads, size_per_head]) | |
| # `value_layer` = [B, N, T, H] | |
| value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) | |
| # `context_layer` = [B, N, F, H] | |
| context_layer = tf.matmul(attention_probs, value_layer) | |
| # `context_layer` = [B, F, N, H] | |
| context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) | |
| if do_return_2d_tensor: | |
| # `context_layer` = [B*F, N*V] | |
| context_layer = tf.reshape( | |
| context_layer, | |
| [batch_size * from_seq_length, num_attention_heads * size_per_head]) | |
| else: | |
| # `context_layer` = [B, F, N*V] | |
| context_layer = tf.reshape( | |
| context_layer, | |
| [batch_size, from_seq_length, num_attention_heads * size_per_head]) | |
| return context_layer | |
| def transformer_model(input_tensor, | |
| attention_mask=None, | |
| hidden_size=768, | |
| num_hidden_layers=12, | |
| num_attention_heads=12, | |
| intermediate_size=3072, | |
| intermediate_act_fn=gelu, | |
| hidden_dropout_prob=0.1, | |
| attention_probs_dropout_prob=0.1, | |
| initializer_range=0.02, | |
| do_return_all_layers=False): | |
| """Multi-headed, multi-layer Transformer from "Attention is All You Need". | |
| This is almost an exact implementation of the original Transformer encoder. | |
| See the original paper: | |
| https://arxiv.org/abs/1706.03762 | |
| Also see: | |
| https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py | |
| Args: | |
| input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. | |
| attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, | |
| seq_length], with 1 for positions that can be attended to and 0 in | |
| positions that should not be. | |
| hidden_size: int. Hidden size of the Transformer. | |
| num_hidden_layers: int. Number of layers (blocks) in the Transformer. | |
| num_attention_heads: int. Number of attention heads in the Transformer. | |
| intermediate_size: int. The size of the "intermediate" (a.k.a., feed | |
| forward) layer. | |
| intermediate_act_fn: function. The non-linear activation function to apply | |
| to the output of the intermediate/feed-forward layer. | |
| hidden_dropout_prob: float. Dropout probability for the hidden layers. | |
| attention_probs_dropout_prob: float. Dropout probability of the attention | |
| probabilities. | |
| initializer_range: float. Range of the initializer (stddev of truncated | |
| normal). | |
| do_return_all_layers: Whether to also return all layers or just the final | |
| layer. | |
| Returns: | |
| float Tensor of shape [batch_size, seq_length, hidden_size], the final | |
| hidden layer of the Transformer. | |
| Raises: | |
| ValueError: A Tensor shape or parameter is invalid. | |
| """ | |
| if hidden_size % num_attention_heads != 0: | |
| raise ValueError( | |
| "The hidden size (%d) is not a multiple of the number of attention " | |
| "heads (%d)" % (hidden_size, num_attention_heads)) | |
| attention_head_size = int(hidden_size / num_attention_heads) | |
| input_shape = get_shape_list(input_tensor, expected_rank=3) | |
| batch_size = input_shape[0] | |
| seq_length = input_shape[1] | |
| input_width = input_shape[2] | |
| # The Transformer performs sum residuals on all layers so the input needs | |
| # to be the same as the hidden size. | |
| if input_width != hidden_size: | |
| raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % | |
| (input_width, hidden_size)) | |
| # We keep the representation as a 2D tensor to avoid re-shaping it back and | |
| # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on | |
| # the GPU/CPU but may not be free on the TPU, so we want to minimize them to | |
| # help the optimizer. | |
| prev_output = reshape_to_matrix(input_tensor) | |
| all_layer_outputs = [] | |
| for layer_idx in range(num_hidden_layers): | |
| with tf.variable_scope("layer_%d" % layer_idx): | |
| layer_input = prev_output | |
| with tf.variable_scope("attention"): | |
| attention_heads = [] | |
| with tf.variable_scope("self"): | |
| attention_head = attention_layer( | |
| from_tensor=layer_input, | |
| to_tensor=layer_input, | |
| attention_mask=attention_mask, | |
| num_attention_heads=num_attention_heads, | |
| size_per_head=attention_head_size, | |
| attention_probs_dropout_prob=attention_probs_dropout_prob, | |
| initializer_range=initializer_range, | |
| do_return_2d_tensor=True, | |
| batch_size=batch_size, | |
| from_seq_length=seq_length, | |
| to_seq_length=seq_length) | |
| attention_heads.append(attention_head) | |
| attention_output = None | |
| if len(attention_heads) == 1: | |
| attention_output = attention_heads[0] | |
| else: | |
| # In the case where we have other sequences, we just concatenate | |
| # them to the self-attention head before the projection. | |
| attention_output = tf.concat(attention_heads, axis=-1) | |
| # Run a linear projection of `hidden_size` then add a residual | |
| # with `layer_input`. | |
| with tf.variable_scope("output"): | |
| attention_output = tf.layers.dense( | |
| attention_output, | |
| hidden_size, | |
| kernel_initializer=create_initializer(initializer_range)) | |
| attention_output = dropout(attention_output, hidden_dropout_prob) | |
| attention_output = layer_norm(attention_output + layer_input) | |
| # The activation is only applied to the "intermediate" hidden layer. | |
| with tf.variable_scope("intermediate"): | |
| intermediate_output = tf.layers.dense( | |
| attention_output, | |
| intermediate_size, | |
| activation=intermediate_act_fn, | |
| kernel_initializer=create_initializer(initializer_range)) | |
| # Down-project back to `hidden_size` then add the residual. | |
| with tf.variable_scope("output"): | |
| layer_output = tf.layers.dense( | |
| intermediate_output, | |
| hidden_size, | |
| kernel_initializer=create_initializer(initializer_range)) | |
| layer_output = dropout(layer_output, hidden_dropout_prob) | |
| layer_output = layer_norm(layer_output + attention_output) | |
| prev_output = layer_output | |
| all_layer_outputs.append(layer_output) | |
| if do_return_all_layers: | |
| final_outputs = [] | |
| for layer_output in all_layer_outputs: | |
| final_output = reshape_from_matrix(layer_output, input_shape) | |
| final_outputs.append(final_output) | |
| return final_outputs | |
| else: | |
| final_output = reshape_from_matrix(prev_output, input_shape) | |
| return final_output | |
| def get_shape_list(tensor, expected_rank=None, name=None): | |
| """Returns a list of the shape of tensor, preferring static dimensions. | |
| Args: | |
| tensor: A tf.Tensor object to find the shape of. | |
| expected_rank: (optional) int. The expected rank of `tensor`. If this is | |
| specified and the `tensor` has a different rank, and exception will be | |
| thrown. | |
| name: Optional name of the tensor for the error message. | |
| Returns: | |
| A list of dimensions of the shape of tensor. All static dimensions will | |
| be returned as python integers, and dynamic dimensions will be returned | |
| as tf.Tensor scalars. | |
| """ | |
| if name is None: | |
| name = tensor.name | |
| if expected_rank is not None: | |
| assert_rank(tensor, expected_rank, name) | |
| shape = tensor.shape.as_list() | |
| non_static_indexes = [] | |
| for (index, dim) in enumerate(shape): | |
| if dim is None: | |
| non_static_indexes.append(index) | |
| if not non_static_indexes: | |
| return shape | |
| dyn_shape = tf.shape(tensor) | |
| for index in non_static_indexes: | |
| shape[index] = dyn_shape[index] | |
| return shape | |
| def reshape_to_matrix(input_tensor): | |
| """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" | |
| ndims = input_tensor.shape.ndims | |
| if ndims < 2: | |
| raise ValueError("Input tensor must have at least rank 2. Shape = %s" % | |
| (input_tensor.shape)) | |
| if ndims == 2: | |
| return input_tensor | |
| width = input_tensor.shape[-1] | |
| output_tensor = tf.reshape(input_tensor, [-1, width]) | |
| return output_tensor | |
| def reshape_from_matrix(output_tensor, orig_shape_list): | |
| """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" | |
| if len(orig_shape_list) == 2: | |
| return output_tensor | |
| output_shape = get_shape_list(output_tensor) | |
| orig_dims = orig_shape_list[0:-1] | |
| width = output_shape[-1] | |
| return tf.reshape(output_tensor, orig_dims + [width]) | |
| def assert_rank(tensor, expected_rank, name=None): | |
| """Raises an exception if the tensor rank is not of the expected rank. | |
| Args: | |
| tensor: A tf.Tensor to check the rank of. | |
| expected_rank: Python integer or list of integers, expected rank. | |
| name: Optional name of the tensor for the error message. | |
| Raises: | |
| ValueError: If the expected shape doesn't match the actual shape. | |
| """ | |
| if name is None: | |
| name = tensor.name | |
| expected_rank_dict = {} | |
| if isinstance(expected_rank, six.integer_types): | |
| expected_rank_dict[expected_rank] = True | |
| else: | |
| for x in expected_rank: | |
| expected_rank_dict[x] = True | |
| actual_rank = tensor.shape.ndims | |
| if actual_rank not in expected_rank_dict: | |
| scope_name = tf.get_variable_scope().name | |
| raise ValueError( | |
| "For the tensor `%s` in scope `%s`, the actual rank " | |
| "`%d` (shape = %s) is not equal to the expected rank `%s`" % | |
| (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) | |