internal merge

Lukasz Kaiser · Ryan Sepassi · commit 72f08746adbe · 2017-07-11T18:27:24.000-07:00
PiperOrigin-RevId: 161608262
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
@@ -292,7 +292,8 @@ def conv_internal(conv_fn, inputs, filters, kernel_size, **kwargs):
   """Conditional conv_fn making kernel 1d or 2d depending on inputs shape."""
   static_shape = inputs.get_shape()
   if not static_shape or len(static_shape) != 4:
-    raise ValueError("Inputs to conv must have statically known rank 4.")
+    raise ValueError("Inputs to conv must have statically known rank 4. "
+                     "Shape: " + str(static_shape))
   # Add support for left padding.
   if "padding" in kwargs and kwargs["padding"] == "LEFT":
     dilation_rate = (1, 1)
@@ -1402,3 +1403,127 @@ def smoothing_cross_entropy(logits, labels, vocab_size, confidence):
     xentropy = tf.nn.softmax_cross_entropy_with_logits(
         logits=logits, labels=soft_targets)
     return xentropy - normalizing
+
+
+def global_pool_1d(inputs, pooling_type="MAX", mask=None):
+  """Pool elements across the last dimension.
+
+  Useful to convert a list of vectors into a single vector so as
+  to get a representation of a set.
+
+  Args:
+    inputs: A tensor of dimensions batch_size x sequence_length x input_dims
+      containing the sequences of input vectors.
+    pooling_type: the pooling type to use, MAX or AVR
+    mask: A tensor of dimensions batch_size x sequence_length containing a
+      mask for the inputs with 1's for existing elements, and 0's elsewhere.
+
+  Returns:
+    output: A tensor of dimensions batch_size x input_dims
+      dimension containing the sequences of transformed vectors.
+  """
+  with tf.name_scope("global_pool", [inputs]):
+    if mask is not None:
+      mask = tf.expand_dims(mask, axis=2)
+      inputs = tf.multiply(inputs, mask)
+
+    if pooling_type == "MAX":
+      # A tf.pool can be used here, but reduce is cleaner
+      output = tf.reduce_max(inputs, axis=1)
+    elif pooling_type == "AVR":
+      if mask is not None:
+        # Some elems are dummy elems so we can't just reduce the average.
+        output = tf.reduce_sum(inputs, axis=1)
+        num_elems = tf.reduce_sum(mask, axis=1, keep_dims=True)
+        output = tf.div(output, tf.maximum(num_elems, 1))
+      else:
+        output = tf.reduce_mean(inputs, axis=1)
+
+  return output
+
+
+def linear_set_layer(layer_size,
+                     inputs,
+                     context=None,
+                     activation_fn=tf.nn.relu,
+                     dropout=0.0,
+                     name=None):
+  """Basic layer type for doing funky things with sets.
+
+  Applies a linear transformation to each element in the input set.
+  If a context is supplied, it is concatenated with the inputs.
+    e.g. One can use global_pool_1d to get a representation of the set which
+    can then be used as the context for the next layer.
+
+  TODO: Add bias add (or control the biases used).
+
+  Args:
+    layer_size: Dimension to transform the input vectors to.
+    inputs: A tensor of dimensions batch_size x sequence_length x input_dims
+      containing the sequences of input vectors.
+    context: A tensor of dimensions batch_size x context_dims
+      containing a global statistic about the set.
+    activation_fn: The activation function to use.
+    dropout: Dropout probability.
+    name: name.
+
+  Returns:
+    output: A tensor of dimensions batch_size x sequence_length x output_dims
+      dimension containing the sequences of transformed vectors.
+  """
+  with tf.variable_scope(name, "linear_set_layer", [inputs]):
+    # Apply 1D convolution to apply linear filter to each element
+    # along the 2nd dimension.
+    outputs = conv1d(inputs, layer_size, 1, activation=None, name="set_conv")
+
+    # Apply the context if it exists.
+    if context is not None:
+      # Unfortunately tf doesn't support broadcasting via concat, but we can
+      # simply add the transformed context to get the same effect.
+      context = tf.expand_dims(context, axis=1)
+      cont_tfm = conv1d(context, layer_size, 1,
+                        activation=None, name="cont_conv")
+      outputs += cont_tfm
+
+    if activation_fn is not None:
+      outputs = activation_fn(outputs)
+
+    if dropout != 0.0:
+      outputs = tf.nn.dropout(outputs, 1.0 - dropout)
+
+    return outputs
+
+
+def ravanbakhsh_set_layer(layer_size,
+                          inputs,
+                          mask=None,
+                          activation_fn=tf.nn.tanh,
+                          dropout=0.0,
+                          name=None):
+  """Layer from Deep Sets paper: https://arxiv.org/abs/1611.04500 .
+
+  More parameter-efficient verstion of a linear-set-layer with context.
+
+  Args:
+    layer_size: Dimension to transform the input vectors to.
+    inputs: A tensor of dimensions batch_size x sequence_length x vector
+      containing the sequences of input vectors.
+    mask: A tensor of dimensions batch_size x sequence_length containing a
+      mask for the inputs with 1's for existing elements, and 0's elsewhere.
+    activation_fn: The activation function to use.
+    dropout: dropout.
+    name: name.
+
+  Returns:
+    output: A tensor of dimensions batch_size x sequence_length x vector
+      dimension containing the sequences of transformed vectors.
+  """
+  with tf.variable_scope(name, "ravanbakhsh_set_layer", [inputs]):
+    output = linear_set_layer(
+        layer_size,
+        inputs - tf.expand_dims(global_pool_1d(inputs, mask=mask), axis=1),
+        activation_fn=activation_fn,
+        dropout=dropout,
+        name=name)
+
+    return output
diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/models/common_layers_test.py
@@ -82,6 +82,14 @@ def testConv(self):
       res = session.run(y)
     self.assertEqual(res.shape, (5, 5, 1, 13))
 
+  def testConv1d(self):
+    x = np.random.rand(5, 7, 11)
+    with self.test_session() as session:
+      y = common_layers.conv1d(tf.constant(x, dtype=tf.float32), 13, 1)
+      session.run(tf.global_variables_initializer())
+      res = session.run(y)
+    self.assertEqual(res.shape, (5, 7, 13))
+
   def testSeparableConv(self):
     x = np.random.rand(5, 7, 1, 11)
     with self.test_session() as session:
@@ -361,6 +369,58 @@ def testResidualFnWithLayerNorm(self):
       actual = session.run(x3)
     self.assertEqual(actual.shape, (5, 2, 1, 11))
 
+  def testGlobalPool1d(self):
+    x1 = np.random.rand(5, 4, 11)
+    no_mask = np.ones((5, 4))
+    full_mask = np.zeros((5, 4))
+
+    with self.test_session() as session:
+      x1_ = tf.Variable(x1, dtype=tf.float32)
+      no_mask_ = tf.Variable(no_mask, dtype=tf.float32)
+      full_mask_ = tf.Variable(full_mask, dtype=tf.float32)
+
+      none_mask_max = common_layers.global_pool_1d(x1_)
+      no_mask_max = common_layers.global_pool_1d(x1_, mask=no_mask_)
+      result1 = tf.reduce_sum(none_mask_max - no_mask_max)
+
+      full_mask_max = common_layers.global_pool_1d(x1_, mask=full_mask_)
+      result2 = tf.reduce_sum(full_mask_max)
+
+      none_mask_avr = common_layers.global_pool_1d(x1_, "AVR")
+      no_mask_avr = common_layers.global_pool_1d(x1_, "AVR", no_mask_)
+      result3 = tf.reduce_sum(none_mask_avr - no_mask_avr)
+
+      full_mask_avr = common_layers.global_pool_1d(x1_, "AVR", full_mask_)
+      result4 = tf.reduce_sum(full_mask_avr)
+
+      session.run(tf.global_variables_initializer())
+      actual = session.run([result1, result2, result3, result4])
+    self.assertAllEqual(actual[:3], [0.0, 0.0, 0.0])
+
+  def testLinearSetLayer(self):
+    x1 = np.random.rand(5, 4, 11)
+    cont = np.random.rand(5, 13)
+    with self.test_session() as session:
+      x1_ = tf.Variable(x1, dtype=tf.float32)
+      cont_ = tf.Variable(cont, dtype=tf.float32)
+
+      simple_ff = common_layers.linear_set_layer(32, x1_)
+      cont_ff = common_layers.linear_set_layer(32, x1_, context=cont_)
+
+      session.run(tf.global_variables_initializer())
+      actual = session.run([simple_ff, cont_ff])
+    self.assertEqual(actual[0].shape, (5, 4, 32))
+    self.assertEqual(actual[1].shape, (5, 4, 32))
+
+  def testRavanbakhshSetLayer(self):
+    x1 = np.random.rand(5, 4, 11)
+    with self.test_session() as session:
+      x1_ = tf.Variable(x1, dtype=tf.float32)
+      layer = common_layers.ravanbakhsh_set_layer(32, x1_)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(layer)
+    self.assertEqual(actual.shape, (5, 4, 32))
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py
@@ -32,5 +32,6 @@
 from tensor2tensor.models import neural_gpu
 from tensor2tensor.models import slicenet
 from tensor2tensor.models import transformer
+from tensor2tensor.models import transformer_alternative
 from tensor2tensor.models import xception
 # pylint: enable=unused-import
diff --git a/tensor2tensor/models/transformer_alternative.py b/tensor2tensor/models/transformer_alternative.py
@@ -0,0 +1,174 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Alternative transformer network.
+
+Using different layer types to demonstrate alternatives to self attention.
+
+Code is mostly copied from original Transformer source.
+"""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensor2tensor.models import common_attention
+from tensor2tensor.models import common_layers
+from tensor2tensor.models import transformer
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+@registry.register_model
+class TransformerAlt(t2t_model.T2TModel):
+
+  def model_fn_body(self, features):
+    hparams = self._hparams
+    targets = features["targets"]
+    inputs = features.get("inputs")
+    target_space = features.get("target_space_id")
+
+    inputs = common_layers.flatten4d3d(inputs)
+    targets = common_layers.flatten4d3d(targets)
+
+    (encoder_input, encoder_attention_bias,
+     _) = transformer.transformer_prepare_encoder(inputs, target_space, hparams)
+    (decoder_input,
+     decoder_self_attention_bias) = transformer.transformer_prepare_decoder(
+         targets, hparams)
+
+    # We need masks of the form batch size x input sequences
+    # Biases seem to be of the form batch_size x 1 x input sequences x vec dim
+    #  Squeeze out dim one, and get the first element of each vector.
+    encoder_mask = tf.squeeze(encoder_attention_bias, [1])[:, :, 0]
+    decoder_mask = tf.squeeze(decoder_self_attention_bias, [1])[:, :, 0]
+
+    def residual_fn(x, y):
+      return common_layers.layer_norm(x + tf.nn.dropout(
+          y, 1.0 - hparams.residual_dropout))
+
+    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
+    decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
+    encoder_output = alt_transformer_encoder(
+        encoder_input, residual_fn, encoder_mask, hparams)
+
+    decoder_output = alt_transformer_decoder(
+        decoder_input, encoder_output, residual_fn, decoder_mask,
+        encoder_attention_bias, hparams)
+
+    decoder_output = tf.expand_dims(decoder_output, 2)
+
+    return decoder_output
+
+
+def composite_layer(inputs, mask, hparams):
+  """Composite layer."""
+  x = inputs
+
+  # Applies ravanbakhsh on top of each other.
+  if hparams.composite_layer_type == "ravanbakhsh":
+    for layer in xrange(hparams.layers_per_layer):
+      with tf.variable_scope(".%d" % layer):
+        x = common_layers.ravanbakhsh_set_layer(
+            hparams.hidden_size,
+            x,
+            mask=mask,
+            dropout=0.0)
+
+  # Transforms elements to get a context, and then uses this in a final layer.
+  elif hparams.composite_layer_type == "reembedding":
+    # Transform elements n times and then pool.
+    for layer in xrange(hparams.layers_per_layer):
+      with tf.variable_scope(".%d" % layer):
+        x = common_layers.linear_set_layer(
+            hparams.hidden_size,
+            x,
+            dropout=0.0)
+    context = common_layers.global_pool_1d(x, mask=mask)
+
+    # Final layer.
+    x = common_layers.linear_set_layer(
+        hparams.hidden_size,
+        x,
+        context=context,
+        dropout=0.0)
+
+  return x
+
+
+def alt_transformer_encoder(encoder_input,
+                            residual_fn,
+                            mask,
+                            hparams,
+                            name="encoder"):
+  """Alternative encoder."""
+  x = encoder_input
+
+  with tf.variable_scope(name):
+    for layer in xrange(hparams.num_hidden_layers):
+      with tf.variable_scope("layer_%d" % layer):
+        x = residual_fn(x, composite_layer(x, mask, hparams))
+
+  return x
+
+
+def alt_transformer_decoder(decoder_input,
+                            encoder_output,
+                            residual_fn,
+                            mask,
+                            encoder_decoder_attention_bias,
+                            hparams,
+                            name="decoder"):
+  """Alternative decoder."""
+  x = decoder_input
+
+  # Summaries don't work in multi-problem setting yet.
+  summaries = "problems" not in hparams.values() or len(hparams.problems) == 1
+  with tf.variable_scope(name):
+    for layer in xrange(hparams.num_hidden_layers):
+      with tf.variable_scope("layer_%d" % layer):
+
+        x_ = common_attention.multihead_attention(
+            x,
+            encoder_output,
+            encoder_decoder_attention_bias,
+            hparams.attention_key_channels or hparams.hidden_size,
+            hparams.attention_value_channels or hparams.hidden_size,
+            hparams.hidden_size,
+            hparams.num_heads,
+            hparams.attention_dropout,
+            summaries=summaries,
+            name="encdec_attention")
+
+        x_ = residual_fn(x_, composite_layer(x_, mask, hparams))
+        x = residual_fn(x, x_)
+
+  return x
+
+
+@registry.register_hparams
+def transformer_alt():
+  """Set of hyperparameters."""
+  hparams = transformer.transformer_base()
+  hparams.batch_size = 64
+  hparams.add_hparam("layers_per_layer", 4)
+  hparams.add_hparam("composite_layer_type", "reembedding")
+  return hparams