Fast beam search decoding.

T2T Team · Ryan Sepassi · commit 86703a2b448e · 2017-10-26T16:49:09.000-07:00
PiperOrigin-RevId: 173594699
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -30,12 +30,15 @@
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
+from tensor2tensor.utils import beam_search
 from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 
 import tensorflow as tf
 
+from tensorflow.python.util import nest
+
 
 @registry.register_model
 class Transformer(t2t_model.T2TModel):
@@ -159,6 +162,58 @@ def _greedy_infer(
        logits: Not returned
        losses: Not returned
 
+    Raises:
+      ValueError: If last_position_only if False
+      NotImplementedError: If there are multiple data shards.
+    """
+    decoded_ids = self._fast_decode(features, decode_length, last_position_only)
+    return decoded_ids, None, None
+
+  def _beam_decode(self, features, decode_length, beam_size, top_beams,
+                   last_position_only, alpha):
+    """Beam search decoding.
+
+    Args:
+      features: an map of string to `Tensor`
+      decode_length: an integer.  How many additional timesteps to decode.
+      beam_size: number of beams.
+      top_beams: an integer. How many of the beams to return.
+      last_position_only: MUST be true for fast decoding!
+      alpha: Float that controls the length penalty. larger the alpha, stronger
+        the preference for slonger translations.
+
+    Returns:
+       samples: an integer `Tensor`. Top samples from the beam search
+    """
+    return self._fast_decode(
+        features, decode_length, last_position_only, beam_size, top_beams,
+        alpha)
+
+  def _fast_decode(
+      self,
+      features,
+      decode_length,
+      last_position_only=True,
+      beam_size=1,
+      top_beams=1,
+      alpha=1.0):
+    """Fast decoding.
+
+    Implements both greedy and beam search decoding, uses beam search iff
+    beam_size > 1, otherwise beam search related arguments are ignored.
+
+    Args:
+      features: a map of string to model  features.
+      decode_length: an integer.  How many additional timesteps to decode.
+      last_position_only: MUST be true for fast decoding!
+      beam_size: number of beams.
+      top_beams: an integer. How many of the beams to return.
+      alpha: Float that controls the length penalty. larger the alpha, stronger
+        the preference for slonger translations.
+
+    Returns:
+       samples: an integer `Tensor`. Top samples from the beam search
+
     Raises:
       ValueError: If last_position_only if False
       NotImplementedError: If there are multiple data shards.
@@ -192,6 +247,8 @@ def _greedy_infer(
     with tf.variable_scope("body"):
       encoder_output, encoder_decoder_attention_bias = dp(
           self.encode, inputs, features["target_space_id"], hparams)
+    encoder_output = encoder_output[0]
+    encoder_decoder_attention_bias = encoder_decoder_attention_bias[0]
 
     if hparams.pos == "timing":
       timing_signal = common_attention.get_timing_signal_1d(
@@ -236,6 +293,7 @@ def preprocess_targets(targets, i):
 
     def symbols_to_logits_fn(ids, i, cache):
       """Go from ids to logits for next symbol."""
+      ids = ids[:, -1:]
       targets = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3)
       targets = preprocess_targets(targets, i)
 
@@ -245,22 +303,16 @@ def symbols_to_logits_fn(ids, i, cache):
         body_outputs = dp(
             self.decode,
             targets,
-            encoder_output[0],
-            encoder_decoder_attention_bias[0],
+            cache["encoder_output"],
+            cache["encoder_decoder_attention_bias"],
             bias,
             hparams,
             cache)
 
       with tf.variable_scope(target_modality.name):
         logits = target_modality.top_sharded(body_outputs, None, dp)[0]
 
-      return tf.squeeze(logits, axis=[1, 2, 3])
-
-    def inner_loop(i, next_id, decoded_ids, cache):
-      logits = symbols_to_logits_fn(next_id, i, cache)
-      next_id = tf.expand_dims(tf.argmax(logits, axis=-1), axis=1)
-      decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
-      return i+1, next_id, decoded_ids, cache
+      return tf.squeeze(logits, axis=[1, 2, 3]), cache
 
     key_channels = hparams.attention_key_channels or hparams.hidden_size
     value_channels = hparams.attention_value_channels or hparams.hidden_size
@@ -272,24 +324,53 @@ def inner_loop(i, next_id, decoded_ids, cache):
             "v": tf.zeros([batch_size, 0, value_channels]),
         } for layer in range(num_layers)
     }
-    decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64)
-    next_id = tf.zeros([batch_size, 1], dtype=tf.int64)
-    _, _, decoded_ids, _ = tf.while_loop(
-        # TODO(llion): Early stopping.
-        lambda i, *_: tf.less(i, decode_length),
-        inner_loop,
-        [tf.constant(0), next_id, decoded_ids, cache],
-        shape_invariants=[
-            tf.TensorShape([]),
-            tf.TensorShape([None, None]),
-            tf.TensorShape([None, None]),
-            {"layer_%d" % layer: {
-                "k": tf.TensorShape([None, None, key_channels]),
-                "v": tf.TensorShape([None, None, value_channels]),
-            } for layer in range(num_layers)}
-        ])
 
-    return decoded_ids, None, None
+    # Set 2nd dim to None since it's not invariant in the tf.while_loop
+    # Note: Tensor.set_shape() does not work here since it merges shape info.
+    # TODO(llion); Find a more robust solution.
+    # pylint: disable=protected-access
+    for layer in cache:
+      cache[layer]["k"]._shape = tf.TensorShape([None, None, key_channels])
+      cache[layer]["v"]._shape = tf.TensorShape([None, None, value_channels])
+    # pylint: enable=protected-access
+    cache["encoder_output"] = encoder_output
+    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+
+    if beam_size > 1:  # Beam Search
+      target_modality = (
+          self._hparams.problems[self._problem_idx].target_modality)
+      vocab_size = target_modality.top_dimensionality
+      initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+      decoded_ids, _ = beam_search.beam_search(
+          symbols_to_logits_fn, initial_ids, beam_size, decode_length,
+          vocab_size, alpha, states=cache)
+
+      if top_beams == 1:
+        decoded_ids = decoded_ids[:, 0, 1:]
+      else:
+        decoded_ids = decoded_ids[:, :top_beams, 1:]
+    else:  # Greedy
+      def inner_loop(i, next_id, decoded_ids, cache):
+        logits, cache = symbols_to_logits_fn(next_id, i, cache)
+        next_id = tf.expand_dims(tf.argmax(logits, axis=-1), axis=1)
+        decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
+        return i+1, next_id, decoded_ids, cache
+
+      decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64)
+      next_id = tf.zeros([batch_size, 1], dtype=tf.int64)
+      _, _, decoded_ids, _ = tf.while_loop(
+          # TODO(llion): Early stopping.
+          lambda i, *_: tf.less(i, decode_length),
+          inner_loop,
+          [tf.constant(0), next_id, decoded_ids, cache],
+          shape_invariants=[
+              tf.TensorShape([]),
+              tf.TensorShape([None, None]),
+              tf.TensorShape([None, None]),
+              nest.map_structure(lambda t: tf.TensorShape(t.shape), cache),
+          ])
+
+    return decoded_ids
 
 
 @registry.register_model
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
@@ -112,5 +112,51 @@ def testGreedyVsFast(self):
     self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length))
     self.assertAllClose(greedy_res, fast_res)
 
+  def testBeamVsFast(self):
+    model, features = self.getModel(transformer.transformer_small())
+
+    decode_length = 2
+
+    out_logits, _ = model.model_fn(features)
+    out_logits = tf.squeeze(out_logits[0], axis=[2, 3])
+    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
+        logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]),
+        labels=tf.reshape(features["targets"], [-1]))
+    loss = tf.reduce_mean(loss)
+    apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss)
+
+    with self.test_session():
+      tf.global_variables_initializer().run()
+      for _ in range(100):
+        apply_grad.run()
+
+    model, _ = self.getModel(transformer.transformer_small(),
+                             mode=tf.estimator.ModeKeys.PREDICT)
+
+    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+      beam_result = model._beam_decode_slow(
+          features,
+          decode_length,
+          beam_size=4,
+          top_beams=1,
+          last_position_only=True,
+          alpha=1.0)
+
+      fast_result = model._beam_decode(
+          features,
+          decode_length,
+          beam_size=4,
+          top_beams=1,
+          last_position_only=True,
+          alpha=1.0)
+
+    with self.test_session():
+      beam_res = beam_result.eval()
+      fast_res = fast_result.eval()
+
+    self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length))
+    self.assertAllClose(beam_res, fast_res)
+
+
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensor2tensor/utils/beam_search.py b/tensor2tensor/utils/beam_search.py
@@ -30,7 +30,45 @@
 INF = 1. * 1e7
 
 
-def expand_to_beam_size(tensor, beam_size):
+def _get_shape(tensor):
+  """Returns static shape if available and dynamic shape otherwise."""
+  static = tensor.shape.as_list()
+  dynamic = tf.unstack(tf.shape(tensor))
+  return [s[1] if s[0] is None else s[0] for s in zip(static, dynamic)]
+
+
+def _merge_beam_dim(tensor):
+  """Reshapes first two dimensions in to single dimension.
+
+  Args:
+    tensor: Tensor to reshape of shape [A, B, ...]
+
+  Returns:
+    Reshaped tensor of shape [A*B, ...]
+  """
+  shape = _get_shape(tensor)
+  shape[0] *= shape[1]  # batch -> batch * beam_size
+  shape.pop(1)  # Remove beam dim
+  return tf.reshape(tensor, shape)
+
+
+def _unmerge_beam_dim(tensor, batch_size, beam_size):
+  """Reshapes first dimension back to [batch_size, beam_size].
+
+  Args:
+    tensor: Tensor to reshape of shape [batch_size*beam_size, ...]
+    batch_size: Tensor, original batch size.
+    beam_size: int, original beam size.
+
+  Returns:
+    Reshaped tensor of shape [batch_size, beam_size, ...]
+  """
+  shape = _get_shape(tensor)
+  new_shape = [batch_size] + [beam_size] + shape[1:]
+  return tf.reshape(tensor, new_shape)
+
+
+def _expand_to_beam_size(tensor, beam_size):
   """Tiles a given tensor by beam_size.
 
   Args:
@@ -191,11 +229,11 @@ def beam_search(symbols_to_logits_fn,
   alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
 
   # Expand each batch and state to beam_size
-  alive_seq = expand_to_beam_size(initial_ids, beam_size)
+  alive_seq = _expand_to_beam_size(initial_ids, beam_size)
   alive_seq = tf.expand_dims(alive_seq, axis=2)  # (batch_size, beam_size, 1)
   if states:
     states = nest.map_structure(
-        lambda state: expand_to_beam_size(state, beam_size), states)
+        lambda state: _expand_to_beam_size(state, beam_size), states)
   else:
     states = {}
 
@@ -302,12 +340,10 @@ def grow_topk(i, alive_seq, alive_log_probs, states):
 
     # (batch_size * beam_size, decoded_length)
     if states:
-      flat_states = nest.map_structure(
-          lambda state: tf.reshape(state, [batch_size * beam_size, -1]), states)
-      flat_logits, flat_states = symbols_to_logits_fn(flat_ids, flat_states)
+      flat_states = nest.map_structure(_merge_beam_dim, states)
+      flat_logits, flat_states = symbols_to_logits_fn(flat_ids, i, flat_states)
       states = nest.map_structure(
-          lambda state: tf.reshape(state, [batch_size, beam_size, -1]),
-          flat_states)
+          lambda t: _unmerge_beam_dim(t, batch_size, beam_size), flat_states)
     else:
       flat_logits = symbols_to_logits_fn(flat_ids)
     logits = tf.reshape(flat_logits, [batch_size, beam_size, -1])
@@ -478,8 +514,7 @@ def _is_finished(i, unused_alive_seq, alive_log_probs, unused_finished_seq,
            finished_scores.get_shape(),
            finished_flags.get_shape(),
            nest.map_structure(
-               lambda tensor: tf.TensorShape([None] * tensor.shape.ndims),
-               states),
+               lambda tensor: tf.TensorShape(tensor.shape), states),
        ],
        parallel_iterations=1,
        back_prop=False)
diff --git a/tensor2tensor/utils/beam_search_test.py b/tensor2tensor/utils/beam_search_test.py
@@ -289,7 +289,7 @@ def testStates(self):
 
     expected_states = tf.constant([[[0.]], [[1.]]])
 
-    def symbols_to_logits(ids, states):
+    def symbols_to_logits(ids, _, states):
       pos = tf.shape(ids)[1] - 1
       # We have to assert the values of state inline here since we can't fetch
       # them out of the loop!
@@ -303,6 +303,7 @@ def symbols_to_logits(ids, states):
     states = {
         "state": tf.zeros((batch_size, 1)),
     }
+    states["state"]._shape = tf.TensorShape((None, 1))
 
     final_ids, _ = beam_search.beam_search(
         symbols_to_logits,
@@ -336,7 +337,7 @@ def testStateBeamTwo(self):
     # at each position, which is the one thats getting 3 added to it each step.
     expected_states = tf.constant([[[0.], [0.]], [[3.], [3.]], [[6.], [6.]]])
 
-    def symbols_to_logits(ids, states):
+    def symbols_to_logits(ids, _, states):
       pos = tf.shape(ids)[1] - 1
 
       # We have to assert the values of state inline here since we can't fetch
@@ -351,6 +352,7 @@ def symbols_to_logits(ids, states):
     states = {
         "state": tf.zeros((batch_size, 1)),
     }
+    states["state"]._shape = tf.TensorShape((None, 1))
 
     final_ids, _ = beam_search.beam_search(
         symbols_to_logits,
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
@@ -217,6 +217,29 @@ def _beam_decode(self, features, decode_length, beam_size, top_beams,
                    last_position_only, alpha):
     """Beam search decoding.
 
+    Models should ideally implement a more efficient version of this function.
+
+    Args:
+      features: an map of string to `Tensor`
+      decode_length: an integer.  How many additional timesteps to decode.
+      beam_size: number of beams.
+      top_beams: an integer. How many of the beams to return.
+      last_position_only: a boolean, speed-up by computing last position only.
+      alpha: Float that controls the length penalty. larger the alpha, stronger
+        the preference for slonger translations.
+
+    Returns:
+       samples: an integer `Tensor`. Top samples from the beam search
+    """
+    return self._beam_decode_slow(features, decode_length, beam_size, top_beams,
+                                  last_position_only, alpha)
+
+  def _beam_decode_slow(self, features, decode_length, beam_size, top_beams,
+                        last_position_only, alpha):
+    """Slow version of Beam search decoding.
+
+    Quadratic time in decode_length.
+
     Args:
       features: an map of string to `Tensor`
       decode_length: an integer.  How many additional timesteps to decode.