Remove redundant copies of res_fn. Use from common_layers

Niki Parmar · Ryan Sepassi · commit 09a6084554a4 · 2017-07-11T18:27:20.000-07:00
PiperOrigin-RevId: 161451356
diff --git a/tensor2tensor/models/common_hparams.py b/tensor2tensor/models/common_hparams.py
@@ -65,6 +65,8 @@ def basic_params1():
       sampling_method="argmax",  # "argmax" or "random"
       problem_choice="adaptive",  # "uniform", "adaptive", "distributed"
       multiply_embedding_mode="sqrt_depth",
+      norm_type="none",  # "batch", layer", "noam", "none".
+      layer_norm_epsilon=1e-6,
       symbol_modality_num_shards=16,
       # setting the max length in a minibatch. 0 means default behavior,
       # max_length = hparams.batch_size * length_multiplier
diff --git a/tensor2tensor/models/common_layers.py b/tensor2tensor/models/common_layers.py
@@ -433,24 +433,48 @@ def noam_norm(x, name=None):
             tf.sqrt(tf.to_float(shape[-1])))
 
 
-def residual_function(hparams):
+def get_norm(norm_type):
+  """Get the normalizer function."""
+  if norm_type == "layer":
+    return lambda x, name, filters=None, epsilon=1e-6: layer_norm(  # pylint: disable=g-long-lambda
+        x, filters=filters, epsilon=epsilon, name=name)
+  if norm_type == "batch":
+    return tf.layers.batch_normalization
+  if norm_type == "noam":
+    return noam_norm
+  if norm_type == "none":
+    return lambda x, name: x
+  raise ValueError("Parameter normalizer_fn must be one of: 'layer', 'batch',"
+                   "'noam', 'none'.")
+
+
+def residual_fn(x, y, norm_type, residual_dropout,
+                filters=None,
+                epsilon=1e-16,
+                name="residual"):
   """Returns a function for combining layer input and layer output.
 
   The returned function on x (layer input) and y (layer output) computes:
-    norm_function(x + t
+    norm_function(x + dropout(y))
 
   Args:
-    hparams: model hyperparameters
+    x: tensor, input layer
+    y: tensor, output layer
+    norm_type: string, type of normalizer function
+    residual_dropout: integer, dropout value for residual connection
+    filters: integer, dimension for layer norm, optional
+    epsilon: integer, value of layer norm epsilon
+    name: string, name
 
   Returns:
-    a function from x=<layer input> and y=<layer output> to computed output
+    residual layer output with applied norm_fn.
   """
-
-  def residual_fn(x, y):
-    return hparams.norm_function(x + tf.nn.dropout(
-        y, 1.0 - hparams.residual_dropout))
-
-  return residual_fn
+  norm_fn = get_norm(norm_type)
+  res = x + tf.nn.dropout(y, 1.0 - residual_dropout)
+  if norm_type == "layer":
+    return norm_fn(res, name=name, filters=filters, epsilon=epsilon)
+  else:
+    return norm_fn(res, name=name)
 
 
 def conv_block_internal(conv_fn,
diff --git a/tensor2tensor/models/common_layers_test.py b/tensor2tensor/models/common_layers_test.py
@@ -294,6 +294,73 @@ def testDeconvStride2MultiStep(self):
       actual = session.run(a)
     self.assertEqual(actual.shape, (5, 32, 1, 16))
 
+  def testGetNormLayerFn(self):
+    norm_type = "layer"
+    with self.test_session() as session:
+      a = common_layers.get_norm(norm_type)
+      x1 = np.random.rand(5, 2, 1, 11)
+      x2 = a(tf.constant(x1, dtype=tf.float32), name="layer", filters=11)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(x2)
+    self.assertEqual(actual.shape, (5, 2, 1, 11))
+
+  def testGetNormNoamFn(self):
+    norm_type = "noam"
+    with self.test_session() as session:
+      a = common_layers.get_norm(norm_type)
+      x1 = np.random.rand(5, 2, 1, 11)
+      x2 = a(tf.constant(x1, dtype=tf.float32), name="noam")
+      session.run(tf.global_variables_initializer())
+      actual = session.run(x2)
+    self.assertEqual(actual.shape, (5, 2, 1, 11))
+
+  def testGetNormBatchFn(self):
+    norm_type = "batch"
+    with self.test_session() as session:
+      a = common_layers.get_norm(norm_type)
+      x1 = np.random.rand(5, 2, 1, 11)
+      x2 = a(tf.constant(x1, dtype=tf.float32), name="batch")
+      session.run(tf.global_variables_initializer())
+      actual = session.run(x2)
+    self.assertEqual(actual.shape, (5, 2, 1, 11))
+
+  def testGetNormNoneFn(self):
+    norm_type = "none"
+    with self.test_session() as session:
+      a = common_layers.get_norm(norm_type)
+      x1 = np.random.rand(5, 2, 1, 11)
+      x2 = a(tf.constant(x1, dtype=tf.float32), name="none")
+      session.run(tf.global_variables_initializer())
+      actual = session.run(x2)
+    self.assertEqual(actual.shape, (5, 2, 1, 11))
+    self.assertAllClose(actual, x1, atol=1e-03)
+
+  def testResidualFn(self):
+    norm_type = "batch"
+    with self.test_session() as session:
+      x1 = np.random.rand(5, 2, 1, 11)
+      x2 = np.random.rand(5, 2, 1, 11)
+      x3 = common_layers.residual_fn(
+          tf.constant(x1, dtype=tf.float32),
+          tf.constant(x2, dtype=tf.float32),
+          norm_type, 0.1)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(x3)
+    self.assertEqual(actual.shape, (5, 2, 1, 11))
+
+  def testResidualFnWithLayerNorm(self):
+    norm_type = "layer"
+    with self.test_session() as session:
+      x1 = np.random.rand(5, 2, 1, 11)
+      x2 = np.random.rand(5, 2, 1, 11)
+      x3 = common_layers.residual_fn(
+          tf.constant(x1, dtype=tf.float32),
+          tf.constant(x2, dtype=tf.float32),
+          norm_type, 0.1, epsilon=0.1)
+      session.run(tf.global_variables_initializer())
+      actual = session.run(x3)
+    self.assertEqual(actual.shape, (5, 2, 1, 11))
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
@@ -31,21 +31,6 @@
 import tensorflow as tf
 
 
-def get_norm(hparams):
-  """Get the normalizer function."""
-  if hparams.normalizer_fn == "layer":
-    return lambda x, name: common_layers.layer_norm(  # pylint: disable=g-long-lambda
-        x, hparams.hidden_size, name=name)
-  if hparams.normalizer_fn == "batch":
-    return tf.layers.batch_normalization
-  if hparams.normalizer_fn == "noam":
-    return common_layers.noam_norm
-  if hparams.normalizer_fn == "none":
-    return lambda x, name: x
-  raise ValueError("Parameter normalizer_fn must be one of: 'layer', 'batch',"
-                   "'noam', 'none'.")
-
-
 def attention(targets_shifted, inputs_encoded, norm_fn, hparams, bias=None):
   """Complete attention layer with preprocessing."""
   separabilities = [hparams.separability, hparams.separability]
@@ -128,7 +113,7 @@ def multi_conv_res(x, padding, name, layers, hparams,
           hparams.separability - i
           for i in reversed(range(len(dilations_and_kernels2)))
       ]
-    norm_fn = get_norm(hparams)
+    norm_fn = common_layers.get_norm(hparams.norm_type)
     for layer in xrange(layers):
       with tf.variable_scope("layer_%d" % layer):
         y = common_layers.subseparable_conv_block(
@@ -188,7 +173,7 @@ def similarity_cost(inputs_encoded, targets_encoded):
 
 def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams):
   """Middle part of slicenet, connecting encoder and decoder."""
-  norm_fn = get_norm(hparams)
+  norm_fn = common_layers.get_norm(hparams.norm_type)
 
   # Flatten targets and embed target_space_id.
   targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2)
@@ -311,7 +296,7 @@ def slicenet_params1():
   hparams.num_hidden_layers = 4
   hparams.kernel_height = 3
   hparams.kernel_width = 1
-  hparams.add_hparam("normalizer_fn", "layer")  # New ones are added like this.
+  hparams.norm_type = "layer"
   hparams.learning_rate_decay_scheme = "exp50k"
   hparams.learning_rate = 0.05
   hparams.learning_rate_warmup_steps = 3000
@@ -322,7 +307,7 @@ def slicenet_params1():
   hparams.optimizer_adam_epsilon = 1e-6
   hparams.optimizer_adam_beta1 = 0.85
   hparams.optimizer_adam_beta2 = 0.997
-  hparams.add_hparam("large_kernel_size", 15)
+  hparams.add_hparam("large_kernel_size", 15)  # New ones are added like this.
   hparams.add_hparam("separability", -2)
   # A dilation scheme, one of _DILATION_SCHEMES.
   hparams.add_hparam("dilation_scheme", "1.1.1.1")