Moved ice_parsing to data_generators; updated to 1.1.7

vthorsteinsson · vthorsteinsson · commit b4de995cec24 · 2017-08-11T00:17:55.000Z
diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
@@ -31,6 +31,7 @@
 from tensor2tensor.data_generators import wiki
 from tensor2tensor.data_generators import wmt
 from tensor2tensor.data_generators import wsj_parsing
+from tensor2tensor.data_generators import ice_parsing
 
 
 # Problem modules that require optional dependencies
diff --git a/tensor2tensor/data_generators/ice_parsing.py b/tensor2tensor/data_generators/ice_parsing.py
@@ -28,7 +28,6 @@
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators.wmt import tabbed_generator
 from tensor2tensor.utils import registry
-from tensor2tensor.models import transformer
 
 import tensorflow as tf
 
@@ -69,9 +68,21 @@ def source_vocab_size(self):
     return 2**14  # 16384
 
   @property
-  def target_vocab_size(self):
+  def targeted_vocab_size(self):
     return 2**8  # 256
 
+  @property
+  def input_space_id(self):
+    return problem.SpaceID.ICE_TOK
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.ICE_PARSE_TOK
+
+  @property
+  def num_shards(self):
+    return 10
+
   def feature_encoders(self, data_dir):
     source_vocab_filename = os.path.join(
         data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
@@ -89,7 +100,7 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
         tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice",
                                        self.source_vocab_size,
                                        self.target_vocab_size),
-        self.training_filepaths(data_dir, 1, shuffled=False),
+        self.training_filepaths(data_dir, self.num_shards, shuffled=False),
         tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice",
                                        self.source_vocab_size,
                                        self.target_vocab_size),
@@ -99,29 +110,8 @@ def hparams(self, defaults, model_hparams):
     p = defaults
     source_vocab_size = self._encoders["inputs"].vocab_size
     p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
-    p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
-    p.input_space_id = problem.SpaceID.ICE_TOK
-    p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
+    p.target_modality = (registry.Modalities.SYMBOL, self.targeted_vocab_size)
+    p.input_space_id = self.input_space_id
+    p.target_space_id = self.target_space_id
     p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
 
-
-@registry.register_hparams
-def transformer_parsing_ice():
-  """Hparams for parsing Icelandic text."""
-  hparams = transformer.transformer_base_single_gpu()
-  hparams.batch_size = 4096
-  hparams.shared_embedding_and_softmax_weights = int(False)
-  return hparams
-
-
-@registry.register_hparams
-def transformer_parsing_ice_big():
-  """Hparams for parsing Icelandic text, bigger model."""
-  hparams = transformer_parsing_ice()
-  hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU
-  hparams.attention_dropout = 0.05
-  hparams.residual_dropout = 0.05
-  hparams.max_length = 512
-  hparams.hidden_size = 1024
-  return hparams
-
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
@@ -187,36 +187,6 @@ def bi_vocabs_token_generator(source_path,
         source, target = source_file.readline(), target_file.readline()
 
 
-def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
-  r"""Generator for sequence-to-sequence tasks using tabbed files.
-
-  Tokens are derived from text files where each line contains both
-  a source and a target string. The two strings are separated by a tab
-  character ('\t'). It yields dictionaries of "inputs" and "targets" where
-  inputs are characters from the source lines converted to integers, and
-  targets are characters from the target lines, also converted to integers.
-
-  Args:
-    source_path: path to the file with source and target sentences.
-    source_vocab: a SunwordTextEncoder to encode the source string.
-    target_vocab: a SunwordTextEncoder to encode the target string.
-    eos: integer to append at the end of each sequence (default: None).
-
-  Yields:
-    A dictionary {"inputs": source-line, "targets": target-line} where
-    the lines are integer lists converted from characters in the file lines.
-  """
-  eos_list = [] if eos is None else [eos]
-  with tf.gfile.GFile(source_path, mode="r") as source_file:
-    for line in source_file:
-      if line and "\t" in line:
-        parts = line.split("\t", maxsplit=1)
-        source, target = parts[0].strip(), parts[1].strip()
-        source_ints = source_vocab.encode(source) + eos_list
-        target_ints = target_vocab.encode(target) + eos_list
-        yield {"inputs": source_ints, "targets": target_ints}
-
-
 # Data-set URLs.
 
 
diff --git a/tensor2tensor/ice_parsing/__init__.py b/tensor2tensor/ice_parsing/__init__.py
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -391,6 +391,15 @@ def transformer_parsing_big():
   return hparams
 
 
+@registry.register_hparams
+def transformer_parsing_ice():
+  """Hparams for parsing and tagging Icelandic text."""
+  hparams = transformer.transformer_base_single_gpu()
+  hparams.batch_size = 4096
+  hparams.shared_embedding_and_softmax_weights = int(False)
+  return hparams
+
+
 @registry.register_hparams
 def transformer_tiny():
   hparams = transformer_base()