Merge pull request #151 from cshanbo/wmt_zhen_translate

lukaszkaiser · web-flow · commit 7566c4d7acb8 · 2017-07-13T13:38:00.000-07:00
add wmt_zhen_token_32k
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -140,6 +140,12 @@ _SUPPORTED_PROBLEM_GENERATORS = {
         lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, True, 2**15),
         lambda: wmt.ende_wordpiece_token_generator(FLAGS.tmp_dir, False, 2**15)
     ),
+    "wmt_zhen_tokens_32k": (
+        lambda: wmt.zhen_wordpiece_token_generator(FLAGS.tmp_dir, True,
+                                                   2**15, 2**15),
+        lambda: wmt.zhen_wordpiece_token_generator(FLAGS.tmp_dir, False,
+                                                   2**15, 2**15)
+    ),
     "lm1b_32k": (
         lambda: lm1b.generator(FLAGS.tmp_dir, True),
         lambda: lm1b.generator(FLAGS.tmp_dir, False)
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
@@ -177,6 +177,7 @@ def default_problem_hparams():
       #   13: Audio spectral domain
       #   14: Parse characters
       #   15: Parse tokens
+      #   16: Chinese tokens
       # Add more above if needed.
       input_space_id=0,
       target_space_id=0,
@@ -472,6 +473,32 @@ def wmt_ende_tokens(model_hparams, wrong_vocab_size):
   return p
 
 
+def wmt_zhen_tokens(model_hparams, wrong_vocab_size):
+  """Chinese to English translation benchmark."""
+  p = default_problem_hparams()
+  # This vocab file must be present within the data directory.
+  if model_hparams.shared_embedding_and_softmax_weights == 1:
+    model_hparams.shared_embedding_and_softmax_weights = 0
+  source_vocab_filename = os.path.join(model_hparams.data_dir,
+                                       "tokens.vocab.zh.%d" % wrong_vocab_size)
+  target_vocab_filename = os.path.join(model_hparams.data_dir,
+                                       "tokens.vocab.en.%d" % wrong_vocab_size)
+  source_token = text_encoder.SubwordTextEncoder(source_vocab_filename)
+  target_token = text_encoder.SubwordTextEncoder(target_vocab_filename)
+  p.input_modality = {
+      "inputs": (registry.Modalities.SYMBOL, source_token.vocab_size)
+  }
+  p.target_modality = (registry.Modalities.SYMBOL, target_token.vocab_size)
+  p.vocabulary = {
+      "inputs": source_token,
+      "targets": target_token,
+  }
+  p.loss_multiplier = 1.4
+  p.input_space_id = 16
+  p.target_space_id = 4
+  return p
+
+
 def wmt_ende_v2(model_hparams, vocab_size):
   """English to German translation benchmark with separate vocabularies."""
   p = default_problem_hparams()
@@ -730,6 +757,7 @@ def img2img_imagenet(unused_model_hparams):
     "wmt_ende_bpe32k_160": wmt_ende_bpe32k,
     "wmt_ende_v2_32k_combined": lambda p: wmt_ende_v2(p, 2**15),
     "wmt_ende_v2_16k_combined": lambda p: wmt_ende_v2(p, 2**14),
+    "wmt_zhen_tokens_32k": lambda p: wmt_zhen_tokens(p, 2**15),
     "image_cifar10_tune": image_cifar10,
     "image_cifar10_test": image_cifar10,
     "image_mnist_tune": image_mnist,
diff --git a/tensor2tensor/data_generators/wmt.py b/tensor2tensor/data_generators/wmt.py
@@ -101,6 +101,38 @@ def token_generator(source_path, target_path, token_vocab, eos=None):
         source, target = source_file.readline(), target_file.readline()
 
 
+def bi_vocabs_token_generator(source_path, target_path,
+                              source_token_vocab,
+                              target_token_vocab,
+                              eos=None):
+  """Generator for sequence-to-sequence tasks that uses tokens.
+
+  This generator assumes the files at source_path and target_path have
+  the same number of lines and yields dictionaries of "inputs" and "targets"
+  where inputs are token ids from the " "-split source (and target, resp.) lines
+  converted to integers using the token_map.
+
+  Args:
+    source_path: path to the file with source sentences.
+    target_path: path to the file with target sentences.
+    source_token_vocab: text_encoder.TextEncoder object.
+    target_token_vocab: text_encoder.TextEncoder object.
+    eos: integer to append at the end of each sequence (default: None).
+
+  Yields:
+    A dictionary {"inputs": source-line, "targets": target-line} where
+    the lines are integer lists converted from tokens in the file lines.
+  """
+  eos_list = [] if eos is None else [eos]
+  with tf.gfile.GFile(source_path, mode="r") as source_file:
+    with tf.gfile.GFile(target_path, mode="r") as target_file:
+      source, target = source_file.readline(), target_file.readline()
+      while source and target:
+        source_ints = source_token_vocab.encode(source.strip()) + eos_list
+        target_ints = target_token_vocab.encode(target.strip()) + eos_list
+        yield {"inputs": source_ints, "targets": target_ints}
+        source, target = source_file.readline(), target_file.readline()
+
 def _get_wmt_ende_dataset(directory, filename):
   """Extract the WMT en-de corpus `filename` to directory unless it's there."""
   train_path = os.path.join(directory, filename)
@@ -177,6 +209,21 @@ def ende_bpe_token_generator(tmp_dir, train):
     ],
 ]
 
+_ZHEN_TRAIN_DATASETS = [
+    [
+        "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz",
+        ("training/news-commentary-v12.zh-en.zh",
+         "training/news-commentary-v12.zh-en.en")
+    ]
+]
+
+_ZHEN_TEST_DATASETS = [
+    [
+        "http://data.statmt.org/wmt17/translation-task/dev.tgz",
+        ("dev/newsdev2017-zhen-src.zh",
+         "dev/newsdev2017-zhen-ref.en")
+    ]
+]
 
 def _compile_data(tmp_dir, datasets, filename):
   """Concatenate all `datasets` and save to `filename`."""
@@ -253,6 +300,25 @@ def ende_character_generator(tmp_dir, train):
                              character_vocab, EOS)
 
 
+def zhen_wordpiece_token_generator(tmp_dir, train,
+                                   source_vocab_size, 
+                                   target_vocab_size):
+  datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
+  source_datasets = [[item[0], [item[1][0]]] for item in datasets]
+  target_datasets = [[item[0], [item[1][1]]] for item in datasets]
+  source_vocab = generator_utils.get_or_generate_vocab(
+      tmp_dir, "tokens.vocab.zh.%d" % source_vocab_size,
+      source_vocab_size, source_datasets)
+  target_vocab = generator_utils.get_or_generate_vocab(
+      tmp_dir, "tokens.vocab.en.%d" % target_vocab_size,
+      target_vocab_size, target_datasets)
+  tag = "train" if train else "dev"
+  data_path = _compile_data(tmp_dir, datasets, "wmt_zhen_tok_%s" % tag)
+  return bi_vocabs_token_generator(data_path + ".lang1",
+                                   data_path + ".lang2",
+                                   source_vocab, target_vocab,  EOS)
+
+
 def enfr_wordpiece_token_generator(tmp_dir, train, vocab_size):
   """Instance of token generator for the WMT en->fr task."""
   symbolizer_vocab = generator_utils.get_or_generate_vocab(