Support TF 1.5 launching an ML Engine job (which only supports 1.4)

Ryan Sepassi · Ryan Sepassi · commit 8548dab0eb69 · 2018-02-16T13:09:33.000-08:00
PiperOrigin-RevId: 185954947
diff --git a/docs/new_problem.md b/docs/new_problem.md
@@ -59,6 +59,7 @@ class PoetryLines(text_problems.Text2TextProblem):
     # generate_data will shard the data into TRAIN and EVAL for us.
     return False
 
+  @property
   def dataset_splits(self):
     """Splits of data to produce and number of output shards for each."""
     # 10% evaluation data
@@ -141,6 +142,7 @@ training data will be generated into 90 files and the evaluation data into 10.
     # generate_data will shard the data into TRAIN and EVAL for us.
     return False
 
+  @property
   def dataset_splits(self):
     """Splits of data to produce and number of output shards for each."""
     # 10% evaluation data
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
@@ -322,8 +322,8 @@ def main(argv):
   if FLAGS.generate_data:
     generate_data()
 
-  if hasattr(FLAGS, "job_dir") and FLAGS.job_dir:
-    FLAGS.output_dir = FLAGS.job_dir
+  if cloud_mlengine.job_dir():
+    FLAGS.output_dir = cloud_mlengine.job_dir()
 
   if argv:
     set_hparams_from_args(argv[1:])
diff --git a/tensor2tensor/test_data/example_usr_dir/my_submodule.py b/tensor2tensor/test_data/example_usr_dir/my_submodule.py
@@ -14,9 +14,17 @@
 # limitations under the License.
 
 """Example registrations for T2T."""
+import re
+
+from tensor2tensor.data_generators import problem
+from tensor2tensor.data_generators import text_problems
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.utils import registry
 
+# Use register_model for a new T2TModel
+# Use register_problem for a new Problem
+# Use register_hparams for a new hyperparameter set
+
 
 @registry.register_hparams
 def my_very_own_hparams():
@@ -28,5 +36,64 @@ def my_very_own_hparams():
   hp.add_hparam("filter_size", 2048)
   return hp
 
-# Use register_model for a new T2TModel
-# Use register_problem for a new Problem
+
+@registry.register_problem
+class PoetryLines(text_problems.Text2TextProblem):
+  """Predict next line of poetry from the last line. From Gutenberg texts."""
+
+  @property
+  def approx_vocab_size(self):
+    return 2**13  # ~8k
+
+  @property
+  def is_generate_per_split(self):
+    # generate_data will shard the data into TRAIN and EVAL for us.
+    return False
+
+  @property
+  def dataset_splits(self):
+    """Splits of data to produce and number of output shards for each."""
+    # 10% evaluation data
+    return [{
+        "split": problem.DatasetSplit.TRAIN,
+        "shards": 90,
+    }, {
+        "split": problem.DatasetSplit.EVAL,
+        "shards": 10,
+    }]
+
+  def generate_samples(self, data_dir, tmp_dir, dataset_split):
+    del data_dir
+    del tmp_dir
+    del dataset_split
+
+    # pylint: disable=g-import-not-at-top
+    from gutenberg import acquire
+    from gutenberg import cleanup
+    # pylint: enable=g-import-not-at-top
+
+    books = [
+        # bookid, skip N lines
+        (19221, 223),
+        (15553, 522),
+    ]
+
+    for (book_id, toskip) in books:
+      text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
+      lines = text.split("\n")[toskip:]
+      prev_line = None
+      ex_count = 0
+      for line in lines:
+        # Any line that is all upper case is a title or author name
+        if not line or line.upper() == line:
+          prev_line = None
+          continue
+
+        line = re.sub("[^a-z]+", " ", line.strip().lower())
+        if prev_line and line:
+          yield {
+              "inputs": prev_line,
+              "targets": line,
+          }
+          ex_count += 1
+        prev_line = line
diff --git a/tensor2tensor/test_data/example_usr_dir/setup.py b/tensor2tensor/test_data/example_usr_dir/setup.py
@@ -0,0 +1,34 @@
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example setup.py for a t2t_usr_dir launching on Cloud ML Engine.
+
+This is only necessary if you have additional required pip packages for the
+import of your usr_dir, and only if you're launching t2t-trainer on Cloud ML
+Engine with the --cloud_mlengine flag.
+
+Note that the call to setup uses find_packages() and that the location of this
+file is alongside the __init__.py file that imports my_submodule.
+"""
+from setuptools import find_packages
+from setuptools import setup
+setup(
+    name='DummyUsrDirPackage',
+    version='0.1',
+    packages=find_packages(),
+    install_requires=[
+        'gutenberg',
+    ],
+)
diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py
@@ -47,12 +47,24 @@
 """
 
 
+def job_dir():
+  # The flag --job-dir is parsed differently before and after switching to absl
+  return getattr(FLAGS, 'job-dir', '') or getattr(FLAGS, 'job_dir', '')
+
+
 def flags_as_args():
   """Convert FLAGS to list of args suitable for passing on cmd line."""
-  args_dict = dict(FLAGS.__dict__['__flags'])
+  if hasattr(FLAGS, 'flag_values_dict'):
+    args_dict = FLAGS.flag_values_dict()
+  else:
+    args_dict = dict(FLAGS.__dict__['__flags'])
   del args_dict['cloud_mlengine']
   # Configured later
   del args_dict['t2t_usr_dir']
+  args_dict.pop('h', None)
+  args_dict.pop('helpfull', None)
+  args_dict.pop('helpshort', None)
+  args_dict.pop('help', None)
   args = []
   for name, val in args_dict.items():
     if val is None:
@@ -223,7 +235,7 @@ def configure_usr_dir(job_spec, usr_tar):
 def launch():
   """Launch t2t_trainer on Cloud ML Engine."""
   assert not FLAGS.cloud_tpu
-  assert not FLAGS.job_dir
+  assert not job_dir()
   assert FLAGS.output_dir.startswith('gs://')
   assert FLAGS.data_dir.startswith('gs://')
   assert FLAGS.worker_replicas <= 1