Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit fa92cbe

Browse files
Adaptation to upstream changes
1 parent ed695f4 commit fa92cbe

File tree

2 files changed

+47
-51
lines changed

2 files changed

+47
-51
lines changed

tensor2tensor/bin/t2t-trainer

100644100755
File mode changed.

tensor2tensor/data_generators/wmt.py

Lines changed: 47 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -75,57 +75,6 @@ def train_generator(self):
7575
"""Generator; takes data_dir, tmp_dir, is_training, targeted_vocab_size."""
7676
raise NotImplementedError()
7777

78-
79-
@registry.register_problem("ice_parsing_tokens")
80-
class IceParsingTokens(problem.Problem):
81-
"""Problem spec for parsing tokenized Icelandic text to
82-
constituency trees, also tokenized but to a smaller vocabulary."""
83-
84-
@property
85-
def source_vocab_size(self):
86-
return 2**13 # 8192
87-
88-
@property
89-
def target_vocab_size(self):
90-
return 2**8 # 256
91-
92-
def feature_encoders(self, data_dir):
93-
source_vocab_filename = os.path.join(
94-
data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
95-
target_vocab_filename = os.path.join(
96-
data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
97-
source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
98-
target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
99-
return {
100-
"inputs": source_subtokenizer,
101-
"targets": target_subtokenizer,
102-
}
103-
104-
def generate_data(self, data_dir, tmp_dir, num_shards=100):
105-
generator_utils.generate_dataset_and_shuffle(
106-
tabbed_parsing_token_generator(tmp_dir, True, "ice",
107-
self.source_vocab_size,
108-
self.target_vocab_size),
109-
self.training_filepaths(data_dir, num_shards, shuffled=False),
110-
tabbed_parsing_token_generator(tmp_dir, False, "ice",
111-
self.source_vocab_size,
112-
self.target_vocab_size),
113-
self.dev_filepaths(data_dir, 1, shuffled=False))
114-
115-
def hparams(self, defaults, unused_model_hparams):
116-
p = defaults
117-
source_vocab_size = self._encoders["inputs"].vocab_size
118-
p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
119-
p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
120-
p.input_space_id = problem.SpaceID.ICE_TOK
121-
p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
122-
p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
123-
124-
125-
@registry.register_problem("setimes_mken_tokens_32k")
126-
class SETimesMkEnTokens32k(problem.Problem):
127-
"""Problem spec for SETimes Mk-En translation."""
128-
12978
@property
13079
def dev_generator(self):
13180
return self.train_generator
@@ -734,3 +683,50 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
734683
tree_filepath = os.path.join(tmp_dir, filename)
735684
return wsj_parsing.token_generator(tree_filepath, symbolizer_vocab,
736685
symbolizer_vocab, EOS)
686+
687+
688+
@registry.register_problem("ice_parsing_tokens")
689+
class IceParsingTokens(problem.Problem):
690+
"""Problem spec for parsing tokenized Icelandic text to
691+
constituency trees, also tokenized but to a smaller vocabulary."""
692+
693+
@property
694+
def source_vocab_size(self):
695+
return 2**13 # 8192
696+
697+
@property
698+
def target_vocab_size(self):
699+
return 2**8 # 256
700+
701+
def feature_encoders(self, data_dir):
702+
source_vocab_filename = os.path.join(
703+
data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
704+
target_vocab_filename = os.path.join(
705+
data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
706+
source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
707+
target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
708+
return {
709+
"inputs": source_subtokenizer,
710+
"targets": target_subtokenizer,
711+
}
712+
713+
def generate_data(self, data_dir, tmp_dir, num_shards=100):
714+
generator_utils.generate_dataset_and_shuffle(
715+
tabbed_parsing_token_generator(tmp_dir, True, "ice",
716+
self.source_vocab_size,
717+
self.target_vocab_size),
718+
self.training_filepaths(data_dir, num_shards, shuffled=False),
719+
tabbed_parsing_token_generator(tmp_dir, False, "ice",
720+
self.source_vocab_size,
721+
self.target_vocab_size),
722+
self.dev_filepaths(data_dir, 1, shuffled=False))
723+
724+
def hparams(self, defaults, unused_model_hparams):
725+
p = defaults
726+
source_vocab_size = self._encoders["inputs"].vocab_size
727+
p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
728+
p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
729+
p.input_space_id = problem.SpaceID.ICE_TOK
730+
p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
731+
p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
732+

0 commit comments

Comments
 (0)