@@ -162,36 +162,6 @@ def character_generator(source_path, target_path, character_vocab, eos=None):
162162 source , target = source_file .readline (), target_file .readline ()
163163
164164
165- def tabbed_generator (source_path , source_vocab , target_vocab , eos = None ):
166- r"""Generator for sequence-to-sequence tasks using tabbed files.
167-
168- Tokens are derived from text files where each line contains both
169- a source and a target string. The two strings are separated by a tab
170- character ('\t'). It yields dictionaries of "inputs" and "targets" where
171- inputs are characters from the source lines converted to integers, and
172- targets are characters from the target lines, also converted to integers.
173-
174- Args:
175- source_path: path to the file with source and target sentences.
176- source_vocab: a SunwordTextEncoder to encode the source string.
177- target_vocab: a SunwordTextEncoder to encode the target string.
178- eos: integer to append at the end of each sequence (default: None).
179-
180- Yields:
181- A dictionary {"inputs": source-line, "targets": target-line} where
182- the lines are integer lists converted from characters in the file lines.
183- """
184- eos_list = [] if eos is None else [eos ]
185- with tf .gfile .GFile (source_path , mode = "r" ) as source_file :
186- for line in source_file :
187- if line and "\t " in line :
188- parts = line .split ("\t " , maxsplit = 1 )
189- source , target = parts [0 ].strip (), parts [1 ].strip ()
190- source_ints = source_vocab .encode (source ) + eos_list
191- target_ints = target_vocab .encode (target ) + eos_list
192- yield {"inputs" : source_ints , "targets" : target_ints }
193-
194-
195165def token_generator (source_path , target_path , token_vocab , eos = None ):
196166 """Generator for sequence-to-sequence tasks that uses tokens.
197167
@@ -255,6 +225,36 @@ def bi_vocabs_token_generator(source_path,
255225 source , target = source_file .readline (), target_file .readline ()
256226
257227
228+ def tabbed_generator (source_path , source_vocab , target_vocab , eos = None ):
229+ r"""Generator for sequence-to-sequence tasks using tabbed files.
230+
231+ Tokens are derived from text files where each line contains both
232+ a source and a target string. The two strings are separated by a tab
233+ character ('\t'). It yields dictionaries of "inputs" and "targets" where
234+ inputs are characters from the source lines converted to integers, and
235+ targets are characters from the target lines, also converted to integers.
236+
237+ Args:
238+ source_path: path to the file with source and target sentences.
239+ source_vocab: a SunwordTextEncoder to encode the source string.
240+ target_vocab: a SunwordTextEncoder to encode the target string.
241+ eos: integer to append at the end of each sequence (default: None).
242+
243+ Yields:
244+ A dictionary {"inputs": source-line, "targets": target-line} where
245+ the lines are integer lists converted from characters in the file lines.
246+ """
247+ eos_list = [] if eos is None else [eos ]
248+ with tf .gfile .GFile (source_path , mode = "r" ) as source_file :
249+ for line in source_file :
250+ if line and "\t " in line :
251+ parts = line .split ("\t " , maxsplit = 1 )
252+ source , target = parts [0 ].strip (), parts [1 ].strip ()
253+ source_ints = source_vocab .encode (source ) + eos_list
254+ target_ints = target_vocab .encode (target ) + eos_list
255+ yield {"inputs" : source_ints , "targets" : target_ints }
256+
257+
258258# Data-set URLs.
259259
260260
@@ -654,28 +654,6 @@ def parsing_character_generator(tmp_dir, train):
654654 return character_generator (text_filepath , tags_filepath , character_vocab , EOS )
655655
656656
657- def tabbed_parsing_token_generator (data_dir , tmp_dir , train , prefix ,
658- source_vocab_size , target_vocab_size ):
659- """Generate source and target data from a single file."""
660- source_vocab = generator_utils .get_or_generate_tabbed_vocab (
661- data_dir , tmp_dir , "parsing_train.pairs" , 0 ,
662- prefix + "_source.vocab.%d" % source_vocab_size , source_vocab_size )
663- target_vocab = generator_utils .get_or_generate_tabbed_vocab (
664- data_dir , tmp_dir , "parsing_train.pairs" , 1 ,
665- prefix + "_target.vocab.%d" % target_vocab_size , target_vocab_size )
666- filename = "parsing_%s" % ("train" if train else "dev" )
667- pair_filepath = os .path .join (tmp_dir , filename + ".pairs" )
668- return tabbed_generator (pair_filepath , source_vocab , target_vocab , EOS )
669-
670-
671- def tabbed_parsing_character_generator (tmp_dir , train ):
672- """Generate source and target data from a single file."""
673- character_vocab = text_encoder .ByteTextEncoder ()
674- filename = "parsing_%s" % ("train" if train else "dev" )
675- pair_filepath = os .path .join (tmp_dir , filename + ".pairs" )
676- return tabbed_generator (pair_filepath , character_vocab , character_vocab , EOS )
677-
678-
679657def parsing_token_generator (data_dir , tmp_dir , train , vocab_size ):
680658 symbolizer_vocab = generator_utils .get_or_generate_vocab (
681659 data_dir , tmp_dir , "vocab.endefr.%d" % vocab_size , vocab_size )
@@ -685,48 +663,3 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
685663 symbolizer_vocab , EOS )
686664
687665
688- @registry .register_problem ("ice_parsing_tokens" )
689- class IceParsingTokens (problem .Problem ):
690- """Problem spec for parsing tokenized Icelandic text to
691- constituency trees, also tokenized but to a smaller vocabulary."""
692-
693- @property
694- def source_vocab_size (self ):
695- return 2 ** 13 # 8192
696-
697- @property
698- def target_vocab_size (self ):
699- return 2 ** 8 # 256
700-
701- def feature_encoders (self , data_dir ):
702- source_vocab_filename = os .path .join (
703- data_dir , "ice_source.tokens.vocab.%d" % self .source_vocab_size )
704- target_vocab_filename = os .path .join (
705- data_dir , "ice_target.tokens.vocab.%d" % self .target_vocab_size )
706- source_subtokenizer = text_encoder .SubwordTextEncoder (source_vocab_filename )
707- target_subtokenizer = text_encoder .SubwordTextEncoder (target_vocab_filename )
708- return {
709- "inputs" : source_subtokenizer ,
710- "targets" : target_subtokenizer ,
711- }
712-
713- def generate_data (self , data_dir , tmp_dir , num_shards = 100 ):
714- generator_utils .generate_dataset_and_shuffle (
715- tabbed_parsing_token_generator (tmp_dir , True , "ice" ,
716- self .source_vocab_size ,
717- self .target_vocab_size ),
718- self .training_filepaths (data_dir , num_shards , shuffled = False ),
719- tabbed_parsing_token_generator (tmp_dir , False , "ice" ,
720- self .source_vocab_size ,
721- self .target_vocab_size ),
722- self .dev_filepaths (data_dir , 1 , shuffled = False ))
723-
724- def hparams (self , defaults , unused_model_hparams ):
725- p = defaults
726- source_vocab_size = self ._encoders ["inputs" ].vocab_size
727- p .input_modality = {"inputs" : (registry .Modalities .SYMBOL , source_vocab_size )}
728- p .target_modality = (registry .Modalities .SYMBOL , self .target_vocab_size )
729- p .input_space_id = problem .SpaceID .ICE_TOK
730- p .target_space_id = problem .SpaceID .ICE_PARSE_TOK
731- p .loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
732-
0 commit comments