@@ -75,57 +75,6 @@ def train_generator(self):
7575 """Generator; takes data_dir, tmp_dir, is_training, targeted_vocab_size."""
7676 raise NotImplementedError ()
7777
78-
79- @registry .register_problem ("ice_parsing_tokens" )
80- class IceParsingTokens (problem .Problem ):
81- """Problem spec for parsing tokenized Icelandic text to
82- constituency trees, also tokenized but to a smaller vocabulary."""
83-
84- @property
85- def source_vocab_size (self ):
86- return 2 ** 13 # 8192
87-
88- @property
89- def target_vocab_size (self ):
90- return 2 ** 8 # 256
91-
92- def feature_encoders (self , data_dir ):
93- source_vocab_filename = os .path .join (
94- data_dir , "ice_source.tokens.vocab.%d" % self .source_vocab_size )
95- target_vocab_filename = os .path .join (
96- data_dir , "ice_target.tokens.vocab.%d" % self .target_vocab_size )
97- source_subtokenizer = text_encoder .SubwordTextEncoder (source_vocab_filename )
98- target_subtokenizer = text_encoder .SubwordTextEncoder (target_vocab_filename )
99- return {
100- "inputs" : source_subtokenizer ,
101- "targets" : target_subtokenizer ,
102- }
103-
104- def generate_data (self , data_dir , tmp_dir , num_shards = 100 ):
105- generator_utils .generate_dataset_and_shuffle (
106- tabbed_parsing_token_generator (tmp_dir , True , "ice" ,
107- self .source_vocab_size ,
108- self .target_vocab_size ),
109- self .training_filepaths (data_dir , num_shards , shuffled = False ),
110- tabbed_parsing_token_generator (tmp_dir , False , "ice" ,
111- self .source_vocab_size ,
112- self .target_vocab_size ),
113- self .dev_filepaths (data_dir , 1 , shuffled = False ))
114-
115- def hparams (self , defaults , unused_model_hparams ):
116- p = defaults
117- source_vocab_size = self ._encoders ["inputs" ].vocab_size
118- p .input_modality = {"inputs" : (registry .Modalities .SYMBOL , source_vocab_size )}
119- p .target_modality = (registry .Modalities .SYMBOL , self .target_vocab_size )
120- p .input_space_id = problem .SpaceID .ICE_TOK
121- p .target_space_id = problem .SpaceID .ICE_PARSE_TOK
122- p .loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
123-
124-
125- @registry .register_problem ("setimes_mken_tokens_32k" )
126- class SETimesMkEnTokens32k (problem .Problem ):
127- """Problem spec for SETimes Mk-En translation."""
128-
12978 @property
13079 def dev_generator (self ):
13180 return self .train_generator
@@ -734,3 +683,50 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
734683 tree_filepath = os .path .join (tmp_dir , filename )
735684 return wsj_parsing .token_generator (tree_filepath , symbolizer_vocab ,
736685 symbolizer_vocab , EOS )
686+
687+
688+ @registry .register_problem ("ice_parsing_tokens" )
689+ class IceParsingTokens (problem .Problem ):
690+ """Problem spec for parsing tokenized Icelandic text to
691+ constituency trees, also tokenized but to a smaller vocabulary."""
692+
693+ @property
694+ def source_vocab_size (self ):
695+ return 2 ** 13 # 8192
696+
697+ @property
698+ def target_vocab_size (self ):
699+ return 2 ** 8 # 256
700+
701+ def feature_encoders (self , data_dir ):
702+ source_vocab_filename = os .path .join (
703+ data_dir , "ice_source.tokens.vocab.%d" % self .source_vocab_size )
704+ target_vocab_filename = os .path .join (
705+ data_dir , "ice_target.tokens.vocab.%d" % self .target_vocab_size )
706+ source_subtokenizer = text_encoder .SubwordTextEncoder (source_vocab_filename )
707+ target_subtokenizer = text_encoder .SubwordTextEncoder (target_vocab_filename )
708+ return {
709+ "inputs" : source_subtokenizer ,
710+ "targets" : target_subtokenizer ,
711+ }
712+
713+ def generate_data (self , data_dir , tmp_dir , num_shards = 100 ):
714+ generator_utils .generate_dataset_and_shuffle (
715+ tabbed_parsing_token_generator (tmp_dir , True , "ice" ,
716+ self .source_vocab_size ,
717+ self .target_vocab_size ),
718+ self .training_filepaths (data_dir , num_shards , shuffled = False ),
719+ tabbed_parsing_token_generator (tmp_dir , False , "ice" ,
720+ self .source_vocab_size ,
721+ self .target_vocab_size ),
722+ self .dev_filepaths (data_dir , 1 , shuffled = False ))
723+
724+ def hparams (self , defaults , unused_model_hparams ):
725+ p = defaults
726+ source_vocab_size = self ._encoders ["inputs" ].vocab_size
727+ p .input_modality = {"inputs" : (registry .Modalities .SYMBOL , source_vocab_size )}
728+ p .target_modality = (registry .Modalities .SYMBOL , self .target_vocab_size )
729+ p .input_space_id = problem .SpaceID .ICE_TOK
730+ p .target_space_id = problem .SpaceID .ICE_PARSE_TOK
731+ p .loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
732+
0 commit comments