@@ -81,6 +81,31 @@ def _default_wmt_feature_encoders(data_dir, target_vocab_size):
8181 "targets" : subtokenizer ,
8282 }
8383
84+ @registry .register_problem ("setimes_mken_tokens_32k" )
85+ class SETimesMkEnTokens32k (problem .Problem ):
86+ """Problem spec for SETimes Mk-En translation."""
87+
88+ @property
89+ def target_vocab_size (self ):
90+ return 2 ** 15 # 32768
91+
92+ def feature_encoders (self , data_dir ):
93+ return _default_wmt_feature_encoders (data_dir , self .target_vocab_size )
94+
95+ def generate_data (self , data_dir , tmp_dir ):
96+ generator_utils .generate_dataset_and_shuffle (
97+ mken_wordpiece_token_generator (tmp_dir , True , self .target_vocab_size ),
98+ self .training_filepaths (data_dir , 100 , shuffled = False ),
99+ mken_wordpiece_token_generator (tmp_dir , False , self .target_vocab_size ),
100+ self .dev_filepaths (data_dir , 1 , shuffled = False ))
101+
102+ def hparams (self , defaults , unused_model_hparams ):
103+ p = defaults
104+ vocab_size = self ._encoders ["inputs" ].vocab_size
105+ p .input_modality = {"inputs" : (registry .Modalities .SYMBOL , vocab_size )}
106+ p .target_modality = (registry .Modalities .SYMBOL , vocab_size )
107+ p .input_space_id = problem .SpaceID .MK_TOK
108+ p .target_space_id = problem .SpaceID .EN_TOK
84109
85110# End-of-sentence marker.
86111EOS = text_encoder .EOS_TOKEN
@@ -295,6 +320,21 @@ def ende_bpe_token_generator(tmp_dir, train):
295320 ("dev/newsdev2017-zhen-src.zh" , "dev/newsdev2017-zhen-ref.en" )
296321]]
297322
323+ # For Macedonian-English the SETimes corpus
324+ # from http://nlp.ffzg.hr/resources/corpora/setimes/ is used.
325+ # The original dataset has 207,777 parallel sentences.
326+ # For training the first 205,777 sentences are used.
327+ _MKEN_TRAIN_DATASETS = [[
328+ "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz" , # pylint: disable=line-too-long
329+ ("train.mk" , "train.en" )
330+ ]]
331+
332+ # For development 1000 parallel sentences are used.
333+ _MKEN_TEST_DATASETS = [[
334+ "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.dev.tgz" , # pylint: disable=line-too-long
335+ ("dev.mk" , "dev.en" )
336+ ]]
337+
298338
299339def _compile_data (tmp_dir , datasets , filename ):
300340 """Concatenate all `datasets` and save to `filename`."""
@@ -393,6 +433,19 @@ def enfr_character_generator(tmp_dir, train):
393433 return character_generator (data_path + ".lang1" , data_path + ".lang2" ,
394434 character_vocab , EOS )
395435
436+ def mken_wordpiece_token_generator (tmp_dir , train , vocab_size ):
437+ """Wordpiece generator for the SETimes Mk-En dataset."""
438+ datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
439+ source_datasets = [[item [0 ], [item [1 ][0 ]]] for item in datasets ]
440+ target_datasets = [[item [0 ], [item [1 ][1 ]]] for item in datasets ]
441+ symbolizer_vocab = generator_utils .get_or_generate_vocab (
442+ tmp_dir , "tokens.vocab.%d" % vocab_size , vocab_size ,
443+ source_datasets + target_datasets )
444+ tag = "train" if train else "dev"
445+ data_path = _compile_data (tmp_dir , datasets , "setimes_mken_tok_%s" % tag )
446+ return token_generator (data_path + ".lang1" , data_path + ".lang2" ,
447+ symbolizer_vocab , EOS )
448+
396449
397450def parsing_character_generator (tmp_dir , train ):
398451 character_vocab = text_encoder .ByteTextEncoder ()
0 commit comments