Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 1368b00

Browse files
authored
Merge pull request #158 from stefan-it/mken-problem
Add new problem: Macedonian to English (SETimes corpus)
2 parents 0c66117 + 48997b5 commit 1368b00

File tree

3 files changed

+60
-0
lines changed

3 files changed

+60
-0
lines changed

tensor2tensor/data_generators/generator_utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,11 @@ def gunzip_file(gz_path, new_path):
244244
"http://www.statmt.org/wmt13/training-parallel-un.tgz",
245245
["un/undoc.2000.fr-en.en", "un/undoc.2000.fr-en.fr"]
246246
],
247+
# Macedonian-English
248+
[
249+
"https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz", # pylint: disable=line-too-long
250+
["train.mk", "train.en"]
251+
],
247252
]
248253

249254

tensor2tensor/data_generators/problem.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class SpaceID(object):
6767
ICE_TOK = 18
6868
# Icelandic parse tokens
6969
ICE_PARSE_TOK = 19
70+
# Macedonian tokens
71+
MK_TOK = 20
7072

7173

7274
class Problem(object):

tensor2tensor/data_generators/wmt.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,31 @@ def _default_wmt_feature_encoders(data_dir, target_vocab_size):
8181
"targets": subtokenizer,
8282
}
8383

84+
@registry.register_problem("setimes_mken_tokens_32k")
85+
class SETimesMkEnTokens32k(problem.Problem):
86+
"""Problem spec for SETimes Mk-En translation."""
87+
88+
@property
89+
def target_vocab_size(self):
90+
return 2**15 # 32768
91+
92+
def feature_encoders(self, data_dir):
93+
return _default_wmt_feature_encoders(data_dir, self.target_vocab_size)
94+
95+
def generate_data(self, data_dir, tmp_dir):
96+
generator_utils.generate_dataset_and_shuffle(
97+
mken_wordpiece_token_generator(tmp_dir, True, self.target_vocab_size),
98+
self.training_filepaths(data_dir, 100, shuffled=False),
99+
mken_wordpiece_token_generator(tmp_dir, False, self.target_vocab_size),
100+
self.dev_filepaths(data_dir, 1, shuffled=False))
101+
102+
def hparams(self, defaults, unused_model_hparams):
103+
p = defaults
104+
vocab_size = self._encoders["inputs"].vocab_size
105+
p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)}
106+
p.target_modality = (registry.Modalities.SYMBOL, vocab_size)
107+
p.input_space_id = problem.SpaceID.MK_TOK
108+
p.target_space_id = problem.SpaceID.EN_TOK
84109

85110
# End-of-sentence marker.
86111
EOS = text_encoder.EOS_TOKEN
@@ -295,6 +320,21 @@ def ende_bpe_token_generator(tmp_dir, train):
295320
("dev/newsdev2017-zhen-src.zh", "dev/newsdev2017-zhen-ref.en")
296321
]]
297322

323+
# For Macedonian-English the SETimes corpus
324+
# from http://nlp.ffzg.hr/resources/corpora/setimes/ is used.
325+
# The original dataset has 207,777 parallel sentences.
326+
# For training the first 205,777 sentences are used.
327+
_MKEN_TRAIN_DATASETS = [[
328+
"https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz", # pylint: disable=line-too-long
329+
("train.mk", "train.en")
330+
]]
331+
332+
# For development 1000 parallel sentences are used.
333+
_MKEN_TEST_DATASETS = [[
334+
"https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.dev.tgz", # pylint: disable=line-too-long
335+
("dev.mk", "dev.en")
336+
]]
337+
298338

299339
def _compile_data(tmp_dir, datasets, filename):
300340
"""Concatenate all `datasets` and save to `filename`."""
@@ -393,6 +433,19 @@ def enfr_character_generator(tmp_dir, train):
393433
return character_generator(data_path + ".lang1", data_path + ".lang2",
394434
character_vocab, EOS)
395435

436+
def mken_wordpiece_token_generator(tmp_dir, train, vocab_size):
437+
"""Wordpiece generator for the SETimes Mk-En dataset."""
438+
datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
439+
source_datasets = [[item[0], [item[1][0]]] for item in datasets]
440+
target_datasets = [[item[0], [item[1][1]]] for item in datasets]
441+
symbolizer_vocab = generator_utils.get_or_generate_vocab(
442+
tmp_dir, "tokens.vocab.%d" % vocab_size, vocab_size,
443+
source_datasets + target_datasets)
444+
tag = "train" if train else "dev"
445+
data_path = _compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
446+
return token_generator(data_path + ".lang1", data_path + ".lang2",
447+
symbolizer_vocab, EOS)
448+
396449

397450
def parsing_character_generator(tmp_dir, train):
398451
character_vocab = text_encoder.ByteTextEncoder()

0 commit comments

Comments
 (0)