|
33 | 33 | # End-of-sentence marker. |
34 | 34 | EOS = text_encoder.EOS_ID |
35 | 35 |
|
36 | | -# For Macedonian-English the SETimes corpus |
| 36 | +# For English-Macedonian the SETimes corpus |
37 | 37 | # from http://nlp.ffzg.hr/resources/corpora/setimes/ is used. |
38 | 38 | # The original dataset has 207,777 parallel sentences. |
39 | 39 | # For training the first 205,777 sentences are used. |
40 | | -_MKEN_TRAIN_DATASETS = [[ |
| 40 | +_ENMK_TRAIN_DATASETS = [[ |
41 | 41 | "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz", # pylint: disable=line-too-long |
42 | | - ("train.mk", "train.en") |
| 42 | + ("train.en", "train.mk") |
43 | 43 | ]] |
44 | 44 |
|
45 | 45 | # For development 1000 parallel sentences are used. |
46 | | -_MKEN_TEST_DATASETS = [[ |
| 46 | +_ENMK_TEST_DATASETS = [[ |
47 | 47 | "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.dev.tgz", # pylint: disable=line-too-long |
48 | | - ("dev.mk", "dev.en") |
| 48 | + ("dev.en", "dev.mk") |
49 | 49 | ]] |
50 | 50 |
|
51 | 51 |
|
52 | 52 | @registry.register_problem |
53 | 53 | class TranslateEnmkSetimes32k(translate.TranslateProblem): |
54 | | - """Problem spec for SETimes Mk-En translation.""" |
| 54 | + """Problem spec for SETimes En-Mk translation.""" |
55 | 55 |
|
56 | 56 | @property |
57 | 57 | def approx_vocab_size(self): |
58 | 58 | return 2**15 # 32768 |
59 | 59 |
|
60 | 60 | @property |
61 | 61 | def vocab_filename(self): |
62 | | - return "vocab.mken.%d" % self.approx_vocab_size |
| 62 | + return "vocab.enmk.%d" % self.approx_vocab_size |
63 | 63 |
|
64 | 64 | def source_data_files(self, dataset_split): |
65 | 65 | train = dataset_split == problem.DatasetSplit.TRAIN |
66 | | - return _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS |
| 66 | + return _ENMK_TRAIN_DATASETS if train else _ENMK_TEST_DATASETS |
0 commit comments