Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit d69ad4d

Browse files
Moved Icelandic parsing to separate module
1 parent fa92cbe commit d69ad4d

File tree

4 files changed

+159
-120
lines changed

4 files changed

+159
-120
lines changed

tensor2tensor/data_generators/wmt.py

Lines changed: 30 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -162,36 +162,6 @@ def character_generator(source_path, target_path, character_vocab, eos=None):
162162
source, target = source_file.readline(), target_file.readline()
163163

164164

165-
def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
166-
r"""Generator for sequence-to-sequence tasks using tabbed files.
167-
168-
Tokens are derived from text files where each line contains both
169-
a source and a target string. The two strings are separated by a tab
170-
character ('\t'). It yields dictionaries of "inputs" and "targets" where
171-
inputs are characters from the source lines converted to integers, and
172-
targets are characters from the target lines, also converted to integers.
173-
174-
Args:
175-
source_path: path to the file with source and target sentences.
176-
source_vocab: a SunwordTextEncoder to encode the source string.
177-
target_vocab: a SunwordTextEncoder to encode the target string.
178-
eos: integer to append at the end of each sequence (default: None).
179-
180-
Yields:
181-
A dictionary {"inputs": source-line, "targets": target-line} where
182-
the lines are integer lists converted from characters in the file lines.
183-
"""
184-
eos_list = [] if eos is None else [eos]
185-
with tf.gfile.GFile(source_path, mode="r") as source_file:
186-
for line in source_file:
187-
if line and "\t" in line:
188-
parts = line.split("\t", maxsplit=1)
189-
source, target = parts[0].strip(), parts[1].strip()
190-
source_ints = source_vocab.encode(source) + eos_list
191-
target_ints = target_vocab.encode(target) + eos_list
192-
yield {"inputs": source_ints, "targets": target_ints}
193-
194-
195165
def token_generator(source_path, target_path, token_vocab, eos=None):
196166
"""Generator for sequence-to-sequence tasks that uses tokens.
197167
@@ -255,6 +225,36 @@ def bi_vocabs_token_generator(source_path,
255225
source, target = source_file.readline(), target_file.readline()
256226

257227

228+
def tabbed_generator(source_path, source_vocab, target_vocab, eos=None):
229+
r"""Generator for sequence-to-sequence tasks using tabbed files.
230+
231+
Tokens are derived from text files where each line contains both
232+
a source and a target string. The two strings are separated by a tab
233+
character ('\t'). It yields dictionaries of "inputs" and "targets" where
234+
inputs are characters from the source lines converted to integers, and
235+
targets are characters from the target lines, also converted to integers.
236+
237+
Args:
238+
source_path: path to the file with source and target sentences.
239+
source_vocab: a SunwordTextEncoder to encode the source string.
240+
target_vocab: a SunwordTextEncoder to encode the target string.
241+
eos: integer to append at the end of each sequence (default: None).
242+
243+
Yields:
244+
A dictionary {"inputs": source-line, "targets": target-line} where
245+
the lines are integer lists converted from characters in the file lines.
246+
"""
247+
eos_list = [] if eos is None else [eos]
248+
with tf.gfile.GFile(source_path, mode="r") as source_file:
249+
for line in source_file:
250+
if line and "\t" in line:
251+
parts = line.split("\t", maxsplit=1)
252+
source, target = parts[0].strip(), parts[1].strip()
253+
source_ints = source_vocab.encode(source) + eos_list
254+
target_ints = target_vocab.encode(target) + eos_list
255+
yield {"inputs": source_ints, "targets": target_ints}
256+
257+
258258
# Data-set URLs.
259259

260260

@@ -654,28 +654,6 @@ def parsing_character_generator(tmp_dir, train):
654654
return character_generator(text_filepath, tags_filepath, character_vocab, EOS)
655655

656656

657-
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
658-
source_vocab_size, target_vocab_size):
659-
"""Generate source and target data from a single file."""
660-
source_vocab = generator_utils.get_or_generate_tabbed_vocab(
661-
data_dir, tmp_dir, "parsing_train.pairs", 0,
662-
prefix + "_source.vocab.%d" % source_vocab_size, source_vocab_size)
663-
target_vocab = generator_utils.get_or_generate_tabbed_vocab(
664-
data_dir, tmp_dir, "parsing_train.pairs", 1,
665-
prefix + "_target.vocab.%d" % target_vocab_size, target_vocab_size)
666-
filename = "parsing_%s" % ("train" if train else "dev")
667-
pair_filepath = os.path.join(tmp_dir, filename + ".pairs")
668-
return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS)
669-
670-
671-
def tabbed_parsing_character_generator(tmp_dir, train):
672-
"""Generate source and target data from a single file."""
673-
character_vocab = text_encoder.ByteTextEncoder()
674-
filename = "parsing_%s" % ("train" if train else "dev")
675-
pair_filepath = os.path.join(tmp_dir, filename + ".pairs")
676-
return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
677-
678-
679657
def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
680658
symbolizer_vocab = generator_utils.get_or_generate_vocab(
681659
data_dir, tmp_dir, "vocab.endefr.%d" % vocab_size, vocab_size)
@@ -685,48 +663,3 @@ def parsing_token_generator(data_dir, tmp_dir, train, vocab_size):
685663
symbolizer_vocab, EOS)
686664

687665

688-
@registry.register_problem("ice_parsing_tokens")
689-
class IceParsingTokens(problem.Problem):
690-
"""Problem spec for parsing tokenized Icelandic text to
691-
constituency trees, also tokenized but to a smaller vocabulary."""
692-
693-
@property
694-
def source_vocab_size(self):
695-
return 2**13 # 8192
696-
697-
@property
698-
def target_vocab_size(self):
699-
return 2**8 # 256
700-
701-
def feature_encoders(self, data_dir):
702-
source_vocab_filename = os.path.join(
703-
data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
704-
target_vocab_filename = os.path.join(
705-
data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
706-
source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
707-
target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
708-
return {
709-
"inputs": source_subtokenizer,
710-
"targets": target_subtokenizer,
711-
}
712-
713-
def generate_data(self, data_dir, tmp_dir, num_shards=100):
714-
generator_utils.generate_dataset_and_shuffle(
715-
tabbed_parsing_token_generator(tmp_dir, True, "ice",
716-
self.source_vocab_size,
717-
self.target_vocab_size),
718-
self.training_filepaths(data_dir, num_shards, shuffled=False),
719-
tabbed_parsing_token_generator(tmp_dir, False, "ice",
720-
self.source_vocab_size,
721-
self.target_vocab_size),
722-
self.dev_filepaths(data_dir, 1, shuffled=False))
723-
724-
def hparams(self, defaults, unused_model_hparams):
725-
p = defaults
726-
source_vocab_size = self._encoders["inputs"].vocab_size
727-
p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
728-
p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
729-
p.input_space_id = problem.SpaceID.ICE_TOK
730-
p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
731-
p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
732-
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
from .ice_parsing import IceParsingTokens, transformer_parsing_ice, transformer_parsing_ice_big
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# Copyright 2017 The Tensor2Tensor Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This module implements the ice_parsing_* problems, which
16+
# parse plain text into flattened parse trees and POS tags.
17+
# The training data is stored in files named `parsing_train.pairs`
18+
# and `parsing_dev.pairs`. These files are UTF-8 text files where
19+
# each line contains an input sentence and a target parse tree,
20+
# separated by a tab character.
21+
22+
import os
23+
24+
# Dependency imports
25+
26+
from tensor2tensor.data_generators import generator_utils
27+
from tensor2tensor.data_generators import problem
28+
from tensor2tensor.data_generators import text_encoder
29+
from tensor2tensor.data_generators.wmt import tabbed_generator
30+
from tensor2tensor.utils import registry
31+
from tensor2tensor.models import transformer
32+
33+
import tensorflow as tf
34+
35+
36+
# End-of-sentence marker.
37+
EOS = text_encoder.EOS_ID
38+
39+
40+
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix,
41+
source_vocab_size, target_vocab_size):
42+
"""Generate source and target data from a single file."""
43+
filename = "parsing_{0}.pairs".format("train" if train else "dev")
44+
source_vocab = generator_utils.get_or_generate_tabbed_vocab(
45+
data_dir, tmp_dir, filename, 0,
46+
prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size)
47+
target_vocab = generator_utils.get_or_generate_tabbed_vocab(
48+
data_dir, tmp_dir, filename, 1,
49+
prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size)
50+
pair_filepath = os.path.join(tmp_dir, filename)
51+
return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS)
52+
53+
54+
def tabbed_parsing_character_generator(tmp_dir, train):
55+
"""Generate source and target data from a single file."""
56+
character_vocab = text_encoder.ByteTextEncoder()
57+
filename = "parsing_{0}.pairs".format("train" if train else "dev")
58+
pair_filepath = os.path.join(tmp_dir, filename)
59+
return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
60+
61+
62+
@registry.register_problem("ice_parsing_tokens")
63+
class IceParsingTokens(problem.Problem):
64+
"""Problem spec for parsing tokenized Icelandic text to
65+
constituency trees, also tokenized but to a smaller vocabulary."""
66+
67+
@property
68+
def source_vocab_size(self):
69+
return 2**13 # 8192
70+
71+
@property
72+
def target_vocab_size(self):
73+
return 2**8 # 256
74+
75+
def feature_encoders(self, data_dir):
76+
source_vocab_filename = os.path.join(
77+
data_dir, "ice_source.tokens.vocab.%d" % self.source_vocab_size)
78+
target_vocab_filename = os.path.join(
79+
data_dir, "ice_target.tokens.vocab.%d" % self.target_vocab_size)
80+
source_subtokenizer = text_encoder.SubwordTextEncoder(source_vocab_filename)
81+
target_subtokenizer = text_encoder.SubwordTextEncoder(target_vocab_filename)
82+
return {
83+
"inputs": source_subtokenizer,
84+
"targets": target_subtokenizer,
85+
}
86+
87+
def generate_data(self, data_dir, tmp_dir, num_shards=100):
88+
generator_utils.generate_dataset_and_shuffle(
89+
tabbed_parsing_token_generator(data_dir, tmp_dir, True, "ice",
90+
self.source_vocab_size,
91+
self.target_vocab_size),
92+
self.training_filepaths(data_dir, num_shards, shuffled=False),
93+
tabbed_parsing_token_generator(data_dir, tmp_dir, False, "ice",
94+
self.source_vocab_size,
95+
self.target_vocab_size),
96+
self.dev_filepaths(data_dir, 1, shuffled=False))
97+
98+
def hparams(self, defaults, unused_model_hparams):
99+
p = defaults
100+
source_vocab_size = self._encoders["inputs"].vocab_size
101+
p.input_modality = {"inputs": (registry.Modalities.SYMBOL, source_vocab_size)}
102+
p.target_modality = (registry.Modalities.SYMBOL, self.target_vocab_size)
103+
p.input_space_id = problem.SpaceID.ICE_TOK
104+
p.target_space_id = problem.SpaceID.ICE_PARSE_TOK
105+
p.loss_multiplier = 2.5 # Rough estimate of avg number of tokens per word
106+
107+
108+
@registry.register_hparams
109+
def transformer_parsing_ice():
110+
"""Hparams for parsing Icelandic text."""
111+
hparams = transformer.transformer_base_single_gpu()
112+
hparams.batch_size = 4096
113+
hparams.shared_embedding_and_softmax_weights = int(False)
114+
return hparams
115+
116+
117+
@registry.register_hparams
118+
def transformer_parsing_ice_big():
119+
"""Hparams for parsing Icelandic text, bigger model."""
120+
hparams = transformer_parsing_ice()
121+
hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU
122+
hparams.attention_dropout = 0.05
123+
hparams.residual_dropout = 0.05
124+
hparams.max_length = 512
125+
hparams.hidden_size = 1024
126+
return hparams
127+

tensor2tensor/models/transformer.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -357,29 +357,6 @@ def transformer_parsing_big():
357357
return hparams
358358

359359

360-
@registry.register_hparams
361-
def transformer_parsing_ice():
362-
"""Hparams for parsing Icelandic text."""
363-
hparams = transformer_base_single_gpu()
364-
hparams.batch_size = 4096
365-
hparams.shared_embedding_and_softmax_weights = int(False)
366-
return hparams
367-
368-
369-
@registry.register_hparams
370-
def transformer_parsing_ice_big():
371-
"""Hparams for parsing Icelandic text, bigger model."""
372-
hparams = transformer_parsing_ice()
373-
hparams.batch_size = 2048 # 4096 gives Out-of-memory on 8 GB 1080 GTX GPU
374-
hparams.attention_dropout = 0.2
375-
hparams.residual_dropout = 0.2
376-
hparams.max_length = 512
377-
hparams.learning_rate_warmup_steps = 16000
378-
hparams.hidden_size = 1024
379-
hparams.learning_rate = 0.05
380-
return hparams
381-
382-
383360
@registry.register_hparams
384361
def transformer_tiny():
385362
hparams = transformer_base()

0 commit comments

Comments
 (0)