@@ -264,41 +264,6 @@ def gunzip_file(gz_path, new_path):
264264 new_file .write (line )
265265
266266
267- # TODO(aidangomez): en-fr tasks are significantly over-represented below
268- _DATA_FILE_URLS = [
269- # German-English
270- [
271- "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz" , # pylint: disable=line-too-long
272- [
273- "training-parallel-nc-v11/news-commentary-v11.de-en.en" ,
274- "training-parallel-nc-v11/news-commentary-v11.de-en.de"
275- ]
276- ],
277- # German-English & French-English
278- [
279- "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz" , [
280- "commoncrawl.de-en.en" , "commoncrawl.de-en.de" ,
281- "commoncrawl.fr-en.en" , "commoncrawl.fr-en.fr"
282- ]
283- ],
284- [
285- "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz" , [
286- "training/europarl-v7.de-en.en" , "training/europarl-v7.de-en.de" ,
287- "training/europarl-v7.fr-en.en" , "training/europarl-v7.fr-en.fr"
288- ]
289- ],
290- # French-English
291- [
292- "http://www.statmt.org/wmt10/training-giga-fren.tar" ,
293- ["giga-fren.release2.fixed.en.gz" , "giga-fren.release2.fixed.fr.gz" ]
294- ],
295- [
296- "http://www.statmt.org/wmt13/training-parallel-un.tgz" ,
297- ["un/undoc.2000.fr-en.en" , "un/undoc.2000.fr-en.fr" ]
298- ],
299- ]
300-
301-
302267def get_or_generate_vocab_inner (data_dir , vocab_filename , vocab_size ,
303268 generator ):
304269 """Inner implementation for vocab generators.
@@ -337,13 +302,9 @@ def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
337302 return vocab
338303
339304
340- def get_or_generate_vocab (data_dir ,
341- tmp_dir ,
342- vocab_filename ,
343- vocab_size ,
344- sources = None ):
345- """Generate a vocabulary from the datasets in sources (_DATA_FILE_URLS)."""
346- sources = sources or _DATA_FILE_URLS
305+ def get_or_generate_vocab (data_dir , tmp_dir , vocab_filename , vocab_size ,
306+ sources ):
307+ """Generate a vocabulary from the datasets in sources."""
347308
348309 def generate ():
349310 tf .logging .info ("Generating vocab from: %s" , str (sources ))
@@ -375,13 +336,19 @@ def generate():
375336
376337 # Use Tokenizer to count the word occurrences.
377338 with tf .gfile .GFile (filepath , mode = "r" ) as source_file :
378- file_byte_budget = 3.5e5 if filepath .endswith ("en" ) else 7e5
339+ file_byte_budget = 1e6
340+ counter = 0
341+ countermax = int (source_file .size () / file_byte_budget / 2 )
379342 for line in source_file :
380- if file_byte_budget <= 0 :
381- break
382- line = line .strip ()
383- file_byte_budget -= len (line )
384- yield line
343+ if counter < countermax :
344+ counter += 1
345+ else :
346+ if file_byte_budget <= 0 :
347+ break
348+ line = line .strip ()
349+ file_byte_budget -= len (line )
350+ counter = 0
351+ yield line
385352
386353 return get_or_generate_vocab_inner (data_dir , vocab_filename , vocab_size ,
387354 generate ())
0 commit comments