Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 9a1f888

Browse files
authored
Merge pull request #139 from deasuke/unicode_fix
replace unicode in python3
2 parents 006ecb5 + 89ddffe commit 9a1f888

File tree

2 files changed

+3
-2
lines changed

2 files changed

+3
-2
lines changed

tensor2tensor/data_generators/text_encoder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,7 +473,7 @@ def _escape_token(self, token):
473473
Returns:
474474
escaped_token: a unicode string
475475
"""
476-
assert isinstance(token, unicode)
476+
assert isinstance(token, six.text_type)
477477
token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") + u"_"
478478
ret = u""
479479
for c in token:

tensor2tensor/data_generators/wiki.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
# Dependency imports
2626

2727
import six
28+
from six import PY2
2829
from tensor2tensor.data_generators import generator_utils
2930
from tensor2tensor.data_generators import text_encoder
3031
from tensor2tensor.data_generators import tokenizer
@@ -60,7 +61,7 @@ def page_generator(tmp_dir, max_docs=None):
6061
count = 0
6162
corpus_filepath = _maybe_download_corpus(tmp_dir)
6263
for line in bz2.BZ2File(corpus_filepath, "r"):
63-
line = unicode(line, "utf-8")
64+
line = unicode(line, "utf-8") if PY2 else line.decode("utf-8")
6465
if not doc and line != u" <page>\n":
6566
continue
6667
doc += line

0 commit comments

Comments
 (0)