@@ -103,9 +103,6 @@ class WordTokenizer(object):
103103
104104 Some issues:
105105
106- >>> WordTokenizer().segment_words("Phone:855-349-1914")
107- [TextToken(chars='Phone:855-349-1914', position=0, length=18)]
108-
109106 >>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")
110107 [TextToken(chars='Copyright', position=0, length=9),
111108 TextToken(chars=u'\xa9', position=10, length=1),
@@ -120,21 +117,6 @@ class WordTokenizer(object):
120117 TextToken(chars='Reserved', position=51, length=8),
121118 TextToken(chars='.', position=59, length=1)]
122119
123- >>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077")
124- [TextToken(chars='Powai', position=0, length=5),
125- TextToken(chars='Campus', position=6, length=6),
126- TextToken(chars=',', position=12, length=1),
127- TextToken(chars='Mumbai-400077', position=14, length=13)]
128-
129- >>> WordTokenizer().segment_words("1 5858/ 1800")
130- [TextToken(chars='1', position=0, length=1),
131- TextToken(chars='5858/', position=2, length=5),
132- TextToken(chars='1800', position=8, length=4)]
133-
134- >>> WordTokenizer().segment_words("Saudi Arabia-")
135- [TextToken(chars='Saudi', position=0, length=5),
136- TextToken(chars='Arabia-', position=6, length=7)]
137-
138120 """
139121
140122 # regex, token
0 commit comments