@@ -100,6 +100,41 @@ class WordTokenizer(object):
100100 >>> WordTokenizer().segment_words('" a')
101101 [TextToken(chars='``', position=0, length=1),
102102 TextToken(chars='a', position=2, length=1)]
103+
104+ Some issues:
105+
106+ >>> WordTokenizer().segment_words("Phone:855-349-1914")
107+ [TextToken(chars='Phone:855-349-1914', position=0, length=18)]
108+
109+ >>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")
110+ [TextToken(chars='Copyright', position=0, length=9),
111+ TextToken(chars=u'\xa9', position=10, length=1),
112+ TextToken(chars='2014', position=12, length=4),
113+ TextToken(chars='Foo', position=17, length=3),
114+ TextToken(chars='Bar', position=21, length=3),
115+ TextToken(chars='and', position=25, length=3),
116+ TextToken(chars='Buzz', position=29, length=4),
117+ TextToken(chars='Spam.', position=34, length=5),
118+ TextToken(chars='All', position=40, length=3),
119+ TextToken(chars='Rights', position=44, length=6),
120+ TextToken(chars='Reserved', position=51, length=8),
121+ TextToken(chars='.', position=59, length=1)]
122+
123+ >>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077")
124+ [TextToken(chars='Powai', position=0, length=5),
125+ TextToken(chars='Campus', position=6, length=6),
126+ TextToken(chars=',', position=12, length=1),
127+ TextToken(chars='Mumbai-400077', position=14, length=13)]
128+
129+ >>> WordTokenizer().segment_words("1 5858/ 1800")
130+ [TextToken(chars='1', position=0, length=1),
131+ TextToken(chars='5858/', position=2, length=5),
132+ TextToken(chars='1800', position=8, length=4)]
133+
134+ >>> WordTokenizer().segment_words("Saudi Arabia-")
135+ [TextToken(chars='Saudi', position=0, length=5),
136+ TextToken(chars='Arabia-', position=6, length=7)]
137+
103138 """
104139
105140 # regex, token
0 commit comments