Skip to content

Commit 943a44e

Browse files
Revert "remove brittle tests"
This reverts commit 75a9698.
1 parent 4729323 commit 943a44e

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed

webstruct/text_tokenizers.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,41 @@ class WordTokenizer(object):
100100
>>> WordTokenizer().segment_words('" a')
101101
[TextToken(chars='``', position=0, length=1),
102102
TextToken(chars='a', position=2, length=1)]
103+
104+
Some issues:
105+
106+
>>> WordTokenizer().segment_words("Phone:855-349-1914")
107+
[TextToken(chars='Phone:855-349-1914', position=0, length=18)]
108+
109+
>>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")
110+
[TextToken(chars='Copyright', position=0, length=9),
111+
TextToken(chars=u'\xa9', position=10, length=1),
112+
TextToken(chars='2014', position=12, length=4),
113+
TextToken(chars='Foo', position=17, length=3),
114+
TextToken(chars='Bar', position=21, length=3),
115+
TextToken(chars='and', position=25, length=3),
116+
TextToken(chars='Buzz', position=29, length=4),
117+
TextToken(chars='Spam.', position=34, length=5),
118+
TextToken(chars='All', position=40, length=3),
119+
TextToken(chars='Rights', position=44, length=6),
120+
TextToken(chars='Reserved', position=51, length=8),
121+
TextToken(chars='.', position=59, length=1)]
122+
123+
>>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077")
124+
[TextToken(chars='Powai', position=0, length=5),
125+
TextToken(chars='Campus', position=6, length=6),
126+
TextToken(chars=',', position=12, length=1),
127+
TextToken(chars='Mumbai-400077', position=14, length=13)]
128+
129+
>>> WordTokenizer().segment_words("1 5858/ 1800")
130+
[TextToken(chars='1', position=0, length=1),
131+
TextToken(chars='5858/', position=2, length=5),
132+
TextToken(chars='1800', position=8, length=4)]
133+
134+
>>> WordTokenizer().segment_words("Saudi Arabia-")
135+
[TextToken(chars='Saudi', position=0, length=5),
136+
TextToken(chars='Arabia-', position=6, length=7)]
137+
103138
"""
104139

105140
# regex, token

0 commit comments

Comments
 (0)