Skip to content

Commit b72bcc1

Browse files
expect behaviour of nltk tokenizer
1 parent ba7d6fe commit b72bcc1

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

webstruct/tests/test_text_tokenizer.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ def do_tokenize(self, text, result):
1111
def test_phone(self):
1212
return self.do_tokenize(
1313
"Phone:855-349-1914",
14-
[TextToken(chars='Phone:855-349-1914', position=0, length=18)]
14+
[TextToken(chars='Phone', position=0, length=5)]
15+
[TextToken(chars=':', position=5, length=1)]
16+
[TextToken(chars='855-349-1914', position=6, length=12)]
1517
)
1618

1719
@pytest.mark.xfail
@@ -21,22 +23,26 @@ def test_hyphen_mid(self):
2123
[TextToken(chars='Powai', position=0, length=5),
2224
TextToken(chars='Campus', position=6, length=6),
2325
TextToken(chars=',', position=12, length=1),
24-
TextToken(chars='Mumbai-400077', position=14, length=13)]
26+
TextToken(chars='Mumbai', position=14, length=6),
27+
TextToken(chars='-', position=20, length=1),
28+
TextToken(chars='400077', position=21, length=6)]
2529
)
2630

2731
@pytest.mark.xfail
2832
def test_hyphen_end(self):
2933
return self.do_tokenize(
3034
"Saudi Arabia-",
3135
[TextToken(chars='Saudi', position=0, length=5),
32-
TextToken(chars='Arabia-', position=6, length=7)]
36+
TextToken(chars='Arabia', position=6, length=6),
37+
TextToken(chars='-', position=12, length=1)]
3338
)
3439

3540
@pytest.mark.xfail
3641
def test_hyphen_end(self):
3742
return self.do_tokenize(
3843
"1 5858/ 1800",
3944
[TextToken(chars='1', position=0, length=1),
40-
TextToken(chars='5858/', position=2, length=5),
45+
TextToken(chars='5858', position=2, length=4),
46+
TextToken(chars='/', position=6, length=1),
4147
TextToken(chars='1800', position=8, length=4)]
4248
)

0 commit comments

Comments
 (0)