Skip to content

Commit ba7d6fe

Browse files
move brittle tests to pytest xfail
1 parent 943a44e commit ba7d6fe

File tree

2 files changed

+42
-18
lines changed

2 files changed

+42
-18
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import unittest
2+
import pytest
3+
4+
from webstruct.text_tokenizers import TextToken, WordTokenizer
5+
6+
class TestTokenizerTest(unittest.TestCase):
7+
def do_tokenize(self, text, result):
8+
self.assertEqual(result, WordTokenizer().segment_words(text))
9+
10+
@pytest.mark.xfail
11+
def test_phone(self):
12+
return self.do_tokenize(
13+
"Phone:855-349-1914",
14+
[TextToken(chars='Phone:855-349-1914', position=0, length=18)]
15+
)
16+
17+
@pytest.mark.xfail
18+
def test_hyphen_mid(self):
19+
return self.do_tokenize(
20+
"Powai Campus, Mumbai-400077",
21+
[TextToken(chars='Powai', position=0, length=5),
22+
TextToken(chars='Campus', position=6, length=6),
23+
TextToken(chars=',', position=12, length=1),
24+
TextToken(chars='Mumbai-400077', position=14, length=13)]
25+
)
26+
27+
@pytest.mark.xfail
28+
def test_hyphen_end(self):
29+
return self.do_tokenize(
30+
"Saudi Arabia-",
31+
[TextToken(chars='Saudi', position=0, length=5),
32+
TextToken(chars='Arabia-', position=6, length=7)]
33+
)
34+
35+
@pytest.mark.xfail
36+
def test_hyphen_end(self):
37+
return self.do_tokenize(
38+
"1 5858/ 1800",
39+
[TextToken(chars='1', position=0, length=1),
40+
TextToken(chars='5858/', position=2, length=5),
41+
TextToken(chars='1800', position=8, length=4)]
42+
)

webstruct/text_tokenizers.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,6 @@ class WordTokenizer(object):
103103
104104
Some issues:
105105
106-
>>> WordTokenizer().segment_words("Phone:855-349-1914")
107-
[TextToken(chars='Phone:855-349-1914', position=0, length=18)]
108-
109106
>>> WordTokenizer().segment_words("Copyright © 2014 Foo Bar and Buzz Spam. All Rights Reserved.")
110107
[TextToken(chars='Copyright', position=0, length=9),
111108
TextToken(chars=u'\xa9', position=10, length=1),
@@ -120,21 +117,6 @@ class WordTokenizer(object):
120117
TextToken(chars='Reserved', position=51, length=8),
121118
TextToken(chars='.', position=59, length=1)]
122119
123-
>>> WordTokenizer().segment_words("Powai Campus, Mumbai-400077")
124-
[TextToken(chars='Powai', position=0, length=5),
125-
TextToken(chars='Campus', position=6, length=6),
126-
TextToken(chars=',', position=12, length=1),
127-
TextToken(chars='Mumbai-400077', position=14, length=13)]
128-
129-
>>> WordTokenizer().segment_words("1 5858/ 1800")
130-
[TextToken(chars='1', position=0, length=1),
131-
TextToken(chars='5858/', position=2, length=5),
132-
TextToken(chars='1800', position=8, length=4)]
133-
134-
>>> WordTokenizer().segment_words("Saudi Arabia-")
135-
[TextToken(chars='Saudi', position=0, length=5),
136-
TextToken(chars='Arabia-', position=6, length=7)]
137-
138120
"""
139121

140122
# regex, token

0 commit comments

Comments
 (0)