Skip to content

Commit 217efeb

Browse files
authored
Merge pull request #27 from dataiku/release/1.0/spacy3
Spacy 3 migration
2 parents dad2ad7 + 8f588d2 commit 217efeb

File tree

11 files changed

+277
-78
lines changed

11 files changed

+277
-78
lines changed

code-env/python/spec/requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ pymorphy2==0.9.1
44
jieba==0.42.1
55
pyvi==0.1
66
regex==2020.11.13
7-
spacy[ja,lookups,th]==2.3.5
7+
spacy[lookups,ja,th]==3.0.7
88
emoji==1.2.0
99
tqdm==4.50.2
1010
matplotlib==3.3.1
1111
wordcloud==1.8.0
1212
fonttools==4.14.0
1313
pathvalidate==2.3.0
14-
fastcore==1.3.1
14+
fastcore==1.3.19

python-lib/language_support.py

Lines changed: 51 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -62,22 +62,67 @@
6262
"yo": "Yoruba",
6363
"zh": "Chinese (simplified)",
6464
}
65-
"""dict: Languages supported by spaCy: https://spacy.io/usage/models#languages
6665

66+
"""dict: Languages supported by spaCy: https://spacy.io/usage/models#languages
6767
Dictionary with ISO 639-1 language code (key) and language name (value)
6868
Korean is excluded for now because of system installation issues
6969
"""
7070

7171
SPACY_LANGUAGE_MODELS = {
72+
"de": "de_core_news_sm", # OntoNotes
7273
"en": "en_core_web_sm", # OntoNotes
7374
"es": "es_core_news_sm", # Wikipedia
74-
"zh": "zh_core_web_sm", # OntoNotes
75-
"xx": "xx_ent_wiki_sm", # Wikipedia
76-
"pl": "nb_core_news_sm", # NorNE
7775
"fr": "fr_core_news_sm", # Wikipedia
78-
"de": "de_core_news_sm", # OntoNotes
76+
"nb": "nb_core_news_sm", # NorNE
77+
"pl": "pl_core_news_sm", # NKJP
78+
"ru": "ru_core_news_sm", # Nerus
79+
"zh": "zh_core_web_sm", # OntoNotes
7980
}
80-
"""dict: Mapping between ISO 639-1 language code and spaCy model identifiers
8181

82+
"""dict: Mapping between ISO 639-1 language code and spaCy model identifiers
8283
Models with Creative Commons licenses are not included because this plugin is licensed under Apache-2
8384
"""
85+
86+
SPACY_LANGUAGE_MODELS_LEMMATIZATION = ["de", "en", "es", "fr", "nb", "pl", "ru"]
87+
"""list: Languages that have a SpaCy pre-trained model with a Lemmatizer component.
88+
When using a pre-trained pipeline to lemmatize, you need to have in your SpaCy Language pipeline:
89+
-either SpaCy 'morphologizer' + 'lemmatizer'
90+
-or SpaCy 'tagger' + 'attribute ruler' + 'lemmatizer'
91+
depending on the pre-trained pipeline built-in components"""
92+
93+
SPACY_LANGUAGE_MODELS_MORPHOLOGIZER = ["de", "es", "fr", "nb", "pl", "ru"]
94+
"""list: Languages that have a SpaCy pre-trained model with a Morphologizer component."""
95+
96+
SPACY_LANGUAGE_LOOKUP = [
97+
"ca",
98+
"cs",
99+
"da",
100+
"de",
101+
"en",
102+
"es",
103+
"fr",
104+
"hr",
105+
"hu",
106+
"id",
107+
"it",
108+
"lb",
109+
"lt",
110+
"mk",
111+
"nb",
112+
"nl",
113+
"pt",
114+
"ro",
115+
"sr",
116+
"sv",
117+
"tl",
118+
"tr",
119+
"ur",
120+
]
121+
"""list: Languages that have available SpaCy lookup tables for lemmatization.
122+
The lookup tables are available at https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data
123+
"""
124+
125+
SPACY_LANGUAGE_RULES = ["bn", "el", "fa"]
126+
"""list: Languages that have available SpaCy rule tables for lemmatization
127+
The rule tables are available at https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data
128+
"""

python-lib/spacy_tokenizer.py

Lines changed: 155 additions & 59 deletions
Large diffs are not rendered by default.

python-lib/wordcloud_visualizer.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,20 @@ def _normalize_case_token_counts(self, counts: Counter) -> Counter:
235235
normalized_counts = Counter(dict(zip(df_counts_agg.token_majority_case, df_counts_agg["sum"])))
236236
return normalized_counts
237237

238+
def _save_chart(self, fig: plt.figure) -> BytesIO:
239+
"""Private method to save chart as a bytes stream
240+
241+
Args:
242+
fig (plt.figure): matplotlib figure to save
243+
244+
Returns:
245+
BytesIO: bytes stream containing the chart's data
246+
"""
247+
temp = BytesIO()
248+
fig.savefig(temp, bbox_inches=self.bbox_inches, pad_inches=self.pad_inches, dpi=fig.dpi)
249+
plt.close()
250+
return temp
251+
238252
@time_logging(log_message="Counting tokens")
239253
def _count_tokens(self, docs: List[Doc]) -> List[Tuple[AnyStr, Dict]]:
240254
"""Private method to count tokens for each document in corpus
@@ -303,17 +317,15 @@ def generate_wordclouds(self, counts: List[Tuple[AnyStr, Dict]]) -> Generator[Tu
303317
else:
304318
fig = self._generate_wordcloud(frequencies=count, language=self.language, title=wordcloud_title)
305319
# Return chart
306-
temp = BytesIO()
307-
fig.savefig(temp, bbox_inches=self.bbox_inches, pad_inches=self.pad_inches, dpi=fig.dpi)
320+
temp = self._save_chart(fig)
308321
yield (temp, output_file_name)
309322

310323
else:
311324
# Generate chart
312325
count = counts[0][1]
313326
fig = self._generate_wordcloud(frequencies=count, language=self.language)
314327
# Return chart
315-
temp = BytesIO()
316-
fig.savefig(temp, bbox_inches=self.bbox_inches, pad_inches=self.pad_inches, dpi=fig.dpi)
328+
temp = self._save_chart(fig)
317329
yield (temp, "wordcloud.png")
318330

319331
def tokenize_and_count(self, df: pd.DataFrame) -> List[Tuple[AnyStr, Dict]]:

resource/stopwords/de.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,4 +76,4 @@ wie
7676
wir
7777
würde
7878
würden
79-
zwar
79+
zwar

resource/stopwords/en.txt

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ about
2424
above
2525
across
2626
after
27+
afterwards
2728
against
2829
ain
2930
all
@@ -50,19 +51,27 @@ aren
5051
around
5152
as
5253
at
54+
back
5355
be
56+
became
5457
because
58+
become
59+
becomes
60+
becoming
5561
been
5662
before
5763
beforehand
64+
behind
5865
being
66+
beside
5967
besides
6068
between
6169
beyond
6270
both
6371
but
6472
by
6573
ca
74+
call
6675
can
6776
could
6877
couldn
@@ -74,6 +83,7 @@ does
7483
doesn
7584
doing
7685
don
86+
done
7787
down
7888
due
7989
during
@@ -83,13 +93,19 @@ else
8393
elsewhere
8494
ever
8595
every
96+
everyone
97+
everything
98+
everywhere
8699
few
87100
for
88101
former
89102
formerly
90103
from
104+
front
91105
further
92106
get
107+
give
108+
go
93109
gon
94110
had
95111
hadn
@@ -108,6 +124,7 @@ herein
108124
hereupon
109125
hers
110126
herself
127+
hi
111128
him
112129
himself
113130
his
@@ -129,6 +146,8 @@ let
129146
ll
130147
m
131148
ma
149+
made
150+
make
132151
many
133152
may
134153
me
@@ -139,22 +158,28 @@ mine
139158
more
140159
moreover
141160
most
161+
move
142162
must
143163
mustn
144164
my
145165
myself
146166
n‘t
147167
n’t
148168
na
169+
name
170+
namely
149171
needn
150172
neither
173+
next
151174
nor
175+
now
152176
o
153177
of
154178
off
155179
often
156180
on
157181
once
182+
one
158183
only
159184
onto
160185
or
@@ -168,6 +193,7 @@ ourselves
168193
out
169194
over
170195
own
196+
part
171197
per
172198
put
173199
quite
@@ -188,6 +214,8 @@ shan
188214
she
189215
should
190216
shouldn
217+
show
218+
side
191219
since
192220
so
193221
some
@@ -229,10 +257,15 @@ thus
229257
to
230258
together
231259
too
260+
toward
261+
towards
232262
under
233263
until
234264
up
235265
upon
266+
us
267+
used
268+
using
236269
various
237270
ve
238271
via
@@ -263,7 +296,9 @@ whole
263296
whom
264297
whose
265298
why
299+
will
266300
with
301+
within
267302
wo
268303
won
269304
would
@@ -274,4 +309,4 @@ you
274309
your
275310
yours
276311
yourself
277-
yourselves
312+
yourselves

resource/stopwords/es.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,4 +305,4 @@ vuestro
305305
vuestros
306306
y
307307
ya
308-
yo
308+
yo

resource/stopwords/fr.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ aie
77
aient
88
aies
99
ait
10+
allaient
11+
allons
1012
alors
1113
as
1214
au
@@ -119,6 +121,7 @@ eut
119121
eût
120122
eûtes
121123
eux
124+
façon
122125
fais
123126
faisaient
124127
faisant
@@ -136,6 +139,7 @@ fussions
136139
fut
137140
fût
138141
fûtes
142+
gens
139143
ici
140144
il
141145
ils
@@ -214,6 +218,8 @@ s'
214218
s‘
215219
s’
216220
sa
221+
sais
222+
sait
217223
sans
218224
se
219225
sera
@@ -262,12 +268,17 @@ toutes
262268
tu
263269
un
264270
une
271+
va
272+
vais
273+
vas
265274
vers
266275
voici
267276
voilà
277+
vont
268278
vos
269279
votre
270280
vôtre
271281
vôtres
272282
vous
273-
y
283+
vu
284+
y

resource/stopwords/it.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -265,4 +265,4 @@ voi
265265
vostra
266266
vostre
267267
vostri
268-
vostro
268+
vostro

resource/stopwords/nl.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,4 @@ zich
5555
zij
5656
zijn
5757
zo
58-
zou
58+
zou

0 commit comments

Comments
 (0)