Merge pull request #27 from dataiku/release/1.0/spacy3

Alexlandeau · web-flow · commit 217efeb44614 · 2021-08-31T15:26:26.000+02:00
Spacy 3 migration
diff --git a/code-env/python/spec/requirements.txt b/code-env/python/spec/requirements.txt
@@ -4,11 +4,11 @@ pymorphy2==0.9.1
 jieba==0.42.1
 pyvi==0.1
 regex==2020.11.13
-spacy[ja,lookups,th]==2.3.5
+spacy[lookups,ja,th]==3.0.7
 emoji==1.2.0
 tqdm==4.50.2
 matplotlib==3.3.1
 wordcloud==1.8.0
 fonttools==4.14.0
 pathvalidate==2.3.0
-fastcore==1.3.1
+fastcore==1.3.19
diff --git a/python-lib/language_support.py b/python-lib/language_support.py
@@ -62,22 +62,67 @@
     "yo": "Yoruba",
     "zh": "Chinese (simplified)",
 }
-"""dict: Languages supported by spaCy: https://spacy.io/usage/models#languages
 
+"""dict: Languages supported by spaCy: https://spacy.io/usage/models#languages
 Dictionary with ISO 639-1 language code (key) and language name (value)
 Korean is excluded for now because of system installation issues
 """
 
 SPACY_LANGUAGE_MODELS = {
+    "de": "de_core_news_sm",  # OntoNotes
     "en": "en_core_web_sm",  # OntoNotes
     "es": "es_core_news_sm",  # Wikipedia
-    "zh": "zh_core_web_sm",  # OntoNotes
-    "xx": "xx_ent_wiki_sm",  # Wikipedia
-    "pl": "nb_core_news_sm",  # NorNE
     "fr": "fr_core_news_sm",  # Wikipedia
-    "de": "de_core_news_sm",  # OntoNotes
+    "nb": "nb_core_news_sm",  # NorNE
+    "pl": "pl_core_news_sm",  # NKJP
+    "ru": "ru_core_news_sm",  # Nerus
+    "zh": "zh_core_web_sm",  # OntoNotes
 }
-"""dict: Mapping between ISO 639-1 language code and spaCy model identifiers
 
+"""dict: Mapping between ISO 639-1 language code and spaCy model identifiers
 Models with Creative Commons licenses are not included because this plugin is licensed under Apache-2
 """
+
+SPACY_LANGUAGE_MODELS_LEMMATIZATION = ["de", "en", "es", "fr", "nb", "pl", "ru"]
+"""list: Languages that have a SpaCy pre-trained model with a Lemmatizer component.
+When using a pre-trained pipeline to lemmatize, you need to have in your SpaCy Language pipeline:
+-either SpaCy 'morphologizer' + 'lemmatizer'
+-or SpaCy 'tagger' + 'attribute ruler' + 'lemmatizer'
+depending on the pre-trained pipeline built-in components"""
+
+SPACY_LANGUAGE_MODELS_MORPHOLOGIZER = ["de", "es", "fr", "nb", "pl", "ru"]
+"""list: Languages that have a SpaCy pre-trained model with a Morphologizer component."""
+
+SPACY_LANGUAGE_LOOKUP = [
+    "ca",
+    "cs",
+    "da",
+    "de",
+    "en",
+    "es",
+    "fr",
+    "hr",
+    "hu",
+    "id",
+    "it",
+    "lb",
+    "lt",
+    "mk",
+    "nb",
+    "nl",
+    "pt",
+    "ro",
+    "sr",
+    "sv",
+    "tl",
+    "tr",
+    "ur",
+]
+"""list: Languages that have available SpaCy lookup tables for lemmatization. 
+The lookup tables are available at https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data
+"""
+
+SPACY_LANGUAGE_RULES = ["bn", "el", "fa"]
+"""list: Languages that have available SpaCy rule tables for lemmatization
+The rule tables are available at https://github.com/explosion/spacy-lookups-data/tree/master/spacy_lookups_data/data
+"""
diff --git a/python-lib/spacy_tokenizer.py b/python-lib/spacy_tokenizer.py
diff --git a/python-lib/wordcloud_visualizer.py b/python-lib/wordcloud_visualizer.py
@@ -235,6 +235,20 @@ def _normalize_case_token_counts(self, counts: Counter) -> Counter:
         normalized_counts = Counter(dict(zip(df_counts_agg.token_majority_case, df_counts_agg["sum"])))
         return normalized_counts
 
+    def _save_chart(self, fig: plt.figure) -> BytesIO:
+        """Private method to save chart as a bytes stream
+
+        Args:
+            fig (plt.figure): matplotlib figure to save
+
+        Returns:
+            BytesIO: bytes stream containing the chart's data
+        """
+        temp = BytesIO()
+        fig.savefig(temp, bbox_inches=self.bbox_inches, pad_inches=self.pad_inches, dpi=fig.dpi)
+        plt.close()
+        return temp
+
     @time_logging(log_message="Counting tokens")
     def _count_tokens(self, docs: List[Doc]) -> List[Tuple[AnyStr, Dict]]:
         """Private method to count tokens for each document in corpus
@@ -303,17 +317,15 @@ def generate_wordclouds(self, counts: List[Tuple[AnyStr, Dict]]) -> Generator[Tu
                 else:
                     fig = self._generate_wordcloud(frequencies=count, language=self.language, title=wordcloud_title)
                 # Return chart
-                temp = BytesIO()
-                fig.savefig(temp, bbox_inches=self.bbox_inches, pad_inches=self.pad_inches, dpi=fig.dpi)
+                temp = self._save_chart(fig)
                 yield (temp, output_file_name)
 
         else:
             # Generate chart
             count = counts[0][1]
             fig = self._generate_wordcloud(frequencies=count, language=self.language)
             # Return chart
-            temp = BytesIO()
-            fig.savefig(temp, bbox_inches=self.bbox_inches, pad_inches=self.pad_inches, dpi=fig.dpi)
+            temp = self._save_chart(fig)
             yield (temp, "wordcloud.png")
 
     def tokenize_and_count(self, df: pd.DataFrame) -> List[Tuple[AnyStr, Dict]]:
diff --git a/resource/stopwords/de.txt b/resource/stopwords/de.txt
@@ -76,4 +76,4 @@ wie
 wir
 würde
 würden
-zwar
+zwar
diff --git a/resource/stopwords/en.txt b/resource/stopwords/en.txt
@@ -24,6 +24,7 @@ about
 above
 across
 after
+afterwards
 against
 ain
 all
@@ -50,19 +51,27 @@ aren
 around
 as
 at
+back
 be
+became
 because
+become
+becomes
+becoming
 been
 before
 beforehand
+behind
 being
+beside
 besides
 between
 beyond
 both
 but
 by
 ca
+call
 can
 could
 couldn
@@ -74,6 +83,7 @@ does
 doesn
 doing
 don
+done
 down
 due
 during
@@ -83,13 +93,19 @@ else
 elsewhere
 ever
 every
+everyone
+everything
+everywhere
 few
 for
 former
 formerly
 from
+front
 further
 get
+give
+go
 gon
 had
 hadn
@@ -108,6 +124,7 @@ herein
 hereupon
 hers
 herself
+hi
 him
 himself
 his
@@ -129,6 +146,8 @@ let
 ll
 m
 ma
+made
+make
 many
 may
 me
@@ -139,22 +158,28 @@ mine
 more
 moreover
 most
+move
 must
 mustn
 my
 myself
 n‘t
 n’t
 na
+name
+namely
 needn
 neither
+next
 nor
+now
 o
 of
 off
 often
 on
 once
+one
 only
 onto
 or
@@ -168,6 +193,7 @@ ourselves
 out
 over
 own
+part
 per
 put
 quite
@@ -188,6 +214,8 @@ shan
 she
 should
 shouldn
+show
+side
 since
 so
 some
@@ -229,10 +257,15 @@ thus
 to
 together
 too
+toward
+towards
 under
 until
 up
 upon
+us
+used
+using
 various
 ve
 via
@@ -263,7 +296,9 @@ whole
 whom
 whose
 why
+will
 with
+within
 wo
 won
 would
@@ -274,4 +309,4 @@ you
 your
 yours
 yourself
-yourselves
+yourselves
diff --git a/resource/stopwords/es.txt b/resource/stopwords/es.txt
@@ -305,4 +305,4 @@ vuestro
 vuestros
 y
 ya
-yo
+yo
diff --git a/resource/stopwords/fr.txt b/resource/stopwords/fr.txt
@@ -7,6 +7,8 @@ aie
 aient
 aies
 ait
+allaient
+allons
 alors
 as
 au
@@ -119,6 +121,7 @@ eut
 eût
 eûtes
 eux
+façon
 fais
 faisaient
 faisant
@@ -136,6 +139,7 @@ fussions
 fut
 fût
 fûtes
+gens
 ici
 il
 ils
@@ -214,6 +218,8 @@ s'
 s‘
 s’
 sa
+sais
+sait
 sans
 se
 sera
@@ -262,12 +268,17 @@ toutes
 tu
 un
 une
+va
+vais
+vas
 vers
 voici
 voilà
+vont
 vos
 votre
 vôtre
 vôtres
 vous
-y
+vu
+y
diff --git a/resource/stopwords/it.txt b/resource/stopwords/it.txt
@@ -265,4 +265,4 @@ voi
 vostra
 vostre
 vostri
-vostro
+vostro
diff --git a/resource/stopwords/nl.txt b/resource/stopwords/nl.txt
@@ -55,4 +55,4 @@ zich
 zij
 zijn
 zo
-zou
+zou
diff --git a/resource/stopwords/pt.txt b/resource/stopwords/pt.txt
@@ -197,4 +197,4 @@ você
 vocês
 vos
 vossa
-vosso
+vosso

-Original file line number
+Diff line change
 wir
 würde
 würden
 -zwar
 +zwar
Original file line number	Diff line number	Diff line change
`@@ -305,4 +305,4 @@ vuestro`
`305`	`305`	`vuestros`
`306`	`306`	`y`
`307`	`307`	`ya`
`308`		`-yo`
	`308`	`+yo`