Skip to content

Commit 537e353

Browse files
authored
Merge pull request #28 from dataiku/release/1.0/fix_japanese_tokenization_perf
tokenizing document by document instead of aggregating docs before to…
2 parents 432e31e + e7242ea commit 537e353

File tree

1 file changed

+5
-3
lines changed

1 file changed

+5
-3
lines changed

python-lib/wordcloud_visualizer.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ def _tokenize_texts(self, df_grouped: List) -> List:
193193
texts = []
194194
group_names = []
195195
for name, group in df_grouped:
196-
texts.append([group[self.text_column].str.cat(sep=" ")])
196+
texts.append(list(group[self.text_column]))
197197
group_names.append(name)
198198

199199
# Get tokenization languages differently depending on language/subchart settings combinations
@@ -210,8 +210,10 @@ def _tokenize_texts(self, df_grouped: List) -> List:
210210
languages = group_names
211211

212212
# Tokenize
213-
docs = [self.tokenizer.tokenize_list(text, language)[0] for text, language in zip(texts, languages)]
214-
213+
docs = [
214+
Doc.from_docs(self.tokenizer.tokenize_list(text_list, language))
215+
for text_list, language in zip(texts, languages)
216+
]
215217
return docs
216218

217219
def _normalize_case_token_counts(self, counts: Counter) -> Counter:

0 commit comments

Comments
 (0)