Skip to content

Commit fbce431

Browse files
authored
feat(medcat): CU-869b9n4mq Allow faster spacy tokenization (#244)
* CU-869b9n4mq: Add config option for faster spacy tokenization * CU-869b9n4mq: Add implementation for faster spacy tokenization * CU-869b9n4mq: Add a few tests for faster tokenization
1 parent e21da73 commit fbce431

File tree

3 files changed

+99
-3
lines changed

3 files changed

+99
-3
lines changed

medcat-v2/medcat/config/config.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,39 @@ class NLPConfig(SerialisableBaseModel):
168168
169169
Currently only regex and spacy are natively supported.
170170
171+
NB! For these changes to take effect, the pipe would need to be recreated.
172+
"""
173+
faster_spacy_tokenization: bool = False
174+
"""Allow skipping the spacy pipeline.
175+
176+
If True, uses basic tokenization only (spacy.make_doc) for ~3-4x overall
177+
speedup.
178+
If False, uses full linguistic pipeline including POS tagging,
179+
lemmatization, and stopword detection.
180+
181+
**Impact of fast_tokenization=True:**
182+
- No part-of-speech tags: All tokens treated uniformly during normalization
183+
- No lemmatization: Words used in surface form (e.g., "running" vs "run")
184+
- No stopword detection: All tokens in multi-token spans considered;
185+
all tokens used in context vector calculation
186+
- Real world performance (in terms of precision and recall) is likely to
187+
be lower
188+
189+
**When to use fast mode:**
190+
- Processing very large datasets where speed is critical
191+
- Text is already clean/normalized
192+
- Minor drops in precision/recall (typically 1-3%) are acceptable
193+
194+
**When to use full mode (default):**
195+
- Maximum accuracy is required
196+
- Working with noisy or varied text
197+
- Proper linguistic analysis improves your specific use case
198+
199+
Benchmark on your data to determine if the speedup justifies the
200+
accuracy tradeoff.
201+
202+
PS: Only applicable for spacy based tokenizer.
203+
171204
NB! For these changes to take effect, the pipe would need to be recreated.
172205
"""
173206
modelname: str = 'en_core_web_md'

medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def __init__(self, spacy_model_name: str,
4747
max_document_length: int,
4848
tokenizer_getter: Callable[[Language, bool], Tokenizer
4949
] = spacy_split_all,
50-
stopwords: Optional[set[str]] = None,):
50+
stopwords: Optional[set[str]] = None,
51+
avoid_pipe: bool = False):
5152
self._spacy_model_name = os.path.basename(
5253
spacy_model_name).removeprefix(TOKENIZER_PREFIX)
5354
if self.load_internals_from(spacy_model_name):
@@ -62,6 +63,7 @@ def __init__(self, spacy_model_name: str,
6263
TOKENIZER_PREFIX).split('_', 1)[0]
6364
cls = spacy.util.get_lang_class(lang_str)
6465
cls.Defaults.stop_words = set(stopwords)
66+
self._avoid_pipe = avoid_pipe
6567
self._nlp = spacy.load(spacy_model_name,
6668
disable=spacy_disabled_components)
6769
self._nlp.tokenizer = tokenizer_getter(self._nlp, use_diacritics)
@@ -83,7 +85,11 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
8385
return Entity(span)
8486

8587
def __call__(self, text: str) -> MutableDocument:
86-
return Document(self._nlp(text))
88+
if self._avoid_pipe:
89+
doc = Document(self._nlp.make_doc(text))
90+
else:
91+
doc = Document(self._nlp(text))
92+
return doc
8793

8894
@classmethod
8995
def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer':
@@ -93,7 +99,8 @@ def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer':
9399
nlp_cnf.disabled_components,
94100
config.general.diacritics,
95101
config.preprocessing.max_document_length,
96-
stopwords=config.preprocessing.stopwords)
102+
stopwords=config.preprocessing.stopwords,
103+
avoid_pipe=config.general.nlp.faster_spacy_tokenization)
97104

98105
def get_doc_class(self) -> Type[MutableDocument]:
99106
return Document

medcat-v2/tests/test_cat.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import json
55
from typing import Optional, Any
66
from collections import Counter
7+
from contextlib import contextmanager
78

89
from medcat import cat
910
from medcat.data.model_card import ModelCard
@@ -18,6 +19,7 @@
1819
from medcat.components.addons.meta_cat import MetaCATAddon
1920
from medcat.utils.defaults import AVOID_LEGACY_CONVERSION_ENVIRON
2021
from medcat.utils.defaults import LegacyConversionDisabledError
22+
from medcat.utils.config_utils import temp_changed_config
2123

2224
import unittest
2325
import tempfile
@@ -222,6 +224,60 @@ def test_inference_works(self):
222224
with self.subTest(f"{nr}"):
223225
ConvertedFunctionalityTests.assert_has_ent(ent)
224226

227+
@classmethod
228+
@contextmanager
229+
def _faster_spacy_inference(cls):
230+
with temp_changed_config(
231+
cls.model.config.general.nlp,
232+
"faster_spacy_tokenization",
233+
True
234+
):
235+
with temp_changed_config(
236+
cls.model.config.general.nlp,
237+
"modelname",
238+
"en_core_web_md"
239+
):
240+
cls.model._recreate_pipe()
241+
yield
242+
cls.model._recreate_pipe()
243+
244+
def _is_spacy_model(self):
245+
if self.model.config.general.nlp.provider != "spacy":
246+
raise unittest.SkipTest("Only applicable for spacy models")
247+
248+
def test_default_spacy_runs_pipe(self):
249+
self._is_spacy_model()
250+
self.assertFalse(self.model.pipe._tokenizer._avoid_pipe)
251+
252+
def test_faster_spacy_inference_is_set(self):
253+
self._is_spacy_model()
254+
with self._faster_spacy_inference():
255+
self.assertTrue(self.model.pipe._tokenizer._avoid_pipe)
256+
257+
def test_faster_spacy_inference_works(self):
258+
self._is_spacy_model()
259+
with self._faster_spacy_inference():
260+
ents = self.model.get_entities(
261+
ConvertedFunctionalityTests.TEXT)['entities']
262+
self.assertTrue(ents)
263+
for nr, ent in enumerate(ents.values()):
264+
with self.subTest(f"{nr}"):
265+
ConvertedFunctionalityTests.assert_has_ent(ent)
266+
267+
def test_faster_spacy_inference_is_used(self):
268+
self._is_spacy_model()
269+
with self._faster_spacy_inference():
270+
with unittest.mock.patch.object(
271+
self.model.pipe._tokenizer._nlp,
272+
'__call__') as dunder_call_mock:
273+
with unittest.mock.patch.object(
274+
self.model.pipe._tokenizer._nlp,
275+
'make_doc') as make_doc_mock:
276+
self.model.get_entities(
277+
ConvertedFunctionalityTests.TEXT)
278+
dunder_call_mock.assert_not_called()
279+
make_doc_mock.assert_called()
280+
225281
def test_entities_in_correct_order(self):
226282
# NOTE: the issue wouldn't show up with smaller amount of text
227283
doc = self.model(ConvertedFunctionalityTests.TEXT * 3)

0 commit comments

Comments
 (0)