feat(medcat): CU-869b9n4mq Allow faster spacy tokenization (#244)

mart-r · web-flow · commit fbce43160fe4 · 2025-11-28T16:52:23.000Z
* CU-869b9n4mq: Add config option for faster spacy tokenization

* CU-869b9n4mq: Add implementation for faster spacy tokenization

* CU-869b9n4mq: Add a few tests for faster tokenization
diff --git a/medcat-v2/medcat/config/config.py b/medcat-v2/medcat/config/config.py
@@ -168,6 +168,39 @@ class NLPConfig(SerialisableBaseModel):
 
     Currently only regex and spacy are natively supported.
 
+    NB! For these changes to take effect, the pipe would need to be recreated.
+    """
+    faster_spacy_tokenization: bool = False
+    """Allow skipping the spacy pipeline.
+
+    If True, uses basic tokenization only (spacy.make_doc) for ~3-4x overall
+    speedup.
+    If False, uses full linguistic pipeline including POS tagging,
+    lemmatization, and stopword detection.
+
+    **Impact of fast_tokenization=True:**
+    - No part-of-speech tags: All tokens treated uniformly during normalization
+    - No lemmatization: Words used in surface form (e.g., "running" vs "run")
+    - No stopword detection: All tokens in multi-token spans considered;
+      all tokens used in context vector calculation
+    - Real world performance (in terms of precision and recall) is likely to
+      be lower
+
+    **When to use fast mode:**
+    - Processing very large datasets where speed is critical
+    - Text is already clean/normalized
+    - Minor drops in precision/recall (typically 1-3%) are acceptable
+
+    **When to use full mode (default):**
+    - Maximum accuracy is required
+    - Working with noisy or varied text
+    - Proper linguistic analysis improves your specific use case
+
+    Benchmark on your data to determine if the speedup justifies the
+    accuracy tradeoff.
+
+    PS: Only applicable for spacy based tokenizer.
+
     NB! For these changes to take effect, the pipe would need to be recreated.
     """
     modelname: str = 'en_core_web_md'
diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py b/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
@@ -47,7 +47,8 @@ def __init__(self, spacy_model_name: str,
                  max_document_length: int,
                  tokenizer_getter: Callable[[Language, bool], Tokenizer
                                             ] = spacy_split_all,
-                 stopwords: Optional[set[str]] = None,):
+                 stopwords: Optional[set[str]] = None,
+                 avoid_pipe: bool = False):
         self._spacy_model_name = os.path.basename(
             spacy_model_name).removeprefix(TOKENIZER_PREFIX)
         if self.load_internals_from(spacy_model_name):
@@ -62,6 +63,7 @@ def __init__(self, spacy_model_name: str,
                 TOKENIZER_PREFIX).split('_', 1)[0]
             cls = spacy.util.get_lang_class(lang_str)
             cls.Defaults.stop_words = set(stopwords)
+        self._avoid_pipe = avoid_pipe
         self._nlp = spacy.load(spacy_model_name,
                                disable=spacy_disabled_components)
         self._nlp.tokenizer = tokenizer_getter(self._nlp, use_diacritics)
@@ -83,7 +85,11 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         return Entity(span)
 
     def __call__(self, text: str) -> MutableDocument:
-        return Document(self._nlp(text))
+        if self._avoid_pipe:
+            doc = Document(self._nlp.make_doc(text))
+        else:
+            doc = Document(self._nlp(text))
+        return doc
 
     @classmethod
     def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer':
@@ -93,7 +99,8 @@ def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer':
             nlp_cnf.disabled_components,
             config.general.diacritics,
             config.preprocessing.max_document_length,
-            stopwords=config.preprocessing.stopwords)
+            stopwords=config.preprocessing.stopwords,
+            avoid_pipe=config.general.nlp.faster_spacy_tokenization)
 
     def get_doc_class(self) -> Type[MutableDocument]:
         return Document
diff --git a/medcat-v2/tests/test_cat.py b/medcat-v2/tests/test_cat.py
@@ -4,6 +4,7 @@
 import json
 from typing import Optional, Any
 from collections import Counter
+from contextlib import contextmanager
 
 from medcat import cat
 from medcat.data.model_card import ModelCard
@@ -18,6 +19,7 @@
 from medcat.components.addons.meta_cat import MetaCATAddon
 from medcat.utils.defaults import AVOID_LEGACY_CONVERSION_ENVIRON
 from medcat.utils.defaults import LegacyConversionDisabledError
+from medcat.utils.config_utils import temp_changed_config
 
 import unittest
 import tempfile
@@ -222,6 +224,60 @@ def test_inference_works(self):
             with self.subTest(f"{nr}"):
                 ConvertedFunctionalityTests.assert_has_ent(ent)
 
+    @classmethod
+    @contextmanager
+    def _faster_spacy_inference(cls):
+        with temp_changed_config(
+            cls.model.config.general.nlp,
+            "faster_spacy_tokenization",
+            True
+        ):
+            with temp_changed_config(
+                cls.model.config.general.nlp,
+                "modelname",
+                "en_core_web_md"
+            ):
+                cls.model._recreate_pipe()
+                yield
+        cls.model._recreate_pipe()
+
+    def _is_spacy_model(self):
+        if self.model.config.general.nlp.provider != "spacy":
+            raise unittest.SkipTest("Only applicable for spacy models")
+
+    def test_default_spacy_runs_pipe(self):
+        self._is_spacy_model()
+        self.assertFalse(self.model.pipe._tokenizer._avoid_pipe)
+
+    def test_faster_spacy_inference_is_set(self):
+        self._is_spacy_model()
+        with self._faster_spacy_inference():
+            self.assertTrue(self.model.pipe._tokenizer._avoid_pipe)
+
+    def test_faster_spacy_inference_works(self):
+        self._is_spacy_model()
+        with self._faster_spacy_inference():
+            ents = self.model.get_entities(
+                ConvertedFunctionalityTests.TEXT)['entities']
+            self.assertTrue(ents)
+            for nr, ent in enumerate(ents.values()):
+                with self.subTest(f"{nr}"):
+                    ConvertedFunctionalityTests.assert_has_ent(ent)
+
+    def test_faster_spacy_inference_is_used(self):
+        self._is_spacy_model()
+        with self._faster_spacy_inference():
+            with unittest.mock.patch.object(
+                    self.model.pipe._tokenizer._nlp,
+                    '__call__') as dunder_call_mock:
+                with unittest.mock.patch.object(
+                        self.model.pipe._tokenizer._nlp,
+                        'make_doc') as make_doc_mock:
+                    self.model.get_entities(
+                        ConvertedFunctionalityTests.TEXT)
+                    dunder_call_mock.assert_not_called()
+                    make_doc_mock.assert_called()
+
     def test_entities_in_correct_order(self):
         # NOTE: the issue wouldn't show up with smaller amount of text
         doc = self.model(ConvertedFunctionalityTests.TEXT * 3)