diff --git a/medcat-v2/medcat/config/config.py b/medcat-v2/medcat/config/config.py index f10f84d73..d45c4966b 100644 --- a/medcat-v2/medcat/config/config.py +++ b/medcat-v2/medcat/config/config.py @@ -168,6 +168,39 @@ class NLPConfig(SerialisableBaseModel): Currently only regex and spacy are natively supported. + NB! For these changes to take effect, the pipe would need to be recreated. + """ + faster_spacy_tokenization: bool = False + """Allow skipping the spacy pipeline. + + If True, uses basic tokenization only (spacy.make_doc) for ~3-4x overall + speedup. + If False, uses full linguistic pipeline including POS tagging, + lemmatization, and stopword detection. + + **Impact of fast_tokenization=True:** + - No part-of-speech tags: All tokens treated uniformly during normalization + - No lemmatization: Words used in surface form (e.g., "running" vs "run") + - No stopword detection: All tokens in multi-token spans considered; + all tokens used in context vector calculation + - Real world performance (in terms of precision and recall) is likely to + be lower + + **When to use fast mode:** + - Processing very large datasets where speed is critical + - Text is already clean/normalized + - Minor drops in precision/recall (typically 1-3%) are acceptable + + **When to use full mode (default):** + - Maximum accuracy is required + - Working with noisy or varied text + - Proper linguistic analysis improves your specific use case + + Benchmark on your data to determine if the speedup justifies the + accuracy tradeoff. + + PS: Only applicable for spacy based tokenizer. + NB! For these changes to take effect, the pipe would need to be recreated. """ modelname: str = 'en_core_web_md' diff --git a/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py b/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py index 8b8e11540..0d18ed4f5 100644 --- a/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py +++ b/medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py @@ -47,7 +47,8 @@ def __init__(self, spacy_model_name: str, max_document_length: int, tokenizer_getter: Callable[[Language, bool], Tokenizer ] = spacy_split_all, - stopwords: Optional[set[str]] = None,): + stopwords: Optional[set[str]] = None, + avoid_pipe: bool = False): self._spacy_model_name = os.path.basename( spacy_model_name).removeprefix(TOKENIZER_PREFIX) if self.load_internals_from(spacy_model_name): @@ -62,6 +63,7 @@ def __init__(self, spacy_model_name: str, TOKENIZER_PREFIX).split('_', 1)[0] cls = spacy.util.get_lang_class(lang_str) cls.Defaults.stop_words = set(stopwords) + self._avoid_pipe = avoid_pipe self._nlp = spacy.load(spacy_model_name, disable=spacy_disabled_components) self._nlp.tokenizer = tokenizer_getter(self._nlp, use_diacritics) @@ -83,7 +85,11 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity: return Entity(span) def __call__(self, text: str) -> MutableDocument: - return Document(self._nlp(text)) + if self._avoid_pipe: + doc = Document(self._nlp.make_doc(text)) + else: + doc = Document(self._nlp(text)) + return doc @classmethod def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer': @@ -93,7 +99,8 @@ def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer': nlp_cnf.disabled_components, config.general.diacritics, config.preprocessing.max_document_length, - stopwords=config.preprocessing.stopwords) + stopwords=config.preprocessing.stopwords, + avoid_pipe=config.general.nlp.faster_spacy_tokenization) def get_doc_class(self) -> Type[MutableDocument]: return Document diff --git a/medcat-v2/tests/test_cat.py b/medcat-v2/tests/test_cat.py index cc4e11ada..b3a081312 100644 --- a/medcat-v2/tests/test_cat.py +++ b/medcat-v2/tests/test_cat.py @@ -4,6 +4,7 @@ import json from typing import Optional, Any from collections import Counter +from contextlib import contextmanager from medcat import cat from medcat.data.model_card import ModelCard @@ -18,6 +19,7 @@ from medcat.components.addons.meta_cat import MetaCATAddon from medcat.utils.defaults import AVOID_LEGACY_CONVERSION_ENVIRON from medcat.utils.defaults import LegacyConversionDisabledError +from medcat.utils.config_utils import temp_changed_config import unittest import tempfile @@ -222,6 +224,60 @@ def test_inference_works(self): with self.subTest(f"{nr}"): ConvertedFunctionalityTests.assert_has_ent(ent) + @classmethod + @contextmanager + def _faster_spacy_inference(cls): + with temp_changed_config( + cls.model.config.general.nlp, + "faster_spacy_tokenization", + True + ): + with temp_changed_config( + cls.model.config.general.nlp, + "modelname", + "en_core_web_md" + ): + cls.model._recreate_pipe() + yield + cls.model._recreate_pipe() + + def _is_spacy_model(self): + if self.model.config.general.nlp.provider != "spacy": + raise unittest.SkipTest("Only applicable for spacy models") + + def test_default_spacy_runs_pipe(self): + self._is_spacy_model() + self.assertFalse(self.model.pipe._tokenizer._avoid_pipe) + + def test_faster_spacy_inference_is_set(self): + self._is_spacy_model() + with self._faster_spacy_inference(): + self.assertTrue(self.model.pipe._tokenizer._avoid_pipe) + + def test_faster_spacy_inference_works(self): + self._is_spacy_model() + with self._faster_spacy_inference(): + ents = self.model.get_entities( + ConvertedFunctionalityTests.TEXT)['entities'] + self.assertTrue(ents) + for nr, ent in enumerate(ents.values()): + with self.subTest(f"{nr}"): + ConvertedFunctionalityTests.assert_has_ent(ent) + + def test_faster_spacy_inference_is_used(self): + self._is_spacy_model() + with self._faster_spacy_inference(): + with unittest.mock.patch.object( + self.model.pipe._tokenizer._nlp, + '__call__') as dunder_call_mock: + with unittest.mock.patch.object( + self.model.pipe._tokenizer._nlp, + 'make_doc') as make_doc_mock: + self.model.get_entities( + ConvertedFunctionalityTests.TEXT) + dunder_call_mock.assert_not_called() + make_doc_mock.assert_called() + def test_entities_in_correct_order(self): # NOTE: the issue wouldn't show up with smaller amount of text doc = self.model(ConvertedFunctionalityTests.TEXT * 3)