Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions medcat-v2/medcat/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,39 @@ class NLPConfig(SerialisableBaseModel):

Currently only regex and spacy are natively supported.

NB! For these changes to take effect, the pipe would need to be recreated.
"""
faster_spacy_tokenization: bool = False
"""Allow skipping the spacy pipeline.

If True, uses basic tokenization only (spacy.make_doc) for ~3-4x overall
speedup.
If False, uses full linguistic pipeline including POS tagging,
lemmatization, and stopword detection.

**Impact of fast_tokenization=True:**
- No part-of-speech tags: All tokens treated uniformly during normalization
- No lemmatization: Words used in surface form (e.g., "running" vs "run")
- No stopword detection: All tokens in multi-token spans considered;
all tokens used in context vector calculation
- Real world performance (in terms of precision and recall) is likely to
be lower

**When to use fast mode:**
- Processing very large datasets where speed is critical
- Text is already clean/normalized
- Minor drops in precision/recall (typically 1-3%) are acceptable

**When to use full mode (default):**
- Maximum accuracy is required
- Working with noisy or varied text
- Proper linguistic analysis improves your specific use case

Benchmark on your data to determine if the speedup justifies the
accuracy tradeoff.

PS: Only applicable for spacy based tokenizer.

NB! For these changes to take effect, the pipe would need to be recreated.
"""
modelname: str = 'en_core_web_md'
Expand Down
13 changes: 10 additions & 3 deletions medcat-v2/medcat/tokenizing/spacy_impl/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def __init__(self, spacy_model_name: str,
max_document_length: int,
tokenizer_getter: Callable[[Language, bool], Tokenizer
] = spacy_split_all,
stopwords: Optional[set[str]] = None,):
stopwords: Optional[set[str]] = None,
avoid_pipe: bool = False):
self._spacy_model_name = os.path.basename(
spacy_model_name).removeprefix(TOKENIZER_PREFIX)
if self.load_internals_from(spacy_model_name):
Expand All @@ -62,6 +63,7 @@ def __init__(self, spacy_model_name: str,
TOKENIZER_PREFIX).split('_', 1)[0]
cls = spacy.util.get_lang_class(lang_str)
cls.Defaults.stop_words = set(stopwords)
self._avoid_pipe = avoid_pipe
self._nlp = spacy.load(spacy_model_name,
disable=spacy_disabled_components)
self._nlp.tokenizer = tokenizer_getter(self._nlp, use_diacritics)
Expand All @@ -83,7 +85,11 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
return Entity(span)

def __call__(self, text: str) -> MutableDocument:
return Document(self._nlp(text))
if self._avoid_pipe:
doc = Document(self._nlp.make_doc(text))
else:
doc = Document(self._nlp(text))
return doc

@classmethod
def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer':
Expand All @@ -93,7 +99,8 @@ def create_new_tokenizer(cls, config: Config) -> 'SpacyTokenizer':
nlp_cnf.disabled_components,
config.general.diacritics,
config.preprocessing.max_document_length,
stopwords=config.preprocessing.stopwords)
stopwords=config.preprocessing.stopwords,
avoid_pipe=config.general.nlp.faster_spacy_tokenization)

def get_doc_class(self) -> Type[MutableDocument]:
return Document
Expand Down
56 changes: 56 additions & 0 deletions medcat-v2/tests/test_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
from typing import Optional, Any
from collections import Counter
from contextlib import contextmanager

from medcat import cat
from medcat.data.model_card import ModelCard
Expand All @@ -18,6 +19,7 @@
from medcat.components.addons.meta_cat import MetaCATAddon
from medcat.utils.defaults import AVOID_LEGACY_CONVERSION_ENVIRON
from medcat.utils.defaults import LegacyConversionDisabledError
from medcat.utils.config_utils import temp_changed_config

import unittest
import tempfile
Expand Down Expand Up @@ -222,6 +224,60 @@ def test_inference_works(self):
with self.subTest(f"{nr}"):
ConvertedFunctionalityTests.assert_has_ent(ent)

@classmethod
@contextmanager
def _faster_spacy_inference(cls):
with temp_changed_config(
cls.model.config.general.nlp,
"faster_spacy_tokenization",
True
):
with temp_changed_config(
cls.model.config.general.nlp,
"modelname",
"en_core_web_md"
):
cls.model._recreate_pipe()
yield
cls.model._recreate_pipe()

def _is_spacy_model(self):
if self.model.config.general.nlp.provider != "spacy":
raise unittest.SkipTest("Only applicable for spacy models")

def test_default_spacy_runs_pipe(self):
self._is_spacy_model()
self.assertFalse(self.model.pipe._tokenizer._avoid_pipe)

def test_faster_spacy_inference_is_set(self):
self._is_spacy_model()
with self._faster_spacy_inference():
self.assertTrue(self.model.pipe._tokenizer._avoid_pipe)

def test_faster_spacy_inference_works(self):
self._is_spacy_model()
with self._faster_spacy_inference():
ents = self.model.get_entities(
ConvertedFunctionalityTests.TEXT)['entities']
self.assertTrue(ents)
for nr, ent in enumerate(ents.values()):
with self.subTest(f"{nr}"):
ConvertedFunctionalityTests.assert_has_ent(ent)

def test_faster_spacy_inference_is_used(self):
self._is_spacy_model()
with self._faster_spacy_inference():
with unittest.mock.patch.object(
self.model.pipe._tokenizer._nlp,
'__call__') as dunder_call_mock:
with unittest.mock.patch.object(
self.model.pipe._tokenizer._nlp,
'make_doc') as make_doc_mock:
self.model.get_entities(
ConvertedFunctionalityTests.TEXT)
dunder_call_mock.assert_not_called()
make_doc_mock.assert_called()

def test_entities_in_correct_order(self):
# NOTE: the issue wouldn't show up with smaller amount of text
doc = self.model(ConvertedFunctionalityTests.TEXT * 3)
Expand Down
Loading