From 98e64e10ac8f9e0b6ed9aea6feccf4a5bd18aeb3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Nov 2025 13:18:23 +0000 Subject: [PATCH 1/4] CU-869b9h7y6: Add faster linker that only links to primary names --- .../linking/only_primary_name_linker.py | 70 +++++++++++++++++++ medcat-v2/medcat/components/types.py | 4 ++ 2 files changed, 74 insertions(+) create mode 100644 medcat-v2/medcat/components/linking/only_primary_name_linker.py diff --git a/medcat-v2/medcat/components/linking/only_primary_name_linker.py b/medcat-v2/medcat/components/linking/only_primary_name_linker.py new file mode 100644 index 000000000..97bfb8c9b --- /dev/null +++ b/medcat-v2/medcat/components/linking/only_primary_name_linker.py @@ -0,0 +1,70 @@ +from typing import Iterator, Optional, Union +import logging + +from medcat.tokenizing.tokens import MutableDocument, MutableEntity +from medcat.components.linking.context_based_linker import Linker +from medcat.components.linking.vector_context_model import ( + PerDocumentTokenCache) +from medcat.utils.defaults import StatusTypes +from medcat.cdb import CDB +from medcat.vocab import Vocab +from medcat.config import Config + + +logger = logging.getLogger(__name__) + + +class OnlyPrimaryNamesLinker(Linker): + name = 'primary_name_only_linker' + + def __init__(self, cdb: CDB, vocab: Vocab, config: Config) -> None: + super().__init__(cdb, vocab, config) + print("==INIT== primary_name_only_linker") + # don't need / use the context model + del self.context_model + + def _process_entity_inference( + self, doc: MutableDocument, + entity: MutableEntity, + per_doc_valid_token_cache: PerDocumentTokenCache + ) -> Iterator[MutableEntity]: + cuis = entity.link_candidates + if not cuis: + return + # Check does it have a detected name + name = entity.detected_name + if name is None: + logger.info("No name detected for entity %s", entity) + return + primary_cuis = [cui for cui, status in + self.cdb.name2info[name]["per_cui_status"].items() + if status in StatusTypes.PRIMARY_STATUS] + if not primary_cuis: + logger.info("No pimary CUIs for name %s", name) + return + if len(primary_cuis) > 1: + logger.info( + "Ambiguous pimary CUIs for name %s: %s", name, primary_cuis) + return + cui = primary_cuis[0] + entity.cui = cui + entity.context_similarity = 1.0 + yield entity + + def train(self, cui: str, + entity: MutableEntity, + doc: MutableDocument, + negative: bool = False, + names: Union[list[str], dict] = [], + per_doc_valid_token_cache: Optional[PerDocumentTokenCache] = None + ) -> None: + raise NoTrainingException("Training is not supported for this linker") + + def _train_on_doc(self, doc: MutableDocument, + ner_ents: list[MutableEntity] + ) -> Iterator[MutableEntity]: + raise NoTrainingException("Training is not supported for this linker") + + +class NoTrainingException(ValueError): + pass diff --git a/medcat-v2/medcat/components/types.py b/medcat-v2/medcat/components/types.py index a3aa549eb..a95b82ee3 100644 --- a/medcat-v2/medcat/components/types.py +++ b/medcat-v2/medcat/components/types.py @@ -213,6 +213,10 @@ def train(self, cui: str, "medcat2_embedding_linker": ( "medcat.components.linking.embedding_linker", "Linker.create_new_component"), + # primary name only + "primary_name_only_linker": ( + "medcat.components.linking.only_primary_name_linker", + "OnlyPrimaryNamesLinker.create_new_component"), } From d72b4f940c62f889238ad02b6fcd71a2f4e0c1b3 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Nov 2025 14:05:47 +0000 Subject: [PATCH 2/4] CU-869b9h7y6: Remove debug output --- medcat-v2/medcat/components/linking/only_primary_name_linker.py | 1 - 1 file changed, 1 deletion(-) diff --git a/medcat-v2/medcat/components/linking/only_primary_name_linker.py b/medcat-v2/medcat/components/linking/only_primary_name_linker.py index 97bfb8c9b..d472eabf6 100644 --- a/medcat-v2/medcat/components/linking/only_primary_name_linker.py +++ b/medcat-v2/medcat/components/linking/only_primary_name_linker.py @@ -19,7 +19,6 @@ class OnlyPrimaryNamesLinker(Linker): def __init__(self, cdb: CDB, vocab: Vocab, config: Config) -> None: super().__init__(cdb, vocab, config) - print("==INIT== primary_name_only_linker") # don't need / use the context model del self.context_model From 0839a24ee077d16cdd1de85a029bbe27e4ed8a77 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Nov 2025 14:25:38 +0000 Subject: [PATCH 3/4] CU-869b9h7y6: Add proper filtering as well as usage of single-possible CUI options --- .../linking/only_primary_name_linker.py | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/medcat-v2/medcat/components/linking/only_primary_name_linker.py b/medcat-v2/medcat/components/linking/only_primary_name_linker.py index d472eabf6..c9cca440d 100644 --- a/medcat-v2/medcat/components/linking/only_primary_name_linker.py +++ b/medcat-v2/medcat/components/linking/only_primary_name_linker.py @@ -35,9 +35,28 @@ def _process_entity_inference( if name is None: logger.info("No name detected for entity %s", entity) return - primary_cuis = [cui for cui, status in - self.cdb.name2info[name]["per_cui_status"].items() - if status in StatusTypes.PRIMARY_STATUS] + cnf_l = self.config.components.linking + if cnf_l.filter_before_disamb: + cuis = [cui for cui in cuis if cnf_l.filters.check_filters(cui)] + if not cuis: + logger.debug("No CUIs that fit filter for %s", entity) + return + if len(cuis) == 1: + if cnf_l.filters.check_filters(cuis[0]): + logger.info("Choosing only possible CUI %s for %s", + cuis[0], entity) + entity.cui = cuis[0] + entity.context_similarity = 1.0 + yield entity + else: + logger.info( + "A single CUI (%s) was mapped to for %s but not in filter", + cuis[0], entity) + return + primary_cuis = [cui for cui in cuis + if (self.cdb.name2info[name]['per_cui_status'][cui] + in StatusTypes.PRIMARY_STATUS and + cnf_l.filters.check_filters(cui))] if not primary_cuis: logger.info("No pimary CUIs for name %s", name) return From 48396af99ff09f910eda82cbd7a416bc9bc24602 Mon Sep 17 00:00:00 2001 From: mart-r Date: Tue, 25 Nov 2025 16:15:29 +0000 Subject: [PATCH 4/4] CU-869b9h7y6: Add a simple test for the new linker --- .../linking/test_primary_name_only_linker.py | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 medcat-v2/tests/components/linking/test_primary_name_only_linker.py diff --git a/medcat-v2/tests/components/linking/test_primary_name_only_linker.py b/medcat-v2/tests/components/linking/test_primary_name_only_linker.py new file mode 100644 index 000000000..f2036eb7e --- /dev/null +++ b/medcat-v2/tests/components/linking/test_primary_name_only_linker.py @@ -0,0 +1,33 @@ +import os + +from medcat.cdb import CDB +from medcat.cat import CAT +from medcat.vocab import Vocab +from medcat.components.linking.only_primary_name_linker import ( + OnlyPrimaryNamesLinker) + +import unittest + +from ... import UNPACKED_EXAMPLE_MODEL_PACK_PATH + + +EXAMPLE_CDB_PATH = os.path.join(UNPACKED_EXAMPLE_MODEL_PACK_PATH, "cdb") +EXAMPLE_VOCAB_PATH = os.path.join(UNPACKED_EXAMPLE_MODEL_PACK_PATH, "vocab") + + +class PrimaryNamesLinkerTests(unittest.TestCase): + TEXT = ( + "Man was diagnosed with severe kidney failure and acute diabetes " + "and presented with a light fever") + + @classmethod + def setUpClass(cls): + vocab = Vocab.load(EXAMPLE_VOCAB_PATH) + cdb = CDB.load(EXAMPLE_CDB_PATH) + cdb.config.components.linking.comp_name = OnlyPrimaryNamesLinker.name + cls.cat = CAT(cdb, vocab) + + def test_gets_entities(self): + ents = self.cat.get_entities(self.TEXT) + self.assertTrue(ents) + self.assertTrue(len(ents["entities"]))