From 235cce7df1caa0086e29ca6b5151974c67a43238 Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Fri, 12 Jun 2026 13:02:20 -0700 Subject: [PATCH] Match the generic "reference" term as a token, not a bare substring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ReferenceURIs.is_reference_uri matched the bare substring "reference", so any URI that merely contained those letters was classified as an authoritative reference field — e.g. `user_preference`, `dereference`, `preferenceOrder`. A mis-detected reference slot can then trigger spurious reference fetches and "could not fetch reference" errors on unrelated fields. Match the generic word "reference"/"references" only as a whole token, split on camelCase boundaries and separators, while keeping the specific Dublin Core / legacy URIs (dcterms:source, authoritative_reference, ...) as substring matches. This preserves real detections — including the camelCase `myReferenceField` case the existing tests pin — while rejecting the false positives. Also complete two under-specified mocks in test_title_validation that omitted `slot_uri` (their sibling title mock already set it to None); real slots always carry slot_uri as str|None, never an auto-MagicMock. Co-Authored-By: Claude Fable 5 --- .../field_detection.py | 45 +++++++++++++++++- tests/test_field_detection.py | 47 +++++++++++++++++++ tests/test_title_validation.py | 2 + 3 files changed, 92 insertions(+), 2 deletions(-) diff --git a/src/linkml_reference_validator/field_detection.py b/src/linkml_reference_validator/field_detection.py index 6d30718..42534f1 100644 --- a/src/linkml_reference_validator/field_detection.py +++ b/src/linkml_reference_validator/field_detection.py @@ -29,6 +29,7 @@ full URIs before matching. """ +import re from dataclasses import dataclass from typing import TYPE_CHECKING, Callable, Optional, Protocol @@ -36,6 +37,26 @@ from curies import Converter +def _uri_tokens(uri: str) -> set[str]: + """Split a URI/CURIE into lowercased word tokens. + + Splits on camelCase boundaries and any run of non-alphanumeric characters, + so a generic term can be matched as a whole word rather than as a bare + substring (which would flag ``user_preference`` or ``dereference`` as + containing "reference"). + + Examples: + >>> sorted(_uri_tokens("http://example.org/myReferenceField")) + ['example', 'field', 'http', 'my', 'org', 'reference'] + >>> sorted(_uri_tokens("test:user_preference")) + ['preference', 'test', 'user'] + >>> sorted(_uri_tokens("dcterms:references")) + ['dcterms', 'references'] + """ + spaced = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", uri) + return {tok.lower() for tok in re.split(r"[^A-Za-z0-9]+", spaced) if tok} + + # ============================================================================= # URI Constants # ============================================================================= @@ -126,20 +147,32 @@ class ReferenceURIs: LEGACY_LINKML: str = "https://w3id.org/linkml/authoritative_reference" LEGACY_LINKML_PREFIXED: str = "linkml:authoritative_reference" - # Additional patterns to match (substrings) + # Unambiguous patterns matched as substrings (e.g. dcterms:source has no + # generic single-word form to confuse). MATCH_PATTERNS: tuple[str, ...] = ( "dcterms:references", "dc/terms/references", "dcterms:source", "dc/terms/source", "authoritative_reference", + ) + + # Generic terms matched as whole word tokens, NOT bare substrings, so that + # "user_preference" / "dereference" are not mistaken for references. + TOKEN_TERMS: tuple[str, ...] = ( "reference", + "references", ) @classmethod def is_reference_uri(cls, uri: str) -> bool: """Check if a URI identifies a reference field. + Specific Dublin Core / legacy URIs are matched as substrings. The + generic word "reference(s)" is matched only as a whole token, so URIs + that merely contain those letters (``user_preference``, ``dereference``) + are not misclassified. + Args: uri: URI string to check (can be full or prefixed) @@ -155,11 +188,19 @@ def is_reference_uri(cls, uri: str) -> bool: True >>> ReferenceURIs.is_reference_uri("https://w3id.org/linkml/authoritative_reference") True + >>> ReferenceURIs.is_reference_uri("http://example.org/myReferenceField") + True >>> ReferenceURIs.is_reference_uri("oa:exact") False + >>> ReferenceURIs.is_reference_uri("test:user_preference") + False + >>> ReferenceURIs.is_reference_uri("ex:dereference") + False """ uri_lower = uri.lower() - return any(pattern in uri_lower for pattern in cls.MATCH_PATTERNS) + if any(pattern in uri_lower for pattern in cls.MATCH_PATTERNS): + return True + return bool(_uri_tokens(uri) & set(cls.TOKEN_TERMS)) @dataclass(frozen=True) diff --git a/tests/test_field_detection.py b/tests/test_field_detection.py index 93e43e8..895c080 100644 --- a/tests/test_field_detection.py +++ b/tests/test_field_detection.py @@ -166,6 +166,53 @@ def test_multiple_implements(self): assert is_excerpt_slot(slot) is True +class TestReferenceURITokenBoundaries: + """The generic 'reference' term must match as a token, not a bare substring. + + A bare substring match flags unrelated URIs like ``user_preference`` or + ``dereference`` (both contain the letters 'reference'), which would cause + the plugin to treat those slots as authoritative references. Matching the + word as a camelCase / separator-delimited token avoids the false positives + while still recognising real reference fields. + """ + + @pytest.mark.parametrize( + "uri", + [ + "test:user_preference", + "ex:dereference", + "http://example.org/userPreference", + "schema:preferenceOrder", + ], + ) + def test_non_reference_terms_rejected(self, uri: str): + """URIs where 'reference' is only a substring of another word.""" + assert ReferenceURIs.is_reference_uri(uri) is False + + @pytest.mark.parametrize( + "uri", + [ + # Specific canonical / legacy forms must keep matching + "dcterms:references", + "http://purl.org/dc/terms/references", + "dcterms:source", + "linkml:authoritative_reference", + # Generic 'reference' token in various shapes + "http://example.org/myReferenceField", + "test:reference", + "ex:cross_reference", + ], + ) + def test_real_reference_terms_accepted(self, uri: str): + """Genuine reference fields must still be detected.""" + assert ReferenceURIs.is_reference_uri(uri) is True + + def test_slot_with_preference_uri_not_a_reference(self): + """A slot whose URI merely contains 'preference' is not a reference.""" + slot = Mock(implements=["test:user_preference"], slot_uri=None) + assert is_reference_slot(slot) is False + + class TestIsReferenceSlot: """Tests for is_reference_slot function.""" diff --git a/tests/test_title_validation.py b/tests/test_title_validation.py index 3d56f97..217d45f 100644 --- a/tests/test_title_validation.py +++ b/tests/test_title_validation.py @@ -318,10 +318,12 @@ def test_validate_with_title_field(self, plugin, mocker): # Setup schema view mock_slot_ref = mocker.MagicMock() mock_slot_ref.implements = ["linkml:authoritative_reference"] + mock_slot_ref.slot_uri = None mock_slot_ref.range = None mock_slot_excerpt = mocker.MagicMock() mock_slot_excerpt.implements = ["linkml:excerpt"] + mock_slot_excerpt.slot_uri = None mock_slot_excerpt.range = None mock_slot_title = mocker.MagicMock()