From 235cce7df1caa0086e29ca6b5151974c67a43238 Mon Sep 17 00:00:00 2001
From: Chris Mungall <cjm@berkeleybop.org>
Date: Fri, 12 Jun 2026 13:02:20 -0700
Subject: [PATCH] Match the generic "reference" term as a token, not a bare
 substring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ReferenceURIs.is_reference_uri matched the bare substring "reference", so any
URI that merely contained those letters was classified as an authoritative
reference field — e.g. `user_preference`, `dereference`, `preferenceOrder`.
A mis-detected reference slot can then trigger spurious reference fetches and
"could not fetch reference" errors on unrelated fields.

Match the generic word "reference"/"references" only as a whole token, split
on camelCase boundaries and separators, while keeping the specific
Dublin Core / legacy URIs (dcterms:source, authoritative_reference, ...) as
substring matches. This preserves real detections — including the
camelCase `myReferenceField` case the existing tests pin — while rejecting the
false positives.

Also complete two under-specified mocks in test_title_validation that omitted
`slot_uri` (their sibling title mock already set it to None); real slots always
carry slot_uri as str|None, never an auto-MagicMock.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../field_detection.py                        | 45 +++++++++++++++++-
 tests/test_field_detection.py                 | 47 +++++++++++++++++++
 tests/test_title_validation.py                |  2 +
 3 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/src/linkml_reference_validator/field_detection.py b/src/linkml_reference_validator/field_detection.py
index 6d30718..42534f1 100644
--- a/src/linkml_reference_validator/field_detection.py
+++ b/src/linkml_reference_validator/field_detection.py
@@ -29,6 +29,7 @@
 full URIs before matching.
 """
 
+import re
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Callable, Optional, Protocol
 
@@ -36,6 +37,26 @@
     from curies import Converter
 
 
+def _uri_tokens(uri: str) -> set[str]:
+    """Split a URI/CURIE into lowercased word tokens.
+
+    Splits on camelCase boundaries and any run of non-alphanumeric characters,
+    so a generic term can be matched as a whole word rather than as a bare
+    substring (which would flag ``user_preference`` or ``dereference`` as
+    containing "reference").
+
+    Examples:
+        >>> sorted(_uri_tokens("http://example.org/myReferenceField"))
+        ['example', 'field', 'http', 'my', 'org', 'reference']
+        >>> sorted(_uri_tokens("test:user_preference"))
+        ['preference', 'test', 'user']
+        >>> sorted(_uri_tokens("dcterms:references"))
+        ['dcterms', 'references']
+    """
+    spaced = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", " ", uri)
+    return {tok.lower() for tok in re.split(r"[^A-Za-z0-9]+", spaced) if tok}
+
+
 # =============================================================================
 # URI Constants
 # =============================================================================
@@ -126,20 +147,32 @@ class ReferenceURIs:
     LEGACY_LINKML: str = "https://w3id.org/linkml/authoritative_reference"
     LEGACY_LINKML_PREFIXED: str = "linkml:authoritative_reference"
 
-    # Additional patterns to match (substrings)
+    # Unambiguous patterns matched as substrings (e.g. dcterms:source has no
+    # generic single-word form to confuse).
     MATCH_PATTERNS: tuple[str, ...] = (
         "dcterms:references",
         "dc/terms/references",
         "dcterms:source",
         "dc/terms/source",
         "authoritative_reference",
+    )
+
+    # Generic terms matched as whole word tokens, NOT bare substrings, so that
+    # "user_preference" / "dereference" are not mistaken for references.
+    TOKEN_TERMS: tuple[str, ...] = (
         "reference",
+        "references",
     )
 
     @classmethod
     def is_reference_uri(cls, uri: str) -> bool:
         """Check if a URI identifies a reference field.
 
+        Specific Dublin Core / legacy URIs are matched as substrings. The
+        generic word "reference(s)" is matched only as a whole token, so URIs
+        that merely contain those letters (``user_preference``, ``dereference``)
+        are not misclassified.
+
         Args:
             uri: URI string to check (can be full or prefixed)
 
@@ -155,11 +188,19 @@ def is_reference_uri(cls, uri: str) -> bool:
             True
             >>> ReferenceURIs.is_reference_uri("https://w3id.org/linkml/authoritative_reference")
             True
+            >>> ReferenceURIs.is_reference_uri("http://example.org/myReferenceField")
+            True
             >>> ReferenceURIs.is_reference_uri("oa:exact")
             False
+            >>> ReferenceURIs.is_reference_uri("test:user_preference")
+            False
+            >>> ReferenceURIs.is_reference_uri("ex:dereference")
+            False
         """
         uri_lower = uri.lower()
-        return any(pattern in uri_lower for pattern in cls.MATCH_PATTERNS)
+        if any(pattern in uri_lower for pattern in cls.MATCH_PATTERNS):
+            return True
+        return bool(_uri_tokens(uri) & set(cls.TOKEN_TERMS))
 
 
 @dataclass(frozen=True)
diff --git a/tests/test_field_detection.py b/tests/test_field_detection.py
index 93e43e8..895c080 100644
--- a/tests/test_field_detection.py
+++ b/tests/test_field_detection.py
@@ -166,6 +166,53 @@ def test_multiple_implements(self):
         assert is_excerpt_slot(slot) is True
 
 
+class TestReferenceURITokenBoundaries:
+    """The generic 'reference' term must match as a token, not a bare substring.
+
+    A bare substring match flags unrelated URIs like ``user_preference`` or
+    ``dereference`` (both contain the letters 'reference'), which would cause
+    the plugin to treat those slots as authoritative references. Matching the
+    word as a camelCase / separator-delimited token avoids the false positives
+    while still recognising real reference fields.
+    """
+
+    @pytest.mark.parametrize(
+        "uri",
+        [
+            "test:user_preference",
+            "ex:dereference",
+            "http://example.org/userPreference",
+            "schema:preferenceOrder",
+        ],
+    )
+    def test_non_reference_terms_rejected(self, uri: str):
+        """URIs where 'reference' is only a substring of another word."""
+        assert ReferenceURIs.is_reference_uri(uri) is False
+
+    @pytest.mark.parametrize(
+        "uri",
+        [
+            # Specific canonical / legacy forms must keep matching
+            "dcterms:references",
+            "http://purl.org/dc/terms/references",
+            "dcterms:source",
+            "linkml:authoritative_reference",
+            # Generic 'reference' token in various shapes
+            "http://example.org/myReferenceField",
+            "test:reference",
+            "ex:cross_reference",
+        ],
+    )
+    def test_real_reference_terms_accepted(self, uri: str):
+        """Genuine reference fields must still be detected."""
+        assert ReferenceURIs.is_reference_uri(uri) is True
+
+    def test_slot_with_preference_uri_not_a_reference(self):
+        """A slot whose URI merely contains 'preference' is not a reference."""
+        slot = Mock(implements=["test:user_preference"], slot_uri=None)
+        assert is_reference_slot(slot) is False
+
+
 class TestIsReferenceSlot:
     """Tests for is_reference_slot function."""
 
diff --git a/tests/test_title_validation.py b/tests/test_title_validation.py
index 3d56f97..217d45f 100644
--- a/tests/test_title_validation.py
+++ b/tests/test_title_validation.py
@@ -318,10 +318,12 @@ def test_validate_with_title_field(self, plugin, mocker):
         # Setup schema view
         mock_slot_ref = mocker.MagicMock()
         mock_slot_ref.implements = ["linkml:authoritative_reference"]
+        mock_slot_ref.slot_uri = None
         mock_slot_ref.range = None
 
         mock_slot_excerpt = mocker.MagicMock()
         mock_slot_excerpt.implements = ["linkml:excerpt"]
+        mock_slot_excerpt.slot_uri = None
         mock_slot_excerpt.range = None
 
         mock_slot_title = mocker.MagicMock()