From 84af070982fc81fa14a5419b81112b3153b15797 Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Tue, 19 May 2026 18:16:30 -0500 Subject: [PATCH] fix: Disable semantic_check on populate antijoin (parallels #1383) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Same fix #1383 applied to the Job table's antijoin in refresh(), now applied to AutoPopulate._populate_direct's antijoin and the progress() fallback path. The two-arg subtract `key_source - self` triggers QueryExpression.__sub__ which calls .restrict(Not(...)) with semantic_check=True by default. The semantic-check requirement is wrong here: this antijoin is a plain set-difference, not a join — we ask "which key_source rows aren't yet in self." Whether the same-named PK attribute carries the same source-table lineage tag on both sides is irrelevant. Where it bites: dj.Imported / dj.Computed tables whose primary key is fully inherited from a single FK, with no own-table PK attributes. On those, self.proj() returns the PK attribute with lineage=None (or pointing to self rather than the FK parent), while key_source's matching attribute carries the parent's lineage tag. The semantic-check fails with: Cannot join on attribute 'X': different lineages (schema.parent.X vs None). Use .proj() to rename one of the attributes. This pattern is legitimate ("one row downstream per parent row, no intermediate ID") but rare in typical Elements / SciOps pipelines, which extend the inherited PK with own-table attributes (trial_id, experiment_id, etc.) that anchor proj()'s lineage. That's why the existing #1405 test suite didn't surface it. Changes: - src/datajoint/autopopulate.py - Import Not from .condition at module top. - _populate_direct: replace `(LHS - self.proj())` with `LHS.restrict(Not(self.proj()), semantic_check=False)`. - progress(): same swap on the no-common-attrs fallback branch. - tests/integration/test_autopopulate.py - New test_populate_antijoin_fk_inherited_pk regression test: Spec(Manual) -> Item(Imported with only -> Spec) — the minimal shape that triggers the bug. Without the fix Item.populate() raises DataJointError; with the fix it populates correctly, progress() reports correct counts, and partial-then-full populate works. Stacked on top of #1452 (the secrets-loading + dead-code fix); rebase to master after that lands. --- src/datajoint/autopopulate.py | 11 ++++- tests/integration/test_autopopulate.py | 58 ++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 2 deletions(-) diff --git a/src/datajoint/autopopulate.py b/src/datajoint/autopopulate.py index 24d6b17aa..2c57e2fa5 100644 --- a/src/datajoint/autopopulate.py +++ b/src/datajoint/autopopulate.py @@ -11,6 +11,7 @@ import traceback from typing import TYPE_CHECKING, Any, Generator +from .condition import Not from .errors import DataJointError, LostConnectionError from .expression import AndList, QueryExpression @@ -401,7 +402,12 @@ def _populate_direct( """ from tqdm import tqdm - keys = (self._jobs_to_do(restrictions) - self.proj()).keys() + # Disable semantic_check on the antijoin: when self has FK-inherited + # PK attributes, self.proj() may carry attribute lineages that don't + # match key_source's (same attribute, different source-table tag). + # The set-difference itself doesn't care about lineage — we just want + # rows in key_source that aren't yet in self. + keys = self._jobs_to_do(restrictions).restrict(Not(self.proj()), semantic_check=False).keys() logger.debug("Found %d keys to populate" % len(keys)) @@ -702,7 +708,8 @@ def progress(self, *restrictions: Any, display: bool = False) -> tuple[int, int] if not common_attrs: # No common attributes - fall back to two-query method total = len(todo) - remaining = len(todo - self.proj()) + # Same lineage caveat as in _populate_direct — disable semantic_check. + remaining = len(todo.restrict(Not(self.proj()), semantic_check=False)) else: # Build a single query that computes both total and remaining # Using LEFT JOIN with COUNT(DISTINCT) to handle 1:many relationships diff --git a/tests/integration/test_autopopulate.py b/tests/integration/test_autopopulate.py index 02ba69d6b..e3913e120 100644 --- a/tests/integration/test_autopopulate.py +++ b/tests/integration/test_autopopulate.py @@ -236,6 +236,64 @@ def make(self, key): test_schema.drop(prompt=False) +def test_populate_antijoin_fk_inherited_pk(prefix, connection_test): + """Regression test: populate antijoin on a table whose PK is fully FK-inherited. + + Reproduces the lineage-mismatch failure that hits ``Imported`` or + ``Computed`` tables whose primary key consists entirely of attributes + inherited via a foreign key, with no own-table PK attributes. + + Without the ``semantic_check=False`` on the populate antijoin, the + subtraction ``key_source - self.proj()`` raises:: + + DataJointError: Cannot join on attribute 'spec_id': different lineages + (schema.spec.spec_id vs None). Use .proj() to rename one of the attributes. + + The set-difference doesn't actually need lineage matching — it just + asks which key_source rows aren't yet in ``self``. + """ + test_schema = dj.Schema(f"{prefix}_antijoin_fk_pk", connection=connection_test) + + @test_schema + class Spec(dj.Manual): + definition = """ + spec_id : int32 + --- + label : varchar(30) + """ + + @test_schema + class Item(dj.Imported): + definition = """ + -> Spec + --- + payload : varchar(60) + """ + + def make(self, key): + label = (Spec & key).fetch1("label") + self.insert1(dict(key, payload=f"made:{label}")) + + try: + Spec.insert([(1, "alpha"), (2, "beta"), (3, "gamma")]) + + # Before the fix this raised DataJointError on the antijoin. + Item.populate(max_calls=2) + assert len(Item) == 2 + + remaining, total = Item.progress() + assert total == 3 + assert remaining == 1 + + Item.populate() + assert len(Item) == 3 + remaining, total = Item.progress() + assert remaining == 0 + assert total == 3 + finally: + test_schema.drop(prompt=False) + + def test_load_dependencies(prefix, connection_test): schema = dj.Schema(f"{prefix}_load_dependencies_populate", connection=connection_test)