Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20260428025412857613.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Fix entity deduplication to use (title, type) compound key instead of title only, preserving entities with the same name but different types"
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ async def finalize_entities(
) -> list[dict[str, Any]]:
"""Read entity rows, enrich with degree, and write back.

Streams through the entities table, deduplicates by title,
Streams through the entities table, deduplicates by (title, type),
assigns degree from the pre-computed degree map, and writes
each finalized row back to the same table (safe when using
truncate=True, which reads from the original and writes to
Expand All @@ -36,14 +36,15 @@ async def finalize_entities(
Sample of up to 5 entity rows for logging.
"""
sample_rows: list[dict[str, Any]] = []
seen_titles: set[str] = set()
seen_entities: set[tuple[str, str]] = set()
human_readable_id = 0

async for row in entities_table:
title = row.get("title")
if not title or title in seen_titles:
entity_type = row.get("type", "")
if not title or (title, entity_type) in seen_entities:
continue
seen_titles.add(title)
seen_entities.add((title, entity_type))
row["degree"] = degree_map.get(title, 0)
row["human_readable_id"] = human_readable_id
row["id"] = str(uuid4())
Expand Down
18 changes: 17 additions & 1 deletion tests/unit/indexing/test_finalize_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ async def test_missing_degree_defaults_to_zero(self):
assert table.written[0]["degree"] == 0

async def test_deduplicates_by_title(self):
"""Duplicate titles should be skipped."""
"""Duplicate (title, type) pairs should be skipped."""
table = FakeTable([
_make_entity_row("A"),
_make_entity_row("A"),
Expand All @@ -205,6 +205,22 @@ async def test_deduplicates_by_title(self):
titles = [r["title"] for r in table.written]
assert titles == ["A", "B"]

async def test_preserves_same_title_different_type(self):
"""Entities with the same title but different types must not be merged."""
table = FakeTable([
_make_entity_row("Python", entity_type="LANGUAGE"),
_make_entity_row("Python", entity_type="CONCEPT"),
_make_entity_row("Java", entity_type="LANGUAGE"),
])
degree_map = {"Python": 2, "Java": 1}
await finalize_entities(table, degree_map)

assert len(table.written) == 3
pairs = [(r["title"], r["type"]) for r in table.written]
assert ("Python", "LANGUAGE") in pairs
assert ("Python", "CONCEPT") in pairs
assert ("Java", "LANGUAGE") in pairs

async def test_skips_empty_title(self):
"""Rows with empty or missing title should be skipped."""
table = FakeTable([
Expand Down