From 90506759d7221b8127ee4ad146590b302f302c05 Mon Sep 17 00:00:00 2001 From: octo-patch Date: Tue, 28 Apr 2026 10:54:29 +0800 Subject: [PATCH] fix: deduplicate entities by (title, type) to preserve same-name different-type entities (fixes #1718) Previously, finalize_entities only tracked seen titles, causing entities with the same title but different type to be incorrectly deduplicated. Change the dedup key to the (title, type) compound tuple so that e.g. "Python" as a LANGUAGE and "Python" as a CONCEPT are both preserved. Co-Authored-By: Octopus --- .../patch-20260428025412857613.json | 4 ++++ .../index/operations/finalize_entities.py | 9 +++++---- tests/unit/indexing/test_finalize_graph.py | 18 +++++++++++++++++- 3 files changed, 26 insertions(+), 5 deletions(-) create mode 100644 .semversioner/next-release/patch-20260428025412857613.json diff --git a/.semversioner/next-release/patch-20260428025412857613.json b/.semversioner/next-release/patch-20260428025412857613.json new file mode 100644 index 0000000000..fe28d95a5c --- /dev/null +++ b/.semversioner/next-release/patch-20260428025412857613.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Fix entity deduplication to use (title, type) compound key instead of title only, preserving entities with the same name but different types" +} diff --git a/packages/graphrag/graphrag/index/operations/finalize_entities.py b/packages/graphrag/graphrag/index/operations/finalize_entities.py index a0121a5b72..f4c096cd08 100644 --- a/packages/graphrag/graphrag/index/operations/finalize_entities.py +++ b/packages/graphrag/graphrag/index/operations/finalize_entities.py @@ -17,7 +17,7 @@ async def finalize_entities( ) -> list[dict[str, Any]]: """Read entity rows, enrich with degree, and write back. - Streams through the entities table, deduplicates by title, + Streams through the entities table, deduplicates by (title, type), assigns degree from the pre-computed degree map, and writes each finalized row back to the same table (safe when using truncate=True, which reads from the original and writes to @@ -36,14 +36,15 @@ async def finalize_entities( Sample of up to 5 entity rows for logging. """ sample_rows: list[dict[str, Any]] = [] - seen_titles: set[str] = set() + seen_entities: set[tuple[str, str]] = set() human_readable_id = 0 async for row in entities_table: title = row.get("title") - if not title or title in seen_titles: + entity_type = row.get("type", "") + if not title or (title, entity_type) in seen_entities: continue - seen_titles.add(title) + seen_entities.add((title, entity_type)) row["degree"] = degree_map.get(title, 0) row["human_readable_id"] = human_readable_id row["id"] = str(uuid4()) diff --git a/tests/unit/indexing/test_finalize_graph.py b/tests/unit/indexing/test_finalize_graph.py index 20daa49666..033b076fa2 100644 --- a/tests/unit/indexing/test_finalize_graph.py +++ b/tests/unit/indexing/test_finalize_graph.py @@ -192,7 +192,7 @@ async def test_missing_degree_defaults_to_zero(self): assert table.written[0]["degree"] == 0 async def test_deduplicates_by_title(self): - """Duplicate titles should be skipped.""" + """Duplicate (title, type) pairs should be skipped.""" table = FakeTable([ _make_entity_row("A"), _make_entity_row("A"), @@ -205,6 +205,22 @@ async def test_deduplicates_by_title(self): titles = [r["title"] for r in table.written] assert titles == ["A", "B"] + async def test_preserves_same_title_different_type(self): + """Entities with the same title but different types must not be merged.""" + table = FakeTable([ + _make_entity_row("Python", entity_type="LANGUAGE"), + _make_entity_row("Python", entity_type="CONCEPT"), + _make_entity_row("Java", entity_type="LANGUAGE"), + ]) + degree_map = {"Python": 2, "Java": 1} + await finalize_entities(table, degree_map) + + assert len(table.written) == 3 + pairs = [(r["title"], r["type"]) for r in table.written] + assert ("Python", "LANGUAGE") in pairs + assert ("Python", "CONCEPT") in pairs + assert ("Java", "LANGUAGE") in pairs + async def test_skips_empty_title(self): """Rows with empty or missing title should be skipped.""" table = FakeTable([