diff --git a/.semversioner/next-release/patch-20260428025412857613.json b/.semversioner/next-release/patch-20260428025412857613.json new file mode 100644 index 0000000000..fe28d95a5c --- /dev/null +++ b/.semversioner/next-release/patch-20260428025412857613.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Fix entity deduplication to use (title, type) compound key instead of title only, preserving entities with the same name but different types" +} diff --git a/packages/graphrag/graphrag/index/operations/finalize_entities.py b/packages/graphrag/graphrag/index/operations/finalize_entities.py index a0121a5b72..f4c096cd08 100644 --- a/packages/graphrag/graphrag/index/operations/finalize_entities.py +++ b/packages/graphrag/graphrag/index/operations/finalize_entities.py @@ -17,7 +17,7 @@ async def finalize_entities( ) -> list[dict[str, Any]]: """Read entity rows, enrich with degree, and write back. - Streams through the entities table, deduplicates by title, + Streams through the entities table, deduplicates by (title, type), assigns degree from the pre-computed degree map, and writes each finalized row back to the same table (safe when using truncate=True, which reads from the original and writes to @@ -36,14 +36,15 @@ async def finalize_entities( Sample of up to 5 entity rows for logging. """ sample_rows: list[dict[str, Any]] = [] - seen_titles: set[str] = set() + seen_entities: set[tuple[str, str]] = set() human_readable_id = 0 async for row in entities_table: title = row.get("title") - if not title or title in seen_titles: + entity_type = row.get("type", "") + if not title or (title, entity_type) in seen_entities: continue - seen_titles.add(title) + seen_entities.add((title, entity_type)) row["degree"] = degree_map.get(title, 0) row["human_readable_id"] = human_readable_id row["id"] = str(uuid4()) diff --git a/tests/unit/indexing/test_finalize_graph.py b/tests/unit/indexing/test_finalize_graph.py index 20daa49666..033b076fa2 100644 --- a/tests/unit/indexing/test_finalize_graph.py +++ b/tests/unit/indexing/test_finalize_graph.py @@ -192,7 +192,7 @@ async def test_missing_degree_defaults_to_zero(self): assert table.written[0]["degree"] == 0 async def test_deduplicates_by_title(self): - """Duplicate titles should be skipped.""" + """Duplicate (title, type) pairs should be skipped.""" table = FakeTable([ _make_entity_row("A"), _make_entity_row("A"), @@ -205,6 +205,22 @@ async def test_deduplicates_by_title(self): titles = [r["title"] for r in table.written] assert titles == ["A", "B"] + async def test_preserves_same_title_different_type(self): + """Entities with the same title but different types must not be merged.""" + table = FakeTable([ + _make_entity_row("Python", entity_type="LANGUAGE"), + _make_entity_row("Python", entity_type="CONCEPT"), + _make_entity_row("Java", entity_type="LANGUAGE"), + ]) + degree_map = {"Python": 2, "Java": 1} + await finalize_entities(table, degree_map) + + assert len(table.written) == 3 + pairs = [(r["title"], r["type"]) for r in table.written] + assert ("Python", "LANGUAGE") in pairs + assert ("Python", "CONCEPT") in pairs + assert ("Java", "LANGUAGE") in pairs + async def test_skips_empty_title(self): """Rows with empty or missing title should be skipped.""" table = FakeTable([