From 2ba6eb126322e39825e51a065ab4f5f2cfb51655 Mon Sep 17 00:00:00 2001 From: Johnson K C Date: Sun, 7 Jun 2026 16:23:58 -0400 Subject: [PATCH 1/2] Don't extract NULL values into a lookup row (#186) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `table.extract()` built its lookup table with `INSERT OR IGNORE ... SELECT DISTINCT FROM `, which included the all-NULL combination. That created a spurious lookup row for NULL and pointed every NULL source row at it, instead of leaving those rows with a NULL foreign key. Before: db["creatures"].extract("type") # type lookup: [{"id": 1, "type": None}, {"id": 2, "type": "dog"}] # creatures: Simon -> type_id=1, Natalie -> type_id=1, Cleo -> type_id=2 After: # type lookup: [{"id": 1, "type": "dog"}] # creatures: Simon -> type_id=None, Natalie -> type_id=None, Cleo -> type_id=1 A row whose extracted columns are entirely NULL represents "no value", so it now keeps a NULL foreign key and no lookup row is created for it. The fix adds a `WHERE NOT (IS NULL AND ...)` guard to the lookup INSERT; the existing `IS`-based foreign-key UPDATE then leaves those rows NULL automatically (the subquery finds no matching lookup row). For multi-column extracts, only the fully-NULL combination is skipped — a partial-NULL combination (some extracted columns set, others NULL) is a genuine distinct value and is still extracted and shared between matching rows. Updates test_extract_works_with_null_values to assert the corrected behaviour and adds regression tests for the single-column and multi-column cases. Co-Authored-By: Claude Opus 4.8 (1M context) --- sqlite_utils/db.py | 10 ++++++++- tests/test_extract.py | 48 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/sqlite_utils/db.py b/sqlite_utils/db.py index ed3fc7af..920a5959 100644 --- a/sqlite_utils/db.py +++ b/sqlite_utils/db.py @@ -2190,12 +2190,20 @@ def extract( ) lookup_columns = [(rename.get(col) or col) for col in columns] lookup_table.create_index(lookup_columns, unique=True, if_not_exists=True) + # Don't create a lookup row for the all-NULL combination: a row whose + # extracted columns are entirely NULL represents "no value", so it + # should keep a NULL foreign key rather than point at a NULL lookup row + # (#186). Rows with a partial NULL (some extracted columns set, others + # NULL) are a genuine distinct value and are still extracted. self.db.execute( - "INSERT OR IGNORE INTO {} ({lookup_columns}) SELECT DISTINCT {table_cols} FROM {}".format( + "INSERT OR IGNORE INTO {} ({lookup_columns}) SELECT DISTINCT {table_cols} FROM {} WHERE NOT ({all_null})".format( quote_identifier(table), quote_identifier(self.name), lookup_columns=", ".join(quote_identifier(c) for c in lookup_columns), table_cols=", ".join(quote_identifier(c) for c in columns), + all_null=" AND ".join( + "{} IS NULL".format(quote_identifier(c)) for c in columns + ), ) ) diff --git a/tests/test_extract.py b/tests/test_extract.py index d24c5974..a0622bac 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -177,6 +177,8 @@ def test_extract_error_on_incompatible_existing_lookup_table(fresh_db): def test_extract_works_with_null_values(fresh_db): + # A NULL extracted value represents "no value", so it should keep a NULL + # foreign key rather than be turned into a lookup row of its own (#186). fresh_db["listens"].insert_all( [ {"id": 1, "track_title": "foo", "album_title": "bar"}, @@ -189,9 +191,51 @@ def test_extract_works_with_null_values(fresh_db): ) assert list(fresh_db["listens"].rows) == [ {"id": 1, "track_title": "foo", "album_id": 1}, - {"id": 2, "track_title": "baz", "album_id": 2}, + {"id": 2, "track_title": "baz", "album_id": None}, ] assert list(fresh_db["albums"].rows) == [ {"id": 1, "album_title": "bar"}, - {"id": 2, "album_title": None}, ] + + +def test_extract_does_not_create_lookup_row_for_all_null(fresh_db): + # Single-column: every NULL keeps a NULL fk and no NULL lookup row is made. + fresh_db["creatures"].insert_all( + [ + {"id": 1, "name": "Simon", "type": None}, + {"id": 2, "name": "Natalie", "type": None}, + {"id": 3, "name": "Cleo", "type": "dog"}, + ], + pk="id", + ) + fresh_db["creatures"].extract("type") + assert list(fresh_db["creatures"].rows) == [ + {"id": 1, "name": "Simon", "type_id": None}, + {"id": 2, "name": "Natalie", "type_id": None}, + {"id": 3, "name": "Cleo", "type_id": 1}, + ] + assert list(fresh_db["type"].rows) == [{"id": 1, "type": "dog"}] + + +def test_extract_multi_column_keeps_partial_null_but_not_all_null(fresh_db): + # Multi-column: a row whose extracted columns are *all* NULL keeps a NULL + # fk, but a partial-NULL combination is a genuine distinct value and is + # still extracted (and shared between matching rows) (#186). + fresh_db["t"].insert_all( + [ + {"id": 1, "a": "x", "b": None}, + {"id": 2, "a": "x", "b": None}, + {"id": 3, "a": None, "b": None}, + {"id": 4, "a": "y", "b": "z"}, + ], + pk="id", + ) + fresh_db["t"].extract(["a", "b"], table="ab", fk_column="ab_id") + rows = list(fresh_db["t"].rows) + assert rows[2]["ab_id"] is None # all-NULL row -> NULL fk + assert rows[0]["ab_id"] == rows[1]["ab_id"] is not None # partial NULL shared + assert rows[3]["ab_id"] not in (None, rows[0]["ab_id"]) + # The lookup table must not contain an all-NULL row. + assert not any( + row["a"] is None and row["b"] is None for row in fresh_db["ab"].rows + ) From 89fe8bdcd111b5967ecb18389cca47ee06eab8b4 Mon Sep 17 00:00:00 2001 From: Johnson K C Date: Mon, 8 Jun 2026 14:59:25 -0400 Subject: [PATCH 2/2] extract(): keep all-NULL rows unlinked even with a pre-existing NULL lookup row Follow-up to the #186 fix: skipping the all-NULL combination in the lookup INSERT is not enough on its own. When extract() reuses a lookup table that already contains an all-NULL row (e.g. one written by an older sqlite-utils version or created manually), the IS-based foreign-key UPDATE would still match that row and link all-NULL source rows to it. The UPDATE now carries a trailing `WHERE NOT (IS NULL AND ...)` so all-NULL source rows are never assigned a foreign key and keep the NULL that the freshly-added column starts with, regardless of the lookup table's existing contents. Adds test_extract_all_null_stays_null_with_preexisting_null_lookup_row. Co-Authored-By: Claude Opus 4.8 (1M context) --- sqlite_utils/db.py | 19 ++++++++++++++----- tests/test_extract.py | 23 +++++++++++++++++++++-- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/sqlite_utils/db.py b/sqlite_utils/db.py index 920a5959..3c497d07 100644 --- a/sqlite_utils/db.py +++ b/sqlite_utils/db.py @@ -2210,12 +2210,15 @@ def extract( # Now add the new fk_column self.add_column(magic_lookup_column, int) - # And populate it + # And populate it. The trailing ``WHERE NOT ()`` leaves + # all-NULL source rows with a NULL foreign key even when the lookup table + # already contains an all-NULL row (e.g. a reused or legacy lookup table) + # — the IS-based join would otherwise match that row and link to it (#186). self.db.execute( - "UPDATE {} SET {} = (SELECT id FROM {} WHERE {where})".format( - quote_identifier(self.name), - quote_identifier(magic_lookup_column), - quote_identifier(table), + "UPDATE {table_name} SET {fk} = (SELECT id FROM {lookup} WHERE {where}) WHERE NOT ({all_null})".format( + table_name=quote_identifier(self.name), + fk=quote_identifier(magic_lookup_column), + lookup=quote_identifier(table), where=" AND ".join( "{}.{} IS {}.{}".format( quote_identifier(self.name), @@ -2225,6 +2228,12 @@ def extract( ) for column in columns ), + all_null=" AND ".join( + "{}.{} IS NULL".format( + quote_identifier(self.name), quote_identifier(column) + ) + for column in columns + ), ) ) # Figure out the right column order diff --git a/tests/test_extract.py b/tests/test_extract.py index a0622bac..130dee2d 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -236,6 +236,25 @@ def test_extract_multi_column_keeps_partial_null_but_not_all_null(fresh_db): assert rows[0]["ab_id"] == rows[1]["ab_id"] is not None # partial NULL shared assert rows[3]["ab_id"] not in (None, rows[0]["ab_id"]) # The lookup table must not contain an all-NULL row. - assert not any( - row["a"] is None and row["b"] is None for row in fresh_db["ab"].rows + assert not any(row["a"] is None and row["b"] is None for row in fresh_db["ab"].rows) + + +def test_extract_all_null_stays_null_with_preexisting_null_lookup_row(fresh_db): + # Reusing a lookup table that already contains an all-NULL row (e.g. created + # by an older sqlite-utils version) must still leave all-NULL source rows with + # a NULL foreign key — the IS-based join must not link them to that row (#186). + fresh_db["type"].insert_all( + [{"id": 1, "type": None}, {"id": 2, "type": "dog"}], pk="id" + ) + fresh_db["creatures"].insert_all( + [ + {"id": 1, "name": "Simon", "type": None}, + {"id": 2, "name": "Cleo", "type": "dog"}, + ], + pk="id", ) + fresh_db["creatures"].extract("type") + assert list(fresh_db["creatures"].rows) == [ + {"id": 1, "name": "Simon", "type_id": None}, + {"id": 2, "name": "Cleo", "type_id": 2}, + ]