diff --git a/sqlite_utils/db.py b/sqlite_utils/db.py index ed3fc7af..3c497d07 100644 --- a/sqlite_utils/db.py +++ b/sqlite_utils/db.py @@ -2190,24 +2190,35 @@ def extract( ) lookup_columns = [(rename.get(col) or col) for col in columns] lookup_table.create_index(lookup_columns, unique=True, if_not_exists=True) + # Don't create a lookup row for the all-NULL combination: a row whose + # extracted columns are entirely NULL represents "no value", so it + # should keep a NULL foreign key rather than point at a NULL lookup row + # (#186). Rows with a partial NULL (some extracted columns set, others + # NULL) are a genuine distinct value and are still extracted. self.db.execute( - "INSERT OR IGNORE INTO {} ({lookup_columns}) SELECT DISTINCT {table_cols} FROM {}".format( + "INSERT OR IGNORE INTO {} ({lookup_columns}) SELECT DISTINCT {table_cols} FROM {} WHERE NOT ({all_null})".format( quote_identifier(table), quote_identifier(self.name), lookup_columns=", ".join(quote_identifier(c) for c in lookup_columns), table_cols=", ".join(quote_identifier(c) for c in columns), + all_null=" AND ".join( + "{} IS NULL".format(quote_identifier(c)) for c in columns + ), ) ) # Now add the new fk_column self.add_column(magic_lookup_column, int) - # And populate it + # And populate it. The trailing ``WHERE NOT ()`` leaves + # all-NULL source rows with a NULL foreign key even when the lookup table + # already contains an all-NULL row (e.g. a reused or legacy lookup table) + # — the IS-based join would otherwise match that row and link to it (#186). self.db.execute( - "UPDATE {} SET {} = (SELECT id FROM {} WHERE {where})".format( - quote_identifier(self.name), - quote_identifier(magic_lookup_column), - quote_identifier(table), + "UPDATE {table_name} SET {fk} = (SELECT id FROM {lookup} WHERE {where}) WHERE NOT ({all_null})".format( + table_name=quote_identifier(self.name), + fk=quote_identifier(magic_lookup_column), + lookup=quote_identifier(table), where=" AND ".join( "{}.{} IS {}.{}".format( quote_identifier(self.name), @@ -2217,6 +2228,12 @@ def extract( ) for column in columns ), + all_null=" AND ".join( + "{}.{} IS NULL".format( + quote_identifier(self.name), quote_identifier(column) + ) + for column in columns + ), ) ) # Figure out the right column order diff --git a/tests/test_extract.py b/tests/test_extract.py index d24c5974..130dee2d 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -177,6 +177,8 @@ def test_extract_error_on_incompatible_existing_lookup_table(fresh_db): def test_extract_works_with_null_values(fresh_db): + # A NULL extracted value represents "no value", so it should keep a NULL + # foreign key rather than be turned into a lookup row of its own (#186). fresh_db["listens"].insert_all( [ {"id": 1, "track_title": "foo", "album_title": "bar"}, @@ -189,9 +191,70 @@ def test_extract_works_with_null_values(fresh_db): ) assert list(fresh_db["listens"].rows) == [ {"id": 1, "track_title": "foo", "album_id": 1}, - {"id": 2, "track_title": "baz", "album_id": 2}, + {"id": 2, "track_title": "baz", "album_id": None}, ] assert list(fresh_db["albums"].rows) == [ {"id": 1, "album_title": "bar"}, - {"id": 2, "album_title": None}, + ] + + +def test_extract_does_not_create_lookup_row_for_all_null(fresh_db): + # Single-column: every NULL keeps a NULL fk and no NULL lookup row is made. + fresh_db["creatures"].insert_all( + [ + {"id": 1, "name": "Simon", "type": None}, + {"id": 2, "name": "Natalie", "type": None}, + {"id": 3, "name": "Cleo", "type": "dog"}, + ], + pk="id", + ) + fresh_db["creatures"].extract("type") + assert list(fresh_db["creatures"].rows) == [ + {"id": 1, "name": "Simon", "type_id": None}, + {"id": 2, "name": "Natalie", "type_id": None}, + {"id": 3, "name": "Cleo", "type_id": 1}, + ] + assert list(fresh_db["type"].rows) == [{"id": 1, "type": "dog"}] + + +def test_extract_multi_column_keeps_partial_null_but_not_all_null(fresh_db): + # Multi-column: a row whose extracted columns are *all* NULL keeps a NULL + # fk, but a partial-NULL combination is a genuine distinct value and is + # still extracted (and shared between matching rows) (#186). + fresh_db["t"].insert_all( + [ + {"id": 1, "a": "x", "b": None}, + {"id": 2, "a": "x", "b": None}, + {"id": 3, "a": None, "b": None}, + {"id": 4, "a": "y", "b": "z"}, + ], + pk="id", + ) + fresh_db["t"].extract(["a", "b"], table="ab", fk_column="ab_id") + rows = list(fresh_db["t"].rows) + assert rows[2]["ab_id"] is None # all-NULL row -> NULL fk + assert rows[0]["ab_id"] == rows[1]["ab_id"] is not None # partial NULL shared + assert rows[3]["ab_id"] not in (None, rows[0]["ab_id"]) + # The lookup table must not contain an all-NULL row. + assert not any(row["a"] is None and row["b"] is None for row in fresh_db["ab"].rows) + + +def test_extract_all_null_stays_null_with_preexisting_null_lookup_row(fresh_db): + # Reusing a lookup table that already contains an all-NULL row (e.g. created + # by an older sqlite-utils version) must still leave all-NULL source rows with + # a NULL foreign key — the IS-based join must not link them to that row (#186). + fresh_db["type"].insert_all( + [{"id": 1, "type": None}, {"id": 2, "type": "dog"}], pk="id" + ) + fresh_db["creatures"].insert_all( + [ + {"id": 1, "name": "Simon", "type": None}, + {"id": 2, "name": "Cleo", "type": "dog"}, + ], + pk="id", + ) + fresh_db["creatures"].extract("type") + assert list(fresh_db["creatures"].rows) == [ + {"id": 1, "name": "Simon", "type_id": None}, + {"id": 2, "name": "Cleo", "type_id": 2}, ]