Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 23 additions & 6 deletions sqlite_utils/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -2190,24 +2190,35 @@ def extract(
)
lookup_columns = [(rename.get(col) or col) for col in columns]
lookup_table.create_index(lookup_columns, unique=True, if_not_exists=True)
# Don't create a lookup row for the all-NULL combination: a row whose
# extracted columns are entirely NULL represents "no value", so it
# should keep a NULL foreign key rather than point at a NULL lookup row
# (#186). Rows with a partial NULL (some extracted columns set, others
# NULL) are a genuine distinct value and are still extracted.
self.db.execute(
"INSERT OR IGNORE INTO {} ({lookup_columns}) SELECT DISTINCT {table_cols} FROM {}".format(
"INSERT OR IGNORE INTO {} ({lookup_columns}) SELECT DISTINCT {table_cols} FROM {} WHERE NOT ({all_null})".format(

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Leave all-null rows unlinked with existing lookups

When extract() is pointed at an existing lookup table that already contains an all-NULL row (for example one created by a previous sqlite-utils version or manually), this WHERE only prevents inserting another all-NULL lookup row; the later UPDATE ... WHERE source.col IS lookup.col still matches the existing NULL row and sets the new foreign key to its id. Since reusing lookup tables is a supported path in this method, all-NULL source rows still fail to keep a NULL foreign key in that scenario.

Useful? React with 👍 / 👎.

quote_identifier(table),
quote_identifier(self.name),
lookup_columns=", ".join(quote_identifier(c) for c in lookup_columns),
table_cols=", ".join(quote_identifier(c) for c in columns),
all_null=" AND ".join(
"{} IS NULL".format(quote_identifier(c)) for c in columns
),
)
)

# Now add the new fk_column
self.add_column(magic_lookup_column, int)

# And populate it
# And populate it. The trailing ``WHERE NOT (<all columns NULL>)`` leaves
# all-NULL source rows with a NULL foreign key even when the lookup table
# already contains an all-NULL row (e.g. a reused or legacy lookup table)
# — the IS-based join would otherwise match that row and link to it (#186).
self.db.execute(
"UPDATE {} SET {} = (SELECT id FROM {} WHERE {where})".format(
quote_identifier(self.name),
quote_identifier(magic_lookup_column),
quote_identifier(table),
"UPDATE {table_name} SET {fk} = (SELECT id FROM {lookup} WHERE {where}) WHERE NOT ({all_null})".format(
table_name=quote_identifier(self.name),
fk=quote_identifier(magic_lookup_column),
lookup=quote_identifier(table),
where=" AND ".join(
"{}.{} IS {}.{}".format(
quote_identifier(self.name),
Expand All @@ -2217,6 +2228,12 @@ def extract(
)
for column in columns
),
all_null=" AND ".join(
"{}.{} IS NULL".format(
quote_identifier(self.name), quote_identifier(column)
)
for column in columns
),
)
)
# Figure out the right column order
Expand Down
67 changes: 65 additions & 2 deletions tests/test_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ def test_extract_error_on_incompatible_existing_lookup_table(fresh_db):


def test_extract_works_with_null_values(fresh_db):
# A NULL extracted value represents "no value", so it should keep a NULL
# foreign key rather than be turned into a lookup row of its own (#186).
fresh_db["listens"].insert_all(
[
{"id": 1, "track_title": "foo", "album_title": "bar"},
Expand All @@ -189,9 +191,70 @@ def test_extract_works_with_null_values(fresh_db):
)
assert list(fresh_db["listens"].rows) == [
{"id": 1, "track_title": "foo", "album_id": 1},
{"id": 2, "track_title": "baz", "album_id": 2},
{"id": 2, "track_title": "baz", "album_id": None},
]
assert list(fresh_db["albums"].rows) == [
{"id": 1, "album_title": "bar"},
{"id": 2, "album_title": None},
]


def test_extract_does_not_create_lookup_row_for_all_null(fresh_db):
# Single-column: every NULL keeps a NULL fk and no NULL lookup row is made.
fresh_db["creatures"].insert_all(
[
{"id": 1, "name": "Simon", "type": None},
{"id": 2, "name": "Natalie", "type": None},
{"id": 3, "name": "Cleo", "type": "dog"},
],
pk="id",
)
fresh_db["creatures"].extract("type")
assert list(fresh_db["creatures"].rows) == [
{"id": 1, "name": "Simon", "type_id": None},
{"id": 2, "name": "Natalie", "type_id": None},
{"id": 3, "name": "Cleo", "type_id": 1},
]
assert list(fresh_db["type"].rows) == [{"id": 1, "type": "dog"}]


def test_extract_multi_column_keeps_partial_null_but_not_all_null(fresh_db):
# Multi-column: a row whose extracted columns are *all* NULL keeps a NULL
# fk, but a partial-NULL combination is a genuine distinct value and is
# still extracted (and shared between matching rows) (#186).
fresh_db["t"].insert_all(
[
{"id": 1, "a": "x", "b": None},
{"id": 2, "a": "x", "b": None},
{"id": 3, "a": None, "b": None},
{"id": 4, "a": "y", "b": "z"},
],
pk="id",
)
fresh_db["t"].extract(["a", "b"], table="ab", fk_column="ab_id")
rows = list(fresh_db["t"].rows)
assert rows[2]["ab_id"] is None # all-NULL row -> NULL fk
assert rows[0]["ab_id"] == rows[1]["ab_id"] is not None # partial NULL shared
assert rows[3]["ab_id"] not in (None, rows[0]["ab_id"])
# The lookup table must not contain an all-NULL row.
assert not any(row["a"] is None and row["b"] is None for row in fresh_db["ab"].rows)


def test_extract_all_null_stays_null_with_preexisting_null_lookup_row(fresh_db):
# Reusing a lookup table that already contains an all-NULL row (e.g. created
# by an older sqlite-utils version) must still leave all-NULL source rows with
# a NULL foreign key — the IS-based join must not link them to that row (#186).
fresh_db["type"].insert_all(
[{"id": 1, "type": None}, {"id": 2, "type": "dog"}], pk="id"
)
fresh_db["creatures"].insert_all(
[
{"id": 1, "name": "Simon", "type": None},
{"id": 2, "name": "Cleo", "type": "dog"},
],
pk="id",
)
fresh_db["creatures"].extract("type")
assert list(fresh_db["creatures"].rows) == [
{"id": 1, "name": "Simon", "type_id": None},
{"id": 2, "name": "Cleo", "type_id": 2},
]