Skip to content

Commit 249e9d6

Browse files
authored
feat(duckdb): Bake in spatial and excel extensions (#36)
* feat(duckdb): Bake in `spatial` and `excel` extensions * fix: Use `load_extension` directly * chore: Address code review suggestions * chore: Test cases where extension loading or importing fails * fix: NumPy boolean comparison to be flake8 compliant
1 parent 9dadd3d commit 249e9d6

File tree

4 files changed

+212
-3
lines changed

4 files changed

+212
-3
lines changed

deepnote_toolkit/sql/duckdb_sql.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import sys
22

33
import duckdb
4+
from duckdb_extensions import import_extension
45
from packaging.version import Version
56

7+
from deepnote_toolkit.logging import LoggerManager
8+
69
_DEEPNOTE_DUCKDB_CONNECTION = None
710
_DEFAULT_DUCKDB_SAMPLE_SIZE = 20_000
811

@@ -40,16 +43,32 @@ def _get_duckdb_connection():
4043
duckdb.Connection: A connection to the DuckDB database.
4144
"""
4245
global _DEEPNOTE_DUCKDB_CONNECTION
46+
logger = LoggerManager().get_logger()
4347

4448
if not _DEEPNOTE_DUCKDB_CONNECTION:
4549
_DEEPNOTE_DUCKDB_CONNECTION = duckdb.connect(
4650
database=":memory:", read_only=False
4751
)
4852

53+
# DuckDB extensions are loaded from included wheels to prevent loading them
54+
# from the internet on every notebook start
55+
#
4956
# Install and load the spatial extension. Primary use case: reading xlsx files
5057
# e.g. SELECT * FROM st_read('excel.xlsx')
51-
_DEEPNOTE_DUCKDB_CONNECTION.execute("install spatial;")
52-
_DEEPNOTE_DUCKDB_CONNECTION.execute("load spatial;")
58+
# there is also official excel extension, which mentions that Excel support from spatial extension
59+
# may be removed in the future (see: https://duckdb.org/docs/stable/core_extensions/excel)
60+
for extension_name in ["spatial", "excel"]:
61+
try:
62+
import_extension(
63+
name=extension_name,
64+
force_install=True,
65+
con=_DEEPNOTE_DUCKDB_CONNECTION,
66+
)
67+
_DEEPNOTE_DUCKDB_CONNECTION.load_extension(extension_name)
68+
except Exception as e:
69+
# Extensions are optional and connection still works, users are able to load
70+
# them manually if needed (pulling them from internet in this case as fallback)
71+
logger.warning(f"Failed to load DuckDB {extension_name} extension: {e}")
5372

5473
_set_sample_size(_DEEPNOTE_DUCKDB_CONNECTION, _DEFAULT_DUCKDB_SAMPLE_SIZE)
5574

poetry.lock

Lines changed: 51 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ dependencies = [
5959
"duckdb>=1.1.0,<2.0.0; python_version < '3.12'",
6060
"duckdb>=1.1.0,<2.0.0; python_version >= '3.12'",
6161
"duckdb>=1.4.1,<2.0.0; python_version >= '3.13'",
62+
"duckdb-extensions>=1.1.0,<2.0.0", # bake in as dependency to not pull extensions from the internet on every notebook start
63+
"duckdb-extension-spatial>=1.1.0,<2.0.0",
64+
"duckdb-extension-excel>=1.1.0,<2.0.0",
6265
"google-cloud-bigquery-storage==2.16.2; python_version < '3.13'",
6366
"google-cloud-bigquery-storage>=2.33.1,<3; python_version>='3.13'",
6467

tests/unit/test_duckdb_sql.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
from contextlib import contextmanager
2+
from unittest import mock
3+
4+
import pandas as pd
5+
import pytest
6+
7+
from deepnote_toolkit.sql.duckdb_sql import (
8+
_get_duckdb_connection,
9+
_set_sample_size,
10+
_set_scan_all_frames,
11+
)
12+
13+
14+
@contextmanager
15+
def fresh_duckdb_connection():
16+
import deepnote_toolkit.sql.duckdb_sql as duckdb_sql_module
17+
18+
duckdb_sql_module._DEEPNOTE_DUCKDB_CONNECTION = None
19+
conn = _get_duckdb_connection()
20+
21+
try:
22+
yield conn
23+
finally:
24+
conn.close()
25+
duckdb_sql_module._DEEPNOTE_DUCKDB_CONNECTION = None
26+
27+
28+
@pytest.fixture(scope="function")
29+
def duckdb_connection():
30+
with fresh_duckdb_connection() as conn:
31+
yield conn
32+
33+
34+
@pytest.mark.parametrize("extension_name", ["spatial", "excel"])
35+
def test_extension_installed_and_loadable(duckdb_connection, extension_name):
36+
result = duckdb_connection.execute(
37+
f"SELECT installed FROM duckdb_extensions() WHERE extension_name = '{extension_name}'"
38+
).fetchone()
39+
40+
assert (
41+
result is not None
42+
), f"{extension_name} extension should be found in duckdb_extensions()"
43+
assert result[0] is True, f"{extension_name} extension should be installed"
44+
45+
loaded_result = duckdb_connection.execute(
46+
f"SELECT loaded FROM duckdb_extensions() WHERE extension_name = '{extension_name}'"
47+
).fetchone()
48+
assert loaded_result[0] is True, f"{extension_name} extension should be loaded"
49+
50+
51+
def test_connection_singleton_pattern():
52+
conn1 = _get_duckdb_connection()
53+
conn2 = _get_duckdb_connection()
54+
55+
assert conn1 is conn2, "Connection should be a singleton"
56+
57+
58+
def test_set_sample_size(duckdb_connection):
59+
_set_sample_size(duckdb_connection, 50000)
60+
result = duckdb_connection.execute(
61+
"SELECT value FROM duckdb_settings() WHERE name = 'pandas_analyze_sample'"
62+
).fetchone()
63+
assert int(result[0]) == 50000
64+
65+
66+
def test_set_scan_all_frames(duckdb_connection):
67+
_set_scan_all_frames(duckdb_connection, False)
68+
result = duckdb_connection.execute(
69+
"SELECT value FROM duckdb_settings() WHERE name = 'python_scan_all_frames'"
70+
).fetchone()
71+
assert result[0] == "false"
72+
73+
_set_scan_all_frames(duckdb_connection, True)
74+
result = duckdb_connection.execute(
75+
"SELECT value FROM duckdb_settings() WHERE name = 'python_scan_all_frames'"
76+
).fetchone()
77+
assert result[0] == "true"
78+
79+
80+
@mock.patch("deepnote_toolkit.sql.duckdb_sql.import_extension")
81+
def test_connection_returns_successfully_when_import_extension_fails(
82+
mock_import_extension,
83+
):
84+
mock_import_extension.side_effect = Exception("Failed to import extension")
85+
86+
with fresh_duckdb_connection() as conn:
87+
assert conn is not None
88+
result = conn.execute(
89+
"SELECT extension_name, loaded FROM duckdb_extensions()"
90+
).df()
91+
assert result is not None
92+
# check that spatial and excel extensions are not loaded as import extension failed
93+
result = result[result["extension_name"].isin(["spatial", "excel"])]
94+
assert all(result["loaded"]) is False
95+
96+
97+
@mock.patch("duckdb.DuckDBPyConnection.load_extension")
98+
def test_connection_returns_successfully_when_load_extension_fails(mock_load_extension):
99+
mock_load_extension.side_effect = Exception("Failed to load extension")
100+
101+
with fresh_duckdb_connection() as conn:
102+
assert conn is not None
103+
result = conn.execute(
104+
"SELECT extension_name, loaded FROM duckdb_extensions()"
105+
).df()
106+
assert result is not None
107+
# check that spatial and excel extensions are not loaded as import extension failed
108+
result = result[result["extension_name"].isin(["spatial", "excel"])]
109+
assert all(result["loaded"]) is False
110+
111+
112+
def test_excel_extension_roundtrip(duckdb_connection, tmp_path):
113+
test_data = pd.DataFrame(
114+
{
115+
"id": [1, 2, 3],
116+
"name": ["Alice", "Bob", "Charlie"],
117+
"score": [95.5, 87.3, 91.2],
118+
}
119+
)
120+
duckdb_connection.register("test_table", test_data)
121+
excel_path = tmp_path / "test_data.xlsx"
122+
duckdb_connection.execute(
123+
f"COPY test_table TO '{excel_path}' WITH (FORMAT xlsx, HEADER true)"
124+
)
125+
duckdb_connection.unregister("test_table")
126+
127+
assert excel_path.exists(), "Excel file should be created"
128+
129+
# read with spatial extension
130+
result = duckdb_connection.execute(f"SELECT * FROM st_read('{excel_path}')").df()
131+
diff = test_data.compare(result)
132+
assert diff.empty, "Data should be the same"
133+
134+
# read with excel extension
135+
result = duckdb_connection.execute(f"SELECT * FROM read_xlsx('{excel_path}')").df()
136+
diff = test_data.compare(result)
137+
assert diff.empty, "Data should be the same"

0 commit comments

Comments
 (0)