Skip to content

Commit e136775

Browse files
committed
feat(duckdb): Bake in spatial and excel extensions
1 parent 9dadd3d commit e136775

File tree

4 files changed

+162
-3
lines changed

4 files changed

+162
-3
lines changed

deepnote_toolkit/sql/duckdb_sql.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import sys
22

33
import duckdb
4+
from duckdb_extensions import import_extension
45
from packaging.version import Version
56

67
_DEEPNOTE_DUCKDB_CONNECTION = None
@@ -46,10 +47,18 @@ def _get_duckdb_connection():
4647
database=":memory:", read_only=False
4748
)
4849

50+
# DuckDB extensions are loaded from included wheels to prevent loading them
51+
# from the internet on every notebook start
52+
#
4953
# Install and load the spatial extension. Primary use case: reading xlsx files
5054
# e.g. SELECT * FROM st_read('excel.xlsx')
51-
_DEEPNOTE_DUCKDB_CONNECTION.execute("install spatial;")
52-
_DEEPNOTE_DUCKDB_CONNECTION.execute("load spatial;")
55+
# there is also official excel extension, which mentions that Excel support from spatial extension
56+
# may be removed in the future (see: https://duckdb.org/docs/stable/core_extensions/excel)
57+
for extension_name in ["spatial", "excel"]:
58+
import_extension(
59+
name=extension_name, force_install=True, con=_DEEPNOTE_DUCKDB_CONNECTION
60+
)
61+
_DEEPNOTE_DUCKDB_CONNECTION.execute(f"LOAD '{extension_name}'")
5362

5463
_set_sample_size(_DEEPNOTE_DUCKDB_CONNECTION, _DEFAULT_DUCKDB_SAMPLE_SIZE)
5564

poetry.lock

Lines changed: 51 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ dependencies = [
5959
"duckdb>=1.1.0,<2.0.0; python_version < '3.12'",
6060
"duckdb>=1.1.0,<2.0.0; python_version >= '3.12'",
6161
"duckdb>=1.4.1,<2.0.0; python_version >= '3.13'",
62+
"duckdb-extensions>=1.1.0,<2.0.0", # bake in as dependency to not pull extensions from the internet on every notebook start
63+
"duckdb-extension-spatial>=1.1.0,<2.0.0",
64+
"duckdb-extension-excel>=1.1.0,<2.0.0",
6265
"google-cloud-bigquery-storage==2.16.2; python_version < '3.13'",
6366
"google-cloud-bigquery-storage>=2.33.1,<3; python_version>='3.13'",
6467

tests/unit/test_duckdb_sql.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import pandas as pd
2+
import pytest
3+
4+
from deepnote_toolkit.sql.duckdb_sql import (
5+
_get_duckdb_connection,
6+
_set_sample_size,
7+
_set_scan_all_frames,
8+
)
9+
10+
11+
@pytest.fixture(scope="function")
12+
def duckdb_connection():
13+
import deepnote_toolkit.sql.duckdb_sql as duckdb_sql_module
14+
15+
# reset the connection to ensure a fresh one is created for each test
16+
duckdb_sql_module._DEEPNOTE_DUCKDB_CONNECTION = None
17+
conn = _get_duckdb_connection()
18+
19+
try:
20+
yield conn
21+
finally:
22+
conn.close()
23+
duckdb_sql_module._DEEPNOTE_DUCKDB_CONNECTION = None
24+
25+
26+
@pytest.mark.parametrize("extension_name", ["spatial", "excel"])
27+
def test_extension_installed_and_loadable(duckdb_connection, extension_name):
28+
result = duckdb_connection.execute(
29+
f"SELECT installed FROM duckdb_extensions() WHERE extension_name = '{extension_name}'"
30+
).fetchone()
31+
32+
assert (
33+
result is not None
34+
), f"{extension_name} extension should be found in duckdb_extensions()"
35+
assert result[0] is True, f"{extension_name} extension should be installed"
36+
37+
loaded_result = duckdb_connection.execute(
38+
f"SELECT loaded FROM duckdb_extensions() WHERE extension_name = '{extension_name}'"
39+
).fetchone()
40+
assert loaded_result[0] is True, f"{extension_name} extension should be loaded"
41+
42+
43+
def test_connection_singleton_pattern():
44+
conn1 = _get_duckdb_connection()
45+
conn2 = _get_duckdb_connection()
46+
47+
assert conn1 is conn2, "Connection should be a singleton"
48+
49+
50+
def test_set_sample_size(duckdb_connection):
51+
_set_sample_size(duckdb_connection, 50000)
52+
result = duckdb_connection.execute(
53+
"SELECT value FROM duckdb_settings() WHERE name = 'pandas_analyze_sample'"
54+
).fetchone()
55+
assert int(result[0]) == 50000
56+
57+
58+
def test_set_scan_all_frames(duckdb_connection):
59+
_set_scan_all_frames(duckdb_connection, False)
60+
result = duckdb_connection.execute(
61+
"SELECT value FROM duckdb_settings() WHERE name = 'python_scan_all_frames'"
62+
).fetchone()
63+
assert result[0] == "false"
64+
65+
_set_scan_all_frames(duckdb_connection, True)
66+
result = duckdb_connection.execute(
67+
"SELECT value FROM duckdb_settings() WHERE name = 'python_scan_all_frames'"
68+
).fetchone()
69+
assert result[0] == "true"
70+
71+
72+
def test_excel_extension_roundtrip(duckdb_connection, tmp_path):
73+
test_data = pd.DataFrame(
74+
{
75+
"id": [1, 2, 3],
76+
"name": ["Alice", "Bob", "Charlie"],
77+
"score": [95.5, 87.3, 91.2],
78+
}
79+
)
80+
duckdb_connection.register("test_table", test_data)
81+
excel_path = tmp_path / "test_data.xlsx"
82+
duckdb_connection.execute(
83+
f"COPY test_table TO '{excel_path}' WITH (FORMAT xlsx, HEADER true)"
84+
)
85+
duckdb_connection.unregister("test_table")
86+
87+
assert excel_path.exists(), "Excel file should be created"
88+
89+
# read with spatial extension
90+
result = duckdb_connection.execute(f"SELECT * FROM st_read('{excel_path}')").df()
91+
diff = test_data.compare(result)
92+
assert diff.empty, "Data should be the same"
93+
94+
# read with excel extension
95+
result = duckdb_connection.execute(f"SELECT * FROM read_xlsx('{excel_path}')").df()
96+
diff = test_data.compare(result)
97+
assert diff.empty, "Data should be the same"

0 commit comments

Comments
 (0)