Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions deepnote_toolkit/chart/deepnote_chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from deepnote_toolkit.chart.spec_utils import (
attach_config_to_vega_lite_spec,
attach_selection_parameters_to_vega_lite_spec,
get_temporal_fields_from_vega_lite_spec,
verify_used_fields,
)
from deepnote_toolkit.chart.types import CHART_ROW_LIMIT, VEGA_5_MIME_TYPE, ChartError
Expand Down Expand Up @@ -136,10 +137,14 @@ def __init__(
if attach_selection:
attach_selection_parameters_to_vega_lite_spec(spec_dict)

temporal_fields = get_temporal_fields_from_vega_lite_spec(spec_dict)

oc_df = oc.DataFrame.from_native(dataframe)
filtered_df = oc_df.filter(*self.filters).prepare_for_serialization()
if filtered_df.native_type == "pandas":
sanitized_pandas = sanitize_dataframe_for_chart(filtered_df.to_native())
sanitized_pandas = sanitize_dataframe_for_chart(
filtered_df.to_native(), temporal_fields
)
oc_sanitized_df = oc.DataFrame.from_native(sanitized_pandas)
elif filtered_df.native_type in ("pyspark", "polars-eager"):
# We don't need to sanitize Spark DFs because they will processed by Spark itself and it can handle
Expand Down Expand Up @@ -178,13 +183,14 @@ def __init__(
row_limit=CHART_ROW_LIMIT,
)
)
except pa.ArrowNotImplementedError as err:
except (pa.ArrowNotImplementedError, ValueError) as err:
error_msg = (
f"DataFrame contains data types that cannot be serialized into Arrow format for charting: {err}. "
f"Common causes include:\n"
f"• Mixed data types in columns (e.g., numbers and strings)\n"
f"• Complex objects like dictionaries or custom classes\n"
f"• Nested data structures\n"
f"• Datetime string columns that cannot be parsed\n"
f"Try converting problematic columns to consistent types or removing them."
)
raise ChartError(error_msg) from err
Expand Down
19 changes: 19 additions & 0 deletions deepnote_toolkit/chart/spec_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,25 @@ def _get_used_fields_from_vega_lite_spec(vega_lite_spec):
)


def get_temporal_fields_from_vega_lite_spec(vega_lite_spec: Dict[str, Any]) -> set:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major | ⚡ Quick win

Use explicit type hint for return value.

Return type should be Set[str] instead of set per coding guidelines.

📝 Proposed fix
-def get_temporal_fields_from_vega_lite_spec(vega_lite_spec: Dict[str, Any]) -> set:
+def get_temporal_fields_from_vega_lite_spec(vega_lite_spec: Dict[str, Any]) -> Set[str]:

Add Set to imports at top of file:

+from typing import Set
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@deepnote_toolkit/chart/spec_utils.py` at line 332, The function
get_temporal_fields_from_vega_lite_spec has a generic return type hint of set
instead of the more explicit Set[str]. Import Set from the typing module at the
top of the file, then update the return type annotation of
get_temporal_fields_from_vega_lite_spec to use Set[str] instead of set to follow
the coding guidelines for explicit type hints.

Source: Coding guidelines

"""
Returns the set of field names the spec encodes with ``"type": "temporal"``.

Only these fields should be coerced to datetime for charting; coercing every
ISO-parseable string column would wrongly turn nominal axes (years, months,
numeric codes) into time scales.
"""
encodings = _extract_encodings_from_vega_lite_spec_recursive(vega_lite_spec)

return set(
_unescape_field_name(encoding["field"])
for encoding in encodings
if encoding.get("type") == "temporal"
and "field" in encoding
and isinstance(encoding["field"], str)
)


def verify_used_fields(oc_df: oc.DataFrame, vega_lite_spec: Any) -> None:
allowed_fields = set(oc_df.column_names) | set([COUNT_FIELD_NAME])

Expand Down
45 changes: 43 additions & 2 deletions deepnote_toolkit/chart/utils.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,61 @@
from typing import Any, List, Optional
from typing import Any, List, Optional, Set

import pandas as pd

import deepnote_toolkit.ocelots as oc


def sanitize_dataframe_for_chart(pd_df: pd.DataFrame):
def sanitize_dataframe_for_chart(
pd_df: pd.DataFrame, temporal_fields: Optional[Set[str]] = None
) -> pd.DataFrame:
sanitized_dataframe = pd_df.copy()

oc.pandas.utils.deduplicate_columns(sanitized_dataframe)
_convert_timedelta_columns_to_seconds(sanitized_dataframe)
_convert_column_names_to_string(sanitized_dataframe)
_convert_datetime_string_columns(sanitized_dataframe, temporal_fields)

return sanitized_dataframe


def _convert_datetime_string_columns(
pd_df: pd.DataFrame, temporal_fields: Optional[Set[str]] = None
) -> None:
"""
Converts object columns that contain ISO datetime strings to datetime64.

VegaFusion treats datetime64 columns as temporal natively. When columns are
left as object (string), VegaFusion attempts to parse them using the axis
display format from the Vega-Lite spec (e.g. ``'%B %d, %Y %H:%M'``), which
fails for ISO 8601 strings and raises a DataFusion ValueError.

Only columns named in ``temporal_fields`` (fields the spec encodes as
``"type": "temporal"``) are converted. This avoids turning nominal string
axes such as years (``"2020"``), months (``"2024-01"``) or numeric codes
into time scales just because they happen to parse as ISO 8601. When
``temporal_fields`` is ``None`` or empty, no columns are converted.

WARNING: This function modifies the DataFrame in-place.
"""
if not temporal_fields:
return

for col in pd_df.columns:
if col not in temporal_fields:
continue
if pd_df[col].dtype != object:
continue
non_null = pd_df[col].dropna()
if len(non_null) == 0:
continue
try:
converted = pd.to_datetime(non_null, format="ISO8601", utc=True, errors="coerce")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check if minimum pandas version is specified and compatible with ISO8601 format

echo "Checking pandas version constraints..."

# Check pyproject.toml or setup.py for pandas version requirements
fd -t f -e toml -e py -e txt | rg -l "pandas" | head -5 | while read -r file; do
    echo "=== $file ==="
    rg -C2 "pandas" "$file"
done

# Check if there's a minimum pandas version that guarantees format="ISO8601" support (2.0.0+)
echo -e "\n=== Searching for pandas version constraints ==="
rg -n "pandas\s*[><=]" --type toml --type py --type txt

Repository: deepnote/deepnote-toolkit

Length of output: 1602


🏁 Script executed:

cd deepnote_toolkit/chart && wc -l utils.py

Repository: deepnote/deepnote-toolkit

Length of output: 82


🏁 Script executed:

cd deepnote_toolkit/chart && sed -n '45,60p' utils.py

Repository: deepnote/deepnote-toolkit

Length of output: 600


🏁 Script executed:

cd deepnote_toolkit/chart && sed -n '35,75p' utils.py

Repository: deepnote/deepnote-toolkit

Length of output: 1332


🏁 Script executed:

rg -n "pandas|version|ISO8601" deepnote_toolkit/chart/utils.py

Repository: deepnote/deepnote-toolkit

Length of output: 356


Handle pandas < 2.0.0 incompatibility with format="ISO8601".

The format="ISO8601" parameter (lines 52, 54) requires pandas 2.0.0+, but the dependencies allow pandas ≥1.2.5 for Python 3.9–3.11. On older pandas versions, this raises an exception that the broad except: pass silently swallows, preventing temporal field conversion. Add version-specific logic or raise the minimum pandas version requirement.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@deepnote_toolkit/chart/utils.py` at line 52, The pd.to_datetime call with
format="ISO8601" is incompatible with pandas versions below 2.0.0, but the
project supports pandas >= 1.2.5, causing silent failures when the exception is
caught by the broad except: pass block. Either add version-specific logic to
check the pandas version and conditionally use format="ISO8601" only for pandas
2.0.0+, removing the parameter for older versions, or update the project's
minimum pandas version requirement to 2.0.0 or higher. This applies to both
instances of this parameter in the converted and converted_utc variable
assignments.

if converted.notna().all():
pd_df[col] = pd.to_datetime(pd_df[col], format="ISO8601", utc=True, errors="coerce")
except Exception:
pass


def _convert_column_names_to_string(pd_df: pd.DataFrame):
"""
Converts dataframe column names to strings.
Expand Down
152 changes: 151 additions & 1 deletion tests/unit/test_chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,14 @@
from deepnote_toolkit.chart import ChartError, DeepnoteChart
from deepnote_toolkit.chart.spec_utils import (
_get_used_fields_from_vega_lite_spec,
get_temporal_fields_from_vega_lite_spec,
verify_used_fields,
)
from deepnote_toolkit.chart.types import VEGA_5_MIME_TYPE
from deepnote_toolkit.chart.utils import sanitize_dataframe_for_chart
from deepnote_toolkit.chart.utils import (
_convert_datetime_string_columns,
sanitize_dataframe_for_chart,
)

from .helpers.testing_dataframes import testing_dataframes

Expand Down Expand Up @@ -538,3 +542,149 @@ def test_multilayer_spec_v2_invalid_fields(self):
}
with self.assertRaises(ChartError):
verify_used_fields(self.oc_df, spec)


class TestGetTemporalFields(unittest.TestCase):
def test_single_temporal_field(self):
spec = {
"mark": "line",
"encoding": {
"x": {"field": "timestamp", "type": "temporal"},
"y": {"field": "value", "type": "quantitative"},
},
}
fields = get_temporal_fields_from_vega_lite_spec(spec)
self.assertSetEqual(fields, {"timestamp"})

def test_no_temporal_fields(self):
spec = {
"mark": "bar",
"encoding": {
"x": {"field": "year", "type": "nominal"},
"y": {"field": "count", "type": "quantitative"},
},
}
fields = get_temporal_fields_from_vega_lite_spec(spec)
self.assertSetEqual(fields, set())

def test_multiple_temporal_fields(self):
spec = {
"mark": "point",
"encoding": {
"x": {"field": "start_time", "type": "temporal"},
"x2": {"field": "end_time", "type": "temporal"},
"y": {"field": "value", "type": "quantitative"},
},
}
fields = get_temporal_fields_from_vega_lite_spec(spec)
self.assertSetEqual(fields, {"start_time", "end_time"})

def test_mixed_types_only_returns_temporal(self):
"""Only 'type: temporal' encodings are returned; nominal/quantitative are excluded."""
spec = {
"mark": "bar",
"encoding": {
"x": {"field": "date", "type": "temporal"},
"y": {"field": "amount", "type": "quantitative"},
"color": {"field": "category", "type": "nominal"},
},
}
fields = get_temporal_fields_from_vega_lite_spec(spec)
self.assertSetEqual(fields, {"date"})

def test_layered_spec_collects_temporal_from_all_layers(self):
spec = {
"layer": [
{
"mark": "line",
"encoding": {
"x": {"field": "ts", "type": "temporal"},
"y": {"field": "v1", "type": "quantitative"},
},
},
{
"mark": "point",
"encoding": {
"x": {"field": "ts", "type": "temporal"},
"y": {"field": "v2", "type": "quantitative"},
},
},
]
}
fields = get_temporal_fields_from_vega_lite_spec(spec)
self.assertSetEqual(fields, {"ts"})

def test_encoding_without_field_is_skipped(self):
"""datum-based encodings have no 'field' key and must not crash."""
spec = {
"mark": "rule",
"encoding": {
"x": {"datum": "2024-01-01", "type": "temporal"},
"y": {"field": "value", "type": "quantitative"},
},
}
fields = get_temporal_fields_from_vega_lite_spec(spec)
self.assertSetEqual(fields, set())

def test_escaped_field_name_is_unescaped(self):
spec = {
"mark": "line",
"encoding": {
"x": {"field": "date\\.time", "type": "temporal"},
},
}
fields = get_temporal_fields_from_vega_lite_spec(spec)
self.assertSetEqual(fields, {"date.time"})


class TestConvertDatetimeStringColumns(unittest.TestCase):
def test_iso_string_column_is_converted_when_in_temporal_fields(self):
df = pd.DataFrame({"ts": ["2024-04-17T23:18:06.527738", "2024-04-18T00:00:00"]})
_convert_datetime_string_columns(df, temporal_fields={"ts"})
self.assertTrue(pd.api.types.is_datetime64_any_dtype(df["ts"]))

def test_column_not_in_temporal_fields_is_not_converted(self):
"""A column that looks like ISO dates but is nominal must stay as object."""
df = pd.DataFrame({"year": ["2019", "2020", "2021"]})
_convert_datetime_string_columns(df, temporal_fields=set())
self.assertEqual(df["year"].dtype, object)

def test_no_temporal_fields_is_a_no_op(self):
df = pd.DataFrame({"ts": ["2024-04-17T23:18:06", "2024-04-18T00:00:00"]})
original_dtype = df["ts"].dtype
_convert_datetime_string_columns(df, temporal_fields=None)
self.assertEqual(df["ts"].dtype, original_dtype)

def test_already_datetime_column_is_left_alone(self):
df = pd.DataFrame({"ts": pd.to_datetime(["2024-01-01", "2024-01-02"])})
_convert_datetime_string_columns(df, temporal_fields={"ts"})
self.assertTrue(pd.api.types.is_datetime64_any_dtype(df["ts"]))

def test_mixed_valid_invalid_strings_are_not_converted(self):
"""If any value fails ISO 8601 parsing the column is left as-is."""
df = pd.DataFrame({"ts": ["2024-04-17T23:18:06", "not-a-date"]})
_convert_datetime_string_columns(df, temporal_fields={"ts"})
self.assertEqual(df["ts"].dtype, object)

def test_all_null_column_is_skipped(self):
df = pd.DataFrame({"ts": pd.array([None, None], dtype=object)})
_convert_datetime_string_columns(df, temporal_fields={"ts"})
self.assertEqual(df["ts"].dtype, object)

def test_sanitize_dataframe_converts_temporal_strings_end_to_end(self):
"""Full sanitize_dataframe_for_chart path converts ISO strings for temporal fields."""
df = pd.DataFrame(
{
"timestamp": ["2024-04-17T23:18:06.527738", "2024-04-18T00:00:00"],
"value": [1.0, 2.0],
}
)
result = sanitize_dataframe_for_chart(df, temporal_fields={"timestamp"})
self.assertTrue(pd.api.types.is_datetime64_any_dtype(result["timestamp"]))
self.assertEqual(result["value"].dtype, float)

def test_sanitize_dataframe_nominal_year_strings_unchanged(self):
"""Nominal year strings must NOT be converted even if they look like ISO dates."""
df = pd.DataFrame({"year": ["2019", "2020", "2021"], "count": [10, 20, 30]})
result = sanitize_dataframe_for_chart(df, temporal_fields=set())
self.assertEqual(result["year"].dtype, object)
Loading