From de0dbab5df14446c1e540c4524823f48b96192ad Mon Sep 17 00:00:00 2001 From: Marko P Date: Mon, 22 Jun 2026 17:11:47 +0200 Subject: [PATCH] fix: convert ISO datetime string columns before VegaFusion pre-transform Charts exported from Deepnote cloud store temporal columns as object (ISO 8601 string) dtype. VegaFusion/DataFusion then tries to parse those strings using the Vega-Lite axis display format (e.g. '%B %d, %Y %H:%M'), which raises a ValueError and crashes the chart. Fix by converting only the columns that the Vega-Lite spec declares as "type": "temporal" to datetime64 before the VegaFusion pre-transform. Scoping the conversion to temporal fields avoids falsely converting nominal string axes (year strings, month codes, numeric IDs). Also broadens the VegaFusion except clause to catch ValueError alongside the existing ArrowNotImplementedError so any remaining edge cases surface a friendly ChartError instead of a raw traceback. --- deepnote_toolkit/chart/deepnote_chart.py | 10 +- deepnote_toolkit/chart/spec_utils.py | 19 +++ deepnote_toolkit/chart/utils.py | 45 ++++++- tests/unit/test_chart.py | 152 ++++++++++++++++++++++- 4 files changed, 221 insertions(+), 5 deletions(-) diff --git a/deepnote_toolkit/chart/deepnote_chart.py b/deepnote_toolkit/chart/deepnote_chart.py index 787fe1fc..9da9b144 100644 --- a/deepnote_toolkit/chart/deepnote_chart.py +++ b/deepnote_toolkit/chart/deepnote_chart.py @@ -12,6 +12,7 @@ from deepnote_toolkit.chart.spec_utils import ( attach_config_to_vega_lite_spec, attach_selection_parameters_to_vega_lite_spec, + get_temporal_fields_from_vega_lite_spec, verify_used_fields, ) from deepnote_toolkit.chart.types import CHART_ROW_LIMIT, VEGA_5_MIME_TYPE, ChartError @@ -136,10 +137,14 @@ def __init__( if attach_selection: attach_selection_parameters_to_vega_lite_spec(spec_dict) + temporal_fields = get_temporal_fields_from_vega_lite_spec(spec_dict) + oc_df = oc.DataFrame.from_native(dataframe) filtered_df = oc_df.filter(*self.filters).prepare_for_serialization() if filtered_df.native_type == "pandas": - sanitized_pandas = sanitize_dataframe_for_chart(filtered_df.to_native()) + sanitized_pandas = sanitize_dataframe_for_chart( + filtered_df.to_native(), temporal_fields + ) oc_sanitized_df = oc.DataFrame.from_native(sanitized_pandas) elif filtered_df.native_type in ("pyspark", "polars-eager"): # We don't need to sanitize Spark DFs because they will processed by Spark itself and it can handle @@ -178,13 +183,14 @@ def __init__( row_limit=CHART_ROW_LIMIT, ) ) - except pa.ArrowNotImplementedError as err: + except (pa.ArrowNotImplementedError, ValueError) as err: error_msg = ( f"DataFrame contains data types that cannot be serialized into Arrow format for charting: {err}. " f"Common causes include:\n" f"• Mixed data types in columns (e.g., numbers and strings)\n" f"• Complex objects like dictionaries or custom classes\n" f"• Nested data structures\n" + f"• Datetime string columns that cannot be parsed\n" f"Try converting problematic columns to consistent types or removing them." ) raise ChartError(error_msg) from err diff --git a/deepnote_toolkit/chart/spec_utils.py b/deepnote_toolkit/chart/spec_utils.py index 2cb2afac..a56aa2b5 100644 --- a/deepnote_toolkit/chart/spec_utils.py +++ b/deepnote_toolkit/chart/spec_utils.py @@ -329,6 +329,25 @@ def _get_used_fields_from_vega_lite_spec(vega_lite_spec): ) +def get_temporal_fields_from_vega_lite_spec(vega_lite_spec: Dict[str, Any]) -> set: + """ + Returns the set of field names the spec encodes with ``"type": "temporal"``. + + Only these fields should be coerced to datetime for charting; coercing every + ISO-parseable string column would wrongly turn nominal axes (years, months, + numeric codes) into time scales. + """ + encodings = _extract_encodings_from_vega_lite_spec_recursive(vega_lite_spec) + + return set( + _unescape_field_name(encoding["field"]) + for encoding in encodings + if encoding.get("type") == "temporal" + and "field" in encoding + and isinstance(encoding["field"], str) + ) + + def verify_used_fields(oc_df: oc.DataFrame, vega_lite_spec: Any) -> None: allowed_fields = set(oc_df.column_names) | set([COUNT_FIELD_NAME]) diff --git a/deepnote_toolkit/chart/utils.py b/deepnote_toolkit/chart/utils.py index e507bb25..4df5fb56 100644 --- a/deepnote_toolkit/chart/utils.py +++ b/deepnote_toolkit/chart/utils.py @@ -1,20 +1,61 @@ -from typing import Any, List, Optional +from typing import Any, List, Optional, Set import pandas as pd import deepnote_toolkit.ocelots as oc -def sanitize_dataframe_for_chart(pd_df: pd.DataFrame): +def sanitize_dataframe_for_chart( + pd_df: pd.DataFrame, temporal_fields: Optional[Set[str]] = None +) -> pd.DataFrame: sanitized_dataframe = pd_df.copy() oc.pandas.utils.deduplicate_columns(sanitized_dataframe) _convert_timedelta_columns_to_seconds(sanitized_dataframe) _convert_column_names_to_string(sanitized_dataframe) + _convert_datetime_string_columns(sanitized_dataframe, temporal_fields) return sanitized_dataframe +def _convert_datetime_string_columns( + pd_df: pd.DataFrame, temporal_fields: Optional[Set[str]] = None +) -> None: + """ + Converts object columns that contain ISO datetime strings to datetime64. + + VegaFusion treats datetime64 columns as temporal natively. When columns are + left as object (string), VegaFusion attempts to parse them using the axis + display format from the Vega-Lite spec (e.g. ``'%B %d, %Y %H:%M'``), which + fails for ISO 8601 strings and raises a DataFusion ValueError. + + Only columns named in ``temporal_fields`` (fields the spec encodes as + ``"type": "temporal"``) are converted. This avoids turning nominal string + axes such as years (``"2020"``), months (``"2024-01"``) or numeric codes + into time scales just because they happen to parse as ISO 8601. When + ``temporal_fields`` is ``None`` or empty, no columns are converted. + + WARNING: This function modifies the DataFrame in-place. + """ + if not temporal_fields: + return + + for col in pd_df.columns: + if col not in temporal_fields: + continue + if pd_df[col].dtype != object: + continue + non_null = pd_df[col].dropna() + if len(non_null) == 0: + continue + try: + converted = pd.to_datetime(non_null, format="ISO8601", utc=True, errors="coerce") + if converted.notna().all(): + pd_df[col] = pd.to_datetime(pd_df[col], format="ISO8601", utc=True, errors="coerce") + except Exception: + pass + + def _convert_column_names_to_string(pd_df: pd.DataFrame): """ Converts dataframe column names to strings. diff --git a/tests/unit/test_chart.py b/tests/unit/test_chart.py index 47af5ea1..9d6fde93 100644 --- a/tests/unit/test_chart.py +++ b/tests/unit/test_chart.py @@ -13,10 +13,14 @@ from deepnote_toolkit.chart import ChartError, DeepnoteChart from deepnote_toolkit.chart.spec_utils import ( _get_used_fields_from_vega_lite_spec, + get_temporal_fields_from_vega_lite_spec, verify_used_fields, ) from deepnote_toolkit.chart.types import VEGA_5_MIME_TYPE -from deepnote_toolkit.chart.utils import sanitize_dataframe_for_chart +from deepnote_toolkit.chart.utils import ( + _convert_datetime_string_columns, + sanitize_dataframe_for_chart, +) from .helpers.testing_dataframes import testing_dataframes @@ -538,3 +542,149 @@ def test_multilayer_spec_v2_invalid_fields(self): } with self.assertRaises(ChartError): verify_used_fields(self.oc_df, spec) + + +class TestGetTemporalFields(unittest.TestCase): + def test_single_temporal_field(self): + spec = { + "mark": "line", + "encoding": { + "x": {"field": "timestamp", "type": "temporal"}, + "y": {"field": "value", "type": "quantitative"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"timestamp"}) + + def test_no_temporal_fields(self): + spec = { + "mark": "bar", + "encoding": { + "x": {"field": "year", "type": "nominal"}, + "y": {"field": "count", "type": "quantitative"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, set()) + + def test_multiple_temporal_fields(self): + spec = { + "mark": "point", + "encoding": { + "x": {"field": "start_time", "type": "temporal"}, + "x2": {"field": "end_time", "type": "temporal"}, + "y": {"field": "value", "type": "quantitative"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"start_time", "end_time"}) + + def test_mixed_types_only_returns_temporal(self): + """Only 'type: temporal' encodings are returned; nominal/quantitative are excluded.""" + spec = { + "mark": "bar", + "encoding": { + "x": {"field": "date", "type": "temporal"}, + "y": {"field": "amount", "type": "quantitative"}, + "color": {"field": "category", "type": "nominal"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"date"}) + + def test_layered_spec_collects_temporal_from_all_layers(self): + spec = { + "layer": [ + { + "mark": "line", + "encoding": { + "x": {"field": "ts", "type": "temporal"}, + "y": {"field": "v1", "type": "quantitative"}, + }, + }, + { + "mark": "point", + "encoding": { + "x": {"field": "ts", "type": "temporal"}, + "y": {"field": "v2", "type": "quantitative"}, + }, + }, + ] + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"ts"}) + + def test_encoding_without_field_is_skipped(self): + """datum-based encodings have no 'field' key and must not crash.""" + spec = { + "mark": "rule", + "encoding": { + "x": {"datum": "2024-01-01", "type": "temporal"}, + "y": {"field": "value", "type": "quantitative"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, set()) + + def test_escaped_field_name_is_unescaped(self): + spec = { + "mark": "line", + "encoding": { + "x": {"field": "date\\.time", "type": "temporal"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"date.time"}) + + +class TestConvertDatetimeStringColumns(unittest.TestCase): + def test_iso_string_column_is_converted_when_in_temporal_fields(self): + df = pd.DataFrame({"ts": ["2024-04-17T23:18:06.527738", "2024-04-18T00:00:00"]}) + _convert_datetime_string_columns(df, temporal_fields={"ts"}) + self.assertTrue(pd.api.types.is_datetime64_any_dtype(df["ts"])) + + def test_column_not_in_temporal_fields_is_not_converted(self): + """A column that looks like ISO dates but is nominal must stay as object.""" + df = pd.DataFrame({"year": ["2019", "2020", "2021"]}) + _convert_datetime_string_columns(df, temporal_fields=set()) + self.assertEqual(df["year"].dtype, object) + + def test_no_temporal_fields_is_a_no_op(self): + df = pd.DataFrame({"ts": ["2024-04-17T23:18:06", "2024-04-18T00:00:00"]}) + original_dtype = df["ts"].dtype + _convert_datetime_string_columns(df, temporal_fields=None) + self.assertEqual(df["ts"].dtype, original_dtype) + + def test_already_datetime_column_is_left_alone(self): + df = pd.DataFrame({"ts": pd.to_datetime(["2024-01-01", "2024-01-02"])}) + _convert_datetime_string_columns(df, temporal_fields={"ts"}) + self.assertTrue(pd.api.types.is_datetime64_any_dtype(df["ts"])) + + def test_mixed_valid_invalid_strings_are_not_converted(self): + """If any value fails ISO 8601 parsing the column is left as-is.""" + df = pd.DataFrame({"ts": ["2024-04-17T23:18:06", "not-a-date"]}) + _convert_datetime_string_columns(df, temporal_fields={"ts"}) + self.assertEqual(df["ts"].dtype, object) + + def test_all_null_column_is_skipped(self): + df = pd.DataFrame({"ts": pd.array([None, None], dtype=object)}) + _convert_datetime_string_columns(df, temporal_fields={"ts"}) + self.assertEqual(df["ts"].dtype, object) + + def test_sanitize_dataframe_converts_temporal_strings_end_to_end(self): + """Full sanitize_dataframe_for_chart path converts ISO strings for temporal fields.""" + df = pd.DataFrame( + { + "timestamp": ["2024-04-17T23:18:06.527738", "2024-04-18T00:00:00"], + "value": [1.0, 2.0], + } + ) + result = sanitize_dataframe_for_chart(df, temporal_fields={"timestamp"}) + self.assertTrue(pd.api.types.is_datetime64_any_dtype(result["timestamp"])) + self.assertEqual(result["value"].dtype, float) + + def test_sanitize_dataframe_nominal_year_strings_unchanged(self): + """Nominal year strings must NOT be converted even if they look like ISO dates.""" + df = pd.DataFrame({"year": ["2019", "2020", "2021"], "count": [10, 20, 30]}) + result = sanitize_dataframe_for_chart(df, temporal_fields=set()) + self.assertEqual(result["year"].dtype, object)