diff --git a/deepnote_toolkit/chart/deepnote_chart.py b/deepnote_toolkit/chart/deepnote_chart.py index 787fe1f..9da9b14 100644 --- a/deepnote_toolkit/chart/deepnote_chart.py +++ b/deepnote_toolkit/chart/deepnote_chart.py @@ -12,6 +12,7 @@ from deepnote_toolkit.chart.spec_utils import ( attach_config_to_vega_lite_spec, attach_selection_parameters_to_vega_lite_spec, + get_temporal_fields_from_vega_lite_spec, verify_used_fields, ) from deepnote_toolkit.chart.types import CHART_ROW_LIMIT, VEGA_5_MIME_TYPE, ChartError @@ -136,10 +137,14 @@ def __init__( if attach_selection: attach_selection_parameters_to_vega_lite_spec(spec_dict) + temporal_fields = get_temporal_fields_from_vega_lite_spec(spec_dict) + oc_df = oc.DataFrame.from_native(dataframe) filtered_df = oc_df.filter(*self.filters).prepare_for_serialization() if filtered_df.native_type == "pandas": - sanitized_pandas = sanitize_dataframe_for_chart(filtered_df.to_native()) + sanitized_pandas = sanitize_dataframe_for_chart( + filtered_df.to_native(), temporal_fields + ) oc_sanitized_df = oc.DataFrame.from_native(sanitized_pandas) elif filtered_df.native_type in ("pyspark", "polars-eager"): # We don't need to sanitize Spark DFs because they will processed by Spark itself and it can handle @@ -178,13 +183,14 @@ def __init__( row_limit=CHART_ROW_LIMIT, ) ) - except pa.ArrowNotImplementedError as err: + except (pa.ArrowNotImplementedError, ValueError) as err: error_msg = ( f"DataFrame contains data types that cannot be serialized into Arrow format for charting: {err}. " f"Common causes include:\n" f"• Mixed data types in columns (e.g., numbers and strings)\n" f"• Complex objects like dictionaries or custom classes\n" f"• Nested data structures\n" + f"• Datetime string columns that cannot be parsed\n" f"Try converting problematic columns to consistent types or removing them." ) raise ChartError(error_msg) from err diff --git a/deepnote_toolkit/chart/spec_utils.py b/deepnote_toolkit/chart/spec_utils.py index 2cb2afa..a56aa2b 100644 --- a/deepnote_toolkit/chart/spec_utils.py +++ b/deepnote_toolkit/chart/spec_utils.py @@ -329,6 +329,25 @@ def _get_used_fields_from_vega_lite_spec(vega_lite_spec): ) +def get_temporal_fields_from_vega_lite_spec(vega_lite_spec: Dict[str, Any]) -> set: + """ + Returns the set of field names the spec encodes with ``"type": "temporal"``. + + Only these fields should be coerced to datetime for charting; coercing every + ISO-parseable string column would wrongly turn nominal axes (years, months, + numeric codes) into time scales. + """ + encodings = _extract_encodings_from_vega_lite_spec_recursive(vega_lite_spec) + + return set( + _unescape_field_name(encoding["field"]) + for encoding in encodings + if encoding.get("type") == "temporal" + and "field" in encoding + and isinstance(encoding["field"], str) + ) + + def verify_used_fields(oc_df: oc.DataFrame, vega_lite_spec: Any) -> None: allowed_fields = set(oc_df.column_names) | set([COUNT_FIELD_NAME]) diff --git a/deepnote_toolkit/chart/utils.py b/deepnote_toolkit/chart/utils.py index e507bb2..4df5fb5 100644 --- a/deepnote_toolkit/chart/utils.py +++ b/deepnote_toolkit/chart/utils.py @@ -1,20 +1,61 @@ -from typing import Any, List, Optional +from typing import Any, List, Optional, Set import pandas as pd import deepnote_toolkit.ocelots as oc -def sanitize_dataframe_for_chart(pd_df: pd.DataFrame): +def sanitize_dataframe_for_chart( + pd_df: pd.DataFrame, temporal_fields: Optional[Set[str]] = None +) -> pd.DataFrame: sanitized_dataframe = pd_df.copy() oc.pandas.utils.deduplicate_columns(sanitized_dataframe) _convert_timedelta_columns_to_seconds(sanitized_dataframe) _convert_column_names_to_string(sanitized_dataframe) + _convert_datetime_string_columns(sanitized_dataframe, temporal_fields) return sanitized_dataframe +def _convert_datetime_string_columns( + pd_df: pd.DataFrame, temporal_fields: Optional[Set[str]] = None +) -> None: + """ + Converts object columns that contain ISO datetime strings to datetime64. + + VegaFusion treats datetime64 columns as temporal natively. When columns are + left as object (string), VegaFusion attempts to parse them using the axis + display format from the Vega-Lite spec (e.g. ``'%B %d, %Y %H:%M'``), which + fails for ISO 8601 strings and raises a DataFusion ValueError. + + Only columns named in ``temporal_fields`` (fields the spec encodes as + ``"type": "temporal"``) are converted. This avoids turning nominal string + axes such as years (``"2020"``), months (``"2024-01"``) or numeric codes + into time scales just because they happen to parse as ISO 8601. When + ``temporal_fields`` is ``None`` or empty, no columns are converted. + + WARNING: This function modifies the DataFrame in-place. + """ + if not temporal_fields: + return + + for col in pd_df.columns: + if col not in temporal_fields: + continue + if pd_df[col].dtype != object: + continue + non_null = pd_df[col].dropna() + if len(non_null) == 0: + continue + try: + converted = pd.to_datetime(non_null, format="ISO8601", utc=True, errors="coerce") + if converted.notna().all(): + pd_df[col] = pd.to_datetime(pd_df[col], format="ISO8601", utc=True, errors="coerce") + except Exception: + pass + + def _convert_column_names_to_string(pd_df: pd.DataFrame): """ Converts dataframe column names to strings. diff --git a/tests/unit/test_chart.py b/tests/unit/test_chart.py index 47af5ea..9d6fde9 100644 --- a/tests/unit/test_chart.py +++ b/tests/unit/test_chart.py @@ -13,10 +13,14 @@ from deepnote_toolkit.chart import ChartError, DeepnoteChart from deepnote_toolkit.chart.spec_utils import ( _get_used_fields_from_vega_lite_spec, + get_temporal_fields_from_vega_lite_spec, verify_used_fields, ) from deepnote_toolkit.chart.types import VEGA_5_MIME_TYPE -from deepnote_toolkit.chart.utils import sanitize_dataframe_for_chart +from deepnote_toolkit.chart.utils import ( + _convert_datetime_string_columns, + sanitize_dataframe_for_chart, +) from .helpers.testing_dataframes import testing_dataframes @@ -538,3 +542,149 @@ def test_multilayer_spec_v2_invalid_fields(self): } with self.assertRaises(ChartError): verify_used_fields(self.oc_df, spec) + + +class TestGetTemporalFields(unittest.TestCase): + def test_single_temporal_field(self): + spec = { + "mark": "line", + "encoding": { + "x": {"field": "timestamp", "type": "temporal"}, + "y": {"field": "value", "type": "quantitative"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"timestamp"}) + + def test_no_temporal_fields(self): + spec = { + "mark": "bar", + "encoding": { + "x": {"field": "year", "type": "nominal"}, + "y": {"field": "count", "type": "quantitative"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, set()) + + def test_multiple_temporal_fields(self): + spec = { + "mark": "point", + "encoding": { + "x": {"field": "start_time", "type": "temporal"}, + "x2": {"field": "end_time", "type": "temporal"}, + "y": {"field": "value", "type": "quantitative"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"start_time", "end_time"}) + + def test_mixed_types_only_returns_temporal(self): + """Only 'type: temporal' encodings are returned; nominal/quantitative are excluded.""" + spec = { + "mark": "bar", + "encoding": { + "x": {"field": "date", "type": "temporal"}, + "y": {"field": "amount", "type": "quantitative"}, + "color": {"field": "category", "type": "nominal"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"date"}) + + def test_layered_spec_collects_temporal_from_all_layers(self): + spec = { + "layer": [ + { + "mark": "line", + "encoding": { + "x": {"field": "ts", "type": "temporal"}, + "y": {"field": "v1", "type": "quantitative"}, + }, + }, + { + "mark": "point", + "encoding": { + "x": {"field": "ts", "type": "temporal"}, + "y": {"field": "v2", "type": "quantitative"}, + }, + }, + ] + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"ts"}) + + def test_encoding_without_field_is_skipped(self): + """datum-based encodings have no 'field' key and must not crash.""" + spec = { + "mark": "rule", + "encoding": { + "x": {"datum": "2024-01-01", "type": "temporal"}, + "y": {"field": "value", "type": "quantitative"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, set()) + + def test_escaped_field_name_is_unescaped(self): + spec = { + "mark": "line", + "encoding": { + "x": {"field": "date\\.time", "type": "temporal"}, + }, + } + fields = get_temporal_fields_from_vega_lite_spec(spec) + self.assertSetEqual(fields, {"date.time"}) + + +class TestConvertDatetimeStringColumns(unittest.TestCase): + def test_iso_string_column_is_converted_when_in_temporal_fields(self): + df = pd.DataFrame({"ts": ["2024-04-17T23:18:06.527738", "2024-04-18T00:00:00"]}) + _convert_datetime_string_columns(df, temporal_fields={"ts"}) + self.assertTrue(pd.api.types.is_datetime64_any_dtype(df["ts"])) + + def test_column_not_in_temporal_fields_is_not_converted(self): + """A column that looks like ISO dates but is nominal must stay as object.""" + df = pd.DataFrame({"year": ["2019", "2020", "2021"]}) + _convert_datetime_string_columns(df, temporal_fields=set()) + self.assertEqual(df["year"].dtype, object) + + def test_no_temporal_fields_is_a_no_op(self): + df = pd.DataFrame({"ts": ["2024-04-17T23:18:06", "2024-04-18T00:00:00"]}) + original_dtype = df["ts"].dtype + _convert_datetime_string_columns(df, temporal_fields=None) + self.assertEqual(df["ts"].dtype, original_dtype) + + def test_already_datetime_column_is_left_alone(self): + df = pd.DataFrame({"ts": pd.to_datetime(["2024-01-01", "2024-01-02"])}) + _convert_datetime_string_columns(df, temporal_fields={"ts"}) + self.assertTrue(pd.api.types.is_datetime64_any_dtype(df["ts"])) + + def test_mixed_valid_invalid_strings_are_not_converted(self): + """If any value fails ISO 8601 parsing the column is left as-is.""" + df = pd.DataFrame({"ts": ["2024-04-17T23:18:06", "not-a-date"]}) + _convert_datetime_string_columns(df, temporal_fields={"ts"}) + self.assertEqual(df["ts"].dtype, object) + + def test_all_null_column_is_skipped(self): + df = pd.DataFrame({"ts": pd.array([None, None], dtype=object)}) + _convert_datetime_string_columns(df, temporal_fields={"ts"}) + self.assertEqual(df["ts"].dtype, object) + + def test_sanitize_dataframe_converts_temporal_strings_end_to_end(self): + """Full sanitize_dataframe_for_chart path converts ISO strings for temporal fields.""" + df = pd.DataFrame( + { + "timestamp": ["2024-04-17T23:18:06.527738", "2024-04-18T00:00:00"], + "value": [1.0, 2.0], + } + ) + result = sanitize_dataframe_for_chart(df, temporal_fields={"timestamp"}) + self.assertTrue(pd.api.types.is_datetime64_any_dtype(result["timestamp"])) + self.assertEqual(result["value"].dtype, float) + + def test_sanitize_dataframe_nominal_year_strings_unchanged(self): + """Nominal year strings must NOT be converted even if they look like ISO dates.""" + df = pd.DataFrame({"year": ["2019", "2020", "2021"], "count": [10, 20, 30]}) + result = sanitize_dataframe_for_chart(df, temporal_fields=set()) + self.assertEqual(result["year"].dtype, object)