diff --git a/deepnote_toolkit/ocelots/pandas/implementation.py b/deepnote_toolkit/ocelots/pandas/implementation.py index fb52d0c..24a6306 100644 --- a/deepnote_toolkit/ocelots/pandas/implementation.py +++ b/deepnote_toolkit/ocelots/pandas/implementation.py @@ -13,6 +13,7 @@ from .analyze import analyze_columns from .utils import ( + cast_large_numbers_to_string, cast_objects_to_string, deduplicate_columns, fill_nat, @@ -303,6 +304,7 @@ def to_records(self, mode: Literal["json", "python"]) -> List[Dict[str, Any]]: if mode == "json": fill_nat(df_copy, "NaT") cast_objects_to_string(df_copy) + cast_large_numbers_to_string(df_copy) return df_copy.to_dict("records") def to_csv(self, path_or_buf: Union[str, TextIO]) -> None: diff --git a/deepnote_toolkit/ocelots/pandas/utils.py b/deepnote_toolkit/ocelots/pandas/utils.py index 2514628..d786492 100644 --- a/deepnote_toolkit/ocelots/pandas/utils.py +++ b/deepnote_toolkit/ocelots/pandas/utils.py @@ -1,3 +1,6 @@ +from decimal import Decimal +from typing import Any + import numpy as np import pandas as pd from packaging.requirements import Requirement @@ -104,6 +107,42 @@ def to_string_truncated(elem): return df +MAX_SAFE_FLOAT64_INTEGER = 2**53 + + +def is_large_number(x: Any) -> bool: + """Return True if *x* is a numeric value that would lose precision as float64. + + float64 can represent integers exactly only up to 2**53, so any + numeric value whose absolute value exceeds that threshold is + considered "large" and should be converted to a string. + """ + try: + return ( + isinstance(x, (int, float, Decimal, np.integer, np.floating)) + and abs(x) > MAX_SAFE_FLOAT64_INTEGER + ) + except (TypeError, OverflowError, ArithmeticError): + return False + + +def cast_large_numbers_to_string(df: pd.DataFrame) -> pd.DataFrame: + """Convert columns containing numbers beyond float64 safe integer range to strings. + + JavaScript's JSON.parse() reads all numbers as float64, which can only + represent integers exactly up to 2**53. Values above that threshold lose + precision, so we convert the entire column to strings to preserve the + exact value. + """ + for column in df: + if ( + is_pure_numeric(df[column].dtype) + and df[column].apply(is_large_number).any() + ): + df[column] = df[column].apply(safe_convert_to_string) + return df + + def is_type_datetime_or_timedelta(series_or_dtype): """ Returns True if the series or dtype is datetime or timedelta, False otherwise. diff --git a/deepnote_toolkit/sql/sql_execution.py b/deepnote_toolkit/sql/sql_execution.py index be52aa1..07b61fe 100644 --- a/deepnote_toolkit/sql/sql_execution.py +++ b/deepnote_toolkit/sql/sql_execution.py @@ -5,7 +5,6 @@ import uuid import warnings import weakref -from decimal import Decimal from typing import TYPE_CHECKING, Any, Optional from urllib.parse import quote @@ -31,7 +30,7 @@ ) from deepnote_toolkit.ipython_utils import output_sql_metadata from deepnote_toolkit.logging import LoggerManager -from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns +from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns, is_large_number from deepnote_toolkit.sql.duckdb_sql import execute_duckdb_sql from deepnote_toolkit.sql.jinjasql_utils import render_jinja_sql_template from deepnote_toolkit.sql.query_preview import DeepnoteQueryPreview @@ -687,14 +686,6 @@ class BigQueryCredentialsError(Exception): return {"connect_args": {"client": client}} -def _is_large_number(x: Any) -> bool: - """Return True if *x* is a numeric value that exceeds the int64 range""" - try: - return isinstance(x, (int, float, Decimal)) and abs(x) > 2**63 - 1 - except (TypeError, OverflowError, ArithmeticError): - return False - - def _sanitize_dataframe_for_parquet(dataframe): """Sanitizes the dataframe so that we can safely call .to_parquet on it""" @@ -714,9 +705,11 @@ def _sanitize_dataframe_for_parquet(dataframe): ): dataframe[column] = dataframe[column].astype(str) - # Convert columns with large numbers to strings + # Convert columns with large numbers to strings to preserve precision. + # float64 can only represent integers exactly up to 2**53; values + # above that threshold are converted to strings. for column in dataframe.columns: - if dataframe[column].apply(_is_large_number).any(): + if dataframe[column].apply(is_large_number).any(): dataframe[column] = dataframe[column].astype(str) diff --git a/tests/unit/helpers/testing_dataframes.py b/tests/unit/helpers/testing_dataframes.py index f3744a3..2fcc6c3 100644 --- a/tests/unit/helpers/testing_dataframes.py +++ b/tests/unit/helpers/testing_dataframes.py @@ -113,6 +113,17 @@ def create_dataframe_with_duplicate_column_names(): "col1": [2**53], } ), + "large_numbers_above_threshold": pd.DataFrame( + data={ + "col1": [2**53 + 1, 42, 2**53 + 100], + } + ), + "large_numbers_mixed_columns": pd.DataFrame( + data={ + "safe_col": [1, 2, 3], + "large_col": [2**53 + 1, 2**53 + 2, 2**53 + 3], + } + ), "infinity": pd.DataFrame( data={ "col1": [0, np.inf, -np.inf], diff --git a/tests/unit/test_dataframe_utils.py b/tests/unit/test_dataframe_utils.py index 38e8222..a7d441d 100644 --- a/tests/unit/test_dataframe_utils.py +++ b/tests/unit/test_dataframe_utils.py @@ -2,6 +2,7 @@ import unittest from unittest.mock import MagicMock +import numpy as np from ipykernel.jsonutil import json_clean from deepnote_toolkit.dataframe_utils import _describe_dataframe, add_formatters @@ -188,6 +189,34 @@ def test_large_numbers(self): self.assertEqual(result["columns"][0]["stats"]["min"], str(2**53)) self.assertEqual(result["columns"][0]["stats"]["max"], str(2**53)) + def test_large_numbers_above_threshold_are_strings_in_rows(self): + """Integers above 2**53 must be converted to strings in rows to preserve precision.""" + df = testing_dataframes["large_numbers_above_threshold"] + result = describe_and_json_clean(df) + self.assertEqual(result["row_count"], 3) + for row in result["rows"]: + self.assertIsInstance(row["col1"], str) + self.assertEqual(result["rows"][0]["col1"], str(2**53 + 1)) + self.assertEqual(result["rows"][1]["col1"], "42") + self.assertEqual(result["rows"][2]["col1"], str(2**53 + 100)) + + def test_large_numbers_mixed_columns_only_affects_large_column(self): + """Only columns containing values above 2**53 should be converted; safe columns stay numeric.""" + df = testing_dataframes["large_numbers_mixed_columns"] + result = describe_and_json_clean(df) + self.assertEqual(result["row_count"], 3) + for row in result["rows"]: + self.assertIsInstance(row["safe_col"], int) + self.assertIsInstance(row["large_col"], str) + self.assertEqual(result["rows"][0]["large_col"], str(2**53 + 1)) + + def test_large_numbers_at_boundary_stay_numeric(self): + """Integers exactly at 2**53 should remain as numbers (still exact in float64).""" + df = testing_dataframes["large_numbers"] + result = describe_and_json_clean(df) + for row in result["rows"]: + self.assertIsInstance(row["col1"], (int, np.integer)) + def test_infinity(self): df = testing_dataframes["infinity"] result = describe_and_json_clean(df) diff --git a/tests/unit/test_sql_execution_internal.py b/tests/unit/test_sql_execution_internal.py index fe44145..ad4ec00 100644 --- a/tests/unit/test_sql_execution_internal.py +++ b/tests/unit/test_sql_execution_internal.py @@ -218,7 +218,7 @@ def test_sanitize_dataframe_for_parquet_decimal_large_numbers(): def test_sanitize_dataframe_for_parquet_decimal_small_numbers(): - """Decimal values within int64 range should not be converted.""" + """Decimal values within float64 exact range should not be converted.""" from decimal import Decimal data = pd.DataFrame( @@ -246,19 +246,130 @@ def test_sanitize_dataframe_for_parquet_decimal_nan(): def test_is_large_number(): from decimal import Decimal - assert se._is_large_number(2**63) is True - assert se._is_large_number(-(2**63) - 1) is True - assert se._is_large_number(2**63 - 1) is False - assert se._is_large_number(42) is False - assert se._is_large_number(float("inf")) is True - assert se._is_large_number(float("nan")) is False - assert se._is_large_number(Decimal("1e40")) is True - assert se._is_large_number(Decimal("100")) is False - assert se._is_large_number(Decimal("NaN")) is False - assert se._is_large_number(Decimal("sNaN")) is False - assert se._is_large_number(Decimal("Infinity")) is True - assert se._is_large_number("not a number") is False - assert se._is_large_number(None) is False + from deepnote_toolkit.ocelots.pandas.utils import is_large_number + + # 2**53 boundary: float64 can represent integers exactly up to 2**53 + assert is_large_number(2**53) is False + assert is_large_number(2**53 + 1) is True + assert is_large_number(-(2**53)) is False + assert is_large_number(-(2**53) - 1) is True + + # Small integers should not trigger + assert is_large_number(0) is False + assert is_large_number(1) is False + assert is_large_number(-1) is False + assert is_large_number(42) is False + + # Large ints well beyond 2**53 should trigger + assert is_large_number(2**63 - 1) is True + assert is_large_number(2**63) is True + assert is_large_number(10**18) is True + + # Floats + assert is_large_number(float("inf")) is True + assert is_large_number(float("nan")) is False + assert is_large_number(1.0) is False + + # Decimals + assert is_large_number(Decimal("1e40")) is True + assert is_large_number(Decimal("9007199254740994")) is True + assert is_large_number(Decimal("100")) is False + assert is_large_number(Decimal("NaN")) is False + assert is_large_number(Decimal("sNaN")) is False + assert is_large_number(Decimal("Infinity")) is True + + # Non-numeric types should not trigger + assert is_large_number("not a number") is False + assert is_large_number(None) is False + + +def test_sanitize_dataframe_for_parquet_large_int_precision_loss(): + """Integers above 2**53 must be converted to strings to preserve precision.""" + val_above = 2**53 + 1 # 9007199254740993 + val_exact = 2**53 # 9007199254740992 + + data = pd.DataFrame( + { + "lossy": [val_above, val_exact], + "safe": [42, 100], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["lossy"].dtype == object + assert data["lossy"].iloc[0] == str(val_above) + assert data["lossy"].iloc[1] == str(val_exact) + assert pd.api.types.is_integer_dtype(data["safe"]) + + +def test_sanitize_dataframe_for_parquet_large_int_negative(): + """Negative integers beyond -2**53 must also be converted.""" + data = pd.DataFrame( + { + "neg": [-(2**53) - 1, 0], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["neg"].dtype == object + assert data["neg"].iloc[0] == str(-(2**53) - 1) + + +def test_sanitize_dataframe_for_parquet_int_at_boundary(): + """Integers exactly at 2**53 should not be converted (still exact in float64).""" + data = pd.DataFrame( + { + "boundary": [2**53, -(2**53)], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert pd.api.types.is_integer_dtype(data["boundary"]) + + +def test_sanitize_dataframe_for_parquet_mixed_int_with_none(): + """Mixed object column with None and large int should convert to strings.""" + data = pd.DataFrame( + { + "mixed": pd.array([2**53 + 1, None, 42], dtype=object), + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["mixed"].dtype == object + assert data["mixed"].iloc[0] == str(2**53 + 1) + + +def test_sanitize_dataframe_for_parquet_decimal_int_precision_loss(): + """Integer-valued Decimals above 2**53 should be converted to strings.""" + from decimal import Decimal + + data = pd.DataFrame( + { + "d": [Decimal("9007199254740993"), Decimal("42")], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["d"].dtype == object + assert data["d"].iloc[0] == str(Decimal("9007199254740993")) + + +def test_sanitize_dataframe_for_parquet_precision_loss_preserves_value(): + """Verify the string conversion preserves the exact integer value.""" + val = 9007199254740993 + assert float(val) == float(9007199254740992) # proves precision loss in float64 + + data = pd.DataFrame({"x": [val]}) + se._sanitize_dataframe_for_parquet(data) + assert data["x"].iloc[0] == "9007199254740993" # exact value preserved + + +def test_sanitize_dataframe_for_parquet_very_large_int(): + """Integers far beyond 2**53 (e.g. 2**64) must also be converted.""" + data = pd.DataFrame( + { + "huge": [2**64, 42], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["huge"].dtype == object + assert data["huge"].iloc[0] == str(2**64) def test_create_sql_ssh_uri_no_ssh():