From b12a99b53345408b728de13b8f185b9692d878dd Mon Sep 17 00:00:00 2001 From: tomas Date: Tue, 10 Mar 2026 16:07:17 +0000 Subject: [PATCH 1/6] fix(sql_execution): Fix is large number check to use 2**53 as cutoff --- deepnote_toolkit/sql/sql_execution.py | 13 ++- tests/unit/test_sql_execution_internal.py | 117 +++++++++++++++++++++- 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/deepnote_toolkit/sql/sql_execution.py b/deepnote_toolkit/sql/sql_execution.py index be52aa1..e05fb20 100644 --- a/deepnote_toolkit/sql/sql_execution.py +++ b/deepnote_toolkit/sql/sql_execution.py @@ -688,9 +688,14 @@ class BigQueryCredentialsError(Exception): def _is_large_number(x: Any) -> bool: - """Return True if *x* is a numeric value that exceeds the int64 range""" + """Return True if *x* is a numeric value that would lose precision as float64. + + float64 can represent integers exactly only up to 2**53, so any + int, float, or Decimal whose absolute value exceeds that threshold + is considered "large" and will be converted to a string. + """ try: - return isinstance(x, (int, float, Decimal)) and abs(x) > 2**63 - 1 + return isinstance(x, (int, float, Decimal)) and abs(x) > 2**53 except (TypeError, OverflowError, ArithmeticError): return False @@ -714,7 +719,9 @@ def _sanitize_dataframe_for_parquet(dataframe): ): dataframe[column] = dataframe[column].astype(str) - # Convert columns with large numbers to strings + # Convert columns with large numbers to strings to preserve precision. + # float64 can only represent integers exactly up to 2**53; values + # above that threshold are converted to strings. for column in dataframe.columns: if dataframe[column].apply(_is_large_number).any(): dataframe[column] = dataframe[column].astype(str) diff --git a/tests/unit/test_sql_execution_internal.py b/tests/unit/test_sql_execution_internal.py index fe44145..52d25c3 100644 --- a/tests/unit/test_sql_execution_internal.py +++ b/tests/unit/test_sql_execution_internal.py @@ -218,7 +218,7 @@ def test_sanitize_dataframe_for_parquet_decimal_large_numbers(): def test_sanitize_dataframe_for_parquet_decimal_small_numbers(): - """Decimal values within int64 range should not be converted.""" + """Decimal values within float64 exact range should not be converted.""" from decimal import Decimal data = pd.DataFrame( @@ -246,21 +246,130 @@ def test_sanitize_dataframe_for_parquet_decimal_nan(): def test_is_large_number(): from decimal import Decimal - assert se._is_large_number(2**63) is True - assert se._is_large_number(-(2**63) - 1) is True - assert se._is_large_number(2**63 - 1) is False + # 2**53 boundary: float64 can represent integers exactly up to 2**53 + assert se._is_large_number(2**53) is False + assert se._is_large_number(2**53 + 1) is True + assert se._is_large_number(-(2**53)) is False + assert se._is_large_number(-(2**53) - 1) is True + + # Small integers should not trigger + assert se._is_large_number(0) is False + assert se._is_large_number(1) is False + assert se._is_large_number(-1) is False assert se._is_large_number(42) is False + + # Large ints well beyond 2**53 should trigger + assert se._is_large_number(2**63 - 1) is True + assert se._is_large_number(2**63) is True + assert se._is_large_number(10**18) is True + + # Floats assert se._is_large_number(float("inf")) is True assert se._is_large_number(float("nan")) is False + assert se._is_large_number(1.0) is False + + # Decimals assert se._is_large_number(Decimal("1e40")) is True + assert se._is_large_number(Decimal("9007199254740994")) is True assert se._is_large_number(Decimal("100")) is False assert se._is_large_number(Decimal("NaN")) is False assert se._is_large_number(Decimal("sNaN")) is False assert se._is_large_number(Decimal("Infinity")) is True + + # Non-numeric types should not trigger assert se._is_large_number("not a number") is False assert se._is_large_number(None) is False +def test_sanitize_dataframe_for_parquet_large_int_precision_loss(): + """Integers above 2**53 must be converted to strings to preserve precision.""" + val_above = 2**53 + 1 # 9007199254740993 + val_exact = 2**53 # 9007199254740992 + + data = pd.DataFrame( + { + "lossy": [val_above, val_exact], + "safe": [42, 100], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["lossy"].dtype == object + assert data["lossy"].iloc[0] == str(val_above) + assert data["lossy"].iloc[1] == str(val_exact) + assert pd.api.types.is_integer_dtype(data["safe"]) + + +def test_sanitize_dataframe_for_parquet_large_int_negative(): + """Negative integers beyond -2**53 must also be converted.""" + data = pd.DataFrame( + { + "neg": [-(2**53) - 1, 0], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["neg"].dtype == object + assert data["neg"].iloc[0] == str(-(2**53) - 1) + + +def test_sanitize_dataframe_for_parquet_int_at_boundary(): + """Integers exactly at 2**53 should not be converted (still exact in float64).""" + data = pd.DataFrame( + { + "boundary": [2**53, -(2**53)], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert pd.api.types.is_integer_dtype(data["boundary"]) + + +def test_sanitize_dataframe_for_parquet_mixed_int_with_none(): + """Mixed object column with None and large int should convert to strings.""" + data = pd.DataFrame( + { + "mixed": pd.array([2**53 + 1, None, 42], dtype=object), + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["mixed"].dtype == object + assert data["mixed"].iloc[0] == str(2**53 + 1) + + +def test_sanitize_dataframe_for_parquet_decimal_int_precision_loss(): + """Integer-valued Decimals above 2**53 should be converted to strings.""" + from decimal import Decimal + + data = pd.DataFrame( + { + "d": [Decimal("9007199254740993"), Decimal("42")], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["d"].dtype == object + assert data["d"].iloc[0] == str(Decimal("9007199254740993")) + + +def test_sanitize_dataframe_for_parquet_precision_loss_preserves_value(): + """Verify the string conversion preserves the exact integer value.""" + val = 9007199254740993 + assert float(val) == float(9007199254740992) # proves precision loss in float64 + + data = pd.DataFrame({"x": [val]}) + se._sanitize_dataframe_for_parquet(data) + assert data["x"].iloc[0] == "9007199254740993" # exact value preserved + + +def test_sanitize_dataframe_for_parquet_very_large_int(): + """Integers far beyond 2**53 (e.g. 2**64) must also be converted.""" + data = pd.DataFrame( + { + "huge": [2**64, 42], + } + ) + se._sanitize_dataframe_for_parquet(data) + assert data["huge"].dtype == object + assert data["huge"].iloc[0] == str(2**64) + + def test_create_sql_ssh_uri_no_ssh(): with se._create_sql_ssh_uri(False, {}) as url: assert url is None From 731e71b9624c6c396de89b546ab1409bf6a24a31 Mon Sep 17 00:00:00 2001 From: tomas Date: Tue, 10 Mar 2026 18:34:40 +0000 Subject: [PATCH 2/6] feat(pandas): Add functionality to cast large numbers to strings for JSON compatibility - Introduced `cast_large_numbers_to_string` function to convert numeric values exceeding the float64 safe integer range (2**53) to strings, preserving precision for JSON serialization. - Updated `PandasImplementation.to_json` method to utilize the new function. - Added unit tests to ensure correct behavior for large numbers in dataframes. --- .../ocelots/pandas/implementation.py | 2 ++ deepnote_toolkit/ocelots/pandas/utils.py | 32 +++++++++++++++++++ deepnote_toolkit/sql/sql_execution.py | 17 ++-------- tests/unit/helpers/testing_dataframes.py | 11 +++++++ tests/unit/test_dataframe_utils.py | 29 +++++++++++++++++ 5 files changed, 76 insertions(+), 15 deletions(-) diff --git a/deepnote_toolkit/ocelots/pandas/implementation.py b/deepnote_toolkit/ocelots/pandas/implementation.py index fb52d0c..24a6306 100644 --- a/deepnote_toolkit/ocelots/pandas/implementation.py +++ b/deepnote_toolkit/ocelots/pandas/implementation.py @@ -13,6 +13,7 @@ from .analyze import analyze_columns from .utils import ( + cast_large_numbers_to_string, cast_objects_to_string, deduplicate_columns, fill_nat, @@ -303,6 +304,7 @@ def to_records(self, mode: Literal["json", "python"]) -> List[Dict[str, Any]]: if mode == "json": fill_nat(df_copy, "NaT") cast_objects_to_string(df_copy) + cast_large_numbers_to_string(df_copy) return df_copy.to_dict("records") def to_csv(self, path_or_buf: Union[str, TextIO]) -> None: diff --git a/deepnote_toolkit/ocelots/pandas/utils.py b/deepnote_toolkit/ocelots/pandas/utils.py index 2514628..5755cc0 100644 --- a/deepnote_toolkit/ocelots/pandas/utils.py +++ b/deepnote_toolkit/ocelots/pandas/utils.py @@ -1,3 +1,5 @@ +from decimal import Decimal + import numpy as np import pandas as pd from packaging.requirements import Requirement @@ -104,6 +106,36 @@ def to_string_truncated(elem): return df +MAX_SAFE_FLOAT64_INTEGER = 2**53 + + +def is_large_number(x) -> bool: + """Return True if *x* is a numeric value that would lose precision as float64. + + float64 can represent integers exactly only up to 2**53, so any + numeric value whose absolute value exceeds that threshold is + considered "large" and should be converted to a string. + """ + try: + return isinstance(x, (int, float, Decimal, np.integer, np.floating)) and abs(x) > MAX_SAFE_FLOAT64_INTEGER + except (TypeError, OverflowError, ArithmeticError): + return False + + +def cast_large_numbers_to_string(df): + """Convert columns containing numbers beyond float64 safe integer range to strings. + + JavaScript's JSON.parse() reads all numbers as float64, which can only + represent integers exactly up to 2**53. Values above that threshold lose + precision, so we convert the entire column to strings to preserve the + exact value. + """ + for column in df: + if is_pure_numeric(df[column].dtype) and df[column].apply(is_large_number).any(): + df[column] = df[column].apply(safe_convert_to_string) + return df + + def is_type_datetime_or_timedelta(series_or_dtype): """ Returns True if the series or dtype is datetime or timedelta, False otherwise. diff --git a/deepnote_toolkit/sql/sql_execution.py b/deepnote_toolkit/sql/sql_execution.py index e05fb20..c93fe90 100644 --- a/deepnote_toolkit/sql/sql_execution.py +++ b/deepnote_toolkit/sql/sql_execution.py @@ -31,7 +31,7 @@ ) from deepnote_toolkit.ipython_utils import output_sql_metadata from deepnote_toolkit.logging import LoggerManager -from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns +from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns, is_large_number from deepnote_toolkit.sql.duckdb_sql import execute_duckdb_sql from deepnote_toolkit.sql.jinjasql_utils import render_jinja_sql_template from deepnote_toolkit.sql.query_preview import DeepnoteQueryPreview @@ -687,19 +687,6 @@ class BigQueryCredentialsError(Exception): return {"connect_args": {"client": client}} -def _is_large_number(x: Any) -> bool: - """Return True if *x* is a numeric value that would lose precision as float64. - - float64 can represent integers exactly only up to 2**53, so any - int, float, or Decimal whose absolute value exceeds that threshold - is considered "large" and will be converted to a string. - """ - try: - return isinstance(x, (int, float, Decimal)) and abs(x) > 2**53 - except (TypeError, OverflowError, ArithmeticError): - return False - - def _sanitize_dataframe_for_parquet(dataframe): """Sanitizes the dataframe so that we can safely call .to_parquet on it""" @@ -723,7 +710,7 @@ def _sanitize_dataframe_for_parquet(dataframe): # float64 can only represent integers exactly up to 2**53; values # above that threshold are converted to strings. for column in dataframe.columns: - if dataframe[column].apply(_is_large_number).any(): + if dataframe[column].apply(is_large_number).any(): dataframe[column] = dataframe[column].astype(str) diff --git a/tests/unit/helpers/testing_dataframes.py b/tests/unit/helpers/testing_dataframes.py index f3744a3..2fcc6c3 100644 --- a/tests/unit/helpers/testing_dataframes.py +++ b/tests/unit/helpers/testing_dataframes.py @@ -113,6 +113,17 @@ def create_dataframe_with_duplicate_column_names(): "col1": [2**53], } ), + "large_numbers_above_threshold": pd.DataFrame( + data={ + "col1": [2**53 + 1, 42, 2**53 + 100], + } + ), + "large_numbers_mixed_columns": pd.DataFrame( + data={ + "safe_col": [1, 2, 3], + "large_col": [2**53 + 1, 2**53 + 2, 2**53 + 3], + } + ), "infinity": pd.DataFrame( data={ "col1": [0, np.inf, -np.inf], diff --git a/tests/unit/test_dataframe_utils.py b/tests/unit/test_dataframe_utils.py index 38e8222..a7d441d 100644 --- a/tests/unit/test_dataframe_utils.py +++ b/tests/unit/test_dataframe_utils.py @@ -2,6 +2,7 @@ import unittest from unittest.mock import MagicMock +import numpy as np from ipykernel.jsonutil import json_clean from deepnote_toolkit.dataframe_utils import _describe_dataframe, add_formatters @@ -188,6 +189,34 @@ def test_large_numbers(self): self.assertEqual(result["columns"][0]["stats"]["min"], str(2**53)) self.assertEqual(result["columns"][0]["stats"]["max"], str(2**53)) + def test_large_numbers_above_threshold_are_strings_in_rows(self): + """Integers above 2**53 must be converted to strings in rows to preserve precision.""" + df = testing_dataframes["large_numbers_above_threshold"] + result = describe_and_json_clean(df) + self.assertEqual(result["row_count"], 3) + for row in result["rows"]: + self.assertIsInstance(row["col1"], str) + self.assertEqual(result["rows"][0]["col1"], str(2**53 + 1)) + self.assertEqual(result["rows"][1]["col1"], "42") + self.assertEqual(result["rows"][2]["col1"], str(2**53 + 100)) + + def test_large_numbers_mixed_columns_only_affects_large_column(self): + """Only columns containing values above 2**53 should be converted; safe columns stay numeric.""" + df = testing_dataframes["large_numbers_mixed_columns"] + result = describe_and_json_clean(df) + self.assertEqual(result["row_count"], 3) + for row in result["rows"]: + self.assertIsInstance(row["safe_col"], int) + self.assertIsInstance(row["large_col"], str) + self.assertEqual(result["rows"][0]["large_col"], str(2**53 + 1)) + + def test_large_numbers_at_boundary_stay_numeric(self): + """Integers exactly at 2**53 should remain as numbers (still exact in float64).""" + df = testing_dataframes["large_numbers"] + result = describe_and_json_clean(df) + for row in result["rows"]: + self.assertIsInstance(row["col1"], (int, np.integer)) + def test_infinity(self): df = testing_dataframes["infinity"] result = describe_and_json_clean(df) From 7784a6b37bc4a719f328619e52b7c1bf8356627e Mon Sep 17 00:00:00 2001 From: tomas Date: Tue, 10 Mar 2026 19:11:51 +0000 Subject: [PATCH 3/6] refactor(pandas): Improve readability of large number checks in utils.py - Reformatted the `is_large_number` and `cast_large_numbers_to_string` functions for better readability by using multi-line expressions. - Updated unit tests to call the `is_large_number` function directly instead of through a different module, ensuring consistency and clarity in test cases. --- deepnote_toolkit/ocelots/pandas/utils.py | 10 ++++- tests/unit/test_sql_execution_internal.py | 46 ++++++++++++----------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/deepnote_toolkit/ocelots/pandas/utils.py b/deepnote_toolkit/ocelots/pandas/utils.py index 5755cc0..a7881b8 100644 --- a/deepnote_toolkit/ocelots/pandas/utils.py +++ b/deepnote_toolkit/ocelots/pandas/utils.py @@ -117,7 +117,10 @@ def is_large_number(x) -> bool: considered "large" and should be converted to a string. """ try: - return isinstance(x, (int, float, Decimal, np.integer, np.floating)) and abs(x) > MAX_SAFE_FLOAT64_INTEGER + return ( + isinstance(x, (int, float, Decimal, np.integer, np.floating)) + and abs(x) > MAX_SAFE_FLOAT64_INTEGER + ) except (TypeError, OverflowError, ArithmeticError): return False @@ -131,7 +134,10 @@ def cast_large_numbers_to_string(df): exact value. """ for column in df: - if is_pure_numeric(df[column].dtype) and df[column].apply(is_large_number).any(): + if ( + is_pure_numeric(df[column].dtype) + and df[column].apply(is_large_number).any() + ): df[column] = df[column].apply(safe_convert_to_string) return df diff --git a/tests/unit/test_sql_execution_internal.py b/tests/unit/test_sql_execution_internal.py index 52d25c3..ad4ec00 100644 --- a/tests/unit/test_sql_execution_internal.py +++ b/tests/unit/test_sql_execution_internal.py @@ -246,39 +246,41 @@ def test_sanitize_dataframe_for_parquet_decimal_nan(): def test_is_large_number(): from decimal import Decimal + from deepnote_toolkit.ocelots.pandas.utils import is_large_number + # 2**53 boundary: float64 can represent integers exactly up to 2**53 - assert se._is_large_number(2**53) is False - assert se._is_large_number(2**53 + 1) is True - assert se._is_large_number(-(2**53)) is False - assert se._is_large_number(-(2**53) - 1) is True + assert is_large_number(2**53) is False + assert is_large_number(2**53 + 1) is True + assert is_large_number(-(2**53)) is False + assert is_large_number(-(2**53) - 1) is True # Small integers should not trigger - assert se._is_large_number(0) is False - assert se._is_large_number(1) is False - assert se._is_large_number(-1) is False - assert se._is_large_number(42) is False + assert is_large_number(0) is False + assert is_large_number(1) is False + assert is_large_number(-1) is False + assert is_large_number(42) is False # Large ints well beyond 2**53 should trigger - assert se._is_large_number(2**63 - 1) is True - assert se._is_large_number(2**63) is True - assert se._is_large_number(10**18) is True + assert is_large_number(2**63 - 1) is True + assert is_large_number(2**63) is True + assert is_large_number(10**18) is True # Floats - assert se._is_large_number(float("inf")) is True - assert se._is_large_number(float("nan")) is False - assert se._is_large_number(1.0) is False + assert is_large_number(float("inf")) is True + assert is_large_number(float("nan")) is False + assert is_large_number(1.0) is False # Decimals - assert se._is_large_number(Decimal("1e40")) is True - assert se._is_large_number(Decimal("9007199254740994")) is True - assert se._is_large_number(Decimal("100")) is False - assert se._is_large_number(Decimal("NaN")) is False - assert se._is_large_number(Decimal("sNaN")) is False - assert se._is_large_number(Decimal("Infinity")) is True + assert is_large_number(Decimal("1e40")) is True + assert is_large_number(Decimal("9007199254740994")) is True + assert is_large_number(Decimal("100")) is False + assert is_large_number(Decimal("NaN")) is False + assert is_large_number(Decimal("sNaN")) is False + assert is_large_number(Decimal("Infinity")) is True # Non-numeric types should not trigger - assert se._is_large_number("not a number") is False - assert se._is_large_number(None) is False + assert is_large_number("not a number") is False + assert is_large_number(None) is False def test_sanitize_dataframe_for_parquet_large_int_precision_loss(): From 7fbfcf1f665d372fb4a777922757bac1a13b0d28 Mon Sep 17 00:00:00 2001 From: tomas Date: Tue, 10 Mar 2026 19:16:10 +0000 Subject: [PATCH 4/6] Remove unused import --- deepnote_toolkit/sql/sql_execution.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepnote_toolkit/sql/sql_execution.py b/deepnote_toolkit/sql/sql_execution.py index c93fe90..07b61fe 100644 --- a/deepnote_toolkit/sql/sql_execution.py +++ b/deepnote_toolkit/sql/sql_execution.py @@ -5,7 +5,6 @@ import uuid import warnings import weakref -from decimal import Decimal from typing import TYPE_CHECKING, Any, Optional from urllib.parse import quote From ed33db8b29879411ac45796384f88c667a7c4f14 Mon Sep 17 00:00:00 2001 From: tomas Date: Tue, 10 Mar 2026 20:18:32 +0000 Subject: [PATCH 5/6] Add type hints to cast_large_numbers_to_string Add explicit pd.DataFrame input and return type annotations to cast_large_numbers_to_string so callers and static type checkers (mypy) recognise the typed signature. --- deepnote_toolkit/ocelots/pandas/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepnote_toolkit/ocelots/pandas/utils.py b/deepnote_toolkit/ocelots/pandas/utils.py index a7881b8..55b8640 100644 --- a/deepnote_toolkit/ocelots/pandas/utils.py +++ b/deepnote_toolkit/ocelots/pandas/utils.py @@ -125,7 +125,7 @@ def is_large_number(x) -> bool: return False -def cast_large_numbers_to_string(df): +def cast_large_numbers_to_string(df: pd.DataFrame) -> pd.DataFrame: """Convert columns containing numbers beyond float64 safe integer range to strings. JavaScript's JSON.parse() reads all numbers as float64, which can only From ed6dea6e9198a217fc6fc2f5927ca6302e9e1cba Mon Sep 17 00:00:00 2001 From: tomas Date: Tue, 10 Mar 2026 20:35:15 +0000 Subject: [PATCH 6/6] Add type hint Any to is_large_number parameter Import typing.Any and annotate the x parameter of is_large_number so static type checkers (mypy) accept the function signature. --- deepnote_toolkit/ocelots/pandas/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepnote_toolkit/ocelots/pandas/utils.py b/deepnote_toolkit/ocelots/pandas/utils.py index 55b8640..d786492 100644 --- a/deepnote_toolkit/ocelots/pandas/utils.py +++ b/deepnote_toolkit/ocelots/pandas/utils.py @@ -1,4 +1,5 @@ from decimal import Decimal +from typing import Any import numpy as np import pandas as pd @@ -109,7 +110,7 @@ def to_string_truncated(elem): MAX_SAFE_FLOAT64_INTEGER = 2**53 -def is_large_number(x) -> bool: +def is_large_number(x: Any) -> bool: """Return True if *x* is a numeric value that would lose precision as float64. float64 can represent integers exactly only up to 2**53, so any