Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions deepnote_toolkit/ocelots/pandas/implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from .analyze import analyze_columns
from .utils import (
cast_large_numbers_to_string,
cast_objects_to_string,
deduplicate_columns,
fill_nat,
Expand Down Expand Up @@ -303,6 +304,7 @@ def to_records(self, mode: Literal["json", "python"]) -> List[Dict[str, Any]]:
if mode == "json":
fill_nat(df_copy, "NaT")
cast_objects_to_string(df_copy)
cast_large_numbers_to_string(df_copy)
return df_copy.to_dict("records")

def to_csv(self, path_or_buf: Union[str, TextIO]) -> None:
Expand Down
39 changes: 39 additions & 0 deletions deepnote_toolkit/ocelots/pandas/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from decimal import Decimal
from typing import Any

import numpy as np
import pandas as pd
from packaging.requirements import Requirement
Expand Down Expand Up @@ -104,6 +107,42 @@ def to_string_truncated(elem):
return df


MAX_SAFE_FLOAT64_INTEGER = 2**53


def is_large_number(x: Any) -> bool:
"""Return True if *x* is a numeric value that would lose precision as float64.

float64 can represent integers exactly only up to 2**53, so any
numeric value whose absolute value exceeds that threshold is
considered "large" and should be converted to a string.
"""
try:
return (
isinstance(x, (int, float, Decimal, np.integer, np.floating))
and abs(x) > MAX_SAFE_FLOAT64_INTEGER
)
except (TypeError, OverflowError, ArithmeticError):
return False


def cast_large_numbers_to_string(df: pd.DataFrame) -> pd.DataFrame:
"""Convert columns containing numbers beyond float64 safe integer range to strings.

JavaScript's JSON.parse() reads all numbers as float64, which can only
represent integers exactly up to 2**53. Values above that threshold lose
precision, so we convert the entire column to strings to preserve the
exact value.
"""
for column in df:
if (
is_pure_numeric(df[column].dtype)
and df[column].apply(is_large_number).any()
):
df[column] = df[column].apply(safe_convert_to_string)
return df


def is_type_datetime_or_timedelta(series_or_dtype):
"""
Returns True if the series or dtype is datetime or timedelta, False otherwise.
Expand Down
17 changes: 5 additions & 12 deletions deepnote_toolkit/sql/sql_execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import uuid
import warnings
import weakref
from decimal import Decimal
from typing import TYPE_CHECKING, Any, Optional
from urllib.parse import quote

Expand All @@ -31,7 +30,7 @@
)
from deepnote_toolkit.ipython_utils import output_sql_metadata
from deepnote_toolkit.logging import LoggerManager
from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns
from deepnote_toolkit.ocelots.pandas.utils import deduplicate_columns, is_large_number
from deepnote_toolkit.sql.duckdb_sql import execute_duckdb_sql
from deepnote_toolkit.sql.jinjasql_utils import render_jinja_sql_template
from deepnote_toolkit.sql.query_preview import DeepnoteQueryPreview
Expand Down Expand Up @@ -687,14 +686,6 @@ class BigQueryCredentialsError(Exception):
return {"connect_args": {"client": client}}


def _is_large_number(x: Any) -> bool:
"""Return True if *x* is a numeric value that exceeds the int64 range"""
try:
return isinstance(x, (int, float, Decimal)) and abs(x) > 2**63 - 1
except (TypeError, OverflowError, ArithmeticError):
return False


def _sanitize_dataframe_for_parquet(dataframe):
"""Sanitizes the dataframe so that we can safely call .to_parquet on it"""

Expand All @@ -714,9 +705,11 @@ def _sanitize_dataframe_for_parquet(dataframe):
):
dataframe[column] = dataframe[column].astype(str)

# Convert columns with large numbers to strings
# Convert columns with large numbers to strings to preserve precision.
# float64 can only represent integers exactly up to 2**53; values
# above that threshold are converted to strings.
for column in dataframe.columns:
if dataframe[column].apply(_is_large_number).any():
if dataframe[column].apply(is_large_number).any():
dataframe[column] = dataframe[column].astype(str)


Expand Down
11 changes: 11 additions & 0 deletions tests/unit/helpers/testing_dataframes.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,17 @@ def create_dataframe_with_duplicate_column_names():
"col1": [2**53],
}
),
"large_numbers_above_threshold": pd.DataFrame(
data={
"col1": [2**53 + 1, 42, 2**53 + 100],
}
),
"large_numbers_mixed_columns": pd.DataFrame(
data={
"safe_col": [1, 2, 3],
"large_col": [2**53 + 1, 2**53 + 2, 2**53 + 3],
}
),
"infinity": pd.DataFrame(
data={
"col1": [0, np.inf, -np.inf],
Expand Down
29 changes: 29 additions & 0 deletions tests/unit/test_dataframe_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import unittest
from unittest.mock import MagicMock

import numpy as np
from ipykernel.jsonutil import json_clean

from deepnote_toolkit.dataframe_utils import _describe_dataframe, add_formatters
Expand Down Expand Up @@ -188,6 +189,34 @@ def test_large_numbers(self):
self.assertEqual(result["columns"][0]["stats"]["min"], str(2**53))
self.assertEqual(result["columns"][0]["stats"]["max"], str(2**53))

def test_large_numbers_above_threshold_are_strings_in_rows(self):
"""Integers above 2**53 must be converted to strings in rows to preserve precision."""
df = testing_dataframes["large_numbers_above_threshold"]
result = describe_and_json_clean(df)
self.assertEqual(result["row_count"], 3)
for row in result["rows"]:
self.assertIsInstance(row["col1"], str)
self.assertEqual(result["rows"][0]["col1"], str(2**53 + 1))
self.assertEqual(result["rows"][1]["col1"], "42")
self.assertEqual(result["rows"][2]["col1"], str(2**53 + 100))

def test_large_numbers_mixed_columns_only_affects_large_column(self):
"""Only columns containing values above 2**53 should be converted; safe columns stay numeric."""
df = testing_dataframes["large_numbers_mixed_columns"]
result = describe_and_json_clean(df)
self.assertEqual(result["row_count"], 3)
for row in result["rows"]:
self.assertIsInstance(row["safe_col"], int)
self.assertIsInstance(row["large_col"], str)
self.assertEqual(result["rows"][0]["large_col"], str(2**53 + 1))

def test_large_numbers_at_boundary_stay_numeric(self):
"""Integers exactly at 2**53 should remain as numbers (still exact in float64)."""
df = testing_dataframes["large_numbers"]
result = describe_and_json_clean(df)
for row in result["rows"]:
self.assertIsInstance(row["col1"], (int, np.integer))

def test_infinity(self):
df = testing_dataframes["infinity"]
result = describe_and_json_clean(df)
Expand Down
139 changes: 125 additions & 14 deletions tests/unit/test_sql_execution_internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def test_sanitize_dataframe_for_parquet_decimal_large_numbers():


def test_sanitize_dataframe_for_parquet_decimal_small_numbers():
"""Decimal values within int64 range should not be converted."""
"""Decimal values within float64 exact range should not be converted."""
from decimal import Decimal

data = pd.DataFrame(
Expand Down Expand Up @@ -246,19 +246,130 @@ def test_sanitize_dataframe_for_parquet_decimal_nan():
def test_is_large_number():
from decimal import Decimal

assert se._is_large_number(2**63) is True
assert se._is_large_number(-(2**63) - 1) is True
assert se._is_large_number(2**63 - 1) is False
assert se._is_large_number(42) is False
assert se._is_large_number(float("inf")) is True
assert se._is_large_number(float("nan")) is False
assert se._is_large_number(Decimal("1e40")) is True
assert se._is_large_number(Decimal("100")) is False
assert se._is_large_number(Decimal("NaN")) is False
assert se._is_large_number(Decimal("sNaN")) is False
assert se._is_large_number(Decimal("Infinity")) is True
assert se._is_large_number("not a number") is False
assert se._is_large_number(None) is False
from deepnote_toolkit.ocelots.pandas.utils import is_large_number

# 2**53 boundary: float64 can represent integers exactly up to 2**53
assert is_large_number(2**53) is False
assert is_large_number(2**53 + 1) is True
assert is_large_number(-(2**53)) is False
assert is_large_number(-(2**53) - 1) is True

# Small integers should not trigger
assert is_large_number(0) is False
assert is_large_number(1) is False
assert is_large_number(-1) is False
assert is_large_number(42) is False

# Large ints well beyond 2**53 should trigger
assert is_large_number(2**63 - 1) is True
assert is_large_number(2**63) is True
assert is_large_number(10**18) is True

# Floats
assert is_large_number(float("inf")) is True
assert is_large_number(float("nan")) is False
assert is_large_number(1.0) is False

# Decimals
assert is_large_number(Decimal("1e40")) is True
assert is_large_number(Decimal("9007199254740994")) is True
assert is_large_number(Decimal("100")) is False
assert is_large_number(Decimal("NaN")) is False
assert is_large_number(Decimal("sNaN")) is False
assert is_large_number(Decimal("Infinity")) is True

# Non-numeric types should not trigger
assert is_large_number("not a number") is False
assert is_large_number(None) is False


def test_sanitize_dataframe_for_parquet_large_int_precision_loss():
"""Integers above 2**53 must be converted to strings to preserve precision."""
val_above = 2**53 + 1 # 9007199254740993
val_exact = 2**53 # 9007199254740992

data = pd.DataFrame(
{
"lossy": [val_above, val_exact],
"safe": [42, 100],
}
)
se._sanitize_dataframe_for_parquet(data)
assert data["lossy"].dtype == object
assert data["lossy"].iloc[0] == str(val_above)
assert data["lossy"].iloc[1] == str(val_exact)
assert pd.api.types.is_integer_dtype(data["safe"])


def test_sanitize_dataframe_for_parquet_large_int_negative():
"""Negative integers beyond -2**53 must also be converted."""
data = pd.DataFrame(
{
"neg": [-(2**53) - 1, 0],
}
)
se._sanitize_dataframe_for_parquet(data)
assert data["neg"].dtype == object
assert data["neg"].iloc[0] == str(-(2**53) - 1)


def test_sanitize_dataframe_for_parquet_int_at_boundary():
"""Integers exactly at 2**53 should not be converted (still exact in float64)."""
data = pd.DataFrame(
{
"boundary": [2**53, -(2**53)],
}
)
se._sanitize_dataframe_for_parquet(data)
assert pd.api.types.is_integer_dtype(data["boundary"])


def test_sanitize_dataframe_for_parquet_mixed_int_with_none():
"""Mixed object column with None and large int should convert to strings."""
data = pd.DataFrame(
{
"mixed": pd.array([2**53 + 1, None, 42], dtype=object),
}
)
se._sanitize_dataframe_for_parquet(data)
assert data["mixed"].dtype == object
assert data["mixed"].iloc[0] == str(2**53 + 1)


def test_sanitize_dataframe_for_parquet_decimal_int_precision_loss():
"""Integer-valued Decimals above 2**53 should be converted to strings."""
from decimal import Decimal

data = pd.DataFrame(
{
"d": [Decimal("9007199254740993"), Decimal("42")],
}
)
se._sanitize_dataframe_for_parquet(data)
assert data["d"].dtype == object
assert data["d"].iloc[0] == str(Decimal("9007199254740993"))


def test_sanitize_dataframe_for_parquet_precision_loss_preserves_value():
"""Verify the string conversion preserves the exact integer value."""
val = 9007199254740993
assert float(val) == float(9007199254740992) # proves precision loss in float64

data = pd.DataFrame({"x": [val]})
se._sanitize_dataframe_for_parquet(data)
assert data["x"].iloc[0] == "9007199254740993" # exact value preserved


def test_sanitize_dataframe_for_parquet_very_large_int():
"""Integers far beyond 2**53 (e.g. 2**64) must also be converted."""
data = pd.DataFrame(
{
"huge": [2**64, 42],
}
)
se._sanitize_dataframe_for_parquet(data)
assert data["huge"].dtype == object
assert data["huge"].iloc[0] == str(2**64)


def test_create_sql_ssh_uri_no_ssh():
Expand Down
Loading