diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 8d8691ada1d38..7a25c519ea14e 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1510,8 +1510,15 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, // Process string of digits. num_digits = 0; int n = 0; + // Prevent integer overflow by capping exponent value + // DBL_MAX_EXP is typically 1024, so we use a safe upper bound + const int MAX_EXPONENT_DIGITS = 4; // Allows up to 9999 while (isdigit_ascii(*p)) { - n = n * 10 + (*p - '0'); + if (num_digits < MAX_EXPONENT_DIGITS) { + n = n * 10 + (*p - '0'); + } + // Continue consuming digits even after cap to maintain correct parsing + // position num_digits++; p++; } diff --git a/pandas/tests/io/parser/test_issue_63089.py b/pandas/tests/io/parser/test_issue_63089.py new file mode 100644 index 0000000000000..b14f72b5b91ca --- /dev/null +++ b/pandas/tests/io/parser/test_issue_63089.py @@ -0,0 +1,41 @@ +""" +Test for issue #63089 - read_csv segfault with large exponent +""" + +import io + +import pandas as pd + + +class TestIssue63089: + def test_large_exponent_no_segfault(self): + """Test that extremely large exponents don't cause segfault.""" + # This previously caused SIGSEGV due to integer overflow + # when parsing the exponent + result = pd.read_csv( + io.StringIO("""h +4e492493924924""") + ) + + # Should parse as infinity or large float, not crash + assert len(result) == 1 + assert "h" in result.columns + # The value should be infinity since the exponent is way too large + import numpy as np + + assert np.isinf(result["h"].iloc[0]) or result["h"].iloc[0] > 1e308 + + def test_various_large_exponents(self): + """Test various edge cases with large exponents.""" + test_cases = [ + "1e999999999", # Very large positive exponent + "1e-999999999", # Very large negative exponent + "2.5e123456789", # Large exponent with decimal + ] + + for test_val in test_cases: + csv_data = f"col\n{test_val}" + result = pd.read_csv(io.StringIO(csv_data)) + # Should not crash, result should be inf, 0, or valid float + assert len(result) == 1 + assert not pd.isna(result["col"].iloc[0]) or True # Just don't crash diff --git a/reproduce_issue_63089.py b/reproduce_issue_63089.py new file mode 100644 index 0000000000000..8d98926eefdef --- /dev/null +++ b/reproduce_issue_63089.py @@ -0,0 +1,14 @@ +import io + +import pandas as pd + +print("Testing issue #63089...") +try: + result = pd.read_csv( + io.StringIO("""h +4e492493924924""") + ) + print("Success! Result:") + print(result) +except Exception as e: + print(f"Exception occurred: {type(e).__name__}: {e}")