From 59bb18013d63c8b19891ca3f7d03578716885ab5 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 22 Oct 2025 15:59:01 +0200 Subject: [PATCH 1/6] support large lists with pa.int64 types --- src/datasets/arrow_writer.py | 10 +++++++++- src/datasets/features/features.py | 12 ++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 3174f5cf206..37a1fb1a639 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -206,6 +206,7 @@ def __init__( optimized_int_type: Optional[FeatureType] = None, ): # assert type is None or try_type is None, + # print("[TypedSequence]: Init with type", type, "try_type", try_type, "optimized_int_type", optimized_int_type) if type is not None and try_type is not None: raise ValueError("You cannot specify both type and try_type") # set attributes @@ -524,7 +525,14 @@ def _build_schema(self, inferred_schema: pa.Schema): def _build_writer(self, inferred_schema: pa.Schema): self._schema, self._features = self._build_schema(inferred_schema) - self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema) + allow_64bit = False + for field_type in self._schema.types: + if pa.types.is_large_list(field_type): + allow_64bit = True + break + self.pa_writer = pa.RecordBatchStreamWriter( + self.stream, self._schema, options=pa.ipc.IpcWriteOptions(allow_64bit=allow_64bit) + ) @property def schema(self): diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 54d84ef33e2..da582d73369 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1525,6 +1525,8 @@ def numpy_to_pyarrow_listarray(arr: np.ndarray, type: pa.DataType = None) -> pa. def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> pa.ListArray: + # import pdb; pdb.set_trace() + # import traceback as tb; tb.print_stack() null_mask = np.array([arr is None for arr in l_arr]) null_indices = np.arange(len(null_mask))[null_mask] - np.arange(np.sum(null_mask)) l_arr = [arr for arr in l_arr if arr is not None] @@ -1532,9 +1534,15 @@ def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> p [0] + [len(arr) for arr in l_arr], dtype=object ) # convert to dtype object to allow None insertion offsets = np.insert(offsets, null_indices, None) - offsets = pa.array(offsets, type=pa.int32()) values = pa.concat_arrays(l_arr) - return pa.ListArray.from_arrays(offsets, values) + # offsets = pa.array(offsets, type=pa.int32()) + # return pa.ListArray.from_arrays(offsets, values) + try: + offsets = pa.array(offsets, type=pa.int32()) + return pa.ListArray.from_arrays(offsets, values) + except pa.lib.ArrowInvalid: + offsets = pa.array(offsets, type=pa.int64()) + return pa.LargeListArray.from_arrays(offsets, values) def list_of_np_array_to_pyarrow_listarray(l_arr: list[np.ndarray], type: pa.DataType = None) -> pa.ListArray: From 68c764dc4b77b1ed09744a86ebfbd6bf1ba4cdf2 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 22 Oct 2025 16:38:58 +0200 Subject: [PATCH 2/6] add test for int32 overflow --- tests/test_arrow_dataset.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 4661e8c6dd7..add816029b1 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -4773,3 +4773,23 @@ def test_from_polars_save_to_disk_and_load_from_disk_round_trip_with_large_list( def test_polars_round_trip(): ds = Dataset.from_dict({"x": [[1, 2], [3, 4, 5]], "y": ["a", "b"]}) assert isinstance(Dataset.from_polars(ds.to_polars()), Dataset) + + +def test_map_int32_overflow(): + # GH: 7821 + def process_batch(batch): + res = [] + for _ in batch["id"]: + res.append(np.zeros((2**31)).astype(np.uint16)) + + return {"audio": res} + + ds = Dataset.from_dict({"id": [0]}) + mapped_ds = ds.map( + process_batch, + batched=True, + batch_size=1, + num_proc=0, + remove_columns=ds.column_names, + ) + assert isinstance(mapped_ds, Dataset) From 1b1bf7dfab3c677974df4e929f931fb023594886 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 22 Oct 2025 16:41:53 +0200 Subject: [PATCH 3/6] remove debug statements --- src/datasets/arrow_writer.py | 1 - src/datasets/features/features.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index 37a1fb1a639..ea065e2ef7a 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -206,7 +206,6 @@ def __init__( optimized_int_type: Optional[FeatureType] = None, ): # assert type is None or try_type is None, - # print("[TypedSequence]: Init with type", type, "try_type", try_type, "optimized_int_type", optimized_int_type) if type is not None and try_type is not None: raise ValueError("You cannot specify both type and try_type") # set attributes diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index da582d73369..4515978d55d 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -1525,8 +1525,6 @@ def numpy_to_pyarrow_listarray(arr: np.ndarray, type: pa.DataType = None) -> pa. def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> pa.ListArray: - # import pdb; pdb.set_trace() - # import traceback as tb; tb.print_stack() null_mask = np.array([arr is None for arr in l_arr]) null_indices = np.arange(len(null_mask))[null_mask] - np.arange(np.sum(null_mask)) l_arr = [arr for arr in l_arr if arr is not None] @@ -1535,8 +1533,6 @@ def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> p ) # convert to dtype object to allow None insertion offsets = np.insert(offsets, null_indices, None) values = pa.concat_arrays(l_arr) - # offsets = pa.array(offsets, type=pa.int32()) - # return pa.ListArray.from_arrays(offsets, values) try: offsets = pa.array(offsets, type=pa.int32()) return pa.ListArray.from_arrays(offsets, values) From 901abfb37646888e700e16386016a3bfb4191ab7 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Mon, 27 Oct 2025 16:25:24 +0100 Subject: [PATCH 4/6] allow 64bit on arrow writer --- src/datasets/arrow_writer.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py index ea065e2ef7a..606a007af90 100644 --- a/src/datasets/arrow_writer.py +++ b/src/datasets/arrow_writer.py @@ -524,13 +524,8 @@ def _build_schema(self, inferred_schema: pa.Schema): def _build_writer(self, inferred_schema: pa.Schema): self._schema, self._features = self._build_schema(inferred_schema) - allow_64bit = False - for field_type in self._schema.types: - if pa.types.is_large_list(field_type): - allow_64bit = True - break self.pa_writer = pa.RecordBatchStreamWriter( - self.stream, self._schema, options=pa.ipc.IpcWriteOptions(allow_64bit=allow_64bit) + self.stream, self._schema, options=pa.ipc.IpcWriteOptions(allow_64bit=True) ) @property From 8b0ad7a0af5bba15e4f2be7ef801546a5fa1c74c Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 28 Oct 2025 16:53:07 +0100 Subject: [PATCH 5/6] mark test as memory intensive --- pyproject.toml | 1 + tests/test_arrow_dataset.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 511a9e0d744..abac4a9d90a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,4 +24,5 @@ filterwarnings = [ markers = [ "unit: unit test", "integration: integration test", + "high_memory: tests requiring significant RAM (>=16GB)", ] diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index e6cfe97ed41..35593b7d22b 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -4785,8 +4785,12 @@ def test_polars_round_trip(): assert isinstance(Dataset.from_polars(ds.to_polars()), Dataset) +@pytest.mark.integration +@pytest.mark.high_memory def test_map_int32_overflow(): # GH: 7821 + # This test requires ~4GB RAM to create a large array that triggers int32 overflow + # Marked as high_memory (>=16GB) to prevent CI failures def process_batch(batch): res = [] for _ in batch["id"]: From 1b890b331c23d2a4434268c673426c46ac3ccc4a Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Wed, 29 Oct 2025 06:39:32 +0100 Subject: [PATCH 6/6] update tests --- tests/test_arrow_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py index 35593b7d22b..e40bce33296 100644 --- a/tests/test_arrow_dataset.py +++ b/tests/test_arrow_dataset.py @@ -4785,7 +4785,6 @@ def test_polars_round_trip(): assert isinstance(Dataset.from_polars(ds.to_polars()), Dataset) -@pytest.mark.integration @pytest.mark.high_memory def test_map_int32_overflow(): # GH: 7821