From 59bb18013d63c8b19891ca3f7d03578716885ab5 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Wed, 22 Oct 2025 15:59:01 +0200
Subject: [PATCH 1/6] support large lists with pa.int64 types

---
 src/datasets/arrow_writer.py      | 10 +++++++++-
 src/datasets/features/features.py | 12 ++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
index 3174f5cf206..37a1fb1a639 100644
--- a/src/datasets/arrow_writer.py
+++ b/src/datasets/arrow_writer.py
@@ -206,6 +206,7 @@ def __init__(
         optimized_int_type: Optional[FeatureType] = None,
     ):
         # assert type is None or try_type is None,
+        # print("[TypedSequence]: Init with type", type, "try_type", try_type, "optimized_int_type", optimized_int_type)
         if type is not None and try_type is not None:
             raise ValueError("You cannot specify both type and try_type")
         # set attributes
@@ -524,7 +525,14 @@ def _build_schema(self, inferred_schema: pa.Schema):
 
     def _build_writer(self, inferred_schema: pa.Schema):
         self._schema, self._features = self._build_schema(inferred_schema)
-        self.pa_writer = pa.RecordBatchStreamWriter(self.stream, self._schema)
+        allow_64bit = False
+        for field_type in self._schema.types:
+            if pa.types.is_large_list(field_type):
+                allow_64bit = True
+                break
+        self.pa_writer = pa.RecordBatchStreamWriter(
+            self.stream, self._schema, options=pa.ipc.IpcWriteOptions(allow_64bit=allow_64bit)
+        )
 
     @property
     def schema(self):
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
index 54d84ef33e2..da582d73369 100644
--- a/src/datasets/features/features.py
+++ b/src/datasets/features/features.py
@@ -1525,6 +1525,8 @@ def numpy_to_pyarrow_listarray(arr: np.ndarray, type: pa.DataType = None) -> pa.
 
 
 def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> pa.ListArray:
+    # import pdb; pdb.set_trace()
+    # import traceback as tb; tb.print_stack()
     null_mask = np.array([arr is None for arr in l_arr])
     null_indices = np.arange(len(null_mask))[null_mask] - np.arange(np.sum(null_mask))
     l_arr = [arr for arr in l_arr if arr is not None]
@@ -1532,9 +1534,15 @@ def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> p
         [0] + [len(arr) for arr in l_arr], dtype=object
     )  # convert to dtype object to allow None insertion
     offsets = np.insert(offsets, null_indices, None)
-    offsets = pa.array(offsets, type=pa.int32())
     values = pa.concat_arrays(l_arr)
-    return pa.ListArray.from_arrays(offsets, values)
+    # offsets = pa.array(offsets, type=pa.int32())
+    # return pa.ListArray.from_arrays(offsets, values)
+    try:
+        offsets = pa.array(offsets, type=pa.int32())
+        return pa.ListArray.from_arrays(offsets, values)
+    except pa.lib.ArrowInvalid:
+        offsets = pa.array(offsets, type=pa.int64())
+        return pa.LargeListArray.from_arrays(offsets, values)
 
 
 def list_of_np_array_to_pyarrow_listarray(l_arr: list[np.ndarray], type: pa.DataType = None) -> pa.ListArray:

From 68c764dc4b77b1ed09744a86ebfbd6bf1ba4cdf2 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Wed, 22 Oct 2025 16:38:58 +0200
Subject: [PATCH 2/6] add test for int32 overflow

---
 tests/test_arrow_dataset.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 4661e8c6dd7..add816029b1 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -4773,3 +4773,23 @@ def test_from_polars_save_to_disk_and_load_from_disk_round_trip_with_large_list(
 def test_polars_round_trip():
     ds = Dataset.from_dict({"x": [[1, 2], [3, 4, 5]], "y": ["a", "b"]})
     assert isinstance(Dataset.from_polars(ds.to_polars()), Dataset)
+
+
+def test_map_int32_overflow():
+    # GH: 7821
+    def process_batch(batch):
+        res = []
+        for _ in batch["id"]:
+            res.append(np.zeros((2**31)).astype(np.uint16))
+
+        return {"audio": res}
+
+    ds = Dataset.from_dict({"id": [0]})
+    mapped_ds = ds.map(
+        process_batch,
+        batched=True,
+        batch_size=1,
+        num_proc=0,
+        remove_columns=ds.column_names,
+    )
+    assert isinstance(mapped_ds, Dataset)

From 1b1bf7dfab3c677974df4e929f931fb023594886 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Wed, 22 Oct 2025 16:41:53 +0200
Subject: [PATCH 3/6] remove debug statements

---
 src/datasets/arrow_writer.py      | 1 -
 src/datasets/features/features.py | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
index 37a1fb1a639..ea065e2ef7a 100644
--- a/src/datasets/arrow_writer.py
+++ b/src/datasets/arrow_writer.py
@@ -206,7 +206,6 @@ def __init__(
         optimized_int_type: Optional[FeatureType] = None,
     ):
         # assert type is None or try_type is None,
-        # print("[TypedSequence]: Init with type", type, "try_type", try_type, "optimized_int_type", optimized_int_type)
         if type is not None and try_type is not None:
             raise ValueError("You cannot specify both type and try_type")
         # set attributes
diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py
index da582d73369..4515978d55d 100644
--- a/src/datasets/features/features.py
+++ b/src/datasets/features/features.py
@@ -1525,8 +1525,6 @@ def numpy_to_pyarrow_listarray(arr: np.ndarray, type: pa.DataType = None) -> pa.
 
 
 def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> pa.ListArray:
-    # import pdb; pdb.set_trace()
-    # import traceback as tb; tb.print_stack()
     null_mask = np.array([arr is None for arr in l_arr])
     null_indices = np.arange(len(null_mask))[null_mask] - np.arange(np.sum(null_mask))
     l_arr = [arr for arr in l_arr if arr is not None]
@@ -1535,8 +1533,6 @@ def list_of_pa_arrays_to_pyarrow_listarray(l_arr: list[Optional[pa.Array]]) -> p
     )  # convert to dtype object to allow None insertion
     offsets = np.insert(offsets, null_indices, None)
     values = pa.concat_arrays(l_arr)
-    # offsets = pa.array(offsets, type=pa.int32())
-    # return pa.ListArray.from_arrays(offsets, values)
     try:
         offsets = pa.array(offsets, type=pa.int32())
         return pa.ListArray.from_arrays(offsets, values)

From 901abfb37646888e700e16386016a3bfb4191ab7 Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Mon, 27 Oct 2025 16:25:24 +0100
Subject: [PATCH 4/6] allow 64bit on arrow writer

---
 src/datasets/arrow_writer.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/datasets/arrow_writer.py b/src/datasets/arrow_writer.py
index ea065e2ef7a..606a007af90 100644
--- a/src/datasets/arrow_writer.py
+++ b/src/datasets/arrow_writer.py
@@ -524,13 +524,8 @@ def _build_schema(self, inferred_schema: pa.Schema):
 
     def _build_writer(self, inferred_schema: pa.Schema):
         self._schema, self._features = self._build_schema(inferred_schema)
-        allow_64bit = False
-        for field_type in self._schema.types:
-            if pa.types.is_large_list(field_type):
-                allow_64bit = True
-                break
         self.pa_writer = pa.RecordBatchStreamWriter(
-            self.stream, self._schema, options=pa.ipc.IpcWriteOptions(allow_64bit=allow_64bit)
+            self.stream, self._schema, options=pa.ipc.IpcWriteOptions(allow_64bit=True)
         )
 
     @property

From 8b0ad7a0af5bba15e4f2be7ef801546a5fa1c74c Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Tue, 28 Oct 2025 16:53:07 +0100
Subject: [PATCH 5/6] mark test as memory intensive

---
 pyproject.toml              | 1 +
 tests/test_arrow_dataset.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 511a9e0d744..abac4a9d90a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,4 +24,5 @@ filterwarnings = [
 markers = [
     "unit: unit test",
     "integration: integration test",
+    "high_memory: tests requiring significant RAM (>=16GB)",
 ]
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index e6cfe97ed41..35593b7d22b 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -4785,8 +4785,12 @@ def test_polars_round_trip():
     assert isinstance(Dataset.from_polars(ds.to_polars()), Dataset)
 
 
+@pytest.mark.integration
+@pytest.mark.high_memory
 def test_map_int32_overflow():
     # GH: 7821
+    # This test requires ~4GB RAM to create a large array that triggers int32 overflow
+    # Marked as high_memory (>=16GB) to prevent CI failures
     def process_batch(batch):
         res = []
         for _ in batch["id"]:

From 1b890b331c23d2a4434268c673426c46ac3ccc4a Mon Sep 17 00:00:00 2001
From: Tobias Pitters <tobias.pitters@gmail.com>
Date: Wed, 29 Oct 2025 06:39:32 +0100
Subject: [PATCH 6/6] update tests

---
 tests/test_arrow_dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
index 35593b7d22b..e40bce33296 100644
--- a/tests/test_arrow_dataset.py
+++ b/tests/test_arrow_dataset.py
@@ -4785,7 +4785,6 @@ def test_polars_round_trip():
     assert isinstance(Dataset.from_polars(ds.to_polars()), Dataset)
 
 
-@pytest.mark.integration
 @pytest.mark.high_memory
 def test_map_int32_overflow():
     # GH: 7821