apache · andygrove · Mar 21, 2026 · Mar 21, 2026 · Mar 21, 2026 · Mar 23, 2026
diff --git a/common/src/main/scala/org/apache/comet/CometConf.scala b/common/src/main/scala/org/apache/comet/CometConf.scala
@@ -523,6 +523,18 @@ object CometConf extends ShimCometConf {
         "Should not be larger than batch size `spark.comet.batchSize`")
       .createWithDefault(8192)
 
+  val COMET_SHUFFLE_PARTITIONER_MODE: ConfigEntry[String] =
+    conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.partitionerMode")
+      .category(CATEGORY_SHUFFLE)
+      .doc(
+        "The partitioner mode used by the native shuffle writer. " +
+          "'immediate' writes partitioned IPC blocks immediately as batches arrive, " +
+          "reducing memory usage. 'buffered' buffers all rows before writing, which may " +
+          "improve performance for small datasets but uses more memory.")
+      .stringConf
+      .checkValues(Set("immediate", "buffered"))
+      .createWithDefault("buffered")
+
   val COMET_SHUFFLE_WRITE_BUFFER_SIZE: ConfigEntry[Long] =
     conf(s"$COMET_EXEC_CONFIG_PREFIX.shuffle.writeBufferSize")
       .category(CATEGORY_SHUFFLE)

diff --git a/docs/source/contributor-guide/native_shuffle.md b/docs/source/contributor-guide/native_shuffle.md
@@ -81,10 +81,18 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
 └─────────────────────────────────────────────────────────────────────────────┘
                     │                                     │
                     ▼                                     ▼
-┌───────────────────────────────────┐   ┌───────────────────────────────────┐
-│ MultiPartitionShuffleRepartitioner │   │ SinglePartitionShufflePartitioner │
-│ (hash/range partitioning)          │   │ (single partition case)           │
-└───────────────────────────────────┘   └───────────────────────────────────┘
+┌───────────────────────────────────────────────────────────────────────┐
+│                        Partitioner Selection                          │
+│  Controlled by spark.comet.exec.shuffle.partitionerMode               │
+├───────────────────────────┬───────────────────────────────────────────┤
+│  immediate (default)      │  buffered                                 │
+│  ImmediateModePartitioner │  MultiPartitionShuffleRepartitioner       │
+│  (hash/range/round-robin) │  (hash/range/round-robin)                 │
+│  Writes IPC blocks as     │  Buffers all rows in memory               │
+│  batches arrive            │  before writing                           │
+├───────────────────────────┴───────────────────────────────────────────┤
-│  Writes IPC blocks as     │  Buffers all rows in memory               │
-│  batches arrive            │  before writing                           │
-├───────────────────────────┴───────────────────────────────────────────┤
+│  Writes IPC blocks as     │  Buffers all rows in memory               │
+│  batches arrive           │  before writing                           │
+├───────────────────────────┴───────────────────────────────────────────┤
-│  Writes IPC blocks as     │  Buffers all rows in memory               │
-│  batches arrive            │  before writing                           │
-├───────────────────────────┴───────────────────────────────────────────┤
+│  Writes IPC blocks as     │  Buffers all rows in memory               │
+│  batches arrive           │  before writing                           │
+├───────────────────────────┴───────────────────────────────────────────┤
+│  SinglePartitionShufflePartitioner (single partition case)            │
+└───────────────────────────────────────────────────────────────────────┘
                     │
                     ▼
 ┌───────────────────────────────────┐
@@ -113,11 +121,13 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
 
 ### Rust Side
 
-| File                    | Location                             | Description                                                                          |
-| ----------------------- | ------------------------------------ | ------------------------------------------------------------------------------------ |
-| `shuffle_writer.rs`     | `native/core/src/execution/shuffle/` | `ShuffleWriterExec` plan and partitioners. Main shuffle logic.                       |
-| `codec.rs`              | `native/core/src/execution/shuffle/` | `ShuffleBlockWriter` for Arrow IPC encoding with compression. Also handles decoding. |
-| `comet_partitioning.rs` | `native/core/src/execution/shuffle/` | `CometPartitioning` enum defining partition schemes (Hash, Range, Single).           |
+| File                    | Location                           | Description                                                                                                                            |
+| ----------------------- | ---------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
+| `shuffle_writer.rs`     | `native/shuffle/src/`              | `ShuffleWriterExec` plan. Selects partitioner based on `immediate_mode` flag.                                                          |
+| `immediate_mode.rs`     | `native/shuffle/src/partitioners/` | `ImmediateModePartitioner`. Scatter-writes rows into per-partition Arrow builders and flushes IPC blocks to in-memory buffers eagerly. |
+| `multi_partition.rs`    | `native/shuffle/src/partitioners/` | `MultiPartitionShuffleRepartitioner`. Buffers all rows in memory, then writes partitions.                                              |
+| `codec.rs`              | `native/shuffle/src/`              | `ShuffleBlockWriter` for Arrow IPC encoding with compression. Also handles decoding.                                                   |
+| `comet_partitioning.rs` | `native/shuffle/src/`              | `CometPartitioning` enum defining partition schemes (Hash, Range, Single).                                                             |
 
 ## Data Flow
 
@@ -129,23 +139,33 @@ Native shuffle (`CometExchange`) is selected when all of the following condition
 
 2. **Native execution**: `CometExec.getCometIterator()` executes the plan in Rust.
 
-3. **Partitioning**: `ShuffleWriterExec` receives batches and routes to the appropriate partitioner:
-   - `MultiPartitionShuffleRepartitioner`: For hash/range/round-robin partitioning
-   - `SinglePartitionShufflePartitioner`: For single partition (simpler path)
+3. **Partitioning**: `ShuffleWriterExec` receives batches and routes to the appropriate partitioner
+   based on the `partitionerMode` configuration:
+   - **Immediate mode** (`ImmediateModePartitioner`): For hash/range/round-robin partitioning.
+     As each batch arrives, rows are scattered into per-partition Arrow array builders. When a
+     partition's builder reaches the target batch size, it is flushed as a compressed Arrow IPC
+     block to an in-memory buffer. Under memory pressure, these buffers are spilled to
+     per-partition temporary files. This keeps memory usage much lower than buffered mode since
+     data is encoded into compact IPC format eagerly rather than held as raw Arrow arrays.
 
-4. **Buffering and spilling**: The partitioner buffers rows per partition. When memory pressure
-   exceeds the threshold, partitions spill to temporary files.
+   - **Buffered mode** (`MultiPartitionShuffleRepartitioner`): For hash/range/round-robin
+     partitioning. Buffers all input `RecordBatch`es in memory, then partitions and writes
+     them in a single pass. When memory pressure exceeds the threshold, partitions spill to
+     temporary files.
 
-5. **Encoding**: `ShuffleBlockWriter` encodes each partition's data as compressed Arrow IPC:
+   - `SinglePartitionShufflePartitioner`: For single partition (simpler path, used regardless
+     of partitioner mode).
+
+4. **Encoding**: `ShuffleBlockWriter` encodes each partition's data as compressed Arrow IPC:
    - Writes compression type header
    - Writes field count header
    - Writes compressed IPC stream
 
-6. **Output files**: Two files are produced:
+5. **Output files**: Two files are produced:
    - **Data file**: Concatenated partition data
    - **Index file**: Array of 8-byte little-endian offsets marking partition boundaries
 
-7. **Commit**: Back in JVM, `CometNativeShuffleWriter` reads the index file to get partition
+6. **Commit**: Back in JVM, `CometNativeShuffleWriter` reads the index file to get partition
    lengths and commits via Spark's `IndexShuffleBlockResolver`.
 
 ### Read Path
@@ -201,10 +221,31 @@ sizes.
 
 ## Memory Management
 
-Native shuffle uses DataFusion's memory management with spilling support:
+Native shuffle uses DataFusion's memory management. The memory characteristics differ
+between the two partitioner modes:
+
+### Immediate Mode
+
+Immediate mode keeps memory usage low by partitioning and encoding data eagerly as it arrives,
+rather than buffering all input rows before writing:
+
+- **Per-partition builders**: Each partition has a set of Arrow array builders sized to the
+  target batch size. When a builder fills up, it is flushed as a compressed IPC block to an
+  in-memory buffer.
+- **Memory footprint**: Proportional to `num_partitions × batch_size` for the builders, plus
+  the accumulated IPC buffers. This is typically much smaller than buffered mode since IPC
+  encoding is more compact than raw Arrow arrays.
+- **Spilling**: When memory pressure is detected via DataFusion's `MemoryConsumer` trait,
+  partition builders are flushed and all IPC buffers are drained to per-partition temporary
+  files on disk.
+
+### Buffered Mode
+
+Buffered mode holds all input data in memory before writing:
 
-- **Memory pool**: Tracks memory usage across the shuffle operation.
-- **Spill threshold**: When buffered data exceeds the threshold, partitions spill to disk.
+- **Buffered batches**: All incoming `RecordBatch`es are accumulated in a `Vec`.
+- **Spill threshold**: When buffered data exceeds the memory threshold, partitions spill to
+  temporary files on disk.
 - **Per-partition spilling**: Each partition has its own spill file. Multiple spills for a
   partition are concatenated when writing the final output.
 - **Scratch space**: Reusable buffers for partition ID computation to reduce allocations.
@@ -232,14 +273,15 @@ independently compressed, allowing parallel decompression during reads.
 
 ## Configuration
 
-| Config                                            | Default | Description                              |
-| ------------------------------------------------- | ------- | ---------------------------------------- |
-| `spark.comet.exec.shuffle.enabled`                | `true`  | Enable Comet shuffle                     |
-| `spark.comet.exec.shuffle.mode`                   | `auto`  | Shuffle mode: `native`, `jvm`, or `auto` |
-| `spark.comet.exec.shuffle.compression.codec`      | `zstd`  | Compression codec                        |
-| `spark.comet.exec.shuffle.compression.zstd.level` | `1`     | Zstd compression level                   |
-| `spark.comet.shuffle.write.buffer.size`           | `1MB`   | Write buffer size                        |
-| `spark.comet.columnar.shuffle.batch.size`         | `8192`  | Target rows per batch                    |
+| Config                                            | Default     | Description                                 |
+| ------------------------------------------------- | ----------- | ------------------------------------------- |
+| `spark.comet.exec.shuffle.enabled`                | `true`      | Enable Comet shuffle                        |
+| `spark.comet.exec.shuffle.mode`                   | `auto`      | Shuffle mode: `native`, `jvm`, or `auto`    |
+| `spark.comet.exec.shuffle.partitionerMode`        | `immediate` | Partitioner mode: `immediate` or `buffered` |
+| `spark.comet.exec.shuffle.compression.codec`      | `zstd`      | Compression codec                           |
+| `spark.comet.exec.shuffle.compression.zstd.level` | `1`         | Zstd compression level                      |
+| `spark.comet.shuffle.write.buffer.size`           | `1MB`       | Write buffer size                           |
+| `spark.comet.columnar.shuffle.batch.size`         | `8192`      | Target rows per batch                       |
 
 ## Comparison with JVM Shuffle
 

diff --git a/docs/source/user-guide/latest/tuning.md b/docs/source/user-guide/latest/tuning.md
@@ -144,6 +144,17 @@ Comet provides a fully native shuffle implementation, which generally provides t
 supports `HashPartitioning`, `RangePartitioning` and `SinglePartitioning` but currently only supports primitive type
 partitioning keys. Columns that are not partitioning keys may contain complex types like maps, structs, and arrays.
 
+Native shuffle has two partitioner modes, configured via
+`spark.comet.exec.shuffle.partitionerMode`:
+
+- **`immediate`** (default): Writes partitioned Arrow IPC blocks to disk immediately as each batch
+  arrives. This mode uses less memory because it does not need to buffer the entire input before
+  writing. It is recommended for most workloads, especially large datasets.
+
+- **`buffered`**: Buffers all input rows in memory before partitioning and writing to disk. This
+  may improve performance for small datasets that fit in memory, but uses significantly more
+  memory.
+
 #### Columnar (JVM) Shuffle
 
 Comet Columnar shuffle is JVM-based and supports `HashPartitioning`, `RoundRobinPartitioning`, `RangePartitioning`, and

diff --git a/native/Cargo.lock b/native/Cargo.lock
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -1374,6 +1374,7 @@ impl PhysicalPlanner {
                     writer.output_index_file.clone(),
                     writer.tracing_enabled,
                     write_buffer_size,
+                    writer.immediate_mode,
                 )?);
 
                 Ok((

diff --git a/native/proto/src/proto/operator.proto b/native/proto/src/proto/operator.proto
@@ -294,6 +294,10 @@ message ShuffleWriter {
   // Size of the write buffer in bytes used when writing shuffle data to disk.
   // Larger values may improve write performance but use more memory.
   int32 write_buffer_size = 8;
+  // Whether to use immediate mode partitioner. When true, partitioned IPC blocks
+  // are written immediately as batches arrive. When false, rows are buffered
+  // before writing (the original behavior).
+  bool immediate_mode = 9;
 }
 
 message ParquetWriter {

diff --git a/native/shuffle/Cargo.toml b/native/shuffle/Cargo.toml
@@ -32,6 +32,7 @@ publish = false
 arrow = { workspace = true }
 async-trait = { workspace = true }
 bytes = { workspace = true }
+clap = { version = "4", features = ["derive"], optional = true }
 crc32c = "0.6.8"
 crc32fast = "1.3.2"
 datafusion = { workspace = true }
@@ -43,6 +44,8 @@ itertools = "0.14.0"
 jni = "0.21"
 log = "0.4"
 lz4_flex = { version = "0.13.0", default-features = false, features = ["frame"] }
+# parquet is only used by the shuffle_bench binary (shuffle-bench feature)
+parquet = { workspace = true, optional = true }
 simd-adler32 = "0.3.9"
 snap = "1.1"
 tokio = { version = "1", features = ["rt-multi-thread"] }
@@ -54,10 +57,18 @@ datafusion = { workspace = true, features = ["parquet_encryption", "sql"] }
 itertools = "0.14.0"
 tempfile = "3.26.0"
 
+[features]
+shuffle-bench = ["clap", "parquet"]
+
 [lib]
 name = "datafusion_comet_shuffle"
 path = "src/lib.rs"
 
+[[bin]]
+name = "shuffle_bench"
+path = "src/bin/shuffle_bench.rs"
+required-features = ["shuffle-bench"]
+
 [[bench]]
 name = "shuffle_writer"
 harness = false