From e5d6ea02b150bc29ebda9cf753a88f2a707b1d8c Mon Sep 17 00:00:00 2001
From: Baris Palaska <barispalaska@gmail.com>
Date: Fri, 13 Feb 2026 16:15:10 +0000
Subject: [PATCH 1/6] replace builder with bulk take in naive_rebuild

Signed-off-by: Baris Palaska <barispalaska@gmail.com>
---
 vortex-array/Cargo.toml                     |  4 ++
 vortex-array/benches/listview_rebuild.rs    | 79 +++++++++++++++++++++
 vortex-array/src/arrays/listview/rebuild.rs | 28 ++------
 3 files changed, 90 insertions(+), 21 deletions(-)
 create mode 100644 vortex-array/benches/listview_rebuild.rs
diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
index c310b18eeed..4b1ebd5b332 100644
--- a/vortex-array/Cargo.toml
+++ b/vortex-array/Cargo.toml
@@ -169,5 +169,9 @@ harness = false
 name = "take_fsl"
 harness = false
 
+[[bench]]
+name = "listview_rebuild"
+harness = false
+
 [package.metadata.cargo-machete]
 ignored = ["getrandom_v03"]
diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs
new file mode 100644
index 00000000000..c502122f6a9
--- /dev/null
+++ b/vortex-array/benches/listview_rebuild.rs
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Benchmarks for ListView rebuild with primitive and VarBinView elements.
+
+#![allow(clippy::unwrap_used)]
+#![allow(clippy::cast_possible_truncation)]
+
+use divan::Bencher;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ListViewArray;
+use vortex_array::arrays::ListViewRebuildMode;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+
+fn main() {
+    divan::main();
+}
+
+const NUM_LISTS: &[usize] = &[100, 1_000, 10_000];
+
+/// Build a ListViewArray of primitives with overlapping views.
+fn make_primitive_listview(num_lists: usize) -> ListViewArray {
+    let element_count = num_lists * 4;
+    let elements = PrimitiveArray::from_iter(0..element_count as i32).into_array();
+
+    let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * 2) as u32).collect();
+    let sizes: Buffer<u32> = std::iter::repeat_n(4u32, num_lists).collect();
+
+    ListViewArray::new(
+        elements,
+        offsets.into_array(),
+        sizes.into_array(),
+        Validity::NonNullable,
+    )
+}
+
+/// Build a ListViewArray of strings with overlapping views.
+fn make_varbinview_listview(num_lists: usize) -> ListViewArray {
+    let element_count = num_lists * 4;
+    let strings: Vec<String> = (0..element_count)
+        .map(|i| {
+            if i % 3 == 0 {
+                format!("long-string-value-{i:06}")
+            } else {
+                format!("s{i}")
+            }
+        })
+        .collect();
+    let elements = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str())).into_array();
+
+    let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * 2) as u32).collect();
+    let sizes: Buffer<u32> = std::iter::repeat_n(4u32, num_lists).collect();
+
+    ListViewArray::new(
+        elements,
+        offsets.into_array(),
+        sizes.into_array(),
+        Validity::NonNullable,
+    )
+}
+
+#[divan::bench(args = NUM_LISTS)]
+fn rebuild_primitive(bencher: Bencher, num_lists: usize) {
+    let listview = make_primitive_listview(num_lists);
+    bencher
+        .with_inputs(|| &listview)
+        .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
+}
+
+#[divan::bench(args = NUM_LISTS)]
+fn rebuild_varbinview(bencher: Bencher, num_lists: usize) {
+    let listview = make_varbinview_listview(num_lists);
+    bencher
+        .with_inputs(|| &listview)
+        .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
+}
diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs
index d2714b671e6..7907296ef07 100644
--- a/vortex-array/src/arrays/listview/rebuild.rs
+++ b/vortex-array/src/arrays/listview/rebuild.rs
@@ -14,7 +14,6 @@ use crate::Array;
 use crate::IntoArray;
 use crate::ToCanonical;
 use crate::arrays::ListViewArray;
-use crate::builders::builder_with_capacity;
 use crate::compute;
 use crate::vtable::ValidityHelper;
 
@@ -103,18 +102,14 @@ impl ListViewArray {
         })
     }
 
-    // TODO(connor)[ListView]: We should benchmark if it is faster to use `take` on the elements
-    // instead of using a builder.
     /// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece
     /// by piece.
+    ///
+    /// Collects all element indices into a flat `BufferMut<u64>` and performs a single bulk
+    /// `take` to produce the new elements array.
     fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
         &self,
     ) -> VortexResult<ListViewArray> {
-        let element_dtype = self
-            .dtype()
-            .as_list_element_opt()
-            .vortex_expect("somehow had a canonical list that was not a list");
-
         let offsets_canonical = self.offsets().to_primitive();
         let offsets_slice = offsets_canonical.as_slice::<O>();
         let sizes_canonical = self.sizes().to_primitive();
@@ -129,17 +124,8 @@ impl ListViewArray {
         // null lists to 0.
         let mut new_sizes = BufferMut::<S>::with_capacity(len);
 
-        // Canonicalize the elements up front as we will be slicing the elements quite a lot.
-        let elements_canonical = self
-            .elements()
-            .to_canonical()
-            .vortex_expect("canonicalize elements for rebuild")
-            .into_array();
-
-        // Note that we do not know what the exact capacity should be of the new elements since
-        // there could be overlaps in the existing `ListViewArray`.
-        let mut new_elements_builder =
-            builder_with_capacity(element_dtype.as_ref(), self.elements().len());
+        // Collect all element indices for a single bulk take.
+        let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());
 
         let mut n_elements = NewOffset::zero();
         for index in 0..len {
@@ -159,14 +145,14 @@ impl ListViewArray {
 
             new_offsets.push(n_elements);
             new_sizes.push(size);
-            new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?);
+            take_indices.extend(start as u64..stop as u64);
 
             n_elements += num_traits::cast(size).vortex_expect("Cast failed");
         }
 
         let offsets = new_offsets.into_array();
         let sizes = new_sizes.into_array();
-        let elements = new_elements_builder.finish();
+        let elements = self.elements().take(take_indices.into_array())?;
 
         debug_assert_eq!(
             n_elements.as_(),

From dd4950771b099ce4b303ce282847d4fee67cfc9c Mon Sep 17 00:00:00 2001
From: Baris Palaska <barispalaska@gmail.com>
Date: Fri, 13 Feb 2026 16:40:24 +0000
Subject: [PATCH 2/6] to_canonical

Signed-off-by: Baris Palaska <barispalaska@gmail.com>
---
 vortex-array/benches/listview_rebuild.rs    | 6 ------
 vortex-array/src/arrays/listview/rebuild.rs | 6 +++++-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs
index c502122f6a9..4fc4e15970d 100644
--- a/vortex-array/benches/listview_rebuild.rs
+++ b/vortex-array/benches/listview_rebuild.rs
@@ -21,14 +21,11 @@ fn main() {
 
 const NUM_LISTS: &[usize] = &[100, 1_000, 10_000];
 
-/// Build a ListViewArray of primitives with overlapping views.
 fn make_primitive_listview(num_lists: usize) -> ListViewArray {
     let element_count = num_lists * 4;
     let elements = PrimitiveArray::from_iter(0..element_count as i32).into_array();
-
     let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * 2) as u32).collect();
     let sizes: Buffer<u32> = std::iter::repeat_n(4u32, num_lists).collect();
-
     ListViewArray::new(
         elements,
         offsets.into_array(),
@@ -37,7 +34,6 @@ fn make_primitive_listview(num_lists: usize) -> ListViewArray {
     )
 }
 
-/// Build a ListViewArray of strings with overlapping views.
 fn make_varbinview_listview(num_lists: usize) -> ListViewArray {
     let element_count = num_lists * 4;
     let strings: Vec<String> = (0..element_count)
@@ -50,10 +46,8 @@ fn make_varbinview_listview(num_lists: usize) -> ListViewArray {
         })
         .collect();
     let elements = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str())).into_array();
-
     let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * 2) as u32).collect();
     let sizes: Buffer<u32> = std::iter::repeat_n(4u32, num_lists).collect();
-
     ListViewArray::new(
         elements,
         offsets.into_array(),
diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs
index 7907296ef07..b9b619c496b 100644
--- a/vortex-array/src/arrays/listview/rebuild.rs
+++ b/vortex-array/src/arrays/listview/rebuild.rs
@@ -152,7 +152,11 @@ impl ListViewArray {
 
         let offsets = new_offsets.into_array();
         let sizes = new_sizes.into_array();
-        let elements = self.elements().take(take_indices.into_array())?;
+        let elements = self
+            .elements()
+            .take(take_indices.into_array())?
+            .to_canonical()?
+            .into_array();
 
         debug_assert_eq!(
             n_elements.as_(),

From 66b9f69c67cd6f12192b55d2b4efd350ab75a98e Mon Sep 17 00:00:00 2001
From: Baris Palaska <barispalaska@gmail.com>
Date: Mon, 16 Feb 2026 10:28:11 +0000
Subject: [PATCH 3/6] use both methods

Signed-off-by: Baris Palaska <barispalaska@gmail.com>
---
 vortex-array/benches/listview_rebuild.rs    | 107 +++++++--
 vortex-array/src/arrays/listview/rebuild.rs | 253 +++++++++++++++++++-
 2 files changed, 328 insertions(+), 32 deletions(-)

diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs
index 4fc4e15970d..52451e9e591 100644
--- a/vortex-array/benches/listview_rebuild.rs
+++ b/vortex-array/benches/listview_rebuild.rs
@@ -1,7 +1,13 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Benchmarks for ListView rebuild with primitive and VarBinView elements.
+//! Benchmarks for ListView rebuild across different element types and scenarios.
+//!
+//! The heuristic internally picks between bulk-take (single `take`) and per-list-copy (per-list
+//! `slice` + builder copy) strategies. These scenarios exercise both paths:
+//! - `crossover` straddles the decision boundary
+//! - `large_i8` and `small_overlap` exercise the extremes
+//! - `varbinview` and `struct_elements` exercise special-case types
 
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
@@ -11,21 +17,34 @@ use vortex_array::IntoArray;
 use vortex_array::arrays::ListViewArray;
 use vortex_array::arrays::ListViewRebuildMode;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
 use vortex_array::arrays::VarBinViewArray;
 use vortex_array::validity::Validity;
 use vortex_buffer::Buffer;
+use vortex_dtype::FieldNames;
 
 fn main() {
     divan::main();
 }
 
-const NUM_LISTS: &[usize] = &[100, 1_000, 10_000];
-
-fn make_primitive_listview(num_lists: usize) -> ListViewArray {
-    let element_count = num_lists * 4;
+fn make_primitive_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray {
+    let element_count = step * num_lists + list_size;
     let elements = PrimitiveArray::from_iter(0..element_count as i32).into_array();
-    let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * 2) as u32).collect();
-    let sizes: Buffer<u32> = std::iter::repeat_n(4u32, num_lists).collect();
+    let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * step) as u32).collect();
+    let sizes: Buffer<u32> = std::iter::repeat_n(list_size as u32, num_lists).collect();
+    ListViewArray::new(
+        elements,
+        offsets.into_array(),
+        sizes.into_array(),
+        Validity::NonNullable,
+    )
+}
+
+fn make_i8_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray {
+    let element_count = step * num_lists + list_size;
+    let elements = PrimitiveArray::from_iter((0..element_count).map(|i| i as i8)).into_array();
+    let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * step) as u32).collect();
+    let sizes: Buffer<u32> = std::iter::repeat_n(list_size as u32, num_lists).collect();
     ListViewArray::new(
         elements,
         offsets.into_array(),
@@ -34,8 +53,8 @@ fn make_primitive_listview(num_lists: usize) -> ListViewArray {
     )
 }
 
-fn make_varbinview_listview(num_lists: usize) -> ListViewArray {
-    let element_count = num_lists * 4;
+fn make_varbinview_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray {
+    let element_count = step * num_lists + list_size;
     let strings: Vec<String> = (0..element_count)
         .map(|i| {
             if i % 3 == 0 {
@@ -46,8 +65,8 @@ fn make_varbinview_listview(num_lists: usize) -> ListViewArray {
         })
         .collect();
     let elements = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str())).into_array();
-    let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * 2) as u32).collect();
-    let sizes: Buffer<u32> = std::iter::repeat_n(4u32, num_lists).collect();
+    let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * step) as u32).collect();
+    let sizes: Buffer<u32> = std::iter::repeat_n(list_size as u32, num_lists).collect();
     ListViewArray::new(
         elements,
         offsets.into_array(),
@@ -56,18 +75,68 @@ fn make_varbinview_listview(num_lists: usize) -> ListViewArray {
     )
 }
 
-#[divan::bench(args = NUM_LISTS)]
-fn rebuild_primitive(bencher: Bencher, num_lists: usize) {
-    let listview = make_primitive_listview(num_lists);
+fn make_struct_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray {
+    let element_count = step * num_lists + list_size;
+    let field_a = PrimitiveArray::from_iter(0..element_count as i32).into_array();
+    let field_b = PrimitiveArray::from_iter((0..element_count).map(|i| i as f64)).into_array();
+    let elements = StructArray::try_new(
+        FieldNames::from(["a", "b"]),
+        vec![field_a, field_b],
+        element_count,
+        Validity::NonNullable,
+    )
+    .unwrap()
+    .into_array();
+
+    let offsets: Buffer<u32> = (0..num_lists).map(|i| (i * step) as u32).collect();
+    let sizes: Buffer<u32> = std::iter::repeat_n(list_size as u32, num_lists).collect();
+    ListViewArray::new(
+        elements,
+        offsets.into_array(),
+        sizes.into_array(),
+        Validity::NonNullable,
+    )
+}
+
+// ── i32 around threshold (8+4)*64 = 768: exercises both strategies ──────────
+const CROSSOVER_SIZES: &[usize] = &[512, 768, 1024];
+
+#[divan::bench(args = CROSSOVER_SIZES)]
+fn rebuild_crossover(bencher: Bencher, list_size: usize) {
+    let lv = make_primitive_lv(1_000, list_size, list_size);
+    bencher
+        .with_inputs(|| &lv)
+        .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
+}
+
+#[divan::bench]
+fn rebuild_large_i8(bencher: Bencher) {
+    let lv = make_i8_lv(1_000, 65_536, 65_536);
+    bencher
+        .with_inputs(|| &lv)
+        .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
+}
+
+#[divan::bench]
+fn rebuild_small_overlap(bencher: Bencher) {
+    let lv = make_primitive_lv(1_000, 8, 1);
+    bencher
+        .with_inputs(|| &lv)
+        .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
+}
+
+#[divan::bench]
+fn rebuild_varbinview(bencher: Bencher) {
+    let lv = make_varbinview_lv(1_000, 1_024, 1_024);
     bencher
-        .with_inputs(|| &listview)
+        .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
-#[divan::bench(args = NUM_LISTS)]
-fn rebuild_varbinview(bencher: Bencher, num_lists: usize) {
-    let listview = make_varbinview_listview(num_lists);
+#[divan::bench]
+fn rebuild_struct(bencher: Bencher) {
+    let lv = make_struct_lv(1_000, 1_024, 1_024);
     bencher
-        .with_inputs(|| &listview)
+        .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs
index b9b619c496b..0b29e9ad509 100644
--- a/vortex-array/src/arrays/listview/rebuild.rs
+++ b/vortex-array/src/arrays/listview/rebuild.rs
@@ -3,6 +3,7 @@
 
 use num_traits::FromPrimitive;
 use vortex_buffer::BufferMut;
+use vortex_dtype::DType;
 use vortex_dtype::IntegerPType;
 use vortex_dtype::Nullability;
 use vortex_dtype::match_each_integer_ptype;
@@ -14,6 +15,7 @@ use crate::Array;
 use crate::IntoArray;
 use crate::ToCanonical;
 use crate::arrays::ListViewArray;
+use crate::builders::builder_with_capacity;
 use crate::compute;
 use crate::vtable::ValidityHelper;
 
@@ -102,14 +104,153 @@ impl ListViewArray {
         })
     }
 
-    /// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece
-    /// by piece.
-    ///
-    /// Collects all element indices into a flat `BufferMut<u64>` and performs a single bulk
-    /// `take` to produce the new elements array.
+    /// Picks between [`naive_rebuild_bulk_take`](Self::naive_rebuild_bulk_take) and
+    /// [`naive_rebuild_per_list_copy`](Self::naive_rebuild_per_list_copy) via heuristic.
     fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
         &self,
     ) -> VortexResult<ListViewArray> {
+        let element_dtype = self
+            .dtype()
+            .as_list_element_opt()
+            .vortex_expect("somehow had a canonical list that was not a list");
+        let sizes_canonical = self.sizes().to_primitive();
+        let total: u64 = sizes_canonical
+            .as_slice::<S>()
+            .iter()
+            .map(|s| (*s).as_() as u64)
+            .sum();
+        let use_per_list_copy = Self::should_use_per_list_copy(element_dtype, total, self.len());
+
+        if use_per_list_copy {
+            self.naive_rebuild_per_list_copy::<O, NewOffset, S>()
+        } else {
+            self.naive_rebuild_bulk_take::<O, NewOffset, S>()
+        }
+    }
+
+    /// Decides whether the per-list-copy strategy should be used over bulk-take.
+    ///
+    /// Empirical heuristic: use per-list-copy when `avg_list_size >= (8 + E) * 64 / fsl_divisor`.
+    /// - `8` — extra per-element cost of bulk-take (writing a u64 index that per-list-copy avoids)
+    /// - `E` — element byte width; larger elements make the builder's per-call overhead
+    ///   (slice dispatch, `to_primitive()`, validity mask) relatively more expensive,
+    ///   raising the threshold before per-list-copy's sequential memcpy wins
+    /// - `64` — base number of elements per list needed to amortize per-list-copy's ~200ns
+    ///   per-list overhead (upfront canonicalize + per-list slice dispatch)
+    /// - `fsl_divisor` — for `FixedSizeList<_, N>`, `clamp(N, 1, 2)`: bulk-take expands each
+    ///   outer index into N inner sub-indices, so the threshold is lowered
+    fn should_use_per_list_copy(
+        element_dtype: &DType,
+        total_output_elements: u64,
+        num_lists: usize,
+    ) -> bool {
+        if num_lists == 0 {
+            return false;
+        }
+        let avg = total_output_elements / num_lists as u64;
+        match element_dtype {
+            // Struct: per-list-copy creates separate per-field builders and reconstructs
+            // the structure element-by-element, making it 10-40x slower than bulk-take at
+            // all tested sizes (2-field and 4-field). This overhead is fundamental to the
+            // builder architecture and does not amortize with larger lists.
+            DType::Struct(..) => false,
+            // FixedSizeList<T, N>: bulk-take expands each outer index into N inner
+            // sub-indices, making it ~2x more expensive per element than flat types.
+            // We account for this by dividing the base threshold by clamp(N, 1, 2):
+            //   - N >= 2: threshold halved (e.g. FSL<i32,4> → (8+4)*32 = 384)
+            //   - N == 1: threshold unchanged (same as flat types)
+            // Uses the *inner* element size, not the total FSL size, because per-list-copy
+            // extends the inner flat buffer directly.
+            DType::FixedSizeList(inner, n, ..) => inner
+                .element_size()
+                .is_some_and(|e| avg >= (8 + e as u64) * 64 / (*n as u64).clamp(1, 2)),
+            _ => {
+                if let Some(e) = element_dtype.element_size() {
+                    // Flat fixed-width types (primitives, bools): the base formula.
+                    // Both strategies copy E bytes per element, but bulk-take additionally
+                    // writes an 8-byte u64 index and per-list-copy has per-list overhead
+                    // (~200ns for slice dispatch + builder calls). The (8 + E) factor
+                    // also captures the builder's higher per-element overhead vs SIMD
+                    // bulk-take. Validated across i8/i16/i32/f64/bool.
+                    avg >= (8 + e as u64) * 64
+                } else {
+                    // List<fixed-width>: per-list-copy does cheap offset slicing + bulk
+                    // inner element copy, crossing over at ~1024. We use a fixed threshold
+                    // because inner list sizes aren't cheaply observable here.
+                    // Utf8/Binary: always bulk-take — their builder does expensive Arc
+                    // ref-count bumps and view metadata copying (crossover >50K).
+                    // List<variable-width>: always bulk-take (same reason as Utf8/Binary).
+                    element_dtype
+                        .as_list_element_opt()
+                        .and_then(|d| d.element_size())
+                        .is_some_and(|_| avg >= 1024)
+                }
+            }
+        }
+    }
+
+    /// Rebuilds elements using the **bulk-take** strategy: collect all element indices into a flat
+    /// `BufferMut<u64>`, perform a single bulk `take`, then canonicalize.
+    fn naive_rebuild_bulk_take<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
+        &self,
+    ) -> VortexResult<ListViewArray> {
+        let offsets_canonical = self.offsets().to_primitive();
+        let offsets_slice = offsets_canonical.as_slice::<O>();
+        let sizes_canonical = self.sizes().to_primitive();
+        let sizes_slice = sizes_canonical.as_slice::<S>();
+
+        let len = offsets_slice.len();
+
+        let mut new_offsets = BufferMut::<NewOffset>::with_capacity(len);
+        let mut new_sizes = BufferMut::<S>::with_capacity(len);
+        let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());
+
+        let mut n_elements = NewOffset::zero();
+        for index in 0..len {
+            if !self.is_valid(index)? {
+                new_offsets.push(n_elements);
+                new_sizes.push(S::zero());
+                continue;
+            }
+
+            let offset = offsets_slice[index];
+            let size = sizes_slice[index];
+            let start = offset.as_();
+            let stop = start + size.as_();
+
+            new_offsets.push(n_elements);
+            new_sizes.push(size);
+            take_indices.extend(start as u64..stop as u64);
+            n_elements += num_traits::cast(size).vortex_expect("Cast failed");
+        }
+
+        let elements = self
+            .elements()
+            .take(take_indices.into_array())?
+            .to_canonical()?
+            .into_array();
+        let offsets = new_offsets.into_array();
+        let sizes = new_sizes.into_array();
+
+        // SAFETY: same invariants as `naive_rebuild_per_list_copy` — offsets are sequential and
+        // non-overlapping, all (offset, size) pairs reference valid elements, and the validity
+        // array is preserved from the original.
+        Ok(unsafe {
+            ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone())
+                .with_zero_copy_to_list(true)
+        })
+    }
+
+    /// Rebuilds elements using the **per-list-copy** strategy: canonicalize elements upfront, then
+    /// per-list `slice` + `extend_from_array` into a typed builder.
+    fn naive_rebuild_per_list_copy<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
+        &self,
+    ) -> VortexResult<ListViewArray> {
+        let element_dtype = self
+            .dtype()
+            .as_list_element_opt()
+            .vortex_expect("somehow had a canonical list that was not a list");
+
         let offsets_canonical = self.offsets().to_primitive();
         let offsets_slice = offsets_canonical.as_slice::<O>();
         let sizes_canonical = self.sizes().to_primitive();
@@ -124,8 +265,17 @@ impl ListViewArray {
         // null lists to 0.
         let mut new_sizes = BufferMut::<S>::with_capacity(len);
 
-        // Collect all element indices for a single bulk take.
-        let mut take_indices = BufferMut::<u64>::with_capacity(self.elements().len());
+        // Canonicalize the elements up front as we will be slicing the elements quite a lot.
+        let elements_canonical = self
+            .elements()
+            .to_canonical()
+            .vortex_expect("canonicalize elements for rebuild")
+            .into_array();
+
+        // Note that we do not know what the exact capacity should be of the new elements since
+        // there could be overlaps in the existing `ListViewArray`.
+        let mut new_elements_builder =
+            builder_with_capacity(element_dtype.as_ref(), self.elements().len());
 
         let mut n_elements = NewOffset::zero();
         for index in 0..len {
@@ -145,18 +295,14 @@ impl ListViewArray {
 
             new_offsets.push(n_elements);
             new_sizes.push(size);
-            take_indices.extend(start as u64..stop as u64);
+            new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?);
 
             n_elements += num_traits::cast(size).vortex_expect("Cast failed");
         }
 
         let offsets = new_offsets.into_array();
         let sizes = new_sizes.into_array();
-        let elements = self
-            .elements()
-            .take(take_indices.into_array())?
-            .to_canonical()?
-            .into_array();
+        let elements = new_elements_builder.finish();
 
         debug_assert_eq!(
             n_elements.as_(),
@@ -252,9 +398,13 @@ impl ListViewArray {
 }
 
 #[cfg(test)]
+#[allow(clippy::cast_possible_truncation)]
 mod tests {
+    use rstest::rstest;
     use vortex_buffer::BitBuffer;
+    use vortex_dtype::DType;
     use vortex_dtype::Nullability;
+    use vortex_dtype::PType;
     use vortex_error::VortexResult;
 
     use super::ListViewRebuildMode;
@@ -438,4 +588,81 @@ mod tests {
         );
         Ok(())
     }
+
+    // ── should_use_per_list_copy heuristic tests ───────────────────────────
+
+    #[test]
+    fn heuristic_rejects_zero_lists() {
+        let prim = DType::Primitive(PType::I32, Nullability::NonNullable);
+        assert!(!ListViewArray::should_use_per_list_copy(&prim, 0, 0));
+    }
+
+    #[test]
+    fn heuristic_rejects_struct_always() {
+        let struct_dtype = DType::struct_(
+            [
+                ("a", DType::Primitive(PType::I32, Nullability::NonNullable)),
+                ("b", DType::Primitive(PType::F64, Nullability::NonNullable)),
+            ],
+            Nullability::NonNullable,
+        );
+        assert!(!ListViewArray::should_use_per_list_copy(
+            &struct_dtype,
+            100_000,
+            100
+        ));
+    }
+
+    #[test]
+    fn heuristic_accepts_fsl() {
+        use std::sync::Arc;
+        // FixedSizeList<i32, 4>: threshold = (8+4)*64/2 = 384.
+        let fsl = DType::FixedSizeList(
+            Arc::new(DType::Primitive(PType::I32, Nullability::NonNullable)),
+            4,
+            Nullability::NonNullable,
+        );
+        assert!(!ListViewArray::should_use_per_list_copy(
+            &fsl, 256_000, 1_000
+        ));
+        assert!(ListViewArray::should_use_per_list_copy(
+            &fsl, 384_000, 1_000
+        ));
+    }
+
+    #[test]
+    fn heuristic_accepts_list_of_fixed_width() {
+        let list_i32 = DType::list(
+            DType::Primitive(PType::I32, Nullability::NonNullable),
+            Nullability::NonNullable,
+        );
+        // List<i32>: threshold = 1024.
+        assert!(!ListViewArray::should_use_per_list_copy(
+            &list_i32, 512_000, 1_000
+        ));
+        assert!(ListViewArray::should_use_per_list_copy(
+            &list_i32, 1_024_000, 1_000
+        ));
+    }
+
+    #[rstest]
+    #[case(PType::I32, 512, false)] // i32: threshold=(8+4)*64=768, 512<768
+    #[case(PType::I32, 768, true)] // i32: 768>=768
+    #[case(PType::I8, 512, false)] // i8: threshold=(8+1)*64=576, 512<576
+    #[case(PType::I8, 576, true)] // i8: 576>=576
+    #[case(PType::F64, 512, false)] // f64: threshold=(8+8)*64=1024, 512<1024
+    #[case(PType::F64, 1024, true)] // f64: 1024>=1024
+    fn heuristic_threshold(
+        #[case] ptype: PType,
+        #[case] avg: u64,
+        #[case] expect_per_list_copy: bool,
+    ) {
+        let dtype = DType::Primitive(ptype, Nullability::NonNullable);
+        let num_lists = 1000_usize;
+        let total = avg * num_lists as u64;
+        assert_eq!(
+            ListViewArray::should_use_per_list_copy(&dtype, total, num_lists),
+            expect_per_list_copy,
+        );
+    }
 }

From 793d4f7e0797074bb7c3244a7c2223eb537e8de5 Mon Sep 17 00:00:00 2001
From: Baris Palaska <barispalaska@gmail.com>
Date: Mon, 16 Feb 2026 10:52:00 +0000
Subject: [PATCH 4/6] clearer benchmark function naming

Signed-off-by: Baris Palaska <barispalaska@gmail.com>
---
 vortex-array/benches/listview_rebuild.rs | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs
index 52451e9e591..dec1f18fa41 100644
--- a/vortex-array/benches/listview_rebuild.rs
+++ b/vortex-array/benches/listview_rebuild.rs
@@ -5,9 +5,9 @@
 //!
 //! The heuristic internally picks between bulk-take (single `take`) and per-list-copy (per-list
 //! `slice` + builder copy) strategies. These scenarios exercise both paths:
-//! - `crossover` straddles the decision boundary
-//! - `large_i8` and `small_overlap` exercise the extremes
-//! - `varbinview` and `struct_elements` exercise special-case types
+//! - `i32_at_heuristic_threshold` straddles the decision boundary
+//! - `i8_large_lists_per_list_copy` and `i32_small_overlapping_bulk_take` exercise the extremes
+//! - `varbinview_always_bulk_take` and `struct_always_bulk_take` exercise special-case types
 
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
@@ -99,42 +99,46 @@ fn make_struct_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewAr
 }
 
 // ── i32 around threshold (8+4)*64 = 768: exercises both strategies ──────────
-const CROSSOVER_SIZES: &[usize] = &[512, 768, 1024];
+const HEURISTIC_THRESHOLD_SIZES: &[usize] = &[512, 768, 1024];
 
-#[divan::bench(args = CROSSOVER_SIZES)]
-fn rebuild_crossover(bencher: Bencher, list_size: usize) {
+#[divan::bench(args = HEURISTIC_THRESHOLD_SIZES)]
+fn i32_at_heuristic_threshold(bencher: Bencher, list_size: usize) {
     let lv = make_primitive_lv(1_000, list_size, list_size);
     bencher
         .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
+// ── i8 with 65K-element lists: deep into per-list-copy territory ─────────────
 #[divan::bench]
-fn rebuild_large_i8(bencher: Bencher) {
+fn i8_large_lists_per_list_copy(bencher: Bencher) {
     let lv = make_i8_lv(1_000, 65_536, 65_536);
     bencher
         .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
+// ── i32 with 8-element overlapping lists: deep into bulk-take territory ──────
 #[divan::bench]
-fn rebuild_small_overlap(bencher: Bencher) {
+fn i32_small_overlapping_bulk_take(bencher: Bencher) {
     let lv = make_primitive_lv(1_000, 8, 1);
     bencher
         .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
+// ── VarBinView: variable-width always uses bulk-take ─────────────────────────
 #[divan::bench]
-fn rebuild_varbinview(bencher: Bencher) {
+fn varbinview_always_bulk_take(bencher: Bencher) {
     let lv = make_varbinview_lv(1_000, 1_024, 1_024);
     bencher
         .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
+// ── Struct{i32, f64}: struct always uses bulk-take ───────────────────────────
 #[divan::bench]
-fn rebuild_struct(bencher: Bencher) {
+fn struct_always_bulk_take(bencher: Bencher) {
     let lv = make_struct_lv(1_000, 1_024, 1_024);
     bencher
         .with_inputs(|| &lv)

From 8534d19f9551b3da5b55c7d8f8e9203df241962b Mon Sep 17 00:00:00 2001
From: Baris Palaska <barispalaska@gmail.com>
Date: Mon, 16 Feb 2026 14:42:44 +0000
Subject: [PATCH 5/6] clarify comments, better FSL heuristic

Signed-off-by: Baris Palaska <barispalaska@gmail.com>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 vortex-array/benches/listview_rebuild.rs    |  24 ++---
 vortex-array/src/arrays/listview/rebuild.rs | 110 +++++++++++---------
 2 files changed, 73 insertions(+), 61 deletions(-)

diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs
index dec1f18fa41..46a8458cdc2 100644
--- a/vortex-array/benches/listview_rebuild.rs
+++ b/vortex-array/benches/listview_rebuild.rs
@@ -3,11 +3,11 @@
 
 //! Benchmarks for ListView rebuild across different element types and scenarios.
 //!
-//! The heuristic internally picks between bulk-take (single `take`) and per-list-copy (per-list
-//! `slice` + builder copy) strategies. These scenarios exercise both paths:
+//! The heuristic internally picks between `rebuild_with_take` (single bulk `take`) and
+//! `rebuild_list_by_list` (per-list `slice` + builder copy). These scenarios exercise both:
 //! - `i32_at_heuristic_threshold` straddles the decision boundary
-//! - `i8_large_lists_per_list_copy` and `i32_small_overlapping_bulk_take` exercise the extremes
-//! - `varbinview_always_bulk_take` and `struct_always_bulk_take` exercise special-case types
+//! - `i8_large_lists_list_by_list` and `i32_small_overlapping_take` exercise the extremes
+//! - `varbinview_always_take` and `struct_always_take` exercise special-case types
 
 #![allow(clippy::unwrap_used)]
 #![allow(clippy::cast_possible_truncation)]
@@ -109,36 +109,36 @@ fn i32_at_heuristic_threshold(bencher: Bencher, list_size: usize) {
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
-// ── i8 with 65K-element lists: deep into per-list-copy territory ─────────────
+// ── i8 with 65K-element lists: deep into list-by-list territory ──────────────
 #[divan::bench]
-fn i8_large_lists_per_list_copy(bencher: Bencher) {
+fn i8_large_lists_list_by_list(bencher: Bencher) {
     let lv = make_i8_lv(1_000, 65_536, 65_536);
     bencher
         .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
-// ── i32 with 8-element overlapping lists: deep into bulk-take territory ──────
+// ── i32 with 8-element overlapping lists: deep into take territory ───────────
 #[divan::bench]
-fn i32_small_overlapping_bulk_take(bencher: Bencher) {
+fn i32_small_overlapping_take(bencher: Bencher) {
     let lv = make_primitive_lv(1_000, 8, 1);
     bencher
         .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
-// ── VarBinView: variable-width always uses bulk-take ─────────────────────────
+// ── VarBinView: variable-width always uses take ──────────────────────────────
 #[divan::bench]
-fn varbinview_always_bulk_take(bencher: Bencher) {
+fn varbinview_always_take(bencher: Bencher) {
     let lv = make_varbinview_lv(1_000, 1_024, 1_024);
     bencher
         .with_inputs(|| &lv)
         .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap());
 }
 
-// ── Struct{i32, f64}: struct always uses bulk-take ───────────────────────────
+// ── Struct{i32, f64}: struct always uses take ────────────────────────────────
 #[divan::bench]
-fn struct_always_bulk_take(bencher: Bencher) {
+fn struct_always_take(bencher: Bencher) {
     let lv = make_struct_lv(1_000, 1_024, 1_024);
     bencher
         .with_inputs(|| &lv)
diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs
index 0b29e9ad509..a31af5304f0 100644
--- a/vortex-array/src/arrays/listview/rebuild.rs
+++ b/vortex-array/src/arrays/listview/rebuild.rs
@@ -104,8 +104,13 @@ impl ListViewArray {
         })
     }
 
-    /// Picks between [`naive_rebuild_bulk_take`](Self::naive_rebuild_bulk_take) and
-    /// [`naive_rebuild_per_list_copy`](Self::naive_rebuild_per_list_copy) via heuristic.
+    /// Picks between [`rebuild_with_take`](Self::rebuild_with_take) and
+    /// [`rebuild_list_by_list`](Self::rebuild_list_by_list) via heuristic.
+    ///
+    /// Rebuilding list-by-list introduces per-list overhead (slice dispatch, builder calls), so
+    /// for many small lists it is faster to model the entire rebuild as a single bulk `take` over
+    /// a flat index array. Conversely, for large lists the sequential `memcpy` of list-by-list
+    /// rebuilding outperforms the index construction + SIMD gather of the take path.
     fn naive_rebuild<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
         &self,
     ) -> VortexResult<ListViewArray> {
@@ -119,27 +124,28 @@ impl ListViewArray {
             .iter()
             .map(|s| (*s).as_() as u64)
             .sum();
-        let use_per_list_copy = Self::should_use_per_list_copy(element_dtype, total, self.len());
+        let use_list_by_list = Self::should_use_list_by_list(element_dtype, total, self.len());
 
-        if use_per_list_copy {
-            self.naive_rebuild_per_list_copy::<O, NewOffset, S>()
+        if use_list_by_list {
+            self.rebuild_list_by_list::<O, NewOffset, S>()
         } else {
-            self.naive_rebuild_bulk_take::<O, NewOffset, S>()
+            self.rebuild_with_take::<O, NewOffset, S>()
         }
     }
 
-    /// Decides whether the per-list-copy strategy should be used over bulk-take.
+    /// Decides whether [`rebuild_list_by_list`](Self::rebuild_list_by_list) should be used over
+    /// [`rebuild_with_take`](Self::rebuild_with_take).
     ///
-    /// Empirical heuristic: use per-list-copy when `avg_list_size >= (8 + E) * 64 / fsl_divisor`.
-    /// - `8` — extra per-element cost of bulk-take (writing a u64 index that per-list-copy avoids)
+    /// Empirical heuristic: use list-by-list when `avg_list_size >= (8 + E) * 64 / fsl_divisor`.
+    /// - `8` — extra per-element cost of take (writing a u64 index that list-by-list avoids)
     /// - `E` — element byte width; larger elements make the builder's per-call overhead
     ///   (slice dispatch, `to_primitive()`, validity mask) relatively more expensive,
-    ///   raising the threshold before per-list-copy's sequential memcpy wins
-    /// - `64` — base number of elements per list needed to amortize per-list-copy's ~200ns
+    ///   raising the threshold before list-by-list's sequential memcpy wins
+    /// - `64` — base number of elements per list needed to amortize list-by-list's ~200ns
     ///   per-list overhead (upfront canonicalize + per-list slice dispatch)
-    /// - `fsl_divisor` — for `FixedSizeList<_, N>`, `clamp(N, 1, 2)`: bulk-take expands each
+    /// - `fsl_divisor` — for `FixedSizeList<_, N>`, `clamp(N, 1, 2)`: take expands each
     ///   outer index into N inner sub-indices, so the threshold is lowered
-    fn should_use_per_list_copy(
+    fn should_use_list_by_list(
         element_dtype: &DType,
         total_output_elements: u64,
         num_lists: usize,
@@ -149,37 +155,43 @@ impl ListViewArray {
         }
         let avg = total_output_elements / num_lists as u64;
         match element_dtype {
-            // Struct: per-list-copy creates separate per-field builders and reconstructs
-            // the structure element-by-element, making it 10-40x slower than bulk-take at
+            // Struct: list-by-list creates separate per-field builders and reconstructs
+            // the structure element-by-element, making it 10-40x slower than take at
             // all tested sizes (2-field and 4-field). This overhead is fundamental to the
             // builder architecture and does not amortize with larger lists.
             DType::Struct(..) => false,
-            // FixedSizeList<T, N>: bulk-take expands each outer index into N inner
-            // sub-indices, making it ~2x more expensive per element than flat types.
-            // We account for this by dividing the base threshold by clamp(N, 1, 2):
-            //   - N >= 2: threshold halved (e.g. FSL<i32,4> → (8+4)*32 = 384)
-            //   - N == 1: threshold unchanged (same as flat types)
-            // Uses the *inner* element size, not the total FSL size, because per-list-copy
-            // extends the inner flat buffer directly.
-            DType::FixedSizeList(inner, n, ..) => inner
-                .element_size()
-                .is_some_and(|e| avg >= (8 + e as u64) * 64 / (*n as u64).clamp(1, 2)),
+            // FixedSizeList<T, N>: take expands each outer index into N inner
+            // sub-indices, so its cost grows with N. We lower the threshold by a
+            // divisor that grows as ~log₄(N): `min(N, max(2, ilog2(N) / 2))`.
+            //   - N=1: divisor=1 (same as flat types)
+            //   - N=2..63: divisor=2 (e.g. FSL<i32,4> → (8+4)*32 = 384)
+            //   - N=64..255: divisor=3 (e.g. FSL<i32,64> → (8+4)*~21 = 256)
+            //   - N=256+: divisor=4
+            // `e` is the *inner* element size (e.g. 4 for i32), not the total FSL
+            // size, because list-by-list extends the inner flat buffer directly.
+            DType::FixedSizeList(inner, n, ..) => {
+                let n = (*n as u64).max(1);
+                let fsl_divisor = n.min((n.ilog2() as u64 / 2).max(2));
+                inner
+                    .element_size()
+                    .is_some_and(|e| avg >= (8 + e as u64) * 64 / fsl_divisor)
+            }
             _ => {
                 if let Some(e) = element_dtype.element_size() {
                     // Flat fixed-width types (primitives, bools): the base formula.
-                    // Both strategies copy E bytes per element, but bulk-take additionally
-                    // writes an 8-byte u64 index and per-list-copy has per-list overhead
+                    // Both strategies copy E bytes per element, but take additionally
+                    // writes an 8-byte u64 index and list-by-list has per-list overhead
                     // (~200ns for slice dispatch + builder calls). The (8 + E) factor
                     // also captures the builder's higher per-element overhead vs SIMD
-                    // bulk-take. Validated across i8/i16/i32/f64/bool.
+                    // take. Validated across i8/i16/i32/f64/bool.
                     avg >= (8 + e as u64) * 64
                 } else {
-                    // List<fixed-width>: per-list-copy does cheap offset slicing + bulk
+                    // List<fixed-width>: list-by-list does cheap offset slicing + bulk
                     // inner element copy, crossing over at ~1024. We use a fixed threshold
                     // because inner list sizes aren't cheaply observable here.
-                    // Utf8/Binary: always bulk-take — their builder does expensive Arc
+                    // Utf8/Binary: always take — their builder does expensive Arc
                     // ref-count bumps and view metadata copying (crossover >50K).
-                    // List<variable-width>: always bulk-take (same reason as Utf8/Binary).
+                    // List<variable-width>: always take (same reason as Utf8/Binary).
                     element_dtype
                         .as_list_element_opt()
                         .and_then(|d| d.element_size())
@@ -189,9 +201,9 @@ impl ListViewArray {
         }
     }
 
-    /// Rebuilds elements using the **bulk-take** strategy: collect all element indices into a flat
-    /// `BufferMut<u64>`, perform a single bulk `take`, then canonicalize.
-    fn naive_rebuild_bulk_take<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
+    /// Rebuilds elements using a single bulk `take`: collect all element indices into a flat
+    /// `BufferMut<u64>`, perform a single `take`, then canonicalize.
+    fn rebuild_with_take<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
         &self,
     ) -> VortexResult<ListViewArray> {
         let offsets_canonical = self.offsets().to_primitive();
@@ -232,7 +244,7 @@ impl ListViewArray {
         let offsets = new_offsets.into_array();
         let sizes = new_sizes.into_array();
 
-        // SAFETY: same invariants as `naive_rebuild_per_list_copy` — offsets are sequential and
+        // SAFETY: same invariants as `rebuild_list_by_list` — offsets are sequential and
         // non-overlapping, all (offset, size) pairs reference valid elements, and the validity
         // array is preserved from the original.
         Ok(unsafe {
@@ -241,9 +253,9 @@ impl ListViewArray {
         })
     }
 
-    /// Rebuilds elements using the **per-list-copy** strategy: canonicalize elements upfront, then
-    /// per-list `slice` + `extend_from_array` into a typed builder.
-    fn naive_rebuild_per_list_copy<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
+    /// Rebuilds elements list-by-list: canonicalize elements upfront, then for each list `slice`
+    /// the relevant range and `extend_from_array` into a typed builder.
+    fn rebuild_list_by_list<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
         &self,
     ) -> VortexResult<ListViewArray> {
         let element_dtype = self
@@ -589,12 +601,12 @@ mod tests {
         Ok(())
     }
 
-    // ── should_use_per_list_copy heuristic tests ───────────────────────────
+    // ── should_use_list_by_list heuristic tests ───────────────────────────
 
     #[test]
     fn heuristic_rejects_zero_lists() {
         let prim = DType::Primitive(PType::I32, Nullability::NonNullable);
-        assert!(!ListViewArray::should_use_per_list_copy(&prim, 0, 0));
+        assert!(!ListViewArray::should_use_list_by_list(&prim, 0, 0));
     }
 
     #[test]
@@ -606,13 +618,13 @@ mod tests {
             ],
             Nullability::NonNullable,
         );
-        assert!(!ListViewArray::should_use_per_list_copy(
+        assert!(!ListViewArray::should_use_list_by_list(
             &struct_dtype,
             100_000,
             100
         ));
     }
-
+    
     #[test]
     fn heuristic_accepts_fsl() {
         use std::sync::Arc;
@@ -622,10 +634,10 @@ mod tests {
             4,
             Nullability::NonNullable,
         );
-        assert!(!ListViewArray::should_use_per_list_copy(
+        assert!(!ListViewArray::should_use_list_by_list(
             &fsl, 256_000, 1_000
         ));
-        assert!(ListViewArray::should_use_per_list_copy(
+        assert!(ListViewArray::should_use_list_by_list(
             &fsl, 384_000, 1_000
         ));
     }
@@ -637,10 +649,10 @@ mod tests {
             Nullability::NonNullable,
         );
         // List<i32>: threshold = 1024.
-        assert!(!ListViewArray::should_use_per_list_copy(
+        assert!(!ListViewArray::should_use_list_by_list(
             &list_i32, 512_000, 1_000
         ));
-        assert!(ListViewArray::should_use_per_list_copy(
+        assert!(ListViewArray::should_use_list_by_list(
             &list_i32, 1_024_000, 1_000
         ));
     }
@@ -655,14 +667,14 @@ mod tests {
     fn heuristic_threshold(
         #[case] ptype: PType,
         #[case] avg: u64,
-        #[case] expect_per_list_copy: bool,
+        #[case] expect_list_by_list: bool,
     ) {
         let dtype = DType::Primitive(ptype, Nullability::NonNullable);
         let num_lists = 1000_usize;
         let total = avg * num_lists as u64;
         assert_eq!(
-            ListViewArray::should_use_per_list_copy(&dtype, total, num_lists),
-            expect_per_list_copy,
+            ListViewArray::should_use_list_by_list(&dtype, total, num_lists),
+            expect_list_by_list,
         );
     }
 }

From cfb5fe6c3ed151f4803bc20fd3b4b91615426910 Mon Sep 17 00:00:00 2001
From: Baris Palaska <barispalaska@gmail.com>
Date: Mon, 16 Feb 2026 17:04:02 +0000
Subject: [PATCH 6/6] lazy

Signed-off-by: Baris Palaska <barispalaska@gmail.com>
---
 vortex-array/src/arrays/listview/rebuild.rs | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs
index a31af5304f0..ab3db9ddf0b 100644
--- a/vortex-array/src/arrays/listview/rebuild.rs
+++ b/vortex-array/src/arrays/listview/rebuild.rs
@@ -202,7 +202,7 @@ impl ListViewArray {
     }
 
     /// Rebuilds elements using a single bulk `take`: collect all element indices into a flat
-    /// `BufferMut<u64>`, perform a single `take`, then canonicalize.
+    /// `BufferMut<u64>` and perform a single `take`.
     fn rebuild_with_take<O: IntegerPType, NewOffset: IntegerPType, S: IntegerPType>(
         &self,
     ) -> VortexResult<ListViewArray> {
@@ -236,11 +236,7 @@ impl ListViewArray {
             n_elements += num_traits::cast(size).vortex_expect("Cast failed");
         }
 
-        let elements = self
-            .elements()
-            .take(take_indices.into_array())?
-            .to_canonical()?
-            .into_array();
+        let elements = self.elements().take(take_indices.into_array())?;
         let offsets = new_offsets.into_array();
         let sizes = new_sizes.into_array();
 
@@ -624,7 +620,7 @@ mod tests {
             100
         ));
     }
-    
+
     #[test]
     fn heuristic_accepts_fsl() {
         use std::sync::Arc;
@@ -637,9 +633,7 @@ mod tests {
         assert!(!ListViewArray::should_use_list_by_list(
             &fsl, 256_000, 1_000
         ));
-        assert!(ListViewArray::should_use_list_by_list(
-            &fsl, 384_000, 1_000
-        ));
+        assert!(ListViewArray::should_use_list_by_list(&fsl, 384_000, 1_000));
     }
 
     #[test]