From e5d6ea02b150bc29ebda9cf753a88f2a707b1d8c Mon Sep 17 00:00:00 2001 From: Baris Palaska Date: Fri, 13 Feb 2026 16:15:10 +0000 Subject: [PATCH 1/6] replace builder with bulk take in naive_rebuild Signed-off-by: Baris Palaska --- vortex-array/Cargo.toml | 4 ++ vortex-array/benches/listview_rebuild.rs | 79 +++++++++++++++++++++ vortex-array/src/arrays/listview/rebuild.rs | 28 ++------ 3 files changed, 90 insertions(+), 21 deletions(-) create mode 100644 vortex-array/benches/listview_rebuild.rs diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index c310b18eeed..4b1ebd5b332 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -169,5 +169,9 @@ harness = false name = "take_fsl" harness = false +[[bench]] +name = "listview_rebuild" +harness = false + [package.metadata.cargo-machete] ignored = ["getrandom_v03"] diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs new file mode 100644 index 00000000000..c502122f6a9 --- /dev/null +++ b/vortex-array/benches/listview_rebuild.rs @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Benchmarks for ListView rebuild with primitive and VarBinView elements. + +#![allow(clippy::unwrap_used)] +#![allow(clippy::cast_possible_truncation)] + +use divan::Bencher; +use vortex_array::IntoArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::ListViewRebuildMode; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; + +fn main() { + divan::main(); +} + +const NUM_LISTS: &[usize] = &[100, 1_000, 10_000]; + +/// Build a ListViewArray of primitives with overlapping views. +fn make_primitive_listview(num_lists: usize) -> ListViewArray { + let element_count = num_lists * 4; + let elements = PrimitiveArray::from_iter(0..element_count as i32).into_array(); + + let offsets: Buffer = (0..num_lists).map(|i| (i * 2) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(4u32, num_lists).collect(); + + ListViewArray::new( + elements, + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) +} + +/// Build a ListViewArray of strings with overlapping views. +fn make_varbinview_listview(num_lists: usize) -> ListViewArray { + let element_count = num_lists * 4; + let strings: Vec = (0..element_count) + .map(|i| { + if i % 3 == 0 { + format!("long-string-value-{i:06}") + } else { + format!("s{i}") + } + }) + .collect(); + let elements = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str())).into_array(); + + let offsets: Buffer = (0..num_lists).map(|i| (i * 2) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(4u32, num_lists).collect(); + + ListViewArray::new( + elements, + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) +} + +#[divan::bench(args = NUM_LISTS)] +fn rebuild_primitive(bencher: Bencher, num_lists: usize) { + let listview = make_primitive_listview(num_lists); + bencher + .with_inputs(|| &listview) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} + +#[divan::bench(args = NUM_LISTS)] +fn rebuild_varbinview(bencher: Bencher, num_lists: usize) { + let listview = make_varbinview_listview(num_lists); + bencher + .with_inputs(|| &listview) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index d2714b671e6..7907296ef07 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -14,7 +14,6 @@ use crate::Array; use crate::IntoArray; use crate::ToCanonical; use crate::arrays::ListViewArray; -use crate::builders::builder_with_capacity; use crate::compute; use crate::vtable::ValidityHelper; @@ -103,18 +102,14 @@ impl ListViewArray { }) } - // TODO(connor)[ListView]: We should benchmark if it is faster to use `take` on the elements - // instead of using a builder. /// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece /// by piece. + /// + /// Collects all element indices into a flat `BufferMut` and performs a single bulk + /// `take` to produce the new elements array. fn naive_rebuild( &self, ) -> VortexResult { - let element_dtype = self - .dtype() - .as_list_element_opt() - .vortex_expect("somehow had a canonical list that was not a list"); - let offsets_canonical = self.offsets().to_primitive(); let offsets_slice = offsets_canonical.as_slice::(); let sizes_canonical = self.sizes().to_primitive(); @@ -129,17 +124,8 @@ impl ListViewArray { // null lists to 0. let mut new_sizes = BufferMut::::with_capacity(len); - // Canonicalize the elements up front as we will be slicing the elements quite a lot. - let elements_canonical = self - .elements() - .to_canonical() - .vortex_expect("canonicalize elements for rebuild") - .into_array(); - - // Note that we do not know what the exact capacity should be of the new elements since - // there could be overlaps in the existing `ListViewArray`. - let mut new_elements_builder = - builder_with_capacity(element_dtype.as_ref(), self.elements().len()); + // Collect all element indices for a single bulk take. + let mut take_indices = BufferMut::::with_capacity(self.elements().len()); let mut n_elements = NewOffset::zero(); for index in 0..len { @@ -159,14 +145,14 @@ impl ListViewArray { new_offsets.push(n_elements); new_sizes.push(size); - new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?); + take_indices.extend(start as u64..stop as u64); n_elements += num_traits::cast(size).vortex_expect("Cast failed"); } let offsets = new_offsets.into_array(); let sizes = new_sizes.into_array(); - let elements = new_elements_builder.finish(); + let elements = self.elements().take(take_indices.into_array())?; debug_assert_eq!( n_elements.as_(), From dd4950771b099ce4b303ce282847d4fee67cfc9c Mon Sep 17 00:00:00 2001 From: Baris Palaska Date: Fri, 13 Feb 2026 16:40:24 +0000 Subject: [PATCH 2/6] to_canonical Signed-off-by: Baris Palaska --- vortex-array/benches/listview_rebuild.rs | 6 ------ vortex-array/src/arrays/listview/rebuild.rs | 6 +++++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs index c502122f6a9..4fc4e15970d 100644 --- a/vortex-array/benches/listview_rebuild.rs +++ b/vortex-array/benches/listview_rebuild.rs @@ -21,14 +21,11 @@ fn main() { const NUM_LISTS: &[usize] = &[100, 1_000, 10_000]; -/// Build a ListViewArray of primitives with overlapping views. fn make_primitive_listview(num_lists: usize) -> ListViewArray { let element_count = num_lists * 4; let elements = PrimitiveArray::from_iter(0..element_count as i32).into_array(); - let offsets: Buffer = (0..num_lists).map(|i| (i * 2) as u32).collect(); let sizes: Buffer = std::iter::repeat_n(4u32, num_lists).collect(); - ListViewArray::new( elements, offsets.into_array(), @@ -37,7 +34,6 @@ fn make_primitive_listview(num_lists: usize) -> ListViewArray { ) } -/// Build a ListViewArray of strings with overlapping views. fn make_varbinview_listview(num_lists: usize) -> ListViewArray { let element_count = num_lists * 4; let strings: Vec = (0..element_count) @@ -50,10 +46,8 @@ fn make_varbinview_listview(num_lists: usize) -> ListViewArray { }) .collect(); let elements = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str())).into_array(); - let offsets: Buffer = (0..num_lists).map(|i| (i * 2) as u32).collect(); let sizes: Buffer = std::iter::repeat_n(4u32, num_lists).collect(); - ListViewArray::new( elements, offsets.into_array(), diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 7907296ef07..b9b619c496b 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -152,7 +152,11 @@ impl ListViewArray { let offsets = new_offsets.into_array(); let sizes = new_sizes.into_array(); - let elements = self.elements().take(take_indices.into_array())?; + let elements = self + .elements() + .take(take_indices.into_array())? + .to_canonical()? + .into_array(); debug_assert_eq!( n_elements.as_(), From 66b9f69c67cd6f12192b55d2b4efd350ab75a98e Mon Sep 17 00:00:00 2001 From: Baris Palaska Date: Mon, 16 Feb 2026 10:28:11 +0000 Subject: [PATCH 3/6] use both methods Signed-off-by: Baris Palaska --- vortex-array/benches/listview_rebuild.rs | 107 +++++++-- vortex-array/src/arrays/listview/rebuild.rs | 253 +++++++++++++++++++- 2 files changed, 328 insertions(+), 32 deletions(-) diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs index 4fc4e15970d..52451e9e591 100644 --- a/vortex-array/benches/listview_rebuild.rs +++ b/vortex-array/benches/listview_rebuild.rs @@ -1,7 +1,13 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Benchmarks for ListView rebuild with primitive and VarBinView elements. +//! Benchmarks for ListView rebuild across different element types and scenarios. +//! +//! The heuristic internally picks between bulk-take (single `take`) and per-list-copy (per-list +//! `slice` + builder copy) strategies. These scenarios exercise both paths: +//! - `crossover` straddles the decision boundary +//! - `large_i8` and `small_overlap` exercise the extremes +//! - `varbinview` and `struct_elements` exercise special-case types #![allow(clippy::unwrap_used)] #![allow(clippy::cast_possible_truncation)] @@ -11,21 +17,34 @@ use vortex_array::IntoArray; use vortex_array::arrays::ListViewArray; use vortex_array::arrays::ListViewRebuildMode; use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; use vortex_array::arrays::VarBinViewArray; use vortex_array::validity::Validity; use vortex_buffer::Buffer; +use vortex_dtype::FieldNames; fn main() { divan::main(); } -const NUM_LISTS: &[usize] = &[100, 1_000, 10_000]; - -fn make_primitive_listview(num_lists: usize) -> ListViewArray { - let element_count = num_lists * 4; +fn make_primitive_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray { + let element_count = step * num_lists + list_size; let elements = PrimitiveArray::from_iter(0..element_count as i32).into_array(); - let offsets: Buffer = (0..num_lists).map(|i| (i * 2) as u32).collect(); - let sizes: Buffer = std::iter::repeat_n(4u32, num_lists).collect(); + let offsets: Buffer = (0..num_lists).map(|i| (i * step) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(list_size as u32, num_lists).collect(); + ListViewArray::new( + elements, + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) +} + +fn make_i8_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray { + let element_count = step * num_lists + list_size; + let elements = PrimitiveArray::from_iter((0..element_count).map(|i| i as i8)).into_array(); + let offsets: Buffer = (0..num_lists).map(|i| (i * step) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(list_size as u32, num_lists).collect(); ListViewArray::new( elements, offsets.into_array(), @@ -34,8 +53,8 @@ fn make_primitive_listview(num_lists: usize) -> ListViewArray { ) } -fn make_varbinview_listview(num_lists: usize) -> ListViewArray { - let element_count = num_lists * 4; +fn make_varbinview_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray { + let element_count = step * num_lists + list_size; let strings: Vec = (0..element_count) .map(|i| { if i % 3 == 0 { @@ -46,8 +65,8 @@ fn make_varbinview_listview(num_lists: usize) -> ListViewArray { }) .collect(); let elements = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str())).into_array(); - let offsets: Buffer = (0..num_lists).map(|i| (i * 2) as u32).collect(); - let sizes: Buffer = std::iter::repeat_n(4u32, num_lists).collect(); + let offsets: Buffer = (0..num_lists).map(|i| (i * step) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(list_size as u32, num_lists).collect(); ListViewArray::new( elements, offsets.into_array(), @@ -56,18 +75,68 @@ fn make_varbinview_listview(num_lists: usize) -> ListViewArray { ) } -#[divan::bench(args = NUM_LISTS)] -fn rebuild_primitive(bencher: Bencher, num_lists: usize) { - let listview = make_primitive_listview(num_lists); +fn make_struct_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray { + let element_count = step * num_lists + list_size; + let field_a = PrimitiveArray::from_iter(0..element_count as i32).into_array(); + let field_b = PrimitiveArray::from_iter((0..element_count).map(|i| i as f64)).into_array(); + let elements = StructArray::try_new( + FieldNames::from(["a", "b"]), + vec![field_a, field_b], + element_count, + Validity::NonNullable, + ) + .unwrap() + .into_array(); + + let offsets: Buffer = (0..num_lists).map(|i| (i * step) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(list_size as u32, num_lists).collect(); + ListViewArray::new( + elements, + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) +} + +// ── i32 around threshold (8+4)*64 = 768: exercises both strategies ────────── +const CROSSOVER_SIZES: &[usize] = &[512, 768, 1024]; + +#[divan::bench(args = CROSSOVER_SIZES)] +fn rebuild_crossover(bencher: Bencher, list_size: usize) { + let lv = make_primitive_lv(1_000, list_size, list_size); + bencher + .with_inputs(|| &lv) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} + +#[divan::bench] +fn rebuild_large_i8(bencher: Bencher) { + let lv = make_i8_lv(1_000, 65_536, 65_536); + bencher + .with_inputs(|| &lv) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} + +#[divan::bench] +fn rebuild_small_overlap(bencher: Bencher) { + let lv = make_primitive_lv(1_000, 8, 1); + bencher + .with_inputs(|| &lv) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} + +#[divan::bench] +fn rebuild_varbinview(bencher: Bencher) { + let lv = make_varbinview_lv(1_000, 1_024, 1_024); bencher - .with_inputs(|| &listview) + .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } -#[divan::bench(args = NUM_LISTS)] -fn rebuild_varbinview(bencher: Bencher, num_lists: usize) { - let listview = make_varbinview_listview(num_lists); +#[divan::bench] +fn rebuild_struct(bencher: Bencher) { + let lv = make_struct_lv(1_000, 1_024, 1_024); bencher - .with_inputs(|| &listview) + .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index b9b619c496b..0b29e9ad509 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -3,6 +3,7 @@ use num_traits::FromPrimitive; use vortex_buffer::BufferMut; +use vortex_dtype::DType; use vortex_dtype::IntegerPType; use vortex_dtype::Nullability; use vortex_dtype::match_each_integer_ptype; @@ -14,6 +15,7 @@ use crate::Array; use crate::IntoArray; use crate::ToCanonical; use crate::arrays::ListViewArray; +use crate::builders::builder_with_capacity; use crate::compute; use crate::vtable::ValidityHelper; @@ -102,14 +104,153 @@ impl ListViewArray { }) } - /// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece - /// by piece. - /// - /// Collects all element indices into a flat `BufferMut` and performs a single bulk - /// `take` to produce the new elements array. + /// Picks between [`naive_rebuild_bulk_take`](Self::naive_rebuild_bulk_take) and + /// [`naive_rebuild_per_list_copy`](Self::naive_rebuild_per_list_copy) via heuristic. fn naive_rebuild( &self, ) -> VortexResult { + let element_dtype = self + .dtype() + .as_list_element_opt() + .vortex_expect("somehow had a canonical list that was not a list"); + let sizes_canonical = self.sizes().to_primitive(); + let total: u64 = sizes_canonical + .as_slice::() + .iter() + .map(|s| (*s).as_() as u64) + .sum(); + let use_per_list_copy = Self::should_use_per_list_copy(element_dtype, total, self.len()); + + if use_per_list_copy { + self.naive_rebuild_per_list_copy::() + } else { + self.naive_rebuild_bulk_take::() + } + } + + /// Decides whether the per-list-copy strategy should be used over bulk-take. + /// + /// Empirical heuristic: use per-list-copy when `avg_list_size >= (8 + E) * 64 / fsl_divisor`. + /// - `8` — extra per-element cost of bulk-take (writing a u64 index that per-list-copy avoids) + /// - `E` — element byte width; larger elements make the builder's per-call overhead + /// (slice dispatch, `to_primitive()`, validity mask) relatively more expensive, + /// raising the threshold before per-list-copy's sequential memcpy wins + /// - `64` — base number of elements per list needed to amortize per-list-copy's ~200ns + /// per-list overhead (upfront canonicalize + per-list slice dispatch) + /// - `fsl_divisor` — for `FixedSizeList<_, N>`, `clamp(N, 1, 2)`: bulk-take expands each + /// outer index into N inner sub-indices, so the threshold is lowered + fn should_use_per_list_copy( + element_dtype: &DType, + total_output_elements: u64, + num_lists: usize, + ) -> bool { + if num_lists == 0 { + return false; + } + let avg = total_output_elements / num_lists as u64; + match element_dtype { + // Struct: per-list-copy creates separate per-field builders and reconstructs + // the structure element-by-element, making it 10-40x slower than bulk-take at + // all tested sizes (2-field and 4-field). This overhead is fundamental to the + // builder architecture and does not amortize with larger lists. + DType::Struct(..) => false, + // FixedSizeList: bulk-take expands each outer index into N inner + // sub-indices, making it ~2x more expensive per element than flat types. + // We account for this by dividing the base threshold by clamp(N, 1, 2): + // - N >= 2: threshold halved (e.g. FSL → (8+4)*32 = 384) + // - N == 1: threshold unchanged (same as flat types) + // Uses the *inner* element size, not the total FSL size, because per-list-copy + // extends the inner flat buffer directly. + DType::FixedSizeList(inner, n, ..) => inner + .element_size() + .is_some_and(|e| avg >= (8 + e as u64) * 64 / (*n as u64).clamp(1, 2)), + _ => { + if let Some(e) = element_dtype.element_size() { + // Flat fixed-width types (primitives, bools): the base formula. + // Both strategies copy E bytes per element, but bulk-take additionally + // writes an 8-byte u64 index and per-list-copy has per-list overhead + // (~200ns for slice dispatch + builder calls). The (8 + E) factor + // also captures the builder's higher per-element overhead vs SIMD + // bulk-take. Validated across i8/i16/i32/f64/bool. + avg >= (8 + e as u64) * 64 + } else { + // List: per-list-copy does cheap offset slicing + bulk + // inner element copy, crossing over at ~1024. We use a fixed threshold + // because inner list sizes aren't cheaply observable here. + // Utf8/Binary: always bulk-take — their builder does expensive Arc + // ref-count bumps and view metadata copying (crossover >50K). + // List: always bulk-take (same reason as Utf8/Binary). + element_dtype + .as_list_element_opt() + .and_then(|d| d.element_size()) + .is_some_and(|_| avg >= 1024) + } + } + } + } + + /// Rebuilds elements using the **bulk-take** strategy: collect all element indices into a flat + /// `BufferMut`, perform a single bulk `take`, then canonicalize. + fn naive_rebuild_bulk_take( + &self, + ) -> VortexResult { + let offsets_canonical = self.offsets().to_primitive(); + let offsets_slice = offsets_canonical.as_slice::(); + let sizes_canonical = self.sizes().to_primitive(); + let sizes_slice = sizes_canonical.as_slice::(); + + let len = offsets_slice.len(); + + let mut new_offsets = BufferMut::::with_capacity(len); + let mut new_sizes = BufferMut::::with_capacity(len); + let mut take_indices = BufferMut::::with_capacity(self.elements().len()); + + let mut n_elements = NewOffset::zero(); + for index in 0..len { + if !self.is_valid(index)? { + new_offsets.push(n_elements); + new_sizes.push(S::zero()); + continue; + } + + let offset = offsets_slice[index]; + let size = sizes_slice[index]; + let start = offset.as_(); + let stop = start + size.as_(); + + new_offsets.push(n_elements); + new_sizes.push(size); + take_indices.extend(start as u64..stop as u64); + n_elements += num_traits::cast(size).vortex_expect("Cast failed"); + } + + let elements = self + .elements() + .take(take_indices.into_array())? + .to_canonical()? + .into_array(); + let offsets = new_offsets.into_array(); + let sizes = new_sizes.into_array(); + + // SAFETY: same invariants as `naive_rebuild_per_list_copy` — offsets are sequential and + // non-overlapping, all (offset, size) pairs reference valid elements, and the validity + // array is preserved from the original. + Ok(unsafe { + ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone()) + .with_zero_copy_to_list(true) + }) + } + + /// Rebuilds elements using the **per-list-copy** strategy: canonicalize elements upfront, then + /// per-list `slice` + `extend_from_array` into a typed builder. + fn naive_rebuild_per_list_copy( + &self, + ) -> VortexResult { + let element_dtype = self + .dtype() + .as_list_element_opt() + .vortex_expect("somehow had a canonical list that was not a list"); + let offsets_canonical = self.offsets().to_primitive(); let offsets_slice = offsets_canonical.as_slice::(); let sizes_canonical = self.sizes().to_primitive(); @@ -124,8 +265,17 @@ impl ListViewArray { // null lists to 0. let mut new_sizes = BufferMut::::with_capacity(len); - // Collect all element indices for a single bulk take. - let mut take_indices = BufferMut::::with_capacity(self.elements().len()); + // Canonicalize the elements up front as we will be slicing the elements quite a lot. + let elements_canonical = self + .elements() + .to_canonical() + .vortex_expect("canonicalize elements for rebuild") + .into_array(); + + // Note that we do not know what the exact capacity should be of the new elements since + // there could be overlaps in the existing `ListViewArray`. + let mut new_elements_builder = + builder_with_capacity(element_dtype.as_ref(), self.elements().len()); let mut n_elements = NewOffset::zero(); for index in 0..len { @@ -145,18 +295,14 @@ impl ListViewArray { new_offsets.push(n_elements); new_sizes.push(size); - take_indices.extend(start as u64..stop as u64); + new_elements_builder.extend_from_array(&elements_canonical.slice(start..stop)?); n_elements += num_traits::cast(size).vortex_expect("Cast failed"); } let offsets = new_offsets.into_array(); let sizes = new_sizes.into_array(); - let elements = self - .elements() - .take(take_indices.into_array())? - .to_canonical()? - .into_array(); + let elements = new_elements_builder.finish(); debug_assert_eq!( n_elements.as_(), @@ -252,9 +398,13 @@ impl ListViewArray { } #[cfg(test)] +#[allow(clippy::cast_possible_truncation)] mod tests { + use rstest::rstest; use vortex_buffer::BitBuffer; + use vortex_dtype::DType; use vortex_dtype::Nullability; + use vortex_dtype::PType; use vortex_error::VortexResult; use super::ListViewRebuildMode; @@ -438,4 +588,81 @@ mod tests { ); Ok(()) } + + // ── should_use_per_list_copy heuristic tests ─────────────────────────── + + #[test] + fn heuristic_rejects_zero_lists() { + let prim = DType::Primitive(PType::I32, Nullability::NonNullable); + assert!(!ListViewArray::should_use_per_list_copy(&prim, 0, 0)); + } + + #[test] + fn heuristic_rejects_struct_always() { + let struct_dtype = DType::struct_( + [ + ("a", DType::Primitive(PType::I32, Nullability::NonNullable)), + ("b", DType::Primitive(PType::F64, Nullability::NonNullable)), + ], + Nullability::NonNullable, + ); + assert!(!ListViewArray::should_use_per_list_copy( + &struct_dtype, + 100_000, + 100 + )); + } + + #[test] + fn heuristic_accepts_fsl() { + use std::sync::Arc; + // FixedSizeList: threshold = (8+4)*64/2 = 384. + let fsl = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::I32, Nullability::NonNullable)), + 4, + Nullability::NonNullable, + ); + assert!(!ListViewArray::should_use_per_list_copy( + &fsl, 256_000, 1_000 + )); + assert!(ListViewArray::should_use_per_list_copy( + &fsl, 384_000, 1_000 + )); + } + + #[test] + fn heuristic_accepts_list_of_fixed_width() { + let list_i32 = DType::list( + DType::Primitive(PType::I32, Nullability::NonNullable), + Nullability::NonNullable, + ); + // List: threshold = 1024. + assert!(!ListViewArray::should_use_per_list_copy( + &list_i32, 512_000, 1_000 + )); + assert!(ListViewArray::should_use_per_list_copy( + &list_i32, 1_024_000, 1_000 + )); + } + + #[rstest] + #[case(PType::I32, 512, false)] // i32: threshold=(8+4)*64=768, 512<768 + #[case(PType::I32, 768, true)] // i32: 768>=768 + #[case(PType::I8, 512, false)] // i8: threshold=(8+1)*64=576, 512<576 + #[case(PType::I8, 576, true)] // i8: 576>=576 + #[case(PType::F64, 512, false)] // f64: threshold=(8+8)*64=1024, 512<1024 + #[case(PType::F64, 1024, true)] // f64: 1024>=1024 + fn heuristic_threshold( + #[case] ptype: PType, + #[case] avg: u64, + #[case] expect_per_list_copy: bool, + ) { + let dtype = DType::Primitive(ptype, Nullability::NonNullable); + let num_lists = 1000_usize; + let total = avg * num_lists as u64; + assert_eq!( + ListViewArray::should_use_per_list_copy(&dtype, total, num_lists), + expect_per_list_copy, + ); + } } From 793d4f7e0797074bb7c3244a7c2223eb537e8de5 Mon Sep 17 00:00:00 2001 From: Baris Palaska Date: Mon, 16 Feb 2026 10:52:00 +0000 Subject: [PATCH 4/6] clearer benchmark function naming Signed-off-by: Baris Palaska --- vortex-array/benches/listview_rebuild.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs index 52451e9e591..dec1f18fa41 100644 --- a/vortex-array/benches/listview_rebuild.rs +++ b/vortex-array/benches/listview_rebuild.rs @@ -5,9 +5,9 @@ //! //! The heuristic internally picks between bulk-take (single `take`) and per-list-copy (per-list //! `slice` + builder copy) strategies. These scenarios exercise both paths: -//! - `crossover` straddles the decision boundary -//! - `large_i8` and `small_overlap` exercise the extremes -//! - `varbinview` and `struct_elements` exercise special-case types +//! - `i32_at_heuristic_threshold` straddles the decision boundary +//! - `i8_large_lists_per_list_copy` and `i32_small_overlapping_bulk_take` exercise the extremes +//! - `varbinview_always_bulk_take` and `struct_always_bulk_take` exercise special-case types #![allow(clippy::unwrap_used)] #![allow(clippy::cast_possible_truncation)] @@ -99,42 +99,46 @@ fn make_struct_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewAr } // ── i32 around threshold (8+4)*64 = 768: exercises both strategies ────────── -const CROSSOVER_SIZES: &[usize] = &[512, 768, 1024]; +const HEURISTIC_THRESHOLD_SIZES: &[usize] = &[512, 768, 1024]; -#[divan::bench(args = CROSSOVER_SIZES)] -fn rebuild_crossover(bencher: Bencher, list_size: usize) { +#[divan::bench(args = HEURISTIC_THRESHOLD_SIZES)] +fn i32_at_heuristic_threshold(bencher: Bencher, list_size: usize) { let lv = make_primitive_lv(1_000, list_size, list_size); bencher .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } +// ── i8 with 65K-element lists: deep into per-list-copy territory ───────────── #[divan::bench] -fn rebuild_large_i8(bencher: Bencher) { +fn i8_large_lists_per_list_copy(bencher: Bencher) { let lv = make_i8_lv(1_000, 65_536, 65_536); bencher .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } +// ── i32 with 8-element overlapping lists: deep into bulk-take territory ────── #[divan::bench] -fn rebuild_small_overlap(bencher: Bencher) { +fn i32_small_overlapping_bulk_take(bencher: Bencher) { let lv = make_primitive_lv(1_000, 8, 1); bencher .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } +// ── VarBinView: variable-width always uses bulk-take ───────────────────────── #[divan::bench] -fn rebuild_varbinview(bencher: Bencher) { +fn varbinview_always_bulk_take(bencher: Bencher) { let lv = make_varbinview_lv(1_000, 1_024, 1_024); bencher .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } +// ── Struct{i32, f64}: struct always uses bulk-take ─────────────────────────── #[divan::bench] -fn rebuild_struct(bencher: Bencher) { +fn struct_always_bulk_take(bencher: Bencher) { let lv = make_struct_lv(1_000, 1_024, 1_024); bencher .with_inputs(|| &lv) From 8534d19f9551b3da5b55c7d8f8e9203df241962b Mon Sep 17 00:00:00 2001 From: Baris Palaska Date: Mon, 16 Feb 2026 14:42:44 +0000 Subject: [PATCH 5/6] clarify comments, better FSL heuristic Signed-off-by: Baris Palaska Co-Authored-By: Claude Opus 4.6 --- vortex-array/benches/listview_rebuild.rs | 24 ++--- vortex-array/src/arrays/listview/rebuild.rs | 110 +++++++++++--------- 2 files changed, 73 insertions(+), 61 deletions(-) diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs index dec1f18fa41..46a8458cdc2 100644 --- a/vortex-array/benches/listview_rebuild.rs +++ b/vortex-array/benches/listview_rebuild.rs @@ -3,11 +3,11 @@ //! Benchmarks for ListView rebuild across different element types and scenarios. //! -//! The heuristic internally picks between bulk-take (single `take`) and per-list-copy (per-list -//! `slice` + builder copy) strategies. These scenarios exercise both paths: +//! The heuristic internally picks between `rebuild_with_take` (single bulk `take`) and +//! `rebuild_list_by_list` (per-list `slice` + builder copy). These scenarios exercise both: //! - `i32_at_heuristic_threshold` straddles the decision boundary -//! - `i8_large_lists_per_list_copy` and `i32_small_overlapping_bulk_take` exercise the extremes -//! - `varbinview_always_bulk_take` and `struct_always_bulk_take` exercise special-case types +//! - `i8_large_lists_list_by_list` and `i32_small_overlapping_take` exercise the extremes +//! - `varbinview_always_take` and `struct_always_take` exercise special-case types #![allow(clippy::unwrap_used)] #![allow(clippy::cast_possible_truncation)] @@ -109,36 +109,36 @@ fn i32_at_heuristic_threshold(bencher: Bencher, list_size: usize) { .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } -// ── i8 with 65K-element lists: deep into per-list-copy territory ───────────── +// ── i8 with 65K-element lists: deep into list-by-list territory ────────────── #[divan::bench] -fn i8_large_lists_per_list_copy(bencher: Bencher) { +fn i8_large_lists_list_by_list(bencher: Bencher) { let lv = make_i8_lv(1_000, 65_536, 65_536); bencher .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } -// ── i32 with 8-element overlapping lists: deep into bulk-take territory ────── +// ── i32 with 8-element overlapping lists: deep into take territory ─────────── #[divan::bench] -fn i32_small_overlapping_bulk_take(bencher: Bencher) { +fn i32_small_overlapping_take(bencher: Bencher) { let lv = make_primitive_lv(1_000, 8, 1); bencher .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } -// ── VarBinView: variable-width always uses bulk-take ───────────────────────── +// ── VarBinView: variable-width always uses take ────────────────────────────── #[divan::bench] -fn varbinview_always_bulk_take(bencher: Bencher) { +fn varbinview_always_take(bencher: Bencher) { let lv = make_varbinview_lv(1_000, 1_024, 1_024); bencher .with_inputs(|| &lv) .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); } -// ── Struct{i32, f64}: struct always uses bulk-take ─────────────────────────── +// ── Struct{i32, f64}: struct always uses take ──────────────────────────────── #[divan::bench] -fn struct_always_bulk_take(bencher: Bencher) { +fn struct_always_take(bencher: Bencher) { let lv = make_struct_lv(1_000, 1_024, 1_024); bencher .with_inputs(|| &lv) diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index 0b29e9ad509..a31af5304f0 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -104,8 +104,13 @@ impl ListViewArray { }) } - /// Picks between [`naive_rebuild_bulk_take`](Self::naive_rebuild_bulk_take) and - /// [`naive_rebuild_per_list_copy`](Self::naive_rebuild_per_list_copy) via heuristic. + /// Picks between [`rebuild_with_take`](Self::rebuild_with_take) and + /// [`rebuild_list_by_list`](Self::rebuild_list_by_list) via heuristic. + /// + /// Rebuilding list-by-list introduces per-list overhead (slice dispatch, builder calls), so + /// for many small lists it is faster to model the entire rebuild as a single bulk `take` over + /// a flat index array. Conversely, for large lists the sequential `memcpy` of list-by-list + /// rebuilding outperforms the index construction + SIMD gather of the take path. fn naive_rebuild( &self, ) -> VortexResult { @@ -119,27 +124,28 @@ impl ListViewArray { .iter() .map(|s| (*s).as_() as u64) .sum(); - let use_per_list_copy = Self::should_use_per_list_copy(element_dtype, total, self.len()); + let use_list_by_list = Self::should_use_list_by_list(element_dtype, total, self.len()); - if use_per_list_copy { - self.naive_rebuild_per_list_copy::() + if use_list_by_list { + self.rebuild_list_by_list::() } else { - self.naive_rebuild_bulk_take::() + self.rebuild_with_take::() } } - /// Decides whether the per-list-copy strategy should be used over bulk-take. + /// Decides whether [`rebuild_list_by_list`](Self::rebuild_list_by_list) should be used over + /// [`rebuild_with_take`](Self::rebuild_with_take). /// - /// Empirical heuristic: use per-list-copy when `avg_list_size >= (8 + E) * 64 / fsl_divisor`. - /// - `8` — extra per-element cost of bulk-take (writing a u64 index that per-list-copy avoids) + /// Empirical heuristic: use list-by-list when `avg_list_size >= (8 + E) * 64 / fsl_divisor`. + /// - `8` — extra per-element cost of take (writing a u64 index that list-by-list avoids) /// - `E` — element byte width; larger elements make the builder's per-call overhead /// (slice dispatch, `to_primitive()`, validity mask) relatively more expensive, - /// raising the threshold before per-list-copy's sequential memcpy wins - /// - `64` — base number of elements per list needed to amortize per-list-copy's ~200ns + /// raising the threshold before list-by-list's sequential memcpy wins + /// - `64` — base number of elements per list needed to amortize list-by-list's ~200ns /// per-list overhead (upfront canonicalize + per-list slice dispatch) - /// - `fsl_divisor` — for `FixedSizeList<_, N>`, `clamp(N, 1, 2)`: bulk-take expands each + /// - `fsl_divisor` — for `FixedSizeList<_, N>`, `clamp(N, 1, 2)`: take expands each /// outer index into N inner sub-indices, so the threshold is lowered - fn should_use_per_list_copy( + fn should_use_list_by_list( element_dtype: &DType, total_output_elements: u64, num_lists: usize, @@ -149,37 +155,43 @@ impl ListViewArray { } let avg = total_output_elements / num_lists as u64; match element_dtype { - // Struct: per-list-copy creates separate per-field builders and reconstructs - // the structure element-by-element, making it 10-40x slower than bulk-take at + // Struct: list-by-list creates separate per-field builders and reconstructs + // the structure element-by-element, making it 10-40x slower than take at // all tested sizes (2-field and 4-field). This overhead is fundamental to the // builder architecture and does not amortize with larger lists. DType::Struct(..) => false, - // FixedSizeList: bulk-take expands each outer index into N inner - // sub-indices, making it ~2x more expensive per element than flat types. - // We account for this by dividing the base threshold by clamp(N, 1, 2): - // - N >= 2: threshold halved (e.g. FSL → (8+4)*32 = 384) - // - N == 1: threshold unchanged (same as flat types) - // Uses the *inner* element size, not the total FSL size, because per-list-copy - // extends the inner flat buffer directly. - DType::FixedSizeList(inner, n, ..) => inner - .element_size() - .is_some_and(|e| avg >= (8 + e as u64) * 64 / (*n as u64).clamp(1, 2)), + // FixedSizeList: take expands each outer index into N inner + // sub-indices, so its cost grows with N. We lower the threshold by a + // divisor that grows as ~log₄(N): `min(N, max(2, ilog2(N) / 2))`. + // - N=1: divisor=1 (same as flat types) + // - N=2..63: divisor=2 (e.g. FSL → (8+4)*32 = 384) + // - N=64..255: divisor=3 (e.g. FSL → (8+4)*~21 = 256) + // - N=256+: divisor=4 + // `e` is the *inner* element size (e.g. 4 for i32), not the total FSL + // size, because list-by-list extends the inner flat buffer directly. + DType::FixedSizeList(inner, n, ..) => { + let n = (*n as u64).max(1); + let fsl_divisor = n.min((n.ilog2() as u64 / 2).max(2)); + inner + .element_size() + .is_some_and(|e| avg >= (8 + e as u64) * 64 / fsl_divisor) + } _ => { if let Some(e) = element_dtype.element_size() { // Flat fixed-width types (primitives, bools): the base formula. - // Both strategies copy E bytes per element, but bulk-take additionally - // writes an 8-byte u64 index and per-list-copy has per-list overhead + // Both strategies copy E bytes per element, but take additionally + // writes an 8-byte u64 index and list-by-list has per-list overhead // (~200ns for slice dispatch + builder calls). The (8 + E) factor // also captures the builder's higher per-element overhead vs SIMD - // bulk-take. Validated across i8/i16/i32/f64/bool. + // take. Validated across i8/i16/i32/f64/bool. avg >= (8 + e as u64) * 64 } else { - // List: per-list-copy does cheap offset slicing + bulk + // List: list-by-list does cheap offset slicing + bulk // inner element copy, crossing over at ~1024. We use a fixed threshold // because inner list sizes aren't cheaply observable here. - // Utf8/Binary: always bulk-take — their builder does expensive Arc + // Utf8/Binary: always take — their builder does expensive Arc // ref-count bumps and view metadata copying (crossover >50K). - // List: always bulk-take (same reason as Utf8/Binary). + // List: always take (same reason as Utf8/Binary). element_dtype .as_list_element_opt() .and_then(|d| d.element_size()) @@ -189,9 +201,9 @@ impl ListViewArray { } } - /// Rebuilds elements using the **bulk-take** strategy: collect all element indices into a flat - /// `BufferMut`, perform a single bulk `take`, then canonicalize. - fn naive_rebuild_bulk_take( + /// Rebuilds elements using a single bulk `take`: collect all element indices into a flat + /// `BufferMut`, perform a single `take`, then canonicalize. + fn rebuild_with_take( &self, ) -> VortexResult { let offsets_canonical = self.offsets().to_primitive(); @@ -232,7 +244,7 @@ impl ListViewArray { let offsets = new_offsets.into_array(); let sizes = new_sizes.into_array(); - // SAFETY: same invariants as `naive_rebuild_per_list_copy` — offsets are sequential and + // SAFETY: same invariants as `rebuild_list_by_list` — offsets are sequential and // non-overlapping, all (offset, size) pairs reference valid elements, and the validity // array is preserved from the original. Ok(unsafe { @@ -241,9 +253,9 @@ impl ListViewArray { }) } - /// Rebuilds elements using the **per-list-copy** strategy: canonicalize elements upfront, then - /// per-list `slice` + `extend_from_array` into a typed builder. - fn naive_rebuild_per_list_copy( + /// Rebuilds elements list-by-list: canonicalize elements upfront, then for each list `slice` + /// the relevant range and `extend_from_array` into a typed builder. + fn rebuild_list_by_list( &self, ) -> VortexResult { let element_dtype = self @@ -589,12 +601,12 @@ mod tests { Ok(()) } - // ── should_use_per_list_copy heuristic tests ─────────────────────────── + // ── should_use_list_by_list heuristic tests ─────────────────────────── #[test] fn heuristic_rejects_zero_lists() { let prim = DType::Primitive(PType::I32, Nullability::NonNullable); - assert!(!ListViewArray::should_use_per_list_copy(&prim, 0, 0)); + assert!(!ListViewArray::should_use_list_by_list(&prim, 0, 0)); } #[test] @@ -606,13 +618,13 @@ mod tests { ], Nullability::NonNullable, ); - assert!(!ListViewArray::should_use_per_list_copy( + assert!(!ListViewArray::should_use_list_by_list( &struct_dtype, 100_000, 100 )); } - + #[test] fn heuristic_accepts_fsl() { use std::sync::Arc; @@ -622,10 +634,10 @@ mod tests { 4, Nullability::NonNullable, ); - assert!(!ListViewArray::should_use_per_list_copy( + assert!(!ListViewArray::should_use_list_by_list( &fsl, 256_000, 1_000 )); - assert!(ListViewArray::should_use_per_list_copy( + assert!(ListViewArray::should_use_list_by_list( &fsl, 384_000, 1_000 )); } @@ -637,10 +649,10 @@ mod tests { Nullability::NonNullable, ); // List: threshold = 1024. - assert!(!ListViewArray::should_use_per_list_copy( + assert!(!ListViewArray::should_use_list_by_list( &list_i32, 512_000, 1_000 )); - assert!(ListViewArray::should_use_per_list_copy( + assert!(ListViewArray::should_use_list_by_list( &list_i32, 1_024_000, 1_000 )); } @@ -655,14 +667,14 @@ mod tests { fn heuristic_threshold( #[case] ptype: PType, #[case] avg: u64, - #[case] expect_per_list_copy: bool, + #[case] expect_list_by_list: bool, ) { let dtype = DType::Primitive(ptype, Nullability::NonNullable); let num_lists = 1000_usize; let total = avg * num_lists as u64; assert_eq!( - ListViewArray::should_use_per_list_copy(&dtype, total, num_lists), - expect_per_list_copy, + ListViewArray::should_use_list_by_list(&dtype, total, num_lists), + expect_list_by_list, ); } } From cfb5fe6c3ed151f4803bc20fd3b4b91615426910 Mon Sep 17 00:00:00 2001 From: Baris Palaska Date: Mon, 16 Feb 2026 17:04:02 +0000 Subject: [PATCH 6/6] lazy Signed-off-by: Baris Palaska --- vortex-array/src/arrays/listview/rebuild.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index a31af5304f0..ab3db9ddf0b 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -202,7 +202,7 @@ impl ListViewArray { } /// Rebuilds elements using a single bulk `take`: collect all element indices into a flat - /// `BufferMut`, perform a single `take`, then canonicalize. + /// `BufferMut` and perform a single `take`. fn rebuild_with_take( &self, ) -> VortexResult { @@ -236,11 +236,7 @@ impl ListViewArray { n_elements += num_traits::cast(size).vortex_expect("Cast failed"); } - let elements = self - .elements() - .take(take_indices.into_array())? - .to_canonical()? - .into_array(); + let elements = self.elements().take(take_indices.into_array())?; let offsets = new_offsets.into_array(); let sizes = new_sizes.into_array(); @@ -624,7 +620,7 @@ mod tests { 100 )); } - + #[test] fn heuristic_accepts_fsl() { use std::sync::Arc; @@ -637,9 +633,7 @@ mod tests { assert!(!ListViewArray::should_use_list_by_list( &fsl, 256_000, 1_000 )); - assert!(ListViewArray::should_use_list_by_list( - &fsl, 384_000, 1_000 - )); + assert!(ListViewArray::should_use_list_by_list(&fsl, 384_000, 1_000)); } #[test]