diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index c310b18eeed..4b1ebd5b332 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -169,5 +169,9 @@ harness = false name = "take_fsl" harness = false +[[bench]] +name = "listview_rebuild" +harness = false + [package.metadata.cargo-machete] ignored = ["getrandom_v03"] diff --git a/vortex-array/benches/listview_rebuild.rs b/vortex-array/benches/listview_rebuild.rs new file mode 100644 index 00000000000..dec1f18fa41 --- /dev/null +++ b/vortex-array/benches/listview_rebuild.rs @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Benchmarks for ListView rebuild across different element types and scenarios. +//! +//! The heuristic internally picks between bulk-take (single `take`) and per-list-copy (per-list +//! `slice` + builder copy) strategies. These scenarios exercise both paths: +//! - `i32_at_heuristic_threshold` straddles the decision boundary +//! - `i8_large_lists_per_list_copy` and `i32_small_overlapping_bulk_take` exercise the extremes +//! - `varbinview_always_bulk_take` and `struct_always_bulk_take` exercise special-case types + +#![allow(clippy::unwrap_used)] +#![allow(clippy::cast_possible_truncation)] + +use divan::Bencher; +use vortex_array::IntoArray; +use vortex_array::arrays::ListViewArray; +use vortex_array::arrays::ListViewRebuildMode; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_dtype::FieldNames; + +fn main() { + divan::main(); +} + +fn make_primitive_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray { + let element_count = step * num_lists + list_size; + let elements = PrimitiveArray::from_iter(0..element_count as i32).into_array(); + let offsets: Buffer = (0..num_lists).map(|i| (i * step) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(list_size as u32, num_lists).collect(); + ListViewArray::new( + elements, + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) +} + +fn make_i8_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray { + let element_count = step * num_lists + list_size; + let elements = PrimitiveArray::from_iter((0..element_count).map(|i| i as i8)).into_array(); + let offsets: Buffer = (0..num_lists).map(|i| (i * step) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(list_size as u32, num_lists).collect(); + ListViewArray::new( + elements, + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) +} + +fn make_varbinview_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray { + let element_count = step * num_lists + list_size; + let strings: Vec = (0..element_count) + .map(|i| { + if i % 3 == 0 { + format!("long-string-value-{i:06}") + } else { + format!("s{i}") + } + }) + .collect(); + let elements = VarBinViewArray::from_iter_str(strings.iter().map(|s| s.as_str())).into_array(); + let offsets: Buffer = (0..num_lists).map(|i| (i * step) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(list_size as u32, num_lists).collect(); + ListViewArray::new( + elements, + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) +} + +fn make_struct_lv(num_lists: usize, list_size: usize, step: usize) -> ListViewArray { + let element_count = step * num_lists + list_size; + let field_a = PrimitiveArray::from_iter(0..element_count as i32).into_array(); + let field_b = PrimitiveArray::from_iter((0..element_count).map(|i| i as f64)).into_array(); + let elements = StructArray::try_new( + FieldNames::from(["a", "b"]), + vec![field_a, field_b], + element_count, + Validity::NonNullable, + ) + .unwrap() + .into_array(); + + let offsets: Buffer = (0..num_lists).map(|i| (i * step) as u32).collect(); + let sizes: Buffer = std::iter::repeat_n(list_size as u32, num_lists).collect(); + ListViewArray::new( + elements, + offsets.into_array(), + sizes.into_array(), + Validity::NonNullable, + ) +} + +// ── i32 around threshold (8+4)*64 = 768: exercises both strategies ────────── +const HEURISTIC_THRESHOLD_SIZES: &[usize] = &[512, 768, 1024]; + +#[divan::bench(args = HEURISTIC_THRESHOLD_SIZES)] +fn i32_at_heuristic_threshold(bencher: Bencher, list_size: usize) { + let lv = make_primitive_lv(1_000, list_size, list_size); + bencher + .with_inputs(|| &lv) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} + +// ── i8 with 65K-element lists: deep into per-list-copy territory ───────────── +#[divan::bench] +fn i8_large_lists_per_list_copy(bencher: Bencher) { + let lv = make_i8_lv(1_000, 65_536, 65_536); + bencher + .with_inputs(|| &lv) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} + +// ── i32 with 8-element overlapping lists: deep into bulk-take territory ────── +#[divan::bench] +fn i32_small_overlapping_bulk_take(bencher: Bencher) { + let lv = make_primitive_lv(1_000, 8, 1); + bencher + .with_inputs(|| &lv) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} + +// ── VarBinView: variable-width always uses bulk-take ───────────────────────── +#[divan::bench] +fn varbinview_always_bulk_take(bencher: Bencher) { + let lv = make_varbinview_lv(1_000, 1_024, 1_024); + bencher + .with_inputs(|| &lv) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} + +// ── Struct{i32, f64}: struct always uses bulk-take ─────────────────────────── +#[divan::bench] +fn struct_always_bulk_take(bencher: Bencher) { + let lv = make_struct_lv(1_000, 1_024, 1_024); + bencher + .with_inputs(|| &lv) + .bench_refs(|lv| lv.rebuild(ListViewRebuildMode::MakeZeroCopyToList).unwrap()); +} diff --git a/vortex-array/src/arrays/listview/rebuild.rs b/vortex-array/src/arrays/listview/rebuild.rs index d2714b671e6..0b29e9ad509 100644 --- a/vortex-array/src/arrays/listview/rebuild.rs +++ b/vortex-array/src/arrays/listview/rebuild.rs @@ -3,6 +3,7 @@ use num_traits::FromPrimitive; use vortex_buffer::BufferMut; +use vortex_dtype::DType; use vortex_dtype::IntegerPType; use vortex_dtype::Nullability; use vortex_dtype::match_each_integer_ptype; @@ -103,12 +104,147 @@ impl ListViewArray { }) } - // TODO(connor)[ListView]: We should benchmark if it is faster to use `take` on the elements - // instead of using a builder. - /// The inner function for `rebuild_zero_copy_to_list`, which rebuilds a `ListViewArray` piece - /// by piece. + /// Picks between [`naive_rebuild_bulk_take`](Self::naive_rebuild_bulk_take) and + /// [`naive_rebuild_per_list_copy`](Self::naive_rebuild_per_list_copy) via heuristic. fn naive_rebuild( &self, + ) -> VortexResult { + let element_dtype = self + .dtype() + .as_list_element_opt() + .vortex_expect("somehow had a canonical list that was not a list"); + let sizes_canonical = self.sizes().to_primitive(); + let total: u64 = sizes_canonical + .as_slice::() + .iter() + .map(|s| (*s).as_() as u64) + .sum(); + let use_per_list_copy = Self::should_use_per_list_copy(element_dtype, total, self.len()); + + if use_per_list_copy { + self.naive_rebuild_per_list_copy::() + } else { + self.naive_rebuild_bulk_take::() + } + } + + /// Decides whether the per-list-copy strategy should be used over bulk-take. + /// + /// Empirical heuristic: use per-list-copy when `avg_list_size >= (8 + E) * 64 / fsl_divisor`. + /// - `8` — extra per-element cost of bulk-take (writing a u64 index that per-list-copy avoids) + /// - `E` — element byte width; larger elements make the builder's per-call overhead + /// (slice dispatch, `to_primitive()`, validity mask) relatively more expensive, + /// raising the threshold before per-list-copy's sequential memcpy wins + /// - `64` — base number of elements per list needed to amortize per-list-copy's ~200ns + /// per-list overhead (upfront canonicalize + per-list slice dispatch) + /// - `fsl_divisor` — for `FixedSizeList<_, N>`, `clamp(N, 1, 2)`: bulk-take expands each + /// outer index into N inner sub-indices, so the threshold is lowered + fn should_use_per_list_copy( + element_dtype: &DType, + total_output_elements: u64, + num_lists: usize, + ) -> bool { + if num_lists == 0 { + return false; + } + let avg = total_output_elements / num_lists as u64; + match element_dtype { + // Struct: per-list-copy creates separate per-field builders and reconstructs + // the structure element-by-element, making it 10-40x slower than bulk-take at + // all tested sizes (2-field and 4-field). This overhead is fundamental to the + // builder architecture and does not amortize with larger lists. + DType::Struct(..) => false, + // FixedSizeList: bulk-take expands each outer index into N inner + // sub-indices, making it ~2x more expensive per element than flat types. + // We account for this by dividing the base threshold by clamp(N, 1, 2): + // - N >= 2: threshold halved (e.g. FSL → (8+4)*32 = 384) + // - N == 1: threshold unchanged (same as flat types) + // Uses the *inner* element size, not the total FSL size, because per-list-copy + // extends the inner flat buffer directly. + DType::FixedSizeList(inner, n, ..) => inner + .element_size() + .is_some_and(|e| avg >= (8 + e as u64) * 64 / (*n as u64).clamp(1, 2)), + _ => { + if let Some(e) = element_dtype.element_size() { + // Flat fixed-width types (primitives, bools): the base formula. + // Both strategies copy E bytes per element, but bulk-take additionally + // writes an 8-byte u64 index and per-list-copy has per-list overhead + // (~200ns for slice dispatch + builder calls). The (8 + E) factor + // also captures the builder's higher per-element overhead vs SIMD + // bulk-take. Validated across i8/i16/i32/f64/bool. + avg >= (8 + e as u64) * 64 + } else { + // List: per-list-copy does cheap offset slicing + bulk + // inner element copy, crossing over at ~1024. We use a fixed threshold + // because inner list sizes aren't cheaply observable here. + // Utf8/Binary: always bulk-take — their builder does expensive Arc + // ref-count bumps and view metadata copying (crossover >50K). + // List: always bulk-take (same reason as Utf8/Binary). + element_dtype + .as_list_element_opt() + .and_then(|d| d.element_size()) + .is_some_and(|_| avg >= 1024) + } + } + } + } + + /// Rebuilds elements using the **bulk-take** strategy: collect all element indices into a flat + /// `BufferMut`, perform a single bulk `take`, then canonicalize. + fn naive_rebuild_bulk_take( + &self, + ) -> VortexResult { + let offsets_canonical = self.offsets().to_primitive(); + let offsets_slice = offsets_canonical.as_slice::(); + let sizes_canonical = self.sizes().to_primitive(); + let sizes_slice = sizes_canonical.as_slice::(); + + let len = offsets_slice.len(); + + let mut new_offsets = BufferMut::::with_capacity(len); + let mut new_sizes = BufferMut::::with_capacity(len); + let mut take_indices = BufferMut::::with_capacity(self.elements().len()); + + let mut n_elements = NewOffset::zero(); + for index in 0..len { + if !self.is_valid(index)? { + new_offsets.push(n_elements); + new_sizes.push(S::zero()); + continue; + } + + let offset = offsets_slice[index]; + let size = sizes_slice[index]; + let start = offset.as_(); + let stop = start + size.as_(); + + new_offsets.push(n_elements); + new_sizes.push(size); + take_indices.extend(start as u64..stop as u64); + n_elements += num_traits::cast(size).vortex_expect("Cast failed"); + } + + let elements = self + .elements() + .take(take_indices.into_array())? + .to_canonical()? + .into_array(); + let offsets = new_offsets.into_array(); + let sizes = new_sizes.into_array(); + + // SAFETY: same invariants as `naive_rebuild_per_list_copy` — offsets are sequential and + // non-overlapping, all (offset, size) pairs reference valid elements, and the validity + // array is preserved from the original. + Ok(unsafe { + ListViewArray::new_unchecked(elements, offsets, sizes, self.validity.clone()) + .with_zero_copy_to_list(true) + }) + } + + /// Rebuilds elements using the **per-list-copy** strategy: canonicalize elements upfront, then + /// per-list `slice` + `extend_from_array` into a typed builder. + fn naive_rebuild_per_list_copy( + &self, ) -> VortexResult { let element_dtype = self .dtype() @@ -262,9 +398,13 @@ impl ListViewArray { } #[cfg(test)] +#[allow(clippy::cast_possible_truncation)] mod tests { + use rstest::rstest; use vortex_buffer::BitBuffer; + use vortex_dtype::DType; use vortex_dtype::Nullability; + use vortex_dtype::PType; use vortex_error::VortexResult; use super::ListViewRebuildMode; @@ -448,4 +588,81 @@ mod tests { ); Ok(()) } + + // ── should_use_per_list_copy heuristic tests ─────────────────────────── + + #[test] + fn heuristic_rejects_zero_lists() { + let prim = DType::Primitive(PType::I32, Nullability::NonNullable); + assert!(!ListViewArray::should_use_per_list_copy(&prim, 0, 0)); + } + + #[test] + fn heuristic_rejects_struct_always() { + let struct_dtype = DType::struct_( + [ + ("a", DType::Primitive(PType::I32, Nullability::NonNullable)), + ("b", DType::Primitive(PType::F64, Nullability::NonNullable)), + ], + Nullability::NonNullable, + ); + assert!(!ListViewArray::should_use_per_list_copy( + &struct_dtype, + 100_000, + 100 + )); + } + + #[test] + fn heuristic_accepts_fsl() { + use std::sync::Arc; + // FixedSizeList: threshold = (8+4)*64/2 = 384. + let fsl = DType::FixedSizeList( + Arc::new(DType::Primitive(PType::I32, Nullability::NonNullable)), + 4, + Nullability::NonNullable, + ); + assert!(!ListViewArray::should_use_per_list_copy( + &fsl, 256_000, 1_000 + )); + assert!(ListViewArray::should_use_per_list_copy( + &fsl, 384_000, 1_000 + )); + } + + #[test] + fn heuristic_accepts_list_of_fixed_width() { + let list_i32 = DType::list( + DType::Primitive(PType::I32, Nullability::NonNullable), + Nullability::NonNullable, + ); + // List: threshold = 1024. + assert!(!ListViewArray::should_use_per_list_copy( + &list_i32, 512_000, 1_000 + )); + assert!(ListViewArray::should_use_per_list_copy( + &list_i32, 1_024_000, 1_000 + )); + } + + #[rstest] + #[case(PType::I32, 512, false)] // i32: threshold=(8+4)*64=768, 512<768 + #[case(PType::I32, 768, true)] // i32: 768>=768 + #[case(PType::I8, 512, false)] // i8: threshold=(8+1)*64=576, 512<576 + #[case(PType::I8, 576, true)] // i8: 576>=576 + #[case(PType::F64, 512, false)] // f64: threshold=(8+8)*64=1024, 512<1024 + #[case(PType::F64, 1024, true)] // f64: 1024>=1024 + fn heuristic_threshold( + #[case] ptype: PType, + #[case] avg: u64, + #[case] expect_per_list_copy: bool, + ) { + let dtype = DType::Primitive(ptype, Nullability::NonNullable); + let num_lists = 1000_usize; + let total = avg * num_lists as u64; + assert_eq!( + ListViewArray::should_use_per_list_copy(&dtype, total, num_lists), + expect_per_list_copy, + ); + } }