diff --git a/encodings/runend/src/array.rs b/encodings/runend/src/array.rs index 90f271cdc7d..8449431dac3 100644 --- a/encodings/runend/src/array.rs +++ b/encodings/runend/src/array.rs @@ -215,6 +215,21 @@ impl RunEndArray { return Ok(()); } + debug_assert!({ + // Run ends must be strictly sorted for binary search to work correctly. + let pre_validation = ends.statistics().to_owned(); + + let is_sorted = ends + .statistics() + .compute_is_strict_sorted() + .unwrap_or(false); + + // Preserve the original statistics since compute_is_strict_sorted may have mutated them. + // We don't want to run with different stats in debug mode and outside. + ends.statistics().inherit(pre_validation.iter()); + is_sorted + }); + // Skip host-only validation when ends are not host-resident. if !ends.is_host() { return Ok(()); diff --git a/encodings/sequence/src/array.rs b/encodings/sequence/src/array.rs index 074c7f3b99b..efa8a2c12b1 100644 --- a/encodings/sequence/src/array.rs +++ b/encodings/sequence/src/array.rs @@ -15,8 +15,11 @@ use vortex_array::ProstMetadata; use vortex_array::SerializeMetadata; use vortex_array::arrays::PrimitiveArray; use vortex_array::buffer::BufferHandle; +use vortex_array::expr::stats::Precision as StatPrecision; +use vortex_array::expr::stats::Stat; use vortex_array::serde::ArrayChildren; use vortex_array::stats::ArrayStats; +use vortex_array::stats::StatsSet; use vortex_array::stats::StatsSetRef; use vortex_array::validity::Validity; use vortex_array::vtable; @@ -127,13 +130,30 @@ impl SequenceArray { length: usize, ) -> Self { let dtype = DType::Primitive(ptype, nullability); + + // A sequence A[i] = base + i * multiplier is sorted iff multiplier >= 0, + // and strictly sorted iff multiplier > 0. + let m_int = multiplier.cast::(); + let is_sorted = m_int >= 0; + let is_strict_sorted = m_int > 0; + + // SAFETY: we don't have duplicate stats + let stats_set = unsafe { + StatsSet::new_unchecked(vec![ + (Stat::IsSorted, StatPrecision::Exact(is_sorted.into())), + ( + Stat::IsStrictSorted, + StatPrecision::Exact(is_strict_sorted.into()), + ), + ]) + }; + Self { base, multiplier, dtype, len: length, - // TODO(joe): add stats, on construct or on use? - stats_set: Default::default(), + stats_set: ArrayStats::from(stats_set), } } @@ -393,7 +413,11 @@ impl SequenceVTable { mod tests { use vortex_array::arrays::PrimitiveArray; use vortex_array::assert_arrays_eq; + use vortex_array::expr::stats::Precision as StatPrecision; + use vortex_array::expr::stats::Stat; + use vortex_array::expr::stats::StatsProviderExt; use vortex_dtype::Nullability; + use vortex_error::VortexResult; use vortex_scalar::Scalar; use vortex_scalar::ScalarValue; @@ -444,4 +468,52 @@ mod tests { assert!(SequenceArray::typed_new(127i8, 1i8, Nullability::NonNullable, 2).is_err()); assert!(SequenceArray::typed_new(-128i8, -1i8, Nullability::NonNullable, 2).is_err()); } + + #[test] + fn positive_multiplier_is_strict_sorted() -> VortexResult<()> { + let arr = SequenceArray::typed_new(0i64, 3, Nullability::NonNullable, 4)?; + + let is_sorted = arr + .statistics() + .with_typed_stats_set(|s| s.get_as::(Stat::IsSorted)); + assert_eq!(is_sorted, Some(StatPrecision::Exact(true))); + + let is_strict_sorted = arr + .statistics() + .with_typed_stats_set(|s| s.get_as::(Stat::IsStrictSorted)); + assert_eq!(is_strict_sorted, Some(StatPrecision::Exact(true))); + Ok(()) + } + + #[test] + fn zero_multiplier_is_sorted_not_strict() -> VortexResult<()> { + let arr = SequenceArray::typed_new(5i64, 0, Nullability::NonNullable, 4)?; + + let is_sorted = arr + .statistics() + .with_typed_stats_set(|s| s.get_as::(Stat::IsSorted)); + assert_eq!(is_sorted, Some(StatPrecision::Exact(true))); + + let is_strict_sorted = arr + .statistics() + .with_typed_stats_set(|s| s.get_as::(Stat::IsStrictSorted)); + assert_eq!(is_strict_sorted, Some(StatPrecision::Exact(false))); + Ok(()) + } + + #[test] + fn negative_multiplier_not_sorted() -> VortexResult<()> { + let arr = SequenceArray::typed_new(10i64, -1, Nullability::NonNullable, 4)?; + + let is_sorted = arr + .statistics() + .with_typed_stats_set(|s| s.get_as::(Stat::IsSorted)); + assert_eq!(is_sorted, Some(StatPrecision::Exact(false))); + + let is_strict_sorted = arr + .statistics() + .with_typed_stats_set(|s| s.get_as::(Stat::IsStrictSorted)); + assert_eq!(is_strict_sorted, Some(StatPrecision::Exact(false))); + Ok(()) + } } diff --git a/vortex-array/src/arrays/validation_tests.rs b/vortex-array/src/arrays/validation_tests.rs index 5d539c8e6d3..31906dd27f2 100644 --- a/vortex-array/src/arrays/validation_tests.rs +++ b/vortex-array/src/arrays/validation_tests.rs @@ -98,8 +98,9 @@ mod tests { } #[test] - fn test_varbin_array_validation_failure_offsets_not_monotonic() { - // Invalid case: offsets are not monotonically increasing. + fn test_varbin_array_validation_non_monotonic_offsets_accepted() { + // VarBin does not validate monotonicity of offsets at construction time. + // Sortedness is enforced at the builder level instead. let offsets = buffer![0i32, 3, 2, 5].into_array(); // 3 -> 2 is decreasing. let bytes = ByteBuffer::from(vec![0u8, 1, 2, 3, 4]); let result = VarBinArray::try_new( @@ -109,8 +110,7 @@ mod tests { Validity::NonNullable, ); - assert!(matches!(result, Err(VortexError::InvalidArgument(_, _)))); - assert!(result.is_err()); + assert!(result.is_ok()); } #[test] diff --git a/vortex-array/src/arrays/varbin/array.rs b/vortex-array/src/arrays/varbin/array.rs index c28e59db115..23e6442d59c 100644 --- a/vortex-array/src/arrays/varbin/array.rs +++ b/vortex-array/src/arrays/varbin/array.rs @@ -200,11 +200,6 @@ impl VarBinArray { "Offsets must have at least one element" ); - // Check offsets are sorted - if let Some(is_sorted) = offsets.statistics().compute_is_sorted() { - vortex_ensure!(is_sorted, "offsets must be sorted"); - } - // Skip host-only validation when offsets/bytes are not host-resident. if offsets.is_host() && bytes.is_on_host() { let last_offset = offsets diff --git a/vortex-array/src/arrays/varbin/builder.rs b/vortex-array/src/arrays/varbin/builder.rs index 2c81676be78..f57f815fa77 100644 --- a/vortex-array/src/arrays/varbin/builder.rs +++ b/vortex-array/src/arrays/varbin/builder.rs @@ -11,6 +11,8 @@ use vortex_error::vortex_panic; use crate::IntoArray; use crate::arrays::primitive::PrimitiveArray; use crate::arrays::varbin::VarBinArray; +use crate::expr::stats::Precision; +use crate::expr::stats::Stat; use crate::validity::Validity; pub struct VarBinBuilder { @@ -94,6 +96,17 @@ impl VarBinBuilder { let validity = Validity::from_bit_buffer(nulls, dtype.nullability()); + // The builder guarantees offsets are monotonically increasing, so we can set + // this stat eagerly. This avoids an O(n) recomputation when the array is + // deserialized and VarBinArray::validate checks sortedness. + debug_assert!( + offsets.statistics().compute_is_sorted().unwrap_or(false), + "VarBinBuilder offsets must be sorted" + ); + offsets + .statistics() + .set(Stat::IsSorted, Precision::Exact(true.into())); + // SAFETY: The builder maintains all invariants: // - Offsets are monotonically increasing starting from 0 (guaranteed by builder logic). // - Bytes buffer contains exactly the data referenced by offsets. @@ -109,9 +122,13 @@ impl VarBinBuilder { mod tests { use vortex_dtype::DType; use vortex_dtype::Nullability::Nullable; + use vortex_error::VortexResult; use vortex_scalar::Scalar; use crate::arrays::varbin::builder::VarBinBuilder; + use crate::expr::stats::Precision; + use crate::expr::stats::Stat; + use crate::expr::stats::StatsProviderExt; #[test] fn test_builder() { @@ -129,4 +146,33 @@ mod tests { ); assert!(array.scalar_at(1).unwrap().is_null()); } + + #[test] + fn offsets_have_is_sorted_stat() -> VortexResult<()> { + let mut builder = VarBinBuilder::::with_capacity(0); + builder.append_value(b"aaa"); + builder.append_null(); + builder.append_value(b"bbb"); + let array = builder.finish(DType::Utf8(Nullable)); + + let is_sorted = array + .offsets() + .statistics() + .with_typed_stats_set(|s| s.get_as::(Stat::IsSorted)); + assert_eq!(is_sorted, Some(Precision::Exact(true))); + Ok(()) + } + + #[test] + fn empty_builder_offsets_have_is_sorted_stat() -> VortexResult<()> { + let builder = VarBinBuilder::::new(); + let array = builder.finish(DType::Utf8(Nullable)); + + let is_sorted = array + .offsets() + .statistics() + .with_typed_stats_set(|s| s.get_as::(Stat::IsSorted)); + assert_eq!(is_sorted, Some(Precision::Exact(true))); + Ok(()) + } } diff --git a/vortex-array/src/expr/stats/mod.rs b/vortex-array/src/expr/stats/mod.rs index cba33e2743c..1eb8414feeb 100644 --- a/vortex-array/src/expr/stats/mod.rs +++ b/vortex-array/src/expr/stats/mod.rs @@ -44,9 +44,11 @@ pub enum Stat { /// Whether all values are the same (nulls are not equal to other non-null values, /// so this is true iff all values are null or all values are the same non-null value) IsConstant = 0, - /// Whether the non-null values in the array are sorted (i.e., we skip nulls) + /// Whether the non-null values in the array are sorted in ascending order (i.e., we skip nulls) + /// This may later be extended to support descending order, but for now we only support ascending order. IsSorted = 1, - /// Whether the non-null values in the array are strictly sorted (i.e., sorted with no duplicates) + /// Whether the non-null values in the array are strictly sorted in ascending order (i.e., sorted with no duplicates) + /// This may later be extended to support descending order, but for now we only support ascending order. IsStrictSorted = 2, /// The maximum value in the array (ignoring nulls, unless all values are null) Max = 3,