From 944986f08a9e6dd3354752318e2fb07c6378c4fd Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 3 Mar 2026 09:14:42 -0500 Subject: [PATCH 1/3] fix: Fix bug in array_has scalar path with sliced arrays --- datafusion/functions-nested/src/array_has.rs | 74 ++++++++++++++++-- datafusion/functions-nested/src/position.rs | 81 ++++++++++++++++++-- 2 files changed, 140 insertions(+), 15 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index ace69de66f5c3..d7a0f4458617b 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -352,8 +352,6 @@ fn array_has_dispatch_for_scalar( haystack: ArrayWrapper<'_>, needle: &dyn Datum, ) -> Result { - let values = haystack.values(); - let is_nested = values.data_type().is_nested(); // If first argument is empty list (second argument is non-null), return false // i.e. array_has([], non-null element) -> false if haystack.len() == 0 { @@ -362,7 +360,17 @@ fn array_has_dispatch_for_scalar( None, ))); } - let eq_array = compare_with_eq(values, needle, is_nested)?; + + // Only compare the visible portion of the values buffer, which avoids + // wasted work for sliced ListArrays. + let offsets: Vec = haystack.offsets().collect(); + let first_offset = offsets[0]; + let visible_values = haystack + .values() + .slice(first_offset, offsets[offsets.len() - 1] - first_offset); + + let is_nested = visible_values.data_type().is_nested(); + let eq_array = compare_with_eq(&visible_values, needle, is_nested)?; // When a haystack element is null, `eq()` returns null (not false). // In Arrow, a null BooleanArray entry has validity=0 but an @@ -382,10 +390,14 @@ fn array_has_dispatch_for_scalar( ArrayWrapper::LargeList(arr) => arr.nulls(), }; let mut matches = eq_bits.set_indices().peekable(); - let mut values = BooleanBufferBuilder::new(haystack.len()); - values.append_n(haystack.len(), false); + let mut result = BooleanBufferBuilder::new(haystack.len()); + result.append_n(haystack.len(), false); + + // Match positions are relative to visible_values (0-based), so + // subtract first_offset from each offset when comparing. + for (i, window) in offsets.windows(2).enumerate() { + let end = window[1] - first_offset; - for (i, (_start, end)) in haystack.offsets().tuple_windows().enumerate() { let has_match = matches.peek().is_some_and(|&p| p < end); // Advance past all match positions in this row's range. @@ -394,14 +406,14 @@ fn array_has_dispatch_for_scalar( } if has_match && validity.is_none_or(|v| v.is_valid(i)) { - values.set_bit(i, true); + result.set_bit(i, true); } } // A null haystack row always produces a null output, so we can // reuse the haystack's null buffer directly. Ok(Arc::new(BooleanArray::new( - values.finish(), + result.finish(), validity.cloned(), ))) } @@ -1066,6 +1078,52 @@ mod tests { Ok(()) } + #[test] + fn test_array_has_sliced_list() -> Result<(), DataFusionError> { + // [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30, 40], [50, 60]] + let list = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(10), Some(20)]), + Some(vec![Some(30), Some(40)]), + Some(vec![Some(50), Some(60)]), + Some(vec![Some(70), Some(80)]), + ]); + let sliced = list.slice(1, 2); + let haystack_field = + Arc::new(Field::new("haystack", sliced.data_type().clone(), true)); + let needle_field = Arc::new(Field::new("needle", DataType::Int32, true)); + let return_field = Arc::new(Field::new("return", DataType::Boolean, true)); + + // Search for elements that exist only in sliced-away rows: + // 10 is in the prefix row, 70 is in the suffix row. + let invoke = |needle: i32| -> Result { + ArrayHas::new() + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(sliced.clone())), + ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))), + ], + arg_fields: vec![ + Arc::clone(&haystack_field), + Arc::clone(&needle_field), + ], + number_rows: 2, + return_field: Arc::clone(&return_field), + config_options: Arc::new(ConfigOptions::default()), + })? + .into_array(2) + }; + + let output = invoke(10)?.as_boolean().clone(); + assert_eq!(output.value(0), false); + assert_eq!(output.value(1), false); + + let output = invoke(70)?.as_boolean().clone(); + assert_eq!(output.value(0), false); + assert_eq!(output.value(1), false); + + Ok(()) + } + #[test] fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> { let haystack_field = Arc::new(Field::new("haystack", DataType::Null, true)); diff --git a/datafusion/functions-nested/src/position.rs b/datafusion/functions-nested/src/position.rs index ba16d08538c6d..0214b1552bc9c 100644 --- a/datafusion/functions-nested/src/position.rs +++ b/datafusion/functions-nested/src/position.rs @@ -230,26 +230,36 @@ fn array_position_scalar( "array_position", &[list_array.values(), element_array], )?; - let element_datum = Scalar::new(Arc::clone(element_array)); - - let offsets = list_array.offsets(); - let validity = list_array.nulls(); if list_array.len() == 0 { return Ok(Arc::new(UInt64Array::new_null(0))); } + let element_datum = Scalar::new(Arc::clone(element_array)); + let validity = list_array.nulls(); + + // Only compare the visible portion of the values buffer, which avoids + // wasted work for sliced ListArrays. + let offsets = list_array.offsets(); + let first_offset = offsets[0].as_usize(); + let last_offset = offsets[list_array.len()].as_usize(); + let visible_values = list_array + .values() + .slice(first_offset, last_offset - first_offset); + // `not_distinct` treats NULL=NULL as true, matching the semantics of // `array_position` - let eq_array = arrow_ord::cmp::not_distinct(list_array.values(), &element_datum)?; + let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &element_datum)?; let eq_bits = eq_array.values(); let mut result: Vec> = Vec::with_capacity(list_array.len()); let mut matches = eq_bits.set_indices().peekable(); + // Match positions are relative to visible_values (0-based), so + // subtract first_offset from each offset when comparing. for i in 0..list_array.len() { - let start = offsets[i].as_usize(); - let end = offsets[i + 1].as_usize(); + let start = offsets[i].as_usize() - first_offset; + let end = offsets[i + 1].as_usize() - first_offset; if validity.is_some_and(|v| v.is_null(i)) { // Null row -> null output; advance past matches in range @@ -474,3 +484,60 @@ fn general_positions( ListArray::from_iter_primitive::(data), )) } + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::AsArray; + use arrow::datatypes::Int32Type; + use datafusion_common::config::ConfigOptions; + use datafusion_expr::ScalarFunctionArgs; + + #[test] + fn test_array_position_sliced_list() -> Result<()> { + // [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30, 40], [50, 60]] + let list = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(10), Some(20)]), + Some(vec![Some(30), Some(40)]), + Some(vec![Some(50), Some(60)]), + Some(vec![Some(70), Some(80)]), + ]); + let sliced = list.slice(1, 2); + let haystack_field = + Arc::new(Field::new("haystack", sliced.data_type().clone(), true)); + let needle_field = Arc::new(Field::new("needle", DataType::Int32, true)); + let return_field = Arc::new(Field::new("return", UInt64, true)); + + // Search for elements that exist only in sliced-away rows: + // 10 is in the prefix row, 70 is in the suffix row. + let invoke = |needle: i32| -> Result { + ArrayPosition::new() + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(sliced.clone())), + ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))), + ], + arg_fields: vec![ + Arc::clone(&haystack_field), + Arc::clone(&needle_field), + ], + number_rows: 2, + return_field: Arc::clone(&return_field), + config_options: Arc::new(ConfigOptions::default()), + })? + .into_array(2) + }; + + let output = invoke(10)?; + let output = output.as_primitive::(); + assert!(output.is_null(0)); + assert!(output.is_null(1)); + + let output = invoke(70)?; + let output = output.as_primitive::(); + assert!(output.is_null(0)); + assert!(output.is_null(1)); + + Ok(()) + } +} From d15c2eb2ddabe55aea16cf5fb619312ef9825b67 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 3 Mar 2026 13:08:25 -0500 Subject: [PATCH 2/3] Fix clippy --- datafusion/functions-nested/src/array_has.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index d7a0f4458617b..1f6c01e747e7a 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -1114,12 +1114,12 @@ mod tests { }; let output = invoke(10)?.as_boolean().clone(); - assert_eq!(output.value(0), false); - assert_eq!(output.value(1), false); + assert!(!output.value(0)); + assert!(!output.value(1)); let output = invoke(70)?.as_boolean().clone(); - assert_eq!(output.value(0), false); - assert_eq!(output.value(1), false); + assert!(!output.value(0)); + assert!(!output.value(1)); Ok(()) } From 0b20330d8c2d6287d79658acb36d2a38732b1058 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 4 Mar 2026 08:30:31 -0500 Subject: [PATCH 3/3] Tweak comment, per review --- datafusion/functions-nested/src/array_has.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions-nested/src/array_has.rs b/datafusion/functions-nested/src/array_has.rs index 1f6c01e747e7a..76cf786c954dc 100644 --- a/datafusion/functions-nested/src/array_has.rs +++ b/datafusion/functions-nested/src/array_has.rs @@ -361,8 +361,8 @@ fn array_has_dispatch_for_scalar( ))); } - // Only compare the visible portion of the values buffer, which avoids - // wasted work for sliced ListArrays. + // For sliced ListArrays, values() returns the full underlying array but + // only elements between the first and last offset are visible. let offsets: Vec = haystack.offsets().collect(); let first_offset = offsets[0]; let visible_values = haystack