From 98e4325e60ba9e64921f8c7bab91f8326cfb742d Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 5 Jun 2026 11:30:26 +0800 Subject: [PATCH 1/5] feat(regex): optimize regexpcount implementation and improve input handling - Replaced 8-arm scalar/array match with a single row loop. - Introduced private input sources: StringValueSource and StartValueSource. - Centralized length checks through validate_array_len function. - Preserved scalar NULL regex zero short-circuit behavior. - Maintained compile-once path for scalar regex and flags. - Retained cache mechanism for row-varying regex/flags. - Removed dependency on itertools::izip. --- datafusion/functions/src/regex/regexpcount.rs | 373 ++++++------------ 1 file changed, 119 insertions(+), 254 deletions(-) diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index d970eccc43a54..d498d69348588 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -28,7 +28,6 @@ use datafusion_expr::{ TypeSignature::Exact, TypeSignature::Uniform, Volatility, }; use datafusion_macros::user_doc; -use itertools::izip; use regex::Regex; use std::collections::HashMap; use std::sync::Arc; @@ -256,296 +255,162 @@ fn regexp_count( } fn regexp_count_inner<'a, S>( - values: &S, - regex_array: &S, + values: &'a S, + regex_array: &'a S, is_regex_scalar: bool, - start_array: Option<&Int64Array>, + start_array: Option<&'a Int64Array>, is_start_scalar: bool, - flags_array: Option<&S>, + flags_array: Option<&'a S>, is_flags_scalar: bool, ) -> Result where S: StringArrayType<'a>, { - let (regex_scalar, is_regex_scalar) = if is_regex_scalar || regex_array.len() == 1 { - ( - (!regex_array.is_null(0)).then(|| regex_array.value(0)), - true, - ) - } else { - (None, false) - }; - - let (start_array, start_scalar, is_start_scalar) = - if let Some(start_array) = start_array { - if is_start_scalar || start_array.len() == 1 { - (None, Some(start_array.value(0)), true) - } else { - (Some(start_array), None, false) - } - } else { - (None, Some(1), true) - }; + let regex = StringValueSource::regex(regex_array, is_regex_scalar); - let (flags_array, flags_scalar, is_flags_scalar) = - if let Some(flags_array) = flags_array { - if is_flags_scalar || flags_array.len() == 1 { - (None, Some(flags_array.value(0)), true) - } else { - (Some(flags_array), None, false) - } - } else { - (None, None, true) - }; + // Preserve existing short-circuit behavior: a scalar NULL regex produces zeros + // for every row without validating start or flags. + if regex.is_null_scalar() { + return Ok(Arc::new(Int64Array::from(vec![0; values.len()]))); + } - let mut regex_cache = HashMap::new(); + let start = StartValueSource::new(start_array, is_start_scalar); + let flags = StringValueSource::flags(flags_array, is_flags_scalar); - match (is_regex_scalar, is_start_scalar, is_flags_scalar) { - (true, true, true) => { - let regex = match regex_scalar { - None => { - return Ok(Arc::new(Int64Array::from(vec![0; values.len()]))); - } - Some(regex) => regex, - }; + regex.validate_len("regex_array", values.len())?; + start.validate_len(values.len())?; + flags.validate_len("flags_array", values.len())?; - let pattern = compile_regex(regex, flags_scalar)?; + let scalar_pattern = match (regex.scalar(), flags.scalar()) { + (Some(Some(regex)), Some(flags)) => Some(compile_regex(regex, flags)?), + _ => None, + }; - Ok(Arc::new( - values - .iter() - .map(|value| count_matches(value, &pattern, start_scalar)) - .collect::>()?, - )) - } - (true, true, false) => { - let regex = match regex_scalar { - None => { - return Ok(Arc::new(Int64Array::from(vec![0; values.len()]))); - } + let mut regex_cache = HashMap::new(); + let counts = (0..values.len()) + .map(|row| { + let regex = match regex.value(row) { + None => return Ok(0), Some(regex) => regex, }; + let start = start.value(row); + let flags = flags.value(row); - let flags_array = flags_array.unwrap(); - if values.len() != flags_array.len() { - return Err(ArrowError::ComputeError(format!( - "flags_array must be the same length as values array; got {} and {}", - flags_array.len(), - values.len(), - ))); + if let Some(pattern) = &scalar_pattern { + count_matches(values.value_opt(row), pattern, start) + } else { + let pattern = compile_and_cache_regex(regex, flags, &mut regex_cache)?; + count_matches(values.value_opt(row), pattern, start) } + }) + .collect::>()?; - Ok(Arc::new( - values - .iter() - .zip(flags_array.iter()) - .map(|(value, flags)| { - let pattern = - compile_and_cache_regex(regex, flags, &mut regex_cache)?; - count_matches(value, pattern, start_scalar) - }) - .collect::>()?, - )) - } - (true, false, true) => { - let regex = match regex_scalar { - None => { - return Ok(Arc::new(Int64Array::from(vec![0; values.len()]))); - } - Some(regex) => regex, - }; + Ok(Arc::new(counts)) +} - let pattern = compile_regex(regex, flags_scalar)?; +trait StringArrayValue<'a>: StringArrayType<'a> { + fn value_opt(&self, row: usize) -> Option<&'a str> { + (!self.is_null(row)).then(|| self.value(row)) + } +} + +impl<'a, S> StringArrayValue<'a> for S where S: StringArrayType<'a> {} - let start_array = start_array.unwrap(); +enum StringValueSource<'a, S> { + Scalar(Option<&'a str>), + Array(&'a S), +} - Ok(Arc::new( - values - .iter() - .zip(start_array.iter()) - .map(|(value, start)| count_matches(value, &pattern, start)) - .collect::>()?, - )) +impl<'a, S> StringValueSource<'a, S> +where + S: StringArrayValue<'a>, +{ + fn regex(array: &'a S, is_scalar: bool) -> Self { + if is_scalar || array.len() == 1 { + Self::Scalar(array.value_opt(0)) + } else { + Self::Array(array) } - (true, false, false) => { - let regex = match regex_scalar { - None => { - return Ok(Arc::new(Int64Array::from(vec![0; values.len()]))); - } - Some(regex) => regex, - }; + } - let flags_array = flags_array.unwrap(); - if values.len() != flags_array.len() { - return Err(ArrowError::ComputeError(format!( - "flags_array must be the same length as values array; got {} and {}", - flags_array.len(), - values.len(), - ))); + fn flags(array: Option<&'a S>, is_scalar: bool) -> Self { + match array { + Some(array) if is_scalar || array.len() == 1 => { + Self::Scalar(Some(array.value(0))) } - - Ok(Arc::new( - izip!( - values.iter(), - start_array.unwrap().iter(), - flags_array.iter() - ) - .map(|(value, start, flags)| { - let pattern = - compile_and_cache_regex(regex, flags, &mut regex_cache)?; - - count_matches(value, pattern, start) - }) - .collect::>()?, - )) + Some(array) => Self::Array(array), + None => Self::Scalar(None), } - (false, true, true) => { - if values.len() != regex_array.len() { - return Err(ArrowError::ComputeError(format!( - "regex_array must be the same length as values array; got {} and {}", - regex_array.len(), - values.len(), - ))); - } + } - Ok(Arc::new( - values - .iter() - .zip(regex_array.iter()) - .map(|(value, regex)| { - let regex = match regex { - None => return Ok(0), - Some(regex) => regex, - }; - - let pattern = compile_and_cache_regex( - regex, - flags_scalar, - &mut regex_cache, - )?; - count_matches(value, pattern, start_scalar) - }) - .collect::>()?, - )) + fn scalar(&self) -> Option> { + match self { + Self::Scalar(value) => Some(*value), + Self::Array(_) => None, } - (false, true, false) => { - if values.len() != regex_array.len() { - return Err(ArrowError::ComputeError(format!( - "regex_array must be the same length as values array; got {} and {}", - regex_array.len(), - values.len(), - ))); - } + } - let flags_array = flags_array.unwrap(); - if values.len() != flags_array.len() { - return Err(ArrowError::ComputeError(format!( - "flags_array must be the same length as values array; got {} and {}", - flags_array.len(), - values.len(), - ))); - } + fn is_null_scalar(&self) -> bool { + matches!(self, Self::Scalar(None)) + } - Ok(Arc::new( - izip!(values.iter(), regex_array.iter(), flags_array.iter()) - .map(|(value, regex, flags)| { - let regex = match regex { - None => return Ok(0), - Some(regex) => regex, - }; - - let pattern = - compile_and_cache_regex(regex, flags, &mut regex_cache)?; - - count_matches(value, pattern, start_scalar) - }) - .collect::>()?, - )) + fn value(&self, row: usize) -> Option<&'a str> { + match self { + Self::Scalar(value) => *value, + Self::Array(array) => array.value_opt(row), } - (false, false, true) => { - if values.len() != regex_array.len() { - return Err(ArrowError::ComputeError(format!( - "regex_array must be the same length as values array; got {} and {}", - regex_array.len(), - values.len(), - ))); - } - - let start_array = start_array.unwrap(); - if values.len() != start_array.len() { - return Err(ArrowError::ComputeError(format!( - "start_array must be the same length as values array; got {} and {}", - start_array.len(), - values.len(), - ))); - } + } - Ok(Arc::new( - izip!(values.iter(), regex_array.iter(), start_array.iter()) - .map(|(value, regex, start)| { - let regex = match regex { - None => return Ok(0), - Some(regex) => regex, - }; - - let pattern = compile_and_cache_regex( - regex, - flags_scalar, - &mut regex_cache, - )?; - count_matches(value, pattern, start) - }) - .collect::>()?, - )) + fn validate_len(&self, name: &str, values_len: usize) -> Result<(), ArrowError> { + if let Self::Array(array) = self { + validate_array_len(name, array.len(), values_len)?; } - (false, false, false) => { - if values.len() != regex_array.len() { - return Err(ArrowError::ComputeError(format!( - "regex_array must be the same length as values array; got {} and {}", - regex_array.len(), - values.len(), - ))); - } + Ok(()) + } +} - let start_array = start_array.unwrap(); - if values.len() != start_array.len() { - return Err(ArrowError::ComputeError(format!( - "start_array must be the same length as values array; got {} and {}", - start_array.len(), - values.len(), - ))); - } +enum StartValueSource<'a> { + Scalar(Option), + Array(&'a Int64Array), +} - let flags_array = flags_array.unwrap(); - if values.len() != flags_array.len() { - return Err(ArrowError::ComputeError(format!( - "flags_array must be the same length as values array; got {} and {}", - flags_array.len(), - values.len(), - ))); +impl<'a> StartValueSource<'a> { + fn new(array: Option<&'a Int64Array>, is_scalar: bool) -> Self { + match array { + Some(array) if is_scalar || array.len() == 1 => { + Self::Scalar(Some(array.value(0))) } + Some(array) => Self::Array(array), + None => Self::Scalar(Some(1)), + } + } + + fn value(&self, row: usize) -> Option { + match self { + Self::Scalar(value) => *value, + Self::Array(array) => (!array.is_null(row)).then(|| array.value(row)), + } + } - Ok(Arc::new( - izip!( - values.iter(), - regex_array.iter(), - start_array.iter(), - flags_array.iter() - ) - .map(|(value, regex, start, flags)| { - let regex = match regex { - None => return Ok(0), - Some(regex) => regex, - }; - - let pattern = - compile_and_cache_regex(regex, flags, &mut regex_cache)?; - count_matches(value, pattern, start) - }) - .collect::>()?, - )) + fn validate_len(&self, values_len: usize) -> Result<(), ArrowError> { + if let Self::Array(array) = self { + validate_array_len("start_array", array.len(), values_len)?; } + Ok(()) + } +} + +fn validate_array_len( + array_name: &str, + array_len: usize, + values_len: usize, +) -> Result<(), ArrowError> { + if values_len != array_len { + return Err(ArrowError::ComputeError(format!( + "{array_name} must be the same length as values array; got {array_len} and {values_len}", + ))); } + Ok(()) } fn count_matches( From ed2975cc590ede592923307c5e80c9703b3156c3 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 5 Jun 2026 11:52:55 +0800 Subject: [PATCH 2/5] feat(regex): enhance regexpcount with new functionalities and improvements - Added values_len - Replaced extension trait with string_value_opt - Introduced compile_scalar_pattern - Bound input value once per row for efficiency - Renamed private constructors: regex_arg to improve clarity, flags_arg for better understanding - Added comments to clarify scalar flags and start value (0) behavior --- datafusion/functions/src/regex/regexpcount.rs | 69 ++++++++++--------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index d498d69348588..71280f169a86f 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -266,28 +266,26 @@ fn regexp_count_inner<'a, S>( where S: StringArrayType<'a>, { - let regex = StringValueSource::regex(regex_array, is_regex_scalar); + let values_len = values.len(); + let regex = StringValueSource::regex_arg(regex_array, is_regex_scalar); // Preserve existing short-circuit behavior: a scalar NULL regex produces zeros // for every row without validating start or flags. if regex.is_null_scalar() { - return Ok(Arc::new(Int64Array::from(vec![0; values.len()]))); + return Ok(Arc::new(Int64Array::from(vec![0; values_len]))); } let start = StartValueSource::new(start_array, is_start_scalar); - let flags = StringValueSource::flags(flags_array, is_flags_scalar); + let flags = StringValueSource::flags_arg(flags_array, is_flags_scalar); - regex.validate_len("regex_array", values.len())?; - start.validate_len(values.len())?; - flags.validate_len("flags_array", values.len())?; + regex.validate_len("regex_array", values_len)?; + start.validate_len(values_len)?; + flags.validate_len("flags_array", values_len)?; - let scalar_pattern = match (regex.scalar(), flags.scalar()) { - (Some(Some(regex)), Some(flags)) => Some(compile_regex(regex, flags)?), - _ => None, - }; + let scalar_pattern = compile_scalar_pattern(®ex, &flags)?; let mut regex_cache = HashMap::new(); - let counts = (0..values.len()) + let counts = (0..values_len) .map(|row| { let regex = match regex.value(row) { None => return Ok(0), @@ -295,12 +293,13 @@ where }; let start = start.value(row); let flags = flags.value(row); + let value = string_value_opt(values, row); if let Some(pattern) = &scalar_pattern { - count_matches(values.value_opt(row), pattern, start) + count_matches(value, pattern, start) } else { let pattern = compile_and_cache_regex(regex, flags, &mut regex_cache)?; - count_matches(values.value_opt(row), pattern, start) + count_matches(value, pattern, start) } }) .collect::>()?; @@ -308,14 +307,13 @@ where Ok(Arc::new(counts)) } -trait StringArrayValue<'a>: StringArrayType<'a> { - fn value_opt(&self, row: usize) -> Option<&'a str> { - (!self.is_null(row)).then(|| self.value(row)) - } +fn string_value_opt<'a, S>(array: &'a S, row: usize) -> Option<&'a str> +where + S: StringArrayType<'a>, +{ + (!array.is_null(row)).then(|| array.value(row)) } -impl<'a, S> StringArrayValue<'a> for S where S: StringArrayType<'a> {} - enum StringValueSource<'a, S> { Scalar(Option<&'a str>), Array(&'a S), @@ -323,18 +321,20 @@ enum StringValueSource<'a, S> { impl<'a, S> StringValueSource<'a, S> where - S: StringArrayValue<'a>, + S: StringArrayType<'a>, { - fn regex(array: &'a S, is_scalar: bool) -> Self { + fn regex_arg(array: &'a S, is_scalar: bool) -> Self { if is_scalar || array.len() == 1 { - Self::Scalar(array.value_opt(0)) + Self::Scalar(string_value_opt(array, 0)) } else { Self::Array(array) } } - fn flags(array: Option<&'a S>, is_scalar: bool) -> Self { + fn flags_arg(array: Option<&'a S>, is_scalar: bool) -> Self { match array { + // Preserve prior behavior: scalar flags use value(0), not a null-aware + // lookup, before compile_regex handles the resulting flag value. Some(array) if is_scalar || array.len() == 1 => { Self::Scalar(Some(array.value(0))) } @@ -343,13 +343,6 @@ where } } - fn scalar(&self) -> Option> { - match self { - Self::Scalar(value) => Some(*value), - Self::Array(_) => None, - } - } - fn is_null_scalar(&self) -> bool { matches!(self, Self::Scalar(None)) } @@ -357,7 +350,7 @@ where fn value(&self, row: usize) -> Option<&'a str> { match self { Self::Scalar(value) => *value, - Self::Array(array) => array.value_opt(row), + Self::Array(array) => string_value_opt(*array, row), } } @@ -369,6 +362,18 @@ where } } +fn compile_scalar_pattern<'a, S>( + regex: &StringValueSource<'a, S>, + flags: &StringValueSource<'a, S>, +) -> Result, ArrowError> { + match (regex, flags) { + (StringValueSource::Scalar(Some(regex)), StringValueSource::Scalar(flags)) => { + compile_regex(regex, *flags).map(Some) + } + _ => Ok(None), + } +} + enum StartValueSource<'a> { Scalar(Option), Array(&'a Int64Array), @@ -377,6 +382,8 @@ enum StartValueSource<'a> { impl<'a> StartValueSource<'a> { fn new(array: Option<&'a Int64Array>, is_scalar: bool) -> Self { match array { + // Preserve prior behavior: scalar start uses value(0), not a null-aware + // lookup, before count_matches validates the resulting start value. Some(array) if is_scalar || array.len() == 1 => { Self::Scalar(Some(array.value(0))) } From fb71f16b12c910a366d21bdcb29c00a93e23c3e9 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 5 Jun 2026 12:14:00 +0800 Subject: [PATCH 3/5] feat(regex): preserve error ordering and improve flag validation - Maintained old error ordering: - Scalar regex with scalar flags compiles before start length validation. - Scalar regex with array flags validates flags before start. - Regex array retains the order of regex/start/flags. - Implemented suggestion to use StartValueSource::Scalar(i64) instead of Option. - Added regression tests to verify error ordering. --- datafusion/functions/src/regex/regexpcount.rs | 63 ++++++++++++++++--- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index 71280f169a86f..3a4b40ef848fa 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -279,10 +279,20 @@ where let flags = StringValueSource::flags_arg(flags_array, is_flags_scalar); regex.validate_len("regex_array", values_len)?; - start.validate_len(values_len)?; - flags.validate_len("flags_array", values_len)?; - let scalar_pattern = compile_scalar_pattern(®ex, &flags)?; + let scalar_pattern = if regex.is_array() { + start.validate_len(values_len)?; + flags.validate_len("flags_array", values_len)?; + None + } else if flags.is_array() { + flags.validate_len("flags_array", values_len)?; + start.validate_len(values_len)?; + None + } else { + let scalar_pattern = compile_scalar_pattern(®ex, &flags)?; + start.validate_len(values_len)?; + scalar_pattern + }; let mut regex_cache = HashMap::new(); let counts = (0..values_len) @@ -347,6 +357,10 @@ where matches!(self, Self::Scalar(None)) } + fn is_array(&self) -> bool { + matches!(self, Self::Array(_)) + } + fn value(&self, row: usize) -> Option<&'a str> { match self { Self::Scalar(value) => *value, @@ -375,7 +389,7 @@ fn compile_scalar_pattern<'a, S>( } enum StartValueSource<'a> { - Scalar(Option), + Scalar(i64), Array(&'a Int64Array), } @@ -384,17 +398,15 @@ impl<'a> StartValueSource<'a> { match array { // Preserve prior behavior: scalar start uses value(0), not a null-aware // lookup, before count_matches validates the resulting start value. - Some(array) if is_scalar || array.len() == 1 => { - Self::Scalar(Some(array.value(0))) - } + Some(array) if is_scalar || array.len() == 1 => Self::Scalar(array.value(0)), Some(array) => Self::Array(array), - None => Self::Scalar(Some(1)), + None => Self::Scalar(1), } } fn value(&self, row: usize) -> Option { match self { - Self::Scalar(value) => *value, + Self::Scalar(value) => Some(*value), Self::Array(array) => (!array.is_null(row)).then(|| array.value(row)), } } @@ -497,6 +509,8 @@ mod tests { test_case_sensitive_regexp_count_array_complex::(); test_case_regexp_count_cache_check::>(); + test_regexp_count_error_order_invalid_scalar_regex_before_start_len(); + test_regexp_count_error_order_flags_len_before_start_len(); } fn regexp_count_with_scalar_values(args: &[ScalarValue]) -> Result { @@ -723,6 +737,37 @@ mod tests { }); } + fn assert_regexp_count_error_contains(args: &[ArrayRef], expected: &str) { + let err = regexp_count_func(args).unwrap_err().to_string(); + assert!( + err.contains(expected), + "expected error to contain {expected:?}, got {err:?}" + ); + } + + fn test_regexp_count_error_order_invalid_scalar_regex_before_start_len() { + let values = Arc::new(GenericStringArray::::from(vec!["a", "b"])); + let regex = Arc::new(GenericStringArray::::from(vec!["["])); + let start = Arc::new(Int64Array::from(vec![1, 1, 1])); + + assert_regexp_count_error_contains( + &[values, regex, start], + "Regular expression did not compile", + ); + } + + fn test_regexp_count_error_order_flags_len_before_start_len() { + let values = Arc::new(GenericStringArray::::from(vec!["a", "b"])); + let regex = Arc::new(GenericStringArray::::from(vec!["a"])); + let start = Arc::new(Int64Array::from(vec![1, 1, 1])); + let flags = Arc::new(GenericStringArray::::from(vec!["", "", ""])); + + assert_regexp_count_error_contains( + &[values, regex, start, flags], + "flags_array must be the same length as values array; got 3 and 2", + ); + } + fn test_case_sensitive_regexp_count_array() where A: From> + Array + 'static, From 4a2b88f25c972e4faa58da650cccfceb1e02cfde Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 5 Jun 2026 13:35:16 +0800 Subject: [PATCH 4/5] feat: simplify regexp_count_inner by accepting S directly - Updated regexp_count_inner to take S directly instead of &S. - Adjusted call sites to pass values.as_string() directly without an additional reference. - Changed flags_array type to Option from Option<&S>. - Modified StringValueSource::Array to use S instead of &S. - Retained lifetime 'a solely for Arrow StringArrayType<'a> and for returning &str. - Confirmed that there are no behavioral changes. --- datafusion/functions/src/regex/regexpcount.rs | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index 3a4b40ef848fa..37dbc8a992c9e 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -195,8 +195,8 @@ fn regexp_count( match (values.data_type(), regex_array.data_type(), flags_array) { (Utf8, Utf8, None) => regexp_count_inner( - &values.as_string::(), - ®ex_array.as_string::(), + values.as_string::(), + regex_array.as_string::(), is_regex_scalar, start_array.map(|start| start.as_primitive::()), is_start_scalar, @@ -204,17 +204,17 @@ fn regexp_count( is_flags_scalar, ), (Utf8, Utf8, Some(flags_array)) if *flags_array.data_type() == Utf8 => regexp_count_inner( - &values.as_string::(), - ®ex_array.as_string::(), + values.as_string::(), + regex_array.as_string::(), is_regex_scalar, start_array.map(|start| start.as_primitive::()), is_start_scalar, - Some(&flags_array.as_string::()), + Some(flags_array.as_string::()), is_flags_scalar, ), (LargeUtf8, LargeUtf8, None) => regexp_count_inner( - &values.as_string::(), - ®ex_array.as_string::(), + values.as_string::(), + regex_array.as_string::(), is_regex_scalar, start_array.map(|start| start.as_primitive::()), is_start_scalar, @@ -222,17 +222,17 @@ fn regexp_count( is_flags_scalar, ), (LargeUtf8, LargeUtf8, Some(flags_array)) if *flags_array.data_type() == LargeUtf8 => regexp_count_inner( - &values.as_string::(), - ®ex_array.as_string::(), + values.as_string::(), + regex_array.as_string::(), is_regex_scalar, start_array.map(|start| start.as_primitive::()), is_start_scalar, - Some(&flags_array.as_string::()), + Some(flags_array.as_string::()), is_flags_scalar, ), (Utf8View, Utf8View, None) => regexp_count_inner( - &values.as_string_view(), - ®ex_array.as_string_view(), + values.as_string_view(), + regex_array.as_string_view(), is_regex_scalar, start_array.map(|start| start.as_primitive::()), is_start_scalar, @@ -240,12 +240,12 @@ fn regexp_count( is_flags_scalar, ), (Utf8View, Utf8View, Some(flags_array)) if *flags_array.data_type() == Utf8View => regexp_count_inner( - &values.as_string_view(), - ®ex_array.as_string_view(), + values.as_string_view(), + regex_array.as_string_view(), is_regex_scalar, start_array.map(|start| start.as_primitive::()), is_start_scalar, - Some(&flags_array.as_string_view()), + Some(flags_array.as_string_view()), is_flags_scalar, ), _ => Err(ArrowError::ComputeError( @@ -255,16 +255,16 @@ fn regexp_count( } fn regexp_count_inner<'a, S>( - values: &'a S, - regex_array: &'a S, + values: S, + regex_array: S, is_regex_scalar: bool, start_array: Option<&'a Int64Array>, is_start_scalar: bool, - flags_array: Option<&'a S>, + flags_array: Option, is_flags_scalar: bool, ) -> Result where - S: StringArrayType<'a>, + S: StringArrayType<'a> + Copy, { let values_len = values.len(); let regex = StringValueSource::regex_arg(regex_array, is_regex_scalar); @@ -317,7 +317,7 @@ where Ok(Arc::new(counts)) } -fn string_value_opt<'a, S>(array: &'a S, row: usize) -> Option<&'a str> +fn string_value_opt<'a, S>(array: S, row: usize) -> Option<&'a str> where S: StringArrayType<'a>, { @@ -326,14 +326,14 @@ where enum StringValueSource<'a, S> { Scalar(Option<&'a str>), - Array(&'a S), + Array(S), } impl<'a, S> StringValueSource<'a, S> where - S: StringArrayType<'a>, + S: StringArrayType<'a> + Copy, { - fn regex_arg(array: &'a S, is_scalar: bool) -> Self { + fn regex_arg(array: S, is_scalar: bool) -> Self { if is_scalar || array.len() == 1 { Self::Scalar(string_value_opt(array, 0)) } else { @@ -341,7 +341,7 @@ where } } - fn flags_arg(array: Option<&'a S>, is_scalar: bool) -> Self { + fn flags_arg(array: Option, is_scalar: bool) -> Self { match array { // Preserve prior behavior: scalar flags use value(0), not a null-aware // lookup, before compile_regex handles the resulting flag value. From c68d5d553b7ed3cb640d905a779fc9362ab272e0 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Fri, 5 Jun 2026 13:49:42 +0800 Subject: [PATCH 5/5] fix: update string_value_opt to accept &S and adjust call sites to borrow &values and &array - Updated string_value_opt to now take &S. - Modified call sites to borrow &values and &array. - Retained simplified regexp_count_inner(values: S, flags_array: Option). --- datafusion/functions/src/regex/regexpcount.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/functions/src/regex/regexpcount.rs b/datafusion/functions/src/regex/regexpcount.rs index 37dbc8a992c9e..80b701516c470 100644 --- a/datafusion/functions/src/regex/regexpcount.rs +++ b/datafusion/functions/src/regex/regexpcount.rs @@ -303,7 +303,7 @@ where }; let start = start.value(row); let flags = flags.value(row); - let value = string_value_opt(values, row); + let value = string_value_opt(&values, row); if let Some(pattern) = &scalar_pattern { count_matches(value, pattern, start) @@ -317,7 +317,7 @@ where Ok(Arc::new(counts)) } -fn string_value_opt<'a, S>(array: S, row: usize) -> Option<&'a str> +fn string_value_opt<'a, S>(array: &S, row: usize) -> Option<&'a str> where S: StringArrayType<'a>, { @@ -335,7 +335,7 @@ where { fn regex_arg(array: S, is_scalar: bool) -> Self { if is_scalar || array.len() == 1 { - Self::Scalar(string_value_opt(array, 0)) + Self::Scalar(string_value_opt(&array, 0)) } else { Self::Array(array) } @@ -364,7 +364,7 @@ where fn value(&self, row: usize) -> Option<&'a str> { match self { Self::Scalar(value) => *value, - Self::Array(array) => string_value_opt(*array, row), + Self::Array(array) => string_value_opt(array, row), } }