From 0ad664ace8e92f20269d1b6af1a3a23a37c19127 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 26 Dec 2025 10:10:03 -0700 Subject: [PATCH 1/2] perf: improve performance of string repeat --- datafusion/functions/src/string/repeat.rs | 36 ++++++++++++++++------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 0656a32c246b0..53d05e50ad364 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -153,6 +153,7 @@ where S: StringArrayType<'a>, { let mut total_capacity = 0; + let mut max_item_capacity = 0; string_array.iter().zip(number_array.iter()).try_for_each( |(string, number)| -> Result<(), DataFusionError> { match (string, number) { @@ -166,6 +167,7 @@ where ); } total_capacity += item_capacity; + max_item_capacity = max_item_capacity.max(item_capacity); } _ => (), } @@ -176,18 +178,32 @@ where let mut builder = GenericStringBuilder::::with_capacity(string_array.len(), total_capacity); - string_array.iter().zip(number_array.iter()).try_for_each( - |(string, number)| -> Result<(), DataFusionError> { - match (string, number) { - (Some(string), Some(number)) if number >= 0 => { - builder.append_value(string.repeat(number as usize)); + // Reusable buffer to avoid allocations in string.repeat() + let mut buffer = Vec::::with_capacity(max_item_capacity); + + string_array.iter().zip(number_array.iter()).for_each(|(string, number)| { + match (string, number) { + (Some(string), Some(number)) if number >= 0 => { + buffer.clear(); + let count = number as usize; + if count > 0 && !string.is_empty() { + let src = string.as_bytes(); + // Initial copy + buffer.extend_from_slice(src); + // Doubling strategy: copy what we have so far until we reach the target + while buffer.len() < src.len() * count { + let copy_len = buffer.len().min(src.len() * count - buffer.len()); + // SAFETY: we're copying valid UTF-8 bytes that we already verified + buffer.extend_from_within(..copy_len); + } } - (Some(_), Some(_)) => builder.append_value(""), - _ => builder.append_null(), + // SAFETY: buffer contains valid UTF-8 since we only ever copy from a valid &str + builder.append_value(unsafe { std::str::from_utf8_unchecked(&buffer) }); } - Ok(()) - }, - )?; + (Some(_), Some(_)) => builder.append_value(""), + _ => builder.append_null(), + } + }); let array = builder.finish(); Ok(Arc::new(array) as ArrayRef) From e0d95337fb1a4b58e96784990410d1716a6e663b Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 26 Dec 2025 10:32:57 -0700 Subject: [PATCH 2/2] cargo fmt --- datafusion/functions/src/string/repeat.rs | 45 +++++++++++++---------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 53d05e50ad364..2ca5e190c6e02 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -181,29 +181,34 @@ where // Reusable buffer to avoid allocations in string.repeat() let mut buffer = Vec::::with_capacity(max_item_capacity); - string_array.iter().zip(number_array.iter()).for_each(|(string, number)| { - match (string, number) { - (Some(string), Some(number)) if number >= 0 => { - buffer.clear(); - let count = number as usize; - if count > 0 && !string.is_empty() { - let src = string.as_bytes(); - // Initial copy - buffer.extend_from_slice(src); - // Doubling strategy: copy what we have so far until we reach the target - while buffer.len() < src.len() * count { - let copy_len = buffer.len().min(src.len() * count - buffer.len()); - // SAFETY: we're copying valid UTF-8 bytes that we already verified - buffer.extend_from_within(..copy_len); + string_array + .iter() + .zip(number_array.iter()) + .for_each(|(string, number)| { + match (string, number) { + (Some(string), Some(number)) if number >= 0 => { + buffer.clear(); + let count = number as usize; + if count > 0 && !string.is_empty() { + let src = string.as_bytes(); + // Initial copy + buffer.extend_from_slice(src); + // Doubling strategy: copy what we have so far until we reach the target + while buffer.len() < src.len() * count { + let copy_len = + buffer.len().min(src.len() * count - buffer.len()); + // SAFETY: we're copying valid UTF-8 bytes that we already verified + buffer.extend_from_within(..copy_len); + } } + // SAFETY: buffer contains valid UTF-8 since we only ever copy from a valid &str + builder + .append_value(unsafe { std::str::from_utf8_unchecked(&buffer) }); } - // SAFETY: buffer contains valid UTF-8 since we only ever copy from a valid &str - builder.append_value(unsafe { std::str::from_utf8_unchecked(&buffer) }); + (Some(_), Some(_)) => builder.append_value(""), + _ => builder.append_null(), } - (Some(_), Some(_)) => builder.append_value(""), - _ => builder.append_null(), - } - }); + }); let array = builder.finish(); Ok(Arc::new(array) as ArrayRef)