diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 765f5d865a60..c8584612874b 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -210,6 +210,11 @@ harness = false name = "repeat" required-features = ["string_expressions"] +[[bench]] +harness = false +name = "replace" +required-features = ["string_expressions"] + [[bench]] harness = false name = "random" diff --git a/datafusion/functions/benches/replace.rs b/datafusion/functions/benches/replace.rs new file mode 100644 index 000000000000..deadbfeb99a8 --- /dev/null +++ b/datafusion/functions/benches/replace.rs @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::OffsetSizeTrait; +use arrow::datatypes::{DataType, Field}; +use arrow::util::bench_util::{ + create_string_array_with_len, create_string_view_array_with_len, +}; +use criterion::{Criterion, SamplingMode, criterion_group, criterion_main}; +use datafusion_common::DataFusionError; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::string; +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +fn create_args( + size: usize, + str_len: usize, + force_view_types: bool, + from_len: usize, + to_len: usize, +) -> Vec { + if force_view_types { + let string_array = + Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false)); + let from_array = Arc::new(create_string_view_array_with_len( + size, 0.1, from_len, false, + )); + let to_array = + Arc::new(create_string_view_array_with_len(size, 0.1, to_len, false)); + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(from_array), + ColumnarValue::Array(to_array), + ] + } else { + let string_array = + Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + let from_array = Arc::new(create_string_array_with_len::(size, 0.1, from_len)); + let to_array = Arc::new(create_string_array_with_len::(size, 0.1, to_len)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(from_array), + ColumnarValue::Array(to_array), + ] + } +} + +fn invoke_replace_with_args( + args: Vec, + number_rows: usize, +) -> Result { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into()) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + string::replace().invoke_with_args(ScalarFunctionArgs { + args, + arg_fields, + number_rows, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) +} + +fn criterion_benchmark(c: &mut Criterion) { + for size in [1024, 4096] { + let mut group = c.benchmark_group(format!("replace size={size}")); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + // ASCII single character replacement (fast path) + let str_len = 32; + let args = create_args::(size, str_len, false, 1, 1); + group.bench_function( + format!("replace_string_ascii_single [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_replace_with_args(args_cloned, size)) + }) + }, + ); + + // Multi-character strings (general path) + let args = create_args::(size, str_len, true, 3, 5); + group.bench_function( + format!("replace_string_view [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_replace_with_args(args_cloned, size)) + }) + }, + ); + + let args = create_args::(size, str_len, false, 3, 5); + group.bench_function( + format!("replace_string [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_replace_with_args(args_cloned, size)) + }) + }, + ); + + let args = create_args::(size, str_len, false, 3, 5); + group.bench_function( + format!("replace_large_string [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_replace_with_args(args_cloned, size)) + }) + }, + ); + + // Larger strings + let str_len = 128; + let args = create_args::(size, str_len, false, 1, 1); + group.bench_function( + format!("replace_string_ascii_single [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_replace_with_args(args_cloned, size)) + }) + }, + ); + + let args = create_args::(size, str_len, true, 3, 5); + group.bench_function( + format!("replace_string_view [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_replace_with_args(args_cloned, size)) + }) + }, + ); + + let args = create_args::(size, str_len, false, 3, 5); + group.bench_function( + format!("replace_string [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_replace_with_args(args_cloned, size)) + }) + }, + ); + + let args = create_args::(size, str_len, false, 3, 5); + group.bench_function( + format!("replace_large_string [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_replace_with_args(args_cloned, size)) + }) + }, + ); + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/string/replace.rs b/datafusion/functions/src/string/replace.rs index a976ca7b9139..165e0634a6b8 100644 --- a/datafusion/functions/src/string/replace.rs +++ b/datafusion/functions/src/string/replace.rs @@ -18,7 +18,7 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; +use arrow::array::{ArrayRef, GenericStringBuilder, OffsetSizeTrait}; use arrow::datatypes::DataType; use crate::utils::{make_scalar_function, utf8_to_str_type}; @@ -165,17 +165,25 @@ fn replace_view(args: &[ArrayRef]) -> Result { let from_array = as_string_view_array(&args[1])?; let to_array = as_string_view_array(&args[2])?; - let result = string_array + let mut builder = GenericStringBuilder::::new(); + let mut buffer = String::new(); + + for ((string, from), to) in string_array .iter() .zip(from_array.iter()) .zip(to_array.iter()) - .map(|((string, from), to)| match (string, from, to) { - (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), - _ => None, - }) - .collect::(); + { + match (string, from, to) { + (Some(string), Some(from), Some(to)) => { + buffer.clear(); + replace_into_string(&mut buffer, string, from, to); + builder.append_value(&buffer); + } + _ => builder.append_null(), + } + } - Ok(Arc::new(result) as ArrayRef) + Ok(Arc::new(builder.finish()) as ArrayRef) } /// Replaces all occurrences in string of substring from with substring to. @@ -185,17 +193,64 @@ fn replace(args: &[ArrayRef]) -> Result { let from_array = as_generic_string_array::(&args[1])?; let to_array = as_generic_string_array::(&args[2])?; - let result = string_array + let mut builder = GenericStringBuilder::::new(); + let mut buffer = String::new(); + + for ((string, from), to) in string_array .iter() .zip(from_array.iter()) .zip(to_array.iter()) - .map(|((string, from), to)| match (string, from, to) { - (Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), - _ => None, - }) - .collect::>(); + { + match (string, from, to) { + (Some(string), Some(from), Some(to)) => { + buffer.clear(); + replace_into_string(&mut buffer, string, from, to); + builder.append_value(&buffer); + } + _ => builder.append_null(), + } + } - Ok(Arc::new(result) as ArrayRef) + Ok(Arc::new(builder.finish()) as ArrayRef) +} + +/// Helper function to perform string replacement into a reusable String buffer +#[inline] +fn replace_into_string(buffer: &mut String, string: &str, from: &str, to: &str) { + if from.is_empty() { + // When from is empty, insert 'to' at the beginning, between each character, and at the end + // This matches the behavior of str::replace() + buffer.push_str(to); + for ch in string.chars() { + buffer.push(ch); + buffer.push_str(to); + } + return; + } + + // Fast path for replacing a single ASCII character with another single ASCII character + // This matches Rust's str::replace() optimization and enables vectorization + if let ([from_byte], [to_byte]) = (from.as_bytes(), to.as_bytes()) + && from_byte.is_ascii() + && to_byte.is_ascii() + { + // SAFETY: We're replacing ASCII with ASCII, which preserves UTF-8 validity + let replaced: Vec = string + .as_bytes() + .iter() + .map(|b| if *b == *from_byte { *to_byte } else { *b }) + .collect(); + buffer.push_str(unsafe { std::str::from_utf8_unchecked(&replaced) }); + return; + } + + let mut last_end = 0; + for (start, _part) in string.match_indices(from) { + buffer.push_str(&string[last_end..start]); + buffer.push_str(to); + last_end = start + from.len(); + } + buffer.push_str(&string[last_end..]); } #[cfg(test)]