diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 4041af600815a..d85a269c7fa71 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -270,6 +270,11 @@ harness = false name = "ends_with" required-features = ["string_expressions"] +[[bench]] +harness = false +name = "translate" +required-features = ["unicode_expressions"] + [[bench]] harness = false name = "levenshtein" diff --git a/datafusion/functions/benches/translate.rs b/datafusion/functions/benches/translate.rs new file mode 100644 index 0000000000000..601bdec7cd364 --- /dev/null +++ b/datafusion/functions/benches/translate.rs @@ -0,0 +1,90 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::OffsetSizeTrait; +use arrow::datatypes::{DataType, Field}; +use arrow::util::bench_util::create_string_array_with_len; +use criterion::{Criterion, SamplingMode, criterion_group, criterion_main}; +use datafusion_common::DataFusionError; +use datafusion_common::config::ConfigOptions; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; +use datafusion_functions::unicode; +use std::hint::black_box; +use std::sync::Arc; +use std::time::Duration; + +fn create_args(size: usize, str_len: usize) -> Vec { + let string_array = Arc::new(create_string_array_with_len::(size, 0.1, str_len)); + // Create simple from/to strings for translation + let from_array = Arc::new(create_string_array_with_len::(size, 0.1, 3)); + let to_array = Arc::new(create_string_array_with_len::(size, 0.1, 2)); + + vec![ + ColumnarValue::Array(string_array), + ColumnarValue::Array(from_array), + ColumnarValue::Array(to_array), + ] +} + +fn invoke_translate_with_args( + args: Vec, + number_rows: usize, +) -> Result { + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into()) + .collect::>(); + let config_options = Arc::new(ConfigOptions::default()); + + unicode::translate().invoke_with_args(ScalarFunctionArgs { + args, + arg_fields, + number_rows, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) +} + +fn criterion_benchmark(c: &mut Criterion) { + for size in [1024, 4096] { + let mut group = c.benchmark_group(format!("translate size={size}")); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + for str_len in [8, 32] { + let args = create_args::(size, str_len); + group.bench_function( + format!("translate_string [size={size}, str_len={str_len}]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(invoke_translate_with_args(args_cloned, size)) + }) + }, + ); + } + + group.finish(); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index db785f4f8836a..376311d9d3f46 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -148,34 +148,48 @@ where let from_array_iter = ArrayIter::new(from_array); let to_array_iter = ArrayIter::new(to_array); + // Reusable buffers to avoid allocating for each row + let mut from_map: HashMap<&str, usize> = HashMap::new(); + let mut from_graphemes: Vec<&str> = Vec::new(); + let mut to_graphemes: Vec<&str> = Vec::new(); + let mut string_graphemes: Vec<&str> = Vec::new(); + let mut result_graphemes: Vec<&str> = Vec::new(); + let result = string_array_iter .zip(from_array_iter) .zip(to_array_iter) .map(|((string, from), to)| match (string, from, to) { (Some(string), Some(from), Some(to)) => { - // create a hashmap of [char, index] to change from O(n) to O(1) for from list - let from_map: HashMap<&str, usize> = from - .graphemes(true) - .collect::>() - .iter() - .enumerate() - .map(|(index, c)| (c.to_owned(), index)) - .collect(); + // Clear and reuse buffers + from_map.clear(); + from_graphemes.clear(); + to_graphemes.clear(); + string_graphemes.clear(); + result_graphemes.clear(); + + // Build from_map using reusable buffer + from_graphemes.extend(from.graphemes(true)); + for (index, c) in from_graphemes.iter().enumerate() { + from_map.insert(*c, index); + } + + // Build to_graphemes + to_graphemes.extend(to.graphemes(true)); - let to = to.graphemes(true).collect::>(); + // Process string and build result + string_graphemes.extend(string.graphemes(true)); + for c in &string_graphemes { + match from_map.get(*c) { + Some(n) => { + if let Some(replacement) = to_graphemes.get(*n) { + result_graphemes.push(*replacement); + } + } + None => result_graphemes.push(*c), + } + } - Some( - string - .graphemes(true) - .collect::>() - .iter() - .flat_map(|c| match from_map.get(*c) { - Some(n) => to.get(*n).copied(), - None => Some(*c), - }) - .collect::>() - .concat(), - ) + Some(result_graphemes.concat()) } _ => None, })