Skip to content

Commit 479c5d6

Browse files
committed
perf: improve performance of levenshtein by reusing cache buffer
Add levenshtein_with_buffer() function that accepts a reusable cache buffer to avoid allocating a new Vec<usize> for each distance calculation. Changes: - Added levenshtein_with_buffer() in datafusion-common that takes a mutable Vec<usize> buffer parameter - Updated levenshtein function to use the optimized version with a reusable buffer across all rows - Applied optimization to all data types: Utf8View, Utf8, and LargeUtf8 - Added benchmark to measure performance improvements Optimization: - Before: Allocated new Vec<usize> cache for every row - After: Single Vec<usize> buffer reused across all rows Benchmark Results: - size=1024, str_len=8: 60.6 µs → 45.9 µs (24% faster) - size=1024, str_len=32: 615.5 µs → 598.5 µs (3% faster) - size=4096, str_len=8: 234.7 µs → 180.5 µs (23% faster) - size=4096, str_len=32: 2.46 ms → 2.38 ms (3% faster) The optimization shows significant improvements for shorter strings (23-24%) where allocation overhead is more prominent relative to algorithm cost. For longer strings, the O(m×n) algorithm complexity dominates, but still shows measurable 3% improvement from eliminating per-row allocations.
1 parent 00aac51 commit 479c5d6

File tree

5 files changed

+134
-5
lines changed

5 files changed

+134
-5
lines changed

datafusion/common/src/utils/mod.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -741,6 +741,43 @@ pub mod datafusion_strsim {
741741
generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
742742
}
743743

744+
/// Calculates the Levenshtein distance using a reusable cache buffer.
745+
/// This avoids allocating a new Vec for each call, improving performance
746+
/// when computing many distances.
747+
///
748+
/// The `cache` buffer will be resized as needed and reused across calls.
749+
pub fn levenshtein_with_buffer(a: &str, b: &str, cache: &mut Vec<usize>) -> usize {
750+
let b_len = b.chars().count();
751+
let a_len = a.chars().count();
752+
753+
if a_len == 0 {
754+
return b_len;
755+
}
756+
if b_len == 0 {
757+
return a_len;
758+
}
759+
760+
// Resize cache to fit b_len elements
761+
cache.clear();
762+
cache.extend(1..=b_len);
763+
764+
let mut result = 0;
765+
for (i, a_char) in a.chars().enumerate() {
766+
result = i + 1;
767+
let mut distance_b = i;
768+
769+
for (j, b_char) in b.chars().enumerate() {
770+
let cost = if a_char == b_char { 0 } else { 1 };
771+
let distance_a = distance_b + cost;
772+
distance_b = cache[j];
773+
result = min(result + 1, min(distance_a, distance_b + 1));
774+
cache[j] = result;
775+
}
776+
}
777+
778+
result
779+
}
780+
744781
/// Calculates the normalized Levenshtein distance between two strings.
745782
/// The normalized distance is a value between 0.0 and 1.0, where 1.0 indicates
746783
/// that the strings are identical and 0.0 indicates no similarity.

datafusion/functions/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,3 +269,8 @@ required-features = ["string_expressions"]
269269
harness = false
270270
name = "ends_with"
271271
required-features = ["string_expressions"]
272+
273+
[[bench]]
274+
harness = false
275+
name = "levenshtein"
276+
required-features = ["unicode_expressions"]
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
extern crate criterion;
2+
3+
use arrow::array::OffsetSizeTrait;
4+
use arrow::datatypes::{DataType, Field};
5+
use arrow::util::bench_util::create_string_array_with_len;
6+
use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
7+
use datafusion_common::DataFusionError;
8+
use datafusion_common::config::ConfigOptions;
9+
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
10+
use datafusion_functions::string;
11+
use std::hint::black_box;
12+
use std::sync::Arc;
13+
use std::time::Duration;
14+
15+
fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarValue> {
16+
let string1_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
17+
let string2_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
18+
19+
vec![
20+
ColumnarValue::Array(string1_array),
21+
ColumnarValue::Array(string2_array),
22+
]
23+
}
24+
25+
fn invoke_levenshtein_with_args(
26+
args: Vec<ColumnarValue>,
27+
number_rows: usize,
28+
) -> Result<ColumnarValue, DataFusionError> {
29+
let arg_fields = args
30+
.iter()
31+
.enumerate()
32+
.map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
33+
.collect::<Vec<_>>();
34+
let config_options = Arc::new(ConfigOptions::default());
35+
36+
string::levenshtein().invoke_with_args(ScalarFunctionArgs {
37+
args,
38+
arg_fields,
39+
number_rows,
40+
return_field: Field::new("f", DataType::Int32, true).into(),
41+
config_options: Arc::clone(&config_options),
42+
})
43+
}
44+
45+
fn criterion_benchmark(c: &mut Criterion) {
46+
for size in [1024, 4096] {
47+
let mut group = c.benchmark_group(format!("levenshtein size={size}"));
48+
group.sampling_mode(SamplingMode::Flat);
49+
group.sample_size(10);
50+
group.measurement_time(Duration::from_secs(10));
51+
52+
for str_len in [8, 32] {
53+
let args = create_args::<i32>(size, str_len);
54+
group.bench_function(
55+
format!("levenshtein_string [size={size}, str_len={str_len}]"),
56+
|b| {
57+
b.iter(|| {
58+
let args_cloned = args.clone();
59+
black_box(invoke_levenshtein_with_args(args_cloned, size))
60+
})
61+
},
62+
);
63+
}
64+
65+
group.finish();
66+
}
67+
}
68+
69+
criterion_group!(benches, criterion_benchmark);
70+
criterion_main!(benches);

datafusion/functions/benches/split_part.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarV
1717
(0..size).map(|i| (i % 3 + 1) as i64).collect::<Vec<_>>(),
1818
));
1919

20-
let string_array =
21-
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
20+
let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
2221
let delimiter_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 1));
2322

2423
vec![

datafusion/functions/src/string/levenshtein.rs

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
151151
DataType::Utf8View => {
152152
let str1_array = as_string_view_array(&str1)?;
153153
let str2_array = as_string_view_array(&str2)?;
154+
155+
// Reusable buffer to avoid allocating for each row
156+
let mut cache = Vec::new();
157+
154158
let result = str1_array
155159
.iter()
156160
.zip(str2_array.iter())
157161
.map(|(string1, string2)| match (string1, string2) {
158162
(Some(string1), Some(string2)) => {
159-
Some(datafusion_strsim::levenshtein(string1, string2) as i32)
163+
Some(datafusion_strsim::levenshtein_with_buffer(
164+
string1, string2, &mut cache,
165+
) as i32)
160166
}
161167
_ => None,
162168
})
@@ -166,12 +172,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
166172
DataType::Utf8 => {
167173
let str1_array = as_generic_string_array::<T>(&str1)?;
168174
let str2_array = as_generic_string_array::<T>(&str2)?;
175+
176+
// Reusable buffer to avoid allocating for each row
177+
let mut cache = Vec::new();
178+
169179
let result = str1_array
170180
.iter()
171181
.zip(str2_array.iter())
172182
.map(|(string1, string2)| match (string1, string2) {
173183
(Some(string1), Some(string2)) => {
174-
Some(datafusion_strsim::levenshtein(string1, string2) as i32)
184+
Some(datafusion_strsim::levenshtein_with_buffer(
185+
string1, string2, &mut cache,
186+
) as i32)
175187
}
176188
_ => None,
177189
})
@@ -181,12 +193,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
181193
DataType::LargeUtf8 => {
182194
let str1_array = as_generic_string_array::<T>(&str1)?;
183195
let str2_array = as_generic_string_array::<T>(&str2)?;
196+
197+
// Reusable buffer to avoid allocating for each row
198+
let mut cache = Vec::new();
199+
184200
let result = str1_array
185201
.iter()
186202
.zip(str2_array.iter())
187203
.map(|(string1, string2)| match (string1, string2) {
188204
(Some(string1), Some(string2)) => {
189-
Some(datafusion_strsim::levenshtein(string1, string2) as i64)
205+
Some(datafusion_strsim::levenshtein_with_buffer(
206+
string1, string2, &mut cache,
207+
) as i64)
190208
}
191209
_ => None,
192210
})

0 commit comments

Comments
 (0)