perf: improve performance of levenshtein by reusing cache buffer

viirya · viirya · commit 479c5d61ede1 · 2025-12-28T13:51:37.000-08:00
Add levenshtein_with_buffer() function that accepts a reusable cache buffer
to avoid allocating a new Vec&lt;usize&gt; for each distance calculation.

Changes:
- Added levenshtein_with_buffer() in datafusion-common that takes a mutable
  Vec&lt;usize&gt; buffer parameter
- Updated levenshtein function to use the optimized version with a reusable
  buffer across all rows
- Applied optimization to all data types: Utf8View, Utf8, and LargeUtf8
- Added benchmark to measure performance improvements

Optimization:
- Before: Allocated new Vec&lt;usize&gt; cache for every row
- After: Single Vec&lt;usize&gt; buffer reused across all rows

Benchmark Results:
- size=1024, str_len=8:  60.6 µs → 45.9 µs (24% faster)
- size=1024, str_len=32: 615.5 µs → 598.5 µs (3% faster)
- size=4096, str_len=8:  234.7 µs → 180.5 µs (23% faster)
- size=4096, str_len=32: 2.46 ms → 2.38 ms (3% faster)

The optimization shows significant improvements for shorter strings (23-24%)
where allocation overhead is more prominent relative to algorithm cost.
For longer strings, the O(m×n) algorithm complexity dominates, but still
shows measurable 3% improvement from eliminating per-row allocations.
diff --git a/datafusion/common/src/utils/mod.rs b/datafusion/common/src/utils/mod.rs
@@ -741,6 +741,43 @@ pub mod datafusion_strsim {
         generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
     }
 
+    /// Calculates the Levenshtein distance using a reusable cache buffer.
+    /// This avoids allocating a new Vec for each call, improving performance
+    /// when computing many distances.
+    ///
+    /// The `cache` buffer will be resized as needed and reused across calls.
+    pub fn levenshtein_with_buffer(a: &str, b: &str, cache: &mut Vec<usize>) -> usize {
+        let b_len = b.chars().count();
+        let a_len = a.chars().count();
+
+        if a_len == 0 {
+            return b_len;
+        }
+        if b_len == 0 {
+            return a_len;
+        }
+
+        // Resize cache to fit b_len elements
+        cache.clear();
+        cache.extend(1..=b_len);
+
+        let mut result = 0;
+        for (i, a_char) in a.chars().enumerate() {
+            result = i + 1;
+            let mut distance_b = i;
+
+            for (j, b_char) in b.chars().enumerate() {
+                let cost = if a_char == b_char { 0 } else { 1 };
+                let distance_a = distance_b + cost;
+                distance_b = cache[j];
+                result = min(result + 1, min(distance_a, distance_b + 1));
+                cache[j] = result;
+            }
+        }
+
+        result
+    }
+
     /// Calculates the normalized Levenshtein distance between two strings.
     /// The normalized distance is a value between 0.0 and 1.0, where 1.0 indicates
     /// that the strings are identical and 0.0 indicates no similarity.
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
@@ -269,3 +269,8 @@ required-features = ["string_expressions"]
 harness = false
 name = "ends_with"
 required-features = ["string_expressions"]
+
+[[bench]]
+harness = false
+name = "levenshtein"
+required-features = ["unicode_expressions"]
diff --git a/datafusion/functions/benches/levenshtein.rs b/datafusion/functions/benches/levenshtein.rs
@@ -0,0 +1,70 @@
+extern crate criterion;
+
+use arrow::array::OffsetSizeTrait;
+use arrow::datatypes::{DataType, Field};
+use arrow::util::bench_util::create_string_array_with_len;
+use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
+use datafusion_common::DataFusionError;
+use datafusion_common::config::ConfigOptions;
+use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
+use datafusion_functions::string;
+use std::hint::black_box;
+use std::sync::Arc;
+use std::time::Duration;
+
+fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarValue> {
+    let string1_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+    let string2_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+
+    vec![
+        ColumnarValue::Array(string1_array),
+        ColumnarValue::Array(string2_array),
+    ]
+}
+
+fn invoke_levenshtein_with_args(
+    args: Vec<ColumnarValue>,
+    number_rows: usize,
+) -> Result<ColumnarValue, DataFusionError> {
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    string::levenshtein().invoke_with_args(ScalarFunctionArgs {
+        args,
+        arg_fields,
+        number_rows,
+        return_field: Field::new("f", DataType::Int32, true).into(),
+        config_options: Arc::clone(&config_options),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    for size in [1024, 4096] {
+        let mut group = c.benchmark_group(format!("levenshtein size={size}"));
+        group.sampling_mode(SamplingMode::Flat);
+        group.sample_size(10);
+        group.measurement_time(Duration::from_secs(10));
+
+        for str_len in [8, 32] {
+            let args = create_args::<i32>(size, str_len);
+            group.bench_function(
+                format!("levenshtein_string [size={size}, str_len={str_len}]"),
+                |b| {
+                    b.iter(|| {
+                        let args_cloned = args.clone();
+                        black_box(invoke_levenshtein_with_args(args_cloned, size))
+                    })
+                },
+            );
+        }
+
+        group.finish();
+    }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/functions/benches/split_part.rs b/datafusion/functions/benches/split_part.rs
@@ -17,8 +17,7 @@ fn create_args<O: OffsetSizeTrait>(size: usize, str_len: usize) -> Vec<ColumnarV
         (0..size).map(|i| (i % 3 + 1) as i64).collect::<Vec<_>>(),
     ));
 
-    let string_array =
-        Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
+    let string_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
     let delimiter_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, 1));
 
     vec![
diff --git a/datafusion/functions/src/string/levenshtein.rs b/datafusion/functions/src/string/levenshtein.rs
@@ -151,12 +151,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::Utf8View => {
                 let str1_array = as_string_view_array(&str1)?;
                 let str2_array = as_string_view_array(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i32)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i32)
                         }
                         _ => None,
                     })
@@ -166,12 +172,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::Utf8 => {
                 let str1_array = as_generic_string_array::<T>(&str1)?;
                 let str2_array = as_generic_string_array::<T>(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i32)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i32)
                         }
                         _ => None,
                     })
@@ -181,12 +193,18 @@ fn levenshtein<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
             DataType::LargeUtf8 => {
                 let str1_array = as_generic_string_array::<T>(&str1)?;
                 let str2_array = as_generic_string_array::<T>(&str2)?;
+
+                // Reusable buffer to avoid allocating for each row
+                let mut cache = Vec::new();
+
                 let result = str1_array
                     .iter()
                     .zip(str2_array.iter())
                     .map(|(string1, string2)| match (string1, string2) {
                         (Some(string1), Some(string2)) => {
-                            Some(datafusion_strsim::levenshtein(string1, string2) as i64)
+                            Some(datafusion_strsim::levenshtein_with_buffer(
+                                string1, string2, &mut cache,
+                            ) as i64)
                         }
                         _ => None,
                     })