From d29a185622c155e4a41fc47ae6593c3b8fae9ed8 Mon Sep 17 00:00:00 2001 From: Tim-53 <82676248+Tim-53@users.noreply.github.com> Date: Tue, 3 Mar 2026 04:15:16 +0100 Subject: [PATCH 1/2] fix: preserve Utf8View return type in lpad --- datafusion/functions/src/unicode/common.rs | 76 +++++++++- datafusion/functions/src/unicode/lpad.rs | 137 ++++++++++-------- .../test_files/string/string_view.slt | 32 ++++ 3 files changed, 177 insertions(+), 68 deletions(-) diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs index 93f0c7900961e..9486785b5d58e 100644 --- a/datafusion/functions/src/unicode/common.rs +++ b/datafusion/functions/src/unicode/common.rs @@ -18,16 +18,18 @@ //! Common utilities for implementing unicode functions use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, ByteView, GenericStringArray, Int64Array, - OffsetSizeTrait, StringViewArray, make_view, + Array, ArrayAccessor, ArrayBuilder, ArrayIter, ArrayRef, ByteView, + GenericStringArray, GenericStringBuilder, Int64Array, OffsetSizeTrait, + StringViewArray, StringViewBuilder, make_view, }; use arrow::datatypes::DataType; use arrow_buffer::{NullBuffer, ScalarBuffer}; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; -use datafusion_common::exec_err; +use datafusion_common::{Result, exec_err}; use std::cmp::Ordering; +use std::fmt::Write as FmtWrite; use std::ops::Range; use std::sync::Arc; @@ -81,7 +83,7 @@ fn left_right_byte_length(string: &str, n: i64) -> usize { /// General implementation for `left` and `right` functions pub(crate) fn general_left_right( args: &[ArrayRef], -) -> datafusion_common::Result { +) -> Result { let n_array = as_int64_array(&args[1])?; match args[0].data_type() { @@ -110,7 +112,7 @@ fn general_left_right_array< >( string_array: V, n_array: &Int64Array, -) -> datafusion_common::Result { +) -> Result { let iter = ArrayIter::new(string_array); let result = iter .zip(n_array.iter()) @@ -131,7 +133,7 @@ fn general_left_right_array< fn general_left_right_view( string_view_array: &StringViewArray, n_array: &Int64Array, -) -> datafusion_common::Result { +) -> Result { let len = n_array.len(); let views = string_view_array.views(); @@ -181,3 +183,65 @@ fn general_left_right_view( )?; Ok(Arc::new(result) as ArrayRef) } + +/// A uniform write interface over `GenericStringBuilder` and `StringViewBuilder`. +pub(crate) trait StringArrayWriter { + fn write_str(&mut self, s: &str) -> Result<()>; + fn write_char(&mut self, c: char) -> Result<()>; + fn finalize(&mut self); + fn append_null(&mut self); + fn finish(self) -> ArrayRef; +} + +impl StringArrayWriter for GenericStringBuilder { + fn write_str(&mut self, s: &str) -> Result<()> { + Ok(FmtWrite::write_str(self, s)?) + } + fn write_char(&mut self, c: char) -> Result<()> { + Ok(FmtWrite::write_char(self, c)?) + } + fn finalize(&mut self) { + self.append_value(""); + } + fn append_null(&mut self) { + self.append_null(); + } + fn finish(mut self) -> ArrayRef { + ArrayBuilder::finish(&mut self) + } +} + +pub(crate) struct StringViewWriter { + builder: StringViewBuilder, + value_buffer: String, +} + +impl StringViewWriter { + pub(crate) fn new(capacity: usize) -> Self { + Self { + builder: StringViewBuilder::with_capacity(capacity), + value_buffer: String::new(), + } + } +} + +impl StringArrayWriter for StringViewWriter { + fn write_str(&mut self, s: &str) -> Result<()> { + self.value_buffer.push_str(s); + Ok(()) + } + fn write_char(&mut self, c: char) -> Result<()> { + self.value_buffer.push(c); + Ok(()) + } + fn finalize(&mut self) { + self.builder.append_value(&self.value_buffer); + self.value_buffer.clear(); + } + fn append_null(&mut self) { + self.builder.append_null(); + } + fn finish(mut self) -> ArrayRef { + Arc::new(self.builder.finish()) as ArrayRef + } +} diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index 50d15c7d62a65..879367307d5ad 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -16,17 +16,16 @@ // under the License. use std::any::Any; -use std::fmt::Write; -use std::sync::Arc; use DataType::{LargeUtf8, Utf8, Utf8View}; use arrow::array::{ - Array, ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, + ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, OffsetSizeTrait, StringArrayType, StringViewArray, }; use arrow::datatypes::DataType; use unicode_segmentation::UnicodeSegmentation; +use crate::unicode::common::{StringArrayWriter, StringViewWriter}; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; use datafusion_common::{Result, exec_err}; @@ -109,7 +108,11 @@ impl ScalarUDFImpl for LPadFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "lpad") + if arg_types[0] == Utf8View { + Ok(Utf8View) + } else { + utf8_to_str_type(&arg_types[0], "lpad") + } } fn invoke_with_args( @@ -141,74 +144,84 @@ fn lpad(args: &[ArrayRef]) -> Result { } let length_array = as_int64_array(&args[1])?; + let len = args[0].len(); + // 32 bytes per value is a rough estimate for the output data capacity. + let data_capacity = 32 * len; match (args.len(), args[0].data_type()) { - (2, Utf8View) => lpad_impl::<&StringViewArray, &GenericStringArray, T>( + (2, Utf8View) => lpad_impl( &args[0].as_string_view(), length_array, - None, + None::<&StringViewArray>, + StringViewWriter::new(len), + ), + (2, Utf8 | LargeUtf8) => lpad_impl( + &args[0].as_string::(), + length_array, + None::<&GenericStringArray>, + GenericStringBuilder::::with_capacity(len, data_capacity), ), - (2, Utf8 | LargeUtf8) => lpad_impl::< - &GenericStringArray, - &GenericStringArray, - T, - >(&args[0].as_string::(), length_array, None), - (3, Utf8View) => lpad_with_replace::<&StringViewArray, T>( + (3, Utf8View) => lpad_with_replace( &args[0].as_string_view(), length_array, &args[2], + StringViewWriter::new(len), ), - (3, Utf8 | LargeUtf8) => lpad_with_replace::<&GenericStringArray, T>( + (3, Utf8 | LargeUtf8) => lpad_with_replace( &args[0].as_string::(), length_array, &args[2], + GenericStringBuilder::::with_capacity(len, data_capacity), ), (_, _) => unreachable!("lpad"), } } -fn lpad_with_replace<'a, V, T: OffsetSizeTrait>( +fn lpad_with_replace<'a, V, W>( string_array: &V, length_array: &Int64Array, fill_array: &'a ArrayRef, + writer: W, ) -> Result where V: StringArrayType<'a>, + W: StringArrayWriter, { match fill_array.data_type() { - Utf8View => lpad_impl::( + Utf8View => lpad_impl( string_array, length_array, Some(fill_array.as_string_view()), + writer, ), - LargeUtf8 => lpad_impl::, T>( + LargeUtf8 => lpad_impl( string_array, length_array, Some(fill_array.as_string::()), + writer, ), - Utf8 => lpad_impl::, T>( + Utf8 => lpad_impl( string_array, length_array, Some(fill_array.as_string::()), + writer, ), - other => { - exec_err!("Unsupported data type {other:?} for function lpad") - } + other => exec_err!("Unsupported data type {other:?} for function lpad"), } } -fn lpad_impl<'a, V, V2, T>( +fn lpad_impl<'a, V, V2, W>( string_array: &V, length_array: &Int64Array, fill_array: Option, + mut writer: W, ) -> Result where V: StringArrayType<'a>, V2: StringArrayType<'a>, - T: OffsetSizeTrait, + W: StringArrayWriter, { - let array = if let Some(fill_array) = fill_array { - let mut builder: GenericStringBuilder = GenericStringBuilder::new(); + if let Some(fill_array) = fill_array { let mut graphemes_buf = Vec::new(); let mut fill_chars_buf = Vec::new(); @@ -224,7 +237,8 @@ where let length = if length < 0 { 0 } else { length as usize }; if length == 0 { - builder.append_value(""); + // append empty string + writer.finalize(); continue; } @@ -233,22 +247,23 @@ where // so we skip expensive grapheme segmentation. let str_len = string.len(); if length < str_len { - builder.append_value(&string[..length]); + writer.write_str(&string[..length])?; } else if fill.is_empty() { - builder.append_value(string); + writer.write_str(string)?; } else { let pad_len = length - str_len; let fill_len = fill.len(); let full_reps = pad_len / fill_len; let remainder = pad_len % fill_len; for _ in 0..full_reps { - builder.write_str(fill)?; + writer.write_str(fill)?; } if remainder > 0 { - builder.write_str(&fill[..remainder])?; + writer.write_str(&fill[..remainder])?; } - builder.append_value(string); + writer.write_str(string)?; } + writer.finalize(); } else { // Reuse buffers by clearing and refilling graphemes_buf.clear(); @@ -258,26 +273,23 @@ where fill_chars_buf.extend(fill.chars()); if length < graphemes_buf.len() { - builder.append_value(graphemes_buf[..length].concat()); + writer.write_str(&graphemes_buf[..length].concat())?; } else if fill_chars_buf.is_empty() { - builder.append_value(string); + writer.write_str(string)?; } else { for l in 0..length - graphemes_buf.len() { - let c = - *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap(); - builder.write_char(c)?; + writer + .write_char(fill_chars_buf[l % fill_chars_buf.len()])?; } - builder.append_value(string); + writer.write_str(string)?; } + writer.finalize(); } } else { - builder.append_null(); + writer.append_null(); } } - - builder.finish() } else { - let mut builder: GenericStringBuilder = GenericStringBuilder::new(); let mut graphemes_buf = Vec::new(); for (string, length) in string_array.iter().zip(length_array.iter()) { @@ -288,7 +300,8 @@ where let length = if length < 0 { 0 } else { length as usize }; if length == 0 { - builder.append_value(""); + // append empty string + writer.finalize(); continue; } @@ -296,36 +309,36 @@ where // ASCII fast path: byte length == character length let str_len = string.len(); if length < str_len { - builder.append_value(&string[..length]); + writer.write_str(&string[..length])?; } else { for _ in 0..(length - str_len) { - builder.write_str(" ")?; + writer.write_str(" ")?; } - builder.append_value(string); + writer.write_str(string)?; } + writer.finalize(); } else { // Reuse buffer by clearing and refilling graphemes_buf.clear(); graphemes_buf.extend(string.graphemes(true)); if length < graphemes_buf.len() { - builder.append_value(graphemes_buf[..length].concat()); + writer.write_str(&graphemes_buf[..length].concat())?; } else { for _ in 0..(length - graphemes_buf.len()) { - builder.write_str(" ")?; + writer.write_str(" ")?; } - builder.append_value(string); + writer.write_str(string)?; } + writer.finalize(); } } else { - builder.append_null(); + writer.append_null(); } } + } - builder.finish() - }; - - Ok(Arc::new(array) as ArrayRef) + Ok(writer.finish()) } #[cfg(test)] @@ -333,8 +346,8 @@ mod tests { use crate::unicode::lpad::LPadFunc; use crate::utils::test::test_function; - use arrow::array::{Array, LargeStringArray, StringArray}; - use arrow::datatypes::DataType::{LargeUtf8, Utf8}; + use arrow::array::{Array, LargeStringArray, StringArray, StringViewArray}; + use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View}; use datafusion_common::{Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; @@ -373,8 +386,8 @@ mod tests { ], $EXPECTED, &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); }; @@ -469,8 +482,8 @@ mod tests { ], $EXPECTED, &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); // utf8view, largeutf8 test_function!( @@ -482,8 +495,8 @@ mod tests { ], $EXPECTED, &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); // utf8view, utf8view test_function!( @@ -495,8 +508,8 @@ mod tests { ], $EXPECTED, &str, - Utf8, - StringArray + Utf8View, + StringViewArray ); }; } diff --git a/datafusion/sqllogictest/test_files/string/string_view.slt b/datafusion/sqllogictest/test_files/string/string_view.slt index 13b0aba653efb..1acdd397c9786 100644 --- a/datafusion/sqllogictest/test_files/string/string_view.slt +++ b/datafusion/sqllogictest/test_files/string/string_view.slt @@ -733,6 +733,38 @@ logical_plan 01)Projection: lpad(test.column1_utf8view, Int64(12), test.column2_utf8view) AS c1 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] +# lpad return type should match input type +query T +SELECT arrow_typeof(lpad('Dolly', 10, 'hello')) +---- +Utf8 + +query T +SELECT arrow_typeof(lpad(arrow_cast('Dolly', 'LargeUtf8'), 10, arrow_cast('hello', 'LargeUtf8'))) +---- +LargeUtf8 + +query T +SELECT arrow_typeof(lpad(arrow_cast('Dolly', 'Utf8View'), 10, arrow_cast('hello', 'Utf8View'))) +---- +Utf8View + +# lpad with Utf8View input should produce correct output +query T +SELECT lpad(arrow_cast('hi', 'Utf8View'), 5, arrow_cast('xy', 'Utf8View')) +---- +xyxhi + +query T +SELECT lpad(arrow_cast('josé', 'Utf8View'), 10, arrow_cast('éñ', 'Utf8View')) +---- +éñéñéñjosé + +query T +SELECT lpad(arrow_cast('hi', 'Utf8View'), 5) +---- + hi + ## Ensure no casts for OCTET_LENGTH query TT EXPLAIN SELECT From 6ca17c84fff889c1e4d777cbd21375fa9d66325a Mon Sep 17 00:00:00 2001 From: Tim-53 <82676248+Tim-53@users.noreply.github.com> Date: Tue, 3 Mar 2026 04:46:17 +0100 Subject: [PATCH 2/2] refactor: explicitly write empty string before finalize --- datafusion/functions/src/unicode/lpad.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index 879367307d5ad..7b77b3a5ca209 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -237,7 +237,7 @@ where let length = if length < 0 { 0 } else { length as usize }; if length == 0 { - // append empty string + writer.write_str("")?; writer.finalize(); continue; } @@ -300,7 +300,7 @@ where let length = if length < 0 { 0 } else { length as usize }; if length == 0 { - // append empty string + writer.write_str("")?; writer.finalize(); continue; }