diff --git a/datafusion/functions/benches/pad.rs b/datafusion/functions/benches/pad.rs index 99f177c035597..0f856f0fef384 100644 --- a/datafusion/functions/benches/pad.rs +++ b/datafusion/functions/benches/pad.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. -use arrow::array::{ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray}; +use arrow::array::{ + ArrowPrimitiveType, GenericStringBuilder, OffsetSizeTrait, PrimitiveArray, + StringViewBuilder, +}; use arrow::datatypes::{DataType, Field, Int64Type}; use arrow::util::bench_util::{ create_string_array_with_len, create_string_view_array_with_len, @@ -30,6 +33,51 @@ use std::hint::black_box; use std::sync::Arc; use std::time::Duration; +const UNICODE_STRINGS: &[&str] = &[ + "Ñandú", + "Íslensku", + "Þjóðarinnar", + "Ελληνική", + "Иванович", + "データフュージョン", + "José García", + "Ölçü bïrïmï", + "Ÿéšṱëṟḏàÿ", + "Ährenstraße", +]; + +fn create_unicode_string_array( + size: usize, + null_density: f32, +) -> arrow::array::GenericStringArray { + let mut rng = rand::rng(); + let mut builder = GenericStringBuilder::::new(); + for i in 0..size { + if rng.random::() < null_density { + builder.append_null(); + } else { + builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]); + } + } + builder.finish() +} + +fn create_unicode_string_view_array( + size: usize, + null_density: f32, +) -> arrow::array::StringViewArray { + let mut rng = rand::rng(); + let mut builder = StringViewBuilder::with_capacity(size); + for i in 0..size { + if rng.random::() < null_density { + builder.append_null(); + } else { + builder.append_value(UNICODE_STRINGS[i % UNICODE_STRINGS.len()]); + } + } + builder.finish() +} + struct Filter { dist: Dist, } @@ -67,6 +115,34 @@ where .collect() } +/// Create args for pad benchmark with Unicode strings +fn create_unicode_pad_args( + size: usize, + target_len: usize, + use_string_view: bool, +) -> Vec { + let length_array = + Arc::new(create_primitive_array::(size, 0.0, target_len)); + + if use_string_view { + let string_array = create_unicode_string_view_array(size, 0.1); + let fill_array = create_unicode_string_view_array(size, 0.1); + vec![ + ColumnarValue::Array(Arc::new(string_array)), + ColumnarValue::Array(length_array), + ColumnarValue::Array(Arc::new(fill_array)), + ] + } else { + let string_array = create_unicode_string_array::(size, 0.1); + let fill_array = create_unicode_string_array::(size, 0.1); + vec![ + ColumnarValue::Array(Arc::new(string_array)), + ColumnarValue::Array(length_array), + ColumnarValue::Array(Arc::new(fill_array)), + ] + } +} + /// Create args for pad benchmark fn create_pad_args( size: usize, @@ -208,6 +284,58 @@ fn criterion_benchmark(c: &mut Criterion) { }, ); + // Utf8 type with Unicode strings + let args = create_unicode_pad_args(size, 20, false); + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + + group.bench_function( + format!("lpad utf8 unicode [size={size}, target=20]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + })) + }) + }, + ); + + // StringView type with Unicode strings + let args = create_unicode_pad_args(size, 20, true); + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + + group.bench_function( + format!("lpad stringview unicode [size={size}, target=20]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new("f", DataType::Utf8View, true).into(), + config_options: Arc::clone(&config_options), + })) + }) + }, + ); + group.finish(); } @@ -322,6 +450,58 @@ fn criterion_benchmark(c: &mut Criterion) { }, ); + // Utf8 type with Unicode strings + let args = create_unicode_pad_args(size, 20, false); + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + + group.bench_function( + format!("rpad utf8 unicode [size={size}, target=20]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + })) + }) + }, + ); + + // StringView type with Unicode strings + let args = create_unicode_pad_args(size, 20, true); + let arg_fields = args + .iter() + .enumerate() + .map(|(idx, arg)| { + Field::new(format!("arg_{idx}"), arg.data_type(), true).into() + }) + .collect::>(); + + group.bench_function( + format!("rpad stringview unicode [size={size}, target=20]"), + |b| { + b.iter(|| { + let args_cloned = args.clone(); + black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + arg_fields: arg_fields.clone(), + number_rows: size, + return_field: Field::new("f", DataType::Utf8View, true).into(), + config_options: Arc::clone(&config_options), + })) + }) + }, + ); + group.finish(); } } diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index a892c0adf58de..2e650832749dd 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -49,7 +49,10 @@ use datafusion_macros::user_doc; +---------------------------------------------+ ```"#, standard_argument(name = "str", prefix = "String"), - argument(name = "n", description = "String length to pad to."), + argument( + name = "n", + description = "String length to pad to. If the input string is longer than this length, it is truncated (on the right)." + ), argument( name = "padding_str", description = "Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._" @@ -225,24 +228,47 @@ where continue; } - // Reuse buffers by clearing and refilling - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); - - fill_chars_buf.clear(); - fill_chars_buf.extend(fill.chars()); - - if length < graphemes_buf.len() { - builder.append_value(graphemes_buf[..length].concat()); - } else if fill_chars_buf.is_empty() { - builder.append_value(string); + if string.is_ascii() && fill.is_ascii() { + // ASCII fast path: byte length == character length, + // so we skip expensive grapheme segmentation. + let str_len = string.len(); + if length < str_len { + builder.append_value(&string[..length]); + } else if fill.is_empty() { + builder.append_value(string); + } else { + let pad_len = length - str_len; + let fill_len = fill.len(); + let full_reps = pad_len / fill_len; + let remainder = pad_len % fill_len; + for _ in 0..full_reps { + builder.write_str(fill)?; + } + if remainder > 0 { + builder.write_str(&fill[..remainder])?; + } + builder.append_value(string); + } } else { - for l in 0..length - graphemes_buf.len() { - let c = *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap(); - builder.write_char(c)?; + // Reuse buffers by clearing and refilling + graphemes_buf.clear(); + graphemes_buf.extend(string.graphemes(true)); + + fill_chars_buf.clear(); + fill_chars_buf.extend(fill.chars()); + + if length < graphemes_buf.len() { + builder.append_value(graphemes_buf[..length].concat()); + } else if fill_chars_buf.is_empty() { + builder.append_value(string); + } else { + for l in 0..length - graphemes_buf.len() { + let c = + *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap(); + builder.write_char(c)?; + } + builder.append_value(string); } - builder.write_str(string)?; - builder.append_value(""); } } else { builder.append_null(); @@ -266,17 +292,28 @@ where continue; } - // Reuse buffer by clearing and refilling - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); - - if length < graphemes_buf.len() { - builder.append_value(graphemes_buf[..length].concat()); + if string.is_ascii() { + // ASCII fast path: byte length == character length + let str_len = string.len(); + if length < str_len { + builder.append_value(&string[..length]); + } else { + builder.write_str(" ".repeat(length - str_len).as_str())?; + builder.append_value(string); + } } else { - builder - .write_str(" ".repeat(length - graphemes_buf.len()).as_str())?; - builder.write_str(string)?; - builder.append_value(""); + // Reuse buffer by clearing and refilling + graphemes_buf.clear(); + graphemes_buf.extend(string.graphemes(true)); + + if length < graphemes_buf.len() { + builder.append_value(graphemes_buf[..length].concat()); + } else { + builder.write_str( + " ".repeat(length - graphemes_buf.len()).as_str(), + )?; + builder.append_value(string); + } } } else { builder.append_null(); @@ -523,6 +560,11 @@ mod tests { None, Ok(None) ); + test_lpad!( + Some("hello".into()), + ScalarValue::Int64(Some(2i64)), + Ok(Some("he")) + ); test_lpad!( Some("josé".into()), ScalarValue::Int64(Some(10i64)), diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 14f517faf8cf1..e8ea83cd9e532 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -48,7 +48,10 @@ use unicode_segmentation::UnicodeSegmentation; +-----------------------------------------------+ ```"#, standard_argument(name = "str", prefix = "String"), - argument(name = "n", description = "String length to pad to."), + argument( + name = "n", + description = "String length to pad to. If the input string is longer than this length, it is truncated." + ), argument( name = "padding_str", description = "String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._" @@ -203,7 +206,8 @@ fn rpad( } } -/// Extends the string to length 'length' by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. +/// Extends the string to length 'length' by appending the characters fill (a space by default). +/// If the string is already longer than length then it is truncated (on the right). /// rpad('hi', 5, 'xy') = 'hixyx' fn rpad_impl<'a, StringArrType, FillArrType, StringArrayLen>( string_array: &StringArrType, @@ -234,6 +238,17 @@ where let length = if length < 0 { 0 } else { length as usize }; if length == 0 { builder.append_value(""); + } else if string.is_ascii() { + // ASCII fast path: byte length == character length + let str_len = string.len(); + if length < str_len { + builder.append_value(&string[..length]); + } else { + builder.write_str(string)?; + builder.append_value( + " ".repeat(length - str_len).as_str(), + ); + } } else { // Reuse buffer by clearing and refilling graphemes_buf.clear(); @@ -244,10 +259,9 @@ where .append_value(graphemes_buf[..length].concat()); } else { builder.write_str(string)?; - builder.write_str( + builder.append_value( &" ".repeat(length - graphemes_buf.len()), - )?; - builder.append_value(""); + ); } } } @@ -273,27 +287,49 @@ where ); } let length = if length < 0 { 0 } else { length as usize }; - // Reuse buffer by clearing and refilling - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); - - if length < graphemes_buf.len() { - builder - .append_value(graphemes_buf[..length].concat()); - } else if fill.is_empty() { - builder.append_value(string); + if string.is_ascii() && fill.is_ascii() { + // ASCII fast path: byte length == character length, + // so we skip expensive grapheme segmentation. + let str_len = string.len(); + if length < str_len { + builder.append_value(&string[..length]); + } else if fill.is_empty() { + builder.append_value(string); + } else { + let pad_len = length - str_len; + let fill_len = fill.len(); + let full_reps = pad_len / fill_len; + let remainder = pad_len % fill_len; + builder.write_str(string)?; + for _ in 0..full_reps { + builder.write_str(fill)?; + } + builder.append_value(&fill[..remainder]); + } } else { - builder.write_str(string)?; - // Reuse fill_chars_buf by clearing and refilling - fill_chars_buf.clear(); - fill_chars_buf.extend(fill.chars()); - for l in 0..length - graphemes_buf.len() { - let c = *fill_chars_buf - .get(l % fill_chars_buf.len()) - .unwrap(); - builder.write_char(c)?; + // Reuse buffer by clearing and refilling + graphemes_buf.clear(); + graphemes_buf.extend(string.graphemes(true)); + + if length < graphemes_buf.len() { + builder.append_value( + graphemes_buf[..length].concat(), + ); + } else if fill.is_empty() { + builder.append_value(string); + } else { + builder.write_str(string)?; + // Reuse fill_chars_buf by clearing and refilling + fill_chars_buf.clear(); + fill_chars_buf.extend(fill.chars()); + for l in 0..length - graphemes_buf.len() { + let c = *fill_chars_buf + .get(l % fill_chars_buf.len()) + .unwrap(); + builder.write_char(c)?; + } + builder.append_value(""); } - builder.append_value(""); } } _ => builder.append_null(), @@ -459,6 +495,17 @@ mod tests { Utf8, StringArray ); + test_function!( + RPadFunc::new(), + vec![ + ColumnarValue::Scalar(ScalarValue::from("hello")), + ColumnarValue::Scalar(ScalarValue::from(2i64)), + ], + Ok(Some("he")), + &str, + Utf8, + StringArray + ); test_function!( RPadFunc::new(), vec![ diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index e09c4cb7cbc32..3c426a39fb9e0 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1592,7 +1592,7 @@ lpad(str, n[, padding_str]) #### Arguments - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. -- **n**: String length to pad to. +- **n**: String length to pad to. If the input string is longer than this length, it is truncated (on the right). - **padding_str**: Optional string expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ #### Example @@ -1820,7 +1820,7 @@ rpad(str, n[, padding_str]) #### Arguments - **str**: String expression to operate on. Can be a constant, column, or function, and any combination of operators. -- **n**: String length to pad to. +- **n**: String length to pad to. If the input string is longer than this length, it is truncated. - **padding_str**: String expression to pad with. Can be a constant, column, or function, and any combination of string operators. _Default is a space._ #### Example