Skip to content

Commit a9b4221

Browse files
grtlralamb
andauthored
Implement ArrayBuilder for UnionBuilder (apache#8169)
# Which issue does this PR close? We generally require a GitHub issue to be filed for all bug fixes and enhancements and this helps us generate change logs for our releases. You can link an issue to this PR using the GitHub syntax. - Closes apache#8033. # What changes are included in this PR? * Make `FieldDataValues: Send + Sync` * Derive `Default` for `UnionBuilder` * Implement `build_cloned` for `UnionBuilder` # Are these changes tested? Yes. # Are there any user-facing changes? These changes should be backwards compatible and don't change the existing public API. Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent be0ede7 commit a9b4221

File tree

1 file changed

+179
-5
lines changed

1 file changed

+179
-5
lines changed

arrow-array/src/builder/union_builder.rs

Lines changed: 179 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@
1616
// under the License.
1717

1818
use crate::builder::buffer_builder::{Int32BufferBuilder, Int8BufferBuilder};
19-
use crate::builder::BufferBuilder;
20-
use crate::{make_array, ArrowPrimitiveType, UnionArray};
19+
use crate::builder::{ArrayBuilder, BufferBuilder};
20+
use crate::{make_array, ArrayRef, ArrowPrimitiveType, UnionArray};
2121
use arrow_buffer::NullBufferBuilder;
22-
use arrow_buffer::{ArrowNativeType, Buffer};
22+
use arrow_buffer::{ArrowNativeType, Buffer, ScalarBuffer};
2323
use arrow_data::ArrayDataBuilder;
2424
use arrow_schema::{ArrowError, DataType, Field};
2525
use std::any::Any;
@@ -42,12 +42,14 @@ struct FieldData {
4242
}
4343

4444
/// A type-erased [`BufferBuilder`] used by [`FieldData`]
45-
trait FieldDataValues: std::fmt::Debug {
45+
trait FieldDataValues: std::fmt::Debug + Send + Sync {
4646
fn as_mut_any(&mut self) -> &mut dyn Any;
4747

4848
fn append_null(&mut self);
4949

5050
fn finish(&mut self) -> Buffer;
51+
52+
fn finish_cloned(&self) -> Buffer;
5153
}
5254

5355
impl<T: ArrowNativeType> FieldDataValues for BufferBuilder<T> {
@@ -62,6 +64,10 @@ impl<T: ArrowNativeType> FieldDataValues for BufferBuilder<T> {
6264
fn finish(&mut self) -> Buffer {
6365
self.finish()
6466
}
67+
68+
fn finish_cloned(&self) -> Buffer {
69+
Buffer::from_slice_ref(self.as_slice())
70+
}
6571
}
6672

6773
impl FieldData {
@@ -138,7 +144,7 @@ impl FieldData {
138144
/// assert_eq!(union.value_offset(1), 1);
139145
/// assert_eq!(union.value_offset(2), 2);
140146
/// ```
141-
#[derive(Debug)]
147+
#[derive(Debug, Default)]
142148
pub struct UnionBuilder {
143149
/// The current number of slots in the array
144150
len: usize,
@@ -310,4 +316,172 @@ impl UnionBuilder {
310316
children,
311317
)
312318
}
319+
320+
/// Builds this builder creating a new `UnionArray` without consuming the builder.
321+
///
322+
/// This is used for the `finish_cloned` implementation in `ArrayBuilder`.
323+
fn build_cloned(&self) -> Result<UnionArray, ArrowError> {
324+
let mut children = Vec::with_capacity(self.fields.len());
325+
let union_fields: Vec<_> = self
326+
.fields
327+
.iter()
328+
.map(|(name, field_data)| {
329+
let FieldData {
330+
type_id,
331+
data_type,
332+
values_buffer,
333+
slots,
334+
null_buffer_builder,
335+
} = field_data;
336+
337+
let array_ref = make_array(unsafe {
338+
ArrayDataBuilder::new(data_type.clone())
339+
.add_buffer(values_buffer.finish_cloned())
340+
.len(*slots)
341+
.nulls(null_buffer_builder.finish_cloned())
342+
.build_unchecked()
343+
});
344+
children.push(array_ref);
345+
(
346+
*type_id,
347+
Arc::new(Field::new(name.clone(), data_type.clone(), false)),
348+
)
349+
})
350+
.collect();
351+
UnionArray::try_new(
352+
union_fields.into_iter().collect(),
353+
ScalarBuffer::from(self.type_id_builder.as_slice().to_vec()),
354+
self.value_offset_builder
355+
.as_ref()
356+
.map(|builder| ScalarBuffer::from(builder.as_slice().to_vec())),
357+
children,
358+
)
359+
}
360+
}
361+
362+
impl ArrayBuilder for UnionBuilder {
363+
/// Returns the number of array slots in the builder
364+
fn len(&self) -> usize {
365+
self.len
366+
}
367+
368+
/// Builds the array
369+
fn finish(&mut self) -> ArrayRef {
370+
// Even simpler - just move the builder using mem::take and replace with default
371+
let builder = std::mem::take(self);
372+
373+
// Since UnionBuilder controls all invariants, this should never fail
374+
Arc::new(builder.build().unwrap())
375+
}
376+
377+
/// Builds the array without resetting the underlying builder
378+
fn finish_cloned(&self) -> ArrayRef {
379+
// We construct the UnionArray carefully to ensure try_new cannot fail.
380+
// Since UnionBuilder controls all the invariants, this should never panic.
381+
Arc::new(self.build_cloned().unwrap_or_else(|err| {
382+
panic!("UnionBuilder::build_cloned failed unexpectedly: {}", err)
383+
}))
384+
}
385+
386+
/// Returns the builder as a non-mutable `Any` reference
387+
fn as_any(&self) -> &dyn Any {
388+
self
389+
}
390+
391+
/// Returns the builder as a mutable `Any` reference
392+
fn as_any_mut(&mut self) -> &mut dyn Any {
393+
self
394+
}
395+
396+
/// Returns the boxed builder as a box of `Any`
397+
fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
398+
self
399+
}
400+
}
401+
402+
#[cfg(test)]
403+
mod tests {
404+
use super::*;
405+
use crate::array::Array;
406+
use crate::cast::AsArray;
407+
use crate::types::{Float64Type, Int32Type};
408+
409+
#[test]
410+
fn test_union_builder_array_builder_trait() {
411+
// Test that UnionBuilder implements ArrayBuilder trait
412+
let mut builder = UnionBuilder::new_dense();
413+
414+
// Add some data
415+
builder.append::<Int32Type>("a", 1).unwrap();
416+
builder.append::<Float64Type>("b", 3.0).unwrap();
417+
builder.append::<Int32Type>("a", 4).unwrap();
418+
419+
assert_eq!(builder.len(), 3);
420+
421+
// Test finish_cloned (non-destructive)
422+
let array1 = builder.finish_cloned();
423+
assert_eq!(array1.len(), 3);
424+
425+
// Verify values in cloned array
426+
let union1 = array1.as_any().downcast_ref::<UnionArray>().unwrap();
427+
assert_eq!(union1.type_ids(), &[0, 1, 0]);
428+
assert_eq!(union1.offsets().unwrap().as_ref(), &[0, 0, 1]);
429+
let int_array1 = union1.child(0).as_primitive::<Int32Type>();
430+
let float_array1 = union1.child(1).as_primitive::<Float64Type>();
431+
assert_eq!(int_array1.value(0), 1);
432+
assert_eq!(int_array1.value(1), 4);
433+
assert_eq!(float_array1.value(0), 3.0);
434+
435+
// Builder should still be usable after finish_cloned
436+
builder.append::<Float64Type>("b", 5.0).unwrap();
437+
assert_eq!(builder.len(), 4);
438+
439+
// Test finish (destructive)
440+
let array2 = builder.finish();
441+
assert_eq!(array2.len(), 4);
442+
443+
// Verify values in final array
444+
let union2 = array2.as_any().downcast_ref::<UnionArray>().unwrap();
445+
assert_eq!(union2.type_ids(), &[0, 1, 0, 1]);
446+
assert_eq!(union2.offsets().unwrap().as_ref(), &[0, 0, 1, 1]);
447+
let int_array2 = union2.child(0).as_primitive::<Int32Type>();
448+
let float_array2 = union2.child(1).as_primitive::<Float64Type>();
449+
assert_eq!(int_array2.value(0), 1);
450+
assert_eq!(int_array2.value(1), 4);
451+
assert_eq!(float_array2.value(0), 3.0);
452+
assert_eq!(float_array2.value(1), 5.0);
453+
}
454+
455+
#[test]
456+
fn test_union_builder_type_erased() {
457+
// Test type-erased usage with Box<dyn ArrayBuilder>
458+
let mut builders: Vec<Box<dyn ArrayBuilder>> = vec![Box::new(UnionBuilder::new_sparse())];
459+
460+
// Downcast and use
461+
let union_builder = builders[0]
462+
.as_any_mut()
463+
.downcast_mut::<UnionBuilder>()
464+
.unwrap();
465+
union_builder.append::<Int32Type>("x", 10).unwrap();
466+
union_builder.append::<Float64Type>("y", 20.0).unwrap();
467+
468+
assert_eq!(builders[0].len(), 2);
469+
470+
let result = builders
471+
.into_iter()
472+
.map(|mut b| b.finish())
473+
.collect::<Vec<_>>();
474+
assert_eq!(result[0].len(), 2);
475+
476+
// Verify sparse union values
477+
let union = result[0].as_any().downcast_ref::<UnionArray>().unwrap();
478+
assert_eq!(union.type_ids(), &[0, 1]);
479+
assert!(union.offsets().is_none()); // Sparse union has no offsets
480+
let int_array = union.child(0).as_primitive::<Int32Type>();
481+
let float_array = union.child(1).as_primitive::<Float64Type>();
482+
assert_eq!(int_array.value(0), 10);
483+
assert!(int_array.is_null(1)); // Null in sparse layout
484+
assert!(float_array.is_null(0)); // Null in sparse layout
485+
assert_eq!(float_array.value(1), 20.0);
486+
}
313487
}

0 commit comments

Comments
 (0)