Skip to content

Commit c71edce

Browse files
authored
[Variant] VariantArrayBuilder tracks only offsets (apache#8193)
# Which issue does this PR close? - Closes apache#8192 # Rationale for this change Tracking lengths is redundant because that's just the difference between adjacent offsets. It becomes even easier if we eventually start adding the views eagerly instead of storing up offsets, because we only need to remember the difference between starting and final offset. # What changes are included in this PR? Update the algorithm that creates the binary view to compute the offsets. # Are these changes tested? Existing tests cover this change. # Are there any user-facing changes? No.
1 parent a9b4221 commit c71edce

File tree

1 file changed

+44
-53
lines changed

1 file changed

+44
-53
lines changed

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 44 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -74,12 +74,12 @@ pub struct VariantArrayBuilder {
7474
nulls: NullBufferBuilder,
7575
/// buffer for all the metadata
7676
metadata_buffer: Vec<u8>,
77-
/// (offset, len) pairs for locations of metadata in the buffer
78-
metadata_locations: Vec<(usize, usize)>,
77+
/// ending offset for each serialized metadata dictionary in the buffer
78+
metadata_offsets: Vec<usize>,
7979
/// buffer for values
8080
value_buffer: Vec<u8>,
81-
/// (offset, len) pairs for locations of values in the buffer
82-
value_locations: Vec<(usize, usize)>,
81+
/// ending offset for each serialized variant value in the buffer
82+
value_offsets: Vec<usize>,
8383
/// The fields of the final `StructArray`
8484
///
8585
/// TODO: 1) Add extension type metadata
@@ -96,9 +96,9 @@ impl VariantArrayBuilder {
9696
Self {
9797
nulls: NullBufferBuilder::new(row_capacity),
9898
metadata_buffer: Vec::new(), // todo allocation capacity
99-
metadata_locations: Vec::with_capacity(row_capacity),
99+
metadata_offsets: Vec::with_capacity(row_capacity),
100100
value_buffer: Vec::new(),
101-
value_locations: Vec::with_capacity(row_capacity),
101+
value_offsets: Vec::with_capacity(row_capacity),
102102
fields: Fields::from(vec![metadata_field, value_field]),
103103
}
104104
}
@@ -108,15 +108,15 @@ impl VariantArrayBuilder {
108108
let Self {
109109
mut nulls,
110110
metadata_buffer,
111-
metadata_locations,
111+
metadata_offsets,
112112
value_buffer,
113-
value_locations,
113+
value_offsets,
114114
fields,
115115
} = self;
116116

117-
let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_locations);
117+
let metadata_array = binary_view_array_from_buffers(metadata_buffer, metadata_offsets);
118118

119-
let value_array = binary_view_array_from_buffers(value_buffer, value_locations);
119+
let value_array = binary_view_array_from_buffers(value_buffer, value_offsets);
120120

121121
// The build the final struct array
122122
let inner = StructArray::new(
@@ -136,13 +136,8 @@ impl VariantArrayBuilder {
136136
pub fn append_null(&mut self) {
137137
self.nulls.append_null();
138138
// The subfields are expected to be non-nullable according to the parquet variant spec.
139-
let metadata_offset = self.metadata_buffer.len();
140-
let metadata_length = 0;
141-
self.metadata_locations
142-
.push((metadata_offset, metadata_length));
143-
let value_offset = self.value_buffer.len();
144-
let value_length = 0;
145-
self.value_locations.push((value_offset, value_length));
139+
self.metadata_offsets.push(self.metadata_buffer.len());
140+
self.value_offsets.push(self.value_buffer.len());
146141
}
147142

148143
/// Append the [`Variant`] to the builder as the next row
@@ -186,10 +181,7 @@ impl VariantArrayBuilder {
186181
/// assert!(variant_array.value(1).as_object().is_some());
187182
/// ```
188183
pub fn variant_builder(&mut self) -> VariantArrayVariantBuilder<'_> {
189-
// append directly into the metadata and value buffers
190-
let metadata_buffer = std::mem::take(&mut self.metadata_buffer);
191-
let value_buffer = std::mem::take(&mut self.value_buffer);
192-
VariantArrayVariantBuilder::new(self, metadata_buffer, value_buffer)
184+
VariantArrayVariantBuilder::new(self)
193185
}
194186
}
195187

@@ -236,11 +228,10 @@ impl<'a> VariantArrayVariantBuilder<'a> {
236228
///
237229
/// Note this is not public as this is a structure that is logically
238230
/// part of the [`VariantArrayBuilder`] and relies on its internal structure
239-
fn new(
240-
array_builder: &'a mut VariantArrayBuilder,
241-
metadata_buffer: Vec<u8>,
242-
value_buffer: Vec<u8>,
243-
) -> Self {
231+
fn new(array_builder: &'a mut VariantArrayBuilder) -> Self {
232+
// append directly into the metadata and value buffers
233+
let metadata_buffer = std::mem::take(&mut array_builder.metadata_buffer);
234+
let value_buffer = std::mem::take(&mut array_builder.value_buffer);
244235
let metadata_offset = metadata_buffer.len();
245236
let value_offset = value_buffer.len();
246237
VariantArrayVariantBuilder {
@@ -276,27 +267,25 @@ impl<'a> VariantArrayVariantBuilder<'a> {
276267
let (metadata_buffer, value_buffer) = std::mem::take(&mut self.variant_builder).finish();
277268

278269
// Sanity Check: if the buffers got smaller, something went wrong (previous data was lost)
279-
let metadata_len = metadata_buffer
280-
.len()
281-
.checked_sub(metadata_offset)
282-
.expect("metadata length decreased unexpectedly");
283-
let value_len = value_buffer
284-
.len()
285-
.checked_sub(value_offset)
286-
.expect("value length decreased unexpectedly");
270+
assert!(
271+
metadata_offset <= metadata_buffer.len(),
272+
"metadata length decreased unexpectedly"
273+
);
274+
assert!(
275+
value_offset <= value_buffer.len(),
276+
"value length decreased unexpectedly"
277+
);
287278

288279
// commit the changes by putting the
289-
// offsets and lengths into the parent array builder.
290-
self.array_builder
291-
.metadata_locations
292-
.push((metadata_offset, metadata_len));
293-
self.array_builder
294-
.value_locations
295-
.push((value_offset, value_len));
296-
self.array_builder.nulls.append_non_null();
280+
// ending offsets into the parent array builder.
281+
let builder = &mut self.array_builder;
282+
builder.metadata_offsets.push(metadata_buffer.len());
283+
builder.value_offsets.push(value_buffer.len());
284+
builder.nulls.append_non_null();
285+
297286
// put the buffers back into the array builder
298-
self.array_builder.metadata_buffer = metadata_buffer;
299-
self.array_builder.value_buffer = value_buffer;
287+
builder.metadata_buffer = metadata_buffer;
288+
builder.value_buffer = value_buffer;
300289
}
301290
}
302291

@@ -338,19 +327,21 @@ impl Drop for VariantArrayVariantBuilder<'_> {
338327
}
339328
}
340329

341-
fn binary_view_array_from_buffers(
342-
buffer: Vec<u8>,
343-
locations: Vec<(usize, usize)>,
344-
) -> BinaryViewArray {
345-
let mut builder = BinaryViewBuilder::with_capacity(locations.len());
330+
fn binary_view_array_from_buffers(buffer: Vec<u8>, offsets: Vec<usize>) -> BinaryViewArray {
331+
// All offsets are less than or equal to the buffer length, so we can safely cast all offsets
332+
// inside the loop below, as long as the buffer length fits in u32.
333+
u32::try_from(buffer.len()).expect("buffer length should fit in u32");
334+
335+
let mut builder = BinaryViewBuilder::with_capacity(offsets.len());
346336
let block = builder.append_block(buffer.into());
347337
// TODO this can be much faster if it creates the views directly during append
348-
for (offset, length) in locations {
349-
let offset = offset.try_into().expect("offset should fit in u32");
350-
let length = length.try_into().expect("length should fit in u32");
338+
let mut start = 0;
339+
for end in offsets {
340+
let end = end as u32; // Safe cast: validated max offset fits in u32 above
351341
builder
352-
.try_append_view(block, offset, length)
342+
.try_append_view(block, start, end - start)
353343
.expect("Failed to append view");
344+
start = end;
354345
}
355346
builder.finish()
356347
}

0 commit comments

Comments
 (0)