Skip to content

Commit c76dbbb

Browse files
author
Yicong Huang
committed
fix: write offset buffer for empty variable-width vectors per Arrow spec
1 parent 4a7fb4e commit c76dbbb

File tree

3 files changed

+66
-4
lines changed

3 files changed

+66
-4
lines changed

vector/src/main/java/org/apache/arrow/vector/BaseLargeVariableWidthVector.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,14 +373,26 @@ private void setReaderAndWriterIndex() {
373373
valueBuffer.readerIndex(0);
374374
if (valueCount == 0) {
375375
validityBuffer.writerIndex(0);
376-
offsetBuffer.writerIndex(0);
377376
valueBuffer.writerIndex(0);
378377
} else {
379378
final long lastDataOffset = getStartOffset(valueCount);
380379
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
381-
offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
382380
valueBuffer.writerIndex(lastDataOffset);
383381
}
382+
// IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`.
383+
// Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers
384+
// in other libraries. According to Arrow spec, we should still output the offset buffer which
385+
// is [0].
386+
final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH;
387+
if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
388+
ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize);
389+
if (offsetBuffer.capacity() > 0) {
390+
newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
391+
}
392+
offsetBuffer.getReferenceManager().release();
393+
offsetBuffer = newOffsetBuffer;
394+
}
395+
offsetBuffer.writerIndex(requiredOffsetBufferSize);
384396
}
385397

386398
/** Same as {@link #allocateNewSafe()}. */

vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -389,14 +389,26 @@ private void setReaderAndWriterIndex() {
389389
valueBuffer.readerIndex(0);
390390
if (valueCount == 0) {
391391
validityBuffer.writerIndex(0);
392-
offsetBuffer.writerIndex(0);
393392
valueBuffer.writerIndex(0);
394393
} else {
395394
final int lastDataOffset = getStartOffset(valueCount);
396395
validityBuffer.writerIndex(BitVectorHelper.getValidityBufferSizeFromCount(valueCount));
397-
offsetBuffer.writerIndex((long) (valueCount + 1) * OFFSET_WIDTH);
398396
valueBuffer.writerIndex(lastDataOffset);
399397
}
398+
// IPC serializer will determine readable bytes based on `readerIndex` and `writerIndex`.
399+
// Both are set to 0 means 0 bytes are written to the IPC stream which will crash IPC readers
400+
// in other libraries. According to Arrow spec, we should still output the offset buffer which
401+
// is [0].
402+
final long requiredOffsetBufferSize = (long) (valueCount + 1) * OFFSET_WIDTH;
403+
if (offsetBuffer.capacity() < requiredOffsetBufferSize) {
404+
ArrowBuf newOffsetBuffer = allocateOffsetBuffer(requiredOffsetBufferSize);
405+
if (offsetBuffer.capacity() > 0) {
406+
newOffsetBuffer.setBytes(0, offsetBuffer, 0, offsetBuffer.capacity());
407+
}
408+
offsetBuffer.getReferenceManager().release();
409+
offsetBuffer = newOffsetBuffer;
410+
}
411+
offsetBuffer.writerIndex(requiredOffsetBufferSize);
400412
}
401413

402414
/** Same as {@link #allocateNewSafe()}. */

vector/src/test/java/org/apache/arrow/vector/TestValueVector.java

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3940,4 +3940,42 @@ public void testVectorLoadUnloadOnNonVariadicVectors() {
39403940
}
39413941
}
39423942
}
3943+
3944+
@Test
3945+
public void testEmptyVarCharOffsetBuffer() {
3946+
// Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0)
3947+
// even when valueCount is 0, per Arrow specification.
3948+
try (VarCharVector vector = newVarCharVector("varchar", allocator)) {
3949+
vector.allocateNew();
3950+
vector.setValueCount(0);
3951+
3952+
List<ArrowBuf> buffers = vector.getFieldBuffers();
3953+
// buffers: [validity, offset, data]
3954+
assertTrue(
3955+
buffers.get(1).readableBytes() >= BaseVariableWidthVector.OFFSET_WIDTH,
3956+
"Offset buffer should have at least "
3957+
+ BaseVariableWidthVector.OFFSET_WIDTH
3958+
+ " bytes for offset[0]");
3959+
assertEquals(0, vector.getOffsetBuffer().getInt(0));
3960+
}
3961+
}
3962+
3963+
@Test
3964+
public void testEmptyLargeVarCharOffsetBuffer() {
3965+
// Validates that offset buffer has at least OFFSET_WIDTH bytes (for offset[0]=0)
3966+
// even when valueCount is 0, per Arrow specification.
3967+
try (LargeVarCharVector vector = new LargeVarCharVector("largevarchar", allocator)) {
3968+
vector.allocateNew();
3969+
vector.setValueCount(0);
3970+
3971+
List<ArrowBuf> buffers = vector.getFieldBuffers();
3972+
// buffers: [validity, offset, data]
3973+
assertTrue(
3974+
buffers.get(1).readableBytes() >= BaseLargeVariableWidthVector.OFFSET_WIDTH,
3975+
"Offset buffer should have at least "
3976+
+ BaseLargeVariableWidthVector.OFFSET_WIDTH
3977+
+ " bytes for offset[0]");
3978+
assertEquals(0, vector.getOffsetBuffer().getLong(0));
3979+
}
3980+
}
39433981
}

0 commit comments

Comments
 (0)