diff --git a/docs/layouts/shortcodes/generated/core_configuration.html b/docs/layouts/shortcodes/generated/core_configuration.html index 0071c264376f..1ad6b008ef98 100644 --- a/docs/layouts/shortcodes/generated/core_configuration.html +++ b/docs/layouts/shortcodes/generated/core_configuration.html @@ -86,6 +86,12 @@ String Specifies column names that should be stored as blob type. This is used when you want to treat a BYTES column as a BLOB. + +
blob-ref-field
+ (none) + String + Specifies column names that should be stored as blob reference type. This is used when you want to treat a BYTES column as a BLOB_REF. +
blob.split-by-file-size
(none) diff --git a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java index 897bebcf686c..8b38b2d8ee0f 100644 --- a/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java +++ b/paimon-api/src/main/java/org/apache/paimon/CoreOptions.java @@ -2254,6 +2254,14 @@ public InlineElement getDescription() { "Specifies column names that should be stored as blob type. " + "This is used when you want to treat a BYTES column as a BLOB."); + public static final ConfigOption BLOB_REF_FIELD = + key("blob-ref-field") + .stringType() + .noDefaultValue() + .withDescription( + "Specifies column names that should be stored as blob reference type. " + + "This is used when you want to treat a BYTES column as a BLOB_REF."); + @Immutable public static final ConfigOption BLOB_DESCRIPTOR_FIELD = key("blob-descriptor-field") @@ -2935,7 +2943,13 @@ public Set blobExternalStorageField() { * subset of descriptor fields and therefore are also updatable. */ public Set updatableBlobFields() { - return blobDescriptorField(); + Set fields = new HashSet<>(blobDescriptorField()); + fields.addAll(blobRefField()); + return fields; + } + + public Set blobRefField() { + return parseCommaSeparatedSet(BLOB_REF_FIELD); } /** @@ -3274,6 +3288,15 @@ public static List blobField(Map options) { return Arrays.stream(string.split(",")).map(String::trim).collect(Collectors.toList()); } + public static List blobRefField(Map options) { + String string = options.get(BLOB_REF_FIELD.key()); + if (string == null) { + return Collections.emptyList(); + } + + return Arrays.stream(string.split(",")).map(String::trim).collect(Collectors.toList()); + } + public boolean sequenceFieldSortOrderIsAscending() { return options.get(SEQUENCE_FIELD_SORT_ORDER) == SortOrder.ASCENDING; } diff --git a/paimon-api/src/main/java/org/apache/paimon/types/BlobRefType.java b/paimon-api/src/main/java/org/apache/paimon/types/BlobRefType.java new file mode 100644 index 000000000000..e5d53ead4e4a --- /dev/null +++ b/paimon-api/src/main/java/org/apache/paimon/types/BlobRefType.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.types; + +import org.apache.paimon.annotation.Public; + +/** + * Data type of blob reference. + * + *

{@link BlobRefType} stores reference bytes inline in data files instead of writing payloads to + * Paimon-managed {@code .blob} files. + * + * @since 1.5.0 + */ +@Public +public final class BlobRefType extends DataType { + + private static final long serialVersionUID = 1L; + + private static final String FORMAT = "BLOB_REF"; + + public BlobRefType(boolean isNullable) { + super(isNullable, DataTypeRoot.BLOB_REF); + } + + public BlobRefType() { + this(true); + } + + @Override + public int defaultSize() { + return BlobType.DEFAULT_SIZE; + } + + @Override + public DataType copy(boolean isNullable) { + return new BlobRefType(isNullable); + } + + @Override + public String asSQLString() { + return withNullability(FORMAT); + } + + @Override + public R accept(DataTypeVisitor visitor) { + return visitor.visit(this); + } +} diff --git a/paimon-api/src/main/java/org/apache/paimon/types/DataTypeDefaultVisitor.java b/paimon-api/src/main/java/org/apache/paimon/types/DataTypeDefaultVisitor.java index af680ede62e2..4a819d42ae2c 100644 --- a/paimon-api/src/main/java/org/apache/paimon/types/DataTypeDefaultVisitor.java +++ b/paimon-api/src/main/java/org/apache/paimon/types/DataTypeDefaultVisitor.java @@ -119,6 +119,11 @@ public R visit(BlobType blobType) { return defaultMethod(blobType); } + @Override + public R visit(BlobRefType blobRefType) { + return defaultMethod(blobRefType); + } + @Override public R visit(ArrayType arrayType) { return defaultMethod(arrayType); diff --git a/paimon-api/src/main/java/org/apache/paimon/types/DataTypeJsonParser.java b/paimon-api/src/main/java/org/apache/paimon/types/DataTypeJsonParser.java index 4079dd8c47c0..5e2a39a29fcd 100644 --- a/paimon-api/src/main/java/org/apache/paimon/types/DataTypeJsonParser.java +++ b/paimon-api/src/main/java/org/apache/paimon/types/DataTypeJsonParser.java @@ -331,6 +331,7 @@ private enum Keyword { LEGACY, VARIANT, BLOB, + BLOB_REF, NOT } @@ -549,6 +550,8 @@ private DataType parseTypeByKeyword() { return new VariantType(); case BLOB: return new BlobType(); + case BLOB_REF: + return new BlobRefType(); case VECTOR: return parseVectorType(); default: diff --git a/paimon-api/src/main/java/org/apache/paimon/types/DataTypeRoot.java b/paimon-api/src/main/java/org/apache/paimon/types/DataTypeRoot.java index f55da9c4706f..27f8d65a40bf 100644 --- a/paimon-api/src/main/java/org/apache/paimon/types/DataTypeRoot.java +++ b/paimon-api/src/main/java/org/apache/paimon/types/DataTypeRoot.java @@ -104,6 +104,8 @@ public enum DataTypeRoot { BLOB(DataTypeFamily.PREDEFINED), + BLOB_REF(DataTypeFamily.PREDEFINED), + ARRAY(DataTypeFamily.CONSTRUCTED, DataTypeFamily.COLLECTION), VECTOR(DataTypeFamily.CONSTRUCTED, DataTypeFamily.COLLECTION), diff --git a/paimon-api/src/main/java/org/apache/paimon/types/DataTypeVisitor.java b/paimon-api/src/main/java/org/apache/paimon/types/DataTypeVisitor.java index 6e377309f237..074a1d82ec70 100644 --- a/paimon-api/src/main/java/org/apache/paimon/types/DataTypeVisitor.java +++ b/paimon-api/src/main/java/org/apache/paimon/types/DataTypeVisitor.java @@ -66,6 +66,8 @@ public interface DataTypeVisitor { R visit(BlobType blobType); + R visit(BlobRefType blobRefType); + R visit(ArrayType arrayType); R visit(VectorType vectorType); diff --git a/paimon-api/src/main/java/org/apache/paimon/types/DataTypes.java b/paimon-api/src/main/java/org/apache/paimon/types/DataTypes.java index 39b180651ef5..0033984bc6cc 100644 --- a/paimon-api/src/main/java/org/apache/paimon/types/DataTypes.java +++ b/paimon-api/src/main/java/org/apache/paimon/types/DataTypes.java @@ -163,6 +163,10 @@ public static BlobType BLOB() { return new BlobType(); } + public static BlobRefType BLOB_REF() { + return new BlobRefType(); + } + public static OptionalInt getPrecision(DataType dataType) { return dataType.accept(PRECISION_EXTRACTOR); } diff --git a/paimon-arrow/src/main/java/org/apache/paimon/arrow/ArrowFieldTypeConversion.java b/paimon-arrow/src/main/java/org/apache/paimon/arrow/ArrowFieldTypeConversion.java index 33defc8f9a01..aca278588b7d 100644 --- a/paimon-arrow/src/main/java/org/apache/paimon/arrow/ArrowFieldTypeConversion.java +++ b/paimon-arrow/src/main/java/org/apache/paimon/arrow/ArrowFieldTypeConversion.java @@ -21,6 +21,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -163,6 +164,12 @@ public FieldType visit(BlobType blobType) { throw new UnsupportedOperationException(); } + @Override + public FieldType visit(BlobRefType blobRefType) { + return new FieldType( + blobRefType.isNullable(), Types.MinorType.VARBINARY.getType(), null); + } + private TimeUnit getTimeUnit(int precision) { if (precision == 0) { return TimeUnit.SECOND; diff --git a/paimon-arrow/src/main/java/org/apache/paimon/arrow/converter/Arrow2PaimonVectorConverter.java b/paimon-arrow/src/main/java/org/apache/paimon/arrow/converter/Arrow2PaimonVectorConverter.java index e1fe66883a84..6c2008452276 100644 --- a/paimon-arrow/src/main/java/org/apache/paimon/arrow/converter/Arrow2PaimonVectorConverter.java +++ b/paimon-arrow/src/main/java/org/apache/paimon/arrow/converter/Arrow2PaimonVectorConverter.java @@ -47,6 +47,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -447,6 +448,11 @@ public Arrow2PaimonVectorConverter visit(BlobType blobType) { throw new UnsupportedOperationException(); } + @Override + public Arrow2PaimonVectorConverter visit(BlobRefType blobRefType) { + return visit(new VarBinaryType()); + } + @Override public Arrow2PaimonVectorConverter visit(ArrayType arrayType) { final Arrow2PaimonVectorConverter arrowVectorConvertor = diff --git a/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriterFactoryVisitor.java b/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriterFactoryVisitor.java index ccff6d6a24f6..287a33f0382f 100644 --- a/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriterFactoryVisitor.java +++ b/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriterFactoryVisitor.java @@ -21,6 +21,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -156,6 +157,11 @@ public ArrowFieldWriterFactory visit(BlobType blobType) { throw new UnsupportedOperationException("Doesn't support BlobType."); } + @Override + public ArrowFieldWriterFactory visit(BlobRefType blobRefType) { + return ArrowFieldWriters.BlobRefWriter::new; + } + @Override public ArrowFieldWriterFactory visit(ArrayType arrayType) { ArrowFieldWriterFactory elementWriterFactory = arrayType.getElementType().accept(this); diff --git a/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java b/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java index 2999acdaf651..6a977e65f655 100644 --- a/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java +++ b/paimon-arrow/src/main/java/org/apache/paimon/arrow/writer/ArrowFieldWriters.java @@ -199,6 +199,42 @@ protected void doWrite(int rowIndex, DataGetters getters, int pos) { } } + /** + * Writer for BLOB_REF. The batch path is identical to {@link BinaryWriter} (columnar data is + * already serialized bytes). The row-by-row path serializes the {@link + * org.apache.paimon.data.Blob} via {@link BlobUtils#serializeBlobReference}. + */ + public static class BlobRefWriter extends ArrowFieldWriter { + + public BlobRefWriter(FieldVector fieldVector, boolean isNullable) { + super(fieldVector, isNullable); + } + + @Override + protected void doWrite( + ColumnVector columnVector, + @Nullable int[] pickedInColumn, + int startIndex, + int batchRows) { + VarBinaryVector varBinaryVector = (VarBinaryVector) fieldVector; + for (int i = 0; i < batchRows; i++) { + int row = getRowNumber(startIndex, i, pickedInColumn); + if (columnVector.isNullAt(row)) { + varBinaryVector.setNull(i); + } else { + byte[] value = ((BytesColumnVector) columnVector).getBytes(row).getBytes(); + varBinaryVector.setSafe(i, value); + } + } + } + + @Override + protected void doWrite(int rowIndex, DataGetters getters, int pos) { + byte[] bytes = getters.getBlobRef(pos).reference().serialize(); + ((VarBinaryVector) fieldVector).setSafe(rowIndex, bytes); + } + } + /** Writer for DECIMAL. */ public static class DecimalWriter extends ArrowFieldWriter { diff --git a/paimon-arrow/src/test/java/org/apache/paimon/arrow/converter/ArrowVectorizedBatchConverterTest.java b/paimon-arrow/src/test/java/org/apache/paimon/arrow/converter/ArrowVectorizedBatchConverterTest.java index ec484de64bd8..06d0f6dc8cb1 100644 --- a/paimon-arrow/src/test/java/org/apache/paimon/arrow/converter/ArrowVectorizedBatchConverterTest.java +++ b/paimon-arrow/src/test/java/org/apache/paimon/arrow/converter/ArrowVectorizedBatchConverterTest.java @@ -20,18 +20,21 @@ import org.apache.paimon.arrow.ArrowUtils; import org.apache.paimon.arrow.writer.ArrowFieldWriter; +import org.apache.paimon.data.BlobReference; import org.apache.paimon.data.InternalRow; import org.apache.paimon.data.InternalVector; import org.apache.paimon.data.columnar.ColumnVector; import org.apache.paimon.data.columnar.ColumnarVec; import org.apache.paimon.data.columnar.VecColumnVector; import org.apache.paimon.data.columnar.VectorizedColumnBatch; +import org.apache.paimon.data.columnar.heap.HeapBytesVector; import org.apache.paimon.data.columnar.heap.HeapFloatVector; import org.apache.paimon.reader.VectorizedRecordIterator; import org.apache.paimon.types.DataTypes; import org.apache.paimon.types.RowType; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VectorSchemaRoot; import org.apache.arrow.vector.complex.FixedSizeListVector; import org.junit.jupiter.api.Test; @@ -129,6 +132,78 @@ public void testVectorColumnWriteWithPickedInColumn() { } } + @Test + public void testBlobRefColumnWriteAndReadBack() { + RowType rowType = RowType.of(DataTypes.BLOB_REF()); + try (RootAllocator allocator = new RootAllocator()) { + VectorSchemaRoot vsr = ArrowUtils.createVectorSchemaRoot(rowType, allocator); + ArrowFieldWriter[] fieldWriters = ArrowUtils.createArrowFieldWriters(vsr, rowType); + + // Prepare serialized BlobReference bytes + BlobReference ref0 = new BlobReference("default.upstream", 7, 100L); + BlobReference ref1 = new BlobReference("default.upstream", 8, 200L); + byte[] bytes0 = ref0.serialize(); + byte[] bytes1 = ref1.serialize(); + + int rows = 3; // row 0 = ref0, row 1 = null, row 2 = ref1 + HeapBytesVector bytesVector = new HeapBytesVector(rows); + bytesVector.appendByteArray(bytes0, 0, bytes0.length); + bytesVector.appendByteArray(new byte[0], 0, 0); // placeholder for null + bytesVector.appendByteArray(bytes1, 0, bytes1.length); + bytesVector.setNullAt(1); + + VectorizedColumnBatch batch = + new VectorizedColumnBatch(new ColumnVector[] {bytesVector}); + batch.setNumRows(rows); + + ArrowVectorizedBatchConverter converter = + new ArrowVectorizedBatchConverter(vsr, fieldWriters); + converter.reset( + new VectorizedRecordIterator() { + @Override + public VectorizedColumnBatch batch() { + return batch; + } + + @Override + public InternalRow next() { + return null; + } + + @Override + public void releaseBatch() {} + }); + converter.next(rows); + + // Verify the Arrow vector contains the correct binary data + VarBinaryVector arrowVector = (VarBinaryVector) vsr.getVector(0); + assertThat(arrowVector.isNull(0)).isFalse(); + assertThat(arrowVector.isNull(1)).isTrue(); + assertThat(arrowVector.isNull(2)).isFalse(); + + // Read back and verify the bytes can be deserialized to BlobReference + BlobReference readRef0 = BlobReference.deserialize(arrowVector.getObject(0)); + assertThat(readRef0.tableName()).isEqualTo("default.upstream"); + assertThat(readRef0.fieldId()).isEqualTo(7); + assertThat(readRef0.rowId()).isEqualTo(100L); + + BlobReference readRef2 = BlobReference.deserialize(arrowVector.getObject(2)); + assertThat(readRef2.tableName()).isEqualTo("default.upstream"); + assertThat(readRef2.fieldId()).isEqualTo(8); + assertThat(readRef2.rowId()).isEqualTo(200L); + + // Also verify the Arrow2Paimon round-trip + Arrow2PaimonVectorConverter paimonConverter = + Arrow2PaimonVectorConverter.construct(DataTypes.BLOB_REF()); + ColumnVector paimonVector = paimonConverter.convertVector(arrowVector); + assertThat(paimonVector.isNullAt(0)).isFalse(); + assertThat(paimonVector.isNullAt(1)).isTrue(); + assertThat(paimonVector.isNullAt(2)).isFalse(); + + converter.close(); + } + } + private static class TestVecColumnVectorWithNulls implements VecColumnVector { private final ColumnVector data; diff --git a/paimon-arrow/src/test/java/org/apache/paimon/arrow/vector/OneElementFieldVectorGeneratorTest.java b/paimon-arrow/src/test/java/org/apache/paimon/arrow/vector/OneElementFieldVectorGeneratorTest.java index a417133da1a1..ff4ea1e0ed67 100644 --- a/paimon-arrow/src/test/java/org/apache/paimon/arrow/vector/OneElementFieldVectorGeneratorTest.java +++ b/paimon-arrow/src/test/java/org/apache/paimon/arrow/vector/OneElementFieldVectorGeneratorTest.java @@ -20,6 +20,9 @@ import org.apache.paimon.arrow.reader.ArrowBatchReader; import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; import org.apache.paimon.data.Timestamp; @@ -95,4 +98,34 @@ public void testFunction() { } } } + + @Test + public void testBlobRef() { + try (RootAllocator rootAllocator = new RootAllocator()) { + DataField dataField = new DataField(0, "ref", DataTypes.BLOB_REF()); + BlobReference reference = new BlobReference("default.upstream", 7, 42L); + BlobRef value = Blob.fromReference(reference); + + OneElementFieldVectorGenerator generator = + new OneElementFieldVectorGenerator(rootAllocator, dataField, value); + try (FieldVector fieldVector = generator.get(3)) { + Assertions.assertThat(fieldVector.getValueCount()).isEqualTo(3); + + // Read back via ArrowBatchReader — BLOB_REF comes back as binary bytes + ArrowBatchReader reader = + new ArrowBatchReader(new RowType(Arrays.asList(dataField)), true); + Iterable it = + reader.readBatch(new VectorSchemaRoot(Arrays.asList(fieldVector))); + it.forEach( + row -> { + byte[] bytes = row.getBinary(0); + BlobReference readRef = BlobReference.deserialize(bytes); + Assertions.assertThat(readRef.tableName()) + .isEqualTo("default.upstream"); + Assertions.assertThat(readRef.fieldId()).isEqualTo(7); + Assertions.assertThat(readRef.rowId()).isEqualTo(42L); + }); + } + } + } } diff --git a/paimon-codegen/src/main/scala/org/apache/paimon/codegen/GenerateUtils.scala b/paimon-codegen/src/main/scala/org/apache/paimon/codegen/GenerateUtils.scala index 967d58ad30db..e4a3fb6f9551 100644 --- a/paimon-codegen/src/main/scala/org/apache/paimon/codegen/GenerateUtils.scala +++ b/paimon-codegen/src/main/scala/org/apache/paimon/codegen/GenerateUtils.scala @@ -386,6 +386,7 @@ object GenerateUtils { case ROW => className[InternalRow] case VARIANT => className[Variant] case BLOB => className[Blob] + case BLOB_REF => className[BlobRef] case _ => throw new IllegalArgumentException("Illegal type: " + t) } @@ -430,6 +431,8 @@ object GenerateUtils { s"$rowTerm.getVariant($indexTerm)" case BLOB => s"$rowTerm.getBlob($indexTerm)" + case BLOB_REF => + s"$rowTerm.getBlobRef($indexTerm)" case _ => throw new IllegalArgumentException("Illegal type: " + t) } diff --git a/paimon-codegen/src/test/java/org/apache/paimon/codegen/EqualiserCodeGeneratorTest.java b/paimon-codegen/src/test/java/org/apache/paimon/codegen/EqualiserCodeGeneratorTest.java index 7e977291e81f..cbc0cbde9c52 100644 --- a/paimon-codegen/src/test/java/org/apache/paimon/codegen/EqualiserCodeGeneratorTest.java +++ b/paimon-codegen/src/test/java/org/apache/paimon/codegen/EqualiserCodeGeneratorTest.java @@ -22,7 +22,9 @@ import org.apache.paimon.data.BinaryRowWriter; import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.BinaryWriter; +import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobData; +import org.apache.paimon.data.BlobReference; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.GenericMap; import org.apache.paimon.data.GenericRow; @@ -206,6 +208,13 @@ public class EqualiserCodeGeneratorTest { Pair.of( new BlobData(new byte[] {1, 2, 3}), new BlobData(new byte[] {4, 5, 6})))); + TEST_DATA.put( + DataTypeRoot.BLOB_REF, + new GeneratedData( + DataTypes.BLOB_REF(), + Pair.of( + Blob.fromReference(new BlobReference("default.t1", 1, 0L)), + Blob.fromReference(new BlobReference("default.t2", 2, 1L))))); } @ParameterizedTest diff --git a/paimon-common/src/main/java/org/apache/paimon/PartitionSettedRow.java b/paimon-common/src/main/java/org/apache/paimon/PartitionSettedRow.java index a62464d1981f..f37cc30958d1 100644 --- a/paimon-common/src/main/java/org/apache/paimon/PartitionSettedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/PartitionSettedRow.java @@ -21,6 +21,7 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -170,6 +171,13 @@ public Blob getBlob(int pos) { : row.getBlob(partitionInfo.getRealIndex(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + return partitionInfo.inPartitionRow(pos) + ? partition.getBlobRef(partitionInfo.getRealIndex(pos)) + : row.getBlobRef(partitionInfo.getRealIndex(pos)); + } + @Override public InternalArray getArray(int pos) { return partitionInfo.inPartitionRow(pos) diff --git a/paimon-common/src/main/java/org/apache/paimon/casting/CastedArray.java b/paimon-common/src/main/java/org/apache/paimon/casting/CastedArray.java index 4e95c9db8dcf..b000887c70d1 100644 --- a/paimon-common/src/main/java/org/apache/paimon/casting/CastedArray.java +++ b/paimon-common/src/main/java/org/apache/paimon/casting/CastedArray.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -197,6 +198,11 @@ public Blob getBlob(int pos) { return castElementGetter.getElementOrNull(array, pos); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("CastedArray does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { return castElementGetter.getElementOrNull(array, pos); diff --git a/paimon-common/src/main/java/org/apache/paimon/casting/CastedRow.java b/paimon-common/src/main/java/org/apache/paimon/casting/CastedRow.java index 76a1366d4784..26806ac76959 100644 --- a/paimon-common/src/main/java/org/apache/paimon/casting/CastedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/casting/CastedRow.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -144,6 +145,11 @@ public Blob getBlob(int pos) { return castMapping[pos].getFieldOrNull(row); } + @Override + public BlobRef getBlobRef(int pos) { + return castMapping[pos].getFieldOrNull(row); + } + @Override public InternalArray getArray(int pos) { return castMapping[pos].getFieldOrNull(row); diff --git a/paimon-common/src/main/java/org/apache/paimon/casting/CastedVector.java b/paimon-common/src/main/java/org/apache/paimon/casting/CastedVector.java index c7bf0303467a..cade191598da 100644 --- a/paimon-common/src/main/java/org/apache/paimon/casting/CastedVector.java +++ b/paimon-common/src/main/java/org/apache/paimon/casting/CastedVector.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -91,6 +92,11 @@ public Blob getBlob(int pos) { throw new UnsupportedOperationException("CastedVector does not support Blob."); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("CastedVector does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { throw new UnsupportedOperationException("CastedVector does not support nested Array."); diff --git a/paimon-common/src/main/java/org/apache/paimon/casting/DefaultValueRow.java b/paimon-common/src/main/java/org/apache/paimon/casting/DefaultValueRow.java index 555f065c7031..dc32a7af2aa0 100644 --- a/paimon-common/src/main/java/org/apache/paimon/casting/DefaultValueRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/casting/DefaultValueRow.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalArray; @@ -217,6 +218,14 @@ public Blob getBlob(int pos) { return defaultValueRow.getBlob(pos); } + @Override + public BlobRef getBlobRef(int pos) { + if (!row.isNullAt(pos)) { + return row.getBlobRef(pos); + } + return defaultValueRow.getBlobRef(pos); + } + public static DefaultValueRow from(InternalRow defaultValueRow) { return new DefaultValueRow(defaultValueRow); } diff --git a/paimon-common/src/main/java/org/apache/paimon/casting/FallbackMappingRow.java b/paimon-common/src/main/java/org/apache/paimon/casting/FallbackMappingRow.java index b981d552876d..7086459791bd 100644 --- a/paimon-common/src/main/java/org/apache/paimon/casting/FallbackMappingRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/casting/FallbackMappingRow.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -167,6 +168,14 @@ public Blob getBlob(int pos) { return main.getBlob(pos); } + @Override + public BlobRef getBlobRef(int pos) { + if (mappings[pos] != -1 && main.isNullAt(pos)) { + return fallbackRow.getBlobRef(mappings[pos]); + } + return main.getBlobRef(pos); + } + @Override public InternalArray getArray(int pos) { if (mappings[pos] != -1 && main.isNullAt(pos)) { diff --git a/paimon-common/src/main/java/org/apache/paimon/data/BinaryArray.java b/paimon-common/src/main/java/org/apache/paimon/data/BinaryArray.java index afac3f8d3d35..5435edf5c5e6 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/BinaryArray.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/BinaryArray.java @@ -250,6 +250,11 @@ public Blob getBlob(int pos) { return new BlobData(getBinary(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("BinaryArray does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { assertIndexIsValid(pos); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/BinaryRow.java b/paimon-common/src/main/java/org/apache/paimon/data/BinaryRow.java index ff5406f7b326..3c2aecc3dc6d 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/BinaryRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/BinaryRow.java @@ -346,7 +346,13 @@ public Variant getVariant(int pos) { @Override public Blob getBlob(int pos) { - return new BlobData(getBinary(pos)); + return BlobUtils.fromBytes(getBinary(pos), null, null); + } + + @Override + public BlobRef getBlobRef(int pos) { + byte[] bytes = getBinary(pos); + return new BlobRef(BlobReference.deserialize(bytes)); } @Override diff --git a/paimon-common/src/main/java/org/apache/paimon/data/BinaryVector.java b/paimon-common/src/main/java/org/apache/paimon/data/BinaryVector.java index 9487aefd8d68..475ddea8ecf3 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/BinaryVector.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/BinaryVector.java @@ -163,6 +163,11 @@ public Blob getBlob(int pos) { throw new UnsupportedOperationException("BinaryVector does not support Blob."); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("BinaryVector does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { throw new UnsupportedOperationException("BinaryVector does not support nested Array."); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/BinaryWriter.java b/paimon-common/src/main/java/org/apache/paimon/data/BinaryWriter.java index 2e0cd5701b71..fbb1570d3517 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/BinaryWriter.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/BinaryWriter.java @@ -157,6 +157,10 @@ static void write( case BLOB: writer.writeBlob(pos, (Blob) o); break; + case BLOB_REF: + byte[] refBytes = ((BlobRef) o).reference().serialize(); + writer.writeBinary(pos, refBytes, 0, refBytes.length); + break; default: throw new UnsupportedOperationException("Not support type: " + type); } @@ -241,6 +245,11 @@ static ValueSetter createValueSetter(DataType elementType, Serializer seriali return (writer, pos, value) -> writer.writeVariant(pos, (Variant) value); case BLOB: return (writer, pos, value) -> writer.writeBlob(pos, (Blob) value); + case BLOB_REF: + return (writer, pos, value) -> { + byte[] bytes = ((BlobRef) value).reference().serialize(); + writer.writeBinary(pos, bytes, 0, bytes.length); + }; default: String msg = String.format( diff --git a/paimon-common/src/main/java/org/apache/paimon/data/Blob.java b/paimon-common/src/main/java/org/apache/paimon/data/Blob.java index 6586124e466b..8eed2733c671 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/Blob.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/Blob.java @@ -65,6 +65,10 @@ static Blob fromDescriptor(UriReader reader, BlobDescriptor descriptor) { return new BlobRef(reader, descriptor); } + static BlobRef fromReference(BlobReference reference) { + return new BlobRef(reference); + } + static Blob fromInputStream(Supplier supplier) { return new BlobStream(supplier); } diff --git a/paimon-common/src/main/java/org/apache/paimon/data/BlobRef.java b/paimon-common/src/main/java/org/apache/paimon/data/BlobRef.java index 0248454ee90e..aeefb2bebac2 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/BlobRef.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/BlobRef.java @@ -24,25 +24,61 @@ import org.apache.paimon.utils.IOUtils; import org.apache.paimon.utils.UriReader; +import javax.annotation.Nullable; + import java.io.IOException; +import java.io.Serializable; import java.util.Objects; /** - * A {@link Blob} refers blob in {@link BlobDescriptor}. + * A {@link Blob} that can represent both descriptor-backed blobs (for BLOB type) and + * reference-based blobs (for BLOB_REF type). + * + *

For BLOB type: created via {@link #BlobRef(UriReader, BlobDescriptor)}, always resolved. + * + *

For BLOB_REF type: created via {@link #BlobRef(BlobReference)}, initially unresolved. Call + * {@link #resolve(UriReader, BlobDescriptor)} to make it readable. * * @since 1.4.0 */ @Public -public class BlobRef implements Blob { +public class BlobRef implements Blob, Serializable { + + private static final long serialVersionUID = 1L; - private final UriReader uriReader; - private final BlobDescriptor descriptor; + @Nullable private final BlobReference reference; + @Nullable private UriReader uriReader; + @Nullable private BlobDescriptor descriptor; + /** Creates a resolved descriptor-backed blob (for BLOB type). */ public BlobRef(UriReader uriReader, BlobDescriptor descriptor) { + this.reference = null; this.uriReader = uriReader; this.descriptor = descriptor; } + /** Creates an unresolved blob ref (for BLOB_REF type). */ + public BlobRef(BlobReference reference) { + this.reference = reference; + this.uriReader = null; + this.descriptor = null; + } + + @Nullable + public BlobReference reference() { + return reference; + } + + public boolean isResolved() { + return uriReader != null && descriptor != null; + } + + /** Resolves this blob ref in place by setting the reader and descriptor. */ + public void resolve(UriReader reader, BlobDescriptor desc) { + this.uriReader = reader; + this.descriptor = desc; + } + @Override public byte[] toData() { try { @@ -54,15 +90,21 @@ public byte[] toData() { @Override public BlobDescriptor toDescriptor() { - return descriptor; + if (descriptor != null) { + return descriptor; + } + throw new IllegalStateException("BlobRef is not resolved."); } @Override public SeekableInputStream newInputStream() throws IOException { - return new OffsetSeekableInputStream( - uriReader.newInputStream(descriptor.uri()), - descriptor.offset(), - descriptor.length()); + if (uriReader != null && descriptor != null) { + return new OffsetSeekableInputStream( + uriReader.newInputStream(descriptor.uri()), + descriptor.offset(), + descriptor.length()); + } + throw new IllegalStateException("BlobRef is not resolved."); } @Override @@ -70,12 +112,15 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) { return false; } - BlobRef blobRef = (BlobRef) o; - return Objects.deepEquals(descriptor, blobRef.descriptor); + BlobRef that = (BlobRef) o; + if (reference != null) { + return Objects.equals(reference, that.reference); + } + return Objects.equals(descriptor, that.descriptor); } @Override public int hashCode() { - return descriptor.hashCode(); + return reference != null ? Objects.hash(reference) : Objects.hash(descriptor); } } diff --git a/paimon-common/src/main/java/org/apache/paimon/data/BlobReference.java b/paimon-common/src/main/java/org/apache/paimon/data/BlobReference.java new file mode 100644 index 000000000000..028db0dad514 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/data/BlobReference.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.data; + +import java.io.Serializable; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.Objects; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Serialized metadata for a {@code BLOB_REF} field. + * + *

A blob reference only stores the coordinates needed to locate the original blob value in the + * upstream table: {@code tableName}, {@code fieldId} and {@code rowId}. The actual blob data is + * resolved at read time by scanning the upstream table. + * + *

Serialization layout (Little Endian): + * + *

+ * | Offset       | Field         | Type    | Size |
+ * |--------------|---------------|---------|------|
+ * | 0            | version       | byte    | 1    |
+ * | 1            | magicNumber   | long    | 8    |
+ * | 9            | tableNameLen  | int     | 4    |
+ * | 13           | tableNameBytes| byte[N] | N    |
+ * | 13 + N       | fieldId       | int     | 4    |
+ * | 17 + N       | rowId         | long    | 8    |
+ * 
+ */ +public class BlobReference implements Serializable { + + private static final long serialVersionUID = 1L; + + private static final long MAGIC = 0x424C4F4252454631L; // "BLOBREF1" + private static final byte CURRENT_VERSION = 1; + + private final String tableName; + private final int fieldId; + private final long rowId; + + public BlobReference(String tableName, int fieldId, long rowId) { + this.tableName = tableName; + this.fieldId = fieldId; + this.rowId = rowId; + } + + public String tableName() { + return tableName; + } + + public int fieldId() { + return fieldId; + } + + public long rowId() { + return rowId; + } + + public byte[] serialize() { + byte[] tableBytes = tableName.getBytes(UTF_8); + + int totalSize = 1 + 8 + 4 + tableBytes.length + 4 + 8; + ByteBuffer buffer = ByteBuffer.allocate(totalSize).order(ByteOrder.LITTLE_ENDIAN); + buffer.put(CURRENT_VERSION); + buffer.putLong(MAGIC); + buffer.putInt(tableBytes.length); + buffer.put(tableBytes); + buffer.putInt(fieldId); + buffer.putLong(rowId); + return buffer.array(); + } + + public static BlobReference deserialize(byte[] bytes) { + ByteBuffer buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN); + byte version = buffer.get(); + + if (version != CURRENT_VERSION) { + throw new UnsupportedOperationException( + "Expecting BlobReference version to be " + + CURRENT_VERSION + + ", but found " + + version + + "."); + } + + long magic = buffer.getLong(); + if (magic != MAGIC) { + throw new IllegalArgumentException( + "Invalid BlobReference: missing magic header. Expected magic: " + + MAGIC + + ", but found: " + + magic); + } + + byte[] tableBytes = new byte[buffer.getInt()]; + buffer.get(tableBytes); + + int fieldId = buffer.getInt(); + long rowId = buffer.getLong(); + return new BlobReference(new String(tableBytes, UTF_8), fieldId, rowId); + } + + public static boolean isBlobReference(byte[] bytes) { + if (bytes == null || bytes.length < 9) { + return false; + } + ByteBuffer buffer = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN); + byte version = buffer.get(); + return version == CURRENT_VERSION && MAGIC == buffer.getLong(); + } + + @Override + public boolean equals(Object o) { + if (o == null || getClass() != o.getClass()) { + return false; + } + BlobReference that = (BlobReference) o; + return fieldId == that.fieldId + && rowId == that.rowId + && Objects.equals(tableName, that.tableName); + } + + @Override + public int hashCode() { + return Objects.hash(tableName, fieldId, rowId); + } + + @Override + public String toString() { + return "BlobReference{table=" + + tableName + + ", fieldId=" + + fieldId + + ", rowId=" + + rowId + + "}"; + } +} diff --git a/paimon-common/src/main/java/org/apache/paimon/data/BlobReferenceResolver.java b/paimon-common/src/main/java/org/apache/paimon/data/BlobReferenceResolver.java new file mode 100644 index 000000000000..dd263edef718 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/data/BlobReferenceResolver.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.data; + +import java.io.Serializable; + +/** Resolves a {@link BlobRef} by setting its reader and descriptor in place. */ +@FunctionalInterface +public interface BlobReferenceResolver extends Serializable { + + void resolve(BlobRef blobRef); +} diff --git a/paimon-common/src/main/java/org/apache/paimon/data/BlobUtils.java b/paimon-common/src/main/java/org/apache/paimon/data/BlobUtils.java new file mode 100644 index 000000000000..02c02d07e4ec --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/data/BlobUtils.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.data; + +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.utils.UriReader; +import org.apache.paimon.utils.UriReaderFactory; + +import javax.annotation.Nullable; + +/** Utilities for decoding and encoding blob-related bytes. */ +public class BlobUtils { + + /** + * Decodes blob bytes for BLOB type fields. For BLOB_REF type, use {@link + * DataGetters#getBlobRef(int)} instead. + */ + public static Blob fromBytes( + byte[] bytes, @Nullable UriReaderFactory uriReaderFactory, @Nullable FileIO fileIO) { + if (bytes == null) { + return null; + } + + if (BlobDescriptor.isBlobDescriptor(bytes)) { + BlobDescriptor descriptor = BlobDescriptor.deserialize(bytes); + UriReader reader = + uriReaderFactory != null + ? uriReaderFactory.create(descriptor.uri()) + : UriReader.fromFile(fileIO); + return Blob.fromDescriptor(reader, descriptor); + } + + return new BlobData(bytes); + } + + public static byte[] serializeBlobReference(BlobRef blobRef) { + return blobRef.reference().serialize(); + } + + private BlobUtils() {} +} diff --git a/paimon-common/src/main/java/org/apache/paimon/data/DataGetters.java b/paimon-common/src/main/java/org/apache/paimon/data/DataGetters.java index 1043b7e3ba4f..26bd95d6f26a 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/DataGetters.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/DataGetters.java @@ -81,6 +81,9 @@ public interface DataGetters { /** Returns the blob value at the given position. */ Blob getBlob(int pos); + /** Returns the blob ref value at the given position. For BLOB_REF type fields only. */ + BlobRef getBlobRef(int pos); + /** Returns the array value at the given position. */ InternalArray getArray(int pos); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/GenericArray.java b/paimon-common/src/main/java/org/apache/paimon/data/GenericArray.java index 8c1ba4e28ac9..da4a94732f20 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/GenericArray.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/GenericArray.java @@ -215,6 +215,11 @@ public Blob getBlob(int pos) { return (Blob) getObject(pos); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("GenericArray does not support BlobRef."); + } + @Override public BinaryString getString(int pos) { return (BinaryString) getObject(pos); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/GenericRow.java b/paimon-common/src/main/java/org/apache/paimon/data/GenericRow.java index 10aefbafdd07..b3d5541d13ac 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/GenericRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/GenericRow.java @@ -197,6 +197,15 @@ public Blob getBlob(int pos) { return (Blob) this.fields[pos]; } + @Override + public BlobRef getBlobRef(int pos) { + Object value = this.fields[pos]; + if (value instanceof BlobRef) { + return (BlobRef) value; + } + throw new ClassCastException("Cannot cast " + value.getClass().getName() + " to BlobRef"); + } + @Override public InternalArray getArray(int pos) { return (InternalArray) this.fields[pos]; diff --git a/paimon-common/src/main/java/org/apache/paimon/data/InternalRow.java b/paimon-common/src/main/java/org/apache/paimon/data/InternalRow.java index 3bbb85f49963..e466111e0072 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/InternalRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/InternalRow.java @@ -146,6 +146,10 @@ static Class getDataClass(DataType type) { case TIMESTAMP_WITHOUT_TIME_ZONE: case TIMESTAMP_WITH_LOCAL_TIME_ZONE: return Timestamp.class; + case BLOB: + return Blob.class; + case BLOB_REF: + return BlobRef.class; case ARRAY: return InternalArray.class; case MULTISET: @@ -230,6 +234,9 @@ static FieldGetter createFieldGetter(DataType fieldType, int fieldPos) { case BLOB: fieldGetter = row -> row.getBlob(fieldPos); break; + case BLOB_REF: + fieldGetter = row -> row.getBlobRef(fieldPos); + break; default: String msg = String.format( diff --git a/paimon-common/src/main/java/org/apache/paimon/data/JoinedRow.java b/paimon-common/src/main/java/org/apache/paimon/data/JoinedRow.java index fee5552f8c5c..f2ba1f9fc689 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/JoinedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/JoinedRow.java @@ -243,6 +243,15 @@ public Blob getBlob(int pos) { } } + @Override + public BlobRef getBlobRef(int pos) { + if (pos < row1.getFieldCount()) { + return row1.getBlobRef(pos); + } else { + return row2.getBlobRef(pos - row1.getFieldCount()); + } + } + @Override public InternalArray getArray(int pos) { if (pos < row1.getFieldCount()) { diff --git a/paimon-common/src/main/java/org/apache/paimon/data/LazyGenericRow.java b/paimon-common/src/main/java/org/apache/paimon/data/LazyGenericRow.java index 6d2e8b141f57..bbcce1429391 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/LazyGenericRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/LazyGenericRow.java @@ -153,6 +153,11 @@ public Blob getBlob(int pos) { return (Blob) getField(pos); } + @Override + public BlobRef getBlobRef(int pos) { + return (BlobRef) getField(pos); + } + @Override public InternalArray getArray(int pos) { return (InternalArray) getField(pos); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/NestedRow.java b/paimon-common/src/main/java/org/apache/paimon/data/NestedRow.java index afc4f0c47fb0..531e193409a3 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/NestedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/NestedRow.java @@ -291,7 +291,13 @@ public Variant getVariant(int pos) { @Override public Blob getBlob(int pos) { - return new BlobData(getBinary(pos)); + return BlobUtils.fromBytes(getBinary(pos), null, null); + } + + @Override + public BlobRef getBlobRef(int pos) { + byte[] bytes = getBinary(pos); + return new BlobRef(BlobReference.deserialize(bytes)); } @Override diff --git a/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarArray.java b/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarArray.java index 28221cec0db1..7b2a1c2ae8b6 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarArray.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarArray.java @@ -21,6 +21,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobData; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.DataSetters; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; @@ -139,6 +140,11 @@ public Blob getBlob(int pos) { return new BlobData(getBinary(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("ColumnarArray does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { return ((ArrayColumnVector) data).getArray(offset + pos); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarRow.java b/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarRow.java index 13d345b1f03f..34d59d1f6a7a 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarRow.java @@ -20,7 +20,9 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; -import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.data.BlobUtils; import org.apache.paimon.data.DataSetters; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; @@ -31,7 +33,6 @@ import org.apache.paimon.data.variant.Variant; import org.apache.paimon.fs.FileIO; import org.apache.paimon.types.RowKind; -import org.apache.paimon.utils.UriReader; import java.io.Serializable; @@ -162,14 +163,16 @@ public Blob getBlob(int pos) { if (bytes == null) { return null; } - if (fileIO == null) { - throw new IllegalStateException("FileIO is null, cannot read blob data from uri!"); - } + return BlobUtils.fromBytes(bytes, null, fileIO); + } - // Only blob descriptor could be able to stored in columnar format. - BlobDescriptor blobDescriptor = BlobDescriptor.deserialize(bytes); - UriReader uriReader = UriReader.fromFile(fileIO); - return Blob.fromDescriptor(uriReader, blobDescriptor); + @Override + public BlobRef getBlobRef(int pos) { + byte[] bytes = getBinary(pos); + if (bytes == null) { + return null; + } + return new BlobRef(BlobReference.deserialize(bytes)); } @Override diff --git a/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarVec.java b/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarVec.java index 01512dea786a..c5c40230958f 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarVec.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/columnar/ColumnarVec.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.DataSetters; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; @@ -152,6 +153,11 @@ public Blob getBlob(int pos) { throw new UnsupportedOperationException("Not support the operation!"); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("ColumnarVec does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { throw new UnsupportedOperationException("Not support the operation!"); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/columnar/RowToColumnConverter.java b/paimon-common/src/main/java/org/apache/paimon/data/columnar/RowToColumnConverter.java index de962ad86a39..12b7a567ec65 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/columnar/RowToColumnConverter.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/columnar/RowToColumnConverter.java @@ -41,6 +41,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -262,6 +263,11 @@ public TypeConverter visit(BlobType blobType) { throw new UnsupportedOperationException(); } + @Override + public TypeConverter visit(BlobRefType blobRefType) { + throw new UnsupportedOperationException(); + } + @Override public TypeConverter visit(ArrayType arrayType) { return createConverter( diff --git a/paimon-common/src/main/java/org/apache/paimon/data/safe/SafeBinaryArray.java b/paimon-common/src/main/java/org/apache/paimon/data/safe/SafeBinaryArray.java index 78d717ee1b0b..e8e060bbf722 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/safe/SafeBinaryArray.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/safe/SafeBinaryArray.java @@ -22,6 +22,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobData; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -160,6 +161,11 @@ public Blob getBlob(int pos) { return new BlobData(getBinary(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("SafeBinaryArray does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { throw new UnsupportedOperationException(); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/safe/SafeBinaryRow.java b/paimon-common/src/main/java/org/apache/paimon/data/safe/SafeBinaryRow.java index 2c285c30a6b4..2a0ba7fd6080 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/safe/SafeBinaryRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/safe/SafeBinaryRow.java @@ -22,6 +22,8 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobData; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -166,6 +168,11 @@ public Blob getBlob(int pos) { return new BlobData(getBinary(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + return new BlobRef(BlobReference.deserialize(getBinary(pos))); + } + @Override public InternalArray getArray(int pos) { return readArrayData(bytes, offset, getLong(pos)); diff --git a/paimon-common/src/main/java/org/apache/paimon/data/serializer/BlobRefSerializer.java b/paimon-common/src/main/java/org/apache/paimon/data/serializer/BlobRefSerializer.java new file mode 100644 index 000000000000..9d40468fdb11 --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/data/serializer/BlobRefSerializer.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.data.serializer; + +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.io.DataInputView; +import org.apache.paimon.io.DataOutputView; + +import java.io.IOException; + +/** Type serializer for {@code BLOB_REF}. */ +public class BlobRefSerializer extends SerializerSingleton { + + private static final long serialVersionUID = 1L; + + public static final BlobRefSerializer INSTANCE = new BlobRefSerializer(); + + @Override + public BlobRef copy(BlobRef from) { + return from; + } + + @Override + public void serialize(BlobRef blobRef, DataOutputView target) throws IOException { + BinarySerializer.INSTANCE.serialize(blobRef.reference().serialize(), target); + } + + @Override + public BlobRef deserialize(DataInputView source) throws IOException { + byte[] bytes = BinarySerializer.INSTANCE.deserialize(source); + return new BlobRef(BlobReference.deserialize(bytes)); + } +} diff --git a/paimon-common/src/main/java/org/apache/paimon/data/serializer/InternalSerializers.java b/paimon-common/src/main/java/org/apache/paimon/data/serializer/InternalSerializers.java index 6669f347ff27..9d4c9dba1798 100644 --- a/paimon-common/src/main/java/org/apache/paimon/data/serializer/InternalSerializers.java +++ b/paimon-common/src/main/java/org/apache/paimon/data/serializer/InternalSerializers.java @@ -92,6 +92,8 @@ private static Serializer createInternal(DataType type) { return VariantSerializer.INSTANCE; case BLOB: return BlobSerializer.INSTANCE; + case BLOB_REF: + return BlobRefSerializer.INSTANCE; default: throw new UnsupportedOperationException( "Unsupported type '" + type + "' to get internal serializer"); diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapTypeVisitor.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapTypeVisitor.java index 4183bfbb2bf8..57fcc8665b97 100644 --- a/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapTypeVisitor.java +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/bitmap/BitmapTypeVisitor.java @@ -21,6 +21,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -176,4 +177,9 @@ public final R visit(VariantType rowType) { public final R visit(BlobType blobType) { throw new UnsupportedOperationException("Does not support type blob"); } + + @Override + public final R visit(BlobRefType blobRefType) { + throw new UnsupportedOperationException("Does not support type blob ref"); + } } diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/bloomfilter/FastHash.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/bloomfilter/FastHash.java index 322847f849ab..722ab63bc4f0 100644 --- a/paimon-common/src/main/java/org/apache/paimon/fileindex/bloomfilter/FastHash.java +++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/bloomfilter/FastHash.java @@ -23,6 +23,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -172,6 +173,11 @@ public FastHash visit(BlobType blobType) { throw new UnsupportedOperationException("Does not support type blob"); } + @Override + public FastHash visit(BlobRefType blobRefType) { + throw new UnsupportedOperationException("Does not support type blob_ref"); + } + @Override public FastHash visit(ArrayType arrayType) { throw new UnsupportedOperationException("Does not support type array"); diff --git a/paimon-common/src/main/java/org/apache/paimon/reader/DataEvolutionArray.java b/paimon-common/src/main/java/org/apache/paimon/reader/DataEvolutionArray.java index 8b89c1ce7f8a..3d726f719340 100644 --- a/paimon-common/src/main/java/org/apache/paimon/reader/DataEvolutionArray.java +++ b/paimon-common/src/main/java/org/apache/paimon/reader/DataEvolutionArray.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -144,6 +145,11 @@ public Blob getBlob(int pos) { return chooseArray(pos).getBlob(offsetInRow(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("DataEvolutionArray does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { return chooseArray(pos).getArray(offsetInRow(pos)); diff --git a/paimon-common/src/main/java/org/apache/paimon/reader/DataEvolutionRow.java b/paimon-common/src/main/java/org/apache/paimon/reader/DataEvolutionRow.java index 08c6d24d2b79..29a54d408ac6 100644 --- a/paimon-common/src/main/java/org/apache/paimon/reader/DataEvolutionRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/reader/DataEvolutionRow.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -168,6 +169,11 @@ public Blob getBlob(int pos) { return chooseRow(pos).getBlob(offsetInRow(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + return chooseRow(pos).getBlobRef(offsetInRow(pos)); + } + @Override public InternalArray getArray(int pos) { return chooseRow(pos).getArray(offsetInRow(pos)); diff --git a/paimon-common/src/main/java/org/apache/paimon/sort/hilbert/HilbertIndexer.java b/paimon-common/src/main/java/org/apache/paimon/sort/hilbert/HilbertIndexer.java index 241dc6100379..254204dc2511 100644 --- a/paimon-common/src/main/java/org/apache/paimon/sort/hilbert/HilbertIndexer.java +++ b/paimon-common/src/main/java/org/apache/paimon/sort/hilbert/HilbertIndexer.java @@ -25,6 +25,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -272,6 +273,11 @@ public HProcessFunction visit(BlobType blobType) { throw new RuntimeException("Unsupported type"); } + @Override + public HProcessFunction visit(BlobRefType blobRefType) { + throw new RuntimeException("Unsupported type"); + } + @Override public HProcessFunction visit(ArrayType arrayType) { throw new RuntimeException("Unsupported type"); diff --git a/paimon-common/src/main/java/org/apache/paimon/sort/zorder/ZIndexer.java b/paimon-common/src/main/java/org/apache/paimon/sort/zorder/ZIndexer.java index 1d40fe75e776..f95e767cb5ae 100644 --- a/paimon-common/src/main/java/org/apache/paimon/sort/zorder/ZIndexer.java +++ b/paimon-common/src/main/java/org/apache/paimon/sort/zorder/ZIndexer.java @@ -26,6 +26,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -360,6 +361,11 @@ public ZProcessFunction visit(BlobType blobType) { throw new UnsupportedOperationException("Does not support type blob"); } + @Override + public ZProcessFunction visit(BlobRefType blobRefType) { + throw new UnsupportedOperationException("Does not support type blob_ref"); + } + @Override public ZProcessFunction visit(ArrayType arrayType) { throw new RuntimeException("Unsupported type"); diff --git a/paimon-common/src/main/java/org/apache/paimon/types/InternalRowToSizeVisitor.java b/paimon-common/src/main/java/org/apache/paimon/types/InternalRowToSizeVisitor.java index dbac55a07dde..e3dab7d7517d 100644 --- a/paimon-common/src/main/java/org/apache/paimon/types/InternalRowToSizeVisitor.java +++ b/paimon-common/src/main/java/org/apache/paimon/types/InternalRowToSizeVisitor.java @@ -229,8 +229,18 @@ public BiFunction visit(BlobType blobType) { if (row.isNullAt(index)) { return NULL_SIZE; } else { - return Math.toIntExact(row.getVariant(index).sizeInBytes()); + return row.getBlob(index).toData().length; + } + }; + } + + @Override + public BiFunction visit(BlobRefType blobRefType) { + return (row, index) -> { + if (row.isNullAt(index)) { + return NULL_SIZE; } + return row.getBlobRef(index).reference().serialize().length; }; } diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/InternalRowUtils.java b/paimon-common/src/main/java/org/apache/paimon/utils/InternalRowUtils.java index 4cfe35e39851..407df59555f2 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/InternalRowUtils.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/InternalRowUtils.java @@ -332,6 +332,8 @@ public static Object get(DataGetters dataGetters, int pos, DataType fieldType) { return dataGetters.getVariant(pos); case BLOB: return dataGetters.getBlob(pos); + case BLOB_REF: + return dataGetters.getBlobRef(pos); default: throw new UnsupportedOperationException("Unsupported type: " + fieldType); } diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/KeyProjectedRow.java b/paimon-common/src/main/java/org/apache/paimon/utils/KeyProjectedRow.java index d4999dcf57dc..59bfc0846a1f 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/KeyProjectedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/KeyProjectedRow.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -136,6 +137,11 @@ public Blob getBlob(int pos) { return row.getBlob(indexMapping[pos]); } + @Override + public BlobRef getBlobRef(int pos) { + return row.getBlobRef(indexMapping[pos]); + } + @Override public InternalArray getArray(int pos) { return row.getArray(indexMapping[pos]); diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/ProjectedArray.java b/paimon-common/src/main/java/org/apache/paimon/utils/ProjectedArray.java index 015fb022edbb..374f515767ff 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/ProjectedArray.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/ProjectedArray.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -138,6 +139,11 @@ public Blob getBlob(int pos) { return array.getBlob(indexMapping[pos]); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("ProjectedArray does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { return array.getArray(indexMapping[pos]); diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/ProjectedRow.java b/paimon-common/src/main/java/org/apache/paimon/utils/ProjectedRow.java index a9dc3e9253d7..2d03f5bc4f52 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/ProjectedRow.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/ProjectedRow.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -153,6 +154,11 @@ public Blob getBlob(int pos) { return row.getBlob(indexMapping[pos]); } + @Override + public BlobRef getBlobRef(int pos) { + return row.getBlobRef(indexMapping[pos]); + } + @Override public InternalArray getArray(int pos) { return row.getArray(indexMapping[pos]); diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/TypeCheckUtils.java b/paimon-common/src/main/java/org/apache/paimon/utils/TypeCheckUtils.java index c1520be34107..b4b22517865a 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/TypeCheckUtils.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/TypeCheckUtils.java @@ -24,6 +24,7 @@ import static org.apache.paimon.types.DataTypeRoot.ARRAY; import static org.apache.paimon.types.DataTypeRoot.BIGINT; import static org.apache.paimon.types.DataTypeRoot.BLOB; +import static org.apache.paimon.types.DataTypeRoot.BLOB_REF; import static org.apache.paimon.types.DataTypeRoot.BOOLEAN; import static org.apache.paimon.types.DataTypeRoot.DECIMAL; import static org.apache.paimon.types.DataTypeRoot.INTEGER; @@ -107,7 +108,7 @@ public static boolean isVariant(DataType type) { } public static boolean isBlob(DataType type) { - return type.getTypeRoot() == BLOB; + return type.getTypeRoot() == BLOB || type.getTypeRoot() == BLOB_REF; } public static boolean isComparable(DataType type) { diff --git a/paimon-common/src/main/java/org/apache/paimon/utils/VectorMappingUtils.java b/paimon-common/src/main/java/org/apache/paimon/utils/VectorMappingUtils.java index 99e8fd455c41..6ea9a0a7b52f 100644 --- a/paimon-common/src/main/java/org/apache/paimon/utils/VectorMappingUtils.java +++ b/paimon-common/src/main/java/org/apache/paimon/utils/VectorMappingUtils.java @@ -45,6 +45,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -336,6 +337,11 @@ public ColumnVector visit(BlobType blobType) { throw new UnsupportedOperationException("BlobType is not supported."); } + @Override + public ColumnVector visit(BlobRefType blobRefType) { + throw new UnsupportedOperationException("BlobRefType is not supported."); + } + @Override public ColumnVector visit(ArrayType arrayType) { return new ArrayColumnVector() { diff --git a/paimon-common/src/test/java/org/apache/paimon/data/BlobReferenceTest.java b/paimon-common/src/test/java/org/apache/paimon/data/BlobReferenceTest.java new file mode 100644 index 000000000000..7bbb7965ba95 --- /dev/null +++ b/paimon-common/src/test/java/org/apache/paimon/data/BlobReferenceTest.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.data; + +import org.junit.jupiter.api.Test; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Test for {@link BlobReference}. */ +public class BlobReferenceTest { + + @Test + public void testSerializeAndDeserialize() { + BlobReference reference = new BlobReference("default.source", 7, 5L); + + BlobReference deserialized = BlobReference.deserialize(reference.serialize()); + + assertThat(deserialized.tableName()).isEqualTo("default.source"); + assertThat(deserialized.fieldId()).isEqualTo(7); + assertThat(deserialized.rowId()).isEqualTo(5L); + } + + @Test + public void testRejectUnexpectedVersion() { + BlobReference reference = new BlobReference("default.source", 7, 5L); + byte[] bytes = reference.serialize(); + bytes[0] = 3; + + assertThatThrownBy(() -> BlobReference.deserialize(bytes)) + .isInstanceOf(UnsupportedOperationException.class) + .hasMessageContaining("Expecting BlobReference version to be 1"); + } + + @Test + public void testEquality() { + BlobReference a = new BlobReference("default.source", 7, 5L); + BlobReference b = new BlobReference("default.source", 7, 5L); + BlobReference c = new BlobReference("default.source", 8, 5L); + + assertThat(a).isEqualTo(b); + assertThat(a.hashCode()).isEqualTo(b.hashCode()); + assertThat(a).isNotEqualTo(c); + } + + @Test + public void testIsBlobReference() { + BlobReference reference = new BlobReference("default.source", 7, 5L); + byte[] bytes = reference.serialize(); + + assertThat(BlobReference.isBlobReference(bytes)).isTrue(); + assertThat(BlobReference.isBlobReference(null)).isFalse(); + assertThat(BlobReference.isBlobReference(new byte[] {1, 2, 3})).isFalse(); + } +} diff --git a/paimon-common/src/test/java/org/apache/paimon/types/InternalRowToSizeVisitorTest.java b/paimon-common/src/test/java/org/apache/paimon/types/InternalRowToSizeVisitorTest.java index cfdae649c190..3357b54bdc3f 100644 --- a/paimon-common/src/test/java/org/apache/paimon/types/InternalRowToSizeVisitorTest.java +++ b/paimon-common/src/test/java/org/apache/paimon/types/InternalRowToSizeVisitorTest.java @@ -19,6 +19,9 @@ package org.apache.paimon.types; import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; import org.apache.paimon.data.DataGetters; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.GenericArray; @@ -36,6 +39,11 @@ import java.util.function.BiFunction; import java.util.stream.Collectors; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + /** Test for InternalRowToSizeVisitor. */ public class InternalRowToSizeVisitorTest { @@ -192,4 +200,19 @@ void testCalculatorSize() { Assertions.assertThat(feildSizeCalculator.get(23).apply(row, 23)).isEqualTo(0); } + + @Test + void testBlobRefSizeUsesSerializedReferenceBytes() { + BlobReference reference = new BlobReference("default.t", 1, 0L); + BlobRef blobRef = Blob.fromReference(reference); + DataGetters row = mock(DataGetters.class); + when(row.isNullAt(0)).thenReturn(false); + when(row.getBlobRef(0)).thenReturn(blobRef); + + int size = new InternalRowToSizeVisitor().visit(DataTypes.BLOB_REF()).apply(row, 0); + + Assertions.assertThat(size).isEqualTo(reference.serialize().length); + verify(row).getBlobRef(0); + verify(row, never()).getBinary(0); + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/io/RowDataFileWriter.java b/paimon-core/src/main/java/org/apache/paimon/io/RowDataFileWriter.java index 7f8715ab0846..652017a85e2a 100644 --- a/paimon-core/src/main/java/org/apache/paimon/io/RowDataFileWriter.java +++ b/paimon-core/src/main/java/org/apache/paimon/io/RowDataFileWriter.java @@ -32,7 +32,7 @@ import javax.annotation.Nullable; import java.io.IOException; -import java.util.Collections; +import java.util.ArrayList; import java.util.List; import java.util.function.Function; import java.util.function.Supplier; @@ -106,6 +106,10 @@ public DataFileMeta result() throws IOException { ? DataFileIndexWriter.EMPTY_RESULT : dataFileIndexWriter.result(); String externalPath = isExternalPath ? path.toString() : null; + List extraFiles = new ArrayList<>(); + if (indexResult.independentIndexFile() != null) { + extraFiles.add(indexResult.independentIndexFile()); + } return DataFileMeta.forAppend( path.getName(), fileSize, @@ -114,9 +118,7 @@ public DataFileMeta result() throws IOException { seqNumCounter.getValue() - super.recordCount(), seqNumCounter.getValue() - 1, schemaId, - indexResult.independentIndexFile() == null - ? Collections.emptyList() - : Collections.singletonList(indexResult.independentIndexFile()), + extraFiles, indexResult.embeddedIndexBytes(), fileSource, statsPair.getKey(), diff --git a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java index 271709c47ef5..cab6a28d7fac 100644 --- a/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java +++ b/paimon-core/src/main/java/org/apache/paimon/schema/SchemaValidation.java @@ -165,6 +165,7 @@ public static void validateTableSchema(TableSchema schema) { FileFormat fileFormat = FileFormat.fromIdentifier(options.formatType(), new Options(schema.options())); RowType tableRowType = new RowType(schema.fields()); + validateNestedBlobRefFields(tableRowType); Set blobDescriptorFields = validateBlobDescriptorFields(tableRowType, options); validateBlobExternalStorageFields(tableRowType, options, blobDescriptorFields); @@ -672,19 +673,22 @@ private static void validateRowTracking(TableSchema schema, CoreOptions options) List fields = schema.fields(); List blobNames = fields.stream() - .filter(field -> field.type().is(DataTypeRoot.BLOB)) + .filter( + field -> + field.type().is(DataTypeRoot.BLOB) + || field.type().is(DataTypeRoot.BLOB_REF)) .map(DataField::name) .collect(Collectors.toList()); if (!blobNames.isEmpty()) { checkArgument( options.dataEvolutionEnabled(), - "Data evolution config must enabled for table with BLOB type column."); + "Data evolution config must enabled for table with BLOB or BLOB_REF type column."); checkArgument( fields.size() > blobNames.size(), - "Table with BLOB type column must have other normal columns."); + "Table with BLOB or BLOB_REF type column must have other normal columns."); checkArgument( blobNames.stream().noneMatch(schema.partitionKeys()::contains), - "The BLOB type column can not be part of partition keys."); + "The BLOB or BLOB_REF type column can not be part of partition keys."); } FileFormat vectorFileFormat = vectorFileFormat(options); @@ -702,6 +706,49 @@ private static void validateRowTracking(TableSchema schema, CoreOptions options) } } + private static void validateNestedBlobRefFields(RowType rowType) { + for (DataField field : rowType.getFields()) { + checkArgument( + !containsNestedBlobRef(field.type()), + "Nested BLOB_REF type is not supported. Field '%s' contains a nested BLOB_REF.", + field.name()); + } + } + + private static boolean containsNestedBlobRef(DataType dataType) { + switch (dataType.getTypeRoot()) { + case ARRAY: + DataType arrayElementType = ((ArrayType) dataType).getElementType(); + return arrayElementType.is(DataTypeRoot.BLOB_REF) + || containsNestedBlobRef(arrayElementType); + case MULTISET: + DataType multisetElementType = ((MultisetType) dataType).getElementType(); + return multisetElementType.is(DataTypeRoot.BLOB_REF) + || containsNestedBlobRef(multisetElementType); + case MAP: + MapType mapType = (MapType) dataType; + return mapType.getKeyType().is(DataTypeRoot.BLOB_REF) + || containsNestedBlobRef(mapType.getKeyType()) + || mapType.getValueType().is(DataTypeRoot.BLOB_REF) + || containsNestedBlobRef(mapType.getValueType()); + case ROW: + for (DataField field : ((RowType) dataType).getFields()) { + if (field.type().is(DataTypeRoot.BLOB_REF) + || containsNestedBlobRef(field.type())) { + return true; + } + } + return false; + case VECTOR: + DataType vectorElementType = + ((org.apache.paimon.types.VectorType) dataType).getElementType(); + return vectorElementType.is(DataTypeRoot.BLOB_REF) + || containsNestedBlobRef(vectorElementType); + default: + return false; + } + } + private static Set validateBlobDescriptorFields(RowType rowType, CoreOptions options) { Set blobFieldNames = rowType.getFields().stream() diff --git a/paimon-core/src/main/java/org/apache/paimon/stats/SimpleStatsEvolution.java b/paimon-core/src/main/java/org/apache/paimon/stats/SimpleStatsEvolution.java index 906d97501f4b..d2dc8fcbb69d 100644 --- a/paimon-core/src/main/java/org/apache/paimon/stats/SimpleStatsEvolution.java +++ b/paimon-core/src/main/java/org/apache/paimon/stats/SimpleStatsEvolution.java @@ -22,6 +22,7 @@ import org.apache.paimon.casting.CastedRow; import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.GenericArray; import org.apache.paimon.data.GenericRow; @@ -277,6 +278,11 @@ public Blob getBlob(int pos) { throw new UnsupportedOperationException(); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException(); + } + @Override public InternalArray getArray(int pos) { throw new UnsupportedOperationException(); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/AppendOnlyFileStoreTable.java b/paimon-core/src/main/java/org/apache/paimon/table/AppendOnlyFileStoreTable.java index 327810b881bc..5b3c2600f397 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/AppendOnlyFileStoreTable.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/AppendOnlyFileStoreTable.java @@ -123,7 +123,11 @@ public InnerTableRead newRead() { new AppendTableRawFileSplitReadProvider( () -> store().newRead(), config)); } - return new AppendTableRead(providerFactories, schema()); + return new AppendTableRead( + providerFactories, + schema(), + catalogEnvironment().catalogContext(), + () -> new AppendTableRead(providerFactories, schema(), null, null)); } @Override diff --git a/paimon-core/src/main/java/org/apache/paimon/table/PrimaryKeyFileStoreTable.java b/paimon-core/src/main/java/org/apache/paimon/table/PrimaryKeyFileStoreTable.java index a2fee49bfb88..52b6faa74acf 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/PrimaryKeyFileStoreTable.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/PrimaryKeyFileStoreTable.java @@ -147,7 +147,17 @@ protected BiConsumer nonPartitionFilterConsumer() { @Override public InnerTableRead newRead() { return new KeyValueTableRead( - () -> store().newRead(), () -> store().newBatchRawFileRead(), schema()); + () -> store().newRead(), + () -> store().newBatchRawFileRead(), + schema(), + catalogEnvironment().catalogContext(), + () -> + new KeyValueTableRead( + () -> store().newRead(), + () -> store().newBatchRawFileRead(), + schema(), + null, + null)); } @Override diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractDataTableRead.java b/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractDataTableRead.java index ca5af88f40bc..4492ed5d4518 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractDataTableRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractDataTableRead.java @@ -18,23 +18,33 @@ package org.apache.paimon.table.source; +import org.apache.paimon.catalog.CatalogContext; import org.apache.paimon.catalog.TableQueryAuthResult; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.data.BlobReferenceResolver; import org.apache.paimon.data.InternalRow; import org.apache.paimon.disk.IOManager; import org.apache.paimon.predicate.Predicate; import org.apache.paimon.predicate.PredicateProjectionConverter; import org.apache.paimon.reader.RecordReader; import org.apache.paimon.schema.TableSchema; +import org.apache.paimon.types.DataTypeRoot; import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.BlobReferenceLookup; import org.apache.paimon.utils.ListUtils; import org.apache.paimon.utils.ProjectedRow; +import javax.annotation.Nullable; + import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; +import java.util.LinkedHashSet; import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.function.Supplier; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; @@ -45,9 +55,20 @@ public abstract class AbstractDataTableRead implements InnerTableRead { private boolean executeFilter = false; private Predicate predicate; private final TableSchema schema; + @Nullable private final CatalogContext catalogContext; + @Nullable private final Supplier readFactory; + + public AbstractDataTableRead(TableSchema schema, @Nullable CatalogContext catalogContext) { + this(schema, catalogContext, null); + } - public AbstractDataTableRead(TableSchema schema) { + public AbstractDataTableRead( + TableSchema schema, + @Nullable CatalogContext catalogContext, + @Nullable Supplier readFactory) { this.schema = schema; + this.catalogContext = catalogContext; + this.readFactory = readFactory; } public abstract void applyReadType(RowType readType); @@ -96,6 +117,25 @@ public final RecordReader createReader(Split split) throws IOExcept split = authSplit.split(); authResult = authSplit.authResult(); } + + // Check if this split has BLOB_REF fields that need resolving + if (catalogContext != null) { + RowType rowType = this.readType == null ? schema.logicalRowType() : this.readType; + int[] blobRefFields = + rowType.getFields().stream() + .filter(field -> field.type().is(DataTypeRoot.BLOB_REF)) + .mapToInt(field -> rowType.getFieldIndex(field.name())) + .toArray(); + if (blobRefFields.length > 0) { + if (readFactory == null) { + throw new IllegalStateException( + "Cannot read BLOB_REF fields without a readFactory. " + + "The table must provide a readFactory to support BLOB_REF resolution."); + } + return createBlobRefReader(split, authResult, blobRefFields); + } + } + RecordReader reader; if (authResult == null) { reader = reader(split); @@ -105,10 +145,58 @@ public final RecordReader createReader(Split split) throws IOExcept if (executeFilter) { reader = executeFilter(reader); } - return reader; } + private RecordReader createBlobRefReader( + Split split, @Nullable TableQueryAuthResult authResult, int[] blobRefFields) + throws IOException { + // Pre-scan: use an independent read instance to read only BLOB_REF columns. + // Transfer predicate to narrow the scan range, but NOT limit/topN since the + // pre-scan must cover all rows that the second pass might return. + RowType rowType = this.readType == null ? schema.logicalRowType() : this.readType; + RowType blobRefOnlyType = rowType.project(blobRefFields); + InnerTableRead prescanRead = readFactory.get(); + prescanRead.withReadType(blobRefOnlyType); + if (predicate != null) { + prescanRead.withFilter(predicate); + } + Split prescanSplit = authResult != null ? new QueryAuthSplit(split, authResult) : split; + LinkedHashSet references = new LinkedHashSet<>(); + RecordReader prescanReader = prescanRead.createReader(prescanSplit); + try { + prescanReader.forEachRemaining( + row -> { + for (int i = 0; i < blobRefFields.length; i++) { + if (row.isNullAt(i)) { + continue; + } + BlobRef blobRef = row.getBlobRef(i); + references.add(blobRef.reference()); + } + }); + } finally { + prescanReader.close(); + } + + // Build the resolver from collected references + List refList = new ArrayList<>(references); + BlobReferenceResolver resolver = + BlobReferenceLookup.createResolver(catalogContext, refList); + + // Second pass: read all columns, wrap each row to resolve UnresolvedBlob + RecordReader reader = + authResult == null ? reader(split) : authedReader(split, authResult); + if (executeFilter) { + reader = executeFilter(reader); + } + Set blobRefFieldSet = new HashSet<>(); + for (int f : blobRefFields) { + blobRefFieldSet.add(f); + } + return reader.transform(row -> new BlobRefResolvingRow(row, blobRefFieldSet, resolver)); + } + private RecordReader authedReader(Split split, TableQueryAuthResult authResult) throws IOException { RecordReader reader; diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/AppendTableRead.java b/paimon-core/src/main/java/org/apache/paimon/table/source/AppendTableRead.java index 1a9ed9b4bee2..5501e37d9e7b 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/AppendTableRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/AppendTableRead.java @@ -18,6 +18,7 @@ package org.apache.paimon.table.source; +import org.apache.paimon.catalog.CatalogContext; import org.apache.paimon.data.InternalRow; import org.apache.paimon.operation.MergeFileSplitRead; import org.apache.paimon.operation.SplitRead; @@ -35,6 +36,7 @@ import java.util.ArrayList; import java.util.List; import java.util.function.Function; +import java.util.function.Supplier; import java.util.stream.Collectors; /** @@ -51,8 +53,10 @@ public final class AppendTableRead extends AbstractDataTableRead { public AppendTableRead( List> providerFactories, - TableSchema schema) { - super(schema); + TableSchema schema, + CatalogContext catalogContext, + @Nullable Supplier readFactory) { + super(schema, catalogContext, readFactory); this.readProviders = providerFactories.stream() .map(factory -> factory.apply(this::config)) diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/BlobRefResolvingRow.java b/paimon-core/src/main/java/org/apache/paimon/table/source/BlobRefResolvingRow.java new file mode 100644 index 000000000000..b6aa4d2f0405 --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/BlobRefResolvingRow.java @@ -0,0 +1,166 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.table.source; + +import org.apache.paimon.data.BinaryString; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReferenceResolver; +import org.apache.paimon.data.Decimal; +import org.apache.paimon.data.InternalArray; +import org.apache.paimon.data.InternalMap; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.data.InternalVector; +import org.apache.paimon.data.Timestamp; +import org.apache.paimon.data.variant.Variant; +import org.apache.paimon.types.RowKind; + +import java.util.Set; + +/** + * {@link InternalRow} wrapper that resolves UnresolvedBlob to real {@link Blob} via a {@link + * BlobReferenceResolver} when {@link #getBlob(int)} is called. + */ +class BlobRefResolvingRow implements InternalRow { + + private final InternalRow wrapped; + private final Set blobRefFields; + private final BlobReferenceResolver resolver; + + BlobRefResolvingRow( + InternalRow wrapped, Set blobRefFields, BlobReferenceResolver resolver) { + this.wrapped = wrapped; + this.blobRefFields = blobRefFields; + this.resolver = resolver; + } + + @Override + public int getFieldCount() { + return wrapped.getFieldCount(); + } + + @Override + public RowKind getRowKind() { + return wrapped.getRowKind(); + } + + @Override + public void setRowKind(RowKind kind) { + wrapped.setRowKind(kind); + } + + @Override + public boolean isNullAt(int pos) { + return wrapped.isNullAt(pos); + } + + @Override + public boolean getBoolean(int pos) { + return wrapped.getBoolean(pos); + } + + @Override + public byte getByte(int pos) { + return wrapped.getByte(pos); + } + + @Override + public short getShort(int pos) { + return wrapped.getShort(pos); + } + + @Override + public int getInt(int pos) { + return wrapped.getInt(pos); + } + + @Override + public long getLong(int pos) { + return wrapped.getLong(pos); + } + + @Override + public float getFloat(int pos) { + return wrapped.getFloat(pos); + } + + @Override + public double getDouble(int pos) { + return wrapped.getDouble(pos); + } + + @Override + public BinaryString getString(int pos) { + return wrapped.getString(pos); + } + + @Override + public Decimal getDecimal(int pos, int precision, int scale) { + return wrapped.getDecimal(pos, precision, scale); + } + + @Override + public Timestamp getTimestamp(int pos, int precision) { + return wrapped.getTimestamp(pos, precision); + } + + @Override + public byte[] getBinary(int pos) { + return wrapped.getBinary(pos); + } + + @Override + public Variant getVariant(int pos) { + return wrapped.getVariant(pos); + } + + @Override + public Blob getBlob(int pos) { + return wrapped.getBlob(pos); + } + + @Override + public BlobRef getBlobRef(int pos) { + BlobRef blobRef = wrapped.getBlobRef(pos); + if (blobRefFields.contains(pos) && !blobRef.isResolved()) { + resolver.resolve(blobRef); + } + return blobRef; + } + + @Override + public InternalRow getRow(int pos, int numFields) { + return wrapped.getRow(pos, numFields); + } + + @Override + public InternalArray getArray(int pos) { + return wrapped.getArray(pos); + } + + @Override + public InternalVector getVector(int pos) { + return wrapped.getVector(pos); + } + + @Override + public InternalMap getMap(int pos) { + return wrapped.getMap(pos); + } +} diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/KeyValueTableRead.java b/paimon-core/src/main/java/org/apache/paimon/table/source/KeyValueTableRead.java index fda7d70ffdf6..1d9a4d9b51b5 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/KeyValueTableRead.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/KeyValueTableRead.java @@ -21,6 +21,7 @@ import org.apache.paimon.CoreOptions; import org.apache.paimon.KeyValue; import org.apache.paimon.annotation.VisibleForTesting; +import org.apache.paimon.catalog.CatalogContext; import org.apache.paimon.data.InternalRow; import org.apache.paimon.disk.IOManager; import org.apache.paimon.operation.MergeFileSplitRead; @@ -63,8 +64,10 @@ public final class KeyValueTableRead extends AbstractDataTableRead { public KeyValueTableRead( Supplier mergeReadSupplier, Supplier batchRawReadSupplier, - TableSchema schema) { - super(schema); + TableSchema schema, + CatalogContext catalogContext, + @Nullable Supplier readFactory) { + super(schema, catalogContext, readFactory); this.readProviders = Arrays.asList( new PrimaryKeyTableRawFileSplitReadProvider( diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/BlobReferenceLookup.java b/paimon-core/src/main/java/org/apache/paimon/utils/BlobReferenceLookup.java new file mode 100644 index 000000000000..2ec94009743b --- /dev/null +++ b/paimon-core/src/main/java/org/apache/paimon/utils/BlobReferenceLookup.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.utils; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.catalog.Catalog; +import org.apache.paimon.catalog.CatalogContext; +import org.apache.paimon.catalog.CatalogFactory; +import org.apache.paimon.catalog.Identifier; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.data.BlobReferenceResolver; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.table.SpecialFields; +import org.apache.paimon.table.Table; +import org.apache.paimon.table.source.ReadBuilder; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.TreeSet; + +/** + * Batch-preloads {@link BlobDescriptor}s for a set of {@link BlobReference}s by scanning the + * upstream tables once. The preloaded descriptors are lightweight (uri + offset + length) so memory + * stays small even for large numbers of references. + */ +public class BlobReferenceLookup { + + /** + * Creates a resolver that resolves {@link BlobRef}s using a preloaded descriptor cache. All + * given references are batch-scanned from the upstream tables upfront. + */ + public static BlobReferenceResolver createResolver( + CatalogContext catalogContext, List references) { + Map cached = preloadDescriptors(catalogContext, references); + Catalog catalog = CatalogFactory.createCatalog(catalogContext); + Map cache = new HashMap<>(); + return blobRef -> { + BlobDescriptor descriptor = cached.get(blobRef.reference()); + if (descriptor == null) { + throw new IllegalStateException( + "BlobReference not found in preloaded cache: " + + blobRef.reference() + + ". Cache keys: " + + cached.keySet()); + } + UriReader uriReader = + cache.computeIfAbsent( + blobRef.reference().tableName(), + tableName -> { + try { + return UriReader.fromFile( + catalog.getTable(Identifier.fromString(tableName)) + .fileIO()); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + blobRef.resolve(uriReader, descriptor); + }; + } + + private static Map preloadDescriptors( + CatalogContext catalogContext, List references) { + if (references.isEmpty()) { + return Collections.emptyMap(); + } + try (Catalog catalog = CatalogFactory.createCatalog(catalogContext)) { + return loadReferencedDescriptors(catalog, references); + } catch (Exception e) { + throw new RuntimeException("Failed to preload blob descriptors.", e); + } + } + + private static Map loadReferencedDescriptors( + Catalog catalog, Collection references) throws Exception { + Map grouped = new HashMap<>(); + for (BlobReference reference : references) { + grouped.computeIfAbsent(reference.tableName(), TableReferences::new).add(reference); + } + Map resolved = new HashMap<>(); + for (TableReferences tableReferences : grouped.values()) { + loadTableDescriptors(catalog, tableReferences, resolved); + } + return resolved; + } + + private static void loadTableDescriptors( + Catalog catalog, + TableReferences tableReferences, + Map resolved) + throws Exception { + Table table = + catalog.getTable(Identifier.fromString(tableReferences.tableName)) + .copy( + Collections.singletonMap( + CoreOptions.BLOB_AS_DESCRIPTOR.key(), "true")); + + List fields = new ArrayList<>(tableReferences.referencesByField.size()); + TreeSet rowIds = new TreeSet<>(); + for (Map.Entry> entry : + tableReferences.referencesByField.entrySet()) { + int fieldId = entry.getKey(); + if (!table.rowType().containsField(fieldId)) { + throw new IllegalArgumentException( + "Cannot find blob fieldId " + + fieldId + + " in upstream table " + + tableReferences.tableName + + "."); + } + int fieldPos = table.rowType().getFieldIndexByFieldId(fieldId); + fields.add(new FieldRead(fieldId, fieldPos, table.rowType().getFields().get(fieldPos))); + for (BlobReference reference : entry.getValue()) { + rowIds.add(reference.rowId()); + } + } + + Collections.sort(fields, Comparator.comparingInt(left -> left.fieldPos)); + + List readFields = new ArrayList<>(fields.size()); + for (FieldRead field : fields) { + readFields.add(field.field); + } + + ReadBuilder readBuilder = + table.newReadBuilder() + .withReadType(SpecialFields.rowTypeWithRowId(new RowType(readFields))) + .withRowRanges(Range.toRanges(rowIds)); + + try (RecordReader reader = + readBuilder.newRead().createReader(readBuilder.newScan().plan())) { + RecordReader.RecordIterator batch; + while ((batch = reader.readBatch()) != null) { + try { + InternalRow row; + while ((row = batch.next()) != null) { + long rowId = row.getLong(fields.size()); + for (int i = 0; i < fields.size(); i++) { + Blob blob = row.getBlob(i); + if (blob != null) { + resolved.put( + new BlobReference( + tableReferences.tableName, + fields.get(i).fieldId, + rowId), + blob.toDescriptor()); + } + } + } + } finally { + batch.releaseBatch(); + } + } + } + } + + private static class TableReferences { + private final String tableName; + private final Map> referencesByField = new HashMap<>(); + + private TableReferences(String tableName) { + this.tableName = tableName; + } + + private void add(BlobReference reference) { + referencesByField + .computeIfAbsent(reference.fieldId(), unused -> new ArrayList<>()) + .add(reference); + } + } + + private static class FieldRead { + private final int fieldId; + private final int fieldPos; + private final DataField field; + + private FieldRead(int fieldId, int fieldPos, DataField field) { + this.fieldId = fieldId; + this.fieldPos = fieldPos; + this.field = field; + } + } + + private BlobReferenceLookup() {} +} diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/OffsetRow.java b/paimon-core/src/main/java/org/apache/paimon/utils/OffsetRow.java index 70b7e24514db..01a9e70e1eb1 100644 --- a/paimon-core/src/main/java/org/apache/paimon/utils/OffsetRow.java +++ b/paimon-core/src/main/java/org/apache/paimon/utils/OffsetRow.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -137,6 +138,11 @@ public Blob getBlob(int pos) { return row.getBlob(offset + pos); } + @Override + public BlobRef getBlobRef(int pos) { + return row.getBlobRef(offset + pos); + } + @Override public InternalArray getArray(int pos) { return row.getArray(offset + pos); diff --git a/paimon-core/src/main/java/org/apache/paimon/utils/PartialRow.java b/paimon-core/src/main/java/org/apache/paimon/utils/PartialRow.java index 2e172dadf38a..6bb9428ebc71 100644 --- a/paimon-core/src/main/java/org/apache/paimon/utils/PartialRow.java +++ b/paimon-core/src/main/java/org/apache/paimon/utils/PartialRow.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -135,6 +136,11 @@ public Blob getBlob(int pos) { return row.getBlob(pos); } + @Override + public BlobRef getBlobRef(int pos) { + return row.getBlobRef(pos); + } + @Override public InternalArray getArray(int pos) { return row.getArray(pos); diff --git a/paimon-core/src/test/java/org/apache/paimon/append/BlobTableTest.java b/paimon-core/src/test/java/org/apache/paimon/append/BlobTableTest.java index e3019485bb2d..fa1cafe8c451 100644 --- a/paimon-core/src/test/java/org/apache/paimon/append/BlobTableTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/append/BlobTableTest.java @@ -25,6 +25,8 @@ import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobData; import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; import org.apache.paimon.fs.FileIO; @@ -39,7 +41,9 @@ import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.table.Table; import org.apache.paimon.table.TableTestBase; +import org.apache.paimon.table.sink.BatchTableCommit; import org.apache.paimon.table.sink.BatchTableWrite; +import org.apache.paimon.table.sink.BatchWriteBuilder; import org.apache.paimon.table.sink.CommitMessage; import org.apache.paimon.table.sink.StreamTableWrite; import org.apache.paimon.table.sink.StreamWriteBuilder; @@ -751,6 +755,108 @@ void testRenameBlobColumnShouldFail() throws Exception { .hasMessageContaining("Cannot rename BLOB column"); } + @Test + public void testBlobRefE2E() throws Exception { + // 1. Create upstream table with BLOB field and write data + String upstreamTableName = "UpstreamBlob"; + Schema.Builder upstreamSchema = Schema.newBuilder(); + upstreamSchema.column("id", DataTypes.INT()); + upstreamSchema.column("name", DataTypes.STRING()); + upstreamSchema.column("image", DataTypes.BLOB()); + upstreamSchema.option(CoreOptions.TARGET_FILE_SIZE.key(), "25 MB"); + upstreamSchema.option(CoreOptions.ROW_TRACKING_ENABLED.key(), "true"); + upstreamSchema.option(CoreOptions.DATA_EVOLUTION_ENABLED.key(), "true"); + catalog.createTable(identifier(upstreamTableName), upstreamSchema.build(), true); + + FileStoreTable upstreamTable = + (FileStoreTable) catalog.getTable(identifier(upstreamTableName)); + + byte[] imageBytes1 = randomBytes(); + byte[] imageBytes2 = randomBytes(); + + BatchWriteBuilder upstreamWriteBuilder = upstreamTable.newBatchWriteBuilder(); + try (BatchTableWrite write = upstreamWriteBuilder.newWrite(); + BatchTableCommit commit = upstreamWriteBuilder.newCommit()) { + write.write( + GenericRow.of(1, BinaryString.fromString("row1"), new BlobData(imageBytes1))); + write.write( + GenericRow.of(2, BinaryString.fromString("row2"), new BlobData(imageBytes2))); + commit.commit(write.prepareCommit()); + } + + // 2. Get field ID for the "image" blob column + int imageFieldId = + upstreamTable.rowType().getFields().stream() + .filter(f -> f.name().equals("image")) + .findFirst() + .orElseThrow(() -> new RuntimeException("image field not found")) + .id(); + + // Read upstream with _ROW_ID to get actual row IDs + RowTrackingTable upstreamRowTracking = new RowTrackingTable(upstreamTable); + // schema: 0=id, 1=name, 2=image, 3=_ROW_ID, 4=_SEQUENCE_NUMBER + ReadBuilder rowIdReader = + upstreamRowTracking.newReadBuilder().withProjection(new int[] {0, 2, 3}); + // maps: upstream id -> (rowId, blobData) + java.util.Map idToRowId = new java.util.HashMap<>(); + java.util.Map idToBlob = new java.util.HashMap<>(); + rowIdReader + .newRead() + .createReader(rowIdReader.newScan().plan()) + .forEachRemaining( + row -> { + int id = row.getInt(0); + byte[] blobData = row.getBlob(1).toData(); + long rowId = row.getLong(2); + idToRowId.put(id, rowId); + idToBlob.put(id, blobData); + }); + assertThat(idToRowId.size()).isEqualTo(2); + + // 3. Create downstream table with BLOB_REF field + String downstreamTableName = "DownstreamRef"; + Schema.Builder downstreamSchema = Schema.newBuilder(); + downstreamSchema.column("id", DataTypes.INT()); + downstreamSchema.column("label", DataTypes.STRING()); + downstreamSchema.column("image_ref", DataTypes.BLOB_REF()); + downstreamSchema.option(CoreOptions.TARGET_FILE_SIZE.key(), "25 MB"); + downstreamSchema.option(CoreOptions.ROW_TRACKING_ENABLED.key(), "true"); + downstreamSchema.option(CoreOptions.DATA_EVOLUTION_ENABLED.key(), "true"); + catalog.createTable(identifier(downstreamTableName), downstreamSchema.build(), true); + + FileStoreTable downstreamTable = + (FileStoreTable) catalog.getTable(identifier(downstreamTableName)); + + // 4. Write blob references using actual row IDs from upstream + String upstreamFullName = database + "." + upstreamTableName; + BlobReference ref1 = new BlobReference(upstreamFullName, imageFieldId, idToRowId.get(1)); + BlobReference ref2 = new BlobReference(upstreamFullName, imageFieldId, idToRowId.get(2)); + + BatchWriteBuilder downstreamWriteBuilder = downstreamTable.newBatchWriteBuilder(); + try (BatchTableWrite write = downstreamWriteBuilder.newWrite(); + BatchTableCommit commit = downstreamWriteBuilder.newCommit()) { + write.write( + GenericRow.of(1, BinaryString.fromString("label1"), Blob.fromReference(ref1))); + write.write( + GenericRow.of(2, BinaryString.fromString("label2"), Blob.fromReference(ref2))); + commit.commit(write.prepareCommit()); + } + + // 5. Read downstream table — blob references should resolve from upstream + ReadBuilder downstreamReadBuilder = downstreamTable.newReadBuilder(); + downstreamReadBuilder + .newRead() + .createReader(downstreamReadBuilder.newScan().plan()) + .forEachRemaining( + row -> { + int id = row.getInt(0); + BlobRef blobRef = row.getBlobRef(2); + assertThat(blobRef).isNotNull(); + assertThat(blobRef.isResolved()).isTrue(); + assertThat(blobRef.toData()).isEqualTo(idToBlob.get(id)); + }); + } + private void createExternalStorageTable() throws Exception { Schema.Builder schemaBuilder = Schema.newBuilder(); schemaBuilder.column("f0", DataTypes.INT()); diff --git a/paimon-core/src/test/java/org/apache/paimon/io/RowDataFileWriterTest.java b/paimon-core/src/test/java/org/apache/paimon/io/RowDataFileWriterTest.java new file mode 100644 index 000000000000..0e9e10aa2147 --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/io/RowDataFileWriterTest.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.io; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.fileindex.FileIndexOptions; +import org.apache.paimon.format.FileFormat; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.manifest.FileSource; +import org.apache.paimon.options.Options; +import org.apache.paimon.statistics.NoneSimpleColStatsCollector; +import org.apache.paimon.statistics.SimpleColStatsCollector; +import org.apache.paimon.types.BlobRefType; +import org.apache.paimon.types.DataType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.RowType; +import org.apache.paimon.utils.LongCounter; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link RowDataFileWriter}. */ +public class RowDataFileWriterTest { + + private static final RowType SCHEMA = + RowType.of( + new DataType[] {new IntType(), new BlobRefType()}, new String[] {"id", "ref"}); + + @TempDir java.nio.file.Path tempDir; + + @Test + public void testWriteBlobRefFile() throws Exception { + FileFormat fileFormat = FileFormat.fromIdentifier("parquet", new Options()); + Path dataPath = new Path(tempDir.toUri().toString(), "data.parquet"); + BlobReference reference = new BlobReference("default.upstream", 7, 11L); + + RowDataFileWriter writer = + new RowDataFileWriter( + LocalFileIO.create(), + RollingFileWriter.createFileWriterContext( + fileFormat, + SCHEMA, + new SimpleColStatsCollector.Factory[] { + NoneSimpleColStatsCollector::new, + NoneSimpleColStatsCollector::new + }, + CoreOptions.FILE_COMPRESSION.defaultValue()), + dataPath, + SCHEMA, + 0L, + () -> new LongCounter(0), + new FileIndexOptions(), + FileSource.APPEND, + false, + false, + false, + SCHEMA.getFieldNames()); + + writer.write(GenericRow.of(1, Blob.fromReference(reference))); + writer.close(); + + DataFileMeta meta = writer.result(); + + // No .blobref extra files should be produced + assertThat(meta.extraFiles().stream().noneMatch(f -> f.endsWith(".blobref"))).isTrue(); + assertThat(meta.rowCount()).isEqualTo(1); + } +} diff --git a/paimon-core/src/test/java/org/apache/paimon/schema/BlobRefSchemaValidationTest.java b/paimon-core/src/test/java/org/apache/paimon/schema/BlobRefSchemaValidationTest.java new file mode 100644 index 000000000000..fcc345d719a9 --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/schema/BlobRefSchemaValidationTest.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.schema; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataTypes; + +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static java.util.Collections.emptyList; +import static org.apache.paimon.CoreOptions.BUCKET; +import static org.apache.paimon.schema.SchemaValidation.validateTableSchema; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +/** Tests for BLOB_REF-specific schema validation. */ +public class BlobRefSchemaValidationTest { + + @Test + public void testNestedBlobRefTableSchema() { + Map options = new HashMap<>(); + options.put(CoreOptions.ROW_TRACKING_ENABLED.key(), "true"); + options.put(CoreOptions.DATA_EVOLUTION_ENABLED.key(), "true"); + options.put(BUCKET.key(), String.valueOf(-1)); + + List fields = + Arrays.asList( + new DataField(0, "f0", DataTypes.INT()), + new DataField( + 1, + "f1", + DataTypes.ROW(DataTypes.FIELD(2, "nested", DataTypes.BLOB_REF())))); + + assertThatThrownBy( + () -> + validateTableSchema( + new TableSchema( + 1, + fields, + 10, + emptyList(), + emptyList(), + options, + ""))) + .hasMessage( + "Nested BLOB_REF type is not supported. Field 'f1' contains a nested BLOB_REF."); + } +} diff --git a/paimon-core/src/test/java/org/apache/paimon/schema/SchemaValidationTest.java b/paimon-core/src/test/java/org/apache/paimon/schema/SchemaValidationTest.java index 2f7ffbb6d937..9b931181f27f 100644 --- a/paimon-core/src/test/java/org/apache/paimon/schema/SchemaValidationTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/schema/SchemaValidationTest.java @@ -144,13 +144,14 @@ public void testBlobTableSchema() { options.put(CoreOptions.ROW_TRACKING_ENABLED.key(), "true"); assertThatThrownBy(() -> validateBlobSchema(options, emptyList())) - .hasMessage("Data evolution config must enabled for table with BLOB type column."); + .hasMessage( + "Data evolution config must enabled for table with BLOB or BLOB_REF type column."); options.clear(); options.put(CoreOptions.ROW_TRACKING_ENABLED.key(), "true"); options.put(CoreOptions.DATA_EVOLUTION_ENABLED.key(), "true"); assertThatThrownBy(() -> validateBlobSchema(options, singletonList("f2"))) - .hasMessage("The BLOB type column can not be part of partition keys."); + .hasMessage("The BLOB or BLOB_REF type column can not be part of partition keys."); assertThatThrownBy( () -> { @@ -164,7 +165,8 @@ public void testBlobTableSchema() { options, "")); }) - .hasMessage("Table with BLOB type column must have other normal columns."); + .hasMessage( + "Table with BLOB or BLOB_REF type column must have other normal columns."); } @Test diff --git a/paimon-core/src/test/java/org/apache/paimon/utils/BlobReferenceLookupTest.java b/paimon-core/src/test/java/org/apache/paimon/utils/BlobReferenceLookupTest.java new file mode 100644 index 000000000000..a40825756df3 --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/utils/BlobReferenceLookupTest.java @@ -0,0 +1,233 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.utils; + +import org.apache.paimon.catalog.Catalog; +import org.apache.paimon.catalog.CatalogContext; +import org.apache.paimon.catalog.CatalogFactory; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.data.BlobReferenceResolver; +import org.apache.paimon.data.GenericRow; +import org.apache.paimon.data.InternalRow; +import org.apache.paimon.disk.IOManager; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.metrics.MetricRegistry; +import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.table.Table; +import org.apache.paimon.table.source.ReadBuilder; +import org.apache.paimon.table.source.Split; +import org.apache.paimon.table.source.TableRead; +import org.apache.paimon.table.source.TableScan; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.DataTypes; +import org.apache.paimon.types.RowType; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; +import org.mockito.MockedStatic; +import org.mockito.Mockito; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.OptionalLong; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyList; +import static org.mockito.ArgumentMatchers.anyMap; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +/** Tests for {@link BlobReferenceLookup}. */ +public class BlobReferenceLookupTest { + + @TempDir java.nio.file.Path tempDir; + + @Test + public void testCreateResolverPreloadsDescriptors() throws Exception { + CatalogContext context = CatalogContext.create(new Path(tempDir.toUri().toString())); + Catalog catalog = mock(Catalog.class); + Table table = mock(Table.class); + ReadBuilder readBuilder = mock(ReadBuilder.class); + TableScan scan = mock(TableScan.class); + TableScan.Plan plan = mock(TableScan.Plan.class); + + // Write real blob files + byte[] leftPayload = new byte[] {1, 2, 3}; + byte[] rightPayload = new byte[] {4, 5, 6}; + Path leftBlobPath = new Path(tempDir.toUri().toString(), "left.blob"); + Path rightBlobPath = new Path(tempDir.toUri().toString(), "right.blob"); + LocalFileIO fileIO = LocalFileIO.create(); + try (org.apache.paimon.fs.PositionOutputStream out = + fileIO.newOutputStream(leftBlobPath, false)) { + out.write(leftPayload); + } + try (org.apache.paimon.fs.PositionOutputStream out = + fileIO.newOutputStream(rightBlobPath, false)) { + out.write(rightPayload); + } + + BlobDescriptor leftDescriptor = + new BlobDescriptor(leftBlobPath.toString(), 0L, leftPayload.length); + BlobDescriptor rightDescriptor = + new BlobDescriptor(rightBlobPath.toString(), 0L, rightPayload.length); + + Blob leftBlob = + Blob.fromDescriptor(UriReader.fromFile(LocalFileIO.create()), leftDescriptor); + Blob rightBlob = + Blob.fromDescriptor(UriReader.fromFile(LocalFileIO.create()), rightDescriptor); + + BlobReference leftReference = new BlobReference("default.source", 7, 12L); + BlobReference rightReference = new BlobReference("default.source", 8, 12L); + + Split readerSplit = new TestSplit(); + InternalRow preloadRow = GenericRow.of(leftBlob, rightBlob, 12L); + + when(catalog.getTable(any())).thenReturn(table); + when(table.copy(anyMap())).thenReturn(table); + when(table.rowType()) + .thenReturn( + new RowType( + java.util.Arrays.asList( + new DataField(7, "blob_left", DataTypes.BLOB()), + new DataField(8, "blob_right", DataTypes.BLOB())))); + when(table.newReadBuilder()).thenReturn(readBuilder); + when(readBuilder.withReadType(any(RowType.class))).thenReturn(readBuilder); + when(readBuilder.withRowRanges(anyList())).thenReturn(readBuilder); + when(readBuilder.newRead()) + .thenAnswer( + invocation -> + new ListRowTableRead( + readerSplit, Collections.singletonList(preloadRow))); + when(readBuilder.newScan()).thenReturn(scan); + when(scan.plan()).thenReturn(plan); + when(plan.splits()).thenReturn(Collections.singletonList(readerSplit)); + + try (MockedStatic mockedCatalogFactory = + Mockito.mockStatic(CatalogFactory.class)) { + mockedCatalogFactory + .when(() -> CatalogFactory.createCatalog(context)) + .thenReturn(catalog); + + when(table.fileIO()).thenReturn(LocalFileIO.create()); + + BlobReferenceResolver resolver = + BlobReferenceLookup.createResolver( + context, java.util.Arrays.asList(leftReference, rightReference)); + + BlobRef leftBlobRef = new BlobRef(leftReference); + BlobRef rightBlobRef = new BlobRef(rightReference); + resolver.resolve(leftBlobRef); + resolver.resolve(rightBlobRef); + assertThat(leftBlobRef.isResolved()).isTrue(); + assertThat(rightBlobRef.isResolved()).isTrue(); + assertThat(leftBlobRef.toData()).isEqualTo(leftPayload); + assertThat(rightBlobRef.toData()).isEqualTo(rightPayload); + + // Same coordinates → same result + BlobRef anotherLeft = new BlobRef(new BlobReference("default.source", 7, 12L)); + resolver.resolve(anotherLeft); + assertThat(anotherLeft.toData()).isEqualTo(leftPayload); + + // Only one readBuilder should have been created (batch preload) + verify(table, times(1)).newReadBuilder(); + } + } + + private static class ListRowTableRead implements TableRead { + + private final Split split; + private final List rows; + + private ListRowTableRead(Split split, List rows) { + this.split = split; + this.rows = rows; + } + + @Override + public TableRead withMetricRegistry(MetricRegistry registry) { + return this; + } + + @Override + public TableRead executeFilter() { + return this; + } + + @Override + public TableRead withIOManager(IOManager ioManager) { + return this; + } + + @Override + public RecordReader createReader(Split split) { + return new RecordReader() { + + private boolean emitted = false; + + @Nullable + @Override + public RecordIterator readBatch() { + if (emitted) { + return null; + } + emitted = true; + return new RecordIterator() { + + private int next = 0; + + @Nullable + @Override + public InternalRow next() { + return next < rows.size() ? rows.get(next++) : null; + } + + @Override + public void releaseBatch() {} + }; + } + + @Override + public void close() throws IOException {} + }; + } + } + + private static class TestSplit implements Split { + + @Override + public long rowCount() { + return 1L; + } + + @Override + public OptionalLong mergedRowCount() { + return OptionalLong.of(1L); + } + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/DataTypeToLogicalType.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/DataTypeToLogicalType.java index 92ae714ca577..21cb45cf0d93 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/DataTypeToLogicalType.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/DataTypeToLogicalType.java @@ -21,6 +21,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -155,6 +156,11 @@ public LogicalType visit(BlobType blobType) { org.apache.flink.table.types.logical.VarBinaryType.MAX_LENGTH); } + @Override + public LogicalType visit(BlobRefType blobRefType) { + return new org.apache.flink.table.types.logical.VarBinaryType(BlobType.DEFAULT_SIZE); + } + @Override public LogicalType visit(ArrayType arrayType) { return new org.apache.flink.table.types.logical.ArrayType( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkCatalog.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkCatalog.java index 5f59063668a5..80d1d14d2d2f 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkCatalog.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkCatalog.java @@ -153,6 +153,7 @@ import static org.apache.paimon.catalog.Catalog.SYSTEM_DATABASE_NAME; import static org.apache.paimon.catalog.Catalog.TOTAL_SIZE_PROP; import static org.apache.paimon.flink.FlinkCatalogOptions.DISABLE_CREATE_TABLE_IN_DEFAULT_DB; +import static org.apache.paimon.flink.LogicalTypeConversion.toBlobRefType; import static org.apache.paimon.flink.LogicalTypeConversion.toBlobType; import static org.apache.paimon.flink.LogicalTypeConversion.toDataType; import static org.apache.paimon.flink.LogicalTypeConversion.toLogicalType; @@ -1038,6 +1039,7 @@ public static Schema fromCatalogTable(CatalogBaseTable catalogTable) { Map options = new HashMap<>(catalogTable.getOptions()); List blobFields = CoreOptions.blobField(options); + List blobRefFields = CoreOptions.blobRefField(options); if (!blobFields.isEmpty()) { checkArgument( options.containsKey(CoreOptions.DATA_EVOLUTION_ENABLED.key()), @@ -1047,6 +1049,15 @@ public static Schema fromCatalogTable(CatalogBaseTable catalogTable) { + CoreOptions.DATA_EVOLUTION_ENABLED.key() + "'"); } + if (!blobRefFields.isEmpty()) { + checkArgument( + options.containsKey(CoreOptions.DATA_EVOLUTION_ENABLED.key()), + "When setting '" + + CoreOptions.BLOB_REF_FIELD.key() + + "', you must also set '" + + CoreOptions.DATA_EVOLUTION_ENABLED.key() + + "'"); + } // Serialize virtual columns and watermark to the options // This is what Flink SQL needs, the storage itself does not need them options.putAll(columnOptions(schema)); @@ -1077,9 +1088,13 @@ private static org.apache.paimon.types.DataType resolveDataType( org.apache.flink.table.types.logical.LogicalType logicalType, Map options) { List blobFields = CoreOptions.blobField(options); + List blobRefFields = CoreOptions.blobRefField(options); if (blobFields.contains(fieldName)) { return toBlobType(logicalType); } + if (blobRefFields.contains(fieldName)) { + return toBlobRefType(logicalType); + } Set vectorFields = CoreOptions.vectorField(options); if (vectorFields.contains(fieldName)) { return toVectorType(fieldName, logicalType, options); diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkRowDataWithBlob.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkRowDataWithBlob.java index dc38588a7ce4..5b70fe6b8f93 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkRowDataWithBlob.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkRowDataWithBlob.java @@ -19,6 +19,7 @@ package org.apache.paimon.flink; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.InternalRow; import java.util.Set; @@ -27,18 +28,26 @@ public class FlinkRowDataWithBlob extends FlinkRowData { private final Set blobFields; + private final Set blobRefFields; private final boolean blobAsDescriptor; public FlinkRowDataWithBlob( - InternalRow row, Set blobFields, boolean blobAsDescriptor) { + InternalRow row, + Set blobFields, + Set blobRefFields, + boolean blobAsDescriptor) { super(row); this.blobFields = blobFields; + this.blobRefFields = blobRefFields; this.blobAsDescriptor = blobAsDescriptor; } @Override public byte[] getBinary(int pos) { - if (blobFields.contains(pos)) { + if (blobRefFields.contains(pos)) { + BlobRef blobRef = row.getBlobRef(pos); + return blobAsDescriptor ? blobRef.toDescriptor().serialize() : blobRef.toData(); + } else if (blobFields.contains(pos)) { Blob blob = row.getBlob(pos); return blobAsDescriptor ? blob.toDescriptor().serialize() : blob.toData(); } else { diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkRowWrapper.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkRowWrapper.java index ad2132e8c1eb..a4afb630e207 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkRowWrapper.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/FlinkRowWrapper.java @@ -22,7 +22,9 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobData; -import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.data.BlobUtils; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -32,7 +34,6 @@ import org.apache.paimon.data.variant.GenericVariant; import org.apache.paimon.data.variant.Variant; import org.apache.paimon.types.RowKind; -import org.apache.paimon.utils.UriReader; import org.apache.paimon.utils.UriReaderFactory; import org.apache.flink.table.data.DecimalData; @@ -55,7 +56,8 @@ public FlinkRowWrapper(org.apache.flink.table.data.RowData row) { public FlinkRowWrapper(org.apache.flink.table.data.RowData row, CatalogContext catalogContext) { this.row = row; - this.uriReaderFactory = new UriReaderFactory(catalogContext); + this.uriReaderFactory = + catalogContext == null ? null : new UriReaderFactory(catalogContext); } @Override @@ -142,15 +144,13 @@ public Variant getVariant(int pos) { @Override public Blob getBlob(int pos) { + return BlobUtils.fromBytes(row.getBinary(pos), uriReaderFactory, null); + } + + @Override + public BlobRef getBlobRef(int pos) { byte[] bytes = row.getBinary(pos); - boolean blobDes = BlobDescriptor.isBlobDescriptor(bytes); - if (blobDes) { - BlobDescriptor blobDescriptor = BlobDescriptor.deserialize(bytes); - UriReader uriReader = uriReaderFactory.create(blobDescriptor.uri()); - return Blob.fromDescriptor(uriReader, blobDescriptor); - } else { - return new BlobData(bytes); - } + return new BlobRef(BlobReference.deserialize(bytes)); } @Override @@ -258,6 +258,11 @@ public Blob getBlob(int pos) { return new BlobData(array.getBinary(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("FlinkArrayWrapper does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { return new FlinkArrayWrapper(array.getArray(pos)); diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/LogicalTypeConversion.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/LogicalTypeConversion.java index 556dbd95ff31..1cd0168c332d 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/LogicalTypeConversion.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/LogicalTypeConversion.java @@ -19,6 +19,7 @@ package org.apache.paimon.flink; import org.apache.paimon.CoreOptions; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.DataType; import org.apache.paimon.types.DataTypes; @@ -53,6 +54,13 @@ public static BlobType toBlobType(LogicalType logicalType) { return new BlobType(); } + public static BlobRefType toBlobRefType(LogicalType logicalType) { + checkArgument( + logicalType instanceof BinaryType || logicalType instanceof VarBinaryType, + "Expected BinaryType or VarBinaryType, but got: " + logicalType); + return new BlobRefType(); + } + public static VectorType toVectorType( String fieldName, org.apache.flink.table.types.logical.LogicalType logicalType, diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/function/BlobReferenceFunction.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/function/BlobReferenceFunction.java new file mode 100644 index 000000000000..d2a54cd3146d --- /dev/null +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/function/BlobReferenceFunction.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.flink.function; + +import org.apache.paimon.data.BlobReference; + +import org.apache.flink.table.functions.ScalarFunction; + +/** + * Flink scalar function that constructs a serialized {@link BlobReference}. + * + *

Usage: {@code sys.blob_reference('db.table', 7, 11)} + */ +public class BlobReferenceFunction extends ScalarFunction { + + public byte[] eval(String tableName, int fieldId, long rowId) { + if (tableName == null) { + return null; + } + return new BlobReference(tableName, fieldId, rowId).serialize(); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/function/BuiltInFunctions.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/function/BuiltInFunctions.java index a6a94faf6141..4b9965117974 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/function/BuiltInFunctions.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/function/BuiltInFunctions.java @@ -29,6 +29,7 @@ public class BuiltInFunctions { { put("path_to_descriptor", PathToDescriptor.class.getName()); put("descriptor_to_string", DescriptorToString.class.getName()); + put("blob_reference", BlobReferenceFunction.class.getName()); } }; } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/lookup/LookupCompactDiffRead.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/lookup/LookupCompactDiffRead.java index e4870de58336..76d83393b3d4 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/lookup/LookupCompactDiffRead.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/lookup/LookupCompactDiffRead.java @@ -18,6 +18,7 @@ package org.apache.paimon.flink.lookup; +import org.apache.paimon.catalog.CatalogContext; import org.apache.paimon.data.InternalRow; import org.apache.paimon.disk.IOManager; import org.apache.paimon.operation.MergeFileSplitRead; @@ -41,8 +42,9 @@ public class LookupCompactDiffRead extends AbstractDataTableRead { private final SplitRead fullPhaseMergeRead; private final SplitRead incrementalDiffRead; - public LookupCompactDiffRead(MergeFileSplitRead mergeRead, TableSchema schema) { - super(schema); + public LookupCompactDiffRead( + MergeFileSplitRead mergeRead, TableSchema schema, CatalogContext catalogContext) { + super(schema, catalogContext); this.incrementalDiffRead = new IncrementalCompactDiffSplitRead(mergeRead); this.fullPhaseMergeRead = SplitRead.convert( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/lookup/LookupFileStoreTable.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/lookup/LookupFileStoreTable.java index 353c99d2b1f1..4e355db1e8f2 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/lookup/LookupFileStoreTable.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/lookup/LookupFileStoreTable.java @@ -64,7 +64,9 @@ public InnerTableRead newRead() { return wrapped.newRead(); case COMPACT_DELTA_MONITOR: return new LookupCompactDiffRead( - ((KeyValueFileStore) wrapped.store()).newRead(), wrapped.schema()); + ((KeyValueFileStore) wrapped.store()).newRead(), + wrapped.schema(), + wrapped.catalogEnvironment().catalogContext()); default: throw new UnsupportedOperationException( "Unknown lookup stream scan mode: " + lookupScanMode.name()); diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/source/FileStoreSourceSplitReader.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/source/FileStoreSourceSplitReader.java index b49b9adb9476..0938184a04ec 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/source/FileStoreSourceSplitReader.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/source/FileStoreSourceSplitReader.java @@ -269,15 +269,22 @@ private class FileStoreRecordIterator implements BulkFormat.RecordIterator recordAndPosition = new MutableRecordAndPosition<>(); private final Set blobFields; + private final Set blobRefFields; private FileStoreRecordIterator(@Nullable RowType rowType) { - this.blobFields = rowType == null ? Collections.emptySet() : blobFieldIndex(rowType); + if (rowType == null) { + this.blobFields = Collections.emptySet(); + this.blobRefFields = Collections.emptySet(); + } else { + this.blobFields = fieldIndexByType(rowType, DataTypeRoot.BLOB); + this.blobRefFields = fieldIndexByType(rowType, DataTypeRoot.BLOB_REF); + } } - private Set blobFieldIndex(RowType rowType) { + private Set fieldIndexByType(RowType rowType, DataTypeRoot typeRoot) { Set result = new HashSet<>(); for (int i = 0; i < rowType.getFieldCount(); i++) { - if (rowType.getTypeAt(i).getTypeRoot() == DataTypeRoot.BLOB) { + if (rowType.getTypeAt(i).getTypeRoot() == typeRoot) { result.add(i); } } @@ -307,9 +314,10 @@ public RecordAndPosition next() { } recordAndPosition.setNext( - blobFields.isEmpty() + blobFields.isEmpty() && blobRefFields.isEmpty() ? new FlinkRowData(row) - : new FlinkRowDataWithBlob(row, blobFields, blobAsDescriptor)); + : new FlinkRowDataWithBlob( + row, blobFields, blobRefFields, blobAsDescriptor)); currentNumRead++; if (limiter != null) { limiter.increment(); diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/BlobTableITCase.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/BlobTableITCase.java index e4900207095f..ac282f48bfed 100644 --- a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/BlobTableITCase.java +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/BlobTableITCase.java @@ -374,6 +374,54 @@ public void testBlobTypeSchemaEquals() throws Exception { assertThat(AbstractFlinkTableFactory.schemaEquals(convertedRowType, flinkRowType)).isTrue(); } + @Test + public void testWriteBlobRefWithBuiltInFunction() throws Exception { + // 1. Create upstream blob table and write data + tEnv.executeSql( + "CREATE TABLE upstream_blob (id INT, name STRING, picture BYTES)" + + " WITH ('row-tracking.enabled'='true'," + + " 'data-evolution.enabled'='true'," + + " 'blob-field'='picture')"); + batchSql("INSERT INTO upstream_blob VALUES (1, 'row1', X'48656C6C6F')"); + batchSql("INSERT INTO upstream_blob VALUES (2, 'row2', X'5945')"); + + // 2. Get fieldId for the "picture" column from Paimon schema + FileStoreTable upstreamTable = paimonTable("upstream_blob"); + int pictureFieldId = + upstreamTable.rowType().getFields().stream() + .filter(f -> f.name().equals("picture")) + .findFirst() + .orElseThrow(() -> new RuntimeException("picture field not found")) + .id(); + + // 3. Create downstream blob_ref table + String fullTableName = tEnv.getCurrentDatabase() + ".upstream_blob"; + tEnv.executeSql( + "CREATE TABLE downstream_ref (id INT, label STRING, image_ref BYTES)" + + " WITH ('row-tracking.enabled'='true'," + + " 'data-evolution.enabled'='true'," + + " 'blob-ref-field'='image_ref')"); + + // 4. Insert by reading _ROW_ID from $row_tracking directly in the INSERT + batchSql( + String.format( + "INSERT INTO downstream_ref" + + " SELECT id, name, sys.blob_reference('%s', %d, _ROW_ID)" + + " FROM `upstream_blob$row_tracking`", + fullTableName, pictureFieldId)); + + // 5. Read back — blob references should resolve to upstream blob data + List result = batchSql("SELECT * FROM downstream_ref ORDER BY id"); + assertThat(result).hasSize(2); + assertThat(result.get(0).getField(0)).isEqualTo(1); + assertThat(result.get(0).getField(1)).isEqualTo("row1"); + assertThat((byte[]) result.get(0).getField(2)) + .isEqualTo(new byte[] {72, 101, 108, 108, 111}); + assertThat(result.get(1).getField(0)).isEqualTo(2); + assertThat(result.get(1).getField(1)).isEqualTo("row2"); + assertThat((byte[]) result.get(1).getField(2)).isEqualTo(new byte[] {89, 69}); + } + private static final char[] HEX_ARRAY = "0123456789ABCDEF".toCharArray(); public static String bytesToHex(byte[] bytes) { diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/source/TestChangelogDataReadWrite.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/source/TestChangelogDataReadWrite.java index 8114ac17eb38..f110c7e31488 100644 --- a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/source/TestChangelogDataReadWrite.java +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/source/TestChangelogDataReadWrite.java @@ -152,7 +152,7 @@ public KeyValueTableRead createReadWithKey() { FileFormatDiscover.of(options), pathFactory, options); - return new KeyValueTableRead(() -> read, () -> rawFileRead, null); + return new KeyValueTableRead(() -> read, () -> rawFileRead, schema, null, null); } public List writeFiles( diff --git a/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroSchemaConverter.java b/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroSchemaConverter.java index eb55a86c5b66..251a973b3e0c 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroSchemaConverter.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/avro/AvroSchemaConverter.java @@ -106,6 +106,7 @@ public static Schema convertToSchema( case BINARY: case VARBINARY: case BLOB: + case BLOB_REF: Schema binary = SchemaBuilder.builder().bytesType(); return nullable ? nullableSchema(binary) : binary; case TIMESTAMP_WITHOUT_TIME_ZONE: diff --git a/paimon-format/src/main/java/org/apache/paimon/format/avro/FieldReaderFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/avro/FieldReaderFactory.java index 9aa663df8946..0f63e7c7339c 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/avro/FieldReaderFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/avro/FieldReaderFactory.java @@ -22,6 +22,8 @@ import org.apache.paimon.data.BinaryVector; import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.GenericArray; import org.apache.paimon.data.GenericMap; @@ -69,6 +71,8 @@ public FieldReaderFactory(@Nullable UriReader uriReader) { private static final FieldReader BYTES_READER = new BytesReader(); + private static final FieldReader BLOB_REFERENCE_READER = new BlobReferenceBytesReader(); + private static final FieldReader BOOLEAN_READER = new BooleanReader(); private static final FieldReader TINYINT_READER = new TinyIntReader(); @@ -89,10 +93,13 @@ public FieldReaderFactory(@Nullable UriReader uriReader) { @Override public FieldReader primitive(Schema primitive, DataType type) { - if (primitive.getType() == Schema.Type.BYTES - && type != null - && type.getTypeRoot() == DataTypeRoot.BLOB) { - return new BlobDescriptorBytesReader(uriReader); + if (primitive.getType() == Schema.Type.BYTES && type != null) { + if (type.getTypeRoot() == DataTypeRoot.BLOB) { + return new BlobDescriptorBytesReader(uriReader); + } + if (type.getTypeRoot() == DataTypeRoot.BLOB_REF) { + return BLOB_REFERENCE_READER; + } } return AvroSchemaVisitor.super.primitive(primitive, type); } @@ -285,6 +292,20 @@ public void skip(Decoder decoder) throws IOException { } } + private static class BlobReferenceBytesReader implements FieldReader { + + @Override + public Object read(Decoder decoder, Object reuse) throws IOException { + byte[] bytes = decoder.readBytes(null).array(); + return new BlobRef(BlobReference.deserialize(bytes)); + } + + @Override + public void skip(Decoder decoder) throws IOException { + decoder.skipBytes(); + } + } + private static class BooleanReader implements FieldReader { @Override diff --git a/paimon-format/src/main/java/org/apache/paimon/format/avro/FieldWriterFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/avro/FieldWriterFactory.java index 6eb81cb7f5d1..07856e6ca421 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/avro/FieldWriterFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/avro/FieldWriterFactory.java @@ -21,6 +21,7 @@ import org.apache.paimon.CoreOptions; import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.DataGetters; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.GenericRow; @@ -93,12 +94,24 @@ public class FieldWriterFactory implements AvroSchemaVisitor { } }; + private static final FieldWriter BLOB_REFERENCE_BYTES_WRITER = + (container, i, encoder) -> { + BlobRef blobRef = container.getBlobRef(i); + if (blobRef == null) { + throw new IllegalArgumentException("Null blob_ref is not allowed."); + } + encoder.writeBytes(blobRef.reference().serialize()); + }; + @Override public FieldWriter primitive(Schema primitive, DataType type) { - if (primitive.getType() == Schema.Type.BYTES - && type != null - && type.getTypeRoot() == DataTypeRoot.BLOB) { - return BLOB_DESCRIPTOR_BYTES_WRITER; + if (primitive.getType() == Schema.Type.BYTES && type != null) { + if (type.getTypeRoot() == DataTypeRoot.BLOB) { + return BLOB_DESCRIPTOR_BYTES_WRITER; + } + if (type.getTypeRoot() == DataTypeRoot.BLOB_REF) { + return BLOB_REFERENCE_BYTES_WRITER; + } } return AvroSchemaVisitor.super.primitive(primitive, type); } diff --git a/paimon-format/src/main/java/org/apache/paimon/format/blob/BlobFileMeta.java b/paimon-format/src/main/java/org/apache/paimon/format/blob/BlobFileMeta.java index 02579b959793..27a1ed56170a 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/blob/BlobFileMeta.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/blob/BlobFileMeta.java @@ -98,7 +98,7 @@ public long blobOffset(int i) { } public int returnedPosition(int i) { - return returnedPositions == null ? i : returnedPositions[i - 1]; + return returnedPositions == null ? i - 1 : returnedPositions[i - 1]; } public int recordNumber() { diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java index 4b80827d1bb3..4fe8b5999b3f 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/OrcTypeUtil.java @@ -68,6 +68,7 @@ static TypeDescription convertToOrcType(DataType type, int fieldId, int depth) { return TypeDescription.createBoolean() .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); case BLOB: + case BLOB_REF: return TypeDescription.createBinary() .setAttribute(PAIMON_ORC_FIELD_ID_KEY, String.valueOf(fieldId)); case VARBINARY: diff --git a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/FieldWriterFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/FieldWriterFactory.java index 443c2410cbd2..c77f1339c5c3 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/FieldWriterFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/orc/writer/FieldWriterFactory.java @@ -28,6 +28,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -264,6 +265,16 @@ public FieldWriter visit(BlobType blobType) { }; } + @Override + public FieldWriter visit(BlobRefType blobRefType) { + return (rowId, column, getters, columnId) -> { + BytesColumnVector vector = (BytesColumnVector) column; + byte[] bytes = getters.getBlobRef(columnId).reference().serialize(); + vector.setVal(rowId, bytes, 0, bytes.length); + return bytes.length; + }; + } + @Override public FieldWriter visit(DecimalType decimalType) { return (rowId, column, getters, columnId) -> { diff --git a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetSchemaConverter.java b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetSchemaConverter.java index 37a69fe9aebd..102aa0b2b709 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetSchemaConverter.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/parquet/ParquetSchemaConverter.java @@ -91,6 +91,7 @@ public static Type convertToParquetType(String name, DataType type, int fieldId, case BINARY: case VARBINARY: case BLOB: + case BLOB_REF: return Types.primitive(PrimitiveType.PrimitiveTypeName.BINARY, repetition) .named(name) .withId(fieldId); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/ParquetReaderUtil.java b/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/ParquetReaderUtil.java index a2741f869ab6..a285491219a6 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/ParquetReaderUtil.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/ParquetReaderUtil.java @@ -101,6 +101,7 @@ public static WritableColumnVector createWritableColumnVector( case VARCHAR: case VARBINARY: case BLOB: + case BLOB_REF: return new HeapBytesVector(batchSize); case BINARY: return new HeapBytesVector(batchSize); @@ -178,7 +179,8 @@ public static ColumnVector createReadableColumnVector( case TIMESTAMP_WITH_LOCAL_TIME_ZONE: return new ParquetTimestampVector(writableVector); case BLOB: - // Physical representation is bytes; higher-level Row#getBlob() handles descriptor. + case BLOB_REF: + // Physical representation is bytes; higher-level Row#getBlob() materializes them. return writableVector; case ARRAY: return new CastedArrayColumnVector( diff --git a/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/ParquetVectorUpdaterFactory.java b/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/ParquetVectorUpdaterFactory.java index 0abf78fd2747..2f2582b401e6 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/ParquetVectorUpdaterFactory.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/parquet/reader/ParquetVectorUpdaterFactory.java @@ -36,6 +36,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -230,6 +231,11 @@ public UpdaterFactory visit(BlobType blobType) { }; } + @Override + public UpdaterFactory visit(BlobRefType blobRefType) { + return visit(new BlobType(blobRefType.isNullable())); + } + @Override public UpdaterFactory visit(ArrayType arrayType) { throw new RuntimeException("Array type is not supported"); diff --git a/paimon-format/src/main/java/org/apache/paimon/format/parquet/writer/ParquetRowDataWriter.java b/paimon-format/src/main/java/org/apache/paimon/format/parquet/writer/ParquetRowDataWriter.java index a7241147e68a..43c6be2e2df5 100644 --- a/paimon-format/src/main/java/org/apache/paimon/format/parquet/writer/ParquetRowDataWriter.java +++ b/paimon-format/src/main/java/org/apache/paimon/format/parquet/writer/ParquetRowDataWriter.java @@ -109,6 +109,8 @@ private FieldWriter createWriter(DataType t, Type type) { return new BinaryWriter(); case BLOB: return new BlobDescriptorWriter(); + case BLOB_REF: + return new BlobReferenceWriter(); case DECIMAL: DecimalType decimalType = (DecimalType) t; return createDecimalWriter(decimalType.getPrecision(), decimalType.getScale()); @@ -344,6 +346,21 @@ private void writeBlob(Blob blob) { } } + /** Writes BLOB_REF as serialized {@link org.apache.paimon.data.BlobReference} bytes. */ + private class BlobReferenceWriter implements FieldWriter { + + @Override + public void write(InternalRow row, int ordinal) { + byte[] bytes = row.getBlobRef(ordinal).reference().serialize(); + recordConsumer.addBinary(Binary.fromReusedByteArray(bytes)); + } + + @Override + public void write(InternalArray arrayData, int ordinal) { + throw new UnsupportedOperationException("BLOB_REF in array is not supported."); + } + } + private class IntWriter implements FieldWriter { @Override diff --git a/paimon-format/src/main/java/org/apache/parquet/filter2/predicate/ParquetFilters.java b/paimon-format/src/main/java/org/apache/parquet/filter2/predicate/ParquetFilters.java index dacd12f492c1..70c865f0b5f9 100644 --- a/paimon-format/src/main/java/org/apache/parquet/filter2/predicate/ParquetFilters.java +++ b/paimon-format/src/main/java/org/apache/parquet/filter2/predicate/ParquetFilters.java @@ -29,6 +29,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -414,6 +415,11 @@ public Operators.Column visit(BlobType blobType) { throw new UnsupportedOperationException(); } + @Override + public Operators.Column visit(BlobRefType blobRefType) { + throw new UnsupportedOperationException(); + } + // ===================== can not support ========================= @Override diff --git a/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java b/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java index cb3d7de27da5..ecfb2240d358 100644 --- a/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java +++ b/paimon-format/src/test/java/org/apache/paimon/format/avro/AvroFileFormatTest.java @@ -18,6 +18,9 @@ package org.apache.paimon.format.avro; +import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; import org.apache.paimon.data.GenericRow; import org.apache.paimon.data.InternalRow; import org.apache.paimon.format.FileFormat; @@ -76,6 +79,7 @@ public void testSupportedDataTypes() { dataFields.add(new DataField(index++, "varchar_type", DataTypes.VARCHAR(20))); dataFields.add(new DataField(index++, "binary_type", DataTypes.BINARY(20))); dataFields.add(new DataField(index++, "varbinary_type", DataTypes.VARBINARY(20))); + dataFields.add(new DataField(index++, "blob_ref_type", DataTypes.BLOB_REF())); dataFields.add(new DataField(index++, "timestamp_type", DataTypes.TIMESTAMP(3))); dataFields.add(new DataField(index++, "date_type", DataTypes.DATE())); dataFields.add(new DataField(index++, "decimal_type", DataTypes.DECIMAL(10, 3))); @@ -210,4 +214,29 @@ void testCompression() throws IOException { .hasMessageContaining("Unrecognized codec: unsupported"); } } + + @Test + void testBlobRefRoundTrip() throws IOException { + RowType rowType = DataTypes.ROW(DataTypes.FIELD(0, "blob_ref", DataTypes.BLOB_REF())); + BlobReference reference = new BlobReference("default.t", 7, 11L); + BlobRef blob = Blob.fromReference(reference); + + FileFormat format = new AvroFileFormat(new FormatContext(new Options(), 1024, 1024)); + LocalFileIO fileIO = LocalFileIO.create(); + Path file = new Path(new Path(tempPath.toUri()), UUID.randomUUID().toString()); + + try (PositionOutputStream out = fileIO.newOutputStream(file, false)) { + FormatWriter writer = format.createWriterFactory(rowType).create(out, "zstd"); + writer.addElement(GenericRow.of(blob)); + writer.close(); + } + + try (RecordReader reader = + format.createReaderFactory(rowType, rowType, new ArrayList<>()) + .createReader( + new FormatReaderContext(fileIO, file, fileIO.getFileSize(file)))) { + InternalRow row = reader.readBatch().next(); + assertThat(row.getBlobRef(0).reference()).isEqualTo(reference); + } + } } diff --git a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java index 5669ac33d443..5c36e14cfd1a 100644 --- a/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java +++ b/paimon-format/src/test/java/org/apache/paimon/format/orc/OrcTypeUtilTest.java @@ -60,6 +60,7 @@ void testDataTypeToOrcType() { test("varchar(123)", DataTypes.VARCHAR(123)); test("string", DataTypes.STRING()); test("binary", DataTypes.BYTES()); + test("binary", DataTypes.BLOB_REF()); test("tinyint", DataTypes.TINYINT()); test("smallint", DataTypes.SMALLINT()); test("int", DataTypes.INT()); diff --git a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetSchemaConverterTest.java b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetSchemaConverterTest.java index bfbdaed7c4a3..a312d8867b53 100644 --- a/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetSchemaConverterTest.java +++ b/paimon-format/src/test/java/org/apache/paimon/format/parquet/ParquetSchemaConverterTest.java @@ -25,6 +25,7 @@ import org.apache.paimon.types.RowType; import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.PrimitiveType; import org.junit.jupiter.api.Test; import java.util.Arrays; @@ -106,4 +107,15 @@ public void testPaimonParquetSchemaConvert() { RowType rowType = convertToPaimonRowType(messageType); assertThat(ALL_TYPES).isEqualTo(rowType); } + + @Test + public void testBlobRefSchemaConvertToBinary() { + MessageType messageType = + convertToParquetMessageType( + new RowType( + Arrays.asList(new DataField(0, "blob_ref", DataTypes.BLOB_REF())))); + + assertThat(messageType.getType("blob_ref").asPrimitiveType().getPrimitiveTypeName()) + .isEqualTo(PrimitiveType.PrimitiveTypeName.BINARY); + } } diff --git a/paimon-hive/paimon-hive-common/src/main/java/org/apache/paimon/hive/HiveTypeUtils.java b/paimon-hive/paimon-hive-common/src/main/java/org/apache/paimon/hive/HiveTypeUtils.java index e4799341d1dc..bcb48dffb485 100644 --- a/paimon-hive/paimon-hive-common/src/main/java/org/apache/paimon/hive/HiveTypeUtils.java +++ b/paimon-hive/paimon-hive-common/src/main/java/org/apache/paimon/hive/HiveTypeUtils.java @@ -22,6 +22,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -235,6 +236,11 @@ public TypeInfo visit(BlobType blobType) { return TypeInfoFactory.binaryTypeInfo; } + @Override + public TypeInfo visit(BlobRefType blobRefType) { + return TypeInfoFactory.binaryTypeInfo; + } + @Override protected TypeInfo defaultMethod(org.apache.paimon.types.DataType dataType) { throw new UnsupportedOperationException("Unsupported type: " + dataType); diff --git a/paimon-hive/paimon-hive-connector-common/src/main/java/org/apache/paimon/hive/objectinspector/HivePaimonArray.java b/paimon-hive/paimon-hive-connector-common/src/main/java/org/apache/paimon/hive/objectinspector/HivePaimonArray.java index 02d14d360625..4c73abe49488 100644 --- a/paimon-hive/paimon-hive-connector-common/src/main/java/org/apache/paimon/hive/objectinspector/HivePaimonArray.java +++ b/paimon-hive/paimon-hive-connector-common/src/main/java/org/apache/paimon/hive/objectinspector/HivePaimonArray.java @@ -20,6 +20,7 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; +import org.apache.paimon.data.BlobRef; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -127,6 +128,11 @@ public Blob getBlob(int pos) { return getAs(pos); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("HivePaimonArray does not support BlobRef."); + } + @Override public InternalArray getArray(int i) { return new HivePaimonArray( diff --git a/paimon-lance/src/main/java/org/apache/paimon/format/lance/LanceFileFormat.java b/paimon-lance/src/main/java/org/apache/paimon/format/lance/LanceFileFormat.java index 64b4e2887f82..9421de6b60a3 100644 --- a/paimon-lance/src/main/java/org/apache/paimon/format/lance/LanceFileFormat.java +++ b/paimon-lance/src/main/java/org/apache/paimon/format/lance/LanceFileFormat.java @@ -27,6 +27,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -188,6 +189,11 @@ public Void visit(BlobType blobType) { return null; } + @Override + public Void visit(BlobRefType blobRefType) { + return null; + } + @Override public Void visit(ArrayType arrayType) { return null; diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkCatalog.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkCatalog.java index 6ef853eda870..7a07f4fb5ef8 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkCatalog.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkCatalog.java @@ -41,6 +41,7 @@ import org.apache.paimon.table.iceberg.IcebergTable; import org.apache.paimon.table.lance.LanceTable; import org.apache.paimon.table.object.ObjectTable; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.DataField; import org.apache.paimon.types.DataType; @@ -457,6 +458,7 @@ private Schema toInitialSchema( StructType schema, Transform[] partitions, Map properties) { Map normalizedProperties = new HashMap<>(properties); List blobFields = CoreOptions.blobField(properties); + List blobRefFields = CoreOptions.blobRefField(properties); String provider = properties.get(TableCatalog.PROP_PROVIDER); if (!usePaimon(provider)) { if (isFormatTable(provider)) { @@ -495,6 +497,11 @@ private Schema toInitialSchema( field.dataType() instanceof org.apache.spark.sql.types.BinaryType, "The type of blob field must be binary"); type = new BlobType(); + } else if (blobRefFields.contains(name)) { + checkArgument( + field.dataType() instanceof org.apache.spark.sql.types.BinaryType, + "The type of blob ref field must be binary"); + type = new BlobRefType(); } else { type = toPaimonType(field.dataType()).copy(field.nullable()); } diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkInternalRowWrapper.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkInternalRowWrapper.java index ffd077741c9f..8e0602d19828 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkInternalRowWrapper.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkInternalRowWrapper.java @@ -22,7 +22,9 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobData; -import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.data.BlobUtils; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -32,7 +34,6 @@ import org.apache.paimon.data.variant.Variant; import org.apache.paimon.spark.util.shim.TypeUtils$; import org.apache.paimon.types.RowKind; -import org.apache.paimon.utils.UriReader; import org.apache.paimon.utils.UriReaderFactory; import org.apache.spark.sql.catalyst.util.ArrayData; @@ -246,15 +247,17 @@ public Blob getBlob(int pos) { if (actualPos == -1 || internalRow.isNullAt(actualPos)) { return null; } - byte[] bytes = internalRow.getBinary(actualPos); - boolean blobDes = BlobDescriptor.isBlobDescriptor(bytes); - if (blobDes) { - BlobDescriptor blobDescriptor = BlobDescriptor.deserialize(bytes); - UriReader uriReader = uriReaderFactory.create(blobDescriptor.uri()); - return Blob.fromDescriptor(uriReader, blobDescriptor); - } else { - return new BlobData(bytes); + return BlobUtils.fromBytes(internalRow.getBinary(actualPos), uriReaderFactory, null); + } + + @Override + public BlobRef getBlobRef(int pos) { + int actualPos = getActualFieldPosition(pos); + if (actualPos == -1 || internalRow.isNullAt(actualPos)) { + return null; } + byte[] bytes = internalRow.getBinary(actualPos); + return new BlobRef(BlobReference.deserialize(bytes)); } @Override @@ -438,6 +441,11 @@ public Blob getBlob(int pos) { return new BlobData(arrayData.getBinary(pos)); } + @Override + public BlobRef getBlobRef(int pos) { + throw new UnsupportedOperationException("SparkInternalArray does not support BlobRef."); + } + @Override public InternalArray getArray(int pos) { return new SparkInternalArray( diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkRow.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkRow.java index 36b5624ff52f..92fcdf820a52 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkRow.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/SparkRow.java @@ -22,7 +22,9 @@ import org.apache.paimon.data.BinaryString; import org.apache.paimon.data.Blob; import org.apache.paimon.data.BlobData; -import org.apache.paimon.data.BlobDescriptor; +import org.apache.paimon.data.BlobRef; +import org.apache.paimon.data.BlobReference; +import org.apache.paimon.data.BlobUtils; import org.apache.paimon.data.Decimal; import org.apache.paimon.data.InternalArray; import org.apache.paimon.data.InternalMap; @@ -38,7 +40,6 @@ import org.apache.paimon.types.RowKind; import org.apache.paimon.types.RowType; import org.apache.paimon.utils.DateTimeUtils; -import org.apache.paimon.utils.UriReader; import org.apache.paimon.utils.UriReaderFactory; import org.apache.spark.sql.Row; @@ -72,7 +73,8 @@ public SparkRow(RowType type, Row row, RowKind rowkind, CatalogContext catalogCo this.type = type; this.row = row; this.rowKind = rowkind; - this.uriReaderFactory = new UriReaderFactory(catalogContext); + this.uriReaderFactory = + catalogContext == null ? null : new UriReaderFactory(catalogContext); } @Override @@ -161,15 +163,13 @@ public Variant getVariant(int i) { @Override public Blob getBlob(int i) { + return BlobUtils.fromBytes(row.getAs(i), uriReaderFactory, null); + } + + @Override + public BlobRef getBlobRef(int i) { byte[] bytes = row.getAs(i); - boolean blobDes = BlobDescriptor.isBlobDescriptor(bytes); - if (blobDes) { - BlobDescriptor blobDescriptor = BlobDescriptor.deserialize(bytes); - UriReader uriReader = uriReaderFactory.create(blobDescriptor.uri()); - return Blob.fromDescriptor(uriReader, blobDescriptor); - } else { - return new BlobData(bytes); - } + return new BlobRef(BlobReference.deserialize(bytes)); } @Override @@ -344,6 +344,11 @@ public Blob getBlob(int i) { return new BlobData(getAs(i)); } + @Override + public BlobRef getBlobRef(int i) { + throw new UnsupportedOperationException("PaimonArray does not support BlobRef."); + } + @Override public InternalArray getArray(int i) { Object o = getAs(i); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/function/BlobReferenceSparkFunction.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/function/BlobReferenceSparkFunction.java new file mode 100644 index 000000000000..ea9ebe131edb --- /dev/null +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/function/BlobReferenceSparkFunction.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.function; + +import org.apache.paimon.data.BlobReference; + +import org.apache.spark.sql.connector.catalog.functions.ScalarFunction; +import org.apache.spark.sql.types.DataType; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.unsafe.types.UTF8String; + +import java.io.Serializable; + +/** Spark scalar function that constructs a serialized {@link BlobReference}. */ +public class BlobReferenceSparkFunction implements ScalarFunction, Serializable { + + @Override + public DataType[] inputTypes() { + return new DataType[] {DataTypes.StringType, DataTypes.IntegerType, DataTypes.LongType}; + } + + @Override + public DataType resultType() { + return DataTypes.BinaryType; + } + + public byte[] invoke(UTF8String tableName, int fieldId, long rowId) { + if (tableName == null) { + return null; + } + return new BlobReference(tableName.toString(), fieldId, rowId).serialize(); + } + + @Override + public String name() { + return "blob_reference"; + } +} diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/function/BlobReferenceUnbound.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/function/BlobReferenceUnbound.java new file mode 100644 index 000000000000..a130783be902 --- /dev/null +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/function/BlobReferenceUnbound.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.spark.function; + +import org.apache.spark.sql.connector.catalog.functions.BoundFunction; +import org.apache.spark.sql.connector.catalog.functions.UnboundFunction; +import org.apache.spark.sql.types.IntegerType; +import org.apache.spark.sql.types.LongType; +import org.apache.spark.sql.types.StringType; +import org.apache.spark.sql.types.StructType; + +/** Function unbound to {@link BlobReferenceSparkFunction}. */ +public class BlobReferenceUnbound implements UnboundFunction { + + @Override + public BoundFunction bind(StructType inputType) { + if (inputType.fields().length != 3) { + throw new UnsupportedOperationException( + "Function 'blob_reference' requires 3 arguments (tableName STRING, fieldId INT, rowId BIGINT), but found " + + inputType.fields().length); + } + if (!(inputType.fields()[0].dataType() instanceof StringType)) { + throw new UnsupportedOperationException( + "The first argument of 'blob_reference' must be STRING type."); + } + if (!(inputType.fields()[1].dataType() instanceof IntegerType)) { + throw new UnsupportedOperationException( + "The second argument of 'blob_reference' must be INT type."); + } + if (!(inputType.fields()[2].dataType() instanceof LongType)) { + throw new UnsupportedOperationException( + "The third argument of 'blob_reference' must be BIGINT type."); + } + return new BlobReferenceSparkFunction(); + } + + @Override + public String description() { + return "Construct a serialized BlobReference from tableName, fieldId and rowId"; + } + + @Override + public String name() { + return "blob_reference"; + } +} diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/SparkTypeUtils.java b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/SparkTypeUtils.java index dc2f8b30acab..823534deea7c 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/SparkTypeUtils.java +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/SparkTypeUtils.java @@ -24,6 +24,7 @@ import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; import org.apache.paimon.types.BlobType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; import org.apache.paimon.types.DataField; @@ -167,6 +168,11 @@ public DataType visit(BlobType blobType) { return DataTypes.BinaryType; } + @Override + public DataType visit(BlobRefType blobRefType) { + return DataTypes.BinaryType; + } + @Override public DataType visit(VarBinaryType varBinaryType) { return DataTypes.BinaryType; diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalog/functions/PaimonFunctions.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalog/functions/PaimonFunctions.scala index dd039de6cc2d..3f3380fee78e 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalog/functions/PaimonFunctions.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/catalog/functions/PaimonFunctions.scala @@ -25,7 +25,7 @@ import org.apache.paimon.shade.guava30.com.google.common.collect.{ImmutableMap, import org.apache.paimon.spark.SparkInternalRowWrapper import org.apache.paimon.spark.SparkTypeUtils.toPaimonRowType import org.apache.paimon.spark.catalog.functions.PaimonFunctions._ -import org.apache.paimon.spark.function.{DescriptorToStringUnbound, PathToDescriptorUnbound} +import org.apache.paimon.spark.function.{BlobReferenceUnbound, DescriptorToStringUnbound, PathToDescriptorUnbound} import org.apache.paimon.table.{BucketMode, FileStoreTable} import org.apache.paimon.types.{ArrayType, DataType => PaimonDataType, LocalZonedTimestampType, MapType, RowType, TimestampType} import org.apache.paimon.utils.ProjectedRow @@ -47,6 +47,7 @@ object PaimonFunctions { val MAX_PT: String = "max_pt" val PATH_TO_DESCRIPTOR: String = "path_to_descriptor" val DESCRIPTOR_TO_STRING: String = "descriptor_to_string" + val BLOB_REFERENCE: String = "blob_reference" private val FUNCTIONS = ImmutableMap .builder[String, UnboundFunction]() @@ -56,6 +57,7 @@ object PaimonFunctions { .put(MAX_PT, new MaxPtFunction) .put(PATH_TO_DESCRIPTOR, new PathToDescriptorUnbound) .put(DESCRIPTOR_TO_STRING, new DescriptorToStringUnbound) + .put(BLOB_REFERENCE, new BlobReferenceUnbound) .build() /** The bucket function type to the function name mapping */ diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/data/SparkInternalRow.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/data/SparkInternalRow.scala index ae504b24120f..4c34814df4f2 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/data/SparkInternalRow.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/data/SparkInternalRow.scala @@ -48,7 +48,10 @@ object SparkInternalRow { var i: Int = 0 val blobFields = new mutable.HashSet[Int]() while (i < rowType.getFieldCount) { - if (rowType.getTypeAt(i).getTypeRoot.equals(DataTypeRoot.BLOB)) { + if ( + rowType.getTypeAt(i).getTypeRoot.equals(DataTypeRoot.BLOB) || + rowType.getTypeAt(i).getTypeRoot.equals(DataTypeRoot.BLOB_REF) + ) { blobFields.add(i) } i += 1 diff --git a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/BlobTestBase.scala b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/BlobTestBase.scala index 86e0a0dce2f9..d96684330e03 100644 --- a/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/BlobTestBase.scala +++ b/paimon-spark/paimon-spark-ut/src/test/scala/org/apache/paimon/spark/sql/BlobTestBase.scala @@ -32,6 +32,8 @@ import org.apache.spark.sql.Row import java.util import java.util.Random +import scala.collection.JavaConverters._ + class BlobTestBase extends PaimonSparkTestBase { private val RANDOM = new Random @@ -314,6 +316,45 @@ class BlobTestBase extends PaimonSparkTestBase { } } + test("BlobRef: test write and read blob reference") { + withTable("upstream", "downstream") { + // 1. Create upstream blob table and write data + sql("CREATE TABLE upstream (id INT, name STRING, picture BINARY) TBLPROPERTIES " + + "('row-tracking.enabled'='true', 'data-evolution.enabled'='true', 'blob-field'='picture')") + sql("INSERT INTO upstream VALUES (1, 'row1', X'48656C6C6F')") + sql("INSERT INTO upstream VALUES (2, 'row2', X'5945')") + + // 2. Get fieldId for the "picture" column + val upstreamTable = loadTable("upstream") + val pictureFieldId = upstreamTable + .rowType() + .getFields + .asScala + .find(_.name() == "picture") + .map(_.id()) + .getOrElse(throw new RuntimeException("picture field not found")) + + // 3. Create downstream blob_ref table + val fullTableName = s"$dbName0.upstream" + sql("CREATE TABLE downstream (id INT, label STRING, image_ref BINARY) TBLPROPERTIES " + + "('row-tracking.enabled'='true', 'data-evolution.enabled'='true', 'blob-ref-field'='image_ref')") + + // 4. Insert by reading _ROW_ID from $row_tracking directly + sql( + s"INSERT INTO downstream " + + s"SELECT id, name, sys.blob_reference('$fullTableName', $pictureFieldId, _ROW_ID) " + + s"FROM `upstream`") + + // 5. Read back — blob references should resolve to upstream blob data + checkAnswer( + sql("SELECT * FROM downstream ORDER BY id"), + Seq( + Row(1, "row1", Array[Byte](72, 101, 108, 108, 111)), + Row(2, "row2", Array[Byte](89, 69))) + ) + } + } + private val HEX_ARRAY = "0123456789ABCDEF".toCharArray def bytesToHex(bytes: Array[Byte]): String = { diff --git a/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/paimon/spark/data/Spark3InternalRowWithBlob.scala b/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/paimon/spark/data/Spark3InternalRowWithBlob.scala index 6c1dbd9d2120..3a43ffefb737 100644 --- a/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/paimon/spark/data/Spark3InternalRowWithBlob.scala +++ b/paimon-spark/paimon-spark3-common/src/main/scala/org/apache/paimon/spark/data/Spark3InternalRowWithBlob.scala @@ -18,7 +18,7 @@ package org.apache.paimon.spark.data -import org.apache.paimon.types.RowType +import org.apache.paimon.types.{DataTypeRoot, RowType} import org.apache.paimon.utils.InternalRowUtils.copyInternalRow import org.apache.spark.sql.catalyst.InternalRow @@ -26,13 +26,17 @@ import org.apache.spark.sql.catalyst.InternalRow class Spark3InternalRowWithBlob(rowType: RowType, blobFields: Set[Int], blobAsDescriptor: Boolean) extends Spark3InternalRow(rowType) { + private val blobRefFields: Set[Int] = + blobFields.filter(i => rowType.getTypeAt(i).getTypeRoot.equals(DataTypeRoot.BLOB_REF)) + private val pureBlobFields: Set[Int] = blobFields -- blobRefFields + override def getBinary(ordinal: Int): Array[Byte] = { - if (blobFields.contains(ordinal)) { - if (blobAsDescriptor) { - row.getBlob(ordinal).toDescriptor.serialize() - } else { - row.getBlob(ordinal).toData - } + if (blobRefFields.contains(ordinal)) { + val blobRef = row.getBlobRef(ordinal) + if (blobAsDescriptor) blobRef.toDescriptor.serialize() else blobRef.toData + } else if (pureBlobFields.contains(ordinal)) { + val blob = row.getBlob(ordinal) + if (blobAsDescriptor) blob.toDescriptor.serialize() else blob.toData } else { super.getBinary(ordinal) } diff --git a/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/paimon/spark/data/Spark4InternalRowWithBlob.scala b/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/paimon/spark/data/Spark4InternalRowWithBlob.scala index 2a120e5b4c2a..4edca805d1cc 100644 --- a/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/paimon/spark/data/Spark4InternalRowWithBlob.scala +++ b/paimon-spark/paimon-spark4-common/src/main/scala/org/apache/paimon/spark/data/Spark4InternalRowWithBlob.scala @@ -19,7 +19,7 @@ package org.apache.paimon.spark.data import org.apache.paimon.spark.AbstractSparkInternalRow -import org.apache.paimon.types.RowType +import org.apache.paimon.types.{DataTypeRoot, RowType} import org.apache.paimon.utils.InternalRowUtils.copyInternalRow import org.apache.spark.sql.catalyst.InternalRow @@ -28,13 +28,17 @@ import org.apache.spark.unsafe.types.VariantVal class Spark4InternalRowWithBlob(rowType: RowType, blobFields: Set[Int], blobAsDescriptor: Boolean) extends Spark4InternalRow(rowType) { + private val blobRefFields: Set[Int] = + blobFields.filter(i => rowType.getTypeAt(i).getTypeRoot.equals(DataTypeRoot.BLOB_REF)) + private val pureBlobFields: Set[Int] = blobFields -- blobRefFields + override def getBinary(ordinal: Int): Array[Byte] = { - if (blobFields.contains(ordinal)) { - if (blobAsDescriptor) { - row.getBlob(ordinal).toDescriptor.serialize() - } else { - row.getBlob(ordinal).toData - } + if (blobRefFields.contains(ordinal)) { + val blobRef = row.getBlobRef(ordinal) + if (blobAsDescriptor) blobRef.toDescriptor.serialize() else blobRef.toData + } else if (pureBlobFields.contains(ordinal)) { + val blob = row.getBlob(ordinal) + if (blobAsDescriptor) blob.toDescriptor.serialize() else blob.toData } else { super.getBinary(ordinal) } diff --git a/paimon-vortex/paimon-vortex-format/src/main/java/org/apache/paimon/format/vortex/VortexFileFormat.java b/paimon-vortex/paimon-vortex-format/src/main/java/org/apache/paimon/format/vortex/VortexFileFormat.java index eda8a3944ca9..d6191af2204d 100644 --- a/paimon-vortex/paimon-vortex-format/src/main/java/org/apache/paimon/format/vortex/VortexFileFormat.java +++ b/paimon-vortex/paimon-vortex-format/src/main/java/org/apache/paimon/format/vortex/VortexFileFormat.java @@ -27,6 +27,7 @@ import org.apache.paimon.types.ArrayType; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.BinaryType; +import org.apache.paimon.types.BlobRefType; import org.apache.paimon.types.BlobType; import org.apache.paimon.types.BooleanType; import org.apache.paimon.types.CharType; @@ -187,6 +188,12 @@ public Void visit(BlobType blobType) { "Vortex file format does not support type BLOB"); } + @Override + public Void visit(BlobRefType blobRefType) { + throw new UnsupportedOperationException( + "Vortex file format does not support type BLOB_REF"); + } + @Override public Void visit(ArrayType arrayType) { return null;