From 560eb1e920dbfb024d5197cd7e0427e940e815ea Mon Sep 17 00:00:00 2001
From: xuzifu666 <1206332514@qq.com>
Date: Thu, 30 Apr 2026 15:51:14 +0800
Subject: [PATCH 1/3] [core] Introduce PrefixFileIndex for prefix query
optimization
---
.../fileindex/prefix/PrefixFileIndex.java | 341 ++++++++++++++++++
.../prefix/PrefixFileIndexFactory.java | 40 ++
...apache.paimon.fileindex.FileIndexerFactory | 1 +
3 files changed, 382 insertions(+)
create mode 100644 paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndex.java
create mode 100644 paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexFactory.java
diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndex.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndex.java
new file mode 100644
index 000000000000..a99cc13e67b6
--- /dev/null
+++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndex.java
@@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.fileindex.prefix;
+
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.fileindex.FileIndexReader;
+import org.apache.paimon.fileindex.FileIndexResult;
+import org.apache.paimon.fileindex.FileIndexWriter;
+import org.apache.paimon.fileindex.FileIndexer;
+import org.apache.paimon.fs.SeekableInputStream;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.predicate.FieldRef;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.utils.IOUtils;
+import org.apache.paimon.utils.RoaringBitmap32;
+
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+/**
+ * Prefix file index for accelerating prefix match queries (e.g. LIKE 'prefix%', STARTS_WITH).
+ *
+ *
For each text value, extracts a fixed-length prefix (default 3 characters) and builds an
+ * inverted index: prefix -> RoaringBitmap of row numbers. Queries with a prefix literal can
+ * quickly determine whether the data file needs to be read.
+ */
+public class PrefixFileIndex implements FileIndexer {
+
+ public static final int VERSION_1 = 1;
+ public static final String VERSION = "version";
+ public static final String PREFIX_LENGTH = "prefix-length";
+
+ private static final int DEFAULT_PREFIX_LENGTH = 3;
+
+ private final DataType dataType;
+ private final Options options;
+
+ public PrefixFileIndex(DataType dataType, Options options) {
+ this.dataType = dataType;
+ this.options = options;
+ }
+
+ @Override
+ public FileIndexWriter createWriter() {
+ return new Writer(options);
+ }
+
+ @Override
+ public FileIndexReader createReader(SeekableInputStream inputStream, int start, int length) {
+ try {
+ inputStream.seek(start);
+ byte[] serializedBytes = new byte[length];
+ IOUtils.readFully(inputStream, serializedBytes);
+ return new Reader(serializedBytes);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static String keyToString(Object key) {
+ if (key instanceof BinaryString) {
+ return key.toString();
+ } else if (key instanceof String) {
+ return (String) key;
+ }
+ throw new IllegalArgumentException(
+ "Prefix index only supports string types, but got: "
+ + (key == null ? "null" : key.getClass().getName()));
+ }
+
+ private static String extractPrefix(String text, int prefixLength) {
+ if (text.length() <= prefixLength) {
+ return text;
+ }
+ return text.substring(0, prefixLength);
+ }
+
+ // ==================== Writer ====================
+
+ private static class Writer extends FileIndexWriter {
+
+ private final int prefixLength;
+ private final Map prefix2bitmap = new HashMap<>();
+ private final RoaringBitmap32 nullBitmap = new RoaringBitmap32();
+ private int rowNumber;
+
+ Writer(Options options) {
+ this.prefixLength = options.getInteger(PREFIX_LENGTH, DEFAULT_PREFIX_LENGTH);
+ if (prefixLength <= 0) {
+ throw new IllegalArgumentException(
+ "prefix-length must be positive, but got: " + prefixLength);
+ }
+ }
+
+ @Override
+ public void write(Object key) {
+ if (key == null) {
+ nullBitmap.add(rowNumber++);
+ } else {
+ String prefix = extractPrefix(keyToString(key), prefixLength);
+ prefix2bitmap.computeIfAbsent(prefix, k -> new RoaringBitmap32()).add(rowNumber++);
+ }
+ }
+
+ @Override
+ public byte[] serializedBytes() {
+ try {
+ ByteArrayOutputStream output = new ByteArrayOutputStream();
+ DataOutputStream dos = new DataOutputStream(output);
+
+ // Compute body layout first
+ byte[] nullBitmapBytes = null;
+ int nullOffset;
+ int bodyOffset = 0;
+
+ if (nullBitmap.isEmpty()) {
+ nullOffset = 0;
+ } else if (nullBitmap.getCardinality() == 1) {
+ nullOffset = -1 - nullBitmap.first();
+ } else {
+ nullBitmapBytes = serializeBitmap(nullBitmap);
+ nullOffset = 0; // null bitmap at start of body
+ bodyOffset = nullBitmapBytes.length;
+ }
+
+ // Sort entries by prefix for deterministic serialization
+ LinkedHashMap offsets = new LinkedHashMap<>();
+ LinkedHashMap bitmapBytes = new LinkedHashMap<>();
+
+ for (String prefix : sortedPrefixes()) {
+ RoaringBitmap32 bitmap = prefix2bitmap.get(prefix);
+ byte[] bytes = serializeBitmap(bitmap);
+ offsets.put(prefix, bodyOffset);
+ bitmapBytes.put(prefix, bytes);
+ bodyOffset += bytes.length;
+ }
+
+ // Write header
+ dos.writeByte(VERSION_1);
+ dos.writeInt(prefixLength);
+ dos.writeInt(rowNumber);
+ dos.writeInt(prefix2bitmap.size());
+ dos.writeBoolean(!nullBitmap.isEmpty());
+ dos.writeInt(nullOffset);
+
+ // Write entries (prefix + offset)
+ for (Map.Entry entry : offsets.entrySet()) {
+ byte[] prefixBytes = entry.getKey().getBytes(StandardCharsets.UTF_8);
+ dos.writeInt(prefixBytes.length);
+ dos.write(prefixBytes);
+ dos.writeInt(entry.getValue());
+ }
+
+ // Write bitmap body
+ if (nullBitmapBytes != null) {
+ dos.write(nullBitmapBytes);
+ }
+ for (byte[] bytes : bitmapBytes.values()) {
+ dos.write(bytes);
+ }
+
+ dos.flush();
+ return output.toByteArray();
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to serialize prefix file index", e);
+ }
+ }
+
+ private byte[] serializeBitmap(RoaringBitmap32 bitmap) throws IOException {
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ DataOutputStream dos = new DataOutputStream(baos);
+ bitmap.serialize(dos);
+ dos.flush();
+ return baos.toByteArray();
+ }
+
+ private java.util.List sortedPrefixes() {
+ java.util.List list = new java.util.ArrayList<>(prefix2bitmap.keySet());
+ java.util.Collections.sort(list);
+ return list;
+ }
+ }
+
+ // ==================== Reader ====================
+
+ private static class Reader extends FileIndexReader {
+
+ private final byte[] data;
+ private int prefixLength;
+
+ // Lazy loaded
+ private int rowCount;
+ private boolean hasNull;
+ private int nullOffset;
+ private Map prefixOffsets;
+ private int bodyStart;
+
+ Reader(byte[] data) {
+ this.data = data;
+ // prefixLength is not stored in serialized data; we use a reasonable default
+ // In practice, query prefix extraction should match writer's prefixLength.
+ // For simplicity, we use the default here; the query literal's prefix is extracted
+ // with the same logic as the writer (min of literal length and prefix length).
+ this.prefixLength = DEFAULT_PREFIX_LENGTH;
+ }
+
+ private void ensureLoaded() {
+ if (prefixOffsets != null) {
+ return;
+ }
+ try {
+ java.io.ByteArrayInputStream bais = new java.io.ByteArrayInputStream(data);
+ DataInputStream dis = new DataInputStream(bais);
+
+ int version = dis.readByte();
+ if (version != VERSION_1) {
+ throw new RuntimeException("Unsupported prefix file index version: " + version);
+ }
+ prefixLength = dis.readInt();
+ rowCount = dis.readInt();
+ int entryCount = dis.readInt();
+ hasNull = dis.readBoolean();
+ nullOffset = dis.readInt();
+
+ prefixOffsets = new HashMap<>(entryCount);
+ for (int i = 0; i < entryCount; i++) {
+ int prefixLen = dis.readInt();
+ byte[] prefixBytes = new byte[prefixLen];
+ dis.readFully(prefixBytes);
+ String prefix = new String(prefixBytes, StandardCharsets.UTF_8);
+ int offset = dis.readInt();
+ prefixOffsets.put(prefix, offset);
+ }
+ bodyStart = data.length - bais.available();
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to deserialize prefix file index", e);
+ }
+ }
+
+ private RoaringBitmap32 readBitmap(int offset) {
+ try {
+ java.io.ByteArrayInputStream bais =
+ new java.io.ByteArrayInputStream(
+ data, bodyStart + offset, data.length - bodyStart - offset);
+ DataInputStream dis = new DataInputStream(bais);
+ RoaringBitmap32 bitmap = new RoaringBitmap32();
+ bitmap.deserialize(dis);
+ return bitmap;
+ } catch (IOException e) {
+ throw new RuntimeException("Failed to read bitmap from prefix file index", e);
+ }
+ }
+
+ private boolean hasPrefix(String prefix) {
+ ensureLoaded();
+ Integer offset = prefixOffsets.get(prefix);
+ if (offset != null) {
+ if (offset < 0) {
+ // single value shortcut
+ return true;
+ }
+ RoaringBitmap32 bitmap = readBitmap(offset);
+ return !bitmap.isEmpty();
+ }
+ // If exact prefix not found, check if any stored prefix starts with the query prefix.
+ // This handles the case where query prefix is shorter than prefixLength.
+ for (Map.Entry entry : prefixOffsets.entrySet()) {
+ if (entry.getKey().startsWith(prefix)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public FileIndexResult visitStartsWith(FieldRef fieldRef, Object literal) {
+ if (literal == null) {
+ return FileIndexResult.REMAIN;
+ }
+ String text = keyToString(literal);
+ String prefix = extractPrefix(text, prefixLength);
+ return hasPrefix(prefix) ? FileIndexResult.REMAIN : FileIndexResult.SKIP;
+ }
+
+ @Override
+ public FileIndexResult visitEqual(FieldRef fieldRef, Object literal) {
+ if (literal == null) {
+ return hasNull ? FileIndexResult.REMAIN : FileIndexResult.SKIP;
+ }
+ String text = keyToString(literal);
+ String prefix = extractPrefix(text, prefixLength);
+ return hasPrefix(prefix) ? FileIndexResult.REMAIN : FileIndexResult.SKIP;
+ }
+
+ @Override
+ public FileIndexResult visitLike(FieldRef fieldRef, Object literal) {
+ if (literal == null) {
+ return FileIndexResult.REMAIN;
+ }
+ String pattern = keyToString(literal);
+ // Optimize for "prefix%" patterns (no leading wildcard, single trailing %)
+ if (pattern.endsWith("%")
+ && !pattern.startsWith("%")
+ && pattern.indexOf('%') == pattern.length() - 1
+ && pattern.indexOf('_') == -1) {
+ String prefixText = pattern.substring(0, pattern.length() - 1);
+ String prefix = extractPrefix(prefixText, prefixLength);
+ return hasPrefix(prefix) ? FileIndexResult.REMAIN : FileIndexResult.SKIP;
+ }
+ return FileIndexResult.REMAIN;
+ }
+
+ @Override
+ public FileIndexResult visitIsNull(FieldRef fieldRef) {
+ ensureLoaded();
+ return hasNull ? FileIndexResult.REMAIN : FileIndexResult.SKIP;
+ }
+ }
+}
diff --git a/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexFactory.java b/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexFactory.java
new file mode 100644
index 000000000000..b25cd865e8a3
--- /dev/null
+++ b/paimon-common/src/main/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexFactory.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.fileindex.prefix;
+
+import org.apache.paimon.fileindex.FileIndexer;
+import org.apache.paimon.fileindex.FileIndexerFactory;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataType;
+
+/** Index factory to construct {@link PrefixFileIndex}. */
+public class PrefixFileIndexFactory implements FileIndexerFactory {
+
+ public static final String PREFIX = "prefix";
+
+ @Override
+ public String identifier() {
+ return PREFIX;
+ }
+
+ @Override
+ public FileIndexer create(DataType type, Options options) {
+ return new PrefixFileIndex(type, options);
+ }
+}
diff --git a/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory b/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory
index 5f8ed20221d4..848d2dc79121 100644
--- a/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory
+++ b/paimon-common/src/main/resources/META-INF/services/org.apache.paimon.fileindex.FileIndexerFactory
@@ -17,3 +17,4 @@ org.apache.paimon.fileindex.bloomfilter.BloomFilterFileIndexFactory
org.apache.paimon.fileindex.bitmap.BitmapFileIndexFactory
org.apache.paimon.fileindex.bsi.BitSliceIndexBitmapFileIndexFactory
org.apache.paimon.fileindex.rangebitmap.RangeBitmapFileIndexFactory
+org.apache.paimon.fileindex.prefix.PrefixFileIndexFactory
From 7ce0e7d9f9918945b9f248f8a6f375e71d07a107 Mon Sep 17 00:00:00 2001
From: xuzifu666 <1206332514@qq.com>
Date: Thu, 30 Apr 2026 15:52:21 +0800
Subject: [PATCH 2/3] added
---
.../fileindex/prefix/PrefixFileIndexTest.java | 206 ++++++++++++++++++
1 file changed, 206 insertions(+)
create mode 100644 paimon-common/src/test/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexTest.java
diff --git a/paimon-common/src/test/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexTest.java b/paimon-common/src/test/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexTest.java
new file mode 100644
index 000000000000..ba1a6d8ebbd4
--- /dev/null
+++ b/paimon-common/src/test/java/org/apache/paimon/fileindex/prefix/PrefixFileIndexTest.java
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.fileindex.prefix;
+
+import org.apache.paimon.fileindex.FileIndexReader;
+import org.apache.paimon.fileindex.FileIndexWriter;
+import org.apache.paimon.fs.ByteArraySeekableStream;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.types.DataTypes;
+
+import org.assertj.core.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+import java.util.HashMap;
+
+/** Tests for {@link PrefixFileIndex}. */
+public class PrefixFileIndexTest {
+
+ @Test
+ public void testStartsWith() {
+ PrefixFileIndex index =
+ new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap()));
+ FileIndexWriter writer = index.createWriter();
+
+ writer.write("hello");
+ writer.write("world");
+ writer.write("help");
+ writer.write("helm");
+ writer.write("helium");
+
+ byte[] serialized = writer.serializedBytes();
+ FileIndexReader reader =
+ index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length);
+
+ // Existing prefixes should return REMAIN
+ Assertions.assertThat(reader.visitStartsWith(null, "hel").remain()).isTrue();
+ Assertions.assertThat(reader.visitStartsWith(null, "hello").remain()).isTrue();
+ Assertions.assertThat(reader.visitStartsWith(null, "wor").remain()).isTrue();
+ Assertions.assertThat(reader.visitStartsWith(null, "world").remain()).isTrue();
+ Assertions.assertThat(reader.visitStartsWith(null, "he").remain()).isTrue();
+
+ // Non-existing prefixes should return SKIP
+ Assertions.assertThat(reader.visitStartsWith(null, "abc").remain()).isFalse();
+ Assertions.assertThat(reader.visitStartsWith(null, "xyz").remain()).isFalse();
+ // "helz" truncates to "hel" which exists in index, so it's a false positive (REMAIN)
+ Assertions.assertThat(reader.visitStartsWith(null, "helz").remain()).isTrue();
+ }
+
+ @Test
+ public void testEqual() {
+ PrefixFileIndex index =
+ new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap()));
+ FileIndexWriter writer = index.createWriter();
+
+ writer.write("apple");
+ writer.write("apply");
+ writer.write("banana");
+
+ byte[] serialized = writer.serializedBytes();
+ FileIndexReader reader =
+ index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length);
+
+ // Equal to existing value: prefix matches, so REMAIN
+ Assertions.assertThat(reader.visitEqual(null, "apple").remain()).isTrue();
+ Assertions.assertThat(reader.visitEqual(null, "apply").remain()).isTrue();
+
+ // Equal to non-existing value with matching prefix: REMAIN (false positive)
+ // "applx" has prefix "app" which exists in index
+ Assertions.assertThat(reader.visitEqual(null, "applx").remain()).isTrue();
+
+ // Equal to non-existing value with non-matching prefix: SKIP
+ Assertions.assertThat(reader.visitEqual(null, "apricot").remain()).isFalse();
+ Assertions.assertThat(reader.visitEqual(null, "ban").remain()).isTrue();
+ Assertions.assertThat(reader.visitEqual(null, "cherry").remain()).isFalse();
+ }
+
+ @Test
+ public void testLikePrefixPattern() {
+ PrefixFileIndex index =
+ new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap()));
+ FileIndexWriter writer = index.createWriter();
+
+ writer.write("database");
+ writer.write("dataflow");
+ writer.write("datamine");
+
+ byte[] serialized = writer.serializedBytes();
+ FileIndexReader reader =
+ index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length);
+
+ // "prefix%" pattern
+ Assertions.assertThat(reader.visitLike(null, "dat%").remain()).isTrue();
+ Assertions.assertThat(reader.visitLike(null, "data%").remain()).isTrue();
+ Assertions.assertThat(reader.visitLike(null, "xyz%").remain()).isFalse();
+
+ // Patterns with leading wildcard cannot use prefix index
+ Assertions.assertThat(reader.visitLike(null, "%base").remain()).isTrue();
+ Assertions.assertThat(reader.visitLike(null, "%ata%").remain()).isTrue();
+ }
+
+ @Test
+ public void testNullValues() {
+ PrefixFileIndex index =
+ new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap()));
+ FileIndexWriter writer = index.createWriter();
+
+ writer.write("test");
+ writer.write(null);
+ writer.write("testing");
+
+ byte[] serialized = writer.serializedBytes();
+ FileIndexReader reader =
+ index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length);
+
+ // IS NULL should return REMAIN when nulls exist
+ Assertions.assertThat(reader.visitIsNull(null).remain()).isTrue();
+
+ // STARTS_WITH with null literal should return REMAIN
+ Assertions.assertThat(reader.visitStartsWith(null, null).remain()).isTrue();
+ }
+
+ @Test
+ public void testNoNullValues() {
+ PrefixFileIndex index =
+ new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap()));
+ FileIndexWriter writer = index.createWriter();
+
+ writer.write("only");
+ writer.write("values");
+
+ byte[] serialized = writer.serializedBytes();
+ FileIndexReader reader =
+ index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length);
+
+ // IS NULL should return SKIP when no nulls exist
+ Assertions.assertThat(reader.visitIsNull(null).remain()).isFalse();
+ }
+
+ @Test
+ public void testCustomPrefixLength() {
+ PrefixFileIndex index =
+ new PrefixFileIndex(
+ DataTypes.STRING(),
+ new Options(
+ new HashMap() {
+ {
+ put("prefix-length", "2");
+ }
+ }));
+ FileIndexWriter writer = index.createWriter();
+
+ writer.write("abcde");
+ writer.write("abxyz");
+ writer.write("bcdef");
+
+ byte[] serialized = writer.serializedBytes();
+ FileIndexReader reader =
+ index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length);
+
+ // With prefix-length=2, "ab" matches both "abcde" and "abxyz"
+ Assertions.assertThat(reader.visitStartsWith(null, "ab").remain()).isTrue();
+ Assertions.assertThat(reader.visitStartsWith(null, "abc").remain()).isTrue();
+
+ // "bc" matches "bcdef"
+ Assertions.assertThat(reader.visitStartsWith(null, "bc").remain()).isTrue();
+
+ // "xy" does not match any prefix
+ Assertions.assertThat(reader.visitStartsWith(null, "xy").remain()).isFalse();
+ }
+
+ @Test
+ public void testShortValues() {
+ PrefixFileIndex index =
+ new PrefixFileIndex(DataTypes.STRING(), new Options(new HashMap()));
+ FileIndexWriter writer = index.createWriter();
+
+ writer.write("ab");
+ writer.write("a");
+ writer.write("abc");
+
+ byte[] serialized = writer.serializedBytes();
+ FileIndexReader reader =
+ index.createReader(new ByteArraySeekableStream(serialized), 0, serialized.length);
+
+ // "ab" is both a prefix and a full value
+ Assertions.assertThat(reader.visitStartsWith(null, "ab").remain()).isTrue();
+ // "a" matches "a", "ab", "abc"
+ Assertions.assertThat(reader.visitStartsWith(null, "a").remain()).isTrue();
+ }
+}
From c346975bdc54ac34a845f8b85d7c1e63a4e4b212 Mon Sep 17 00:00:00 2001
From: xuzifu666 <1206332514@qq.com>
Date: Thu, 30 Apr 2026 15:52:58 +0800
Subject: [PATCH 3/3] added
---
.../prefix/PrefixIndexBenchmark.java | 356 ++++++++++++++++++
1 file changed, 356 insertions(+)
create mode 100644 paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/prefix/PrefixIndexBenchmark.java
diff --git a/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/prefix/PrefixIndexBenchmark.java b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/prefix/PrefixIndexBenchmark.java
new file mode 100644
index 000000000000..02b3229da8cb
--- /dev/null
+++ b/paimon-benchmark/paimon-micro-benchmarks/src/test/java/org/apache/paimon/benchmark/prefix/PrefixIndexBenchmark.java
@@ -0,0 +1,356 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.benchmark.prefix;
+
+import org.apache.paimon.benchmark.Benchmark;
+import org.apache.paimon.data.BinaryString;
+import org.apache.paimon.fileindex.FileIndexReader;
+import org.apache.paimon.fileindex.FileIndexWriter;
+import org.apache.paimon.fileindex.bitmap.BitmapFileIndex;
+import org.apache.paimon.fileindex.bitmap.BitmapIndexResult;
+import org.apache.paimon.fileindex.prefix.PrefixFileIndex;
+import org.apache.paimon.fs.local.LocalFileIO;
+import org.apache.paimon.options.Options;
+import org.apache.paimon.predicate.FieldRef;
+import org.apache.paimon.types.DataTypes;
+
+import org.apache.commons.io.FileUtils;
+import org.junit.Rule;
+import org.junit.jupiter.api.Test;
+import org.junit.rules.TemporaryFolder;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Random;
+
+/** Benchmark for {@link PrefixFileIndex}. */
+public class PrefixIndexBenchmark {
+
+ public static final int ROW_COUNT = 1000000;
+ public static final int[] CARDINALITIES = new int[] {100, 1000, 10000};
+ public static final int[] PREFIX_LENGTHS = new int[] {2, 3, 4};
+ public static final String[] CATEGORIES =
+ new String[] {"electronics", "clothing", "books", "food", "sports"};
+
+ @Rule public TemporaryFolder folder = new TemporaryFolder();
+
+ @Test
+ public void testQueryHit() throws Exception {
+ for (int cardinality : CARDINALITIES) {
+ IndexFiles files = createIndexes(cardinality);
+ String existsPrefix = CATEGORIES[0] + "_" + (cardinality / 2);
+
+ Benchmark benchmark =
+ new Benchmark(
+ String.format(
+ "prefix-index-query-hit-cardinality-%s", cardinality),
+ 100)
+ .setNumWarmupIters(1)
+ .setOutputPerIteration(false);
+
+ benchmark.addCase(
+ "prefix-index-prefix2",
+ 10,
+ () -> queryPrefix(files.prefixFile2, existsPrefix, 2));
+ benchmark.addCase(
+ "prefix-index-prefix3",
+ 10,
+ () -> queryPrefix(files.prefixFile3, existsPrefix, 3));
+ benchmark.addCase(
+ "prefix-index-prefix4",
+ 10,
+ () -> queryPrefix(files.prefixFile4, existsPrefix, 4));
+ benchmark.addCase(
+ "bitmap-index-equal", 10, () -> queryBitmap(files.bitmapFile, existsPrefix));
+ benchmark.addCase(
+ "no-index-full-scan", 10, () -> queryNoIndexScan(files.data, existsPrefix));
+
+ benchmark.run();
+ }
+ }
+
+ @Test
+ public void testQuerySkip() throws Exception {
+ for (int cardinality : CARDINALITIES) {
+ IndexFiles files = createIndexes(cardinality);
+ // A prefix that does not exist in the index
+ String notExistsPrefix = "unknown_not_exists";
+
+ Benchmark benchmark =
+ new Benchmark(
+ String.format(
+ "prefix-index-query-skip-cardinality-%s", cardinality),
+ 100)
+ .setNumWarmupIters(1)
+ .setOutputPerIteration(false);
+
+ benchmark.addCase(
+ "prefix-index-prefix2",
+ 10,
+ () -> queryPrefix(files.prefixFile2, notExistsPrefix, 2));
+ benchmark.addCase(
+ "prefix-index-prefix3",
+ 10,
+ () -> queryPrefix(files.prefixFile3, notExistsPrefix, 3));
+ benchmark.addCase(
+ "prefix-index-prefix4",
+ 10,
+ () -> queryPrefix(files.prefixFile4, notExistsPrefix, 4));
+ benchmark.addCase(
+ "bitmap-index-equal", 10, () -> queryBitmap(files.bitmapFile, notExistsPrefix));
+ benchmark.addCase(
+ "no-index-full-scan", 10, () -> queryNoIndexScan(files.data, notExistsPrefix));
+
+ benchmark.run();
+ }
+ }
+
+ @Test
+ public void testIndexSize() throws Exception {
+ System.out.println("\n========== Prefix Index Size Comparison ==========");
+ System.out.printf(
+ "%-15s %-15s %-15s %-15s %-15s %-15s%n",
+ "Cardinality",
+ "PrefixLen=2",
+ "PrefixLen=3",
+ "PrefixLen=4",
+ "BitmapIndex",
+ "RawData");
+ System.out.println(
+ "---------------------------------------------------------------------------------------------");
+
+ for (int cardinality : CARDINALITIES) {
+ IndexFiles files = createIndexes(cardinality);
+ long size2 = files.prefixFile2.length();
+ long size3 = files.prefixFile3.length();
+ long size4 = files.prefixFile4.length();
+ long bitmapSize = files.bitmapFile.length();
+ long rawDataSize = estimateRawDataSize(files.data);
+ System.out.printf(
+ "%-15d %-15d %-15d %-15d %-15d %-15d%n",
+ cardinality, size2, size3, size4, bitmapSize, rawDataSize);
+ }
+ System.out.println();
+ }
+
+ @Test
+ public void testBuildTime() throws Exception {
+ for (int cardinality : CARDINALITIES) {
+ Benchmark benchmark =
+ new Benchmark(
+ String.format("prefix-index-build-cardinality-%s", cardinality),
+ ROW_COUNT)
+ .setNumWarmupIters(0)
+ .setOutputPerIteration(false);
+
+ benchmark.addCase("prefix-index-prefix2", 5, () -> buildPrefixIndex(cardinality, 2));
+ benchmark.addCase("prefix-index-prefix3", 5, () -> buildPrefixIndex(cardinality, 3));
+ benchmark.addCase("prefix-index-prefix4", 5, () -> buildPrefixIndex(cardinality, 4));
+ benchmark.addCase("bitmap-index", 5, () -> buildBitmapIndex(cardinality));
+
+ benchmark.run();
+ }
+ }
+
+ private IndexFiles createIndexes(int cardinality) throws IOException {
+ folder.create();
+
+ File prefixFile2 = folder.newFile("prefix-index-2-" + cardinality);
+ File prefixFile3 = folder.newFile("prefix-index-3-" + cardinality);
+ File prefixFile4 = folder.newFile("prefix-index-4-" + cardinality);
+ File bitmapFile = folder.newFile("bitmap-index-" + cardinality);
+
+ FileIndexWriter writer2 =
+ new PrefixFileIndex(
+ DataTypes.STRING(),
+ new Options(new HashMap()) {
+ {
+ setString("prefix-length", "2");
+ }
+ })
+ .createWriter();
+ FileIndexWriter writer3 =
+ new PrefixFileIndex(
+ DataTypes.STRING(),
+ new Options(new HashMap()) {
+ {
+ setString("prefix-length", "3");
+ }
+ })
+ .createWriter();
+ FileIndexWriter writer4 =
+ new PrefixFileIndex(
+ DataTypes.STRING(),
+ new Options(new HashMap()) {
+ {
+ setString("prefix-length", "4");
+ }
+ })
+ .createWriter();
+ FileIndexWriter bitmapWriter =
+ new BitmapFileIndex(DataTypes.STRING(), new Options()).createWriter();
+
+ String[] data = new String[ROW_COUNT];
+ Random random = new Random(42);
+ for (int i = 0; i < ROW_COUNT; i++) {
+ String value =
+ CATEGORIES[random.nextInt(CATEGORIES.length)]
+ + "_"
+ + random.nextInt(cardinality);
+ data[i] = value;
+ writer2.write(BinaryString.fromString(value));
+ writer3.write(BinaryString.fromString(value));
+ writer4.write(BinaryString.fromString(value));
+ bitmapWriter.write(BinaryString.fromString(value));
+ }
+
+ FileUtils.writeByteArrayToFile(prefixFile2, writer2.serializedBytes());
+ FileUtils.writeByteArrayToFile(prefixFile3, writer3.serializedBytes());
+ FileUtils.writeByteArrayToFile(prefixFile4, writer4.serializedBytes());
+ FileUtils.writeByteArrayToFile(bitmapFile, bitmapWriter.serializedBytes());
+
+ return new IndexFiles(prefixFile2, prefixFile3, prefixFile4, bitmapFile, data);
+ }
+
+ private void buildPrefixIndex(int cardinality, int prefixLength) {
+ try {
+ FileIndexWriter writer =
+ new PrefixFileIndex(
+ DataTypes.STRING(),
+ new Options(new HashMap()) {
+ {
+ setString(
+ "prefix-length", String.valueOf(prefixLength));
+ }
+ })
+ .createWriter();
+ Random random = new Random(42);
+ for (int i = 0; i < ROW_COUNT; i++) {
+ writer.write(
+ BinaryString.fromString(
+ CATEGORIES[random.nextInt(CATEGORIES.length)]
+ + "_"
+ + random.nextInt(cardinality)));
+ }
+ writer.serializedBytes();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void buildBitmapIndex(int cardinality) {
+ try {
+ FileIndexWriter writer =
+ new BitmapFileIndex(DataTypes.STRING(), new Options()).createWriter();
+ Random random = new Random(42);
+ for (int i = 0; i < ROW_COUNT; i++) {
+ writer.write(
+ BinaryString.fromString(
+ CATEGORIES[random.nextInt(CATEGORIES.length)]
+ + "_"
+ + random.nextInt(cardinality)));
+ }
+ writer.serializedBytes();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static void queryPrefix(File indexFile, String prefix, int prefixLength) {
+ try {
+ FieldRef fieldRef = new FieldRef(0, "", DataTypes.STRING());
+ Options options =
+ new Options(new HashMap()) {
+ {
+ setString("prefix-length", String.valueOf(prefixLength));
+ }
+ };
+ LocalFileIO.LocalSeekableInputStream stream =
+ new LocalFileIO.LocalSeekableInputStream(indexFile);
+ FileIndexReader reader =
+ new PrefixFileIndex(DataTypes.STRING(), options)
+ .createReader(stream, 0, (int) indexFile.length());
+ reader.visitStartsWith(fieldRef, BinaryString.fromString(prefix));
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static void queryBitmap(File indexFile, String value) {
+ try {
+ FieldRef fieldRef = new FieldRef(0, "", DataTypes.STRING());
+ Options options = new Options();
+ LocalFileIO.LocalSeekableInputStream stream =
+ new LocalFileIO.LocalSeekableInputStream(indexFile);
+ FileIndexReader reader =
+ new BitmapFileIndex(DataTypes.STRING(), options)
+ .createReader(stream, 0, (int) indexFile.length());
+ ((BitmapIndexResult) reader.visitEqual(fieldRef, BinaryString.fromString(value))).get();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static void queryNoIndexScan(String[] data, String prefix) {
+ boolean found = false;
+ for (String value : data) {
+ if (value.startsWith(prefix)) {
+ found = true;
+ break;
+ }
+ }
+ // Return the result for file pruning decision (REMAIN if found, SKIP if not)
+ // We don't throw here because skip scenarios intentionally query non-existing prefixes
+ boolean remain = found;
+ // Prevent the JVM from optimizing away the result
+ if (remain && System.nanoTime() == 0) {
+ throw new RuntimeException("Unreachable");
+ }
+ }
+
+ private static long estimateRawDataSize(String[] data) {
+ long size = 0;
+ for (String s : data) {
+ size += s.getBytes().length + 4; // 4 bytes for length prefix
+ }
+ return size;
+ }
+
+ private static class IndexFiles {
+ final File prefixFile2;
+ final File prefixFile3;
+ final File prefixFile4;
+ final File bitmapFile;
+ final String[] data;
+
+ IndexFiles(
+ File prefixFile2,
+ File prefixFile3,
+ File prefixFile4,
+ File bitmapFile,
+ String[] data) {
+ this.prefixFile2 = prefixFile2;
+ this.prefixFile3 = prefixFile3;
+ this.prefixFile4 = prefixFile4;
+ this.bitmapFile = bitmapFile;
+ this.data = data;
+ }
+ }
+}