diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index 55eb91e7..473416d5 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -382,6 +382,7 @@ if(PAIMON_BUILD_TESTS) common/file_index/bsi/bit_slice_index_roaring_bitmap_test.cpp common/file_index/rangebitmap/bit_slice_index_bitmap_test.cpp common/file_index/rangebitmap/dictionary/chunked_dictionary_test.cpp + common/file_index/rangebitmap/range_bitmap_file_index_test.cpp common/file_index/bloomfilter/bloom_filter_file_index_test.cpp common/file_index/bloomfilter/fast_hash_test.cpp common/global_index/complete_index_score_batch_reader_test.cpp diff --git a/src/paimon/common/file_index/CMakeLists.txt b/src/paimon/common/file_index/CMakeLists.txt index cff27cc9..9ac0804a 100644 --- a/src/paimon/common/file_index/CMakeLists.txt +++ b/src/paimon/common/file_index/CMakeLists.txt @@ -28,7 +28,10 @@ set(PAIMON_FILE_INDEX_SRC rangebitmap/dictionary/fixed_length_chunk.cpp rangebitmap/dictionary/key_factory.cpp rangebitmap/utils/literal_serialization_utils.cpp - rangebitmap/bit_slice_index_bitmap.cpp) + rangebitmap/bit_slice_index_bitmap.cpp + rangebitmap/range_bitmap.cpp + rangebitmap/range_bitmap_file_index.cpp + rangebitmap/range_bitmap_file_index_factory.cpp) add_paimon_lib(paimon_file_index SOURCES diff --git a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h index 73e18cda..1b1167af 100644 --- a/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h +++ b/src/paimon/common/file_index/rangebitmap/dictionary/key_factory.h @@ -54,7 +54,7 @@ class KeyFactory : public std::enable_shared_from_this { static Result> Create(FieldType field_type); public: - static constexpr char DEFAULT_CHUNK_SIZE[] = "16kb"; + static constexpr char kDefaultChunkSize[] = "16kb"; }; class FixedLengthKeyFactory : public KeyFactory { diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp new file mode 100644 index 00000000..0c487a1c --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.cpp @@ -0,0 +1,310 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" + +#include + +#include "fmt/format.h" +#include "paimon/common/file_index/rangebitmap/dictionary/chunked_dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/common/file_index/rangebitmap/utils/literal_serialization_utils.h" +#include "paimon/common/io/data_output_stream.h" +#include "paimon/common/io/memory_segment_output_stream.h" +#include "paimon/common/memory/memory_segment_utils.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/io/data_input_stream.h" +#include "paimon/memory/bytes.h" + +namespace paimon { + +Result> RangeBitmap::Create( + const std::shared_ptr& input_stream, const int64_t offset, + const FieldType field_type, const std::shared_ptr& pool) { + PAIMON_RETURN_NOT_OK(input_stream->Seek(offset, SeekOrigin::FS_SEEK_SET)); + const auto data_in = std::make_shared(input_stream); + PAIMON_ASSIGN_OR_RAISE(int32_t header_length, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int8_t version, data_in->ReadValue()); + if (version != kCurrentVersion) { + return Status::Invalid(fmt::format("RangeBitmap unsupported version: {}", version)); + } + PAIMON_ASSIGN_OR_RAISE(int32_t rid, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(int32_t cardinality, data_in->ReadValue()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr shared_key_factory, + KeyFactory::Create(field_type)); + PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Deserializer key_deserializer, + LiteralSerDeUtils::CreateValueReader(field_type)); + auto min = Literal{field_type}; + auto max = Literal{field_type}; + if (cardinality > 0) { + PAIMON_ASSIGN_OR_RAISE(min, key_deserializer(data_in, pool.get())); + PAIMON_ASSIGN_OR_RAISE(max, key_deserializer(data_in, pool.get())); + } + PAIMON_ASSIGN_OR_RAISE(int32_t dictionary_length, data_in->ReadValue()); + auto dictionary_offset = static_cast(offset + sizeof(int32_t) + header_length); + int32_t bsi_offset = dictionary_offset + dictionary_length; + return std::unique_ptr(new RangeBitmap(rid, cardinality, dictionary_offset, + bsi_offset, min, max, shared_key_factory, + input_stream, pool)); +} + +Status RangeBitmap::Not(RoaringBitmap32* out) { + out->Flip(0, rid_); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 is_not_null, this->IsNotNull()); + *out &= is_not_null; + return Status::OK(); +} + +Result RangeBitmap::Eq(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); + PAIMON_ASSIGN_OR_RAISE(auto bit_slice_ptr, this->GetBitSliceIndex()); + if (min_compare == 0 && max_compare == 0) { + return bit_slice_ptr->IsNotNull({}); + } + if (min_compare < 0 || max_compare > 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(auto dictionary, this->GetDictionary()); + PAIMON_ASSIGN_OR_RAISE(int32_t code, dictionary->Find(key)); + if (code < 0) { + return RoaringBitmap32(); + } + return bit_slice_ptr->Eq(code); +} + +Result RangeBitmap::Neq(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 result, Eq(key)); + PAIMON_RETURN_NOT_OK(Not(&result)); + return result; +} + +Result RangeBitmap::Lt(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); + if (max_compare > 0) { + return IsNotNull(); + } + if (min_compare <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 result, Gte(key)); + PAIMON_RETURN_NOT_OK(Not(&result)); + return result; +} + +Result RangeBitmap::Lte(const Literal& key) { + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 lt_result, Lt(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 eq_result, Eq(key)); + lt_result |= eq_result; + return lt_result; +} + +Result RangeBitmap::Gt(const Literal& key) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t max_compare, key.CompareTo(max_)); + if (max_compare >= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_compare, key.CompareTo(min_)); + if (min_compare < 0) { + return IsNotNull(); + } + PAIMON_ASSIGN_OR_RAISE(auto dictionary, this->GetDictionary()); + PAIMON_ASSIGN_OR_RAISE(int32_t code, dictionary->Find(key)); + PAIMON_ASSIGN_OR_RAISE(auto bit_slice_ptr, this->GetBitSliceIndex()); + if (code >= 0) { + return bit_slice_ptr->Gt(code); + } + return bit_slice_ptr->Gte(-code - 1); +} + +Result RangeBitmap::Gte(const Literal& key) { + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 gt_result, Gt(key)); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 eq_result, Eq(key)); + gt_result |= eq_result; + return gt_result; +} + +Result RangeBitmap::In(const std::vector& keys) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + RoaringBitmap32 result{}; + for (const auto& key : keys) { + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 bitmap, Eq(key)); + result |= bitmap; + } + return result; +} + +Result RangeBitmap::NotIn(const std::vector& keys) { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 result, In(keys)); + PAIMON_RETURN_NOT_OK(Not(&result)); + return result; +} + +Result RangeBitmap::IsNull() { + if (cardinality_ <= 0) { + if (rid_ > 0) { + RoaringBitmap32 result; + result.AddRange(0, rid_); + return result; + } + return RoaringBitmap32(); + } + + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 non_null_bitmap, IsNotNull()); + non_null_bitmap.Flip(0, rid_); + return non_null_bitmap; +} + +Result RangeBitmap::IsNotNull() { + if (cardinality_ <= 0) { + return RoaringBitmap32(); + } + PAIMON_ASSIGN_OR_RAISE(auto bit_slice_ptr, this->GetBitSliceIndex()); + PAIMON_ASSIGN_OR_RAISE(RoaringBitmap32 result, bit_slice_ptr->IsNotNull({})); + return result; +} + +RangeBitmap::RangeBitmap(const int32_t rid, const int32_t cardinality, + const int32_t dictionary_offset, const int32_t bsi_offset, + const Literal& min, const Literal& max, + const std::shared_ptr& key_factory, + const std::shared_ptr& input_stream, + const std::shared_ptr& pool) + : pool_(pool), + rid_(rid), + cardinality_(cardinality), + bsi_offset_(bsi_offset), + dictionary_offset_(dictionary_offset), + min_(min), + max_(max), + key_factory_(key_factory), + input_stream_(input_stream), + bsi_(nullptr), + dictionary_(nullptr) {} + +Result RangeBitmap::GetBitSliceIndex() { + if (bsi_ == nullptr) { + PAIMON_ASSIGN_OR_RAISE(bsi_, + BitSliceIndexBitmap::Create(input_stream_, bsi_offset_, pool_)); + } + return bsi_.get(); +} + +Result RangeBitmap::GetDictionary() { + if (dictionary_ == nullptr) { + PAIMON_ASSIGN_OR_RAISE( + dictionary_, ChunkedDictionary::Create(key_factory_->GetFieldType(), input_stream_, + dictionary_offset_, pool_)); + } + return dictionary_.get(); +} + +Result> RangeBitmap::Appender::Create( + const std::shared_ptr& factory, int64_t limited_serialized_size_in_bytes, + const std::shared_ptr& pool) { + return std::unique_ptr(new Appender(factory, limited_serialized_size_in_bytes, pool)); +} + +RangeBitmap::Appender::Appender(const std::shared_ptr& factory, + const int64_t limited_serialized_size_in_bytes, + const std::shared_ptr& pool) + : pool_(pool), + rid_(0), + bitmaps_(LiteralComparator(factory)), + factory_(factory), + chunk_size_bytes_limit_(limited_serialized_size_in_bytes) {} + +void RangeBitmap::Appender::Append(const Literal& key) { + if (!key.IsNull()) { + bitmaps_[key].Add(rid_); + } + rid_++; +} + +Result> RangeBitmap::Appender::Serialize() const { + int32_t code = 0; + PAIMON_ASSIGN_OR_RAISE(auto bsi, + BitSliceIndexBitmap::Appender::Create( + 0, std::max(static_cast(bitmaps_.size() - 1), 0), pool_)); + PAIMON_ASSIGN_OR_RAISE(auto dictionary, + ChunkedDictionary::Appender::Create( + factory_, static_cast(chunk_size_bytes_limit_), pool_)); + for (const auto& [key, bitmap] : bitmaps_) { + PAIMON_RETURN_NOT_OK(dictionary->AppendSorted(key, code)); + for (auto it = bitmap.Begin(); it != bitmap.End(); ++it) { + PAIMON_RETURN_NOT_OK(bsi->Append(*it, code)); + } + code++; + } + PAIMON_ASSIGN_OR_RAISE(LiteralSerDeUtils::Serializer serializer, + LiteralSerDeUtils::CreateValueWriter(factory_->GetFieldType())); + Literal min{factory_->GetFieldType()}; + Literal max{factory_->GetFieldType()}; + if (!bitmaps_.empty()) { + min = bitmaps_.begin()->first; + max = bitmaps_.rbegin()->first; + } + PAIMON_ASSIGN_OR_RAISE(int32_t min_size, LiteralSerDeUtils::GetSerializedSizeInBytes(min)); + PAIMON_ASSIGN_OR_RAISE(int32_t max_size, LiteralSerDeUtils::GetSerializedSizeInBytes(max)); + int32_t header_size = 0; + header_size += sizeof(int8_t); // version + header_size += sizeof(int32_t); // rid + header_size += sizeof(int32_t); // cardinality + header_size += min.IsNull() ? 0 : min_size; // min literal size + header_size += max.IsNull() ? 0 : max_size; // max literal size + header_size += sizeof(int32_t); // dictionary length + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR dictionary_bytes, dictionary->Serialize()); + auto dictionary_length = static_cast(dictionary_bytes->size()); + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR bsi_bytes, bsi->Serialize()); + size_t bsi_length = bsi_bytes->size(); + const auto data_output_stream = std::make_shared( + MemorySegmentOutputStream::DEFAULT_SEGMENT_SIZE, pool_); + data_output_stream->WriteValue(header_size); + data_output_stream->WriteValue(kCurrentVersion); + data_output_stream->WriteValue(rid_); + data_output_stream->WriteValue(static_cast(bitmaps_.size())); + if (!min.IsNull()) { + PAIMON_RETURN_NOT_OK(serializer(data_output_stream, min)); + } + if (!max.IsNull()) { + PAIMON_RETURN_NOT_OK(serializer(data_output_stream, max)); + } + data_output_stream->WriteValue(dictionary_length); + data_output_stream->Write(dictionary_bytes->data(), dictionary_length); + data_output_stream->Write(bsi_bytes->data(), bsi_length); + return MemorySegmentUtils::CopyToBytes(data_output_stream->Segments(), 0, + static_cast(data_output_stream->CurrentSize()), + pool_.get()); +} +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap.h b/src/paimon/common/file_index/rangebitmap/range_bitmap.h new file mode 100644 index 00000000..311f84b7 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap.h @@ -0,0 +1,111 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/rangebitmap/bit_slice_index_bitmap.h" +#include "paimon/common/file_index/rangebitmap/dictionary/dictionary.h" +#include "paimon/common/file_index/rangebitmap/dictionary/key_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/utils/roaring_bitmap32.h" + +namespace paimon { + +class InputStream; +class MemoryPool; + +class RangeBitmap { + public: + static Result> Create( + const std::shared_ptr& input_stream, int64_t offset, FieldType field_type, + const std::shared_ptr& pool); + + Result Eq(const Literal& key); + Result Neq(const Literal& key); + Result Lt(const Literal& key); + Result Lte(const Literal& key); + Result Gt(const Literal& key); + Result Gte(const Literal& key); + Result In(const std::vector& keys); + Result NotIn(const std::vector& keys); + Result IsNull(); + Result IsNotNull(); + + public: + static constexpr int8_t kCurrentVersion = 1; + + private: + Status Not(RoaringBitmap32* out); + RangeBitmap(int32_t rid, int32_t cardinality, int32_t dictionary_offset, int32_t bsi_offset, + const Literal& min, const Literal& max, + const std::shared_ptr& key_factory, + const std::shared_ptr& input_stream, + const std::shared_ptr& pool); + Result GetBitSliceIndex(); + Result GetDictionary(); + + private: + std::shared_ptr pool_; + int32_t rid_; + int32_t cardinality_; + int32_t bsi_offset_; + int32_t dictionary_offset_; + Literal min_; + Literal max_; + std::shared_ptr key_factory_; + std::shared_ptr input_stream_; + + // For lazy loading + std::unique_ptr bsi_; + std::unique_ptr dictionary_; + + public: + class Appender { + public: + static Result> Create(const std::shared_ptr& factory, + int64_t limited_serialized_size_in_bytes, + const std::shared_ptr& pool); + void Append(const Literal& key); + Result> Serialize() const; + + private: + Appender(const std::shared_ptr& factory, + int64_t limited_serialized_size_in_bytes, const std::shared_ptr& pool); + struct LiteralComparator { + std::shared_ptr factory; + + explicit LiteralComparator(std::shared_ptr f) : factory(std::move(f)) {} + + bool operator()(const Literal& lhs, const Literal& rhs) const { + const auto result = factory->CompareLiteral(lhs, rhs); + return result.ok() && result.value() < 0; + } + }; + std::shared_ptr pool_; + int32_t rid_; + std::map bitmaps_; + std::shared_ptr factory_; + int64_t chunk_size_bytes_limit_; + }; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp new file mode 100644 index 00000000..345d7b4f --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.cpp @@ -0,0 +1,214 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +#include +#include + +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" +#include "paimon/common/options/memory_size.h" +#include "paimon/common/predicate/literal_converter.h" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/common/utils/field_type_utils.h" +#include "paimon/file_index/bitmap_index_result.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +RangeBitmapFileIndex::RangeBitmapFileIndex(const std::map& options) + : options_(options) {} + +Result> RangeBitmapFileIndex::CreateReader( + ::ArrowSchema* const arrow_schema, const int32_t start, const int32_t length, + const std::shared_ptr& input_stream, + const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_schema_ptr, + arrow::ImportSchema(arrow_schema)); + if (arrow_schema_ptr->num_fields() != 1) { + return Status::Invalid( + "invalid schema for RangeBitmapFileIndexReader, supposed to have single field."); + } + const auto arrow_type = arrow_schema_ptr->field(0)->type(); + return RangeBitmapFileIndexReader::Create(arrow_type, start, length, input_stream, pool); +} + +Result> RangeBitmapFileIndex::CreateWriter( + ::ArrowSchema* arrow_schema, const std::shared_ptr& pool) const { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr arrow_schema_ptr, + arrow::ImportSchema(arrow_schema)); + if (arrow_schema_ptr->num_fields() != 1) { + return Status::Invalid( + "invalid schema for RangeBitmapFileIndexWriter, supposed to have single field."); + } + const auto arrow_field = arrow_schema_ptr->field(0); + return RangeBitmapFileIndexWriter::Create(arrow_schema_ptr, arrow_field->name(), options_, + pool); +} + +Result> RangeBitmapFileIndexWriter::Create( + const std::shared_ptr& arrow_schema, const std::string& field_name, + const std::map& options, const std::shared_ptr& pool) { + const auto field = arrow_schema->GetFieldByName(field_name); + if (!field) { + return Status::Invalid(fmt::format("Field not found in schema: {}", field_name)); + } + PAIMON_ASSIGN_OR_RAISE(FieldType field_type, + FieldTypeUtils::ConvertToFieldType(field->type()->id())); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr shared_key_factory, + KeyFactory::Create(field_type)); + PAIMON_ASSIGN_OR_RAISE(int64_t parsed_chunk_size, + MemorySize::ParseBytes(KeyFactory::kDefaultChunkSize)); + if (const auto chunk_size_it = options.find(RangeBitmapFileIndex::kChunkSize); + chunk_size_it != options.end()) { + PAIMON_ASSIGN_OR_RAISE(parsed_chunk_size, MemorySize::ParseBytes(chunk_size_it->second)); + } + auto struct_type = arrow::struct_({field}); + PAIMON_ASSIGN_OR_RAISE(auto appender_ptr, RangeBitmap::Appender::Create( + shared_key_factory, parsed_chunk_size, pool)); + return std::make_shared( + struct_type, field->type(), options, pool, shared_key_factory, std::move(appender_ptr)); +} + +Status RangeBitmapFileIndexWriter::AddBatch(::ArrowArray* batch) { + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr array, + arrow::ImportArray(batch, struct_type_)); + auto struct_array = std::dynamic_pointer_cast(array); + if (!struct_array || struct_array->num_fields() != 1) { + return Status::Invalid( + "invalid batch for RangeBitmapFileIndexWriter, supposed to be struct array with single " + "field."); + } + PAIMON_ASSIGN_OR_RAISE(std::vector array_values, + LiteralConverter::ConvertLiteralsFromArray(*(struct_array->field(0)), + /*own_data=*/true)); + for (const auto& literal : array_values) { + appender_->Append(literal); + } + return Status::OK(); +} + +Result> RangeBitmapFileIndexWriter::SerializedBytes() const { + return appender_->Serialize(); +} + +RangeBitmapFileIndexWriter::RangeBitmapFileIndexWriter( + const std::shared_ptr& struct_type, + const std::shared_ptr& arrow_type, + const std::map& options, const std::shared_ptr& pool, + const std::shared_ptr& key_factory, std::unique_ptr appender) + : struct_type_(struct_type), + arrow_type_(arrow_type), + options_(options), + pool_(pool), + key_factory_(key_factory), + appender_(std::move(appender)) {} + +Result> RangeBitmapFileIndexReader::Create( + const std::shared_ptr& arrow_type, const int32_t start, const int32_t length, + const std::shared_ptr& input_stream, const std::shared_ptr& pool) { + PAIMON_ASSIGN_OR_RAISE(FieldType field_type, + FieldTypeUtils::ConvertToFieldType(arrow_type->id())); + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr range_bitmap, + RangeBitmap::Create(input_stream, start, field_type, pool)); + return std::shared_ptr( + new RangeBitmapFileIndexReader(std::move(range_bitmap))); +} + +RangeBitmapFileIndexReader::RangeBitmapFileIndexReader(std::unique_ptr range_bitmap) + : range_bitmap_(std::move(range_bitmap)) {} + +Result> RangeBitmapFileIndexReader::VisitEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + return self->range_bitmap_->Eq(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitNotEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + return self->range_bitmap_->Neq(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIn( + const std::vector& literals) { + return std::make_shared( + [self = shared_from_this(), literals]() -> Result { + return self->range_bitmap_->In(literals); + }); +} + +Result> RangeBitmapFileIndexReader::VisitNotIn( + const std::vector& literals) { + return std::make_shared( + [self = shared_from_this(), literals]() -> Result { + return self->range_bitmap_->NotIn(literals); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIsNull() { + return std::make_shared( + [self = shared_from_this()]() -> Result { + return self->range_bitmap_->IsNull(); + }); +} + +Result> RangeBitmapFileIndexReader::VisitIsNotNull() { + return std::make_shared( + [self = shared_from_this()]() -> Result { + return self->range_bitmap_->IsNotNull(); + }); +} + +Result> RangeBitmapFileIndexReader::VisitGreaterThan( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + return self->range_bitmap_->Gt(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitLessThan( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + return self->range_bitmap_->Lt(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitGreaterOrEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + return self->range_bitmap_->Gte(literal); + }); +} + +Result> RangeBitmapFileIndexReader::VisitLessOrEqual( + const Literal& literal) { + return std::make_shared( + [self = shared_from_this(), literal]() -> Result { + return self->range_bitmap_->Lte(literal); + }); +} + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h new file mode 100644 index 00000000..c15e6ce6 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index.h @@ -0,0 +1,111 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/common/file_index/bitmap/bitmap_file_index.h" +#include "paimon/common/file_index/rangebitmap/range_bitmap.h" +#include "paimon/file_index/file_index_reader.h" +#include "paimon/file_index/file_index_writer.h" +#include "paimon/file_index/file_indexer.h" +#include "paimon/predicate/literal.h" +#include "paimon/result.h" +#include "paimon/status.h" + +namespace paimon { + +class RangeBitmapFileIndexWriter; +class RangeBitmapFileIndexReader; + +class RangeBitmapFileIndex final : public FileIndexer { + public: + explicit RangeBitmapFileIndex(const std::map& options); + + ~RangeBitmapFileIndex() override = default; + + Result> CreateReader( + ::ArrowSchema* arrow_schema, int32_t start, int32_t length, + const std::shared_ptr& input_stream, + const std::shared_ptr& pool) const override; + + Result> CreateWriter( + ::ArrowSchema* arrow_schema, const std::shared_ptr& pool) const override; + + public: + static constexpr char kChunkSize[] = "chunk-size"; + + private: + std::map options_; +}; + +class RangeBitmapFileIndexWriter final : public FileIndexWriter { + public: + static Result> Create( + const std::shared_ptr& arrow_schema, const std::string& field_name, + const std::map& options, const std::shared_ptr& pool); + + Status AddBatch(::ArrowArray* batch) override; + Result> SerializedBytes() const override; + + RangeBitmapFileIndexWriter(const std::shared_ptr& struct_type, + const std::shared_ptr& arrow_type, + const std::map& options, + const std::shared_ptr& pool, + const std::shared_ptr& key_factory, + std::unique_ptr appender); + + private: + /// @note struct_type_ contains only one field with arrow_type_, used for import from C + /// interface. + std::shared_ptr struct_type_; + std::shared_ptr arrow_type_; + std::map options_; + std::shared_ptr pool_; + std::shared_ptr key_factory_; + std::unique_ptr appender_; +}; + +class RangeBitmapFileIndexReader final + : public FileIndexReader, + public std::enable_shared_from_this { + public: + static Result> Create( + const std::shared_ptr& arrow_type, int32_t start, int32_t length, + const std::shared_ptr& input_stream, const std::shared_ptr& pool); + + private: + explicit RangeBitmapFileIndexReader(std::unique_ptr range_bitmap); + + Result> VisitEqual(const Literal& literal) override; + Result> VisitNotEqual(const Literal& literal) override; + Result> VisitIn(const std::vector& literals) override; + Result> VisitNotIn( + const std::vector& literals) override; + Result> VisitIsNull() override; + Result> VisitIsNotNull() override; + Result> VisitGreaterThan(const Literal& literal) override; + Result> VisitLessThan(const Literal& literal) override; + Result> VisitGreaterOrEqual(const Literal& literal) override; + Result> VisitLessOrEqual(const Literal& literal) override; + + std::unique_ptr range_bitmap_; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp new file mode 100644 index 00000000..c27736fd --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.cpp @@ -0,0 +1,32 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h" + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +namespace paimon { + +RangeBitmapFileIndexFactory::~RangeBitmapFileIndexFactory() = default; + +Result> RangeBitmapFileIndexFactory::Create( + const std::map& options) const { + return std::make_unique(options); +} + +REGISTER_PAIMON_FACTORY(RangeBitmapFileIndexFactory); + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h new file mode 100644 index 00000000..844bd5d2 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_factory.h @@ -0,0 +1,41 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/file_index/file_indexer.h" +#include "paimon/file_index/file_indexer_factory.h" +#include "paimon/result.h" + +namespace paimon { + +class PAIMON_EXPORT RangeBitmapFileIndexFactory final : public FileIndexerFactory { + public: + const char* Identifier() const override { + return "range-bitmap"; + } + + ~RangeBitmapFileIndexFactory() override; + + Result> Create( + const std::map& options) const override; +}; + +} // namespace paimon diff --git a/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp new file mode 100644 index 00000000..7a78fcc0 --- /dev/null +++ b/src/paimon/common/file_index/rangebitmap/range_bitmap_file_index_test.cpp @@ -0,0 +1,567 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/file_index/rangebitmap/range_bitmap_file_index.h" + +#include + +#include +#include +#include + +#include "arrow/api.h" +#include "arrow/c/bridge.h" +#include "paimon/common/utils/arrow/status_utils.h" +#include "paimon/file_index/bitmap_index_result.h" +#include "paimon/file_index/file_index_format.h" +#include "paimon/file_index/file_indexer_factory.h" +#include "paimon/fs/file_system.h" +#include "paimon/fs/local/local_file_system.h" +#include "paimon/io/byte_array_input_stream.h" +#include "paimon/memory/memory_pool.h" +#include "paimon/predicate/literal.h" +#include "paimon/testing/utils/testharness.h" + +namespace paimon::test { + +class RangeBitmapFileIndexTest : public ::testing::Test { + public: + void SetUp() override { + pool_ = GetDefaultPool(); + fs_ = std::make_shared(); + } + + void TearDown() override { + index_buffer_.reset(); + pool_.reset(); + fs_.reset(); + } + + static void CheckResult(const std::shared_ptr& result, + const std::vector& expected) { + const auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap32* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + const RoaringBitmap32 expected_bitmap = RoaringBitmap32::From(expected); + ASSERT_EQ(*bitmap, expected_bitmap) + << "result=" << bitmap->ToString() << ", expected=" << expected_bitmap.ToString(); + } + + // Helper function to create writer, serialize, and create reader + template + Result> CreateReaderForTest( + const std::shared_ptr& arrow_type, const std::vector& test_data, + PAIMON_UNIQUE_PTR* serialized_bytes_out) { + return CreateReaderForTest(arrow_type, test_data, {}, {}, + serialized_bytes_out); + } + + // Overload with NULL support - null_indices specifies which positions are NULL + template + Result> CreateReaderForTest( + const std::shared_ptr& arrow_type, const std::vector& test_data, + const std::set& null_indices, PAIMON_UNIQUE_PTR* serialized_bytes_out) { + return CreateReaderForTest(arrow_type, test_data, null_indices, {}, + serialized_bytes_out); + } + + // Overload with options to exercise writer configuration such as chunk size. + template + Result> CreateReaderForTest( + const std::shared_ptr& arrow_type, const std::vector& test_data, + const std::map& options, + PAIMON_UNIQUE_PTR* serialized_bytes_out) { + return CreateReaderForTest(arrow_type, test_data, {}, options, + serialized_bytes_out); + } + + // Full overload with NULL support and options + template + Result> CreateReaderForTest( + const std::shared_ptr& arrow_type, const std::vector& test_data, + const std::set& null_indices, const std::map& options, + PAIMON_UNIQUE_PTR* serialized_bytes_out); + + protected: + std::shared_ptr pool_; + + private: + std::shared_ptr fs_; + std::shared_ptr index_buffer_; +}; + +template +Result> RangeBitmapFileIndexTest::CreateReaderForTest( + const std::shared_ptr& arrow_type, const std::vector& test_data, + const std::set& null_indices, const std::map& options, + PAIMON_UNIQUE_PTR* serialized_bytes_out) { + // Create Arrow array from test data, replacing values at null_indices with NULL + auto builder = std::make_shared(); + for (size_t i = 0; i < test_data.size(); ++i) { + if (null_indices.count(static_cast(i)) > 0) { + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->AppendNull()); + } else { + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Append(test_data[i])); + } + } + std::shared_ptr arrow_array; + PAIMON_RETURN_NOT_OK_FROM_ARROW(builder->Finish(&arrow_array)); + // Wrap in StructArray (single field) as required by RangeBitmapFileIndexWriter + arrow::FieldVector fields = {arrow::field("test_field", arrow_type)}; + PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr struct_array, + arrow::StructArray::Make({arrow_array}, fields)); + auto c_array = std::make_unique<::ArrowArray>(); + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*struct_array, c_array.get())); + // Create schema for the field + const auto schema = arrow::schema({arrow::field("test_field", arrow_type)}); + // Create writer + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr writer, + RangeBitmapFileIndexWriter::Create(schema, "test_field", options, pool_)); + // Add the batch + PAIMON_RETURN_NOT_OK(writer->AddBatch(c_array.get())); + // Get serialized payload + PAIMON_ASSIGN_OR_RAISE(PAIMON_UNIQUE_PTR serialized_bytes, writer->SerializedBytes()); + if (!serialized_bytes || serialized_bytes->size() == 0) { + return Status::Invalid("Serialized bytes is empty"); + } + *serialized_bytes_out = std::move(serialized_bytes); + const auto input_stream = std::make_shared( + (*serialized_bytes_out)->data(), (*serialized_bytes_out)->size()); + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, + RangeBitmapFileIndexReader::Create( + arrow_type, 0, static_cast((*serialized_bytes_out)->size()), + input_stream, pool_)); + return reader; +} + +// Test with all NULL values +TEST_F(RangeBitmapFileIndexTest, TestAllNullValues) { + constexpr int num_rows = 10; + std::vector test_data(num_rows, 0); // placeholders, all will be NULL + std::set null_indices; + for (int32_t i = 0; i < num_rows; ++i) { + null_indices.insert(i); + } + const auto& arrow_type = arrow::int32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, null_indices, &serialized_bytes))); + + // Test IsNull - should return all positions + std::vector all_positions(num_rows); + std::iota(all_positions.begin(), all_positions.end(), 0); + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, all_positions); + + // Test IsNotNull - should return empty + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, {}); + + // Test other queries with all NULL data - should return empty + ASSERT_OK_AND_ASSIGN(auto eq_result, reader->VisitEqual(Literal(FieldType::INT, 42))); + CheckResult(eq_result, {}); + + ASSERT_OK_AND_ASSIGN(auto gt_result, reader->VisitGreaterThan(Literal(FieldType::INT, 0))); + CheckResult(gt_result, {}); + + ASSERT_OK_AND_ASSIGN(auto lt_result, reader->VisitLessThan(Literal(FieldType::INT, 100))); + CheckResult(lt_result, {}); +} + +// Test with empty data - no rows at all +TEST_F(RangeBitmapFileIndexTest, TestEmptyRangeBitmap) { + std::vector test_data; // empty + const auto& arrow_type = arrow::int32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, &serialized_bytes))); + + // All queries should return empty results for empty bitmap + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, {}); + + ASSERT_OK_AND_ASSIGN(auto eq_result, reader->VisitEqual(Literal(FieldType::INT, 42))); + CheckResult(eq_result, {}); + + ASSERT_OK_AND_ASSIGN(auto gt_result, reader->VisitGreaterThan(Literal(FieldType::INT, 0))); + CheckResult(gt_result, {}); + + ASSERT_OK_AND_ASSIGN(auto lt_result, reader->VisitLessThan(Literal(FieldType::INT, 100))); + CheckResult(lt_result, {}); + + ASSERT_OK_AND_ASSIGN(auto gte_result, reader->VisitGreaterOrEqual(Literal(FieldType::INT, 0))); + CheckResult(gte_result, {}); + + ASSERT_OK_AND_ASSIGN(auto lte_result, reader->VisitLessOrEqual(Literal(FieldType::INT, 100))); + CheckResult(lte_result, {}); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexMultiChunk) { + // Use many distinct values and a very small chunk size to force multiple + // dictionary chunks when writing the range bitmap index. + std::vector test_data(100); + std::iota(test_data.begin(), test_data.end(), 0); + + const auto& arrow_type = arrow::int32(); + std::map options; + // Configure a very small chunk size in bytes so that the dictionary must + // be split into multiple chunks. + options[RangeBitmapFileIndex::kChunkSize] = "86b"; + + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, options, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_0_result, reader->VisitEqual(Literal(static_cast(0)))); + CheckResult(eq_0_result, {0}); + + ASSERT_OK_AND_ASSIGN(auto eq_50_result, reader->VisitEqual(Literal(static_cast(50)))); + CheckResult(eq_50_result, {50}); + ASSERT_OK_AND_ASSIGN(auto eq_51_result, reader->VisitEqual(Literal(static_cast(51)))); + CheckResult(eq_51_result, {51}); + ASSERT_OK_AND_ASSIGN(auto eq_99_result, reader->VisitEqual(Literal(static_cast(99)))); + CheckResult(eq_99_result, {99}); + + ASSERT_OK_AND_ASSIGN(auto gt_49_result, + reader->VisitGreaterThan(Literal(static_cast(49)))); + // Positions 50..99 + std::vector expected_gt_49(50); + std::iota(expected_gt_49.begin(), expected_gt_49.end(), 50); + CheckResult(gt_49_result, expected_gt_49); + + ASSERT_OK_AND_ASSIGN(auto lt_10_result, + reader->VisitLessThan(Literal(static_cast(10)))); + // Positions 0..9 + std::vector expected_lt_10(10); + std::iota(expected_lt_10.begin(), expected_lt_10.end(), 0); + CheckResult(lt_10_result, expected_lt_10); + + // is_not_null should cover all rows. + std::vector all_positions(100); + std::iota(all_positions.begin(), all_positions.end(), 0); + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +// test data mixed with NULLs +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexBigInt) { + // Data: 10, NULL, 10, 30, NULL, 40, 50 (NULLs at positions 1 and 4) + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; // 20 at pos 1,4 will be NULL + std::set null_indices = {1, 4}; + const auto& arrow_type = arrow::int64(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, null_indices, &serialized_bytes))); + + // Test equality queries (NULL positions excluded) + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); // positions 0 and 2 have value 10 + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {}); // no value 20 (positions 1 and 4 are NULL) + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); // position 3 has value 30 + ASSERT_OK_AND_ASSIGN(auto eq_40_result, reader->VisitEqual(Literal(static_cast(40)))); + CheckResult(eq_40_result, {5}); // position 5 has value 40 + ASSERT_OK_AND_ASSIGN(auto eq_50_result, reader->VisitEqual(Literal(static_cast(50)))); + CheckResult(eq_50_result, {6}); // position 6 has value 50 + + // Test range queries (NULL positions excluded) + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 2, 3}); // values < 35: 10, 10, 30 (NULLs excluded) + ASSERT_OK_AND_ASSIGN(auto gte_20_result, + reader->VisitGreaterOrEqual(Literal(static_cast(20)))); + CheckResult(gte_20_result, {3, 5, 6}); // values >= 20: 30, 40, 50 (NULLs excluded) + ASSERT_OK_AND_ASSIGN(auto lte_40_result, + reader->VisitLessOrEqual(Literal(static_cast(40)))); + CheckResult(lte_40_result, {0, 2, 3, 5}); // values <= 40: 10, 10, 30, 40 (NULLs excluded) + + // Test IN queries (NULL positions excluded) + std::vector in_values = {Literal(static_cast(10)), + Literal(static_cast(30))}; + ASSERT_OK_AND_ASSIGN(auto in_result, reader->VisitIn(in_values)); + CheckResult(in_result, {0, 2, 3}); // positions with values 10 or 30 + ASSERT_OK_AND_ASSIGN(auto not_in_result, reader->VisitNotIn(in_values)); + CheckResult(not_in_result, {5, 6}); // positions with values NOT 10 or 30 (NULLs excluded) + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {1, 4}); // positions 1 and 4 are NULL + std::vector not_null_positions = {0, 2, 3, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, not_null_positions); // non-NULL positions +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, &serialized_bytes))); + + // Test equality queries + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + + // Test range queries + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); + ASSERT_OK_AND_ASSIGN(auto gte_20_result, + reader->VisitGreaterOrEqual(Literal(static_cast(20)))); + CheckResult(gte_20_result, {1, 3, 4, 5, 6}); + ASSERT_OK_AND_ASSIGN(auto lte_40_result, + reader->VisitLessOrEqual(Literal(static_cast(40)))) + CheckResult(lte_40_result, {0, 1, 2, 3, 4, 5}); + + // Test empty result cases for INT values that don't exist + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_int_result, + reader->VisitEqual(Literal(static_cast(25)))); + CheckResult(eq_nonexistent_int_result, {}); // 25 doesn't exist in data {10,20,30,40,50} + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_high_int_result, + reader->VisitEqual(Literal(static_cast(100)))); + CheckResult(eq_out_of_range_high_int_result, {}); // Value above maximum (50) + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_low_int_result, + reader->VisitEqual(Literal(static_cast(5)))); + CheckResult(eq_out_of_range_low_int_result, {}); // Value below minimum (10) + + // Test NotEqual operations + ASSERT_OK_AND_ASSIGN(auto ne_10_result, + reader->VisitNotEqual(Literal(static_cast(10)))); + CheckResult(ne_10_result, {1, 3, 4, 5, 6}); // All positions except {0, 2} where 10 appears + + ASSERT_OK_AND_ASSIGN(auto ne_nonexistent_result, + reader->VisitNotEqual(Literal(static_cast(99)))); + CheckResult(ne_nonexistent_result, {0, 1, 2, 3, 4, 5, 6}); // All positions (non-empty result) + + // Test NotIn operations + ASSERT_OK_AND_ASSIGN(auto not_in_single_result, + reader->VisitNotIn({Literal(static_cast(10))})); + CheckResult(not_in_single_result, {1, 3, 4, 5, 6}); // All positions except where 10 appears + + ASSERT_OK_AND_ASSIGN( + auto not_in_multiple_result, + reader->VisitNotIn({Literal(static_cast(10)), Literal(static_cast(20))})); + CheckResult(not_in_multiple_result, {3, 5, 6}); // Positions not containing 10 or 20 + + ASSERT_OK_AND_ASSIGN(auto not_in_nonexistent_result, + reader->VisitNotIn({Literal(static_cast(99))})); + CheckResult(not_in_nonexistent_result, + {0, 1, 2, 3, 4, 5, 6}); // All positions (non-empty result) + + // Test NotIn with empty result - all values are NOT IN the complete set + std::vector all_values = { + Literal(static_cast(10)), Literal(static_cast(20)), + Literal(static_cast(30)), Literal(static_cast(40)), + Literal(static_cast(50))}; + ASSERT_OK_AND_ASSIGN(auto not_in_all_result, reader->VisitNotIn(all_values)); + CheckResult(not_in_all_result, + {}); // Empty result - no positions left when excluding all existing values +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexSmallInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int16(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexTinyInt) { + std::vector test_data = {10, 20, 10, 30, 20, 40, 50}; + const auto& arrow_type = arrow::int8(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_result, reader->VisitEqual(Literal(static_cast(10)))); + CheckResult(eq_10_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_20_result, reader->VisitEqual(Literal(static_cast(20)))); + CheckResult(eq_20_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_30_result, reader->VisitEqual(Literal(static_cast(30)))); + CheckResult(eq_30_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_25_result, + reader->VisitGreaterThan(Literal(static_cast(25)))); + CheckResult(gt_25_result, {3, 5, 6}); // values > 25: 30, 40, 50 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, + reader->VisitLessThan(Literal(static_cast(35)))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexBoolean) { + std::vector test_data = {true, false, true, true, false, true, false}; + const auto& arrow_type = arrow::boolean(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_true_result, reader->VisitEqual(Literal(true))); + CheckResult(eq_true_result, {0, 2, 3, 5}); // positions with value true + ASSERT_OK_AND_ASSIGN(auto eq_false_result, reader->VisitEqual(Literal(false))); + CheckResult(eq_false_result, {1, 4, 6}); // positions with value false + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexFloat) { + std::vector test_data = {10.5f, 20.3f, 10.5f, 30.7f, 20.3f, 40.1f, 50.9f}; + const auto& arrow_type = arrow::float32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_5_result, reader->VisitEqual(Literal(10.5f))); + CheckResult(eq_10_5_result, {0, 2}); // positions with value 10.5 + ASSERT_OK_AND_ASSIGN(auto eq_20_3_result, reader->VisitEqual(Literal(20.3f))); + CheckResult(eq_20_3_result, {1, 4}); // positions with value 20.3 + ASSERT_OK_AND_ASSIGN(auto eq_30_7_result, reader->VisitEqual(Literal(30.7f))); + CheckResult(eq_30_7_result, {3}); // position with value 30.7 + ASSERT_OK_AND_ASSIGN(auto gt_24_9_result, reader->VisitGreaterThan(Literal(24.9f))); + CheckResult(gt_24_9_result, {3, 5, 6}); // values > 25.0: 30.7, 40.1, 50.9 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, reader->VisitLessThan(Literal(35.0f))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35.0 + + // Test empty result cases for float values that don't exist + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_float_result, reader->VisitEqual(Literal(25.0f))); + CheckResult(eq_nonexistent_float_result, {}); // 25.0 doesn't exist in data + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_high_result, reader->VisitEqual(Literal(100.0f))); + CheckResult(eq_out_of_range_high_result, {}); // Value above maximum + + ASSERT_OK_AND_ASSIGN(auto eq_out_of_range_low_result, reader->VisitEqual(Literal(5.0f))); + CheckResult(eq_out_of_range_low_result, {}); // Value below minimum + + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexDouble) { + std::vector test_data = {10.5, 20.3, 10.5, 30.7, 20.3, 40.1, 50.9}; + const auto& arrow_type = arrow::float64(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_10_5_result, reader->VisitEqual(Literal(10.5))); + CheckResult(eq_10_5_result, {0, 2}); // positions with value 10.5 + ASSERT_OK_AND_ASSIGN(auto eq_20_3_result, reader->VisitEqual(Literal(20.3))); + CheckResult(eq_20_3_result, {1, 4}); // positions with value 20.3 + ASSERT_OK_AND_ASSIGN(auto eq_30_7_result, reader->VisitEqual(Literal(30.7))); + CheckResult(eq_30_7_result, {3}); // position with value 30.7 + ASSERT_OK_AND_ASSIGN(auto gt_24_9_result, reader->VisitGreaterThan(Literal(24.9))); + CheckResult(gt_24_9_result, {3, 5, 6}); // values > 25.0: 30.7, 40.1, 50.9 + ASSERT_OK_AND_ASSIGN(auto lt_35_result, reader->VisitLessThan(Literal(35.0))); + CheckResult(lt_35_result, {0, 1, 2, 3, 4}); // values < 35.0 + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +TEST_F(RangeBitmapFileIndexTest, TestWriteAndReadRangeBitmapIndexDate) { + std::vector test_data = {42432, 24649, 42432, 38001, 24649, 50000, 12000}; + const auto& arrow_type = arrow::date32(); + PAIMON_UNIQUE_PTR serialized_bytes; + ASSERT_OK_AND_ASSIGN(auto reader, (CreateReaderForTest( + arrow_type, test_data, &serialized_bytes))); + ASSERT_OK_AND_ASSIGN(auto eq_42432_result, reader->VisitEqual(Literal(FieldType::DATE, 42432))); + CheckResult(eq_42432_result, {0, 2}); + ASSERT_OK_AND_ASSIGN(auto eq_24649_result, reader->VisitEqual(Literal(FieldType::DATE, 24649))); + CheckResult(eq_24649_result, {1, 4}); + ASSERT_OK_AND_ASSIGN(auto eq_38001_result, reader->VisitEqual(Literal(FieldType::DATE, 38001))); + CheckResult(eq_38001_result, {3}); + ASSERT_OK_AND_ASSIGN(auto gt_result, + reader->VisitGreaterOrEqual(Literal(FieldType::DATE, 30000))); + CheckResult(gt_result, {0, 2, 3, 5}); // 42432, 38001, 50000 + + ASSERT_OK_AND_ASSIGN(auto lt_result, reader->VisitLessThan(Literal(FieldType::DATE, 40000))); + CheckResult(lt_result, {1, 3, 4, 6}); // 24649, 38001, 12000 + + // Test empty result cases - values that don't exist in the data + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_low_result, + reader->VisitEqual(Literal(FieldType::DATE, 47432))); + CheckResult(eq_nonexistent_low_result, {}); + + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_mid_result, + reader->VisitEqual(Literal(FieldType::DATE, 30000))); + CheckResult(eq_nonexistent_mid_result, {}); // Value in middle range but doesn't exist + + ASSERT_OK_AND_ASSIGN(auto eq_nonexistent_high_result, + reader->VisitEqual(Literal(FieldType::DATE, 60000))); + CheckResult(eq_nonexistent_high_result, {}); // Value above maximum (50000) + + // Test range queries that should return empty results + ASSERT_OK_AND_ASSIGN(auto gt_all_result, + reader->VisitGreaterOrEqual(Literal(FieldType::DATE, 60000))); + CheckResult(gt_all_result, {}); // Greater than maximum should return empty + + ASSERT_OK_AND_ASSIGN(auto lt_all_result, + reader->VisitLessThan(Literal(FieldType::DATE, 10000))); + CheckResult(lt_all_result, {}); // Less than minimum should return empty + + ASSERT_OK_AND_ASSIGN(auto is_null_result, reader->VisitIsNull()); + CheckResult(is_null_result, {}); + std::vector all_positions = {0, 1, 2, 3, 4, 5, 6}; + ASSERT_OK_AND_ASSIGN(auto is_not_null_result, reader->VisitIsNotNull()); + CheckResult(is_not_null_result, all_positions); +} + +} // namespace paimon::test diff --git a/test/inte/read_inte_with_index_test.cpp b/test/inte/read_inte_with_index_test.cpp index bd26f187..957d62b9 100644 --- a/test/inte/read_inte_with_index_test.cpp +++ b/test/inte/read_inte_with_index_test.cpp @@ -45,6 +45,7 @@ #include "paimon/data/timestamp.h" #include "paimon/defs.h" #include "paimon/factories/factory_creator.h" +#include "paimon/fs/local/local_file_system.h" #include "paimon/memory/bytes.h" #include "paimon/memory/memory_pool.h" #include "paimon/metrics.h" @@ -372,6 +373,295 @@ class ReadInteWithIndexTest : public testing::Test, } } + void CheckResultForRangeBitmap(const std::string& path, + const std::shared_ptr& arrow_data_type, + const std::shared_ptr& split) const { + { + // test with no predicate - return all 8 rows + std::shared_ptr expected_array; + auto array_status = arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, + { + R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"] +])", + R"([ +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"] +])", + R"([ +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, null, null, null, null, null, null] +])", + R"([ +[0, null, null, null, null, null, "null_row"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + std::cout << array_status.message() << std::endl; + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, /*predicate=*/nullptr, expected_array); + } + { + // Test equal predicate: f0 = 17 -> row 0 + auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(17)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test less than predicate: f0 < 10 -> rows 1,2,3,4 (values 3,5,7,9) + auto predicate = PredicateBuilder::LessThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(10)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test greater than predicate: f0 > 5 -> rows 0,3,4,7 (values 17,7,9,10) + auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(5)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test is null predicate on f0 -> rows 5, 6 + auto predicate = + PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::INT); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, null, null, null, null, null, null], +[0, null, null, null, null, null, "null_row"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test is not null predicate on f0 -> rows 0,1,2,3,4,7 + auto predicate = + PredicateBuilder::IsNotNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::INT); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test in predicate: f0 in (3, 7) -> rows 1, 3 + auto predicate = PredicateBuilder::In( + /*field_index=*/0, /*field_name=*/"f0", FieldType::INT, {Literal(3), Literal(7)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test not in predicate: f0 not in (3, 7) -> rows 0,2,4,7 (excluding null rows 5,6) + auto predicate = PredicateBuilder::NotIn( + /*field_index=*/0, /*field_name=*/"f0", FieldType::INT, {Literal(3), Literal(7)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] + ])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f1 (BIGINT) predicates + { + // Test greater than predicate: f1 > 300 -> rows 3,4,7 (values 400,500,600) + auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(300L)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f2 (FLOAT) predicates + { + // Test less than predicate: f2 < 4.0 -> rows 0,1,2 (values 1.1,2.2,3.3) + auto predicate = PredicateBuilder::LessThan(/*field_index=*/2, /*field_name=*/"f2", + FieldType::FLOAT, Literal(4.0f)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + // Test date type + { + // Test greater than predicate: f0 > 5 -> rows 0,3,4,7 (values 17,7,9,10) + auto predicate = + PredicateBuilder::LessOrEqual(/*field_index=*/4, /*field_name=*/"f4", + FieldType::DATE, Literal(FieldType::DATE, 19725)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test f3 (DOUBLE) predicates + { + // Test greater or equal predicate: f3 >= 40.0 -> rows 3,4,7 (values 44.44,55.55,66.66) + auto predicate = PredicateBuilder::GreaterOrEqual( + /*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(44.44)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test BETWEEN predicate on f1 (BIGINT) + { + // Test f1 BETWEEN 200 AND 500 -> rows 1,2,3,4 (values 200,300,400,500) + auto predicate = + PredicateBuilder::Between(/*field_index=*/1, /*field_name=*/"f1", FieldType::BIGINT, + Literal(200L), Literal(500L)); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 3, 200, 2.2, 22.22, 19725, "row1"], +[0, 5, 300, 3.3, 33.33, 19727, "row2"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + + // Test IN predicate on f2 (FLOAT) + { + // Test f2 IN (1.1, 4.4, 6.6) -> rows 0,3,7 (values 1.1,4.4,6.6) + auto predicate = + PredicateBuilder::In(/*field_index=*/2, /*field_name=*/"f2", FieldType::FLOAT, + {Literal(1.1f), Literal(4.4f), Literal(6.6f)}); + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"], +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, predicate, expected_array); + } + { + // Test nested composite: (f0 = 3 OR f0 = 17) AND f1 < 200 + // (f0 = 3 OR f0 = 17): matches rows 0,1 + // f1 < 200: matches rows 0 (f1=100) + // Combined AND: matches rows 0 + auto predicate1 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(3)); + auto predicate2 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(17)); + ASSERT_OK_AND_ASSIGN(auto or_predicate, PredicateBuilder::Or({predicate1, predicate2})); + + auto predicate3 = PredicateBuilder::LessThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(200L)); + ASSERT_OK_AND_ASSIGN(auto and_predicate, + PredicateBuilder::And({or_predicate, predicate3})); + + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 17, 100, 1.1, 11.11, 19739, "row0"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, and_predicate, expected_array); + } + { + // Test AND predicate with mixed types: f0 >= 5 AND f1 > 100 + // f0 >= 5: matches rows 3,4,7 + // f1 > 100: matches rows 2,3,4,7 + // Combined AND: matches rows 3,4,7 + auto predicate1 = PredicateBuilder::GreaterThan(/*field_index=*/0, /*field_name=*/"f0", + FieldType::INT, Literal(5)); + auto predicate2 = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", + FieldType::BIGINT, Literal(100L)); + ASSERT_OK_AND_ASSIGN(auto and_predicate, + PredicateBuilder::And({predicate1, predicate2})); + + std::shared_ptr expected_array; + auto array_status = + arrow::ipc::internal::json::ChunkedArrayFromJSON(arrow_data_type, {R"([ +[0, 7, 400, 4.4, 44.44, 19729, "row3"], +[0, 9, 500, 5.5, 55.55, 19731, "row4"], +[0, 10, 600, 6.6, 66.66, 19732, "row7"] +])"}, + &expected_array); + ASSERT_TRUE(array_status.ok()); + CheckResult(path, {split}, and_predicate, expected_array); + } + } + void CheckResultForBsi(const std::string& path, const std::shared_ptr& arrow_data_type, const std::shared_ptr split) const { @@ -2072,6 +2362,95 @@ TEST_P(ReadInteWithIndexTest, TestWithIndexWithoutRegistered) { } } +TEST_P(ReadInteWithIndexTest, TestRangeBitmapIndex) { + auto [file_format, enable_prefetch] = GetParam(); + std::string path = + GetDataDir() + file_format + "/append_with_rangebitmap.db/append_with_rangebitmap/"; + std::string file_name; + if (file_format == "orc") { + file_name = "data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc"; + } else if (file_format == "parquet") { + file_name = "data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet"; + } + + std::vector read_fields = {SpecialFields::ValueKind(), + DataField(0, arrow::field("f0", arrow::int32())), + DataField(1, arrow::field("f1", arrow::int64())), + DataField(2, arrow::field("f2", arrow::float32())), + DataField(3, arrow::field("f3", arrow::float64())), + DataField(4, arrow::field("f4", arrow::date32())), + DataField(5, arrow::field("f5", arrow::utf8()))}; + std::shared_ptr arrow_data_type = + DataField::ConvertDataFieldsToArrowStructType(read_fields); + + auto data_file_meta = std::make_shared( + file_name, /*file_size=*/1288, + /*row_count=*/8, /*min_key=*/BinaryRow::EmptyRow(), + /*max_key=*/BinaryRow::EmptyRow(), /*key_stats=*/SimpleStats::EmptyStats(), + /*value_stats=*/SimpleStats::EmptyStats(), /*min_sequence_number=*/0, + /*max_sequence_number=*/7, /*schema_id=*/0, + /*level=*/0, + /*extra_files=*/ + std::vector>({file_name + ".index"}), + /*creation_time=*/Timestamp(0ll, 0), /*delete_row_count=*/0, + /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + + DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, + /*bucket_path=*/path + "bucket-0/", {data_file_meta}); + ASSERT_OK_AND_ASSIGN(auto split, + builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); + + // Run comprehensive range bitmap index tests + CheckResultForRangeBitmap(path, arrow_data_type, split); +} + +TEST_P(ReadInteWithIndexTest, TestRangeBitmapIndexMultiChunk) { + auto [file_format, enable_prefetch] = GetParam(); + std::string path = + GetDataDir() + file_format + + "/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/"; + std::string file_name; + if (file_format == "orc") { + file_name = "data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc"; + } else if (file_format == "parquet") { + file_name = "data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet"; + } + + std::vector read_fields = {SpecialFields::ValueKind(), + DataField(0, arrow::field("f0", arrow::int32())), + DataField(1, arrow::field("f1", arrow::int64())), + DataField(2, arrow::field("f2", arrow::float32())), + DataField(3, arrow::field("f3", arrow::float64())), + DataField(4, arrow::field("f4", arrow::date32())), + DataField(5, arrow::field("f5", arrow::utf8()))}; + std::shared_ptr arrow_data_type = + DataField::ConvertDataFieldsToArrowStructType(read_fields); + + auto data_file_meta = std::make_shared( + file_name, /*file_size=*/1413, + /*row_count=*/8, /*min_key=*/BinaryRow::EmptyRow(), + /*max_key=*/BinaryRow::EmptyRow(), /*key_stats=*/SimpleStats::EmptyStats(), + /*value_stats=*/SimpleStats::EmptyStats(), /*min_sequence_number=*/0, + /*max_sequence_number=*/7, /*schema_id=*/0, + /*level=*/0, + /*extra_files=*/ + std::vector>({file_name + ".index"}), + /*creation_time=*/Timestamp(0ll, 0), /*delete_row_count=*/0, + /*embedded_index=*/nullptr, FileSource::Append(), + /*value_stats_cols=*/std::nullopt, + /*external_path=*/std::nullopt, /*first_row_id=*/std::nullopt, /*write_cols=*/std::nullopt); + + DataSplitImpl::Builder builder(BinaryRow::EmptyRow(), /*bucket=*/0, + /*bucket_path=*/path + "bucket-0/", {data_file_meta}); + ASSERT_OK_AND_ASSIGN(auto split, + builder.WithSnapshot(1).IsStreaming(false).RawConvertible(true).Build()); + + // Run range bitmap index tests with multi-chunk test data + CheckResultForRangeBitmap(path, arrow_data_type, split); +} + TEST_P(ReadInteWithIndexTest, TestWithIOException) { auto [file_format, enable_prefetch] = GetParam(); std::string path = GetDataDir() + "/" + file_format + diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README new file mode 100644 index 00000000..52eb2755 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/README @@ -0,0 +1,16 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc new file mode 100644 index 00000000..fe2d4db1 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc.index b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc.index new file mode 100644 index 00000000..18ea5e60 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-5759403d-17fc-4031-b5bb-5e22b02fdb3b-0.orc.index differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 new file mode 100644 index 00000000..854ee21b Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-642b5c6e-a8d6-46d0-bd3b-686478c89f6b-0 differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 new file mode 100644 index 00000000..4786f9ea Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0 differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1 new file mode 100644 index 00000000..a8d36f53 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1 differ diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 new file mode 100644 index 00000000..0d24cba7 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 @@ -0,0 +1,39 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file.format" : "orc", + "file-index.in-manifest-threshold" : "1B" + }, + "timeMillis" : 1772177550729 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 new file mode 100644 index 00000000..e30127af --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-64353b80-fb7c-470e-972a-07d0717af717-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-64353b80-fb7c-470e-972a-07d0717af717-1", + "deltaManifestListSize" : 1113, + "commitUser" : "67cff790-9276-4301-aa3c-469a17418ac9", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772177555995, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README new file mode 100644 index 00000000..f308976e --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README @@ -0,0 +1,17 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 +range-bitmap index chunk-size: 16B + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc new file mode 100644 index 00000000..fe2d4db1 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc.index b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc.index new file mode 100644 index 00000000..343734e7 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-b6b64e1e-e3d2-4f08-9a36-726a96cde1be-0.orc.index differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-71437504-f8ad-4b7d-be04-28203480227d-0 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-71437504-f8ad-4b7d-be04-28203480227d-0 new file mode 100644 index 00000000..a78f32bf Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-71437504-f8ad-4b7d-be04-28203480227d-0 differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0 new file mode 100644 index 00000000..6e7d09e9 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0 differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1 new file mode 100644 index 00000000..156d6840 Binary files /dev/null and b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1 differ diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 new file mode 100644 index 00000000..096f0300 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 @@ -0,0 +1,44 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.range-bitmap.f1.chunk-size" : "16B", + "file-index.range-bitmap.f2.chunk-size" : "16B", + "file-index.range-bitmap.f3.chunk-size" : "16B", + "file-index.in-manifest-threshold" : "1B", + "file-index.range-bitmap.f4.chunk-size" : "16B", + "file.format" : "orc", + "file-index.range-bitmap.f0.chunk-size" : "16B" + }, + "timeMillis" : 1772188734852 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 new file mode 100644 index 00000000..1e8a9f72 --- /dev/null +++ b/test/test_data/orc/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-72b69947-13fc-4b20-8db2-ffa7d607b38f-1", + "deltaManifestListSize" : 1106, + "commitUser" : "162120aa-5242-438d-bb0e-96ee933b3313", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772188737678, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README new file mode 100644 index 00000000..52eb2755 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/README @@ -0,0 +1,16 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet new file mode 100644 index 00000000..19fabdca Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index new file mode 100644 index 00000000..18ea5e60 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/bucket-0/data-2fb852e2-e4b5-4807-bf92-04401ed10560-0.parquet.index differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 new file mode 100644 index 00000000..c9299ed9 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-75f07296-b729-48db-aadd-17826f0aadf9-0 differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0 new file mode 100644 index 00000000..a22d455b Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0 differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1 new file mode 100644 index 00000000..49e0cd75 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/manifest/manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1 differ diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 new file mode 100644 index 00000000..cdeb25d1 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/schema/schema-0 @@ -0,0 +1,38 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.in-manifest-threshold" : "1B" + }, + "timeMillis" : 1772163669686 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 new file mode 100644 index 00000000..fee41027 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap.db/append_with_rangebitmap/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-28f50ecd-5d0e-4366-a13b-ca1f39bfac83-1", + "deltaManifestListSize" : 1108, + "commitUser" : "95859ce1-495d-4176-8f68-f7fbd595554c", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772163672630, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README new file mode 100644 index 00000000..f308976e --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/README @@ -0,0 +1,17 @@ +f0:int (nullable), f1:long (nullable), f2:float (nullable), f3:double (nullable), f4:date (nullable), f5:string (nullable) +no partition key +no bucket key +bucket count: -1 +range-bitmap index: f0,f1,f2,f3,f4 +range-bitmap index chunk-size: 16B + +Rows (snapshot-1): +Add: [17, 100L, 1.1f, 11.11, date('2024-01-17'), 'row0'] +Add: [3, 200L, 2.2f, 22.22, date('2024-01-03'), 'row1'] +Add: [5, 300L, 3.3f, 33.33, date('2024-01-05'), 'row2'] +Add: [7, 400L, 4.4f, 44.44, date('2024-01-07'), 'row3'] +Add: [9, 500L, 5.5f, 55.55, date('2024-01-09'), 'row4'] +Add: [null, null, null, null, null, null] +Add: [null, null, null, null, null, 'null_row'] +Add: [10, 600L, 6.6f, 66.66, date('2024-01-10'), 'row7'] +NoCompact \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet new file mode 100644 index 00000000..67d9aea9 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index new file mode 100644 index 00000000..343734e7 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/bucket-0/data-e4673af1-afcb-4b84-b69a-ae472ba517f2-0.parquet.index differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-90122746-0b4c-4328-8a04-576ee6b4cb83-0 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-90122746-0b4c-4328-8a04-576ee6b4cb83-0 new file mode 100644 index 00000000..37270a28 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-90122746-0b4c-4328-8a04-576ee6b4cb83-0 differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 new file mode 100644 index 00000000..808abfa5 Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0 differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 new file mode 100644 index 00000000..d367884c Binary files /dev/null and b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/manifest/manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1 differ diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 new file mode 100644 index 00000000..d4ca2df4 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/schema/schema-0 @@ -0,0 +1,43 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "f0", + "type" : "INT" + }, { + "id" : 1, + "name" : "f1", + "type" : "BIGINT" + }, { + "id" : 2, + "name" : "f2", + "type" : "FLOAT" + }, { + "id" : 3, + "name" : "f3", + "type" : "DOUBLE" + }, { + "id" : 4, + "name" : "f4", + "type" : "DATE" + }, { + "id" : 5, + "name" : "f5", + "type" : "STRING" + } ], + "highestFieldId" : 5, + "partitionKeys" : [ ], + "primaryKeys" : [ ], + "options" : { + "file-index.range-bitmap.columns" : "f0,f1,f2,f3,f4", + "owner" : "xiaoheng", + "file-index.range-bitmap.f1.chunk-size" : "16B", + "file-index.range-bitmap.f2.chunk-size" : "16B", + "file-index.range-bitmap.f3.chunk-size" : "16B", + "file-index.in-manifest-threshold" : "1B", + "file-index.range-bitmap.f4.chunk-size" : "16B", + "file-index.range-bitmap.f0.chunk-size" : "16B" + }, + "timeMillis" : 1772188209180 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST new file mode 100644 index 00000000..56a6051c --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/LATEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 new file mode 100644 index 00000000..4e78d6b5 --- /dev/null +++ b/test/test_data/parquet/append_with_rangebitmap_multi_chunk.db/append_with_rangebitmap_multi_chunk/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-a63d920d-a1ef-444b-a1cf-97b5df41f6e8-1", + "deltaManifestListSize" : 1108, + "commitUser" : "9385bcac-276c-4639-b825-52623beb2a6d", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1772188213862, + "totalRecordCount" : 8, + "deltaRecordCount" : 8, + "nextRowId" : 0 +} \ No newline at end of file