diff --git a/include/paimon/defs.h b/include/paimon/defs.h index 73fd43d1e..7f223a786 100644 --- a/include/paimon/defs.h +++ b/include/paimon/defs.h @@ -365,6 +365,9 @@ struct PAIMON_EXPORT Options { static const char BLOB_AS_DESCRIPTOR[]; /// "global-index.enabled" - Whether to enable global index for scan. Default value is "true". static const char GLOBAL_INDEX_ENABLED[]; + /// "global-index.thread-num" - The maximum number of concurrent scanner for global index. No + /// default value. By default is the number of processors available to the machine. + static const char GLOBAL_INDEX_THREAD_NUM[]; /// "global-index.external-path" - Global index root directory, if not set, the global index /// files will be stored under the index directory. static const char GLOBAL_INDEX_EXTERNAL_PATH[]; diff --git a/include/paimon/global_index/global_index_reader.h b/include/paimon/global_index/global_index_reader.h index 9325735d2..8338823c0 100644 --- a/include/paimon/global_index/global_index_reader.h +++ b/include/paimon/global_index/global_index_reader.h @@ -31,11 +31,7 @@ namespace paimon { /// /// Derived classes are expected to implement the visitor methods (e.g., `VisitEqual`, /// `VisitIsNull`, etc.) to return index-based results that indicate which -/// row satisfy the given predicate. -/// -/// @note All `GlobalIndexResult` objects returned by implementations of this class use **local row -/// ids** that start from 0 — not global row ids in the entire table. -/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`. +/// rows satisfy the given predicate. class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor> { public: /// VisitVectorSearch performs approximate vector similarity search. diff --git a/include/paimon/global_index/global_index_result.h b/include/paimon/global_index/global_index_result.h index bbde2aa0e..13dbc0a13 100644 --- a/include/paimon/global_index/global_index_result.h +++ b/include/paimon/global_index/global_index_result.h @@ -27,12 +27,12 @@ #include "paimon/visibility.h" namespace paimon { -/// Global index result to get selected global row ids. +/// Global index result that holds the row ids. class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this { public: virtual ~GlobalIndexResult() = default; - /// Iterator interface for traversing selected global row ids. + /// Iterator interface for traversing selected row ids. class Iterator { public: virtual ~Iterator() = default; @@ -40,7 +40,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this IsEmpty() const = 0; - /// Creates a new iterator over the selected global row ids. + /// Creates a new iterator over the selected row ids. virtual Result> CreateIterator() const = 0; /// Returns non-overlapping, sorted ranges covering all row ids in `GlobalIndexResult`. @@ -125,7 +125,7 @@ class PAIMON_EXPORT ScoredGlobalIndexResult : public GlobalIndexResult { /// Retrieves the next (row_id, score) pair and advances the iterator. /// /// @return A pair where: - /// - first: the global row id (returned in ascending order), + /// - first: the row id (returned in ascending order). /// - second: the associated score computed by the index. /// /// @note The sequence is ordered by **row_id**, not by score. diff --git a/include/paimon/global_index/global_index_scan.h b/include/paimon/global_index/global_index_scan.h index b24c697b2..ff2c976fc 100644 --- a/include/paimon/global_index/global_index_scan.h +++ b/include/paimon/global_index/global_index_scan.h @@ -23,70 +23,92 @@ #include #include -#include "paimon/global_index/row_range_global_index_scanner.h" +#include "paimon/global_index/global_index_reader.h" +#include "paimon/global_index/global_index_result.h" +#include "paimon/predicate/predicate.h" #include "paimon/utils/range.h" +#include "paimon/utils/row_range_index.h" #include "paimon/visibility.h" - namespace paimon { class MemoryPool; class FileSystem; + /// Represents a logical scan over a global index for a table. class PAIMON_EXPORT GlobalIndexScan { public: /// Creates a `GlobalIndexScan` instance for the specified table and context. - /// /// @param table_path Root directory of the table. /// @param snapshot_id Optional snapshot id to read from; if not provided, uses the latest. /// @param partitions Optional list of specific partitions to restrict the scan scope. /// Each map represents one partition (e.g., {"dt": "2024-06-01"}). - /// If omitted, scans all partitions. - /// @param options Index-specific configuration. + /// If omitted (`std::nullopt`), scans all partitions of the table. + /// @param options User defined configuration. /// @param file_system File system for accessing index files. /// If not provided (nullptr), it is inferred from the `FILE_SYSTEM` /// key in the `options` parameter. + /// @param executor The executor to be used for asynchronous operations during global + /// index scan. /// @param pool Memory pool for temporary allocations; if nullptr, uses default. /// @return A `Result` containing a unique pointer to the created scanner, - /// or an error if initialization fails (e.g., I/O error). + /// or an error if initialization fails (e.g., I/O error, invalid snapshot id, + /// unknown partition). static Result> Create( const std::string& table_path, const std::optional& snapshot_id, const std::optional>>& partitions, const std::map& options, - const std::shared_ptr& file_system, const std::shared_ptr& pool); + const std::shared_ptr& file_system, const std::shared_ptr& executor, + const std::shared_ptr& pool); - /// Creates a `GlobalIndexScan` instance for the specified table and context. - /// - /// @param partition_filters Optional specific partition predicates. + /// Creates a `GlobalIndexScan` instance for the specified table and context, with a + /// predicate-based partition filter. + /// @param root_path Root directory of the table. + /// @param snapshot_id Optional snapshot id to read from; if not provided, uses the + /// latest snapshot. + /// @param partition_filters Optional partition-level predicate used for partition pruning. + /// If nullptr, all partitions are scanned. + /// @param options User defined configuration. + /// @param file_system File system for accessing index files. If nullptr, it is + /// inferred from the `FILE_SYSTEM` key in `options`. + /// @param executor The executor to be used for asynchronous operations during global + /// index scan. + /// @param pool Memory pool for temporary allocations; if nullptr, uses default. + /// @return A `Result` containing a unique pointer to the created scanner, + /// or an error if initialization fails. static Result> Create( const std::string& root_path, const std::optional& snapshot_id, const std::shared_ptr& partition_filters, const std::map& options, - const std::shared_ptr& file_system, - const std::shared_ptr& memory_pool); + const std::shared_ptr& file_system, const std::shared_ptr& executor, + const std::shared_ptr& pool); virtual ~GlobalIndexScan() = default; - /// Creates a scanner for the global index over the specified row id range. - /// - /// This method instantiates a low-level scanner that can evaluate predicates and - /// retrieve matching row ids from the global index data corresponding to the given - /// row id range. - /// - /// @param range The inclusive row id range [start, end] for which to create the scanner. - /// The range must be fully covered by existing global index data (from - /// `GetRowRangeList()`). - /// @return A `Result` containing a range-level scanner, or an error if parse index meta fails. - virtual Result> CreateRangeScan( - const Range& range) = 0; + /// Creates several `GlobalIndexReader`s for a specific field. + /// @param field_name Name of the indexed column. + /// @param row_range_index Optional row range that limits the scan to a sub-range of row ids. + /// If not provided, the entire row range is considered. + /// @return A `Result` that is: + /// - Successful with several readers(with global row id) if the indexes exist and load + /// correctly; + /// - Successful with an empty vector if no index was built for the given field; + /// - Error returns when loading fails (e.g., file corruption, I/O error, + /// unsupported format). + virtual Result>> CreateReaders( + const std::string& field_name, + const std::optional& row_range_index) const = 0; - /// Returns row id ranges covered by this global index (sorted and non-overlapping - /// ranges). - /// - /// Each `Range` represents a contiguous segment of row ids for which global index - /// data exists. This allows the query engine to parallelize scanning and be aware - /// of ranges that are not covered by any global index. - /// - /// @return A `Result` containing sorted and non-overlapping `Range` objects. - virtual Result> GetRowRangeList() = 0; + /// Creates several `GlobalIndexReader`s for a specific field (looked up by id), + /// @param field_id Field id of the indexed column. + /// @param row_range_index Optional row range that limits the scan to a sub-range of row ids. + /// If not provided, the entire row range is considered. + /// @return A `Result` that is: + /// - Successful with several readers(with global row id) if the indexes exist and load + /// correctly; + /// - Successful with an empty vector if no index was built for the given field; + /// - Error returns when loading fails (e.g., file corruption, I/O error, + /// unsupported format). + virtual Result>> CreateReaders( + int32_t field_id, const std::optional& row_range_index) const = 0; }; } // namespace paimon diff --git a/include/paimon/global_index/row_range_global_index_scanner.h b/include/paimon/global_index/row_range_global_index_scanner.h deleted file mode 100644 index 996b2c2e7..000000000 --- a/include/paimon/global_index/row_range_global_index_scanner.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include "paimon/global_index/global_index_reader.h" -#include "paimon/visibility.h" - -namespace paimon { -/// Interface for scanning global index data at the range level. -class PAIMON_EXPORT RowRangeGlobalIndexScanner { - public: - virtual ~RowRangeGlobalIndexScanner() = default; - - /// Creates a `GlobalIndexReader` for a specific field and index type within this range. - /// - /// This reader provides low-level access to the serialized index data - /// for the given column (`field_name`) and index kind (`index_type`, such as "bitmap"). - /// - /// @param field_name Name of the indexed column. - /// @param index_type Type of the global index (e.g., "bitmap", "lumina"). - /// @return A `Result` that is: - /// - Successful with a non-null reader if the index exists and loads correctly; - /// - Successful with a null pointer if no index was built for the given field and type; - /// - An error only if loading fails (e.g., file corruption, I/O error, unsupported - /// format). - /// @note All `GlobalIndexResult` objects returned by `GlobalIndexReader` use **local row - /// ids** that start from 0 — not global row ids in the entire table. - virtual Result> CreateReader( - const std::string& field_name, const std::string& index_type) const = 0; - - /// Creates several `GlobalIndexReader`s for a specific field within this range. - /// - /// @param field_name Name of the indexed column. - /// @return A `Result` that is: - /// - Successful with several readers if the indexes exist and load correctly; - /// - Successful with an empty vector if no index was built for the given field; - /// - Error returns when loading fails (e.g., file corruption, I/O error, unsupported - /// format). - virtual Result>> CreateReaders( - const std::string& field_name) const = 0; -}; - -} // namespace paimon diff --git a/include/paimon/scan_context.h b/include/paimon/scan_context.h index c40fe62de..ee6b18515 100644 --- a/include/paimon/scan_context.h +++ b/include/paimon/scan_context.h @@ -25,7 +25,6 @@ #include "paimon/global_index/global_index_result.h" #include "paimon/predicate/predicate.h" -#include "paimon/predicate/vector_search.h" #include "paimon/result.h" #include "paimon/type_fwd.h" #include "paimon/visibility.h" @@ -104,19 +103,14 @@ class PAIMON_EXPORT ScanFilter { public: ScanFilter(const std::shared_ptr& predicate, const std::vector>& partition_filters, - const std::optional& bucket_filter, - const std::shared_ptr& vector_search) + const std::optional& bucket_filter) : predicates_(predicate), - vector_search_(vector_search), bucket_filter_(bucket_filter), partition_filters_(partition_filters) {} std::shared_ptr GetPredicate() const { return predicates_; } - std::shared_ptr GetVectorSearch() const { - return vector_search_; - } std::optional GetBucketFilter() const { return bucket_filter_; } @@ -126,7 +120,6 @@ class PAIMON_EXPORT ScanFilter { private: std::shared_ptr predicates_; - std::shared_ptr vector_search_; std::optional bucket_filter_; std::vector> partition_filters_; }; @@ -155,8 +148,6 @@ class PAIMON_EXPORT ScanContextBuilder { ScanContextBuilder& SetGlobalIndexResult( const std::shared_ptr& global_index_result); - /// Set vector search for similarity search. - ScanContextBuilder& SetVectorSearch(const std::shared_ptr& vector_search); /// The options added or set in `ScanContextBuilder` have high priority and will be merged with /// the options in table schema. ScanContextBuilder& AddOption(const std::string& key, const std::string& value); diff --git a/src/paimon/CMakeLists.txt b/src/paimon/CMakeLists.txt index 1301169b5..e314ef3d7 100644 --- a/src/paimon/CMakeLists.txt +++ b/src/paimon/CMakeLists.txt @@ -53,6 +53,8 @@ set(PAIMON_COMMON_SRCS common/fs/resolving_file_system.cpp common/fs/file_system_factory.cpp common/global_config.cpp + common/global_index/union_global_index_reader.cpp + common/global_index/offset_global_index_reader.cpp common/global_index/complete_index_score_batch_reader.cpp common/global_index/bitmap_scored_global_index_result.cpp common/global_index/bitmap_global_index_result.cpp @@ -195,7 +197,6 @@ set(PAIMON_CORE_SRCS core/global_index/global_index_evaluator_impl.cpp core/global_index/global_index_scan.cpp core/global_index/global_index_scan_impl.cpp - core/global_index/row_range_global_index_scanner_impl.cpp core/global_index/global_index_write_task.cpp core/index/index_file_handler.cpp core/index/global_index_meta.cpp @@ -413,6 +414,8 @@ if(PAIMON_BUILD_TESTS) common/global_index/complete_index_score_batch_reader_test.cpp common/global_index/global_index_result_test.cpp common/global_index/global_index_utils_test.cpp + common/global_index/offset_global_index_reader_test.cpp + common/global_index/union_global_index_reader_test.cpp common/global_index/global_indexer_factory_test.cpp common/global_index/bitmap_global_index_result_test.cpp common/global_index/bitmap_scored_global_index_result_test.cpp diff --git a/src/paimon/common/defs.cpp b/src/paimon/common/defs.cpp index 11fba2a17..fe7fceb3e 100644 --- a/src/paimon/common/defs.cpp +++ b/src/paimon/common/defs.cpp @@ -91,6 +91,7 @@ const char Options::DATA_EVOLUTION_ENABLED[] = "data-evolution.enabled"; const char Options::PARTITION_GENERATE_LEGACY_NAME[] = "partition.legacy-name"; const char Options::BLOB_AS_DESCRIPTOR[] = "blob-as-descriptor"; const char Options::GLOBAL_INDEX_ENABLED[] = "global-index.enabled"; +const char Options::GLOBAL_INDEX_THREAD_NUM[] = "global-index.thread-num"; const char Options::GLOBAL_INDEX_EXTERNAL_PATH[] = "global-index.external-path"; const char Options::AGGREGATION_REMOVE_RECORD_ON_DELETE[] = "aggregation.remove-record-on-delete"; const char Options::SCAN_TIMESTAMP_MILLIS[] = "scan.timestamp-millis"; diff --git a/src/paimon/common/global_index/CMakeLists.txt b/src/paimon/common/global_index/CMakeLists.txt index c4e7d9548..b86ce0353 100644 --- a/src/paimon/common/global_index/CMakeLists.txt +++ b/src/paimon/common/global_index/CMakeLists.txt @@ -25,7 +25,9 @@ set(PAIMON_GLOBAL_INDEX_SRC btree/lazy_filtered_btree_reader.cpp btree/key_serializer.cpp rangebitmap/range_bitmap_global_index.cpp - rangebitmap/range_bitmap_global_index_factory.cpp) + rangebitmap/range_bitmap_global_index_factory.cpp + offset_global_index_reader.cpp + union_global_index_reader.cpp) add_paimon_lib(paimon_global_index SOURCES diff --git a/src/paimon/common/global_index/btree/btree_file_meta_selector_test.cpp b/src/paimon/common/global_index/btree/btree_file_meta_selector_test.cpp index c24c3c61b..31e45f642 100644 --- a/src/paimon/common/global_index/btree/btree_file_meta_selector_test.cpp +++ b/src/paimon/common/global_index/btree/btree_file_meta_selector_test.cpp @@ -182,7 +182,7 @@ TEST_F(BTreeFileMetaSelectorTest, TestVisitIn) { // 1 in [1,10]=file1, [1,5]=file4 // 2 in [1,10]=file1, [1,5]=file4 // 3 in [1,10]=file1, [1,5]=file4 - // 26 in [21,30]=file3, [19,25]=file5 + // 26 in [21,30]=file3 // 27 in [21,30]=file3 // 28 in [21,30]=file3 ASSERT_OK_AND_ASSIGN(auto result, selector.VisitIn({Literal(1), Literal(2), Literal(3), diff --git a/src/paimon/common/global_index/btree/btree_global_index_writer.cpp b/src/paimon/common/global_index/btree/btree_global_index_writer.cpp index 5eabae3fb..bfb99e8b6 100644 --- a/src/paimon/common/global_index/btree/btree_global_index_writer.cpp +++ b/src/paimon/common/global_index/btree/btree_global_index_writer.cpp @@ -119,7 +119,10 @@ Status BTreeGlobalIndexWriter::Flush() { return Status::OK(); } MemorySliceOutput output(current_row_ids_.size() * 9 + 5, pool_.get()); - PAIMON_RETURN_NOT_OK(output.WriteVarLenInt(current_row_ids_.size())); + if (current_row_ids_.size() > INT32_MAX) { + return Status::Invalid("invalid row id numbers, exceed INT32_MAX"); + } + PAIMON_RETURN_NOT_OK(output.WriteVarLenInt(static_cast(current_row_ids_.size()))); for (int64_t row_id : current_row_ids_) { PAIMON_RETURN_NOT_OK(output.WriteVarLenLong(row_id)); } diff --git a/src/paimon/common/global_index/btree/btree_global_indexer.cpp b/src/paimon/common/global_index/btree/btree_global_indexer.cpp index 0d5ccd3c2..a21edf166 100644 --- a/src/paimon/common/global_index/btree/btree_global_indexer.cpp +++ b/src/paimon/common/global_index/btree/btree_global_indexer.cpp @@ -70,7 +70,10 @@ Result> BTreeGlobalIndexer::CreateWriter( std::string block_size_str, OptionsUtils::GetValueFromMap(options_, BtreeDefs::kBtreeIndexBlockSize, BtreeDefs::kDefaultBtreeIndexBlockSize)); - PAIMON_ASSIGN_OR_RAISE(int32_t block_size, MemorySize::ParseBytes(block_size_str)); + PAIMON_ASSIGN_OR_RAISE(int64_t block_size, MemorySize::ParseBytes(block_size_str)); + if (block_size > INT32_MAX) { + return Status::Invalid("invalid block size, exceed INT32_MAX"); + } PAIMON_ASSIGN_OR_RAISE( std::string compress_str, OptionsUtils::GetValueFromMap(options_, BtreeDefs::kBtreeIndexCompression, @@ -82,8 +85,9 @@ Result> BTreeGlobalIndexer::CreateWriter( CompressOptions compress_options{compress_str, compress_level}; PAIMON_ASSIGN_OR_RAISE(std::shared_ptr compression_factory, BlockCompressionFactory::Create(compress_options)); - return BTreeGlobalIndexWriter::Create(field_name, struct_type, file_writer, block_size, - compression_factory, pool); + return BTreeGlobalIndexWriter::Create(field_name, struct_type, file_writer, + static_cast(block_size), compression_factory, + pool); } Result> BTreeGlobalIndexer::CreateReader( diff --git a/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.cpp b/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.cpp index de054607e..f2ae0338e 100644 --- a/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.cpp +++ b/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.cpp @@ -24,6 +24,7 @@ #include "paimon/common/global_index/btree/btree_global_index_reader.h" #include "paimon/common/global_index/btree/btree_index_meta.h" #include "paimon/common/global_index/btree/key_serializer.h" +#include "paimon/common/global_index/union_global_index_reader.h" #include "paimon/common/memory/memory_slice.h" #include "paimon/common/memory/memory_slice_input.h" #include "paimon/common/sst/block_cache.h" @@ -170,55 +171,23 @@ Result> LazyFilteredBTreeReader::DispatchVisi return std::make_shared([]() { return RoaringBitmap64(); }); } - // Prepare all readers sequentially (reader_cache_ is not thread-safe) + // Create a UnionGlobalIndexReader from cached readers for the selected files + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr union_reader, + CreateUnionReader(selected_files)); + + // Delegate the action to the union reader + return action(union_reader); +} + +Result> LazyFilteredBTreeReader::CreateUnionReader( + const std::vector& files) { std::vector> readers; - readers.reserve(selected_files.size()); - for (const auto& meta : selected_files) { + readers.reserve(files.size()); + for (const auto& meta : files) { PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, GetOrCreateReader(meta)); readers.push_back(std::move(reader)); } - - // Execute actions: parallel if executor is available, sequential otherwise - std::vector>> collected_results; - if (executor_ != nullptr) { - // Parallel: submit all tasks to executor, then collect results in order - std::vector>>> futures; - futures.reserve(readers.size()); - for (const auto& reader : readers) { - futures.push_back( - Via(executor_.get(), - [&action, &reader]() -> Result> { - return action(reader); - })); - } - collected_results = CollectAll(futures); - } else { - // Sequential fallback: execute actions one by one - collected_results.reserve(readers.size()); - for (const auto& reader : readers) { - collected_results.push_back(action(reader)); - } - } - - // Merge results in submission order - std::shared_ptr merged_result = nullptr; - for (auto& result_or_status : collected_results) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, - std::move(result_or_status)); - if (result == nullptr) { - continue; - } - if (merged_result == nullptr) { - merged_result = std::move(result); - } else { - PAIMON_ASSIGN_OR_RAISE(merged_result, merged_result->Or(result)); - } - } - - if (merged_result == nullptr) { - return Status::Invalid("DispatchVisit cannot return empty result"); - } - return merged_result; + return std::make_shared(std::move(readers), executor_); } Result> LazyFilteredBTreeReader::GetOrCreateReader( diff --git a/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.h b/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.h index 372ba7691..4889b1e4e 100644 --- a/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.h +++ b/src/paimon/common/global_index/btree/lazy_filtered_btree_reader.h @@ -82,6 +82,8 @@ class LazyFilteredBTreeReader : public GlobalIndexReader { Result> DispatchVisit(SelectAction select_files, ReaderAction action); + Result> CreateUnionReader( + const std::vector& files); Result> GetOrCreateReader(const GlobalIndexIOMeta& meta); Result> CreateSingleReader(const GlobalIndexIOMeta& meta); Result ReadNullBitmap(const std::shared_ptr& cache, diff --git a/src/paimon/common/global_index/offset_global_index_reader.cpp b/src/paimon/common/global_index/offset_global_index_reader.cpp new file mode 100644 index 000000000..2e3130f6f --- /dev/null +++ b/src/paimon/common/global_index/offset_global_index_reader.cpp @@ -0,0 +1,151 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/offset_global_index_reader.h" + +#include + +namespace paimon { + +OffsetGlobalIndexReader::OffsetGlobalIndexReader(std::shared_ptr&& wrapped, + int64_t offset) + : wrapped_(std::move(wrapped)), offset_(offset) {} + +Result> OffsetGlobalIndexReader::VisitIsNotNull() { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, wrapped_->VisitIsNotNull()); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitIsNull() { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, wrapped_->VisitIsNull()); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitEqual( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitEqual(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitNotEqual( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitNotEqual(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitLessThan( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitLessThan(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitLessOrEqual( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitLessOrEqual(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitGreaterThan( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitGreaterThan(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitGreaterOrEqual( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitGreaterOrEqual(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitIn( + const std::vector& literals) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, wrapped_->VisitIn(literals)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitNotIn( + const std::vector& literals) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitNotIn(literals)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitStartsWith( + const Literal& prefix) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitStartsWith(prefix)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitEndsWith( + const Literal& suffix) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitEndsWith(suffix)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitContains( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitContains(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitLike( + const Literal& literal) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, wrapped_->VisitLike(literal)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::VisitVectorSearch( + const std::shared_ptr& vector_search) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitVectorSearch(vector_search)); + if (result == nullptr) { + return std::shared_ptr(); + } + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr offset_result, + result->AddOffset(offset_)); + auto scored_result = std::dynamic_pointer_cast(offset_result); + if (!scored_result) { + return Status::Invalid( + "AddOffset on ScoredGlobalIndexResult did not return ScoredGlobalIndexResult"); + } + return scored_result; +} + +Result> OffsetGlobalIndexReader::VisitFullTextSearch( + const std::shared_ptr& full_text_search) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + wrapped_->VisitFullTextSearch(full_text_search)); + return ApplyOffset(result); +} + +Result> OffsetGlobalIndexReader::ApplyOffset( + const std::shared_ptr& result) { + if (result == nullptr) { + return result; + } + return result->AddOffset(offset_); +} + +} // namespace paimon diff --git a/src/paimon/common/global_index/offset_global_index_reader.h b/src/paimon/common/global_index/offset_global_index_reader.h new file mode 100644 index 000000000..8b2d6034c --- /dev/null +++ b/src/paimon/common/global_index/offset_global_index_reader.h @@ -0,0 +1,77 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include "paimon/global_index/global_index_reader.h" + +namespace paimon { + +/// A GlobalIndexReader that wraps another reader and applies an offset to all row ids in the +/// results. This is used to convert local row IDs into global row IDs. +class OffsetGlobalIndexReader : public GlobalIndexReader { + public: + /// Constructs an OffsetGlobalIndexReader. + /// @param wrapped The inner reader to delegate queries to. Its results are expected to + /// contain local row ids. + /// @param offset The offset to add to each row id in the results. + OffsetGlobalIndexReader(std::shared_ptr&& wrapped, int64_t offset); + + Result> VisitIsNotNull() override; + Result> VisitIsNull() override; + Result> VisitEqual(const Literal& literal) override; + Result> VisitNotEqual(const Literal& literal) override; + Result> VisitLessThan(const Literal& literal) override; + Result> VisitLessOrEqual(const Literal& literal) override; + Result> VisitGreaterThan(const Literal& literal) override; + Result> VisitGreaterOrEqual(const Literal& literal) override; + Result> VisitIn( + const std::vector& literals) override; + Result> VisitNotIn( + const std::vector& literals) override; + Result> VisitStartsWith(const Literal& prefix) override; + Result> VisitEndsWith(const Literal& suffix) override; + Result> VisitContains(const Literal& literal) override; + Result> VisitLike(const Literal& literal) override; + + Result> VisitVectorSearch( + const std::shared_ptr& vector_search) override; + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override; + + bool IsThreadSafe() const override { + return wrapped_->IsThreadSafe(); + } + + std::string GetIndexType() const override { + return wrapped_->GetIndexType(); + } + + private: + Result> ApplyOffset( + const std::shared_ptr& result); + + private: + std::shared_ptr wrapped_; + int64_t offset_; +}; + +} // namespace paimon diff --git a/src/paimon/common/global_index/offset_global_index_reader_test.cpp b/src/paimon/common/global_index/offset_global_index_reader_test.cpp new file mode 100644 index 000000000..6eb3499f1 --- /dev/null +++ b/src/paimon/common/global_index/offset_global_index_reader_test.cpp @@ -0,0 +1,353 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/offset_global_index_reader.h" + +#include +#include + +#include "gtest/gtest.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/predicate/literal.h" +#include "paimon/testing/utils/testharness.h" +#include "paimon/utils/roaring_bitmap64.h" + +namespace paimon::test { +class FakeGlobalIndexReader : public GlobalIndexReader { + public: + void SetDefaultResult(const std::vector& row_ids) { + default_result_ = row_ids; + } + + void SetVectorSearchResult(const std::vector& row_ids, + const std::vector& scores) { + vector_search_row_ids_ = row_ids; + vector_search_scores_ = scores; + has_vector_search_result_ = true; + } + + Result> VisitIsNotNull() override { + return MakeResult(default_result_); + } + + Result> VisitIsNull() override { + return MakeResult(default_result_); + } + + Result> VisitEqual(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitNotEqual(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitLessThan(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitLessOrEqual(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitGreaterThan(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitGreaterOrEqual( + const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitIn( + const std::vector& literals) override { + return MakeResult(default_result_); + } + + Result> VisitNotIn( + const std::vector& literals) override { + return MakeResult(default_result_); + } + + Result> VisitStartsWith(const Literal& prefix) override { + return MakeResult(default_result_); + } + + Result> VisitEndsWith(const Literal& suffix) override { + return MakeResult(default_result_); + } + + Result> VisitContains(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitLike(const Literal& literal) override { + return MakeResult(default_result_); + } + + Result> VisitVectorSearch( + const std::shared_ptr& vector_search) override { + if (!has_vector_search_result_) { + return Status::Invalid("FakeGlobalIndexReader does not support vector search"); + } + auto bitmap = RoaringBitmap64::From(vector_search_row_ids_); + auto scores = vector_search_scores_; + return std::make_shared(std::move(bitmap), + std::move(scores)); + } + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override { + return MakeResult(default_result_); + } + + bool IsThreadSafe() const override { + return true; + } + + std::string GetIndexType() const override { + return "fake"; + } + + private: + static Result> MakeResult( + const std::vector& row_ids) { + auto ids = row_ids; + return std::make_shared( + [ids]() { return RoaringBitmap64::From(ids); }); + } + + private: + std::vector default_result_; + std::vector vector_search_row_ids_; + std::vector vector_search_scores_; + bool has_vector_search_result_ = false; +}; + +class OffsetGlobalIndexReaderTest : public ::testing::Test { + public: + void CheckResult(const std::shared_ptr& result, + const std::vector& expected) const { + ASSERT_TRUE(result); + auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected).ToString(); + } + + static void CheckScoredResult(const std::shared_ptr& result, + const std::vector& expected_row_ids, + const std::vector& expected_scores) { + ASSERT_TRUE(result); + auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected_row_ids)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected_row_ids).ToString(); + ASSERT_EQ(typed_result->GetScores(), expected_scores); + } +}; + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitEqualWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 1, 3}); + + auto offset_reader = std::make_shared(fake_reader, 100); + + Literal literal_5(5); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitEqual(literal_5)); + // row ids {0, 1, 3} + offset 100 -> {100, 101, 103} + CheckResult(result, {100, 101, 103}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitIsNotNullWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 2, 4, 6}); + + auto offset_reader = std::make_shared(fake_reader, 50); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIsNotNull()); + // row ids {0, 2, 4, 6} + offset 50 -> {50, 52, 54, 56} + CheckResult(result, {50, 52, 54, 56}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitIsNullWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({1, 3}); + + auto offset_reader = std::make_shared(fake_reader, 200); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIsNull()); + // row ids {1, 3} + offset 200 -> {201, 203} + CheckResult(result, {201, 203}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitLessThanWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 1}); + + auto offset_reader = std::make_shared(fake_reader, 10); + + Literal literal_5(5); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitLessThan(literal_5)); + // row ids {0, 1} + offset 10 -> {10, 11} + CheckResult(result, {10, 11}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitGreaterOrEqualWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({3, 4, 5}); + + auto offset_reader = std::make_shared(fake_reader, 1000); + + Literal literal_10(10); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitGreaterOrEqual(literal_10)); + // row ids {3, 4, 5} + offset 1000 -> {1003, 1004, 1005} + CheckResult(result, {1003, 1004, 1005}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitInWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 2}); + + auto offset_reader = std::make_shared(fake_reader, 5); + + std::vector literals = {Literal(1), Literal(3)}; + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIn(literals)); + // row ids {0, 2} + offset 5 -> {5, 7} + CheckResult(result, {5, 7}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitNotInWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({1, 3, 5}); + + auto offset_reader = std::make_shared(fake_reader, 20); + + std::vector literals = {Literal(2)}; + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitNotIn(literals)); + // row ids {1, 3, 5} + offset 20 -> {21, 23, 25} + CheckResult(result, {21, 23, 25}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitNotEqualWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 2, 4}); + + auto offset_reader = std::make_shared(fake_reader, 7); + + Literal literal_1(1); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitNotEqual(literal_1)); + // row ids {0, 2, 4} + offset 7 -> {7, 9, 11} + CheckResult(result, {7, 9, 11}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitLessOrEqualWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 1, 2}); + + auto offset_reader = std::make_shared(fake_reader, 30); + + Literal literal_3(3); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitLessOrEqual(literal_3)); + // row ids {0, 1, 2} + offset 30 -> {30, 31, 32} + CheckResult(result, {30, 31, 32}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitGreaterThanWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({4, 5}); + + auto offset_reader = std::make_shared(fake_reader, 15); + + Literal literal_3(3); + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitGreaterThan(literal_3)); + // row ids {4, 5} + offset 15 -> {19, 20} + CheckResult(result, {19, 20}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestZeroOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 1, 2}); + + auto offset_reader = std::make_shared(fake_reader, 0); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIsNotNull()); + // row ids {0, 1, 2} + offset 0 -> {0, 1, 2} + CheckResult(result, {0, 1, 2}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestEmptyResultWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({}); + + auto offset_reader = std::make_shared(fake_reader, 100); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitIsNotNull()); + // empty result + offset 100 -> still empty + CheckResult(result, {}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestIsThreadSafeDelegated) { + auto fake_reader = std::make_shared(); + auto offset_reader = std::make_shared(fake_reader, 100); + // FakeGlobalIndexReader returns true for IsThreadSafe + ASSERT_TRUE(offset_reader->IsThreadSafe()); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestGetIndexTypeDelegated) { + auto fake_reader = std::make_shared(); + auto offset_reader = std::make_shared(fake_reader, 100); + ASSERT_EQ(offset_reader->GetIndexType(), "fake"); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitFullTextSearchWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetDefaultResult({0, 3, 5}); + + auto offset_reader = std::make_shared(fake_reader, 10); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitFullTextSearch(nullptr)); + // row ids {0, 3, 5} + offset 10 -> {10, 13, 15} + CheckResult(result, {10, 13, 15}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitVectorSearchWithOffset) { + auto fake_reader = std::make_shared(); + fake_reader->SetVectorSearchResult({0, 2, 5}, {0.9f, 0.7f, 0.3f}); + + auto offset_reader = std::make_shared(fake_reader, 100); + + ASSERT_OK_AND_ASSIGN(auto result, offset_reader->VisitVectorSearch(nullptr)); + // row ids {0, 2, 5} + offset 100 -> {100, 102, 105}, scores unchanged + CheckScoredResult(result, {100, 102, 105}, {0.9f, 0.7f, 0.3f}); +} + +TEST_F(OffsetGlobalIndexReaderTest, TestVisitVectorSearchNotSupported) { + auto fake_reader = std::make_shared(); + auto offset_reader = std::make_shared(fake_reader, 10); + // FakeGlobalIndexReader without SetVectorSearchResult returns error for VectorSearch + ASSERT_NOK_WITH_MSG(offset_reader->VisitVectorSearch(nullptr), + "FakeGlobalIndexReader does not support vector search"); +} + +} // namespace paimon::test diff --git a/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp b/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp index 1547ee95f..14854f189 100644 --- a/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp +++ b/src/paimon/common/global_index/rangebitmap/range_bitmap_global_index_test.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/src/paimon/common/global_index/union_global_index_reader.cpp b/src/paimon/common/global_index/union_global_index_reader.cpp new file mode 100644 index 000000000..d5133e945 --- /dev/null +++ b/src/paimon/common/global_index/union_global_index_reader.cpp @@ -0,0 +1,207 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/union_global_index_reader.h" + +#include +#include + +#include "paimon/common/executor/future.h" + +namespace paimon { +UnionGlobalIndexReader::UnionGlobalIndexReader( + std::vector>&& readers, + const std::shared_ptr& executor) + : readers_(std::move(readers)), executor_(executor) {} + +Result> UnionGlobalIndexReader::VisitIsNotNull() { + return Union( + [](const std::shared_ptr& reader) { return reader->VisitIsNotNull(); }); +} + +Result> UnionGlobalIndexReader::VisitIsNull() { + return Union( + [](const std::shared_ptr& reader) { return reader->VisitIsNull(); }); +} + +Result> UnionGlobalIndexReader::VisitEqual( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitEqual(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitNotEqual( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitNotEqual(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitLessThan( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitLessThan(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitLessOrEqual( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitLessOrEqual(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitGreaterThan( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitGreaterThan(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitGreaterOrEqual( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitGreaterOrEqual(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitIn( + const std::vector& literals) { + return Union([&literals](const std::shared_ptr& reader) { + return reader->VisitIn(literals); + }); +} + +Result> UnionGlobalIndexReader::VisitNotIn( + const std::vector& literals) { + return Union([&literals](const std::shared_ptr& reader) { + return reader->VisitNotIn(literals); + }); +} + +Result> UnionGlobalIndexReader::VisitStartsWith( + const Literal& prefix) { + return Union([&prefix](const std::shared_ptr& reader) { + return reader->VisitStartsWith(prefix); + }); +} + +Result> UnionGlobalIndexReader::VisitEndsWith( + const Literal& suffix) { + return Union([&suffix](const std::shared_ptr& reader) { + return reader->VisitEndsWith(suffix); + }); +} + +Result> UnionGlobalIndexReader::VisitContains( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitContains(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitLike( + const Literal& literal) { + return Union([&literal](const std::shared_ptr& reader) { + return reader->VisitLike(literal); + }); +} + +Result> UnionGlobalIndexReader::VisitVectorSearch( + const std::shared_ptr& vector_search) { + auto results = ExecuteAllReaders>>( + [&vector_search](const std::shared_ptr& reader) + -> Result> { + return reader->VisitVectorSearch(vector_search); + }); + + std::shared_ptr merged_result = nullptr; + for (auto& result_or_status : results) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + std::move(result_or_status)); + if (result == nullptr) { + continue; + } + if (merged_result == nullptr) { + merged_result = std::move(result); + } else { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr merged_as_base, + merged_result->Or(result)); + merged_result = std::dynamic_pointer_cast(merged_as_base); + if (!merged_result) { + return Status::Invalid( + "Or of ScoredGlobalIndexResult did not return ScoredGlobalIndexResult in " + "UnionGlobalIndexReader"); + } + } + } + + return merged_result; +} + +Result> UnionGlobalIndexReader::VisitFullTextSearch( + const std::shared_ptr& full_text_search) { + return Union([&full_text_search](const std::shared_ptr& reader) { + return reader->VisitFullTextSearch(full_text_search); + }); +} + +Result> UnionGlobalIndexReader::Union(ReaderAction action) { + auto results = ExecuteAllReaders>>( + [&action](const std::shared_ptr& reader) + -> Result> { return action(reader); }); + + std::shared_ptr merged_result = nullptr; + for (auto& result_or_status : results) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result, + std::move(result_or_status)); + if (result == nullptr) { + continue; + } + if (merged_result == nullptr) { + merged_result = std::move(result); + } else { + PAIMON_ASSIGN_OR_RAISE(merged_result, merged_result->Or(result)); + } + } + + return merged_result; +} + +template +std::vector UnionGlobalIndexReader::ExecuteAllReaders( + const std::function&)>& action) { + if (executor_ == nullptr || readers_.size() == 1) { + std::vector results; + results.reserve(readers_.size()); + for (const auto& reader : readers_) { + results.push_back(action(reader)); + } + return results; + } + + // Parallel: submit all tasks to executor, then collect results in submission order + std::vector> futures; + futures.reserve(readers_.size()); + for (const auto& reader : readers_) { + futures.push_back( + Via(executor_.get(), [&action, &reader]() -> R { return action(reader); })); + } + return CollectAll(futures); +} + +} // namespace paimon diff --git a/src/paimon/common/global_index/union_global_index_reader.h b/src/paimon/common/global_index/union_global_index_reader.h new file mode 100644 index 000000000..1f253ca47 --- /dev/null +++ b/src/paimon/common/global_index/union_global_index_reader.h @@ -0,0 +1,85 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paimon/executor.h" +#include "paimon/global_index/global_index_reader.h" + +namespace paimon { +/// A GlobalIndexReader that combines results from multiple readers by performing a union +/// operation on their results. +/// +/// When an executor is provided, all sub-reader actions are submitted in parallel and results are +/// collected in submission order. Otherwise, sub-readers are evaluated sequentially. +class UnionGlobalIndexReader : public GlobalIndexReader { + public: + UnionGlobalIndexReader(std::vector>&& readers, + const std::shared_ptr& executor); + + Result> VisitIsNotNull() override; + Result> VisitIsNull() override; + Result> VisitEqual(const Literal& literal) override; + Result> VisitNotEqual(const Literal& literal) override; + Result> VisitLessThan(const Literal& literal) override; + Result> VisitLessOrEqual(const Literal& literal) override; + Result> VisitGreaterThan(const Literal& literal) override; + Result> VisitGreaterOrEqual(const Literal& literal) override; + Result> VisitIn( + const std::vector& literals) override; + Result> VisitNotIn( + const std::vector& literals) override; + Result> VisitStartsWith(const Literal& prefix) override; + Result> VisitEndsWith(const Literal& suffix) override; + Result> VisitContains(const Literal& literal) override; + Result> VisitLike(const Literal& literal) override; + + Result> VisitVectorSearch( + const std::shared_ptr& vector_search) override; + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override; + + bool IsThreadSafe() const override { + return false; + } + + std::string GetIndexType() const override { + return "union"; + } + + private: + using ReaderAction = std::function>( + const std::shared_ptr&)>; + + /// Executes the given action on all readers and merges results with Union. + Result> Union(ReaderAction action); + + /// Executes the given action on all readers (parallel or sequential) and collects results. + template + std::vector ExecuteAllReaders( + const std::function&)>& action); + + std::vector> readers_; + std::shared_ptr executor_; +}; + +} // namespace paimon diff --git a/src/paimon/common/global_index/union_global_index_reader_test.cpp b/src/paimon/common/global_index/union_global_index_reader_test.cpp new file mode 100644 index 000000000..98ecfdc76 --- /dev/null +++ b/src/paimon/common/global_index/union_global_index_reader_test.cpp @@ -0,0 +1,534 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "paimon/common/global_index/union_global_index_reader.h" + +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "paimon/executor.h" +#include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/predicate/literal.h" +#include "paimon/testing/utils/testharness.h" +#include "paimon/utils/roaring_bitmap64.h" + +namespace paimon::test { +class FakeReader : public GlobalIndexReader { + public: + /// Sets the result returned by all Visit* methods (default behavior). + /// Pass an empty vector for an empty bitmap. + void SetDefaultResult(const std::vector& row_ids) { + default_result_ = row_ids; + return_nullptr_ = false; + return_error_ = false; + } + + /// Configures this reader to return nullptr for all Visit* methods. + void SetReturnNullptr() { + return_nullptr_ = true; + return_error_ = false; + } + + /// Configures this reader to return an error Status for all Visit* methods. + void SetReturnError(const std::string& message) { + return_error_ = true; + return_nullptr_ = false; + error_message_ = message; + } + + /// Sets a scored result returned by VisitVectorSearch. + void SetScoredResult(const std::vector& row_ids, const std::vector& scores) { + scored_row_ids_ = row_ids; + scored_scores_ = scores; + has_scored_result_ = true; + } + + /// Counts how many times any Visit* method was invoked. Useful to assert all readers + /// are exercised by UnionGlobalIndexReader. + int InvocationCount() const { + return invocation_count_.load(); + } + + Result> VisitIsNotNull() override { + return MakeResult(); + } + + Result> VisitIsNull() override { + return MakeResult(); + } + + Result> VisitEqual(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitNotEqual(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitLessThan(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitLessOrEqual(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitGreaterThan(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitGreaterOrEqual( + const Literal& literal) override { + return MakeResult(); + } + + Result> VisitIn( + const std::vector& literals) override { + return MakeResult(); + } + + Result> VisitNotIn( + const std::vector& literals) override { + return MakeResult(); + } + + Result> VisitStartsWith(const Literal& prefix) override { + return MakeResult(); + } + + Result> VisitEndsWith(const Literal& suffix) override { + return MakeResult(); + } + + Result> VisitContains(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitLike(const Literal& literal) override { + return MakeResult(); + } + + Result> VisitVectorSearch( + const std::shared_ptr& vector_search) override { + invocation_count_++; + if (return_error_) { + return Status::Invalid(error_message_); + } + if (!has_scored_result_) { + return std::shared_ptr(nullptr); + } + auto bitmap = RoaringBitmap64::From(scored_row_ids_); + auto scores = scored_scores_; + return std::make_shared(std::move(bitmap), + std::move(scores)); + } + + Result> VisitFullTextSearch( + const std::shared_ptr& full_text_search) override { + return MakeResult(); + } + + bool IsThreadSafe() const override { + return true; + } + + std::string GetIndexType() const override { + return "fake"; + } + + private: + Result> MakeResult() { + invocation_count_++; + if (return_error_) { + return Status::Invalid(error_message_); + } + if (return_nullptr_) { + return std::shared_ptr(nullptr); + } + auto ids = default_result_; + return std::make_shared( + [ids]() { return RoaringBitmap64::From(ids); }); + } + + private: + std::vector default_result_; + bool return_nullptr_ = false; + bool return_error_ = false; + std::string error_message_; + std::vector scored_row_ids_; + std::vector scored_scores_; + bool has_scored_result_ = false; + std::atomic invocation_count_{0}; +}; + +class UnionGlobalIndexReaderTest : public ::testing::Test { + public: + static void CheckResult(const std::shared_ptr& result, + const std::vector& expected) { + ASSERT_TRUE(result); + auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected).ToString(); + } + + static void CheckScoredResult(const std::shared_ptr& result, + const std::vector& expected_row_ids, + const std::vector& expected_scores) { + ASSERT_TRUE(result); + auto typed_result = std::dynamic_pointer_cast(result); + ASSERT_TRUE(typed_result); + ASSERT_OK_AND_ASSIGN(const RoaringBitmap64* bitmap, typed_result->GetBitmap()); + ASSERT_TRUE(bitmap); + ASSERT_EQ(*bitmap, RoaringBitmap64::From(expected_row_ids)) + << "result=" << bitmap->ToString() + << ", expected=" << RoaringBitmap64::From(expected_row_ids).ToString(); + ASSERT_EQ(typed_result->GetScores(), expected_scores); + } +}; + +TEST_F(UnionGlobalIndexReaderTest, TestSingleReaderUnion) { + auto reader = std::make_shared(); + reader->SetDefaultResult({1, 2, 3}); + + std::vector> readers = {reader}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + CheckResult(result, {1, 2, 3}); + ASSERT_EQ(reader->InvocationCount(), 1); +} + +TEST_F(UnionGlobalIndexReaderTest, TestMultipleReadersUnionSequential) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + auto reader3 = std::make_shared(); + reader1->SetDefaultResult({1, 2}); + reader2->SetDefaultResult({3, 4}); + reader3->SetDefaultResult({5}); + + std::vector> readers = {reader1, reader2, reader3}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + // {1,2} OR {3,4} OR {5} -> {1,2,3,4,5} + CheckResult(result, {1, 2, 3, 4, 5}); + ASSERT_EQ(reader1->InvocationCount(), 1); + ASSERT_EQ(reader2->InvocationCount(), 1); + ASSERT_EQ(reader3->InvocationCount(), 1); +} + +TEST_F(UnionGlobalIndexReaderTest, TestMultipleReadersUnionOverlappingIds) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1, 2, 3}); + reader2->SetDefaultResult({2, 3, 4}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + // {1,2,3} OR {2,3,4} -> {1,2,3,4} + CheckResult(result, {1, 2, 3, 4}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestMultipleReadersUnionWithExecutor) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + auto reader3 = std::make_shared(); + reader1->SetDefaultResult({10}); + reader2->SetDefaultResult({20}); + reader3->SetDefaultResult({30}); + + std::vector> readers = {reader1, reader2, reader3}; + std::shared_ptr executor = CreateDefaultExecutor(); + UnionGlobalIndexReader union_reader(std::move(readers), executor); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + CheckResult(result, {10, 20, 30}); + ASSERT_EQ(reader1->InvocationCount(), 1); + ASSERT_EQ(reader2->InvocationCount(), 1); + ASSERT_EQ(reader3->InvocationCount(), 1); +} + +TEST_F(UnionGlobalIndexReaderTest, TestEmptyReaderList) { + std::vector> readers; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + // No readers means no results to merge -> nullptr + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + ASSERT_FALSE(result); +} + +TEST_F(UnionGlobalIndexReaderTest, TestAllReadersReturnNullptr) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetReturnNullptr(); + reader2->SetReturnNullptr(); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + // All readers return nullptr -> merged result is nullptr + ASSERT_FALSE(result); +} + +TEST_F(UnionGlobalIndexReaderTest, TestPartialReadersReturnNullptr) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + auto reader3 = std::make_shared(); + reader1->SetReturnNullptr(); + reader2->SetDefaultResult({1, 2}); + reader3->SetReturnNullptr(); + + std::vector> readers = {reader1, reader2, reader3}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNotNull()); + // Nullptrs are skipped, only reader2's result is used + CheckResult(result, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestErrorPropagationSequential) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1, 2}); + reader2->SetReturnError("Unknown error for reader2"); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_NOK_WITH_MSG(union_reader.VisitIsNotNull(), "Unknown error for reader2"); +} + +TEST_F(UnionGlobalIndexReaderTest, TestErrorPropagationWithExecutor) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetReturnError("Unknown error for reader2"); + + std::vector> readers = {reader1, reader2}; + std::shared_ptr executor = CreateDefaultExecutor(); + UnionGlobalIndexReader union_reader(std::move(readers), executor); + + ASSERT_NOK_WITH_MSG(union_reader.VisitIsNotNull(), "Unknown error for reader2"); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitEqualUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + Literal literal_42(42); + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitEqual(literal_42)); + CheckResult(result, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitNotEqualUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + Literal literal_42(42); + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitNotEqual(literal_42)); + CheckResult(result, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitRangeQueriesUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + Literal literal_5(5); + ASSERT_OK_AND_ASSIGN(auto lt, union_reader.VisitLessThan(literal_5)); + CheckResult(lt, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto le, union_reader.VisitLessOrEqual(literal_5)); + CheckResult(le, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto gt, union_reader.VisitGreaterThan(literal_5)); + CheckResult(gt, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto ge, union_reader.VisitGreaterOrEqual(literal_5)); + CheckResult(ge, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitInUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1, 3}); + reader2->SetDefaultResult({2, 4}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + std::vector literals = {Literal(10), Literal(20)}; + ASSERT_OK_AND_ASSIGN(auto in_result, union_reader.VisitIn(literals)); + CheckResult(in_result, {1, 2, 3, 4}); + + ASSERT_OK_AND_ASSIGN(auto not_in_result, union_reader.VisitNotIn(literals)); + CheckResult(not_in_result, {1, 2, 3, 4}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitStringQueriesUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + Literal literal_str(FieldType::STRING, "abc", 3); + ASSERT_OK_AND_ASSIGN(auto starts, union_reader.VisitStartsWith(literal_str)); + CheckResult(starts, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto ends, union_reader.VisitEndsWith(literal_str)); + CheckResult(ends, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto contains, union_reader.VisitContains(literal_str)); + CheckResult(contains, {1, 2}); + ASSERT_OK_AND_ASSIGN(auto like, union_reader.VisitLike(literal_str)); + CheckResult(like, {1, 2}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitIsNullUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({100, 200}); + reader2->SetDefaultResult({300}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitIsNull()); + CheckResult(result, {100, 200, 300}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitFullTextSearchUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetDefaultResult({1, 5}); + reader2->SetDefaultResult({2, 6}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitFullTextSearch(nullptr)); + CheckResult(result, {1, 2, 5, 6}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchAllNullptr) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + // Neither reader has SetScoredResult -> VisitVectorSearch returns nullptr + reader1->SetDefaultResult({1}); + reader2->SetDefaultResult({2}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitVectorSearch(nullptr)); + ASSERT_FALSE(result); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchSingleReader) { + auto reader = std::make_shared(); + reader->SetScoredResult({1, 3, 5}, {0.9f, 0.7f, 0.5f}); + + std::vector> readers = {reader}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitVectorSearch(nullptr)); + CheckScoredResult(result, {1, 3, 5}, {0.9f, 0.7f, 0.5f}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchMultipleReadersUnion) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetScoredResult({1, 3}, {0.9f, 0.7f}); + reader2->SetScoredResult({2, 4}, {0.8f, 0.6f}); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitVectorSearch(nullptr)); + // {1,3} OR {2,4} -> {1,2,3,4}, scores merged in row id order + CheckScoredResult(result, {1, 2, 3, 4}, {0.9f, 0.8f, 0.7f, 0.6f}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchPartialNullptr) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + auto reader3 = std::make_shared(); + reader1->SetScoredResult({1, 2}, {0.9f, 0.8f}); + // reader2 has no scored result -> returns nullptr + reader2->SetDefaultResult({10}); + reader3->SetScoredResult({5, 6}, {0.5f, 0.4f}); + + std::vector> readers = {reader1, reader2, reader3}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_OK_AND_ASSIGN(auto result, union_reader.VisitVectorSearch(nullptr)); + // reader2 nullptr is skipped, {1,2} OR {5,6} -> {1,2,5,6} + CheckScoredResult(result, {1, 2, 5, 6}, {0.9f, 0.8f, 0.5f, 0.4f}); +} + +TEST_F(UnionGlobalIndexReaderTest, TestVisitVectorSearchErrorPropagation) { + auto reader1 = std::make_shared(); + auto reader2 = std::make_shared(); + reader1->SetScoredResult({1}, {0.9f}); + reader2->SetReturnError("vector search failure"); + + std::vector> readers = {reader1, reader2}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_NOK_WITH_MSG(union_reader.VisitVectorSearch(nullptr), "vector search failure"); +} + +TEST_F(UnionGlobalIndexReaderTest, TestIsThreadSafeAlwaysFalse) { + auto reader = std::make_shared(); + std::vector> readers = {reader}; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + // UnionGlobalIndexReader is not thread-safe regardless of inner readers + ASSERT_FALSE(union_reader.IsThreadSafe()); +} + +TEST_F(UnionGlobalIndexReaderTest, TestGetIndexTypeReturnsUnion) { + std::vector> readers; + UnionGlobalIndexReader union_reader(std::move(readers), nullptr); + + ASSERT_EQ(union_reader.GetIndexType(), "union"); +} + +} // namespace paimon::test diff --git a/src/paimon/common/sst/sst_file_reader.cpp b/src/paimon/common/sst/sst_file_reader.cpp index 8243048b2..6b004c754 100644 --- a/src/paimon/common/sst/sst_file_reader.cpp +++ b/src/paimon/common/sst/sst_file_reader.cpp @@ -154,7 +154,7 @@ Result SstFileReader::DecompressBlock(const MemorySegment& compre crc32c_code = CRC32C::calculate(&compression_val, 1, crc32c_code); if (trailer->Crc32c() != static_cast(crc32c_code)) { return Status::Invalid(fmt::format("Expected crc32c({:#x}) but found crc32c({:#x})", - trailer->Crc32c(), crc32c_code)); + static_cast(trailer->Crc32c()), crc32c_code)); } // decompress data diff --git a/src/paimon/common/utils/row_range_index.cpp b/src/paimon/common/utils/row_range_index.cpp index f2eab9177..e78eb2a6b 100644 --- a/src/paimon/common/utils/row_range_index.cpp +++ b/src/paimon/common/utils/row_range_index.cpp @@ -17,7 +17,7 @@ #include "paimon/utils/row_range_index.h" #include -#include +#include namespace paimon { diff --git a/src/paimon/core/append/append_compact_coordinator.cpp b/src/paimon/core/append/append_compact_coordinator.cpp index 174909604..4d36436d1 100644 --- a/src/paimon/core/append/append_compact_coordinator.cpp +++ b/src/paimon/core/append/append_compact_coordinator.cpp @@ -235,8 +235,7 @@ Result>>> Sca const std::vector>& partitions, const std::shared_ptr& executor, const std::shared_ptr& pool) { auto scan_filter = std::make_shared( - /*predicate=*/nullptr, partitions, /*bucket_filter=*/std::nullopt, - /*vector_search=*/nullptr); + /*predicate=*/nullptr, partitions, /*bucket_filter=*/std::nullopt); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr scan, CreateFileStoreScan(snapshot_manager, schema_manager, table_schema, diff --git a/src/paimon/core/core_options.cpp b/src/paimon/core/core_options.cpp index b00f9a99f..c90a8dd9e 100644 --- a/src/paimon/core/core_options.cpp +++ b/src/paimon/core/core_options.cpp @@ -431,6 +431,7 @@ struct CoreOptions::Impl { bool data_evolution_enabled = false; bool legacy_partition_name_enabled = true; bool global_index_enabled = true; + std::optional global_index_thread_num; bool commit_force_compact = false; bool compaction_force_rewrite_all_files = false; bool compaction_force_up_level_0 = false; @@ -688,6 +689,10 @@ struct CoreOptions::Impl { // Parse global-index.enabled - whether to enable global index for scan, default true PAIMON_RETURN_NOT_OK( parser.Parse(Options::GLOBAL_INDEX_ENABLED, &global_index_enabled)); + // Parse global-index.thread-num - the maximum number of concurrent scanner for global + // index, no default value + PAIMON_RETURN_NOT_OK( + parser.Parse(Options::GLOBAL_INDEX_THREAD_NUM, &global_index_thread_num)); // Parse global-index.external-path - global index root directory PAIMON_RETURN_NOT_OK( parser.Parse(Options::GLOBAL_INDEX_EXTERNAL_PATH, &global_index_external_path)); @@ -1270,6 +1275,10 @@ bool CoreOptions::GlobalIndexEnabled() const { return impl_->global_index_enabled; } +std::optional CoreOptions::GetGlobalIndexThreadNum() const { + return impl_->global_index_thread_num; +} + std::optional CoreOptions::GetGlobalIndexExternalPath() const { return impl_->global_index_external_path; } diff --git a/src/paimon/core/core_options.h b/src/paimon/core/core_options.h index e50176a4b..1f15e3fd5 100644 --- a/src/paimon/core/core_options.h +++ b/src/paimon/core/core_options.h @@ -174,9 +174,10 @@ class PAIMON_EXPORT CoreOptions { int64_t GetLookupCacheFileRetentionMs() const; int64_t GetLookupCacheMaxDiskSize() const; - const std::map& ToMap() const; - BucketFunctionType GetBucketFunctionType() const; + std::optional GetGlobalIndexThreadNum() const; + + const std::map& ToMap() const; private: std::optional GetDataFileExternalPaths() const; diff --git a/src/paimon/core/core_options_test.cpp b/src/paimon/core/core_options_test.cpp index c0f63f721..5802e6ee6 100644 --- a/src/paimon/core/core_options_test.cpp +++ b/src/paimon/core/core_options_test.cpp @@ -115,6 +115,7 @@ TEST(CoreOptionsTest, TestDefaultValue) { ASSERT_TRUE(core_options.LegacyPartitionNameEnabled()); ASSERT_TRUE(core_options.GlobalIndexEnabled()); ASSERT_EQ(std::nullopt, core_options.GetGlobalIndexExternalPath()); + ASSERT_EQ(std::nullopt, core_options.GetGlobalIndexThreadNum()); ASSERT_EQ(std::nullopt, core_options.GetScanTagName()); ASSERT_EQ(std::nullopt, core_options.GetOptimizedCompactionInterval()); ASSERT_EQ(std::nullopt, core_options.GetCompactionTotalSizeThreshold()); @@ -209,6 +210,7 @@ TEST(CoreOptionsTest, TestFromMap) { {Options::DATA_EVOLUTION_ENABLED, "true"}, {Options::PARTITION_GENERATE_LEGACY_NAME, "false"}, {Options::GLOBAL_INDEX_ENABLED, "false"}, + {Options::GLOBAL_INDEX_THREAD_NUM, "4"}, {Options::GLOBAL_INDEX_EXTERNAL_PATH, "FILE:///tmp/global_index/"}, {Options::SCAN_TAG_NAME, "test-tag"}, {Options::WRITE_ONLY, "true"}, @@ -333,6 +335,7 @@ TEST(CoreOptionsTest, TestFromMap) { ASSERT_TRUE(core_options.DataEvolutionEnabled()); ASSERT_FALSE(core_options.LegacyPartitionNameEnabled()); ASSERT_FALSE(core_options.GlobalIndexEnabled()); + ASSERT_EQ(core_options.GetGlobalIndexThreadNum(), 4); ASSERT_TRUE(core_options.GetGlobalIndexExternalPath()); ASSERT_EQ(core_options.GetGlobalIndexExternalPath().value(), "FILE:///tmp/global_index/"); ASSERT_EQ("test-tag", core_options.GetScanTagName().value()); diff --git a/src/paimon/core/global_index/global_index_evaluator.h b/src/paimon/core/global_index/global_index_evaluator.h index 05342d2f6..856198782 100644 --- a/src/paimon/core/global_index/global_index_evaluator.h +++ b/src/paimon/core/global_index/global_index_evaluator.h @@ -17,10 +17,10 @@ #pragma once #include +#include #include "paimon/global_index/global_index_result.h" #include "paimon/predicate/predicate.h" -#include "paimon/predicate/vector_search.h" #include "paimon/visibility.h" namespace paimon { @@ -30,21 +30,14 @@ class PAIMON_EXPORT GlobalIndexEvaluator { virtual ~GlobalIndexEvaluator() = default; /// Evaluates a predicate against the global index. /// - /// @param predicate The filter predicate to evaluate. - /// @param vector_search The vector similarity search to evaluate. - /// @note When both `predicate` and `vector_search` are present, the predicate - /// is used to constrain the vector search space (for example, via a - /// pre-filter callback that may be applied during vector search), so - /// vector similarity scoring is effectively limited to rows that satisfy - /// the predicate. + /// @param predicate The filter predicate to evaluate. /// @return A `Result` containing: /// - `nullptr` if the predicate cannot be evaluated by this index (e.g., field has /// no index), /// - A `std::shared_ptr` if evaluation succeeds. /// The `GlobalIndexResult` indicates the matching rows (e.g., via row ID bitmaps). virtual Result> Evaluate( - const std::shared_ptr& predicate, - const std::shared_ptr& vector_search) = 0; + const std::shared_ptr& predicate) = 0; }; } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_evaluator_impl.cpp b/src/paimon/core/global_index/global_index_evaluator_impl.cpp index 12093fba1..bb123acd0 100644 --- a/src/paimon/core/global_index/global_index_evaluator_impl.cpp +++ b/src/paimon/core/global_index/global_index_evaluator_impl.cpp @@ -23,17 +23,11 @@ namespace paimon { Result> GlobalIndexEvaluatorImpl::Evaluate( - const std::shared_ptr& predicate, - const std::shared_ptr& vector_search) { + const std::shared_ptr& predicate) { std::shared_ptr compound_result; if (predicate) { PAIMON_ASSIGN_OR_RAISE(compound_result, EvaluatePredicate(predicate)); } - if (vector_search) { - PAIMON_ASSIGN_OR_RAISE( - compound_result, - EvaluateVectorSearch(vector_search, /*predicate_result=*/compound_result)); - } return compound_result; } @@ -53,42 +47,6 @@ Result>> GlobalIndexEvaluatorImpl return readers; } -Result> GlobalIndexEvaluatorImpl::EvaluateVectorSearch( - const std::shared_ptr& vector_search, - const std::shared_ptr& predicate_result) { - PAIMON_ASSIGN_OR_RAISE(std::vector> readers, - GetIndexReaders(vector_search->field_name)); - if (readers.empty()) { - return predicate_result; - } - if (readers.size() > 1) { - return Status::Invalid("Vector search cannot have multiple global indexes"); - } - const auto& vector_search_reader = readers[0]; - if (predicate_result && vector_search->pre_filter != nullptr) { - return Status::Invalid("Predicate result and pre_filter in VectorSearch conflict"); - } - auto final_vector_search = vector_search; - if (predicate_result) { - auto bitmap_global_index_result = - std::dynamic_pointer_cast(predicate_result); - if (!bitmap_global_index_result) { - return Status::Invalid( - "The pre_filter of vector search only supports BitmapGlobalIndexResult"); - } - PAIMON_ASSIGN_OR_RAISE(const RoaringBitmap64* bitmap, - bitmap_global_index_result->GetBitmap()); - assert(bitmap); - final_vector_search = vector_search->ReplacePreFilter( - [bitmap_global_index_result, bitmap](int64_t row_id) -> bool { - return bitmap->Contains(row_id); - }); - } - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr scored_result, - vector_search_reader->VisitVectorSearch(final_vector_search)); - return scored_result; -} - Result> GlobalIndexEvaluatorImpl::EvaluatePredicate( const std::shared_ptr& predicate) { if (predicate == nullptr) { @@ -101,6 +59,12 @@ Result> GlobalIndexEvaluatorImpl::EvaluatePre const std::string& field_name = leaf_predicate->FieldName(); PAIMON_ASSIGN_OR_RAISE(std::vector> readers, GetIndexReaders(field_name)); + if (readers.empty()) { + // No usable index for this field within the requested range. Treat as "no + // pushdown available" so the upstream falls back to a full scan instead of + // wrongly producing an empty result. + return std::shared_ptr(nullptr); + } // calculate compound result as field may has multiple indexes std::shared_ptr compound_result; for (const auto& index_reader : readers) { diff --git a/src/paimon/core/global_index/global_index_evaluator_impl.h b/src/paimon/core/global_index/global_index_evaluator_impl.h index bf4f1178c..7cae89193 100644 --- a/src/paimon/core/global_index/global_index_evaluator_impl.h +++ b/src/paimon/core/global_index/global_index_evaluator_impl.h @@ -31,6 +31,8 @@ namespace paimon { class GlobalIndexEvaluatorImpl : public GlobalIndexEvaluator { public: + /// Creates the underlying readers for the given field. Returns an empty vector when the field + /// has no usable index. using IndexReadersCreator = std::function>>(int32_t)>; @@ -39,14 +41,9 @@ class GlobalIndexEvaluatorImpl : public GlobalIndexEvaluator { : table_schema_(table_schema), create_index_readers_(std::move(create_index_readers)) {} Result> Evaluate( - const std::shared_ptr& predicate, - const std::shared_ptr& vector_search) override; + const std::shared_ptr& predicate) override; private: - Result> EvaluateVectorSearch( - const std::shared_ptr& vector_search, - const std::shared_ptr& predicate_result); - Result> EvaluatePredicate( const std::shared_ptr& predicate); diff --git a/src/paimon/core/global_index/global_index_scan.cpp b/src/paimon/core/global_index/global_index_scan.cpp index 311f06376..aa54a3a6e 100644 --- a/src/paimon/core/global_index/global_index_scan.cpp +++ b/src/paimon/core/global_index/global_index_scan.cpp @@ -20,7 +20,9 @@ #include "paimon/core/global_index/global_index_scan_impl.h" #include "paimon/core/operation/file_store_scan.h" #include "paimon/core/schema/schema_manager.h" +#include "paimon/core/utils/file_store_path_factory.h" #include "paimon/core/utils/snapshot_manager.h" + namespace paimon { namespace { Result> LoadSchema(const std::string& root_path, @@ -67,7 +69,7 @@ Result> GlobalIndexScan::Create( const std::string& root_path, const std::optional& snapshot_id, const std::optional>>& partitions, const std::map& options, - const std::shared_ptr& file_system, + const std::shared_ptr& file_system, const std::shared_ptr& executor, const std::shared_ptr& memory_pool) { if (partitions && partitions.value().empty()) { return Status::Invalid( @@ -87,14 +89,14 @@ Result> GlobalIndexScan::Create( arrow_schema, partitions.value())); } PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, LoadSnapshot(root_path, snapshot_id, core_options)); - return std::make_unique(root_path, table_schema, snapshot, - partition_filters, core_options, pool); + return GlobalIndexScanImpl::Create(root_path, table_schema, snapshot, partition_filters, + core_options, executor, pool); } Result> GlobalIndexScan::Create( const std::string& root_path, const std::optional& snapshot_id, const std::shared_ptr& partitions, const std::map& options, - const std::shared_ptr& file_system, + const std::shared_ptr& file_system, const std::shared_ptr& executor, const std::shared_ptr& memory_pool) { std::shared_ptr partition_filters; if (partitions) { @@ -109,8 +111,8 @@ Result> GlobalIndexScan::Create( PAIMON_ASSIGN_OR_RAISE(CoreOptions core_options, MergeOptions(table_schema, options, file_system)); PAIMON_ASSIGN_OR_RAISE(Snapshot snapshot, LoadSnapshot(root_path, snapshot_id, core_options)); - return std::make_unique(root_path, table_schema, snapshot, - partition_filters, core_options, pool); + return GlobalIndexScanImpl::Create(root_path, table_schema, snapshot, partition_filters, + core_options, executor, pool); } } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_scan_impl.cpp b/src/paimon/core/global_index/global_index_scan_impl.cpp index 89851b4ae..de7ce1b83 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.cpp +++ b/src/paimon/core/global_index/global_index_scan_impl.cpp @@ -16,121 +16,70 @@ #include "paimon/core/global_index/global_index_scan_impl.h" #include -#include +#include #include -#include "paimon/common/executor/future.h" -#include "paimon/core/global_index/row_range_global_index_scanner_impl.h" +#include "arrow/c/bridge.h" +#include "paimon/common/global_index/offset_global_index_reader.h" +#include "paimon/common/global_index/union_global_index_reader.h" +#include "paimon/common/utils/scope_guard.h" +#include "paimon/core/global_index/global_index_evaluator_impl.h" #include "paimon/core/index/index_file_handler.h" #include "paimon/global_index/bitmap_global_index_result.h" +#include "paimon/global_index/global_indexer.h" +#include "paimon/global_index/global_indexer_factory.h" + namespace paimon { -GlobalIndexScanImpl::GlobalIndexScanImpl(const std::string& root_path, - const std::shared_ptr& table_schema, - const Snapshot& snapshot, - const std::shared_ptr& partitions, +GlobalIndexScanImpl::GlobalIndexScanImpl(const std::shared_ptr& table_schema, const CoreOptions& options, + const std::shared_ptr& path_factory, + IndexMetaMap&& index_metas, + const std::shared_ptr& executor, const std::shared_ptr& pool) : pool_(pool), - root_path_(root_path), table_schema_(table_schema), - snapshot_(snapshot), - partitions_(partitions), - options_(options) {} + options_(options), + index_file_manager_( + std::make_shared(options.GetFileSystem(), path_factory)), + index_metas_(std::move(index_metas)), + executor_(executor) {} -Result> GlobalIndexScanImpl::CreateRangeScan( - const Range& range) { - PAIMON_RETURN_NOT_OK(Scan()); - std::optional partition; - // field id -> {index type -> entry} - std::map>> filtered_entries; - for (const auto& entry : entries_) { - const auto& global_index_meta = entry.index_file->GetGlobalIndexMeta(); - assert(global_index_meta); - const auto& meta = global_index_meta.value(); - if (Range::HasIntersection(range, Range(meta.row_range_start, meta.row_range_end))) { - if (!partition) { - partition = entry.partition; - } else if (!(partition.value() == entry.partition)) { - return Status::Invalid( - "input range contain multiple partitions, fail to create range scan"); - } - filtered_entries[meta.index_field_id][entry.index_file->IndexType()].push_back(entry); - } - } - std::shared_ptr index_file_path_factory = - path_factory_->CreateGlobalIndexFileFactory(); - return std::make_shared(table_schema_, index_file_path_factory, - filtered_entries, options_, pool_); -} - -Result> GlobalIndexScanImpl::GetRowRangeList() { - PAIMON_RETURN_NOT_OK(Scan()); - std::map> index_type_to_ranges; - std::vector index_ranges; - index_ranges.reserve(entries_.size()); - for (const auto& entry : entries_) { - const auto& global_index_meta = entry.index_file->GetGlobalIndexMeta(); - assert(global_index_meta); - const auto& index_meta = global_index_meta.value(); - Range range(index_meta.row_range_start, index_meta.row_range_end); - index_ranges.push_back(range); - index_type_to_ranges[entry.index_file->IndexType()].push_back(range); - } - std::string check_index_type; - std::vector check_ranges; - // check all type index have same shard ranges - // If index a has [1,10],[20,30] and index b has [1,10],[20,25], it's inconsistent, because - // it is hard to handle the [26,30] range. - for (const auto& [type, ranges] : index_type_to_ranges) { - if (check_index_type.empty()) { - check_index_type = type; - check_ranges = Range::SortAndMergeOverlap(ranges, /*adjacent=*/true); - } else { - auto merged = Range::SortAndMergeOverlap(ranges, /*adjacent=*/true); - if (merged != check_ranges) { - return Status::Invalid( - fmt::format("Inconsistent row ranges among index types: {} and {}", - check_index_type, type)); - } - } - } - return Range::SortAndMergeOverlap(index_ranges, /*adjacent=*/false); -} - -Status GlobalIndexScanImpl::Scan() { - if (initialized_) { - return Status::OK(); - } - auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema_->Fields()); - PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options_.CreateExternalPaths()); +Result> GlobalIndexScanImpl::Create( + const std::string& root_path, const std::shared_ptr& table_schema, + const Snapshot& snapshot, const std::shared_ptr& partitions, + const CoreOptions& options, const std::shared_ptr& executor, + const std::shared_ptr& pool) { + auto arrow_schema = DataField::ConvertDataFieldsToArrowSchema(table_schema->Fields()); + PAIMON_ASSIGN_OR_RAISE(std::vector external_paths, options.CreateExternalPaths()); PAIMON_ASSIGN_OR_RAISE(std::optional global_index_external_path, - options_.CreateGlobalIndexExternalPath()); + options.CreateGlobalIndexExternalPath()); PAIMON_ASSIGN_OR_RAISE( - path_factory_, + std::shared_ptr file_store_path_factory, FileStorePathFactory::Create( - root_path_, arrow_schema, table_schema_->PartitionKeys(), - options_.GetPartitionDefaultName(), options_.GetFileFormat()->Identifier(), - options_.DataFilePrefix(), options_.LegacyPartitionNameEnabled(), external_paths, - global_index_external_path, options_.IndexFileInDataFileDir(), pool_)); + root_path, arrow_schema, table_schema->PartitionKeys(), + options.GetPartitionDefaultName(), options.GetFileFormat()->Identifier(), + options.DataFilePrefix(), options.LegacyPartitionNameEnabled(), external_paths, + global_index_external_path, options.IndexFileInDataFileDir(), pool)); + std::shared_ptr path_factory = + file_store_path_factory->CreateGlobalIndexFileFactory(); PAIMON_ASSIGN_OR_RAISE( std::unique_ptr index_manifest_file, - IndexManifestFile::Create(options_.GetFileSystem(), options_.GetManifestFormat(), - options_.GetManifestCompression(), path_factory_, - options_.GetBucket(), pool_, options_)); - auto index_file_handler = - std::make_unique(options_.GetFileSystem(), std::move(index_manifest_file), - std::make_shared(path_factory_), - options_.DeletionVectorsBitmap64(), pool_); + IndexManifestFile::Create(options.GetFileSystem(), options.GetManifestFormat(), + options.GetManifestCompression(), file_store_path_factory, + options.GetBucket(), pool, options)); + auto index_file_handler = std::make_unique( + options.GetFileSystem(), std::move(index_manifest_file), + std::make_shared(file_store_path_factory), + options.DeletionVectorsBitmap64(), pool); PAIMON_ASSIGN_OR_RAISE(std::vector partition_fields, - table_schema_->GetFields(table_schema_->PartitionKeys())); + table_schema->GetFields(table_schema->PartitionKeys())); auto partition_schema = DataField::ConvertDataFieldsToArrowSchema(partition_fields); std::function(const IndexManifestEntry&)> filter = [&](const IndexManifestEntry& entry) -> Result { - if (partitions_) { - PAIMON_ASSIGN_OR_RAISE(bool saved, - partitions_->Test(partition_schema, entry.partition)); + if (partitions) { + PAIMON_ASSIGN_OR_RAISE(bool saved, partitions->Test(partition_schema, entry.partition)); if (!saved) { return false; } @@ -140,76 +89,121 @@ Status GlobalIndexScanImpl::Scan() { } return true; }; - PAIMON_ASSIGN_OR_RAISE(entries_, index_file_handler->Scan(snapshot_, filter)); - initialized_ = true; - return Status::OK(); + PAIMON_ASSIGN_OR_RAISE(std::vector entries, + index_file_handler->Scan(snapshot, filter)); + IndexMetaMap index_metas; + for (const auto& entry : entries) { + auto index_file_meta = entry.index_file; + const auto& index_meta = index_file_meta->GetGlobalIndexMeta(); + assert(index_meta); + Range range(index_meta->row_range_start, index_meta->row_range_end); + index_metas[index_meta->index_field_id][index_file_meta->IndexType()][range].push_back( + index_file_meta); + } + auto final_executor = executor; + if (!final_executor) { + std::optional thread_num = options.GetGlobalIndexThreadNum(); + if (!thread_num) { + uint32_t cpu_count = std::thread::hardware_concurrency(); + thread_num = cpu_count > 0 ? static_cast(cpu_count) : 1; + } + final_executor = CreateDefaultExecutor(static_cast(thread_num.value())); + } + return std::unique_ptr(new GlobalIndexScanImpl( + table_schema, options, path_factory, std::move(index_metas), final_executor, pool)); } -Result> GlobalIndexScanImpl::ParallelScan( - const std::vector& ranges, const std::shared_ptr& predicate, - const std::shared_ptr& vector_search, const std::shared_ptr& executor) { - std::vector> range_scanners; - range_scanners.reserve(ranges.size()); - for (const auto& range : ranges) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr scanner, - CreateRangeScan(range)); - auto scanner_impl = std::dynamic_pointer_cast(scanner); - if (!scanner_impl) { - return Status::Invalid( - "invalid RowRangeGlobalIndexScanner, fail to cast to " - "RowRangeGlobalIndexScannerImpl"); - } - range_scanners.push_back(scanner_impl); +Result> GlobalIndexScanImpl::GetOrCreateIndexEvaluator() { + if (evaluator_) { + return evaluator_; } + GlobalIndexEvaluatorImpl::IndexReadersCreator create_index_readers = + [this](int32_t field_id) -> Result>> { + return CreateReaders(field_id, /*row_range_index=*/std::nullopt); + }; + evaluator_ = std::make_shared(table_schema_, create_index_readers); + return evaluator_; +} - std::vector>>> futures; - for (size_t i = 0; i < range_scanners.size(); i++) { - const auto& scanner = range_scanners[i]; - const auto& range = ranges[i]; - auto search_index = [&scanner, &predicate, &vector_search, - &range]() -> Result> { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr evaluator, - scanner->CreateIndexEvaluator()); - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr index_result, - evaluator->Evaluate(predicate, vector_search)); - if (!index_result) { - return index_result; - } - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr result_with_offset, - index_result->AddOffset(range.from)); - return result_with_offset; - }; - futures.push_back(Via(executor.get(), search_index)); +Result>> GlobalIndexScanImpl::CreateReaders( + int32_t field_id, const std::optional& row_range_index) const { + PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_id)); + return CreateReaders(field, row_range_index); +} + +Result>> GlobalIndexScanImpl::CreateReaders( + const std::string& field_name, const std::optional& row_range_index) const { + PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_name)); + return CreateReaders(field, row_range_index); +} + +Result>> GlobalIndexScanImpl::CreateReaders( + const DataField& field, const std::optional& row_range_index) const { + auto field_iter = index_metas_.find(field.Id()); + if (field_iter == index_metas_.end()) { + return std::vector>(); } - auto collected_results = CollectAll(futures); + const auto& index_type_to_metas = field_iter->second; + std::vector> readers; + readers.reserve(index_type_to_metas.size()); + for (const auto& [index_type, range_to_metas] : index_type_to_metas) { + PAIMON_ASSIGN_OR_RAISE(std::unique_ptr indexer, + GlobalIndexerFactory::Get(index_type, options_.ToMap())); + if (!indexer) { + continue; + } + std::vector> union_readers; + union_readers.reserve(range_to_metas.size()); + for (const auto& [range, metas] : range_to_metas) { + if (row_range_index && !row_range_index->Intersects(range.from, range.to)) { + continue; + } + // TODO(xinyu.lxy): c_arrow_schema may contains additional associated fields. + auto arrow_field = DataField::ConvertDataFieldToArrowField(field); + auto arrow_schema = arrow::schema({arrow_field}); - // collect inner result and check all null - bool all_null = true; - std::vector> results; - for (auto& result : collected_results) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr inner_result, result); - if (inner_result) { - all_null = false; + ArrowSchema c_arrow_schema; + PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*arrow_schema, &c_arrow_schema)); + auto index_io_metas = ToGlobalIndexIOMetas(metas); + ScopeGuard guard([&]() { ArrowSchemaRelease(&c_arrow_schema); }); + PAIMON_ASSIGN_OR_RAISE( + std::shared_ptr index_reader, + indexer->CreateReader(&c_arrow_schema, index_file_manager_, index_io_metas, pool_)); + union_readers.push_back( + std::make_shared(std::move(index_reader), range.from)); } - results.push_back(std::move(inner_result)); + if (union_readers.empty()) { + continue; + } + readers.push_back( + std::make_shared(std::move(union_readers), executor_)); } - if (all_null) { - return std::shared_ptr(nullptr); + return readers; +} + +std::vector GlobalIndexScanImpl::ToGlobalIndexIOMetas( + const std::vector>& metas) const { + std::vector index_io_metas; + index_io_metas.reserve(metas.size()); + for (const auto& meta : metas) { + index_io_metas.push_back(ToGlobalIndexIOMeta(meta)); } + return index_io_metas; +} - // union result from multiple ranges - std::shared_ptr final_global_index_result; +GlobalIndexIOMeta GlobalIndexScanImpl::ToGlobalIndexIOMeta( + const std::shared_ptr& index_meta) const { + assert(index_meta->GetGlobalIndexMeta()); + const auto& global_index_meta = index_meta->GetGlobalIndexMeta().value(); + return {index_file_manager_->ToPath(index_meta), index_meta->FileSize(), + global_index_meta.index_meta}; +} - for (size_t i = 0; i < results.size(); ++i) { - std::shared_ptr result = - results[i] ? results[i] : BitmapGlobalIndexResult::FromRanges({ranges[i]}); - if (!final_global_index_result) { - final_global_index_result = result; - } else { - PAIMON_ASSIGN_OR_RAISE(final_global_index_result, - final_global_index_result->Or(result)); - } - } - return final_global_index_result; +Result> GlobalIndexScanImpl::Scan( + const std::shared_ptr& predicate) { + PAIMON_ASSIGN_OR_RAISE(std::shared_ptr evaluator, + GetOrCreateIndexEvaluator()); + return evaluator->Evaluate(predicate); } + } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_scan_impl.h b/src/paimon/core/global_index/global_index_scan_impl.h index 1f7e23cd0..8988d32f4 100644 --- a/src/paimon/core/global_index/global_index_scan_impl.h +++ b/src/paimon/core/global_index/global_index_scan_impl.h @@ -23,48 +23,64 @@ #include "paimon/common/predicate/predicate_filter.h" #include "paimon/core/core_options.h" -#include "paimon/core/manifest/index_manifest_entry.h" +#include "paimon/core/global_index/global_index_evaluator.h" +#include "paimon/core/global_index/global_index_file_manager.h" +#include "paimon/core/index/index_file_meta.h" +#include "paimon/core/index/index_path_factory.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/snapshot.h" -#include "paimon/core/utils/file_store_path_factory.h" -#include "paimon/core/utils/snapshot_manager.h" +#include "paimon/global_index/global_index_io_meta.h" #include "paimon/global_index/global_index_scan.h" namespace paimon { class GlobalIndexScanImpl : public GlobalIndexScan { public: - GlobalIndexScanImpl(const std::string& root_path, - const std::shared_ptr& table_schema, const Snapshot& snapshot, - const std::shared_ptr& partitions, - const CoreOptions& options, const std::shared_ptr& pool); + static Result> Create( + const std::string& root_path, const std::shared_ptr& table_schema, + const Snapshot& snapshot, const std::shared_ptr& partitions, + const CoreOptions& options, const std::shared_ptr& executor, + const std::shared_ptr& pool); - Result> CreateRangeScan( - const Range& range) override; + Result> Scan(const std::shared_ptr& predicate); - Result> GetRowRangeList() override; + Result>> CreateReaders( + const std::string& field_name, + const std::optional& row_range_index) const override; - const Snapshot& GetSnapshot() const { - return snapshot_; - } - - Result> ParallelScan( - const std::vector& ranges, const std::shared_ptr& predicate, - const std::shared_ptr& vector_search, - const std::shared_ptr& executor); + Result>> CreateReaders( + int32_t field_id, const std::optional& row_range_index) const override; private: - Status Scan(); + /// (id->index_type->row_range) -> index meta list + using IndexMetaMap = + std::map>>>>; + + GlobalIndexScanImpl(const std::shared_ptr& table_schema, + const CoreOptions& options, + const std::shared_ptr& path_factory, + IndexMetaMap&& index_metas, const std::shared_ptr& executor, + const std::shared_ptr& pool); + + Result> GetOrCreateIndexEvaluator(); + + Result>> CreateReaders( + const DataField& field, const std::optional& row_range_index) const; + + std::vector ToGlobalIndexIOMetas( + const std::vector>& metas) const; + + GlobalIndexIOMeta ToGlobalIndexIOMeta(const std::shared_ptr& index_meta) const; private: - bool initialized_ = false; std::shared_ptr pool_; std::string root_path_; std::shared_ptr table_schema_; - Snapshot snapshot_; - std::shared_ptr partitions_; CoreOptions options_; - std::shared_ptr path_factory_; - std::vector entries_; + std::shared_ptr index_file_manager_; + IndexMetaMap index_metas_; + std::shared_ptr executor_; + std::shared_ptr evaluator_; }; } // namespace paimon diff --git a/src/paimon/core/global_index/global_index_write_task.cpp b/src/paimon/core/global_index/global_index_write_task.cpp index cd0e3c6e7..5ee425f86 100644 --- a/src/paimon/core/global_index/global_index_write_task.cpp +++ b/src/paimon/core/global_index/global_index_write_task.cpp @@ -123,7 +123,12 @@ Result> BuildIndex(const std::string& field_name, std::vector relative_row_ids; relative_row_ids.reserve(typed_row_id_array->length()); for (int64_t i = 0; i < typed_row_id_array->length(); i++) { - relative_row_ids.push_back(typed_row_id_array->Value(i) - range.from); + int64_t row_id = typed_row_id_array->Value(i); + if (row_id < range.from || row_id > range.to) { + return Status::Invalid("invalid row id {}, out of range [{}, {}]", row_id, + range.from, range.to); + } + relative_row_ids.push_back(row_id - range.from); } PAIMON_ASSIGN_OR_RAISE_FROM_ARROW(std::shared_ptr new_array, arrow::StructArray::Make({indexed_array}, {field_name})); diff --git a/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp b/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp deleted file mode 100644 index 846a3bd53..000000000 --- a/src/paimon/core/global_index/row_range_global_index_scanner_impl.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "paimon/core/global_index/row_range_global_index_scanner_impl.h" - -#include -#include -#include -#include - -#include "arrow/c/bridge.h" -#include "arrow/c/helpers.h" -#include "paimon/common/utils/scope_guard.h" -#include "paimon/core/global_index/global_index_evaluator_impl.h" -#include "paimon/global_index/global_indexer.h" -#include "paimon/global_index/global_indexer_factory.h" -namespace paimon { -RowRangeGlobalIndexScannerImpl::RowRangeGlobalIndexScannerImpl( - const std::shared_ptr& table_schema, - const std::shared_ptr& path_factory, - const RowRangeGlobalIndexScannerImpl::IndexManifestEntryGroup& grouped_entries, - const CoreOptions& options, const std::shared_ptr& pool) - : pool_(pool), - table_schema_(table_schema), - options_(options), - grouped_entries_(grouped_entries), - index_file_manager_( - std::make_shared(options.GetFileSystem(), path_factory)) {} - -Result> RowRangeGlobalIndexScannerImpl::CreateIndexEvaluator() - const { - GlobalIndexEvaluatorImpl::IndexReadersCreator create_index_readers = - [scanner = shared_from_this()]( - int32_t field_id) -> Result>> { - return scanner->CreateReaders(field_id); - }; - return std::make_shared(table_schema_, create_index_readers); -} - -Result> RowRangeGlobalIndexScannerImpl::CreateReader( - const std::string& field_name, const std::string& index_type) const { - PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_name)); - auto field_iter = grouped_entries_.find(field.Id()); - if (field_iter == grouped_entries_.end()) { - return std::shared_ptr(); - } - const auto& index_type_to_entries = field_iter->second; - auto entry_iter = index_type_to_entries.find(index_type); - if (entry_iter == index_type_to_entries.end()) { - return std::shared_ptr(); - } - const auto& entries = entry_iter->second; - return CreateReader(field, index_type, entries); -} - -Result>> -RowRangeGlobalIndexScannerImpl::CreateReaders(const std::string& field_name) const { - PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_name)); - return CreateReaders(field); -} - -Result>> -RowRangeGlobalIndexScannerImpl::CreateReaders(int32_t field_id) const { - PAIMON_ASSIGN_OR_RAISE(DataField field, table_schema_->GetField(field_id)); - return CreateReaders(field); -} - -Result>> -RowRangeGlobalIndexScannerImpl::CreateReaders(const DataField& field) const { - auto field_iter = grouped_entries_.find(field.Id()); - if (field_iter == grouped_entries_.end()) { - return std::vector>(); - } - const auto& index_type_to_entries = field_iter->second; - std::vector> readers; - readers.reserve(index_type_to_entries.size()); - for (const auto& [index_type, entries] : index_type_to_entries) { - PAIMON_ASSIGN_OR_RAISE(std::shared_ptr reader, - CreateReader(field, index_type, entries)); - if (reader) { - readers.push_back(std::move(reader)); - } - } - return readers; -} - -Result> RowRangeGlobalIndexScannerImpl::CreateReader( - const DataField& field, const std::string& index_type, - const std::vector& entries) const { - PAIMON_ASSIGN_OR_RAISE(std::unique_ptr indexer, - GlobalIndexerFactory::Get(index_type, options_.ToMap())); - if (!indexer) { - return std::shared_ptr(); - } - // TODO(xinyu.lxy): c_arrow_schema may contains additional associated fields. - auto arrow_field = DataField::ConvertDataFieldToArrowField(field); - auto arrow_schema = arrow::schema({arrow_field}); - - ArrowSchema c_arrow_schema; - PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportSchema(*arrow_schema, &c_arrow_schema)); - auto index_io_metas = ToGlobalIndexIOMetas(entries); - ScopeGuard guard([&]() { ArrowSchemaRelease(&c_arrow_schema); }); - return indexer->CreateReader(&c_arrow_schema, index_file_manager_, index_io_metas, pool_); -} - -std::vector RowRangeGlobalIndexScannerImpl::ToGlobalIndexIOMetas( - const std::vector& entries) const { - std::vector index_io_metas; - index_io_metas.reserve(entries.size()); - for (const auto& entry : entries) { - index_io_metas.push_back(ToGlobalIndexIOMeta(entry)); - } - return index_io_metas; -} - -GlobalIndexIOMeta RowRangeGlobalIndexScannerImpl::ToGlobalIndexIOMeta( - const IndexManifestEntry& entry) const { - const auto& index_file = entry.index_file; - assert(index_file->GetGlobalIndexMeta()); - const auto& global_index_meta = index_file->GetGlobalIndexMeta().value(); - return {index_file_manager_->ToPath(index_file), index_file->FileSize(), - global_index_meta.index_meta}; -} - -} // namespace paimon diff --git a/src/paimon/core/global_index/row_range_global_index_scanner_impl.h b/src/paimon/core/global_index/row_range_global_index_scanner_impl.h deleted file mode 100644 index 41b26ea05..000000000 --- a/src/paimon/core/global_index/row_range_global_index_scanner_impl.h +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include - -#include "paimon/core/core_options.h" -#include "paimon/core/global_index/global_index_evaluator.h" -#include "paimon/core/global_index/global_index_file_manager.h" -#include "paimon/core/manifest/index_manifest_entry.h" -#include "paimon/core/schema/table_schema.h" -#include "paimon/global_index/global_index_io_meta.h" -#include "paimon/global_index/row_range_global_index_scanner.h" -namespace paimon { -class RowRangeGlobalIndexScannerImpl - : public RowRangeGlobalIndexScanner, - public std::enable_shared_from_this { - public: - using IndexManifestEntryGroup = - std::map>>; - - RowRangeGlobalIndexScannerImpl(const std::shared_ptr& table_schema, - const std::shared_ptr& path_factory, - const IndexManifestEntryGroup& grouped_entries, - const CoreOptions& options, - const std::shared_ptr& pool); - - Result> CreateIndexEvaluator() const; - - /// @return nullptr if global index reader not exist or plugin mismatch - Result> CreateReader( - const std::string& field_name, const std::string& index_type) const override; - - Result>> CreateReaders( - const std::string& field_name) const override; - - private: - Result>> CreateReaders(int32_t field_id) const; - Result>> CreateReaders( - const DataField& field) const; - - Result> CreateReader( - const DataField& field, const std::string& index_type, - const std::vector& entries) const; - - std::vector ToGlobalIndexIOMetas( - const std::vector& entries) const; - - GlobalIndexIOMeta ToGlobalIndexIOMeta(const IndexManifestEntry& entry) const; - - private: - std::shared_ptr pool_; - std::shared_ptr table_schema_; - CoreOptions options_; - IndexManifestEntryGroup grouped_entries_; - std::shared_ptr index_file_manager_; -}; - -} // namespace paimon diff --git a/src/paimon/core/operation/abstract_file_store_write.cpp b/src/paimon/core/operation/abstract_file_store_write.cpp index 2ef4c8215..712dbfe75 100644 --- a/src/paimon/core/operation/abstract_file_store_write.cpp +++ b/src/paimon/core/operation/abstract_file_store_write.cpp @@ -286,8 +286,7 @@ Result> AbstractFileStoreWrite::ScanExistingFileMe partition_filters.push_back(part_values_map); } auto scan_filter = std::make_shared( - /*predicate=*/nullptr, partition_filters, std::optional(bucket), - /*vector_search=*/nullptr); + /*predicate=*/nullptr, partition_filters, std::optional(bucket)); PAIMON_ASSIGN_OR_RAISE(std::unique_ptr scan, CreateFileStoreScan(scan_filter)); std::shared_ptr index_file_handler; diff --git a/src/paimon/core/operation/data_evolution_file_store_scan.cpp b/src/paimon/core/operation/data_evolution_file_store_scan.cpp index 07ded98b2..e9bb68a68 100644 --- a/src/paimon/core/operation/data_evolution_file_store_scan.cpp +++ b/src/paimon/core/operation/data_evolution_file_store_scan.cpp @@ -26,9 +26,9 @@ #include "paimon/common/utils/range_helper.h" namespace paimon { Result DataEvolutionFileStoreScan::FilterEntryByRowRanges( - const ManifestEntry& entry, const std::optional>& row_ranges) { - // If row ranges is null, all entries should be kept - if (!row_ranges) { + const ManifestEntry& entry, const std::optional& row_range_index) { + // If row range index is null, all entries should be kept + if (!row_range_index) { return true; } // If firstRowId does not exist, keep the entry @@ -39,34 +39,12 @@ Result DataEvolutionFileStoreScan::FilterEntryByRowRanges( // Check if any value in indices is in the range [firstRowId, firstRowId + rowCount - 1] int64_t end_row_id = first_row_id.value() + entry.File()->row_count - 1; - Range file_range(first_row_id.value(), end_row_id); - for (const auto& row_range : row_ranges.value()) { - if (Range::HasIntersection(file_range, row_range)) { - return true; - } - } - // No matching indices found, skip this entry - return false; + return row_range_index->Intersects(first_row_id.value(), end_row_id); } Result DataEvolutionFileStoreScan::FilterByStats(const ManifestEntry& entry) const { - return FilterEntryByRowRanges(entry, row_ranges_); -} - -std::vector DataEvolutionFileStoreScan::PostFilterManifests( - std::vector&& manifests) const { - if (!row_ranges_) { - return std::move(manifests); - } - std::vector result_metas; - result_metas.reserve(manifests.size()); - for (auto& manifest : manifests) { - if (FilterManifestByRowRanges(manifest, row_ranges_)) { - result_metas.push_back(std::move(manifest)); - } - } - return result_metas; + return FilterEntryByRowRanges(entry, row_range_index_); } Result> DataEvolutionFileStoreScan::PostFilterManifestEntries( @@ -101,26 +79,6 @@ Result> DataEvolutionFileStoreScan::PostFilterManifes return result_entries; } -bool DataEvolutionFileStoreScan::FilterManifestByRowRanges( - const ManifestFileMeta& manifest, const std::optional>& row_ranges) { - if (!row_ranges) { - return true; - } - std::optional min = manifest.MinRowId(); - std::optional max = manifest.MaxRowId(); - if (!min || !max) { - return true; - } - - Range manifest_range(min.value(), max.value()); - for (const auto& range : row_ranges.value()) { - if (Range::HasIntersection(manifest_range, range)) { - return true; - } - } - return false; -} - Result DataEvolutionFileStoreScan::FilterByStatsWithSameRowId( const std::vector& entries) const { if (entries.empty()) { diff --git a/src/paimon/core/operation/data_evolution_file_store_scan.h b/src/paimon/core/operation/data_evolution_file_store_scan.h index 7c9fc6df9..9dbd2cddd 100644 --- a/src/paimon/core/operation/data_evolution_file_store_scan.h +++ b/src/paimon/core/operation/data_evolution_file_store_scan.h @@ -64,9 +64,6 @@ class DataEvolutionFileStoreScan : public FileStoreScan { return scan; } - std::vector PostFilterManifests( - std::vector&& manifests) const override; - Result> PostFilterManifestEntries( std::vector&& entries) const override; @@ -88,10 +85,8 @@ class DataEvolutionFileStoreScan : public FileStoreScan { Result FilterByStatsWithSameRowId(const std::vector& entries) const; - static bool FilterManifestByRowRanges(const ManifestFileMeta& manifest, - const std::optional>& row_ranges); static Result FilterEntryByRowRanges(const ManifestEntry& entry, - const std::optional>& row_ranges); + const std::optional& row_range_index); static Result> EvolutionStats( const std::vector& entries, const std::shared_ptr& table_schema, diff --git a/src/paimon/core/operation/data_evolution_file_store_scan_test.cpp b/src/paimon/core/operation/data_evolution_file_store_scan_test.cpp index 12e6681ab..49b8e8903 100644 --- a/src/paimon/core/operation/data_evolution_file_store_scan_test.cpp +++ b/src/paimon/core/operation/data_evolution_file_store_scan_test.cpp @@ -558,15 +558,9 @@ TEST_F(DataEvolutionFileStoreScanTest, TestFilterEntryByRowRanges) { { // row_ids is null ASSERT_OK_AND_ASSIGN(bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::nullopt)); + entry, /*row_range_index=*/std::nullopt)); ASSERT_TRUE(exist); } - { - // row_ids is empty - ASSERT_OK_AND_ASSIGN(bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::vector())); - ASSERT_FALSE(exist); - } { auto file_without_first_row_id = std::make_shared( "data-0.orc", /*file_size=*/645, @@ -582,59 +576,37 @@ TEST_F(DataEvolutionFileStoreScanTest, TestFilterEntryByRowRanges) { ManifestEntry entry_without_first_row_id(FileKind::Add(), BinaryRow::EmptyRow(), /*bucket=*/0, /*total_buckets=*/1, file_without_first_row_id); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(0l, 0l)}))); // first row id is null - ASSERT_OK_AND_ASSIGN( - bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry_without_first_row_id, - /*row_ranges=*/std::optional>({Range(0l, 0l)}))); + ASSERT_OK_AND_ASSIGN(bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges( + entry_without_first_row_id, row_range_index)); ASSERT_TRUE(exist); } { - ASSERT_OK_AND_ASSIGN(bool exist, - DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::optional>( - {Range(0l, 0l), Range(10l, 10l)}))); + ASSERT_OK_AND_ASSIGN( + RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(0l, 0l), Range(10l, 10l)}))); + ASSERT_OK_AND_ASSIGN( + bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges(entry, row_range_index)); ASSERT_FALSE(exist); } { - ASSERT_OK_AND_ASSIGN(bool exist, - DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::optional>( - {Range(0l, 0l), Range(101l, 101l)}))); + ASSERT_OK_AND_ASSIGN( + RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(0l, 0l), Range(101l, 101l)}))); + ASSERT_OK_AND_ASSIGN( + bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges(entry, row_range_index)); ASSERT_TRUE(exist); } { - ASSERT_OK_AND_ASSIGN(bool exist, - DataEvolutionFileStoreScan::FilterEntryByRowRanges( - entry, /*row_ranges=*/std::optional>( - {Range(100l, 100l), Range(189l, 189l)}))); + ASSERT_OK_AND_ASSIGN( + RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(100l, 100l), Range(189l, 189l)}))); + ASSERT_OK_AND_ASSIGN( + bool exist, DataEvolutionFileStoreScan::FilterEntryByRowRanges(entry, row_range_index)); ASSERT_TRUE(exist); } } -TEST_F(DataEvolutionFileStoreScanTest, TestFilterManifestByRowRanges) { - // row id [10, 20] - auto manifest1 = - ManifestFileMeta("manifest-65b0d403-a1bc-4157-b242-bff73c46596d-0", /*file_size=*/2779, - /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), - /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, - /*min_level=*/0, /*max_level=*/0, - /*min_row_id=*/10, /*max_row_id=*/20); - ASSERT_TRUE(DataEvolutionFileStoreScan::FilterManifestByRowRanges(manifest1, std::nullopt)); - ASSERT_FALSE( - DataEvolutionFileStoreScan::FilterManifestByRowRanges(manifest1, std::vector())); - ASSERT_TRUE(DataEvolutionFileStoreScan::FilterManifestByRowRanges( - manifest1, std::optional>({Range(0, 15), Range(100, 200)}))); - ASSERT_FALSE(DataEvolutionFileStoreScan::FilterManifestByRowRanges( - manifest1, std::optional>({Range(0, 5), Range(100, 200)}))); - - auto manifest2 = - ManifestFileMeta("manifest-65b0d403-a1bc-4157-b242-bff73c46596d-0", /*file_size=*/2779, - /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), - /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, - /*min_level=*/0, /*max_level=*/0, - /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); - ASSERT_TRUE(DataEvolutionFileStoreScan::FilterManifestByRowRanges( - manifest2, std::optional>({Range(0, 0)}))); -} } // namespace paimon::test diff --git a/src/paimon/core/operation/file_store_commit_impl.cpp b/src/paimon/core/operation/file_store_commit_impl.cpp index 87bcfbcd0..c0e545c34 100644 --- a/src/paimon/core/operation/file_store_commit_impl.cpp +++ b/src/paimon/core/operation/file_store_commit_impl.cpp @@ -313,9 +313,8 @@ Result FileStoreCommitImpl::GetLastCommitTableRequest() { Result> FileStoreCommitImpl::GetAllFiles( const Snapshot& snapshot, const std::vector>& partitions) { - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, partitions, - /*bucket_filter=*/std::nullopt, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, partitions, + /*bucket_filter=*/std::nullopt); PAIMON_ASSIGN_OR_RAISE( auto scan, AppendOnlyFileStoreScan::Create( snapshot_manager_, schema_manager_, manifest_list_, manifest_file_, @@ -519,9 +518,8 @@ Result> FileStoreCommitImpl::ReadAllEntriesFromChange const std::set>& partitions) const { std::vector> partition_filters(partitions.begin(), partitions.end()); - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, partition_filters, - /*bucket_filter=*/std::nullopt, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, partition_filters, + /*bucket_filter=*/std::nullopt); PAIMON_ASSIGN_OR_RAISE( auto scan, AppendOnlyFileStoreScan::Create( snapshot_manager_, schema_manager_, manifest_list_, manifest_file_, diff --git a/src/paimon/core/operation/file_store_scan.cpp b/src/paimon/core/operation/file_store_scan.cpp index 78d639a83..925a3afce 100644 --- a/src/paimon/core/operation/file_store_scan.cpp +++ b/src/paimon/core/operation/file_store_scan.cpp @@ -130,7 +130,6 @@ Result> FileStoreScan::CreatePlan() cons std::vector filtered_manifest_file_metas; PAIMON_RETURN_NOT_OK( ReadManifests(&snapshot, &all_manifest_file_metas, &filtered_manifest_file_metas)); - filtered_manifest_file_metas = PostFilterManifests(std::move(filtered_manifest_file_metas)); std::vector manifest_entries; PAIMON_RETURN_NOT_OK(ReadManifestEntries(filtered_manifest_file_metas, &manifest_entries)); @@ -288,14 +287,31 @@ Result FileStoreScan::FilterManifestFileMeta(const ManifestFileMeta& manif } } // filter by partition filter - if (!partition_filter_) { + + if (partition_filter_) { + SimpleStats stats = manifest.PartitionStats(); + PAIMON_ASSIGN_OR_RAISE( + bool saved, partition_filter_->Test( + partition_schema_, + /*row_count=*/manifest.NumAddedFiles() + manifest.NumDeletedFiles(), + stats.MinValues(), stats.MaxValues(), stats.NullCounts())); + if (!saved) { + return false; + } + } + return FilterManifestByRowRanges(manifest); +} + +bool FileStoreScan::FilterManifestByRowRanges(const ManifestFileMeta& manifest) const { + if (!row_range_index_) { + return true; + } + std::optional min = manifest.MinRowId(); + std::optional max = manifest.MaxRowId(); + if (!min || !max) { return true; } - SimpleStats stats = manifest.PartitionStats(); - return partition_filter_->Test( - partition_schema_, - /*row_count=*/manifest.NumAddedFiles() + manifest.NumDeletedFiles(), stats.MinValues(), - stats.MaxValues(), stats.NullCounts()); + return row_range_index_->Intersects(min.value(), max.value()); } Status FileStoreScan::ReadManifestFileMeta(const ManifestFileMeta& manifest, diff --git a/src/paimon/core/operation/file_store_scan.h b/src/paimon/core/operation/file_store_scan.h index e55f07620..46a06e4a7 100644 --- a/src/paimon/core/operation/file_store_scan.h +++ b/src/paimon/core/operation/file_store_scan.h @@ -50,6 +50,7 @@ #include "paimon/result.h" #include "paimon/scan_context.h" #include "paimon/status.h" +#include "paimon/utils/row_range_index.h" namespace arrow { class Schema; @@ -114,8 +115,8 @@ class FileStoreScan { return this; } - FileStoreScan* WithRowRanges(const std::vector& row_ranges) { - row_ranges_ = row_ranges; + FileStoreScan* WithRowRangeIndex(const RowRangeIndex& row_range_index) { + row_range_index_ = row_range_index; return this; } @@ -194,11 +195,6 @@ class FileStoreScan { /// @note Keep this thread-safe. virtual Result FilterByStats(const ManifestEntry& entry) const = 0; - virtual std::vector PostFilterManifests( - std::vector&& manifests) const { - return std::move(manifests); - } - virtual Result> PostFilterManifestEntries( std::vector&& entries) const { return std::move(entries); @@ -252,6 +248,8 @@ class FileStoreScan { Result FilterManifestFileMeta(const ManifestFileMeta& manifest) const; + bool FilterManifestByRowRanges(const ManifestFileMeta& manifest) const; + Status ReadManifestFileMeta(const ManifestFileMeta& manifest, std::vector* entries) const; @@ -261,7 +259,7 @@ class FileStoreScan { std::shared_ptr predicates_; std::shared_ptr schema_; std::shared_ptr table_schema_; - std::optional> row_ranges_; + std::optional row_range_index_; ScanMode scan_mode_ = ScanMode::ALL; CoreOptions core_options_; diff --git a/src/paimon/core/operation/file_store_scan_test.cpp b/src/paimon/core/operation/file_store_scan_test.cpp index 888ee57ff..d2ef0db13 100644 --- a/src/paimon/core/operation/file_store_scan_test.cpp +++ b/src/paimon/core/operation/file_store_scan_test.cpp @@ -129,4 +129,58 @@ TEST_F(FileStoreScanTest, TestCreatePartitionPredicateWithInvalidPartitionFilter "field invalid does not exist in partition keys"); } +TEST_F(FileStoreScanTest, TestFilterManifestByRowRanges) { + class FakeFileStoreScan : public FileStoreScan { + public: + FakeFileStoreScan(const std::shared_ptr& snapshot_manager, + const std::shared_ptr& schema_manager, + const std::shared_ptr& manifest_list, + const std::shared_ptr& manifest_file, + const std::shared_ptr& table_schema, + const std::shared_ptr& schema, + const CoreOptions& core_options, + const std::shared_ptr& executor, + const std::shared_ptr& pool) + : FileStoreScan(snapshot_manager, schema_manager, manifest_list, manifest_file, + table_schema, schema, core_options, executor, pool) {} + Result FilterByStats(const ManifestEntry& entry) const override { + return false; + } + }; + // row id [10, 20] + auto manifest1 = + ManifestFileMeta("manifest-65b0d403-a1bc-4157-b242-bff73c46596d-0", /*file_size=*/2779, + /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), + /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, + /*min_level=*/0, /*max_level=*/0, + /*min_row_id=*/10, /*max_row_id=*/20); + + ASSERT_OK_AND_ASSIGN(CoreOptions options, CoreOptions::FromMap({{}})); + auto file_store_scan = std::make_shared( + /*snapshot_manager=*/nullptr, /*schema_manager=*/nullptr, /*manifest_list=*/nullptr, + /*manifest_file=*/nullptr, /*table_schema=*/nullptr, /*schema=*/nullptr, options, + /*executor=*/CreateDefaultExecutor(), GetDefaultPool()); + ASSERT_TRUE(file_store_scan->FilterManifestByRowRanges(manifest1)); + + ASSERT_OK_AND_ASSIGN( + RowRangeIndex row_range_index, + RowRangeIndex::Create(std::vector({Range(0, 15), Range(100, 200)}))); + file_store_scan->WithRowRangeIndex(row_range_index); + ASSERT_TRUE(file_store_scan->FilterManifestByRowRanges(manifest1)); + + ASSERT_OK_AND_ASSIGN(row_range_index, + RowRangeIndex::Create(std::vector({Range(0, 5), Range(100, 200)}))); + file_store_scan->WithRowRangeIndex(row_range_index); + ASSERT_FALSE(file_store_scan->FilterManifestByRowRanges(manifest1)); + + auto manifest2 = + ManifestFileMeta("manifest-65b0d403-a1bc-4157-b242-bff73c46596d-0", /*file_size=*/2779, + /*num_added_files=*/1, /*num_deleted_files=*/0, SimpleStats::EmptyStats(), + /*schema_id=*/0, /*min_bucket=*/0, /*max_bucket=*/0, + /*min_level=*/0, /*max_level=*/0, + /*min_row_id=*/std::nullopt, /*max_row_id=*/std::nullopt); + ASSERT_OK_AND_ASSIGN(row_range_index, RowRangeIndex::Create(std::vector({Range(0, 0)}))); + file_store_scan->WithRowRangeIndex(row_range_index); + ASSERT_TRUE(file_store_scan->FilterManifestByRowRanges(manifest2)); +} } // namespace paimon::test diff --git a/src/paimon/core/operation/key_value_file_store_scan_test.cpp b/src/paimon/core/operation/key_value_file_store_scan_test.cpp index 569558ee6..35bbfe078 100644 --- a/src/paimon/core/operation/key_value_file_store_scan_test.cpp +++ b/src/paimon/core/operation/key_value_file_store_scan_test.cpp @@ -138,10 +138,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { std::string table_path = paimon::test::GetDataDir() + "orc/pk_table_with_dv_cardinality.db/pk_table_with_dv_cardinality"; std::vector> partition_filters = {{{"f1", "10"}}}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); @@ -182,10 +181,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { "orc/pk_table_with_dv_cardinality.db/" "pk_table_with_dv_cardinality"; std::vector> partition_filters = {{{"f1", "10"}}}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/1, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/1); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/4)); @@ -200,10 +198,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { paimon::test::GetDataDir() + "orc/pk_table_with_mor.db/pk_table_with_mor"; std::vector> partition_filters = { {{"p0", "1"}, {"p1", "0"}}}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/1)); @@ -218,10 +215,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { paimon::test::GetDataDir() + "orc/pk_table_with_mor.db/pk_table_with_mor"; std::vector> partition_filters = { {{"p0", "0"}, {"p1", "0"}}}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); @@ -235,10 +231,9 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { std::string table_path = paimon::test::GetDataDir() + "orc/pk_table_partial_update.db/pk_table_partial_update"; std::vector> partition_filters = {}; - auto scan_filter = - std::make_shared(/*predicate=*/nullptr, - /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + auto scan_filter = std::make_shared(/*predicate=*/nullptr, + /*partition_filters=*/partition_filters, + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); @@ -255,7 +250,7 @@ TEST_F(KeyValueFileStoreScanTest, TestScanDurationMetric) { std::vector> partition_filters = {{{"f1", "10"}}}; auto scan_filter = std::make_shared(/*predicate=*/nullptr, /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); @@ -307,7 +302,7 @@ TEST_F(KeyValueFileStoreScanTest, TestSplitAndSetKeyValueFilter) { PredicateBuilder::And({not_equal, equal, greater_than, less_than})); auto scan_filter = std::make_shared(/*predicate=*/predicate, /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, /*vector_search=*/nullptr); + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/1)); @@ -377,8 +372,7 @@ TEST_F(KeyValueFileStoreScanTest, TestFilterByValueFilterWithValueStatsCols) { FieldType::DOUBLE, Literal(30.1)); auto scan_filter = std::make_shared(/*predicate=*/greater_than, /*partition_filters=*/partition_filters, - /*bucket_filter=*/0, - /*vector_search=*/nullptr); + /*bucket_filter=*/0); ASSERT_OK_AND_ASSIGN(std::unique_ptr scan, CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/1)); diff --git a/src/paimon/core/operation/scan_context.cpp b/src/paimon/core/operation/scan_context.cpp index 7c6cd2cfb..ec60dfa4f 100644 --- a/src/paimon/core/operation/scan_context.cpp +++ b/src/paimon/core/operation/scan_context.cpp @@ -56,7 +56,6 @@ class ScanContextBuilder::Impl { bucket_filter_ = std::nullopt; partition_filters_.clear(); predicates_.reset(); - vector_search_.reset(); global_index_result_.reset(); memory_pool_ = GetDefaultPool(); executor_ = CreateDefaultExecutor(); @@ -71,7 +70,6 @@ class ScanContextBuilder::Impl { std::optional bucket_filter_; std::vector> partition_filters_; std::shared_ptr predicates_; - std::shared_ptr vector_search_; std::shared_ptr global_index_result_; std::shared_ptr memory_pool_ = GetDefaultPool(); std::shared_ptr executor_ = CreateDefaultExecutor(); @@ -110,12 +108,6 @@ ScanContextBuilder& ScanContextBuilder::SetPredicate(const std::shared_ptr& vector_search) { - impl_->vector_search_ = vector_search; - return *this; -} - ScanContextBuilder& ScanContextBuilder::SetGlobalIndexResult( const std::shared_ptr& global_index_result) { impl_->global_index_result_ = global_index_result; @@ -159,7 +151,7 @@ Result> ScanContextBuilder::Finish() { auto ctx = std::make_unique( impl_->path_, impl_->is_streaming_mode_, impl_->limit_, std::make_shared(impl_->predicates_, impl_->partition_filters_, - impl_->bucket_filter_, impl_->vector_search_), + impl_->bucket_filter_), impl_->global_index_result_, impl_->memory_pool_, impl_->executor_, impl_->specific_file_system_, impl_->options_); impl_->Reset(); diff --git a/src/paimon/core/operation/scan_context_test.cpp b/src/paimon/core/operation/scan_context_test.cpp index 89b2e2ac5..ee0f77153 100644 --- a/src/paimon/core/operation/scan_context_test.cpp +++ b/src/paimon/core/operation/scan_context_test.cpp @@ -36,7 +36,6 @@ TEST(ScanContextTest, TestSimple) { ASSERT_TRUE(ctx->GetScanFilters()); ASSERT_FALSE(ctx->GetScanFilters()->GetBucketFilter()); ASSERT_FALSE(ctx->GetScanFilters()->GetPredicate()); - ASSERT_FALSE(ctx->GetScanFilters()->GetVectorSearch()); ASSERT_TRUE(ctx->GetScanFilters()->GetPartitionFilters().empty()); ASSERT_FALSE(ctx->GetGlobalIndexResult()); ASSERT_FALSE(ctx->GetSpecificFileSystem()); @@ -50,11 +49,6 @@ TEST(ScanContextTest, TestSetFilter) { auto predicate = PredicateBuilder::IsNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); builder.SetPredicate(predicate); - std::vector query = {1.0, 2.0}; - VectorSearch::PreFilter pre_filter = [](int64_t id) -> bool { return id % 2; }; - builder.SetVectorSearch(std::make_shared( - "f0", 10, query, pre_filter, nullptr, VectorSearch::DistanceType::INNER_PRODUCT, - std::map())); std::vector row_ranges = {Range(1, 2), Range(4, 5)}; auto global_index_result = BitmapGlobalIndexResult::FromRanges(row_ranges); builder.SetGlobalIndexResult(global_index_result); @@ -70,11 +64,6 @@ TEST(ScanContextTest, TestSetFilter) { ASSERT_TRUE(ctx->GetScanFilters()); ASSERT_EQ(10, ctx->GetScanFilters()->GetBucketFilter()); ASSERT_EQ(*predicate, *(ctx->GetScanFilters()->GetPredicate())); - auto result_vector_search = ctx->GetScanFilters()->GetVectorSearch(); - ASSERT_TRUE(result_vector_search); - ASSERT_EQ(query, result_vector_search->query); - ASSERT_EQ(VectorSearch::DistanceType::INNER_PRODUCT, - result_vector_search->distance_type.value()); ASSERT_EQ(partition_filters, ctx->GetScanFilters()->GetPartitionFilters()); ASSERT_EQ("{1,2,4,5}", ctx->GetGlobalIndexResult()->ToString()); std::map expected_options = {{"key", "value"}}; diff --git a/src/paimon/core/table/source/data_evolution_batch_scan.cpp b/src/paimon/core/table/source/data_evolution_batch_scan.cpp index 60e6b2084..94310b2e5 100644 --- a/src/paimon/core/table/source/data_evolution_batch_scan.cpp +++ b/src/paimon/core/table/source/data_evolution_batch_scan.cpp @@ -26,15 +26,13 @@ namespace paimon { DataEvolutionBatchScan::DataEvolutionBatchScan( const std::string& table_path, const std::shared_ptr& snapshot_reader, std::unique_ptr&& batch_scan, - const std::shared_ptr& global_index_result, - const std::shared_ptr& vector_search, const CoreOptions& core_options, + const std::shared_ptr& global_index_result, const CoreOptions& core_options, const std::shared_ptr& pool, const std::shared_ptr& executor) : AbstractTableScan(core_options, snapshot_reader), pool_(pool), table_path_(table_path), batch_scan_(std::move(batch_scan)), global_index_result_(global_index_result), - vector_search_(vector_search), executor_(executor) {} Result> DataEvolutionBatchScan::CreatePlan() { @@ -52,7 +50,12 @@ Result> DataEvolutionBatchScan::CreatePlan() { if (!row_ranges) { return batch_scan_->CreatePlan(); } - batch_scan_->WithRowRanges(row_ranges.value()); + if (row_ranges.value().empty()) { + return PlanImpl::EmptyPlan(); + } + PAIMON_ASSIGN_OR_RAISE(RowRangeIndex row_range_index, + RowRangeIndex::Create(row_ranges.value())); + batch_scan_->WithRowRangeIndex(row_range_index); PAIMON_ASSIGN_OR_RAISE(std::shared_ptr data_plan, batch_scan_->CreatePlan()); std::map id_to_score; if (auto scored_result = @@ -64,14 +67,13 @@ Result> DataEvolutionBatchScan::CreatePlan() { id_to_score[id] = score; } } - return WrapToIndexedSplits(data_plan, row_ranges.value(), id_to_score); + return WrapToIndexedSplits(data_plan, row_range_index, id_to_score); } Result> DataEvolutionBatchScan::WrapToIndexedSplits( - const std::shared_ptr& data_plan, const std::vector& row_ranges, + const std::shared_ptr& data_plan, const RowRangeIndex& row_range_index, const std::map& id_to_score) const { - std::vector sorted_row_ranges = - Range::SortAndMergeOverlap(row_ranges, /*adjacent=*/true); + // TODO(lisizhuo.lsz): add executor here auto data_splits = data_plan->Splits(); std::vector> indexed_splits; indexed_splits.reserve(data_splits.size()); @@ -80,14 +82,22 @@ Result> DataEvolutionBatchScan::WrapToIndexedSplits( if (!data_split) { return Status::Invalid("Cannot cast split to DataSplit when create IndexedSplit"); } - std::vector file_ranges; - file_ranges.reserve(data_split->DataFiles().size()); - for (const auto& meta : data_split->DataFiles()) { - PAIMON_ASSIGN_OR_RAISE(int64_t first_row_id, meta->NonNullFirstRowId()); - file_ranges.emplace_back(first_row_id, first_row_id + meta->row_count - 1); + const auto& files = data_split->DataFiles(); + if (files.empty()) { + return Status::Invalid("Empty data files in WrapToIndexedSplits"); + } + PAIMON_ASSIGN_OR_RAISE(int64_t min, files[0]->NonNullFirstRowId()); + PAIMON_ASSIGN_OR_RAISE(int64_t max, files[files.size() - 1]->NonNullFirstRowId()); + max += files[files.size() - 1]->row_count - 1; + + std::vector expected = row_range_index.IntersectedRanges(min, max); + if (expected.empty()) { + return Status::Invalid( + fmt::format("There should be intersected ranges for split with min row id {} and " + "max row id {}.", + min, max)); } - auto sorted_file_ranges = Range::SortAndMergeOverlap(file_ranges, /*adjacent=*/true); - std::vector expected = Range::And(sorted_file_ranges, sorted_row_ranges); + std::vector scores; if (!id_to_score.empty()) { for (const auto& range : expected) { @@ -108,7 +118,7 @@ Result> DataEvolutionBatchScan::WrapToIndexedSplits( Result> DataEvolutionBatchScan::EvalGlobalIndex() const { auto predicate = batch_scan_->GetNonPartitionPredicate(); - if (!predicate && !vector_search_) { + if (!predicate) { return std::shared_ptr(nullptr); } if (!core_options_.GlobalIndexEnabled()) { @@ -119,36 +129,14 @@ Result> DataEvolutionBatchScan::EvalGlobalInd PAIMON_ASSIGN_OR_RAISE( std::unique_ptr index_scan, GlobalIndexScan::Create(table_path_, core_options_.GetScanSnapshotId(), partition_filter, - core_options_.ToMap(), core_options_.GetFileSystem(), pool_)); + core_options_.ToMap(), core_options_.GetFileSystem(), executor_, + pool_)); auto index_scan_impl = dynamic_cast(index_scan.get()); if (!index_scan_impl) { return Status::Invalid("invalid GlobalIndexScan, cannot cast to GlobalIndexScanImpl"); } - PAIMON_ASSIGN_OR_RAISE(std::vector indexed_row_ranges, index_scan->GetRowRangeList()); - if (indexed_row_ranges.empty()) { - return std::shared_ptr(nullptr); - } - const auto& snapshot = index_scan_impl->GetSnapshot(); - const std::optional& next_row_id = snapshot.NextRowId(); - if (!next_row_id) { - return Status::Invalid("invalid snapshot, next row id is null"); - } - std::vector non_indexed_row_ranges = - Range(0, next_row_id.value() - 1).Exclude(indexed_row_ranges); - PAIMON_ASSIGN_OR_RAISE( - std::shared_ptr index_result, - index_scan_impl->ParallelScan(indexed_row_ranges, predicate, vector_search_, executor_)); - if (!index_result) { - return std::shared_ptr(nullptr); - } - if (!non_indexed_row_ranges.empty()) { - for (const auto& range : non_indexed_row_ranges) { - PAIMON_ASSIGN_OR_RAISE(index_result, - index_result->Or(BitmapGlobalIndexResult::FromRanges({range}))); - } - } - return index_result; + return index_scan_impl->Scan(predicate); } } // namespace paimon diff --git a/src/paimon/core/table/source/data_evolution_batch_scan.h b/src/paimon/core/table/source/data_evolution_batch_scan.h index fad108ea7..88f20f762 100644 --- a/src/paimon/core/table/source/data_evolution_batch_scan.h +++ b/src/paimon/core/table/source/data_evolution_batch_scan.h @@ -32,7 +32,6 @@ class DataEvolutionBatchScan : public AbstractTableScan { const std::shared_ptr& snapshot_reader, std::unique_ptr&& batch_scan, const std::shared_ptr& global_index_result, - const std::shared_ptr& vector_search, const CoreOptions& core_options, const std::shared_ptr& pool, const std::shared_ptr& executor); @@ -40,7 +39,7 @@ class DataEvolutionBatchScan : public AbstractTableScan { private: Result> WrapToIndexedSplits( - const std::shared_ptr& data_plan, const std::vector& row_ranges, + const std::shared_ptr& data_plan, const RowRangeIndex& row_range_index, const std::map& id_to_score) const; Result> EvalGlobalIndex() const; @@ -49,7 +48,6 @@ class DataEvolutionBatchScan : public AbstractTableScan { std::string table_path_; std::unique_ptr batch_scan_; std::shared_ptr global_index_result_; - std::shared_ptr vector_search_; std::shared_ptr executor_; }; diff --git a/src/paimon/core/table/source/data_table_batch_scan.h b/src/paimon/core/table/source/data_table_batch_scan.h index 405b784d6..d5a1d44e6 100644 --- a/src/paimon/core/table/source/data_table_batch_scan.h +++ b/src/paimon/core/table/source/data_table_batch_scan.h @@ -44,8 +44,8 @@ class DataTableBatchScan : public AbstractTableScan { return snapshot_reader_->GetPartitionPredicate(); } - DataTableBatchScan* WithRowRanges(const std::vector& row_ranges) { - snapshot_reader_->WithRowRanges(row_ranges); + DataTableBatchScan* WithRowRangeIndex(const RowRangeIndex& row_range_index) { + snapshot_reader_->WithRowRangeIndex(row_range_index); return this; } diff --git a/src/paimon/core/table/source/snapshot/snapshot_reader.h b/src/paimon/core/table/source/snapshot/snapshot_reader.h index b590cd077..b00a96404 100644 --- a/src/paimon/core/table/source/snapshot/snapshot_reader.h +++ b/src/paimon/core/table/source/snapshot/snapshot_reader.h @@ -80,8 +80,8 @@ class SnapshotReader { return this; } - SnapshotReader* WithRowRanges(const std::vector& row_ranges) { - scan_->WithRowRanges(row_ranges); + SnapshotReader* WithRowRangeIndex(const RowRangeIndex& row_range_index) { + scan_->WithRowRangeIndex(row_range_index); return this; } diff --git a/src/paimon/core/table/source/table_scan.cpp b/src/paimon/core/table/source/table_scan.cpp index 43ac11b4f..e28b01c8b 100644 --- a/src/paimon/core/table/source/table_scan.cpp +++ b/src/paimon/core/table/source/table_scan.cpp @@ -242,8 +242,7 @@ Result> TableScan::Create(std::unique_ptr( context->GetPath(), snapshot_reader, std::move(batch_scan), context->GetGlobalIndexResult(), - context->GetScanFilters()->GetVectorSearch(), core_options, context->GetMemoryPool(), - context->GetExecutor()); + core_options, context->GetMemoryPool(), context->GetExecutor()); } } // namespace paimon diff --git a/test/inte/global_index_test.cpp b/test/inte/global_index_test.cpp index e0ea4f896..ba43f4bbd 100644 --- a/test/inte/global_index_test.cpp +++ b/test/inte/global_index_test.cpp @@ -13,20 +13,23 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - #include "arrow/type.h" #include "gtest/gtest.h" #include "paimon/common/factories/io_hook.h" #include "paimon/common/global_index/bitmap/bitmap_global_index_factory.h" +#include "paimon/common/global_index/union_global_index_reader.h" #include "paimon/common/table/special_fields.h" #include "paimon/common/utils/scope_guard.h" +#include "paimon/core/global_index/global_index_scan_impl.h" #include "paimon/core/global_index/indexed_split_impl.h" -#include "paimon/core/global_index/row_range_global_index_scanner_impl.h" #include "paimon/core/table/source/data_split_impl.h" #include "paimon/defs.h" +#include "paimon/executor.h" #include "paimon/fs/file_system.h" #include "paimon/global_index/bitmap_global_index_result.h" #include "paimon/global_index/bitmap_scored_global_index_result.h" +#include "paimon/global_index/global_index_reader.h" +#include "paimon/global_index/global_index_result.h" #include "paimon/global_index/global_index_scan.h" #include "paimon/global_index/global_index_write_task.h" #include "paimon/predicate/literal.h" @@ -151,12 +154,10 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter Result> ScanGlobalIndexAndData( const std::string& table_path, const std::shared_ptr& predicate, - const std::shared_ptr& vector_search = nullptr, const std::map& options = {}, const std::shared_ptr& index_result = nullptr) const { ScanContextBuilder scan_context_builder(table_path); scan_context_builder.SetPredicate(predicate) - .SetVectorSearch(vector_search) .SetOptions(options) .SetGlobalIndexResult(index_result) .WithFileSystem(fs_); @@ -184,7 +185,7 @@ class GlobalIndexTest : public ::testing::Test, public ::testing::WithParamInter index_result = std::make_shared(std::move(bitmap), std::move(scores)); } - return ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, /*vector_search=*/nullptr, + return ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, /*options=*/{}, index_result); } @@ -431,27 +432,23 @@ TEST_P(GlobalIndexTest, TestScanIndex) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 7)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // test index reader // test f0 field - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); // test f0, f1, f2 fields - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); { // test with non predicate ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(/*predicate=*/nullptr, /*vector_search=*/nullptr)); + global_index_scan_impl->Scan(/*predicate=*/nullptr)); ASSERT_FALSE(index_result); } { @@ -459,8 +456,7 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,7}"); } { @@ -468,40 +464,35 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{1,2,3,4,5,6}"); } { // test equal predicate for f1 auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{4,6,7}"); } { // test equal predicate for f2 auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(1)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,1,4,5}"); } { // test is null predicate auto predicate = PredicateBuilder::IsNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{7}"); } { // test is not null predicate auto predicate = PredicateBuilder::IsNotNull(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,1,2,3,4,5,6}"); } { @@ -510,8 +501,7 @@ TEST_P(GlobalIndexTest, TestScanIndex) { /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, {Literal(FieldType::STRING, "Alice", 5), Literal(FieldType::STRING, "Bob", 3), Literal(FieldType::STRING, "Lucy", 4)}); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,1,4,5,7}"); } { @@ -520,8 +510,7 @@ TEST_P(GlobalIndexTest, TestScanIndex) { /*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, {Literal(FieldType::STRING, "Alice", 5), Literal(FieldType::STRING, "Bob", 3), Literal(FieldType::STRING, "Lucy", 4)}); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{2,3,6}"); } { @@ -532,8 +521,7 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{7}"); } { @@ -544,16 +532,14 @@ TEST_P(GlobalIndexTest, TestScanIndex) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,4,6,7}"); } { // test non-result auto predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(30)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{}"); } { @@ -568,48 +554,42 @@ TEST_P(GlobalIndexTest, TestScanIndex) { ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f1_predicate, f2_predicate, f0_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{}"); } { // test greater than predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::GreaterThan(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test greater or equal predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::GreaterOrEqual(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test less than predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::LessThan(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test less or equal predicate which bitmap index is not support, will return all range auto predicate = PredicateBuilder::LessOrEqual(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(10)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test a predicate for field with no index auto f3_predicate = PredicateBuilder::Equal(/*field_index=*/3, /*field_name=*/"f3", FieldType::DOUBLE, Literal(1.2)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(f3_predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(f3_predicate)); ASSERT_FALSE(index_result); } } @@ -622,27 +602,23 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; // snapshot 2 has f0 index - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/2l, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 7)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/2l, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // test index reader // test f0 field - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); // test f1 field - ASSERT_OK_AND_ASSIGN(auto index_reader2, range_scanner->CreateReader("f1", "bitmap")); - ASSERT_FALSE(index_reader2); + ASSERT_OK_AND_ASSIGN(auto index_readers2, global_index_scan->CreateReaders("f1", std::nullopt)); + ASSERT_EQ(index_readers2.size(), 0u); + + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); { // test and predicate auto f0_predicate = @@ -651,8 +627,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(index_result->ToString(), "{0,7}"); } { @@ -663,8 +638,7 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshot) { auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", FieldType::INT, Literal(20)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({f0_predicate, f1_predicate})); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } } @@ -677,27 +651,20 @@ TEST_P(GlobalIndexTest, TestScanIndexWithSpecificSnapshotWithNoIndex) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; // snapshot 1 has no index - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/1l, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_TRUE(ranges.empty()); - - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/1l, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // test index reader - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); - ASSERT_FALSE(index_reader); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 0u); + + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } @@ -708,57 +675,32 @@ TEST_P(GlobalIndexTest, TestScanIndexWithRange) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 7)})); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); { - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 3))); - auto scanner_impl = - std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - // test index reader - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); - { - // test non-exist index type - ASSERT_OK_AND_ASSIGN(auto non_exist_index_reader, - range_scanner->CreateReader("f0", "non-exist")); - ASSERT_FALSE(non_exist_index_reader); - } - - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto evaluator_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto evaluator_result, global_index_scan_impl->Scan(predicate)); ASSERT_EQ(evaluator_result->ToString(), "{1,2,3,4,5,6}"); } { - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(10, 13))); - auto scanner_impl = - std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - - // test index reader - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); - ASSERT_FALSE(index_reader); - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); - auto predicate = - PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); - ASSERT_FALSE(index_result); + // invalid range + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(10, 13)})); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(index_readers.size(), 0u); } } @@ -773,50 +715,41 @@ TEST_P(GlobalIndexTest, TestScanIndexWithPartition) { "/append_with_global_index_with_partition.db/append_with_global_index_with_partition"; auto check_result = [&](const std::optional>>& partitions) { - ASSERT_OK_AND_ASSIGN(auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - partitions, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 4)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, - global_index_scan->CreateRangeScan(Range(0, 4))); - auto scanner_impl = - std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, + /*options=*/{}, fs_, /*executor=*/nullptr, pool_)); // test index reader - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); - ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, + RowRangeIndex::Create({Range(0, 4)})); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(index_readers.size(), 1u); + ASSERT_OK_AND_ASSIGN(auto index_result, index_readers[0]->VisitEqual( + Literal(FieldType::STRING, "Bob", 3))); ASSERT_EQ(index_result->ToString(), "{1,4}"); - // test evaluator - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); + auto global_index_scan_impl = + std::dynamic_pointer_cast(global_index_scan); + { // null result as f2 does not have index auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(1)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } { // test not equal predicate for Bob - auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", - FieldType::STRING, - Literal(FieldType::STRING, "Bob", 3)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, index_readers[0]->VisitNotEqual( + Literal(FieldType::STRING, "Bob", 3))); ASSERT_EQ(index_result->ToString(), "{0,2,3}"); } { // test equal predicate for Alice - auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", - FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, index_readers[0]->VisitEqual(Literal( + FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0}"); } }; @@ -838,23 +771,19 @@ TEST_P(GlobalIndexTest, TestScanUnregisteredIndex) { std::string table_path = paimon::test::GetDataDir() + "/" + file_format_ + "/append_with_global_index.db/append_with_global_index"; - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); - ASSERT_FALSE(index_reader); + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 0u); - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); + auto global_index_scan_impl = std::dynamic_pointer_cast(global_index_scan); auto predicate = PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Bob", 3)); - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); + ASSERT_OK_AND_ASSIGN(auto index_result, global_index_scan_impl->Scan(predicate)); ASSERT_FALSE(index_result); } @@ -882,18 +811,14 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndex) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", /*options=*/{}, Range(0, 7))); - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 7)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 7))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "bitmap")); + ASSERT_OK_AND_ASSIGN(auto global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); ASSERT_OK_AND_ASSIGN(auto index_result, - index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); } @@ -955,44 +880,32 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { write_data_and_index(src_array2, {{"f2", "20"}}, Range(4, 8)); auto scan_and_check_result = [&](const std::map& partition, - const Range& expected_range, VectorSearch::PreFilter filter, - int32_t limit, const std::string& bitmap_result, + const std::optional& row_range_index, + VectorSearch::PreFilter filter, int32_t limit, + const std::string& bitmap_result, const std::string& lumina_result, - const std::vector& read_row_ranges, const std::shared_ptr& expected_array, const std::map& id_to_score) { std::vector> partitions = {partition}; - ASSERT_OK_AND_ASSIGN(auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - partitions, lumina_options, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({expected_range})); - - ASSERT_OK_AND_ASSIGN(auto range_scanner, - global_index_scan->CreateRangeScan(expected_range)); - auto scanner_impl = - std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, + lumina_options, fs_, /*executor=*/nullptr, pool_)); // check bitmap index - ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->CreateIndexEvaluator()); - - auto predicate1 = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto predicate2 = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Paul", 4)); - ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::Or({predicate1, predicate2})); - - ASSERT_OK_AND_ASSIGN(auto index_result, - evaluator->Evaluate(predicate, /*vector_search=*/nullptr)); - ASSERT_TRUE(index_result); + ASSERT_OK_AND_ASSIGN(auto readers, global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(readers.size(), 1u); + ASSERT_OK_AND_ASSIGN(auto result1, + readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_OK_AND_ASSIGN(auto result2, + readers[0]->VisitEqual(Literal(FieldType::STRING, "Paul", 4))); + ASSERT_OK_AND_ASSIGN(auto index_result, result1->Or(result2)); ASSERT_EQ(index_result->ToString(), bitmap_result); // check lumina index - ASSERT_OK_AND_ASSIGN(auto lumina_reader, range_scanner->CreateReader("f1", "lumina")); - + ASSERT_OK_AND_ASSIGN(auto lumina_readers, + global_index_scan->CreateReaders("f1", row_range_index)); + ASSERT_EQ(lumina_readers.size(), 1u); + auto lumina_reader = lumina_readers[0]; std::vector query = {1.0f, 1.0f, 1.0f, 1.1f}; auto vector_search = std::make_shared( "f1", limit, query, filter, @@ -1000,21 +913,11 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { ASSERT_OK_AND_ASSIGN(auto scored_result, lumina_reader->VisitVectorSearch(vector_search)); ASSERT_EQ(scored_result->ToString(), lumina_result); - // check evaluate predicate and vector search - auto vector_search_without_filter = vector_search->ReplacePreFilter(nullptr); - ASSERT_OK_AND_ASSIGN(auto compound_index_result, - evaluator->Evaluate(predicate, vector_search_without_filter)); - ASSERT_TRUE(compound_index_result); - ASSERT_EQ(compound_index_result->ToString(), lumina_result); - // check read array std::vector read_field_names = schema->field_names(); read_field_names.push_back("_INDEX_SCORE"); - ASSERT_OK_AND_ASSIGN(auto result_with_offset, - compound_index_result->AddOffset(expected_range.from)); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, - /*vector_search=*/nullptr, - /*options=*/{}, result_with_offset)); + /*options=*/{}, scored_result)); ASSERT_OK(ReadData(table_path, read_field_names, expected_array, /*predicate=*/nullptr, plan)); }; @@ -1034,9 +937,9 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { [0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21] ])") .ValueOrDie(); - scan_and_check_result({{"f2", "10"}}, Range(0, 3), filter, /*limit=*/2, "{0}", - "row ids: {0}, scores: {4.21}", {Range(0, 0)}, expected_array, - id_to_score1); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(0, 3)})); + scan_and_check_result({{"f2", "10"}}, row_range_index, filter, /*limit=*/2, "{0}", + "row ids: {0}, scores: {4.21}", expected_array, id_to_score1); } { // test scan and read for f2=20 @@ -1046,18 +949,9 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { [0, "Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1, 322.21] ])") .ValueOrDie(); - scan_and_check_result({{"f2", "20"}}, Range(4, 8), filter, /*limit=*/1, "{3,4}", - "row ids: {4}, scores: {322.21}", {Range(4, 4)}, expected_array, - id_to_score2); - } - { - // test invalid range input - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, lumina_options, fs_, pool_)); - ASSERT_NOK_WITH_MSG(global_index_scan->CreateRangeScan(Range(0, 8)), - "input range contain multiple partitions, fail to create range scan"); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(4, 8)})); + scan_and_check_result({{"f2", "20"}}, row_range_index, filter, /*limit=*/1, "{7,8}", + "row ids: {8}, scores: {322.21}", expected_array, id_to_score2); } { // test invalid partition input @@ -1065,7 +959,7 @@ TEST_P(GlobalIndexTest, TestWriteCommitScanReadIndexWithPartition) { GlobalIndexScan::Create( table_path, /*snapshot_id=*/std::nullopt, /*partitions=*/std::vector>(), lumina_options, - fs_, pool_), + fs_, /*executor=*/nullptr, pool_), "invalid input partition, supposed to be null or at least one partition"); } } @@ -1287,215 +1181,12 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScan) { PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN( - auto plan, ScanGlobalIndexAndData(table_path, predicate, /*vector_search=*/nullptr, - {{"global-index.enabled", "false"}})); + auto plan, + ScanGlobalIndexAndData(table_path, predicate, {{"global-index.enabled", "false"}})); ASSERT_OK(ReadData(table_path, write_cols, expected_all_array, predicate, plan)); } } -#ifdef PAIMON_ENABLE_LUMINA -TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithVectorSearch) { - arrow::FieldVector fields = { - arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), - arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; - std::map lumina_write_options = {{"lumina.index.dimension", "4"}, - {"lumina.index.type", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "rawf32"}}; - std::map lumina_read_options = { - {"lumina.search.parallel_number", "10"}}; - - auto schema = arrow::schema(fields); - std::map options = {{Options::MANIFEST_FORMAT, "orc"}, - {Options::FILE_FORMAT, file_format_}, - {Options::FILE_SYSTEM, "local"}, - {Options::ROW_TRACKING_ENABLED, "true"}, - {Options::DATA_EVOLUTION_ENABLED, "true"}}; - CreateTable(/*partition_keys=*/{}, schema, options); - - std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); - std::vector write_cols = schema->field_names(); - - auto src_array = std::dynamic_pointer_cast( - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ -["Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], -["Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], -["Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], -["Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1], -["Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], -["Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], -["Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1], -["Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], -["Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1] - ])") - .ValueOrDie()); - ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); - ASSERT_OK(Commit(table_path, commit_msgs)); - - auto result_fields = fields; - result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); - { - // read when no index is built - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], -[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], -[0, "Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], -[0, "Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1], -[0, "Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], -[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], -[0, "Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1], -[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], -[0, "Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); - } - - // write and commit bitmap global index - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", /*options=*/{}, - Range(0, 8))); - - auto read_cols = write_cols; - read_cols.push_back("_INDEX_SCORE"); - result_fields.insert(result_fields.end(), SpecialFields::IndexScore().ArrowField()); - { - // read when only bitmap index is built - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, null], -[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1, null] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); - } - - // write and commit lumina global index - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", - /*options=*/lumina_write_options, Range(0, 8))); - - // scan and read with global index - { - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); - } - { - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/3, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21], -[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1, 398.01] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); - } - { - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Bob", 3)); - auto vector_search = std::make_shared( - "f1", /*limit=*/3, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, - lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1, 2.01], -[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1, 360.01] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); - } - { - // test only has vector search with pre_filter - auto vector_search = std::make_shared( - "f1", /*limit=*/3, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), - /*filter=*/[](int64_t row_id) { return row_id == 1 || row_id == 5; }, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, - vector_search, lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1, 2.01], -[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1, 360.01] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, /*predicate=*/nullptr, plan)); - } - { - // test only has vector search with no pre_filter - auto vector_search = std::make_shared( - "f1", /*limit=*/2, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), - /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, - vector_search, lumina_read_options)); - - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1, 2.01], -[0, "Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1, 0.01] - ])") - .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, /*predicate=*/nullptr, plan)); - } - { - // test invalid vector search - auto predicate = - PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, - Literal(FieldType::STRING, "Bob", 3)); - auto vector_search = std::make_shared( - "f1", /*limit=*/3, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), - /*filter=*/[](int64_t row_id) { return true; }, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_read_options); - ASSERT_NOK_WITH_MSG( - ScanGlobalIndexAndData(table_path, predicate, vector_search, lumina_read_options), - "Predicate result and pre_filter in VectorSearch conflict"); - } -} -#endif - TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithOnlyOnePartitionHasIndex) { CreateTable(/*partition_keys=*/{"f1"}); std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); @@ -1531,17 +1222,13 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithOnlyOnePartitionHasIndex) auto result_fields = fields_; result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); { - // only f1 = 10 partition has index, f1 = 20 partition will not be filtered auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate)); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", 10, 1, 11.1], -[0, "Lucy", 20, 1, 15.1], -[0, "Tony", 20, 0, 17.1], -[0, "Alice", 20, null, 18.1] +[0, "Alice", 10, 1, 11.1] ])") .ValueOrDie(); @@ -1588,34 +1275,24 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithTwoIndexInDiffTwoPartition auto result_fields = fields_; result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); { - // only f1 = 10 partition has f0 index, f1 = 20 partition will not be filtered auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate)); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", 10, 1, 11.1], -[0, "Lucy", 20, 1, 15.1], -[0, "Tony", 20, 0, 17.1], -[0, "Alice", 20, null, 18.1] +[0, "Alice", 10, 1, 11.1] ])") .ValueOrDie(); ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); } { - // only f1 = 20 partition has f2 index, f1 = 10 partition will not be filtered auto predicate = PredicateBuilder::Equal(/*field_index=*/2, /*field_name=*/"f2", FieldType::INT, Literal(1)); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate)); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", 10, 1, 11.1], -[0, "Bob", 10, 1, 12.1], -[0, "Emily", 10, 0, 13.1], -[0, "Tony", 10, 0, 14.1], -[0, "Bob", 10, 1, 16.1], [0, "Lucy", 20, 1, 15.1] ])") .ValueOrDie(); @@ -1623,6 +1300,8 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithTwoIndexInDiffTwoPartition ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); } { + // only f1 = 10 partition has f0 index, query predicate1 results in ["Alice", 10, 1, 11.1] + // only f2 = 20 partition has f2 index, query predicate2 results in ["Lucy", 20, 1, 15.1] auto predicate1 = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); @@ -1630,14 +1309,7 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithTwoIndexInDiffTwoPartition FieldType::INT, Literal(1)); ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({predicate1, predicate2})); ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate)); - auto expected_array = - arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", 10, 1, 11.1], -[0, "Lucy", 20, 1, 15.1] - ])") - .ValueOrDie(); - - ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); + ASSERT_OK(ReadData(table_path, write_cols, /*expected_array=*/nullptr, predicate, plan)); } { // predicate2 is partition filter @@ -1764,68 +1436,6 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithTwoPartitionAllWithIndex) } } -#ifdef PAIMON_ENABLE_LUMINA -TEST_P(GlobalIndexTest, TestInvalidGetRowRangeListWithIndexRangeMismatchViaDifferentType) { - arrow::FieldVector fields = { - arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), - arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; - std::map lumina_options = {{"lumina.index.dimension", "4"}, - {"lumina.index.type", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "rawf32"}, - {"lumina.search.parallel_number", "10"}}; - auto schema = arrow::schema(fields); - std::map options = {{Options::MANIFEST_FORMAT, "orc"}, - {Options::FILE_FORMAT, file_format_}, - {Options::FILE_SYSTEM, "local"}, - {Options::ROW_TRACKING_ENABLED, "true"}, - {Options::DATA_EVOLUTION_ENABLED, "true"}}; - CreateTable(/*partition_keys=*/{"f2"}, schema, options); - - std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); - std::vector write_cols = schema->field_names(); - // write partition f2 = 10 - auto src_array1 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ -["Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], -["Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], -["Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], -["Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1] - ])") - .ValueOrDie(); - ASSERT_OK_AND_ASSIGN(auto commit_msgs1, - WriteArray(table_path, {{"f2", "10"}}, write_cols, src_array1)); - ASSERT_OK(Commit(table_path, commit_msgs1)); - - // write partition f2 = 20 - auto src_array2 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ -["Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], -["Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], -["Tony", [11.0, 10.0, 11.0, 10.0], 20, 17.1], -["Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], -["Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1] - ])") - .ValueOrDie(); - ASSERT_OK_AND_ASSIGN(auto commit_msgs2, - WriteArray(table_path, {{"f2", "20"}}, write_cols, src_array2)); - ASSERT_OK(Commit(table_path, commit_msgs2)); - - // write and commit bitmap global index for f2 = 10 - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f2", "10"}}}, "f0", "bitmap", - /*options=*/{}, Range(0, 3))); - - // write and commit lumina global index for f2 = 20 - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f2", "20"}}}, "f1", "lumina", - /*options=*/lumina_options, Range(4, 8))); - - ASSERT_OK_AND_ASSIGN(auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, - /*options=*/lumina_options, fs_, pool_)); - ASSERT_NOK_WITH_MSG(global_index_scan->GetRowRangeList(), - "Inconsistent row ranges among index types"); -} -#endif - TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithPartitionWithTwoFields) { CreateTable(/*partition_keys=*/{"f1", "f2"}); std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); @@ -1973,22 +1583,20 @@ TEST_P(GlobalIndexTest, TestScanIndexWithTwoIndexes) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", /*options=*/lumina_options, Range(0, 8))); - ASSERT_OK_AND_ASSIGN(auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, - /*options=*/lumina_options, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 8)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 8))); + ASSERT_OK_AND_ASSIGN( + auto global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, + /*options=*/lumina_options, fs_, /*executor=*/nullptr, pool_)); // query f0 - ASSERT_OK_AND_ASSIGN(auto index_readers, range_scanner->CreateReaders("f0")); + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); ASSERT_EQ(index_readers.size(), 1); ASSERT_OK_AND_ASSIGN(auto index_result, index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); ASSERT_EQ(index_result->ToString(), "{0,7}"); // query f1 - ASSERT_OK_AND_ASSIGN(index_readers, range_scanner->CreateReaders("f1")); + ASSERT_OK_AND_ASSIGN(index_readers, global_index_scan->CreateReaders("f1", std::nullopt)); ASSERT_EQ(index_readers.size(), 1); std::vector query = {11.0f, 11.0f, 11.0f, 11.0f}; ASSERT_OK_AND_ASSIGN( @@ -1999,19 +1607,15 @@ TEST_P(GlobalIndexTest, TestScanIndexWithTwoIndexes) { ASSERT_EQ(scored_result->ToString(), "row ids: {7}, scores: {0.00}"); // query f2 - ASSERT_OK_AND_ASSIGN(index_readers, range_scanner->CreateReaders("f2")); + ASSERT_OK_AND_ASSIGN(index_readers, global_index_scan->CreateReaders("f2", std::nullopt)); ASSERT_EQ(index_readers.size(), 0); } +#endif TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithExternalPath) { arrow::FieldVector fields = { arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; - std::map lumina_options = {{"lumina.index.dimension", "4"}, - {"lumina.index.type", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "rawf32"}, - {"lumina.search.parallel_number", "10"}}; auto schema = arrow::schema(fields); std::map options = {{Options::MANIFEST_FORMAT, "orc"}, {Options::FILE_FORMAT, file_format_}, @@ -2041,40 +1645,28 @@ TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithExternalPath) { // write and commit bitmap global index auto external_dir1 = UniqueTestDirectory::Create("local"); - ASSERT_OK( - WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", - /*options=*/{{"global-index.external-path", "FILE://" + external_dir1->Str()}}, - Range(0, 8))); - - auto external_dir2 = UniqueTestDirectory::Create("local"); - auto lumina_options_with_external_path = lumina_options; - lumina_options_with_external_path["global-index.external-path"] = - "FILE://" + external_dir2->Str(); - ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", - /*options=*/lumina_options_with_external_path, Range(0, 8))); + std::map index_options = { + {"global-index.external-path", "FILE://" + external_dir1->Str()}}; + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", index_options, + Range(0, 8))); - auto read_cols = write_cols; - read_cols.push_back("_INDEX_SCORE"); auto result_fields = fields; result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); - result_fields.insert(result_fields.end(), SpecialFields::IndexScore().ArrowField()); // test scan and read auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_options); - ASSERT_OK_AND_ASSIGN( - auto plan, ScanGlobalIndexAndData(table_path, predicate, vector_search, lumina_options)); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, predicate, index_options)); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1, 4.21] +[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], +[0, "Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1] ])") .ValueOrDie(); - ASSERT_OK(ReadData(table_path, read_cols, expected_array, predicate, plan)); + + ASSERT_OK(ReadData(table_path, write_cols, expected_array, predicate, plan)); } TEST_P(GlobalIndexTest, TestIOException) { @@ -2100,11 +1692,6 @@ TEST_P(GlobalIndexTest, TestIOException) { {Options::FILE_SYSTEM, "local"}, {Options::ROW_TRACKING_ENABLED, "true"}, {Options::DATA_EVOLUTION_ENABLED, "true"}}; - std::map lumina_options = {{"lumina.index.dimension", "4"}, - {"lumina.index.type", "bruteforce"}, - {"lumina.distance.metric", "l2"}, - {"lumina.encoding.type", "rawf32"}, - {"lumina.search.parallel_number", "10"}}; std::string table_path; bool write_run_complete = false; auto io_hook = IOHook::GetInstance(); @@ -2123,17 +1710,12 @@ TEST_P(GlobalIndexTest, TestIOException) { WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", /*options=*/{}, Range(0, 3)); CHECK_HOOK_STATUS(bitmap_index_write_status, i); - // write lumina index - auto lumina_index_write_status = - WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", - /*options=*/lumina_options, Range(0, 3)); - CHECK_HOOK_STATUS_WITHOUT_MESSAGE_CHECK(lumina_index_write_status); write_run_complete = true; break; } ASSERT_TRUE(write_run_complete); - // read for bitmap and lumina + // read for bitmap bool read_run_complete = false; for (size_t i = 0; i < 2000; i += paimon::test::RandomNumber(20, 30)) { ScopeGuard guard([&io_hook]() { io_hook->Clear(); }); @@ -2141,34 +1723,27 @@ TEST_P(GlobalIndexTest, TestIOException) { auto result_fields = fields; result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); - result_fields.insert(result_fields.end(), SpecialFields::IndexScore().ArrowField()); - auto read_cols = write_cols; - read_cols.push_back("_INDEX_SCORE"); auto predicate = PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, Literal(FieldType::STRING, "Alice", 5)); - auto vector_search = std::make_shared( - "f1", /*limit=*/1, std::vector({1.0f, 1.0f, 1.0f, 1.1f}), /*filter=*/nullptr, - /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_options); auto expected_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ -[0, "Alice", [1.0, 0.0, 1.0, 0.0], 10, 13.1, 2.21] +[0, "Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], +[0, "Alice", [1.0, 0.0, 1.0, 0.0], 10, 13.1] ])") .ValueOrDie(); - auto plan_result = - ScanGlobalIndexAndData(table_path, predicate, vector_search, lumina_options); + auto plan_result = ScanGlobalIndexAndData(table_path, predicate); CHECK_HOOK_STATUS_WITHOUT_MESSAGE_CHECK(plan_result.status()); auto plan = std::move(plan_result).value(); - auto read_status = ReadData(table_path, read_cols, expected_array, predicate, plan); + auto read_status = ReadData(table_path, write_cols, expected_array, predicate, plan); CHECK_HOOK_STATUS(read_status, i); read_run_complete = true; break; } ASSERT_TRUE(read_run_complete); } -#endif TEST_P(GlobalIndexTest, TestDataEvolutionBatchScanWithRangeBitmap) { CreateTable(); @@ -2463,18 +2038,15 @@ TEST_P(GlobalIndexTest, TestLuceneWriteCommitScanReadIndexWithScore) { ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "lucene-fts", /*options=*/lucene_options, Range(0, 3))); - ASSERT_OK_AND_ASSIGN( - auto global_index_scan, - GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, - /*partitions=*/std::nullopt, /*options=*/{}, fs_, pool_)); - ASSERT_OK_AND_ASSIGN(std::vector ranges, global_index_scan->GetRowRangeList()); - ASSERT_EQ(ranges, std::vector({Range(0, 3)})); - ASSERT_OK_AND_ASSIGN(auto range_scanner, global_index_scan->CreateRangeScan(Range(0, 3))); - auto scanner_impl = std::dynamic_pointer_cast(range_scanner); - ASSERT_TRUE(scanner_impl); - + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); // test f0 field - ASSERT_OK_AND_ASSIGN(auto index_reader, range_scanner->CreateReader("f0", "lucene-fts")); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", /*row_range_index=*/std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + auto index_reader = index_readers[0]; { ASSERT_OK_AND_ASSIGN(auto index_result, index_reader->VisitFullTextSearch(std::make_shared( @@ -2503,6 +2075,866 @@ TEST_P(GlobalIndexTest, TestLuceneWriteCommitScanReadIndexWithScore) { } #endif +TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndex) { + // BTreeGlobalIndexWriter requires keys to be written in monotonically increasing order. + // Therefore the source data must be pre-sorted by the indexed column (f0, string). + CreateTable(); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + auto schema = arrow::schema(fields_); + + std::vector write_cols = schema->field_names(); + + // Data sorted by f0 (string, ascending): Alice < Bob < Bob < Emily < Lucy < Tony < Tony + // The last row has f0=null which is treated separately by the null bitmap. + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 10, 1, 11.1], +["Bob", 10, 1, 12.1], +["Bob", 20, 0, 16.1], +["Emily", 10, 0, 13.1], +["Lucy", 20, 1, 15.1], +["Tony", 10, 0, 14.1], +["Tony", 20, 0, 17.1], +[null, 20, null, 18.1] + ])") + .ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Write btree-global index on f0 + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "btree", + /*options=*/{}, Range(0, 7))); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", /*row_range_index=*/std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + auto index_reader = index_readers[0]; + + { + // VisitEqual: "Alice" -> row 0 + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0}"); + } + { + // VisitEqual: "Bob" -> rows 1,2 + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2}"); + } + { + // VisitEqual: non-existent key -> empty + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitEqual(Literal(FieldType::STRING, "Zara", 4))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{}"); + } + { + // VisitNotEqual: "Bob" -> all non-null except Bob rows -> {0,3,4,5,6} + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitNotEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,3,4,5,6}"); + } + { + // VisitIsNull -> row 7 (null key) + ASSERT_OK_AND_ASSIGN(auto result, index_reader->VisitIsNull()); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{7}"); + } + { + // VisitIsNotNull -> rows 0-6 + ASSERT_OK_AND_ASSIGN(auto result, index_reader->VisitIsNotNull()); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,1,2,3,4,5,6}"); + } + { + // VisitIn: {"Alice", "Lucy"} -> rows {0, 4} + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitIn({Literal(FieldType::STRING, "Alice", 5), + Literal(FieldType::STRING, "Lucy", 4)})); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,4}"); + } + { + // VisitNotIn: {"Alice", "Lucy"} -> all non-null except {0,4} -> {1,2,3,5,6} + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitNotIn({Literal(FieldType::STRING, "Alice", 5), + Literal(FieldType::STRING, "Lucy", 4)})); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2,3,5,6}"); + } + { + // VisitLessThan: "Emily" -> keys < "Emily" -> Alice(0), Bob(1,2) -> {0,1,2} + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitLessThan(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,1,2}"); + } + { + // VisitLessOrEqual: "Emily" -> keys <= "Emily" -> Alice(0), Bob(1,2), Emily(3) -> + // {0,1,2,3} + ASSERT_OK_AND_ASSIGN( + auto result, index_reader->VisitLessOrEqual(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,1,2,3}"); + } + { + // VisitGreaterThan: "Emily" -> keys > "Emily" -> Lucy(4), Tony(5,6) -> {4,5,6} + ASSERT_OK_AND_ASSIGN( + auto result, index_reader->VisitGreaterThan(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{4,5,6}"); + } + { + // VisitGreaterOrEqual: "Emily" -> keys >= "Emily" -> Emily(3), Lucy(4), Tony(5,6) -> + // {3,4,5,6} + ASSERT_OK_AND_ASSIGN( + auto result, index_reader->VisitGreaterOrEqual(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{3,4,5,6}"); + } + + auto scan_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scan_impl); + { + // Equal predicate via evaluator + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Tony", 4)); + ASSERT_OK_AND_ASSIGN(auto result, scan_impl->Scan(predicate)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{5,6}"); + } + { + // AND predicate: f0 == "Bob" AND f1 == 20 + // f0 == "Bob" -> {1,2}, but f1 index does not exist -> AND yields {1,2} + // (fields without index return nullptr, AND with nullptr keeps the other side) + auto f0_predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + auto f1_predicate = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", + FieldType::INT, Literal(20)); + ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_predicate, f1_predicate})); + ASSERT_OK_AND_ASSIGN(auto result, scan_impl->Scan(predicate)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2}"); + } + { + // row_range_index filtering: range [0,2] should only load that range + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(0, 2)})); + ASSERT_OK_AND_ASSIGN(auto range_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(range_readers.size(), 1u); + ASSERT_OK_AND_ASSIGN(auto result, + range_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0}"); + } + { + // Invalid row_range_index: no intersection -> empty readers + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, + RowRangeIndex::Create({Range(100, 200)})); + ASSERT_OK_AND_ASSIGN(auto range_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(range_readers.size(), 0u); + } + + // Test full pipeline: scan with predicate -> read data + { + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + auto scan_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_OK_AND_ASSIGN(auto index_result, scan_impl->Scan(predicate)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{1,2}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Bob", 10, 1, 12.1], +[0, "Bob", 20, 0, 16.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } +} + +TEST_P(GlobalIndexTest, TestBTreeWriteCommitScanReadIndexWithPartition) { + // BTree index with partitioned table. Each partition's data is sorted by f0 independently. + auto schema = arrow::schema(fields_); + std::map options = {{Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, file_format_}, + {Options::FILE_SYSTEM, "local"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(/*partition_keys=*/{"f1"}, schema, options); + + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + std::vector write_cols = schema->field_names(); + + // Write partition f1=10. Data sorted by f0: Alice < Bob < Bob < Emily < Tony + auto src_array1 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 10, 1, 11.1], +["Bob", 10, 1, 12.1], +["Bob", 10, 0, 13.1], +["Emily", 10, 0, 14.1], +["Tony", 10, 1, 15.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto commit_msgs1, + WriteArray(table_path, {{"f1", "10"}}, write_cols, src_array1)); + ASSERT_OK(Commit(table_path, commit_msgs1)); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f1", "10"}}}, "f0", "btree", + /*options=*/{}, Range(0, 4))); + + // Write partition f1=20. Data sorted by f0: Alice < Lucy < Tony + auto src_array2 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 20, null, 16.1], +["Lucy", 20, 1, 17.1], +["Tony", 20, 0, 18.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto commit_msgs2, + WriteArray(table_path, {{"f1", "20"}}, write_cols, src_array2)); + ASSERT_OK(Commit(table_path, commit_msgs2)); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f1", "20"}}}, "f0", "btree", + /*options=*/{}, Range(5, 7))); + + // Scan all partitions + { + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, + fs_, /*executor=*/nullptr, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + // One reader per partition range -> 2 ranges -> UnionGlobalIndexReader wraps them + ASSERT_EQ(index_readers.size(), 1u); + + // "Alice" exists in both partitions: local ids {0} in range [0,4] -> global 0, + // and local ids {0} in range [5,7] -> global 5 + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,5}"); + + // "Bob" only in partition f1=10: local ids {1,2} -> global {1,2} + ASSERT_OK_AND_ASSIGN(auto result2, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result2); + ASSERT_EQ(result2->ToString(), "{1,2}"); + + // "Lucy" only in partition f1=20: local ids {1} -> global {6} + ASSERT_OK_AND_ASSIGN(auto result3, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Lucy", 4))); + ASSERT_TRUE(result3); + ASSERT_EQ(result3->ToString(), "{6}"); + } + + // Scan with partition filter: only f1=10 + { + std::vector> partitions = {{{"f1", "10"}}}; + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, partitions, + /*options=*/{}, fs_, /*executor=*/nullptr, pool_)); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + + // "Alice" in f1=10 only -> global {0} + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0}"); + + // "Lucy" not in f1=10 -> empty + ASSERT_OK_AND_ASSIGN(auto result2, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Lucy", 4))); + ASSERT_TRUE(result2); + ASSERT_EQ(result2->ToString(), "{}"); + } + + // Scan with row_range_index filtering: only range [5,7] (partition f1=20) + { + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, + fs_, /*executor=*/nullptr, pool_)); + ASSERT_OK_AND_ASSIGN(RowRangeIndex row_range_index, RowRangeIndex::Create({Range(5, 7)})); + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", row_range_index)); + ASSERT_EQ(index_readers.size(), 1u); + + // "Tony" in range [5,7]: local id {2} in range [5,7] -> global {7} + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Tony", 4))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{7}"); + } + + // Full pipeline with evaluator: Scan(predicate) -> read data + { + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, + fs_, /*executor=*/nullptr, pool_)); + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scanner_impl); + + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Tony", 4)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{4,7}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Tony", 10, 1, 15.1], +[0, "Tony", 20, 0, 18.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } +} + +TEST_P(GlobalIndexTest, TestBTreeWithPartitionAndCustomExecutor) { + // Test that UnionGlobalIndexReader uses a custom 8-thread executor to read + // btree indexes from two partitions in parallel. + auto schema = arrow::schema(fields_); + std::map options = {{Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, file_format_}, + {Options::FILE_SYSTEM, "local"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(/*partition_keys=*/{"f1"}, schema, options); + + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + std::vector write_cols = schema->field_names(); + + // Write partition f1=10 (5 rows, sorted by f0) + auto src_array1 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 10, 1, 11.1], +["Bob", 10, 1, 12.1], +["Bob", 10, 0, 13.1], +["Emily", 10, 0, 14.1], +["Tony", 10, 1, 15.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto commit_msgs1, + WriteArray(table_path, {{"f1", "10"}}, write_cols, src_array1)); + ASSERT_OK(Commit(table_path, commit_msgs1)); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f1", "10"}}}, "f0", "btree", + /*options=*/{}, Range(0, 4))); + + // Write partition f1=20 (3 rows, sorted by f0) + auto src_array2 = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 20, null, 16.1], +["Lucy", 20, 1, 17.1], +["Tony", 20, 0, 18.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto commit_msgs2, + WriteArray(table_path, {{"f1", "20"}}, write_cols, src_array2)); + ASSERT_OK(Commit(table_path, commit_msgs2)); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{{{"f1", "20"}}}, "f0", "btree", + /*options=*/{}, Range(5, 7))); + + // Create a GlobalIndexScan with an explicit 8-thread executor + std::shared_ptr executor = CreateDefaultExecutor(/*thread_count=*/8); + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, executor, pool_)); + + // CreateReaders should return 1 UnionGlobalIndexReader (2 sub-readers for 2 ranges) + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + + auto union_reader = std::dynamic_pointer_cast(index_readers[0]); + ASSERT_TRUE(union_reader); + ASSERT_EQ(union_reader->executor_, executor); + + // "Alice" in both partitions: global ids {0, 5} + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,5}"); + + // "Bob" only in f1=10: global ids {1, 2} + ASSERT_OK_AND_ASSIGN(auto result2, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result2); + ASSERT_EQ(result2->ToString(), "{1,2}"); + + // "Lucy" only in f1=20: global id {6} + ASSERT_OK_AND_ASSIGN(auto result3, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Lucy", 4))); + ASSERT_TRUE(result3); + ASSERT_EQ(result3->ToString(), "{6}"); + + // Full pipeline: evaluator with the 8-thread executor + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scanner_impl); + + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Tony", 4)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{4,7}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Tony", 10, 1, 15.1], +[0, "Tony", 20, 0, 18.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); +} + +TEST_P(GlobalIndexTest, TestBTreeAndBitmapCoexist) { + // Test btree-global and bitmap index coexisting on the same field (f0). + // The evaluator should AND their results, producing the intersection. + CreateTable(); + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + auto schema = arrow::schema(fields_); + std::vector write_cols = schema->field_names(); + + // Data sorted by f0 for btree: Alice < Bob < Bob < Emily < Lucy < Tony < Tony + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields_), R"([ +["Alice", 10, 1, 11.1], +["Bob", 10, 1, 12.1], +["Bob", 25, 1, 16.1], +["Emily", 15, 0, 13.1], +["Lucy", 20, 1, 15.1], +["Tony", 20, 0, 14.1], +["Tony", 30, 0, 17.1], +[null, 30, null, 18.1] + ])") + .ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Build both indexes on f0 + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "btree", + /*options=*/{}, Range(0, 7))); + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "bitmap", + /*options=*/{}, Range(0, 7))); + + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); + + // Two index types on f0 -> 2 readers + ASSERT_OK_AND_ASSIGN(auto index_readers, global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 2u); + + // Each reader individually should return the same result for Equal("Bob") + for (const auto& index_reader : index_readers) { + ASSERT_OK_AND_ASSIGN(auto result, + index_reader->VisitEqual(Literal(FieldType::STRING, "Bob", 3))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2}"); + } + + // Via evaluator: the two indexes' results get AND, still {1,2} + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scanner_impl); + ASSERT_OK_AND_ASSIGN(auto evaluator, scanner_impl->GetOrCreateIndexEvaluator()); + { + // Equal predicate + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + ASSERT_OK_AND_ASSIGN(auto result, evaluator->Evaluate(predicate)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{1,2}"); + } + { + // NotEqual predicate: both indexes agree on non-null, non-"Bob" rows -> {0,3,4,5,6} + auto predicate = + PredicateBuilder::NotEqual(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + ASSERT_OK_AND_ASSIGN(auto result, evaluator->Evaluate(predicate)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,3,4,5,6}"); + } + { + // IsNull: both agree on row 7 + auto predicate = + PredicateBuilder::IsNull(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING); + ASSERT_OK_AND_ASSIGN(auto result, evaluator->Evaluate(predicate)); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{7}"); + } + + // Full pipeline: f0 == "Alice" -> read data + { + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Alice", 5)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{0}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Alice", 10, 1, 11.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } + // Full pipeline with AND across btree(f0) and bitmap(f0): + // btree supports LessOrEqual, bitmap returns nullptr for LessOrEqual + // So AND(LessOrEqual, Equal) -> only the field(s) that both can evaluate get AND + { + // f0 == "Bob" AND f1 == 10 (f1 has no index -> nullptr -> keeps btree+bitmap result) + auto f0_pred = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + auto f1_pred = PredicateBuilder::Equal(/*field_index=*/1, /*field_name=*/"f1", + FieldType::INT, Literal(10)); + ASSERT_OK_AND_ASSIGN(auto predicate, PredicateBuilder::And({f0_pred, f1_pred})); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{1,2}"); + + auto result_fields = fields_; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Bob", 10, 1, 12.1], +[0, "Bob", 25, 1, 16.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } +} + +TEST_P(GlobalIndexTest, TestBTreeScanWithPartitionWithMultiMeta) { + if (file_format_ == "lance" || file_format_ == "avro") { + return; + } + std::string table_path = + paimon::test::GetDataDir() + "/" + file_format_ + + "/append_with_btree_with_partition.db/append_with_btree_with_partition"; + + ASSERT_OK_AND_ASSIGN(std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, /*options=*/{}, fs_, + /*executor=*/nullptr, pool_)); + + auto count_rows = [](const std::shared_ptr& result) -> int64_t { + EXPECT_TRUE(result); + EXPECT_OK_AND_ASSIGN(std::vector ranges, result->ToRanges()); + int64_t total = 0; + for (const auto& range : ranges) { + total += range.Count(); + } + return total; + }; + + auto get_reader = [&](const std::string& column) -> std::shared_ptr { + EXPECT_OK_AND_ASSIGN(auto readers, global_index_scan->CreateReaders(column, std::nullopt)); + EXPECT_EQ(readers.size(), 1u); + return readers[0]; + }; + + // ---- col_boolean ---- + { + auto reader = get_reader("col_boolean"); + ASSERT_TRUE(reader); + ASSERT_OK_AND_ASSIGN(auto eq_true, reader->VisitEqual(Literal(true))); + ASSERT_EQ(count_rows(eq_true), 20); + ASSERT_OK_AND_ASSIGN(auto eq_false, reader->VisitEqual(Literal(false))); + ASSERT_EQ(count_rows(eq_false), 20); + } + + // ---- col_int ---- + { + auto reader = get_reader("col_int"); + ASSERT_TRUE(reader); + ASSERT_OK_AND_ASSIGN(auto eq_15, reader->VisitEqual(Literal(15))); + ASSERT_EQ(count_rows(eq_15), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, reader->VisitEqual(Literal(100))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(30): i*3 > 30 -> i in [11, 19], 9 indices per partition -> 18 rows. + ASSERT_OK_AND_ASSIGN(auto gt_30, reader->VisitGreaterThan(Literal(30))); + ASSERT_EQ(count_rows(gt_30), 18); + // GreaterThan(57): nothing greater than the max value. + ASSERT_OK_AND_ASSIGN(auto gt_max, reader->VisitGreaterThan(Literal(57))); + ASSERT_EQ(count_rows(gt_max), 0); + } + + // ---- col_date (values are 18000 + i for i in [0,19]) ---- + { + auto reader = get_reader("col_date"); + ASSERT_TRUE(reader); + // 18005 is present at i=5 in both partitions. + ASSERT_OK_AND_ASSIGN(auto eq_present, reader->VisitEqual(Literal(FieldType::DATE, 18005))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, reader->VisitEqual(Literal(FieldType::DATE, 17999))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(18010): i in [11, 19] -> 9 per partition -> 18 rows. + ASSERT_OK_AND_ASSIGN(auto gt_mid, + reader->VisitGreaterThan(Literal(FieldType::DATE, 18010))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_double (values are i * 2.2 for i in [0,19]) ---- + { + auto reader = get_reader("col_double"); + ASSERT_TRUE(reader); + // i=5 -> 11.0 + ASSERT_OK_AND_ASSIGN(auto eq_present, reader->VisitEqual(Literal(11.0))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, reader->VisitEqual(Literal(123.456))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(10 * 2.2 = 22.0): i in [11, 19] -> 9 per partition -> 18 rows. + ASSERT_OK_AND_ASSIGN(auto gt_mid, reader->VisitGreaterThan(Literal(10 * 2.2))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_timestamp (Timestamp from epoch millis = 1700000000000 + i*1000) ---- + { + auto reader = get_reader("col_timestamp"); + ASSERT_TRUE(reader); + // i=5 -> 1700000005000 ms. + ASSERT_OK_AND_ASSIGN( + auto eq_present, + reader->VisitEqual(Literal(Timestamp::FromEpochMillis(1700000000000L + 5 * 1000L)))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, + reader->VisitEqual(Literal(Timestamp::FromEpochMillis(1L)))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(i=10 boundary): i in [11, 19] -> 18 rows globally. + ASSERT_OK_AND_ASSIGN(auto gt_mid, + reader->VisitGreaterThan( + Literal(Timestamp::FromEpochMillis(1700000000000L + 10 * 1000L)))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_timestamp_ltz (same physical values as col_timestamp) ---- + { + auto reader = get_reader("col_timestamp_ltz"); + ASSERT_TRUE(reader); + ASSERT_OK_AND_ASSIGN( + auto eq_present, + reader->VisitEqual(Literal(Timestamp::FromEpochMillis(1700000000000L + 7 * 1000L)))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto gt_mid, + reader->VisitGreaterThan( + Literal(Timestamp::FromEpochMillis(1700000000000L + 10 * 1000L)))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_decimal (unscaled = i * 123456, precision=18, scale=6) ---- + { + auto reader = get_reader("col_decimal"); + ASSERT_TRUE(reader); + // i=5 -> unscaled 617280 + ASSERT_OK_AND_ASSIGN( + auto eq_present, + reader->VisitEqual(Literal(Decimal::FromUnscaledLong(5 * 123456L, 18, 6)))); + ASSERT_EQ(count_rows(eq_present), 2); + ASSERT_OK_AND_ASSIGN(auto eq_missing, + reader->VisitEqual(Literal(Decimal::FromUnscaledLong(1L, 18, 6)))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan(i=10): i in [11, 19] -> 18 rows globally. + ASSERT_OK_AND_ASSIGN( + auto gt_mid, + reader->VisitGreaterThan(Literal(Decimal::FromUnscaledLong(10 * 123456L, 18, 6)))); + ASSERT_EQ(count_rows(gt_mid), 18); + } + + // ---- col_string (values are "str_00000" .. "str_00019") ---- + { + auto reader = get_reader("col_string"); + ASSERT_TRUE(reader); + std::string present_value = "str_00005"; + ASSERT_OK_AND_ASSIGN(auto eq_present, + reader->VisitEqual(Literal(FieldType::STRING, present_value.data(), + present_value.size()))); + ASSERT_EQ(count_rows(eq_present), 2); + std::string missing_value = "str_99999"; + ASSERT_OK_AND_ASSIGN(auto eq_missing, + reader->VisitEqual(Literal(FieldType::STRING, missing_value.data(), + missing_value.size()))); + ASSERT_EQ(count_rows(eq_missing), 0); + // GreaterThan("str_00010"): lexicographically greater values are i in [11, 19]. + std::string mid_value = "str_00010"; + ASSERT_OK_AND_ASSIGN(auto gt_mid, + reader->VisitGreaterThan( + Literal(FieldType::STRING, mid_value.data(), mid_value.size()))); + ASSERT_EQ(count_rows(gt_mid), 18); + } +} + +#ifdef PAIMON_ENABLE_LUMINA +TEST_P(GlobalIndexTest, TestBTreeWithLumina) { + // Test btree on f0 (string) and lumina on f1 (vector) coexisting on different fields. + arrow::FieldVector fields = { + arrow::field("f0", arrow::utf8()), arrow::field("f1", arrow::list(arrow::float32())), + arrow::field("f2", arrow::int32()), arrow::field("f3", arrow::float64())}; + std::map lumina_options = {{"lumina.index.dimension", "4"}, + {"lumina.index.type", "bruteforce"}, + {"lumina.distance.metric", "l2"}, + {"lumina.encoding.type", "rawf32"}, + {"lumina.search.parallel_number", "10"}}; + auto schema = arrow::schema(fields); + std::map options = {{Options::MANIFEST_FORMAT, "orc"}, + {Options::FILE_FORMAT, file_format_}, + {Options::FILE_SYSTEM, "local"}, + {Options::ROW_TRACKING_ENABLED, "true"}, + {Options::DATA_EVOLUTION_ENABLED, "true"}}; + CreateTable(/*partition_keys=*/{}, schema, options); + + std::string table_path = PathUtil::JoinPath(dir_->Str(), "foo.db/bar"); + std::vector write_cols = schema->field_names(); + + // Data sorted by f0 for btree: Alice < Alice < Bob < Bob < Emily < Lucy < Paul < Tony + auto src_array = arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(fields), R"([ +["Alice", [0.0, 0.0, 0.0, 0.0], 10, 11.1], +["Alice", [11.0, 11.0, 11.0, 11.0], 20, 18.1], +["Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], +["Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1], +["Emily", [1.0, 0.0, 1.0, 0.0], 10, 13.1], +["Lucy", [10.0, 10.0, 10.0, 10.0], 20, 15.1], +["Paul", [10.0, 10.0, 10.0, 10.0], 20, 19.1], +["Tony", [1.0, 1.0, 1.0, 1.0], 10, 14.1] + ])") + .ValueOrDie(); + + ASSERT_OK_AND_ASSIGN(auto commit_msgs, WriteArray(table_path, write_cols, src_array)); + ASSERT_OK(Commit(table_path, commit_msgs)); + + // Build btree index on f0 + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f0", "btree", + /*options=*/{}, Range(0, 7))); + // Build lumina index on f1 + ASSERT_OK(WriteIndex(table_path, /*partition_filters=*/{}, "f1", "lumina", + /*options=*/lumina_options, Range(0, 7))); + + ASSERT_OK_AND_ASSIGN( + std::shared_ptr global_index_scan, + GlobalIndexScan::Create(table_path, /*snapshot_id=*/std::nullopt, + /*partitions=*/std::nullopt, + /*options=*/lumina_options, fs_, /*executor=*/nullptr, pool_)); + + // Query f0 via btree + { + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + + ASSERT_OK_AND_ASSIGN(auto result, + index_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(result); + ASSERT_EQ(result->ToString(), "{0,1}"); + + ASSERT_OK_AND_ASSIGN( + auto result2, index_readers[0]->VisitLessThan(Literal(FieldType::STRING, "Emily", 5))); + ASSERT_TRUE(result2); + ASSERT_EQ(result2->ToString(), "{0,1,2,3}"); + } + + // Query f1 via lumina (vector search) + { + ASSERT_OK_AND_ASSIGN(auto index_readers, + global_index_scan->CreateReaders("f1", std::nullopt)); + ASSERT_EQ(index_readers.size(), 1u); + std::vector query = {11.0f, 11.0f, 11.0f, 11.0f}; + auto vector_search = std::make_shared( + "f1", /*limit=*/1, query, /*filter=*/nullptr, + /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_options); + ASSERT_OK_AND_ASSIGN(auto scored_result, + index_readers[0]->VisitVectorSearch(vector_search)); + ASSERT_TRUE(scored_result); + ASSERT_EQ(scored_result->ToString(), "row ids: {1}, scores: {0.00}"); + } + + // Evaluator: btree on f0 = "Bob" + { + auto scanner_impl = std::dynamic_pointer_cast(global_index_scan); + ASSERT_TRUE(scanner_impl); + auto predicate = + PredicateBuilder::Equal(/*field_index=*/0, /*field_name=*/"f0", FieldType::STRING, + Literal(FieldType::STRING, "Bob", 3)); + ASSERT_OK_AND_ASSIGN(auto index_result, scanner_impl->Scan(predicate)); + ASSERT_TRUE(index_result); + ASSERT_EQ(index_result->ToString(), "{2,3}"); + + // Read data for Bob + auto result_fields = fields; + result_fields.insert(result_fields.begin(), SpecialFields::ValueKind().ArrowField()); + auto expected_array = + arrow::ipc::internal::json::ArrayFromJSON(arrow::struct_(result_fields), R"([ +[0, "Bob", [0.0, 1.0, 0.0, 1.0], 10, 12.1], +[0, "Bob", [10.0, 11.0, 10.0, 11.0], 20, 16.1] + ])") + .ValueOrDie(); + ASSERT_OK_AND_ASSIGN(auto plan, ScanGlobalIndexAndData(table_path, /*predicate=*/nullptr, + /*options=*/{}, index_result)); + ASSERT_OK(ReadData(table_path, write_cols, expected_array, /*predicate=*/nullptr, plan)); + } + + // Combined: btree f0 filter + lumina vector search with pre-filter + // Use btree result as pre_filter for lumina search + { + ASSERT_OK_AND_ASSIGN(auto btree_readers, + global_index_scan->CreateReaders("f0", std::nullopt)); + ASSERT_EQ(btree_readers.size(), 1u); + // Get rows where f0 == "Alice" -> {0, 1} + ASSERT_OK_AND_ASSIGN(auto btree_result, + btree_readers[0]->VisitEqual(Literal(FieldType::STRING, "Alice", 5))); + ASSERT_TRUE(btree_result); + ASSERT_EQ(btree_result->ToString(), "{0,1}"); + + // Now vector search on f1 with pre_filter limiting to Alice's rows {0, 1} + ASSERT_OK_AND_ASSIGN(auto lumina_readers, + global_index_scan->CreateReaders("f1", std::nullopt)); + ASSERT_EQ(lumina_readers.size(), 1u); + std::vector query = {11.0f, 11.0f, 11.0f, 11.0f}; + auto filter = [](int64_t id) -> bool { return id == 0 || id == 1; }; + auto vector_search = std::make_shared( + "f1", /*limit=*/1, query, filter, + /*predicate=*/nullptr, /*distance_type=*/std::nullopt, /*options=*/lumina_options); + ASSERT_OK_AND_ASSIGN(auto scored_result, + lumina_readers[0]->VisitVectorSearch(vector_search)); + ASSERT_EQ(scored_result->ToString(), "row ids: {1}, scores: {0.00}"); + } +} +#endif + std::vector GetTestValuesForGlobalIndexTest() { std::vector values; values.emplace_back("parquet", false); diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/README b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/README new file mode 100644 index 000000000..7ab7a6619 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/README @@ -0,0 +1,27 @@ +pt:int +col_boolean:bool +col_int:int +col_date:int +col_double:double +col_timestamp:timestamp +col_timestamp_ltz:timestamp with local timezone +col_decimal:decimal +col_string:string + +pt:partition key +no bucket key +bucket count: -1 +global btree index: col_boolean/col_int/col_date/col_double/col_timestamp/col_timestamp_ltz/col_decimal/col_string + +Msgs: +snapshot-1 +Add:(0, i%2, i * 3, 18000 + i, i * 2.2, 1700000000000L + i * 1000L, 1700000000000L + i * 1000L, Decimal(i * 123456L, 18, 6), "str_0000i") +i = [0, 19] +NoCompact + +snapshot-2 +Add:(1, i%2, i * 3, 18000 + i, i * 2.2, 1700000000000L + i * 1000L, 1700000000000L + i * 1000L, Decimal(i * 123456L, 18, 6), "str_0000i") +i = [0, 19] +NoCompact + +snapshot-3: with global btree index for col_boolean/col_int/col_date/col_double/col_timestamp/col_timestamp_ltz/col_decimal/col_string \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0323e05c-8903-49a2-9bc0-0a7e5a395211.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0323e05c-8903-49a2-9bc0-0a7e5a395211.index new file mode 100644 index 000000000..e1f90eab9 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0323e05c-8903-49a2-9bc0-0a7e5a395211.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-061cad82-2fc6-448e-a3ce-7ac49c836273.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-061cad82-2fc6-448e-a3ce-7ac49c836273.index new file mode 100644 index 000000000..9f8be0e9a Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-061cad82-2fc6-448e-a3ce-7ac49c836273.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1c6c0c2a-b8f0-410e-90bf-26aadf8472af.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1c6c0c2a-b8f0-410e-90bf-26aadf8472af.index new file mode 100644 index 000000000..a51d3c3dc Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1c6c0c2a-b8f0-410e-90bf-26aadf8472af.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1cf3190c-b80f-46b9-bce3-0566612a2e3c.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1cf3190c-b80f-46b9-bce3-0566612a2e3c.index new file mode 100644 index 000000000..129384f37 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-1cf3190c-b80f-46b9-bce3-0566612a2e3c.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2525fcef-a890-403a-a587-c9d63329adc6.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2525fcef-a890-403a-a587-c9d63329adc6.index new file mode 100644 index 000000000..f51554f65 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2525fcef-a890-403a-a587-c9d63329adc6.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2d75bcb5-1f7d-40b6-9e13-ed11ac16cbb2.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2d75bcb5-1f7d-40b6-9e13-ed11ac16cbb2.index new file mode 100644 index 000000000..b4c3ecbb0 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-2d75bcb5-1f7d-40b6-9e13-ed11ac16cbb2.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-3311dd84-93f0-4b16-b3eb-54864d44767b.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-3311dd84-93f0-4b16-b3eb-54864d44767b.index new file mode 100644 index 000000000..e1f90eab9 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-3311dd84-93f0-4b16-b3eb-54864d44767b.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-476c39a6-b760-45db-9602-a1e6559cfabe.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-476c39a6-b760-45db-9602-a1e6559cfabe.index new file mode 100644 index 000000000..31b38f523 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-476c39a6-b760-45db-9602-a1e6559cfabe.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4c089e9d-f607-41dc-8edc-41e10a951eac.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4c089e9d-f607-41dc-8edc-41e10a951eac.index new file mode 100644 index 000000000..3bc7e15b4 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4c089e9d-f607-41dc-8edc-41e10a951eac.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4ed7238f-dd41-4ab0-ad3a-816a8f26aba3.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4ed7238f-dd41-4ab0-ad3a-816a8f26aba3.index new file mode 100644 index 000000000..92fb4eff3 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-4ed7238f-dd41-4ab0-ad3a-816a8f26aba3.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6eb46ccf-94b1-4ba2-8704-9ce827de9c5b.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6eb46ccf-94b1-4ba2-8704-9ce827de9c5b.index new file mode 100644 index 000000000..ad0d0df72 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6eb46ccf-94b1-4ba2-8704-9ce827de9c5b.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77eb6c57-c860-47d1-83a9-ec93886e76c9.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77eb6c57-c860-47d1-83a9-ec93886e76c9.index new file mode 100644 index 000000000..ca42db01f Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77eb6c57-c860-47d1-83a9-ec93886e76c9.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a7e9396-772a-40c2-811c-ddc1f716e1d9.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a7e9396-772a-40c2-811c-ddc1f716e1d9.index new file mode 100644 index 000000000..e8ff3b77c Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a7e9396-772a-40c2-811c-ddc1f716e1d9.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-84feda98-48d8-40e1-84a1-5c5b737ef5e5.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-84feda98-48d8-40e1-84a1-5c5b737ef5e5.index new file mode 100644 index 000000000..ada7273d6 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-84feda98-48d8-40e1-84a1-5c5b737ef5e5.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8a91544c-8779-40bf-a66c-4f84f576a6ec.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8a91544c-8779-40bf-a66c-4f84f576a6ec.index new file mode 100644 index 000000000..3e97caec2 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8a91544c-8779-40bf-a66c-4f84f576a6ec.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b658fde-dc52-4948-8733-7a6b9e2aa58e.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b658fde-dc52-4948-8733-7a6b9e2aa58e.index new file mode 100644 index 000000000..d38a31e03 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b658fde-dc52-4948-8733-7a6b9e2aa58e.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b964514-2199-42d4-b2c2-625b461f82f3.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b964514-2199-42d4-b2c2-625b461f82f3.index new file mode 100644 index 000000000..e587bf07d Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8b964514-2199-42d4-b2c2-625b461f82f3.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8dbf5e09-7023-4f97-8bbb-248d3eec6bed.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8dbf5e09-7023-4f97-8bbb-248d3eec6bed.index new file mode 100644 index 000000000..450f92ff4 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8dbf5e09-7023-4f97-8bbb-248d3eec6bed.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-97cac122-2d88-4b84-88aa-efe022506163.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-97cac122-2d88-4b84-88aa-efe022506163.index new file mode 100644 index 000000000..242c6e094 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-97cac122-2d88-4b84-88aa-efe022506163.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-9f0d6ea2-e3d9-4877-9c57-85c285a2cd9d.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-9f0d6ea2-e3d9-4877-9c57-85c285a2cd9d.index new file mode 100644 index 000000000..e87212693 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-9f0d6ea2-e3d9-4877-9c57-85c285a2cd9d.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ae2b3e23-a92a-4688-b6f9-b35117567156.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ae2b3e23-a92a-4688-b6f9-b35117567156.index new file mode 100644 index 000000000..88fe380f7 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ae2b3e23-a92a-4688-b6f9-b35117567156.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-af76d02b-7f3a-4f6b-b45b-db44c5ff24cc.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-af76d02b-7f3a-4f6b-b45b-db44c5ff24cc.index new file mode 100644 index 000000000..129384f37 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-af76d02b-7f3a-4f6b-b45b-db44c5ff24cc.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-bd1a00c8-ff45-426d-af34-f7080ed247ba.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-bd1a00c8-ff45-426d-af34-f7080ed247ba.index new file mode 100644 index 000000000..885833853 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-bd1a00c8-ff45-426d-af34-f7080ed247ba.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-befa46d1-beb2-4a58-912b-2318efd2105c.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-befa46d1-beb2-4a58-912b-2318efd2105c.index new file mode 100644 index 000000000..c28678531 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-befa46d1-beb2-4a58-912b-2318efd2105c.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c146d9cb-5e43-4bf8-891b-a05f92cd0563.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c146d9cb-5e43-4bf8-891b-a05f92cd0563.index new file mode 100644 index 000000000..92fb4eff3 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c146d9cb-5e43-4bf8-891b-a05f92cd0563.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d8503faa-e1d3-474e-a1f1-2e380dec91d3.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d8503faa-e1d3-474e-a1f1-2e380dec91d3.index new file mode 100644 index 000000000..4eb313ad7 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d8503faa-e1d3-474e-a1f1-2e380dec91d3.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d99a76c1-c181-49d8-8882-812700de4013.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d99a76c1-c181-49d8-8882-812700de4013.index new file mode 100644 index 000000000..3b2c1ea22 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d99a76c1-c181-49d8-8882-812700de4013.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-da0c9c05-4f11-4e9a-8246-01582ee5e614.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-da0c9c05-4f11-4e9a-8246-01582ee5e614.index new file mode 100644 index 000000000..d50b07abc Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-da0c9c05-4f11-4e9a-8246-01582ee5e614.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e81874df-120e-4124-9517-b8b7fbefc336.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e81874df-120e-4124-9517-b8b7fbefc336.index new file mode 100644 index 000000000..d38a31e03 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e81874df-120e-4124-9517-b8b7fbefc336.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f35fe6ce-b072-4fd6-a91c-a5d940ae5998.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f35fe6ce-b072-4fd6-a91c-a5d940ae5998.index new file mode 100644 index 000000000..a9cc153e6 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f35fe6ce-b072-4fd6-a91c-a5d940ae5998.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f530db7c-32f7-4f35-b5d2-4b7f321488c4.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f530db7c-32f7-4f35-b5d2-4b7f321488c4.index new file mode 100644 index 000000000..5887cd946 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f530db7c-32f7-4f35-b5d2-4b7f321488c4.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f67895be-6638-449c-9a1b-f2ac7d0efc7d.index b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f67895be-6638-449c-9a1b-f2ac7d0efc7d.index new file mode 100644 index 000000000..21c0c6bb5 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f67895be-6638-449c-9a1b-f2ac7d0efc7d.index differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-efa3dce8-fe86-499a-b436-182337edd173-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-efa3dce8-fe86-499a-b436-182337edd173-0 new file mode 100644 index 000000000..cd74bae96 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-efa3dce8-fe86-499a-b436-182337edd173-0 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-804b07d3-0855-4246-b778-8bb12a5154dc-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-804b07d3-0855-4246-b778-8bb12a5154dc-0 new file mode 100644 index 000000000..81af93e82 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-804b07d3-0855-4246-b778-8bb12a5154dc-0 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-d67632fc-11f7-484d-8634-5d4ae78743e5-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-d67632fc-11f7-484d-8634-5d4ae78743e5-0 new file mode 100644 index 000000000..797583ea3 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-d67632fc-11f7-484d-8634-5d4ae78743e5-0 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0 new file mode 100644 index 000000000..0214cefe1 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-1 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-1 new file mode 100644 index 000000000..94664451c Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-1 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0 new file mode 100644 index 000000000..af2329058 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1 new file mode 100644 index 000000000..56512396f Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-0 new file mode 100644 index 000000000..45f1a6a35 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-0 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-1 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-1 new file mode 100644 index 000000000..a11c270e4 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-1 differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-0f1a5888-3dd7-4cc9-95f9-9a6cc88e88cf-0.parquet b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-0f1a5888-3dd7-4cc9-95f9-9a6cc88e88cf-0.parquet new file mode 100644 index 000000000..2d9ad405e Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-0f1a5888-3dd7-4cc9-95f9-9a6cc88e88cf-0.parquet differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-38231641-4057-4e0d-a2d4-91a7842d2acb-0.parquet b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-38231641-4057-4e0d-a2d4-91a7842d2acb-0.parquet new file mode 100644 index 000000000..003f1d6f5 Binary files /dev/null and b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-38231641-4057-4e0d-a2d4-91a7842d2acb-0.parquet differ diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 new file mode 100644 index 000000000..b011496f9 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 @@ -0,0 +1,50 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "pt", + "type" : "INT" + }, { + "id" : 1, + "name" : "col_boolean", + "type" : "BOOLEAN" + }, { + "id" : 2, + "name" : "col_int", + "type" : "INT" + }, { + "id" : 3, + "name" : "col_date", + "type" : "DATE" + }, { + "id" : 4, + "name" : "col_double", + "type" : "DOUBLE" + }, { + "id" : 5, + "name" : "col_timestamp", + "type" : "TIMESTAMP(3)" + }, { + "id" : 6, + "name" : "col_timestamp_ltz", + "type" : "TIMESTAMP(3) WITH LOCAL TIME ZONE" + }, { + "id" : 7, + "name" : "col_decimal", + "type" : "DECIMAL(18, 6)" + }, { + "id" : 8, + "name" : "col_string", + "type" : "STRING" + } ], + "highestFieldId" : 8, + "partitionKeys" : [ "pt" ], + "primaryKeys" : [ ], + "options" : { + "btree-index.records-per-range" : "10", + "data-evolution.enabled" : "true", + "row-tracking.enabled" : "true" + }, + "timeMillis" : 1777884796113 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST new file mode 100644 index 000000000..56a6051ca --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST new file mode 100644 index 000000000..e440e5c84 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST @@ -0,0 +1 @@ +3 \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 new file mode 100644 index 000000000..d49f49ac9 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-531b8b9e-f8b9-49a1-b2b1-7c131df48686-1", + "deltaManifestListSize" : 1113, + "commitUser" : "1d8c13bf-d126-47d8-a0dc-35cf4e1ffb42", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777884797725, + "totalRecordCount" : 20, + "deltaRecordCount" : 20, + "nextRowId" : 20 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 new file mode 100644 index 000000000..13881bf3f --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 2, + "schemaId" : 0, + "baseManifestList" : "manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-0", + "baseManifestListSize" : 1113, + "deltaManifestList" : "manifest-list-bde5901d-912f-40ca-a9c5-fd9d679b4924-1", + "deltaManifestListSize" : 1119, + "commitUser" : "1d8c13bf-d126-47d8-a0dc-35cf4e1ffb42", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777884797831, + "totalRecordCount" : 40, + "deltaRecordCount" : 20, + "nextRowId" : 40 +} \ No newline at end of file diff --git a/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 new file mode 100644 index 000000000..af2c32912 --- /dev/null +++ b/test/test_data/orc/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 @@ -0,0 +1,17 @@ +{ + "version" : 3, + "id" : 3, + "schemaId" : 0, + "baseManifestList" : "manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-0", + "baseManifestListSize" : 1151, + "deltaManifestList" : "manifest-list-81c12cf5-7bf5-45a1-8414-233e9370d2d5-1", + "deltaManifestListSize" : 1006, + "indexManifest" : "index-manifest-efa3dce8-fe86-499a-b436-182337edd173-0", + "commitUser" : "cc3110f6-7052-4e18-8b4d-99e6c133a02f", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777884798405, + "totalRecordCount" : 40, + "deltaRecordCount" : 0, + "nextRowId" : 40 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/README b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/README new file mode 100644 index 000000000..7ab7a6619 --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/README @@ -0,0 +1,27 @@ +pt:int +col_boolean:bool +col_int:int +col_date:int +col_double:double +col_timestamp:timestamp +col_timestamp_ltz:timestamp with local timezone +col_decimal:decimal +col_string:string + +pt:partition key +no bucket key +bucket count: -1 +global btree index: col_boolean/col_int/col_date/col_double/col_timestamp/col_timestamp_ltz/col_decimal/col_string + +Msgs: +snapshot-1 +Add:(0, i%2, i * 3, 18000 + i, i * 2.2, 1700000000000L + i * 1000L, 1700000000000L + i * 1000L, Decimal(i * 123456L, 18, 6), "str_0000i") +i = [0, 19] +NoCompact + +snapshot-2 +Add:(1, i%2, i * 3, 18000 + i, i * 2.2, 1700000000000L + i * 1000L, 1700000000000L + i * 1000L, Decimal(i * 123456L, 18, 6), "str_0000i") +i = [0, 19] +NoCompact + +snapshot-3: with global btree index for col_boolean/col_int/col_date/col_double/col_timestamp/col_timestamp_ltz/col_decimal/col_string \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03d348f1-3a2b-47b2-ab26-1d894a5bc150.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03d348f1-3a2b-47b2-ab26-1d894a5bc150.index new file mode 100644 index 000000000..129384f37 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03d348f1-3a2b-47b2-ab26-1d894a5bc150.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03da8b2e-7eb5-42c7-9741-6cfa57e3e305.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03da8b2e-7eb5-42c7-9741-6cfa57e3e305.index new file mode 100644 index 000000000..ca42db01f Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-03da8b2e-7eb5-42c7-9741-6cfa57e3e305.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0b4db5e0-03a9-4fa4-ba96-0288a0eb0b19.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0b4db5e0-03a9-4fa4-ba96-0288a0eb0b19.index new file mode 100644 index 000000000..5887cd946 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-0b4db5e0-03a9-4fa4-ba96-0288a0eb0b19.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-10d1c370-e5ad-4530-9935-84e018b7cf8f.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-10d1c370-e5ad-4530-9935-84e018b7cf8f.index new file mode 100644 index 000000000..3e97caec2 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-10d1c370-e5ad-4530-9935-84e018b7cf8f.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-226a2318-32a7-4569-88b1-096c4029e4de.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-226a2318-32a7-4569-88b1-096c4029e4de.index new file mode 100644 index 000000000..450f92ff4 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-226a2318-32a7-4569-88b1-096c4029e4de.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-336a671f-39fd-4922-b322-ec854712e573.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-336a671f-39fd-4922-b322-ec854712e573.index new file mode 100644 index 000000000..129384f37 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-336a671f-39fd-4922-b322-ec854712e573.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-34391e56-cb42-4e31-9002-8be9162b9d77.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-34391e56-cb42-4e31-9002-8be9162b9d77.index new file mode 100644 index 000000000..e87212693 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-34391e56-cb42-4e31-9002-8be9162b9d77.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-36d11008-5e75-4801-82da-f5e7c7bc62e0.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-36d11008-5e75-4801-82da-f5e7c7bc62e0.index new file mode 100644 index 000000000..88fe380f7 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-36d11008-5e75-4801-82da-f5e7c7bc62e0.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5279021b-ac4b-4418-9e56-5e8b3ede837b.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5279021b-ac4b-4418-9e56-5e8b3ede837b.index new file mode 100644 index 000000000..885833853 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5279021b-ac4b-4418-9e56-5e8b3ede837b.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5d9fd6d0-68e9-46c6-b064-04434bc36ad0.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5d9fd6d0-68e9-46c6-b064-04434bc36ad0.index new file mode 100644 index 000000000..c28678531 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-5d9fd6d0-68e9-46c6-b064-04434bc36ad0.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-66e10354-4845-43e2-9838-3ceb058fe2c7.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-66e10354-4845-43e2-9838-3ceb058fe2c7.index new file mode 100644 index 000000000..b4c3ecbb0 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-66e10354-4845-43e2-9838-3ceb058fe2c7.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6e579175-580a-4b14-ad1c-c3972407503c.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6e579175-580a-4b14-ad1c-c3972407503c.index new file mode 100644 index 000000000..31b38f523 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6e579175-580a-4b14-ad1c-c3972407503c.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6fc58c7a-d306-4da1-a33e-22dfee9390e9.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6fc58c7a-d306-4da1-a33e-22dfee9390e9.index new file mode 100644 index 000000000..a9cc153e6 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-6fc58c7a-d306-4da1-a33e-22dfee9390e9.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77874487-0b24-461b-93ef-46f04280466a.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77874487-0b24-461b-93ef-46f04280466a.index new file mode 100644 index 000000000..ada7273d6 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-77874487-0b24-461b-93ef-46f04280466a.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a81ec36-7dad-4a49-a5c8-62fdc3eb9064.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a81ec36-7dad-4a49-a5c8-62fdc3eb9064.index new file mode 100644 index 000000000..ad0d0df72 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-7a81ec36-7dad-4a49-a5c8-62fdc3eb9064.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-86da803a-8554-4201-934c-0a8edf06bb94.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-86da803a-8554-4201-934c-0a8edf06bb94.index new file mode 100644 index 000000000..f51554f65 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-86da803a-8554-4201-934c-0a8edf06bb94.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8be14989-0b16-44d6-aedd-0556af02c4b5.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8be14989-0b16-44d6-aedd-0556af02c4b5.index new file mode 100644 index 000000000..4eb313ad7 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-8be14989-0b16-44d6-aedd-0556af02c4b5.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-93f35d27-0323-42d2-9a03-651f4d75b4a6.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-93f35d27-0323-42d2-9a03-651f4d75b4a6.index new file mode 100644 index 000000000..242c6e094 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-93f35d27-0323-42d2-9a03-651f4d75b4a6.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ac97eb16-ec2e-4a32-8366-939956cc32df.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ac97eb16-ec2e-4a32-8366-939956cc32df.index new file mode 100644 index 000000000..e1f90eab9 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ac97eb16-ec2e-4a32-8366-939956cc32df.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c02c2b4e-e9b1-437e-82b2-9354c4f2ecfb.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c02c2b4e-e9b1-437e-82b2-9354c4f2ecfb.index new file mode 100644 index 000000000..e8ff3b77c Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c02c2b4e-e9b1-437e-82b2-9354c4f2ecfb.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c54baf11-8caf-47b4-9f04-e6e70aa64211.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c54baf11-8caf-47b4-9f04-e6e70aa64211.index new file mode 100644 index 000000000..92fb4eff3 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c54baf11-8caf-47b4-9f04-e6e70aa64211.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c8749a02-24af-4188-8e28-a080b56e79f0.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c8749a02-24af-4188-8e28-a080b56e79f0.index new file mode 100644 index 000000000..d38a31e03 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-c8749a02-24af-4188-8e28-a080b56e79f0.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-cc3a2bf4-5a19-46d1-8634-e7e1256026e8.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-cc3a2bf4-5a19-46d1-8634-e7e1256026e8.index new file mode 100644 index 000000000..d38a31e03 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-cc3a2bf4-5a19-46d1-8634-e7e1256026e8.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d02fec1b-f5eb-4890-a7eb-740cbc385f84.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d02fec1b-f5eb-4890-a7eb-740cbc385f84.index new file mode 100644 index 000000000..3b2c1ea22 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d02fec1b-f5eb-4890-a7eb-740cbc385f84.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d535d249-542f-4bb1-9a69-dc957176497b.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d535d249-542f-4bb1-9a69-dc957176497b.index new file mode 100644 index 000000000..3bc7e15b4 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-d535d249-542f-4bb1-9a69-dc957176497b.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dc4937c8-cdc6-4447-8cc2-40ebebf338d7.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dc4937c8-cdc6-4447-8cc2-40ebebf338d7.index new file mode 100644 index 000000000..e587bf07d Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dc4937c8-cdc6-4447-8cc2-40ebebf338d7.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dd37e0dd-c106-48bd-8ba0-4315b38b2eed.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dd37e0dd-c106-48bd-8ba0-4315b38b2eed.index new file mode 100644 index 000000000..21c0c6bb5 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-dd37e0dd-c106-48bd-8ba0-4315b38b2eed.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e2c03e02-aa1c-46d0-b779-4c9d203c7d73.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e2c03e02-aa1c-46d0-b779-4c9d203c7d73.index new file mode 100644 index 000000000..a51d3c3dc Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-e2c03e02-aa1c-46d0-b779-4c9d203c7d73.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f91d5405-4116-4a89-acc6-4e4c028410e5.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f91d5405-4116-4a89-acc6-4e4c028410e5.index new file mode 100644 index 000000000..9f8be0e9a Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-f91d5405-4116-4a89-acc6-4e4c028410e5.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-fcfd9ca5-fa06-422e-89a6-f33ea60630eb.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-fcfd9ca5-fa06-422e-89a6-f33ea60630eb.index new file mode 100644 index 000000000..92fb4eff3 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-fcfd9ca5-fa06-422e-89a6-f33ea60630eb.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-fead2db1-5481-4792-b20c-d978dfc00de5.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-fead2db1-5481-4792-b20c-d978dfc00de5.index new file mode 100644 index 000000000..d50b07abc Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-fead2db1-5481-4792-b20c-d978dfc00de5.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ff65668f-e4d2-4e69-ab5a-79a777f51650.index b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ff65668f-e4d2-4e69-ab5a-79a777f51650.index new file mode 100644 index 000000000..e1f90eab9 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/index/btree-global-index-ff65668f-e4d2-4e69-ab5a-79a777f51650.index differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-7b331606-88e1-4bdd-8426-8c6b88103b42-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-7b331606-88e1-4bdd-8426-8c6b88103b42-0 new file mode 100644 index 000000000..5fecfb9cc Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/index-manifest-7b331606-88e1-4bdd-8426-8c6b88103b42-0 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-b2c71b39-9e84-4784-8441-a367e6aebaeb-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-b2c71b39-9e84-4784-8441-a367e6aebaeb-0 new file mode 100644 index 000000000..e7e4412f1 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-b2c71b39-9e84-4784-8441-a367e6aebaeb-0 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-ce023896-9468-49d9-8c55-e26271ff81da-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-ce023896-9468-49d9-8c55-e26271ff81da-0 new file mode 100644 index 000000000..f5d02b99e Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-ce023896-9468-49d9-8c55-e26271ff81da-0 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-0 new file mode 100644 index 000000000..7eeaeac9c Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-0 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-1 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-1 new file mode 100644 index 000000000..8b6b981cb Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-1 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-0 new file mode 100644 index 000000000..dde84a775 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-0 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-1 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-1 new file mode 100644 index 000000000..d249e0840 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-1 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-0 new file mode 100644 index 000000000..af8c3a746 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-0 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-1 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-1 new file mode 100644 index 000000000..9d4217abb Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/manifest/manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-1 differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-b6d91b0d-4fc5-410a-9157-2dd1cf34a15e-0.parquet b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-b6d91b0d-4fc5-410a-9157-2dd1cf34a15e-0.parquet new file mode 100644 index 000000000..2d9ad405e Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=0/bucket-0/data-b6d91b0d-4fc5-410a-9157-2dd1cf34a15e-0.parquet differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-6ab3cbef-417f-47b0-83ee-4145bc3dedb7-0.parquet b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-6ab3cbef-417f-47b0-83ee-4145bc3dedb7-0.parquet new file mode 100644 index 000000000..003f1d6f5 Binary files /dev/null and b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/pt=1/bucket-0/data-6ab3cbef-417f-47b0-83ee-4145bc3dedb7-0.parquet differ diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 new file mode 100644 index 000000000..1cc1c81de --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/schema/schema-0 @@ -0,0 +1,50 @@ +{ + "version" : 3, + "id" : 0, + "fields" : [ { + "id" : 0, + "name" : "pt", + "type" : "INT" + }, { + "id" : 1, + "name" : "col_boolean", + "type" : "BOOLEAN" + }, { + "id" : 2, + "name" : "col_int", + "type" : "INT" + }, { + "id" : 3, + "name" : "col_date", + "type" : "DATE" + }, { + "id" : 4, + "name" : "col_double", + "type" : "DOUBLE" + }, { + "id" : 5, + "name" : "col_timestamp", + "type" : "TIMESTAMP(3)" + }, { + "id" : 6, + "name" : "col_timestamp_ltz", + "type" : "TIMESTAMP(3) WITH LOCAL TIME ZONE" + }, { + "id" : 7, + "name" : "col_decimal", + "type" : "DECIMAL(18, 6)" + }, { + "id" : 8, + "name" : "col_string", + "type" : "STRING" + } ], + "highestFieldId" : 8, + "partitionKeys" : [ "pt" ], + "primaryKeys" : [ ], + "options" : { + "btree-index.records-per-range" : "10", + "data-evolution.enabled" : "true", + "row-tracking.enabled" : "true" + }, + "timeMillis" : 1777882015920 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST new file mode 100644 index 000000000..56a6051ca --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/EARLIEST @@ -0,0 +1 @@ +1 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST new file mode 100644 index 000000000..e440e5c84 --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/LATEST @@ -0,0 +1 @@ +3 \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 new file mode 100644 index 000000000..18b77828f --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-1 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 1, + "schemaId" : 0, + "baseManifestList" : "manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-0", + "baseManifestListSize" : 1006, + "deltaManifestList" : "manifest-list-349b7730-d60f-4562-b106-5ef4f6a99a41-1", + "deltaManifestListSize" : 1113, + "commitUser" : "b8ba31b4-9a2c-49ac-82c2-4255ed3ef903", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777882017549, + "totalRecordCount" : 20, + "deltaRecordCount" : 20, + "nextRowId" : 20 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 new file mode 100644 index 000000000..557ade85d --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-2 @@ -0,0 +1,16 @@ +{ + "version" : 3, + "id" : 2, + "schemaId" : 0, + "baseManifestList" : "manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-0", + "baseManifestListSize" : 1113, + "deltaManifestList" : "manifest-list-28b8fffb-abfc-4f8c-b366-99f534b294eb-1", + "deltaManifestListSize" : 1119, + "commitUser" : "b8ba31b4-9a2c-49ac-82c2-4255ed3ef903", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777882017651, + "totalRecordCount" : 40, + "deltaRecordCount" : 20, + "nextRowId" : 40 +} \ No newline at end of file diff --git a/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 new file mode 100644 index 000000000..e597168e6 --- /dev/null +++ b/test/test_data/parquet/append_with_btree_with_partition.db/append_with_btree_with_partition/snapshot/snapshot-3 @@ -0,0 +1,17 @@ +{ + "version" : 3, + "id" : 3, + "schemaId" : 0, + "baseManifestList" : "manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-0", + "baseManifestListSize" : 1153, + "deltaManifestList" : "manifest-list-8b3083fa-2817-477f-beb5-f59f058015b1-1", + "deltaManifestListSize" : 1006, + "indexManifest" : "index-manifest-7b331606-88e1-4bdd-8426-8c6b88103b42-0", + "commitUser" : "89b4b9f1-f0e5-4b74-b853-324cd44ac62a", + "commitIdentifier" : 9223372036854775807, + "commitKind" : "APPEND", + "timeMillis" : 1777882018236, + "totalRecordCount" : 40, + "deltaRecordCount" : 0, + "nextRowId" : 40 +} \ No newline at end of file