Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/paimon/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,9 @@ struct PAIMON_EXPORT Options {
static const char BLOB_AS_DESCRIPTOR[];
/// "global-index.enabled" - Whether to enable global index for scan. Default value is "true".
static const char GLOBAL_INDEX_ENABLED[];
/// "global-index.thread-num" - The maximum number of concurrent scanner for global index. No
/// default value. By default is the number of processors available to the machine.
static const char GLOBAL_INDEX_THREAD_NUM[];
/// "global-index.external-path" - Global index root directory, if not set, the global index
/// files will be stored under the index directory.
static const char GLOBAL_INDEX_EXTERNAL_PATH[];
Expand Down
6 changes: 1 addition & 5 deletions include/paimon/global_index/global_index_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,7 @@ namespace paimon {
///
/// Derived classes are expected to implement the visitor methods (e.g., `VisitEqual`,
/// `VisitIsNull`, etc.) to return index-based results that indicate which
/// row satisfy the given predicate.
///
/// @note All `GlobalIndexResult` objects returned by implementations of this class use **local row
/// ids** that start from 0 — not global row ids in the entire table.
/// The `GlobalIndexResult` can be converted to global row ids by calling `AddOffset()`.
/// rows satisfy the given predicate.
class PAIMON_EXPORT GlobalIndexReader : public FunctionVisitor<std::shared_ptr<GlobalIndexResult>> {
Comment thread
lszskye marked this conversation as resolved.
public:
/// VisitVectorSearch performs approximate vector similarity search.
Expand Down
10 changes: 5 additions & 5 deletions include/paimon/global_index/global_index_result.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,20 @@
#include "paimon/visibility.h"

namespace paimon {
/// Global index result to get selected global row ids.
/// Global index result that holds the row ids.
class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<GlobalIndexResult> {
public:
virtual ~GlobalIndexResult() = default;

/// Iterator interface for traversing selected global row ids.
/// Iterator interface for traversing selected row ids.
class Iterator {
public:
virtual ~Iterator() = default;

/// Checks whether more row ids are available.
virtual bool HasNext() const = 0;

/// @return The next global row id and advances the iterator.
/// @return The next row id and advances the iterator.
virtual int64_t Next() = 0;
};

Expand All @@ -53,7 +53,7 @@ class PAIMON_EXPORT GlobalIndexResult : public std::enable_shared_from_this<Glob
/// (e.g., during lazy loading of index data).
virtual Result<bool> IsEmpty() const = 0;

/// Creates a new iterator over the selected global row ids.
/// Creates a new iterator over the selected row ids.
virtual Result<std::unique_ptr<Iterator>> CreateIterator() const = 0;

/// Returns non-overlapping, sorted ranges covering all row ids in `GlobalIndexResult`.
Expand Down Expand Up @@ -125,7 +125,7 @@ class PAIMON_EXPORT ScoredGlobalIndexResult : public GlobalIndexResult {
/// Retrieves the next (row_id, score) pair and advances the iterator.
///
/// @return A pair where:
/// - first: the global row id (returned in ascending order),
/// - first: the row id (returned in ascending order).
/// - second: the associated score computed by the index.
///
/// @note The sequence is ordered by **row_id**, not by score.
Expand Down
88 changes: 55 additions & 33 deletions include/paimon/global_index/global_index_scan.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,70 +23,92 @@
#include <string>
#include <vector>

#include "paimon/global_index/row_range_global_index_scanner.h"
#include "paimon/global_index/global_index_reader.h"
#include "paimon/global_index/global_index_result.h"
#include "paimon/predicate/predicate.h"
#include "paimon/utils/range.h"
#include "paimon/utils/row_range_index.h"
#include "paimon/visibility.h"

namespace paimon {
class MemoryPool;
class FileSystem;

/// Represents a logical scan over a global index for a table.
class PAIMON_EXPORT GlobalIndexScan {
public:
/// Creates a `GlobalIndexScan` instance for the specified table and context.
///
/// @param table_path Root directory of the table.
/// @param snapshot_id Optional snapshot id to read from; if not provided, uses the latest.
/// @param partitions Optional list of specific partitions to restrict the scan scope.
/// Each map represents one partition (e.g., {"dt": "2024-06-01"}).
/// If omitted, scans all partitions.
/// @param options Index-specific configuration.
/// If omitted (`std::nullopt`), scans all partitions of the table.
/// @param options User defined configuration.
/// @param file_system File system for accessing index files.
/// If not provided (nullptr), it is inferred from the `FILE_SYSTEM`
/// key in the `options` parameter.
/// @param executor The executor to be used for asynchronous operations during global
/// index scan.
/// @param pool Memory pool for temporary allocations; if nullptr, uses default.
/// @return A `Result` containing a unique pointer to the created scanner,
/// or an error if initialization fails (e.g., I/O error).
/// or an error if initialization fails (e.g., I/O error, invalid snapshot id,
/// unknown partition).
static Result<std::unique_ptr<GlobalIndexScan>> Create(
const std::string& table_path, const std::optional<int64_t>& snapshot_id,
const std::optional<std::vector<std::map<std::string, std::string>>>& partitions,
const std::map<std::string, std::string>& options,
const std::shared_ptr<FileSystem>& file_system, const std::shared_ptr<MemoryPool>& pool);
const std::shared_ptr<FileSystem>& file_system, const std::shared_ptr<Executor>& executor,
const std::shared_ptr<MemoryPool>& pool);

/// Creates a `GlobalIndexScan` instance for the specified table and context.
///
/// @param partition_filters Optional specific partition predicates.
/// Creates a `GlobalIndexScan` instance for the specified table and context, with a
/// predicate-based partition filter.
/// @param root_path Root directory of the table.
/// @param snapshot_id Optional snapshot id to read from; if not provided, uses the
/// latest snapshot.
/// @param partition_filters Optional partition-level predicate used for partition pruning.
/// If nullptr, all partitions are scanned.
/// @param options User defined configuration.
/// @param file_system File system for accessing index files. If nullptr, it is
/// inferred from the `FILE_SYSTEM` key in `options`.
/// @param executor The executor to be used for asynchronous operations during global
/// index scan.
/// @param pool Memory pool for temporary allocations; if nullptr, uses default.
/// @return A `Result` containing a unique pointer to the created scanner,
/// or an error if initialization fails.
static Result<std::unique_ptr<GlobalIndexScan>> Create(
const std::string& root_path, const std::optional<int64_t>& snapshot_id,
const std::shared_ptr<Predicate>& partition_filters,
const std::map<std::string, std::string>& options,
const std::shared_ptr<FileSystem>& file_system,
const std::shared_ptr<MemoryPool>& memory_pool);
const std::shared_ptr<FileSystem>& file_system, const std::shared_ptr<Executor>& executor,
const std::shared_ptr<MemoryPool>& pool);

virtual ~GlobalIndexScan() = default;

/// Creates a scanner for the global index over the specified row id range.
///
/// This method instantiates a low-level scanner that can evaluate predicates and
/// retrieve matching row ids from the global index data corresponding to the given
/// row id range.
///
/// @param range The inclusive row id range [start, end] for which to create the scanner.
/// The range must be fully covered by existing global index data (from
/// `GetRowRangeList()`).
/// @return A `Result` containing a range-level scanner, or an error if parse index meta fails.
virtual Result<std::shared_ptr<RowRangeGlobalIndexScanner>> CreateRangeScan(
const Range& range) = 0;
/// Creates several `GlobalIndexReader`s for a specific field.
/// @param field_name Name of the indexed column.
/// @param row_range_index Optional row range that limits the scan to a sub-range of row ids.
/// If not provided, the entire row range is considered.
/// @return A `Result` that is:
/// - Successful with several readers(with global row id) if the indexes exist and load
/// correctly;
/// - Successful with an empty vector if no index was built for the given field;
/// - Error returns when loading fails (e.g., file corruption, I/O error,
/// unsupported format).
virtual Result<std::vector<std::shared_ptr<GlobalIndexReader>>> CreateReaders(
const std::string& field_name,
const std::optional<RowRangeIndex>& row_range_index) const = 0;

/// Returns row id ranges covered by this global index (sorted and non-overlapping
/// ranges).
///
/// Each `Range` represents a contiguous segment of row ids for which global index
/// data exists. This allows the query engine to parallelize scanning and be aware
/// of ranges that are not covered by any global index.
///
/// @return A `Result` containing sorted and non-overlapping `Range` objects.
virtual Result<std::vector<Range>> GetRowRangeList() = 0;
/// Creates several `GlobalIndexReader`s for a specific field (looked up by id),
/// @param field_id Field id of the indexed column.
/// @param row_range_index Optional row range that limits the scan to a sub-range of row ids.
/// If not provided, the entire row range is considered.
/// @return A `Result` that is:
/// - Successful with several readers(with global row id) if the indexes exist and load
/// correctly;
/// - Successful with an empty vector if no index was built for the given field;
/// - Error returns when loading fails (e.g., file corruption, I/O error,
/// unsupported format).
virtual Result<std::vector<std::shared_ptr<GlobalIndexReader>>> CreateReaders(
int32_t field_id, const std::optional<RowRangeIndex>& row_range_index) const = 0;
};
Comment thread
lszskye marked this conversation as resolved.

} // namespace paimon
60 changes: 0 additions & 60 deletions include/paimon/global_index/row_range_global_index_scanner.h

This file was deleted.

11 changes: 1 addition & 10 deletions include/paimon/scan_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

#include "paimon/global_index/global_index_result.h"
#include "paimon/predicate/predicate.h"
#include "paimon/predicate/vector_search.h"
#include "paimon/result.h"
#include "paimon/type_fwd.h"
#include "paimon/visibility.h"
Expand Down Expand Up @@ -104,19 +103,14 @@ class PAIMON_EXPORT ScanFilter {
public:
ScanFilter(const std::shared_ptr<Predicate>& predicate,
const std::vector<std::map<std::string, std::string>>& partition_filters,
const std::optional<int32_t>& bucket_filter,
const std::shared_ptr<VectorSearch>& vector_search)
const std::optional<int32_t>& bucket_filter)
: predicates_(predicate),
vector_search_(vector_search),
bucket_filter_(bucket_filter),
partition_filters_(partition_filters) {}

std::shared_ptr<Predicate> GetPredicate() const {
return predicates_;
}
std::shared_ptr<VectorSearch> GetVectorSearch() const {
return vector_search_;
}
std::optional<int32_t> GetBucketFilter() const {
return bucket_filter_;
}
Expand All @@ -126,7 +120,6 @@ class PAIMON_EXPORT ScanFilter {

private:
std::shared_ptr<Predicate> predicates_;
std::shared_ptr<VectorSearch> vector_search_;
std::optional<int32_t> bucket_filter_;
std::vector<std::map<std::string, std::string>> partition_filters_;
};
Expand Down Expand Up @@ -155,8 +148,6 @@ class PAIMON_EXPORT ScanContextBuilder {
ScanContextBuilder& SetGlobalIndexResult(
const std::shared_ptr<GlobalIndexResult>& global_index_result);

/// Set vector search for similarity search.
ScanContextBuilder& SetVectorSearch(const std::shared_ptr<VectorSearch>& vector_search);
/// The options added or set in `ScanContextBuilder` have high priority and will be merged with
/// the options in table schema.
ScanContextBuilder& AddOption(const std::string& key, const std::string& value);
Expand Down
5 changes: 4 additions & 1 deletion src/paimon/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ set(PAIMON_COMMON_SRCS
common/fs/resolving_file_system.cpp
common/fs/file_system_factory.cpp
common/global_config.cpp
common/global_index/union_global_index_reader.cpp
common/global_index/offset_global_index_reader.cpp
common/global_index/complete_index_score_batch_reader.cpp
common/global_index/bitmap_scored_global_index_result.cpp
common/global_index/bitmap_global_index_result.cpp
Expand Down Expand Up @@ -195,7 +197,6 @@ set(PAIMON_CORE_SRCS
core/global_index/global_index_evaluator_impl.cpp
core/global_index/global_index_scan.cpp
core/global_index/global_index_scan_impl.cpp
core/global_index/row_range_global_index_scanner_impl.cpp
core/global_index/global_index_write_task.cpp
core/index/index_file_handler.cpp
core/index/global_index_meta.cpp
Expand Down Expand Up @@ -413,6 +414,8 @@ if(PAIMON_BUILD_TESTS)
common/global_index/complete_index_score_batch_reader_test.cpp
common/global_index/global_index_result_test.cpp
common/global_index/global_index_utils_test.cpp
common/global_index/offset_global_index_reader_test.cpp
common/global_index/union_global_index_reader_test.cpp
common/global_index/global_indexer_factory_test.cpp
common/global_index/bitmap_global_index_result_test.cpp
common/global_index/bitmap_scored_global_index_result_test.cpp
Expand Down
1 change: 1 addition & 0 deletions src/paimon/common/defs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ const char Options::DATA_EVOLUTION_ENABLED[] = "data-evolution.enabled";
const char Options::PARTITION_GENERATE_LEGACY_NAME[] = "partition.legacy-name";
const char Options::BLOB_AS_DESCRIPTOR[] = "blob-as-descriptor";
const char Options::GLOBAL_INDEX_ENABLED[] = "global-index.enabled";
const char Options::GLOBAL_INDEX_THREAD_NUM[] = "global-index.thread-num";
const char Options::GLOBAL_INDEX_EXTERNAL_PATH[] = "global-index.external-path";
const char Options::AGGREGATION_REMOVE_RECORD_ON_DELETE[] = "aggregation.remove-record-on-delete";
const char Options::SCAN_TIMESTAMP_MILLIS[] = "scan.timestamp-millis";
Expand Down
4 changes: 3 additions & 1 deletion src/paimon/common/global_index/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ set(PAIMON_GLOBAL_INDEX_SRC
btree/lazy_filtered_btree_reader.cpp
btree/key_serializer.cpp
rangebitmap/range_bitmap_global_index.cpp
rangebitmap/range_bitmap_global_index_factory.cpp)
rangebitmap/range_bitmap_global_index_factory.cpp
offset_global_index_reader.cpp
union_global_index_reader.cpp)
Comment thread
lszskye marked this conversation as resolved.

add_paimon_lib(paimon_global_index
SOURCES
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ TEST_F(BTreeFileMetaSelectorTest, TestVisitIn) {
// 1 in [1,10]=file1, [1,5]=file4
// 2 in [1,10]=file1, [1,5]=file4
// 3 in [1,10]=file1, [1,5]=file4
// 26 in [21,30]=file3, [19,25]=file5
// 26 in [21,30]=file3
// 27 in [21,30]=file3
// 28 in [21,30]=file3
ASSERT_OK_AND_ASSIGN(auto result, selector.VisitIn({Literal(1), Literal(2), Literal(3),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,10 @@ Status BTreeGlobalIndexWriter::Flush() {
return Status::OK();
}
MemorySliceOutput output(current_row_ids_.size() * 9 + 5, pool_.get());
PAIMON_RETURN_NOT_OK(output.WriteVarLenInt(current_row_ids_.size()));
if (current_row_ids_.size() > INT32_MAX) {
return Status::Invalid("invalid row id numbers, exceed INT32_MAX");
}
PAIMON_RETURN_NOT_OK(output.WriteVarLenInt(static_cast<int32_t>(current_row_ids_.size())));
for (int64_t row_id : current_row_ids_) {
PAIMON_RETURN_NOT_OK(output.WriteVarLenLong(row_id));
}
Expand Down
Loading
Loading