From 69bc2d880044b169e1527c63a95432ae0c9112d8 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 3 May 2026 13:12:35 +0800 Subject: [PATCH 01/18] [fix](inverted index) resolve variant sub-column indexes for score query --- be/src/exec/common/variant_util.cpp | 62 +++-- be/src/exec/common/variant_util.h | 7 + be/src/storage/predicate_collector.cpp | 86 ++++++- be/src/storage/predicate_collector.h | 1 + .../compaction/collection_statistics_test.cpp | 240 +++++++++++++++++- be/test/testutil/mock/mock_descriptors.h | 32 ++- 6 files changed, 400 insertions(+), 28 deletions(-) diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index 4faa43d131ea6a..a5cfc21c0e8cbd 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -1461,6 +1461,38 @@ void get_field_info(const Field& field, FieldInfo* info) { } } +namespace { + +const TabletColumn* find_matching_sub_column_template(const TabletColumn& parent_column, + const std::string& relative_path) { + for (const auto& sub_column : parent_column.get_sub_columns()) { + const std::string& pattern = sub_column->name(); + switch (sub_column->pattern_type()) { + case PatternTypePB::MATCH_NAME: + if (pattern == relative_path) { + return sub_column.get(); + } + break; + case PatternTypePB::MATCH_NAME_GLOB: + if (glob_match_re2(pattern, relative_path)) { + return sub_column.get(); + } + break; + default: + break; + } + } + return nullptr; +} + +} // namespace + +std::string find_matching_sub_column_pattern(const TabletColumn& parent_column, + const std::string& relative_path) { + const auto* sub_column = find_matching_sub_column_template(parent_column, relative_path); + return sub_column == nullptr ? "" : sub_column->name(); +} + bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, TabletSchema::SubColumnInfo* sub_column_info) { @@ -1508,31 +1540,13 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, } }; - const auto& sub_columns = parent_column.get_sub_columns(); - for (const auto& sub_column : sub_columns) { - const char* pattern = sub_column->name().c_str(); - switch (sub_column->pattern_type()) { - case PatternTypePB::MATCH_NAME: { - if (strcmp(pattern, path.c_str()) == 0) { - generate_result_column(*sub_column, &sub_column_info->column); - generate_index(sub_column->name()); - return true; - } - break; - } - case PatternTypePB::MATCH_NAME_GLOB: { - if (glob_match_re2(pattern, path)) { - generate_result_column(*sub_column, &sub_column_info->column); - generate_index(sub_column->name()); - return true; - } - break; - } - default: - break; - } + const TabletColumn* sub_column = find_matching_sub_column_template(parent_column, path); + if (sub_column == nullptr) { + return false; } - return false; + generate_result_column(*sub_column, &sub_column_info->column); + generate_index(sub_column->name()); + return true; } TabletSchemaSPtr VariantCompactionUtil::calculate_variant_extended_schema( diff --git a/be/src/exec/common/variant_util.h b/be/src/exec/common/variant_util.h index f4302146972e2c..54e5f6d2677ed4 100644 --- a/be/src/exec/common/variant_util.h +++ b/be/src/exec/common/variant_util.h @@ -185,6 +185,13 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, TabletSchema::SubColumnInfo* sub_column_info); +// Find the matching sub-column pattern (MATCH_NAME / MATCH_NAME_GLOB) for the +// given relative path among the parent variant column's templated sub-columns. +// Returns the matched pattern string (which is also the field_pattern key used +// by inverted_index_by_field_pattern), or empty if no template matches. +std::string find_matching_sub_column_pattern(const TabletColumn& parent_column, + const std::string& relative_path); + class VariantCompactionUtil { public: // get the subpaths and sparse paths for the variant column diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index 8e319ae329fd0e..db5594cbb17c27 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -19,6 +19,9 @@ #include +#include + +#include "exec/common/variant_util.h" #include "exprs/vexpr.h" #include "exprs/vexpr_context.h" #include "exprs/vliteral.h" @@ -34,6 +37,36 @@ namespace doris { using namespace segment_v2; +namespace { + +std::string join_path_parts(const std::vector& parts) { + std::string path; + for (const auto& part : parts) { + if (part.empty()) { + continue; + } + if (!path.empty()) { + path += "."; + } + path += part; + } + return path; +} + +std::string build_variant_full_path(const TabletColumn& parent_column, + const std::vector& column_paths) { + const auto relative_path = join_path_parts(column_paths); + if (relative_path.empty()) { + return ""; + } + if (parent_column.name_lower_case().empty()) { + return relative_path; + } + return parent_column.name_lower_case() + "." + relative_path; +} + +} // namespace + VSlotRef* PredicateCollector::find_slot_ref(const VExprSPtr& expr) const { if (!expr) { return nullptr; @@ -91,7 +124,50 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS } const auto& column = tablet_schema->column(col_idx); - auto index_metas = tablet_schema->inverted_indexs(sd->col_unique_id(), column.suffix_path()); + // Use the column-aware overload so variant sub-column indexes from + // _path_set_info_map (typed paths / inherited sub-column indexes) are + // resolved correctly. The simple (col_unique_id, suffix_path) lookup misses + // those. + auto index_metas = tablet_schema->inverted_indexs(column); + std::vector> owned_index_metas; + std::string index_suffix_path = column.suffix_path(); + + if (index_metas.empty() && !sd->column_paths().empty()) { + // Variant sub-column with a field_pattern-based index. The index is + // registered under the pattern string (for example "user.*"), not the + // resolved relative path (for example "user.name"). Resolve the parent + // variant column and run the same MATCH_NAME / MATCH_NAME_GLOB matching + // used by variant_util::generate_sub_column_info, then look up by the + // matched pattern. + int32_t parent_unique_id = sd->col_unique_id(); + const TabletColumn* parent_column = nullptr; + if (auto direct_idx = tablet_schema->field_index(parent_unique_id); direct_idx != -1) { + parent_column = &tablet_schema->column(direct_idx); + } else if (column.is_extracted_column()) { + parent_unique_id = column.parent_unique_id(); + if (auto inherited_idx = tablet_schema->field_index(parent_unique_id); + inherited_idx != -1) { + parent_column = &tablet_schema->column(inherited_idx); + } + } + + if (parent_column != nullptr) { + const auto relative_path = join_path_parts(sd->column_paths()); + const auto matched_pattern = + variant_util::find_matching_sub_column_pattern(*parent_column, relative_path); + if (!matched_pattern.empty()) { + index_suffix_path = build_variant_full_path(*parent_column, sd->column_paths()); + const auto field_pattern_indexes = tablet_schema->inverted_index_by_field_pattern( + parent_unique_id, matched_pattern); + for (const auto& index : field_pattern_indexes) { + auto index_ptr = std::make_shared(*index); + index_ptr->set_escaped_escaped_index_suffix_path(index_suffix_path); + index_metas.push_back(index_ptr.get()); + owned_index_metas.push_back(std::move(index_ptr)); + } + } + } + } #ifndef BE_TEST if (index_metas.empty()) { @@ -117,7 +193,7 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS index_meta->properties()); std::string field_name = - build_field_name(index_meta->col_unique_ids()[0], column.suffix_path()); + build_field_name(index_meta->col_unique_ids()[0], index_suffix_path); std::wstring ws_field_name = StringHelper::to_wstring(field_name); auto iter = collect_infos->find(ws_field_name); @@ -125,6 +201,12 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS CollectInfo collect_info; collect_info.term_infos.insert(term_infos.begin(), term_infos.end()); collect_info.index_meta = index_meta; + for (const auto& owned_index_meta : owned_index_metas) { + if (owned_index_meta.get() == index_meta) { + collect_info.owned_index_meta = owned_index_meta; + break; + } + } (*collect_infos)[ws_field_name] = std::move(collect_info); } else { iter->second.term_infos.insert(term_infos.begin(), term_infos.end()); diff --git a/be/src/storage/predicate_collector.h b/be/src/storage/predicate_collector.h index aa5a49344b98c2..c96e0af9c45ed5 100644 --- a/be/src/storage/predicate_collector.h +++ b/be/src/storage/predicate_collector.h @@ -43,6 +43,7 @@ struct TermInfoComparer { struct CollectInfo { std::set term_infos; + std::shared_ptr owned_index_meta; const TabletIndex* index_meta = nullptr; }; using CollectInfoMap = std::unordered_map; diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 92b1522f76738c..1513181cada085 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -25,6 +25,7 @@ #include #include "common/exception.h" +#include "exec/common/variant_util.h" #include "exprs/vexpr.h" #include "exprs/vexpr_context.h" #include "exprs/vliteral.h" @@ -43,7 +44,11 @@ namespace collection_statistics { class MockVExpr : public VExpr { public: - MockVExpr(TExprNodeType::type node_type) : _mock_node_type(node_type) {} + MockVExpr(TExprNodeType::type node_type) : _mock_node_type(node_type) { + if (node_type == TExprNodeType::MATCH_PRED) { + _opcode = TExprOpcode::MATCH_PHRASE; + } + } TExprNodeType::type node_type() const override { return _mock_node_type; } @@ -100,6 +105,7 @@ class MockVLiteral : public VLiteral { MockVLiteral(const std::string& value) : _value(value) {} std::string value() const override { return _value; } + std::string value(const DataTypeSerDe::FormatOptions& options) const override { return _value; } const std::string& expr_name() const override { return _value; } std::string debug_string() const override { return "MockVLiteral: " + _value; } @@ -268,6 +274,7 @@ class CollectionStatisticsTest : public ::testing::Test { index._col_unique_ids.push_back(1); std::map properties; properties["parser"] = "standard"; + properties["support_phrase"] = "true"; index._properties = properties; tablet_schema->append_index(std::move(index)); @@ -614,6 +621,237 @@ TEST_F(CollectionStatisticsTest, CollectWithDoubleCastWrappedSlotRef) { EXPECT_TRUE(status.ok()) << status.msg(); } +// Regression for AIR-36: match score collection must resolve indexes for +// variant sub-columns whose indexes live in _path_set_info_map (typed paths or +// inherited sub-column indexes). The previous simple lookup using +// inverted_indexs(col_unique_id, suffix_path) missed those indexes. +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantSubcolumnIndex) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kVariantUid = 9001; + + TabletColumn variant_col; + variant_col.set_unique_id(kVariantUid); + variant_col.set_name("v"); + variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + tablet_schema->append_column(variant_col); + + TabletColumn sub_col; + sub_col.set_unique_id(-1); + sub_col.set_name("v.host"); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + sub_col.set_parent_unique_id(kVariantUid); + PathInData path("v.host"); + sub_col.set_path_info(path); + tablet_schema->append_column(sub_col); + + auto sub_index = std::make_shared(); + TabletIndexPB index_pb; + index_pb.set_index_id(2001); + index_pb.set_index_name("variant_subcolumn_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kVariantUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "standard"; + (*props)["support_phrase"] = "true"; + sub_index->init_from_pb(index_pb); + + TabletSchema::PathsSetInfo path_set_info; + TabletIndexes sub_indexes = {sub_index}; + path_set_info.subcolumn_indexes["host"] = sub_indexes; + std::unordered_map path_set_info_map; + path_set_info_map[kVariantUid] = std::move(path_set_info); + tablet_schema->set_path_set_info(std::move(path_set_info_map)); + + EXPECT_TRUE(tablet_schema->inverted_indexs(kVariantUid, "host").empty()); + + auto found = tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)); + ASSERT_EQ(found.size(), 1u); + EXPECT_EQ(found[0]->index_name(), "variant_subcolumn_idx"); + + constexpr int kSlotId = 42; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = + std::make_shared("v.host", SlotId(kSlotId)); + auto literal = std::make_shared("foo"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kVariantUid) + ".v.host")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_subcolumn_idx"); +} + +namespace { + +// Build a sub-column template for the parent variant column. pattern_type has no +// public setter on TabletColumn, so construct through ColumnPB. +TabletColumn make_subcolumn_template(const std::string& pattern, PatternTypePB pattern_type) { + ColumnPB column_pb; + column_pb.set_unique_id(-1); + column_pb.set_name(pattern); + column_pb.set_type("STRING"); + column_pb.set_is_nullable(true); + column_pb.set_pattern_type(pattern_type); + + TabletColumn templ; + templ.init_from_pb(column_pb); + return templ; +} + +} // namespace + +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantFieldPatternIndex) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kVariantUid = 9002; + + TabletColumn variant_col; + variant_col.set_unique_id(kVariantUid); + variant_col.set_name("meta"); + variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + TabletColumn host_template = make_subcolumn_template("host", PatternTypePB::MATCH_NAME); + variant_col.add_sub_column(host_template); + tablet_schema->append_column(variant_col); + + TabletColumn sub_col; + sub_col.set_unique_id(-1); + sub_col.set_name("meta.host"); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + sub_col.set_parent_unique_id(kVariantUid); + PathInData path("meta.host"); + sub_col.set_path_info(path); + tablet_schema->append_column(sub_col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2002); + index_pb.set_index_name("variant_field_pattern_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kVariantUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "standard"; + (*props)["support_phrase"] = "true"; + (*props)["field_pattern"] = "host"; + + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); + ASSERT_EQ(tablet_schema->inverted_index_by_field_pattern(kVariantUid, "host").size(), 1u); + + constexpr int kSlotId = 43; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid, "meta.host", + {"host"}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = + std::make_shared("meta.host", SlotId(kSlotId)); + auto literal = std::make_shared("alpha"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find( + StringHelper::to_wstring(std::to_string(kVariantUid) + ".meta.host")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + ASSERT_NE(it->second.owned_index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_field_pattern_idx"); +} + +// Regression: field_pattern="user.*" is registered under the pattern string, +// while the query slot resolves to column_paths=["user", "name"]. The fallback +// must match the parent variant's sub-column template first, then use the +// matched pattern to fetch the index, and collect under the actual Lucene field. +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantFieldPatternGlobIndex) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kVariantUid = 9003; + + TabletColumn variant_col; + variant_col.set_unique_id(kVariantUid); + variant_col.set_name("meta"); + variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + TabletColumn glob_template = make_subcolumn_template("user.*", PatternTypePB::MATCH_NAME_GLOB); + variant_col.add_sub_column(glob_template); + tablet_schema->append_column(variant_col); + + TabletColumn sub_col; + sub_col.set_unique_id(-1); + sub_col.set_name("meta.user.name"); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + sub_col.set_parent_unique_id(kVariantUid); + PathInData path("meta.user.name"); + sub_col.set_path_info(path); + tablet_schema->append_column(sub_col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2003); + index_pb.set_index_name("variant_field_pattern_glob_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kVariantUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "standard"; + (*props)["support_phrase"] = "true"; + (*props)["field_pattern"] = "user.*"; + + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); + ASSERT_TRUE(tablet_schema->inverted_index_by_field_pattern(kVariantUid, "user.name").empty()); + ASSERT_EQ(tablet_schema->inverted_index_by_field_pattern(kVariantUid, "user.*").size(), 1u); + EXPECT_EQ(variant_util::find_matching_sub_column_pattern(tablet_schema->column(/*ordinal=*/0), + "user.name"), + "user.*"); + + constexpr int kSlotId = 44; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid, + "meta.user.name", {"user", "name"}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("meta.user.name", + SlotId(kSlotId)); + auto literal = std::make_shared("alice"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find( + StringHelper::to_wstring(std::to_string(kVariantUid) + ".meta.user.name")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + ASSERT_NE(it->second.owned_index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_field_pattern_glob_idx"); +} + TEST(TermInfoComparerTest, OrdersByTermAndDedups) { using doris::TermInfoComparer; using doris::segment_v2::TermInfo; diff --git a/be/test/testutil/mock/mock_descriptors.h b/be/test/testutil/mock/mock_descriptors.h index 4fec22bf7a11c0..cb8833cf8d8f1e 100644 --- a/be/test/testutil/mock/mock_descriptors.h +++ b/be/test/testutil/mock/mock_descriptors.h @@ -20,6 +20,8 @@ #include #include +#include +#include #include #include "core/data_type/data_type.h" @@ -106,13 +108,41 @@ class MockDescriptorTbl1 : public DescriptorTbl { _slot_descriptors[slot_id] = std::move(slot_desc); } + void add_slot_descriptor(SlotId slot_id, int32_t col_unique_id, const std::string& col_name, + const std::vector& column_paths) { + TTypeNode type_node; + type_node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::STRING); + type_node.__set_scalar_type(scalar_type); + TTypeDesc type_desc; + type_desc.types.push_back(type_node); + + TSlotDescriptor slot_desc; + slot_desc.__set_id(slot_id); + slot_desc.__set_parent(0); + slot_desc.__set_slotType(type_desc); + slot_desc.__set_columnPos(0); + slot_desc.__set_byteOffset(0); + slot_desc.__set_nullIndicatorByte(0); + slot_desc.__set_nullIndicatorBit(-1); + slot_desc.__set_colName(col_name); + slot_desc.__set_slotIdx(0); + slot_desc.__set_isMaterialized(true); + slot_desc.__set_col_unique_id(col_unique_id); + slot_desc.__set_is_key(false); + slot_desc.__set_column_paths(column_paths); + slot_desc.__set_primitive_type(TPrimitiveType::STRING); + _slot_descriptors[slot_id] = std::make_unique(slot_desc); + } + SlotDescriptor* get_slot_descriptor(SlotId id) const override { auto it = _slot_descriptors.find(id); return it != _slot_descriptors.end() ? it->second.get() : nullptr; } private: - mutable std::unordered_map> _slot_descriptors; + mutable std::unordered_map> _slot_descriptors; }; } // namespace doris \ No newline at end of file From 1cc5c93f71c3f067f7852242c43ad23a8df98f14 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 3 May 2026 17:56:44 +0800 Subject: [PATCH 02/18] [fix](inverted index) address variant score review comments --- be/src/exec/common/variant_util.cpp | 6 -- be/src/exec/common/variant_util.h | 7 -- be/src/storage/predicate_collector.cpp | 79 +++---------------- .../compaction/collection_statistics_test.cpp | 11 ++- .../inverted_index_p0/test_bm25_score.groovy | 49 +++++++++++- 5 files changed, 66 insertions(+), 86 deletions(-) diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index a5cfc21c0e8cbd..fbd1804e23e8e7 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -1487,12 +1487,6 @@ const TabletColumn* find_matching_sub_column_template(const TabletColumn& parent } // namespace -std::string find_matching_sub_column_pattern(const TabletColumn& parent_column, - const std::string& relative_path) { - const auto* sub_column = find_matching_sub_column_template(parent_column, relative_path); - return sub_column == nullptr ? "" : sub_column->name(); -} - bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, TabletSchema::SubColumnInfo* sub_column_info) { diff --git a/be/src/exec/common/variant_util.h b/be/src/exec/common/variant_util.h index 54e5f6d2677ed4..f4302146972e2c 100644 --- a/be/src/exec/common/variant_util.h +++ b/be/src/exec/common/variant_util.h @@ -185,13 +185,6 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, TabletSchema::SubColumnInfo* sub_column_info); -// Find the matching sub-column pattern (MATCH_NAME / MATCH_NAME_GLOB) for the -// given relative path among the parent variant column's templated sub-columns. -// Returns the matched pattern string (which is also the field_pattern key used -// by inverted_index_by_field_pattern), or empty if no template matches. -std::string find_matching_sub_column_pattern(const TabletColumn& parent_column, - const std::string& relative_path); - class VariantCompactionUtil { public: // get the subpaths and sparse paths for the variant column diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index db5594cbb17c27..1b7b825938be52 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -37,36 +37,6 @@ namespace doris { using namespace segment_v2; -namespace { - -std::string join_path_parts(const std::vector& parts) { - std::string path; - for (const auto& part : parts) { - if (part.empty()) { - continue; - } - if (!path.empty()) { - path += "."; - } - path += part; - } - return path; -} - -std::string build_variant_full_path(const TabletColumn& parent_column, - const std::vector& column_paths) { - const auto relative_path = join_path_parts(column_paths); - if (relative_path.empty()) { - return ""; - } - if (parent_column.name_lower_case().empty()) { - return relative_path; - } - return parent_column.name_lower_case() + "." + relative_path; -} - -} // namespace - VSlotRef* PredicateCollector::find_slot_ref(const VExprSPtr& expr) const { if (!expr) { return nullptr; @@ -124,47 +94,20 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS } const auto& column = tablet_schema->column(col_idx); - // Use the column-aware overload so variant sub-column indexes from - // _path_set_info_map (typed paths / inherited sub-column indexes) are - // resolved correctly. The simple (col_unique_id, suffix_path) lookup misses - // those. auto index_metas = tablet_schema->inverted_indexs(column); std::vector> owned_index_metas; std::string index_suffix_path = column.suffix_path(); - if (index_metas.empty() && !sd->column_paths().empty()) { - // Variant sub-column with a field_pattern-based index. The index is - // registered under the pattern string (for example "user.*"), not the - // resolved relative path (for example "user.name"). Resolve the parent - // variant column and run the same MATCH_NAME / MATCH_NAME_GLOB matching - // used by variant_util::generate_sub_column_info, then look up by the - // matched pattern. - int32_t parent_unique_id = sd->col_unique_id(); - const TabletColumn* parent_column = nullptr; - if (auto direct_idx = tablet_schema->field_index(parent_unique_id); direct_idx != -1) { - parent_column = &tablet_schema->column(direct_idx); - } else if (column.is_extracted_column()) { - parent_unique_id = column.parent_unique_id(); - if (auto inherited_idx = tablet_schema->field_index(parent_unique_id); - inherited_idx != -1) { - parent_column = &tablet_schema->column(inherited_idx); - } - } - - if (parent_column != nullptr) { - const auto relative_path = join_path_parts(sd->column_paths()); - const auto matched_pattern = - variant_util::find_matching_sub_column_pattern(*parent_column, relative_path); - if (!matched_pattern.empty()) { - index_suffix_path = build_variant_full_path(*parent_column, sd->column_paths()); - const auto field_pattern_indexes = tablet_schema->inverted_index_by_field_pattern( - parent_unique_id, matched_pattern); - for (const auto& index : field_pattern_indexes) { - auto index_ptr = std::make_shared(*index); - index_ptr->set_escaped_escaped_index_suffix_path(index_suffix_path); - index_metas.push_back(index_ptr.get()); - owned_index_metas.push_back(std::move(index_ptr)); - } + if (index_metas.empty() && column.is_extracted_column()) { + TabletSchema::SubColumnInfo sub_column_info; + auto relative_path = column.path_info_ptr()->copy_pop_front(); + if (variant_util::generate_sub_column_info(*tablet_schema, column.parent_unique_id(), + relative_path.get_path(), &sub_column_info) && + !sub_column_info.indexes.empty()) { + index_suffix_path = sub_column_info.column.suffix_path(); + for (auto& index : sub_column_info.indexes) { + index_metas.push_back(index.get()); + owned_index_metas.emplace_back(std::move(index)); } } } @@ -342,4 +285,4 @@ SearchPredicateCollector::ClauseTypeCategory SearchPredicateCollector::get_claus } } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 1513181cada085..6f3585a097f897 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -821,9 +821,12 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantFieldPatternGlobInd ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); ASSERT_TRUE(tablet_schema->inverted_index_by_field_pattern(kVariantUid, "user.name").empty()); ASSERT_EQ(tablet_schema->inverted_index_by_field_pattern(kVariantUid, "user.*").size(), 1u); - EXPECT_EQ(variant_util::find_matching_sub_column_pattern(tablet_schema->column(/*ordinal=*/0), - "user.name"), - "user.*"); + TabletSchema::SubColumnInfo sub_column_info; + ASSERT_TRUE(variant_util::generate_sub_column_info(*tablet_schema, kVariantUid, "user.name", + &sub_column_info)); + ASSERT_EQ(sub_column_info.indexes.size(), 1u); + EXPECT_EQ(sub_column_info.column.suffix_path(), "meta.user.name"); + EXPECT_EQ(sub_column_info.indexes[0]->index_name(), "variant_field_pattern_glob_idx"); constexpr int kSlotId = 44; runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid, @@ -889,4 +892,4 @@ TEST(TermInfoComparerTest, OrdersByTermAndDedups) { EXPECT_THAT(ordered, ::testing::ElementsAre("apple", "banana", "cherry")); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/regression-test/suites/inverted_index_p0/test_bm25_score.groovy b/regression-test/suites/inverted_index_p0/test_bm25_score.groovy index 2686011e89e3b2..b68f5c20b52916 100644 --- a/regression-test/suites/inverted_index_p0/test_bm25_score.groovy +++ b/regression-test/suites/inverted_index_p0/test_bm25_score.groovy @@ -226,6 +226,53 @@ suite("test_bm25_score", "p0") { } finally { } + try { + sql """ set enable_common_expr_pushdown = true; """ + sql """ set enable_match_without_inverted_index = false; """ + sql """ set default_variant_enable_typed_paths_to_sparse = false; """ + sql """ set default_variant_enable_doc_mode = false; """ + + sql "DROP TABLE IF EXISTS test_variant_field_pattern_score" + sql """ + CREATE TABLE test_variant_field_pattern_score ( + id INT, + meta VARIANT, + INDEX idx_meta_user(meta) USING INVERTED PROPERTIES( + "parser"="english", + "support_phrase"="true", + "field_pattern"="user.*" + ) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + + sql """ insert into test_variant_field_pattern_score values(3, '{"other": "alice"}'); """ + sql """ sync """ + sql """ + insert into test_variant_field_pattern_score values + (1, '{"user": {"name": "alice alpha"}}'), + (2, '{"user": {"name": "bob beta"}}'); + """ + sql """ sync """ + + def res = sql """ + select id, score() as score + from test_variant_field_pattern_score + where cast(meta["user"]["name"] as string) match_phrase "alice" + order by score() desc, id + limit 10; + """ + assertEquals(1, res.size()) + assertEquals(1, res[0][0] as int) + assertTrue(Double.parseDouble(res[0][1].toString()) > 0.0) + } finally { + } + try { sql "DROP TABLE IF EXISTS t2" sql """ create table t2(a int, b int, s text) unique key(a) DISTRIBUTED BY HASH(a) buckets 1 PROPERTIES ("replication_allocation" = "tag.location.default: 1"); """ @@ -247,4 +294,4 @@ suite("test_bm25_score", "p0") { } finally { } } -} \ No newline at end of file +} From d483432e1e0204dc44371a1ac3574cab1489fc24 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 3 May 2026 19:27:15 +0800 Subject: [PATCH 03/18] [fix](inverted index) handle variant score parent index fallback --- be/src/storage/predicate_collector.cpp | 51 +++++++++++-- .../compaction/collection_statistics_test.cpp | 72 +++++++++++++++++++ .../inverted_index_p0/test_bm25_score.groovy | 2 +- 3 files changed, 120 insertions(+), 5 deletions(-) diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index 1b7b825938be52..a6c4e51ed5447f 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -98,17 +98,60 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS std::vector> owned_index_metas; std::string index_suffix_path = column.suffix_path(); - if (index_metas.empty() && column.is_extracted_column()) { + if (index_metas.empty()) { + int32_t parent_unique_id = -1; + std::string relative_path; + std::string suffix_path; + + if (column.is_extracted_column()) { + parent_unique_id = column.parent_unique_id(); + relative_path = column.path_info_ptr()->copy_pop_front().get_path(); + suffix_path = column.suffix_path(); + } else if (sd->col_unique_id() >= 0 && !sd->column_paths().empty() && + tablet_schema->has_column_unique_id(sd->col_unique_id())) { + const auto& parent_column = tablet_schema->column_by_uid(sd->col_unique_id()); + if (parent_column.is_variant_type()) { + parent_unique_id = parent_column.unique_id(); + for (size_t i = 0; i < sd->column_paths().size(); ++i) { + if (i > 0) { + relative_path += "."; + } + relative_path += sd->column_paths()[i]; + } + suffix_path = parent_column.name_lower_case() + "." + relative_path; + } + } + TabletSchema::SubColumnInfo sub_column_info; - auto relative_path = column.path_info_ptr()->copy_pop_front(); - if (variant_util::generate_sub_column_info(*tablet_schema, column.parent_unique_id(), - relative_path.get_path(), &sub_column_info) && + if (parent_unique_id >= 0 && + variant_util::generate_sub_column_info(*tablet_schema, parent_unique_id, relative_path, + &sub_column_info) && !sub_column_info.indexes.empty()) { index_suffix_path = sub_column_info.column.suffix_path(); for (auto& index : sub_column_info.indexes) { index_metas.push_back(index.get()); owned_index_metas.emplace_back(std::move(index)); } + } else if (parent_unique_id >= 0) { + TabletIndexes inherited_indexes; + const TabletColumn* inherit_column = &column; + TabletColumn typed_column; + if (!suffix_path.empty() && expr->children()[0]->data_type() != nullptr) { + typed_column = variant_util::get_column_by_type( + expr->children()[0]->data_type(), left_slot_ref->column_name(), + {.unique_id = -1, + .parent_unique_id = parent_unique_id, + .path_info = PathInData(suffix_path)}); + inherit_column = &typed_column; + } + if (variant_util::inherit_index(tablet_schema->inverted_indexs(parent_unique_id), + inherited_indexes, *inherit_column)) { + index_suffix_path = inherit_column->suffix_path(); + for (auto& index : inherited_indexes) { + index_metas.push_back(index.get()); + owned_index_metas.emplace_back(std::move(index)); + } + } } } diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 6f3585a097f897..c33a238b17c3f6 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -25,6 +25,7 @@ #include #include "common/exception.h" +#include "core/data_type/data_type_string.h" #include "exec/common/variant_util.h" #include "exprs/vexpr.h" #include "exprs/vexpr_context.h" @@ -693,6 +694,77 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantSubcolumnIndex) { EXPECT_EQ(it->second.index_meta->index_name(), "variant_subcolumn_idx"); } +// Regression for score on a dynamic variant sub-column inherited from a parent +// variant inverted index. There is no sub-column template in this case, so +// generate_sub_column_info() returns false and the collector must still inherit +// the parent index for the actual Lucene field v.key. +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutTemplate) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kVariantUid = 9004; + + TabletColumn variant_col; + variant_col.set_unique_id(kVariantUid); + variant_col.set_name("v"); + variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + tablet_schema->append_column(variant_col); + + TabletColumn sub_col; + sub_col.set_unique_id(-1); + sub_col.set_name("v.key"); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + sub_col.set_parent_unique_id(kVariantUid); + PathInData path("v.key"); + sub_col.set_path_info(path); + tablet_schema->append_column(sub_col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2004); + index_pb.set_index_name("variant_parent_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kVariantUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "english"; + (*props)["support_phrase"] = "true"; + + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); + ASSERT_EQ(tablet_schema->inverted_indexs(kVariantUid).size(), 1u); + TabletSchema::SubColumnInfo sub_column_info; + ASSERT_FALSE(variant_util::generate_sub_column_info(*tablet_schema, kVariantUid, "key", + &sub_column_info)); + + constexpr int kSlotId = 45; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid, "v.key", + {"key"}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto cast_expr = std::make_shared(TExprNodeType::CAST_EXPR); + cast_expr->_data_type = std::make_shared(); + auto slot_ref = std::make_shared("v.key", SlotId(kSlotId)); + auto literal = std::make_shared("abc"); + cast_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(cast_expr); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kVariantUid) + ".v.key")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + ASSERT_NE(it->second.owned_index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_parent_idx"); +} + namespace { // Build a sub-column template for the parent variant column. pattern_type has no diff --git a/regression-test/suites/inverted_index_p0/test_bm25_score.groovy b/regression-test/suites/inverted_index_p0/test_bm25_score.groovy index b68f5c20b52916..3a8ad125dc5076 100644 --- a/regression-test/suites/inverted_index_p0/test_bm25_score.groovy +++ b/regression-test/suites/inverted_index_p0/test_bm25_score.groovy @@ -264,7 +264,7 @@ suite("test_bm25_score", "p0") { select id, score() as score from test_variant_field_pattern_score where cast(meta["user"]["name"] as string) match_phrase "alice" - order by score() desc, id + order by score() desc limit 10; """ assertEquals(1, res.size()) From 21c9e647f81a206d08c53b075310a5a772c4cc52 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Sun, 3 May 2026 21:42:08 +0800 Subject: [PATCH 04/18] [fix](inverted index) minimize variant util changes --- be/src/exec/common/variant_util.cpp | 56 +++++++++++++---------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index fbd1804e23e8e7..4faa43d131ea6a 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -1461,32 +1461,6 @@ void get_field_info(const Field& field, FieldInfo* info) { } } -namespace { - -const TabletColumn* find_matching_sub_column_template(const TabletColumn& parent_column, - const std::string& relative_path) { - for (const auto& sub_column : parent_column.get_sub_columns()) { - const std::string& pattern = sub_column->name(); - switch (sub_column->pattern_type()) { - case PatternTypePB::MATCH_NAME: - if (pattern == relative_path) { - return sub_column.get(); - } - break; - case PatternTypePB::MATCH_NAME_GLOB: - if (glob_match_re2(pattern, relative_path)) { - return sub_column.get(); - } - break; - default: - break; - } - } - return nullptr; -} - -} // namespace - bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, TabletSchema::SubColumnInfo* sub_column_info) { @@ -1534,13 +1508,31 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, } }; - const TabletColumn* sub_column = find_matching_sub_column_template(parent_column, path); - if (sub_column == nullptr) { - return false; + const auto& sub_columns = parent_column.get_sub_columns(); + for (const auto& sub_column : sub_columns) { + const char* pattern = sub_column->name().c_str(); + switch (sub_column->pattern_type()) { + case PatternTypePB::MATCH_NAME: { + if (strcmp(pattern, path.c_str()) == 0) { + generate_result_column(*sub_column, &sub_column_info->column); + generate_index(sub_column->name()); + return true; + } + break; + } + case PatternTypePB::MATCH_NAME_GLOB: { + if (glob_match_re2(pattern, path)) { + generate_result_column(*sub_column, &sub_column_info->column); + generate_index(sub_column->name()); + return true; + } + break; + } + default: + break; + } } - generate_result_column(*sub_column, &sub_column_info->column); - generate_index(sub_column->name()); - return true; + return false; } TabletSchemaSPtr VariantCompactionUtil::calculate_variant_extended_schema( From 69b5dc41e1ac8270ee7d69833767f93565fdf281 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 00:43:53 +0800 Subject: [PATCH 05/18] [fix](inverted index) collapse variant score fallback and drop typed_column rewrite - Collapse fallback entry into 'if (index_metas.empty() && column.is_extracted_column())', remove the unreachable branch keyed on sd->column_paths(). - Drop typed_column rewrite based on expr->children()[0]->data_type() (post-cast type can mis-route inherit_index into numeric branch and strip parser/analyzer). Pass schema TabletColumn directly to inherit_index. - Re-label ExtractCollectInfoForVariantParentIndexWithoutTemplate as a synthetic defensive fallback case; sub-column type changed to STRING so inherit_index's slice_type branch is exercised. CAST wrapper preserved to validate that the collector does not consult cast result type. --- be/src/storage/predicate_collector.cpp | 52 +++++-------------- .../compaction/collection_statistics_test.cpp | 13 +++-- 2 files changed, 22 insertions(+), 43 deletions(-) diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index a6c4e51ed5447f..38781c1e3c5c8e 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -98,33 +98,19 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS std::vector> owned_index_metas; std::string index_suffix_path = column.suffix_path(); - if (index_metas.empty()) { - int32_t parent_unique_id = -1; - std::string relative_path; - std::string suffix_path; - - if (column.is_extracted_column()) { - parent_unique_id = column.parent_unique_id(); - relative_path = column.path_info_ptr()->copy_pop_front().get_path(); - suffix_path = column.suffix_path(); - } else if (sd->col_unique_id() >= 0 && !sd->column_paths().empty() && - tablet_schema->has_column_unique_id(sd->col_unique_id())) { - const auto& parent_column = tablet_schema->column_by_uid(sd->col_unique_id()); - if (parent_column.is_variant_type()) { - parent_unique_id = parent_column.unique_id(); - for (size_t i = 0; i < sd->column_paths().size(); ++i) { - if (i > 0) { - relative_path += "."; - } - relative_path += sd->column_paths()[i]; - } - suffix_path = parent_column.name_lower_case() + "." + relative_path; - } - } + // Defensive fallback for variant sub-columns whose index was not registered + // in _col_id_suffix_to_index. Covers field_pattern indexes (resolved via + // generate_sub_column_info) and parent-only indexes whose inheritance has + // not been propagated by inherit_column_attributes. Uses column.type() + // directly rather than the post-cast expr type to avoid mis-routing + // inherit_index into the numeric branch (which would strip parser/analyzer + // from a text parent index). + if (index_metas.empty() && column.is_extracted_column()) { + const int32_t parent_unique_id = column.parent_unique_id(); + const std::string relative_path = column.path_info_ptr()->copy_pop_front().get_path(); TabletSchema::SubColumnInfo sub_column_info; - if (parent_unique_id >= 0 && - variant_util::generate_sub_column_info(*tablet_schema, parent_unique_id, relative_path, + if (variant_util::generate_sub_column_info(*tablet_schema, parent_unique_id, relative_path, &sub_column_info) && !sub_column_info.indexes.empty()) { index_suffix_path = sub_column_info.column.suffix_path(); @@ -132,21 +118,11 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS index_metas.push_back(index.get()); owned_index_metas.emplace_back(std::move(index)); } - } else if (parent_unique_id >= 0) { + } else { TabletIndexes inherited_indexes; - const TabletColumn* inherit_column = &column; - TabletColumn typed_column; - if (!suffix_path.empty() && expr->children()[0]->data_type() != nullptr) { - typed_column = variant_util::get_column_by_type( - expr->children()[0]->data_type(), left_slot_ref->column_name(), - {.unique_id = -1, - .parent_unique_id = parent_unique_id, - .path_info = PathInData(suffix_path)}); - inherit_column = &typed_column; - } if (variant_util::inherit_index(tablet_schema->inverted_indexs(parent_unique_id), - inherited_indexes, *inherit_column)) { - index_suffix_path = inherit_column->suffix_path(); + inherited_indexes, column)) { + index_suffix_path = column.suffix_path(); for (auto& index : inherited_indexes) { index_metas.push_back(index.get()); owned_index_metas.emplace_back(std::move(index)); diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index c33a238b17c3f6..3dbdd5a5eed76d 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -694,10 +694,13 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantSubcolumnIndex) { EXPECT_EQ(it->second.index_meta->index_name(), "variant_subcolumn_idx"); } -// Regression for score on a dynamic variant sub-column inherited from a parent -// variant inverted index. There is no sub-column template in this case, so -// generate_sub_column_info() returns false and the collector must still inherit -// the parent index for the actual Lucene field v.key. +// Synthetic defensive case for the inherit_index fallback. The schema state +// here does NOT match production: _init_variant_columns keeps extracted +// sub-columns at OLAP_FIELD_TYPE_VARIANT and inherit_column_attributes does +// not mutate column.type(). The sub-column is set to STRING here purely so +// the inherit_index slice_type branch is exercised. The CAST(slot AS STRING) +// wrapper is intentionally preserved to validate that the new collector does +// not consult the cast result type when resolving indexes. TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutTemplate) { auto tablet_schema = std::make_shared(); @@ -712,7 +715,7 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutT TabletColumn sub_col; sub_col.set_unique_id(-1); sub_col.set_name("v.key"); - sub_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); sub_col.set_parent_unique_id(kVariantUid); PathInData path("v.key"); sub_col.set_path_info(path); From 948049ca362122b759c60ca3675ab510efe98eee Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 00:49:07 +0800 Subject: [PATCH 06/18] [refactor](inverted index) extract resolve_subcolumn_indexes_inheritance helper Pull the shared 'generate_sub_column_info -> inherit_index parent fallback' logic out of MatchPredicateCollector::collect and VariantColumnReader::find_subcolumn_tablet_indexes into variant_util. Segment side keeps its sparse-column / nested-group / inferred-type handling around the helper; collector side passes the schema column directly. Addresses review comment r3177879193. --- be/src/exec/common/variant_util.cpp | 15 ++++ be/src/exec/common/variant_util.h | 13 +++ be/src/storage/predicate_collector.cpp | 36 ++------ .../segment/variant/variant_column_reader.cpp | 86 +++++++++---------- 4 files changed, 77 insertions(+), 73 deletions(-) diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index 4faa43d131ea6a..888f2a214556d3 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -2180,4 +2180,19 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t return Status::OK(); } +TabletIndexes resolve_subcolumn_indexes_inheritance(const TabletSchema& schema, + int32_t parent_unique_id, + const std::string& relative_path, + const TabletColumn& inheritance_column) { + TabletSchema::SubColumnInfo sub_column_info; + if (generate_sub_column_info(schema, parent_unique_id, relative_path, &sub_column_info) && + !sub_column_info.indexes.empty()) { + return std::move(sub_column_info.indexes); + } + + TabletIndexes inherited; + inherit_index(schema.inverted_indexs(parent_unique_id), inherited, inheritance_column); + return inherited; +} + } // namespace doris::variant_util diff --git a/be/src/exec/common/variant_util.h b/be/src/exec/common/variant_util.h index f4302146972e2c..21354054fc8590 100644 --- a/be/src/exec/common/variant_util.h +++ b/be/src/exec/common/variant_util.h @@ -185,6 +185,19 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, TabletSchema::SubColumnInfo* sub_column_info); +// Resolve inverted indexes for an extracted variant sub-column by trying: +// 1. generate_sub_column_info (field_pattern MATCH_NAME / MATCH_NAME_GLOB) +// 2. inherit_index from parent variant indexes +// inheritance_column drives inherit_index's slice/numeric branch decision: +// - collector side: pass the schema column directly. +// - segment side: pass a TabletColumn synthesized from inferred storage +// type with nested-group-aware path. +// Returns cloned TabletIndex shared_ptrs (caller keeps them alive). +TabletIndexes resolve_subcolumn_indexes_inheritance(const TabletSchema& schema, + int32_t parent_unique_id, + const std::string& relative_path, + const TabletColumn& inheritance_column); + class VariantCompactionUtil { public: // get the subpaths and sparse paths for the variant column diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index 38781c1e3c5c8e..4a28871fb51e26 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -98,35 +98,15 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS std::vector> owned_index_metas; std::string index_suffix_path = column.suffix_path(); - // Defensive fallback for variant sub-columns whose index was not registered - // in _col_id_suffix_to_index. Covers field_pattern indexes (resolved via - // generate_sub_column_info) and parent-only indexes whose inheritance has - // not been propagated by inherit_column_attributes. Uses column.type() - // directly rather than the post-cast expr type to avoid mis-routing - // inherit_index into the numeric branch (which would strip parser/analyzer - // from a text parent index). if (index_metas.empty() && column.is_extracted_column()) { - const int32_t parent_unique_id = column.parent_unique_id(); - const std::string relative_path = column.path_info_ptr()->copy_pop_front().get_path(); - - TabletSchema::SubColumnInfo sub_column_info; - if (variant_util::generate_sub_column_info(*tablet_schema, parent_unique_id, relative_path, - &sub_column_info) && - !sub_column_info.indexes.empty()) { - index_suffix_path = sub_column_info.column.suffix_path(); - for (auto& index : sub_column_info.indexes) { - index_metas.push_back(index.get()); - owned_index_metas.emplace_back(std::move(index)); - } - } else { - TabletIndexes inherited_indexes; - if (variant_util::inherit_index(tablet_schema->inverted_indexs(parent_unique_id), - inherited_indexes, column)) { - index_suffix_path = column.suffix_path(); - for (auto& index : inherited_indexes) { - index_metas.push_back(index.get()); - owned_index_metas.emplace_back(std::move(index)); - } + auto inherited = variant_util::resolve_subcolumn_indexes_inheritance( + *tablet_schema, column.parent_unique_id(), + column.path_info_ptr()->copy_pop_front().get_path(), column); + if (!inherited.empty()) { + index_suffix_path = column.suffix_path(); + for (auto& idx : inherited) { + index_metas.push_back(idx.get()); + owned_index_metas.emplace_back(std::move(idx)); } } } diff --git a/be/src/storage/segment/variant/variant_column_reader.cpp b/be/src/storage/segment/variant/variant_column_reader.cpp index 24544984dc02e1..0f62a6bffd69ca 100644 --- a/be/src/storage/segment/variant/variant_column_reader.cpp +++ b/be/src/storage/segment/variant/variant_column_reader.cpp @@ -1325,55 +1325,51 @@ Status VariantColumnReader::load_external_meta_once() { TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletColumn& column, const DataTypePtr& data_type) { - TabletSchema::SubColumnInfo sub_column_info; - const auto& parent_index = _tablet_schema->inverted_indexs(column.parent_unique_id()); auto relative_path = column.path_info_ptr()->copy_pop_front(); - // if subcolumn has index, add index to _variant_subcolumns_indexes - if (variant_util::generate_sub_column_info(*_tablet_schema, column.parent_unique_id(), - relative_path.get_path(), &sub_column_info) && - !sub_column_info.indexes.empty()) { - return sub_column_info.indexes; - } - - // Otherwise, inherit index from the VARIANT parent column. - if (!parent_index.empty() && data_type->get_primitive_type() != PrimitiveType::TYPE_VARIANT && - data_type->get_primitive_type() != PrimitiveType::TYPE_MAP /*SPARSE COLUMN*/) { - // type in column maynot be real type, so use data_type to get the real type - PathInData index_path {*column.path_info_ptr()}; - DataTypePtr index_data_type = data_type; - if (!relative_path.empty()) { - auto [nested_reader, _] = find_nested_group_for_path(relative_path.get_path()); - const std::string root_path(kRootNestedGroupPath); - - if (nested_reader != nullptr) { - const bool is_root_ng = nested_reader->array_path == root_path; - if (!is_root_ng) { - // Named NG — use variant-relative path (consistent with write path) - index_path = relative_path; - } else { - // $root NG — prefix path with __D0_root__ - index_path = PathInData(root_path + "." + relative_path.get_path()); - } - // Unwrap Nullable(Array(...)) → element type for NG subcolumns - if (data_type->is_nullable()) { - auto base = variant_util::get_base_type_of_array(remove_nullable(data_type)); - index_data_type = base->is_nullable() ? base : make_nullable(base); - } else { - index_data_type = variant_util::get_base_type_of_array(data_type); - } + // Sparse columns (VARIANT/MAP) only resolve via field_pattern templates; + // no parent index inheritance. + if (data_type->get_primitive_type() == PrimitiveType::TYPE_VARIANT || + data_type->get_primitive_type() == PrimitiveType::TYPE_MAP) { + TabletSchema::SubColumnInfo sub_column_info; + if (variant_util::generate_sub_column_info(*_tablet_schema, column.parent_unique_id(), + relative_path.get_path(), &sub_column_info) && + !sub_column_info.indexes.empty()) { + return std::move(sub_column_info.indexes); + } + return {}; + } + + // Build inheritance column from inferred type + nested-group-aware path. + // type in column maynot be real type, so use data_type to get the real type. + PathInData index_path {*column.path_info_ptr()}; + DataTypePtr index_data_type = data_type; + if (!relative_path.empty()) { + auto [nested_reader, _] = find_nested_group_for_path(relative_path.get_path()); + const std::string root_path(kRootNestedGroupPath); + if (nested_reader != nullptr) { + const bool is_root_ng = nested_reader->array_path == root_path; + if (!is_root_ng) { + index_path = relative_path; + } else { + index_path = PathInData(root_path + "." + relative_path.get_path()); + } + if (data_type->is_nullable()) { + auto base = variant_util::get_base_type_of_array(remove_nullable(data_type)); + index_data_type = base->is_nullable() ? base : make_nullable(base); + } else { + index_data_type = variant_util::get_base_type_of_array(data_type); } - // else: non-NG scalar field — keep index_path and index_data_type unchanged } - TabletColumn target_column = - variant_util::get_column_by_type(index_data_type, column.name(), - {.unique_id = -1, - .parent_unique_id = column.parent_unique_id(), - .path_info = index_path}); - variant_util::inherit_index(parent_index, sub_column_info.indexes, target_column); - } - // Return shared_ptr directly to maintain object lifetime - return sub_column_info.indexes; + } + TabletColumn inheritance_column = + variant_util::get_column_by_type(index_data_type, column.name(), + {.unique_id = -1, + .parent_unique_id = column.parent_unique_id(), + .path_info = index_path}); + + return variant_util::resolve_subcolumn_indexes_inheritance( + *_tablet_schema, column.parent_unique_id(), relative_path.get_path(), inheritance_column); } void VariantColumnReader::get_subcolumns_types( From 01eb8023661d0c4efdaa1c15d94c1500ae9c72a1 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 00:55:00 +0800 Subject: [PATCH 07/18] [test](inverted index) regression for variant score with field_pattern exact and plain parent baseline --- .../test_bm25_score_variant.groovy | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 regression-test/suites/inverted_index_p0/test_bm25_score_variant.groovy diff --git a/regression-test/suites/inverted_index_p0/test_bm25_score_variant.groovy b/regression-test/suites/inverted_index_p0/test_bm25_score_variant.groovy new file mode 100644 index 00000000000000..885d311bdfc01a --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_bm25_score_variant.groovy @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_bm25_score_variant", "p0") { + if (isCloudMode()) { + return + } + + sql """ set enable_common_expr_pushdown = true """ + sql """ set enable_match_without_inverted_index = false """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + sql """ set default_variant_enable_doc_mode = false """ + + // A1: field_pattern exact name (MATCH_NAME) + try { + sql "DROP TABLE IF EXISTS test_bm25_score_variant_a1" + sql """ + CREATE TABLE test_bm25_score_variant_a1 ( + id INT, + v variant< + MATCH_NAME 'host' : text, + PROPERTIES("variant_max_subcolumns_count"="0") + >, + INDEX idx_v_host (v) USING INVERTED PROPERTIES( + "parser"="english", + "support_phrase"="true", + "field_pattern"="host" + ) + ) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + sql """ insert into test_bm25_score_variant_a1 values + (1, '{"host":"alpha database server"}'), + (2, '{"host":"beta server cluster"}'), + (3, '{"other":"alpha"}') + """ + sql " sync " + + def res = sql """ + select id, score() as score + from test_bm25_score_variant_a1 + where cast(v["host"] as string) match_phrase "alpha" + order by score() desc + limit 10 + """ + assertEquals(1, res.size()) + assertEquals(1, res[0][0] as int) + assertTrue(Double.parseDouble(res[0][1].toString()) > 0.0) + } finally { + } + + // C: plain parent inverted index (baseline; not the fallback path) + try { + sql "DROP TABLE IF EXISTS test_bm25_score_variant_c" + sql """ + CREATE TABLE test_bm25_score_variant_c ( + id INT, + v VARIANT, + INDEX idx_v_plain (v) USING INVERTED PROPERTIES( + "parser"="english", + "support_phrase"="true" + ) + ) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + sql """ insert into test_bm25_score_variant_c values + (1, '{"note":"latency spike at noon"}'), + (2, '{"note":"all green"}') + """ + sql " sync " + + def res = sql """ + select id, score() as score + from test_bm25_score_variant_c + where cast(v["note"] as string) match_phrase "latency" + order by score() desc + limit 10 + """ + assertEquals(1, res.size()) + assertEquals(1, res[0][0] as int) + assertTrue(Double.parseDouble(res[0][1].toString()) > 0.0) + } finally { + } +} From e12ab7f9ee7bd83350bcea6792d2116d969c007a Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 01:47:39 +0800 Subject: [PATCH 08/18] [fix](inverted index) resolve score stats for materialized variant paths - Keep field_pattern resolution ahead of parent inheritance and do not inherit unmatched templates as plain indexes. - Recover plain parent variant indexes for scan-time materialized VARIANT sub-column placeholders by cloning the parent index with the concrete suffix path. - Update the collection statistics regression to match the production schema shape for dynamic variant paths. --- be/src/exec/common/variant_util.cpp | 17 ++++++++++++++++- be/src/exec/common/variant_util.h | 4 ++++ .../compaction/collection_statistics_test.cpp | 14 ++++++-------- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index 888f2a214556d3..866f2da8ac48e7 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -2191,7 +2191,22 @@ TabletIndexes resolve_subcolumn_indexes_inheritance(const TabletSchema& schema, } TabletIndexes inherited; - inherit_index(schema.inverted_indexs(parent_unique_id), inherited, inheritance_column); + const auto parent_indexes = schema.inverted_indexs(parent_unique_id); + if (inherit_index(parent_indexes, inherited, inheritance_column)) { + return inherited; + } + + if (inheritance_column.is_extracted_column() && inheritance_column.is_variant_type()) { + for (const auto* index : parent_indexes) { + if (!index->field_pattern().empty()) { + continue; + } + auto index_ptr = std::make_shared(*index); + index_ptr->set_escaped_escaped_index_suffix_path( + inheritance_column.path_info_ptr()->get_path()); + inherited.emplace_back(std::move(index_ptr)); + } + } return inherited; } diff --git a/be/src/exec/common/variant_util.h b/be/src/exec/common/variant_util.h index 21354054fc8590..c051b1b89b6270 100644 --- a/be/src/exec/common/variant_util.h +++ b/be/src/exec/common/variant_util.h @@ -192,6 +192,10 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, // - collector side: pass the schema column directly. // - segment side: pass a TabletColumn synthesized from inferred storage // type with nested-group-aware path. +// If the collector only has a scan-time materialized VARIANT placeholder for a +// dynamic path, clone plain parent indexes with that path suffix instead of +// using the post-cast expression type. field_pattern template indexes must have +// matched in step 1; unmatched templates are not inherited as plain indexes. // Returns cloned TabletIndex shared_ptrs (caller keeps them alive). TabletIndexes resolve_subcolumn_indexes_inheritance(const TabletSchema& schema, int32_t parent_unique_id, diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 3dbdd5a5eed76d..464dcd36cf4868 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -694,13 +694,11 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantSubcolumnIndex) { EXPECT_EQ(it->second.index_meta->index_name(), "variant_subcolumn_idx"); } -// Synthetic defensive case for the inherit_index fallback. The schema state -// here does NOT match production: _init_variant_columns keeps extracted -// sub-columns at OLAP_FIELD_TYPE_VARIANT and inherit_column_attributes does -// not mutate column.type(). The sub-column is set to STRING here purely so -// the inherit_index slice_type branch is exercised. The CAST(slot AS STRING) -// wrapper is intentionally preserved to validate that the new collector does -// not consult the cast result type when resolving indexes. +// Regression for score on a dynamic variant sub-column inherited from a parent +// variant inverted index. This matches the scan-time schema shape: +// _init_variant_columns materializes the accessed path as an extracted VARIANT +// placeholder, so the collector must not depend on the post-cast expression type +// to recover the parent text index for the concrete Lucene field. TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutTemplate) { auto tablet_schema = std::make_shared(); @@ -715,7 +713,7 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutT TabletColumn sub_col; sub_col.set_unique_id(-1); sub_col.set_name("v.key"); - sub_col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); sub_col.set_parent_unique_id(kVariantUid); PathInData path("v.key"); sub_col.set_path_info(path); From 452a20edc5cadf01fbcbf4eea231ccce789973f3 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 02:01:24 +0800 Subject: [PATCH 09/18] [fix](inverted index) format variant column reader --- be/src/storage/segment/variant/variant_column_reader.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/be/src/storage/segment/variant/variant_column_reader.cpp b/be/src/storage/segment/variant/variant_column_reader.cpp index 0f62a6bffd69ca..adfb185718a4b4 100644 --- a/be/src/storage/segment/variant/variant_column_reader.cpp +++ b/be/src/storage/segment/variant/variant_column_reader.cpp @@ -1369,7 +1369,8 @@ TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletCol .path_info = index_path}); return variant_util::resolve_subcolumn_indexes_inheritance( - *_tablet_schema, column.parent_unique_id(), relative_path.get_path(), inheritance_column); + *_tablet_schema, column.parent_unique_id(), relative_path.get_path(), + inheritance_column); } void VariantColumnReader::get_subcolumns_types( From a1730c2a54bfbf29ebb23122cc4b532ec82a975e Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 14:46:22 +0800 Subject: [PATCH 10/18] [fix](inverted index) restore find_subcolumn_tablet_indexes to master Revert VariantColumnReader::find_subcolumn_tablet_indexes to the master implementation. The previous refactor reorganized this function around the shared variant_util helper, which inadvertently introduced a behavioral edge: when index_data_type is unwrapped to VARIANT in a nested-group scenario, the helper's plain-parent-index clone branch fired, returning indexes the original logic would not have returned. Segment-level resolution is intentionally left in this function with its data_type-driven nested/inherit logic, addressing review comment r3185932957. --- .../segment/variant/variant_column_reader.cpp | 87 ++++++++++--------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/be/src/storage/segment/variant/variant_column_reader.cpp b/be/src/storage/segment/variant/variant_column_reader.cpp index adfb185718a4b4..24544984dc02e1 100644 --- a/be/src/storage/segment/variant/variant_column_reader.cpp +++ b/be/src/storage/segment/variant/variant_column_reader.cpp @@ -1325,52 +1325,55 @@ Status VariantColumnReader::load_external_meta_once() { TabletIndexes VariantColumnReader::find_subcolumn_tablet_indexes(const TabletColumn& column, const DataTypePtr& data_type) { + TabletSchema::SubColumnInfo sub_column_info; + const auto& parent_index = _tablet_schema->inverted_indexs(column.parent_unique_id()); auto relative_path = column.path_info_ptr()->copy_pop_front(); + // if subcolumn has index, add index to _variant_subcolumns_indexes + if (variant_util::generate_sub_column_info(*_tablet_schema, column.parent_unique_id(), + relative_path.get_path(), &sub_column_info) && + !sub_column_info.indexes.empty()) { + return sub_column_info.indexes; + } + + // Otherwise, inherit index from the VARIANT parent column. + if (!parent_index.empty() && data_type->get_primitive_type() != PrimitiveType::TYPE_VARIANT && + data_type->get_primitive_type() != PrimitiveType::TYPE_MAP /*SPARSE COLUMN*/) { + // type in column maynot be real type, so use data_type to get the real type + PathInData index_path {*column.path_info_ptr()}; + DataTypePtr index_data_type = data_type; + if (!relative_path.empty()) { + auto [nested_reader, _] = find_nested_group_for_path(relative_path.get_path()); + const std::string root_path(kRootNestedGroupPath); + + if (nested_reader != nullptr) { + const bool is_root_ng = nested_reader->array_path == root_path; + if (!is_root_ng) { + // Named NG — use variant-relative path (consistent with write path) + index_path = relative_path; + } else { + // $root NG — prefix path with __D0_root__ + index_path = PathInData(root_path + "." + relative_path.get_path()); + } - // Sparse columns (VARIANT/MAP) only resolve via field_pattern templates; - // no parent index inheritance. - if (data_type->get_primitive_type() == PrimitiveType::TYPE_VARIANT || - data_type->get_primitive_type() == PrimitiveType::TYPE_MAP) { - TabletSchema::SubColumnInfo sub_column_info; - if (variant_util::generate_sub_column_info(*_tablet_schema, column.parent_unique_id(), - relative_path.get_path(), &sub_column_info) && - !sub_column_info.indexes.empty()) { - return std::move(sub_column_info.indexes); - } - return {}; - } - - // Build inheritance column from inferred type + nested-group-aware path. - // type in column maynot be real type, so use data_type to get the real type. - PathInData index_path {*column.path_info_ptr()}; - DataTypePtr index_data_type = data_type; - if (!relative_path.empty()) { - auto [nested_reader, _] = find_nested_group_for_path(relative_path.get_path()); - const std::string root_path(kRootNestedGroupPath); - if (nested_reader != nullptr) { - const bool is_root_ng = nested_reader->array_path == root_path; - if (!is_root_ng) { - index_path = relative_path; - } else { - index_path = PathInData(root_path + "." + relative_path.get_path()); - } - if (data_type->is_nullable()) { - auto base = variant_util::get_base_type_of_array(remove_nullable(data_type)); - index_data_type = base->is_nullable() ? base : make_nullable(base); - } else { - index_data_type = variant_util::get_base_type_of_array(data_type); + // Unwrap Nullable(Array(...)) → element type for NG subcolumns + if (data_type->is_nullable()) { + auto base = variant_util::get_base_type_of_array(remove_nullable(data_type)); + index_data_type = base->is_nullable() ? base : make_nullable(base); + } else { + index_data_type = variant_util::get_base_type_of_array(data_type); + } } + // else: non-NG scalar field — keep index_path and index_data_type unchanged } - } - TabletColumn inheritance_column = - variant_util::get_column_by_type(index_data_type, column.name(), - {.unique_id = -1, - .parent_unique_id = column.parent_unique_id(), - .path_info = index_path}); - - return variant_util::resolve_subcolumn_indexes_inheritance( - *_tablet_schema, column.parent_unique_id(), relative_path.get_path(), - inheritance_column); + TabletColumn target_column = + variant_util::get_column_by_type(index_data_type, column.name(), + {.unique_id = -1, + .parent_unique_id = column.parent_unique_id(), + .path_info = index_path}); + variant_util::inherit_index(parent_index, sub_column_info.indexes, target_column); + } + // Return shared_ptr directly to maintain object lifetime + return sub_column_info.indexes; } void VariantColumnReader::get_subcolumns_types( From efdde31068bf181020260c74d57a5c918cd022d1 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 14:46:36 +0800 Subject: [PATCH 11/18] [fix](inverted index) restrict collector fallback to schema-only field_pattern lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MatchPredicateCollector::collect runs at tablet level (per-scan, before any segment is opened), so it cannot perform segment-level operations such as nested-group inference or inherit_index runtime-type dispatch — those need data_type and _subcolumns_meta_info that only segment readers have. Collapse the variant fallback to a schema-only generate_sub_column_info lookup, which covers field_pattern templates (MATCH_NAME / MATCH_NAME_GLOB). Plain inherited sub-column indexes are already resolved through the column-aware inverted_indexs(column) call (populated by inherit_column_attributes during OlapScanner::_init_variant_columns). Rewrite the synthetic VARIANT-placeholder UT to document this boundary: when neither schema lookup resolves any index, collect emits no CollectInfo in BE_TEST builds (production raises INVERTED_INDEX_NOT_SUPPORTED via the #ifndef BE_TEST guard, covered by regression tests). --- be/src/storage/predicate_collector.cpp | 20 ++++++++++++----- .../compaction/collection_statistics_test.cpp | 22 +++++++++---------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index 4a28871fb51e26..a5b74ff47ea0f7 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -98,13 +98,21 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS std::vector> owned_index_metas; std::string index_suffix_path = column.suffix_path(); + // Schema-only fallback for variant sub-column field_pattern templates. + // Collector runs at tablet level without segment context, so it cannot do + // nested-group inference or inherit_index runtime-type dispatch — those + // need segment data_type and _subcolumns_meta_info. Plain inherited + // sub-column indexes are already resolved by inverted_indexs(column) above + // (populated by inherit_column_attributes during _init_variant_columns). if (index_metas.empty() && column.is_extracted_column()) { - auto inherited = variant_util::resolve_subcolumn_indexes_inheritance( - *tablet_schema, column.parent_unique_id(), - column.path_info_ptr()->copy_pop_front().get_path(), column); - if (!inherited.empty()) { - index_suffix_path = column.suffix_path(); - for (auto& idx : inherited) { + TabletSchema::SubColumnInfo sub_column_info; + const std::string relative_path = + column.path_info_ptr()->copy_pop_front().get_path(); + if (variant_util::generate_sub_column_info(*tablet_schema, column.parent_unique_id(), + relative_path, &sub_column_info) && + !sub_column_info.indexes.empty()) { + index_suffix_path = sub_column_info.column.suffix_path(); + for (auto& idx : sub_column_info.indexes) { index_metas.push_back(idx.get()); owned_index_metas.emplace_back(std::move(idx)); } diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 464dcd36cf4868..fe8ad2131c801e 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -694,12 +694,12 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantSubcolumnIndex) { EXPECT_EQ(it->second.index_meta->index_name(), "variant_subcolumn_idx"); } -// Regression for score on a dynamic variant sub-column inherited from a parent -// variant inverted index. This matches the scan-time schema shape: -// _init_variant_columns materializes the accessed path as an extracted VARIANT -// placeholder, so the collector must not depend on the post-cast expression type -// to recover the parent text index for the concrete Lucene field. -TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutTemplate) { +// Schema state where neither inverted_indexs(column) nor generate_sub_column_info +// resolves any index for the variant sub-column. Collector runs at tablet level +// and intentionally has no segment-side fallback (no inherit_index, no +// nested-group inference). In BE_TEST builds the empty-index check is bypassed, +// so collect() returns OK and emits no CollectInfo for this expression. +TEST_F(CollectionStatisticsTest, ExtractCollectInfoEmptyWhenSchemaResolvesNoIndex) { auto tablet_schema = std::make_shared(); constexpr int32_t kVariantUid = 9004; @@ -732,6 +732,9 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutT index.init_from_pb(index_pb); tablet_schema->append_index(std::move(index)); + // Pre-conditions for the schema-only resolution path: + // (1) column-aware lookup empty (no inheritance pre-populated) + // (2) generate_sub_column_info returns false (no field_pattern template) ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); ASSERT_EQ(tablet_schema->inverted_indexs(kVariantUid).size(), 1u); TabletSchema::SubColumnInfo sub_column_info; @@ -758,12 +761,7 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutT auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, &collect_infos); ASSERT_TRUE(status.ok()) << status.msg(); - ASSERT_EQ(collect_infos.size(), 1u); - auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kVariantUid) + ".v.key")); - ASSERT_NE(it, collect_infos.end()); - ASSERT_NE(it->second.index_meta, nullptr); - ASSERT_NE(it->second.owned_index_meta, nullptr); - EXPECT_EQ(it->second.index_meta->index_name(), "variant_parent_idx"); + EXPECT_TRUE(collect_infos.empty()); } namespace { From 3d90512c58459dd6ff9b82a1a3847e17b01e7754 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 14:46:41 +0800 Subject: [PATCH 12/18] [refactor](inverted index) drop unused resolve_subcolumn_indexes_inheritance helper After reverting VariantColumnReader::find_subcolumn_tablet_indexes to its master implementation and restricting the collector fallback to a schema-only generate_sub_column_info call, the shared helper has zero callers. Remove the declaration and definition. --- be/src/exec/common/variant_util.cpp | 30 ----------------------------- be/src/exec/common/variant_util.h | 17 ---------------- 2 files changed, 47 deletions(-) diff --git a/be/src/exec/common/variant_util.cpp b/be/src/exec/common/variant_util.cpp index 866f2da8ac48e7..4faa43d131ea6a 100644 --- a/be/src/exec/common/variant_util.cpp +++ b/be/src/exec/common/variant_util.cpp @@ -2180,34 +2180,4 @@ Status parse_and_materialize_variant_columns(Block& block, const TabletSchema& t return Status::OK(); } -TabletIndexes resolve_subcolumn_indexes_inheritance(const TabletSchema& schema, - int32_t parent_unique_id, - const std::string& relative_path, - const TabletColumn& inheritance_column) { - TabletSchema::SubColumnInfo sub_column_info; - if (generate_sub_column_info(schema, parent_unique_id, relative_path, &sub_column_info) && - !sub_column_info.indexes.empty()) { - return std::move(sub_column_info.indexes); - } - - TabletIndexes inherited; - const auto parent_indexes = schema.inverted_indexs(parent_unique_id); - if (inherit_index(parent_indexes, inherited, inheritance_column)) { - return inherited; - } - - if (inheritance_column.is_extracted_column() && inheritance_column.is_variant_type()) { - for (const auto* index : parent_indexes) { - if (!index->field_pattern().empty()) { - continue; - } - auto index_ptr = std::make_shared(*index); - index_ptr->set_escaped_escaped_index_suffix_path( - inheritance_column.path_info_ptr()->get_path()); - inherited.emplace_back(std::move(index_ptr)); - } - } - return inherited; -} - } // namespace doris::variant_util diff --git a/be/src/exec/common/variant_util.h b/be/src/exec/common/variant_util.h index c051b1b89b6270..f4302146972e2c 100644 --- a/be/src/exec/common/variant_util.h +++ b/be/src/exec/common/variant_util.h @@ -185,23 +185,6 @@ bool generate_sub_column_info(const TabletSchema& schema, int32_t col_unique_id, const std::string& path, TabletSchema::SubColumnInfo* sub_column_info); -// Resolve inverted indexes for an extracted variant sub-column by trying: -// 1. generate_sub_column_info (field_pattern MATCH_NAME / MATCH_NAME_GLOB) -// 2. inherit_index from parent variant indexes -// inheritance_column drives inherit_index's slice/numeric branch decision: -// - collector side: pass the schema column directly. -// - segment side: pass a TabletColumn synthesized from inferred storage -// type with nested-group-aware path. -// If the collector only has a scan-time materialized VARIANT placeholder for a -// dynamic path, clone plain parent indexes with that path suffix instead of -// using the post-cast expression type. field_pattern template indexes must have -// matched in step 1; unmatched templates are not inherited as plain indexes. -// Returns cloned TabletIndex shared_ptrs (caller keeps them alive). -TabletIndexes resolve_subcolumn_indexes_inheritance(const TabletSchema& schema, - int32_t parent_unique_id, - const std::string& relative_path, - const TabletColumn& inheritance_column); - class VariantCompactionUtil { public: // get the subpaths and sparse paths for the variant column From 6c4fff4ac4733896193f6de279a624b9b258be88 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 15:32:03 +0800 Subject: [PATCH 13/18] [test](inverted index) cover all branches of MatchPredicateCollector::collect Add unit tests for the schema-level branches of MatchPredicateCollector::collect that were previously not directly covered: - E1: missing slot reference in left expression - E2: slot descriptor not in runtime descriptor table - E3: column name absent from tablet schema - I1/L3/O1: direct inverted_indexs(column) hit, owned_index_meta null - I2: non-extracted column with no index, fallback skipped - L1: index missing parser is skipped (should_analyzer false) - L2: index without support_phrase is skipped (is_need_similarity_score false) - L4: same field_name across multiple predicates merges term_infos I3/O2 (field_pattern fallback with owned cloned indexes) already covered by ExtractCollectInfoForVariantFieldPatternIndex / ...GlobIndex tests. I4 (neither schema lookup resolves) covered by ExtractCollectInfoEmptyWhenSchemaResolvesNoIndex. E4 (production-only INVERTED_INDEX_NOT_SUPPORTED on empty result) is gated by #ifndef BE_TEST and covered by the test_bm25_score_variant regression suite. --- .../compaction/collection_statistics_test.cpp | 305 ++++++++++++++++++ 1 file changed, 305 insertions(+) diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index fe8ad2131c801e..831d307d1c0d23 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -926,6 +926,311 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantFieldPatternGlobInd EXPECT_EQ(it->second.index_meta->index_name(), "variant_field_pattern_glob_idx"); } +// E1: Match predicate whose left subtree contains no VSlotRef. +// find_slot_ref recurses through children; when it returns nullptr the +// collector reports INVERTED_INDEX_NOT_SUPPORTED. +TEST_F(CollectionStatisticsTest, CollectMissingSlotRefReturnsError) { + auto tablet_schema = std::make_shared(); + TabletColumn col; + col.set_unique_id(1001); + col.set_name("c"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto literal_left = std::make_shared("foo"); + auto literal_right = std::make_shared("bar"); + match_expr->_children.push_back(literal_left); + match_expr->_children.push_back(literal_right); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_FALSE(status.ok()); + EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); + EXPECT_TRUE(status.msg().find("Cannot find slot reference") != std::string::npos); +} + +// E2: SlotRef points to a slot_id absent from the runtime descriptor table. +TEST_F(CollectionStatisticsTest, CollectMissingSlotDescriptorReturnsError) { + auto tablet_schema = std::make_shared(); + TabletColumn col; + col.set_unique_id(1002); + col.set_name("c"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + constexpr int kAbsentSlotId = 99999; + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("c", SlotId(kAbsentSlotId)); + auto literal = std::make_shared("v"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_FALSE(status.ok()); + EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); + EXPECT_TRUE(status.msg().find("Cannot find slot descriptor") != std::string::npos); +} + +// E3: SlotRef name does not exist in tablet_schema (field_index returns -1). +TEST_F(CollectionStatisticsTest, CollectUnknownColumnNameReturnsError) { + auto tablet_schema = std::make_shared(); + TabletColumn col; + col.set_unique_id(1003); + col.set_name("declared"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + constexpr int kSlotId = 50; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), 1003, "missing", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("missing", SlotId(kSlotId)); + auto literal = std::make_shared("v"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_FALSE(status.ok()); + EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); + EXPECT_TRUE(status.msg().find("Cannot find column index") != std::string::npos); +} + +// I1 + L3 + O1: Plain string column with a direct inverted index. +// Direct hit produces a CollectInfo whose owned_index_meta is null +// (the meta lives in the schema and is not cloned). +TEST_F(CollectionStatisticsTest, CollectDirectIndexHitFromSchema) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1100; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("note"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2100); + index_pb.set_index_name("note_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kColUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "english"; + (*props)["support_phrase"] = "true"; + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + constexpr int kSlotId = 60; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "note", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("note", SlotId(kSlotId)); + auto literal = std::make_shared("hello world"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid))); + ASSERT_NE(it, collect_infos.end()); + EXPECT_NE(it->second.index_meta, nullptr); + EXPECT_EQ(it->second.owned_index_meta, nullptr); // O1: schema-direct meta is not owned + EXPECT_FALSE(it->second.term_infos.empty()); +} + +// I2: Plain string column with no index and not an extracted variant +// sub-column. Fallback path does not apply (column.is_extracted_column() +// is false). In BE_TEST builds the empty-index check is skipped, so +// collect returns OK with no CollectInfo emitted. +TEST_F(CollectionStatisticsTest, CollectNotExtractedColumnSkipsFallback) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1200; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("plain"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + // no index appended + + constexpr int kSlotId = 70; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "plain", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("plain", SlotId(kSlotId)); + auto literal = std::make_shared("v"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + EXPECT_TRUE(collect_infos.empty()); +} + +// L1: Index whose properties do not request an analyzer +// (should_analyzer returns false). The matching index_meta is iterated +// but skipped before insertion. +TEST_F(CollectionStatisticsTest, CollectSkipsIndexWithoutAnalyzer) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1300; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("kw"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2300); + index_pb.set_index_name("kw_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kColUid); + // No "parser" property -> should_analyzer returns false + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + constexpr int kSlotId = 80; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "kw", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("kw", SlotId(kSlotId)); + auto literal = std::make_shared("v"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + EXPECT_TRUE(collect_infos.empty()); +} + +// L2: Index whose analyzer is set (should_analyzer returns true) but does +// not declare "support_phrase=true". MockVExpr drives MATCH_PHRASE opcode, +// so is_need_similarity_score returns false and the index is skipped. +TEST_F(CollectionStatisticsTest, CollectSkipsIndexWithoutSimilarityScore) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1350; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("body"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2350); + index_pb.set_index_name("body_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kColUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "english"; // should_analyzer == true + // Intentionally omit "support_phrase" -> is_need_similarity_score == false + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + constexpr int kSlotId = 85; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "body", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("body", SlotId(kSlotId)); + auto literal = std::make_shared("hello"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + EXPECT_TRUE(collect_infos.empty()); +} + +// L4: Two MATCH predicates on the same column produce CollectInfo entries +// keyed on the same field_name; the second insertion merges term_infos +// into the first entry. +TEST_F(CollectionStatisticsTest, CollectMergesTermsForSameFieldName) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1400; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("doc"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2400); + index_pb.set_index_name("doc_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kColUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "english"; + (*props)["support_phrase"] = "true"; + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + constexpr int kSlotId = 90; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "doc", {}); + + auto build_match = [&](const std::string& term) { + auto m = std::make_shared(TExprNodeType::MATCH_PRED); + auto s = std::make_shared("doc", SlotId(kSlotId)); + auto l = std::make_shared(term); + m->_children.push_back(s); + m->_children.push_back(l); + return m; + }; + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(build_match("alpha"))); + contexts.push_back(std::make_shared(build_match("beta"))); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid))); + ASSERT_NE(it, collect_infos.end()); + EXPECT_GE(it->second.term_infos.size(), 2u); // both "alpha" and "beta" present +} + TEST(TermInfoComparerTest, OrdersByTermAndDedups) { using doris::TermInfoComparer; using doris::segment_v2::TermInfo; From 00738ddf5fd67b9ae2f86306d79bbb7eccd2b5cd Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 16:52:30 +0800 Subject: [PATCH 14/18] [fix](inverted index) clone parent plain index for VARIANT placeholder sub-column MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OlapScanner::_init_variant_columns materializes dynamic variant paths (e.g. v["key"]) as extracted sub-columns whose schema-level type is the OLAP_FIELD_TYPE_VARIANT placeholder before the segment's inferred type is resolved. When the parent variant column has a plain inverted index — for example one added by ALTER TABLE — neither resolution path the previous commit relied on succeeds: - inverted_indexs(column) consults _path_set_info_map.subcolumn_indexes, which inherit_column_attributes only populates for declared typed paths and field_pattern outputs. Plain parent indexes do not flow into it. - generate_sub_column_info matches MATCH_NAME / MATCH_NAME_GLOB templates only. The collector therefore raised INVERTED_INDEX_NOT_SUPPORTED for the test_bm25_score case `cast(v["key"] as string) match_phrase "abc"` against a parent variant with a plain idx_v inverted index. Add an inline branch (collector-side, no segment-side change) that, when the schema column is the VARIANT placeholder, clones the parent's non-field-pattern indexes with the variant path as suffix so segment-side BM25 statistics can be collected. field_pattern templates remain exclusively resolved through generate_sub_column_info — they are not inherited as plain indexes. Restore ExtractCollectInfoForVariantParentIndexWithoutTemplate to its original 1-entry assertion since this is exactly the case it was written to cover. --- be/src/storage/predicate_collector.cpp | 32 +++++++++++++++---- .../compaction/collection_statistics_test.cpp | 26 +++++++++------ 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index a5b74ff47ea0f7..e1157021c10f54 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -98,12 +98,20 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS std::vector> owned_index_metas; std::string index_suffix_path = column.suffix_path(); - // Schema-only fallback for variant sub-column field_pattern templates. - // Collector runs at tablet level without segment context, so it cannot do - // nested-group inference or inherit_index runtime-type dispatch — those - // need segment data_type and _subcolumns_meta_info. Plain inherited - // sub-column indexes are already resolved by inverted_indexs(column) above - // (populated by inherit_column_attributes during _init_variant_columns). + // Schema-only fallback for variant sub-columns. Collector runs at tablet + // level without segment context, so we cannot do nested-group inference + // or inherit_index runtime-type dispatch. Two paths cover what is + // resolvable from schema alone: + // 1. field_pattern templates (MATCH_NAME / MATCH_NAME_GLOB) via + // generate_sub_column_info. + // 2. Plain parent inverted index when the schema column is the dynamic + // path's VARIANT placeholder produced by _init_variant_columns. In + // that state inverted_indexs(column) misses because + // _path_set_info_map.subcolumn_indexes is only populated for typed + // paths / field_pattern outputs, not for plain parent indexes added + // by ALTER. Clone the parent's non-field-pattern indexes with the + // variant path as suffix so segment-side BM25 statistics can be + // collected. if (index_metas.empty() && column.is_extracted_column()) { TabletSchema::SubColumnInfo sub_column_info; const std::string relative_path = @@ -116,6 +124,18 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS index_metas.push_back(idx.get()); owned_index_metas.emplace_back(std::move(idx)); } + } else if (column.is_variant_type()) { + const auto parent_indexes = tablet_schema->inverted_indexs(column.parent_unique_id()); + for (const auto* index : parent_indexes) { + if (!index->field_pattern().empty()) { + continue; + } + auto index_ptr = std::make_shared(*index); + index_ptr->set_escaped_escaped_index_suffix_path( + column.path_info_ptr()->get_path()); + index_metas.push_back(index_ptr.get()); + owned_index_metas.emplace_back(std::move(index_ptr)); + } } } diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 831d307d1c0d23..52c9f3afa8fbd5 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -694,12 +694,13 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantSubcolumnIndex) { EXPECT_EQ(it->second.index_meta->index_name(), "variant_subcolumn_idx"); } -// Schema state where neither inverted_indexs(column) nor generate_sub_column_info -// resolves any index for the variant sub-column. Collector runs at tablet level -// and intentionally has no segment-side fallback (no inherit_index, no -// nested-group inference). In BE_TEST builds the empty-index check is bypassed, -// so collect() returns OK and emits no CollectInfo for this expression. -TEST_F(CollectionStatisticsTest, ExtractCollectInfoEmptyWhenSchemaResolvesNoIndex) { +// Regression for score on a dynamic variant sub-column inherited from a plain +// parent variant inverted index (no field_pattern template). Matches the +// scan-time schema shape: _init_variant_columns materializes the accessed +// path as an extracted VARIANT placeholder, so neither inverted_indexs(column) +// nor generate_sub_column_info resolves the parent index. Collector clones +// the parent's non-field-pattern indexes with the variant path as suffix. +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutTemplate) { auto tablet_schema = std::make_shared(); constexpr int32_t kVariantUid = 9004; @@ -732,9 +733,9 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoEmptyWhenSchemaResolvesNoInde index.init_from_pb(index_pb); tablet_schema->append_index(std::move(index)); - // Pre-conditions for the schema-only resolution path: - // (1) column-aware lookup empty (no inheritance pre-populated) - // (2) generate_sub_column_info returns false (no field_pattern template) + // Pre-conditions: column-aware lookup is empty (no inheritance pre-populated) + // and generate_sub_column_info returns false (no field_pattern template). + // The collector must still resolve through the VARIANT-placeholder branch. ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); ASSERT_EQ(tablet_schema->inverted_indexs(kVariantUid).size(), 1u); TabletSchema::SubColumnInfo sub_column_info; @@ -761,7 +762,12 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoEmptyWhenSchemaResolvesNoInde auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, &collect_infos); ASSERT_TRUE(status.ok()) << status.msg(); - EXPECT_TRUE(collect_infos.empty()); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kVariantUid) + ".v.key")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + ASSERT_NE(it->second.owned_index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_parent_idx"); } namespace { From 2c160939784bde03a095d6c48bb3d59db42e9fe2 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 17:00:24 +0800 Subject: [PATCH 15/18] [chore](format) clang-format pass on PR #62992 changes No functional changes. Whitespace/comment-alignment fixes from build-support/clang-format.sh (clang-format v16) on the two files touched by this PR. --- be/src/storage/predicate_collector.cpp | 3 +-- .../compaction/collection_statistics_test.cpp | 12 +++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index e1157021c10f54..fa8fc0117ce34f 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -114,8 +114,7 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS // collected. if (index_metas.empty() && column.is_extracted_column()) { TabletSchema::SubColumnInfo sub_column_info; - const std::string relative_path = - column.path_info_ptr()->copy_pop_front().get_path(); + const std::string relative_path = column.path_info_ptr()->copy_pop_front().get_path(); if (variant_util::generate_sub_column_info(*tablet_schema, column.parent_unique_id(), relative_path, &sub_column_info) && !sub_column_info.indexes.empty()) { diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 52c9f3afa8fbd5..30b677c744ecbd 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -972,7 +972,8 @@ TEST_F(CollectionStatisticsTest, CollectMissingSlotDescriptorReturnsError) { constexpr int kAbsentSlotId = 99999; auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); - auto slot_ref = std::make_shared("c", SlotId(kAbsentSlotId)); + auto slot_ref = + std::make_shared("c", SlotId(kAbsentSlotId)); auto literal = std::make_shared("v"); match_expr->_children.push_back(slot_ref); match_expr->_children.push_back(literal); @@ -1001,7 +1002,8 @@ TEST_F(CollectionStatisticsTest, CollectUnknownColumnNameReturnsError) { runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), 1003, "missing", {}); auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); - auto slot_ref = std::make_shared("missing", SlotId(kSlotId)); + auto slot_ref = + std::make_shared("missing", SlotId(kSlotId)); auto literal = std::make_shared("v"); match_expr->_children.push_back(slot_ref); match_expr->_children.push_back(literal); @@ -1062,7 +1064,7 @@ TEST_F(CollectionStatisticsTest, CollectDirectIndexHitFromSchema) { auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid))); ASSERT_NE(it, collect_infos.end()); EXPECT_NE(it->second.index_meta, nullptr); - EXPECT_EQ(it->second.owned_index_meta, nullptr); // O1: schema-direct meta is not owned + EXPECT_EQ(it->second.owned_index_meta, nullptr); // O1: schema-direct meta is not owned EXPECT_FALSE(it->second.term_infos.empty()); } @@ -1161,7 +1163,7 @@ TEST_F(CollectionStatisticsTest, CollectSkipsIndexWithoutSimilarityScore) { index_pb.set_index_type(IndexType::INVERTED); index_pb.add_col_unique_id(kColUid); auto* props = index_pb.mutable_properties(); - (*props)["parser"] = "english"; // should_analyzer == true + (*props)["parser"] = "english"; // should_analyzer == true // Intentionally omit "support_phrase" -> is_need_similarity_score == false TabletIndex index; index.init_from_pb(index_pb); @@ -1234,7 +1236,7 @@ TEST_F(CollectionStatisticsTest, CollectMergesTermsForSameFieldName) { ASSERT_EQ(collect_infos.size(), 1u); auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid))); ASSERT_NE(it, collect_infos.end()); - EXPECT_GE(it->second.term_infos.size(), 2u); // both "alpha" and "beta" present + EXPECT_GE(it->second.term_infos.size(), 2u); // both "alpha" and "beta" present } TEST(TermInfoComparerTest, OrdersByTermAndDedups) { From 910092ce288f8f55366b5f98471790833ff38646 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Tue, 5 May 2026 20:26:52 +0800 Subject: [PATCH 16/18] [test](inverted index) add direct UTs for PredicateCollector helpers Coverage report on PR #62992 marked many lines in find_slot_ref and build_field_name as new+uncovered, even though end-to-end tests through extract_collect_info exercise them. The increment-coverage tool seems to under-attribute coverage for templated returns and for branches inside helper functions exercised indirectly. Add a small TestablePredicateCollector that exposes the protected helpers, plus 5 direct unit tests: - FindSlotRefHandlesNullExpr -> null shared_ptr early return - FindSlotRefRecursesIntoChildren -> non-CAST wrapper recursion - FindSlotRefReturnsNullForLeafNonSlot -> leaf non-slot fall-through - BuildFieldNameWithSuffix -> non-empty suffix path - BuildFieldNameWithoutSuffix -> empty suffix path All 32 CollectionStatisticsTest cases (27 existing + 5 new) pass locally. --- .../compaction/collection_statistics_test.cpp | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 30b677c744ecbd..8120888f61754e 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -1239,6 +1239,49 @@ TEST_F(CollectionStatisticsTest, CollectMergesTermsForSameFieldName) { EXPECT_GE(it->second.term_infos.size(), 2u); // both "alpha" and "beta" present } +// Test-only subclass that exposes the protected helpers of PredicateCollector. +class TestablePredicateCollector : public MatchPredicateCollector { +public: + using MatchPredicateCollector::build_field_name; + using MatchPredicateCollector::find_slot_ref; +}; + +// find_slot_ref: null shared_ptr returns nullptr (early-return branch). +TEST_F(CollectionStatisticsTest, FindSlotRefHandlesNullExpr) { + TestablePredicateCollector collector; + VExprSPtr null_expr; + EXPECT_EQ(collector.find_slot_ref(null_expr), nullptr); +} + +// find_slot_ref: when expr is a non-CAST wrapper containing a SLOT_REF in its +// children, the recursive descent finds the slot via the for-loop body. +TEST_F(CollectionStatisticsTest, FindSlotRefRecursesIntoChildren) { + TestablePredicateCollector collector; + auto wrapper = std::make_shared(TExprNodeType::FUNCTION_CALL); + auto slot_ref = std::make_shared("c", SlotId(99)); + wrapper->_children.push_back(slot_ref); + EXPECT_EQ(collector.find_slot_ref(wrapper), slot_ref.get()); +} + +// find_slot_ref: leaf non-slot (no children) returns nullptr after for-loop. +TEST_F(CollectionStatisticsTest, FindSlotRefReturnsNullForLeafNonSlot) { + TestablePredicateCollector collector; + auto literal = std::make_shared("x"); + EXPECT_EQ(collector.find_slot_ref(literal), nullptr); +} + +// build_field_name: non-empty suffix is appended with a dot separator. +TEST_F(CollectionStatisticsTest, BuildFieldNameWithSuffix) { + TestablePredicateCollector collector; + EXPECT_EQ(collector.build_field_name(42, "a.b"), "42.a.b"); +} + +// build_field_name: empty suffix returns just the unique id as string. +TEST_F(CollectionStatisticsTest, BuildFieldNameWithoutSuffix) { + TestablePredicateCollector collector; + EXPECT_EQ(collector.build_field_name(42, ""), "42"); +} + TEST(TermInfoComparerTest, OrdersByTermAndDedups) { using doris::TermInfoComparer; using doris::segment_v2::TermInfo; From ca04990c68e6da85c46799d3941a7ab55e1cee17 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 6 May 2026 10:02:33 +0800 Subject: [PATCH 17/18] [test](inverted index) call MatchPredicateCollector::collect() directly in branch tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous version of these branch tests dispatched through CollectionStatistics::extract_collect_info, which internally calls collect() via a unique_ptr::operator->() virtual dispatch. Some LLVM coverage tooling under-attributes line counts inside such indirectly- dispatched callees — for example the templated Status::Error<> returns at the E1/E2/E3 sites and the variant fallback block all showed count=0 in the increment_report even though the tests assertively confirm those branches were taken. Rewrite the eight Collect* branch tests (E1, E2, E3, I1+L3+O1, I2, L1, L2, L4) to instantiate MatchPredicateCollector directly and call collect() without going through extract_collect_info. The integration-flow tests (ExtractCollectInfoFor*) continue to exercise extract_collect_info as before, so the dispatch path is still covered. All 32 CollectionStatistics tests pass locally. --- .../compaction/collection_statistics_test.cpp | 72 ++++++++----------- 1 file changed, 30 insertions(+), 42 deletions(-) diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 8120888f61754e..2a27e8737de016 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -935,6 +935,8 @@ TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantFieldPatternGlobInd // E1: Match predicate whose left subtree contains no VSlotRef. // find_slot_ref recurses through children; when it returns nullptr the // collector reports INVERTED_INDEX_NOT_SUPPORTED. +// Calls MatchPredicateCollector::collect() directly so coverage attribution +// is not muddied by extract_collect_info's virtual-dispatch indirection. TEST_F(CollectionStatisticsTest, CollectMissingSlotRefReturnsError) { auto tablet_schema = std::make_shared(); TabletColumn col; @@ -949,12 +951,10 @@ TEST_F(CollectionStatisticsTest, CollectMissingSlotRefReturnsError) { match_expr->_children.push_back(literal_left); match_expr->_children.push_back(literal_right); - VExprContextSPtrs contexts; - contexts.push_back(std::make_shared(match_expr)); - + MatchPredicateCollector collector; std::unordered_map collect_infos; - auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, - &collect_infos); + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); ASSERT_FALSE(status.ok()); EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); EXPECT_TRUE(status.msg().find("Cannot find slot reference") != std::string::npos); @@ -978,12 +978,10 @@ TEST_F(CollectionStatisticsTest, CollectMissingSlotDescriptorReturnsError) { match_expr->_children.push_back(slot_ref); match_expr->_children.push_back(literal); - VExprContextSPtrs contexts; - contexts.push_back(std::make_shared(match_expr)); - + MatchPredicateCollector collector; std::unordered_map collect_infos; - auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, - &collect_infos); + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); ASSERT_FALSE(status.ok()); EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); EXPECT_TRUE(status.msg().find("Cannot find slot descriptor") != std::string::npos); @@ -1008,12 +1006,10 @@ TEST_F(CollectionStatisticsTest, CollectUnknownColumnNameReturnsError) { match_expr->_children.push_back(slot_ref); match_expr->_children.push_back(literal); - VExprContextSPtrs contexts; - contexts.push_back(std::make_shared(match_expr)); - + MatchPredicateCollector collector; std::unordered_map collect_infos; - auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, - &collect_infos); + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); ASSERT_FALSE(status.ok()); EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); EXPECT_TRUE(status.msg().find("Cannot find column index") != std::string::npos); @@ -1053,12 +1049,10 @@ TEST_F(CollectionStatisticsTest, CollectDirectIndexHitFromSchema) { match_expr->_children.push_back(slot_ref); match_expr->_children.push_back(literal); - VExprContextSPtrs contexts; - contexts.push_back(std::make_shared(match_expr)); - + MatchPredicateCollector collector; std::unordered_map collect_infos; - auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, - &collect_infos); + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); ASSERT_TRUE(status.ok()) << status.msg(); ASSERT_EQ(collect_infos.size(), 1u); auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid))); @@ -1092,12 +1086,10 @@ TEST_F(CollectionStatisticsTest, CollectNotExtractedColumnSkipsFallback) { match_expr->_children.push_back(slot_ref); match_expr->_children.push_back(literal); - VExprContextSPtrs contexts; - contexts.push_back(std::make_shared(match_expr)); - + MatchPredicateCollector collector; std::unordered_map collect_infos; - auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, - &collect_infos); + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); ASSERT_TRUE(status.ok()) << status.msg(); EXPECT_TRUE(collect_infos.empty()); } @@ -1134,12 +1126,10 @@ TEST_F(CollectionStatisticsTest, CollectSkipsIndexWithoutAnalyzer) { match_expr->_children.push_back(slot_ref); match_expr->_children.push_back(literal); - VExprContextSPtrs contexts; - contexts.push_back(std::make_shared(match_expr)); - + MatchPredicateCollector collector; std::unordered_map collect_infos; - auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, - &collect_infos); + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); ASSERT_TRUE(status.ok()) << status.msg(); EXPECT_TRUE(collect_infos.empty()); } @@ -1178,12 +1168,10 @@ TEST_F(CollectionStatisticsTest, CollectSkipsIndexWithoutSimilarityScore) { match_expr->_children.push_back(slot_ref); match_expr->_children.push_back(literal); - VExprContextSPtrs contexts; - contexts.push_back(std::make_shared(match_expr)); - + MatchPredicateCollector collector; std::unordered_map collect_infos; - auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, - &collect_infos); + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); ASSERT_TRUE(status.ok()) << status.msg(); EXPECT_TRUE(collect_infos.empty()); } @@ -1225,14 +1213,14 @@ TEST_F(CollectionStatisticsTest, CollectMergesTermsForSameFieldName) { return m; }; - VExprContextSPtrs contexts; - contexts.push_back(std::make_shared(build_match("alpha"))); - contexts.push_back(std::make_shared(build_match("beta"))); - + MatchPredicateCollector collector; std::unordered_map collect_infos; - auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, - &collect_infos); - ASSERT_TRUE(status.ok()) << status.msg(); + auto first = + collector.collect(runtime_state_.get(), tablet_schema, build_match("alpha"), &collect_infos); + ASSERT_TRUE(first.ok()) << first.msg(); + auto second = + collector.collect(runtime_state_.get(), tablet_schema, build_match("beta"), &collect_infos); + ASSERT_TRUE(second.ok()) << second.msg(); ASSERT_EQ(collect_infos.size(), 1u); auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid))); ASSERT_NE(it, collect_infos.end()); From 30605110b3aa5cefa9d6de2e1837a3d0e4df61a1 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 6 May 2026 12:15:37 +0800 Subject: [PATCH 18/18] [chore](format) clang-format pass on collector branch tests --- be/test/storage/compaction/collection_statistics_test.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 2a27e8737de016..b21c2264bd9516 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -1215,11 +1215,11 @@ TEST_F(CollectionStatisticsTest, CollectMergesTermsForSameFieldName) { MatchPredicateCollector collector; std::unordered_map collect_infos; - auto first = - collector.collect(runtime_state_.get(), tablet_schema, build_match("alpha"), &collect_infos); + auto first = collector.collect(runtime_state_.get(), tablet_schema, build_match("alpha"), + &collect_infos); ASSERT_TRUE(first.ok()) << first.msg(); - auto second = - collector.collect(runtime_state_.get(), tablet_schema, build_match("beta"), &collect_infos); + auto second = collector.collect(runtime_state_.get(), tablet_schema, build_match("beta"), + &collect_infos); ASSERT_TRUE(second.ok()) << second.msg(); ASSERT_EQ(collect_infos.size(), 1u); auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid)));