diff --git a/be/src/storage/predicate_collector.cpp b/be/src/storage/predicate_collector.cpp index 8e319ae329fd0e..fa8fc0117ce34f 100644 --- a/be/src/storage/predicate_collector.cpp +++ b/be/src/storage/predicate_collector.cpp @@ -19,6 +19,9 @@ #include +#include + +#include "exec/common/variant_util.h" #include "exprs/vexpr.h" #include "exprs/vexpr_context.h" #include "exprs/vliteral.h" @@ -91,7 +94,49 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS } const auto& column = tablet_schema->column(col_idx); - auto index_metas = tablet_schema->inverted_indexs(sd->col_unique_id(), column.suffix_path()); + auto index_metas = tablet_schema->inverted_indexs(column); + std::vector> owned_index_metas; + std::string index_suffix_path = column.suffix_path(); + + // Schema-only fallback for variant sub-columns. Collector runs at tablet + // level without segment context, so we cannot do nested-group inference + // or inherit_index runtime-type dispatch. Two paths cover what is + // resolvable from schema alone: + // 1. field_pattern templates (MATCH_NAME / MATCH_NAME_GLOB) via + // generate_sub_column_info. + // 2. Plain parent inverted index when the schema column is the dynamic + // path's VARIANT placeholder produced by _init_variant_columns. In + // that state inverted_indexs(column) misses because + // _path_set_info_map.subcolumn_indexes is only populated for typed + // paths / field_pattern outputs, not for plain parent indexes added + // by ALTER. Clone the parent's non-field-pattern indexes with the + // variant path as suffix so segment-side BM25 statistics can be + // collected. + if (index_metas.empty() && column.is_extracted_column()) { + TabletSchema::SubColumnInfo sub_column_info; + const std::string relative_path = column.path_info_ptr()->copy_pop_front().get_path(); + if (variant_util::generate_sub_column_info(*tablet_schema, column.parent_unique_id(), + relative_path, &sub_column_info) && + !sub_column_info.indexes.empty()) { + index_suffix_path = sub_column_info.column.suffix_path(); + for (auto& idx : sub_column_info.indexes) { + index_metas.push_back(idx.get()); + owned_index_metas.emplace_back(std::move(idx)); + } + } else if (column.is_variant_type()) { + const auto parent_indexes = tablet_schema->inverted_indexs(column.parent_unique_id()); + for (const auto* index : parent_indexes) { + if (!index->field_pattern().empty()) { + continue; + } + auto index_ptr = std::make_shared(*index); + index_ptr->set_escaped_escaped_index_suffix_path( + column.path_info_ptr()->get_path()); + index_metas.push_back(index_ptr.get()); + owned_index_metas.emplace_back(std::move(index_ptr)); + } + } + } #ifndef BE_TEST if (index_metas.empty()) { @@ -117,7 +162,7 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS index_meta->properties()); std::string field_name = - build_field_name(index_meta->col_unique_ids()[0], column.suffix_path()); + build_field_name(index_meta->col_unique_ids()[0], index_suffix_path); std::wstring ws_field_name = StringHelper::to_wstring(field_name); auto iter = collect_infos->find(ws_field_name); @@ -125,6 +170,12 @@ Status MatchPredicateCollector::collect(RuntimeState* state, const TabletSchemaS CollectInfo collect_info; collect_info.term_infos.insert(term_infos.begin(), term_infos.end()); collect_info.index_meta = index_meta; + for (const auto& owned_index_meta : owned_index_metas) { + if (owned_index_meta.get() == index_meta) { + collect_info.owned_index_meta = owned_index_meta; + break; + } + } (*collect_infos)[ws_field_name] = std::move(collect_info); } else { iter->second.term_infos.insert(term_infos.begin(), term_infos.end()); @@ -260,4 +311,4 @@ SearchPredicateCollector::ClauseTypeCategory SearchPredicateCollector::get_claus } } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/storage/predicate_collector.h b/be/src/storage/predicate_collector.h index aa5a49344b98c2..c96e0af9c45ed5 100644 --- a/be/src/storage/predicate_collector.h +++ b/be/src/storage/predicate_collector.h @@ -43,6 +43,7 @@ struct TermInfoComparer { struct CollectInfo { std::set term_infos; + std::shared_ptr owned_index_meta; const TabletIndex* index_meta = nullptr; }; using CollectInfoMap = std::unordered_map; diff --git a/be/test/storage/compaction/collection_statistics_test.cpp b/be/test/storage/compaction/collection_statistics_test.cpp index 92b1522f76738c..b21c2264bd9516 100644 --- a/be/test/storage/compaction/collection_statistics_test.cpp +++ b/be/test/storage/compaction/collection_statistics_test.cpp @@ -25,6 +25,8 @@ #include #include "common/exception.h" +#include "core/data_type/data_type_string.h" +#include "exec/common/variant_util.h" #include "exprs/vexpr.h" #include "exprs/vexpr_context.h" #include "exprs/vliteral.h" @@ -43,7 +45,11 @@ namespace collection_statistics { class MockVExpr : public VExpr { public: - MockVExpr(TExprNodeType::type node_type) : _mock_node_type(node_type) {} + MockVExpr(TExprNodeType::type node_type) : _mock_node_type(node_type) { + if (node_type == TExprNodeType::MATCH_PRED) { + _opcode = TExprOpcode::MATCH_PHRASE; + } + } TExprNodeType::type node_type() const override { return _mock_node_type; } @@ -100,6 +106,7 @@ class MockVLiteral : public VLiteral { MockVLiteral(const std::string& value) : _value(value) {} std::string value() const override { return _value; } + std::string value(const DataTypeSerDe::FormatOptions& options) const override { return _value; } const std::string& expr_name() const override { return _value; } std::string debug_string() const override { return "MockVLiteral: " + _value; } @@ -268,6 +275,7 @@ class CollectionStatisticsTest : public ::testing::Test { index._col_unique_ids.push_back(1); std::map properties; properties["parser"] = "standard"; + properties["support_phrase"] = "true"; index._properties = properties; tablet_schema->append_index(std::move(index)); @@ -614,6 +622,654 @@ TEST_F(CollectionStatisticsTest, CollectWithDoubleCastWrappedSlotRef) { EXPECT_TRUE(status.ok()) << status.msg(); } +// Regression for AIR-36: match score collection must resolve indexes for +// variant sub-columns whose indexes live in _path_set_info_map (typed paths or +// inherited sub-column indexes). The previous simple lookup using +// inverted_indexs(col_unique_id, suffix_path) missed those indexes. +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantSubcolumnIndex) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kVariantUid = 9001; + + TabletColumn variant_col; + variant_col.set_unique_id(kVariantUid); + variant_col.set_name("v"); + variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + tablet_schema->append_column(variant_col); + + TabletColumn sub_col; + sub_col.set_unique_id(-1); + sub_col.set_name("v.host"); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + sub_col.set_parent_unique_id(kVariantUid); + PathInData path("v.host"); + sub_col.set_path_info(path); + tablet_schema->append_column(sub_col); + + auto sub_index = std::make_shared(); + TabletIndexPB index_pb; + index_pb.set_index_id(2001); + index_pb.set_index_name("variant_subcolumn_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kVariantUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "standard"; + (*props)["support_phrase"] = "true"; + sub_index->init_from_pb(index_pb); + + TabletSchema::PathsSetInfo path_set_info; + TabletIndexes sub_indexes = {sub_index}; + path_set_info.subcolumn_indexes["host"] = sub_indexes; + std::unordered_map path_set_info_map; + path_set_info_map[kVariantUid] = std::move(path_set_info); + tablet_schema->set_path_set_info(std::move(path_set_info_map)); + + EXPECT_TRUE(tablet_schema->inverted_indexs(kVariantUid, "host").empty()); + + auto found = tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)); + ASSERT_EQ(found.size(), 1u); + EXPECT_EQ(found[0]->index_name(), "variant_subcolumn_idx"); + + constexpr int kSlotId = 42; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = + std::make_shared("v.host", SlotId(kSlotId)); + auto literal = std::make_shared("foo"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kVariantUid) + ".v.host")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_subcolumn_idx"); +} + +// Regression for score on a dynamic variant sub-column inherited from a plain +// parent variant inverted index (no field_pattern template). Matches the +// scan-time schema shape: _init_variant_columns materializes the accessed +// path as an extracted VARIANT placeholder, so neither inverted_indexs(column) +// nor generate_sub_column_info resolves the parent index. Collector clones +// the parent's non-field-pattern indexes with the variant path as suffix. +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantParentIndexWithoutTemplate) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kVariantUid = 9004; + + TabletColumn variant_col; + variant_col.set_unique_id(kVariantUid); + variant_col.set_name("v"); + variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + tablet_schema->append_column(variant_col); + + TabletColumn sub_col; + sub_col.set_unique_id(-1); + sub_col.set_name("v.key"); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + sub_col.set_parent_unique_id(kVariantUid); + PathInData path("v.key"); + sub_col.set_path_info(path); + tablet_schema->append_column(sub_col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2004); + index_pb.set_index_name("variant_parent_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kVariantUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "english"; + (*props)["support_phrase"] = "true"; + + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + // Pre-conditions: column-aware lookup is empty (no inheritance pre-populated) + // and generate_sub_column_info returns false (no field_pattern template). + // The collector must still resolve through the VARIANT-placeholder branch. + ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); + ASSERT_EQ(tablet_schema->inverted_indexs(kVariantUid).size(), 1u); + TabletSchema::SubColumnInfo sub_column_info; + ASSERT_FALSE(variant_util::generate_sub_column_info(*tablet_schema, kVariantUid, "key", + &sub_column_info)); + + constexpr int kSlotId = 45; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid, "v.key", + {"key"}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto cast_expr = std::make_shared(TExprNodeType::CAST_EXPR); + cast_expr->_data_type = std::make_shared(); + auto slot_ref = std::make_shared("v.key", SlotId(kSlotId)); + auto literal = std::make_shared("abc"); + cast_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(cast_expr); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kVariantUid) + ".v.key")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + ASSERT_NE(it->second.owned_index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_parent_idx"); +} + +namespace { + +// Build a sub-column template for the parent variant column. pattern_type has no +// public setter on TabletColumn, so construct through ColumnPB. +TabletColumn make_subcolumn_template(const std::string& pattern, PatternTypePB pattern_type) { + ColumnPB column_pb; + column_pb.set_unique_id(-1); + column_pb.set_name(pattern); + column_pb.set_type("STRING"); + column_pb.set_is_nullable(true); + column_pb.set_pattern_type(pattern_type); + + TabletColumn templ; + templ.init_from_pb(column_pb); + return templ; +} + +} // namespace + +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantFieldPatternIndex) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kVariantUid = 9002; + + TabletColumn variant_col; + variant_col.set_unique_id(kVariantUid); + variant_col.set_name("meta"); + variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + TabletColumn host_template = make_subcolumn_template("host", PatternTypePB::MATCH_NAME); + variant_col.add_sub_column(host_template); + tablet_schema->append_column(variant_col); + + TabletColumn sub_col; + sub_col.set_unique_id(-1); + sub_col.set_name("meta.host"); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + sub_col.set_parent_unique_id(kVariantUid); + PathInData path("meta.host"); + sub_col.set_path_info(path); + tablet_schema->append_column(sub_col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2002); + index_pb.set_index_name("variant_field_pattern_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kVariantUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "standard"; + (*props)["support_phrase"] = "true"; + (*props)["field_pattern"] = "host"; + + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); + ASSERT_EQ(tablet_schema->inverted_index_by_field_pattern(kVariantUid, "host").size(), 1u); + + constexpr int kSlotId = 43; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid, "meta.host", + {"host"}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = + std::make_shared("meta.host", SlotId(kSlotId)); + auto literal = std::make_shared("alpha"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find( + StringHelper::to_wstring(std::to_string(kVariantUid) + ".meta.host")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + ASSERT_NE(it->second.owned_index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_field_pattern_idx"); +} + +// Regression: field_pattern="user.*" is registered under the pattern string, +// while the query slot resolves to column_paths=["user", "name"]. The fallback +// must match the parent variant's sub-column template first, then use the +// matched pattern to fetch the index, and collect under the actual Lucene field. +TEST_F(CollectionStatisticsTest, ExtractCollectInfoForVariantFieldPatternGlobIndex) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kVariantUid = 9003; + + TabletColumn variant_col; + variant_col.set_unique_id(kVariantUid); + variant_col.set_name("meta"); + variant_col.set_type(FieldType::OLAP_FIELD_TYPE_VARIANT); + TabletColumn glob_template = make_subcolumn_template("user.*", PatternTypePB::MATCH_NAME_GLOB); + variant_col.add_sub_column(glob_template); + tablet_schema->append_column(variant_col); + + TabletColumn sub_col; + sub_col.set_unique_id(-1); + sub_col.set_name("meta.user.name"); + sub_col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + sub_col.set_parent_unique_id(kVariantUid); + PathInData path("meta.user.name"); + sub_col.set_path_info(path); + tablet_schema->append_column(sub_col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2003); + index_pb.set_index_name("variant_field_pattern_glob_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kVariantUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "standard"; + (*props)["support_phrase"] = "true"; + (*props)["field_pattern"] = "user.*"; + + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + ASSERT_TRUE(tablet_schema->inverted_indexs(tablet_schema->column(/*ordinal=*/1)).empty()); + ASSERT_TRUE(tablet_schema->inverted_index_by_field_pattern(kVariantUid, "user.name").empty()); + ASSERT_EQ(tablet_schema->inverted_index_by_field_pattern(kVariantUid, "user.*").size(), 1u); + TabletSchema::SubColumnInfo sub_column_info; + ASSERT_TRUE(variant_util::generate_sub_column_info(*tablet_schema, kVariantUid, "user.name", + &sub_column_info)); + ASSERT_EQ(sub_column_info.indexes.size(), 1u); + EXPECT_EQ(sub_column_info.column.suffix_path(), "meta.user.name"); + EXPECT_EQ(sub_column_info.indexes[0]->index_name(), "variant_field_pattern_glob_idx"); + + constexpr int kSlotId = 44; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kVariantUid, + "meta.user.name", {"user", "name"}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("meta.user.name", + SlotId(kSlotId)); + auto literal = std::make_shared("alice"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + VExprContextSPtrs contexts; + contexts.push_back(std::make_shared(match_expr)); + + std::unordered_map collect_infos; + auto status = stats_->extract_collect_info(runtime_state_.get(), contexts, tablet_schema, + &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find( + StringHelper::to_wstring(std::to_string(kVariantUid) + ".meta.user.name")); + ASSERT_NE(it, collect_infos.end()); + ASSERT_NE(it->second.index_meta, nullptr); + ASSERT_NE(it->second.owned_index_meta, nullptr); + EXPECT_EQ(it->second.index_meta->index_name(), "variant_field_pattern_glob_idx"); +} + +// E1: Match predicate whose left subtree contains no VSlotRef. +// find_slot_ref recurses through children; when it returns nullptr the +// collector reports INVERTED_INDEX_NOT_SUPPORTED. +// Calls MatchPredicateCollector::collect() directly so coverage attribution +// is not muddied by extract_collect_info's virtual-dispatch indirection. +TEST_F(CollectionStatisticsTest, CollectMissingSlotRefReturnsError) { + auto tablet_schema = std::make_shared(); + TabletColumn col; + col.set_unique_id(1001); + col.set_name("c"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto literal_left = std::make_shared("foo"); + auto literal_right = std::make_shared("bar"); + match_expr->_children.push_back(literal_left); + match_expr->_children.push_back(literal_right); + + MatchPredicateCollector collector; + std::unordered_map collect_infos; + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); + ASSERT_FALSE(status.ok()); + EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); + EXPECT_TRUE(status.msg().find("Cannot find slot reference") != std::string::npos); +} + +// E2: SlotRef points to a slot_id absent from the runtime descriptor table. +TEST_F(CollectionStatisticsTest, CollectMissingSlotDescriptorReturnsError) { + auto tablet_schema = std::make_shared(); + TabletColumn col; + col.set_unique_id(1002); + col.set_name("c"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + constexpr int kAbsentSlotId = 99999; + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = + std::make_shared("c", SlotId(kAbsentSlotId)); + auto literal = std::make_shared("v"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + MatchPredicateCollector collector; + std::unordered_map collect_infos; + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); + ASSERT_FALSE(status.ok()); + EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); + EXPECT_TRUE(status.msg().find("Cannot find slot descriptor") != std::string::npos); +} + +// E3: SlotRef name does not exist in tablet_schema (field_index returns -1). +TEST_F(CollectionStatisticsTest, CollectUnknownColumnNameReturnsError) { + auto tablet_schema = std::make_shared(); + TabletColumn col; + col.set_unique_id(1003); + col.set_name("declared"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + constexpr int kSlotId = 50; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), 1003, "missing", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = + std::make_shared("missing", SlotId(kSlotId)); + auto literal = std::make_shared("v"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + MatchPredicateCollector collector; + std::unordered_map collect_infos; + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); + ASSERT_FALSE(status.ok()); + EXPECT_EQ(status.code(), ErrorCode::INVERTED_INDEX_NOT_SUPPORTED); + EXPECT_TRUE(status.msg().find("Cannot find column index") != std::string::npos); +} + +// I1 + L3 + O1: Plain string column with a direct inverted index. +// Direct hit produces a CollectInfo whose owned_index_meta is null +// (the meta lives in the schema and is not cloned). +TEST_F(CollectionStatisticsTest, CollectDirectIndexHitFromSchema) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1100; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("note"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2100); + index_pb.set_index_name("note_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kColUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "english"; + (*props)["support_phrase"] = "true"; + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + constexpr int kSlotId = 60; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "note", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("note", SlotId(kSlotId)); + auto literal = std::make_shared("hello world"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + MatchPredicateCollector collector; + std::unordered_map collect_infos; + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid))); + ASSERT_NE(it, collect_infos.end()); + EXPECT_NE(it->second.index_meta, nullptr); + EXPECT_EQ(it->second.owned_index_meta, nullptr); // O1: schema-direct meta is not owned + EXPECT_FALSE(it->second.term_infos.empty()); +} + +// I2: Plain string column with no index and not an extracted variant +// sub-column. Fallback path does not apply (column.is_extracted_column() +// is false). In BE_TEST builds the empty-index check is skipped, so +// collect returns OK with no CollectInfo emitted. +TEST_F(CollectionStatisticsTest, CollectNotExtractedColumnSkipsFallback) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1200; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("plain"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + // no index appended + + constexpr int kSlotId = 70; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "plain", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("plain", SlotId(kSlotId)); + auto literal = std::make_shared("v"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + MatchPredicateCollector collector; + std::unordered_map collect_infos; + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + EXPECT_TRUE(collect_infos.empty()); +} + +// L1: Index whose properties do not request an analyzer +// (should_analyzer returns false). The matching index_meta is iterated +// but skipped before insertion. +TEST_F(CollectionStatisticsTest, CollectSkipsIndexWithoutAnalyzer) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1300; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("kw"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2300); + index_pb.set_index_name("kw_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kColUid); + // No "parser" property -> should_analyzer returns false + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + constexpr int kSlotId = 80; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "kw", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("kw", SlotId(kSlotId)); + auto literal = std::make_shared("v"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + MatchPredicateCollector collector; + std::unordered_map collect_infos; + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + EXPECT_TRUE(collect_infos.empty()); +} + +// L2: Index whose analyzer is set (should_analyzer returns true) but does +// not declare "support_phrase=true". MockVExpr drives MATCH_PHRASE opcode, +// so is_need_similarity_score returns false and the index is skipped. +TEST_F(CollectionStatisticsTest, CollectSkipsIndexWithoutSimilarityScore) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1350; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("body"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2350); + index_pb.set_index_name("body_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kColUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "english"; // should_analyzer == true + // Intentionally omit "support_phrase" -> is_need_similarity_score == false + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + constexpr int kSlotId = 85; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "body", {}); + + auto match_expr = std::make_shared(TExprNodeType::MATCH_PRED); + auto slot_ref = std::make_shared("body", SlotId(kSlotId)); + auto literal = std::make_shared("hello"); + match_expr->_children.push_back(slot_ref); + match_expr->_children.push_back(literal); + + MatchPredicateCollector collector; + std::unordered_map collect_infos; + auto status = + collector.collect(runtime_state_.get(), tablet_schema, match_expr, &collect_infos); + ASSERT_TRUE(status.ok()) << status.msg(); + EXPECT_TRUE(collect_infos.empty()); +} + +// L4: Two MATCH predicates on the same column produce CollectInfo entries +// keyed on the same field_name; the second insertion merges term_infos +// into the first entry. +TEST_F(CollectionStatisticsTest, CollectMergesTermsForSameFieldName) { + auto tablet_schema = std::make_shared(); + + constexpr int32_t kColUid = 1400; + TabletColumn col; + col.set_unique_id(kColUid); + col.set_name("doc"); + col.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + tablet_schema->append_column(col); + + TabletIndexPB index_pb; + index_pb.set_index_id(2400); + index_pb.set_index_name("doc_idx"); + index_pb.set_index_type(IndexType::INVERTED); + index_pb.add_col_unique_id(kColUid); + auto* props = index_pb.mutable_properties(); + (*props)["parser"] = "english"; + (*props)["support_phrase"] = "true"; + TabletIndex index; + index.init_from_pb(index_pb); + tablet_schema->append_index(std::move(index)); + + constexpr int kSlotId = 90; + runtime_state_->_mock_desc_tbl->add_slot_descriptor(SlotId(kSlotId), kColUid, "doc", {}); + + auto build_match = [&](const std::string& term) { + auto m = std::make_shared(TExprNodeType::MATCH_PRED); + auto s = std::make_shared("doc", SlotId(kSlotId)); + auto l = std::make_shared(term); + m->_children.push_back(s); + m->_children.push_back(l); + return m; + }; + + MatchPredicateCollector collector; + std::unordered_map collect_infos; + auto first = collector.collect(runtime_state_.get(), tablet_schema, build_match("alpha"), + &collect_infos); + ASSERT_TRUE(first.ok()) << first.msg(); + auto second = collector.collect(runtime_state_.get(), tablet_schema, build_match("beta"), + &collect_infos); + ASSERT_TRUE(second.ok()) << second.msg(); + ASSERT_EQ(collect_infos.size(), 1u); + auto it = collect_infos.find(StringHelper::to_wstring(std::to_string(kColUid))); + ASSERT_NE(it, collect_infos.end()); + EXPECT_GE(it->second.term_infos.size(), 2u); // both "alpha" and "beta" present +} + +// Test-only subclass that exposes the protected helpers of PredicateCollector. +class TestablePredicateCollector : public MatchPredicateCollector { +public: + using MatchPredicateCollector::build_field_name; + using MatchPredicateCollector::find_slot_ref; +}; + +// find_slot_ref: null shared_ptr returns nullptr (early-return branch). +TEST_F(CollectionStatisticsTest, FindSlotRefHandlesNullExpr) { + TestablePredicateCollector collector; + VExprSPtr null_expr; + EXPECT_EQ(collector.find_slot_ref(null_expr), nullptr); +} + +// find_slot_ref: when expr is a non-CAST wrapper containing a SLOT_REF in its +// children, the recursive descent finds the slot via the for-loop body. +TEST_F(CollectionStatisticsTest, FindSlotRefRecursesIntoChildren) { + TestablePredicateCollector collector; + auto wrapper = std::make_shared(TExprNodeType::FUNCTION_CALL); + auto slot_ref = std::make_shared("c", SlotId(99)); + wrapper->_children.push_back(slot_ref); + EXPECT_EQ(collector.find_slot_ref(wrapper), slot_ref.get()); +} + +// find_slot_ref: leaf non-slot (no children) returns nullptr after for-loop. +TEST_F(CollectionStatisticsTest, FindSlotRefReturnsNullForLeafNonSlot) { + TestablePredicateCollector collector; + auto literal = std::make_shared("x"); + EXPECT_EQ(collector.find_slot_ref(literal), nullptr); +} + +// build_field_name: non-empty suffix is appended with a dot separator. +TEST_F(CollectionStatisticsTest, BuildFieldNameWithSuffix) { + TestablePredicateCollector collector; + EXPECT_EQ(collector.build_field_name(42, "a.b"), "42.a.b"); +} + +// build_field_name: empty suffix returns just the unique id as string. +TEST_F(CollectionStatisticsTest, BuildFieldNameWithoutSuffix) { + TestablePredicateCollector collector; + EXPECT_EQ(collector.build_field_name(42, ""), "42"); +} + TEST(TermInfoComparerTest, OrdersByTermAndDedups) { using doris::TermInfoComparer; using doris::segment_v2::TermInfo; @@ -651,4 +1307,4 @@ TEST(TermInfoComparerTest, OrdersByTermAndDedups) { EXPECT_THAT(ordered, ::testing::ElementsAre("apple", "banana", "cherry")); } -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/test/testutil/mock/mock_descriptors.h b/be/test/testutil/mock/mock_descriptors.h index 4fec22bf7a11c0..cb8833cf8d8f1e 100644 --- a/be/test/testutil/mock/mock_descriptors.h +++ b/be/test/testutil/mock/mock_descriptors.h @@ -20,6 +20,8 @@ #include #include +#include +#include #include #include "core/data_type/data_type.h" @@ -106,13 +108,41 @@ class MockDescriptorTbl1 : public DescriptorTbl { _slot_descriptors[slot_id] = std::move(slot_desc); } + void add_slot_descriptor(SlotId slot_id, int32_t col_unique_id, const std::string& col_name, + const std::vector& column_paths) { + TTypeNode type_node; + type_node.__set_type(TTypeNodeType::SCALAR); + TScalarType scalar_type; + scalar_type.__set_type(TPrimitiveType::STRING); + type_node.__set_scalar_type(scalar_type); + TTypeDesc type_desc; + type_desc.types.push_back(type_node); + + TSlotDescriptor slot_desc; + slot_desc.__set_id(slot_id); + slot_desc.__set_parent(0); + slot_desc.__set_slotType(type_desc); + slot_desc.__set_columnPos(0); + slot_desc.__set_byteOffset(0); + slot_desc.__set_nullIndicatorByte(0); + slot_desc.__set_nullIndicatorBit(-1); + slot_desc.__set_colName(col_name); + slot_desc.__set_slotIdx(0); + slot_desc.__set_isMaterialized(true); + slot_desc.__set_col_unique_id(col_unique_id); + slot_desc.__set_is_key(false); + slot_desc.__set_column_paths(column_paths); + slot_desc.__set_primitive_type(TPrimitiveType::STRING); + _slot_descriptors[slot_id] = std::make_unique(slot_desc); + } + SlotDescriptor* get_slot_descriptor(SlotId id) const override { auto it = _slot_descriptors.find(id); return it != _slot_descriptors.end() ? it->second.get() : nullptr; } private: - mutable std::unordered_map> _slot_descriptors; + mutable std::unordered_map> _slot_descriptors; }; } // namespace doris \ No newline at end of file diff --git a/regression-test/suites/inverted_index_p0/test_bm25_score.groovy b/regression-test/suites/inverted_index_p0/test_bm25_score.groovy index 2686011e89e3b2..3a8ad125dc5076 100644 --- a/regression-test/suites/inverted_index_p0/test_bm25_score.groovy +++ b/regression-test/suites/inverted_index_p0/test_bm25_score.groovy @@ -226,6 +226,53 @@ suite("test_bm25_score", "p0") { } finally { } + try { + sql """ set enable_common_expr_pushdown = true; """ + sql """ set enable_match_without_inverted_index = false; """ + sql """ set default_variant_enable_typed_paths_to_sparse = false; """ + sql """ set default_variant_enable_doc_mode = false; """ + + sql "DROP TABLE IF EXISTS test_variant_field_pattern_score" + sql """ + CREATE TABLE test_variant_field_pattern_score ( + id INT, + meta VARIANT, + INDEX idx_meta_user(meta) USING INVERTED PROPERTIES( + "parser"="english", + "support_phrase"="true", + "field_pattern"="user.*" + ) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + + sql """ insert into test_variant_field_pattern_score values(3, '{"other": "alice"}'); """ + sql """ sync """ + sql """ + insert into test_variant_field_pattern_score values + (1, '{"user": {"name": "alice alpha"}}'), + (2, '{"user": {"name": "bob beta"}}'); + """ + sql """ sync """ + + def res = sql """ + select id, score() as score + from test_variant_field_pattern_score + where cast(meta["user"]["name"] as string) match_phrase "alice" + order by score() desc + limit 10; + """ + assertEquals(1, res.size()) + assertEquals(1, res[0][0] as int) + assertTrue(Double.parseDouble(res[0][1].toString()) > 0.0) + } finally { + } + try { sql "DROP TABLE IF EXISTS t2" sql """ create table t2(a int, b int, s text) unique key(a) DISTRIBUTED BY HASH(a) buckets 1 PROPERTIES ("replication_allocation" = "tag.location.default: 1"); """ @@ -247,4 +294,4 @@ suite("test_bm25_score", "p0") { } finally { } } -} \ No newline at end of file +} diff --git a/regression-test/suites/inverted_index_p0/test_bm25_score_variant.groovy b/regression-test/suites/inverted_index_p0/test_bm25_score_variant.groovy new file mode 100644 index 00000000000000..885d311bdfc01a --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_bm25_score_variant.groovy @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_bm25_score_variant", "p0") { + if (isCloudMode()) { + return + } + + sql """ set enable_common_expr_pushdown = true """ + sql """ set enable_match_without_inverted_index = false """ + sql """ set default_variant_enable_typed_paths_to_sparse = false """ + sql """ set default_variant_enable_doc_mode = false """ + + // A1: field_pattern exact name (MATCH_NAME) + try { + sql "DROP TABLE IF EXISTS test_bm25_score_variant_a1" + sql """ + CREATE TABLE test_bm25_score_variant_a1 ( + id INT, + v variant< + MATCH_NAME 'host' : text, + PROPERTIES("variant_max_subcolumns_count"="0") + >, + INDEX idx_v_host (v) USING INVERTED PROPERTIES( + "parser"="english", + "support_phrase"="true", + "field_pattern"="host" + ) + ) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + sql """ insert into test_bm25_score_variant_a1 values + (1, '{"host":"alpha database server"}'), + (2, '{"host":"beta server cluster"}'), + (3, '{"other":"alpha"}') + """ + sql " sync " + + def res = sql """ + select id, score() as score + from test_bm25_score_variant_a1 + where cast(v["host"] as string) match_phrase "alpha" + order by score() desc + limit 10 + """ + assertEquals(1, res.size()) + assertEquals(1, res[0][0] as int) + assertTrue(Double.parseDouble(res[0][1].toString()) > 0.0) + } finally { + } + + // C: plain parent inverted index (baseline; not the fallback path) + try { + sql "DROP TABLE IF EXISTS test_bm25_score_variant_c" + sql """ + CREATE TABLE test_bm25_score_variant_c ( + id INT, + v VARIANT, + INDEX idx_v_plain (v) USING INVERTED PROPERTIES( + "parser"="english", + "support_phrase"="true" + ) + ) ENGINE=OLAP DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ) + """ + sql """ insert into test_bm25_score_variant_c values + (1, '{"note":"latency spike at noon"}'), + (2, '{"note":"all green"}') + """ + sql " sync " + + def res = sql """ + select id, score() as score + from test_bm25_score_variant_c + where cast(v["note"] as string) match_phrase "latency" + order by score() desc + limit 10 + """ + assertEquals(1, res.size()) + assertEquals(1, res[0][0] as int) + assertTrue(Double.parseDouble(res[0][1].toString()) > 0.0) + } finally { + } +}