diff --git a/be/src/cloud/config.cpp b/be/src/cloud/config.cpp index b502704e843923..501ab66322fff5 100644 --- a/be/src/cloud/config.cpp +++ b/be/src/cloud/config.cpp @@ -131,6 +131,7 @@ DEFINE_mBool(enable_warmup_immediately_on_new_rowset, "false"); // Packed file manager config DEFINE_mBool(enable_packed_file, "true"); +DEFINE_mBool(enable_file_cache_write_index_file_only, "false"); DEFINE_mInt64(packed_file_size_threshold_bytes, "5242880"); // 5MB DEFINE_mInt64(packed_file_time_threshold_ms, "100"); // 100ms DEFINE_mInt64(packed_file_try_lock_timeout_ms, "5"); // 5ms diff --git a/be/src/cloud/config.h b/be/src/cloud/config.h index c52e09bc0987a3..c3a95de70afe85 100644 --- a/be/src/cloud/config.h +++ b/be/src/cloud/config.h @@ -175,6 +175,7 @@ DECLARE_mBool(enable_warmup_immediately_on_new_rowset); // Packed file manager config DECLARE_mBool(enable_packed_file); +DECLARE_mBool(enable_file_cache_write_index_file_only); DECLARE_mInt64(packed_file_size_threshold_bytes); DECLARE_mInt64(packed_file_time_threshold_ms); DECLARE_mInt64(packed_file_try_lock_timeout_ms); diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index b521fa57ec3dfd..4cbc5399faeee0 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1203,6 +1203,8 @@ DEFINE_Int64(file_cache_each_block_size, "1048576"); // 1MB DEFINE_Bool(clear_file_cache, "false"); DEFINE_mBool(enable_file_cache_query_limit, "false"); +// Whether segment footer and segment metadata count toward file cache query limit. +DEFINE_mBool(file_cache_query_limit_segment_meta, "false"); DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "90"); DEFINE_mInt32(file_cache_exit_disk_resource_limit_mode_percent, "88"); DEFINE_mBool(enable_evict_file_cache_in_advance, "true"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 0b415ed5d2c4ae..df4f11ceea6d8f 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1245,6 +1245,7 @@ DECLARE_String(file_cache_path); DECLARE_Int64(file_cache_each_block_size); DECLARE_Bool(clear_file_cache); DECLARE_mBool(enable_file_cache_query_limit); +DECLARE_mBool(file_cache_query_limit_segment_meta); DECLARE_Int32(file_cache_enter_disk_resource_limit_mode_percent); DECLARE_Int32(file_cache_exit_disk_resource_limit_mode_percent); DECLARE_mBool(enable_evict_file_cache_in_advance); diff --git a/be/src/exec/operator/materialization_opertor.cpp b/be/src/exec/operator/materialization_opertor.cpp index 7ceeae0f261f23..1a475af0bccfa6 100644 --- a/be/src/exec/operator/materialization_opertor.cpp +++ b/be/src/exec/operator/materialization_opertor.cpp @@ -21,8 +21,12 @@ #include #include +#include +#include #include +#include "cloud/config.h" +#include "common/config.h" #include "common/status.h" #include "core/block/block.h" #include "core/column/column.h" @@ -31,9 +35,107 @@ #include "exec/scan/file_scanner.h" #include "util/brpc_client_cache.h" #include "util/brpc_closure.h" +#include "util/pretty_printer.h" namespace doris { +namespace { + +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND = + "TopNLazyMaterializationSecondPhasePerBackend"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_ROWS_READ = + "TopNLazyMaterializationSecondPhasePerBackendRowsRead"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_SEGMENTS_READ = + "TopNLazyMaterializationSecondPhasePerBackendSegmentsRead"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_LOCAL_IO_COUNT = + "TopNLazyMaterializationSecondPhasePerBackendLocalIOCount"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_LOCAL_IO_BYTES = + "TopNLazyMaterializationSecondPhasePerBackendLocalIOBytes"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_REMOTE_IO_COUNT = + "TopNLazyMaterializationSecondPhasePerBackendRemoteIOCount"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_REMOTE_IO_BYTES = + "TopNLazyMaterializationSecondPhasePerBackendRemoteIOBytes"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_SKIP_CACHE_IO_COUNT = + "TopNLazyMaterializationSecondPhasePerBackendSkipCacheIOCount"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_WRITE_CACHE_BYTES = + "TopNLazyMaterializationSecondPhasePerBackendWriteCacheBytes"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_LOCAL_IO_TIME = + "TopNLazyMaterializationSecondPhasePerBackendLocalIOTime"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_REMOTE_IO_TIME = + "TopNLazyMaterializationSecondPhasePerBackendRemoteIOTime"; +constexpr const char* TOPN_LAZY_MAT_PHASE2_PER_BACKEND_WRITE_CACHE_IO_TIME = + "TopNLazyMaterializationSecondPhasePerBackendWriteCacheIOTime"; + +void update_counter(RuntimeProfile* profile, const std::string& name, TUnit::type unit, + int64_t value) { + COUNTER_UPDATE(ADD_COUNTER_WITH_LEVEL(profile, name, unit, 2), value); +} + +void update_topn_lazy_materialization_profile(RuntimeProfile* profile, + const PTopNLazyMaterializationFileCacheStats& stats) { + if (profile == nullptr) { + return; + } + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseLocalIOCount, + TUnit::UNIT, stats.local_io_count()); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseLocalIOBytes, + TUnit::BYTES, stats.local_io_bytes()); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseRemoteIOCount, + TUnit::UNIT, stats.remote_io_count()); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseRemoteIOBytes, + TUnit::BYTES, stats.remote_io_bytes()); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseSkipCacheIOCount, + TUnit::UNIT, stats.skip_cache_io_count()); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseWriteCacheBytes, + TUnit::BYTES, stats.write_cache_bytes()); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseLocalIOTime, + TUnit::TIME_NS, stats.local_io_time()); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseRemoteIOTime, + TUnit::TIME_NS, stats.remote_io_time()); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseWriteCacheIOTime, + TUnit::TIME_NS, stats.write_cache_io_time()); +} + +int64_t count_request_rows(const PMultiGetRequestV2& request) { + int64_t rows = 0; + for (const auto& request_block_desc : request.request_block_descs()) { + rows += request_block_desc.row_id_size(); + } + return rows; +} + +int64_t count_request_segments(const PMultiGetRequestV2& request) { + std::set file_ids; + for (const auto& request_block_desc : request.request_block_descs()) { + DCHECK_EQ(request_block_desc.file_id_size(), request_block_desc.row_id_size()); + for (const auto file_id : request_block_desc.file_id()) { + file_ids.insert(file_id); + } + } + return file_ids.size(); +} + +template +std::string format_array(size_t size, AppendValue append_value) { + std::stringstream values; + values << "["; + for (size_t i = 0; i < size; ++i) { + append_value(values, i); + values << ", "; + } + values << "]"; + return values.str(); +} + +template +std::string format_counter_array(size_t size, TUnit::type unit, GetValue get_value) { + return format_array(size, [&](std::stringstream& values, size_t i) { + values << PrettyPrinter::print(static_cast(get_value(i)), unit); + }); +} + +} // namespace + void MaterializationSharedState::get_block(Block* block) { for (int i = 0, j = 0, rowid_to_block_loc = rowid_locs[j]; i < origin_block.columns(); i++) { if (i != rowid_to_block_loc) { @@ -53,6 +155,98 @@ void MaterializationSharedState::get_block(Block* block) { origin_block.clear(); } +void MaterializationSharedState::_update_topn_lazy_materialization_profile( + RuntimeProfile* profile) { + DORIS_CHECK(profile != nullptr); + for (const auto& [backend_id, rpc_struct] : rpc_struct_map) { + const int64_t rows_read = count_request_rows(rpc_struct.request); + const int64_t segments_read = count_request_segments(rpc_struct.request); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseRowsRead, + TUnit::UNIT, rows_read); + update_counter(profile, RowIdStorageReader::TopNLazyMaterializationSecondPhaseSegmentsRead, + TUnit::UNIT, segments_read); + + auto& stats = _topn_lazy_materialization_backend_stats[backend_id]; + if (stats.backend.empty()) { + stats.backend = rpc_struct.backend_address.empty() ? fmt::format("id={}", backend_id) + : rpc_struct.backend_address; + } + stats.rows_read += rows_read; + stats.segments_read += segments_read; + if (!rpc_struct.response.has_topn_lazy_materialization_file_cache_stats()) { + continue; + } + + const auto& file_cache_stats = + rpc_struct.response.topn_lazy_materialization_file_cache_stats(); + update_topn_lazy_materialization_profile(profile, file_cache_stats); + stats.local_io_count += file_cache_stats.local_io_count(); + stats.local_io_bytes += file_cache_stats.local_io_bytes(); + stats.remote_io_count += file_cache_stats.remote_io_count(); + stats.remote_io_bytes += file_cache_stats.remote_io_bytes(); + stats.skip_cache_io_count += file_cache_stats.skip_cache_io_count(); + stats.write_cache_bytes += file_cache_stats.write_cache_bytes(); + stats.local_io_time += file_cache_stats.local_io_time(); + stats.remote_io_time += file_cache_stats.remote_io_time(); + stats.write_cache_io_time += file_cache_stats.write_cache_io_time(); + } + + std::vector stats; + stats.reserve(_topn_lazy_materialization_backend_stats.size()); + for (const auto& [_, backend_stats] : _topn_lazy_materialization_backend_stats) { + stats.push_back(&backend_stats); + } + + const size_t size = stats.size(); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND, + format_array(size, [&](std::stringstream& values, size_t i) { + values << stats[i]->backend; + })); + profile->add_info_string( + TOPN_LAZY_MAT_PHASE2_PER_BACKEND_ROWS_READ, + format_counter_array(size, TUnit::UNIT, [&](size_t i) { return stats[i]->rows_read; })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_SEGMENTS_READ, + format_counter_array(size, TUnit::UNIT, [&](size_t i) { + return stats[i]->segments_read; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_LOCAL_IO_COUNT, + format_counter_array(size, TUnit::UNIT, [&](size_t i) { + return stats[i]->local_io_count; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_LOCAL_IO_BYTES, + format_counter_array(size, TUnit::BYTES, [&](size_t i) { + return stats[i]->local_io_bytes; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_REMOTE_IO_COUNT, + format_counter_array(size, TUnit::UNIT, [&](size_t i) { + return stats[i]->remote_io_count; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_REMOTE_IO_BYTES, + format_counter_array(size, TUnit::BYTES, [&](size_t i) { + return stats[i]->remote_io_bytes; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_SKIP_CACHE_IO_COUNT, + format_counter_array(size, TUnit::UNIT, [&](size_t i) { + return stats[i]->skip_cache_io_count; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_WRITE_CACHE_BYTES, + format_counter_array(size, TUnit::BYTES, [&](size_t i) { + return stats[i]->write_cache_bytes; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_LOCAL_IO_TIME, + format_counter_array(size, TUnit::TIME_NS, [&](size_t i) { + return stats[i]->local_io_time; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_REMOTE_IO_TIME, + format_counter_array(size, TUnit::TIME_NS, [&](size_t i) { + return stats[i]->remote_io_time; + })); + profile->add_info_string(TOPN_LAZY_MAT_PHASE2_PER_BACKEND_WRITE_CACHE_IO_TIME, + format_counter_array(size, TUnit::TIME_NS, [&](size_t i) { + return stats[i]->write_cache_io_time; + })); +} + // Merges RPC responses from multiple BEs into `response_blocks` in the original row order. // // After parallel multiget_data_v2 RPCs complete, each BE's response contains a partial block @@ -63,7 +257,9 @@ void MaterializationSharedState::get_block(Block* block) { // rpc_struct_map[backend_id].response (per-BE partial blocks, unordered across BEs) // + block_order_results[i][j] (maps each output row → its source backend_id) // → response_blocks[i] (final merged result in original TopN row order) -Status MaterializationSharedState::merge_multi_response() { +Status MaterializationSharedState::merge_multi_response(RuntimeProfile* profile) { + _update_topn_lazy_materialization_profile(profile); + // Outer loop: iterate over each relation (i.e., each rowid column / table). // A query with lazy materialization on 2 tables would have block_order_results.size() == 2, // each with its own set of response_blocks and RPC request_block_descs. @@ -265,6 +461,9 @@ Status MaterializationSharedState::init_multi_requests( // Initialize the base struct of PMultiGetRequestV2 multi_get_request.set_be_exec_version(state->be_exec_version()); multi_get_request.set_wg_id(state->get_query_ctx()->workload_group()->id()); + multi_get_request.set_file_cache_remote_only_on_miss( + config::is_cloud_mode() && + state->query_options().enable_topn_lazy_mat_phase2_no_write_file_cache); auto* query_id = multi_get_request.mutable_query_id(); query_id->set_hi(state->query_id().hi); query_id->set_lo(state->query_id().lo); @@ -315,7 +514,10 @@ Status MaterializationSharedState::init_multi_requests( FetchRpcStruct {.stub = std::move(client), .cntl = std::make_unique(), .request = multi_get_request, - .response = PMultiGetResponseV2()}); + .response = PMultiGetResponseV2(), + .backend_address = fmt::format( + "id={} {}:{}", node_info.id, node_info.host, + node_info.async_internal_port)}); } return Status::OK(); @@ -427,7 +629,8 @@ Status MaterializationOperator::push(RuntimeState* state, Block* in_block, bool if (local_state._materialization_state.need_merge_block) { SCOPED_TIMER(local_state._merge_response_timer); - RETURN_IF_ERROR(local_state._materialization_state.merge_multi_response()); + RETURN_IF_ERROR(local_state._materialization_state.merge_multi_response( + local_state.operator_profile())); local_state._max_rows_per_backend_counter->set( (int64_t)local_state._materialization_state._max_rows_per_backend); } diff --git a/be/src/exec/operator/materialization_opertor.h b/be/src/exec/operator/materialization_opertor.h index 5cf2bf1ee9ee1b..889d94a11ac33a 100644 --- a/be/src/exec/operator/materialization_opertor.h +++ b/be/src/exec/operator/materialization_opertor.h @@ -19,6 +19,10 @@ #include +#include +#include +#include + #include "common/status.h" #include "exec/operator/operator.h" @@ -32,6 +36,7 @@ struct FetchRpcStruct { std::unique_ptr cntl; PMultiGetRequestV2 request; PMultiGetResponseV2 response; + std::string backend_address; }; struct MaterializationSharedState { @@ -41,11 +46,27 @@ struct MaterializationSharedState { Status init_multi_requests(const TMaterializationNode& tnode, RuntimeState* state); Status create_muiltget_result(const Columns& columns, bool eos); - Status merge_multi_response(); + Status merge_multi_response(RuntimeProfile* profile); void get_block(Block* block); private: void _update_profile_info(int64_t backend_id, RuntimeProfile* response_profile); + void _update_topn_lazy_materialization_profile(RuntimeProfile* profile); + + struct TopNLazyMaterializationBackendStats { + std::string backend; + int64_t rows_read = 0; + int64_t segments_read = 0; + int64_t local_io_count = 0; + int64_t local_io_bytes = 0; + int64_t remote_io_count = 0; + int64_t remote_io_bytes = 0; + int64_t skip_cache_io_count = 0; + int64_t write_cache_bytes = 0; + int64_t local_io_time = 0; + int64_t remote_io_time = 0; + int64_t write_cache_io_time = 0; + }; public: bool rpc_struct_inited = false; @@ -68,6 +89,10 @@ struct MaterializationSharedState { uint32_t _max_rows_per_backend = 0; // Store the number of rows processed by each backend std::unordered_map _backend_rows_count; // backend_id => rows_count + +private: + // backend id => accumulated TopN phase-2 profile stats. + std::map _topn_lazy_materialization_backend_stats; }; class MaterializationLocalState final : public PipelineXLocalState { diff --git a/be/src/exec/rowid_fetcher.cpp b/be/src/exec/rowid_fetcher.cpp index c2317a103e5ee3..1c2a6f1729a99e 100644 --- a/be/src/exec/rowid_fetcher.cpp +++ b/be/src/exec/rowid_fetcher.cpp @@ -53,6 +53,7 @@ #include "exec/scan/file_scanner.h" #include "format/orc/vorc_reader.h" #include "format/parquet/vparquet_reader.h" +#include "io/io_common.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" // ExecEnv #include "runtime/fragment_mgr.h" // FragmentMgr @@ -73,6 +74,23 @@ namespace doris { +namespace { + +void set_topn_lazy_materialization_file_cache_stats( + const io::FileCacheStatistics& stats, PTopNLazyMaterializationFileCacheStats* pstats) { + pstats->set_local_io_count(stats.num_local_io_total); + pstats->set_local_io_bytes(stats.bytes_read_from_local); + pstats->set_remote_io_count(stats.num_remote_io_total); + pstats->set_remote_io_bytes(stats.bytes_read_from_remote); + pstats->set_skip_cache_io_count(stats.num_skip_cache_io_total); + pstats->set_write_cache_bytes(stats.bytes_write_into_cache); + pstats->set_local_io_time(stats.local_io_timer); + pstats->set_remote_io_time(stats.remote_io_timer); + pstats->set_write_cache_io_time(stats.write_cache_io_timer); +} + +} // namespace + Status RowIDFetcher::init() { DorisNodesInfo nodes_info; nodes_info.setNodes(_fetch_option.t_fetch_opt.nodes_info); @@ -548,6 +566,11 @@ Status RowIdStorageReader::read_by_rowids(const PMultiGetRequestV2& request, int64_t external_get_block_avg_ms = 0; size_t external_scan_range_cnt = 0; + const auto file_cache_miss_policy = + request.file_cache_remote_only_on_miss() + ? io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS + : io::FileCacheMissPolicy::READ_THROUGH_AND_WRITE_BACK; + // Add counters for different file mapping types std::unordered_map file_type_counts; @@ -589,7 +612,7 @@ Status RowIdStorageReader::read_by_rowids(const PMultiGetRequestV2& request, RETURN_IF_ERROR(read_batch_doris_format_row( request_block_desc, id_file_map, slots, tquery_id, result_blocks[i], stats, &acquire_tablet_ms, &acquire_rowsets_ms, - &acquire_segments_ms, &lookup_row_data_ms)); + &acquire_segments_ms, &lookup_row_data_ms, file_cache_miss_policy)); } else { RETURN_IF_ERROR(read_batch_external_row( request.wg_id(), request_block_desc, id_file_map, slots, @@ -637,6 +660,9 @@ Status RowIdStorageReader::read_by_rowids(const PMultiGetRequestV2& request, acquire_rowsets_ms, acquire_segments_ms, lookup_row_data_ms, file_type_stats, external_init_reader_avg_ms, external_get_block_avg_ms, external_scan_range_cnt); + set_topn_lazy_materialization_file_cache_stats( + stats.file_cache_stats, + response->mutable_topn_lazy_materialization_file_cache_stats()); } return Status::OK(); @@ -646,7 +672,8 @@ Status RowIdStorageReader::read_batch_doris_format_row( const PRequestBlockDesc& request_block_desc, std::shared_ptr id_file_map, std::vector& slots, const TUniqueId& query_id, Block& result_block, OlapReaderStatistics& stats, int64_t* acquire_tablet_ms, int64_t* acquire_rowsets_ms, - int64_t* acquire_segments_ms, int64_t* lookup_row_data_ms) { + int64_t* acquire_segments_ms, int64_t* lookup_row_data_ms, + io::FileCacheMissPolicy file_cache_miss_policy) { if (result_block.is_empty_column()) [[likely]] { result_block = Block(slots, request_block_desc.row_id_size()); } @@ -724,11 +751,11 @@ Status RowIdStorageReader::read_batch_doris_format_row( } scan_blocks[batch_idx] = Block(slots, row_ids.size()); - RETURN_IF_ERROR(read_doris_format_row(id_file_map, scan_batch.file_mapping, row_ids, slots, - full_read_schema, row_store_read_struct, stats, - acquire_tablet_ms, acquire_rowsets_ms, - acquire_segments_ms, lookup_row_data_ms, seg_map, - iterator_map, scan_blocks[batch_idx])); + RETURN_IF_ERROR(read_doris_format_row( + id_file_map, scan_batch.file_mapping, row_ids, slots, full_read_schema, + row_store_read_struct, stats, acquire_tablet_ms, acquire_rowsets_ms, + acquire_segments_ms, lookup_row_data_ms, seg_map, iterator_map, + file_cache_miss_policy, scan_blocks[batch_idx])); } scatter_scan_blocks_to_result_block(row_id_block_idx, scan_blocks, result_block); @@ -740,6 +767,28 @@ const std::string RowIdStorageReader::ScannersRunningTimeProfile = "ScannersRunn const std::string RowIdStorageReader::InitReaderAvgTimeProfile = "InitReaderAvgTime"; const std::string RowIdStorageReader::GetBlockAvgTimeProfile = "GetBlockAvgTime"; const std::string RowIdStorageReader::FileReadLinesProfile = "FileReadLines"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseLocalIOCount = + "TopNLazyMaterializationSecondPhaseLocalIOCount"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseLocalIOBytes = + "TopNLazyMaterializationSecondPhaseLocalIOBytes"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseRemoteIOCount = + "TopNLazyMaterializationSecondPhaseRemoteIOCount"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseRemoteIOBytes = + "TopNLazyMaterializationSecondPhaseRemoteIOBytes"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseSkipCacheIOCount = + "TopNLazyMaterializationSecondPhaseSkipCacheIOCount"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseWriteCacheBytes = + "TopNLazyMaterializationSecondPhaseWriteCacheBytes"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseLocalIOTime = + "TopNLazyMaterializationSecondPhaseLocalIOTime"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseRemoteIOTime = + "TopNLazyMaterializationSecondPhaseRemoteIOTime"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseWriteCacheIOTime = + "TopNLazyMaterializationSecondPhaseWriteCacheIOTime"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseRowsRead = + "TopNLazyMaterializationSecondPhaseRowsRead"; +const std::string RowIdStorageReader::TopNLazyMaterializationSecondPhaseSegmentsRead = + "TopNLazyMaterializationSecondPhaseSegmentsRead"; Status RowIdStorageReader::read_external_row_from_file_mapping( size_t idx, const std::multimap& row_ids, @@ -1034,7 +1083,7 @@ Status RowIdStorageReader::read_doris_format_row( int64_t* acquire_tablet_ms, int64_t* acquire_rowsets_ms, int64_t* acquire_segments_ms, int64_t* lookup_row_data_ms, std::unordered_map& seg_map, std::unordered_map& iterator_map, - Block& result_block) { + io::FileCacheMissPolicy file_cache_miss_policy, Block& result_block) { auto [tablet_id, rowset_id, segment_id] = file_mapping->get_doris_format_info(); SegKey seg_key {.tablet_id = tablet_id, .rowset_id = rowset_id, .segment_id = segment_id}; @@ -1104,13 +1153,18 @@ Status RowIdStorageReader::read_doris_format_row( } auto result_columns_guard = result_block.mutate_columns_scoped(); MutableColumns& result_columns = result_columns_guard.mutable_columns(); + io::IOContext io_ctx; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.file_cache_stats = &stats.file_cache_stats; + io_ctx.file_cache_miss_policy = file_cache_miss_policy; for (auto row_id : row_ids) { RowLocation loc(rowset_id, segment->id(), cast_set(row_id)); row_store_read_struct.row_store_buffer.clear(); RETURN_IF_ERROR(scope_timer_run( [&]() { return tablet->lookup_row_data({}, loc, rowset, stats, - row_store_read_struct.row_store_buffer); + row_store_read_struct.row_store_buffer, + false, &io_ctx); }, lookup_row_data_ms)); @@ -1133,6 +1187,8 @@ Status RowIdStorageReader::read_doris_format_row( iterator_map[iterator_key].segment = segment; iterator_item.storage_read_options.stats = &stats; iterator_item.storage_read_options.io_ctx.reader_type = ReaderType::READER_QUERY; + iterator_item.storage_read_options.io_ctx.file_cache_miss_policy = + file_cache_miss_policy; } set_slot_access_paths(slots[x], full_read_schema, iterator_item.storage_read_options); RETURN_IF_ERROR(segment->seek_and_read_by_rowid( diff --git a/be/src/exec/rowid_fetcher.h b/be/src/exec/rowid_fetcher.h index 7641103666922b..790f9cf17e7e4e 100644 --- a/be/src/exec/rowid_fetcher.h +++ b/be/src/exec/rowid_fetcher.h @@ -38,6 +38,9 @@ namespace doris { class DorisNodesInfo; class RuntimeState; class TupleDescriptor; +namespace io { +enum class FileCacheMissPolicy : uint8_t; +} struct FileMapping; struct SegKey; @@ -97,6 +100,17 @@ class RowIdStorageReader { static const std::string InitReaderAvgTimeProfile; static const std::string GetBlockAvgTimeProfile; static const std::string FileReadLinesProfile; + static const std::string TopNLazyMaterializationSecondPhaseLocalIOCount; + static const std::string TopNLazyMaterializationSecondPhaseLocalIOBytes; + static const std::string TopNLazyMaterializationSecondPhaseRemoteIOCount; + static const std::string TopNLazyMaterializationSecondPhaseRemoteIOBytes; + static const std::string TopNLazyMaterializationSecondPhaseSkipCacheIOCount; + static const std::string TopNLazyMaterializationSecondPhaseWriteCacheBytes; + static const std::string TopNLazyMaterializationSecondPhaseLocalIOTime; + static const std::string TopNLazyMaterializationSecondPhaseRemoteIOTime; + static const std::string TopNLazyMaterializationSecondPhaseWriteCacheIOTime; + static const std::string TopNLazyMaterializationSecondPhaseRowsRead; + static const std::string TopNLazyMaterializationSecondPhaseSegmentsRead; static Status read_by_rowids(const PMultiGetRequest& request, PMultiGetResponse* response); static Status read_by_rowids(const PMultiGetRequestV2& request, PMultiGetResponseV2* response); @@ -112,13 +126,14 @@ class RowIdStorageReader { int64_t* acquire_tablet_ms, int64_t* acquire_rowsets_ms, int64_t* acquire_segments_ms, int64_t* lookup_row_data_ms, std::unordered_map& seg_map, std::unordered_map& iterator_map, - Block& result_block); + io::FileCacheMissPolicy file_cache_miss_policy, Block& result_block); static Status read_batch_doris_format_row( const PRequestBlockDesc& request_block_desc, std::shared_ptr id_file_map, std::vector& slots, const TUniqueId& query_id, Block& result_block, OlapReaderStatistics& stats, int64_t* acquire_tablet_ms, int64_t* acquire_rowsets_ms, - int64_t* acquire_segments_ms, int64_t* lookup_row_data_ms); + int64_t* acquire_segments_ms, int64_t* lookup_row_data_ms, + io::FileCacheMissPolicy file_cache_miss_policy); static Status read_batch_external_row( const uint64_t workload_group_id, const PRequestBlockDesc& request_block_desc, diff --git a/be/src/exec/scan/parallel_scanner_builder.cpp b/be/src/exec/scan/parallel_scanner_builder.cpp index 4ddd2753acb3a1..500427375af243 100644 --- a/be/src/exec/scan/parallel_scanner_builder.cpp +++ b/be/src/exec/scan/parallel_scanner_builder.cpp @@ -25,12 +25,34 @@ #include "common/status.h" #include "exec/operator/olap_scan_operator.h" #include "exec/scan/olap_scanner.h" +#include "io/io_common.h" +#include "runtime/query_context.h" #include "storage/rowset/beta_rowset.h" #include "storage/segment/segment_loader.h" #include "storage/tablet/base_tablet.h" namespace doris { +namespace { + +io::IOContext create_preload_io_context(RuntimeState* state, OlapReaderStatistics* preload_stats) { + io::IOContext io_ctx; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.file_cache_stats = preload_stats ? &preload_stats->file_cache_stats : nullptr; + if (state == nullptr) { + return io_ctx; + } + io_ctx.query_id = &state->query_id(); + io_ctx.read_file_cache = state->query_options().enable_file_cache; + io_ctx.is_disposable = state->query_options().disable_file_cache; + if (auto* query_ctx = state->get_query_ctx(); query_ctx != nullptr) { + io_ctx.remote_scan_cache_write_limiter = query_ctx->remote_scan_cache_write_limiter(); + } + return io_ctx; +} + +} // namespace + Status ParallelScannerBuilder::build_scanners(std::list& scanners) { RETURN_IF_ERROR(_load()); if (_scan_parallelism_by_per_segment) { @@ -234,8 +256,9 @@ Status ParallelScannerBuilder::_load() { auto beta_rowset = std::dynamic_pointer_cast(rowset); std::vector segment_rows; + auto preload_io_ctx = create_preload_io_context(_state, &_builder_stats); RETURN_IF_ERROR(beta_rowset->get_segment_num_rows(&segment_rows, enable_segment_cache, - &_builder_stats)); + &_builder_stats, &preload_io_ctx)); auto segment_count = rowset->num_segments(); for (int64_t i = 0; i != segment_count; i++) { _all_segments_rows[rowset_id].emplace_back(segment_rows[i]); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 47940b9e83afa4..e3c9ce0ba1cfa0 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -52,6 +52,7 @@ #include "io/cache/file_cache_common.h" #include "io/cache/fs_file_cache_storage.h" #include "io/cache/mem_file_cache_storage.h" +#include "io/cache/remote_scan_cache_write_limiter.h" #include "runtime/runtime_profile.h" #include "util/concurrency_stats.h" #include "util/stack_util.h" @@ -720,6 +721,97 @@ void BlockFileCache::add_need_update_lru_block(FileBlockSPtr block) { } } +Status BlockFileCache::get_downloaded_blocks_if_fully_covered(const UInt128Wrapper& hash, + size_t offset, size_t size, + const CacheContext& context, + FileBlocks* blocks, + bool* fully_covered) { + DCHECK(blocks != nullptr); + DCHECK(fully_covered != nullptr); + blocks->clear(); + *fully_covered = false; + if (size == 0) { + *fully_covered = true; + return Status::OK(); + } + + FileBlock::Range range(offset, offset + size - 1); + std::lock_guard cache_lock(_mutex); + auto it = _files.find(hash); + if (it == _files.end()) { + if (_async_open_done) { + return Status::OK(); + } + FileCacheKey key; + key.hash = hash; + key.meta.type = context.cache_type; + key.meta.expiration_time = context.expiration_time; + key.meta.tablet_id = context.tablet_id; + _storage->load_blocks_directly_unlocked(this, key, cache_lock); + + it = _files.find(hash); + if (it == _files.end()) { + return Status::OK(); + } + } + + auto& file_blocks = it->second; + if (file_blocks.empty()) { + LOG(WARNING) << "file_blocks is empty for hash=" << hash.to_string() + << " cache type=" << context.cache_type + << " cache expiration time=" << context.expiration_time + << " cache range=" << range.left << " " << range.right + << " query id=" << context.query_id; + DCHECK(false); + _files.erase(hash); + return Status::OK(); + } + + std::vector covered_cells; + auto block_it = file_blocks.lower_bound(range.left); + if (block_it == file_blocks.end() || block_it->second.file_block->range().left > range.left) { + if (block_it == file_blocks.begin()) { + return Status::OK(); + } + --block_it; + } + + size_t current_pos = range.left; + while (current_pos <= range.right) { + if (block_it == file_blocks.end()) { + return Status::OK(); + } + + auto& cell = block_it->second; + const auto& block_range = cell.file_block->range(); + if (block_range.right < current_pos) { + ++block_it; + continue; + } + if (block_range.left > current_pos || + cell.file_block->state() != FileBlock::State::DOWNLOADED) { + return Status::OK(); + } + + covered_cells.push_back(&cell); + if (range.right <= block_range.right) { + *fully_covered = true; + break; + } + current_pos = block_range.right + 1; + ++block_it; + } + + if (!*fully_covered) { + return Status::OK(); + } + for (const auto* cell : covered_cells) { + use_cell(*cell, blocks, need_to_move(cell->file_block->cache_type(), context.cache_type), + cache_lock); + } + return Status::OK(); +} + std::string BlockFileCache::clear_file_cache_async() { return clear_file_cache_impl(false); } @@ -799,10 +891,21 @@ FileBlocks BlockFileCache::split_range_into_cells(const UInt128Wrapper& hash, while (current_pos < end_pos_non_included) { current_size = std::min(remaining_size, _max_file_block_size); remaining_size -= current_size; - state = try_reserve(hash, context, current_pos, current_size, cache_lock) - ? state - : FileBlock::State::SKIP_CACHE; - if (state == FileBlock::State::SKIP_CACHE) [[unlikely]] { + auto block_state = state; + if (block_state != FileBlock::State::SKIP_CACHE && + context.admit_cache_write_by_remote_scan_limiter) { + auto* limiter = context.remote_scan_cache_write_limiter; + DCHECK(limiter != nullptr); + if (!limiter->try_admit_cache_write(static_cast(current_size))) { + block_state = FileBlock::State::SKIP_CACHE; + } + } + if (block_state != FileBlock::State::SKIP_CACHE) { + block_state = try_reserve(hash, context, current_pos, current_size, cache_lock) + ? block_state + : FileBlock::State::SKIP_CACHE; + } + if (block_state == FileBlock::State::SKIP_CACHE) [[unlikely]] { FileCacheKey key; key.hash = hash; key.offset = current_pos; @@ -813,7 +916,8 @@ FileBlocks BlockFileCache::split_range_into_cells(const UInt128Wrapper& hash, FileBlock::State::SKIP_CACHE); file_blocks.push_back(std::move(file_block)); } else { - auto* cell = add_cell(hash, context, current_pos, current_size, state, cache_lock); + auto* cell = + add_cell(hash, context, current_pos, current_size, block_state, cache_lock); if (cell) { file_blocks.push_back(cell->file_block); if (!context.is_cold_data) { diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index 1a19df5eb4ae5e..9a01d07d136a62 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -237,6 +237,14 @@ class BlockFileCache { FileBlocksHolder get_or_set(const UInt128Wrapper& hash, size_t offset, size_t size, CacheContext& context); + /** + * Return existing downloaded blocks only if they fully cover [offset, offset + size). + * This lookup is read-only: it does not reserve cache space or create EMPTY blocks. + */ + Status get_downloaded_blocks_if_fully_covered(const UInt128Wrapper& hash, size_t offset, + size_t size, const CacheContext& context, + FileBlocks* blocks, bool* fully_covered); + /** * record blocks read directly by CachedRemoteFileReader */ diff --git a/be/src/io/cache/block_file_cache_profile.cpp b/be/src/io/cache/block_file_cache_profile.cpp index 8f9c167c9989e6..4e2cb74ccf31e5 100644 --- a/be/src/io/cache/block_file_cache_profile.cpp +++ b/be/src/io/cache/block_file_cache_profile.cpp @@ -98,6 +98,22 @@ FileCacheStatistics diff_file_cache_statistics(const FileCacheStatistics& curren SUBTRACT_FIELD(inverted_index_remote_io_timer); SUBTRACT_FIELD(inverted_index_peer_io_timer); SUBTRACT_FIELD(inverted_index_io_timer); + SUBTRACT_FIELD(inverted_index_write_cache_io_timer); + SUBTRACT_FIELD(inverted_index_bytes_write_into_cache); + + SUBTRACT_FIELD(segment_footer_index_num_local_io_total); + SUBTRACT_FIELD(segment_footer_index_num_remote_io_total); + SUBTRACT_FIELD(segment_footer_index_num_peer_io_total); + SUBTRACT_FIELD(segment_footer_index_bytes_read_from_local); + SUBTRACT_FIELD(segment_footer_index_bytes_read_from_remote); + SUBTRACT_FIELD(segment_footer_index_bytes_read_from_peer); + SUBTRACT_FIELD(segment_footer_index_local_io_timer); + SUBTRACT_FIELD(segment_footer_index_remote_io_timer); + SUBTRACT_FIELD(segment_footer_index_peer_io_timer); + SUBTRACT_FIELD(segment_footer_index_write_cache_io_timer); + SUBTRACT_FIELD(segment_footer_index_bytes_write_into_cache); + SUBTRACT_FIELD(remote_only_on_miss_triggered); + SUBTRACT_FIELD(remote_only_on_miss_threshold_bytes); #undef SUBTRACT_FIELD return diff; } @@ -135,6 +151,10 @@ FileCacheProfileReporter::FileCacheProfileReporter(RuntimeProfile* profile) { lock_wait_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "LockWaitTimer", cache_profile, 1); get_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "GetTimer", cache_profile, 1); set_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "SetTimer", cache_profile, 1); + remote_only_on_miss_triggered = profile->AddHighWaterMarkCounter("RemoteOnlyOnMissTriggered", + TUnit::UNIT, cache_profile, 1); + remote_only_on_miss_threshold_bytes = profile->AddHighWaterMarkCounter( + "RemoteOnlyOnMissThresholdBytes", TUnit::BYTES, cache_profile, 1); inverted_index_num_local_io_total = ADD_CHILD_COUNTER_WITH_LEVEL( profile, "InvertedIndexNumLocalIOTotal", TUnit::UNIT, cache_profile, 1); @@ -156,6 +176,33 @@ FileCacheProfileReporter::FileCacheProfileReporter(RuntimeProfile* profile) { ADD_CHILD_TIMER_WITH_LEVEL(profile, "InvertedIndexPeerIOUseTimer", cache_profile, 1); inverted_index_io_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "InvertedIndexIOTimer", cache_profile, 1); + inverted_index_write_cache_io_timer = ADD_CHILD_TIMER_WITH_LEVEL( + profile, "InvertedIndexWriteCacheIOUseTimer", cache_profile, 1); + inverted_index_bytes_write_into_cache = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "InvertedIndexBytesWriteIntoCache", TUnit::BYTES, cache_profile, 1); + + segment_footer_index_num_local_io_total = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "SegmentFooterIndexNumLocalIOTotal", TUnit::UNIT, cache_profile, 1); + segment_footer_index_num_remote_io_total = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "SegmentFooterIndexNumRemoteIOTotal", TUnit::UNIT, cache_profile, 1); + segment_footer_index_num_peer_io_total = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "SegmentFooterIndexNumPeerIOTotal", TUnit::UNIT, cache_profile, 1); + segment_footer_index_bytes_scanned_from_cache = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "SegmentFooterIndexBytesScannedFromCache", TUnit::BYTES, cache_profile, 1); + segment_footer_index_bytes_scanned_from_remote = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "SegmentFooterIndexBytesScannedFromRemote", TUnit::BYTES, cache_profile, 1); + segment_footer_index_bytes_scanned_from_peer = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "SegmentFooterIndexBytesScannedFromPeer", TUnit::BYTES, cache_profile, 1); + segment_footer_index_local_io_timer = ADD_CHILD_TIMER_WITH_LEVEL( + profile, "SegmentFooterIndexLocalIOUseTimer", cache_profile, 1); + segment_footer_index_remote_io_timer = ADD_CHILD_TIMER_WITH_LEVEL( + profile, "SegmentFooterIndexRemoteIOUseTimer", cache_profile, 1); + segment_footer_index_peer_io_timer = ADD_CHILD_TIMER_WITH_LEVEL( + profile, "SegmentFooterIndexPeerIOUseTimer", cache_profile, 1); + segment_footer_index_write_cache_io_timer = ADD_CHILD_TIMER_WITH_LEVEL( + profile, "SegmentFooterIndexWriteCacheIOUseTimer", cache_profile, 1); + segment_footer_index_bytes_write_into_cache = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "SegmentFooterIndexBytesWriteIntoCache", TUnit::BYTES, cache_profile, 1); } void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) const { @@ -177,6 +224,8 @@ void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) con COUNTER_UPDATE(lock_wait_timer, statistics->lock_wait_timer); COUNTER_UPDATE(get_timer, statistics->get_timer); COUNTER_UPDATE(set_timer, statistics->set_timer); + remote_only_on_miss_triggered->set(statistics->remote_only_on_miss_triggered); + remote_only_on_miss_threshold_bytes->set(statistics->remote_only_on_miss_threshold_bytes); COUNTER_UPDATE(inverted_index_num_local_io_total, statistics->inverted_index_num_local_io_total); @@ -193,6 +242,33 @@ void FileCacheProfileReporter::update(const FileCacheStatistics* statistics) con COUNTER_UPDATE(inverted_index_remote_io_timer, statistics->inverted_index_remote_io_timer); COUNTER_UPDATE(inverted_index_peer_io_timer, statistics->inverted_index_peer_io_timer); COUNTER_UPDATE(inverted_index_io_timer, statistics->inverted_index_io_timer); + COUNTER_UPDATE(inverted_index_write_cache_io_timer, + statistics->inverted_index_write_cache_io_timer); + COUNTER_UPDATE(inverted_index_bytes_write_into_cache, + statistics->inverted_index_bytes_write_into_cache); + + COUNTER_UPDATE(segment_footer_index_num_local_io_total, + statistics->segment_footer_index_num_local_io_total); + COUNTER_UPDATE(segment_footer_index_num_remote_io_total, + statistics->segment_footer_index_num_remote_io_total); + COUNTER_UPDATE(segment_footer_index_num_peer_io_total, + statistics->segment_footer_index_num_peer_io_total); + COUNTER_UPDATE(segment_footer_index_bytes_scanned_from_cache, + statistics->segment_footer_index_bytes_read_from_local); + COUNTER_UPDATE(segment_footer_index_bytes_scanned_from_remote, + statistics->segment_footer_index_bytes_read_from_remote); + COUNTER_UPDATE(segment_footer_index_bytes_scanned_from_peer, + statistics->segment_footer_index_bytes_read_from_peer); + COUNTER_UPDATE(segment_footer_index_local_io_timer, + statistics->segment_footer_index_local_io_timer); + COUNTER_UPDATE(segment_footer_index_remote_io_timer, + statistics->segment_footer_index_remote_io_timer); + COUNTER_UPDATE(segment_footer_index_peer_io_timer, + statistics->segment_footer_index_peer_io_timer); + COUNTER_UPDATE(segment_footer_index_write_cache_io_timer, + statistics->segment_footer_index_write_cache_io_timer); + COUNTER_UPDATE(segment_footer_index_bytes_write_into_cache, + statistics->segment_footer_index_bytes_write_into_cache); } } // namespace doris::io diff --git a/be/src/io/cache/block_file_cache_profile.h b/be/src/io/cache/block_file_cache_profile.h index 6c95e49791c054..b26176724db744 100644 --- a/be/src/io/cache/block_file_cache_profile.h +++ b/be/src/io/cache/block_file_cache_profile.h @@ -86,6 +86,8 @@ struct FileCacheProfileReporter { RuntimeProfile::Counter* lock_wait_timer = nullptr; RuntimeProfile::Counter* get_timer = nullptr; RuntimeProfile::Counter* set_timer = nullptr; + RuntimeProfile::HighWaterMarkCounter* remote_only_on_miss_triggered = nullptr; + RuntimeProfile::HighWaterMarkCounter* remote_only_on_miss_threshold_bytes = nullptr; RuntimeProfile::Counter* inverted_index_num_local_io_total = nullptr; RuntimeProfile::Counter* inverted_index_num_remote_io_total = nullptr; @@ -97,6 +99,20 @@ struct FileCacheProfileReporter { RuntimeProfile::Counter* inverted_index_remote_io_timer = nullptr; RuntimeProfile::Counter* inverted_index_peer_io_timer = nullptr; RuntimeProfile::Counter* inverted_index_io_timer = nullptr; + RuntimeProfile::Counter* inverted_index_write_cache_io_timer = nullptr; + RuntimeProfile::Counter* inverted_index_bytes_write_into_cache = nullptr; + + RuntimeProfile::Counter* segment_footer_index_num_local_io_total = nullptr; + RuntimeProfile::Counter* segment_footer_index_num_remote_io_total = nullptr; + RuntimeProfile::Counter* segment_footer_index_num_peer_io_total = nullptr; + RuntimeProfile::Counter* segment_footer_index_bytes_scanned_from_cache = nullptr; + RuntimeProfile::Counter* segment_footer_index_bytes_scanned_from_remote = nullptr; + RuntimeProfile::Counter* segment_footer_index_bytes_scanned_from_peer = nullptr; + RuntimeProfile::Counter* segment_footer_index_local_io_timer = nullptr; + RuntimeProfile::Counter* segment_footer_index_remote_io_timer = nullptr; + RuntimeProfile::Counter* segment_footer_index_peer_io_timer = nullptr; + RuntimeProfile::Counter* segment_footer_index_write_cache_io_timer = nullptr; + RuntimeProfile::Counter* segment_footer_index_bytes_write_into_cache = nullptr; FileCacheProfileReporter(RuntimeProfile* profile); void update(const FileCacheStatistics* statistics) const; diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 7c297737d60a44..f53ef569ee7d9f 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -46,6 +46,7 @@ #include "io/cache/file_block.h" #include "io/cache/file_cache_common.h" #include "io/cache/peer_file_cache_reader.h" +#include "io/cache/remote_scan_cache_write_limiter.h" #include "io/fs/file_reader.h" #include "io/fs/local_file_system.h" #include "io/io_common.h" @@ -89,6 +90,14 @@ bvar::Window> g_read_cache_indirect_total_bytes_1min_windo bvar::Adder g_failed_get_peer_addr_counter( "cached_remote_reader_failed_get_peer_addr_counter"); +static bool use_remote_only_on_cache_miss(const IOContext* io_ctx) { + if (io_ctx->file_cache_miss_policy == FileCacheMissPolicy::REMOTE_ONLY_ON_MISS) { + return true; + } + auto* limiter = io_ctx->remote_scan_cache_write_limiter; + return limiter != nullptr && limiter->remote_only_on_miss(); +} + CachedRemoteFileReader::CachedRemoteFileReader(FileReaderSPtr remote_file_reader, const FileReaderOptions& opts) : _tablet_id(opts.tablet_id), _remote_file_reader(std::move(remote_file_reader)) { @@ -271,6 +280,94 @@ Status CachedRemoteFileReader::_execute_remote_read(const std::vector Status { + stats.hit_cache = false; + stats.from_peer_cache = false; + stats.skip_cache = true; + s3_read_counter << 1; + if (is_dryrun) [[unlikely]] { + *bytes_read = bytes_req; + g_read_cache_indirect_bytes << 0; + g_read_cache_indirect_total_bytes << bytes_req; + return Status::OK(); + } + + size_t remote_bytes_read = bytes_req; + SCOPED_RAW_TIMER(&stats.remote_read_timer); + RETURN_IF_ERROR(_remote_file_reader->read_at(offset, Slice(result.data, bytes_req), + &remote_bytes_read, io_ctx)); + *bytes_read = remote_bytes_read; + DCHECK_EQ(*bytes_read, bytes_req); + stats.bytes_read_from_remote += remote_bytes_read; + g_read_cache_indirect_bytes << remote_bytes_read; + g_read_cache_indirect_total_bytes << remote_bytes_read; + return Status::OK(); + }; + + g_read_cache_indirect_num << 1; + CacheContext cache_context(io_ctx); + cache_context.stats = &stats; + cache_context.tablet_id = _tablet_id; + FileBlocks file_blocks; + bool fully_covered = false; + { + SCOPED_RAW_TIMER(&stats.get_timer); + RETURN_IF_ERROR(_cache->get_downloaded_blocks_if_fully_covered( + _cache_hash, offset, bytes_req, cache_context, &file_blocks, &fully_covered)); + } + if (!fully_covered) { + return read_remote(); + } + + size_t local_read_bytes = 0; + size_t current_offset = offset; + size_t end_offset = offset + bytes_req - 1; + for (auto& block : file_blocks) { + if (current_offset > end_offset) { + break; + } + const auto& block_range = block->range(); + if (block_range.right < current_offset) { + continue; + } + + size_t read_left = std::max(current_offset, block_range.left); + size_t read_right = std::min(end_offset, block_range.right); + size_t read_size = read_right - read_left + 1; + if (is_dryrun) [[unlikely]] { + g_skip_local_cache_io_sum_bytes << read_size; + } else { + SCOPED_RAW_TIMER(&stats.local_read_timer); + Status st = block->read(Slice(result.data + (read_left - offset), read_size), + read_left - block_range.left); + if (!st.ok()) { + if (st.is()) { + _cache->remove_if_cached_async(_cache_hash); + } + LOG_EVERY_N(WARNING, 100) + << "Read data failed from file cache in remote-only-on-miss path. " + << "Fallback to remote. err=" << st.msg() + << ", block state=" << block->state(); + return read_remote(); + } + stats.bytes_read_from_local += read_size; + local_read_bytes += read_size; + } + current_offset = read_right + 1; + } + + *bytes_read = bytes_req; + stats.hit_cache = true; + g_read_cache_indirect_bytes << local_read_bytes; + g_read_cache_indirect_total_bytes << bytes_req; + return Status::OK(); +} + Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* bytes_read, const IOContext* io_ctx) { size_t already_read = 0; @@ -321,16 +418,42 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* } if (!io_ctx->is_warmup) { // update stats increment in this reading procedure for file cache metrics + const auto file_cache_read_type = + io_ctx->is_inverted_index + ? FileCacheReadType::INVERTED_INDEX + : (io_ctx->is_index_data ? FileCacheReadType::SEGMENT_FOOTER_INDEX + : FileCacheReadType::DATA); FileCacheStatistics fcache_stats_increment; - _update_stats(stats, &fcache_stats_increment, io_ctx->is_inverted_index); + _update_stats(stats, &fcache_stats_increment, file_cache_read_type); io::FileCacheMetrics::instance().update(&fcache_stats_increment); } if (io_ctx->file_cache_stats) { // update stats in io_ctx, for query profile - _update_stats(stats, io_ctx->file_cache_stats, io_ctx->is_inverted_index); + const auto file_cache_read_type = + io_ctx->is_inverted_index + ? FileCacheReadType::INVERTED_INDEX + : (io_ctx->is_index_data ? FileCacheReadType::SEGMENT_FOOTER_INDEX + : FileCacheReadType::DATA); + _update_stats(stats, io_ctx->file_cache_stats, file_cache_read_type); + auto* limiter = io_ctx->remote_scan_cache_write_limiter; + if (limiter != nullptr) { + io_ctx->file_cache_stats->remote_only_on_miss_triggered = + io_ctx->file_cache_stats->remote_only_on_miss_triggered || + limiter->remote_only_on_miss(); + io_ctx->file_cache_stats->remote_only_on_miss_threshold_bytes = + limiter->threshold_bytes(); + } } }; std::unique_ptr defer((int*)0x01, std::move(defer_func)); + + if (use_remote_only_on_cache_miss(io_ctx)) { + RETURN_IF_ERROR(_read_remote_only_on_cache_miss(offset, result, bytes_read, bytes_req, + stats, io_ctx, is_dryrun)); + read_success = true; + return Status::OK(); + } + if (_is_doris_table && config::enable_read_cache_file_directly) { // read directly SCOPED_RAW_TIMER(&stats.read_cache_file_directly_timer); @@ -609,7 +732,7 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* void CachedRemoteFileReader::_update_stats(const ReadStatistics& read_stats, FileCacheStatistics* statis, - bool is_inverted_index) const { + FileCacheReadType read_type) const { if (statis == nullptr) { return; } @@ -639,22 +762,59 @@ void CachedRemoteFileReader::_update_stats(const ReadStatistics& read_stats, statis->get_timer += read_stats.get_timer; statis->set_timer += read_stats.set_timer; - if (is_inverted_index) { + auto update_index_stats = [&](int64_t& num_local_io_total, int64_t& num_remote_io_total, + int64_t& num_peer_io_total, int64_t& bytes_read_from_local, + int64_t& bytes_read_from_remote, int64_t& bytes_read_from_peer, + int64_t& local_io_timer, int64_t& remote_io_timer, + int64_t& peer_io_timer, int64_t& write_cache_io_timer, + int64_t& bytes_write_into_cache) { if (read_stats.bytes_read_from_local > 0) { - statis->inverted_index_num_local_io_total++; - statis->inverted_index_bytes_read_from_local += read_stats.bytes_read_from_local; + num_local_io_total++; + bytes_read_from_local += read_stats.bytes_read_from_local; } if (read_stats.bytes_read_from_remote > 0) { - statis->inverted_index_num_remote_io_total++; - statis->inverted_index_bytes_read_from_remote += read_stats.bytes_read_from_remote; - statis->inverted_index_remote_io_timer += read_stats.remote_read_timer; + num_remote_io_total++; + bytes_read_from_remote += read_stats.bytes_read_from_remote; + remote_io_timer += read_stats.remote_read_timer; } if (read_stats.bytes_read_from_peer > 0) { - statis->inverted_index_num_peer_io_total++; - statis->inverted_index_bytes_read_from_peer += read_stats.bytes_read_from_peer; - statis->inverted_index_peer_io_timer += read_stats.peer_read_timer; + num_peer_io_total++; + bytes_read_from_peer += read_stats.bytes_read_from_peer; + peer_io_timer += read_stats.peer_read_timer; } - statis->inverted_index_local_io_timer += read_stats.local_read_timer; + local_io_timer += read_stats.local_read_timer; + write_cache_io_timer += read_stats.local_write_timer; + bytes_write_into_cache += read_stats.bytes_write_into_file_cache; + }; + + switch (read_type) { + case FileCacheReadType::DATA: + break; + case FileCacheReadType::INVERTED_INDEX: + update_index_stats( + statis->inverted_index_num_local_io_total, + statis->inverted_index_num_remote_io_total, + statis->inverted_index_num_peer_io_total, + statis->inverted_index_bytes_read_from_local, + statis->inverted_index_bytes_read_from_remote, + statis->inverted_index_bytes_read_from_peer, statis->inverted_index_local_io_timer, + statis->inverted_index_remote_io_timer, statis->inverted_index_peer_io_timer, + statis->inverted_index_write_cache_io_timer, + statis->inverted_index_bytes_write_into_cache); + break; + case FileCacheReadType::SEGMENT_FOOTER_INDEX: + update_index_stats(statis->segment_footer_index_num_local_io_total, + statis->segment_footer_index_num_remote_io_total, + statis->segment_footer_index_num_peer_io_total, + statis->segment_footer_index_bytes_read_from_local, + statis->segment_footer_index_bytes_read_from_remote, + statis->segment_footer_index_bytes_read_from_peer, + statis->segment_footer_index_local_io_timer, + statis->segment_footer_index_remote_io_timer, + statis->segment_footer_index_peer_io_timer, + statis->segment_footer_index_write_cache_io_timer, + statis->segment_footer_index_bytes_write_into_cache); + break; } g_skip_cache_sum << read_stats.skip_cache; diff --git a/be/src/io/cache/cached_remote_file_reader.h b/be/src/io/cache/cached_remote_file_reader.h index 3f2e1ceb2e1395..ddc18d1968b93c 100644 --- a/be/src/io/cache/cached_remote_file_reader.h +++ b/be/src/io/cache/cached_remote_file_reader.h @@ -75,6 +75,12 @@ class CachedRemoteFileReader final : public FileReader, const IOContext* io_ctx) override; private: + enum class FileCacheReadType { + DATA, + INVERTED_INDEX, + SEGMENT_FOOTER_INDEX, + }; + void _insert_file_reader(FileBlockSPtr file_block); // Execute remote read (S3 or peer). @@ -82,8 +88,12 @@ class CachedRemoteFileReader final : public FileReader, size_t& size, std::unique_ptr& buffer, ReadStatistics& stats, const IOContext* io_ctx); + Status _read_remote_only_on_cache_miss(size_t offset, Slice result, size_t* bytes_read, + size_t bytes_req, ReadStatistics& stats, + const IOContext* io_ctx, bool is_dryrun); + void _update_stats(const ReadStatistics& stats, FileCacheStatistics* state, - bool is_inverted_index) const; + FileCacheReadType read_type) const; bool _is_doris_table = false; int64_t _tablet_id = -1; diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index af91c6dcb478ba..9f9628d7237391 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -22,6 +22,7 @@ #include #include +#include "common/config.h" #include "core/uint128.h" #include "io/io_common.h" @@ -163,6 +164,13 @@ struct CacheContext { } query_id = io_context->query_id ? *io_context->query_id : TUniqueId(); is_warmup = io_context->is_warmup; + remote_scan_cache_write_limiter = io_context->remote_scan_cache_write_limiter; + admit_cache_write_by_remote_scan_limiter = + remote_scan_cache_write_limiter != nullptr && + io_context->reader_type == ReaderType::READER_QUERY && + (!io_context->is_index_data || io_context->is_inverted_index || + config::file_cache_query_limit_segment_meta) && + !io_context->is_warmup; } CacheContext() = default; bool operator==(const CacheContext& rhs) const { @@ -176,6 +184,8 @@ struct CacheContext { ReadStatistics* stats; bool is_warmup {false}; int64_t tablet_id {0}; + RemoteScanCacheWriteLimiter* remote_scan_cache_write_limiter = nullptr; + bool admit_cache_write_by_remote_scan_limiter {false}; }; template diff --git a/be/src/io/cache/remote_scan_cache_write_limiter.cpp b/be/src/io/cache/remote_scan_cache_write_limiter.cpp new file mode 100644 index 00000000000000..339ae6c6ac534a --- /dev/null +++ b/be/src/io/cache/remote_scan_cache_write_limiter.cpp @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io/cache/remote_scan_cache_write_limiter.h" + +#include + +#include "common/logging.h" +#include "util/uid_util.h" + +namespace doris::io { + +bvar::Adder g_remote_scan_no_write_file_cache_query_total( + "remote_scan_no_write_file_cache_query_total"); + +RemoteScanCacheWriteLimiter::RemoteScanCacheWriteLimiter(TUniqueId query_id, + int64_t threshold_bytes) + : _query_id(query_id), + _threshold_bytes(threshold_bytes), + _remote_only_on_miss(threshold_bytes == 0) {} + +bool RemoteScanCacheWriteLimiter::remote_only_on_miss() const { + return _remote_only_on_miss.load(std::memory_order_acquire); +} + +int64_t RemoteScanCacheWriteLimiter::admitted_data_bytes() const { + std::lock_guard lock(_mutex); + return _admitted_data_bytes; +} + +bool RemoteScanCacheWriteLimiter::try_admit_cache_write(int64_t bytes) { + if (!enabled()) { + return true; + } + if (_remote_only_on_miss.load(std::memory_order_acquire)) { + return false; + } + if (bytes <= 0) { + return true; + } + + std::lock_guard lock(_mutex); + if (_remote_only_on_miss.load(std::memory_order_relaxed)) { + return false; + } + + const int64_t remaining_budget_bytes = _threshold_bytes - _admitted_data_bytes; + if (bytes > remaining_budget_bytes) { + _remote_only_on_miss.store(true, std::memory_order_release); + g_remote_scan_no_write_file_cache_query_total << 1; + VLOG_DEBUG << "Remote scan file cache write threshold reached" + << ", query_id=" << print_id(_query_id) + << ", admitted_data_bytes=" << _admitted_data_bytes + << ", request_bytes=" << bytes + << ", remaining_budget_bytes=" << remaining_budget_bytes + << ", threshold_bytes=" << _threshold_bytes; + return false; + } + + _admitted_data_bytes += bytes; + return true; +} + +} // namespace doris::io diff --git a/be/src/io/cache/remote_scan_cache_write_limiter.h b/be/src/io/cache/remote_scan_cache_write_limiter.h new file mode 100644 index 00000000000000..e6524f711b6c89 --- /dev/null +++ b/be/src/io/cache/remote_scan_cache_write_limiter.h @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include +#include +#include + +namespace doris::io { + +class RemoteScanCacheWriteLimiter { +public: + RemoteScanCacheWriteLimiter(TUniqueId query_id, int64_t threshold_bytes); + + bool enabled() const { return _threshold_bytes >= 0; } + bool remote_only_on_miss() const; + bool try_admit_cache_write(int64_t bytes); + + int64_t admitted_data_bytes() const; + int64_t threshold_bytes() const { return _threshold_bytes; } + +private: + const TUniqueId _query_id; + const int64_t _threshold_bytes; + mutable std::mutex _mutex; + std::atomic _remote_only_on_miss {false}; + int64_t _admitted_data_bytes {0}; +}; + +} // namespace doris::io diff --git a/be/src/io/fs/file_reader.h b/be/src/io/fs/file_reader.h index 3df912cbad4af9..d9eb02cc9c2d42 100644 --- a/be/src/io/fs/file_reader.h +++ b/be/src/io/fs/file_reader.h @@ -21,6 +21,7 @@ #include #include +#include #include "common/status.h" #include "io/fs/path.h" diff --git a/be/src/io/fs/file_writer.h b/be/src/io/fs/file_writer.h index 9e3f75525032b6..9402fdef18303c 100644 --- a/be/src/io/fs/file_writer.h +++ b/be/src/io/fs/file_writer.h @@ -44,6 +44,7 @@ struct FileWriterOptions { // this shortens the inconsistent time window. bool used_by_s3_committer = false; bool write_file_cache = false; + bool allow_adaptive_file_cache_write = true; bool is_cold_data = false; bool sync_file_data = true; // Whether flush data into storage system uint64_t file_cache_expiration_time = 0; // Relative time @@ -109,13 +110,15 @@ class FileWriter { io::UInt128Wrapper path_hash = BlockFileCache::hash(path.filename().native()); BlockFileCache* file_cache_ptr = FileCacheFactory::instance()->get_by_path(path_hash); - bool has_enough_file_cache_space = config::enable_file_cache_adaptive_write && + bool has_enough_file_cache_space = opts->allow_adaptive_file_cache_write && + config::enable_file_cache_adaptive_write && (opts->approximate_bytes_to_write > 0) && (file_cache_ptr->approximate_available_cache_size() > opts->approximate_bytes_to_write); VLOG_DEBUG << "path:" << path.filename().native() << ", write_file_cache:" << opts->write_file_cache + << ", allow_adaptive_file_cache_write:" << opts->allow_adaptive_file_cache_write << ", has_enough_file_cache_space:" << has_enough_file_cache_space << ", approximate_bytes_to_write:" << opts->approximate_bytes_to_write << ", file_cache_available_size:" diff --git a/be/src/io/fs/packed_file_system.cpp b/be/src/io/fs/packed_file_system.cpp index 9c593d8b2f71da..7c1fa5bf6ffef3 100644 --- a/be/src/io/fs/packed_file_system.cpp +++ b/be/src/io/fs/packed_file_system.cpp @@ -20,6 +20,7 @@ #include #include +#include "cloud/config.h" #include "common/status.h" #include "io/fs/file_reader.h" #include "io/fs/packed_file_reader.h" @@ -96,8 +97,13 @@ Status PackedFileSystem::create_file_impl(const Path& file, FileWriterPtr* write return Status::OK(); } + auto append_info = _append_info; + if (config::enable_file_cache_write_index_file_only && opts != nullptr) { + append_info.write_file_cache = opts->write_file_cache; + } + // Wrap with PackedFileWriter - *writer = std::make_unique(std::move(inner_writer), file, _append_info); + *writer = std::make_unique(std::move(inner_writer), file, append_info); return Status::OK(); } @@ -106,11 +112,19 @@ Status PackedFileSystem::open_file_impl(const Path& file, FileReaderSPtr* reader // Check if this file is in a packed file std::string file_path = file.native(); auto it = _index_map.find(file_path); - bool is_packed_file = (it != _index_map.end()); + PackedSliceLocation manager_location; + const PackedSliceLocation* packed_location = nullptr; + if (it != _index_map.end()) { + packed_location = &it->second; + } else if (PackedFileManager::instance() + ->get_packed_slice_location(file_path, &manager_location) + .ok()) { + packed_location = &manager_location; + } - if (is_packed_file) { + if (packed_location != nullptr) { // File is in packed file, open packed file and wrap with PackedFileReader - const auto& index = it->second; + const auto& index = *packed_location; FileReaderSPtr inner_reader; // Create options for opening the packed file diff --git a/be/src/io/fs/s3_file_system.cpp b/be/src/io/fs/s3_file_system.cpp index 1ec5b0a83774cd..63be8f1955a59c 100644 --- a/be/src/io/fs/s3_file_system.cpp +++ b/be/src/io/fs/s3_file_system.cpp @@ -35,6 +35,7 @@ #include "common/config.h" #include "common/logging.h" #include "common/status.h" +#include "cpp/sync_point.h" #include "io/fs/err_utils.h" #include "io/fs/file_system.h" #include "io/fs/file_writer.h" @@ -195,6 +196,7 @@ Status S3FileSystem::create_file_impl(const Path& file, FileWriterPtr* writer, Status S3FileSystem::open_file_internal(const Path& file, FileReaderSPtr* reader, const FileReaderOptions& opts) { + TEST_SYNC_POINT_CALLBACK("S3FileSystem::open_file_internal", &file, &opts); auto key = DORIS_TRY(get_key(file)); *reader = DORIS_TRY(S3FileReader::create(_client, _bucket, key, opts.file_size, nullptr)); return Status::OK(); diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 36b20517afb87c..c4e91ffeb745f0 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -37,6 +37,13 @@ enum class ReaderType : uint8_t { namespace io { +class RemoteScanCacheWriteLimiter; + +enum class FileCacheMissPolicy : uint8_t { + READ_THROUGH_AND_WRITE_BACK = 0, + REMOTE_ONLY_ON_MISS = 1, +}; + struct FileReaderStats { size_t read_calls = 0; size_t read_bytes = 0; @@ -74,6 +81,22 @@ struct FileCacheStatistics { int64_t inverted_index_remote_io_timer = 0; int64_t inverted_index_peer_io_timer = 0; int64_t inverted_index_io_timer = 0; + int64_t inverted_index_write_cache_io_timer = 0; + int64_t inverted_index_bytes_write_into_cache = 0; + + int64_t segment_footer_index_num_local_io_total = 0; + int64_t segment_footer_index_num_remote_io_total = 0; + int64_t segment_footer_index_num_peer_io_total = 0; + int64_t segment_footer_index_bytes_read_from_local = 0; + int64_t segment_footer_index_bytes_read_from_remote = 0; + int64_t segment_footer_index_bytes_read_from_peer = 0; + int64_t segment_footer_index_local_io_timer = 0; + int64_t segment_footer_index_remote_io_timer = 0; + int64_t segment_footer_index_peer_io_timer = 0; + int64_t segment_footer_index_write_cache_io_timer = 0; + int64_t segment_footer_index_bytes_write_into_cache = 0; + int64_t remote_only_on_miss_triggered = 0; + int64_t remote_only_on_miss_threshold_bytes = 0; }; struct IOContext { @@ -97,6 +120,8 @@ struct IOContext { // if `is_warmup` == true, this I/O request is from a warm up task bool is_warmup {false}; int64_t condition_cache_filtered_rows = 0; + FileCacheMissPolicy file_cache_miss_policy = FileCacheMissPolicy::READ_THROUGH_AND_WRITE_BACK; + RemoteScanCacheWriteLimiter* remote_scan_cache_write_limiter = nullptr; // Ref }; } // namespace io diff --git a/be/src/runtime/query_context.cpp b/be/src/runtime/query_context.cpp index 4e1fa2ab9ecab8..3da8523c25ba8a 100644 --- a/be/src/runtime/query_context.cpp +++ b/be/src/runtime/query_context.cpp @@ -29,6 +29,7 @@ #include #include +#include "cloud/config.h" #include "common/logging.h" #include "common/status.h" #include "exec/operator/rec_cte_scan_operator.h" @@ -36,6 +37,7 @@ #include "exec/pipeline/pipeline_fragment_context.h" #include "exec/runtime_filter/runtime_filter_definitions.h" #include "exec/spill/spill_file_manager.h" +#include "io/cache/remote_scan_cache_write_limiter.h" #include "runtime/exec_env.h" #include "runtime/fragment_mgr.h" #include "runtime/memory/heap_profiler.h" @@ -123,6 +125,16 @@ QueryContext::QueryContext(TUniqueId query_id, ExecEnv* exec_env, _query_id, query_options.file_cache_query_limit_percent); } + const bool initialize_remote_scan_cache_write_limiter = + config::is_cloud_mode() && config::enable_file_cache && + query_options.__isset.file_cache_query_limit_bytes && + query_options.file_cache_query_limit_bytes >= 0 && + query_options.query_type == TQueryType::SELECT; + if (initialize_remote_scan_cache_write_limiter) { + _remote_scan_cache_write_limiter = std::make_unique( + _query_id, query_options.file_cache_query_limit_bytes); + } + bool is_query_type_valid = query_options.query_type == TQueryType::SELECT || query_options.query_type == TQueryType::LOAD || query_options.query_type == TQueryType::EXTERNAL; diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index 3327d48076f5ad..ce2f48dff003a4 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -46,6 +46,10 @@ namespace doris { +namespace io { +class RemoteScanCacheWriteLimiter; +} // namespace io + class PipelineFragmentContext; class PipelineTask; class QueryTaskController; @@ -246,6 +250,10 @@ class QueryContext : public std::enable_shared_from_this { std::shared_ptr resource_ctx() { return _resource_ctx; } + io::RemoteScanCacheWriteLimiter* remote_scan_cache_write_limiter() const { + return _remote_scan_cache_write_limiter.get(); + } + // plan node id -> TFileScanRangeParams // only for file scan node std::map file_scan_range_params_map; @@ -398,6 +406,7 @@ class QueryContext : public std::enable_shared_from_this { std::map, RecCTEScanLocalState*> _cte_scan; std::mutex _cte_scan_lock; std::shared_ptr _mem_arb = nullptr; + std::unique_ptr _remote_scan_cache_write_limiter; public: // when fragment of pipeline is closed, it will register its profile to this map by using add_fragment_profile diff --git a/be/src/storage/compaction/compaction.cpp b/be/src/storage/compaction/compaction.cpp index df2fee8b1146d8..f27f55d309661d 100644 --- a/be/src/storage/compaction/compaction.cpp +++ b/be/src/storage/compaction/compaction.cpp @@ -39,6 +39,7 @@ #include "cloud/cloud_meta_mgr.h" #include "cloud/cloud_storage_engine.h" #include "cloud/cloud_tablet.h" +#include "cloud/config.h" #include "cloud/pb_convert.h" #include "common/config.h" #include "common/metrics/doris_metrics.h" @@ -2020,6 +2021,10 @@ int64_t CloudCompactionMixin::num_input_rowsets() const { } bool CloudCompactionMixin::should_cache_compaction_output() { + if (config::enable_file_cache_write_index_file_only) { + return false; + } + if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION) { return true; } diff --git a/be/src/storage/index/bloom_filter/bloom_filter_index_reader.cpp b/be/src/storage/index/bloom_filter/bloom_filter_index_reader.cpp index b8c1c9b37440ef..2d0a9a0ab8b6a7 100644 --- a/be/src/storage/index/bloom_filter/bloom_filter_index_reader.cpp +++ b/be/src/storage/index/bloom_filter/bloom_filter_index_reader.cpp @@ -32,10 +32,11 @@ namespace doris::segment_v2 { Status BloomFilterIndexReader::load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { // TODO yyq: implement a new once flag to avoid status construct. - return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats] { - return _load(use_page_cache, kept_in_memory, index_load_stats); + return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats, io_ctx] { + return _load(use_page_cache, kept_in_memory, index_load_stats, io_ctx); }); } @@ -45,21 +46,24 @@ int64_t BloomFilterIndexReader::get_metadata_size() const { } Status BloomFilterIndexReader::_load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { const IndexedColumnMetaPB& bf_index_meta = _bloom_filter_index_meta->bloom_filter(); _bloom_filter_reader = std::make_unique(_file_reader, bf_index_meta); - RETURN_IF_ERROR(_bloom_filter_reader->load(use_page_cache, kept_in_memory, index_load_stats)); + RETURN_IF_ERROR( + _bloom_filter_reader->load(use_page_cache, kept_in_memory, index_load_stats, io_ctx)); update_metadata_size(); return Status::OK(); } Status BloomFilterIndexReader::new_iterator(std::unique_ptr* iterator, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { DBUG_EXECUTE_IF("BloomFilterIndexReader::new_iterator.fail", { return Status::InternalError("new_iterator for bloom filter index failed"); }); - *iterator = std::make_unique(this, index_load_stats); + *iterator = std::make_unique(this, index_load_stats, io_ctx); return Status::OK(); } diff --git a/be/src/storage/index/bloom_filter/bloom_filter_index_reader.h b/be/src/storage/index/bloom_filter/bloom_filter_index_reader.h index dc0b78f16c3c60..4395865174447b 100644 --- a/be/src/storage/index/bloom_filter/bloom_filter_index_reader.h +++ b/be/src/storage/index/bloom_filter/bloom_filter_index_reader.h @@ -31,6 +31,9 @@ #include "util/once.h" namespace doris { +namespace io { +struct IOContext; +} namespace segment_v2 { @@ -46,19 +49,21 @@ class BloomFilterIndexReader : public MetadataAdder { _bloom_filter_index_meta.reset(new BloomFilterIndexPB(bloom_filter_index_meta)); } - Status load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* bf_index_load_stats); + Status load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* bf_index_load_stats, + const io::IOContext* io_ctx = nullptr); BloomFilterAlgorithmPB algorithm() { return _bloom_filter_index_meta->algorithm(); } // create a new column iterator. Status new_iterator(std::unique_ptr* iterator, - OlapReaderStatistics* index_load_stats); + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx = nullptr); FieldType type() const { return FieldType::OLAP_FIELD_TYPE_VARCHAR; } private: - Status _load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats); + Status _load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx); int64_t get_metadata_size() const override; @@ -73,8 +78,10 @@ class BloomFilterIndexReader : public MetadataAdder { class BloomFilterIndexIterator { public: - explicit BloomFilterIndexIterator(BloomFilterIndexReader* reader, OlapReaderStatistics* stats) - : _reader(reader), _bloom_filter_iter(reader->_bloom_filter_reader.get(), stats) {} + explicit BloomFilterIndexIterator(BloomFilterIndexReader* reader, OlapReaderStatistics* stats, + const io::IOContext* io_ctx) + : _reader(reader), + _bloom_filter_iter(reader->_bloom_filter_reader.get(), stats, io_ctx) {} // Read bloom filter at the given ordinal into `bf`. Status read_bloom_filter(rowid_t ordinal, std::unique_ptr* bf); diff --git a/be/src/storage/index/indexed_column_reader.cpp b/be/src/storage/index/indexed_column_reader.cpp index d30c2c82dbc41b..29510f21452c2e 100644 --- a/be/src/storage/index/indexed_column_reader.cpp +++ b/be/src/storage/index/indexed_column_reader.cpp @@ -60,7 +60,8 @@ int64_t IndexedColumnReader::get_metadata_size() const { } Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { _use_page_cache = use_page_cache; _kept_in_memory = kept_in_memory; @@ -78,7 +79,7 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, } else { RETURN_IF_ERROR(load_index_page(_meta.ordinal_index_meta().root_page(), &_ordinal_index_page_handle, - _ordinal_index_reader.get(), index_load_stats)); + _ordinal_index_reader.get(), index_load_stats, io_ctx)); _has_index_page = true; } } @@ -90,7 +91,7 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, } else { RETURN_IF_ERROR(load_index_page(_meta.value_index_meta().root_page(), &_value_index_page_handle, _value_index_reader.get(), - index_load_stats)); + index_load_stats, io_ctx)); _has_index_page = true; } } @@ -102,13 +103,14 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* handle, IndexPageReader* reader, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { Slice body; PageFooterPB footer; BlockCompressionCodec* local_compress_codec; RETURN_IF_ERROR(get_block_compression_codec(_meta.compression(), &local_compress_codec)); RETURN_IF_ERROR(read_page(PagePointer(pp), handle, &body, &footer, INDEX_PAGE, - local_compress_codec, false, index_load_stats)); + local_compress_codec, false, index_load_stats, io_ctx)); RETURN_IF_ERROR(reader->parse(body, footer.index_page_footer())); _mem_size += body.get_size(); return Status::OK(); @@ -117,11 +119,14 @@ Status IndexedColumnReader::load_index_page(const PagePointerPB& pp, PageHandle* Status IndexedColumnReader::read_page(const PagePointer& pp, PageHandle* handle, Slice* body, PageFooterPB* footer, PageTypePB type, BlockCompressionCodec* codec, bool pre_decode, - OlapReaderStatistics* stats) const { + OlapReaderStatistics* stats, + const io::IOContext* io_ctx) const { OlapReaderStatistics tmp_stats; OlapReaderStatistics* stats_ptr = stats != nullptr ? stats : &tmp_stats; - PageReadOptions opts(io::IOContext {.is_index_data = true, - .file_cache_stats = &stats_ptr->file_cache_stats}); + io::IOContext page_io_ctx = io_ctx != nullptr ? *io_ctx : io::IOContext {}; + page_io_ctx.is_index_data = true; + page_io_ctx.file_cache_stats = &stats_ptr->file_cache_stats; + PageReadOptions opts(page_io_ctx); opts.use_page_cache = _use_page_cache; opts.kept_in_memory = _kept_in_memory; opts.pre_decode = pre_decode; @@ -158,7 +163,7 @@ Status IndexedColumnIterator::_read_data_page(const PagePointer& pp) { Slice body; PageFooterPB footer; RETURN_IF_ERROR(_reader->read_page(pp, &handle, &body, &footer, DATA_PAGE, _compress_codec, - true, _stats)); + true, _stats, _io_ctx)); // parse data page // note that page_index is not used in IndexedColumnIterator, so we pass 0 PageDecoderOptions opts; diff --git a/be/src/storage/index/indexed_column_reader.h b/be/src/storage/index/indexed_column_reader.h index 1cea4641595dc8..b1562409582679 100644 --- a/be/src/storage/index/indexed_column_reader.h +++ b/be/src/storage/index/indexed_column_reader.h @@ -40,6 +40,9 @@ namespace doris { class KeyCoder; class BlockCompressionCodec; +namespace io { +struct IOContext; +} namespace segment_v2 { @@ -57,12 +60,14 @@ class IndexedColumnReader : public MetadataAdder { ~IndexedColumnReader() override; Status load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* index_load_stats = nullptr); + OlapReaderStatistics* index_load_stats = nullptr, + const io::IOContext* io_ctx = nullptr); // read a page specified by `pp' from `file' into `handle' Status read_page(const PagePointer& pp, PageHandle* handle, Slice* body, PageFooterPB* footer, PageTypePB type, BlockCompressionCodec* codec, bool pre_decode, - OlapReaderStatistics* stats = nullptr) const; + OlapReaderStatistics* stats = nullptr, + const io::IOContext* io_ctx = nullptr) const; int64_t num_values() const { return _num_values; } const EncodingInfo* encoding_info() const { return _encoding_info; } @@ -76,7 +81,7 @@ class IndexedColumnReader : public MetadataAdder { private: Status load_index_page(const PagePointerPB& pp, PageHandle* handle, IndexPageReader* reader, - OlapReaderStatistics* index_load_stats); + OlapReaderStatistics* index_load_stats, const io::IOContext* io_ctx); int64_t get_metadata_size() const override; @@ -108,11 +113,13 @@ class IndexedColumnReader : public MetadataAdder { class IndexedColumnIterator { public: explicit IndexedColumnIterator(const IndexedColumnReader* reader, - OlapReaderStatistics* stats = nullptr) + OlapReaderStatistics* stats = nullptr, + const io::IOContext* io_ctx = nullptr) : _reader(reader), _ordinal_iter(reader->_ordinal_index_reader.get()), _value_iter(reader->_value_index_reader.get()), - _stats(stats) {} + _stats(stats), + _io_ctx(io_ctx) {} // Seek to the given ordinal entry. Entry 0 is the first entry. // Return Status::Error if provided seek point is past the end. @@ -162,6 +169,7 @@ class IndexedColumnIterator { // iterator owned compress codec, should NOT be shared by threads, initialized before used BlockCompressionCodec* _compress_codec = nullptr; OlapReaderStatistics* _stats = nullptr; + const io::IOContext* _io_ctx = nullptr; }; } // namespace segment_v2 diff --git a/be/src/storage/index/inverted/inverted_index_fs_directory.cpp b/be/src/storage/index/inverted/inverted_index_fs_directory.cpp index e65025b25a4fc7..30f168e8b14e02 100644 --- a/be/src/storage/index/inverted/inverted_index_fs_directory.cpp +++ b/be/src/storage/index/inverted/inverted_index_fs_directory.cpp @@ -184,10 +184,14 @@ void DorisFSDirectory::FSIndexInput::setIoContext(const void* io_ctx) { _io_ctx.reader_type = ctx->reader_type; _io_ctx.query_id = ctx->query_id; _io_ctx.file_cache_stats = ctx->file_cache_stats; + _io_ctx.file_cache_miss_policy = ctx->file_cache_miss_policy; + _io_ctx.remote_scan_cache_write_limiter = ctx->remote_scan_cache_write_limiter; } else { _io_ctx.reader_type = ReaderType::UNKNOWN; _io_ctx.query_id = nullptr; _io_ctx.file_cache_stats = nullptr; + _io_ctx.file_cache_miss_policy = io::FileCacheMissPolicy::READ_THROUGH_AND_WRITE_BACK; + _io_ctx.remote_scan_cache_write_limiter = nullptr; } } diff --git a/be/src/storage/index/inverted/util/term_position_iterator.h b/be/src/storage/index/inverted/util/term_position_iterator.h index 82cbd5b63ee634..ed8787177b24f2 100644 --- a/be/src/storage/index/inverted/util/term_position_iterator.h +++ b/be/src/storage/index/inverted/util/term_position_iterator.h @@ -61,4 +61,4 @@ class TermPositionsIterator : public TermIterator { TermPositions* term_poss_ = nullptr; }; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/be/src/storage/index/ordinal_page_index.cpp b/be/src/storage/index/ordinal_page_index.cpp index 054f30f67f2445..688657f575579d 100644 --- a/be/src/storage/index/ordinal_page_index.cpp +++ b/be/src/storage/index/ordinal_page_index.cpp @@ -71,16 +71,17 @@ Status OrdinalIndexWriter::finish(io::FileWriter* file_writer, ColumnIndexMetaPB } Status OrdinalIndexReader::load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { // TODO yyq: implement a new once flag to avoid status construct. - return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats] { - return _load(use_page_cache, kept_in_memory, std::move(_meta_pb), index_load_stats); + return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats, io_ctx] { + return _load(use_page_cache, kept_in_memory, std::move(_meta_pb), index_load_stats, io_ctx); }); } Status OrdinalIndexReader::_load(bool use_page_cache, bool kept_in_memory, std::unique_ptr index_meta, - OlapReaderStatistics* stats) { + OlapReaderStatistics* stats, const io::IOContext* io_ctx) { if (index_meta->root_page().is_root_data_page()) { // only one data page, no index page _num_pages = 1; @@ -92,8 +93,10 @@ Status OrdinalIndexReader::_load(bool use_page_cache, bool kept_in_memory, // need to read index page OlapReaderStatistics tmp_stats; OlapReaderStatistics* stats_ptr = stats != nullptr ? stats : &tmp_stats; - PageReadOptions opts(io::IOContext {.is_index_data = true, - .file_cache_stats = &stats_ptr->file_cache_stats}); + io::IOContext page_io_ctx = io_ctx != nullptr ? *io_ctx : io::IOContext {}; + page_io_ctx.is_index_data = true; + page_io_ctx.file_cache_stats = &stats_ptr->file_cache_stats; + PageReadOptions opts(page_io_ctx); opts.use_page_cache = use_page_cache; opts.kept_in_memory = kept_in_memory; opts.type = INDEX_PAGE; diff --git a/be/src/storage/index/ordinal_page_index.h b/be/src/storage/index/ordinal_page_index.h index f17d80c20ed120..87bc4dc31775d7 100644 --- a/be/src/storage/index/ordinal_page_index.h +++ b/be/src/storage/index/ordinal_page_index.h @@ -36,7 +36,8 @@ namespace doris { namespace io { class FileWriter; -} +struct IOContext; +} // namespace io namespace segment_v2 { class ColumnIndexMetaPB; @@ -75,7 +76,8 @@ class OrdinalIndexReader : public MetadataAdder { virtual ~OrdinalIndexReader(); // load and parse the index page into memory - Status load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats); + Status load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx = nullptr); // the returned iter points to the largest element which is less than `ordinal`, // or points to the first element if all elements are greater than `ordinal`, @@ -94,8 +96,8 @@ class OrdinalIndexReader : public MetadataAdder { private: Status _load(bool use_page_cache, bool kept_in_memory, - std::unique_ptr index_meta, - OlapReaderStatistics* index_load_stats); + std::unique_ptr index_meta, OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx); private: friend class OrdinalPageIndexIterator; diff --git a/be/src/storage/index/primary_key_index.cpp b/be/src/storage/index/primary_key_index.cpp index 654584948e7724..4f7e34e443f514 100644 --- a/be/src/storage/index/primary_key_index.cpp +++ b/be/src/storage/index/primary_key_index.cpp @@ -33,6 +33,19 @@ namespace doris { +namespace { +io::IOContext create_index_io_context(const io::IOContext* source, OlapReaderStatistics* stats) { + io::IOContext io_ctx; + if (source != nullptr) { + io_ctx = *source; + } + io_ctx.is_index_data = true; + io_ctx.is_inverted_index = false; + io_ctx.file_cache_stats = stats ? &stats->file_cache_stats : nullptr; + return io_ctx; +} +} // namespace + static bvar::Adder g_primary_key_index_memory_bytes("doris_primary_key_index_memory_bytes"); Status PrimaryKeyIndexBuilder::init() { @@ -96,12 +109,14 @@ Status PrimaryKeyIndexBuilder::finalize(segment_v2::PrimaryKeyIndexMetaPB* meta) Status PrimaryKeyIndexReader::parse_index(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta, - OlapReaderStatistics* pk_index_load_stats) { + OlapReaderStatistics* pk_index_load_stats, + const io::IOContext* source_io_ctx) { // parse primary key index _index_reader.reset(new segment_v2::IndexedColumnReader(file_reader, meta.primary_key_index())); _index_reader->set_is_pk_index(true); + auto io_ctx = create_index_io_context(source_io_ctx, pk_index_load_stats); RETURN_IF_ERROR(_index_reader->load(!config::disable_pk_storage_page_cache, false, - pk_index_load_stats)); + pk_index_load_stats, &io_ctx)); _index_parsed = true; return Status::OK(); @@ -109,13 +124,15 @@ Status PrimaryKeyIndexReader::parse_index(io::FileReaderSPtr file_reader, Status PrimaryKeyIndexReader::parse_bf(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta, - OlapReaderStatistics* pk_index_load_stats) { + OlapReaderStatistics* pk_index_load_stats, + const io::IOContext* source_io_ctx) { // parse bloom filter segment_v2::ColumnIndexMetaPB column_index_meta = meta.bloom_filter_index(); segment_v2::BloomFilterIndexReader bf_index_reader(std::move(file_reader), column_index_meta.bloom_filter_index()); + auto io_ctx = create_index_io_context(source_io_ctx, pk_index_load_stats); RETURN_IF_ERROR(bf_index_reader.load(!config::disable_pk_storage_page_cache, false, - pk_index_load_stats)); + pk_index_load_stats, &io_ctx)); std::unique_ptr bf_iter; RETURN_IF_ERROR(bf_index_reader.new_iterator(&bf_iter, pk_index_load_stats)); RETURN_IF_ERROR(bf_iter->read_bloom_filter(0, &_bf)); diff --git a/be/src/storage/index/primary_key_index.h b/be/src/storage/index/primary_key_index.h index fbb2e39c539c72..353688aea719c0 100644 --- a/be/src/storage/index/primary_key_index.h +++ b/be/src/storage/index/primary_key_index.h @@ -37,6 +37,7 @@ namespace doris { namespace io { class FileWriter; +struct IOContext; } // namespace io namespace segment_v2 { @@ -108,10 +109,12 @@ class PrimaryKeyIndexReader { Status parse_index(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta, - OlapReaderStatistics* pk_index_load_stats); + OlapReaderStatistics* pk_index_load_stats, + const io::IOContext* io_ctx = nullptr); Status parse_bf(io::FileReaderSPtr file_reader, const segment_v2::PrimaryKeyIndexMetaPB& meta, - OlapReaderStatistics* pk_index_load_stats); + OlapReaderStatistics* pk_index_load_stats, + const io::IOContext* io_ctx = nullptr); Status new_iterator(std::unique_ptr* index_iterator, OlapReaderStatistics* stats) const { diff --git a/be/src/storage/index/zone_map/zone_map_index.cpp b/be/src/storage/index/zone_map/zone_map_index.cpp index b12de9d03b6859..23e2e5a4a0feec 100644 --- a/be/src/storage/index/zone_map/zone_map_index.cpp +++ b/be/src/storage/index/zone_map/zone_map_index.cpp @@ -297,20 +297,22 @@ Status TypedZoneMapIndexWriter::finish(io::FileWriter* file_writer, } Status ZoneMapIndexReader::load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { // TODO yyq: implement a new once flag to avoid status construct. - return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats] { + return _load_once.call([this, use_page_cache, kept_in_memory, index_load_stats, io_ctx] { return _load(use_page_cache, kept_in_memory, std::move(_page_zone_maps_meta), - index_load_stats); + index_load_stats, io_ctx); }); } Status ZoneMapIndexReader::_load(bool use_page_cache, bool kept_in_memory, std::unique_ptr page_zone_maps_meta, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { IndexedColumnReader reader(_file_reader, *page_zone_maps_meta); - RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory, index_load_stats)); - IndexedColumnIterator iter(&reader, index_load_stats); + RETURN_IF_ERROR(reader.load(use_page_cache, kept_in_memory, index_load_stats, io_ctx)); + IndexedColumnIterator iter(&reader, index_load_stats, io_ctx); _page_zone_maps.resize(reader.num_values()); diff --git a/be/src/storage/index/zone_map/zone_map_index.h b/be/src/storage/index/zone_map/zone_map_index.h index 96d9a7a26e5919..80c6928f60bb07 100644 --- a/be/src/storage/index/zone_map/zone_map_index.h +++ b/be/src/storage/index/zone_map/zone_map_index.h @@ -38,6 +38,7 @@ namespace doris { namespace io { class FileWriter; +struct IOContext; } // namespace io namespace segment_v2 { @@ -182,7 +183,8 @@ class ZoneMapIndexReader : public MetadataAdder { // load all page zone maps into memory Status load(bool use_page_cache, bool kept_in_memory, - OlapReaderStatistics* index_load_stats = nullptr); + OlapReaderStatistics* index_load_stats = nullptr, + const io::IOContext* io_ctx = nullptr); const std::vector& page_zone_maps() const { return _page_zone_maps; } @@ -190,7 +192,7 @@ class ZoneMapIndexReader : public MetadataAdder { private: Status _load(bool use_page_cache, bool kept_in_memory, std::unique_ptr, - OlapReaderStatistics* index_load_stats); + OlapReaderStatistics* index_load_stats, const io::IOContext* io_ctx); int64_t get_metadata_size() const override; diff --git a/be/src/storage/rowset/beta_rowset.cpp b/be/src/storage/rowset/beta_rowset.cpp index 70950dfe065634..8c0555037856dd 100644 --- a/be/src/storage/rowset/beta_rowset.cpp +++ b/be/src/storage/rowset/beta_rowset.cpp @@ -75,10 +75,11 @@ Status BetaRowset::init() { namespace { Status load_segment_rows_from_footer(BetaRowsetSharedPtr rowset, std::vector* segment_rows, bool enable_segment_cache, - OlapReaderStatistics* read_stats) { + OlapReaderStatistics* read_stats, + const io::IOContext* io_ctx) { SegmentCacheHandle segment_cache_handle; RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( - rowset, &segment_cache_handle, enable_segment_cache, false, read_stats)); + rowset, &segment_cache_handle, enable_segment_cache, false, read_stats, io_ctx)); for (const auto& segment : segment_cache_handle.get_segments()) { segment_rows->emplace_back(segment->num_rows()); } @@ -106,14 +107,14 @@ Status check_segment_rows_consistency(const std::vector& rows_from_met } // namespace Status BetaRowset::get_segment_num_rows(std::vector* segment_rows, - bool enable_segment_cache, - OlapReaderStatistics* read_stats) { + bool enable_segment_cache, OlapReaderStatistics* read_stats, + const io::IOContext* io_ctx) { #ifndef BE_TEST // `ROWSET_UNLOADING` is state for closed() called but owned by some readers. // So here `ROWSET_UNLOADING` is allowed. DCHECK_NE(_rowset_state_machine.rowset_state(), ROWSET_UNLOADED); #endif - RETURN_IF_ERROR(_load_segment_rows_once.call([this, enable_segment_cache, read_stats] { + RETURN_IF_ERROR(_load_segment_rows_once.call([this, enable_segment_cache, read_stats, io_ctx] { auto segment_count = num_segments(); if (segment_count == 0) { return Status::OK(); @@ -130,7 +131,7 @@ Status BetaRowset::get_segment_num_rows(std::vector* segment_rows, std::vector rows_from_footer; auto self = std::dynamic_pointer_cast(shared_from_this()); auto load_status = load_segment_rows_from_footer( - self, &rows_from_footer, enable_segment_cache, read_stats); + self, &rows_from_footer, enable_segment_cache, read_stats, io_ctx); if (load_status.ok()) { return check_segment_rows_consistency( _segments_rows, rows_from_footer, _rowset_meta->tablet_id(), @@ -162,7 +163,7 @@ Status BetaRowset::get_segment_num_rows(std::vector* segment_rows, TEST_SYNC_POINT("BetaRowset::get_segment_num_rows:load_from_segment_footer"); auto self = std::dynamic_pointer_cast(shared_from_this()); return load_segment_rows_from_footer(self, &_segments_rows, enable_segment_cache, - read_stats); + read_stats, io_ctx); })); segment_rows->assign(_segments_rows.cbegin(), _segments_rows.cend()); return Status::OK(); @@ -258,7 +259,8 @@ Status BetaRowset::load_segments(int64_t seg_id_begin, int64_t seg_id_end, } Status BetaRowset::load_segment(int64_t seg_id, OlapReaderStatistics* stats, - segment_v2::SegmentSharedPtr* segment) { + segment_v2::SegmentSharedPtr* segment, + const io::IOContext* io_ctx) { auto fs = _rowset_meta->fs(); if (!fs) { return Status::Error("get fs failed"); @@ -278,7 +280,7 @@ Status BetaRowset::load_segment(int64_t seg_id, OlapReaderStatistics* stats, auto s = segment_v2::Segment::open( fs, seg_path, _rowset_meta->tablet_id(), static_cast(seg_id), rowset_id(), _schema, reader_options, segment, - _rowset_meta->inverted_index_file_info(static_cast(seg_id)), stats); + _rowset_meta->inverted_index_file_info(static_cast(seg_id)), stats, io_ctx); if (!s.ok()) { LOG(WARNING) << "failed to open segment. " << seg_path << " under rowset " << rowset_id() << " : " << s.to_string(); diff --git a/be/src/storage/rowset/beta_rowset.h b/be/src/storage/rowset/beta_rowset.h index a338d28984927b..8a2c13efe928a4 100644 --- a/be/src/storage/rowset/beta_rowset.h +++ b/be/src/storage/rowset/beta_rowset.h @@ -38,6 +38,7 @@ namespace doris { class BetaRowset; namespace io { +struct IOContext; class RemoteFileSystem; } // namespace io struct RowsetId; @@ -78,7 +79,8 @@ class BetaRowset final : public Rowset { std::vector* segments); Status load_segment(int64_t seg_id, OlapReaderStatistics* read_stats, - segment_v2::SegmentSharedPtr* segment); + segment_v2::SegmentSharedPtr* segment, + const io::IOContext* io_ctx = nullptr); Status get_segments_size(std::vector* segments_size); @@ -92,7 +94,8 @@ class BetaRowset final : public Rowset { rapidjson::Document::AllocatorType& allocator); Status get_segment_num_rows(std::vector* segment_rows, bool enable_segment_cache, - OlapReaderStatistics* read_stats); + OlapReaderStatistics* read_stats, + const io::IOContext* io_ctx = nullptr); protected: BetaRowset(const TabletSchemaSPtr& schema, const RowsetMetaSharedPtr& rowset_meta, diff --git a/be/src/storage/rowset/beta_rowset_reader.cpp b/be/src/storage/rowset/beta_rowset_reader.cpp index 56b580e152d20d..ade12dbf79b04f 100644 --- a/be/src/storage/rowset/beta_rowset_reader.cpp +++ b/be/src/storage/rowset/beta_rowset_reader.cpp @@ -33,6 +33,7 @@ #include "core/block/block.h" #include "io/io_common.h" #include "runtime/descriptors.h" +#include "runtime/query_context.h" #include "runtime/runtime_profile.h" #include "storage/binlog.h" #include "storage/delete/delete_handler.h" @@ -238,6 +239,11 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context _read_context->runtime_state->query_options().enable_file_cache; _read_options.io_ctx.is_disposable = _read_context->runtime_state->query_options().disable_file_cache; + auto* query_ctx = _read_context->runtime_state->get_query_ctx(); + if (_read_context->reader_type == ReaderType::READER_QUERY && query_ctx != nullptr) { + _read_options.io_ctx.remote_scan_cache_write_limiter = + query_ctx->remote_scan_cache_write_limiter(); + } } if (_read_context->condition_cache_digest) { diff --git a/be/src/storage/rowset/beta_rowset_writer.cpp b/be/src/storage/rowset/beta_rowset_writer.cpp index e614028447e17c..197bee4cdfef77 100644 --- a/be/src/storage/rowset/beta_rowset_writer.cpp +++ b/be/src/storage/rowset/beta_rowset_writer.cpp @@ -43,6 +43,7 @@ #include "core/block/block.h" #include "core/column/column.h" #include "core/data_type/data_type_factory.hpp" +#include "cpp/sync_point.h" #include "io/fs/file_reader.h" #include "io/fs/file_system.h" #include "io/fs/file_writer.h" @@ -201,6 +202,7 @@ Status SegmentFileCollection::close() { if (writer->state() != io::FileWriter::State::CLOSED) { RETURN_IF_ERROR(writer->close()); } + TEST_SYNC_POINT_CALLBACK("SegmentFileCollection::close_file_writer", writer.get()); } return Status::OK(); @@ -1126,8 +1128,8 @@ Status BaseBetaRowsetWriter::_build_tmp(RowsetSharedPtr& rowset_ptr) { Status BaseBetaRowsetWriter::_create_file_writer(const std::string& path, io::FileWriterPtr& file_writer, - bool is_index_file) { - io::FileWriterOptions opts = _context.get_file_writer_options(is_index_file); + FileType file_type) { + io::FileWriterOptions opts = _context.get_file_writer_options(file_type); Status st = _context.fs()->create_file(path, &file_writer, &opts); if (!st.ok()) { LOG(WARNING) << "failed to create writable file. path=" << path << ", err: " << st; @@ -1135,6 +1137,8 @@ Status BaseBetaRowsetWriter::_create_file_writer(const std::string& path, } DCHECK(file_writer != nullptr); + TEST_SYNC_POINT_CALLBACK("BaseBetaRowsetWriter::_create_file_writer", &path, &file_type, + file_writer.get(), &opts); return Status::OK(); } @@ -1145,9 +1149,9 @@ Status BaseBetaRowsetWriter::create_file_writer(uint32_t segment_id, io::FileWri std::string prefix = std::string {InvertedIndexDescriptor::get_index_file_path_prefix(segment_path)}; std::string index_path = InvertedIndexDescriptor::get_index_file_path_v2(prefix); - return _create_file_writer(index_path, file_writer, true /* is_index_file */); + return _create_file_writer(index_path, file_writer, file_type); } else if (file_type == FileType::SEGMENT_FILE) { - return _create_file_writer(segment_path, file_writer, false /* is_index_file */); + return _create_file_writer(segment_path, file_writer, file_type); } return Status::Error( fmt::format("failed to create file = {}, file type = {}", segment_path, file_type)); @@ -1158,7 +1162,9 @@ Status BaseBetaRowsetWriter::create_index_file_writer(uint32_t segment_id, RETURN_IF_ERROR(RowsetWriter::create_index_file_writer(segment_id, index_file_writer)); // used for inverted index format v1 (*index_file_writer) - ->set_file_writer_opts(_context.get_file_writer_options(true /* is_index_file */)); + ->set_file_writer_opts(_context.get_file_writer_options(FileType::INVERTED_INDEX_FILE)); + TEST_SYNC_POINT_CALLBACK("BaseBetaRowsetWriter::create_inverted_index_file_writer", &segment_id, + index_file_writer->get()); return Status::OK(); } @@ -1168,7 +1174,7 @@ Status BetaRowsetWriter::create_segment_writer_for_segcompaction( std::string path = BetaRowset::local_segment_path_segcompacted(_context.tablet_path, _context.rowset_id, begin, end); io::FileWriterPtr file_writer; - RETURN_IF_ERROR(_create_file_writer(path, file_writer, false /* is_index_file */)); + RETURN_IF_ERROR(_create_file_writer(path, file_writer, FileType::SEGMENT_FILE)); IndexFileWriterPtr index_file_writer; if (_context.tablet_schema->has_inverted_index() || _context.tablet_schema->has_ann_index()) { @@ -1177,13 +1183,15 @@ Status BetaRowsetWriter::create_segment_writer_for_segcompaction( if (_context.tablet_schema->get_inverted_index_storage_format() != InvertedIndexStorageFormatPB::V1) { std::string index_path = InvertedIndexDescriptor::get_index_file_path_v2(prefix); - RETURN_IF_ERROR( - _create_file_writer(index_path, idx_file_writer, true /* is_index_file */)); + RETURN_IF_ERROR(_create_file_writer(index_path, idx_file_writer, + FileType::INVERTED_INDEX_FILE)); } index_file_writer = std::make_unique( _context.fs(), prefix, _context.rowset_id.to_string(), _num_segcompacted, _context.tablet_schema->get_inverted_index_storage_format(), std::move(idx_file_writer), true /* can_use_ram_dir */, _context.tablet_id); + index_file_writer->set_file_writer_opts( + _context.get_file_writer_options(FileType::INVERTED_INDEX_FILE)); } segment_v2::SegmentWriterOptions writer_options; diff --git a/be/src/storage/rowset/beta_rowset_writer.h b/be/src/storage/rowset/beta_rowset_writer.h index a7f044c10e74a2..1e6dc235c88a26 100644 --- a/be/src/storage/rowset/beta_rowset_writer.h +++ b/be/src/storage/rowset/beta_rowset_writer.h @@ -211,7 +211,7 @@ class BaseBetaRowsetWriter : public RowsetWriter { Status _generate_delete_bitmap(int32_t segment_id); virtual Status _build_rowset_meta(RowsetMeta* rowset_meta, bool check_segment_num = false); Status _create_file_writer(const std::string& path, io::FileWriterPtr& file_writer, - bool is_index_file = false); + FileType file_type = FileType::SEGMENT_FILE); virtual Status _close_file_writers(); virtual Status _check_segment_number_limit(size_t segnum); virtual int64_t _num_seg() const; diff --git a/be/src/storage/rowset/rowset_writer_context.h b/be/src/storage/rowset/rowset_writer_context.h index 58dd12fc1ffa8a..58de04f8f26a5f 100644 --- a/be/src/storage/rowset/rowset_writer_context.h +++ b/be/src/storage/rowset/rowset_writer_context.h @@ -247,17 +247,26 @@ struct RowsetWriterContext { io::FileSystem& fs_ref() const { return *fs(); } - io::FileWriterOptions get_file_writer_options(bool is_index_file = false) { - bool should_write_cache = write_file_cache; - // If configured to only write index files to cache, skip cache for data files - if (compaction_output_write_index_only && !is_index_file) { - should_write_cache = false; + io::FileWriterOptions get_file_writer_options(FileType file_type = FileType::SEGMENT_FILE) { + io::FileWriterOptions opts {.write_file_cache = write_file_cache, + .is_cold_data = is_hot_data, + .file_cache_expiration_time = file_cache_ttl_sec, + .approximate_bytes_to_write = approximate_bytes_to_write}; + + if (config::enable_file_cache_write_index_file_only) { + opts.allow_adaptive_file_cache_write = false; + opts.approximate_bytes_to_write = 0; + opts.write_file_cache = file_type == FileType::INVERTED_INDEX_FILE; + return opts; } - return io::FileWriterOptions {.write_file_cache = should_write_cache, - .is_cold_data = is_hot_data, - .file_cache_expiration_time = file_cache_ttl_sec, - .approximate_bytes_to_write = approximate_bytes_to_write}; + if (compaction_output_write_index_only && file_type == FileType::SEGMENT_FILE) { + opts.write_file_cache = false; + opts.allow_adaptive_file_cache_write = false; + opts.approximate_bytes_to_write = 0; + } + + return opts; } struct BinlogOptions { diff --git a/be/src/storage/rowset/segment_creator.cpp b/be/src/storage/rowset/segment_creator.cpp index 37b847ce216fd3..d320d31256bd34 100644 --- a/be/src/storage/rowset/segment_creator.cpp +++ b/be/src/storage/rowset/segment_creator.cpp @@ -40,10 +40,12 @@ #include "core/column/column_variant.h" #include "core/data_type/data_type.h" #include "core/types.h" +#include "cpp/sync_point.h" #include "io/fs/file_writer.h" #include "storage/olap_define.h" #include "storage/rowset/beta_rowset_writer.h" // SegmentStatistics #include "storage/segment/row_binlog_segment_writer.h" +#include "storage/segment/segment_index_file_cache_loader.h" #include "storage/segment/segment_writer.h" #include "storage/segment/vertical_segment_writer.h" #include "storage/tablet/tablet_schema.h" @@ -88,10 +90,27 @@ Status SegmentFlusher::flush_single_block(const Block* block, int32_t segment_id Status SegmentFlusher::close() { RETURN_IF_ERROR(_seg_files.close()); + RETURN_IF_ERROR(_preload_segment_indexes_to_file_cache()); RETURN_IF_ERROR(_idx_files.finish_close()); return Status::OK(); } +void SegmentFlusher::_record_segment_index_file_cache_preload( + uint32_t segment_id, const segment_v2::SegmentIndexFileCacheInfo& info) { + std::lock_guard lock(_segment_index_file_cache_preloads_lock); + _segment_index_file_cache_preloads.push_back({segment_id, info}); +} + +Status SegmentFlusher::_preload_segment_indexes_to_file_cache() { + std::vector tasks; + { + std::lock_guard lock(_segment_index_file_cache_preloads_lock); + tasks.swap(_segment_index_file_cache_preloads); + } + return segment_v2::SegmentIndexFileCacheLoader::preload_segment_indexes_to_file_cache(_context, + tasks); +} + Status SegmentFlusher::_add_rows(std::unique_ptr& segment_writer, const Block* block, size_t row_pos, size_t num_rows) { RETURN_IF_ERROR(segment_writer->append_block(block, row_pos, num_rows)); @@ -209,7 +228,8 @@ Status SegmentFlusher::_flush_segment_writer( finalize_timer.start(); uint64_t segment_file_size; uint64_t common_index_size; - Status s = writer->finalize(&segment_file_size, &common_index_size); + segment_v2::SegmentIndexFileCacheInfo index_file_cache_info; + Status s = writer->finalize(&segment_file_size, &common_index_size, &index_file_cache_info); finalize_timer.stop(); if (!s.ok()) { @@ -237,6 +257,7 @@ Status SegmentFlusher::_flush_segment_writer( key_bounds.set_max_key(max_key.to_string()); uint32_t segment_id = writer->segment_id(); + TEST_SYNC_POINT_CALLBACK("SegmentFlusher::flush_vertical_segment_writer", &segment_id); SegmentStatistics segstat; segstat.row_num = row_num; segstat.data_size = segment_file_size; @@ -244,6 +265,7 @@ Status SegmentFlusher::_flush_segment_writer( segstat.key_bounds = key_bounds; writer.reset(); + _record_segment_index_file_cache_preload(segment_id, index_file_cache_info); MonotonicStopWatch collector_timer; collector_timer.start(); @@ -287,7 +309,8 @@ Status SegmentFlusher::_flush_segment_writer(std::unique_ptrfinalize(&segment_file_size, &common_index_size); + segment_v2::SegmentIndexFileCacheInfo index_file_cache_info; + Status s = writer->finalize(&segment_file_size, &common_index_size, &index_file_cache_info); finalize_timer.stop(); if (!s.ok()) { @@ -322,6 +345,7 @@ Status SegmentFlusher::_flush_segment_writer(std::unique_ptr #include +#include +#include + #include "common/status.h" #include "core/block/block.h" #include "io/fs/file_reader_writer_fwd.h" #include "storage/index/index_file_writer.h" #include "storage/rowset/rowset_writer_context.h" +#include "storage/segment/segment_index_file_cache_loader.h" #include "storage/tablet/tablet_fwd.h" namespace doris { @@ -149,6 +153,9 @@ class SegmentFlusher { int64_t* flush_size = nullptr); Status _flush_segment_writer(std::unique_ptr& writer, int64_t* flush_size = nullptr); + void _record_segment_index_file_cache_preload( + uint32_t segment_id, const segment_v2::SegmentIndexFileCacheInfo& info); + Status _preload_segment_indexes_to_file_cache(); private: RowsetWriterContext& _context; @@ -161,6 +168,8 @@ class SegmentFlusher { std::atomic _num_rows_new_added = 0; std::atomic _num_rows_deleted = 0; std::atomic _num_rows_filtered = 0; + std::mutex _segment_index_file_cache_preloads_lock; + std::vector _segment_index_file_cache_preloads; }; class SegmentCreator { diff --git a/be/src/storage/rowset/vertical_beta_rowset_writer.cpp b/be/src/storage/rowset/vertical_beta_rowset_writer.cpp index 260c6a87410cc9..154c1e546e47f2 100644 --- a/be/src/storage/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/storage/rowset/vertical_beta_rowset_writer.cpp @@ -32,11 +32,13 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" #include "core/block/block.h" +#include "cpp/sync_point.h" #include "io/fs/file_system.h" #include "io/fs/file_writer.h" #include "storage/rowset/beta_rowset.h" #include "storage/rowset/rowset_meta.h" #include "storage/rowset/rowset_writer_context.h" +#include "storage/segment/segment_index_file_cache_loader.h" #include "util/slice.h" namespace doris { @@ -172,7 +174,7 @@ Status VerticalBetaRowsetWriter::_create_segment_writer( IndexFileWriterPtr index_file_writer; if (context.tablet_schema->has_inverted_index() || context.tablet_schema->has_ann_index()) { - RETURN_IF_ERROR(RowsetWriter::create_index_file_writer(seg_id, &index_file_writer)); + RETURN_IF_ERROR(this->create_index_file_writer(seg_id, &index_file_writer)); } segment_v2::SegmentWriterOptions writer_options; @@ -205,13 +207,18 @@ Status VerticalBetaRowsetWriter::final_flush() { for (auto& segment_writer : _segment_writers) { uint64_t segment_size = 0; //uint64_t footer_position = 0; - auto st = segment_writer->finalize_footer(&segment_size); + segment_v2::SegmentIndexFileCacheInfo index_file_cache_info; + auto segment_id = segment_writer->get_segment_id(); + auto st = segment_writer->finalize_footer(&segment_size, &index_file_cache_info); if (!st.ok()) { LOG(WARNING) << "Fail to finalize segment footer, " << st; return st; } this->_total_data_size += segment_size; + TEST_SYNC_POINT_CALLBACK("VerticalBetaRowsetWriter::final_flush_segment_writer", + &segment_id); segment_writer.reset(); + _record_segment_index_file_cache_preload(segment_id, index_file_cache_info); } return Status::OK(); } @@ -220,7 +227,28 @@ template requires std::is_base_of_v Status VerticalBetaRowsetWriter::_close_file_writers() { RETURN_IF_ERROR(BaseBetaRowsetWriter::_close_inverted_index_file_writers()); - return this->_seg_files.close(); + RETURN_IF_ERROR(this->_seg_files.close()); + return _preload_segment_indexes_to_file_cache(); +} + +template + requires std::is_base_of_v +void VerticalBetaRowsetWriter::_record_segment_index_file_cache_preload( + uint32_t segment_id, const segment_v2::SegmentIndexFileCacheInfo& info) { + std::lock_guard lock(_segment_index_file_cache_preloads_lock); + _segment_index_file_cache_preloads.push_back({segment_id, info}); +} + +template + requires std::is_base_of_v +Status VerticalBetaRowsetWriter::_preload_segment_indexes_to_file_cache() { + std::vector tasks; + { + std::lock_guard lock(_segment_index_file_cache_preloads_lock); + tasks.swap(_segment_index_file_cache_preloads); + } + return segment_v2::SegmentIndexFileCacheLoader::preload_segment_indexes_to_file_cache( + this->_context, tasks); } } // namespace doris diff --git a/be/src/storage/rowset/vertical_beta_rowset_writer.h b/be/src/storage/rowset/vertical_beta_rowset_writer.h index a45952503509c7..7789dd98103797 100644 --- a/be/src/storage/rowset/vertical_beta_rowset_writer.h +++ b/be/src/storage/rowset/vertical_beta_rowset_writer.h @@ -18,11 +18,13 @@ #pragma once #include +#include #include #include #include "common/status.h" #include "storage/rowset/beta_rowset_writer.h" +#include "storage/segment/segment_index_file_cache_loader.h" #include "storage/segment/segment_writer.h" namespace doris { @@ -55,10 +57,15 @@ class VerticalBetaRowsetWriter : public T { Status _flush_columns(segment_v2::SegmentWriter* segment_writer, bool is_key = false); Status _create_segment_writer(const std::vector& column_ids, bool is_key, std::unique_ptr* writer); + void _record_segment_index_file_cache_preload( + uint32_t segment_id, const segment_v2::SegmentIndexFileCacheInfo& info); + Status _preload_segment_indexes_to_file_cache(); std::vector> _segment_writers; size_t _cur_writer_idx = 0; size_t _total_key_group_rows = 0; + std::mutex _segment_index_file_cache_preloads_lock; + std::vector _segment_index_file_cache_preloads; }; template diff --git a/be/src/storage/segment/column_reader.cpp b/be/src/storage/segment/column_reader.cpp index ebb1887c8ee920..6ac459cbbb97dd 100644 --- a/be/src/storage/segment/column_reader.cpp +++ b/be/src/storage/segment/column_reader.cpp @@ -572,7 +572,8 @@ Status ColumnReader::get_row_ranges_by_bloom_filter(const AndBlockColumnPredicat _load_bloom_filter_index(_use_index_page_cache, _opts.kept_in_memory, iter_opts)); RowRanges bf_row_ranges; std::unique_ptr bf_iter; - RETURN_IF_ERROR(_bloom_filter_index->new_iterator(&bf_iter, iter_opts.stats)); + RETURN_IF_ERROR( + _bloom_filter_index->new_iterator(&bf_iter, iter_opts.stats, &iter_opts.io_ctx)); size_t range_size = row_ranges->range_size(); // get covered page ids std::set page_ids; @@ -604,13 +605,14 @@ Status ColumnReader::_load_ordinal_index(bool use_page_cache, bool kept_in_memor if (!_ordinal_index) { return Status::InternalError("ordinal_index not inited"); } - return _ordinal_index->load(use_page_cache, kept_in_memory, iter_opts.stats); + return _ordinal_index->load(use_page_cache, kept_in_memory, iter_opts.stats, &iter_opts.io_ctx); } Status ColumnReader::_load_zone_map_index(bool use_page_cache, bool kept_in_memory, const ColumnIteratorOptions& iter_opts) { if (_zone_map_index != nullptr) { - return _zone_map_index->load(use_page_cache, kept_in_memory, iter_opts.stats); + return _zone_map_index->load(use_page_cache, kept_in_memory, iter_opts.stats, + &iter_opts.io_ctx); } return Status::OK(); } @@ -693,7 +695,8 @@ bool ColumnReader::has_bloom_filter_index(bool ngram) const { Status ColumnReader::_load_bloom_filter_index(bool use_page_cache, bool kept_in_memory, const ColumnIteratorOptions& iter_opts) { if (_bloom_filter_index != nullptr) { - return _bloom_filter_index->load(use_page_cache, kept_in_memory, iter_opts.stats); + return _bloom_filter_index->load(use_page_cache, kept_in_memory, iter_opts.stats, + &iter_opts.io_ctx); } return Status::OK(); } diff --git a/be/src/storage/segment/column_reader_cache.cpp b/be/src/storage/segment/column_reader_cache.cpp index 1c8ed7728ada81..a0cb0fc256ce1a 100644 --- a/be/src/storage/segment/column_reader_cache.cpp +++ b/be/src/storage/segment/column_reader_cache.cpp @@ -31,7 +31,8 @@ namespace doris::segment_v2 { ColumnReaderCache::ColumnReaderCache( ColumnMetaAccessor* accessor, TabletSchemaSPtr tablet_schema, io::FileReaderSPtr file_reader, uint64_t num_rows, - std::function&, OlapReaderStatistics*)> + std::function&, OlapReaderStatistics*, + const io::IOContext*)> get_footer_cb) : _accessor(accessor), _tablet_schema(std::move(tablet_schema)), @@ -94,7 +95,8 @@ std::map> ColumnReaderCache::get_availabl Status ColumnReaderCache::get_column_reader(int32_t col_uid, std::shared_ptr* column_reader, - OlapReaderStatistics* stats) { + OlapReaderStatistics* stats, + const io::IOContext* source_io_ctx) { // Attempt to find in cache if (auto cached = _lookup({col_uid, {}})) { *column_reader = cached; @@ -104,7 +106,7 @@ Status ColumnReaderCache::get_column_reader(int32_t col_uid, std::shared_ptr footer_pb_shared; { std::lock_guard lock(_cache_mutex); - RETURN_IF_ERROR(_get_footer_cb(footer_pb_shared, stats)); + RETURN_IF_ERROR(_get_footer_cb(footer_pb_shared, stats, source_io_ctx)); } // Lookup column meta by uid via ColumnMetaAccessor. If not initialized or not found, return NOT_FOUND. @@ -143,7 +145,8 @@ Status ColumnReaderCache::get_column_reader(int32_t col_uid, Status ColumnReaderCache::get_path_column_reader(int32_t col_uid, PathInData relative_path, std::shared_ptr* column_reader, OlapReaderStatistics* stats, - const SubcolumnColumnMetaInfo::Node* node_hint) { + const SubcolumnColumnMetaInfo::Node* node_hint, + const io::IOContext* source_io_ctx) { // Attempt to find in cache first if (auto cached = _lookup({col_uid, relative_path})) { *column_reader = cached; @@ -160,7 +163,7 @@ Status ColumnReaderCache::get_path_column_reader(int32_t col_uid, PathInData rel std::shared_ptr footer_pb_shared; { std::lock_guard lock(_cache_mutex); - RETURN_IF_ERROR(_get_footer_cb(footer_pb_shared, stats)); + RETURN_IF_ERROR(_get_footer_cb(footer_pb_shared, stats, source_io_ctx)); } // Ensure variant root reader is available in cache. @@ -168,7 +171,7 @@ Status ColumnReaderCache::get_path_column_reader(int32_t col_uid, PathInData rel .be_exec_version = _be_exec_version, .tablet_schema = _tablet_schema}; std::shared_ptr variant_column_reader; - RETURN_IF_ERROR(get_column_reader(col_uid, &variant_column_reader, stats)); + RETURN_IF_ERROR(get_column_reader(col_uid, &variant_column_reader, stats, source_io_ctx)); if (relative_path.empty()) { *column_reader = std::move(variant_column_reader); @@ -193,4 +196,4 @@ Status ColumnReaderCache::get_path_column_reader(int32_t col_uid, PathInData rel return Status::OK(); } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/be/src/storage/segment/column_reader_cache.h b/be/src/storage/segment/column_reader_cache.h index bc672c7b408d61..91398b9c85485a 100644 --- a/be/src/storage/segment/column_reader_cache.h +++ b/be/src/storage/segment/column_reader_cache.h @@ -22,6 +22,10 @@ #include "storage/tablet/tablet_fwd.h" #include "util/json/path_in_data.h" +namespace doris::io { +struct IOContext; +} // namespace doris::io + namespace doris::segment_v2 { class ColumnReader; @@ -45,11 +49,11 @@ class ColumnReaderCache { // Main constructor used in production: cache is bound to a specific segment's // ColumnMetaAccessor, TabletSchema, file reader and row count, plus a footer // getter callback (Segment::_get_segment_footer). - ColumnReaderCache( - ColumnMetaAccessor* accessor, TabletSchemaSPtr tablet_schema, - io::FileReaderSPtr file_reader, uint64_t num_rows, - std::function&, OlapReaderStatistics*)> - get_footer_cb); + ColumnReaderCache(ColumnMetaAccessor* accessor, TabletSchemaSPtr tablet_schema, + io::FileReaderSPtr file_reader, uint64_t num_rows, + std::function&, OlapReaderStatistics*, + const io::IOContext*)> + get_footer_cb); virtual ~ColumnReaderCache(); // Get all available readers // if include_subcolumns is true, return all available readers, including subcolumn readers @@ -58,13 +62,14 @@ class ColumnReaderCache { // Get column reader by column unique id Status get_column_reader(int32_t col_uid, std::shared_ptr* column_reader, - OlapReaderStatistics* stats); + OlapReaderStatistics* stats, const io::IOContext* io_ctx = nullptr); // Get column reader by column unique id and path(leaf node of variant's subcolumn) virtual Status get_path_column_reader(int32_t col_uid, PathInData relative_path, std::shared_ptr* column_reader, OlapReaderStatistics* stats, - const SubcolumnColumnMetaInfo::Node* node_hint = nullptr); + const SubcolumnColumnMetaInfo::Node* node_hint = nullptr, + const io::IOContext* io_ctx = nullptr); private: // Lookup function remains similar @@ -91,7 +96,9 @@ class ColumnReaderCache { io::FileReaderSPtr _file_reader; uint64_t _num_rows = 0; // Callback to get footer, usually bound to Segment::_get_segment_footer. - std::function&, OlapReaderStatistics*)> _get_footer_cb; + std::function&, OlapReaderStatistics*, + const io::IOContext*)> + _get_footer_cb; }; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/be/src/storage/segment/lazy_init_segment_iterator.cpp b/be/src/storage/segment/lazy_init_segment_iterator.cpp index 723233d4cbc210..8e61384556e9ea 100644 --- a/be/src/storage/segment/lazy_init_segment_iterator.cpp +++ b/be/src/storage/segment/lazy_init_segment_iterator.cpp @@ -41,7 +41,8 @@ Status LazyInitSegmentIterator::init(const StorageReadOptions& opts) { { SegmentCacheHandle segment_cache_handle; RETURN_IF_ERROR(SegmentLoader::instance()->load_segment( - _rowset, _segment_id, &segment_cache_handle, _should_use_cache, false, opts.stats)); + _rowset, _segment_id, &segment_cache_handle, _should_use_cache, false, opts.stats, + &opts.io_ctx)); const auto& tmp_segments = segment_cache_handle.get_segments(); segment = tmp_segments[0]; } diff --git a/be/src/storage/segment/segment.cpp b/be/src/storage/segment/segment.cpp index ca2d51498437b7..53e7bbf578b08f 100644 --- a/be/src/storage/segment/segment.cpp +++ b/be/src/storage/segment/segment.cpp @@ -85,16 +85,30 @@ namespace doris::segment_v2 { class InvertedIndexIterator; +namespace { +io::IOContext create_index_io_context(const io::IOContext* source, OlapReaderStatistics* stats) { + io::IOContext io_ctx; + if (source != nullptr) { + io_ctx = *source; + } + io_ctx.is_index_data = true; + io_ctx.is_inverted_index = false; + io_ctx.file_cache_stats = stats ? &stats->file_cache_stats : nullptr; + return io_ctx; +} +} // namespace + Status Segment::open(io::FileSystemSPtr fs, const std::string& path, int64_t tablet_id, uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, const io::FileReaderOptions& reader_options, std::shared_ptr* output, - InvertedIndexFileInfo idx_file_info, OlapReaderStatistics* stats) { + InvertedIndexFileInfo idx_file_info, OlapReaderStatistics* stats, + const io::IOContext* source_io_ctx) { // Ensure tablet_id is available in reader_options for CachedRemoteFileReader peer read. io::FileReaderOptions opts_with_tablet = reader_options; opts_with_tablet.tablet_id = tablet_id; auto s = _open(fs, path, segment_id, rowset_id, tablet_schema, opts_with_tablet, output, - idx_file_info, stats); + idx_file_info, stats, source_io_ctx); if (s.ok() && output && *output) { (*output)->_tablet_id = tablet_id; } @@ -115,7 +129,8 @@ Status Segment::open(io::FileSystemSPtr fs, const std::string& path, int64_t tab Status Segment::_open(io::FileSystemSPtr fs, const std::string& path, uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, const io::FileReaderOptions& reader_options, std::shared_ptr* output, - InvertedIndexFileInfo idx_file_info, OlapReaderStatistics* stats) { + InvertedIndexFileInfo idx_file_info, OlapReaderStatistics* stats, + const io::IOContext* source_io_ctx) { io::FileReaderSPtr file_reader; auto st = fs->open_file(path, &file_reader, &reader_options); TEST_INJECTION_POINT_CALLBACK("Segment::open:corruption", &st); @@ -125,7 +140,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const std::string& path, uint32_t s if (st) { segment->_fs = fs; segment->_file_reader = std::move(file_reader); - st = segment->_open(stats); + st = segment->_open(stats, source_io_ctx); } // Three-tier retry for CORRUPTION errors when file cache is enabled. @@ -144,7 +159,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const std::string& path, uint32_t s if (st) { segment->_fs = fs; segment->_file_reader = std::move(file_reader); - st = segment->_open(stats); + st = segment->_open(stats, source_io_ctx); } TEST_INJECTION_POINT_CALLBACK("Segment::open:corruption1", &st); if (st.is()) { // corrupt again @@ -160,7 +175,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const std::string& path, uint32_t s RETURN_IF_ERROR(fs->open_file(path, &file_reader, &opt)); segment->_fs = fs; segment->_file_reader = std::move(file_reader); - st = segment->_open(stats); + st = segment->_open(stats, source_io_ctx); if (!st.ok()) { // Tier 3: Remote source itself is corrupt. LOG(WARNING) << "failed to try to read remote source file directly," @@ -205,9 +220,9 @@ void Segment::update_metadata_size() { _tracked_meta_mem_usage = _meta_mem_usage; } -Status Segment::_open(OlapReaderStatistics* stats) { +Status Segment::_open(OlapReaderStatistics* stats, const io::IOContext* source_io_ctx) { std::shared_ptr footer_pb_shared; - RETURN_IF_ERROR(_get_segment_footer(footer_pb_shared, stats)); + RETURN_IF_ERROR(_get_segment_footer(footer_pb_shared, stats, source_io_ctx)); _pk_index_meta.reset( footer_pb_shared->has_primary_key_index_meta() @@ -267,7 +282,7 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o if (read_options.runtime_state != nullptr) { _be_exec_version = read_options.runtime_state->be_exec_version(); } - RETURN_IF_ERROR(_create_column_meta_once(read_options.stats)); + RETURN_IF_ERROR(_create_column_meta_once(read_options.stats, &read_options.io_ctx)); read_options.stats->total_segment_number++; // trying to prune the current segment by segment-level zone map @@ -279,7 +294,7 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o } const TabletColumn& col = read_options.tablet_schema->column(column_id); std::shared_ptr reader; - Status st = get_column_reader(col, &reader, read_options.stats); + Status st = get_column_reader(col, &reader, read_options.stats, &read_options.io_ctx); // not found in this segment, skip if (st.is()) { continue; @@ -329,7 +344,7 @@ Status Segment::new_iterator(SchemaSPtr schema, const StorageReadOptions& read_o { SCOPED_RAW_TIMER(&read_options.stats->segment_load_index_timer_ns); - RETURN_IF_ERROR(load_index(read_options.stats)); + RETURN_IF_ERROR(load_index(read_options.stats, &read_options.io_ctx)); } if (read_options.delete_condition_predicates->num_of_column_predicate() == 0 && @@ -435,8 +450,8 @@ Status Segment::_write_error_file(size_t file_size, size_t offset, size_t bytes_ return Status::OK(); // already exists }; -Status Segment::_parse_footer(std::shared_ptr& footer, - OlapReaderStatistics* stats) { +Status Segment::_parse_footer(std::shared_ptr& footer, OlapReaderStatistics* stats, + const io::IOContext* source_io_ctx) { // Footer := SegmentFooterPB, FooterPBSize(4), FooterPBChecksum(4), MagicNumber(4) auto file_size = _file_reader->size(); if (file_size < 12) { @@ -447,9 +462,8 @@ Status Segment::_parse_footer(std::shared_ptr& footer, uint8_t fixed_buf[12]; size_t bytes_read = 0; - // TODO(plat1ko): Support session variable `enable_file_cache` - io::IOContext io_ctx {.is_index_data = true, - .file_cache_stats = stats ? &stats->file_cache_stats : nullptr}; + auto io_ctx = create_index_io_context(source_io_ctx, stats); + TEST_SYNC_POINT_CALLBACK("Segment::_parse_footer::io_ctx", &io_ctx); RETURN_IF_ERROR( _file_reader->read_at(file_size - 12, Slice(fixed_buf, 12), &bytes_read, &io_ctx)); DCHECK_EQ(bytes_read, 12); @@ -522,7 +536,8 @@ Status Segment::_parse_footer(std::shared_ptr& footer, return Status::OK(); } -Status Segment::_load_pk_bloom_filter(OlapReaderStatistics* stats) { +Status Segment::_load_pk_bloom_filter(OlapReaderStatistics* stats, + const io::IOContext* source_io_ctx) { #ifdef BE_TEST if (_pk_index_meta == nullptr) { // for BE UT "segment_cache_test" @@ -537,37 +552,40 @@ Status Segment::_load_pk_bloom_filter(OlapReaderStatistics* stats) { DCHECK(_pk_index_meta != nullptr); DCHECK(_pk_index_reader != nullptr); - return _load_pk_bf_once.call([this, stats] { - RETURN_IF_ERROR(_pk_index_reader->parse_bf(_file_reader, *_pk_index_meta, stats)); + return _load_pk_bf_once.call([this, stats, source_io_ctx] { + RETURN_IF_ERROR( + _pk_index_reader->parse_bf(_file_reader, *_pk_index_meta, stats, source_io_ctx)); // _meta_mem_usage += _pk_index_reader->get_bf_memory_size(); return Status::OK(); }); } -Status Segment::load_pk_index_and_bf(OlapReaderStatistics* index_load_stats) { +Status Segment::load_pk_index_and_bf(OlapReaderStatistics* index_load_stats, + const io::IOContext* source_io_ctx) { // `DorisCallOnce` may catch exception in calling stack A and re-throw it in // a different calling stack B which doesn't have catch block. So we add catch block here // to prevent coreudmp RETURN_IF_CATCH_EXCEPTION({ - RETURN_IF_ERROR(load_index(index_load_stats)); - RETURN_IF_ERROR(_load_pk_bloom_filter(index_load_stats)); + RETURN_IF_ERROR(load_index(index_load_stats, source_io_ctx)); + RETURN_IF_ERROR(_load_pk_bloom_filter(index_load_stats, source_io_ctx)); }); return Status::OK(); } -Status Segment::load_index(OlapReaderStatistics* stats) { - return _load_index_once.call([this, stats] { +Status Segment::load_index(OlapReaderStatistics* stats, const io::IOContext* source_io_ctx) { + return _load_index_once.call([this, stats, source_io_ctx] { if (_tablet_schema->keys_type() == UNIQUE_KEYS && _pk_index_meta != nullptr) { _pk_index_reader = std::make_unique(); - RETURN_IF_ERROR(_pk_index_reader->parse_index(_file_reader, *_pk_index_meta, stats)); + RETURN_IF_ERROR(_pk_index_reader->parse_index(_file_reader, *_pk_index_meta, stats, + source_io_ctx)); // _meta_mem_usage += _pk_index_reader->get_memory_size(); return Status::OK(); } else { // read and parse short key index page OlapReaderStatistics tmp_stats; OlapReaderStatistics* stats_ptr = stats != nullptr ? stats : &tmp_stats; - PageReadOptions opts(io::IOContext {.is_index_data = true, - .file_cache_stats = &stats_ptr->file_cache_stats}); + auto page_io_ctx = create_index_io_context(source_io_ctx, stats_ptr); + PageReadOptions opts(page_io_ctx); opts.use_page_cache = true; opts.type = INDEX_PAGE; opts.file_reader = _file_reader.get(); @@ -650,11 +668,12 @@ DataTypePtr Segment::get_data_type_of(const TabletColumn& column, return type; } -Status Segment::_create_column_meta_once(OlapReaderStatistics* stats) { +Status Segment::_create_column_meta_once(OlapReaderStatistics* stats, + const io::IOContext* source_io_ctx) { SCOPED_RAW_TIMER(&stats->segment_create_column_readers_timer_ns); - return _create_column_meta_once_call.call([&] { + return _create_column_meta_once_call.call([this, stats, source_io_ctx] { std::shared_ptr footer_pb_shared; - RETURN_IF_ERROR(_get_segment_footer(footer_pb_shared, stats)); + RETURN_IF_ERROR(_get_segment_footer(footer_pb_shared, stats, source_io_ctx)); return _create_column_meta(*footer_pb_shared); }); } @@ -681,8 +700,9 @@ Status Segment::_create_column_meta(const SegmentFooterPB& footer) { _column_reader_cache = std::make_unique( _column_meta_accessor.get(), _tablet_schema, _file_reader, _num_rows, - [this](std::shared_ptr& footer_pb, OlapReaderStatistics* stats) { - return _get_segment_footer(footer_pb, stats); + [this](std::shared_ptr& footer_pb, OlapReaderStatistics* stats, + const io::IOContext* io_ctx) { + return _get_segment_footer(footer_pb, stats, io_ctx); }); return Status::OK(); } @@ -722,7 +742,7 @@ Status Segment::new_column_iterator(const TabletColumn& tablet_column, if (opt->runtime_state != nullptr) { _be_exec_version = opt->runtime_state->be_exec_version(); } - RETURN_IF_ERROR(_create_column_meta_once(opt->stats)); + RETURN_IF_ERROR(_create_column_meta_once(opt->stats, &opt->io_ctx)); // For compability reason unique_id may less than 0 for variant extracted column int32_t unique_id = tablet_column.unique_id() >= 0 ? tablet_column.unique_id() @@ -736,7 +756,7 @@ Status Segment::new_column_iterator(const TabletColumn& tablet_column, // init iterator by unique id std::shared_ptr reader; - RETURN_IF_ERROR(get_column_reader(unique_id, &reader, opt->stats)); + RETURN_IF_ERROR(get_column_reader(unique_id, &reader, opt->stats, &opt->io_ctx)); if (reader == nullptr) { return Status::InternalError("column reader is nullptr, unique_id={}", unique_id); } @@ -786,8 +806,8 @@ Status Segment::new_column_iterator(const TabletColumn& tablet_column, } Status Segment::get_column_reader(int32_t col_uid, std::shared_ptr* column_reader, - OlapReaderStatistics* stats) { - RETURN_IF_ERROR(_create_column_meta_once(stats)); + OlapReaderStatistics* stats, const io::IOContext* source_io_ctx) { + RETURN_IF_ERROR(_create_column_meta_once(stats, source_io_ctx)); SCOPED_RAW_TIMER(&stats->segment_create_column_readers_timer_ns); // The column is not in this segment, return nullptr if (!_tablet_schema->has_column_unique_id(col_uid)) { @@ -795,7 +815,7 @@ Status Segment::get_column_reader(int32_t col_uid, std::shared_ptr return Status::Error("column not found in segment, col_uid={}", col_uid); } - return _column_reader_cache->get_column_reader(col_uid, column_reader, stats); + return _column_reader_cache->get_column_reader(col_uid, column_reader, stats, source_io_ctx); } Status Segment::traverse_column_meta_pbs(const std::function& visitor) { @@ -809,8 +829,8 @@ Status Segment::traverse_column_meta_pbs(const std::function* column_reader, - OlapReaderStatistics* stats) { - RETURN_IF_ERROR(_create_column_meta_once(stats)); + OlapReaderStatistics* stats, const io::IOContext* source_io_ctx) { + RETURN_IF_ERROR(_create_column_meta_once(stats, source_io_ctx)); SCOPED_RAW_TIMER(&stats->segment_create_column_readers_timer_ns); int col_uid = col.unique_id() >= 0 ? col.unique_id() : col.parent_unique_id(); // The column is not in this segment, return nullptr @@ -822,9 +842,9 @@ Status Segment::get_column_reader(const TabletColumn& col, if (col.has_path_info()) { PathInData relative_path = col.path_info_ptr()->copy_pop_front(); return _column_reader_cache->get_path_column_reader(col_uid, relative_path, column_reader, - stats); + stats, nullptr, source_io_ctx); } - return _column_reader_cache->get_column_reader(col_uid, column_reader, stats); + return _column_reader_cache->get_column_reader(col_uid, column_reader, stats, source_io_ctx); } Status Segment::new_index_iterator(const TabletColumn& tablet_column, const TabletIndex* index_meta, @@ -833,9 +853,9 @@ Status Segment::new_index_iterator(const TabletColumn& tablet_column, const Tabl if (read_options.runtime_state != nullptr) { _be_exec_version = read_options.runtime_state->be_exec_version(); } - RETURN_IF_ERROR(_create_column_meta_once(read_options.stats)); + RETURN_IF_ERROR(_create_column_meta_once(read_options.stats, &read_options.io_ctx)); std::shared_ptr reader; - auto st = get_column_reader(tablet_column, &reader, read_options.stats); + auto st = get_column_reader(tablet_column, &reader, read_options.stats, &read_options.io_ctx); if (st.is()) { return Status::OK(); } @@ -1038,19 +1058,21 @@ Status Segment::seek_and_read_by_rowid(const TabletSchema& schema, SlotDescripto DORIS_CHECK(std::adjacent_find(row_ids.begin(), row_ids.end()) == row_ids.end()); // ColumnIterator::seek_and_read expects monotonically increasing row_ids without // duplicates for correct ordinal scanning. Enforce this contract at the entry point. + auto io_ctx = storage_read_options.io_ctx; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.file_cache_stats = &storage_read_options.stats->file_cache_stats; segment_v2::ColumnIteratorOptions opt { .use_page_cache = !config::disable_storage_page_cache, .file_reader = file_reader().get(), .stats = storage_read_options.stats, - .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY, - .file_cache_stats = - &storage_read_options.stats->file_cache_stats}, + .io_ctx = io_ctx, }; if (!slot->column_paths().empty()) { // here need create column readers to make sure column reader is created before seek_and_read_by_rowid // if segment cache miss, column reader will be created to make sure the variant column result not coredump - RETURN_IF_ERROR(_create_column_meta_once(storage_read_options.stats)); + RETURN_IF_ERROR( + _create_column_meta_once(storage_read_options.stats, &storage_read_options.io_ctx)); const auto& dt_variant = assert_cast(*remove_nullable(slot->type())); @@ -1094,7 +1116,8 @@ Status Segment::seek_and_read_by_rowid(const TabletSchema& schema, SlotDescripto } Status Segment::_get_segment_footer(std::shared_ptr& footer_pb, - OlapReaderStatistics* stats) { + OlapReaderStatistics* stats, + const io::IOContext* source_io_ctx) { std::shared_ptr footer_pb_shared = _footer_pb.lock(); if (footer_pb_shared != nullptr) { footer_pb = footer_pb_shared; @@ -1119,7 +1142,7 @@ Status Segment::_get_segment_footer(std::shared_ptr& footer_pb, // as other index/metadata pages and avoids competing with DATA_PAGE budget. if (!segment_footer_cache->lookup(cache_key, &cache_handle, segment_v2::PageTypePB::INDEX_PAGE)) { - RETURN_IF_ERROR(_parse_footer(footer_pb_shared, stats)); + RETURN_IF_ERROR(_parse_footer(footer_pb_shared, stats, source_io_ctx)); segment_footer_cache->insert(cache_key, footer_pb_shared, footer_pb_shared->ByteSizeLong(), &cache_handle, segment_v2::PageTypePB::INDEX_PAGE); } else { diff --git a/be/src/storage/segment/segment.h b/be/src/storage/segment/segment.h index fb337fe0b7c4d6..f4dd3553490ee6 100644 --- a/be/src/storage/segment/segment.h +++ b/be/src/storage/segment/segment.h @@ -36,6 +36,7 @@ #include "io/fs/file_reader.h" #include "io/fs/file_reader_writer_fwd.h" #include "io/fs/file_system.h" +#include "io/io_common.h" #include "runtime/descriptors.h" #include "storage/cache/page_cache.h" #include "storage/olap_common.h" @@ -94,7 +95,8 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, const io::FileReaderOptions& reader_options, std::shared_ptr* output, InvertedIndexFileInfo idx_file_info = {}, - OlapReaderStatistics* stats = nullptr); + OlapReaderStatistics* stats = nullptr, + const io::IOContext* io_ctx = nullptr); static io::UInt128Wrapper file_cache_key(std::string_view rowset_id, uint32_t seg_id); io::UInt128Wrapper file_cache_key() const { @@ -150,9 +152,9 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd StorageReadOptions& storage_read_options, std::unique_ptr& iterator_hint); - Status load_index(OlapReaderStatistics* stats); + Status load_index(OlapReaderStatistics* stats, const io::IOContext* io_ctx = nullptr); - Status load_pk_index_and_bf(OlapReaderStatistics* stats); + Status load_pk_index_and_bf(OlapReaderStatistics* stats, const io::IOContext* io_ctx = nullptr); void update_healthy_status(Status new_status) { _healthy_status.update(new_status); } // The segment is loaded into SegmentCache and then will load indices, if there are something wrong @@ -214,11 +216,11 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd // get the column reader by tablet column, return NOT_FOUND if not found reader in this segment Status get_column_reader(const TabletColumn& col, std::shared_ptr* column_reader, - OlapReaderStatistics* stats); + OlapReaderStatistics* stats, const io::IOContext* io_ctx = nullptr); // get the column reader by column unique id, return NOT_FOUND if not found reader in this segment Status get_column_reader(int32_t col_uid, std::shared_ptr* column_reader, - OlapReaderStatistics* stats); + OlapReaderStatistics* stats, const io::IOContext* io_ctx = nullptr); Status traverse_column_meta_pbs(const std::function& visitor); @@ -240,25 +242,28 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd RowsetId rowset_id, TabletSchemaSPtr tablet_schema, const io::FileReaderOptions& reader_options, std::shared_ptr* output, InvertedIndexFileInfo idx_file_info, - OlapReaderStatistics* stats); + OlapReaderStatistics* stats = nullptr, + const io::IOContext* io_ctx = nullptr); // open segment file and read the minimum amount of necessary information (footer) - Status _open(OlapReaderStatistics* stats); + Status _open(OlapReaderStatistics* stats, const io::IOContext* io_ctx = nullptr); Status _parse_footer(std::shared_ptr& footer, - OlapReaderStatistics* stats = nullptr); + OlapReaderStatistics* stats = nullptr, + const io::IOContext* io_ctx = nullptr); Status _create_column_meta(const SegmentFooterPB& footer); - Status _load_pk_bloom_filter(OlapReaderStatistics* stats); - // Must ensure _create_column_readers_once has been called before calling this function. - ColumnReader* _get_column_reader(const TabletColumn& col); + Status _load_pk_bloom_filter(OlapReaderStatistics* stats, + const io::IOContext* io_ctx = nullptr); Status _write_error_file(size_t file_size, size_t offset, size_t bytes_read, char* data, io::IOContext& io_ctx); Status _open_index_file_reader(); - Status _create_column_meta_once(OlapReaderStatistics* stats); + Status _create_column_meta_once(OlapReaderStatistics* stats, + const io::IOContext* io_ctx = nullptr); virtual Status _get_segment_footer(std::shared_ptr&, - OlapReaderStatistics* stats); + OlapReaderStatistics* stats, + const io::IOContext* io_ctx = nullptr); StoragePageCache::CacheKey get_segment_footer_cache_key() const; diff --git a/be/src/storage/segment/segment_index_file_cache_loader.cpp b/be/src/storage/segment/segment_index_file_cache_loader.cpp new file mode 100644 index 00000000000000..13a93460cbc2a3 --- /dev/null +++ b/be/src/storage/segment/segment_index_file_cache_loader.cpp @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "storage/segment/segment_index_file_cache_loader.h" + +#include + +#include "cloud/config.h" +#include "cpp/sync_point.h" +#include "io/fs/file_reader.h" +#include "io/io_common.h" +#include "storage/rowset/rowset_writer_context.h" +#include "util/slice.h" + +namespace doris::segment_v2 { + +namespace { + +bvar::Adder g_segment_index_file_cache_load_total("segment_index_file_cache_load_total"); +bvar::Adder g_segment_index_file_cache_load_failed("segment_index_file_cache_load_failed"); +bvar::Adder g_segment_index_file_cache_load_bytes("segment_index_file_cache_load_bytes"); + +bool enable_cloud_index_only_file_cache() { + return config::is_cloud_mode() && config::enable_file_cache && + config::enable_file_cache_write_index_file_only; +} + +const char* reason_to_string(SegmentIndexFileCacheLoadReason reason) { + switch (reason) { + case SegmentIndexFileCacheLoadReason::LOAD: + return "load"; + case SegmentIndexFileCacheLoadReason::CUMULATIVE_COMPACTION: + return "cumulative_compaction"; + case SegmentIndexFileCacheLoadReason::BASE_COMPACTION: + return "base_compaction"; + case SegmentIndexFileCacheLoadReason::SCHEMA_CHANGE: + return "schema_change"; + } + return "unknown"; +} + +SegmentIndexFileCacheLoadReason reason_from_context(const RowsetWriterContext& context) { + if (context.write_type == DataWriteType::TYPE_SCHEMA_CHANGE) { + return SegmentIndexFileCacheLoadReason::SCHEMA_CHANGE; + } + if (context.write_type == DataWriteType::TYPE_COMPACTION) { + if (context.compaction_type == ReaderType::READER_BASE_COMPACTION || + context.compaction_type == ReaderType::READER_FULL_COMPACTION) { + return SegmentIndexFileCacheLoadReason::BASE_COMPACTION; + } + return SegmentIndexFileCacheLoadReason::CUMULATIVE_COMPACTION; + } + return SegmentIndexFileCacheLoadReason::LOAD; +} + +Status read_range_to_file_cache(io::FileReaderSPtr reader, uint64_t offset, uint64_t size, + const io::IOContext& io_ctx) { + const auto read_size = static_cast(size); + size_t bytes_read = 0; + RETURN_IF_ERROR(reader->read_at(offset, Slice(static_cast(nullptr), read_size), + &bytes_read, &io_ctx)); + if (bytes_read != read_size) { + return Status::InternalError( + "short dry-run read when preloading segment index to file cache, offset={}, " + "expected={}, actual={}", + offset, read_size, bytes_read); + } + return Status::OK(); +} + +} // namespace + +Status SegmentIndexFileCacheLoader::preload_segment_index_to_file_cache( + const RowsetWriterContext& context, uint32_t segment_id, const std::string& segment_path, + const SegmentIndexFileCacheInfo& info) { + if (!enable_cloud_index_only_file_cache() || context.is_local_rowset()) { + return Status::OK(); + } + + for (const auto& range : info.index_ranges) { + auto st = load_segment_index_to_file_cache({ + .fs = context.fs(), + .segment_path = segment_path, + .rowset_id = context.rowset_id, + .tablet_id = context.tablet_id, + .segment_id = segment_id, + .range = range, + .segment_file_size = info.segment_file_size, + .reason = reason_from_context(context), + }); + if (!st.ok()) { + g_segment_index_file_cache_load_failed << 1; + LOG(WARNING) << "failed to preload segment index to file cache, tablet_id=" + << context.tablet_id << ", rowset_id=" << context.rowset_id + << ", segment_id=" << segment_id << ", segment_path=" << segment_path + << ", index_start=" << range.offset << ", index_size=" << range.size + << ", segment_file_size=" << info.segment_file_size << ", status=" << st; + } + } + return Status::OK(); +} + +Status SegmentIndexFileCacheLoader::preload_segment_indexes_to_file_cache( + const RowsetWriterContext& context, + const std::vector& tasks) { + TEST_SYNC_POINT_CALLBACK("SegmentIndexFileCacheLoader::preload_segment_indexes_to_file_cache", + &context, &tasks); + for (const auto& task : tasks) { + RETURN_IF_ERROR(preload_segment_index_to_file_cache( + context, task.segment_id, context.segment_path(task.segment_id), task.info)); + } + return Status::OK(); +} + +Status SegmentIndexFileCacheLoader::load_segment_index_to_file_cache( + const SegmentIndexFileCacheLoadContext& ctx) { + if (!enable_cloud_index_only_file_cache()) { + return Status::OK(); + } + const auto& range = ctx.range; + if (range.empty()) { + return Status::OK(); + } + if (!range.is_valid_for(ctx.segment_file_size)) { + return Status::InvalidArgument( + "invalid segment index cache range, path={}, index_start={}, index_size={}, " + "segment_file_size={}", + ctx.segment_path, range.offset, range.size, ctx.segment_file_size); + } + if (ctx.fs == nullptr) { + return Status::InternalError("file system is null"); + } + + io::IOContext io_ctx { + .reader_type = ReaderType::READER_QUERY, + .is_index_data = true, + .is_dryrun = true, + .is_warmup = false, + }; + TEST_SYNC_POINT_RETURN_WITH_VALUE( + "SegmentIndexFileCacheLoader::load_segment_index_to_file_cache", Status::OK(), &ctx, + &io_ctx); + + io::FileReaderOptions reader_opts; + reader_opts.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; + reader_opts.is_doris_table = true; + reader_opts.file_size = static_cast(ctx.segment_file_size); + reader_opts.tablet_id = ctx.tablet_id; + + io::FileReaderSPtr reader; + RETURN_IF_ERROR(ctx.fs->open_file(ctx.segment_path, &reader, &reader_opts)); + + RETURN_IF_ERROR(read_range_to_file_cache(reader, range.offset, range.size, io_ctx)); + g_segment_index_file_cache_load_total << 1; + g_segment_index_file_cache_load_bytes << static_cast(range.size); + + VLOG_DEBUG << "preloaded segment index to file cache, tablet_id=" << ctx.tablet_id + << ", rowset_id=" << ctx.rowset_id << ", segment_id=" << ctx.segment_id + << ", segment_path=" << ctx.segment_path << ", index_start=" << range.offset + << ", index_size=" << range.size << ", segment_file_size=" << ctx.segment_file_size + << ", reason=" << reason_to_string(ctx.reason); + return Status::OK(); +} + +} // namespace doris::segment_v2 diff --git a/be/src/storage/segment/segment_index_file_cache_loader.h b/be/src/storage/segment/segment_index_file_cache_loader.h new file mode 100644 index 00000000000000..62d08339405bdd --- /dev/null +++ b/be/src/storage/segment/segment_index_file_cache_loader.h @@ -0,0 +1,102 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include + +#include "common/status.h" +#include "io/fs/file_reader_writer_fwd.h" +#include "io/fs/file_system.h" +#include "storage/olap_common.h" + +namespace doris { + +struct RowsetWriterContext; + +namespace segment_v2 { + +struct SegmentIndexFileCacheRange { + uint64_t offset = 0; + uint64_t size = 0; + + bool empty() const { return size == 0; } + + bool is_valid_for(uint64_t file_size) const { + return !empty() && offset < file_size && size <= file_size - offset; + } +}; + +struct SegmentIndexFileCacheInfo { + uint64_t segment_file_size = 0; + std::vector index_ranges; + + void add_index_range(uint64_t offset, uint64_t size) { + if (size == 0) { + return; + } + index_ranges.push_back({offset, size}); + } + + bool empty() const { return index_ranges.empty(); } + + uint64_t cache_start_offset() const { + return empty() ? segment_file_size : index_ranges.front().offset; + } +}; + +struct SegmentIndexFileCachePreloadTask { + uint32_t segment_id = 0; + SegmentIndexFileCacheInfo info; +}; + +enum class SegmentIndexFileCacheLoadReason { + LOAD, + CUMULATIVE_COMPACTION, + BASE_COMPACTION, + SCHEMA_CHANGE, +}; + +struct SegmentIndexFileCacheLoadContext { + io::FileSystemSPtr fs; + std::string segment_path; + RowsetId rowset_id; + int64_t tablet_id = 0; + uint32_t segment_id = 0; + SegmentIndexFileCacheRange range; + uint64_t segment_file_size = 0; + SegmentIndexFileCacheLoadReason reason = SegmentIndexFileCacheLoadReason::LOAD; +}; + +class SegmentIndexFileCacheLoader { +public: + static Status preload_segment_index_to_file_cache(const RowsetWriterContext& context, + uint32_t segment_id, + const std::string& segment_path, + const SegmentIndexFileCacheInfo& info); + + static Status preload_segment_indexes_to_file_cache( + const RowsetWriterContext& context, + const std::vector& tasks); + + static Status load_segment_index_to_file_cache(const SegmentIndexFileCacheLoadContext& ctx); +}; + +} // namespace segment_v2 +} // namespace doris diff --git a/be/src/storage/segment/segment_loader.cpp b/be/src/storage/segment/segment_loader.cpp index 9d44948467f81f..12b46b5f733732 100644 --- a/be/src/storage/segment/segment_loader.cpp +++ b/be/src/storage/segment/segment_loader.cpp @@ -55,7 +55,8 @@ void SegmentCache::erase(const SegmentCache::CacheKey& key) { Status SegmentLoader::load_segment(const BetaRowsetSharedPtr& rowset, int64_t segment_id, SegmentCacheHandle* cache_handle, bool use_cache, bool need_load_pk_index_and_bf, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { auto start = MonotonicMicros(); SegmentCache::CacheKey cache_key(rowset->rowset_id(), segment_id); if (_segment_cache->lookup(cache_key, cache_handle)) { @@ -72,7 +73,7 @@ Status SegmentLoader::load_segment(const BetaRowsetSharedPtr& rowset, int64_t se } // If the segment is not healthy, then will create a new segment and will replace the unhealthy one in SegmentCache. segment_v2::SegmentSharedPtr segment; - RETURN_IF_ERROR(rowset->load_segment(segment_id, index_load_stats, &segment)); + RETURN_IF_ERROR(rowset->load_segment(segment_id, index_load_stats, &segment, io_ctx)); if (need_load_pk_index_and_bf) { RETURN_IF_ERROR(segment->load_pk_index_and_bf(index_load_stats)); } @@ -95,13 +96,14 @@ Status SegmentLoader::load_segment(const BetaRowsetSharedPtr& rowset, int64_t se Status SegmentLoader::load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, bool use_cache, bool need_load_pk_index_and_bf, - OlapReaderStatistics* index_load_stats) { + OlapReaderStatistics* index_load_stats, + const io::IOContext* io_ctx) { if (cache_handle->is_inited()) { return Status::OK(); } for (int64_t i = 0; i < rowset->num_segments(); i++) { RETURN_IF_ERROR(load_segment(rowset, i, cache_handle, use_cache, need_load_pk_index_and_bf, - index_load_stats)); + index_load_stats, io_ctx)); } cache_handle->set_inited(); return Status::OK(); diff --git a/be/src/storage/segment/segment_loader.h b/be/src/storage/segment/segment_loader.h index 1384c6b36195f5..e8d9d354938305 100644 --- a/be/src/storage/segment/segment_loader.h +++ b/be/src/storage/segment/segment_loader.h @@ -121,14 +121,16 @@ class SegmentLoader { // If use_cache is true, it will be loaded from _cache. Status load_segments(const BetaRowsetSharedPtr& rowset, SegmentCacheHandle* cache_handle, bool use_cache = false, bool need_load_pk_index_and_bf = false, - OlapReaderStatistics* index_load_stats = nullptr); + OlapReaderStatistics* index_load_stats = nullptr, + const io::IOContext* io_ctx = nullptr); // Load one segment of "rowset", return the "cache_handle" which contains segments. // If use_cache is true, it will be loaded from _cache. Status load_segment(const BetaRowsetSharedPtr& rowset, int64_t segment_id, SegmentCacheHandle* cache_handle, bool use_cache = false, bool need_load_pk_index_and_bf = false, - OlapReaderStatistics* index_load_stats = nullptr); + OlapReaderStatistics* index_load_stats = nullptr, + const io::IOContext* io_ctx = nullptr); void erase_segment(const SegmentCache::CacheKey& key); diff --git a/be/src/storage/segment/segment_writer.cpp b/be/src/storage/segment/segment_writer.cpp index f5f63d923f7750..6f4ecef1140d67 100644 --- a/be/src/storage/segment/segment_writer.cpp +++ b/be/src/storage/segment/segment_writer.cpp @@ -943,6 +943,10 @@ Status SegmentWriter::finalize_columns_data() { Status SegmentWriter::finalize_columns_index(uint64_t* index_size) { uint64_t index_start = _file_writer->bytes_appended(); + // Record each index range separately. Vertical compaction writes column groups as + // data+index pairs, so a single [first index, EOF) range would include later column data. + // This SegmentWriter path is shared by cloud load, non-vertical compaction, schema change + // final output, and vertical compaction via VerticalBetaRowsetWriter. RETURN_IF_ERROR(_write_ordinal_index()); RETURN_IF_ERROR(_write_zone_map()); RETURN_IF_ERROR(_write_inverted_index()); @@ -978,24 +982,36 @@ Status SegmentWriter::finalize_columns_index(uint64_t* index_size) { *index_size = _file_writer->bytes_appended() - index_start; } } + uint64_t file_index_end = _file_writer->bytes_appended(); + _index_file_cache_info.add_index_range(index_start, file_index_end - index_start); // reset all column writers and data_conveter clear(); return Status::OK(); } -Status SegmentWriter::finalize_footer(uint64_t* segment_file_size) { +Status SegmentWriter::finalize_footer(uint64_t* segment_file_size, + SegmentIndexFileCacheInfo* index_file_cache_info) { + uint64_t footer_start = _file_writer->bytes_appended(); RETURN_IF_ERROR(_write_footer()); // finish RETURN_IF_ERROR(_file_writer->close(true)); *segment_file_size = _file_writer->bytes_appended(); + // The closed size completes the preload range recorded above. Local temporary rowsets, such as + // schema-change internal sorting output, are filtered by SegmentIndexFileCacheLoader. + _index_file_cache_info.segment_file_size = *segment_file_size; + _index_file_cache_info.add_index_range(footer_start, *segment_file_size - footer_start); + if (index_file_cache_info != nullptr) { + *index_file_cache_info = _index_file_cache_info; + } if (*segment_file_size == 0) { return Status::Corruption("Bad segment, file size = 0"); } return Status::OK(); } -Status SegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size) { +Status SegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size, + SegmentIndexFileCacheInfo* index_file_cache_info) { MonotonicStopWatch timer; timer.start(); // check disk capacity @@ -1005,12 +1021,10 @@ Status SegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size } // write data RETURN_IF_ERROR(finalize_columns_data()); - // Get the index start before finalize_footer since this function would write new data. - uint64_t index_start = _file_writer->bytes_appended(); // write index RETURN_IF_ERROR(finalize_columns_index(index_size)); // write footer - RETURN_IF_ERROR(finalize_footer(segment_file_size)); + RETURN_IF_ERROR(finalize_footer(segment_file_size, index_file_cache_info)); if (timer.elapsed_time() > 5000000000l) { LOG(INFO) << "segment flush consumes a lot time_ns " << timer.elapsed_time() @@ -1021,6 +1035,7 @@ Status SegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size if (auto* cache_builder = _file_writer->cache_builder(); cache_builder != nullptr && cache_builder->_expiration_time == 0 && config::is_cloud_mode()) { + auto index_start = _index_file_cache_info.cache_start_offset(); auto size = *index_size + *segment_file_size; auto holder = cache_builder->allocate_cache_holder(index_start, size, _tablet->tablet_id()); for (auto& segment : holder->file_blocks) { diff --git a/be/src/storage/segment/segment_writer.h b/be/src/storage/segment/segment_writer.h index e6bba442bd4a10..4f6ce5c866159b 100644 --- a/be/src/storage/segment/segment_writer.h +++ b/be/src/storage/segment/segment_writer.h @@ -33,6 +33,7 @@ #include "storage/index/index_file_writer.h" #include "storage/olap_define.h" #include "storage/segment/column_writer.h" +#include "storage/segment/segment_index_file_cache_loader.h" #include "storage/tablet/tablet.h" #include "storage/tablet/tablet_schema.h" #include "util/faststring.h" @@ -116,13 +117,15 @@ class SegmentWriter { uint32_t row_count() const { return _row_count; } - Status finalize(uint64_t* segment_file_size, uint64_t* index_size); + Status finalize(uint64_t* segment_file_size, uint64_t* index_size, + SegmentIndexFileCacheInfo* index_file_cache_info = nullptr); uint32_t get_segment_id() const { return _segment_id; } Status finalize_columns_data(); Status finalize_columns_index(uint64_t* index_size); - Status finalize_footer(uint64_t* segment_file_size); + Status finalize_footer(uint64_t* segment_file_size, + SegmentIndexFileCacheInfo* index_file_cache_info = nullptr); void init_column_meta(ColumnMetaPB* meta, uint32_t column_id, const TabletColumn& column, const ColumnWriterOptions& opts); @@ -213,6 +216,7 @@ class SegmentWriter { IndexFileWriter* _index_file_writer = nullptr; SegmentFooterPB _footer; + SegmentIndexFileCacheInfo _index_file_cache_info; // for mow tables with cluster key, the sort key is the cluster keys not unique keys // for other tables, the sort key is the keys size_t _num_sort_key_columns; diff --git a/be/src/storage/segment/variant/hierarchical_data_iterator.cpp b/be/src/storage/segment/variant/hierarchical_data_iterator.cpp index d1fbdb6f24aa49..3ad017fc5d8556 100644 --- a/be/src/storage/segment/variant/hierarchical_data_iterator.cpp +++ b/be/src/storage/segment/variant/hierarchical_data_iterator.cpp @@ -45,7 +45,8 @@ Status HierarchicalDataIterator::create(ColumnIteratorUPtr* reader, int32_t col_ std::unique_ptr&& binary_column_reader, std::unique_ptr&& root_column_reader, ColumnReaderCache* column_reader_cache, - OlapReaderStatistics* stats, ReadType read_type) { + OlapReaderStatistics* stats, ReadType read_type, + const io::IOContext* io_ctx) { // None leave node need merge with root std::unique_ptr stream_iter( new HierarchicalDataIterator(path, read_type)); @@ -66,8 +67,8 @@ Status HierarchicalDataIterator::create(ColumnIteratorUPtr* reader, int32_t col_ VLOG_DEBUG << "Skipping NestedGroup subcolumn: " << leaf_path; continue; } - RETURN_IF_ERROR( - stream_iter->add_stream(col_uid, leaves[i], column_reader_cache, stats)); + RETURN_IF_ERROR(stream_iter->add_stream(col_uid, leaves[i], column_reader_cache, stats, + io_ctx)); } } // need read from root column if not null @@ -141,7 +142,8 @@ Status HierarchicalDataIterator::read_by_rowids(const rowid_t* rowids, const siz Status HierarchicalDataIterator::add_stream(int32_t col_uid, const SubcolumnColumnMetaInfo::Node* node, ColumnReaderCache* column_reader_cache, - OlapReaderStatistics* stats) { + OlapReaderStatistics* stats, + const io::IOContext* io_ctx) { if (_substream_reader.find_leaf(node->path)) { VLOG_DEBUG << "Already exist sub column " << node->path.get_path(); return Status::OK(); @@ -150,7 +152,7 @@ Status HierarchicalDataIterator::add_stream(int32_t col_uid, ColumnIteratorUPtr it; std::shared_ptr column_reader; RETURN_IF_ERROR(column_reader_cache->get_path_column_reader(col_uid, node->path, &column_reader, - stats, node)); + stats, node, io_ctx)); RETURN_IF_ERROR(column_reader->new_iterator(&it, nullptr)); SubstreamIterator reader(node->data.file_column_type->create_column(), std::move(it), node->data.file_column_type); diff --git a/be/src/storage/segment/variant/hierarchical_data_iterator.h b/be/src/storage/segment/variant/hierarchical_data_iterator.h index b54e8a20ab4a6e..5a419205d26435 100644 --- a/be/src/storage/segment/variant/hierarchical_data_iterator.h +++ b/be/src/storage/segment/variant/hierarchical_data_iterator.h @@ -72,7 +72,7 @@ class HierarchicalDataIterator : public ColumnIterator { std::unique_ptr&& sparse_reader, std::unique_ptr&& root_column_reader, ColumnReaderCache* column_reader_cache, OlapReaderStatistics* stats, - ReadType read_type); + ReadType read_type, const io::IOContext* io_ctx = nullptr); Status init(const ColumnIteratorOptions& opts) override; @@ -86,7 +86,8 @@ class HierarchicalDataIterator : public ColumnIterator { ordinal_t get_current_ordinal() const override; Status add_stream(int32_t col_uid, const SubcolumnColumnMetaInfo::Node* node, - ColumnReaderCache* column_reader_cache, OlapReaderStatistics* stats); + ColumnReaderCache* column_reader_cache, OlapReaderStatistics* stats, + const io::IOContext* io_ctx = nullptr); Status init_prefetcher(const SegmentPrefetchParams& params) override; void collect_prefetchers( diff --git a/be/src/storage/segment/variant/variant_column_reader.cpp b/be/src/storage/segment/variant/variant_column_reader.cpp index df9561ada7cd7f..77d12eb37bd7ef 100644 --- a/be/src/storage/segment/variant/variant_column_reader.cpp +++ b/be/src/storage/segment/variant/variant_column_reader.cpp @@ -231,7 +231,7 @@ Status VariantColumnReader::_create_hierarchical_reader( ColumnIteratorUPtr* reader, int32_t col_uid, PathInData path, const SubcolumnColumnMetaInfo::Node* node, const SubcolumnColumnMetaInfo::Node* root, ColumnReaderCache* column_reader_cache, OlapReaderStatistics* stats, - HierarchicalDataIterator::ReadType read_type) { + HierarchicalDataIterator::ReadType read_type, const io::IOContext* io_ctx) { // make sure external meta is loaded otherwise can't find any meta data for extracted columns // TODO(lhy): this will load all external meta if not loaded, and memory will be consumed. RETURN_IF_ERROR(load_external_meta_once()); @@ -273,7 +273,7 @@ Status VariantColumnReader::_create_hierarchical_reader( } RETURN_IF_ERROR(HierarchicalDataIterator::create( reader, col_uid, path, node, std::move(sparse_iter), std::move(root_column_reader), - column_reader_cache, stats, read_type)); + column_reader_cache, stats, read_type, io_ctx)); return Status::OK(); } @@ -318,7 +318,7 @@ Status VariantColumnReader::_create_sparse_merge_reader(ColumnIteratorUPtr* iter std::shared_ptr column_reader; RETURN_IF_ERROR(column_reader_cache->get_path_column_reader( target_col.parent_unique_id(), subcolumn_reader->path, &column_reader, opts->stats, - subcolumn_reader.get())); + subcolumn_reader.get(), &opts->io_ctx)); ColumnIteratorUPtr it; RETURN_IF_ERROR(column_reader->new_iterator(&it, nullptr)); // Create substream reader and add to tree @@ -366,7 +366,8 @@ Status VariantColumnReader::_new_default_iter_with_same_nested( std::unique_ptr sibling_iter; std::shared_ptr column_reader; RETURN_IF_ERROR(column_reader_cache->get_path_column_reader( - tablet_column.parent_unique_id(), leaf->path, &column_reader, opt->stats, leaf)); + tablet_column.parent_unique_id(), leaf->path, &column_reader, opt->stats, leaf, + &opt->io_ctx)); RETURN_IF_ERROR(column_reader->new_iterator(&sibling_iter, nullptr)); *iterator = std::make_unique(std::move(sibling_iter), leaf->data.file_column_type); @@ -514,8 +515,9 @@ Status VariantColumnReader::_build_read_plan_flat_leaves( } VLOG_DEBUG << "new iterator: " << target_col.path_info_ptr()->get_path(); std::shared_ptr column_reader; - RETURN_IF_ERROR(column_reader_cache->get_path_column_reader( - target_col.parent_unique_id(), node->path, &column_reader, opts->stats, node)); + RETURN_IF_ERROR(column_reader_cache->get_path_column_reader(target_col.parent_unique_id(), + node->path, &column_reader, + opts->stats, node, &opts->io_ctx)); plan->kind = ReadKind::LEAF; plan->type = column_reader->get_vec_data_type(); plan->relative_path = relative_path; @@ -761,7 +763,8 @@ Status VariantColumnReader::_try_build_leaf_plan(ReadPlan* plan, int32_t col_uid const PathInData& relative_path, const SubcolumnColumnMetaInfo::Node* node, ColumnReaderCache* column_reader_cache, - OlapReaderStatistics* stats) { + OlapReaderStatistics* stats, + const io::IOContext* io_ctx) { if (node == nullptr) { return Status::OK(); } @@ -771,7 +774,7 @@ Status VariantColumnReader::_try_build_leaf_plan(ReadPlan* plan, int32_t col_uid std::shared_ptr leaf_column_reader; RETURN_IF_ERROR(column_reader_cache->get_path_column_reader( - col_uid, leaf_node->path, &leaf_column_reader, stats, leaf_node)); + col_uid, leaf_node->path, &leaf_column_reader, stats, leaf_node, io_ctx)); plan->kind = ReadKind::LEAF; plan->type = leaf_column_reader->get_vec_data_type(); plan->relative_path = relative_path; @@ -782,14 +785,15 @@ Status VariantColumnReader::_try_build_leaf_plan(ReadPlan* plan, int32_t col_uid Status VariantColumnReader::_try_build_external_leaf_plan(ReadPlan* plan, int32_t col_uid, const PathInData& relative_path, ColumnReaderCache* column_reader_cache, - OlapReaderStatistics* stats) { + OlapReaderStatistics* stats, + const io::IOContext* io_ctx) { if (!_ext_meta_reader || !_ext_meta_reader->available()) { return Status::OK(); } std::shared_ptr leaf_column_reader; - Status st = column_reader_cache->get_path_column_reader(col_uid, relative_path, - &leaf_column_reader, stats, nullptr); + Status st = column_reader_cache->get_path_column_reader( + col_uid, relative_path, &leaf_column_reader, stats, nullptr, io_ctx); DCHECK(!_has_prefix_path_unlocked(relative_path)); if (st.ok()) { plan->kind = ReadKind::LEAF; @@ -919,13 +923,13 @@ Status VariantColumnReader::_build_read_plan(ReadPlan* plan, const TabletColumn& } RETURN_IF_ERROR(_try_build_leaf_plan(plan, col_uid, relative_path, node, column_reader_cache, - opt->stats)); + opt->stats, &opt->io_ctx)); if (plan->kind == ReadKind::LEAF) { return Status::OK(); } if (node == nullptr) { - RETURN_IF_ERROR(_try_build_external_leaf_plan(plan, col_uid, relative_path, - column_reader_cache, opt->stats)); + RETURN_IF_ERROR(_try_build_external_leaf_plan( + plan, col_uid, relative_path, column_reader_cache, opt->stats, &opt->io_ctx)); if (plan->kind == ReadKind::LEAF) { return Status::OK(); } @@ -997,7 +1001,8 @@ Status VariantColumnReader::_create_iterator_from_plan( : target_col.parent_unique_id(); RETURN_IF_ERROR(_create_hierarchical_reader( iterator, col_uid, plan.relative_path, plan.node, plan.root, column_reader_cache, - opt->stats, HierarchicalDataIterator::ReadType::SUBCOLUMNS_AND_SPARSE)); + opt->stats, HierarchicalDataIterator::ReadType::SUBCOLUMNS_AND_SPARSE, + &opt->io_ctx)); return _maybe_wrap_root_merge_iterator(iterator, plan, opt); } case ReadKind::LEAF: { @@ -1051,7 +1056,7 @@ Status VariantColumnReader::_create_iterator_from_plan( : target_col.parent_unique_id(); RETURN_IF_ERROR(_create_hierarchical_reader( iterator, col_uid, plan.relative_path, plan.node, plan.root, column_reader_cache, - opt->stats, HierarchicalDataIterator::ReadType::DOC_VALUE_COLUMN)); + opt->stats, HierarchicalDataIterator::ReadType::DOC_VALUE_COLUMN, &opt->io_ctx)); if (opt && opt->stats) { opt->stats->variant_doc_value_column_iter_count++; } diff --git a/be/src/storage/segment/variant/variant_column_reader.h b/be/src/storage/segment/variant/variant_column_reader.h index 1630f3bc799c35..84a5570ce71ddf 100644 --- a/be/src/storage/segment/variant/variant_column_reader.h +++ b/be/src/storage/segment/variant/variant_column_reader.h @@ -369,12 +369,12 @@ class VariantColumnReader : public ColumnReader { const PathInData& relative_path) const; Status _try_build_leaf_plan(ReadPlan* plan, int32_t col_uid, const PathInData& relative_path, const SubcolumnColumnMetaInfo::Node* node, - ColumnReaderCache* column_reader_cache, - OlapReaderStatistics* stats); + ColumnReaderCache* column_reader_cache, OlapReaderStatistics* stats, + const io::IOContext* io_ctx); Status _try_build_external_leaf_plan(ReadPlan* plan, int32_t col_uid, const PathInData& relative_path, ColumnReaderCache* column_reader_cache, - OlapReaderStatistics* stats); + OlapReaderStatistics* stats, const io::IOContext* io_ctx); // Materialize a concrete ColumnIterator according to the previously built plan. Status _create_iterator_from_plan(ColumnIteratorUPtr* iterator, const ReadPlan& plan, @@ -395,7 +395,8 @@ class VariantColumnReader : public ColumnReader { const SubcolumnColumnMetaInfo::Node* root, ColumnReaderCache* column_reader_cache, OlapReaderStatistics* stats, - HierarchicalDataIterator::ReadType read_type); + HierarchicalDataIterator::ReadType read_type, + const io::IOContext* io_ctx); // Create a reader that merges subcolumns into the destination sparse column. // If bucket_index is set, only subcolumns whose path belongs to this bucket will be merged. Status _create_sparse_merge_reader(ColumnIteratorUPtr* iterator, const StorageReadOptions* opts, diff --git a/be/src/storage/segment/vertical_segment_writer.cpp b/be/src/storage/segment/vertical_segment_writer.cpp index 6cf4cf1a1c6ed7..f5b60ebadb756c 100644 --- a/be/src/storage/segment/vertical_segment_writer.cpp +++ b/be/src/storage/segment/vertical_segment_writer.cpp @@ -1322,6 +1322,9 @@ uint64_t VerticalSegmentWriter::_estimated_remaining_size() { Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { uint64_t index_start = _file_writer->bytes_appended(); + // Record the common index range for cloud index-only file-cache preload. + // This VerticalSegmentWriter path is used when cloud load, compaction, or schema change flushes + // a whole block through SegmentCreator with enable_vertical_segment_writer enabled. RETURN_IF_ERROR(_write_ordinal_index()); RETURN_IF_ERROR(_write_zone_map()); RETURN_IF_ERROR(_write_inverted_index()); @@ -1343,6 +1346,8 @@ Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { RETURN_IF_ERROR(_write_short_key_index()); *index_size = _file_writer->bytes_appended() - index_start; } + uint64_t file_index_end = _file_writer->bytes_appended(); + _index_file_cache_info.add_index_range(index_start, file_index_end - index_start); // reset all column writers and data_conveter clear(); @@ -1350,18 +1355,28 @@ Status VerticalSegmentWriter::finalize_columns_index(uint64_t* index_size) { return Status::OK(); } -Status VerticalSegmentWriter::finalize_footer(uint64_t* segment_file_size) { +Status VerticalSegmentWriter::finalize_footer(uint64_t* segment_file_size, + SegmentIndexFileCacheInfo* index_file_cache_info) { + uint64_t footer_start = _file_writer->bytes_appended(); RETURN_IF_ERROR(_write_footer()); // finish RETURN_IF_ERROR(_file_writer->close(true)); *segment_file_size = _file_writer->bytes_appended(); + // The closed size completes the preload range recorded above. SegmentIndexFileCacheLoader + // later decides whether this is a remote cloud rowset that should actually be preloaded. + _index_file_cache_info.segment_file_size = *segment_file_size; + _index_file_cache_info.add_index_range(footer_start, *segment_file_size - footer_start); + if (index_file_cache_info != nullptr) { + *index_file_cache_info = _index_file_cache_info; + } if (*segment_file_size == 0) { return Status::Corruption("Bad segment, file size = 0"); } return Status::OK(); } -Status VerticalSegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size) { +Status VerticalSegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* index_size, + SegmentIndexFileCacheInfo* index_file_cache_info) { MonotonicStopWatch timer; timer.start(); // check disk capacity @@ -1375,7 +1390,7 @@ Status VerticalSegmentWriter::finalize(uint64_t* segment_file_size, uint64_t* in // write index RETURN_IF_ERROR(finalize_columns_index(index_size)); // write footer - RETURN_IF_ERROR(finalize_footer(segment_file_size)); + RETURN_IF_ERROR(finalize_footer(segment_file_size, index_file_cache_info)); if (timer.elapsed_time() > 5000000000L) { LOG(INFO) << "segment flush consumes a lot time_ns " << timer.elapsed_time() diff --git a/be/src/storage/segment/vertical_segment_writer.h b/be/src/storage/segment/vertical_segment_writer.h index 1a0645efa92daf..1398cce94fdda7 100644 --- a/be/src/storage/segment/vertical_segment_writer.h +++ b/be/src/storage/segment/vertical_segment_writer.h @@ -34,6 +34,7 @@ #include "storage/olap_define.h" #include "storage/partial_update_info.h" #include "storage/segment/column_writer.h" +#include "storage/segment/segment_index_file_cache_loader.h" #include "storage/tablet/tablet.h" #include "storage/tablet/tablet_schema.h" #include "util/faststring.h" @@ -107,10 +108,12 @@ class VerticalSegmentWriter { [[nodiscard]] uint32_t row_count() const { return _row_count; } [[nodiscard]] uint32_t segment_id() const { return _segment_id; } - Status finalize(uint64_t* segment_file_size, uint64_t* index_size); + Status finalize(uint64_t* segment_file_size, uint64_t* index_size, + SegmentIndexFileCacheInfo* index_file_cache_info = nullptr); Status finalize_columns_index(uint64_t* index_size); - Status finalize_footer(uint64_t* segment_file_size); + Status finalize_footer(uint64_t* segment_file_size, + SegmentIndexFileCacheInfo* index_file_cache_info = nullptr); Slice min_encoded_key(); Slice max_encoded_key(); @@ -220,6 +223,7 @@ class VerticalSegmentWriter { IndexFileWriter* _index_file_writer = nullptr; SegmentFooterPB _footer; + SegmentIndexFileCacheInfo _index_file_cache_info; // for mow tables with cluster key, the sort key is the cluster keys not unique keys // for other tables, the sort key is the keys size_t _num_sort_key_columns; diff --git a/be/src/storage/tablet/base_tablet.cpp b/be/src/storage/tablet/base_tablet.cpp index 09f73ec43d27be..3cd8747ee798fb 100644 --- a/be/src/storage/tablet/base_tablet.cpp +++ b/be/src/storage/tablet/base_tablet.cpp @@ -81,7 +81,8 @@ Status _get_segment_column_iterator(const BetaRowsetSharedPtr& rowset, uint32_t const TabletColumn& target_column, SegmentCacheHandle* segment_cache_handle, std::unique_ptr* column_iterator, - OlapReaderStatistics* stats) { + OlapReaderStatistics* stats, + const io::IOContext* input_io_ctx = nullptr) { RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(rowset, segment_cache_handle, true)); // find segment auto it = std::find_if( @@ -95,13 +96,18 @@ Status _get_segment_column_iterator(const BetaRowsetSharedPtr& rowset, uint32_t segment_v2::SegmentSharedPtr segment = *it; StorageReadOptions opts; opts.stats = stats; + if (input_io_ctx != nullptr) { + opts.io_ctx = *input_io_ctx; + } RETURN_IF_ERROR(segment->new_column_iterator(target_column, column_iterator, &opts)); + auto io_ctx = opts.io_ctx; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.file_cache_stats = &stats->file_cache_stats; segment_v2::ColumnIteratorOptions opt { .use_page_cache = !config::disable_storage_page_cache, .file_reader = segment->file_reader().get(), .stats = stats, - .io_ctx = io::IOContext {.reader_type = ReaderType::READER_QUERY, - .file_cache_stats = &stats->file_cache_stats}, + .io_ctx = io_ctx, }; RETURN_IF_ERROR((*column_iterator)->init(opt)); return Status::OK(); @@ -428,7 +434,8 @@ std::vector BaseTablet::get_rowset_by_ids( Status BaseTablet::lookup_row_data(const Slice& encoded_key, const RowLocation& row_location, RowsetSharedPtr input_rowset, OlapReaderStatistics& stats, - std::string& values, bool write_to_cache) { + std::string& values, bool write_to_cache, + const io::IOContext* io_ctx) { MonotonicStopWatch watch; size_t row_size = 1; watch.start(); @@ -444,7 +451,8 @@ Status BaseTablet::lookup_row_data(const Slice& encoded_key, const RowLocation& std::unique_ptr column_iterator; const auto& column = *DORIS_TRY(tablet_schema->column(BeConsts::ROW_STORE_COL)); RETURN_IF_ERROR(_get_segment_column_iterator(rowset, row_location.segment_id, column, - &segment_cache_handle, &column_iterator, &stats)); + &segment_cache_handle, &column_iterator, &stats, + io_ctx)); // get and parse tuple row MutableColumnPtr column_ptr = ColumnString::create(); std::vector rowids {static_cast(row_location.row_id)}; diff --git a/be/src/storage/tablet/base_tablet.h b/be/src/storage/tablet/base_tablet.h index a835e2465ee131..c26e944b01ca6b 100644 --- a/be/src/storage/tablet/base_tablet.h +++ b/be/src/storage/tablet/base_tablet.h @@ -170,7 +170,7 @@ class BaseTablet : public std::enable_shared_from_this { // Lookup a row with TupleDescriptor and fill Block Status lookup_row_data(const Slice& encoded_key, const RowLocation& row_location, RowsetSharedPtr rowset, OlapReaderStatistics& stats, std::string& values, - bool write_to_cache = false); + bool write_to_cache = false, const io::IOContext* io_ctx = nullptr); // Lookup the row location of `encoded_key`, the function sets `row_location` on success. // NOTE: the method only works in unique key model with primary key index, you will got a // not supported error in other data model. diff --git a/be/test/cloud/cloud_compaction_test.cpp b/be/test/cloud/cloud_compaction_test.cpp index 05eec3149f996a..f7916ab511ef55 100644 --- a/be/test/cloud/cloud_compaction_test.cpp +++ b/be/test/cloud/cloud_compaction_test.cpp @@ -28,12 +28,14 @@ #include "cloud/cloud_storage_engine.h" #include "cloud/cloud_tablet.h" #include "cloud/cloud_tablet_mgr.h" +#include "cloud/config.h" #include "json2pb/json_to_pb.h" #include "storage/olap_common.h" #include "storage/rowset/rowset_factory.h" #include "storage/rowset/rowset_meta.h" #include "storage/storage_policy.h" #include "storage/tablet/tablet_meta.h" +#include "util/defer_op.h" #include "util/uid_util.h" namespace doris { @@ -395,6 +397,15 @@ TEST_F(CloudCompactionTest, test_set_storage_resource_from_input_rowsets) { } } TEST_F(CloudCompactionTest, should_cache_compaction_output) { + auto old_write_index_file_only = config::enable_file_cache_write_index_file_only; + auto old_keep_base_compaction_output = config::enable_file_cache_keep_base_compaction_output; + Defer restore_config {[&] { + config::enable_file_cache_write_index_file_only = old_write_index_file_only; + config::enable_file_cache_keep_base_compaction_output = old_keep_base_compaction_output; + }}; + config::enable_file_cache_write_index_file_only = false; + config::enable_file_cache_keep_base_compaction_output = false; + CloudTabletSPtr tablet = std::make_shared(_engine, std::make_shared()); CloudBaseCompaction cloud_base_compaction(_engine, tablet); cloud_base_compaction._input_rowsets_total_size = 0; @@ -436,6 +447,12 @@ TEST_F(CloudCompactionTest, should_cache_compaction_output) { cloud_base_compaction._input_rowsets_cached_data_size = 50; cloud_base_compaction._input_rowsets_cached_index_size = 50; ASSERT_EQ(cloud_base_compaction.should_cache_compaction_output(), true); + + config::enable_file_cache_keep_base_compaction_output = true; + ASSERT_EQ(cloud_base_compaction.should_cache_compaction_output(), true); + + config::enable_file_cache_write_index_file_only = true; + ASSERT_EQ(cloud_base_compaction.should_cache_compaction_output(), false); LOG(INFO) << "should_cache_compaction_output done"; } diff --git a/be/test/exec/operator/materialization_shared_state_test.cpp b/be/test/exec/operator/materialization_shared_state_test.cpp index f03a150496eefa..b1950482a702f2 100644 --- a/be/test/exec/operator/materialization_shared_state_test.cpp +++ b/be/test/exec/operator/materialization_shared_state_test.cpp @@ -23,9 +23,19 @@ #include "core/field.h" #include "exec/operator/materialization_opertor.h" #include "exec/pipeline/dependency.h" +#include "runtime/runtime_profile.h" namespace doris { +namespace { + +void add_request_row(PRequestBlockDesc* request_block_desc, uint32_t row_id, uint32_t file_id) { + request_block_desc->add_row_id(row_id); + request_block_desc->add_file_id(file_id); +} + +} // namespace + class MaterializationSharedStateTest : public testing::Test { protected: void SetUp() override { @@ -107,12 +117,10 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponse) { // 2. Setup response blocks from multiple backends // Backend 1's response { - _shared_state->rpc_struct_map[_backend_id1] - .request.mutable_request_block_descs(0) - ->add_row_id(0); - _shared_state->rpc_struct_map[_backend_id1] - .request.mutable_request_block_descs(0) - ->add_row_id(1); + auto* request_block_desc = + _shared_state->rpc_struct_map[_backend_id1].request.mutable_request_block_descs(0); + add_request_row(request_block_desc, 0, 1); + add_request_row(request_block_desc, 1, 1); Block resp_block1; auto resp_value_col1 = _int_type->create_column(); auto* value_col_data1 = reinterpret_cast(resp_value_col1.get()); @@ -137,9 +145,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponse) { // Backend 2's response { - _shared_state->rpc_struct_map[_backend_id2] - .request.mutable_request_block_descs(0) - ->add_row_id(2); + add_request_row( + _shared_state->rpc_struct_map[_backend_id2].request.mutable_request_block_descs(0), + 2, 2); Block resp_block2; auto resp_value_col2 = _int_type->create_column(); auto* value_col_data2 = reinterpret_cast(resp_value_col2.get()); @@ -166,7 +174,8 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponse) { // 4. Test merging responses Block result_block; - Status st = _shared_state->merge_multi_response(); + RuntimeProfile profile("MaterializationSharedStateTest"); + Status st = _shared_state->merge_multi_response(&profile); _shared_state->get_block(&result_block); EXPECT_TRUE(st.ok()); @@ -219,9 +228,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseMultiBlocks) { // 2. Setup response blocks from multiple backends for first rowid { - _shared_state->rpc_struct_map[_backend_id1] - .request.mutable_request_block_descs(0) - ->add_row_id(0); + add_request_row( + _shared_state->rpc_struct_map[_backend_id1].request.mutable_request_block_descs(0), + 0, 1); Block resp_block1; auto resp_value_col1 = _int_type->create_column(); auto* value_col_data1 = reinterpret_cast(resp_value_col1.get()); @@ -244,9 +253,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseMultiBlocks) { // Backend 2's response for first rowid { - _shared_state->rpc_struct_map[_backend_id2] - .request.mutable_request_block_descs(0) - ->add_row_id(0); + add_request_row( + _shared_state->rpc_struct_map[_backend_id2].request.mutable_request_block_descs(0), + 0, 2); Block resp_block2; auto resp_value_col2 = _int_type->create_column(); auto* value_col_data2 = reinterpret_cast(resp_value_col2.get()); @@ -270,9 +279,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseMultiBlocks) { _shared_state->rpc_struct_map[_backend_id1].request.add_request_block_descs(); _shared_state->rpc_struct_map[_backend_id2].request.add_request_block_descs(); { - _shared_state->rpc_struct_map[_backend_id1] - .request.mutable_request_block_descs(1) - ->add_row_id(0); + add_request_row( + _shared_state->rpc_struct_map[_backend_id1].request.mutable_request_block_descs(1), + 0, 3); Block resp_block1; auto resp_value_col1 = _int_type->create_column(); auto* value_col_data1 = reinterpret_cast(resp_value_col1.get()); @@ -292,9 +301,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseMultiBlocks) { } { - _shared_state->rpc_struct_map[_backend_id2] - .request.mutable_request_block_descs(1) - ->add_row_id(0); + add_request_row( + _shared_state->rpc_struct_map[_backend_id2].request.mutable_request_block_descs(1), + 0, 4); Block resp_block2; auto resp_value_col2 = _int_type->create_column(); auto* value_col_data2 = reinterpret_cast(resp_value_col2.get()); @@ -320,7 +329,8 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseMultiBlocks) { // 4. Test merging responses Block result_block; - Status st = _shared_state->merge_multi_response(); + RuntimeProfile profile("MaterializationSharedStateTest"); + Status st = _shared_state->merge_multi_response(&profile); EXPECT_TRUE(st.ok()); _shared_state->get_block(&result_block); @@ -361,9 +371,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseBackendNotFound) { // --- BE_1: valid response with 1 row --- { - _shared_state->rpc_struct_map[_backend_id1] - .request.mutable_request_block_descs(0) - ->add_row_id(0); + add_request_row( + _shared_state->rpc_struct_map[_backend_id1].request.mutable_request_block_descs(0), + 0, 1); Block resp_block; auto col = _int_type->create_column(); reinterpret_cast(col.get())->insert( @@ -388,9 +398,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseBackendNotFound) { // After deserialization this produces a Block with 0 columns, // so is_empty_column() == true and it won't be inserted into block_maps. { - _shared_state->rpc_struct_map[_backend_id2] - .request.mutable_request_block_descs(0) - ->add_row_id(0); + add_request_row( + _shared_state->rpc_struct_map[_backend_id2].request.mutable_request_block_descs(0), + 0, 2); PMultiGetResponseV2 response; response.add_blocks(); // empty PMultiGetBlockV2, no mutable_block() data _shared_state->rpc_struct_map[_backend_id2].response = std::move(response); @@ -410,7 +420,8 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseBackendNotFound) { _shared_state->rowid_locs = {0}; // merge_multi_response() should return InternalError - Status st = _shared_state->merge_multi_response(); + RuntimeProfile profile("MaterializationSharedStateTest"); + Status st = _shared_state->merge_multi_response(&profile); ASSERT_FALSE(st.ok()); ASSERT_TRUE(st.is()); ASSERT_TRUE(st.to_string().find("not match request row id count") != std::string::npos) @@ -444,9 +455,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseStaleBlockMaps) { // --- Build BE_1's response: blocks[0]=1 row (INT), blocks[1]=empty --- { - _shared_state->rpc_struct_map[_backend_id1] - .request.mutable_request_block_descs(0) - ->add_row_id(0); + add_request_row( + _shared_state->rpc_struct_map[_backend_id1].request.mutable_request_block_descs(0), + 0, 1); PMultiGetResponseV2 response; // blocks[0]: 1 row of INT for relation 0 @@ -481,9 +492,9 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseStaleBlockMaps) { reinterpret_cast(col.get())->insert_data("Alice", 5); rel1_block.insert({make_nullable(std::move(col)), make_nullable(_string_type), "name"}); - _shared_state->rpc_struct_map[_backend_id2] - .request.mutable_request_block_descs(1) - ->add_row_id(0); + add_request_row( + _shared_state->rpc_struct_map[_backend_id2].request.mutable_request_block_descs(1), + 0, 2); auto* pb1 = response.add_blocks()->mutable_block(); size_t us = 0, cs = 0; int64_t ct = 0; @@ -509,7 +520,8 @@ TEST_F(MaterializationSharedStateTest, TestMergeMultiResponseStaleBlockMaps) { _shared_state->rowid_locs = {0, 1}; // merge should succeed — each relation only references the BE that has data - Status st = _shared_state->merge_multi_response(); + RuntimeProfile profile("MaterializationSharedStateTest"); + Status st = _shared_state->merge_multi_response(&profile); ASSERT_TRUE(st.ok()) << "merge_multi_response failed: " << st.to_string(); // Verify results diff --git a/be/test/io/cache/block_file_cache_profile_reporter_test.cpp b/be/test/io/cache/block_file_cache_profile_reporter_test.cpp index e74ad758ac1db3..c2aea1cd825253 100644 --- a/be/test/io/cache/block_file_cache_profile_reporter_test.cpp +++ b/be/test/io/cache/block_file_cache_profile_reporter_test.cpp @@ -52,6 +52,8 @@ io::FileCacheStatistics make_file_cache_stats(int64_t multiplier) { stats.inverted_index_remote_io_timer = multiplier * 26; stats.inverted_index_peer_io_timer = multiplier * 27; stats.inverted_index_io_timer = multiplier * 28; + stats.remote_only_on_miss_triggered = multiplier * 29; + stats.remote_only_on_miss_threshold_bytes = multiplier * 30; return stats; } @@ -89,6 +91,9 @@ void expect_file_cache_stats_eq(const io::FileCacheStatistics& actual, EXPECT_EQ(actual.inverted_index_remote_io_timer, expected.inverted_index_remote_io_timer); EXPECT_EQ(actual.inverted_index_peer_io_timer, expected.inverted_index_peer_io_timer); EXPECT_EQ(actual.inverted_index_io_timer, expected.inverted_index_io_timer); + EXPECT_EQ(actual.remote_only_on_miss_triggered, expected.remote_only_on_miss_triggered); + EXPECT_EQ(actual.remote_only_on_miss_threshold_bytes, + expected.remote_only_on_miss_threshold_bytes); } } // namespace diff --git a/be/test/io/cache/block_file_cache_test.cpp b/be/test/io/cache/block_file_cache_test.cpp index dcf210e3cb1af7..9f97c866b3b9ef 100644 --- a/be/test/io/cache/block_file_cache_test.cpp +++ b/be/test/io/cache/block_file_cache_test.cpp @@ -22,6 +22,7 @@ #include #include "io/cache/block_file_cache_test_common.h" +#include "io/cache/remote_scan_cache_write_limiter.h" #include "io/fs/buffered_reader.h" #include "storage/olap_define.h" @@ -208,6 +209,650 @@ void complete_into_memory(const io::FileBlocksHolder& holder) { } } +io::FileCacheSettings remote_only_on_miss_test_settings() { + io::FileCacheSettings settings; + settings.query_queue_size = 8_mb; + settings.query_queue_elements = 128; + settings.index_queue_size = 1_mb; + settings.index_queue_elements = 16; + settings.disposable_queue_size = 1_mb; + settings.disposable_queue_elements = 16; + settings.capacity = 10_mb; + settings.max_file_block_size = 1_mb; + settings.max_query_cache_size = 8_mb; + return settings; +} + +TEST_F(BlockFileCacheTest, file_cache_profile_remote_only_on_miss_state_counters) { + RuntimeProfile profile("file_cache_profile_test"); + FileCacheProfileReporter reporter(&profile); + + FileCacheStatistics stats; + stats.remote_only_on_miss_triggered = 1; + stats.remote_only_on_miss_threshold_bytes = 128; + reporter.update(&stats); + reporter.update(&stats); + + auto* triggered = profile.get_counter("RemoteOnlyOnMissTriggered"); + auto* threshold_bytes = profile.get_counter("RemoteOnlyOnMissThresholdBytes"); + ASSERT_NE(triggered, nullptr); + ASSERT_NE(threshold_bytes, nullptr); + ASSERT_NE(dynamic_cast(triggered), nullptr); + ASSERT_NE(dynamic_cast(threshold_bytes), nullptr); + EXPECT_EQ(triggered->value(), 1); + EXPECT_EQ(threshold_bytes->value(), 128); + + FileCacheStatistics later_stats; + later_stats.remote_only_on_miss_threshold_bytes = 64; + reporter.update(&later_stats); + EXPECT_EQ(triggered->value(), 1); + EXPECT_EQ(threshold_bytes->value(), 128); + + later_stats.remote_only_on_miss_threshold_bytes = 256; + reporter.update(&later_stats); + EXPECT_EQ(threshold_bytes->value(), 256); +} + +TEST_F(BlockFileCacheTest, file_cache_profile_specialized_write_cache_counters) { + RuntimeProfile profile("file_cache_profile_specialized_write_test"); + FileCacheProfileReporter reporter(&profile); + + FileCacheStatistics stats; + stats.inverted_index_write_cache_io_timer = 11; + stats.inverted_index_bytes_write_into_cache = 17; + stats.segment_footer_index_write_cache_io_timer = 23; + stats.segment_footer_index_bytes_write_into_cache = 29; + reporter.update(&stats); + + auto* inverted_index_write_timer = profile.get_counter("InvertedIndexWriteCacheIOUseTimer"); + auto* inverted_index_write_bytes = profile.get_counter("InvertedIndexBytesWriteIntoCache"); + auto* segment_footer_write_timer = + profile.get_counter("SegmentFooterIndexWriteCacheIOUseTimer"); + auto* segment_footer_write_bytes = profile.get_counter("SegmentFooterIndexBytesWriteIntoCache"); + ASSERT_NE(inverted_index_write_timer, nullptr); + ASSERT_NE(inverted_index_write_bytes, nullptr); + ASSERT_NE(segment_footer_write_timer, nullptr); + ASSERT_NE(segment_footer_write_bytes, nullptr); + EXPECT_EQ(inverted_index_write_timer->value(), 11); + EXPECT_EQ(inverted_index_write_bytes->value(), 17); + EXPECT_EQ(segment_footer_write_timer->value(), 23); + EXPECT_EQ(segment_footer_write_bytes->value(), 29); +} + +TEST_F(BlockFileCacheTest, get_downloaded_blocks_if_fully_covered_is_read_only) { + const std::string local_cache_path = + (caches_dir / "remote_only_on_miss_helper_cache" / "").string(); + if (fs::exists(local_cache_path)) { + fs::remove_all(local_cache_path); + } + fs::create_directories(local_cache_path); + + io::BlockFileCache mgr(local_cache_path, cached_remote_reader_cache_settings()); + ASSERT_TRUE(mgr.initialize().ok()); + + ReadStatistics read_stats; + io::CacheContext context; + context.stats = &read_stats; + context.cache_type = io::FileCacheType::NORMAL; + auto key = io::BlockFileCache::hash("remote_only_on_miss_helper_key"); + const size_t block_size = cached_remote_reader_cache_settings().max_file_block_size; + + auto assert_not_fully_covered = [&](size_t offset, size_t size, size_t expected_blocks_num) { + io::FileBlocks blocks; + bool fully_covered = true; + ASSERT_TRUE(mgr.get_downloaded_blocks_if_fully_covered(key, offset, size, context, &blocks, + &fully_covered) + .ok()); + EXPECT_FALSE(fully_covered); + EXPECT_TRUE(blocks.empty()); + EXPECT_EQ(mgr.get_file_blocks_num(io::FileCacheType::NORMAL), expected_blocks_num); + }; + + auto assert_fully_covered = [&](size_t offset, size_t size, size_t expected_blocks_size) { + io::FileBlocks blocks; + bool fully_covered = false; + ASSERT_TRUE(mgr.get_downloaded_blocks_if_fully_covered(key, offset, size, context, &blocks, + &fully_covered) + .ok()); + EXPECT_TRUE(fully_covered); + EXPECT_EQ(blocks.size(), expected_blocks_size); + }; + + assert_not_fully_covered(0, 10, 0); + + complete_into_memory(mgr.get_or_set(key, 0, block_size, context)); + ASSERT_EQ(mgr.get_file_blocks_num(io::FileCacheType::NORMAL), 1); + assert_fully_covered(2, 5, 1); + assert_not_fully_covered(block_size - 4, 8, 1); + + complete_into_memory(mgr.get_or_set(key, block_size, block_size, context)); + ASSERT_EQ(mgr.get_file_blocks_num(io::FileCacheType::NORMAL), 2); + assert_fully_covered(block_size - 4, 8, 2); + + complete_into_memory(mgr.get_or_set(key, 2 * block_size, block_size, context)); + ASSERT_EQ(mgr.get_file_blocks_num(io::FileCacheType::NORMAL), 3); + assert_fully_covered(block_size - 4, block_size + 8, 3); + + const size_t partial_block_size = 123 * 1024 + 17; + complete_into_memory(mgr.get_or_set(key, 3 * block_size, partial_block_size, context)); + ASSERT_EQ(mgr.get_file_blocks_num(io::FileCacheType::NORMAL), 4); + assert_fully_covered(3 * block_size + 7, 31, 1); + assert_fully_covered(3 * block_size - 4, 16, 2); + assert_fully_covered(0, 3 * block_size + partial_block_size, 4); + assert_not_fully_covered(3 * block_size + partial_block_size - 4, 8, 4); + assert_not_fully_covered(0, 3 * block_size + partial_block_size + 1, 4); +} + +TEST_F(BlockFileCacheTest, cached_remote_file_reader_remote_only_on_miss) { + const std::string local_cache_path = + (caches_dir / "remote_only_on_miss_reader_cache" / "").string(); + BlockFileCache* cache = nullptr; + ASSERT_TRUE(create_cached_remote_reader_cache(local_cache_path, &cache).ok()); + + { + const auto remote_file = caches_dir / "remote_only_on_miss_reader_file"; + { + std::ofstream ofs(remote_file, std::ios::binary | std::ios::trunc); + ASSERT_TRUE(ofs.is_open()); + for (int i = 0; i < 2; ++i) { + std::string data(1_mb, static_cast('a' + i)); + ofs.write(data.data(), data.size()); + } + } + + FileReaderSPtr local_reader; + ASSERT_TRUE(global_local_filesystem()->open_file(remote_file, &local_reader).ok()); + io::FileReaderOptions opts; + opts.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; + opts.is_doris_table = true; + opts.tablet_id = 10086; + auto reader = std::make_shared(local_reader, opts); + + auto key = io::BlockFileCache::hash(remote_file.filename().string()); + cache->remove_if_cached(key); + + std::string buffer(4_kb, '\0'); + size_t bytes_read = 0; + io::IOContext io_ctx; + io::FileCacheStatistics stats; + io_ctx.file_cache_stats = &stats; + io_ctx.file_cache_miss_policy = io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS; + + ASSERT_TRUE(reader->read_at(123, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx) + .ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_EQ(stats.num_remote_io_total, 1); + EXPECT_EQ(stats.bytes_read_from_remote, buffer.size()); + EXPECT_EQ(stats.num_skip_cache_io_total, 1); + EXPECT_EQ(stats.bytes_write_into_cache, 0); + EXPECT_TRUE(cache->get_blocks_by_key(key).empty()); + + io::FileCacheStatistics write_back_stats; + io_ctx.file_cache_stats = &write_back_stats; + io_ctx.file_cache_miss_policy = io::FileCacheMissPolicy::READ_THROUGH_AND_WRITE_BACK; + ASSERT_TRUE(reader->read_at(123, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx) + .ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_GT(write_back_stats.bytes_write_into_cache, 0); + EXPECT_FALSE(cache->get_blocks_by_key(key).empty()); + + io::FileCacheStatistics full_hit_stats; + io_ctx.file_cache_stats = &full_hit_stats; + io_ctx.file_cache_miss_policy = io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS; + ASSERT_TRUE(reader->read_at(123, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx) + .ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_EQ(full_hit_stats.num_local_io_total, 1); + EXPECT_EQ(full_hit_stats.bytes_read_from_local, buffer.size()); + EXPECT_EQ(full_hit_stats.num_remote_io_total, 0); + EXPECT_EQ(full_hit_stats.bytes_write_into_cache, 0); + } + + cleanup_cached_remote_reader_cache(local_cache_path); +} + +TEST_F(BlockFileCacheTest, remote_scan_cache_write_limiter_strict_budget) { + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 2; + RemoteScanCacheWriteLimiter state(query_id, 100); + + EXPECT_TRUE(state.enabled()); + EXPECT_TRUE(state.try_admit_cache_write(60)); + EXPECT_EQ(state.admitted_data_bytes(), 60); + EXPECT_FALSE(state.remote_only_on_miss()); + EXPECT_TRUE(state.try_admit_cache_write(40)); + EXPECT_EQ(state.admitted_data_bytes(), 100); + EXPECT_FALSE(state.remote_only_on_miss()); + + EXPECT_FALSE(state.try_admit_cache_write(1)); + EXPECT_TRUE(state.remote_only_on_miss()); + EXPECT_EQ(state.admitted_data_bytes(), 100); + EXPECT_FALSE(state.try_admit_cache_write(1)); +} + +TEST_F(BlockFileCacheTest, remote_scan_cache_write_limiter_threshold_zero_and_negative) { + TUniqueId query_id; + query_id.hi = 11; + query_id.lo = 12; + + RemoteScanCacheWriteLimiter disabled(query_id, -1); + EXPECT_FALSE(disabled.enabled()); + EXPECT_TRUE(disabled.try_admit_cache_write(1_mb)); + EXPECT_FALSE(disabled.remote_only_on_miss()); + EXPECT_EQ(disabled.admitted_data_bytes(), 0); + + RemoteScanCacheWriteLimiter no_write(query_id, 0); + EXPECT_TRUE(no_write.enabled()); + EXPECT_TRUE(no_write.remote_only_on_miss()); + EXPECT_FALSE(no_write.try_admit_cache_write(1)); + EXPECT_EQ(no_write.admitted_data_bytes(), 0); +} + +TEST_F(BlockFileCacheTest, remote_scan_cache_write_limiter_concurrent_budget) { + TUniqueId query_id; + query_id.hi = 3; + query_id.lo = 4; + RemoteScanCacheWriteLimiter state(query_id, 100); + + std::atomic admitted_count {0}; + std::vector threads; + for (int i = 0; i < 32; ++i) { + threads.emplace_back([&] { + if (state.try_admit_cache_write(10)) { + admitted_count.fetch_add(1, std::memory_order_relaxed); + } + }); + } + for (auto& thread : threads) { + thread.join(); + } + + EXPECT_LE(admitted_count.load(), 10); + EXPECT_TRUE(state.remote_only_on_miss()); + EXPECT_LE(state.admitted_data_bytes(), 100); +} + +TEST_F(BlockFileCacheTest, get_or_set_remote_scan_cache_write_limiter_admission) { + const std::string local_cache_path = + (caches_dir / "remote_only_on_miss_threshold_cache" / "").string(); + if (fs::exists(local_cache_path)) { + fs::remove_all(local_cache_path); + } + fs::create_directories(local_cache_path); + + io::BlockFileCache mgr(local_cache_path, remote_only_on_miss_test_settings()); + ASSERT_TRUE(mgr.initialize().ok()); + + TUniqueId query_id; + query_id.hi = 5; + query_id.lo = 6; + RemoteScanCacheWriteLimiter state(query_id, 1_mb); + + ReadStatistics read_stats; + io::CacheContext context; + context.stats = &read_stats; + context.cache_type = io::FileCacheType::NORMAL; + context.remote_scan_cache_write_limiter = &state; + context.admit_cache_write_by_remote_scan_limiter = true; + + auto key = io::BlockFileCache::hash("remote_only_on_miss_threshold_key"); + auto first = mgr.get_or_set(key, 0, 1_mb, context); + ASSERT_EQ(first.file_blocks.size(), 1); + EXPECT_EQ(first.file_blocks.front()->state(), FileBlock::State::EMPTY); + complete_into_memory(first); + EXPECT_EQ(mgr.get_file_blocks_num(io::FileCacheType::NORMAL), 1); + EXPECT_FALSE(state.remote_only_on_miss()); + + auto second = mgr.get_or_set(key, 1_mb, 1_mb, context); + ASSERT_EQ(second.file_blocks.size(), 1); + EXPECT_EQ(second.file_blocks.front()->state(), FileBlock::State::SKIP_CACHE); + EXPECT_EQ(mgr.get_file_blocks_num(io::FileCacheType::NORMAL), 1); + EXPECT_TRUE(state.remote_only_on_miss()); + + auto hit_after_threshold = mgr.get_or_set(key, 0, 1_mb, context); + ASSERT_EQ(hit_after_threshold.file_blocks.size(), 1); + EXPECT_EQ(hit_after_threshold.file_blocks.front()->state(), FileBlock::State::DOWNLOADED); +} + +TEST_F(BlockFileCacheTest, get_or_set_remote_scan_cache_write_limiter_is_query_wide_for_index) { + const std::string local_cache_path = + (caches_dir / "remote_only_on_miss_query_wide_index_cache" / "").string(); + if (fs::exists(local_cache_path)) { + fs::remove_all(local_cache_path); + } + fs::create_directories(local_cache_path); + + io::BlockFileCache mgr(local_cache_path, remote_only_on_miss_test_settings()); + ASSERT_TRUE(mgr.initialize().ok()); + + TUniqueId query_id; + query_id.hi = 15; + query_id.lo = 16; + RemoteScanCacheWriteLimiter state(query_id, 1_mb); + + io::IOContext data_io_ctx; + data_io_ctx.reader_type = ReaderType::READER_QUERY; + data_io_ctx.query_id = &query_id; + data_io_ctx.remote_scan_cache_write_limiter = &state; + io::CacheContext data_context(&data_io_ctx); + ReadStatistics data_read_stats; + data_context.stats = &data_read_stats; + EXPECT_TRUE(data_context.admit_cache_write_by_remote_scan_limiter); + EXPECT_EQ(data_context.cache_type, io::FileCacheType::NORMAL); + + auto data_key = io::BlockFileCache::hash("remote_only_on_miss_query_wide_data_key"); + auto data_holder = mgr.get_or_set(data_key, 0, 1_mb, data_context); + ASSERT_EQ(data_holder.file_blocks.size(), 1); + EXPECT_EQ(data_holder.file_blocks.front()->state(), FileBlock::State::EMPTY); + complete_into_memory(data_holder); + EXPECT_EQ(mgr.get_file_blocks_num(io::FileCacheType::NORMAL), 1); + EXPECT_FALSE(state.remote_only_on_miss()); + EXPECT_EQ(state.admitted_data_bytes(), 1_mb); + + io::IOContext index_io_ctx; + index_io_ctx.reader_type = ReaderType::READER_QUERY; + index_io_ctx.query_id = &query_id; + index_io_ctx.is_index_data = true; + index_io_ctx.is_inverted_index = true; + index_io_ctx.remote_scan_cache_write_limiter = &state; + io::CacheContext index_context(&index_io_ctx); + ReadStatistics index_read_stats; + index_context.stats = &index_read_stats; + EXPECT_TRUE(index_context.admit_cache_write_by_remote_scan_limiter); + EXPECT_EQ(index_context.cache_type, io::FileCacheType::INDEX); + + auto index_key = io::BlockFileCache::hash("remote_only_on_miss_query_wide_index_key"); + auto index_holder = mgr.get_or_set(index_key, 0, 1_mb, index_context); + ASSERT_EQ(index_holder.file_blocks.size(), 1); + EXPECT_EQ(index_holder.file_blocks.front()->state(), FileBlock::State::SKIP_CACHE); + EXPECT_EQ(mgr.get_file_blocks_num(io::FileCacheType::INDEX), 0); + EXPECT_TRUE(state.remote_only_on_miss()); + EXPECT_EQ(state.admitted_data_bytes(), 1_mb); +} + +TEST_F(BlockFileCacheTest, get_or_set_remote_scan_cache_write_limiter_segment_meta_config) { + const bool old_file_cache_query_limit_segment_meta = + config::file_cache_query_limit_segment_meta; + Defer restore_config {[&] { + config::file_cache_query_limit_segment_meta = old_file_cache_query_limit_segment_meta; + }}; + + const std::string local_cache_path = + (caches_dir / "remote_only_on_miss_query_segment_meta_cache" / "").string(); + if (fs::exists(local_cache_path)) { + fs::remove_all(local_cache_path); + } + fs::create_directories(local_cache_path); + + io::BlockFileCache mgr(local_cache_path, remote_only_on_miss_test_settings()); + ASSERT_TRUE(mgr.initialize().ok()); + + TUniqueId query_id; + query_id.hi = 25; + query_id.lo = 26; + + io::IOContext segment_footer_io_ctx; + segment_footer_io_ctx.reader_type = ReaderType::READER_QUERY; + segment_footer_io_ctx.query_id = &query_id; + segment_footer_io_ctx.is_index_data = true; + segment_footer_io_ctx.is_inverted_index = false; + + config::file_cache_query_limit_segment_meta = false; + RemoteScanCacheWriteLimiter default_state(query_id, 1_mb); + segment_footer_io_ctx.remote_scan_cache_write_limiter = &default_state; + io::CacheContext default_context(&segment_footer_io_ctx); + ReadStatistics default_read_stats; + default_context.stats = &default_read_stats; + EXPECT_FALSE(default_context.admit_cache_write_by_remote_scan_limiter); + EXPECT_EQ(default_context.cache_type, io::FileCacheType::INDEX); + + auto default_key = io::BlockFileCache::hash("remote_only_on_miss_default_segment_meta_key"); + auto default_holder = mgr.get_or_set(default_key, 0, 1_mb, default_context); + ASSERT_EQ(default_holder.file_blocks.size(), 1); + EXPECT_EQ(default_holder.file_blocks.front()->state(), FileBlock::State::EMPTY); + complete_into_memory(default_holder); + EXPECT_FALSE(default_state.remote_only_on_miss()); + EXPECT_EQ(default_state.admitted_data_bytes(), 0); + EXPECT_EQ(mgr.get_file_blocks_num(io::FileCacheType::INDEX), 1); + + config::file_cache_query_limit_segment_meta = true; + RemoteScanCacheWriteLimiter enabled_state(query_id, 1_mb); + segment_footer_io_ctx.remote_scan_cache_write_limiter = &enabled_state; + io::CacheContext enabled_context(&segment_footer_io_ctx); + ReadStatistics enabled_read_stats; + enabled_context.stats = &enabled_read_stats; + EXPECT_TRUE(enabled_context.admit_cache_write_by_remote_scan_limiter); + EXPECT_EQ(enabled_context.cache_type, io::FileCacheType::INDEX); + + auto enabled_key = io::BlockFileCache::hash("remote_only_on_miss_enabled_segment_meta_key"); + auto first = mgr.get_or_set(enabled_key, 0, 1_mb, enabled_context); + ASSERT_EQ(first.file_blocks.size(), 1); + EXPECT_EQ(first.file_blocks.front()->state(), FileBlock::State::EMPTY); + complete_into_memory(first); + EXPECT_FALSE(enabled_state.remote_only_on_miss()); + EXPECT_EQ(enabled_state.admitted_data_bytes(), 1_mb); + EXPECT_EQ(mgr.get_file_blocks_num(io::FileCacheType::INDEX), 2); + + auto second = mgr.get_or_set(enabled_key, 1_mb, 1, enabled_context); + ASSERT_EQ(second.file_blocks.size(), 1); + EXPECT_EQ(second.file_blocks.front()->state(), FileBlock::State::SKIP_CACHE); + EXPECT_TRUE(enabled_state.remote_only_on_miss()); + EXPECT_EQ(enabled_state.admitted_data_bytes(), 1_mb); + EXPECT_EQ(mgr.get_file_blocks_num(io::FileCacheType::INDEX), 2); +} + +TEST_F(BlockFileCacheTest, cached_remote_file_reader_specialized_write_cache_stats) { + const fs::path cache_path = caches_dir / "specialized_write_cache_stats_cache"; + BlockFileCache* cache = nullptr; + Defer cleanup_cache {[&]() { + std::error_code ignore; + fs::remove_all(cache_path, ignore); + cleanup_cached_remote_reader_cache(cache_path.string()); + }}; + + ASSERT_TRUE(create_cached_remote_reader_cache(cache_path.string(), &cache).ok()); + + auto read_and_check = [&](const std::string& name, bool is_inverted_index, int64_t mtime) { + const auto remote_file = caches_dir / name; + { + std::ofstream ofs(remote_file, std::ios::binary | std::ios::trunc); + ASSERT_TRUE(ofs.is_open()); + ofs << "abcdefghijklmnop"; + } + Defer cleanup_file {[&]() { + std::error_code ec; + fs::remove(remote_file, ec); + }}; + + FileReaderSPtr local_reader; + ASSERT_TRUE(global_local_filesystem()->open_file(remote_file.string(), &local_reader).ok()); + io::FileReaderOptions opts; + opts.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; + opts.is_doris_table = false; + opts.cache_base_path = cache_path.string(); + opts.mtime = mtime; + auto reader = std::make_shared(local_reader, opts); + + std::string buffer(4, '#'); + size_t bytes_read = 0; + io::IOContext io_ctx; + io::FileCacheStatistics stats; + io_ctx.file_cache_stats = &stats; + io_ctx.is_index_data = true; + io_ctx.is_inverted_index = is_inverted_index; + + ASSERT_TRUE( + reader->read_at(2, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx).ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_EQ(buffer, "cdef"); + EXPECT_GT(stats.bytes_write_into_cache, 0); + + if (is_inverted_index) { + EXPECT_EQ(stats.inverted_index_bytes_write_into_cache, stats.bytes_write_into_cache); + EXPECT_EQ(stats.inverted_index_write_cache_io_timer, stats.write_cache_io_timer); + EXPECT_EQ(stats.segment_footer_index_bytes_write_into_cache, 0); + EXPECT_EQ(stats.segment_footer_index_write_cache_io_timer, 0); + } else { + EXPECT_EQ(stats.segment_footer_index_bytes_write_into_cache, + stats.bytes_write_into_cache); + EXPECT_EQ(stats.segment_footer_index_write_cache_io_timer, stats.write_cache_io_timer); + EXPECT_EQ(stats.inverted_index_bytes_write_into_cache, 0); + EXPECT_EQ(stats.inverted_index_write_cache_io_timer, 0); + } + }; + + read_and_check("specialized_write_cache_stats_inverted_index", true, 1); + read_and_check("specialized_write_cache_stats_segment_footer_index", false, 2); +} + +TEST_F(BlockFileCacheTest, cached_remote_file_reader_policy_remote_only_with_scan_limiter) { + const std::string local_cache_path = + (caches_dir / "remote_only_on_miss_policy_with_threshold_reader_cache" / "").string(); + if (fs::exists(local_cache_path)) { + fs::remove_all(local_cache_path); + } + fs::create_directories(local_cache_path); + ASSERT_TRUE(FileCacheFactory::instance() + ->create_file_cache(local_cache_path, remote_only_on_miss_test_settings()) + .ok()); + Defer cleanup_cache {[&] { cleanup_cached_remote_reader_cache(local_cache_path); }}; + + const auto remote_file = caches_dir / "remote_only_on_miss_policy_with_threshold_reader_file"; + { + std::ofstream ofs(remote_file, std::ios::binary | std::ios::trunc); + ASSERT_TRUE(ofs.is_open()); + std::string data(1_mb, 'a'); + ofs.write(data.data(), data.size()); + } + + FileReaderSPtr local_reader; + ASSERT_TRUE(global_local_filesystem()->open_file(remote_file, &local_reader).ok()); + io::FileReaderOptions opts; + opts.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; + opts.is_doris_table = true; + opts.tablet_id = 10086; + auto reader = std::make_shared(local_reader, opts); + + auto key = io::BlockFileCache::hash(remote_file.filename().string()); + auto* cache = FileCacheFactory::instance()->get_by_path(key); + ASSERT_NE(cache, nullptr); + cache->remove_if_cached(key); + + TUniqueId query_id; + query_id.hi = 9; + query_id.lo = 10; + RemoteScanCacheWriteLimiter state(query_id, 1_mb); + + std::string buffer(4_kb, '\0'); + size_t bytes_read = 0; + io::IOContext io_ctx; + io::FileCacheStatistics stats; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.file_cache_stats = &stats; + io_ctx.file_cache_miss_policy = FileCacheMissPolicy::REMOTE_ONLY_ON_MISS; + io_ctx.remote_scan_cache_write_limiter = &state; + + ASSERT_TRUE( + reader->read_at(123, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx).ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_EQ(stats.num_remote_io_total, 1); + EXPECT_EQ(stats.num_skip_cache_io_total, 1); + EXPECT_EQ(stats.bytes_write_into_cache, 0); + EXPECT_FALSE(state.remote_only_on_miss()); + EXPECT_EQ(state.admitted_data_bytes(), 0); + EXPECT_TRUE(cache->get_blocks_by_key(key).empty()); +} + +TEST_F(BlockFileCacheTest, cached_remote_file_reader_remote_scan_cache_write_limiter) { + const std::string local_cache_path = + (caches_dir / "remote_only_on_miss_threshold_reader_cache" / "").string(); + if (fs::exists(local_cache_path)) { + fs::remove_all(local_cache_path); + } + fs::create_directories(local_cache_path); + ASSERT_TRUE(FileCacheFactory::instance() + ->create_file_cache(local_cache_path, remote_only_on_miss_test_settings()) + .ok()); + Defer cleanup_cache {[&] { cleanup_cached_remote_reader_cache(local_cache_path); }}; + + const auto remote_file = caches_dir / "remote_only_on_miss_threshold_reader_file"; + { + std::ofstream ofs(remote_file, std::ios::binary | std::ios::trunc); + ASSERT_TRUE(ofs.is_open()); + for (int i = 0; i < 3; ++i) { + std::string data(1_mb, static_cast('a' + i)); + ofs.write(data.data(), data.size()); + } + } + + FileReaderSPtr local_reader; + ASSERT_TRUE(global_local_filesystem()->open_file(remote_file, &local_reader).ok()); + io::FileReaderOptions opts; + opts.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; + opts.is_doris_table = true; + opts.tablet_id = 10086; + auto reader = std::make_shared(local_reader, opts); + + auto key = io::BlockFileCache::hash(remote_file.filename().string()); + auto* cache = FileCacheFactory::instance()->get_by_path(key); + ASSERT_NE(cache, nullptr); + cache->remove_if_cached(key); + + TUniqueId query_id; + query_id.hi = 7; + query_id.lo = 8; + RemoteScanCacheWriteLimiter state(query_id, 1_mb); + + std::string buffer(4_kb, '\0'); + size_t bytes_read = 0; + io::IOContext io_ctx; + io::FileCacheStatistics stats; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.file_cache_stats = &stats; + io_ctx.remote_scan_cache_write_limiter = &state; + + ASSERT_TRUE( + reader->read_at(123, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx).ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_GT(stats.bytes_write_into_cache, 0); + EXPECT_FALSE(state.remote_only_on_miss()); + EXPECT_EQ(cache->get_blocks_by_key(key).size(), 1); + + io::FileCacheStatistics threshold_stats; + io_ctx.file_cache_stats = &threshold_stats; + ASSERT_TRUE( + reader->read_at(1_mb + 123, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx) + .ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_TRUE(state.remote_only_on_miss()); + EXPECT_EQ(threshold_stats.bytes_write_into_cache, 0); + EXPECT_EQ(threshold_stats.num_skip_cache_io_total, 1); + EXPECT_EQ(threshold_stats.remote_only_on_miss_triggered, 1); + EXPECT_EQ(threshold_stats.remote_only_on_miss_threshold_bytes, 1_mb); + EXPECT_EQ(cache->get_blocks_by_key(key).size(), 1); + + io::FileCacheStatistics full_hit_stats; + io_ctx.file_cache_stats = &full_hit_stats; + ASSERT_TRUE( + reader->read_at(123, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx).ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_EQ(full_hit_stats.num_local_io_total, 1); + EXPECT_EQ(full_hit_stats.num_remote_io_total, 0); + EXPECT_EQ(full_hit_stats.bytes_write_into_cache, 0); + + io::FileCacheStatistics remote_only_stats; + io_ctx.file_cache_stats = &remote_only_stats; + ASSERT_TRUE( + reader->read_at(2_mb + 123, Slice(buffer.data(), buffer.size()), &bytes_read, &io_ctx) + .ok()); + EXPECT_EQ(bytes_read, buffer.size()); + EXPECT_EQ(remote_only_stats.num_remote_io_total, 1); + EXPECT_EQ(remote_only_stats.num_skip_cache_io_total, 1); + EXPECT_EQ(remote_only_stats.bytes_write_into_cache, 0); + EXPECT_EQ(cache->get_blocks_by_key(key).size(), 1); +} + void test_file_cache(io::FileCacheType cache_type) { TUniqueId query_id; query_id.hi = 1; diff --git a/be/test/storage/cloud_file_cache_write_index_only_test.cpp b/be/test/storage/cloud_file_cache_write_index_only_test.cpp new file mode 100644 index 00000000000000..c330b93fee489b --- /dev/null +++ b/be/test/storage/cloud_file_cache_write_index_only_test.cpp @@ -0,0 +1,799 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cloud/config.h" +#include "common/config.h" +#include "core/block/block.h" +#include "cpp/sync_point.h" +#include "io/cache/block_file_cache_factory.h" +#include "io/fs/file_writer.h" +#include "io/fs/local_file_system.h" +#include "io/fs/s3_file_system.h" +#include "io/fs/s3_file_writer.h" +#include "io/fs/s3_obj_storage_client.h" +#include "io/io_common.h" +#include "runtime/exec_env.h" +#include "storage/index/inverted/inverted_index_writer.h" +#include "storage/options.h" +#include "storage/rowset/rowset_factory.h" +#include "storage/rowset/rowset_writer.h" +#include "storage/rowset/rowset_writer_context.h" +#include "storage/segment/segment_index_file_cache_loader.h" +#include "storage/storage_engine.h" +#include "util/threadpool.h" +#include "util/time.h" + +namespace doris { +namespace { + +using segment_v2::SegmentIndexFileCacheLoadContext; +using segment_v2::SegmentIndexFileCacheLoadReason; +using segment_v2::SegmentIndexFileCachePreloadTask; + +constexpr int64_t kIndexOnlyTabletId = 10005; +constexpr int64_t kIndexOnlyPartitionId = 10006; +constexpr int64_t kIndexOnlyTabletSchemaHash = 10007; +constexpr std::string_view kTestDir = "ut_dir/cloud_file_cache_write_index_only_e2e"; +constexpr std::string_view kTmpDir = "ut_dir/cloud_file_cache_write_index_only_e2e/tmp"; + +bool has_suffix(std::string_view value, std::string_view suffix) { + return value.size() >= suffix.size() && value.substr(value.size() - suffix.size()) == suffix; +} + +struct CreatedS3File { + std::string path; + FileType file_type; + bool is_s3_writer = false; + bool has_cache_builder = false; + bool write_file_cache = false; + bool allow_adaptive_file_cache_write = false; + uint64_t approximate_bytes_to_write = 0; + size_t bytes_appended = 0; + bool saw_put_object = false; +}; + +struct ObservedIndexPreload { + SegmentIndexFileCacheLoadReason reason; + uint32_t segment_id = 0; + std::string segment_path; + uint64_t range_offset = 0; + uint64_t range_size = 0; + uint64_t segment_file_size = 0; + int closed_segment_files = 0; +}; + +struct WriterFlushCounters { + int vertical_segment_writer_flush = 0; + int segment_writer_final_flush = 0; +}; + +struct S3WriteCounters { + int segment_file_close = 0; + int open_file = 0; +}; + +} // namespace + +class CloudFileCacheWriteIndexOnlyConfigTest : public testing::Test { +protected: + void SetUp() override { + _origin_index_only = config::enable_file_cache_write_index_file_only; + _origin_enable_file_cache = config::enable_file_cache; + _origin_cloud_unique_id = config::cloud_unique_id; + } + + void TearDown() override { + auto sp = SyncPoint::get_instance(); + sp->disable_processing(); + sp->clear_all_call_backs(); + sp->clear_trace(); + + config::enable_file_cache_write_index_file_only = _origin_index_only; + config::enable_file_cache = _origin_enable_file_cache; + config::cloud_unique_id = _origin_cloud_unique_id; + } + +private: + bool _origin_index_only = false; + bool _origin_enable_file_cache = false; + std::string _origin_cloud_unique_id; +}; + +class CloudFileCacheWriteIndexOnlyTest : public testing::Test { +protected: + void SetUp() override { + _origin_index_only = config::enable_file_cache_write_index_file_only; + _origin_enable_file_cache = config::enable_file_cache; + _origin_enable_flush_file_cache_async = config::enable_flush_file_cache_async; + _origin_cloud_unique_id = config::cloud_unique_id; + _origin_enable_packed_file = config::enable_packed_file; + _origin_enable_vertical_segment_writer = config::enable_vertical_segment_writer; + + config::enable_file_cache_write_index_file_only = true; + config::enable_file_cache = true; + config::enable_flush_file_cache_async = false; + config::cloud_unique_id = "cloud_file_cache_write_index_only_e2e"; + config::enable_packed_file = false; + config::enable_vertical_segment_writer = true; + + ASSERT_TRUE(io::global_local_filesystem()->delete_directory(std::string(kTestDir)).ok()); + ASSERT_TRUE(io::global_local_filesystem()->create_directory(std::string(kTestDir)).ok()); + + _origin_file_cache_factory = ExecEnv::GetInstance()->_file_cache_factory; + _owned_file_cache_factory = std::make_unique(); + ExecEnv::GetInstance()->_file_cache_factory = _owned_file_cache_factory.get(); + io::FileCacheSettings settings; + settings.query_queue_size = 64 * 1024 * 1024; + settings.query_queue_elements = 64; + settings.index_queue_size = 64 * 1024 * 1024; + settings.index_queue_elements = 64; + settings.disposable_queue_size = 1024 * 1024; + settings.disposable_queue_elements = 16; + settings.capacity = 128 * 1024 * 1024; + settings.max_file_block_size = 1024 * 1024; + settings.max_query_cache_size = 0; + settings.storage = "memory"; + ASSERT_TRUE(io::FileCacheFactory::instance()->create_file_cache("memory", settings).ok()); + + std::vector paths; + paths.emplace_back(std::string(kTmpDir), -1); + auto tmp_file_dirs = std::make_unique(paths); + ASSERT_TRUE(tmp_file_dirs->init().ok()); + ExecEnv::GetInstance()->set_tmp_file_dir(std::move(tmp_file_dirs)); + + if (ExecEnv::GetInstance()->s3_file_upload_thread_pool() == nullptr) { + std::unique_ptr pool; + ASSERT_TRUE(ThreadPoolBuilder("cloud_file_cache_write_index_only_s3_upload") + .set_min_threads(1) + .set_max_threads(4) + .build(&pool) + .ok()); + ExecEnv::GetInstance()->_s3_file_upload_thread_pool = std::move(pool); + _created_s3_upload_pool = true; + } + + S3Conf s3_conf; + s3_conf.client_conf.ak = "fake_ak"; + s3_conf.client_conf.sk = "fake_sk"; + s3_conf.client_conf.endpoint = "fake_s3_endpoint"; + s3_conf.client_conf.region = "fake_s3_region"; + s3_conf.bucket = "fake_s3_bucket"; + s3_conf.prefix = "cloud_file_cache_write_index_only_e2e"; + auto fs = io::S3FileSystem::create(std::move(s3_conf), "cloud-file-cache-index-only-ut-fs"); + ASSERT_TRUE(fs.has_value()) << fs.error(); + _remote_fs = fs.value(); + + auto engine = std::make_unique(EngineOptions {}); + _engine = engine.get(); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + + void TearDown() override { + auto sp = SyncPoint::get_instance(); + sp->disable_processing(); + sp->clear_all_call_backs(); + sp->clear_trace(); + + _remote_fs.reset(); + _engine = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + + if (_created_s3_upload_pool) { + ExecEnv::GetInstance()->_s3_file_upload_thread_pool.reset(); + } + ExecEnv::GetInstance()->set_tmp_file_dir(nullptr); + + _owned_file_cache_factory.reset(); + ExecEnv::GetInstance()->_file_cache_factory = _origin_file_cache_factory; + + ASSERT_TRUE(io::global_local_filesystem()->delete_directory(std::string(kTestDir)).ok()); + + config::enable_file_cache_write_index_file_only = _origin_index_only; + config::enable_file_cache = _origin_enable_file_cache; + config::enable_flush_file_cache_async = _origin_enable_flush_file_cache_async; + config::cloud_unique_id = _origin_cloud_unique_id; + config::enable_packed_file = _origin_enable_packed_file; + config::enable_vertical_segment_writer = _origin_enable_vertical_segment_writer; + } + + TabletSchemaSPtr create_schema(bool with_inverted_index = false, + InvertedIndexStorageFormatPB inverted_index_storage_format = + InvertedIndexStorageFormatPB::V2) { + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(KeysType::DUP_KEYS); + tablet_schema_pb.set_num_short_key_columns(1); + tablet_schema_pb.set_num_rows_per_row_block(1024); + tablet_schema_pb.set_compress_kind(COMPRESS_NONE); + tablet_schema_pb.set_next_column_unique_id(3); + + auto* key = tablet_schema_pb.add_column(); + key->set_unique_id(1); + key->set_name("k1"); + key->set_type("INT"); + key->set_is_key(true); + key->set_length(4); + key->set_index_length(4); + key->set_is_nullable(false); + key->set_is_bf_column(false); + + auto* value = tablet_schema_pb.add_column(); + value->set_unique_id(2); + value->set_name("v1"); + value->set_type("INT"); + value->set_is_key(false); + value->set_length(4); + value->set_index_length(4); + value->set_is_nullable(false); + value->set_is_bf_column(false); + + if (with_inverted_index) { + tablet_schema_pb.set_inverted_index_storage_format(inverted_index_storage_format); + auto* index = tablet_schema_pb.add_index(); + index->set_index_id(10000); + index->set_index_name("v1_idx"); + index->set_index_type(IndexType::INVERTED); + index->add_col_unique_id(2); + } + + auto tablet_schema = std::make_shared(); + tablet_schema->init_from_pb(tablet_schema_pb); + return tablet_schema; + } + + RowsetWriterContext create_context(const TabletSchemaSPtr& tablet_schema, + DataWriteType write_type = DataWriteType::TYPE_DEFAULT, + ReaderType compaction_type = ReaderType::UNKNOWN) { + RowsetId rowset_id; + rowset_id.init(_next_rowset_id++); + + RowsetWriterContext context; + context.rowset_id = rowset_id; + context.tablet_id = kIndexOnlyTabletId; + context.partition_id = kIndexOnlyPartitionId; + context.tablet_schema_hash = kIndexOnlyTabletSchemaHash; + context.rowset_type = BETA_ROWSET; + context.tablet_schema = tablet_schema; + context.rowset_state = VISIBLE; + context.version = Version(_next_rowset_id, _next_rowset_id); + context.segments_overlap = OVERLAPPING; + context.max_rows_per_segment = UINT32_MAX; + context.data_dir = nullptr; + context.write_type = write_type; + context.compaction_type = compaction_type; + context.storage_resource = StorageResource(_remote_fs); + context.tablet_path = "unused_local_tablet_path"; + context.write_file_cache = true; + context.approximate_bytes_to_write = 4096; + context.newest_write_timestamp = UnixSeconds(); + context.allow_packed_file = false; + context.encrypt_algorithm = EncryptionAlgorithmPB::PLAINTEXT; + return context; + } + + Block create_full_block(const TabletSchemaSPtr& tablet_schema, int32_t start_key = 1) { + auto block = tablet_schema->create_block(); + auto columns = std::move(block).mutate_columns(); + for (int32_t i = 0; i < 8; ++i) { + int32_t key = start_key + i; + int32_t value = key * 10; + columns[0]->insert_data(reinterpret_cast(&key), sizeof(key)); + columns[1]->insert_data(reinterpret_cast(&value), sizeof(value)); + } + block.set_columns(std::move(columns)); + return block; + } + + Block create_column_block(const TabletSchemaSPtr& tablet_schema, + const std::vector& column_ids, int32_t row_count = 8, + int32_t start_key = 1) { + auto block = tablet_schema->create_block(column_ids); + auto columns = std::move(block).mutate_columns(); + for (int32_t i = 0; i < row_count; ++i) { + int32_t key = start_key + i; + int32_t value = column_ids[0] == 0 ? key : key * 10; + columns[0]->insert_data(reinterpret_cast(&value), sizeof(value)); + } + block.set_columns(std::move(columns)); + return block; + } + + void install_observers( + std::vector* observed, std::vector* created_files, + int* preload_task_count, WriterFlushCounters* writer_flush_counters, + S3WriteCounters* s3_write_counters, SyncPoint::CallbackGuard* load_guard, + SyncPoint::CallbackGuard* task_guard, SyncPoint::CallbackGuard* vertical_writer_guard, + SyncPoint::CallbackGuard* segment_writer_guard, + SyncPoint::CallbackGuard* s3_client_guard, SyncPoint::CallbackGuard* s3_put_guard, + SyncPoint::CallbackGuard* create_file_guard, SyncPoint::CallbackGuard* close_file_guard, + SyncPoint::CallbackGuard* s3_open_file_guard) { + auto sp = SyncPoint::get_instance(); + sp->clear_all_call_backs(); + sp->enable_processing(); + sp->set_call_back( + "s3_client_factory::create", + [](auto&& args) { + auto* ret = try_any_cast_ret>(args); + ret->second = true; + }, + s3_client_guard); + sp->set_call_back( + "S3FileWriter::_put_object", + [created_files](auto&& args) { + auto* writer = try_any_cast(args[0]); + for (auto& file : *created_files) { + if (has_suffix(writer->path().native(), file.path)) { + file.bytes_appended = writer->bytes_appended(); + file.saw_put_object = true; + break; + } + } + auto* should_return = try_any_cast(args.back()); + *should_return = true; + }, + s3_put_guard); + sp->set_call_back( + "BaseBetaRowsetWriter::_create_file_writer", + [created_files](auto&& args) { + auto* path = try_any_cast(args[0]); + auto* file_type = try_any_cast(args[1]); + auto* writer = try_any_cast(args[2]); + auto* opts = try_any_cast(args[3]); + created_files->push_back(CreatedS3File { + .path = *path, + .file_type = *file_type, + .is_s3_writer = dynamic_cast(writer) != nullptr, + .has_cache_builder = writer->cache_builder() != nullptr, + .write_file_cache = opts->write_file_cache, + .allow_adaptive_file_cache_write = + opts->allow_adaptive_file_cache_write, + .approximate_bytes_to_write = opts->approximate_bytes_to_write}); + }, + create_file_guard); + sp->set_call_back( + "SegmentFileCollection::close_file_writer", + [s3_write_counters](auto&& args) { + auto* writer = try_any_cast(args[0]); + if (has_suffix(writer->path().native(), ".dat")) { + ++s3_write_counters->segment_file_close; + } + }, + close_file_guard); + sp->set_call_back( + "S3FileSystem::open_file_internal", + [s3_write_counters](auto&& /*args*/) { ++s3_write_counters->open_file; }, + s3_open_file_guard); + sp->set_call_back( + "SegmentIndexFileCacheLoader::preload_segment_indexes_to_file_cache", + [preload_task_count](auto&& args) { + auto* tasks = + try_any_cast*>( + args[1]); + *preload_task_count += static_cast(tasks->size()); + }, + task_guard); + sp->set_call_back( + "SegmentIndexFileCacheLoader::load_segment_index_to_file_cache", + [observed, s3_write_counters](auto&& args) { + auto* ctx = try_any_cast(args[0]); + auto* io_ctx = try_any_cast(args[1]); + EXPECT_TRUE(io_ctx->is_index_data); + EXPECT_TRUE(io_ctx->is_dryrun); + EXPECT_FALSE(io_ctx->is_warmup); + observed->push_back(ObservedIndexPreload { + .reason = ctx->reason, + .segment_id = ctx->segment_id, + .segment_path = ctx->segment_path, + .range_offset = ctx->range.offset, + .range_size = ctx->range.size, + .segment_file_size = ctx->segment_file_size, + .closed_segment_files = s3_write_counters->segment_file_close}); + + auto* ret = try_any_cast_ret(args); + ret->first = Status::OK(); + ret->second = true; + }, + load_guard); + sp->set_call_back( + "SegmentFlusher::flush_vertical_segment_writer", + [writer_flush_counters](auto&& args) { + static_cast(try_any_cast(args[0])); + ++writer_flush_counters->vertical_segment_writer_flush; + }, + vertical_writer_guard); + sp->set_call_back( + "VerticalBetaRowsetWriter::final_flush_segment_writer", + [writer_flush_counters](auto&& args) { + static_cast(try_any_cast(args[0])); + ++writer_flush_counters->segment_writer_final_flush; + }, + segment_writer_guard); + } + + void expect_segment_write_bypasses_file_cache(const std::vector& created_files) { + bool saw_segment_file = false; + for (const auto& file : created_files) { + if (file.file_type != FileType::SEGMENT_FILE) { + continue; + } + saw_segment_file = true; + EXPECT_TRUE(file.is_s3_writer) << file.path; + EXPECT_FALSE(file.write_file_cache) << file.path; + EXPECT_FALSE(file.allow_adaptive_file_cache_write) << file.path; + EXPECT_EQ(file.approximate_bytes_to_write, 0) << file.path; + EXPECT_FALSE(file.has_cache_builder) << file.path; + + auto cache_key = std::filesystem::path(file.path).filename().native(); + auto cache_blocks = io::FileCacheFactory::instance()->get_cache_data_by_path(cache_key); + EXPECT_TRUE(cache_blocks.empty()) << file.path; + } + EXPECT_TRUE(saw_segment_file); + } + + void expect_inverted_index_writes_file_cache(const std::vector& created_files) { + bool saw_index_file = false; + for (const auto& file : created_files) { + if (file.file_type != FileType::INVERTED_INDEX_FILE) { + continue; + } + saw_index_file = true; + EXPECT_TRUE(file.is_s3_writer) << file.path; + EXPECT_TRUE(file.write_file_cache) << file.path; + EXPECT_FALSE(file.allow_adaptive_file_cache_write) << file.path; + EXPECT_EQ(file.approximate_bytes_to_write, 0) << file.path; + EXPECT_TRUE(file.has_cache_builder) << file.path; + EXPECT_TRUE(file.saw_put_object) << file.path; + + auto cache_key = std::filesystem::path(file.path).filename().native(); + auto cache_blocks = io::FileCacheFactory::instance()->get_cache_data_by_path(cache_key); + if (file.bytes_appended == 0) { + EXPECT_TRUE(cache_blocks.empty()) << file.path; + } else { + EXPECT_FALSE(cache_blocks.empty()) + << file.path << ", bytes_appended=" << file.bytes_appended; + } + } + EXPECT_TRUE(saw_index_file); + } + + void expect_loader_open_file_is_mocked_out(const S3WriteCounters& s3_write_counters) { + EXPECT_EQ(s3_write_counters.open_file, 0); + } + + StorageEngine* _engine = nullptr; + std::shared_ptr _remote_fs; + + io::FileCacheFactory* _origin_file_cache_factory = nullptr; + std::unique_ptr _owned_file_cache_factory; + bool _created_s3_upload_pool = false; + + bool _origin_index_only = false; + bool _origin_enable_file_cache = false; + bool _origin_enable_flush_file_cache_async = false; + std::string _origin_cloud_unique_id; + bool _origin_enable_packed_file = false; + bool _origin_enable_vertical_segment_writer = false; + int64_t _next_rowset_id = 20000; +}; + +TEST_F(CloudFileCacheWriteIndexOnlyConfigTest, FileWriterOptionsKeepLegacyWhenIndexOnlyDisabled) { + config::enable_file_cache_write_index_file_only = false; + + RowsetWriterContext context; + context.write_file_cache = false; + context.approximate_bytes_to_write = 12345; + + auto segment_opts = context.get_file_writer_options(FileType::SEGMENT_FILE); + EXPECT_FALSE(segment_opts.write_file_cache); + EXPECT_TRUE(segment_opts.allow_adaptive_file_cache_write); + EXPECT_EQ(segment_opts.approximate_bytes_to_write, 12345); + + auto index_opts = context.get_file_writer_options(FileType::INVERTED_INDEX_FILE); + EXPECT_FALSE(index_opts.write_file_cache); + EXPECT_TRUE(index_opts.allow_adaptive_file_cache_write); + EXPECT_EQ(index_opts.approximate_bytes_to_write, 12345); +} + +TEST_F(CloudFileCacheWriteIndexOnlyConfigTest, IndexOnlyOptionsSplitSegmentAndInvertedIndexFiles) { + config::enable_file_cache_write_index_file_only = true; + + RowsetWriterContext context; + context.write_file_cache = false; + context.approximate_bytes_to_write = 12345; + + auto segment_opts = context.get_file_writer_options(FileType::SEGMENT_FILE); + EXPECT_FALSE(segment_opts.write_file_cache); + EXPECT_FALSE(segment_opts.allow_adaptive_file_cache_write); + EXPECT_EQ(segment_opts.approximate_bytes_to_write, 0); + + auto index_opts = context.get_file_writer_options(FileType::INVERTED_INDEX_FILE); + EXPECT_TRUE(index_opts.write_file_cache); + EXPECT_FALSE(index_opts.allow_adaptive_file_cache_write); + EXPECT_EQ(index_opts.approximate_bytes_to_write, 0); +} + +TEST_F(CloudFileCacheWriteIndexOnlyConfigTest, + IndexOnlyIgnoresRequestWriteFileCacheForSegmentData) { + config::enable_file_cache_write_index_file_only = true; + + RowsetWriterContext context; + context.write_file_cache = true; + context.approximate_bytes_to_write = 12345; + + auto segment_opts = context.get_file_writer_options(FileType::SEGMENT_FILE); + EXPECT_FALSE(segment_opts.write_file_cache); + EXPECT_FALSE(segment_opts.allow_adaptive_file_cache_write); + + auto index_opts = context.get_file_writer_options(FileType::INVERTED_INDEX_FILE); + EXPECT_TRUE(index_opts.write_file_cache); + EXPECT_FALSE(index_opts.allow_adaptive_file_cache_write); +} + +TEST_F(CloudFileCacheWriteIndexOnlyConfigTest, SegmentIndexFileCacheLoaderSkipsWhenConfigDisabled) { + config::enable_file_cache = false; + config::enable_file_cache_write_index_file_only = true; + + segment_v2::SegmentIndexFileCacheLoadContext context; + context.range = {.offset = 1, .size = 1}; + context.segment_file_size = 2; + + EXPECT_TRUE(segment_v2::SegmentIndexFileCacheLoader::load_segment_index_to_file_cache(context) + .ok()); +} + +TEST_F(CloudFileCacheWriteIndexOnlyConfigTest, + SegmentIndexFileCacheLoaderSkipsEmptyRangeBeforeOpenFile) { + config::enable_file_cache = true; + config::enable_file_cache_write_index_file_only = true; + config::cloud_unique_id = "cloud_file_cache_empty_range_ut"; + + S3Conf s3_conf; + s3_conf.client_conf.ak = "fake_ak"; + s3_conf.client_conf.sk = "fake_sk"; + s3_conf.client_conf.endpoint = "fake_s3_endpoint"; + s3_conf.client_conf.region = "fake_s3_region"; + s3_conf.bucket = "fake_s3_bucket"; + s3_conf.prefix = "cloud_file_cache_empty_range_ut"; + auto fs = io::S3FileSystem::create(std::move(s3_conf), "cloud-file-cache-empty-range-ut-fs"); + ASSERT_TRUE(fs.has_value()) << fs.error(); + + int open_file_count = 0; + SyncPoint::CallbackGuard s3_open_file_guard; + auto sp = SyncPoint::get_instance(); + sp->clear_all_call_backs(); + sp->enable_processing(); + sp->set_call_back( + "S3FileSystem::open_file_internal", + [&open_file_count](auto&& /*args*/) { + ++open_file_count; + ADD_FAILURE() << "empty range should return before opening segment"; + }, + &s3_open_file_guard); + + segment_v2::SegmentIndexFileCacheLoadContext context; + context.fs = fs.value(); + context.segment_path = "empty_range_should_not_open.dat"; + context.tablet_id = kIndexOnlyTabletId; + context.segment_file_size = 2; + + EXPECT_TRUE(segment_v2::SegmentIndexFileCacheLoader::load_segment_index_to_file_cache(context) + .ok()); + EXPECT_EQ(open_file_count, 0); +} + +TEST_F(CloudFileCacheWriteIndexOnlyTest, + LoadUsesVerticalSegmentWriterAndPreloadsAfterAllSegmentFilesClosed) { + auto tablet_schema = create_schema(true); + RowsetWriterContext context = create_context(tablet_schema); + + std::vector observed; + std::vector created_files; + int preload_task_count = 0; + WriterFlushCounters writer_flush_counters; + S3WriteCounters s3_write_counters; + SyncPoint::CallbackGuard load_guard; + SyncPoint::CallbackGuard task_guard; + SyncPoint::CallbackGuard vertical_writer_guard; + SyncPoint::CallbackGuard segment_writer_guard; + SyncPoint::CallbackGuard s3_client_guard; + SyncPoint::CallbackGuard s3_put_guard; + SyncPoint::CallbackGuard create_file_guard; + SyncPoint::CallbackGuard close_file_guard; + SyncPoint::CallbackGuard s3_open_file_guard; + install_observers(&observed, &created_files, &preload_task_count, &writer_flush_counters, + &s3_write_counters, &load_guard, &task_guard, &vertical_writer_guard, + &segment_writer_guard, &s3_client_guard, &s3_put_guard, &create_file_guard, + &close_file_guard, &s3_open_file_guard); + + auto writer_result = RowsetFactory::create_rowset_writer(*_engine, context, false); + ASSERT_TRUE(writer_result.has_value()) << writer_result.error(); + auto rowset_writer = std::move(writer_result).value(); + + auto block = create_full_block(tablet_schema, 1); + auto st = rowset_writer->flush_single_block(&block); + ASSERT_TRUE(st.ok()) << st; + auto second_block = create_full_block(tablet_schema, 100); + st = rowset_writer->flush_single_block(&second_block); + ASSERT_TRUE(st.ok()) << st; + + RowsetSharedPtr rowset; + st = rowset_writer->build(rowset); + ASSERT_TRUE(st.ok()) << st; + ASSERT_NE(rowset, nullptr); + EXPECT_EQ(rowset->rowset_meta()->num_segments(), 2); + + EXPECT_EQ(writer_flush_counters.vertical_segment_writer_flush, 2); + EXPECT_EQ(writer_flush_counters.segment_writer_final_flush, 0); + EXPECT_EQ(preload_task_count, 2); + ASSERT_EQ(observed.size(), 4); + std::vector ranges_per_segment(2, 0); + for (const auto& item : observed) { + ASSERT_LT(item.segment_id, ranges_per_segment.size()); + ++ranges_per_segment[item.segment_id]; + EXPECT_EQ(item.reason, SegmentIndexFileCacheLoadReason::LOAD); + EXPECT_EQ(item.segment_path, context.segment_path(item.segment_id)); + EXPECT_GT(item.range_offset, 0); + EXPECT_GT(item.range_size, 0); + EXPECT_LE(item.range_offset + item.range_size, item.segment_file_size); + EXPECT_EQ(item.closed_segment_files, 2); + } + EXPECT_EQ(ranges_per_segment[0], 2); + EXPECT_EQ(ranges_per_segment[1], 2); + + expect_segment_write_bypasses_file_cache(created_files); + expect_inverted_index_writes_file_cache(created_files); + expect_loader_open_file_is_mocked_out(s3_write_counters); +} + +TEST_F(CloudFileCacheWriteIndexOnlyTest, + VerticalCompactionUsesSegmentWriterAndPreloadsAfterAllSegmentFilesClosed) { + auto tablet_schema = create_schema(true); + RowsetWriterContext context = create_context(tablet_schema, DataWriteType::TYPE_COMPACTION, + ReaderType::READER_CUMULATIVE_COMPACTION); + + std::vector observed; + std::vector created_files; + int preload_task_count = 0; + WriterFlushCounters writer_flush_counters; + S3WriteCounters s3_write_counters; + SyncPoint::CallbackGuard load_guard; + SyncPoint::CallbackGuard task_guard; + SyncPoint::CallbackGuard vertical_writer_guard; + SyncPoint::CallbackGuard segment_writer_guard; + SyncPoint::CallbackGuard s3_client_guard; + SyncPoint::CallbackGuard s3_put_guard; + SyncPoint::CallbackGuard create_file_guard; + SyncPoint::CallbackGuard close_file_guard; + SyncPoint::CallbackGuard s3_open_file_guard; + install_observers(&observed, &created_files, &preload_task_count, &writer_flush_counters, + &s3_write_counters, &load_guard, &task_guard, &vertical_writer_guard, + &segment_writer_guard, &s3_client_guard, &s3_put_guard, &create_file_guard, + &close_file_guard, &s3_open_file_guard); + + auto writer_result = RowsetFactory::create_rowset_writer(*_engine, context, true); + ASSERT_TRUE(writer_result.has_value()) << writer_result.error(); + auto rowset_writer = std::move(writer_result).value(); + + std::vector key_column_ids = {0}; + auto key_block = create_column_block(tablet_schema, key_column_ids, 8, 1); + auto st = rowset_writer->add_columns(&key_block, key_column_ids, true, 4, false); + ASSERT_TRUE(st.ok()) << st; + auto second_key_block = create_column_block(tablet_schema, key_column_ids, 8, 100); + st = rowset_writer->add_columns(&second_key_block, key_column_ids, true, 4, false); + ASSERT_TRUE(st.ok()) << st; + st = rowset_writer->flush_columns(true); + ASSERT_TRUE(st.ok()) << st; + + std::vector value_column_ids = {1}; + auto value_block = create_column_block(tablet_schema, value_column_ids, 16, 1); + st = rowset_writer->add_columns(&value_block, value_column_ids, false, UINT32_MAX, false); + ASSERT_TRUE(st.ok()) << st; + st = rowset_writer->flush_columns(false); + ASSERT_TRUE(st.ok()) << st; + st = rowset_writer->final_flush(); + ASSERT_TRUE(st.ok()) << st; + + RowsetSharedPtr rowset; + st = rowset_writer->build(rowset); + ASSERT_TRUE(st.ok()) << st; + ASSERT_NE(rowset, nullptr); + EXPECT_EQ(rowset->rowset_meta()->num_segments(), 2); + + EXPECT_EQ(writer_flush_counters.vertical_segment_writer_flush, 0); + EXPECT_EQ(writer_flush_counters.segment_writer_final_flush, 2); + EXPECT_EQ(preload_task_count, 2); + ASSERT_EQ(observed.size(), 6); + std::vector ranges_per_segment(2, 0); + for (const auto& item : observed) { + ASSERT_LT(item.segment_id, ranges_per_segment.size()); + ++ranges_per_segment[item.segment_id]; + EXPECT_EQ(item.reason, SegmentIndexFileCacheLoadReason::CUMULATIVE_COMPACTION); + EXPECT_EQ(item.segment_path, context.segment_path(item.segment_id)); + EXPECT_GT(item.range_offset, 0); + EXPECT_GT(item.range_size, 0); + EXPECT_LE(item.range_offset + item.range_size, item.segment_file_size); + EXPECT_EQ(item.closed_segment_files, 2); + } + EXPECT_EQ(ranges_per_segment[0], 3); + EXPECT_EQ(ranges_per_segment[1], 3); + + expect_segment_write_bypasses_file_cache(created_files); + expect_inverted_index_writes_file_cache(created_files); + expect_loader_open_file_is_mocked_out(s3_write_counters); +} + +TEST_F(CloudFileCacheWriteIndexOnlyTest, + VerticalCompactionV1InvertedIndexUsesIndexOnlyFileWriterOptions) { + auto tablet_schema = create_schema(true, InvertedIndexStorageFormatPB::V1); + RowsetWriterContext context = create_context(tablet_schema, DataWriteType::TYPE_COMPACTION, + ReaderType::READER_CUMULATIVE_COMPACTION); + + std::vector observed; + std::vector created_files; + int preload_task_count = 0; + WriterFlushCounters writer_flush_counters; + S3WriteCounters s3_write_counters; + SyncPoint::CallbackGuard load_guard; + SyncPoint::CallbackGuard task_guard; + SyncPoint::CallbackGuard vertical_writer_guard; + SyncPoint::CallbackGuard segment_writer_guard; + SyncPoint::CallbackGuard s3_client_guard; + SyncPoint::CallbackGuard s3_put_guard; + SyncPoint::CallbackGuard create_file_guard; + SyncPoint::CallbackGuard close_file_guard; + SyncPoint::CallbackGuard s3_open_file_guard; + install_observers(&observed, &created_files, &preload_task_count, &writer_flush_counters, + &s3_write_counters, &load_guard, &task_guard, &vertical_writer_guard, + &segment_writer_guard, &s3_client_guard, &s3_put_guard, &create_file_guard, + &close_file_guard, &s3_open_file_guard); + int index_writer_create_count = 0; + SyncPoint::CallbackGuard index_writer_create_guard; + SyncPoint::get_instance()->set_call_back( + "BaseBetaRowsetWriter::create_inverted_index_file_writer", + [&index_writer_create_count](auto&& args) { + static_cast(try_any_cast(args[0])); + ++index_writer_create_count; + }, + &index_writer_create_guard); + + auto writer_result = RowsetFactory::create_rowset_writer(*_engine, context, true); + ASSERT_TRUE(writer_result.has_value()) << writer_result.error(); + auto rowset_writer = std::move(writer_result).value(); + + std::vector key_column_ids = {0}; + auto key_block = create_column_block(tablet_schema, key_column_ids, 8, 1); + auto st = rowset_writer->add_columns(&key_block, key_column_ids, true, 4, false); + ASSERT_TRUE(st.ok()) << st; + + EXPECT_EQ(writer_flush_counters.vertical_segment_writer_flush, 0); + EXPECT_EQ(writer_flush_counters.segment_writer_final_flush, 0); + EXPECT_EQ(preload_task_count, 0); + EXPECT_EQ(index_writer_create_count, 1); + expect_segment_write_bypasses_file_cache(created_files); +} + +} // namespace doris diff --git a/be/test/storage/compaction/compaction_file_cache_test.cpp b/be/test/storage/compaction/compaction_file_cache_test.cpp index 41303a9ff9e9be..765df31bf1c7b3 100644 --- a/be/test/storage/compaction/compaction_file_cache_test.cpp +++ b/be/test/storage/compaction/compaction_file_cache_test.cpp @@ -34,17 +34,22 @@ class CompactionFileCacheTest : public testing::Test { public: void SetUp() override { // Save original configuration + _orig_index_file_only_config = config::enable_file_cache_write_index_file_only; _orig_base_config = config::enable_file_cache_write_base_compaction_index_only; _orig_cumu_config = config::enable_file_cache_write_cumu_compaction_index_only; + + config::enable_file_cache_write_index_file_only = false; } void TearDown() override { // Restore original configuration + config::enable_file_cache_write_index_file_only = _orig_index_file_only_config; config::enable_file_cache_write_base_compaction_index_only = _orig_base_config; config::enable_file_cache_write_cumu_compaction_index_only = _orig_cumu_config; } private: + bool _orig_index_file_only_config; bool _orig_base_config; bool _orig_cumu_config; }; @@ -63,7 +68,7 @@ TEST_F(CompactionFileCacheTest, BaseCompaction_IndexOnly_False_IndexFile) { ctx.compaction_output_write_index_only = false; // Test: Get file writer options for index file - auto opts = ctx.get_file_writer_options(true); + auto opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); // Verify: write_file_cache should be true EXPECT_TRUE(opts.write_file_cache); @@ -79,7 +84,7 @@ TEST_F(CompactionFileCacheTest, BaseCompaction_IndexOnly_False_DataFile) { ctx.compaction_output_write_index_only = false; // Test: Get file writer options for data file - auto opts = ctx.get_file_writer_options(false); + auto opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); // Verify: write_file_cache should be true EXPECT_TRUE(opts.write_file_cache); @@ -95,7 +100,7 @@ TEST_F(CompactionFileCacheTest, BaseCompaction_IndexOnly_True_IndexFile) { ctx.compaction_output_write_index_only = true; // Test: Get file writer options for index file - auto opts = ctx.get_file_writer_options(true); + auto opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); // Verify: write_file_cache should be true (index files are always cached) EXPECT_TRUE(opts.write_file_cache); @@ -111,7 +116,7 @@ TEST_F(CompactionFileCacheTest, BaseCompaction_IndexOnly_True_DataFile) { ctx.compaction_output_write_index_only = true; // Test: Get file writer options for data file - auto opts = ctx.get_file_writer_options(false); + auto opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); // Verify: write_file_cache should be false (data files are NOT cached when index-only is enabled) EXPECT_FALSE(opts.write_file_cache); @@ -131,7 +136,7 @@ TEST_F(CompactionFileCacheTest, CumuCompaction_IndexOnly_False_IndexFile) { ctx.compaction_output_write_index_only = false; // Test: Get file writer options for index file - auto opts = ctx.get_file_writer_options(true); + auto opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); // Verify: write_file_cache should be true EXPECT_TRUE(opts.write_file_cache); @@ -147,7 +152,7 @@ TEST_F(CompactionFileCacheTest, CumuCompaction_IndexOnly_False_DataFile) { ctx.compaction_output_write_index_only = false; // Test: Get file writer options for data file - auto opts = ctx.get_file_writer_options(false); + auto opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); // Verify: write_file_cache should be true EXPECT_TRUE(opts.write_file_cache); @@ -163,7 +168,7 @@ TEST_F(CompactionFileCacheTest, CumuCompaction_IndexOnly_True_IndexFile) { ctx.compaction_output_write_index_only = true; // Test: Get file writer options for index file - auto opts = ctx.get_file_writer_options(true); + auto opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); // Verify: write_file_cache should be true (index files are always cached) EXPECT_TRUE(opts.write_file_cache); @@ -179,7 +184,7 @@ TEST_F(CompactionFileCacheTest, CumuCompaction_IndexOnly_True_DataFile) { ctx.compaction_output_write_index_only = true; // Test: Get file writer options for data file - auto opts = ctx.get_file_writer_options(false); + auto opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); // Verify: write_file_cache should be false (data files are NOT cached when index-only is enabled) EXPECT_FALSE(opts.write_file_cache); @@ -199,7 +204,7 @@ TEST_F(CompactionFileCacheTest, BaseCompaction_WriteCacheFalse_IndexOnly_False_I ctx.compaction_output_write_index_only = false; // Test: Get file writer options for index file - auto opts = ctx.get_file_writer_options(true); + auto opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); // Verify: write_file_cache should remain false EXPECT_FALSE(opts.write_file_cache); @@ -215,7 +220,7 @@ TEST_F(CompactionFileCacheTest, BaseCompaction_WriteCacheFalse_IndexOnly_False_D ctx.compaction_output_write_index_only = false; // Test: Get file writer options for data file - auto opts = ctx.get_file_writer_options(false); + auto opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); // Verify: write_file_cache should remain false EXPECT_FALSE(opts.write_file_cache); @@ -231,7 +236,7 @@ TEST_F(CompactionFileCacheTest, BaseCompaction_WriteCacheFalse_IndexOnly_True_In ctx.compaction_output_write_index_only = true; // Test: Get file writer options for index file - auto opts = ctx.get_file_writer_options(true); + auto opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); // Verify: write_file_cache should remain false (base cache setting takes precedence) EXPECT_FALSE(opts.write_file_cache); @@ -247,7 +252,7 @@ TEST_F(CompactionFileCacheTest, BaseCompaction_WriteCacheFalse_IndexOnly_True_Da ctx.compaction_output_write_index_only = true; // Test: Get file writer options for data file - auto opts = ctx.get_file_writer_options(false); + auto opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); // Verify: write_file_cache should remain false EXPECT_FALSE(opts.write_file_cache); @@ -267,7 +272,7 @@ TEST_F(CompactionFileCacheTest, CumuCompaction_WriteCacheFalse_IndexOnly_False_I ctx.compaction_output_write_index_only = false; // Test: Get file writer options for index file - auto opts = ctx.get_file_writer_options(true); + auto opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); // Verify: write_file_cache should remain false EXPECT_FALSE(opts.write_file_cache); @@ -283,7 +288,7 @@ TEST_F(CompactionFileCacheTest, CumuCompaction_WriteCacheFalse_IndexOnly_False_D ctx.compaction_output_write_index_only = false; // Test: Get file writer options for data file - auto opts = ctx.get_file_writer_options(false); + auto opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); // Verify: write_file_cache should remain false EXPECT_FALSE(opts.write_file_cache); @@ -299,7 +304,7 @@ TEST_F(CompactionFileCacheTest, CumuCompaction_WriteCacheFalse_IndexOnly_True_In ctx.compaction_output_write_index_only = true; // Test: Get file writer options for index file - auto opts = ctx.get_file_writer_options(true); + auto opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); // Verify: write_file_cache should remain false (base cache setting takes precedence) EXPECT_FALSE(opts.write_file_cache); @@ -315,12 +320,33 @@ TEST_F(CompactionFileCacheTest, CumuCompaction_WriteCacheFalse_IndexOnly_True_Da ctx.compaction_output_write_index_only = true; // Test: Get file writer options for data file - auto opts = ctx.get_file_writer_options(false); + auto opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); // Verify: write_file_cache should remain false EXPECT_FALSE(opts.write_file_cache); } +TEST_F(CompactionFileCacheTest, GlobalIndexFileOnlyTakesPrecedenceOverCompactionConfigs) { + config::enable_file_cache_write_index_file_only = true; + config::enable_file_cache_write_base_compaction_index_only = true; + config::enable_file_cache_write_cumu_compaction_index_only = true; + + RowsetWriterContext ctx; + ctx.write_file_cache = false; + ctx.compaction_output_write_index_only = false; + ctx.approximate_bytes_to_write = 12345; + + auto segment_opts = ctx.get_file_writer_options(FileType::SEGMENT_FILE); + EXPECT_FALSE(segment_opts.write_file_cache); + EXPECT_FALSE(segment_opts.allow_adaptive_file_cache_write); + EXPECT_EQ(segment_opts.approximate_bytes_to_write, 0); + + auto index_opts = ctx.get_file_writer_options(FileType::INVERTED_INDEX_FILE); + EXPECT_TRUE(index_opts.write_file_cache); + EXPECT_FALSE(index_opts.allow_adaptive_file_cache_write); + EXPECT_EQ(index_opts.approximate_bytes_to_write, 0); +} + // ============================================================================ // Tests for should_enable_compaction_cache_index_only function // ============================================================================ diff --git a/be/test/storage/segment/column_reader_cache_test.cpp b/be/test/storage/segment/column_reader_cache_test.cpp index da8aa13d3bfff8..bab89d4a9cbfa4 100644 --- a/be/test/storage/segment/column_reader_cache_test.cpp +++ b/be/test/storage/segment/column_reader_cache_test.cpp @@ -82,9 +82,10 @@ class ColumnReaderCacheTest : public ::testing::Test { // once the test-specific footer has been constructed. io::FileReaderSPtr file_reader; // nullptr is fine for inline-only tests auto footer_cb = [this](std::shared_ptr& footer_pb_shared, - OlapReaderStatistics* stats) -> Status { + OlapReaderStatistics* stats, + const io::IOContext* io_ctx) -> Status { // Delegate to MockSegment::_get_segment_footer - return _mock_segment->_get_segment_footer(footer_pb_shared, stats); + return _mock_segment->_get_segment_footer(footer_pb_shared, stats, io_ctx); }; _cache = std::make_unique(&_accessor, _mock_segment->tablet_schema(), file_reader, _mock_segment->num_rows(), @@ -105,10 +106,10 @@ class ColumnReaderCacheTest : public ::testing::Test { EXPECT_CALL(*_mock_segment, num_rows()).WillRepeatedly(Return(1000)); // mock _get_segment_footer - EXPECT_CALL(*_mock_segment, _get_segment_footer(_, _)) + EXPECT_CALL(*_mock_segment, _get_segment_footer(_, _, _)) .WillRepeatedly( testing::Invoke([this](std::shared_ptr& footer_pb_shared, - OlapReaderStatistics*) { + OlapReaderStatistics*, const io::IOContext*) { if (_mock_segment->_footer) { footer_pb_shared = _mock_segment->_footer; return Status::OK(); @@ -490,4 +491,4 @@ TEST_F(ColumnReaderCacheTest, EmptyPath) { EXPECT_TRUE(status.is()); } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/be/test/storage/segment/inverted_index_fs_directory_test.cpp b/be/test/storage/segment/inverted_index_fs_directory_test.cpp index d42559a0e39975..4109dba52beaff 100644 --- a/be/test/storage/segment/inverted_index_fs_directory_test.cpp +++ b/be/test/storage/segment/inverted_index_fs_directory_test.cpp @@ -24,6 +24,7 @@ #include #include "common/config.h" +#include "io/cache/remote_scan_cache_write_limiter.h" #include "io/fs/file_system.h" #include "io/fs/local_file_system.h" #include "runtime/exec_env.h" @@ -820,6 +821,61 @@ TEST_F(DorisFSDirectoryTest, FSIndexInputReadInternalTimer) { _CLDELETE(input1); } +TEST_F(DorisFSDirectoryTest, FSIndexInputSetIoContextPropagatesQueryLimiter) { + std::string file_name = "test_io_context_file"; + std::filesystem::path test_file = _tmp_dir / file_name; + std::ofstream ofs(test_file); + std::string content = "some test content for io context"; + ofs << content; + ofs.close(); + + lucene::store::IndexInput* input = nullptr; + CLuceneError error; + bool result = + DorisFSDirectory::FSIndexInput::open(_fs, test_file.string().c_str(), input, error); + EXPECT_TRUE(result); + ASSERT_NE(input, nullptr); + + auto* fs_input = dynamic_cast(input); + ASSERT_NE(fs_input, nullptr); + + TUniqueId query_id; + query_id.hi = 1; + query_id.lo = 2; + io::FileCacheStatistics stats; + io::RemoteScanCacheWriteLimiter limiter(query_id, 0); + io::IOContext io_ctx; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.query_id = &query_id; + io_ctx.file_cache_stats = &stats; + io_ctx.file_cache_miss_policy = io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS; + io_ctx.remote_scan_cache_write_limiter = &limiter; + + fs_input->setIndexFile(true); + fs_input->setIoContext(&io_ctx); + + const auto* actual = static_cast(fs_input->getIoContext()); + EXPECT_EQ(actual->reader_type, ReaderType::READER_QUERY); + EXPECT_EQ(actual->query_id, &query_id); + EXPECT_EQ(actual->file_cache_stats, &stats); + EXPECT_EQ(actual->file_cache_miss_policy, io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS); + EXPECT_EQ(actual->remote_scan_cache_write_limiter, &limiter); + EXPECT_TRUE(actual->is_inverted_index); + EXPECT_TRUE(actual->is_index_data); + + fs_input->setIoContext(nullptr); + actual = static_cast(fs_input->getIoContext()); + EXPECT_EQ(actual->reader_type, ReaderType::UNKNOWN); + EXPECT_EQ(actual->query_id, nullptr); + EXPECT_EQ(actual->file_cache_stats, nullptr); + EXPECT_EQ(actual->file_cache_miss_policy, io::FileCacheMissPolicy::READ_THROUGH_AND_WRITE_BACK); + EXPECT_EQ(actual->remote_scan_cache_write_limiter, nullptr); + EXPECT_TRUE(actual->is_inverted_index); + EXPECT_TRUE(actual->is_index_data); + + _CLDELETE(input); +} + TEST_F(DorisFSDirectoryTest, PrivGetFN) { { std::string file_name = "my_file.txt"; @@ -841,4 +897,4 @@ TEST_F(DorisFSDirectoryTest, PrivGetFN) { } } -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/be/test/storage/segment/mock/mock_segment.h b/be/test/storage/segment/mock/mock_segment.h index 6c715a53dee612..798e79bccea231 100644 --- a/be/test/storage/segment/mock/mock_segment.h +++ b/be/test/storage/segment/mock/mock_segment.h @@ -45,7 +45,8 @@ class MockSegment : public Segment { // Mock methods for footer - make it virtual and public MOCK_METHOD(Status, _get_segment_footer, - (std::shared_ptr&, OlapReaderStatistics*), ()); + (std::shared_ptr&, OlapReaderStatistics*, const io::IOContext*), + (override)); // Helper methods for test setup void add_column_uid_mapping(int32_t col_uid, int32_t footer_ordinal) { @@ -68,4 +69,4 @@ class MockSegment : public Segment { friend class ColumnReaderCache; }; -} // namespace doris::segment_v2 \ No newline at end of file +} // namespace doris::segment_v2 diff --git a/be/test/storage/segment/segment_footer_cache_test.cpp b/be/test/storage/segment/segment_footer_cache_test.cpp index 0e75b3945f3ba8..d6af7b5146f84d 100644 --- a/be/test/storage/segment/segment_footer_cache_test.cpp +++ b/be/test/storage/segment/segment_footer_cache_test.cpp @@ -20,6 +20,8 @@ #include #include "core/field.h" +#include "cpp/sync_point.h" +#include "io/cache/remote_scan_cache_write_limiter.h" #include "storage/cache/page_cache.h" #include "storage/segment/segment.h" #include "storage/segment/segment_writer.h" @@ -139,6 +141,8 @@ class SegmentFooterCacheTest : public ::testing::Test { } void TearDown() override { + _segments.clear(); + ExecEnv::GetInstance()->set_storage_engine(nullptr); EXPECT_TRUE(io::global_local_filesystem() ->delete_directory(segment_footer_cache_test_segment_dir) .ok()); @@ -162,6 +166,106 @@ TEST_F(SegmentFooterCacheTest, TestGetSegmentFooter) { } } +TEST_F(SegmentFooterCacheTest, GetSegmentFooterPropagatesIoContext) { + auto segment_ptr = _segments.front(); + auto* segment_footer_cache = ExecEnv::GetInstance()->get_storage_page_cache(); + ASSERT_NE(segment_footer_cache, nullptr); + segment_footer_cache->erase(segment_ptr->get_segment_footer_cache_key(), + segment_v2::PageTypePB::INDEX_PAGE); + segment_ptr->_footer_pb.reset(); + + TUniqueId query_id; + query_id.hi = 100; + query_id.lo = 200; + io::RemoteScanCacheWriteLimiter limiter(query_id, 1024); + OlapReaderStatistics stats; + io::IOContext io_ctx; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.query_id = &query_id; + io_ctx.file_cache_miss_policy = io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS; + io_ctx.remote_scan_cache_write_limiter = &limiter; + + auto* sp = SyncPoint::get_instance(); + sp->enable_processing(); + bool observed_parse_footer_io_ctx = false; + SyncPoint::CallbackGuard guard; + sp->set_call_back( + "Segment::_parse_footer::io_ctx", + [&](auto&& args) { + auto* actual = try_any_cast(args[0]); + observed_parse_footer_io_ctx = true; + EXPECT_EQ(actual->reader_type, ReaderType::READER_QUERY); + EXPECT_EQ(actual->query_id, &query_id); + EXPECT_EQ(actual->file_cache_stats, &stats.file_cache_stats); + EXPECT_EQ(actual->file_cache_miss_policy, + io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS); + EXPECT_EQ(actual->remote_scan_cache_write_limiter, &limiter); + EXPECT_TRUE(actual->is_index_data); + EXPECT_FALSE(actual->is_inverted_index); + }, + &guard); + + std::shared_ptr footer; + Status st = segment_ptr->_get_segment_footer(footer, &stats, &io_ctx); + sp->disable_processing(); + + ASSERT_TRUE(st.ok()) << st.to_string(); + ASSERT_NE(footer, nullptr); + EXPECT_TRUE(observed_parse_footer_io_ctx); +} + +TEST_F(SegmentFooterCacheTest, OpenPropagatesIoContextToFooter) { + auto segment_ptr = _segments.front(); + auto* segment_footer_cache = ExecEnv::GetInstance()->get_storage_page_cache(); + ASSERT_NE(segment_footer_cache, nullptr); + segment_footer_cache->erase(segment_ptr->get_segment_footer_cache_key(), + segment_v2::PageTypePB::INDEX_PAGE); + + TUniqueId query_id; + query_id.hi = 300; + query_id.lo = 400; + io::RemoteScanCacheWriteLimiter limiter(query_id, 1024); + OlapReaderStatistics stats; + io::IOContext io_ctx; + io_ctx.reader_type = ReaderType::READER_QUERY; + io_ctx.query_id = &query_id; + io_ctx.file_cache_miss_policy = io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS; + io_ctx.remote_scan_cache_write_limiter = &limiter; + + auto* sp = SyncPoint::get_instance(); + sp->enable_processing(); + bool observed_parse_footer_io_ctx = false; + SyncPoint::CallbackGuard guard; + sp->set_call_back( + "Segment::_parse_footer::io_ctx", + [&](auto&& args) { + auto* actual = try_any_cast(args[0]); + observed_parse_footer_io_ctx = true; + EXPECT_EQ(actual->reader_type, ReaderType::READER_QUERY); + EXPECT_EQ(actual->query_id, &query_id); + EXPECT_EQ(actual->file_cache_stats, &stats.file_cache_stats); + EXPECT_EQ(actual->file_cache_miss_policy, + io::FileCacheMissPolicy::REMOTE_ONLY_ON_MISS); + EXPECT_EQ(actual->remote_scan_cache_write_limiter, &limiter); + EXPECT_TRUE(actual->is_index_data); + EXPECT_FALSE(actual->is_inverted_index); + }, + &guard); + + std::shared_ptr opened_segment; + io::FileReaderOptions reader_options; + reader_options.file_size = segment_ptr->file_reader()->size(); + auto st = segment_v2::Segment::open( + io::global_local_filesystem(), segment_ptr->file_reader()->path().native(), + /*tablet_id=*/1, segment_ptr->id(), segment_ptr->rowset_id(), + segment_ptr->tablet_schema(), reader_options, &opened_segment, {}, &stats, &io_ctx); + sp->disable_processing(); + + ASSERT_TRUE(st.ok()) << st.to_string(); + ASSERT_NE(opened_segment, nullptr); + EXPECT_TRUE(observed_parse_footer_io_ctx); +} + TEST_F(SegmentFooterCacheTest, TestGetSegmentFooterCacheKey) { for (auto segment_ptr : _segments) { StoragePageCache::CacheKey cache_key = segment_ptr->get_segment_footer_cache_key(); diff --git a/be/test/storage/variant/variant_column_writer_reader_test.cpp b/be/test/storage/variant/variant_column_writer_reader_test.cpp index 8b92a1983af967..191189092bdb24 100644 --- a/be/test/storage/variant/variant_column_writer_reader_test.cpp +++ b/be/test/storage/variant/variant_column_writer_reader_test.cpp @@ -153,17 +153,17 @@ class MockColumnReaderCache : public segment_v2::ColumnReaderCache { MockColumnReaderCache(const SegmentFooterPB& footer, const io::FileReaderSPtr& file_reader, const std::shared_ptr& tablet_schema) : ColumnReaderCache(nullptr, nullptr, nullptr, 0, - [](std::shared_ptr&, OlapReaderStatistics*) { - return Status::OK(); - }), + [](std::shared_ptr&, OlapReaderStatistics*, + const io::IOContext*) { return Status::OK(); }), _footer(footer), _file_reader(file_reader), _tablet_schema(tablet_schema) {} - Status get_path_column_reader( - int32_t col_uid, PathInData relative_path, - std::shared_ptr* column_reader, OlapReaderStatistics* stats, - const SubcolumnColumnMetaInfo::Node* node_hint = nullptr) override { + Status get_path_column_reader(int32_t col_uid, PathInData relative_path, + std::shared_ptr* column_reader, + OlapReaderStatistics* stats, + const SubcolumnColumnMetaInfo::Node* node_hint = nullptr, + const io::IOContext* io_ctx = nullptr) override { DCHECK(node_hint != nullptr); // Use node_hint's footer_ordinal to locate the specific ColumnMeta int32_t footer_ordinal = node_hint->data.footer_ordinal; diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 2870c3443fefd8..31d9d020c644d4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -507,8 +507,14 @@ public String toString() { public static final String DISABLE_FILE_CACHE = "disable_file_cache"; + public static final String ENABLE_TOPN_LAZY_MAT_PHASE2_NO_WRITE_FILE_CACHE + = "enable_topn_lazy_mat_phase2_no_write_file_cache"; + public static final String FILE_CACHE_QUERY_LIMIT_PERCENT = "file_cache_query_limit_percent"; + public static final String FILE_CACHE_QUERY_LIMIT_BYTES = + "file_cache_query_limit_bytes"; + public static final String FILE_CACHE_BASE_PATH = "file_cache_base_path"; public static final String ENABLE_INVERTED_INDEX_QUERY = "enable_inverted_index_query"; @@ -2269,6 +2275,14 @@ public boolean isEnableHboNonStrictMatchingMode() { @VarAttrDef.VarAttr(name = DISABLE_FILE_CACHE, needForward = true) public boolean disableFileCache = false; + @VarAttrDef.VarAttr(name = ENABLE_TOPN_LAZY_MAT_PHASE2_NO_WRITE_FILE_CACHE, needForward = true, + description = { + "开启后,TopN 延迟物化第二阶段读取在 file cache miss 时直接读远端且不写回 file cache。", + "When enabled, TopN lazy materialization phase-2 reads go remote-only on " + + "file-cache miss and do not write the missed range back to file cache." + }) + public boolean enableTopnLazyMatPhase2NoWriteFileCache = false; + // Whether enable block file cache. Only take effect when BE config item enable_file_cache is true. @VarAttrDef.VarAttr(name = ENABLE_FILE_CACHE, needForward = true, description = { "是否启用 file cache。该变量只有在 be.conf 中 enable_file_cache=true 时才有效," @@ -3079,6 +3093,14 @@ public void checkFileCacheQueryLimitPercent(String fileCacheQueryLimitPercentStr } } + @VarAttrDef.VarAttr(name = FILE_CACHE_QUERY_LIMIT_BYTES, needForward = true, + description = {"单个查询在每个 BE 上最多允许 read-through 写入 file cache 的远端 scan bytes。" + + "< 0 表示关闭,= 0 表示查询开始即不写 file cache,> 0 表示达到阈值后不写 file cache。", + "Maximum remote scan bytes allowed to write file cache per query on each BE. " + + "< 0 disables it, = 0 disables file cache writes from query start, " + + "> 0 disables file cache writes after the threshold is reached."}) + public long fileCacheQueryLimitBytes = -1; + public void setAggPhase(int phase) { aggPhase = phase; } @@ -5611,6 +5633,7 @@ public TQueryOptions toThrift() { tResult.setParallelScanMinRowsPerScanner(parallelScanMinRowsPerScanner); tResult.setOptimizeIndexScanParallelism(optimizeIndexScanParallelism); tResult.setDisableFileCache(disableFileCache); + tResult.setEnableTopnLazyMatPhase2NoWriteFileCache(enableTopnLazyMatPhase2NoWriteFileCache); tResult.setEnablePreferCachedRowset(getEnablePreferCachedRowset()); tResult.setQueryFreshnessToleranceMs(getQueryFreshnessToleranceMs()); @@ -5696,7 +5719,7 @@ public TQueryOptions toThrift() { tResult.setIcebergWriteTargetFileSizeBytes(icebergWriteTargetFileSizeBytes); tResult.setEnableLocalShufflePlanner(enableLocalShufflePlanner); - + tResult.setFileCacheQueryLimitBytes(fileCacheQueryLimitBytes); return tResult; } diff --git a/fe/fe-core/src/test/java/org/apache/doris/qe/SessionVariablesTest.java b/fe/fe-core/src/test/java/org/apache/doris/qe/SessionVariablesTest.java index a8446efb451fee..bc3c44e05d0e0b 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/qe/SessionVariablesTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/qe/SessionVariablesTest.java @@ -27,6 +27,7 @@ import org.apache.doris.common.FeConstants; import org.apache.doris.nereids.parser.NereidsParser; import org.apache.doris.nereids.rules.rewrite.eageraggregation.EagerAggHints.Action; +import org.apache.doris.thrift.TQueryOptions; import org.apache.doris.utframe.TestWithFeService; import org.junit.jupiter.api.Assertions; @@ -345,4 +346,16 @@ public void testAnnSessionVariableChecker() throws Exception { Assertions.assertTrue(nprobeException.getMessage().contains("ivf_nprobe must be >= 1")); Assertions.assertEquals(2, sv.ivfNprobe); } + + @Test + public void testFileCacheQueryLimitBytesToThrift() throws Exception { + SessionVariable variable = new SessionVariable(); + VariableMgr.setVar(variable, new SetVar(SetType.SESSION, + SessionVariable.FILE_CACHE_QUERY_LIMIT_BYTES, + new IntLiteral(262144))); + + TQueryOptions queryOptions = variable.toThrift(); + Assertions.assertTrue(queryOptions.isSetFileCacheQueryLimitBytes()); + Assertions.assertEquals(262144L, queryOptions.getFileCacheQueryLimitBytes()); + } } diff --git a/gensrc/proto/internal_service.proto b/gensrc/proto/internal_service.proto index c5818339f3a66e..e4b41726bd76c3 100644 --- a/gensrc/proto/internal_service.proto +++ b/gensrc/proto/internal_service.proto @@ -854,6 +854,18 @@ message PRequestBlockDesc { repeated uint32 column_idxs = 7; } +message PTopNLazyMaterializationFileCacheStats { + optional int64 local_io_count = 1; + optional int64 local_io_bytes = 2; + optional int64 remote_io_count = 3; + optional int64 remote_io_bytes = 4; + optional int64 skip_cache_io_count = 5; + optional int64 write_cache_bytes = 6; + optional int64 local_io_time = 7; + optional int64 remote_io_time = 8; + optional int64 write_cache_io_time = 9; +} + message PMultiGetRequestV2 { repeated PRequestBlockDesc request_block_descs = 1; @@ -862,6 +874,7 @@ message PMultiGetRequestV2 { optional PUniqueId query_id = 3; optional bool gc_id_map = 4; optional uint64 wg_id = 5; + optional bool file_cache_remote_only_on_miss = 6; }; message PMultiGetBlockV2 { @@ -878,6 +891,7 @@ message PMultiGetBlockV2 { message PMultiGetResponseV2 { optional PStatus status = 1; repeated PMultiGetBlockV2 blocks = 2; + optional PTopNLazyMaterializationFileCacheStats topn_lazy_materialization_file_cache_stats = 3; }; message PFetchColIdsRequest { @@ -1267,4 +1281,3 @@ service PBackendService { rpc fetch_peer_data(PFetchPeerDataRequest) returns (PFetchPeerDataResponse); rpc request_cdc_client(PRequestCdcClientRequest) returns (PRequestCdcClientResult); }; - diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 0d8618dbc78a0f..3aca68e1c3c21d 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -505,6 +505,8 @@ struct TQueryOptions { // In read path, read from file cache or remote storage when execute query. 1000: optional bool disable_file_cache = false 1001: optional i32 file_cache_query_limit_percent = -1 + 1002: optional bool enable_topn_lazy_mat_phase2_no_write_file_cache = false + 1003: optional i64 file_cache_query_limit_bytes = -1 } diff --git a/regression-test/suites/cloud_p0/cache/remote_scan_no_write_file_cache/test_file_cache_query_limit_segment_meta_profile.groovy b/regression-test/suites/cloud_p0/cache/remote_scan_no_write_file_cache/test_file_cache_query_limit_segment_meta_profile.groovy new file mode 100644 index 00000000000000..536becaca24f19 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/remote_scan_no_write_file_cache/test_file_cache_query_limit_segment_meta_profile.groovy @@ -0,0 +1,475 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.Http + +import java.util.Random + +suite("test_file_cache_query_limit_segment_meta_profile", "docker") { + def cacheBlockSize = 262144L + def options = new ClusterOptions() + options.cloudMode = true + options.setFeNum(1) + options.setBeNum(1) + options.msNum = 1 + options.beConfigs += [ + "enable_file_cache=true", + "enable_read_cache_file_directly=false", + "disable_storage_page_cache=true", + "disable_segment_cache=true", + "enable_java_support=false", + "enable_evict_file_cache_in_advance=false", + "file_cache_enter_disk_resource_limit_mode_percent=99", + "file_cache_each_block_size=${cacheBlockSize}", + "file_cache_path=[{\"path\":\"/opt/apache-doris/be/storage/file_cache\",\"total_size\":134217728,\"query_limit\":134217728}]" + ] + + docker(options) { + def clusters = sql "SHOW CLUSTERS" + assert !clusters.isEmpty() + def computeGroup = clusters[0][0] + sql "use @${computeGroup}" + + def backends = sql "SHOW BACKENDS" + assert backends.size() == 1 + def beHost = backends[0][1] + def beHttpPort = backends[0][4] + def supportMaxScannersConcurrency = + !(sql "show variables like 'max_scanners_concurrency'").isEmpty() + + def rowsPerTable = 4096 + def batchRows = 512 + def chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + + def clearFileCache = { + def result = Http.GET("http://${beHost}:${beHttpPort}/api/file_cache?op=clear&sync=true", true) + logger.info("clear file cache result: ${result}") + } + + def getBeParam = { String paramName -> + def (code, out, err) = curl( + "GET", String.format("http://%s:%s/api/show_config?conf_item=%s", + beHost, beHttpPort, paramName)) + assert code == 0 : "show_config ${paramName} failed, out=${out}, err=${err}" + assert out.contains(paramName) : "show_config ${paramName} missing item, out=${out}" + def resultList = parseJson(out)[0] + assert resultList.size() == 4 : "unexpected show_config result=${out}" + def paramValue = resultList[2].toString() + logger.info("BE config ${paramName} current value: ${paramValue}") + return paramValue + } + + def setBeParam = { String paramName, String paramValue -> + logger.info("set BE config ${paramName}=${paramValue}") + def (code, out, err) = curl( + "POST", String.format("http://%s:%s/api/update_config?%s=%s", + beHost, beHttpPort, paramName, paramValue)) + assert code == 0 && out.contains("OK") : + "update_config ${paramName}=${paramValue} failed, out=${out}, err=${err}" + } + + def randomText = { Random random, int length -> + def builder = new StringBuilder(length) + for (int i = 0; i < length; i++) { + builder.append(chars.charAt(random.nextInt(chars.length()))) + } + return builder.toString() + } + + def insertRows = { String tableName -> + for (int batchStart = 0; batchStart < rowsPerTable; batchStart += batchRows) { + def random = new Random(((long) tableName.hashCode()) * 131 + batchStart) + def data = new StringBuilder(batchRows * 3500) + for (int i = 0; i < batchRows; i++) { + int id = batchStart + i + data.append(id).append('\t') + data.append(id % 128).append('\t') + data.append(randomText(random, 2048)).append('\t') + data.append(randomText(random, 1024)).append('\n') + } + streamLoad { + table tableName + set 'column_separator', '\t' + set 'columns', 'id,group_id,payload,pad' + inputText data.toString() + time 60000 + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("stream load ${tableName} result: ${result}") + def json = parseJson(result) + assert json.Status.toString().equalsIgnoreCase("success") : result + assert json.NumberLoadedRows.toString().toInteger() == batchRows : result + } + } + } + sql "SYNC" + } + + def createTableAndLoad = { String tableName, boolean verifyRowCount = true -> + sql "DROP TABLE IF EXISTS ${tableName} FORCE" + sql """ + CREATE TABLE ${tableName} ( + id INT NOT NULL, + group_id INT NOT NULL, + payload VARCHAR(4096) NOT NULL, + pad VARCHAR(4096) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true" + ) + """ + insertRows(tableName) + + if (verifyRowCount) { + def rowCount = sql "SELECT COUNT(*) FROM ${tableName}" + assert rowCount[0][0] == rowsPerTable + } + } + + def parseProfileCounterValue = { String valueText -> + def exact = (valueText =~ /\((\d+)\)/) + if (exact.find()) { + return exact.group(1).toLong() + } + def number = (valueText =~ /([0-9]+(?:\.[0-9]+)?)\s*(B|KB|MB|GB)?/) + if (!number.find()) { + return 0L + } + BigDecimal value = new BigDecimal(number.group(1)) + long multiplier = 1L + if (number.group(2) == "KB") { + multiplier = 1024L + } else if (number.group(2) == "MB") { + multiplier = 1024L * 1024L + } else if (number.group(2) == "GB") { + multiplier = 1024L * 1024L * 1024L + } + return (value * multiplier).toLong() + } + + def parseProfileCounterAggregateValue = { String valueText, String aggregateName -> + def aggregate = (valueText =~ ("(?:^|,\\s*)" + + java.util.regex.Pattern.quote(aggregateName) + "\\s+([^,]+)")) + if (aggregate.find()) { + return parseProfileCounterValue(aggregate.group(1).toString()) + } + if (aggregateName.equalsIgnoreCase("max")) { + def highWaterMarkPeak = (valueText =~ /(?:^|\s)\(Peak:\s*(.+)\)\s*$/) + if (highWaterMarkPeak.find()) { + return parseProfileCounterValue(highWaterMarkPeak.group(1).toString()) + } + } + return parseProfileCounterValue(valueText) + } + + def detailProfileSection = { String profileString -> + int start = profileString.indexOf("\nDetailProfile(") + if (start < 0) { + start = profileString.indexOf("DetailProfile(") + } + if (start < 0) { + return profileString + } + int end = profileString.indexOf("\nAppendix:", start) + return end < 0 ? profileString.substring(start) : profileString.substring(start, end) + } + + def profileCounterValues = { String profileString, String counterName, + String aggregateName = null -> + def values = [] + def profileSection = detailProfileSection(profileString) + def matcher = (profileSection =~ ("(?m)^\\s*(?:-\\s*)?" + + java.util.regex.Pattern.quote(counterName) + "(?::\\s+|\\s+Current:\\s+)([^\\n]+)")) + while (matcher.find()) { + def valueText = matcher.group(1).toString() + values.add(aggregateName == null ? parseProfileCounterValue(valueText) : + parseProfileCounterAggregateValue(valueText, aggregateName)) + } + return values + } + + def sumProfileCounter = { String profileString, String counterName -> + long total = 0 + profileCounterValues(profileString, counterName).each { + total += it + } + return total + } + + def maxProfileCounter = { String profileString, String counterName -> + def values = profileCounterValues(profileString, counterName, "max") + values.addAll(profileCounterValues(profileString, "${counterName}Peak")) + return values.isEmpty() ? 0L : values.max() + } + + def collectFileCacheCounters = { String label, String profileString -> + def counters = [ + remoteOnlyTriggered: maxProfileCounter(profileString, "RemoteOnlyOnMissTriggered"), + thresholdMetricBytes: maxProfileCounter(profileString, "RemoteOnlyOnMissThresholdBytes"), + skipCacheIo: sumProfileCounter(profileString, "NumSkipCacheIOTotal"), + writeCacheBytes: sumProfileCounter(profileString, "BytesWriteIntoCache"), + segmentMetaWriteCacheBytes: + sumProfileCounter(profileString, "SegmentFooterIndexBytesWriteIntoCache"), + segmentMetaRemoteIo: + sumProfileCounter(profileString, "SegmentFooterIndexNumRemoteIOTotal"), + scannerNum: sumProfileCounter(profileString, "NumScanners") + ] + logger.info("${label} file-cache counters: ${counters}") + return counters + } + + def setupQuerySession = { long thresholdBytes -> + sql "set enable_profile = true" + sql "set profile_level = 2" + sql "set enable_sql_cache = false" + sql "set enable_query_cache = false" + sql "set enable_file_cache = false" + sql "set enable_page_cache = false" + sql "set enable_segment_cache = false" + sql "set parallel_fragment_exec_instance_num = 1" + sql "set enable_parallel_scan = true" + sql "set parallel_scan_max_scanners_count = 1" + sql "set parallel_scan_min_rows_per_scanner = 1024" + if (supportMaxScannersConcurrency) { + sql "set max_scanners_concurrency = 1" + } + sql "set parallel_pipeline_task_num = 1" + sql "set file_cache_query_limit_bytes = ${thresholdBytes}" + logger.info("query file_cache_query_limit_bytes set to ${thresholdBytes}") + } + + def setupParallelPreloadQuerySession = { long thresholdBytes -> + setupQuerySession(thresholdBytes) + sql "set parallel_scan_max_scanners_count = 4" + sql "set parallel_scan_min_rows_per_scanner = 1024" + if (supportMaxScannersConcurrency) { + sql "set max_scanners_concurrency = 4" + } + sql "set parallel_pipeline_task_num = 4" + } + + def runProfileQuery = { String name, String query, long thresholdBytes -> + def counters = null + logger.info("run profile query ${name}, thresholdBytes=${thresholdBytes}, sql=${query}") + profile(name) { + run { + sql "/* ${name} */ ${query}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + logger.error("Profile failed, profile result:\n${profileString}", exception) + throw exception + } + assert profileString.contains("Is Cached: No") || + profileString.contains("Is Cached: No") : + "query should not hit SQL/query cache, profile=${profileString}" + counters = collectFileCacheCounters(name, profileString) + if (thresholdBytes >= 0) { + assert counters.thresholdMetricBytes == thresholdBytes : + "${name}: expected threshold bytes ${thresholdBytes}, " + + "profile=${profileString}" + } + } + } + assert counters != null : "${name}: profile check was not executed" + return counters + } + + def runCase = { String tableName, boolean segmentMetaCounts, long thresholdBytes -> + setBeParam("file_cache_query_limit_segment_meta", segmentMetaCounts.toString()) + createTableAndLoad(tableName, false) + setupQuerySession(thresholdBytes) + clearFileCache() + def query = "SELECT SUM(LENGTH(payload) + LENGTH(pad)) FROM ${tableName}" + def counters = runProfileQuery(tableName, query, thresholdBytes) + logger.info("case ${tableName} finished, segmentMetaCounts=${segmentMetaCounts}, " + + "thresholdBytes=${thresholdBytes}, counters=${counters}") + return counters + } + + def runParallelPreloadCase = { String tableName, boolean segmentMetaCounts -> + long tinyThresholdBytes = 1L + createTableAndLoad(tableName, false) + setBeParam("file_cache_query_limit_segment_meta", segmentMetaCounts.toString()) + setupParallelPreloadQuerySession(tinyThresholdBytes) + clearFileCache() + def query = "SELECT SUM(id + group_id + LENGTH(payload)) FROM ${tableName} " + + "WHERE group_id >= 0" + def counters = runProfileQuery(tableName, query, tinyThresholdBytes) + assert counters.scannerNum > 1L : + "parallel preload case should use multiple scanners, counters=${counters}" + assert counters.remoteOnlyTriggered == 1L : + "tiny threshold should trigger remote-only-on-miss, counters=${counters}" + assert counters.skipCacheIo > 0L : + "tiny threshold should skip later cache writes, counters=${counters}" + logger.info("parallel preload case ${tableName} finished, " + + "segmentMetaCounts=${segmentMetaCounts}, counters=${counters}") + return counters + } + + def originalSegmentMetaConfig = getBeParam("file_cache_query_limit_segment_meta") + logger.info("original file_cache_query_limit_segment_meta=${originalSegmentMetaConfig}") + assert getBeParam("enable_read_cache_file_directly").equalsIgnoreCase("false") + try { + def baseline = runCase("file_cache_limit_segment_meta_baseline", false, -1L) + assert baseline.remoteOnlyTriggered == 0L : + "baseline should not trigger remote-only-on-miss, counters=${baseline}" + assert baseline.skipCacheIo == 0L : + "baseline should not skip cache IO, counters=${baseline}" + assert baseline.segmentMetaWriteCacheBytes > 0L : + "baseline should write segment footer/meta file cache, counters=${baseline}" + assert baseline.writeCacheBytes > baseline.segmentMetaWriteCacheBytes : + "baseline query should also write data pages, counters=${baseline}" + + long thresholdBytes = Math.max(cacheBlockSize, baseline.segmentMetaWriteCacheBytes) + logger.info("derived positive query limit threshold: thresholdBytes=${thresholdBytes}, " + + "cacheBlockSize=${cacheBlockSize}, " + + "baselineSegmentMetaWriteCacheBytes=${baseline.segmentMetaWriteCacheBytes}, " + + "baselineWriteCacheBytes=${baseline.writeCacheBytes}") + assert baseline.writeCacheBytes > thresholdBytes : + "baseline write bytes should exceed derived threshold, " + + "thresholdBytes=${thresholdBytes}, counters=${baseline}" + + def withoutSegmentMeta = runCase( + "file_cache_limit_segment_meta_not_counted", false, thresholdBytes) + assert withoutSegmentMeta.remoteOnlyTriggered == 1L : + "threshold should still trigger on data cache writes, counters=${withoutSegmentMeta}" + assert withoutSegmentMeta.skipCacheIo > 0L : + "threshold should skip later cache writes, counters=${withoutSegmentMeta}" + assert withoutSegmentMeta.segmentMetaWriteCacheBytes > 0L : + "segment footer/meta should write cache before data threshold triggers, " + + "counters=${withoutSegmentMeta}" + assert withoutSegmentMeta.writeCacheBytes > thresholdBytes : + "when segment footer/meta is not counted, total profile writes can exceed " + + "the query threshold by those bytes, thresholdBytes=${thresholdBytes}, " + + "counters=${withoutSegmentMeta}" + logger.info("segment meta not counted result: thresholdBytes=${thresholdBytes}, " + + "writeCacheBytes=${withoutSegmentMeta.writeCacheBytes}, " + + "segmentMetaWriteCacheBytes=${withoutSegmentMeta.segmentMetaWriteCacheBytes}, " + + "writeMinusThreshold=${withoutSegmentMeta.writeCacheBytes - thresholdBytes}, " + + "remoteOnlyTriggered=${withoutSegmentMeta.remoteOnlyTriggered}, " + + "skipCacheIo=${withoutSegmentMeta.skipCacheIo}") + + def withSegmentMeta = runCase( + "file_cache_limit_segment_meta_counted", true, thresholdBytes) + assert withSegmentMeta.remoteOnlyTriggered == 1L : + "counted segment footer/meta should trigger remote-only-on-miss, " + + "counters=${withSegmentMeta}" + assert withSegmentMeta.skipCacheIo > 0L : + "counted segment footer/meta should make later misses skip cache writes, " + + "counters=${withSegmentMeta}" + assert withSegmentMeta.segmentMetaWriteCacheBytes > 0L : + "counted segment footer/meta should still have admitted cache writes, " + + "counters=${withSegmentMeta}" + assert withSegmentMeta.writeCacheBytes <= thresholdBytes : + "when segment footer/meta is counted, total admitted profile writes should " + + "respect the query threshold, thresholdBytes=${thresholdBytes}, " + + "counters=${withSegmentMeta}" + logger.info("segment meta counted result: thresholdBytes=${thresholdBytes}, " + + "writeCacheBytes=${withSegmentMeta.writeCacheBytes}, " + + "segmentMetaWriteCacheBytes=${withSegmentMeta.segmentMetaWriteCacheBytes}, " + + "thresholdMinusWrite=${thresholdBytes - withSegmentMeta.writeCacheBytes}, " + + "remoteOnlyTriggered=${withSegmentMeta.remoteOnlyTriggered}, " + + "skipCacheIo=${withSegmentMeta.skipCacheIo}") + + long tinyThresholdBytes = 1L + def tinyWithoutSegmentMeta = runCase( + "file_cache_limit_segment_meta_tiny_not_counted", false, tinyThresholdBytes) + assert tinyWithoutSegmentMeta.remoteOnlyTriggered == 1L : + "tiny threshold should trigger remote-only-on-miss, " + + "counters=${tinyWithoutSegmentMeta}" + assert tinyWithoutSegmentMeta.skipCacheIo > 0L : + "tiny threshold should skip data cache writes, " + + "counters=${tinyWithoutSegmentMeta}" + assert tinyWithoutSegmentMeta.segmentMetaWriteCacheBytes > 0L : + "when segment footer/meta is not counted, tiny threshold should still allow " + + "segment footer/meta cache writes, counters=${tinyWithoutSegmentMeta}" + assert tinyWithoutSegmentMeta.writeCacheBytes > 0L : + "when segment footer/meta is not counted, total profile writes should be " + + "greater than zero, counters=${tinyWithoutSegmentMeta}" + logger.info("tiny threshold segment meta not counted result: " + + "thresholdBytes=${tinyThresholdBytes}, " + + "writeCacheBytes=${tinyWithoutSegmentMeta.writeCacheBytes}, " + + "segmentMetaWriteCacheBytes=" + + "${tinyWithoutSegmentMeta.segmentMetaWriteCacheBytes}, " + + "remoteOnlyTriggered=${tinyWithoutSegmentMeta.remoteOnlyTriggered}, " + + "skipCacheIo=${tinyWithoutSegmentMeta.skipCacheIo}") + + def tinyWithSegmentMeta = runCase( + "file_cache_limit_segment_meta_tiny_counted", true, tinyThresholdBytes) + assert tinyWithSegmentMeta.remoteOnlyTriggered == 1L : + "tiny threshold should trigger remote-only-on-miss when segment footer/meta " + + "is counted, counters=${tinyWithSegmentMeta}" + assert tinyWithSegmentMeta.skipCacheIo > 0L : + "tiny threshold should skip cache writes when segment footer/meta is counted, " + + "counters=${tinyWithSegmentMeta}" + assert tinyWithSegmentMeta.segmentMetaWriteCacheBytes == 0L : + "when segment footer/meta is counted, tiny threshold should block segment " + + "footer/meta cache writes, counters=${tinyWithSegmentMeta}" + assert tinyWithSegmentMeta.writeCacheBytes == 0L : + "when segment footer/meta is counted, tiny threshold should block all " + + "profile cache writes, counters=${tinyWithSegmentMeta}" + logger.info("tiny threshold segment meta counted result: " + + "thresholdBytes=${tinyThresholdBytes}, " + + "writeCacheBytes=${tinyWithSegmentMeta.writeCacheBytes}, " + + "segmentMetaWriteCacheBytes=${tinyWithSegmentMeta.segmentMetaWriteCacheBytes}, " + + "remoteOnlyTriggered=${tinyWithSegmentMeta.remoteOnlyTriggered}, " + + "skipCacheIo=${tinyWithSegmentMeta.skipCacheIo}") + + def preloadWithoutSegmentMeta = runParallelPreloadCase( + "file_cache_limit_segment_meta_preload_not_counted", false) + assert preloadWithoutSegmentMeta.segmentMetaWriteCacheBytes > 0L : + "without segment meta accounting, parallel preload footer/meta should still " + + "write cache, counters=${preloadWithoutSegmentMeta}" + assert preloadWithoutSegmentMeta.writeCacheBytes >= + preloadWithoutSegmentMeta.segmentMetaWriteCacheBytes : + "aggregate file-cache writes should include segment footer/meta writes, " + + "counters=${preloadWithoutSegmentMeta}" + logger.info("parallel preload segment meta not counted result: " + + "writeCacheBytes=${preloadWithoutSegmentMeta.writeCacheBytes}, " + + "segmentMetaWriteCacheBytes=" + + "${preloadWithoutSegmentMeta.segmentMetaWriteCacheBytes}, " + + "scannerNum=${preloadWithoutSegmentMeta.scannerNum}") + + def preloadWithSegmentMeta = runParallelPreloadCase( + "file_cache_limit_segment_meta_preload_counted", true) + assert preloadWithSegmentMeta.segmentMetaWriteCacheBytes == 0L : + "when segment footer/meta is counted, tiny threshold should block " + + "parallel preload footer/meta cache writes, " + + "counters=${preloadWithSegmentMeta}" + assert preloadWithSegmentMeta.writeCacheBytes == 0L : + "when segment footer/meta is counted, tiny threshold should block all " + + "profile cache writes in the parallel preload query, " + + "counters=${preloadWithSegmentMeta}" + logger.info("parallel preload segment meta counted result: " + + "writeCacheBytes=${preloadWithSegmentMeta.writeCacheBytes}, " + + "segmentMetaWriteCacheBytes=${preloadWithSegmentMeta.segmentMetaWriteCacheBytes}, " + + "scannerNum=${preloadWithSegmentMeta.scannerNum}") + } finally { + logger.info("restore file_cache_query_limit_segment_meta=${originalSegmentMetaConfig}") + setBeParam("file_cache_query_limit_segment_meta", originalSegmentMetaConfig) + sql "set file_cache_query_limit_bytes = -1" + } + } +} diff --git a/regression-test/suites/cloud_p0/cache/remote_scan_no_write_file_cache/test_remote_scan_no_write_file_cache_threshold.groovy b/regression-test/suites/cloud_p0/cache/remote_scan_no_write_file_cache/test_remote_scan_no_write_file_cache_threshold.groovy new file mode 100644 index 00000000000000..4f2e3f54582024 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/remote_scan_no_write_file_cache/test_remote_scan_no_write_file_cache_threshold.groovy @@ -0,0 +1,648 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.Http + +import java.util.Random + +suite("test_remote_scan_no_write_file_cache_threshold", "docker") { + def cacheBlockSize = 262144L + def options = new ClusterOptions() + options.cloudMode = true + options.setFeNum(1) + options.setBeNum(1) + options.msNum = 1 + options.beConfigs += [ + "enable_file_cache=true", + "disable_storage_page_cache=true", + "enable_java_support=false", + "enable_evict_file_cache_in_advance=false", + "enable_read_cache_file_directly=false", + "inverted_index_query_cache_limit=67108864", + "inverted_index_searcher_cache_limit=67108864", + "file_cache_enter_disk_resource_limit_mode_percent=99", + "file_cache_each_block_size=${cacheBlockSize}", + "file_cache_path=[{\"path\":\"/opt/apache-doris/be/storage/file_cache\",\"total_size\":134217728,\"query_limit\":134217728}]" + ] + + docker(options) { + def clusters = sql "SHOW CLUSTERS" + assert !clusters.isEmpty() + def computeGroup = clusters[0][0] + sql "use @${computeGroup}" + + def backends = sql "SHOW BACKENDS" + assert backends.size() == 1 + def beHost = backends[0][1] + def beHttpPort = backends[0][4] + def supportMaxScannersConcurrency = + !(sql "show variables like 'max_scanners_concurrency'").isEmpty() + + def rowsPerTable = 4096 + def batchRows = 512 + def invertedIndexBatchRows = 128 + def halfCacheBlockSize = (cacheBlockSize / 2).toLong() + def chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789" + + def clearFileCache = { + def result = Http.GET("http://${beHost}:${beHttpPort}/api/file_cache?op=clear&sync=true", true) + logger.info("clear file cache result: ${result}") + assert result.status.toString().equalsIgnoreCase("OK") : result + } + + def randomText = { Random random, int length -> + def builder = new StringBuilder(length) + for (int i = 0; i < length; i++) { + builder.append(chars.charAt(random.nextInt(chars.length()))) + } + return builder.toString() + } + + def insertRows = { String tableName -> + for (int batchStart = 0; batchStart < rowsPerTable; batchStart += batchRows) { + def random = new Random(((long) tableName.hashCode()) * 131 + batchStart) + def data = new StringBuilder(batchRows * 3500) + for (int i = 0; i < batchRows; i++) { + int id = batchStart + i + data.append(id).append('\t') + data.append(id % 128).append('\t') + data.append(randomText(random, 2048)).append('\t') + data.append(randomText(random, 1024)).append('\n') + } + streamLoad { + table tableName + set 'column_separator', '\t' + set 'columns', 'id,group_id,payload,pad' + inputText data.toString() + time 60000 + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("stream load ${tableName} result: ${result}") + def json = parseJson(result) + assert json.Status.toString().equalsIgnoreCase("success") : result + assert json.NumberLoadedRows.toString().toInteger() == batchRows : result + } + } + } + sql "SYNC" + } + + def insertInvertedIndexRows = { String tableName -> + for (int batchStart = 0; batchStart < rowsPerTable; batchStart += invertedIndexBatchRows) { + def random = new Random(((long) tableName.hashCode()) * 257 + batchStart) + def data = new StringBuilder(invertedIndexBatchRows * 3500) + for (int i = 0; i < invertedIndexBatchRows; i++) { + int id = batchStart + i + data.append(id).append('\t') + data.append("needle group").append(id % 16).append(' ') + data.append(randomText(random, 2048)).append('\t') + data.append(randomText(random, 1024)).append('\n') + } + streamLoad { + table tableName + set 'column_separator', '\t' + set 'columns', 'id,body,pad' + inputText data.toString() + time 60000 + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("stream load ${tableName} result: ${result}") + def json = parseJson(result) + assert json.Status.toString().equalsIgnoreCase("success") : result + assert json.NumberLoadedRows.toString().toInteger() == invertedIndexBatchRows : + result + } + } + } + sql "SYNC" + } + + def parseProfileCounterValue = { String valueText -> + def exact = (valueText =~ /\((\d+)\)/) + if (exact.find()) { + return exact.group(1).toLong() + } + def number = (valueText =~ /([0-9]+(?:\.[0-9]+)?)\s*(B|KB|MB|GB)?/) + if (!number.find()) { + return 0L + } + BigDecimal value = new BigDecimal(number.group(1)) + long multiplier = 1L + if (number.group(2) == "KB") { + multiplier = 1024L + } else if (number.group(2) == "MB") { + multiplier = 1024L * 1024L + } else if (number.group(2) == "GB") { + multiplier = 1024L * 1024L * 1024L + } + return (value * multiplier).toLong() + } + + def parseProfileCounterAggregateValue = { String valueText, String aggregateName -> + def aggregate = (valueText =~ ("(?:^|,\\s*)" + + java.util.regex.Pattern.quote(aggregateName) + "\\s+([^,]+)")) + if (aggregate.find()) { + return parseProfileCounterValue(aggregate.group(1).toString()) + } + if (aggregateName.equalsIgnoreCase("max")) { + def highWaterMarkPeak = (valueText =~ /(?:^|\s)\(Peak:\s*(.+)\)\s*$/) + if (highWaterMarkPeak.find()) { + return parseProfileCounterValue(highWaterMarkPeak.group(1).toString()) + } + } + return parseProfileCounterValue(valueText) + } + + def detailProfileSection = { String profileString -> + int start = profileString.indexOf("\nDetailProfile(") + if (start < 0) { + start = profileString.indexOf("DetailProfile(") + } + if (start < 0) { + return profileString + } + int end = profileString.indexOf("\nAppendix:", start) + return end < 0 ? profileString.substring(start) : profileString.substring(start, end) + } + + def profileCounterValues = { String profileString, String counterName, String aggregateName = null -> + def values = [] + def profileSection = detailProfileSection(profileString) + def matcher = (profileSection =~ ("(?m)^\\s*(?:-\\s*)?" + + java.util.regex.Pattern.quote(counterName) + "(?::\\s+|\\s+Current:\\s+)([^\\n]+)")) + while (matcher.find()) { + def valueText = matcher.group(1).toString() + values.add(aggregateName == null ? parseProfileCounterValue(valueText) : + parseProfileCounterAggregateValue(valueText, aggregateName)) + } + return values + } + + def sumProfileCounter = { String profileString, String counterName -> + long total = 0 + profileCounterValues(profileString, counterName).each { + total += it + } + return total + } + + def maxProfileCounter = { String profileString, String counterName -> + def values = profileCounterValues(profileString, counterName, "max") + values.addAll(profileCounterValues(profileString, "${counterName}Peak")) + return values.isEmpty() ? 0L : values.max() + } + + def assertProfileCounterExists = { String profileString, String counterName, + String label -> + assert profileString.contains(counterName) : + "${label}: expected profile counter ${counterName}, profile=${profileString}" + } + + def runProfileQuery = { String name, String query, Closure checker -> + profile(name) { + run { + sql "/* ${name} */ ${query}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + logger.error("Profile failed, profile result:\n${profileString}", exception) + throw exception + } + assert profileString.contains("Is Cached: No") || + profileString.contains("Is Cached: No") : + "query should not hit SQL/query cache, profile=${profileString}" + checker(profileString) + } + } + } + + def assertScannerShape = { String label, long scannerNum, boolean expectMultipleScanners, + String profileString -> + if (expectMultipleScanners) { + assert scannerNum > 1L : + "${label}: expected more than one scanner, profile=${profileString}" + } else { + assert scannerNum == 1L : + "${label}: expected single scanner, profile=${profileString}" + } + } + + def assertColdWriteThrough = { String label, String profileString, + boolean expectMultipleScanners -> + def remoteOnlyTriggered = maxProfileCounter(profileString, "RemoteOnlyOnMissTriggered") + def skipCacheIo = sumProfileCounter(profileString, "NumSkipCacheIOTotal") + def remoteBytes = sumProfileCounter(profileString, "BytesScannedFromRemote") + def writeCacheBytes = sumProfileCounter(profileString, "BytesWriteIntoCache") + def segmentFooterWriteCacheBytes = + sumProfileCounter(profileString, "SegmentFooterIndexBytesWriteIntoCache") + def segmentFooterRemoteIo = + sumProfileCounter(profileString, "SegmentFooterIndexNumRemoteIOTotal") + def scannerNum = sumProfileCounter(profileString, "NumScanners") + logger.info("${label} cold write-through counters: remoteOnlyTriggered=${remoteOnlyTriggered}, " + + "skipCacheIo=${skipCacheIo}, remoteBytes=${remoteBytes}, " + + "writeCacheBytes=${writeCacheBytes}, " + + "segmentFooterWriteCacheBytes=${segmentFooterWriteCacheBytes}, " + + "segmentFooterRemoteIo=${segmentFooterRemoteIo}, " + + "scannerNum=${scannerNum}") + + assert remoteOnlyTriggered == 0L : + "${label}: threshold should be disabled, profile=${profileString}" + assert skipCacheIo == 0L : + "${label}: cold write-through should not use skip-cache IO, profile=${profileString}" + assert writeCacheBytes > 0L : + "${label}: expected cold read to write file cache, profile=${profileString}" + assertProfileCounterExists(profileString, "SegmentFooterIndexWriteCacheIOUseTimer", + label) + assertProfileCounterExists(profileString, "SegmentFooterIndexBytesWriteIntoCache", + label) + assertProfileCounterExists(profileString, "SegmentFooterIndexNumRemoteIOTotal", + label) + if (segmentFooterRemoteIo > 0L) { + assert segmentFooterWriteCacheBytes > 0L : + "${label}: expected segment footer index read to write file cache, " + + "profile=${profileString}" + } + assert writeCacheBytes >= segmentFooterWriteCacheBytes : + "${label}: aggregate file-cache writes should cover segment footer index " + + "writes, profile=${profileString}" + assertScannerShape(label, scannerNum, expectMultipleScanners, profileString) + return [ + remoteBytes: remoteBytes, + writeCacheBytes: writeCacheBytes, + segmentFooterWriteCacheBytes: segmentFooterWriteCacheBytes, + scannerNum: scannerNum + ] + } + + def assertThresholdRemoteOnly = { String label, String profileString, + boolean expectMultipleScanners, long thresholdBytes -> + def remoteOnlyTriggered = maxProfileCounter(profileString, "RemoteOnlyOnMissTriggered") + def thresholdMetricBytes = + maxProfileCounter(profileString, "RemoteOnlyOnMissThresholdBytes") + def skipCacheIo = sumProfileCounter(profileString, "NumSkipCacheIOTotal") + def remoteBytes = sumProfileCounter(profileString, "BytesScannedFromRemote") + def writeCacheBytes = sumProfileCounter(profileString, "BytesWriteIntoCache") + def scannerNum = sumProfileCounter(profileString, "NumScanners") + logger.info("${label} threshold counters: remoteOnlyTriggered=${remoteOnlyTriggered}, " + + "thresholdMetricBytes=${thresholdMetricBytes}, skipCacheIo=${skipCacheIo}, " + + "remoteBytes=${remoteBytes}, writeCacheBytes=${writeCacheBytes}, " + + "scannerNum=${scannerNum}") + + assert remoteOnlyTriggered == 1L : + "${label}: expected threshold to switch the query to remote-only-on-miss, profile=${profileString}" + assert thresholdMetricBytes == thresholdBytes : + "${label}: expected threshold bytes in profile, thresholdBytes=${thresholdBytes}, " + + "profile=${profileString}" + assert skipCacheIo > 0L : + "${label}: expected later cache misses to skip cache writes, profile=${profileString}" + assert writeCacheBytes <= thresholdBytes : + "${label}: file cache writes should not exceed threshold, thresholdBytes=" + + "${thresholdBytes}, profile=${profileString}" + if (thresholdBytes == 0L) { + assert writeCacheBytes == 0L : + "${label}: zero threshold should not write any file cache, profile=${profileString}" + } else if (thresholdBytes >= cacheBlockSize) { + assert writeCacheBytes > 0L : + "${label}: threshold should allow at least one cache block write before trigger, " + + "profile=${profileString}" + } + assertScannerShape(label, scannerNum, expectMultipleScanners, profileString) + return [ + remoteBytes: remoteBytes, + writeCacheBytes: writeCacheBytes, + skipCacheIo: skipCacheIo, + thresholdMetricBytes: thresholdMetricBytes, + scannerNum: scannerNum + ] + } + + def assertInvertedIndexColdWriteThrough = { String label, String profileString, + long minReaderMisses, long minInvertedRemoteBytes -> + def remoteOnlyTriggered = maxProfileCounter(profileString, "RemoteOnlyOnMissTriggered") + def skipCacheIo = sumProfileCounter(profileString, "NumSkipCacheIOTotal") + def writeCacheBytes = sumProfileCounter(profileString, "BytesWriteIntoCache") + def invertedWriteCacheBytes = + sumProfileCounter(profileString, "InvertedIndexBytesWriteIntoCache") + def invertedRemoteBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromRemote") + def queryCacheMiss = sumProfileCounter(profileString, "InvertedIndexQueryCacheMiss") + def searcherCacheMiss = + sumProfileCounter(profileString, "InvertedIndexSearcherCacheMiss") + logger.info("${label} inverted cold write-through counters: " + + "remoteOnlyTriggered=${remoteOnlyTriggered}, skipCacheIo=${skipCacheIo}, " + + "writeCacheBytes=${writeCacheBytes}, invertedRemoteBytes=${invertedRemoteBytes}, " + + "invertedWriteCacheBytes=${invertedWriteCacheBytes}, " + + "queryCacheMiss=${queryCacheMiss}, searcherCacheMiss=${searcherCacheMiss}, " + + "minReaderMisses=${minReaderMisses}, " + + "minInvertedRemoteBytes=${minInvertedRemoteBytes}") + + assert remoteOnlyTriggered == 0L : + "${label}: threshold should be disabled, profile=${profileString}" + assert skipCacheIo == 0L : + "${label}: cold inverted-index read should not skip cache writes, profile=${profileString}" + assert writeCacheBytes > 0L : + "${label}: expected inverted-index read to write file cache, profile=${profileString}" + assertProfileCounterExists(profileString, "InvertedIndexWriteCacheIOUseTimer", + label) + assertProfileCounterExists(profileString, "InvertedIndexBytesWriteIntoCache", + label) + assert invertedWriteCacheBytes > 0L : + "${label}: expected inverted-index read to write file cache, " + + "profile=${profileString}" + assert writeCacheBytes >= invertedWriteCacheBytes : + "${label}: aggregate file-cache writes should cover inverted-index writes, " + + "profile=${profileString}" + assert invertedRemoteBytes >= minInvertedRemoteBytes : + "${label}: query should trigger enough inverted-index remote reads, " + + "minInvertedRemoteBytes=${minInvertedRemoteBytes}, profile=${profileString}" + assert queryCacheMiss >= minReaderMisses : + "${label}: query cache should not hide inverted-index reads, profile=${profileString}" + assert searcherCacheMiss >= minReaderMisses : + "${label}: searcher cache should not hide inverted-index reads, profile=${profileString}" + } + + def assertInvertedIndexThresholdRemoteOnly = { String label, String profileString, + long thresholdBytes, long minReaderMisses, long minInvertedRemoteBytes -> + def remoteOnlyTriggered = maxProfileCounter(profileString, "RemoteOnlyOnMissTriggered") + def thresholdMetricBytes = + maxProfileCounter(profileString, "RemoteOnlyOnMissThresholdBytes") + def skipCacheIo = sumProfileCounter(profileString, "NumSkipCacheIOTotal") + def writeCacheBytes = sumProfileCounter(profileString, "BytesWriteIntoCache") + def invertedRemoteBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromRemote") + def queryCacheMiss = sumProfileCounter(profileString, "InvertedIndexQueryCacheMiss") + def searcherCacheMiss = + sumProfileCounter(profileString, "InvertedIndexSearcherCacheMiss") + logger.info("${label} inverted threshold counters: " + + "remoteOnlyTriggered=${remoteOnlyTriggered}, " + + "thresholdMetricBytes=${thresholdMetricBytes}, skipCacheIo=${skipCacheIo}, " + + "writeCacheBytes=${writeCacheBytes}, invertedRemoteBytes=${invertedRemoteBytes}, " + + "queryCacheMiss=${queryCacheMiss}, searcherCacheMiss=${searcherCacheMiss}, " + + "minReaderMisses=${minReaderMisses}, " + + "minInvertedRemoteBytes=${minInvertedRemoteBytes}") + + assert remoteOnlyTriggered == 1L : + "${label}: expected threshold to switch the query to remote-only-on-miss, " + + "profile=${profileString}" + assert thresholdMetricBytes == thresholdBytes : + "${label}: expected positive threshold in profile, thresholdBytes=${thresholdBytes}, " + + "profile=${profileString}" + assert skipCacheIo > 0L : + "${label}: expected inverted-index cache misses to skip cache writes, " + + "profile=${profileString}" + assert writeCacheBytes > 0L && writeCacheBytes <= thresholdBytes : + "${label}: positive threshold should allow some writes before blocking later " + + "inverted-index cache writes, thresholdBytes=${thresholdBytes}, " + + "profile=${profileString}" + assert invertedRemoteBytes >= minInvertedRemoteBytes : + "${label}: query should trigger enough inverted-index remote reads, " + + "minInvertedRemoteBytes=${minInvertedRemoteBytes}, profile=${profileString}" + assert queryCacheMiss >= minReaderMisses : + "${label}: query cache should not hide inverted-index reads, profile=${profileString}" + assert searcherCacheMiss >= minReaderMisses : + "${label}: searcher cache should not hide inverted-index reads, profile=${profileString}" + } + + def assertInvertedIndexSearcherCacheHitThresholdRemoteOnly = { String label, + String profileString, long thresholdBytes, long minReaderHits, + long minInvertedRemoteBytes -> + def remoteOnlyTriggered = maxProfileCounter(profileString, "RemoteOnlyOnMissTriggered") + def thresholdMetricBytes = + maxProfileCounter(profileString, "RemoteOnlyOnMissThresholdBytes") + def skipCacheIo = sumProfileCounter(profileString, "NumSkipCacheIOTotal") + def writeCacheBytes = sumProfileCounter(profileString, "BytesWriteIntoCache") + def invertedRemoteBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromRemote") + def queryCacheMiss = sumProfileCounter(profileString, "InvertedIndexQueryCacheMiss") + def searcherCacheHit = + sumProfileCounter(profileString, "InvertedIndexSearcherCacheHit") + logger.info("${label} inverted searcher-cache-hit counters: " + + "remoteOnlyTriggered=${remoteOnlyTriggered}, " + + "thresholdMetricBytes=${thresholdMetricBytes}, skipCacheIo=${skipCacheIo}, " + + "writeCacheBytes=${writeCacheBytes}, invertedRemoteBytes=${invertedRemoteBytes}, " + + "queryCacheMiss=${queryCacheMiss}, searcherCacheHit=${searcherCacheHit}, " + + "minReaderHits=${minReaderHits}, minInvertedRemoteBytes=${minInvertedRemoteBytes}") + + assert remoteOnlyTriggered == 1L : + "${label}: expected threshold to switch the query to remote-only-on-miss, " + + "profile=${profileString}" + assert thresholdMetricBytes == thresholdBytes : + "${label}: expected threshold bytes in profile, thresholdBytes=${thresholdBytes}, " + + "profile=${profileString}" + assert skipCacheIo > 0L : + "${label}: expected cached-searcher posting reads to skip cache writes, " + + "profile=${profileString}" + assert writeCacheBytes == 0L : + "${label}: zero threshold should not write inverted-index file cache, " + + "profile=${profileString}" + assert invertedRemoteBytes >= minInvertedRemoteBytes : + "${label}: query should trigger enough inverted-index remote posting reads, " + + "minInvertedRemoteBytes=${minInvertedRemoteBytes}, profile=${profileString}" + assert queryCacheMiss >= minReaderHits : + "${label}: query cache should miss for the different term, profile=${profileString}" + assert searcherCacheHit >= minReaderHits : + "${label}: query should reuse cached inverted-index searchers, profile=${profileString}" + } + + def runCase = { String tableName, int buckets, long scannerLimit, + long scannerConcurrency, boolean expectMultipleScanners -> + sql "DROP TABLE IF EXISTS ${tableName} FORCE" + sql """ + CREATE TABLE ${tableName} ( + id INT NOT NULL, + group_id INT NOT NULL, + payload VARCHAR(4096) NOT NULL, + pad VARCHAR(4096) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS ${buckets} + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true" + ) + """ + insertRows(tableName) + + def rowCount = sql "SELECT COUNT(*) FROM ${tableName}" + assert rowCount[0][0] == rowsPerTable + + sql "set enable_profile = true" + sql "set profile_level = 2" + sql "set enable_sql_cache = false" + sql "set enable_query_cache = false" + sql "set enable_file_cache = false" + sql "set enable_page_cache = false" + sql "set parallel_fragment_exec_instance_num = 1" + sql "set enable_parallel_scan = true" + sql "set parallel_scan_max_scanners_count = ${scannerLimit}" + sql "set parallel_scan_min_rows_per_scanner = 1024" + if (supportMaxScannersConcurrency) { + sql "set max_scanners_concurrency = ${scannerConcurrency}" + } + sql "set parallel_pipeline_task_num = ${expectMultipleScanners ? 4 : 1}" + + def query = "SELECT SUM(LENGTH(payload) + LENGTH(pad)) FROM ${tableName}" + + clearFileCache() + sql "set file_cache_query_limit_bytes = -1" + runProfileQuery("${tableName}_cold_write_through", query) { profileString -> + assertColdWriteThrough("${tableName}_cold_write_through", profileString, + expectMultipleScanners) + } + + def thresholdCases = [ + [name: "zero", bytes: 0L], + [name: "half_block", bytes: halfCacheBlockSize], + [name: "one_block", bytes: cacheBlockSize], + [name: "two_and_half_blocks", bytes: cacheBlockSize * 2 + halfCacheBlockSize] + ] + thresholdCases.each { thresholdCase -> + clearFileCache() + sql "set file_cache_query_limit_bytes = ${thresholdCase.bytes}" + def thresholdVariable = sql """ + show variables like 'file_cache_query_limit_bytes' + """ + assert !thresholdVariable.isEmpty() && + thresholdVariable[0][1].toString().toLong() == thresholdCase.bytes.toLong() : + "failed to set file_cache_query_limit_bytes to " + + "${thresholdCase.bytes}, actual=${thresholdVariable}" + def profileName = "${tableName}_threshold_${thresholdCase.name}" + runProfileQuery(profileName, query) { profileString -> + assertThresholdRemoteOnly( + profileName, profileString, expectMultipleScanners, thresholdCase.bytes.toLong()) + } + } + } + + def createInvertedIndexTable = { String tableName -> + sql "DROP TABLE IF EXISTS ${tableName} FORCE" + sql """ + CREATE TABLE ${tableName} ( + id INT NOT NULL, + body TEXT NOT NULL, + pad VARCHAR(4096) NOT NULL, + INDEX body_idx(body) USING INVERTED + PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true", + "inverted_index_storage_format" = "V2" + ) + """ + insertInvertedIndexRows(tableName) + + def rowCount = sql "SELECT COUNT(*) FROM ${tableName}" + assert rowCount[0][0] == rowsPerTable + } + + def setupInvertedIndexQuery = { + sql "set enable_profile = true" + sql "set profile_level = 2" + sql "set enable_sql_cache = false" + sql "set enable_query_cache = false" + sql "set enable_file_cache = false" + sql "set enable_page_cache = false" + sql "set enable_inverted_index_query = true" + sql "set enable_common_expr_pushdown_for_inverted_index = true" + sql "set enable_match_without_inverted_index = false" + sql "set parallel_fragment_exec_instance_num = 1" + sql "set enable_parallel_scan = true" + sql "set parallel_scan_max_scanners_count = 1" + sql "set parallel_scan_min_rows_per_scanner = 1024" + if (supportMaxScannersConcurrency) { + sql "set max_scanners_concurrency = 1" + } + sql "set parallel_pipeline_task_num = 1" + } + + def runInvertedIndexCase = { String tableNamePrefix -> + def minReaderMisses = (rowsPerTable / invertedIndexBatchRows).toLong() + def minInvertedRemoteBytes = minReaderMisses * 1024L + + def coldTable = "${tableNamePrefix}_cold" + createInvertedIndexTable(coldTable) + setupInvertedIndexQuery() + def coldQuery = "SELECT COUNT() FROM ${coldTable} WHERE body MATCH 'needle'" + + clearFileCache() + sql "set file_cache_query_limit_bytes = -1" + runProfileQuery("${coldTable}_inverted_cold_write_through", coldQuery) { profileString -> + assertInvertedIndexColdWriteThrough( + "${coldTable}_inverted_cold_write_through", profileString, + minReaderMisses, minInvertedRemoteBytes) + } + + def thresholdTable = "${tableNamePrefix}_threshold" + createInvertedIndexTable(thresholdTable) + setupInvertedIndexQuery() + def thresholdQuery = "SELECT COUNT() FROM ${thresholdTable} WHERE body MATCH 'needle'" + + clearFileCache() + def invertedThresholdBytes = cacheBlockSize + sql "set file_cache_query_limit_bytes = ${invertedThresholdBytes}" + runProfileQuery("${thresholdTable}_inverted_threshold_one_block", + thresholdQuery) { profileString -> + assertInvertedIndexThresholdRemoteOnly( + "${thresholdTable}_inverted_threshold_one_block", profileString, + invertedThresholdBytes, minReaderMisses, minInvertedRemoteBytes) + } + + def cachedSearcherTable = "${tableNamePrefix}_sch_hit" + createInvertedIndexTable(cachedSearcherTable) + setupInvertedIndexQuery() + def buildSearcherQuery = + "SELECT COUNT() FROM ${cachedSearcherTable} WHERE body MATCH 'needle'" + def cachedSearcherQuery = + "SELECT COUNT() FROM ${cachedSearcherTable} WHERE body MATCH 'group1'" + + clearFileCache() + sql "set file_cache_query_limit_bytes = -1" + runProfileQuery("${cachedSearcherTable}_build_searcher_cache", + buildSearcherQuery) { profileString -> + def searcherCacheMiss = + sumProfileCounter(profileString, "InvertedIndexSearcherCacheMiss") + assert searcherCacheMiss >= minReaderMisses : + "${cachedSearcherTable}: first query should build searcher cache, " + + "profile=${profileString}" + } + + clearFileCache() + def cachedSearcherThresholdBytes = 0L + sql "set file_cache_query_limit_bytes = ${cachedSearcherThresholdBytes}" + runProfileQuery("${cachedSearcherTable}_inverted_searcher_cache_hit_zero_threshold", + cachedSearcherQuery) { profileString -> + assertInvertedIndexSearcherCacheHitThresholdRemoteOnly( + "${cachedSearcherTable}_inverted_searcher_cache_hit_zero_threshold", + profileString, cachedSearcherThresholdBytes, minReaderMisses, + minInvertedRemoteBytes) + } + } + + runCase("remote_scan_no_write_file_cache_single", 1, 1L, 1L, false) + runCase("remote_scan_no_write_file_cache_multi", 8, 8L, 4L, true) + runInvertedIndexCase("remote_scan_no_write_file_cache_inverted_index") + } +} diff --git a/regression-test/suites/cloud_p0/cache/topn_lazy_file_cache/test_topn_lazy_mat_phase2_no_write_file_cache.groovy b/regression-test/suites/cloud_p0/cache/topn_lazy_file_cache/test_topn_lazy_mat_phase2_no_write_file_cache.groovy new file mode 100644 index 00000000000000..2e3ea83265c973 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/topn_lazy_file_cache/test_topn_lazy_mat_phase2_no_write_file_cache.groovy @@ -0,0 +1,294 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.Http + +import java.util.regex.Pattern + +suite("test_topn_lazy_mat_phase2_no_write_file_cache", "docker") { + def options = new ClusterOptions() + options.feNum = 1 + options.beNum = 1 + options.msNum = 1 + options.cloudMode = true + options.beConfigs += [ + "enable_file_cache=true", + "disable_storage_page_cache=true", + "enable_java_support=false", + "enable_evict_file_cache_in_advance=false", + "file_cache_enter_disk_resource_limit_mode_percent=99", + "file_cache_each_block_size=4096", + "file_cache_path=[{\"path\":\"/opt/apache-doris/be/storage/file_cache\"," + + "\"total_size\":104857600,\"query_limit\":104857600}]" + ] + + docker(options) { + def clusters = sql "SHOW CLUSTERS" + assert !clusters.isEmpty() + def computeGroup = clusters[0][0] + sql "use @${computeGroup}" + + def backends = sql "SHOW BACKENDS" + assert backends.size() == 1 + def beHost = backends[0][1] + def beHttpPort = backends[0][4] + def clearFileCache = { + def result = Http.GET("http://${beHost}:${beHttpPort}/api/file_cache?op=clear&sync=true", true) + logger.info("clear file cache result: ${result}") + } + + sql "set enable_profile = true" + sql "set profile_level = 2" + sql "set enable_sql_cache = false" + sql "set enable_query_cache = false" + sql "set enable_page_cache = false" + sql "set topn_lazy_materialization_threshold = 1024" + + def metricValue = { String profileText, String metricName -> + def matcher = profileText =~ + /(?m)^\s*-\s*${Pattern.quote(metricName)}:\s*(?:sum\s+)?([0-9]+(?:\.[0-9]+)?).*$/ + assert matcher.find() : "missing metric ${metricName} in profile:\n${profileText}" + return new BigDecimal(matcher.group(1)) + } + + def metricValues = { String profileText, String metricName -> + def matcher = profileText =~ + /(?m)^\s*-\s*${Pattern.quote(metricName)}:\s*(?:sum\s+)?([0-9]+(?:\.[0-9]+)?).*$/ + def values = [] + while (matcher.find()) { + values.add(new BigDecimal(matcher.group(1))) + } + assert !values.isEmpty() : "missing metric ${metricName} in profile:\n${profileText}" + return values + } + + def numericValues = { String text -> + def matcher = text =~ /([0-9]+(?:\.[0-9]+)?)/ + def values = [] + while (matcher.find()) { + values.add(new BigDecimal(matcher.group(1))) + } + return values + } + + def metricLineValues = { String profileText, String metricName -> + def matcher = profileText =~ /(?m)^\s*-\s*${Pattern.quote(metricName)}:\s*(.*)$/ + def values = [] + while (matcher.find()) { + values.add(matcher.group(1).trim()) + } + return values + } + + def topnSecondPhaseMetricNames = [ + "TopNLazyMaterializationSecondPhaseLocalIOCount", + "TopNLazyMaterializationSecondPhaseLocalIOBytes", + "TopNLazyMaterializationSecondPhaseRemoteIOCount", + "TopNLazyMaterializationSecondPhaseRemoteIOBytes", + "TopNLazyMaterializationSecondPhaseSkipCacheIOCount", + "TopNLazyMaterializationSecondPhaseWriteCacheBytes", + "TopNLazyMaterializationSecondPhaseLocalIOTime", + "TopNLazyMaterializationSecondPhaseRemoteIOTime", + "TopNLazyMaterializationSecondPhaseWriteCacheIOTime", + "TopNLazyMaterializationSecondPhaseRowsRead", + "TopNLazyMaterializationSecondPhaseSegmentsRead", + "TopNLazyMaterializationSecondPhasePerBackend", + "TopNLazyMaterializationSecondPhasePerBackendRowsRead", + "TopNLazyMaterializationSecondPhasePerBackendSegmentsRead", + "TopNLazyMaterializationSecondPhasePerBackendLocalIOCount", + "TopNLazyMaterializationSecondPhasePerBackendLocalIOBytes", + "TopNLazyMaterializationSecondPhasePerBackendRemoteIOCount", + "TopNLazyMaterializationSecondPhasePerBackendRemoteIOBytes", + "TopNLazyMaterializationSecondPhasePerBackendSkipCacheIOCount", + "TopNLazyMaterializationSecondPhasePerBackendWriteCacheBytes", + "TopNLazyMaterializationSecondPhasePerBackendLocalIOTime", + "TopNLazyMaterializationSecondPhasePerBackendRemoteIOTime", + "TopNLazyMaterializationSecondPhasePerBackendWriteCacheIOTime" + ] + + def logTopnSecondPhaseMetrics = { String name, String profileText -> + def metrics = topnSecondPhaseMetricNames.collectEntries { metricName -> + def values = metricLineValues(profileText, metricName) + assert !values.isEmpty() : "missing metric ${metricName} in profile:\n${profileText}" + [(metricName): values] + } + logger.info("${name} TopN lazy materialization second phase metrics: ${metrics}") + logger.info("${name} CachedPagesNum values: ${metricLineValues(profileText, 'CachedPagesNum')}") + } + + def runProfileQuery = { String name, String query, Closure checker -> + profile(name) { + run { + sql "/* ${name} */ ${query}" + sleep(1000) + } + check { profileString, exception -> + if (exception != null) { + logger.error("Profile failed, profile result:\n${profileString}", exception) + throw exception + } + assert profileString.contains("TopNLazyMaterializationSecondPhase") : + "missing TopN lazy materialization profile:\n${profileString}" + assert profileString.contains("Is Cached: No") + || profileString.contains("Is Cached: No") : + "query should not hit SQL/query cache:\n${profileString}" + logTopnSecondPhaseMetrics(name, profileString) + checker(profileString) + } + } + } + + def assertRemoteOnlyMiss = { String profileText -> + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseRemoteIOCount") > 0 + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseRemoteIOBytes") > 0 + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseSkipCacheIOCount") > 0 + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseWriteCacheBytes") + .compareTo(BigDecimal.ZERO) == 0 + } + + def assertPerBackendRowsMatchAggregate = { String profileText -> + def aggregateRows = metricValues(profileText, + "TopNLazyMaterializationSecondPhaseRowsRead") + .inject(BigDecimal.ZERO) { sum, value -> sum + value } + def perBackendRows = metricLineValues(profileText, + "TopNLazyMaterializationSecondPhasePerBackendRowsRead") + .collectMany { line -> numericValues(line) } + assert !perBackendRows.isEmpty() : + "missing per-backend rows-read values:\n${profileText}" + def perBackendRowsSum = perBackendRows.inject(BigDecimal.ZERO) { sum, value -> sum + value } + assert aggregateRows.compareTo(perBackendRowsSum) == 0 : + "per-backend rows-read sum ${perBackendRowsSum} should equal aggregate ${aggregateRows}:\n${profileText}" + } + + def assertLocalFullHit = { String profileText -> + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseLocalIOCount") > 0 + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseLocalIOBytes") > 0 + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseRemoteIOCount") + .compareTo(BigDecimal.ZERO) == 0 + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseRemoteIOBytes") + .compareTo(BigDecimal.ZERO) == 0 + assert metricValue(profileText, "TopNLazyMaterializationSecondPhaseWriteCacheBytes") + .compareTo(BigDecimal.ZERO) == 0 + } + + sql "DROP TABLE IF EXISTS topn_lazy_remote_only_no_row_store" + sql """ + CREATE TABLE topn_lazy_remote_only_no_row_store ( + k INT NOT NULL, + sort_key INT NOT NULL, + payload VARCHAR(4096) NOT NULL, + pad VARCHAR(4096) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(k) + DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true", + "store_row_column" = "false" + ) + """ + sql """ + INSERT INTO topn_lazy_remote_only_no_row_store + SELECT number, + 4096 - number, + repeat(cast(number as string), 128), + repeat('x', 256) + FROM numbers("number" = "4096") + """ + + def noRowStoreQuery = + "SELECT k, payload, pad FROM topn_lazy_remote_only_no_row_store ORDER BY sort_key LIMIT 16" + explain { + sql noRowStoreQuery + contains("MaterializeNode") + } + + clearFileCache() + sql "set enable_topn_lazy_mat_phase2_no_write_file_cache = true" + runProfileQuery("topn_lazy_remote_only_no_row_store_remote_only_miss", + noRowStoreQuery, assertRemoteOnlyMiss) + + clearFileCache() + sql "set enable_topn_lazy_mat_phase2_no_write_file_cache = false" + sql "/* topn_lazy_remote_only_no_row_store_local_full_hit_warm */ ${noRowStoreQuery}" + sleep(1000) + + sql "set enable_topn_lazy_mat_phase2_no_write_file_cache = true" + runProfileQuery("topn_lazy_remote_only_no_row_store_local_full_hit", + noRowStoreQuery, assertLocalFullHit) + + clearFileCache() + sql "set enable_topn_lazy_mat_phase2_no_write_file_cache = true" + sql "set batch_size = 1" + try { + runProfileQuery("topn_lazy_remote_only_no_row_store_multi_fetch_accumulate", + noRowStoreQuery, { profileText -> + assertRemoteOnlyMiss(profileText) + assertPerBackendRowsMatchAggregate(profileText) + }) + } finally { + sql "set batch_size = 4062" + } + + sql "DROP TABLE IF EXISTS topn_lazy_remote_only_row_store" + sql """ + CREATE TABLE topn_lazy_remote_only_row_store ( + k INT NOT NULL, + sort_key INT NOT NULL, + payload VARCHAR(4096) NOT NULL, + pad VARCHAR(4096) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(k) + DISTRIBUTED BY HASH(k) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true", + "store_row_column" = "true" + ) + """ + sql """ + INSERT INTO topn_lazy_remote_only_row_store + SELECT number, + 4096 - number, + repeat(cast(number as string), 128), + repeat('x', 256) + FROM numbers("number" = "4096") + """ + + def rowStoreQuery = + "SELECT k, payload, pad FROM topn_lazy_remote_only_row_store ORDER BY sort_key LIMIT 16" + explain { + sql rowStoreQuery + contains("MaterializeNode") + } + + clearFileCache() + sql "set enable_topn_lazy_mat_phase2_no_write_file_cache = true" + runProfileQuery("topn_lazy_remote_only_row_store_remote_only_miss", + rowStoreQuery, assertRemoteOnlyMiss) + + clearFileCache() + sql "set enable_topn_lazy_mat_phase2_no_write_file_cache = false" + sql "/* topn_lazy_remote_only_row_store_local_full_hit_warm */ ${rowStoreQuery}" + sleep(1000) + + sql "set enable_topn_lazy_mat_phase2_no_write_file_cache = true" + runProfileQuery("topn_lazy_remote_only_row_store_local_full_hit", + rowStoreQuery, assertLocalFullHit) + } +} diff --git a/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only.groovy b/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only.groovy new file mode 100644 index 00000000000000..647d0936d80d01 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only.groovy @@ -0,0 +1,344 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.Http + +suite("test_file_cache_write_index_file_only", "docker") { + def options = new ClusterOptions() + options.cloudMode = true + options.setFeNum(1) + options.setBeNum(1) + + options.beConfigs += [ + 'enable_file_cache=true', + 'enable_file_cache_write_index_file_only=true', + 'enable_file_cache_adaptive_write=true', + 'enable_file_cache_keep_base_compaction_output=true', + 'file_cache_keep_base_compaction_output_min_hit_ratio=0', + 'enable_flush_file_cache_async=false', + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'enable_packed_file=false', + 'file_cache_each_block_size=4096', + 'file_cache_path=[{"path":"/opt/apache-doris/be/storage/file_cache","total_size":83886080,"query_limit":83886080}]' + ] + + def tableName = "test_file_cache_write_index_file_only" + def loadRows = 1200 + def compactionBatchRows = 200 + def indexTokenPrefix = "tok" + def backendIdToIp = [:] + def backendIdToHttpPort = [:] + def backendIdToBrpcPort = [:] + + def clearFileCache = { beHost, beHttpPort -> + def result = Http.GET("http://${beHost}:${beHttpPort}/api/file_cache?op=clear&sync=true", true) + logger.info("clear file cache result: ${result}") + } + + def uniqueIndexToken = { int seed -> + def alphabet = "abcdefghijklmnopqrstuvwxyz" + def value = seed + def suffix = new StringBuilder() + for (int i = 0; i < 5; i++) { + suffix.append(alphabet.charAt(value % alphabet.length())) + value = (int) (value / alphabet.length()) + } + return "${indexTokenPrefix}${suffix}" + } + + def insertRows = { int batch, int rowCount -> + def data = new StringBuilder() + (0.. + def payload = (1..24).collect { java.util.UUID.randomUUID().toString() }.join("") + def indexToken = uniqueIndexToken(batch * 100000 + idx) + data.append("${batch * 100000 + idx + 1}\t") + data.append("${rowCount - idx}\t") + data.append("tag_${batch}_${idx}\t") + data.append("quick brown profile text ${indexToken} row ${batch} ${idx}\t") + data.append("${payload}\n") + } + streamLoad { + table "${tableName}" + set 'column_separator', '\t' + set 'columns', 'id,sort_key,tag,body,payload' + inputText data.toString() + time 60000 + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("stream load result: ${result}") + def json = parseJson(result) + assert json.Status.toString().equalsIgnoreCase("success") : result + assert json.NumberLoadedRows.toString().toInteger() == rowCount : result + } + } + } + + def waitForLatestAlterOnTableFinish = { String table -> + for (int i = 0; i < 300; i++) { + def alterResult = sql """ + SHOW ALTER TABLE COLUMN WHERE TableName = "${table}" ORDER BY CreateTime DESC LIMIT 1 + """ + logger.info("latest alter table column result: ${alterResult}") + if (alterResult.size() > 0) { + def alterResultString = alterResult.toString() + assert !alterResultString.contains("CANCELLED") : + "schema change was cancelled, result=${alterResultString}" + if (alterResultString.contains("FINISHED")) { + sleep(3000) + return + } + } + sleep(1000) + } + assert false : "wait alter table column timeout, table=${table}" + } + + def parseProfileCounterValue = { String valueText -> + def exact = (valueText =~ /\((\d+)\)/) + if (exact.find()) { + return exact.group(1).toLong() + } + def number = (valueText =~ /([0-9]+(?:\.[0-9]+)?)\s*(B|KB|MB|GB)?/) + if (!number.find()) { + return 0L + } + BigDecimal value = new BigDecimal(number.group(1)) + long multiplier = 1L + if (number.group(2) == "KB") { + multiplier = 1024L + } else if (number.group(2) == "MB") { + multiplier = 1024L * 1024L + } else if (number.group(2) == "GB") { + multiplier = 1024L * 1024L * 1024L + } + return (value * multiplier).toLong() + } + + def sumProfileCounter = { String profileString, String counterName -> + long total = 0 + def matcher = (profileString =~ ("(?m)^\\s*(?:-\\s*)?" + + java.util.regex.Pattern.quote(counterName) + ":\\s+([^\\n]+)")) + while (matcher.find()) { + total += parseProfileCounterValue(matcher.group(1).toString()) + } + return total + } + + def sumProfileSummaryCounter = { String profileString, String counterName -> + long total = 0 + def counterRegex = counterName.split(/\s+/).collect { + java.util.regex.Pattern.quote(it) + }.join("\\s+") + def matcher = (profileString =~ ("(?m)^\\s*(?:-\\s*)?" + counterRegex + ":\\s+([^\\n]+)")) + while (matcher.find()) { + total += parseProfileCounterValue(matcher.group(1).toString()) + } + return total + } + + def assertIndexOnlyProfile = { String profileString, String label, + boolean requireSegmentFooterCacheBytes, boolean requireDataPageRemoteBytes -> + def invertedIndexCacheBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromCache") + def invertedIndexRemoteBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromRemote") + def segmentFooterIndexCacheBytes = + sumProfileCounter(profileString, "SegmentFooterIndexBytesScannedFromCache") + def segmentFooterIndexRemoteBytes = + sumProfileCounter(profileString, "SegmentFooterIndexBytesScannedFromRemote") + def totalCacheBytes = sumProfileCounter(profileString, "BytesScannedFromCache") + def totalRemoteBytes = sumProfileCounter(profileString, "BytesScannedFromRemote") + def parallelFragmentExecInstanceNum = + sumProfileSummaryCounter(profileString, "Parallel Fragment Exec Instance Num") + def totalInstancesNum = sumProfileSummaryCounter(profileString, "Total Instances Num") + def classifiedRemoteBytes = invertedIndexRemoteBytes + segmentFooterIndexRemoteBytes + def dataPageRemoteBytes = totalRemoteBytes - classifiedRemoteBytes + + logger.info("${label} profile counters: invertedIndexCacheBytes=${invertedIndexCacheBytes}, " + + "invertedIndexRemoteBytes=${invertedIndexRemoteBytes}, " + + "segmentFooterIndexCacheBytes=${segmentFooterIndexCacheBytes}, " + + "segmentFooterIndexRemoteBytes=${segmentFooterIndexRemoteBytes}, " + + "parallelFragmentExecInstanceNum=${parallelFragmentExecInstanceNum}, " + + "totalInstancesNum=${totalInstancesNum}, totalCacheBytes=${totalCacheBytes}, " + + "totalRemoteBytes=${totalRemoteBytes}, dataPageRemoteBytes=${dataPageRemoteBytes}") + + assert parallelFragmentExecInstanceNum == 1L : + "${label}: expected profile query to use one parallel fragment exec instance, profile=${profileString}" + assert invertedIndexCacheBytes > 0 : + "${label}: expected independent inverted index file to be read from local file cache, profile=${profileString}" + assert invertedIndexCacheBytes > 4096 : + "${label}: expected query to read more than the initial inverted index buffer, profile=${profileString}" + assert invertedIndexRemoteBytes == 0L : + "${label}: independent inverted index should not be read from remote storage, profile=${profileString}" + if (requireSegmentFooterCacheBytes) { + assert segmentFooterIndexCacheBytes > 0 : + "${label}: expected segment footer/index to be read from local file cache, profile=${profileString}" + } + assert segmentFooterIndexRemoteBytes == 0L : + "${label}: segment footer/index should not be read from remote storage, profile=${profileString}" + if (requireDataPageRemoteBytes) { + assert totalRemoteBytes > classifiedRemoteBytes : + "${label}: expected ordinary data pages to be read from remote storage, " + + "totalRemoteBytes=${totalRemoteBytes}, classifiedRemoteBytes=" + + "${classifiedRemoteBytes}, dataPageRemoteBytes=${dataPageRemoteBytes}, " + + "profile=${profileString}" + } + // File cache is block-aligned, so footer/index cache ranges may include nearby data pages. + // Only assert data-page remote reads for stages where the query reaches uncached data pages. + } + + def logRowsetsLayout = { String label -> + def currentTablets = sql_return_maparray """ SHOW TABLETS FROM ${tableName} """ + assert currentTablets.size() == 1 + def currentTablet = currentTablets[0] + def tabletStatus = show_tablet_compaction(currentTablet) + def rowsets = tabletStatus.rowsets == null ? [] : tabletStatus.rowsets + def staleRowsets = tabletStatus.stale_rowsets == null ? [] : tabletStatus.stale_rowsets + logger.info("${label} rowsets layout: tabletId=${currentTablet.TabletId}, " + + "rowsetCount=${rowsets.size()}, staleRowsetCount=${staleRowsets.size()}, " + + "rowsets=${rowsets}, staleRowsets=${staleRowsets}") + } + + docker(options) { + getBackendIpHttpAndBrpcPort(backendIdToIp, backendIdToHttpPort, backendIdToBrpcPort) + + sql """ DROP TABLE IF EXISTS ${tableName} FORCE """ + sql """ + CREATE TABLE ${tableName} ( + id INT, + sort_key INT, + tag VARCHAR(64), + body VARCHAR(2048), + payload STRING, + INDEX body_idx(body) USING INVERTED PROPERTIES("parser" = "english") COMMENT '' + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true", + "inverted_index_storage_format" = "V2" + ) + """ + + def tablets = sql_return_maparray """ SHOW TABLETS FROM ${tableName} """ + assert tablets.size() == 1 + def tablet = tablets[0] + def beHost = backendIdToIp[tablet.BackendId] + def beHttpPort = backendIdToHttpPort[tablet.BackendId] + + clearFileCache(beHost, beHttpPort) + insertRows(1, loadRows) + sql """ SYNC """ + + def loadProfileTag = "file_cache_write_index_only_load_profile" + def loadProfileChecked = false + profile(loadProfileTag) { + sql """ SET enable_profile = true """ + sql """ SET profile_level = 2 """ + sql """ SET parallel_pipeline_task_num = 1 """ + sql """ SET inverted_index_max_expansions = 4096 """ + run { + logRowsetsLayout("before load query") + sql """ /* ${loadProfileTag} */ SELECT id + 1 FROM ${tableName} WHERE body MATCH_REGEXP '^${indexTokenPrefix}.*' ORDER BY sort_key LIMIT 10 """ + sleep(500) + } + check { profileString, exception -> + loadProfileChecked = true + if (exception != null) { + throw exception + } + logger.info("profile snippet: {}", profileString.take(3000)) + assertIndexOnlyProfile(profileString, "load", true, true) + } + } + assert loadProfileChecked : "load profile check was not executed" + + (2..3).each { batch -> + insertRows(batch, compactionBatchRows) + } + sql """ SYNC """ + + clearFileCache(beHost, beHttpPort) + trigger_and_wait_compaction(tableName, "full") + + def compactionProfileTag = "file_cache_write_index_only_compaction_profile" + def compactionProfileChecked = false + profile(compactionProfileTag) { + sql """ SET enable_profile = true """ + sql """ SET profile_level = 2 """ + sql """ SET parallel_pipeline_task_num = 1 """ + sql """ SET inverted_index_max_expansions = 4096 """ + run { + logRowsetsLayout("before full compaction query") + sql """ + /* ${compactionProfileTag} */ + SELECT COUNT(*), SUM(id), SUM(sort_key), SUM(LENGTH(payload)) + FROM ${tableName} + WHERE body MATCH_REGEXP '^${indexTokenPrefix}.*' + """ + sleep(500) + } + check { profileString, exception -> + compactionProfileChecked = true + if (exception != null) { + throw exception + } + logger.info("profile snippet: {}", profileString.take(3000)) + assertIndexOnlyProfile(profileString, "full compaction", false, true) + } + } + assert compactionProfileChecked : "full compaction profile check was not executed" + + clearFileCache(beHost, beHttpPort) + sql """ ALTER TABLE ${tableName} ADD COLUMN sc_key BIGINT KEY DEFAULT "0" AFTER id """ + waitForLatestAlterOnTableFinish(tableName) + sql """ SYNC """ + + def schemaChangeProfileTag = "file_cache_write_index_only_schema_change_profile" + def schemaChangeProfileChecked = false + profile(schemaChangeProfileTag) { + sql """ SET enable_profile = true """ + sql """ SET profile_level = 2 """ + sql """ SET parallel_pipeline_task_num = 1 """ + sql """ SET inverted_index_max_expansions = 4096 """ + run { + logRowsetsLayout("before heavy schema change query") + sql """ + /* ${schemaChangeProfileTag} */ + SELECT COUNT(*), SUM(id), SUM(sort_key), SUM(LENGTH(payload)) + FROM ${tableName} + WHERE body MATCH_REGEXP '^${indexTokenPrefix}.*' + """ + sleep(500) + } + check { profileString, exception -> + schemaChangeProfileChecked = true + if (exception != null) { + throw exception + } + logger.info("profile snippet: {}", profileString.take(3000)) + assertIndexOnlyProfile(profileString, "heavy schema change", false, true) + } + } + assert schemaChangeProfileChecked : "heavy schema change profile check was not executed" + } +} diff --git a/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only_compaction_segment_data.groovy b/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only_compaction_segment_data.groovy new file mode 100644 index 00000000000000..7e7dd7b7071632 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only_compaction_segment_data.groovy @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.Http + +suite("test_file_cache_write_index_file_only_compaction_segment_data", "docker") { + def options = new ClusterOptions() + options.cloudMode = true + options.setFeNum(1) + options.setBeNum(1) + + options.beConfigs += [ + 'enable_file_cache=true', + 'enable_file_cache_write_index_file_only=true', + 'enable_file_cache_adaptive_write=true', + 'enable_file_cache_keep_base_compaction_output=true', + 'file_cache_keep_base_compaction_output_min_hit_ratio=0', + 'enable_flush_file_cache_async=false', + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'enable_packed_file=false', + 'file_cache_each_block_size=4096', + 'file_cache_path=[{"path":"/opt/apache-doris/be/storage/file_cache","total_size":83886080,"query_limit":83886080}]' + ] + + def tableName = "test_file_cache_write_index_file_only_compaction_segment_data" + def batchRows = 1200 + def backendIdToIp = [:] + def backendIdToHttpPort = [:] + def backendIdToBrpcPort = [:] + + def clearFileCache = { beHost, beHttpPort -> + def result = Http.GET("http://${beHost}:${beHttpPort}/api/file_cache?op=clear&sync=true", true) + logger.info("clear file cache result: ${result}") + } + + def insertRows = { int batch, int rowCount -> + def data = new StringBuilder() + (0.. + def payload = (1..32).collect { java.util.UUID.randomUUID().toString() }.join("") + def rowId = batch * 100000 + idx + 1 + data.append("${rowId}\t") + data.append("${idx}\t") + data.append("tag_${batch}_${idx}\t") + data.append("scan all segment data row ${batch} ${idx}\t") + data.append("${payload}\n") + } + streamLoad { + table "${tableName}" + set 'column_separator', '\t' + set 'columns', 'id,sort_key,tag,body,payload' + inputText data.toString() + time 60000 + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("stream load result: ${result}") + def json = parseJson(result) + assert json.Status.toString().equalsIgnoreCase("success") : result + assert json.NumberLoadedRows.toString().toInteger() == rowCount : result + } + } + } + + def parseProfileCounterValue = { String valueText -> + def exact = (valueText =~ /\((\d+)\)/) + if (exact.find()) { + return exact.group(1).toLong() + } + def number = (valueText =~ /([0-9]+(?:\.[0-9]+)?)\s*(B|KB|MB|GB)?/) + if (!number.find()) { + return 0L + } + BigDecimal value = new BigDecimal(number.group(1)) + long multiplier = 1L + if (number.group(2) == "KB") { + multiplier = 1024L + } else if (number.group(2) == "MB") { + multiplier = 1024L * 1024L + } else if (number.group(2) == "GB") { + multiplier = 1024L * 1024L * 1024L + } + return (value * multiplier).toLong() + } + + def sumProfileCounter = { String profileString, String counterName -> + long total = 0 + def matcher = (profileString =~ ("(?m)^\\s*(?:-\\s*)?" + + java.util.regex.Pattern.quote(counterName) + ":\\s+([^\\n]+)")) + while (matcher.find()) { + total += parseProfileCounterValue(matcher.group(1).toString()) + } + return total + } + + def sumProfileSummaryCounter = { String profileString, String counterName -> + long total = 0 + def counterRegex = counterName.split(/\s+/).collect { + java.util.regex.Pattern.quote(it) + }.join("\\s+") + def matcher = (profileString =~ ("(?m)^\\s*(?:-\\s*)?" + counterRegex + ":\\s+([^\\n]+)")) + while (matcher.find()) { + total += parseProfileCounterValue(matcher.group(1).toString()) + } + return total + } + + def logRowsetsLayout = { String label -> + def currentTablets = sql_return_maparray """ SHOW TABLETS FROM ${tableName} """ + assert currentTablets.size() == 1 + def currentTablet = currentTablets[0] + def tabletStatus = show_tablet_compaction(currentTablet) + def rowsets = tabletStatus.rowsets == null ? [] : tabletStatus.rowsets + def staleRowsets = tabletStatus.stale_rowsets == null ? [] : tabletStatus.stale_rowsets + logger.info("${label} rowsets layout: tabletId=${currentTablet.TabletId}, " + + "rowsetCount=${rowsets.size()}, staleRowsetCount=${staleRowsets.size()}, " + + "rowsets=${rowsets}, staleRowsets=${staleRowsets}") + } + + def assertCompactionSegmentDataProfile = { String profileString, String label -> + def invertedIndexCacheBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromCache") + def invertedIndexRemoteBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromRemote") + def segmentFooterIndexCacheBytes = + sumProfileCounter(profileString, "SegmentFooterIndexBytesScannedFromCache") + def segmentFooterIndexRemoteBytes = + sumProfileCounter(profileString, "SegmentFooterIndexBytesScannedFromRemote") + def totalCacheBytes = sumProfileCounter(profileString, "BytesScannedFromCache") + def totalRemoteBytes = sumProfileCounter(profileString, "BytesScannedFromRemote") + def parallelFragmentExecInstanceNum = + sumProfileSummaryCounter(profileString, "Parallel Fragment Exec Instance Num") + def totalInstancesNum = sumProfileSummaryCounter(profileString, "Total Instances Num") + def dataPageRemoteBytes = totalRemoteBytes - segmentFooterIndexRemoteBytes + + logger.info("${label} profile counters: invertedIndexCacheBytes=${invertedIndexCacheBytes}, " + + "invertedIndexRemoteBytes=${invertedIndexRemoteBytes}, " + + "segmentFooterIndexCacheBytes=${segmentFooterIndexCacheBytes}, " + + "segmentFooterIndexRemoteBytes=${segmentFooterIndexRemoteBytes}, " + + "parallelFragmentExecInstanceNum=${parallelFragmentExecInstanceNum}, " + + "totalInstancesNum=${totalInstancesNum}, totalCacheBytes=${totalCacheBytes}, " + + "totalRemoteBytes=${totalRemoteBytes}, dataPageRemoteBytes=${dataPageRemoteBytes}") + + assert parallelFragmentExecInstanceNum == 1L : + "${label}: expected profile query to use one parallel fragment exec instance, profile=${profileString}" + assert invertedIndexCacheBytes == 0L : + "${label}: table has no inverted index, profile=${profileString}" + assert invertedIndexRemoteBytes == 0L : + "${label}: table has no inverted index, profile=${profileString}" + assert segmentFooterIndexRemoteBytes == 0L : + "${label}: segment footer/index should not be read from remote storage, profile=${profileString}" + assert dataPageRemoteBytes > 0L : + "${label}: expected compacted segment data pages to be read from remote storage, " + + "profile=${profileString}" + assert totalRemoteBytes > totalCacheBytes : + "${label}: expected compacted segment data to remain mostly uncached; " + + "totalRemoteBytes=${totalRemoteBytes}, totalCacheBytes=${totalCacheBytes}, " + + "profile=${profileString}" + } + + docker(options) { + getBackendIpHttpAndBrpcPort(backendIdToIp, backendIdToHttpPort, backendIdToBrpcPort) + + sql """ DROP TABLE IF EXISTS ${tableName} FORCE """ + sql """ + CREATE TABLE ${tableName} ( + id INT, + sort_key INT, + tag VARCHAR(64), + body VARCHAR(2048), + payload STRING + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true" + ) + """ + + def tablets = sql_return_maparray """ SHOW TABLETS FROM ${tableName} """ + assert tablets.size() == 1 + def tablet = tablets[0] + def beHost = backendIdToIp[tablet.BackendId] + def beHttpPort = backendIdToHttpPort[tablet.BackendId] + + clearFileCache(beHost, beHttpPort) + (1..3).each { batch -> + insertRows(batch, batchRows) + } + sql """ SYNC """ + + clearFileCache(beHost, beHttpPort) + trigger_and_wait_compaction(tableName, "full") + + def compactionProfileTag = "file_cache_write_index_only_compaction_segment_data_profile" + def compactionProfileChecked = false + profile(compactionProfileTag) { + sql """ SET enable_profile = true """ + sql """ SET profile_level = 2 """ + sql """ SET parallel_pipeline_task_num = 1 """ + run { + logRowsetsLayout("before full scan query after compaction") + sql """ + /* ${compactionProfileTag} */ + SELECT COUNT(DISTINCT id), SUM(id), SUM(sort_key), SUM(LENGTH(tag)), + SUM(LENGTH(body)), SUM(LENGTH(payload)) + FROM ${tableName} + """ + sleep(500) + } + check { profileString, exception -> + compactionProfileChecked = true + if (exception != null) { + throw exception + } + logger.info("profile snippet: {}", profileString.take(3000)) + assertCompactionSegmentDataProfile(profileString, "full compaction segment data") + } + } + assert compactionProfileChecked : "full compaction segment data profile check was not executed" + } +} diff --git a/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only_packed_file.groovy b/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only_packed_file.groovy new file mode 100644 index 00000000000000..d7d797aa9f4454 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/write_index_only/test_file_cache_write_index_file_only_packed_file.groovy @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.Http + +suite("test_file_cache_write_index_file_only_packed_file", "docker") { + def options = new ClusterOptions() + options.cloudMode = true + options.setFeNum(1) + options.setBeNum(1) + + options.beConfigs += [ + 'enable_file_cache=true', + 'enable_file_cache_write_index_file_only=true', + 'enable_file_cache_adaptive_write=true', + 'enable_file_cache_keep_base_compaction_output=true', + 'file_cache_keep_base_compaction_output_min_hit_ratio=0', + 'enable_flush_file_cache_async=false', + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'enable_packed_file=true', + 'small_file_threshold_bytes=104857600', + 'packed_file_size_threshold_bytes=104857600', + 'file_cache_each_block_size=4096', + 'file_cache_path=[{"path":"/opt/apache-doris/be/storage/file_cache","total_size":83886080,"query_limit":83886080}]' + ] + + def tableName = "test_file_cache_write_index_file_only_packed_file" + def loadRows = 1200 + def compactionBatchRows = 200 + def indexTokenPrefix = "tok" + def backendIdToIp = [:] + def backendIdToHttpPort = [:] + def backendIdToBrpcPort = [:] + + def clearFileCache = { beHost, beHttpPort -> + def result = Http.GET("http://${beHost}:${beHttpPort}/api/file_cache?op=clear&sync=true", true) + logger.info("clear file cache result: ${result}") + } + + def uniqueIndexToken = { int seed -> + def alphabet = "abcdefghijklmnopqrstuvwxyz" + def value = seed + def suffix = new StringBuilder() + for (int i = 0; i < 5; i++) { + suffix.append(alphabet.charAt(value % alphabet.length())) + value = (int) (value / alphabet.length()) + } + return "${indexTokenPrefix}${suffix}" + } + + def insertRows = { int batch, int rowCount -> + def data = new StringBuilder() + (0.. + def payload = (1..24).collect { java.util.UUID.randomUUID().toString() }.join("") + def indexToken = uniqueIndexToken(batch * 100000 + idx) + data.append("${batch * 100000 + idx + 1}\t") + data.append("${rowCount - idx}\t") + data.append("tag_${batch}_${idx}\t") + data.append("quick brown profile text ${indexToken} row ${batch} ${idx}\t") + data.append("${payload}\n") + } + streamLoad { + table "${tableName}" + set 'column_separator', '\t' + set 'columns', 'id,sort_key,tag,body,payload' + inputText data.toString() + time 60000 + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("stream load result: ${result}") + def json = parseJson(result) + assert json.Status.toString().equalsIgnoreCase("success") : result + assert json.NumberLoadedRows.toString().toInteger() == rowCount : result + } + } + } + + def parseProfileCounterValue = { String valueText -> + def exact = (valueText =~ /\((\d+)\)/) + if (exact.find()) { + return exact.group(1).toLong() + } + def number = (valueText =~ /([0-9]+(?:\.[0-9]+)?)\s*(B|KB|MB|GB)?/) + if (!number.find()) { + return 0L + } + BigDecimal value = new BigDecimal(number.group(1)) + long multiplier = 1L + if (number.group(2) == "KB") { + multiplier = 1024L + } else if (number.group(2) == "MB") { + multiplier = 1024L * 1024L + } else if (number.group(2) == "GB") { + multiplier = 1024L * 1024L * 1024L + } + return (value * multiplier).toLong() + } + + def sumProfileCounter = { String profileString, String counterName -> + long total = 0 + def matcher = (profileString =~ ("(?m)^\\s*(?:-\\s*)?" + + java.util.regex.Pattern.quote(counterName) + ":\\s+([^\\n]+)")) + while (matcher.find()) { + total += parseProfileCounterValue(matcher.group(1).toString()) + } + return total + } + + def sumProfileSummaryCounter = { String profileString, String counterName -> + long total = 0 + def counterRegex = counterName.split(/\s+/).collect { + java.util.regex.Pattern.quote(it) + }.join("\\s+") + def matcher = (profileString =~ ("(?m)^\\s*(?:-\\s*)?" + counterRegex + ":\\s+([^\\n]+)")) + while (matcher.find()) { + total += parseProfileCounterValue(matcher.group(1).toString()) + } + return total + } + + def assertIndexOnlyProfile = { String profileString, String label, + boolean requireSegmentFooterCacheBytes, boolean requireDataPageRemoteBytes -> + def invertedIndexCacheBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromCache") + def invertedIndexRemoteBytes = + sumProfileCounter(profileString, "InvertedIndexBytesScannedFromRemote") + def segmentFooterIndexCacheBytes = + sumProfileCounter(profileString, "SegmentFooterIndexBytesScannedFromCache") + def segmentFooterIndexRemoteBytes = + sumProfileCounter(profileString, "SegmentFooterIndexBytesScannedFromRemote") + def totalCacheBytes = sumProfileCounter(profileString, "BytesScannedFromCache") + def totalRemoteBytes = sumProfileCounter(profileString, "BytesScannedFromRemote") + def parallelFragmentExecInstanceNum = + sumProfileSummaryCounter(profileString, "Parallel Fragment Exec Instance Num") + def totalInstancesNum = sumProfileSummaryCounter(profileString, "Total Instances Num") + def classifiedRemoteBytes = invertedIndexRemoteBytes + segmentFooterIndexRemoteBytes + def dataPageRemoteBytes = totalRemoteBytes - classifiedRemoteBytes + + logger.info("${label} profile counters: invertedIndexCacheBytes=${invertedIndexCacheBytes}, " + + "invertedIndexRemoteBytes=${invertedIndexRemoteBytes}, " + + "segmentFooterIndexCacheBytes=${segmentFooterIndexCacheBytes}, " + + "segmentFooterIndexRemoteBytes=${segmentFooterIndexRemoteBytes}, " + + "parallelFragmentExecInstanceNum=${parallelFragmentExecInstanceNum}, " + + "totalInstancesNum=${totalInstancesNum}, totalCacheBytes=${totalCacheBytes}, " + + "totalRemoteBytes=${totalRemoteBytes}, dataPageRemoteBytes=${dataPageRemoteBytes}") + + assert parallelFragmentExecInstanceNum == 1L : + "${label}: expected profile query to use one parallel fragment exec instance, profile=${profileString}" + assert invertedIndexCacheBytes > 0 : + "${label}: expected independent inverted index file to be read from local file cache, profile=${profileString}" + assert invertedIndexCacheBytes > 4096 : + "${label}: expected query to read more than the initial inverted index buffer, profile=${profileString}" + assert invertedIndexRemoteBytes == 0L : + "${label}: independent inverted index should not be read from remote storage, profile=${profileString}" + if (requireSegmentFooterCacheBytes) { + assert segmentFooterIndexCacheBytes > 0 : + "${label}: expected segment footer/index to be read from local file cache, profile=${profileString}" + } + assert segmentFooterIndexRemoteBytes == 0L : + "${label}: segment footer/index should not be read from remote storage, profile=${profileString}" + if (requireDataPageRemoteBytes) { + assert totalRemoteBytes > classifiedRemoteBytes : + "${label}: expected ordinary data pages to be read from remote storage, " + + "totalRemoteBytes=${totalRemoteBytes}, classifiedRemoteBytes=" + + "${classifiedRemoteBytes}, dataPageRemoteBytes=${dataPageRemoteBytes}, " + + "profile=${profileString}" + } + // File cache is block-aligned, so footer/index cache ranges may include nearby data pages. + // Only assert data-page remote reads for stages where the query reaches uncached data pages. + } + + def logRowsetsLayout = { String label -> + def currentTablets = sql_return_maparray """ SHOW TABLETS FROM ${tableName} """ + assert currentTablets.size() == 1 + def currentTablet = currentTablets[0] + def tabletStatus = show_tablet_compaction(currentTablet) + def rowsets = tabletStatus.rowsets == null ? [] : tabletStatus.rowsets + def staleRowsets = tabletStatus.stale_rowsets == null ? [] : tabletStatus.stale_rowsets + logger.info("${label} rowsets layout: tabletId=${currentTablet.TabletId}, " + + "rowsetCount=${rowsets.size()}, staleRowsetCount=${staleRowsets.size()}, " + + "rowsets=${rowsets}, staleRowsets=${staleRowsets}") + } + + docker(options) { + getBackendIpHttpAndBrpcPort(backendIdToIp, backendIdToHttpPort, backendIdToBrpcPort) + + sql """ DROP TABLE IF EXISTS ${tableName} FORCE """ + sql """ + CREATE TABLE ${tableName} ( + id INT, + sort_key INT, + tag VARCHAR(64), + body VARCHAR(2048), + payload STRING, + INDEX body_idx(body) USING INVERTED PROPERTIES("parser" = "english") COMMENT '' + ) + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true", + "inverted_index_storage_format" = "V2" + ) + """ + + def tablets = sql_return_maparray """ SHOW TABLETS FROM ${tableName} """ + assert tablets.size() == 1 + def tablet = tablets[0] + def beHost = backendIdToIp[tablet.BackendId] + def beHttpPort = backendIdToHttpPort[tablet.BackendId] + + clearFileCache(beHost, beHttpPort) + insertRows(1, loadRows) + sql """ SYNC """ + + def loadProfileTag = "file_cache_write_index_only_packed_file_load_profile" + def loadProfileChecked = false + profile(loadProfileTag) { + sql """ SET enable_profile = true """ + sql """ SET profile_level = 2 """ + sql """ SET parallel_pipeline_task_num = 1 """ + sql """ SET inverted_index_max_expansions = 4096 """ + run { + logRowsetsLayout("before packed file load query") + sql """ /* ${loadProfileTag} */ SELECT id + 1 FROM ${tableName} WHERE body MATCH_REGEXP '^${indexTokenPrefix}.*' ORDER BY sort_key LIMIT 10 """ + sleep(500) + } + check { profileString, exception -> + loadProfileChecked = true + if (exception != null) { + throw exception + } + logger.info("profile snippet: {}", profileString.take(3000)) + assertIndexOnlyProfile(profileString, "packed file load", true, true) + } + } + assert loadProfileChecked : "packed file load profile check was not executed" + + (2..3).each { batch -> + insertRows(batch, compactionBatchRows) + } + sql """ SYNC """ + + clearFileCache(beHost, beHttpPort) + trigger_and_wait_compaction(tableName, "full") + + def compactionProfileTag = "file_cache_write_index_only_packed_file_compaction_profile" + def compactionProfileChecked = false + profile(compactionProfileTag) { + sql """ SET enable_profile = true """ + sql """ SET profile_level = 2 """ + sql """ SET parallel_pipeline_task_num = 1 """ + sql """ SET inverted_index_max_expansions = 4096 """ + run { + logRowsetsLayout("before packed file full compaction query") + sql """ + /* ${compactionProfileTag} */ + SELECT COUNT(*), SUM(id), SUM(sort_key), SUM(LENGTH(payload)) + FROM ${tableName} + WHERE body MATCH_REGEXP '^${indexTokenPrefix}.*' + """ + sleep(500) + } + check { profileString, exception -> + compactionProfileChecked = true + if (exception != null) { + throw exception + } + logger.info("profile snippet: {}", profileString.take(3000)) + assertIndexOnlyProfile(profileString, "packed file full compaction", false, true) + } + } + assert compactionProfileChecked : "packed file full compaction profile check was not executed" + } +}