diff --git a/src/duckdb/extension/parquet/include/parquet_crypto.hpp b/src/duckdb/extension/parquet/include/parquet_crypto.hpp index 2189f00f8..a29e07f13 100644 --- a/src/duckdb/extension/parquet/include/parquet_crypto.hpp +++ b/src/duckdb/extension/parquet/include/parquet_crypto.hpp @@ -36,6 +36,9 @@ class ParquetKeys : public ObjectCacheEntry { public: static string ObjectType(); string GetObjectType() override; + optional_idx GetEstimatedCacheMemory() const override { + return optional_idx {}; + } private: unordered_map keys; diff --git a/src/duckdb/extension/parquet/include/parquet_file_metadata_cache.hpp b/src/duckdb/extension/parquet/include/parquet_file_metadata_cache.hpp index 443bcd653..0d148247b 100644 --- a/src/duckdb/extension/parquet/include/parquet_file_metadata_cache.hpp +++ b/src/duckdb/extension/parquet/include/parquet_file_metadata_cache.hpp @@ -40,6 +40,7 @@ class ParquetFileMetadataCache : public ObjectCacheEntry { public: static string ObjectType(); string GetObjectType() override; + optional_idx GetEstimatedCacheMemory() const override; bool IsValid(CachingFileHandle &new_handle) const; //! Return if a cache entry is valid. diff --git a/src/duckdb/extension/parquet/include/parquet_reader.hpp b/src/duckdb/extension/parquet/include/parquet_reader.hpp index c8fde1eed..a82bf737d 100644 --- a/src/duckdb/extension/parquet/include/parquet_reader.hpp +++ b/src/duckdb/extension/parquet/include/parquet_reader.hpp @@ -9,6 +9,7 @@ #pragma once #include "duckdb.hpp" +#include "duckdb/common/helper.hpp" #include "duckdb/storage/caching_file_system.hpp" #include "duckdb/common/common.hpp" #include "duckdb/common/encryption_functions.hpp" @@ -206,7 +207,9 @@ class ParquetReader : public BaseFileReader { void AddVirtualColumn(column_t virtual_column_id) override; void GetPartitionStats(vector &result); - static void GetPartitionStats(const duckdb_parquet::FileMetaData &metadata, vector &result); + static void GetPartitionStats(const duckdb_parquet::FileMetaData &metadata, vector &result, + optional_ptr root_schema = nullptr, + optional_ptr parquet_options = nullptr); static bool MetadataCacheEnabled(ClientContext &context); static shared_ptr GetMetadataCacheEntry(ClientContext &context, const OpenFileInfo &file); diff --git a/src/duckdb/extension/parquet/parquet_crypto.cpp b/src/duckdb/extension/parquet/parquet_crypto.cpp index 1c72385dc..7adba1cdf 100644 --- a/src/duckdb/extension/parquet/parquet_crypto.cpp +++ b/src/duckdb/extension/parquet/parquet_crypto.cpp @@ -17,10 +17,7 @@ namespace duckdb { ParquetKeys &ParquetKeys::Get(ClientContext &context) { auto &cache = ObjectCache::GetObjectCache(context); - if (!cache.Get(ParquetKeys::ObjectType())) { - cache.Put(ParquetKeys::ObjectType(), make_shared_ptr()); - } - return *cache.Get(ParquetKeys::ObjectType()); + return *cache.GetOrCreate(ParquetKeys::ObjectType()); } void ParquetKeys::AddKey(const string &key_name, const string &key) { diff --git a/src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp b/src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp index a3db6a9fe..f24bde79b 100644 --- a/src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp +++ b/src/duckdb/extension/parquet/parquet_file_metadata_cache.cpp @@ -23,6 +23,31 @@ string ParquetFileMetadataCache::GetObjectType() { return ObjectType(); } +optional_idx ParquetFileMetadataCache::GetEstimatedCacheMemory() const { + // Base memory consumption + idx_t memory = sizeof(*this); + + if (metadata) { + const auto num_cols = metadata->schema.size(); + memory += sizeof(duckdb_parquet::FileMetaData); + memory += num_cols * sizeof(duckdb_parquet::SchemaElement); + memory += metadata->row_groups.size() * sizeof(duckdb_parquet::RowGroup) + + num_cols * sizeof(duckdb_parquet::ColumnChunk); + } + if (geo_metadata) { + memory += + sizeof(GeoParquetFileMetadata) + geo_metadata->GetColumnMeta().size() * sizeof(GeoParquetColumnMetadata); + } + if (crypto_metadata) { + memory += sizeof(FileCryptoMetaData); + } + + memory += footer_size; + memory += version_tag.size(); + + return memory; +} + bool ParquetFileMetadataCache::IsValid(CachingFileHandle &new_handle) const { return ExternalFileCache::IsValid(validate, version_tag, last_modified, new_handle.GetVersionTag(), new_handle.GetLastModifiedTime()); diff --git a/src/duckdb/extension/parquet/parquet_multi_file_info.cpp b/src/duckdb/extension/parquet/parquet_multi_file_info.cpp index a9d2fef44..fae40a9d6 100644 --- a/src/duckdb/extension/parquet/parquet_multi_file_info.cpp +++ b/src/duckdb/extension/parquet/parquet_multi_file_info.cpp @@ -525,17 +525,18 @@ shared_ptr ParquetMultiFileInfo::CreateReader(ClientContext &con shared_ptr ParquetReader::GetUnionData(idx_t file_idx) { auto result = make_uniq(file); + result->names.reserve(columns.size()); + result->types.reserve(columns.size()); for (auto &column : columns) { result->names.push_back(column.name); result->types.push_back(column.type); } + + result->options = parquet_options; + result->metadata = metadata; if (file_idx == 0) { - result->options = parquet_options; - result->metadata = metadata; result->reader = shared_from_this(); } else { - result->options = std::move(parquet_options); - result->metadata = std::move(metadata); result->root_schema = std::move(root_schema); } return std::move(result); diff --git a/src/duckdb/extension/parquet/parquet_reader.cpp b/src/duckdb/extension/parquet/parquet_reader.cpp index 32462cf31..52c9099db 100644 --- a/src/duckdb/extension/parquet/parquet_reader.cpp +++ b/src/duckdb/extension/parquet/parquet_reader.cpp @@ -1,42 +1,30 @@ #include "parquet_reader.hpp" -#include "reader/boolean_column_reader.hpp" -#include "reader/callback_column_reader.hpp" +#include "duckdb/common/optional_ptr.hpp" +#include "duckdb/function/partition_stats.hpp" +#include "parquet_types.h" #include "column_reader.hpp" -#include "duckdb.hpp" #include "reader/expression_column_reader.hpp" #include "parquet_geometry.hpp" #include "reader/list_column_reader.hpp" #include "parquet_crypto.hpp" #include "parquet_file_metadata_cache.hpp" #include "parquet_statistics.hpp" -#include "parquet_timestamp.hpp" #include "mbedtls_wrapper.hpp" #include "reader/row_number_column_reader.hpp" -#include "reader/string_column_reader.hpp" #include "reader/variant_column_reader.hpp" #include "reader/struct_column_reader.hpp" -#include "reader/templated_column_reader.hpp" #include "thrift_tools.hpp" #include "duckdb/main/config.hpp" #include "duckdb/common/encryption_state.hpp" #include "duckdb/common/file_system.hpp" #include "duckdb/common/helper.hpp" -#include "duckdb/common/hive_partitioning.hpp" #include "duckdb/common/string_util.hpp" #include "duckdb/planner/table_filter.hpp" #include "duckdb/storage/object_cache.hpp" #include "duckdb/optimizer/statistics_propagator.hpp" #include "duckdb/planner/table_filter_state.hpp" #include "duckdb/common/multi_file/multi_file_reader.hpp" -#include "duckdb/logging/log_manager.hpp" -#include "duckdb/common/multi_file/multi_file_column_mapper.hpp" -#include "duckdb/common/encryption_functions.hpp" - -#include -#include -#include -#include namespace duckdb { @@ -176,7 +164,7 @@ LoadMetadata(ClientContext &context, Allocator &allocator, CachingFileHandle &fi } ParquetCrypto::GenerateAdditionalAuthenticatedData(allocator, aad_crypto_metadata); ParquetCrypto::Read(*metadata, *file_proto, encryption_config->GetFooterKey(), encryption_util, - std::move(aad_crypto_metadata)); + aad_crypto_metadata); } else { metadata->read(file_proto.get()); } @@ -650,8 +638,8 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d if (is_repeated) { auto list_type = LogicalType::LIST(result.type); vector list_child = {std::move(result)}; - result = ParquetColumnSchema::FromChildSchemas(s_ele.name, std::move(list_type), max_define, max_repeat, - this_idx, next_file_idx, std::move(list_child)); + result = ParquetColumnSchema::FromChildSchemas(s_ele.name, list_type, max_define, max_repeat, this_idx, + next_file_idx, std::move(list_child)); } result.parent_schema_index = this_idx; return result; @@ -665,8 +653,8 @@ ParquetColumnSchema ParquetReader::ParseSchemaRecursive(idx_t depth, idx_t max_d if (s_ele.repetition_type == FieldRepetitionType::REPEATED) { auto list_type = LogicalType::LIST(result.type); vector list_child = {std::move(result)}; - return ParquetColumnSchema::FromChildSchemas(s_ele.name, std::move(list_type), max_define, max_repeat, - this_idx, next_file_idx, std::move(list_child)); + return ParquetColumnSchema::FromChildSchemas(s_ele.name, list_type, max_define, max_repeat, this_idx, + next_file_idx, std::move(list_child)); } return result; @@ -1233,17 +1221,64 @@ void ParquetReader::InitializeScan(ClientContext &context, ParquetReaderScanStat } void ParquetReader::GetPartitionStats(vector &result) { - GetPartitionStats(*GetFileMetadata(), result); + GetPartitionStats(*GetFileMetadata(), result, *root_schema, parquet_options); } -void ParquetReader::GetPartitionStats(const duckdb_parquet::FileMetaData &metadata, - vector &result) { +struct ParquetPartitionRowGroup : public PartitionRowGroup { + ParquetPartitionRowGroup(const duckdb_parquet::FileMetaData &metadata_p, + optional_ptr root_schema_p, + optional_ptr parquet_options_p, const idx_t row_group_idx_p) + : metadata(metadata_p), root_schema(root_schema_p), parquet_options(parquet_options_p), + row_group_idx(row_group_idx_p) { + } + + const duckdb_parquet::FileMetaData &metadata; + const optional_ptr root_schema; + const optional_ptr parquet_options; + const idx_t row_group_idx; + + unique_ptr GetColumnStatistics(const StorageIndex &storage_index) override { + const idx_t primary_index = storage_index.GetPrimaryIndex(); + D_ASSERT(metadata.row_groups.size() > row_group_idx); + D_ASSERT(root_schema->children.size() > primary_index); + + const auto &row_group = metadata.row_groups[row_group_idx]; + const auto &column_schema = root_schema->children[primary_index]; + return column_schema.Stats(metadata, *parquet_options, row_group_idx, row_group.columns); + } + + bool MinMaxIsExact(const BaseStatistics &, const StorageIndex &storage_index) override { + const idx_t primary_index = storage_index.GetPrimaryIndex(); + D_ASSERT(metadata.row_groups.size() > row_group_idx); + D_ASSERT(root_schema->children.size() > primary_index); + + const auto &row_group = metadata.row_groups[row_group_idx]; + const auto &column_chunk = row_group.columns[primary_index]; + + if (column_chunk.__isset.meta_data && column_chunk.meta_data.__isset.statistics && + column_chunk.meta_data.statistics.__isset.is_min_value_exact && + column_chunk.meta_data.statistics.__isset.is_max_value_exact) { + const auto &stats = column_chunk.meta_data.statistics; + return stats.is_min_value_exact && stats.is_max_value_exact; + } + return false; + } +}; + +void ParquetReader::GetPartitionStats(const duckdb_parquet::FileMetaData &metadata, vector &result, + optional_ptr root_schema, + optional_ptr parquet_options) { idx_t offset = 0; - for (auto &row_group : metadata.row_groups) { + for (idx_t i = 0; i < metadata.row_groups.size(); i++) { + auto &row_group = metadata.row_groups[i]; PartitionStatistics partition_stats; partition_stats.row_start = offset; partition_stats.count = row_group.num_rows; partition_stats.count_type = CountType::COUNT_EXACT; + if (root_schema && parquet_options) { + partition_stats.partition_row_group = + make_shared_ptr(metadata, root_schema, parquet_options, i); + } offset += row_group.num_rows; result.push_back(partition_stats); } diff --git a/src/duckdb/src/catalog/default/default_table_functions.cpp b/src/duckdb/src/catalog/default/default_table_functions.cpp index 94079bbcb..0351e8ce3 100644 --- a/src/duckdb/src/catalog/default/default_table_functions.cpp +++ b/src/duckdb/src/catalog/default/default_table_functions.cpp @@ -70,6 +70,17 @@ FROM histogram_values(source, col_name, bin_count := bin_count, technique := tec SELECT * EXCLUDE (message), UNNEST(parse_duckdb_log_message(log_type, message)) FROM duckdb_logs(denormalized_table=1) WHERE type ILIKE log_type +)"}, + {DEFAULT_SCHEMA, "duckdb_profiling_settings", {}, {}, R"( +SELECT * EXCLUDE(input_type, scope, aliases) + FROM duckdb_settings() + WHERE name IN ( + 'enable_profiling', + 'profiling_coverage', + 'profiling_output', + 'profiling_mode', + 'custom_profiling_settings' + ); )"}, {nullptr, nullptr, {nullptr}, {{nullptr, nullptr}}, nullptr} }; diff --git a/src/duckdb/src/common/arrow/schema_metadata.cpp b/src/duckdb/src/common/arrow/schema_metadata.cpp index d408d2bb3..4e17c4a47 100644 --- a/src/duckdb/src/common/arrow/schema_metadata.cpp +++ b/src/duckdb/src/common/arrow/schema_metadata.cpp @@ -108,7 +108,7 @@ unsafe_unique_array ArrowSchemaMetadata::SerializeMetadata() const { memcpy(metadata_ptr, &key_size, sizeof(int32_t)); metadata_ptr += sizeof(int32_t); // Key - memcpy(metadata_ptr, key.c_str(), key_size); + memcpy(metadata_ptr, key.c_str(), key.size()); metadata_ptr += key_size; const std::string &value = pair.second; const int32_t value_size = static_cast(value.size()); @@ -116,7 +116,7 @@ unsafe_unique_array ArrowSchemaMetadata::SerializeMetadata() const { memcpy(metadata_ptr, &value_size, sizeof(int32_t)); metadata_ptr += sizeof(int32_t); // Value - memcpy(metadata_ptr, value.c_str(), value_size); + memcpy(metadata_ptr, value.c_str(), value.size()); metadata_ptr += value_size; } return metadata_array_ptr; diff --git a/src/duckdb/src/common/encryption_key_manager.cpp b/src/duckdb/src/common/encryption_key_manager.cpp index b0044a5f0..409fe605c 100644 --- a/src/duckdb/src/common/encryption_key_manager.cpp +++ b/src/duckdb/src/common/encryption_key_manager.cpp @@ -51,10 +51,7 @@ void EncryptionKey::UnlockEncryptionKey(data_ptr_t key, idx_t key_len) { } EncryptionKeyManager &EncryptionKeyManager::GetInternal(ObjectCache &cache) { - if (!cache.Get(EncryptionKeyManager::ObjectType())) { - cache.Put(EncryptionKeyManager::ObjectType(), make_shared_ptr()); - } - return *cache.Get(EncryptionKeyManager::ObjectType()); + return *cache.GetOrCreate(EncryptionKeyManager::ObjectType()); } EncryptionKeyManager &EncryptionKeyManager::Get(ClientContext &context) { diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index cfb6de9af..fb5743566 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -3029,7 +3029,7 @@ const StringUtil::EnumStringLiteral *GetMetricTypeValues() { { static_cast(MetricType::OPTIMIZER_CTE_INLINING), "OPTIMIZER_CTE_INLINING" }, { static_cast(MetricType::OPTIMIZER_COMMON_SUBPLAN), "OPTIMIZER_COMMON_SUBPLAN" }, { static_cast(MetricType::OPTIMIZER_JOIN_ELIMINATION), "OPTIMIZER_JOIN_ELIMINATION" }, - { static_cast(MetricType::OPTIMIZER_COUNT_WINDOW_ELIMINATION), "OPTIMIZER_COUNT_WINDOW_ELIMINATION" }, + { static_cast(MetricType::OPTIMIZER_WINDOW_SELF_JOIN), "OPTIMIZER_WINDOW_SELF_JOIN" }, { static_cast(MetricType::ALL_OPTIMIZERS), "ALL_OPTIMIZERS" }, { static_cast(MetricType::CUMULATIVE_OPTIMIZER_TIMING), "CUMULATIVE_OPTIMIZER_TIMING" }, { static_cast(MetricType::PHYSICAL_PLANNER), "PHYSICAL_PLANNER" }, @@ -3286,7 +3286,7 @@ const StringUtil::EnumStringLiteral *GetOptimizerTypeValues() { { static_cast(OptimizerType::CTE_INLINING), "CTE_INLINING" }, { static_cast(OptimizerType::COMMON_SUBPLAN), "COMMON_SUBPLAN" }, { static_cast(OptimizerType::JOIN_ELIMINATION), "JOIN_ELIMINATION" }, - { static_cast(OptimizerType::COUNT_WINDOW_ELIMINATION), "COUNT_WINDOW_ELIMINATION" } + { static_cast(OptimizerType::WINDOW_SELF_JOIN), "WINDOW_SELF_JOIN" } }; return values; } diff --git a/src/duckdb/src/common/enums/optimizer_type.cpp b/src/duckdb/src/common/enums/optimizer_type.cpp index 353073f2a..4de651f73 100644 --- a/src/duckdb/src/common/enums/optimizer_type.cpp +++ b/src/duckdb/src/common/enums/optimizer_type.cpp @@ -45,7 +45,7 @@ static const DefaultOptimizerType internal_optimizer_types[] = { {"cte_inlining", OptimizerType::CTE_INLINING}, {"common_subplan", OptimizerType::COMMON_SUBPLAN}, {"join_elimination", OptimizerType::JOIN_ELIMINATION}, - {"count_window_elimination", OptimizerType::COUNT_WINDOW_ELIMINATION}, + {"window_self_join", OptimizerType::WINDOW_SELF_JOIN}, {nullptr, OptimizerType::INVALID}}; string OptimizerTypeToString(OptimizerType type) { diff --git a/src/duckdb/src/execution/index/art/prefix_handle.cpp b/src/duckdb/src/execution/index/art/prefix_handle.cpp index 577233905..a8d954988 100644 --- a/src/duckdb/src/execution/index/art/prefix_handle.cpp +++ b/src/duckdb/src/execution/index/art/prefix_handle.cpp @@ -20,7 +20,7 @@ PrefixHandle::PrefixHandle(FixedSizeAllocator &allocator, const Node node, const } PrefixHandle::PrefixHandle(PrefixHandle &&other) noexcept - : segment_handle(std::move(other.segment_handle)), data(other.data), child(other.child) { + : data(other.data), child(other.child), segment_handle(std::move(other.segment_handle)) { other.data = nullptr; other.child = nullptr; } diff --git a/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp b/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp index e78f8f99b..db4494ed2 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/state_machine/csv_state_machine_cache.cpp @@ -486,7 +486,7 @@ CSVStateMachineCache::CSVStateMachineCache() { const StateMachine &CSVStateMachineCache::Get(const CSVStateMachineOptions &state_machine_options) { // Custom State Machine, we need to create it and cache it first - lock_guard parallel_lock(main_mutex); + const lock_guard parallel_lock(main_mutex); if (state_machine_cache.find(state_machine_options) == state_machine_cache.end()) { Insert(state_machine_options); } diff --git a/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp b/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp index b6f6a267f..990fb8828 100644 --- a/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp +++ b/src/duckdb/src/execution/operator/csv_scanner/table_function/global_csv_state.cpp @@ -180,7 +180,7 @@ void CSVGlobalState::FillRejectsTable(CSVFileScan &scan) { auto limit = options.rejects_limit; auto rejects = CSVRejectsTable::GetOrCreate(context, options.rejects_scan_name.GetValue(), options.rejects_table_name.GetValue()); - lock_guard lock(rejects->write_lock); + const lock_guard lock(rejects->write_lock); auto &errors_table = rejects->GetErrorsTable(context); auto &scans_table = rejects->GetScansTable(context); InternalAppender errors_appender(context, errors_table); diff --git a/src/duckdb/src/execution/operator/join/physical_iejoin.cpp b/src/duckdb/src/execution/operator/join/physical_iejoin.cpp index 995b1c9da..5d26425e3 100644 --- a/src/duckdb/src/execution/operator/join/physical_iejoin.cpp +++ b/src/duckdb/src/execution/operator/join/physical_iejoin.cpp @@ -252,7 +252,7 @@ struct IEJoinSourceTask { //! The thread index (for local state) idx_t thread_idx = 0; //! The chunk range - ChunkRange l_range; + ChunkRange range; }; class IEJoinLocalSourceState; @@ -992,43 +992,9 @@ bool IEJoinLocalSourceState::TryAssignTask() { // Because downstream operators may be using our internal buffers, // we can't "finish" a task until we are about to get the next one. if (task) { - switch (task->stage) { - case IEJoinSourceStage::SINK_L1: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::FINALIZE_L1: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::MATERIALIZE_L1: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::EXTRACT_LI: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::SINK_L2: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::FINALIZE_L2: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::MATERIALIZE_L2: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::EXTRACT_P: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::INNER: - ++gsource.GetStageNext(task->stage); - break; - case IEJoinSourceStage::OUTER: - ++gsource.GetStageNext(task->stage); - left_matches = nullptr; - right_matches = nullptr; - break; - case IEJoinSourceStage::INIT: - case IEJoinSourceStage::DONE: - break; - } + ++gsource.GetStageNext(task->stage); + left_matches = nullptr; + right_matches = nullptr; } if (!gsource.TryNextTask(task, task_local)) { @@ -1057,18 +1023,18 @@ bool IEJoinLocalSourceState::TryAssignTask() { right_block_index = 0; right_base = 0; - joiner = make_uniq(gsource, task->l_range); + joiner = make_uniq(gsource, task->range); break; case IEJoinSourceStage::OUTER: if (task->thread_idx < gsource.left_outers) { - left_block_index = task->l_range.first; + left_block_index = task->range.first; left_base = left_table.BlockStart(left_block_index); left_matches = left_table.found_match.get() + left_base; outer_idx = 0; outer_count = left_table.BlockSize(left_block_index); } else { - right_block_index = task->l_range.first; + right_block_index = task->range.first; right_base = right_table.BlockStart(right_block_index); right_matches = right_table.found_match.get() + right_base; @@ -1096,8 +1062,8 @@ void IEJoinLocalSourceState::ExecuteSinkL1Task(ExecutionContext &context, Interr auto &l1 = gsource.l1; // Process the LHS sub-range - if (task->l_range.first < gsource.left_blocks) { - auto range = task->l_range; + if (task->range.first < gsource.left_blocks) { + auto range = task->range; range.second = MinValue(gsource.left_blocks, range.second); // LHS has positive rids @@ -1112,8 +1078,8 @@ void IEJoinLocalSourceState::ExecuteSinkL1Task(ExecutionContext &context, Interr } // Process the RHS sub-range - if (task->l_range.second > gsource.left_blocks) { - auto range = task->l_range; + if (task->range.second > gsource.left_blocks) { + auto range = task->range; range.first = MaxValue(gsource.left_blocks, range.first) - gsource.left_blocks; range.second -= gsource.left_blocks; @@ -1147,8 +1113,8 @@ void IEJoinLocalSourceState::ExecuteSinkL2Task(ExecutionContext &context, Interr ExpressionExecutor executor(context.client); executor.AddExpression(*ref); - const auto rid = UnsafeNumericCast(l1.BlockStart(task->l_range.first)); - IEJoinUnion::AppendKey(context, interrupt, l1, executor, l2, 1, rid, task->l_range); + const auto rid = UnsafeNumericCast(l1.BlockStart(task->range.first)); + IEJoinUnion::AppendKey(context, interrupt, l1, executor, l2, 1, rid, task->range); } void IEJoinLocalSourceState::ExecuteFinalizeL2Task(ExecutionContext &context, InterruptState &interrupt) { @@ -1367,75 +1333,15 @@ bool IEJoinGlobalSourceState::TryPrepareNextStage() { // Inside lock const auto stage_count = GetStageCount(stage); const auto stage_next = GetStageNext(stage).load(); - switch (stage.load()) { - case IEJoinSourceStage::INIT: - stage = IEJoinSourceStage::SINK_L1; - return true; - case IEJoinSourceStage::SINK_L1: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::FINALIZE_L1; - return true; - } - break; - case IEJoinSourceStage::FINALIZE_L1: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::MATERIALIZE_L1; - return true; - } - break; - case IEJoinSourceStage::MATERIALIZE_L1: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::EXTRACT_LI; - return true; - } - break; - case IEJoinSourceStage::EXTRACT_LI: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::SINK_L2; - return true; - } - break; - case IEJoinSourceStage::SINK_L2: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::FINALIZE_L2; - return true; - } - break; - case IEJoinSourceStage::FINALIZE_L2: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::MATERIALIZE_L2; - return true; - } - break; - case IEJoinSourceStage::MATERIALIZE_L2: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::EXTRACT_P; - return true; - } - break; - case IEJoinSourceStage::EXTRACT_P: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::INNER; - return true; - } - break; - case IEJoinSourceStage::INNER: - if (stage_next >= stage_count) { - if (GetStageCount(IEJoinSourceStage::OUTER)) { - stage = IEJoinSourceStage::OUTER; - } else { - stage = IEJoinSourceStage::DONE; + if (stage_next >= stage_count) { + auto stage_curr = stage.load(); + while (stage_curr < IEJoinSourceStage::DONE) { + stage_curr = IEJoinSourceStage(size_t(stage_curr) + 1); + if (GetStageCount(stage_curr)) { + break; } - return true; } - break; - case IEJoinSourceStage::OUTER: - if (stage_next >= stage_count) { - stage = IEJoinSourceStage::DONE; - return true; - } - break; - case IEJoinSourceStage::DONE: + stage = stage_curr; return true; } @@ -1443,9 +1349,8 @@ bool IEJoinGlobalSourceState::TryPrepareNextStage() { } idx_t IEJoinGlobalSourceState::MaxThreads() { - // We can't leverage any more threads than block pairs. - const auto &sink_state = (op.sink_state->Cast()); - return sink_state.tables[0]->BlockCount() * sink_state.tables[1]->BlockCount(); + // We can't leverage any more threads than tasks. + return *max_element(stage_tasks.begin(), stage_tasks.end()); } void IEJoinGlobalSourceState::FinishTask(TaskPtr task) { @@ -1502,35 +1407,35 @@ bool IEJoinGlobalSourceState::TryNextTask(Task &task) { switch (stage.load()) { case IEJoinSourceStage::SINK_L1: - task.l_range.first = MinValue(task.thread_idx * per_thread, left_blocks + right_blocks); - task.l_range.second = MinValue(task.l_range.first + per_thread, left_blocks + right_blocks); + task.range.first = MinValue(task.thread_idx * per_thread, left_blocks + right_blocks); + task.range.second = MinValue(task.range.first + per_thread, left_blocks + right_blocks); break; case IEJoinSourceStage::FINALIZE_L1: case IEJoinSourceStage::MATERIALIZE_L1: case IEJoinSourceStage::EXTRACT_LI: break; case IEJoinSourceStage::SINK_L2: - task.l_range.first = MinValue(task.thread_idx * per_thread, l1->BlockCount()); - task.l_range.second = MinValue(task.l_range.first + per_thread, l1->BlockCount()); + task.range.first = MinValue(task.thread_idx * per_thread, l1->BlockCount()); + task.range.second = MinValue(task.range.first + per_thread, l1->BlockCount()); break; case IEJoinSourceStage::FINALIZE_L2: case IEJoinSourceStage::MATERIALIZE_L2: case IEJoinSourceStage::EXTRACT_P: break; case IEJoinSourceStage::INNER: { - task.l_range.first = task.thread_idx * per_thread; - task.l_range.second = MinValue(task.l_range.first + per_thread, l2_blocks); + task.range.first = task.thread_idx * per_thread; + task.range.second = MinValue(task.range.first + per_thread, l2_blocks); break; } case IEJoinSourceStage::OUTER: if (task.thread_idx < left_outers) { // Left outer blocks const auto left_task = task.thread_idx; - task.l_range = {left_task, left_task + 1}; + task.range = {left_task, left_task + 1}; } else { // Right outer blocks const auto right_task = task.thread_idx - left_outers; - task.l_range = {right_task, right_task + 1}; + task.range = {right_task, right_task + 1}; } break; case IEJoinSourceStage::INIT: diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index f2eb883fb..84e30b743 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "0-dev5021" +#define DUCKDB_PATCH_VERSION "0-dev5079" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 5 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.5.0-dev5021" +#define DUCKDB_VERSION "v1.5.0-dev5079" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "041f4cac68" +#define DUCKDB_SOURCE_ID "9fffe27818" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/function/window/window_boundaries_state.cpp b/src/duckdb/src/function/window/window_boundaries_state.cpp index 84fbc7929..26af368a0 100644 --- a/src/duckdb/src/function/window/window_boundaries_state.cpp +++ b/src/duckdb/src/function/window/window_boundaries_state.cpp @@ -132,7 +132,7 @@ struct WindowColumnIterator { return coll->GetCell(0, pos + m); } - friend inline iterator &operator+(const iterator &a, difference_type n) { + friend inline iterator operator+(const iterator &a, difference_type n) { return iterator(a.coll, a.pos + n); } diff --git a/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp b/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp index d37423430..cec709f4e 100644 --- a/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp +++ b/src/duckdb/src/include/duckdb/common/encryption_key_manager.hpp @@ -55,6 +55,9 @@ class EncryptionKeyManager : public ObjectCacheEntry { public: static string ObjectType(); string GetObjectType() override; + optional_idx GetEstimatedCacheMemory() const override { + return optional_idx {}; + } public: public: diff --git a/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp b/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp index 38c3b94c2..60db5da1f 100644 --- a/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp +++ b/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp @@ -93,7 +93,7 @@ enum class MetricType : uint8_t { OPTIMIZER_CTE_INLINING, OPTIMIZER_COMMON_SUBPLAN, OPTIMIZER_JOIN_ELIMINATION, - OPTIMIZER_COUNT_WINDOW_ELIMINATION, + OPTIMIZER_WINDOW_SELF_JOIN, // PhaseTiming metrics ALL_OPTIMIZERS, CUMULATIVE_OPTIMIZER_TIMING, @@ -129,7 +129,7 @@ class MetricsUtils { static constexpr uint8_t END_OPERATOR = static_cast(MetricType::OPERATOR_TYPE); static constexpr uint8_t START_OPTIMIZER = static_cast(MetricType::OPTIMIZER_EXPRESSION_REWRITER); - static constexpr uint8_t END_OPTIMIZER = static_cast(MetricType::OPTIMIZER_COUNT_WINDOW_ELIMINATION); + static constexpr uint8_t END_OPTIMIZER = static_cast(MetricType::OPTIMIZER_WINDOW_SELF_JOIN); static constexpr uint8_t START_PHASE_TIMING = static_cast(MetricType::ALL_OPTIMIZERS); static constexpr uint8_t END_PHASE_TIMING = static_cast(MetricType::PLANNER_BINDING); diff --git a/src/duckdb/src/include/duckdb/common/enums/optimizer_type.hpp b/src/duckdb/src/include/duckdb/common/enums/optimizer_type.hpp index 8d2928af4..7207dc873 100644 --- a/src/duckdb/src/include/duckdb/common/enums/optimizer_type.hpp +++ b/src/duckdb/src/include/duckdb/common/enums/optimizer_type.hpp @@ -47,7 +47,7 @@ enum class OptimizerType : uint32_t { CTE_INLINING, COMMON_SUBPLAN, JOIN_ELIMINATION, - COUNT_WINDOW_ELIMINATION + WINDOW_SELF_JOIN }; string OptimizerTypeToString(OptimizerType type); diff --git a/src/duckdb/src/include/duckdb/common/lru_cache.hpp b/src/duckdb/src/include/duckdb/common/lru_cache.hpp new file mode 100644 index 000000000..7b8db14be --- /dev/null +++ b/src/duckdb/src/include/duckdb/common/lru_cache.hpp @@ -0,0 +1,146 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/common/lru_cache.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +#include "duckdb/common/common.hpp" +#include "duckdb/common/list.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/common/shared_ptr.hpp" +#include "duckdb/common/string.hpp" +#include "duckdb/common/unordered_map.hpp" + +namespace duckdb { + +// A LRU cache implementation, whose value could be accessed in a shared manner with shared pointer. +// Notice, it's not thread-safe. +template , typename KeyEqual = std::equal_to> +class SharedLruCache { +public: + using key_type = Key; + using mapped_type = shared_ptr; + using hasher = KeyHash; + using key_equal = KeyEqual; + + // @param max_memory_p: Maximum total memory (in bytes) of entries. 0 means unlimited. + explicit SharedLruCache(idx_t max_memory_p) : max_memory(max_memory_p), current_memory(0) { + } + + // Disable copy and move + SharedLruCache(const SharedLruCache &) = delete; + SharedLruCache &operator=(const SharedLruCache &) = delete; + + ~SharedLruCache() = default; + + // Insert `value` with key `key` and explicit memory size. This will replace any previous entry with the same key. + void Put(Key key, shared_ptr value, idx_t memory_size) { + // Remove existing entry if present + auto existing_it = entry_map.find(key); + if (existing_it != entry_map.end()) { + DeleteImpl(existing_it); + } + + // Evict entries if needed to make room + if (max_memory > 0 && memory_size > 0) { + EvictIfNeeded(memory_size); + } + + // Add new entry + lru_list.emplace_front(key); + Entry new_entry; + new_entry.value = std::move(value); + new_entry.memory = memory_size; + new_entry.lru_iterator = lru_list.begin(); + + entry_map[std::move(key)] = std::move(new_entry); + current_memory += memory_size; + } + + // Delete the entry with key `key`. + // Return whether the requested `key` is found in the cache. + bool Delete(const Key &key) { + auto it = entry_map.find(key); + if (it == entry_map.end()) { + return false; + } + DeleteImpl(it); + return true; + } + + // Look up the entry with key `key`. Return nullptr if not found. + shared_ptr Get(const Key &key) { + auto entry_map_iter = entry_map.find(key); + if (entry_map_iter == entry_map.end()) { + return nullptr; + } + + // Move to front, which indicates most recently used. + lru_list.splice(lru_list.begin(), lru_list, entry_map_iter->second.lru_iterator); + return entry_map_iter->second.value; + } + + // Clear the whole cache. + void Clear() { + entry_map.clear(); + lru_list.clear(); + current_memory = 0; + } + + idx_t MaxMemory() const { + return max_memory; + } + idx_t CurrentMemory() const { + return current_memory; + } + size_t Size() const { + return entry_map.size(); + } + +private: + struct Entry { + shared_ptr value; + idx_t memory; + typename list::iterator lru_iterator; + }; + + using EntryMap = unordered_map; + + void DeleteImpl(typename EntryMap::iterator iter) { + current_memory -= iter->second.memory; + D_ASSERT(current_memory >= 0); + lru_list.erase(iter->second.lru_iterator); + entry_map.erase(iter); + } + + void EvictIfNeeded(idx_t required_memory) { + if (max_memory == 0) { + return; + } + + // Evict LRU entries until we have enough space + while (!lru_list.empty() && (current_memory + required_memory > max_memory)) { + const auto &stale_key = lru_list.back(); + auto stale_it = entry_map.find(stale_key); + if (stale_it != entry_map.end()) { + DeleteImpl(stale_it); + } else { + // Should not happen, but be defensive + lru_list.pop_back(); + } + } + } + + const idx_t max_memory; + idx_t current_memory; + EntryMap entry_map; + list lru_list; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp index 0e2bb1d0a..fe523276a 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/csv_scanner/csv_state_machine_cache.hpp @@ -80,6 +80,10 @@ class CSVStateMachineCache : public ObjectCacheEntry { return ObjectType(); } + optional_idx GetEstimatedCacheMemory() const override { + return optional_idx {}; + } + private: void Insert(const CSVStateMachineOptions &state_machine_options); //! Cache on delimiter|quote|escape|newline diff --git a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp b/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp index 2cae7e3ce..415f217b2 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_rejects_table.hpp @@ -48,6 +48,11 @@ class CSVRejectsTable : public ObjectCacheEntry { return ObjectType(); } + // Rejects table records the overall error counts, which is relatively small and should not be evicted. + optional_idx GetEstimatedCacheMemory() const override { + return optional_idx {}; + } + private: //! Current File Index being used in the query idx_t current_file_idx = 0; diff --git a/src/duckdb/src/include/duckdb/function/partition_stats.hpp b/src/duckdb/src/include/duckdb/function/partition_stats.hpp index 8247861ec..508a8997a 100644 --- a/src/duckdb/src/include/duckdb/function/partition_stats.hpp +++ b/src/duckdb/src/include/duckdb/function/partition_stats.hpp @@ -29,7 +29,7 @@ enum class CountType { COUNT_EXACT, COUNT_APPROXIMATE }; struct PartitionRowGroup { virtual ~PartitionRowGroup() = default; virtual unique_ptr GetColumnStatistics(const StorageIndex &storage_index) = 0; - virtual bool MinMaxIsExact(const BaseStatistics &stats) = 0; + virtual bool MinMaxIsExact(const BaseStatistics &stats, const StorageIndex &storage_index) = 0; }; struct PartitionStatistics { diff --git a/src/duckdb/src/include/duckdb/optimizer/count_window_elimination.hpp b/src/duckdb/src/include/duckdb/optimizer/window_self_join.hpp similarity index 100% rename from src/duckdb/src/include/duckdb/optimizer/count_window_elimination.hpp rename to src/duckdb/src/include/duckdb/optimizer/window_self_join.hpp diff --git a/src/duckdb/src/include/duckdb/storage/object_cache.hpp b/src/duckdb/src/include/duckdb/storage/object_cache.hpp index 96c3bf718..1adc154e5 100644 --- a/src/duckdb/src/include/duckdb/storage/object_cache.hpp +++ b/src/duckdb/src/include/duckdb/storage/object_cache.hpp @@ -9,9 +9,10 @@ #pragma once #include "duckdb/common/common.hpp" +#include "duckdb/common/lru_cache.hpp" +#include "duckdb/common/mutex.hpp" #include "duckdb/common/string.hpp" #include "duckdb/common/unordered_map.hpp" -#include "duckdb/common/mutex.hpp" #include "duckdb/main/client_context.hpp" #include "duckdb/main/database.hpp" @@ -24,17 +25,35 @@ class ObjectCacheEntry { } virtual string GetObjectType() = 0; + + //! Get the rough cache memory usage in bytes for this entry. + //! Used for eviction decisions. Return invalid index to prevent eviction. + virtual optional_idx GetEstimatedCacheMemory() const = 0; }; class ObjectCache { public: + //! Default max memory 8GiB for non-evictable cache entries. + // + // TODO(hjiang): Hard-code a large enough memory consumption upper bound, which is likely a non-regression change. + // I will followup with another PR before v1.5.0 release to provide a user option to tune. + // + // A few consideration here: should we cap object cache memory consumption with duckdb max memory or separate. + static constexpr idx_t DEFAULT_MAX_MEMORY = 8ULL * 1024 * 1024 * 1024; + + ObjectCache() : ObjectCache(DEFAULT_MAX_MEMORY) { + } + + explicit ObjectCache(idx_t max_memory) : lru_cache(max_memory) { + } + shared_ptr GetObject(const string &key) { - lock_guard glock(lock); - auto entry = cache.find(key); - if (entry == cache.end()) { - return nullptr; + const lock_guard lock(lock_mutex); + auto non_evictable_it = non_evictable_entries.find(key); + if (non_evictable_it != non_evictable_entries.end()) { + return non_evictable_it->second; } - return entry->second; + return lru_cache.Get(key); } template @@ -48,37 +67,86 @@ class ObjectCache { template shared_ptr GetOrCreate(const string &key, ARGS &&... args) { - lock_guard glock(lock); + const lock_guard lock(lock_mutex); - auto entry = cache.find(key); - if (entry == cache.end()) { - auto value = make_shared_ptr(args...); - cache[key] = value; - return value; + // Check non-evictable entries first + auto non_evictable_it = non_evictable_entries.find(key); + if (non_evictable_it != non_evictable_entries.end()) { + auto &existing = non_evictable_it->second; + if (existing->GetObjectType() != T::ObjectType()) { + return nullptr; + } + return shared_ptr_cast(existing); } - auto object = entry->second; - if (!object || object->GetObjectType() != T::ObjectType()) { - return nullptr; + + // Check evictable cache + auto existing = lru_cache.Get(key); + if (existing) { + if (existing->GetObjectType() != T::ObjectType()) { + return nullptr; + } + return shared_ptr_cast(existing); } - return shared_ptr_cast(object); + + // Create new entry while holding lock + auto value = make_shared_ptr(args...); + const auto estimated_memory = value->GetEstimatedCacheMemory(); + const bool is_evictable = estimated_memory.IsValid(); + if (!is_evictable) { + non_evictable_entries[key] = value; + } else { + lru_cache.Put(key, value, estimated_memory.GetIndex()); + } + + return value; } void Put(string key, shared_ptr value) { - lock_guard glock(lock); - cache.insert(make_pair(std::move(key), std::move(value))); + if (!value) { + return; + } + + const lock_guard lock(lock_mutex); + const auto estimated_memory = value->GetEstimatedCacheMemory(); + const bool is_evictable = estimated_memory.IsValid(); + if (!is_evictable) { + non_evictable_entries[std::move(key)] = std::move(value); + return; + } + lru_cache.Put(std::move(key), std::move(value), estimated_memory.GetIndex()); } void Delete(const string &key) { - lock_guard glock(lock); - cache.erase(key); + const lock_guard lock(lock_mutex); + auto iter = non_evictable_entries.find(key); + if (iter != non_evictable_entries.end()) { + non_evictable_entries.erase(iter); + return; + } + lru_cache.Delete(key); } DUCKDB_API static ObjectCache &GetObjectCache(ClientContext &context); + idx_t GetMaxMemory() const { + const lock_guard lock(lock_mutex); + return lru_cache.MaxMemory(); + } + idx_t GetCurrentMemory() const { + const lock_guard lock(lock_mutex); + return lru_cache.CurrentMemory(); + } + size_t GetEntryCount() const { + const lock_guard lock(lock_mutex); + return lru_cache.Size() + non_evictable_entries.size(); + } + private: - //! Object Cache - unordered_map> cache; - mutex lock; + mutable mutex lock_mutex; + //! LRU cache for evictable entries + SharedLruCache lru_cache; + //! Separate storage for non-evictable entries (i.e., encryption keys) + unordered_map> non_evictable_entries; }; } // namespace duckdb diff --git a/src/duckdb/src/main/capi/file_system-c.cpp b/src/duckdb/src/main/capi/file_system-c.cpp index e697c363d..8d9e643b6 100644 --- a/src/duckdb/src/main/capi/file_system-c.cpp +++ b/src/duckdb/src/main/capi/file_system-c.cpp @@ -99,11 +99,17 @@ void duckdb_destroy_file_open_options(duckdb_file_open_options *options) { duckdb_state duckdb_file_system_open(duckdb_file_system fs, const char *path, duckdb_file_open_options options, duckdb_file_handle *out_file) { if (!fs) { - *out_file = nullptr; + if (out_file) { + *out_file = nullptr; + } return DuckDBError; } auto cfs = reinterpret_cast(fs); - if (!path || !options || !out_file) { + if (!out_file) { + cfs->SetError("Invalid out file to duckdb_file_system_open"); + return DuckDBError; + } + if (!path || !options) { cfs->SetError("Invalid input to duckdb_file_system_open"); *out_file = nullptr; return DuckDBError; diff --git a/src/duckdb/src/main/query_profiler.cpp b/src/duckdb/src/main/query_profiler.cpp index 37efd44e0..fef9d9fe2 100644 --- a/src/duckdb/src/main/query_profiler.cpp +++ b/src/duckdb/src/main/query_profiler.cpp @@ -262,11 +262,11 @@ void QueryProfiler::AddToCounter(const MetricType type, const idx_t amount) { } idx_t QueryProfiler::GetBytesRead() const { - return query_metrics.GetMetricsIndex(MetricType::TOTAL_BYTES_READ); + return query_metrics.GetMetricValue(MetricType::TOTAL_BYTES_READ); } idx_t QueryProfiler::GetBytesWritten() const { - return query_metrics.GetMetricsIndex(MetricType::TOTAL_BYTES_WRITTEN); + return query_metrics.GetMetricValue(MetricType::TOTAL_BYTES_WRITTEN); } ActiveTimer QueryProfiler::StartTimer(const MetricType type) { diff --git a/src/duckdb/src/optimizer/optimizer.cpp b/src/duckdb/src/optimizer/optimizer.cpp index 4f811bd84..f6b767d25 100644 --- a/src/duckdb/src/optimizer/optimizer.cpp +++ b/src/duckdb/src/optimizer/optimizer.cpp @@ -38,7 +38,7 @@ #include "duckdb/optimizer/unnest_rewriter.hpp" #include "duckdb/optimizer/late_materialization.hpp" #include "duckdb/optimizer/common_subplan_optimizer.hpp" -#include "duckdb/optimizer/count_window_elimination.hpp" +#include "duckdb/optimizer/window_self_join.hpp" #include "duckdb/planner/binder.hpp" #include "duckdb/planner/planner.hpp" @@ -187,7 +187,8 @@ void Optimizer::RunBuiltInOptimizers() { plan = empty_result_pullup.Optimize(std::move(plan)); }); - RunOptimizer(OptimizerType::COUNT_WINDOW_ELIMINATION, [&]() { + // Replaces some window computations with self-joins + RunOptimizer(OptimizerType::WINDOW_SELF_JOIN, [&]() { WindowSelfJoinOptimizer window_self_join_optimizer(*this); plan = window_self_join_optimizer.Optimize(std::move(plan)); }); diff --git a/src/duckdb/src/optimizer/statistics/operator/propagate_aggregate.cpp b/src/duckdb/src/optimizer/statistics/operator/propagate_aggregate.cpp index 5c17124f4..7417b8112 100644 --- a/src/duckdb/src/optimizer/statistics/operator/propagate_aggregate.cpp +++ b/src/duckdb/src/optimizer/statistics/operator/propagate_aggregate.cpp @@ -73,7 +73,7 @@ bool TryGetValueFromStats(const PartitionStatistics &stats, const StorageIndex & return false; } auto column_stats = stats.partition_row_group->GetColumnStatistics(storage_index); - if (!stats.partition_row_group->MinMaxIsExact(*column_stats)) { + if (!stats.partition_row_group->MinMaxIsExact(*column_stats, storage_index)) { return false; } if (column_stats->GetStatsType() == StatisticsType::NUMERIC_STATS) { diff --git a/src/duckdb/src/optimizer/topn_window_elimination.cpp b/src/duckdb/src/optimizer/topn_window_elimination.cpp index 3026e0a95..31cb43215 100644 --- a/src/duckdb/src/optimizer/topn_window_elimination.cpp +++ b/src/duckdb/src/optimizer/topn_window_elimination.cpp @@ -466,7 +466,13 @@ bool TopNWindowElimination::CanOptimize(LogicalOperator &op) { return false; } - if (filter.expressions[0]->type != ExpressionType::COMPARE_LESSTHANOREQUALTO) { + const auto comparison = filter.expressions[0]->type; + switch (comparison) { + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + case ExpressionType::COMPARE_LESSTHAN: + case ExpressionType::COMPARE_EQUAL: + break; + default: return false; } @@ -478,7 +484,26 @@ bool TopNWindowElimination::CanOptimize(LogicalOperator &op) { if (filter_value.value.type() != LogicalType::BIGINT) { return false; } - if (filter_value.value.GetValue() < 1) { + + const auto bigint_value = filter_value.value.GetValue(); + switch (comparison) { + case ExpressionType::COMPARE_LESSTHANOREQUALTO: + if (bigint_value < 1) { + return false; + } + break; + case ExpressionType::COMPARE_LESSTHAN: + if (bigint_value < 2) { + return false; + } + break; + case ExpressionType::COMPARE_EQUAL: + // TODO: Handle other values + if (bigint_value != 1) { + return false; + } + break; + default: return false; } @@ -688,8 +713,12 @@ TopNWindowElimination::ExtractOptimizerParameters(const LogicalWindow &window, c vector> &aggregate_payload) { TopNWindowEliminationParameters params; - auto &limit_expr = filter.expressions[0]->Cast().right; + auto &filter_expr = filter.expressions[0]->Cast(); + auto &limit_expr = filter_expr.right; params.limit = limit_expr->Cast().value.GetValue(); + if (filter_expr.GetExpressionType() == ExpressionType::COMPARE_LESSTHAN) { + --params.limit; + } params.include_row_number = BindingsReferenceRowNumber(bindings, window); params.payload_type = aggregate_payload.size() > 1 ? TopNPayloadType::STRUCT_PACK : TopNPayloadType::SINGLE_COLUMN; auto &window_expr = window.expressions[0]->Cast(); diff --git a/src/duckdb/src/optimizer/count_window_elimination.cpp b/src/duckdb/src/optimizer/window_self_join.cpp similarity index 97% rename from src/duckdb/src/optimizer/count_window_elimination.cpp rename to src/duckdb/src/optimizer/window_self_join.cpp index 53eaf0f00..def09507b 100644 --- a/src/duckdb/src/optimizer/count_window_elimination.cpp +++ b/src/duckdb/src/optimizer/window_self_join.cpp @@ -1,4 +1,4 @@ -#include "duckdb/optimizer/count_window_elimination.hpp" +#include "duckdb/optimizer/window_self_join.hpp" #include "duckdb/optimizer/optimizer.hpp" #include "duckdb/planner/binder.hpp" #include "duckdb/planner/operator/logical_filter.hpp" @@ -20,9 +20,9 @@ namespace duckdb { -class CountWindowTableRebinder : public LogicalOperatorVisitor { +class WindowSelfJoinTableRebinder : public LogicalOperatorVisitor { public: - explicit CountWindowTableRebinder(Optimizer &optimizer) : optimizer(optimizer) { + explicit WindowSelfJoinTableRebinder(Optimizer &optimizer) : optimizer(optimizer) { } unordered_map table_map; @@ -147,7 +147,7 @@ unique_ptr WindowSelfJoinOptimizer::OptimizeInternal(unique_ptr auto copy_child = original_child->Copy(optimizer.context); // Rebind copy_child to avoid duplicate table indices - CountWindowTableRebinder rebinder(optimizer); + WindowSelfJoinTableRebinder rebinder(optimizer); rebinder.VisitOperator(*copy_child); auto aggregate_index = optimizer.binder.GenerateTableIndex(); diff --git a/src/duckdb/src/storage/compression/roaring/analyze.cpp b/src/duckdb/src/storage/compression/roaring/analyze.cpp index 5332b0333..080cee1f2 100644 --- a/src/duckdb/src/storage/compression/roaring/analyze.cpp +++ b/src/duckdb/src/storage/compression/roaring/analyze.cpp @@ -23,7 +23,7 @@ static unsafe_unique_array CreateBitmaskTable() { result = make_unsafe_uniq_array_uninitialized(NumericLimits::Maximum() + 1); for (uint16_t val = 0; val < NumericLimits::Maximum() + 1; val++) { - bool previous_bit; + bool previous_bit = false; auto &entry = result[val]; entry.valid_count = 0; entry.run_count = 0; @@ -36,7 +36,7 @@ static unsafe_unique_array CreateBitmaskTable() { } entry.valid_count += bit_set; - if (i && !bit_set && previous_bit == true) { + if (!bit_set && previous_bit) { entry.run_count++; } previous_bit = bit_set; diff --git a/src/duckdb/src/storage/table/row_group.cpp b/src/duckdb/src/storage/table/row_group.cpp index 1834b2cf7..aaecfe71e 100644 --- a/src/duckdb/src/storage/table/row_group.cpp +++ b/src/duckdb/src/storage/table/row_group.cpp @@ -191,7 +191,7 @@ void RowGroup::InitializeEmpty(const vector &types, ColumnDataType static unique_ptr CreateCast(ClientContext &context, const LogicalType &original_type, const LogicalType &cast_type) { - auto input = make_uniq(original_type, 0); + auto input = make_uniq(original_type, 0U); auto cast_expression = BoundCastExpression::AddCastToType(context, std::move(input), cast_type); auto res = make_uniq(context); res->target.Initialize(context, {cast_type}); @@ -1429,7 +1429,7 @@ struct DuckDBPartitionRowGroup : public PartitionRowGroup { return row_group.GetStatistics(storage_index); } - bool MinMaxIsExact(const BaseStatistics &stats) override { + bool MinMaxIsExact(const BaseStatistics &stats, const StorageIndex &) override { if (!is_exact || row_group.HasChanges()) { return false; } diff --git a/src/duckdb/ub_src_optimizer.cpp b/src/duckdb/ub_src_optimizer.cpp index 89c5e7a6c..fb099e429 100644 --- a/src/duckdb/ub_src_optimizer.cpp +++ b/src/duckdb/ub_src_optimizer.cpp @@ -12,8 +12,6 @@ #include "src/optimizer/compressed_materialization.cpp" -#include "src/optimizer/count_window_elimination.cpp" - #include "src/optimizer/cse_optimizer.cpp" #include "src/optimizer/cte_filter_pusher.cpp" @@ -62,6 +60,8 @@ #include "src/optimizer/unnest_rewriter.cpp" +#include "src/optimizer/window_self_join.cpp" + #include "src/optimizer/sampling_pushdown.cpp" #include "src/optimizer/sum_rewriter.cpp"