diff --git a/src/duckdb/extension/core_functions/aggregate/holistic/mode.cpp b/src/duckdb/extension/core_functions/aggregate/holistic/mode.cpp index 15b7dd18f..8694c4370 100644 --- a/src/duckdb/extension/core_functions/aggregate/holistic/mode.cpp +++ b/src/duckdb/extension/core_functions/aggregate/holistic/mode.cpp @@ -234,15 +234,12 @@ struct BaseModeFunction { } template - static void Combine(const STATE &source, STATE &target, AggregateInputData &) { + static void Combine(const STATE &source, STATE &target, AggregateInputData &aggr_input_data) { if (!source.frequency_map) { return; } if (!target.frequency_map) { - // Copy - don't destroy! Otherwise windowing will break. - target.frequency_map = new typename STATE::Counts(*source.frequency_map); - target.count = source.count; - return; + target.frequency_map = TYPE_OP::CreateEmpty(aggr_input_data.allocator); } for (auto &val : *source.frequency_map) { auto &i = (*target.frequency_map)[val.first]; diff --git a/src/duckdb/src/common/adbc/adbc.cpp b/src/duckdb/src/common/adbc/adbc.cpp index b461a88c2..63f4c0a29 100644 --- a/src/duckdb/src/common/adbc/adbc.cpp +++ b/src/duckdb/src/common/adbc/adbc.cpp @@ -1320,12 +1320,21 @@ AdbcStatusCode StatementSetOption(struct AdbcStatement *statement, const char *k return ADBC_STATUS_INVALID_ARGUMENT; } +std::string createFilter(const char *input) { + if (input) { + auto quoted = duckdb::KeywordHelper::WriteQuoted(input, '\''); + return quoted; + } + return "'%'"; +} + AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth, const char *catalog, const char *db_schema, const char *table_name, const char **table_type, const char *column_name, struct ArrowArrayStream *out, struct AdbcError *error) { - std::string catalog_filter = catalog ? catalog : "%"; - std::string db_schema_filter = db_schema ? db_schema : "%"; - std::string table_name_filter = table_name ? table_name : "%"; + std::string catalog_filter = createFilter(catalog); + std::string db_schema_filter = createFilter(db_schema); + std::string table_name_filter = createFilter(table_name); + std::string column_name_filter = createFilter(column_name); std::string table_type_condition = ""; if (table_type && table_type[0]) { table_type_condition = " AND table_type IN ("; @@ -1341,13 +1350,10 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth if (i > 0) { table_type_condition += ", "; } - table_type_condition += "'"; - table_type_condition += table_type[i]; - table_type_condition += "'"; + table_type_condition += createFilter(table_type[i]); } table_type_condition += ")"; } - std::string column_name_filter = column_name ? column_name : "%"; std::string query; switch (depth) { @@ -1392,7 +1398,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth )[] catalog_db_schemas FROM information_schema.schemata - WHERE catalog_name LIKE '%s' + WHERE catalog_name LIKE %s GROUP BY catalog_name )", catalog_filter); @@ -1405,7 +1411,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth catalog_name, schema_name, FROM information_schema.schemata - WHERE schema_name LIKE '%s' + WHERE schema_name LIKE %s ) SELECT @@ -1448,7 +1454,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth information_schema.schemata LEFT JOIN db_schemas dbs USING (catalog_name, schema_name) - WHERE catalog_name LIKE '%s' + WHERE catalog_name LIKE %s GROUP BY catalog_name )", db_schema_filter, catalog_filter); @@ -1492,7 +1498,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth )[], }) db_schema_tables FROM information_schema.tables - WHERE table_name LIKE '%s'%s + WHERE table_name LIKE %s%s GROUP BY table_catalog, table_schema ), db_schemas AS ( @@ -1503,7 +1509,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth FROM information_schema.schemata LEFT JOIN tables USING (catalog_name, schema_name) - WHERE schema_name LIKE '%s' + WHERE schema_name LIKE %s ) SELECT @@ -1516,7 +1522,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth information_schema.schemata LEFT JOIN db_schemas dbs USING (catalog_name, schema_name) - WHERE catalog_name LIKE '%s' + WHERE catalog_name LIKE %s GROUP BY catalog_name )", table_name_filter, table_type_condition, db_schema_filter, catalog_filter); @@ -1551,7 +1557,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth xdbc_is_generatedcolumn: NULL::BOOLEAN, }) table_columns FROM information_schema.columns - WHERE column_name LIKE '%s' + WHERE column_name LIKE %s GROUP BY table_catalog, table_schema, table_name ), constraints AS ( @@ -1580,7 +1586,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth constraint_column_names, list_filter( constraint_column_names, - lambda name: name LIKE '%s' + lambda name: name LIKE %s ) ) GROUP BY database_name, schema_name, table_name @@ -1600,7 +1606,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth USING (table_catalog, table_schema, table_name) LEFT JOIN constraints USING (table_catalog, table_schema, table_name) - WHERE table_name LIKE '%s'%s + WHERE table_name LIKE %s%s GROUP BY table_catalog, table_schema ), db_schemas AS ( @@ -1611,7 +1617,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth FROM information_schema.schemata LEFT JOIN tables USING (catalog_name, schema_name) - WHERE schema_name LIKE '%s' + WHERE schema_name LIKE %s ) SELECT @@ -1624,7 +1630,7 @@ AdbcStatusCode ConnectionGetObjects(struct AdbcConnection *connection, int depth information_schema.schemata LEFT JOIN db_schemas dbs USING (catalog_name, schema_name) - WHERE catalog_name LIKE '%s' + WHERE catalog_name LIKE %s GROUP BY catalog_name )", column_name_filter, column_name_filter, table_name_filter, diff --git a/src/duckdb/src/common/arrow/schema_metadata.cpp b/src/duckdb/src/common/arrow/schema_metadata.cpp index 0728df9a1..d408d2bb3 100644 --- a/src/duckdb/src/common/arrow/schema_metadata.cpp +++ b/src/duckdb/src/common/arrow/schema_metadata.cpp @@ -97,13 +97,13 @@ unsafe_unique_array ArrowSchemaMetadata::SerializeMetadata() const { auto metadata_array_ptr = make_unsafe_uniq_array(total_size); auto metadata_ptr = metadata_array_ptr.get(); // 1. number of key-value pairs (int32) - const idx_t map_size = schema_metadata_map.size(); + const int32_t map_size = static_cast(schema_metadata_map.size()); memcpy(metadata_ptr, &map_size, sizeof(int32_t)); metadata_ptr += sizeof(int32_t); // Iterate through each key-value pair in the map for (const auto &pair : schema_metadata_map) { const std::string &key = pair.first; - idx_t key_size = key.size(); + int32_t key_size = static_cast(key.size()); // Length of the key (int32) memcpy(metadata_ptr, &key_size, sizeof(int32_t)); metadata_ptr += sizeof(int32_t); @@ -111,7 +111,7 @@ unsafe_unique_array ArrowSchemaMetadata::SerializeMetadata() const { memcpy(metadata_ptr, key.c_str(), key_size); metadata_ptr += key_size; const std::string &value = pair.second; - const idx_t value_size = value.size(); + const int32_t value_size = static_cast(value.size()); // Length of the value (int32) memcpy(metadata_ptr, &value_size, sizeof(int32_t)); metadata_ptr += sizeof(int32_t); diff --git a/src/duckdb/src/common/enum_util.cpp b/src/duckdb/src/common/enum_util.cpp index d2bb2f772..cfb6de9af 100644 --- a/src/duckdb/src/common/enum_util.cpp +++ b/src/duckdb/src/common/enum_util.cpp @@ -3029,6 +3029,7 @@ const StringUtil::EnumStringLiteral *GetMetricTypeValues() { { static_cast(MetricType::OPTIMIZER_CTE_INLINING), "OPTIMIZER_CTE_INLINING" }, { static_cast(MetricType::OPTIMIZER_COMMON_SUBPLAN), "OPTIMIZER_COMMON_SUBPLAN" }, { static_cast(MetricType::OPTIMIZER_JOIN_ELIMINATION), "OPTIMIZER_JOIN_ELIMINATION" }, + { static_cast(MetricType::OPTIMIZER_COUNT_WINDOW_ELIMINATION), "OPTIMIZER_COUNT_WINDOW_ELIMINATION" }, { static_cast(MetricType::ALL_OPTIMIZERS), "ALL_OPTIMIZERS" }, { static_cast(MetricType::CUMULATIVE_OPTIMIZER_TIMING), "CUMULATIVE_OPTIMIZER_TIMING" }, { static_cast(MetricType::PHYSICAL_PLANNER), "PHYSICAL_PLANNER" }, @@ -3043,12 +3044,12 @@ const StringUtil::EnumStringLiteral *GetMetricTypeValues() { template<> const char* EnumUtil::ToChars(MetricType value) { - return StringUtil::EnumToString(GetMetricTypeValues(), 66, "MetricType", static_cast(value)); + return StringUtil::EnumToString(GetMetricTypeValues(), 67, "MetricType", static_cast(value)); } template<> MetricType EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetMetricTypeValues(), 66, "MetricType", value)); + return static_cast(StringUtil::StringToEnum(GetMetricTypeValues(), 67, "MetricType", value)); } const StringUtil::EnumStringLiteral *GetMultiFileColumnMappingModeValues() { @@ -3284,19 +3285,20 @@ const StringUtil::EnumStringLiteral *GetOptimizerTypeValues() { { static_cast(OptimizerType::LATE_MATERIALIZATION), "LATE_MATERIALIZATION" }, { static_cast(OptimizerType::CTE_INLINING), "CTE_INLINING" }, { static_cast(OptimizerType::COMMON_SUBPLAN), "COMMON_SUBPLAN" }, - { static_cast(OptimizerType::JOIN_ELIMINATION), "JOIN_ELIMINATION" } + { static_cast(OptimizerType::JOIN_ELIMINATION), "JOIN_ELIMINATION" }, + { static_cast(OptimizerType::COUNT_WINDOW_ELIMINATION), "COUNT_WINDOW_ELIMINATION" } }; return values; } template<> const char* EnumUtil::ToChars(OptimizerType value) { - return StringUtil::EnumToString(GetOptimizerTypeValues(), 33, "OptimizerType", static_cast(value)); + return StringUtil::EnumToString(GetOptimizerTypeValues(), 34, "OptimizerType", static_cast(value)); } template<> OptimizerType EnumUtil::FromString(const char *value) { - return static_cast(StringUtil::StringToEnum(GetOptimizerTypeValues(), 33, "OptimizerType", value)); + return static_cast(StringUtil::StringToEnum(GetOptimizerTypeValues(), 34, "OptimizerType", value)); } const StringUtil::EnumStringLiteral *GetOrderByNullTypeValues() { diff --git a/src/duckdb/src/common/enums/optimizer_type.cpp b/src/duckdb/src/common/enums/optimizer_type.cpp index f62af9626..353073f2a 100644 --- a/src/duckdb/src/common/enums/optimizer_type.cpp +++ b/src/duckdb/src/common/enums/optimizer_type.cpp @@ -45,6 +45,7 @@ static const DefaultOptimizerType internal_optimizer_types[] = { {"cte_inlining", OptimizerType::CTE_INLINING}, {"common_subplan", OptimizerType::COMMON_SUBPLAN}, {"join_elimination", OptimizerType::JOIN_ELIMINATION}, + {"count_window_elimination", OptimizerType::COUNT_WINDOW_ELIMINATION}, {nullptr, OptimizerType::INVALID}}; string OptimizerTypeToString(OptimizerType type) { diff --git a/src/duckdb/src/common/types/geometry.cpp b/src/duckdb/src/common/types/geometry.cpp index d565d36f8..cc9bacfda 100644 --- a/src/duckdb/src/common/types/geometry.cpp +++ b/src/duckdb/src/common/types/geometry.cpp @@ -16,7 +16,8 @@ class BlobWriter { public: template void Write(const T &value) { - auto ptr = reinterpret_cast(&value); + auto le_value = BSwapIfBE(value); + auto ptr = reinterpret_cast(&le_value); buffer.insert(buffer.end(), ptr, ptr + sizeof(T)); } @@ -38,16 +39,12 @@ class BlobWriter { if (reserved.offset + sizeof(T) > buffer.size()) { throw InternalException("Write out of bounds in BinaryWriter"); } - auto ptr = reinterpret_cast(&reserved.value); + auto le_value = BSwapIfBE(reserved.value); + auto ptr = reinterpret_cast(&le_value); // We've reserved 0 bytes, so we can safely memcpy memcpy(buffer.data() + reserved.offset, ptr, sizeof(T)); } - void Write(const char *data, size_t size) { - D_ASSERT(data != nullptr); - buffer.insert(buffer.end(), data, data + size); - } - const vector &GetBuffer() const { return buffer; } @@ -70,18 +67,11 @@ class FixedSizeBlobWriter { if (pos + sizeof(T) > end) { throw InvalidInputException("Writing beyond end of binary data at position %zu", pos - beg); } - memcpy(pos, &value, sizeof(T)); + auto le_value = BSwapIfBE(value); + memcpy(pos, &le_value, sizeof(T)); pos += sizeof(T); } - void Write(const char *data, size_t size) { - if (pos + size > end) { - throw InvalidInputException("Writing beyond end of binary data at position %zu", pos - beg); - } - memcpy(pos, data, size); - pos += size; - } - size_t GetPosition() const { return static_cast(pos - beg); } @@ -112,17 +102,9 @@ class BlobReader { throw InvalidInputException("Unexpected end of binary data at position %zu", pos - beg); } T value; - if (LE) { - memcpy(&value, pos, sizeof(T)); - pos += sizeof(T); - } else { - char temp[sizeof(T)]; - for (size_t i = 0; i < sizeof(T); ++i) { - temp[i] = pos[sizeof(T) - 1 - i]; - } - memcpy(&value, temp, sizeof(T)); - pos += sizeof(T); - } + memcpy(&value, pos, sizeof(T)); + value = LE ? BSwapIfBE(value) : BSwapIfLE(value); + pos += sizeof(T); return value; } @@ -1060,9 +1042,20 @@ static uint32_t ParseVerticesInternal(BlobReader &reader, GeometryExtent &extent // Issue a single .Reserve() for all vertices, to minimize bounds checking overhead const auto ptr = const_data_ptr_cast(reader.Reserve(vert_count * sizeof(VERTEX_TYPE))); - +#if DUCKDB_IS_BIG_ENDIAN + double be_buffer[sizeof(VERTEX_TYPE)]; + auto be_ptr = reinterpret_cast(be_buffer); +#endif for (uint32_t vert_idx = 0; vert_idx < vert_count; vert_idx++) { +#if DUCKDB_IS_BIG_ENDIAN + auto vert_ofs = vert_idx * sizeof(VERTEX_TYPE); + for (idx_t i = 0; i < sizeof(VERTEX_TYPE) / sizeof(double); ++i) { + be_buffer[i] = LoadLE(ptr + vert_ofs + i * sizeof(double)); + } + VERTEX_TYPE vertex = Load(be_ptr); +#else VERTEX_TYPE vertex = Load(ptr + vert_idx * sizeof(VERTEX_TYPE)); +#endif if (check_nan && vertex.AllNan()) { continue; } diff --git a/src/duckdb/src/common/types/hash.cpp b/src/duckdb/src/common/types/hash.cpp index b8453ac3d..17505081e 100644 --- a/src/duckdb/src/common/types/hash.cpp +++ b/src/duckdb/src/common/types/hash.cpp @@ -84,7 +84,7 @@ hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept { // Hash/combine in blocks of 8 bytes const auto remainder = len & 7U; for (const auto end = ptr + len - remainder; ptr != end; ptr += 8U) { - h ^= Load(ptr); + h ^= LoadLE(ptr); h *= 0xd6e8feb86659fd93U; } @@ -93,7 +93,7 @@ hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept { D_ASSERT(len >= 8); // Load remaining (<8) bytes (with a Load instead of a memcpy) const auto inv_rem = 8U - remainder; - const auto hr = Load(ptr - inv_rem) >> (inv_rem * 8U); + const auto hr = LoadLE(ptr - inv_rem) >> (inv_rem * 8U); h ^= hr; h *= 0xd6e8feb86659fd93U; @@ -101,6 +101,7 @@ hash_t HashBytes(const_data_ptr_t ptr, const idx_t len) noexcept { // Load remaining (<8) bytes (with a memcpy) hash_t hr = 0; memcpy(&hr, ptr, remainder); + hr = BSwapIfBE(hr); h ^= hr; h *= 0xd6e8feb86659fd93U; @@ -122,7 +123,7 @@ hash_t Hash(string_t val) { // Hash/combine the first 8-byte block if (!val.Empty()) { - h ^= Load(const_data_ptr_cast(val.GetPrefix())); + h ^= LoadLE(const_data_ptr_cast(val.GetPrefix())); h *= 0xd6e8feb86659fd93U; } @@ -130,6 +131,7 @@ hash_t Hash(string_t val) { if (val.GetSize() > sizeof(hash_t)) { hash_t hr = 0; memcpy(&hr, const_data_ptr_cast(val.GetPrefix()) + sizeof(hash_t), 4U); + hr = BSwapIfBE(hr); h ^= hr; h *= 0xd6e8feb86659fd93U; diff --git a/src/duckdb/src/common/types/vector.cpp b/src/duckdb/src/common/types/vector.cpp index 7363f952a..f8b2d23e3 100644 --- a/src/duckdb/src/common/types/vector.cpp +++ b/src/duckdb/src/common/types/vector.cpp @@ -1,11 +1,8 @@ #include "duckdb/common/types/vector.hpp" -#include "duckdb/common/algorithm.hpp" #include "duckdb/common/assert.hpp" #include "duckdb/common/exception.hpp" #include "duckdb/common/fsst.hpp" -#include "duckdb/common/operator/comparison_operators.hpp" -#include "duckdb/common/pair.hpp" #include "duckdb/common/printer.hpp" #include "duckdb/common/serializer/deserializer.hpp" #include "duckdb/common/serializer/serializer.hpp" @@ -20,11 +17,8 @@ #include "duckdb/common/types/vector_cache.hpp" #include "duckdb/common/uhugeint.hpp" #include "duckdb/common/vector_operations/vector_operations.hpp" -#include "duckdb/function/scalar/nested_functions.hpp" #include "duckdb/storage/buffer/buffer_handle.hpp" -#include "duckdb/storage/string_uncompressed.hpp" #include "duckdb/common/types/uuid.hpp" -#include "fsst.h" #include // strlen() on Solaris namespace duckdb { @@ -746,6 +740,9 @@ Value Vector::GetValueInternal(const Vector &v_p, idx_t index_p) { auto str = reinterpret_cast(data)[index]; return Value::BIT(const_data_ptr_cast(str.GetData()), str.GetSize()); } + case LogicalTypeId::SQLNULL: { + return Value(); + } case LogicalTypeId::MAP: { auto offlen = reinterpret_cast(data)[index]; auto &child_vec = ListVector::GetEntry(*vector); diff --git a/src/duckdb/src/execution/index/art/art.cpp b/src/duckdb/src/execution/index/art/art.cpp index 770db8818..800135f0d 100644 --- a/src/duckdb/src/execution/index/art/art.cpp +++ b/src/duckdb/src/execution/index/art/art.cpp @@ -231,6 +231,9 @@ unique_ptr ART::TryInitializeScan(const Expression &expr, const return InitializeScanSinglePredicate(high_value, high_comparison_type); } +unique_ptr ART::InitializeFullScan() { + return make_uniq(); +} //===--------------------------------------------------------------------===// // ART Keys //===--------------------------------------------------------------------===// @@ -466,11 +469,6 @@ ErrorData ART::Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids, IndexAppe unsafe_vector row_id_keys(row_count); GenerateKeyVectors(arena, chunk, row_ids, keys, row_id_keys); - optional_ptr delete_art; - if (info.delete_index) { - delete_art = info.delete_index->Cast(); - } - auto conflict_type = ARTConflictType::NO_CONFLICT; optional_idx conflict_idx; auto was_empty = !tree.HasMetadata(); @@ -481,7 +479,7 @@ ErrorData ART::Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids, IndexAppe continue; } conflict_type = ARTOperator::Insert(arena, *this, tree, keys[i], 0, row_id_keys[i], GateStatus::GATE_NOT_SET, - delete_art, info.append_mode); + DeleteIndexInfo(info.delete_indexes), info.append_mode); if (conflict_type != ARTConflictType::NO_CONFLICT) { conflict_idx = i; break; @@ -569,26 +567,37 @@ void ART::CommitDrop(IndexLock &index_lock) { tree.Clear(); } -void ART::Delete(IndexLock &state, DataChunk &input, Vector &row_ids) { +idx_t ART::TryDelete(IndexLock &state, DataChunk &entries, Vector &row_ids, optional_ptr deleted_sel, + optional_ptr non_deleted_sel) { // FIXME: We could pass a row_count in here, as we sometimes don't have to delete all row IDs in the chunk, // FIXME: but rather all row IDs up to the conflicting row. - auto row_count = input.size(); + auto row_count = entries.size(); DataChunk expr_chunk; expr_chunk.Initialize(Allocator::DefaultAllocator(), logical_types); - ExecuteExpressions(input, expr_chunk); + ExecuteExpressions(entries, expr_chunk); ArenaAllocator allocator(BufferAllocator::Get(db)); unsafe_vector keys(row_count); unsafe_vector row_id_keys(row_count); GenerateKeyVectors(allocator, expr_chunk, row_ids, keys, row_id_keys); + idx_t delete_count = 0; for (idx_t i = 0; i < row_count; i++) { - if (keys[i].Empty()) { - continue; + bool deleted = true; + if (!keys[i].Empty()) { + D_ASSERT(tree.GetGateStatus() == GateStatus::GATE_NOT_SET); + deleted = ARTOperator::Delete(*this, tree, keys[i], row_id_keys[i]); + } + if (deleted) { + if (deleted_sel) { + deleted_sel->set_index(delete_count, i); + } + delete_count++; + } else if (non_deleted_sel) { + idx_t non_delete_count = i - delete_count; + non_deleted_sel->set_index(non_delete_count, i); } - D_ASSERT(tree.GetGateStatus() == GateStatus::GATE_NOT_SET); - ARTOperator::Delete(*this, tree, keys[i], row_id_keys[i]); } if (!tree.HasMetadata()) { @@ -608,11 +617,21 @@ void ART::Delete(IndexLock &state, DataChunk &input, Vector &row_ids) { } } #endif + return delete_count; } //===--------------------------------------------------------------------===// // Point and range lookups //===--------------------------------------------------------------------===// +bool ART::FullScan(idx_t max_count, set &row_ids) { + if (!tree.HasMetadata()) { + return true; + } + Iterator it(*this); + it.FindMinimum(tree); + ARTKey empty_key = ARTKey(); + return it.Scan(empty_key, max_count, row_ids, false); +} bool ART::SearchEqual(ARTKey &key, idx_t max_count, set &row_ids) { auto leaf = ARTOperator::Lookup(*this, tree, key, 0); @@ -678,15 +697,20 @@ bool ART::SearchCloseRange(ARTKey &lower_bound, ARTKey &upper_bound, bool left_e bool ART::Scan(IndexScanState &state, const idx_t max_count, set &row_ids) { auto &scan_state = state.Cast(); + if (scan_state.values[0].IsNull()) { + // full scan + lock_guard l(lock); + return FullScan(max_count, row_ids); + } D_ASSERT(scan_state.values[0].type().InternalType() == types[0]); ArenaAllocator arena_allocator(Allocator::Get(db)); auto key = ARTKey::CreateKey(arena_allocator, types[0], scan_state.values[0]); auto max_len = MAX_KEY_LEN * prefix_count; key.VerifyKeyLength(max_len); + lock_guard l(lock); if (scan_state.values[1].IsNull()) { // Single predicate. - lock_guard l(lock); switch (scan_state.expressions[0]) { case ExpressionType::COMPARE_EQUAL: return SearchEqual(key, max_count, row_ids); @@ -704,7 +728,6 @@ bool ART::Scan(IndexScanState &state, const idx_t max_count, set &row_ids } // Two predicates. - lock_guard l(lock); D_ASSERT(scan_state.values[1].type().InternalType() == types[0]); auto upper_bound = ARTKey::CreateKey(arena_allocator, types[0], scan_state.values[1]); upper_bound.VerifyKeyLength(max_len); @@ -758,39 +781,36 @@ string ART::GenerateConstraintErrorMessage(VerifyExistenceType verify_type, cons } } -void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr delete_art, ConflictManager &manager, +void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, DeleteIndexInfo delete_index_info, ConflictManager &manager, optional_idx &conflict_idx, idx_t i) { - // Fast path, the leaf is inlined, and the delete ART does not exist. - if (leaf.GetType() == NType::LEAF_INLINED && !delete_art) { - if (manager.AddHit(i, leaf.GetRowId())) { - conflict_idx = i; - } - return; - } - - // Get the delete_leaf. - // All leaves in the delete ART are inlined. - unsafe_optional_ptr deleted_leaf; - if (delete_art) { - deleted_leaf = ARTOperator::Lookup(*delete_art, delete_art->tree, key, 0); - } - - // The leaf is inlined, and there is no deleted leaf with the same key. - if (leaf.GetType() == NType::LEAF_INLINED && !deleted_leaf) { - if (manager.AddHit(i, leaf.GetRowId())) { - conflict_idx = i; + // Get the set of deleted row ids for this value if we have any delete indexes + vector deleted_row_ids; + if (delete_index_info.delete_indexes) { + for (auto &index : *delete_index_info.delete_indexes) { + auto &delete_art = index.get().Cast(); + auto deleted_leaf = ARTOperator::Lookup(delete_art, delete_art.tree, key, 0); + if (!deleted_leaf) { + continue; + } + // All leaves in the delete ART are inlined. + if (deleted_leaf->GetType() != NType::LEAF_INLINED) { + throw InternalException("Non-inlined leaf?"); + } + auto deleted_row_id = deleted_leaf->GetRowId(); + deleted_row_ids.push_back(deleted_row_id); } - return; } - // The leaf is inlined, and the same key exists in the delete ART. - if (leaf.GetType() == NType::LEAF_INLINED && deleted_leaf) { - D_ASSERT(deleted_leaf->GetType() == NType::LEAF_INLINED); - auto deleted_row_id = deleted_leaf->GetRowId(); + if (leaf.GetType() == NType::LEAF_INLINED) { auto this_row_id = leaf.GetRowId(); - - if (deleted_row_id == this_row_id) { - return; + if (!deleted_row_ids.empty()) { + // The leaf is inlined, and the same key exists in the delete ART. + // check if the row-id matches - if it does there is no conflict + for (auto &deleted_row_id : deleted_row_ids) { + if (deleted_row_id == this_row_id) { + return; + } + } } if (manager.AddHit(i, this_row_id)) { @@ -803,7 +823,7 @@ void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr dele // Up to here, the above code paths work implicitly for FKs, as the leaf is inlined. // FIXME: proper foreign key + delete ART support. if (index_constraint_type == IndexConstraintType::FOREIGN) { - D_ASSERT(!deleted_leaf); + D_ASSERT(deleted_row_ids.empty()); // We don't handle FK conflicts in UPSERT, so the row ID should not matter. if (manager.AddHit(i, MAX_ROW_ID)) { conflict_idx = i; @@ -821,11 +841,12 @@ void ART::VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr dele throw InternalException("VerifyLeaf expects exactly two row IDs to be scanned"); } - if (deleted_leaf) { - auto deleted_row_id = deleted_leaf->GetRowId(); + if (!deleted_row_ids.empty()) { for (const auto row_id : row_ids) { - if (deleted_row_id == row_id) { - return; + for (auto deleted_row_id : deleted_row_ids) { + if (deleted_row_id == row_id) { + return; + } } } } @@ -850,11 +871,6 @@ void ART::VerifyConstraint(DataChunk &chunk, IndexAppendInfo &info, ConflictMana unsafe_vector keys(expr_chunk.size()); GenerateKeys<>(arena_allocator, expr_chunk, keys); - optional_ptr delete_art; - if (info.delete_index) { - delete_art = info.delete_index->Cast(); - } - optional_idx conflict_idx; for (idx_t i = 0; !conflict_idx.IsValid() && i < chunk.size(); i++) { if (keys[i].Empty()) { @@ -868,7 +884,7 @@ void ART::VerifyConstraint(DataChunk &chunk, IndexAppendInfo &info, ConflictMana if (!leaf) { continue; } - VerifyLeaf(*leaf, keys[i], delete_art, manager, conflict_idx, i); + VerifyLeaf(*leaf, keys[i], DeleteIndexInfo(info.delete_indexes), manager, conflict_idx, i); } manager.FinishLookup(); @@ -882,6 +898,7 @@ void ART::VerifyConstraint(DataChunk &chunk, IndexAppendInfo &info, ConflictMana } string ART::GetConstraintViolationMessage(VerifyExistenceType verify_type, idx_t failed_index, DataChunk &input) { + lock_guard l(lock); auto key_name = GenerateErrorKeyName(input, failed_index); auto exception_msg = GenerateConstraintErrorMessage(verify_type, key_name); return exception_msg; @@ -1053,13 +1070,19 @@ idx_t ART::GetInMemorySize(IndexLock &index_lock) { return in_memory_size; } -bool ART::RequiresTransactionality() const { +bool ART::SupportsDeltaIndexes() const { return true; } -unique_ptr ART::CreateEmptyCopy(const string &name_prefix, IndexConstraintType constraint_type) const { - return make_uniq(name_prefix + name, constraint_type, GetColumnIds(), table_io_manager, unbound_expressions, - db); +unique_ptr ART::CreateDeltaIndex(DeltaIndexType target_delta_index) const { + auto constraint_type = index_constraint_type; + if (target_delta_index == DeltaIndexType::DELETED_ROWS_IN_USE) { + // deleted_rows_in_use allows duplicates regardless of whether or not the main index is a unique index or not + constraint_type = IndexConstraintType::NONE; + } + auto result = make_uniq(name, constraint_type, GetColumnIds(), table_io_manager, unbound_expressions, db); + result->delta_index_type = target_delta_index; + return std::move(result); } //===-------------------------------------------------------------------===// @@ -1190,11 +1213,7 @@ bool ART::MergeIndexes(IndexLock &state, BoundIndex &other_index) { if (other_art.owns_data) { if (prefix_count != other_art.prefix_count) { - // this ART uses the deprecated form and the other one does not - transform the other one prior to merging - if (prefix_count != Prefix::DEPRECATED_COUNT) { - throw InternalException("Failed to merge ARTs - other ART is deprecated but this one is not"); - } - other_art.TransformToDeprecated(); + throw InternalException("Failed to merge ARTs - prefix count does not match"); } if (tree.HasMetadata()) { // Fully deserialize other_index, and traverse it to increment its buffer IDs. diff --git a/src/duckdb/src/execution/index/art/art_builder.cpp b/src/duckdb/src/execution/index/art/art_builder.cpp index 92b719ea8..f6721a943 100644 --- a/src/duckdb/src/execution/index/art/art_builder.cpp +++ b/src/duckdb/src/execution/index/art/art_builder.cpp @@ -49,7 +49,7 @@ ARTConflictType ARTBuilder::Build() { // We cannot iterate into the nested leaf with the builder // because row IDs are not sorted. for (idx_t i = entry.start; i < entry.start + row_id_count; i++) { - ARTOperator::Insert(arena, art, ref, row_ids[i], 0, row_ids[i], GateStatus::GATE_SET, nullptr, + ARTOperator::Insert(arena, art, ref, row_ids[i], 0, row_ids[i], GateStatus::GATE_SET, DeleteIndexInfo(), IndexAppendMode::DEFAULT); } ref.get().SetGateStatus(GateStatus::GATE_SET); diff --git a/src/duckdb/src/execution/index/art/art_index.cpp b/src/duckdb/src/execution/index/art/art_index.cpp index 05e97f847..c4ba2c504 100644 --- a/src/duckdb/src/execution/index/art/art_index.cpp +++ b/src/duckdb/src/execution/index/art/art_index.cpp @@ -100,8 +100,9 @@ void ARTBuildSinkUnsorted(IndexBuildSinkInput &input, DataChunk &key_chunk, Data // Insert each key and its corresponding row ID. for (idx_t i = 0; i < row_count; i++) { auto status = art.tree.GetGateStatus(); - auto conflict_type = ARTOperator::Insert(l_state.arena_allocator, art, art.tree, l_state.keys[i], 0, - l_state.row_ids[i], status, nullptr, IndexAppendMode::DEFAULT); + auto conflict_type = + ARTOperator::Insert(l_state.arena_allocator, art, art.tree, l_state.keys[i], 0, l_state.row_ids[i], status, + DeleteIndexInfo(), IndexAppendMode::DEFAULT); D_ASSERT(conflict_type != ARTConflictType::TRANSACTION); if (conflict_type == ARTConflictType::CONSTRAINT) { throw ConstraintException("Data contains duplicates on indexed column(s)"); diff --git a/src/duckdb/src/execution/index/art/art_merger.cpp b/src/duckdb/src/execution/index/art/art_merger.cpp index f2fe5a8d8..3b562374c 100644 --- a/src/duckdb/src/execution/index/art/art_merger.cpp +++ b/src/duckdb/src/execution/index/art/art_merger.cpp @@ -107,7 +107,7 @@ ARTConflictType ARTMerger::MergeNodeAndInlined(NodeEntry &entry) { // We fall back to the ART insertion code. auto row_id_key = ARTKey::CreateARTKey(arena, entry.right.GetRowId()); return ARTOperator::Insert(arena, art, entry.left, row_id_key, entry.depth, row_id_key, GateStatus::GATE_SET, - nullptr, IndexAppendMode::DEFAULT); + DeleteIndexInfo(), IndexAppendMode::DEFAULT); } array_ptr ARTMerger::GetBytes(Node &leaf) { diff --git a/src/duckdb/src/execution/index/art/leaf.cpp b/src/duckdb/src/execution/index/art/leaf.cpp index 3f1190216..4895f996b 100644 --- a/src/duckdb/src/execution/index/art/leaf.cpp +++ b/src/duckdb/src/execution/index/art/leaf.cpp @@ -88,8 +88,8 @@ void Leaf::TransformToNested(ART &art, Node &node) { auto &leaf = Node::Ref(art, leaf_ref, LEAF); for (uint8_t i = 0; i < leaf.count; i++) { auto row_id = ARTKey::CreateARTKey(arena, leaf.row_ids[i]); - auto conflict_type = ARTOperator::Insert(arena, art, root, row_id, 0, row_id, GateStatus::GATE_SET, nullptr, - IndexAppendMode::INSERT_DUPLICATES); + auto conflict_type = ARTOperator::Insert(arena, art, root, row_id, 0, row_id, GateStatus::GATE_SET, + DeleteIndexInfo(), IndexAppendMode::INSERT_DUPLICATES); if (conflict_type != ARTConflictType::NO_CONFLICT) { throw InternalException("invalid conflict type in Leaf::TransformToNested"); } diff --git a/src/duckdb/src/execution/index/bound_index.cpp b/src/duckdb/src/execution/index/bound_index.cpp index c60886c31..cb10bc1f1 100644 --- a/src/duckdb/src/execution/index/bound_index.cpp +++ b/src/duckdb/src/execution/index/bound_index.cpp @@ -64,12 +64,33 @@ void BoundIndex::CommitDrop() { CommitDrop(index_lock); } +idx_t BoundIndex::TryDelete(DataChunk &entries, Vector &row_identifiers, optional_ptr deleted_sel, + optional_ptr non_deleted_sel) { + IndexLock state; + InitializeLock(state); + return TryDelete(state, entries, row_identifiers, deleted_sel, non_deleted_sel); +} + +idx_t BoundIndex::TryDelete(IndexLock &state, DataChunk &entries, Vector &row_identifiers, + optional_ptr deleted_sel, optional_ptr non_deleted_sel) { + throw InternalException("TryDelete not implemented"); +} + void BoundIndex::Delete(DataChunk &entries, Vector &row_identifiers) { IndexLock state; InitializeLock(state); Delete(state, entries, row_identifiers); } +void BoundIndex::Delete(IndexLock &state, DataChunk &entries, Vector &row_identifiers) { + TryDelete(state, entries, row_identifiers); + // FIXME: enable this + // if (deleted_rows != entries.size()) { + // throw InvalidInputException("Failed to delete all rows from index. Only deleted %d out of %d rows.\nChunk: %s", + // deleted_rows, entries.size(), entries.ToString()); + // } +} + ErrorData BoundIndex::Insert(IndexLock &l, DataChunk &chunk, Vector &row_ids, IndexAppendInfo &info) { throw NotImplementedException("this implementation of Insert does not exist."); } @@ -142,13 +163,12 @@ bool BoundIndex::IndexIsUpdated(const vector &column_ids_p) const return false; } -bool BoundIndex::RequiresTransactionality() const { +bool BoundIndex::SupportsDeltaIndexes() const { return false; } -unique_ptr BoundIndex::CreateEmptyCopy(const string &name_prefix, - IndexConstraintType constraint_type) const { - throw InternalException("BoundIndex::CreateEmptyCopy is not supported for this index type"); +unique_ptr BoundIndex::CreateDeltaIndex(DeltaIndexType delta_index_type) const { + throw InternalException("BoundIndex::CreateDeltaIndex is not supported for this index type"); } IndexStorageInfo BoundIndex::SerializeToDisk(QueryContext context, const case_insensitive_map_t &options) { diff --git a/src/duckdb/src/execution/operator/aggregate/physical_streaming_window.cpp b/src/duckdb/src/execution/operator/aggregate/physical_streaming_window.cpp index 0f465620c..097994855 100644 --- a/src/duckdb/src/execution/operator/aggregate/physical_streaming_window.cpp +++ b/src/duckdb/src/execution/operator/aggregate/physical_streaming_window.cpp @@ -17,11 +17,12 @@ PhysicalStreamingWindow::PhysicalStreamingWindow(PhysicalPlan &physical_plan, ve class StreamingWindowGlobalState : public GlobalOperatorState { public: - StreamingWindowGlobalState() : row_number(1) { - } + explicit StreamingWindowGlobalState(ClientContext &client); //! The next row number. std::atomic row_number; + //! The single local state + unique_ptr local_state; }; class StreamingWindowState : public OperatorState { @@ -348,6 +349,10 @@ class StreamingWindowState : public OperatorState { SelectionVector sel; }; +StreamingWindowGlobalState::StreamingWindowGlobalState(ClientContext &client) : row_number(1) { + local_state = make_uniq(client); +} + bool PhysicalStreamingWindow::IsStreamingFunction(ClientContext &context, unique_ptr &expr) { auto &wexpr = expr->Cast(); if (!wexpr.partitions.empty() || !wexpr.orders.empty() || !wexpr.arg_orders.empty() || @@ -392,12 +397,8 @@ bool PhysicalStreamingWindow::IsStreamingFunction(ClientContext &context, unique } } -unique_ptr PhysicalStreamingWindow::GetGlobalOperatorState(ClientContext &context) const { - return make_uniq(); -} - -unique_ptr PhysicalStreamingWindow::GetOperatorState(ExecutionContext &context) const { - return make_uniq(context.client); +unique_ptr PhysicalStreamingWindow::GetGlobalOperatorState(ClientContext &client) const { + return make_uniq(client); } void StreamingWindowState::AggregateState::Execute(ExecutionContext &context, DataChunk &input, Vector &result) { @@ -505,9 +506,9 @@ void StreamingWindowState::AggregateState::Execute(ExecutionContext &context, Da } void PhysicalStreamingWindow::ExecuteFunctions(ExecutionContext &context, DataChunk &output, DataChunk &delayed, - GlobalOperatorState &gstate_p, OperatorState &state_p) const { + GlobalOperatorState &gstate_p) const { auto &gstate = gstate_p.Cast(); - auto &state = state_p.Cast(); + auto &state = gstate.local_state->Cast(); // Compute window functions const idx_t count = output.size(); @@ -624,9 +625,9 @@ void PhysicalStreamingWindow::ExecuteFunctions(ExecutionContext &context, DataCh } void PhysicalStreamingWindow::ExecuteInput(ExecutionContext &context, DataChunk &delayed, DataChunk &input, - DataChunk &output, GlobalOperatorState &gstate_p, - OperatorState &state_p) const { - auto &state = state_p.Cast(); + DataChunk &output, GlobalOperatorState &gstate_p) const { + auto &gstate = gstate_p.Cast(); + auto &state = gstate.local_state->Cast(); // Put payload columns in place for (idx_t col_idx = 0; col_idx < input.data.size(); col_idx++) { @@ -642,13 +643,13 @@ void PhysicalStreamingWindow::ExecuteInput(ExecutionContext &context, DataChunk } output.SetCardinality(count); - ExecuteFunctions(context, output, state.delayed, gstate_p, state_p); + ExecuteFunctions(context, output, state.delayed, gstate_p); } void PhysicalStreamingWindow::ExecuteShifted(ExecutionContext &context, DataChunk &delayed, DataChunk &input, - DataChunk &output, GlobalOperatorState &gstate_p, - OperatorState &state_p) const { - auto &state = state_p.Cast(); + DataChunk &output, GlobalOperatorState &gstate_p) const { + auto &gstate = gstate_p.Cast(); + auto &state = gstate.local_state->Cast(); auto &shifted = state.shifted; idx_t out = output.size(); @@ -670,12 +671,11 @@ void PhysicalStreamingWindow::ExecuteShifted(ExecutionContext &context, DataChun } delayed.SetCardinality(delay - out + in); - ExecuteFunctions(context, output, delayed, gstate_p, state_p); + ExecuteFunctions(context, output, delayed, gstate_p); } void PhysicalStreamingWindow::ExecuteDelayed(ExecutionContext &context, DataChunk &delayed, DataChunk &input, - DataChunk &output, GlobalOperatorState &gstate_p, - OperatorState &state_p) const { + DataChunk &output, GlobalOperatorState &gstate_p) const { // Put payload columns in place for (idx_t col_idx = 0; col_idx < delayed.data.size(); col_idx++) { output.data[col_idx].Reference(delayed.data[col_idx]); @@ -683,12 +683,13 @@ void PhysicalStreamingWindow::ExecuteDelayed(ExecutionContext &context, DataChun idx_t count = delayed.size(); output.SetCardinality(count); - ExecuteFunctions(context, output, input, gstate_p, state_p); + ExecuteFunctions(context, output, input, gstate_p); } OperatorResultType PhysicalStreamingWindow::Execute(ExecutionContext &context, DataChunk &input, DataChunk &output, - GlobalOperatorState &gstate_p, OperatorState &state_p) const { - auto &state = state_p.Cast(); + GlobalOperatorState &gstate_p, OperatorState &) const { + auto &gstate = gstate_p.Cast(); + auto &state = gstate.local_state->Cast(); if (!state.initialized) { state.Initialize(context.client, input, select_list); } @@ -709,27 +710,27 @@ OperatorResultType PhysicalStreamingWindow::Execute(ExecutionContext &context, D // If we can't consume all of the delayed values, // we need to split them instead of referencing them all output.SetCardinality(input.size()); - ExecuteShifted(context, delayed, input, output, gstate_p, state_p); + ExecuteShifted(context, delayed, input, output, gstate_p); // We delayed the unused input so ask for more return OperatorResultType::NEED_MORE_INPUT; } else if (delayed.size()) { // We have enough delayed rows so flush them - ExecuteDelayed(context, delayed, input, output, gstate_p, state_p); + ExecuteDelayed(context, delayed, input, output, gstate_p); // Defer resetting delayed as it may be referenced. delayed.SetCardinality(0); // Come back to process the input return OperatorResultType::HAVE_MORE_OUTPUT; } else { // No delayed rows, so emit what we can and delay the rest. - ExecuteInput(context, delayed, input, output, gstate_p, state_p); + ExecuteInput(context, delayed, input, output, gstate_p); return OperatorResultType::NEED_MORE_INPUT; } } OperatorFinalizeResultType PhysicalStreamingWindow::FinalExecute(ExecutionContext &context, DataChunk &output, - GlobalOperatorState &gstate_p, - OperatorState &state_p) const { - auto &state = state_p.Cast(); + GlobalOperatorState &gstate_p, OperatorState &) const { + auto &gstate = gstate_p.Cast(); + auto &state = gstate.local_state->Cast(); if (state.initialized && state.lead_count) { auto &delayed = state.delayed; @@ -740,10 +741,10 @@ OperatorFinalizeResultType PhysicalStreamingWindow::FinalExecute(ExecutionContex if (output.GetCapacity() < delayed.size()) { // More than one output buffer was delayed, so shift in what we can output.SetCardinality(output.GetCapacity()); - ExecuteShifted(context, delayed, input, output, gstate_p, state_p); + ExecuteShifted(context, delayed, input, output, gstate_p); return OperatorFinalizeResultType::HAVE_MORE_OUTPUT; } - ExecuteDelayed(context, delayed, input, output, gstate_p, state_p); + ExecuteDelayed(context, delayed, input, output, gstate_p); } return OperatorFinalizeResultType::FINISHED; diff --git a/src/duckdb/src/execution/operator/persistent/physical_merge_into.cpp b/src/duckdb/src/execution/operator/persistent/physical_merge_into.cpp index 672a9b861..796d3cec8 100644 --- a/src/duckdb/src/execution/operator/persistent/physical_merge_into.cpp +++ b/src/duckdb/src/execution/operator/persistent/physical_merge_into.cpp @@ -472,10 +472,17 @@ SourceResultType PhysicalMergeInto::GetDataInternal(ExecutionContext &context, D // no action to scan from continue; } + // found a good one + break; + } + if (lstate.index < actions.size()) { + auto &action = *actions[lstate.index]; + auto &child_gstate = *gstate.global_states[lstate.index]; auto &child_lstate = *lstate.local_states[lstate.index]; OperatorSourceInput source_input {child_gstate, child_lstate, input.interrupt_state}; + lstate.scan_chunk.Reset(); auto result = action.op->GetData(context, lstate.scan_chunk, source_input); if (lstate.scan_chunk.size() > 0) { // construct the result chunk @@ -504,9 +511,13 @@ SourceResultType PhysicalMergeInto::GetDataInternal(ExecutionContext &context, D if (result != SourceResultType::FINISHED) { return result; - } - if (chunk.size() != 0) { - return SourceResultType::HAVE_MORE_OUTPUT; + } else { + lstate.index++; + if (lstate.index < actions.size()) { + return SourceResultType::HAVE_MORE_OUTPUT; + } else { + return SourceResultType::FINISHED; + } } } return SourceResultType::FINISHED; diff --git a/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp b/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp index eee2d4a8d..c638f45b8 100644 --- a/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp +++ b/src/duckdb/src/execution/operator/schema/physical_create_art_index.cpp @@ -91,8 +91,9 @@ SinkResultType PhysicalCreateARTIndex::SinkUnsorted(OperatorSinkInput &input) co // Insert each key and its corresponding row ID. for (idx_t i = 0; i < row_count; i++) { auto status = art.tree.GetGateStatus(); - auto conflict_type = ARTOperator::Insert(l_state.arena_allocator, art, art.tree, l_state.keys[i], 0, - l_state.row_ids[i], status, nullptr, IndexAppendMode::DEFAULT); + auto conflict_type = + ARTOperator::Insert(l_state.arena_allocator, art, art.tree, l_state.keys[i], 0, l_state.row_ids[i], status, + DeleteIndexInfo(), IndexAppendMode::DEFAULT); D_ASSERT(conflict_type != ARTConflictType::TRANSACTION); if (conflict_type == ARTConflictType::CONSTRAINT) { throw ConstraintException("Data contains duplicates on indexed column(s)"); diff --git a/src/duckdb/src/function/scalar/string/concat.cpp b/src/duckdb/src/function/scalar/string/concat.cpp index 97a74cebe..7d1bab8be 100644 --- a/src/duckdb/src/function/scalar/string/concat.cpp +++ b/src/duckdb/src/function/scalar/string/concat.cpp @@ -1,15 +1,10 @@ #include "duckdb/common/exception.hpp" -#include "duckdb/common/types/date.hpp" +#include "duckdb/common/types/vector.hpp" #include "duckdb/common/vector_operations/binary_executor.hpp" -#include "duckdb/common/vector_operations/vector_operations.hpp" -#include "duckdb/function/scalar/nested_functions.hpp" #include "duckdb/function/scalar/string_functions.hpp" -#include "duckdb/planner/expression/bound_cast_expression.hpp" #include "duckdb/planner/expression/bound_function_expression.hpp" -#include - namespace duckdb { namespace { @@ -209,6 +204,7 @@ void ConcatFunction(DataChunk &args, ExpressionState &state, Vector &result) { auto &func_expr = state.expr.Cast(); auto &info = func_expr.bind_info->Cast(); if (info.return_type.id() == LogicalTypeId::SQLNULL) { + result.SetVectorType(VectorType::CONSTANT_VECTOR); return; } if (info.return_type.id() == LogicalTypeId::LIST) { diff --git a/src/duckdb/src/function/scalar/string/md5.cpp b/src/duckdb/src/function/scalar/string/md5.cpp index 94a035ee7..8c9894977 100644 --- a/src/duckdb/src/function/scalar/string/md5.cpp +++ b/src/duckdb/src/function/scalar/string/md5.cpp @@ -28,7 +28,7 @@ struct MD5Number128Operator { MD5Context context; context.Add(input); context.Finish(digest); - return *reinterpret_cast(digest); + return BSwapIfBE(*reinterpret_cast(digest)); } }; diff --git a/src/duckdb/src/function/table/arrow.cpp b/src/duckdb/src/function/table/arrow.cpp index f2f932768..f7b5e3ff7 100644 --- a/src/duckdb/src/function/table/arrow.cpp +++ b/src/duckdb/src/function/table/arrow.cpp @@ -245,10 +245,10 @@ static bool CanPushdown(const ArrowType &type) { case LogicalTypeId::UBIGINT: case LogicalTypeId::FLOAT: case LogicalTypeId::DOUBLE: - case LogicalTypeId::VARCHAR: return true; + case LogicalTypeId::VARCHAR: case LogicalTypeId::BLOB: - // PyArrow doesn't support binary view filters yet + // PyArrow doesn't support binary and string view filters yet return type.GetTypeInfo().GetSizeType() != ArrowVariableSizeType::VIEW; case LogicalTypeId::DECIMAL: { switch (duck_type.InternalType()) { diff --git a/src/duckdb/src/function/table/arrow_conversion.cpp b/src/duckdb/src/function/table/arrow_conversion.cpp index 511a272dc..5eba8026b 100644 --- a/src/duckdb/src/function/table/arrow_conversion.cpp +++ b/src/duckdb/src/function/table/arrow_conversion.cpp @@ -55,7 +55,7 @@ static void GetValidityMask(ValidityMask &mask, ArrowArray &array, idx_t chunk_o if (array.null_count != 0 && array.n_buffers > 0 && array.buffers[0]) { auto bit_offset = GetEffectiveOffset(array, parent_offset, chunk_offset, nested_offset); mask.EnsureWritable(); -#if STANDARD_VECTOR_SIZE > 64 +#if STANDARD_VECTOR_SIZE > 64 && !DUCKDB_IS_BIG_ENDIAN auto n_bitmask_bytes = (size + 8 - 1) / 8; if (bit_offset % 8 == 0) { //! just memcpy nullmask diff --git a/src/duckdb/src/function/table/table_scan.cpp b/src/duckdb/src/function/table/table_scan.cpp index 563189942..596860bea 100644 --- a/src/duckdb/src/function/table/table_scan.cpp +++ b/src/duckdb/src/function/table/table_scan.cpp @@ -387,7 +387,7 @@ unique_ptr DuckTableScanInitGlobal(ClientContext &cont g_state->state.local_state.reorderer = make_uniq(*bind_data.order_options); } - storage.InitializeParallelScan(context, g_state->state); + storage.InitializeParallelScan(context, g_state->state, input.column_indexes); if (!input.CanRemoveFilterColumns()) { return std::move(g_state); } diff --git a/src/duckdb/src/function/table/version/pragma_version.cpp b/src/duckdb/src/function/table/version/pragma_version.cpp index 12500c8f1..ae94dca3b 100644 --- a/src/duckdb/src/function/table/version/pragma_version.cpp +++ b/src/duckdb/src/function/table/version/pragma_version.cpp @@ -1,5 +1,5 @@ #ifndef DUCKDB_PATCH_VERSION -#define DUCKDB_PATCH_VERSION "0-dev4892" +#define DUCKDB_PATCH_VERSION "0-dev5016" #endif #ifndef DUCKDB_MINOR_VERSION #define DUCKDB_MINOR_VERSION 5 @@ -8,10 +8,10 @@ #define DUCKDB_MAJOR_VERSION 1 #endif #ifndef DUCKDB_VERSION -#define DUCKDB_VERSION "v1.5.0-dev4892" +#define DUCKDB_VERSION "v1.5.0-dev5016" #endif #ifndef DUCKDB_SOURCE_ID -#define DUCKDB_SOURCE_ID "c46a01b579" +#define DUCKDB_SOURCE_ID "b5761ca54c" #endif #include "duckdb/function/table/system_functions.hpp" #include "duckdb/main/database.hpp" diff --git a/src/duckdb/src/function/window/window_value_function.cpp b/src/duckdb/src/function/window/window_value_function.cpp index adf60be11..276b99fad 100644 --- a/src/duckdb/src/function/window/window_value_function.cpp +++ b/src/duckdb/src/function/window/window_value_function.cpp @@ -465,7 +465,11 @@ void WindowFirstValueExecutor::EvaluateInternal(ExecutionContext &context, DataC if (frame_width) { const auto first_idx = gvstate.value_tree->SelectNth(frames, 0); D_ASSERT(first_idx.second == 0); - cursor.CopyCell(0, first_idx.first, result, i); + if (first_idx.first < cursor.Count()) { + cursor.CopyCell(0, first_idx.first, result, i); + } else { + FlatVector::SetNull(result, i, true); + } } else { FlatVector::SetNull(result, i, true); } @@ -519,7 +523,7 @@ void WindowLastValueExecutor::EvaluateInternal(ExecutionContext &context, DataCh n -= last_idx.second; last_idx = gvstate.value_tree->SelectNth(frames, n); } - if (last_idx.second) { + if (last_idx.second || last_idx.first >= cursor.Count()) { // No last value - give up. FlatVector::SetNull(result, i, true); } else { @@ -589,7 +593,7 @@ void WindowNthValueExecutor::EvaluateInternal(ExecutionContext &context, DataChu if (n < frame_width) { const auto nth_index = gvstate.value_tree->SelectNth(frames, n - 1); - if (nth_index.second) { + if (nth_index.second || nth_index.first >= cursor.Count()) { // Past end of frame FlatVector::SetNull(result, i, true); } else { diff --git a/src/duckdb/src/include/duckdb/common/bswap.hpp b/src/duckdb/src/include/duckdb/common/bswap.hpp index a1434da73..db82f237b 100644 --- a/src/duckdb/src/include/duckdb/common/bswap.hpp +++ b/src/duckdb/src/include/duckdb/common/bswap.hpp @@ -8,8 +8,8 @@ #pragma once -#include "duckdb/common/common.hpp" -#include "duckdb/common/numeric_utils.hpp" +#include "duckdb/common/hugeint.hpp" +#include "duckdb/common/uhugeint.hpp" #include diff --git a/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp b/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp index 82208c895..38c3b94c2 100644 --- a/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp +++ b/src/duckdb/src/include/duckdb/common/enums/metric_type.hpp @@ -93,6 +93,7 @@ enum class MetricType : uint8_t { OPTIMIZER_CTE_INLINING, OPTIMIZER_COMMON_SUBPLAN, OPTIMIZER_JOIN_ELIMINATION, + OPTIMIZER_COUNT_WINDOW_ELIMINATION, // PhaseTiming metrics ALL_OPTIMIZERS, CUMULATIVE_OPTIMIZER_TIMING, @@ -128,7 +129,7 @@ class MetricsUtils { static constexpr uint8_t END_OPERATOR = static_cast(MetricType::OPERATOR_TYPE); static constexpr uint8_t START_OPTIMIZER = static_cast(MetricType::OPTIMIZER_EXPRESSION_REWRITER); - static constexpr uint8_t END_OPTIMIZER = static_cast(MetricType::OPTIMIZER_JOIN_ELIMINATION); + static constexpr uint8_t END_OPTIMIZER = static_cast(MetricType::OPTIMIZER_COUNT_WINDOW_ELIMINATION); static constexpr uint8_t START_PHASE_TIMING = static_cast(MetricType::ALL_OPTIMIZERS); static constexpr uint8_t END_PHASE_TIMING = static_cast(MetricType::PLANNER_BINDING); diff --git a/src/duckdb/src/include/duckdb/common/enums/optimizer_type.hpp b/src/duckdb/src/include/duckdb/common/enums/optimizer_type.hpp index 7f6864aac..8d2928af4 100644 --- a/src/duckdb/src/include/duckdb/common/enums/optimizer_type.hpp +++ b/src/duckdb/src/include/duckdb/common/enums/optimizer_type.hpp @@ -46,7 +46,8 @@ enum class OptimizerType : uint32_t { LATE_MATERIALIZATION, CTE_INLINING, COMMON_SUBPLAN, - JOIN_ELIMINATION + JOIN_ELIMINATION, + COUNT_WINDOW_ELIMINATION }; string OptimizerTypeToString(OptimizerType type); diff --git a/src/duckdb/src/include/duckdb/common/helper.hpp b/src/duckdb/src/include/duckdb/common/helper.hpp index 118bada1e..13a66aa68 100644 --- a/src/duckdb/src/include/duckdb/common/helper.hpp +++ b/src/duckdb/src/include/duckdb/common/helper.hpp @@ -8,6 +8,7 @@ #pragma once +#include "duckdb/common/bswap.hpp" #include "duckdb/common/constants.hpp" #include "duckdb/common/shared_ptr.hpp" #include @@ -220,6 +221,11 @@ const T Load(const_data_ptr_t ptr) { return ret; } +template +const T LoadLE(const_data_ptr_t ptr) { + return BSwapIfBE(Load(ptr)); +} + template void Store(const T &val, data_ptr_t ptr) { memcpy(ptr, (void *)&val, sizeof(val)); // NOLINT diff --git a/src/duckdb/src/include/duckdb/common/http_util.hpp b/src/duckdb/src/include/duckdb/common/http_util.hpp index 11fc26c48..a493647b3 100644 --- a/src/duckdb/src/include/duckdb/common/http_util.hpp +++ b/src/duckdb/src/include/duckdb/common/http_util.hpp @@ -139,7 +139,7 @@ struct BaseRequest { const string &url; string path; string proto_host_port; - const HTTPHeaders &headers; + HTTPHeaders headers; HTTPParams ¶ms; //! Whether or not to return failed requests (instead of throwing) bool try_request = false; @@ -157,6 +157,14 @@ struct BaseRequest { const TARGET &Cast() const { return reinterpret_cast(*this); } + + static HTTPHeaders MergeHeaders(const HTTPHeaders &headers, HTTPParams ¶ms) { + HTTPHeaders result = headers; + for (const auto &header : params.extra_headers) { + result.Insert(header.first, header.second); + } + return result; + } }; struct GetRequestInfo : public BaseRequest { diff --git a/src/duckdb/src/include/duckdb/execution/index/art/art.hpp b/src/duckdb/src/include/duckdb/execution/index/art/art.hpp index 1a14b24c1..09e10aa80 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/art.hpp @@ -25,6 +25,15 @@ class FixedSizeAllocator; struct ARTIndexScanState; +struct DeleteIndexInfo { + DeleteIndexInfo() : delete_indexes(nullptr) { + } + explicit DeleteIndexInfo(vector> &delete_indexes) : delete_indexes(delete_indexes) { + } + + optional_ptr>> delete_indexes; +}; + class ART : public BoundIndex { public: friend class Leaf; @@ -67,6 +76,7 @@ class ART : public BoundIndex { public: //! Try to initialize a scan on the ART with the given expression and filter. unique_ptr TryInitializeScan(const Expression &expr, const Expression &filter_expr); + unique_ptr InitializeFullScan(); //! Perform a lookup on the ART, fetching up to max_count row IDs. //! If all row IDs were fetched, it return true, else false. bool Scan(IndexScanState &state, idx_t max_count, set &row_ids); @@ -85,7 +95,8 @@ class ART : public BoundIndex { void VerifyAppend(DataChunk &chunk, IndexAppendInfo &info, optional_ptr manager) override; //! Delete a chunk from the ART. - void Delete(IndexLock &lock, DataChunk &entries, Vector &row_ids) override; + idx_t TryDelete(IndexLock &state, DataChunk &entries, Vector &row_identifiers, + optional_ptr deleted_sel, optional_ptr non_deleted_sel) override; //! Drop the ART. void CommitDrop(IndexLock &index_lock) override; @@ -107,9 +118,8 @@ class ART : public BoundIndex { //! Returns the in-memory usage of the ART. idx_t GetInMemorySize(IndexLock &index_lock) override; - bool RequiresTransactionality() const override; - unique_ptr CreateEmptyCopy(const string &name_prefix, - IndexConstraintType constraint_type) const override; + bool SupportsDeltaIndexes() const override; + unique_ptr CreateDeltaIndex(DeltaIndexType delta_index_type) const override; //! ART key generation. template @@ -136,6 +146,7 @@ class ART : public BoundIndex { //! The number of bytes fitting in the prefix. uint8_t prefix_count; + bool FullScan(idx_t max_count, set &row_ids); bool SearchEqual(ARTKey &key, idx_t max_count, set &row_ids); bool SearchGreater(ARTKey &key, bool equal, idx_t max_count, set &row_ids); bool SearchLess(ARTKey &upper_bound, bool equal, idx_t max_count, set &row_ids); @@ -144,7 +155,7 @@ class ART : public BoundIndex { string GenerateErrorKeyName(DataChunk &input, idx_t row); string GenerateConstraintErrorMessage(VerifyExistenceType verify_type, const string &key_name); - void VerifyLeaf(const Node &leaf, const ARTKey &key, optional_ptr delete_art, ConflictManager &manager, + void VerifyLeaf(const Node &leaf, const ARTKey &key, DeleteIndexInfo delete_index_info, ConflictManager &manager, optional_idx &conflict_idx, idx_t i); void VerifyConstraint(DataChunk &chunk, IndexAppendInfo &info, ConflictManager &manager) override; string GetConstraintViolationMessage(VerifyExistenceType verify_type, idx_t failed_index, diff --git a/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp b/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp index 0efadc991..7d66df318 100644 --- a/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/art/art_operator.hpp @@ -120,7 +120,7 @@ class ARTOperator { //! Starts at depth (in the key). //! status indicates if the insert happens inside a gate or not. static ARTConflictType Insert(ArenaAllocator &arena, ART &art, Node &node, const ARTKey &key, idx_t depth, - const ARTKey &row_id, GateStatus status, optional_ptr delete_art, + const ARTKey &row_id, GateStatus status, DeleteIndexInfo delete_index_info, const IndexAppendMode append_mode) { reference active_node_ref(node); reference active_key_ref(key); @@ -164,7 +164,8 @@ class ARTOperator { const auto type = active_node.GetType(); switch (type) { case NType::LEAF_INLINED: { - return InsertIntoInlined(arena, art, active_node, key, row_id, depth, status, delete_art, append_mode); + return InsertIntoInlined(arena, art, active_node, key, row_id, depth, status, delete_index_info, + append_mode); } case NType::LEAF: { Leaf::TransformToNested(art, active_node); @@ -217,7 +218,7 @@ class ARTOperator { //! Delete a key and its row ID. //! Assumes that deletion starts at the root of the tree. - static void Delete(ART &art, Node &node, const ARTKey &key, const ARTKey &row_id) { + static bool Delete(ART &art, Node &node, const ARTKey &key, const ARTKey &row_id) { // If we need to compress a Node4 into a one-way node, // then we need the previous prefix before the Node4. Node empty; @@ -246,12 +247,12 @@ class ARTOperator { switch (type) { case NType::LEAF_INLINED: { if (current.get().GetRowId() != row_id.GetRowId()) { - return; + return false; } if (!passed_node && parent.get().GetType() == NType::PREFIX) { // The tree contains exactly one element with a prefix. Node::FreeTree(art, parent); - return; + return true; } if (parent.get().GetType() == NType::PREFIX) { // We might have to compress: @@ -260,10 +261,10 @@ class ARTOperator { // Then, when we delete that child, we also free it. Node::DeleteChild(art, grandparent, greatgrandparent, current_key.get()[grandparent_depth], status, row_id); - return; + return true; } Node::DeleteChild(art, parent, grandparent, current_key.get()[parent_depth], status, row_id); - return; + return true; } case NType::LEAF: { D_ASSERT(status == GateStatus::GATE_NOT_SET); @@ -282,7 +283,7 @@ class ARTOperator { Prefix prefix(art, current, true); for (idx_t i = 0; i < prefix.data[art.PrefixCount()]; i++) { if (prefix.data[i] != current_key.get()[depth]) { - return; + return false; } depth++; } @@ -307,7 +308,7 @@ class ARTOperator { auto child = current.get().GetChildMutable(art, current_key.get()[depth]); if (!child) { // No child at the byte: nothing to erase. - return; + return false; } current = *child; @@ -321,16 +322,17 @@ class ARTOperator { if (current.get().HasByte(art, byte)) { Node::DeleteChild(art, current, parent, byte, status, row_id); } - return; + return true; } } } + return false; } private: static ARTConflictType InsertIntoInlined(ArenaAllocator &arena, ART &art, Node &node, const ARTKey &key, const ARTKey &row_id, const idx_t depth, const GateStatus status, - optional_ptr delete_art, const IndexAppendMode append_mode) { + DeleteIndexInfo delete_index_info, const IndexAppendMode append_mode) { Node row_id_node; Leaf::New(row_id_node, row_id.GetRowId()); @@ -339,31 +341,33 @@ class ARTOperator { return ARTConflictType::NO_CONFLICT; } - if (!delete_art) { - if (append_mode == IndexAppendMode::IGNORE_DUPLICATES) { + if (delete_index_info.delete_indexes) { + // Lookup in the delete_art. + for (auto &delete_index : *delete_index_info.delete_indexes) { + auto &delete_art = delete_index.get().Cast(); + auto delete_leaf = Lookup(delete_art, delete_art.tree, key, 0); + if (!delete_leaf) { + continue; + } + + // The row ID has changed. + // Thus, the local index has a newer (local) row ID, and this is a constraint violation. + D_ASSERT(delete_leaf->GetType() == NType::LEAF_INLINED); + auto deleted_row_id = delete_leaf->GetRowId(); + auto this_row_id = node.GetRowId(); + if (deleted_row_id != this_row_id) { + continue; + } + + // The deleted key and its row ID match the current key and its row ID. + Leaf::MergeInlined(arena, art, node, row_id_node, status, depth); return ARTConflictType::NO_CONFLICT; } - return ARTConflictType::CONSTRAINT; - } - - // Lookup in the delete_art. - auto delete_leaf = Lookup(*delete_art, delete_art->tree, key, 0); - if (!delete_leaf) { - return ARTConflictType::CONSTRAINT; } - - // The row ID has changed. - // Thus, the local index has a newer (local) row ID, and this is a constraint violation. - D_ASSERT(delete_leaf->GetType() == NType::LEAF_INLINED); - auto deleted_row_id = delete_leaf->GetRowId(); - auto this_row_id = node.GetRowId(); - if (deleted_row_id != this_row_id) { - return ARTConflictType::CONSTRAINT; + if (append_mode == IndexAppendMode::IGNORE_DUPLICATES) { + return ARTConflictType::NO_CONFLICT; } - - // The deleted key and its row ID match the current key and its row ID. - Leaf::MergeInlined(arena, art, node, row_id_node, status, depth); - return ARTConflictType::NO_CONFLICT; + return ARTConflictType::CONSTRAINT; } static void InsertIntoNode(ART &art, Node &node, const ARTKey &key, const ARTKey &row_id, const idx_t depth, diff --git a/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp b/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp index ae6daa0cd..9bc40582c 100644 --- a/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp +++ b/src/duckdb/src/include/duckdb/execution/index/bound_index.hpp @@ -33,13 +33,27 @@ enum class IndexAppendMode : uint8_t { DEFAULT = 0, IGNORE_DUPLICATES = 1, INSER class IndexAppendInfo { public: - IndexAppendInfo() : append_mode(IndexAppendMode::DEFAULT), delete_index(nullptr) {}; - IndexAppendInfo(const IndexAppendMode append_mode, const optional_ptr delete_index) - : append_mode(append_mode), delete_index(delete_index) {}; + IndexAppendInfo() : append_mode(IndexAppendMode::DEFAULT) { + } + IndexAppendInfo(const IndexAppendMode append_mode, optional_ptr delete_index) + : append_mode(append_mode) { + if (delete_index) { + delete_indexes.push_back(*delete_index); + } + } public: IndexAppendMode append_mode; - optional_ptr delete_index; + vector> delete_indexes; +}; + +enum class DeltaIndexType { + NONE, + LOCAL_APPEND, + LOCAL_DELETE, + ADDED_DURING_CHECKPOINT, + REMOVED_DURING_CHECKPOINT, + DELETED_ROWS_IN_USE }; //! The index is an abstract base class that serves as the basis for indexes @@ -73,6 +87,9 @@ class BoundIndex : public Index { //! and we use them when binding the unbound expressions. vector> unbound_expressions; + //! Whether or not this is a delta index - and if it is, which type it is + DeltaIndexType delta_index_type = DeltaIndexType::NONE; + public: bool IsBound() const override { return true; @@ -108,8 +125,18 @@ class BoundIndex : public Index { virtual void CommitDrop(IndexLock &index_lock) = 0; //! Deletes all data from the index void CommitDrop() override; - //! Delete a chunk of entries from the index. The lock obtained from InitializeLock must be held - virtual void Delete(IndexLock &state, DataChunk &entries, Vector &row_identifiers) = 0; + //! Delete a chunk of entries from the index. The lock obtained from InitializeLock must be held. + //! Returns the amount of rows successfully deleted from the index. + //! If either deleted_sel or non_deleted_sel are provided the exact rows that were (not) deleted are written there + virtual idx_t TryDelete(IndexLock &state, DataChunk &entries, Vector &row_identifiers, + optional_ptr deleted_sel = nullptr, + optional_ptr non_deleted_sel = nullptr); + //! Obtains a lock and calls TryDelete while holding that lock + idx_t TryDelete(DataChunk &entries, Vector &row_identifiers, optional_ptr deleted_sel = nullptr, + optional_ptr non_deleted_sel = nullptr); + //! Delete a chunk of entries from the index. The lock obtained from InitializeLock must be held. + //! Throws an error if not all rows are deleted + virtual void Delete(IndexLock &state, DataChunk &entries, Vector &row_identifiers); //! Obtains a lock and calls Delete while holding that lock void Delete(DataChunk &entries, Vector &row_identifiers); @@ -130,12 +157,11 @@ class BoundIndex : public Index { //! Obtains a lock and calls Vacuum while holding that lock. void Vacuum(); - //! Whether or not the index requires transactionality. If true we will create delta indexes - virtual bool RequiresTransactionality() const; - //! Creates an empty copy of the index with the same schema, etc, but a different constraint type - //! This will only be called if RequiresTransactionality returns true - virtual unique_ptr CreateEmptyCopy(const string &name_prefix, - IndexConstraintType constraint_type) const; + //! Whether or not the index supports the creation of delta indexes + virtual bool SupportsDeltaIndexes() const; + //! Creates a delta index - an empty copy of the index with the same schema, etc + //! This will only be called if SupportsDeltaIndexes returns true + virtual unique_ptr CreateDeltaIndex(DeltaIndexType delta_index_type) const; //! Returns the in-memory usage of the index. The lock obtained from InitializeLock must be held virtual idx_t GetInMemorySize(IndexLock &state) = 0; diff --git a/src/duckdb/src/include/duckdb/execution/merge_sort_tree.hpp b/src/duckdb/src/include/duckdb/execution/merge_sort_tree.hpp index d17e6944f..1ab568fb5 100644 --- a/src/duckdb/src/include/duckdb/execution/merge_sort_tree.hpp +++ b/src/duckdb/src/include/duckdb/execution/merge_sort_tree.hpp @@ -86,6 +86,8 @@ struct MergeSortTree { using RunElements = array; using Games = array; + static constexpr ElementType INVALID = std::numeric_limits::max(); + struct CompareElements { explicit CompareElements(const CMP &cmp) : cmp(cmp) { } @@ -122,6 +124,9 @@ struct MergeSortTree { pair SelectNth(const SubFrames &frames, idx_t n) const; inline ElementType NthElement(idx_t i) const { + if (tree.empty() || tree.front().first.empty()) { + return INVALID; + } return tree.front().first[i]; } diff --git a/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_streaming_window.hpp b/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_streaming_window.hpp index 52e32d3fd..dfca3fa5e 100644 --- a/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_streaming_window.hpp +++ b/src/duckdb/src/include/duckdb/execution/operator/aggregate/physical_streaming_window.hpp @@ -30,7 +30,6 @@ class PhysicalStreamingWindow : public PhysicalOperator { public: unique_ptr GetGlobalOperatorState(ClientContext &context) const override; - unique_ptr GetOperatorState(ExecutionContext &context) const override; OperatorResultType Execute(ExecutionContext &context, DataChunk &input, DataChunk &chunk, GlobalOperatorState &gstate, OperatorState &state) const override; @@ -50,13 +49,13 @@ class PhysicalStreamingWindow : public PhysicalOperator { private: void ExecuteFunctions(ExecutionContext &context, DataChunk &chunk, DataChunk &delayed, - GlobalOperatorState &gstate_p, OperatorState &state_p) const; + GlobalOperatorState &gstate_p) const; void ExecuteInput(ExecutionContext &context, DataChunk &delayed, DataChunk &input, DataChunk &chunk, - GlobalOperatorState &gstate, OperatorState &state) const; + GlobalOperatorState &gstate) const; void ExecuteDelayed(ExecutionContext &context, DataChunk &delayed, DataChunk &input, DataChunk &chunk, - GlobalOperatorState &gstate, OperatorState &state) const; + GlobalOperatorState &gstate) const; void ExecuteShifted(ExecutionContext &context, DataChunk &delayed, DataChunk &input, DataChunk &chunk, - GlobalOperatorState &gstate, OperatorState &state) const; + GlobalOperatorState &gstate) const; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/function/window/window_collection.hpp b/src/duckdb/src/include/duckdb/function/window/window_collection.hpp index 2dae27c6a..7f828c1d9 100644 --- a/src/duckdb/src/include/duckdb/function/window/window_collection.hpp +++ b/src/duckdb/src/include/duckdb/function/window/window_collection.hpp @@ -86,6 +86,10 @@ class WindowCursor { WindowCursor(const WindowCollection &paged, column_t col_idx); WindowCursor(const WindowCollection &paged, vector column_ids); + //! The row count of the paged collection + idx_t Count() const { + return paged.size(); + } //! Is the scan in range? inline bool RowIsVisible(idx_t row_idx) const { return (row_idx < state.next_row_index && state.current_row_index <= row_idx); diff --git a/src/duckdb/src/include/duckdb/optimizer/count_window_elimination.hpp b/src/duckdb/src/include/duckdb/optimizer/count_window_elimination.hpp new file mode 100644 index 000000000..dc9c0a300 --- /dev/null +++ b/src/duckdb/src/include/duckdb/optimizer/count_window_elimination.hpp @@ -0,0 +1,29 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// duckdb/optimizer/count_window_elimination.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb/optimizer/optimizer.hpp" + +#include "duckdb/optimizer/column_binding_replacer.hpp" + +namespace duckdb { + +class WindowSelfJoinOptimizer { +public: + explicit WindowSelfJoinOptimizer(Optimizer &optimizer); + + unique_ptr Optimize(unique_ptr op); + +private: + unique_ptr OptimizeInternal(unique_ptr op, ColumnBindingReplacer &replacer); + + Optimizer &optimizer; +}; + +} // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/parser/transformer.hpp b/src/duckdb/src/include/duckdb/parser/transformer.hpp index 2afa96722..8a8ba2ecb 100644 --- a/src/duckdb/src/include/duckdb/parser/transformer.hpp +++ b/src/duckdb/src/include/duckdb/parser/transformer.hpp @@ -113,7 +113,7 @@ class Transformer { unique_ptr TransformSelectStmt(duckdb_libpgquery::PGSelectStmt &select, bool is_select = true); unique_ptr TransformSelectStmt(duckdb_libpgquery::PGNode &node, bool is_select = true); //! Transform a Postgres T_AlterStmt node into a AlterStatement - unique_ptr TransformAlter(duckdb_libpgquery::PGAlterTableStmt &stmt); + unique_ptr TransformAlter(duckdb_libpgquery::PGAlterTableStmt &stmt); //! Transform a Postgres T_AlterDatabaseStmt node into a AlterStatement unique_ptr TransformAlterDatabase(duckdb_libpgquery::PGAlterDatabaseStmt &stmt); //! Transform a Postgres duckdb_libpgquery::T_PGRenameStmt node into a RenameStatement diff --git a/src/duckdb/src/include/duckdb/storage/data_table.hpp b/src/duckdb/src/include/duckdb/storage/data_table.hpp index 3b37752cd..43ccf7350 100644 --- a/src/duckdb/src/include/duckdb/storage/data_table.hpp +++ b/src/duckdb/src/include/duckdb/storage/data_table.hpp @@ -85,7 +85,8 @@ class DataTable : public enable_shared_from_this { //! Returns the maximum amount of threads that should be assigned to scan this data table idx_t MaxThreads(ClientContext &context) const; - void InitializeParallelScan(ClientContext &context, ParallelTableScanState &state); + void InitializeParallelScan(ClientContext &context, ParallelTableScanState &state, + const vector &column_indexes); idx_t NextParallelScan(ClientContext &context, ParallelTableScanState &state, TableScanState &scan_state); //! Scans up to STANDARD_VECTOR_SIZE elements from the table starting @@ -97,6 +98,8 @@ class DataTable : public enable_shared_from_this { //! Fetch data from the specific row identifiers from the base table void Fetch(DuckTransaction &transaction, DataChunk &result, const vector &column_ids, const Vector &row_ids, idx_t fetch_count, ColumnFetchState &state); + void FetchCommitted(DataChunk &result, const vector &column_ids, const Vector &row_identifiers, + idx_t fetch_count, ColumnFetchState &state); //! Returns true, if the transaction can fetch the row ID. bool CanFetch(DuckTransaction &transaction, const row_t row_id); @@ -197,7 +200,7 @@ class DataTable : public enable_shared_from_this { void RevertIndexAppend(TableAppendState &state, DataChunk &chunk, Vector &row_identifiers); //! Remove the row identifiers from all the indexes of the table void RemoveFromIndexes(const QueryContext &context, Vector &row_identifiers, idx_t count, - IndexRemovalType removal_type); + IndexRemovalType removal_type, optional_idx checkpoint_id = optional_idx()); void SetAsMainTable() { this->version = DataTableVersion::MAIN_TABLE; diff --git a/src/duckdb/src/include/duckdb/storage/storage_manager.hpp b/src/duckdb/src/include/duckdb/storage/storage_manager.hpp index b7fa7ccec..261c4fb99 100644 --- a/src/duckdb/src/include/duckdb/storage/storage_manager.hpp +++ b/src/duckdb/src/include/duckdb/storage/storage_manager.hpp @@ -77,7 +77,7 @@ class StorageManager { //! Write that we started a checkpoint to the WAL if there is one - returns whether or not there is a WAL bool WALStartCheckpoint(MetaBlockPointer meta_block, CheckpointOptions &options); //! Finishes a checkpoint - void WALFinishCheckpoint(); + void WALFinishCheckpoint(lock_guard &wal_lock); // Get the WAL lock unique_ptr> GetWALLock(); diff --git a/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp b/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp index db959f4cd..45e8c33c2 100644 --- a/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/chunk_info.hpp @@ -51,9 +51,9 @@ class ChunkInfo { virtual bool Cleanup(transaction_t lowest_transaction) const; virtual string ToString(idx_t max_count) const = 0; - virtual bool HasDeletes() const = 0; + virtual bool HasDeletes(transaction_t transaction_id = MAX_TRANSACTION_ID) const = 0; - virtual void Write(WriteStream &writer) const; + virtual void Write(WriteStream &writer, transaction_t transaction_id) const; static unique_ptr Read(FixedSizeAllocator &allocator, ReadStream &reader); public: @@ -95,9 +95,9 @@ class ChunkConstantInfo : public ChunkInfo { bool Cleanup(transaction_t lowest_transaction) const override; string ToString(idx_t max_count) const override; - bool HasDeletes() const override; + bool HasDeletes(transaction_t transaction_id = MAX_TRANSACTION_ID) const override; - void Write(WriteStream &writer) const override; + void Write(WriteStream &writer, transaction_t transaction_id) const override; static unique_ptr Read(ReadStream &reader); private: @@ -137,12 +137,12 @@ class ChunkVectorInfo : public ChunkInfo { idx_t Delete(transaction_t transaction_id, row_t rows[], idx_t count); void CommitDelete(transaction_t commit_id, const DeleteInfo &info); - bool HasDeletes() const override; + bool HasDeletes(transaction_t transaction_id = MAX_TRANSACTION_ID) const override; bool AnyDeleted() const; bool HasConstantInsertionId() const; transaction_t ConstantInsertId() const; - void Write(WriteStream &writer) const override; + void Write(WriteStream &writer, transaction_t transaction_id) const override; static unique_ptr Read(FixedSizeAllocator &allocator, ReadStream &reader); private: diff --git a/src/duckdb/src/include/duckdb/storage/table/delete_state.hpp b/src/duckdb/src/include/duckdb/storage/table/delete_state.hpp index d3a05eeeb..662a26ebd 100644 --- a/src/duckdb/src/include/duckdb/storage/table/delete_state.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/delete_state.hpp @@ -18,6 +18,7 @@ struct TableDeleteState { bool has_delete_constraints = false; DataChunk verify_chunk; vector col_ids; + shared_ptr checkpoint_lock; }; } // namespace duckdb diff --git a/src/duckdb/src/include/duckdb/storage/table/row_group.hpp b/src/duckdb/src/include/duckdb/storage/table/row_group.hpp index 836759f0a..2d1f2424d 100644 --- a/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/row_group.hpp @@ -219,7 +219,7 @@ class RowGroup : public SegmentBase { static FilterPropagateResult CheckRowIdFilter(const TableFilter &filter, idx_t beg_row, idx_t end_row); idx_t GetColumnCount() const; - vector CheckpointDeletes(MetadataManager &manager); + vector CheckpointDeletes(RowGroupWriter &writer); private: optional_ptr GetVersionInfo(); diff --git a/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp b/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp index 28756838d..cd944ff1e 100644 --- a/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp @@ -82,6 +82,7 @@ class RowGroupCollection { void Fetch(TransactionData transaction, DataChunk &result, const vector &column_ids, const Vector &row_identifiers, idx_t fetch_count, ColumnFetchState &state); + //! Returns true, if the row group can fetch the row id for the transaction. bool CanFetch(TransactionData, const row_t row_id); @@ -103,7 +104,7 @@ class RowGroupCollection { bool IsPersistent() const; void RemoveFromIndexes(const QueryContext &context, TableIndexList &indexes, Vector &row_identifiers, idx_t count, - IndexRemovalType removal_type); + IndexRemovalType removal_type, optional_idx active_checkpoint = optional_idx()); idx_t Delete(TransactionData transaction, DataTable &table, row_t *ids, idx_t count); void Update(TransactionData transaction, DataTable &table, row_t *ids, const vector &column_ids, diff --git a/src/duckdb/src/include/duckdb/storage/table/row_version_manager.hpp b/src/duckdb/src/include/duckdb/storage/table/row_version_manager.hpp index 8856ce57b..ab761179c 100644 --- a/src/duckdb/src/include/duckdb/storage/table/row_version_manager.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/row_version_manager.hpp @@ -25,9 +25,6 @@ class RowVersionManager { public: explicit RowVersionManager(BufferManager &buffer_manager) noexcept; - FixedSizeAllocator &GetAllocator() { - return allocator; - } idx_t GetCommittedDeletedCount(idx_t count); bool ShouldCheckpointRowGroup(transaction_t checkpoint_id, idx_t count); @@ -44,7 +41,7 @@ class RowVersionManager { idx_t DeleteRows(idx_t vector_idx, transaction_t transaction_id, row_t rows[], idx_t count); void CommitDelete(idx_t vector_idx, transaction_t commit_id, const DeleteInfo &info); - vector Checkpoint(MetadataManager &manager); + vector Checkpoint(RowGroupWriter &writer); static shared_ptr Deserialize(MetaBlockPointer delete_pointer, MetadataManager &manager); bool HasUnserializedChanges(); @@ -54,10 +51,13 @@ class RowVersionManager { mutex version_lock; FixedSizeAllocator allocator; vector> vector_info; - bool has_unserialized_changes; + optional_idx uncheckpointed_delete_commit; vector storage_pointers; private: + FixedSizeAllocator &GetAllocator() { + return allocator; + } optional_ptr GetChunkInfo(idx_t vector_idx); ChunkVectorInfo &GetVectorInfo(idx_t vector_idx); void FillVectorInfo(idx_t vector_idx); diff --git a/src/duckdb/src/include/duckdb/storage/table/scan_state.hpp b/src/duckdb/src/include/duckdb/storage/table/scan_state.hpp index 6308dcfa8..7f6228819 100644 --- a/src/duckdb/src/include/duckdb/storage/table/scan_state.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/scan_state.hpp @@ -144,7 +144,15 @@ struct ColumnScanState { idx_t GetPositionInSegment() const; }; +enum class FetchType { + //! Verify if each row is valid for the transaction prior to fetching + TRANSACTIONAL_FETCH, + // Force fetch the row, regardless of it if is valid for the transaction or not + FORCE_FETCH +}; + struct ColumnFetchState { + FetchType fetch_type = FetchType::TRANSACTIONAL_FETCH; //! The query context for this fetch QueryContext context; //! The set of pinned block handles for this set of fetches diff --git a/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp b/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp index bb26084df..b21e90c08 100644 --- a/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp +++ b/src/duckdb/src/include/duckdb/storage/table/table_index_list.hpp @@ -34,6 +34,8 @@ struct IndexEntry { unique_ptr deleted_rows_in_use; //! Data that was added to the index during the last checkpoint unique_ptr added_data_during_checkpoint; + //! Data that was removed from the index during the last checkpoint + unique_ptr removed_data_during_checkpoint; //! The last checkpoint index that was written with this index optional_idx last_written_checkpoint; }; @@ -98,7 +100,7 @@ class TableIndexList { index_entries = std::move(other.index_entries); } //! Merge any changes added to deltas during a checkpoint back into the main indexes - void MergeCheckpointDeltas(transaction_t checkpoint_id); + void MergeCheckpointDeltas(DataTable &storage, transaction_t checkpoint_id); //! Returns true, if all indexes //! Find the foreign key matching the keys. optional_ptr FindForeignKeyIndex(const vector &fk_keys, const ForeignKeyType fk_type); diff --git a/src/duckdb/src/include/duckdb/transaction/cleanup_state.hpp b/src/duckdb/src/include/duckdb/transaction/cleanup_state.hpp index 0d91a9dc0..5998cbe54 100644 --- a/src/duckdb/src/include/duckdb/transaction/cleanup_state.hpp +++ b/src/duckdb/src/include/duckdb/transaction/cleanup_state.hpp @@ -23,14 +23,13 @@ struct UpdateInfo; class CleanupState { public: - explicit CleanupState(const QueryContext &context, transaction_t lowest_active_transaction, + explicit CleanupState(DuckTransaction &transaction, transaction_t lowest_active_transaction, ActiveTransactionState transaction_state); public: void CleanupEntry(UndoFlags type, data_ptr_t data); private: - QueryContext context; //! Lowest active transaction transaction_t lowest_active_transaction; ActiveTransactionState transaction_state; diff --git a/src/duckdb/src/include/duckdb/transaction/commit_state.hpp b/src/duckdb/src/include/duckdb/transaction/commit_state.hpp index 3ad975925..97ea0be77 100644 --- a/src/duckdb/src/include/duckdb/transaction/commit_state.hpp +++ b/src/duckdb/src/include/duckdb/transaction/commit_state.hpp @@ -29,7 +29,7 @@ enum class CommitMode { COMMIT, REVERT_COMMIT }; struct IndexDataRemover { public: - explicit IndexDataRemover(QueryContext context, IndexRemovalType removal_type); + explicit IndexDataRemover(DuckTransaction &transaction, QueryContext context, IndexRemovalType removal_type); void PushDelete(DeleteInfo &info); void Verify(); @@ -38,6 +38,7 @@ struct IndexDataRemover { void Flush(DataTable &table, row_t *row_numbers, idx_t count); private: + DuckTransaction &transaction; // data for index cleanup QueryContext context; //! While committing, we remove data from any indexes that was deleted diff --git a/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp b/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp index f0169b3b8..361424053 100644 --- a/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp +++ b/src/duckdb/src/include/duckdb/transaction/duck_transaction.hpp @@ -89,9 +89,6 @@ class DuckTransaction : public Transaction { } unique_ptr TryGetCheckpointLock(); - bool HasWriteLock() const { - return write_lock.get(); - } //! Get a shared lock on a table shared_ptr SharedLockTable(DataTableInfo &info); @@ -105,8 +102,10 @@ class DuckTransaction : public Transaction { UndoBuffer undo_buffer; //! The set of uncommitted appends for the transaction unique_ptr storage; - //! Write lock - unique_ptr write_lock; + //! Lock that prevents checkpoints from starting + unique_ptr checkpoint_lock; + //! Lock that prevents vacuums from starting + unique_ptr vacuum_lock; //! Lock for accessing sequence_usage mutex sequence_lock; //! Map of all sequences that were used during the transaction and the value they had in this transaction diff --git a/src/duckdb/src/include/duckdb/transaction/duck_transaction_manager.hpp b/src/duckdb/src/include/duckdb/transaction/duck_transaction_manager.hpp index 7466fd254..66cac1943 100644 --- a/src/duckdb/src/include/duckdb/transaction/duck_transaction_manager.hpp +++ b/src/duckdb/src/include/duckdb/transaction/duck_transaction_manager.hpp @@ -82,6 +82,8 @@ class DuckTransactionManager : public TransactionManager { //! Try to obtain an exclusive checkpoint lock unique_ptr TryGetCheckpointLock(); unique_ptr TryUpgradeCheckpointLock(StorageLockKey &lock); + unique_ptr SharedVacuumLock(); + unique_ptr TryGetVacuumLock(); //! Returns the current version of the catalog (incremented whenever anything changes, not stored between restarts) DUCKDB_API idx_t GetCatalogVersion(Transaction &transaction); @@ -135,6 +137,8 @@ class DuckTransactionManager : public TransactionManager { mutex transaction_lock; //! The checkpoint lock StorageLock checkpoint_lock; + //! The vacuum lock - necessary to start vacuum operations + StorageLock vacuum_lock; //! Lock necessary to start transactions only - used by FORCE CHECKPOINT to prevent new transactions from starting mutex start_transaction_lock; diff --git a/src/duckdb/src/main/http/http_util.cpp b/src/duckdb/src/main/http/http_util.cpp index fb5a9491f..f562dd8cc 100644 --- a/src/duckdb/src/main/http/http_util.cpp +++ b/src/duckdb/src/main/http/http_util.cpp @@ -123,7 +123,7 @@ unique_ptr HTTPUtil::Request(BaseRequest &request, unique_ptr table_map; + Optimizer &optimizer; + + void VisitOperator(LogicalOperator &op) override { + // Rebind definitions + if (op.type == LogicalOperatorType::LOGICAL_GET) { + auto &get = op.Cast(); + auto new_idx = optimizer.binder.GenerateTableIndex(); + table_map[get.table_index] = new_idx; + get.table_index = new_idx; + } + if (op.type == LogicalOperatorType::LOGICAL_PROJECTION) { + auto &proj = op.Cast(); + auto new_idx = optimizer.binder.GenerateTableIndex(); + table_map[proj.table_index] = new_idx; + proj.table_index = new_idx; + } + if (op.type == LogicalOperatorType::LOGICAL_AGGREGATE_AND_GROUP_BY) { + auto &agg = op.Cast(); + auto new_agg_idx = optimizer.binder.GenerateTableIndex(); + auto new_grp_idx = optimizer.binder.GenerateTableIndex(); + table_map[agg.aggregate_index] = new_agg_idx; + table_map[agg.group_index] = new_grp_idx; + agg.aggregate_index = new_agg_idx; + agg.group_index = new_grp_idx; + } + // TODO: Handle other operators defining tables if needed + // But Get/Projection/Aggregate are most common in subplans. + + VisitOperatorChildren(op); + VisitOperatorExpressions(op); + } + + void VisitExpression(unique_ptr *expression) override { + auto &expr = *expression; + if (expr->GetExpressionClass() == ExpressionClass::BOUND_COLUMN_REF) { + auto &bound = expr->Cast(); + if (table_map.count(bound.binding.table_index)) { + bound.binding.table_index = table_map[bound.binding.table_index]; + } + } + VisitExpressionChildren(**expression); + } +}; + +WindowSelfJoinOptimizer::WindowSelfJoinOptimizer(Optimizer &optimizer) : optimizer(optimizer) { +} + +unique_ptr WindowSelfJoinOptimizer::Optimize(unique_ptr op) { + ColumnBindingReplacer replacer; + op = OptimizeInternal(std::move(op), replacer); + if (!replacer.replacement_bindings.empty()) { + replacer.VisitOperator(*op); + } + return op; +} + +unique_ptr WindowSelfJoinOptimizer::OptimizeInternal(unique_ptr op, + ColumnBindingReplacer &replacer) { + if (op->type == LogicalOperatorType::LOGICAL_FILTER) { + auto &filter = op->Cast(); + if (filter.expressions.size() == 1 && filter.children.size() == 1 && + filter.children[0]->type == LogicalOperatorType::LOGICAL_WINDOW) { + auto &window = filter.children[0]->Cast(); + + // Check recursively + window.children[0] = OptimizeInternal(std::move(window.children[0]), replacer); + + if (window.expressions.size() != 1) { + return op; + } + if (window.expressions[0]->type != ExpressionType::WINDOW_AGGREGATE) { + return op; + } + + // We can only optimize if there is a single window function equality comparison + // Check matches + if (filter.expressions[0]->type != ExpressionType::COMPARE_EQUAL) { + return op; + } + auto &comp = filter.expressions[0]->Cast(); + if (comp.left->type != ExpressionType::BOUND_COLUMN_REF) { + return op; + } + auto &col_ref = comp.left->Cast(); + auto t_idx = col_ref.binding.table_index; + auto c_idx = col_ref.binding.column_index; + auto w_idx = window.window_index; + + if (t_idx != w_idx || c_idx != 0) { + return op; + } + + // Check right side is constant 1 + if (comp.right->type != ExpressionType::VALUE_CONSTANT) { + return op; + } + auto &const_expr = comp.right->Cast(); + if (!const_expr.value.type().IsIntegral()) { + return op; + } + if (const_expr.value.GetValue() != 1) { + return op; + } + + auto &w_expr = window.expressions[0]->Cast(); + if (w_expr.aggregate->name != "count" && w_expr.aggregate->name != "count_star") { + return op; + } + if (!w_expr.orders.empty()) { + return op; + } + if (w_expr.partitions.empty()) { + return op; + } + + // --- Transformation --- + + auto original_child = std::move(window.children[0]); + auto copy_child = original_child->Copy(optimizer.context); + + // Rebind copy_child to avoid duplicate table indices + CountWindowTableRebinder rebinder(optimizer); + rebinder.VisitOperator(*copy_child); + + auto aggregate_index = optimizer.binder.GenerateTableIndex(); + auto group_index = optimizer.binder.GenerateTableIndex(); + + vector> groups; + vector> aggregates; + + // Create Aggregate Operator + for (auto &part : w_expr.partitions) { + auto part_copy = part->Copy(); + rebinder.VisitExpression(&part_copy); // Update bindings + groups.push_back(std::move(part_copy)); + } + + auto count_func = *w_expr.aggregate; + unique_ptr bind_info; + if (w_expr.bind_info) { + bind_info = w_expr.bind_info->Copy(); + } else { + bind_info = nullptr; + } + + vector> children; + for (auto &child : w_expr.children) { + auto child_copy = child->Copy(); + rebinder.VisitExpression(&child_copy); // Update bindings + children.push_back(std::move(child_copy)); + } + + auto aggr_type = w_expr.distinct ? AggregateType::DISTINCT : AggregateType::NON_DISTINCT; + + auto agg_expr = make_uniq(std::move(count_func), std::move(children), nullptr, + std::move(bind_info), aggr_type); + + aggregates.push_back(std::move(agg_expr)); + + // args: group_index, aggregate_index, ... + auto agg_op = make_uniq(group_index, aggregate_index, std::move(aggregates)); + + agg_op->groups = std::move(groups); + agg_op->children.push_back(std::move(copy_child)); + agg_op->ResolveOperatorTypes(); + + if (agg_op->types.size() <= agg_op->groups.size()) { + throw InternalException("LogicalAggregate types size mismatch"); + } + + // Filter on aggregate: count = 1 + // Count is the first aggregate, so it's at agg_op->groups.size() in the types list + // Bindings: Aggregates are at aggregate_index + auto cnt_ref = make_uniq(agg_op->types[agg_op->groups.size()], + ColumnBinding(aggregate_index, 0)); + + auto filter_expr = + make_uniq(ExpressionType::COMPARE_EQUAL, std::move(cnt_ref), + make_uniq(Value::BIGINT(1))); + + auto rhs_filter = make_uniq(); + rhs_filter->expressions.push_back(std::move(filter_expr)); + rhs_filter->children.push_back(std::move(agg_op)); + rhs_filter->ResolveOperatorTypes(); + + // Semi Join + auto join = make_uniq(JoinType::SEMI); + + for (size_t i = 0; i < w_expr.partitions.size(); ++i) { + JoinCondition cond; + cond.comparison = ExpressionType::COMPARE_NOT_DISTINCT_FROM; + cond.left = w_expr.partitions[i]->Copy(); + cond.right = make_uniq(w_expr.partitions[i]->return_type, + ColumnBinding(group_index, i)); + join->conditions.push_back(std::move(cond)); + } + + join->children.push_back(std::move(original_child)); + join->children.push_back(std::move(rhs_filter)); + join->ResolveOperatorTypes(); + + // Create Constant 1 + auto dummy_index = optimizer.binder.GenerateTableIndex(); + auto dummy = make_uniq(dummy_index); + dummy->ResolveOperatorTypes(); + + auto const_one = make_uniq(Value::BIGINT(1)); + const_one->alias = "count_window_result"; + + auto proj_index = optimizer.binder.GenerateTableIndex(); + vector> proj_expressions; + proj_expressions.push_back(std::move(const_one)); + + auto projection = make_uniq(proj_index, std::move(proj_expressions)); + projection->children.push_back(std::move(dummy)); + projection->ResolveOperatorTypes(); + + // Cross Product + auto cross = make_uniq(std::move(join), std::move(projection)); + cross->ResolveOperatorTypes(); + + // Replace Count binding + // Old window column: (window.window_index, 0) + // New constant column: (proj_index, 0) + ColumnBinding old_binding(window.window_index, 0); + ColumnBinding new_binding(proj_index, 0); + + replacer.replacement_bindings.emplace_back(old_binding, new_binding); + + // We do NOT need to replace other bindings because CrossProduct preserves left child bindings, + // and Window (presumably) passed through input bindings without re-binding. + + return std::move(cross); + } + } else if (!op->children.empty()) { + for (auto &child : op->children) { + child = OptimizeInternal(std::move(child), replacer); + } + } + return op; +} + +} // namespace duckdb diff --git a/src/duckdb/src/optimizer/filter_combiner.cpp b/src/duckdb/src/optimizer/filter_combiner.cpp index f7099c9a1..e2480b963 100644 --- a/src/duckdb/src/optimizer/filter_combiner.cpp +++ b/src/duckdb/src/optimizer/filter_combiner.cpp @@ -367,7 +367,7 @@ FilterPushdownResult FilterCombiner::TryPushdownConstantFilter(TableFilterSet &t void ReplaceWithBoundReference(unique_ptr &root_expr) { ExpressionIterator::VisitExpressionMutable( root_expr, [&](BoundColumnRefExpression &col_ref, unique_ptr &expr) { - expr = make_uniq(col_ref.return_type, 0ULL); + expr = make_uniq(col_ref.alias, col_ref.return_type, 0ULL); }); } diff --git a/src/duckdb/src/optimizer/optimizer.cpp b/src/duckdb/src/optimizer/optimizer.cpp index 9bf4fcf8d..4f811bd84 100644 --- a/src/duckdb/src/optimizer/optimizer.cpp +++ b/src/duckdb/src/optimizer/optimizer.cpp @@ -38,6 +38,7 @@ #include "duckdb/optimizer/unnest_rewriter.hpp" #include "duckdb/optimizer/late_materialization.hpp" #include "duckdb/optimizer/common_subplan_optimizer.hpp" +#include "duckdb/optimizer/count_window_elimination.hpp" #include "duckdb/planner/binder.hpp" #include "duckdb/planner/planner.hpp" @@ -186,6 +187,11 @@ void Optimizer::RunBuiltInOptimizers() { plan = empty_result_pullup.Optimize(std::move(plan)); }); + RunOptimizer(OptimizerType::COUNT_WINDOW_ELIMINATION, [&]() { + WindowSelfJoinOptimizer window_self_join_optimizer(*this); + plan = window_self_join_optimizer.Optimize(std::move(plan)); + }); + // then we perform the join ordering optimization // this also rewrites cross products + filters into joins and performs filter pushdowns RunOptimizer(OptimizerType::JOIN_ORDER, [&]() { diff --git a/src/duckdb/src/optimizer/statistics/operator/propagate_get.cpp b/src/duckdb/src/optimizer/statistics/operator/propagate_get.cpp index 105953fe4..708221883 100644 --- a/src/duckdb/src/optimizer/statistics/operator/propagate_get.cpp +++ b/src/duckdb/src/optimizer/statistics/operator/propagate_get.cpp @@ -15,13 +15,15 @@ namespace duckdb { -static void GetColumnIndex(unique_ptr &expr, idx_t &index) { +static void GetColumnIndex(unique_ptr &expr, idx_t &index, string &alias) { if (expr->type == ExpressionType::BOUND_REF) { auto &bound_ref = expr->Cast(); index = bound_ref.index; + alias = bound_ref.alias; return; } - ExpressionIterator::EnumerateChildren(*expr, [&](unique_ptr &child) { GetColumnIndex(child, index); }); + ExpressionIterator::EnumerateChildren(*expr, + [&](unique_ptr &child) { GetColumnIndex(child, index, alias); }); } FilterPropagateResult StatisticsPropagator::PropagateTableFilter(ColumnBinding stats_binding, BaseStatistics &stats, @@ -32,15 +34,16 @@ FilterPropagateResult StatisticsPropagator::PropagateTableFilter(ColumnBinding s // get physical storage index of the filter // since it is a table filter, every storage index is the same idx_t physical_index = DConstants::INVALID_INDEX; - GetColumnIndex(expr_filter.expr, physical_index); + string column_alias; + GetColumnIndex(expr_filter.expr, physical_index, column_alias); D_ASSERT(physical_index != DConstants::INVALID_INDEX); - auto column_ref = make_uniq(stats.GetType(), stats_binding); + auto column_ref = make_uniq(column_alias, stats.GetType(), stats_binding); auto filter_expr = expr_filter.ToExpression(*column_ref); // handle the filter before updating the statistics // otherwise the filter can be pruned by the updated statistics auto propagate_result = HandleFilter(filter_expr); - auto colref = make_uniq(stats.GetType(), physical_index); + auto colref = make_uniq(column_alias, stats.GetType(), physical_index); UpdateFilterStatistics(*filter_expr); // replace BoundColumnRefs with BoundRefs diff --git a/src/duckdb/src/parser/transform/statement/transform_alter_table.cpp b/src/duckdb/src/parser/transform/statement/transform_alter_table.cpp index 8e676f3c8..203d90fdc 100644 --- a/src/duckdb/src/parser/transform/statement/transform_alter_table.cpp +++ b/src/duckdb/src/parser/transform/statement/transform_alter_table.cpp @@ -2,8 +2,13 @@ #include "duckdb/parser/expression/cast_expression.hpp" #include "duckdb/parser/expression/columnref_expression.hpp" #include "duckdb/parser/statement/alter_statement.hpp" +#include "duckdb/parser/sql_statement.hpp" #include "duckdb/parser/transformer.hpp" #include "duckdb/common/exception/parser_exception.hpp" +#include "duckdb/parser/expression/constant_expression.hpp" +#include "duckdb/parser/statement/multi_statement.hpp" +#include "duckdb/parser/statement/update_statement.hpp" +#include "duckdb/parser/tableref/basetableref.hpp" namespace duckdb { @@ -19,7 +24,60 @@ vector Transformer::TransformNameList(duckdb_libpgquery::PGList &list) { return result; } -unique_ptr Transformer::TransformAlter(duckdb_libpgquery::PGAlterTableStmt &stmt) { +void AddToMultiStatement(const unique_ptr &multi_statement, unique_ptr alter_info) { + auto alter_statement = make_uniq(); + alter_statement->info = std::move(alter_info); + multi_statement->statements.push_back(std::move(alter_statement)); +} + +void AddUpdateToMultiStatement(const unique_ptr &multi_statement, const string &column_name, + const string &table_name, const unique_ptr &original_expression) { + auto update_statement = make_uniq(); + + auto table_ref = make_uniq(); + + table_ref->table_name = table_name; + update_statement->table = std::move(table_ref); + + auto set_info = make_uniq(); + set_info->columns.push_back(column_name); + set_info->expressions.push_back(original_expression->Copy()); + update_statement->set_info = std::move(set_info); + + multi_statement->statements.push_back(std::move(update_statement)); +} + +unique_ptr TransformAndMaterializeAlter(const duckdb_libpgquery::PGAlterTableStmt &stmt, + AlterEntryData &data, + unique_ptr info_with_null_placeholder, + const string &column_name, + unique_ptr expression) { + auto multi_statement = make_uniq(); + /* Here we do a workaround that consists of the following statements: + * 1. `ALTER TABLE t ADD COLUMN col DEFAULT NULL;` + * 2. `UPDATE t SET u = ;` + * 3. `ALTER TABLE t ALTER u SET DEFAULT ;` + * + * This workaround exists because, when statements like this were executed: + * `ALTER TABLE ... ADD COLUMN ... DEFAULT ` + * the WAL replay would re-run the default expression, and with expressions such as RANDOM or CURRENT_TIMESTAMP, the + * value would be different from that of the original run. By now doing an UPDATE, we force materialization of these + * values, which makes WAL replays consistent. + */ + + // 1. `ALTER TABLE t ADD COLUMN col DEFAULT NULL;` + AddToMultiStatement(multi_statement, std::move(info_with_null_placeholder)); + + // 2. `UPDATE t SET u = ;` + AddUpdateToMultiStatement(multi_statement, column_name, stmt.relation->relname, expression); + + // 3. `ALTER TABLE t ALTER u SET DEFAULT ;` + // Reinstate the original default expression. + AddToMultiStatement(multi_statement, make_uniq(data, column_name, std::move(expression))); + return multi_statement; +} + +unique_ptr Transformer::TransformAlter(duckdb_libpgquery::PGAlterTableStmt &stmt) { D_ASSERT(stmt.relation); if (stmt.cmds->length != 1) { throw ParserException("Only one ALTER command per statement is supported"); @@ -62,7 +120,18 @@ unique_ptr Transformer::TransformAlter(duckdb_libpgquery::PGAlte column_entry.SetName(column_names.back()); if (column_names.size() == 1) { // ADD COLUMN - result->info = make_uniq(std::move(data), std::move(column_entry), command->missing_ok); + if (!column_entry.HasDefaultValue() || + column_entry.DefaultValue().GetExpressionClass() == ExpressionClass::CONSTANT) { + result->info = + make_uniq(std::move(data), std::move(column_entry), command->missing_ok); + break; + } + auto null_column = column_entry.Copy(); + null_column.SetDefaultValue(make_uniq(ConstantExpression(Value(nullptr)))); + return unique_ptr(std::move(TransformAndMaterializeAlter( + stmt, data, make_uniq(data, std::move(null_column), command->missing_ok), + column_entry.GetName(), column_entry.DefaultValue().Copy()))); + } else { // ADD FIELD column_names.pop_back(); @@ -158,7 +227,7 @@ unique_ptr Transformer::TransformAlter(duckdb_libpgquery::PGAlte throw NotImplementedException("No support for that ALTER TABLE option yet!"); } } - return result; + return unique_ptr(std::move(result)); } } // namespace duckdb diff --git a/src/duckdb/src/planner/planner.cpp b/src/duckdb/src/planner/planner.cpp index ca5e72d88..e6794dcfc 100644 --- a/src/duckdb/src/planner/planner.cpp +++ b/src/duckdb/src/planner/planner.cpp @@ -14,7 +14,7 @@ #include "duckdb/transaction/meta_transaction.hpp" #include "duckdb/execution/column_binding_resolver.hpp" #include "duckdb/main/attached_database.hpp" - +#include "duckdb/parser/statement/multi_statement.hpp" #include "duckdb/planner/subquery/flatten_dependent_join.hpp" namespace duckdb { diff --git a/src/duckdb/src/storage/checkpoint_manager.cpp b/src/duckdb/src/storage/checkpoint_manager.cpp index 854a1c11a..fb9859315 100644 --- a/src/duckdb/src/storage/checkpoint_manager.cpp +++ b/src/duckdb/src/storage/checkpoint_manager.cpp @@ -263,10 +263,19 @@ void SingleFileCheckpointWriter::CreateCheckpoint() { } // truncate the WAL + unique_ptr> wal_lock; if (has_wal) { - storage_manager.WALFinishCheckpoint(); + wal_lock = storage_manager.GetWALLock(); + storage_manager.WALFinishCheckpoint(*wal_lock); } + // FIXME: hold the WAL lock while we are merging checkpoint deltas + // this prevents any commits from happening while this is going on + // this is currently required because of the way that "deletes + inserts" of the same row are processed + // currently we FIRST append the new (duplicate) insert, THEN delete the old value + // if we append the duplicate value, then call MergeCheckpointDeltas, that will fail with a duplicate entry error + // we can fix this and stop holding the WAL lock once we fix / remove that order of operations in the commit + // for any indexes that were appended to while checkpointing, merge the delta back into the main index // FIXME: we only clean up appends made to tables that are part of this checkpoint // Currently, that is correct, since we don't allow creating tables DURING a checkpoint @@ -283,7 +292,7 @@ void SingleFileCheckpointWriter::CreateCheckpoint() { auto &storage = table.GetStorage(); auto &table_info = storage.GetDataTableInfo(); auto &index_list = table_info->GetIndexes(); - index_list.MergeCheckpointDeltas(options.transaction_id); + index_list.MergeCheckpointDeltas(storage, options.transaction_id); } } diff --git a/src/duckdb/src/storage/compression/numeric_constant.cpp b/src/duckdb/src/storage/compression/numeric_constant.cpp index f9cc79b47..e13a0748f 100644 --- a/src/duckdb/src/storage/compression/numeric_constant.cpp +++ b/src/duckdb/src/storage/compression/numeric_constant.cpp @@ -160,7 +160,8 @@ void ConstantFun::FiltersNullValues(const LogicalType &type, const TableFilter & auto &expr_filter = filter.Cast(); auto &state = filter_state.Cast(); Value val(type); - filters_nulls = expr_filter.EvaluateWithConstant(state.executor, val); + //! If the expression evaluates to true, containing only a NULL vector, it *must* be an IS NULL filter + filters_nulls = !expr_filter.EvaluateWithConstant(state.executor, val); filters_valid_values = false; break; } diff --git a/src/duckdb/src/storage/data_table.cpp b/src/duckdb/src/storage/data_table.cpp index aaae4de5b..330b2a882 100644 --- a/src/duckdb/src/storage/data_table.cpp +++ b/src/duckdb/src/storage/data_table.cpp @@ -142,11 +142,6 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, BoundConstraint : db(parent.db), info(parent.info), row_groups(parent.row_groups), version(DataTableVersion::MAIN_TABLE) { // ALTER COLUMN to add a new constraint. - // Clone the storage info vector or the table. - for (const auto &index_info : parent.info->index_storage_infos) { - info->index_storage_infos.push_back(IndexStorageInfo(index_info.name)); - } - // Bind all indexes. info->BindIndexes(context); @@ -268,7 +263,8 @@ idx_t DataTable::MaxThreads(ClientContext &context) const { return GetTotalRows() / parallel_scan_tuple_count + 1; } -void DataTable::InitializeParallelScan(ClientContext &context, ParallelTableScanState &state) { +void DataTable::InitializeParallelScan(ClientContext &context, ParallelTableScanState &state, + const vector &column_indexes) { auto &local_storage = LocalStorage::Get(context, db); row_groups->InitializeParallelScan(state.scan_state); @@ -426,6 +422,12 @@ void DataTable::Fetch(DuckTransaction &transaction, DataChunk &result, const vec row_groups->Fetch(transaction, result, column_ids, row_identifiers, fetch_count, state); } +void DataTable::FetchCommitted(DataChunk &result, const vector &column_ids, const Vector &row_identifiers, + idx_t fetch_count, ColumnFetchState &state) { + TransactionData commit_transaction(MAX_TRANSACTION_ID, TRANSACTION_ID_START - 1); + row_groups->Fetch(commit_transaction, result, column_ids, row_identifiers, fetch_count, state); +} + bool DataTable::CanFetch(DuckTransaction &transaction, const row_t row_id) { return row_groups->CanFetch(transaction, row_id); } @@ -683,21 +685,26 @@ void DataTable::VerifyUniqueIndexes(TableIndexList &indexes, optional_ptr manager) { // Verify the constraint without a conflict manager. if (!manager) { - return indexes.Scan([&](Index &index) { + return indexes.ScanEntries([&](IndexEntry &entry) { + auto &index = *entry.index; if (!index.IsUnique() || index.GetIndexType() != ART::TYPE_NAME) { return false; } D_ASSERT(index.IsBound()); auto &art = index.Cast(); + + lock_guard guard(entry.lock); + IndexAppendInfo index_append_info; if (storage) { auto delete_index = storage->delete_indexes.Find(art.GetIndexName()); - D_ASSERT(!delete_index || delete_index->IsBound()); - IndexAppendInfo index_append_info(IndexAppendMode::DEFAULT, delete_index); - art.VerifyAppend(chunk, index_append_info, nullptr); - } else { - IndexAppendInfo index_append_info; - art.VerifyAppend(chunk, index_append_info, nullptr); + if (delete_index) { + index_append_info.delete_indexes.push_back(*delete_index); + } + } + if (entry.removed_data_during_checkpoint) { + index_append_info.delete_indexes.push_back(*entry.removed_data_during_checkpoint); } + art.VerifyAppend(chunk, index_append_info, nullptr); return false; }); } @@ -729,9 +736,8 @@ void DataTable::VerifyUniqueIndexes(TableIndexList &indexes, optional_ptrSetMode(ConflictManagerMode::SCAN); auto &matching_indexes = manager->MatchingIndexes(); auto &matching_delete_indexes = manager->MatchingDeleteIndexes(); - IndexAppendInfo index_append_info(IndexAppendMode::DEFAULT, nullptr); for (idx_t i = 0; i < matching_indexes.size(); i++) { - index_append_info.delete_index = matching_delete_indexes[i]; + IndexAppendInfo index_append_info(IndexAppendMode::DEFAULT, matching_delete_indexes[i]); matching_indexes[i].get().VerifyAppend(chunk, index_append_info, *manager); } @@ -1243,22 +1249,24 @@ ErrorData DataTable::AppendToIndexes(TableIndexList &indexes, optional_ptr append_index = bound_index; - optional_ptr lookup_index; + optional_ptr lookup_index, lookup_delete_index; // check if there's an on-going checkpoint - if (active_checkpoint.IsValid() && bound_index.RequiresTransactionality()) { - // check if we've already written this index during the on-going checkpoint + if (active_checkpoint.IsValid() && bound_index.SupportsDeltaIndexes()) { + // there's an ongoing checkpoint - check if we need to use delta indexes or if we can write to the main + // index if (!entry.last_written_checkpoint.IsValid() || entry.last_written_checkpoint.GetIndex() != active_checkpoint.GetIndex()) { - // there's an on-going checkpoint and we haven't written the index to disk yet + // there's an on-going checkpoint and we haven't flushed the index yet // we need to append to the "added_data_during_checkpoint" instead // create it if it does not exist if (!entry.added_data_during_checkpoint) { entry.added_data_during_checkpoint = - bound_index.CreateEmptyCopy("added_during_checkpoint_", bound_index.index_constraint_type); + bound_index.CreateDeltaIndex(DeltaIndexType::ADDED_DURING_CHECKPOINT); } if (bound_index.IsUnique()) { // before appending we still need to look-up in the main index to verify there are no conflicts lookup_index = bound_index; + lookup_delete_index = delete_index; } append_index = entry.added_data_during_checkpoint; } @@ -1268,8 +1276,14 @@ ErrorData DataTable::AppendToIndexes(TableIndexList &indexes, optional_ptr
VerifyAppend(table_chunk, index_append_info, nullptr); + IndexAppendInfo lookup_append_info; + if (lookup_delete_index) { + lookup_append_info.delete_indexes.push_back(*lookup_delete_index); + } + if (entry.removed_data_during_checkpoint) { + lookup_append_info.delete_indexes.push_back(*entry.removed_data_during_checkpoint); + } + lookup_index->VerifyAppend(table_chunk, lookup_append_info, nullptr); } // Append the mock chunk containing empty columns for non-key columns. @@ -1330,9 +1344,9 @@ void DataTable::RevertIndexAppend(TableAppendState &state, DataChunk &chunk, Vec } void DataTable::RemoveFromIndexes(const QueryContext &context, Vector &row_identifiers, idx_t count, - IndexRemovalType removal_type) { + IndexRemovalType removal_type, optional_idx active_checkpoint) { D_ASSERT(IsMainTable()); - row_groups->RemoveFromIndexes(context, info->indexes, row_identifiers, count, removal_type); + row_groups->RemoveFromIndexes(context, info->indexes, row_identifiers, count, removal_type, active_checkpoint); } //===--------------------------------------------------------------------===// @@ -1382,6 +1396,7 @@ void DataTable::VerifyDeleteConstraints(optional_ptr storage, unique_ptr DataTable::InitializeDelete(TableCatalogEntry &table, ClientContext &context, const vector> &bound_constraints) { + auto &transaction = DuckTransaction::Get(context, db); // Bind all indexes. info->BindIndexes(context); @@ -1398,6 +1413,7 @@ unique_ptr DataTable::InitializeDelete(TableCatalogEntry &tabl result->verify_chunk.Initialize(Allocator::Get(context), types); result->constraint_state = make_uniq(table, bound_constraints); } + result->checkpoint_lock = transaction.SharedLockTable(*info); return result; } diff --git a/src/duckdb/src/storage/local_storage.cpp b/src/duckdb/src/storage/local_storage.cpp index 6cb4b82e1..d72374a50 100644 --- a/src/duckdb/src/storage/local_storage.cpp +++ b/src/duckdb/src/storage/local_storage.cpp @@ -30,31 +30,19 @@ LocalTableStorage::LocalTableStorage(ClientContext &context, DataTable &table) if (constraint == IndexConstraintType::NONE) { return false; } - if (index.GetIndexType() != ART::TYPE_NAME) { - return false; - } if (!index.IsBound()) { return false; } - auto &art = index.Cast(); - - // UNIQUE constraint. - vector> expressions; - vector> delete_expressions; - for (auto &expr : art.unbound_expressions) { - expressions.push_back(expr->Copy()); - delete_expressions.push_back(expr->Copy()); + auto &bound_index = index.Cast(); + if (!bound_index.SupportsDeltaIndexes()) { + return false; } // Create a delete index and a local index. - auto &name = art.GetIndexName(); - auto &io_manager = art.table_io_manager; - auto delete_index = - make_uniq(name, constraint, art.GetColumnIds(), io_manager, std::move(delete_expressions), art.db); + auto delete_index = bound_index.CreateDeltaIndex(DeltaIndexType::LOCAL_DELETE); delete_indexes.AddIndex(std::move(delete_index)); - auto append_index = - make_uniq(name, constraint, art.GetColumnIds(), io_manager, std::move(expressions), art.db); + auto append_index = bound_index.CreateDeltaIndex(DeltaIndexType::LOCAL_APPEND); append_indexes.AddIndex(std::move(append_index)); return false; }); diff --git a/src/duckdb/src/storage/storage_manager.cpp b/src/duckdb/src/storage/storage_manager.cpp index b4e9b521a..339bd6152 100644 --- a/src/duckdb/src/storage/storage_manager.cpp +++ b/src/duckdb/src/storage/storage_manager.cpp @@ -179,8 +179,7 @@ bool StorageManager::WALStartCheckpoint(MetaBlockPointer meta_block, CheckpointO return true; } -void StorageManager::WALFinishCheckpoint() { - lock_guard guard(wal_lock); +void StorageManager::WALFinishCheckpoint(lock_guard &) { D_ASSERT(wal.get()); // "wal" points to the checkpoint WAL @@ -616,6 +615,20 @@ void SingleFileStorageManager::CreateCheckpoint(QueryContext context, Checkpoint if (read_only || !load_complete) { return; } + unique_ptr vacuum_lock; + if (options.type != CheckpointType::CONCURRENT_CHECKPOINT) { + auto &transaction_manager = GetAttached().GetTransactionManager().Cast(); + vacuum_lock = transaction_manager.TryGetVacuumLock(); + if (!vacuum_lock) { + if (options.type == CheckpointType::FULL_CHECKPOINT) { + options.type = CheckpointType::CONCURRENT_CHECKPOINT; + } else { + // nothing to do + return; + } + } + } + if (db.GetStorageExtension()) { db.GetStorageExtension()->OnCheckpointStart(db, options); } diff --git a/src/duckdb/src/storage/table/chunk_info.cpp b/src/duckdb/src/storage/table/chunk_info.cpp index dfef0b4a1..3a1708663 100644 --- a/src/duckdb/src/storage/table/chunk_info.cpp +++ b/src/duckdb/src/storage/table/chunk_info.cpp @@ -38,7 +38,7 @@ bool ChunkInfo::Cleanup(transaction_t lowest_transaction) const { return false; } -void ChunkInfo::Write(WriteStream &writer) const { +void ChunkInfo::Write(WriteStream &writer, transaction_t checkpoint_id) const { writer.Write(type); } @@ -99,8 +99,11 @@ void ChunkConstantInfo::CommitAppend(transaction_t commit_id, idx_t start, idx_t insert_id = commit_id; } -bool ChunkConstantInfo::HasDeletes() const { - bool is_deleted = insert_id >= TRANSACTION_ID_START || delete_id < TRANSACTION_ID_START; +bool ChunkConstantInfo::HasDeletes(transaction_t transaction_id) const { + if (transaction_id == MAX_TRANSACTION_ID) { + transaction_id = TRANSACTION_ID_START - 1; + } + bool is_deleted = insert_id >= TRANSACTION_ID_START || delete_id <= transaction_id; return is_deleted; } @@ -120,9 +123,9 @@ bool ChunkConstantInfo::Cleanup(transaction_t lowest_transaction) const { return true; } -void ChunkConstantInfo::Write(WriteStream &writer) const { - D_ASSERT(HasDeletes()); - ChunkInfo::Write(writer); +void ChunkConstantInfo::Write(WriteStream &writer, transaction_t checkpoint_id) const { + D_ASSERT(HasDeletes(checkpoint_id)); + ChunkInfo::Write(writer, checkpoint_id); writer.Write(start); } @@ -418,8 +421,22 @@ bool ChunkVectorInfo::Cleanup(transaction_t lowest_transaction) const { return true; } -bool ChunkVectorInfo::HasDeletes() const { - return AnyDeleted(); +bool ChunkVectorInfo::HasDeletes(transaction_t transaction_id) const { + if (!AnyDeleted()) { + return false; + } + if (transaction_id == MAX_TRANSACTION_ID) { + return true; + } + auto segment = allocator.GetHandle(deleted_data); + auto deleted = segment.GetPtr(); + + for (idx_t i = 0; i < STANDARD_VECTOR_SIZE; i++) { + if (deleted[i] <= transaction_id) { + return true; + } + } + return false; } bool ChunkVectorInfo::AnyDeleted() const { @@ -476,9 +493,9 @@ idx_t ChunkVectorInfo::GetCommittedDeletedCount(idx_t max_count) const { return delete_count; } -void ChunkVectorInfo::Write(WriteStream &writer) const { +void ChunkVectorInfo::Write(WriteStream &writer, transaction_t checkpoint_id) const { SelectionVector sel(STANDARD_VECTOR_SIZE); - transaction_t start_time = TRANSACTION_ID_START - 1; + transaction_t start_time = checkpoint_id == MAX_TRANSACTION_ID ? TRANSACTION_ID_START - 1 : checkpoint_id + 1; transaction_t transaction_id = DConstants::INVALID_INDEX; idx_t count = GetSelVector(start_time, transaction_id, sel, STANDARD_VECTOR_SIZE); if (count == STANDARD_VECTOR_SIZE) { @@ -493,7 +510,7 @@ void ChunkVectorInfo::Write(WriteStream &writer) const { return; } // write a boolean vector - ChunkInfo::Write(writer); + ChunkInfo::Write(writer, checkpoint_id); writer.Write(start); ValidityMask mask(STANDARD_VECTOR_SIZE); mask.Initialize(STANDARD_VECTOR_SIZE); diff --git a/src/duckdb/src/storage/table/column_segment.cpp b/src/duckdb/src/storage/table/column_segment.cpp index e1739bc8a..88cccfe88 100644 --- a/src/duckdb/src/storage/table/column_segment.cpp +++ b/src/duckdb/src/storage/table/column_segment.cpp @@ -240,9 +240,7 @@ void ColumnSegment::ConvertToPersistent(QueryContext context, optional_ptr extra_metadata_block_pointers; extra_metadata_block_pointers.reserve(write_data.existing_extra_metadata_blocks.size()); @@ -1323,7 +1323,7 @@ RowGroupPointer RowGroup::Checkpoint(RowGroupWriteData write_data, RowGroupWrite metadata_blocks.insert(column_pointer.block_pointer); } if (metadata_manager) { - row_group_pointer.deletes_pointers = CheckpointDeletes(*metadata_manager); + row_group_pointer.deletes_pointers = CheckpointDeletes(writer); } // set up the pointers correctly within this row group for future operations column_pointers = row_group_pointer.data_pointers; @@ -1376,10 +1376,11 @@ PersistentRowGroupData RowGroup::SerializeRowGroupInfo(idx_t row_group_start) co return result; } -vector RowGroup::CheckpointDeletes(MetadataManager &manager) { +vector RowGroup::CheckpointDeletes(RowGroupWriter &writer) { if (HasUnloadedDeletes()) { // deletes were not loaded so they cannot be changed // re-use them as-is + auto &manager = *writer.GetMetadataManager(); manager.ClearModifiedBlocks(deletes_pointers); return deletes_pointers; } @@ -1388,7 +1389,7 @@ vector RowGroup::CheckpointDeletes(MetadataManager &manager) { // no version information: write nothing return vector(); } - return vinfo->Checkpoint(manager); + return vinfo->Checkpoint(writer); } void RowGroup::Serialize(RowGroupPointer &pointer, Serializer &serializer) { diff --git a/src/duckdb/src/storage/table/row_group_collection.cpp b/src/duckdb/src/storage/table/row_group_collection.cpp index 061649166..4069e2a91 100644 --- a/src/duckdb/src/storage/table/row_group_collection.cpp +++ b/src/duckdb/src/storage/table/row_group_collection.cpp @@ -364,7 +364,8 @@ void RowGroupCollection::Fetch(TransactionData transaction, DataChunk &result, c } auto ¤t_row_group = row_group->GetNode(); auto offset_in_row_group = UnsafeNumericCast(row_id) - row_group->GetRowStart(); - if (!current_row_group.Fetch(transaction, offset_in_row_group)) { + if (state.fetch_type == FetchType::TRANSACTIONAL_FETCH && + !current_row_group.Fetch(transaction, offset_in_row_group)) { continue; } state.row_group = row_group; @@ -739,46 +740,114 @@ void RowGroupCollection::Update(TransactionData transaction, DataTable &data_tab } while (pos < updates.size()); } -void GetIndexRemovalTargets(IndexEntry &entry, IndexRemovalType removal_type, optional_ptr &append_target, - optional_ptr &remove_target) { +struct IndexRemovalTargets { + optional_ptr append_target; + optional_ptr remove_target; + optional_ptr conditional_remove_target; + optional_ptr conditional_append_target; +}; + +void GetIndexRemovalTargetsActiveCheckpoint(IndexEntry &entry, IndexRemovalType removal_type, + IndexRemovalTargets &targets) { + auto &main_index = entry.index->Cast(); + + // create "removed_data_during_checkpoint" if it does not exist + if (!entry.removed_data_during_checkpoint) { + entry.removed_data_during_checkpoint = main_index.CreateDeltaIndex(DeltaIndexType::REMOVED_DURING_CHECKPOINT); + } + if (removal_type == IndexRemovalType::MAIN_INDEX_ONLY || removal_type == IndexRemovalType::MAIN_INDEX) { + // removing from main index - but we cannot remove directly due to the concurrent checkpoint + // add removal to delta index + if (entry.added_data_during_checkpoint) { + // if we have also added data during this checkpoint - we might need to remove from there instead + // we FIRST try to remove from "added_data_during_checkpoint" + // any rows that are not there we add to "removed_data_during_checkpoint" + targets.conditional_remove_target = entry.added_data_during_checkpoint.get(); + targets.conditional_append_target = entry.removed_data_during_checkpoint.get(); + } else { + // add removed rows to "removed_data_during_checkpoint" + targets.conditional_append_target = entry.removed_data_during_checkpoint.get(); + } + if (removal_type == IndexRemovalType::MAIN_INDEX) { + // we also need to append to "deleted_rows_in_use" + if (!entry.deleted_rows_in_use) { + // create "deleted_rows_in_use" if it does not exist yet + entry.deleted_rows_in_use = main_index.CreateDeltaIndex(DeltaIndexType::DELETED_ROWS_IN_USE); + } + targets.append_target = entry.deleted_rows_in_use; + } + return; + } + if (removal_type == IndexRemovalType::REVERT_MAIN_INDEX_ONLY || + removal_type == IndexRemovalType::REVERT_MAIN_INDEX) { + // revert adding to main index + if (entry.added_data_during_checkpoint) { + // we have added data during this checkpoint as well, remove might have EITHER: + // (1) added to "removed_data_during_checkpoint" + // (2) removed data from "added_data_during_checkpoint" + // revert by first trying to remove from "removed_data_during_checkpoint" + // any rows that were not removed are re-added back to "added_data_during_checkpoint" + targets.conditional_remove_target = entry.removed_data_during_checkpoint.get(); + targets.conditional_append_target = entry.added_data_during_checkpoint.get(); + } else { + targets.conditional_remove_target = entry.removed_data_during_checkpoint.get(); + } + if (removal_type == IndexRemovalType::REVERT_MAIN_INDEX) { + // we also need to remove from "deleted_rows_in_use" + targets.remove_target = entry.deleted_rows_in_use.get(); + } + } +} +void GetIndexRemovalTargets(IndexEntry &entry, IndexRemovalType removal_type, IndexRemovalTargets &targets, + optional_idx active_checkpoint) { auto &main_index = entry.index->Cast(); // not all indexes require delta indexes - this is tracked through BoundIndex::RequiresTransactionality // if an index does not require this we skip creating to and appending to "deleted_rows_in_use" - bool index_requires_delta = main_index.RequiresTransactionality(); + bool supports_delta_indexes = main_index.SupportsDeltaIndexes(); + if (removal_type != IndexRemovalType::DELETED_ROWS_IN_USE && active_checkpoint.IsValid() && + supports_delta_indexes) { + // there's an ongoing checkpoint - check if we need to use delta indexes or if we can write to the main index + if (!entry.last_written_checkpoint.IsValid() || + entry.last_written_checkpoint.GetIndex() != active_checkpoint.GetIndex()) { + // there's an on-going checkpoint and we haven't flushed the index yet + // we can't modify the index in-place and need to modify the deltas - get the appropriate deltas to target + GetIndexRemovalTargetsActiveCheckpoint(entry, removal_type, targets); + return; + } + } switch (removal_type) { case IndexRemovalType::MAIN_INDEX_ONLY: // directly remove from main index without appending to delta indexes - remove_target = main_index; + targets.remove_target = main_index; break; case IndexRemovalType::REVERT_MAIN_INDEX_ONLY: // revert main index only append - just add back to index - append_target = main_index; + targets.append_target = main_index; break; case IndexRemovalType::MAIN_INDEX: // regular removal from main index - add rows to delta index if required - if (index_requires_delta) { + if (supports_delta_indexes) { if (!entry.deleted_rows_in_use) { // create "deleted_rows_in_use" if it does not exist yet - entry.deleted_rows_in_use = - main_index.CreateEmptyCopy("deleted_rows_in_use_", IndexConstraintType::NONE); + entry.deleted_rows_in_use = main_index.CreateDeltaIndex(DeltaIndexType::DELETED_ROWS_IN_USE); } - append_target = entry.deleted_rows_in_use; + targets.append_target = entry.deleted_rows_in_use; } - remove_target = main_index; + targets.remove_target = main_index; break; case IndexRemovalType::REVERT_MAIN_INDEX: // revert regular append to main index - remove from deleted_rows_in_use if we appended there before - append_target = main_index; - if (index_requires_delta) { - remove_target = entry.deleted_rows_in_use; + targets.append_target = main_index; + if (supports_delta_indexes) { + targets.remove_target = entry.deleted_rows_in_use; } break; case IndexRemovalType::DELETED_ROWS_IN_USE: // remove from removal index if we appended any rows - if (index_requires_delta) { - remove_target = entry.deleted_rows_in_use; + if (supports_delta_indexes) { + targets.remove_target = entry.deleted_rows_in_use; } break; default: @@ -787,9 +856,8 @@ void GetIndexRemovalTargets(IndexEntry &entry, IndexRemovalType removal_type, op } void RowGroupCollection::RemoveFromIndexes(const QueryContext &context, TableIndexList &indexes, - Vector &row_identifiers, idx_t count, IndexRemovalType removal_type) { - auto row_ids = FlatVector::GetData(row_identifiers); - + Vector &row_identifiers, idx_t count, IndexRemovalType removal_type, + optional_idx active_checkpoint) { // Collect all Indexed columns on the table. unordered_set indexed_column_id_set; indexes.Scan([&](Index &index) { @@ -810,115 +878,104 @@ void RowGroupCollection::RemoveFromIndexes(const QueryContext &context, TableInd for (auto &col : column_ids) { column_types.push_back(types[col.GetPrimaryIndex()]); } - auto row_groups = GetRowGroups(); - - // Initialize the fetch state. Only use indexed columns. - TableScanState state; - auto column_ids_copy = column_ids; - state.Initialize(std::move(column_ids_copy)); - state.table_state.max_row = row_groups->GetBaseRowId() + total_rows; DataChunk fetch_chunk; fetch_chunk.Initialize(GetAllocator(), column_types); + ColumnFetchState state; + state.fetch_type = FetchType::FORCE_FETCH; + TransactionData commit_transaction(MAX_TRANSACTION_ID, TRANSACTION_ID_START - 1); + Fetch(commit_transaction, fetch_chunk, column_ids, row_identifiers, count, state); + // Used for index value removal. // Contains all columns but only initializes indexed ones. DataChunk result_chunk; auto fetched_columns = vector(types.size(), false); result_chunk.Initialize(GetAllocator(), types, fetched_columns); - // Now set all to-be-fetched columns. for (auto &col : indexed_column_id_set) { fetched_columns[col] = true; } - // Iterate over the row ids. - SelectionVector sel(STANDARD_VECTOR_SIZE); - for (idx_t r = 0; r < count;) { - fetch_chunk.Reset(); - result_chunk.Reset(); - - // Figure out which row_group to fetch from. - auto row_id = row_ids[r]; - auto row_group = row_groups->GetSegment(UnsafeNumericCast(row_id)); - - auto ¤t_row_group = row_group->GetNode(); - auto row_start = row_group->GetRowStart(); - auto row_group_vector_idx = (UnsafeNumericCast(row_id) - row_start) / STANDARD_VECTOR_SIZE; - auto base_row_id = row_group_vector_idx * STANDARD_VECTOR_SIZE + row_start; - - // Fetch the current vector into fetch_chunk. - state.table_state.Initialize(context, GetTypes()); - current_row_group.InitializeScanWithOffset(state.table_state, *row_group, row_group_vector_idx); - current_row_group.ScanCommitted(state.table_state, fetch_chunk, TableScanType::TABLE_SCAN_COMMITTED_ROWS); - fetch_chunk.Verify(); - - // Check for any remaining row ids, if they also fall into this vector. - // We try to fetch as many rows as possible at the same time. - idx_t sel_count = 0; - for (; r < count; r++) { - idx_t current_row = idx_t(row_ids[r]); - if (current_row < base_row_id || current_row >= base_row_id + fetch_chunk.size()) { - // This row id does not fall into the current chunk. - break; - } - auto row_in_vector = current_row - base_row_id; - D_ASSERT(row_in_vector < fetch_chunk.size()); - sel.set_index(sel_count++, row_in_vector); - } - D_ASSERT(sel_count > 0); - - // Reference the necessary columns of the fetch_chunk. - idx_t fetch_idx = 0; - for (idx_t j = 0; j < types.size(); j++) { - if (fetched_columns[j]) { - result_chunk.data[j].Reference(fetch_chunk.data[fetch_idx++]); - continue; - } - result_chunk.data[j].Reference(Value(types[j])); + // Reference the necessary columns of the fetch_chunk. + idx_t fetch_idx = 0; + for (idx_t j = 0; j < types.size(); j++) { + if (fetched_columns[j]) { + result_chunk.data[j].Reference(fetch_chunk.data[fetch_idx++]); + continue; } - result_chunk.SetCardinality(fetch_chunk); - - // Slice the vector with all rows that are present in this vector. - // If the index is bound, delete the data. If unbound, buffer into unbound_index. - result_chunk.Slice(sel, sel_count); - indexes.ScanEntries([&](IndexEntry &entry) { - auto &index = *entry.index; - if (index.IsBound()) { - lock_guard guard(entry.lock); - // check which indexes we should append to or remove from - // note that this method might also involve appending to indexes - // the reason for that is that we have "delta" indexes that we must fill with data we are removing - // OR because we are actually reverting a previous removal - optional_ptr append_target, remove_target; - GetIndexRemovalTargets(entry, removal_type, append_target, remove_target); - - // perform the targeted append / removal - if (append_target) { - IndexAppendInfo append_info; - auto error = append_target->Append(result_chunk, row_identifiers, append_info); - if (error.HasError()) { - throw InternalException("Failed to append to %s: %s", append_target->name, error.Message()); + result_chunk.data[j].Reference(Value(types[j])); + } + result_chunk.SetCardinality(fetch_chunk); + + DataChunk remaining_result_chunk; + unique_ptr remaining_row_ids; + + indexes.ScanEntries([&](IndexEntry &entry) { + auto &index = *entry.index; + if (index.IsBound()) { + lock_guard guard(entry.lock); + + // check which indexes we should append to or remove from + // note that this method might also involve appending to indexes + // the reason for that is that we have "delta" indexes that we must fill with data we are removing + // OR because we are actually reverting a previous removal + IndexRemovalTargets targets; + GetIndexRemovalTargets(entry, removal_type, targets, active_checkpoint); + + bool removal_succeeded = false; + if (targets.conditional_remove_target) { + // if we have an conditional remove target, we first try to remove the chunk from there + idx_t delete_count = targets.conditional_remove_target->TryDelete(result_chunk, row_identifiers); + if (delete_count > 0) { + if (delete_count != result_chunk.size()) { + // it should not be possible to get here + // what this means is that we removed SOME rows from the "initial_remove_target" - but not all + // "initial_remove_target" contains rows that were INSERTED during the checkpoint + // the regular remove target contains rows that were ALREADY THERE during the checkpoint + // "RemoveFromIndexes" works on a per-row-group basis + // when appending during a checkpoint, we always insert new row groups for new data + // so the two groups of data should always be separate + throw InternalException("RowGroupCollection::RemoveFromIndexes - partially deleted from the " + "initial removal target"); } + removal_succeeded = true; } - if (remove_target) { - remove_target->Delete(result_chunk, row_identifiers); + } + if (targets.conditional_append_target && !removal_succeeded) { + // for any rows that were not removed - append them to the conditional append target instead + IndexAppendInfo append_info; + auto error = targets.conditional_append_target->Append(result_chunk, row_identifiers, append_info); + if (error.HasError()) { + throw InternalException("Failed to append to %s: %s", targets.conditional_append_target->name, + error.Message()); + } + } + // perform the targeted append / removal + if (targets.append_target) { + IndexAppendInfo append_info; + auto error = targets.append_target->Append(result_chunk, row_identifiers, append_info); + if (error.HasError()) { + throw InternalException("Failed to append to %s: %s", targets.append_target->name, error.Message()); } - return false; } - // Buffering takes only the indexed columns in ordering of the column_ids mapping. - DataChunk index_column_chunk; - index_column_chunk.InitializeEmpty(column_types); - for (idx_t i = 0; i < column_types.size(); i++) { - auto col_id = column_ids[i].GetPrimaryIndex(); - index_column_chunk.data[i].Reference(result_chunk.data[col_id]); + if (targets.remove_target) { + targets.remove_target->Delete(result_chunk, row_identifiers); } - index_column_chunk.SetCardinality(result_chunk.size()); - auto &unbound_index = index.Cast(); - unbound_index.BufferChunk(index_column_chunk, row_identifiers, column_ids, BufferedIndexReplay::DEL_ENTRY); return false; - }); - } + } + // Buffering takes only the indexed columns in ordering of the column_ids mapping. + DataChunk index_column_chunk; + index_column_chunk.InitializeEmpty(column_types); + for (idx_t i = 0; i < column_types.size(); i++) { + auto col_id = column_ids[i].GetPrimaryIndex(); + index_column_chunk.data[i].Reference(result_chunk.data[col_id]); + } + index_column_chunk.SetCardinality(result_chunk.size()); + auto &unbound_index = index.Cast(); + unbound_index.BufferChunk(index_column_chunk, row_identifiers, column_ids, BufferedIndexReplay::DEL_ENTRY); + return false; + }); } void RowGroupCollection::UpdateColumn(TransactionData transaction, DataTable &data_table, Vector &row_ids, @@ -1139,7 +1196,9 @@ class VacuumTask : public BaseCheckpointTask { total_append_count += append_counts[target_idx]; } if (total_append_count != merge_rows) { - throw InternalException("Mismatch in row group count vs verify count in RowGroupCollection::Checkpoint"); + throw InternalException( + "Mismatch in row group count %d vs verify count %d in RowGroupCollection::Checkpoint", merge_rows, + total_append_count); } // merging is complete - execute checkpoint tasks of the target row groups for (idx_t i = 0; i < target_count; i++) { @@ -1403,7 +1462,8 @@ void RowGroupCollection::Checkpoint(TableDataWriter &writer, TableStatistics &gl extra_metadata_block_pointers.emplace_back(block_pointer, 0); } metadata_manager.ClearModifiedBlocks(extra_metadata_block_pointers); - row_group.CheckpointDeletes(metadata_manager); + auto row_group_writer = checkpoint_state.writer.GetRowGroupWriter(row_group); + row_group.CheckpointDeletes(*row_group_writer); } writer.WriteUnchangedTable(metadata_pointer, total_rows.load()); diff --git a/src/duckdb/src/storage/table/row_version_manager.cpp b/src/duckdb/src/storage/table/row_version_manager.cpp index 20d0ebed4..8bf25b787 100644 --- a/src/duckdb/src/storage/table/row_version_manager.cpp +++ b/src/duckdb/src/storage/table/row_version_manager.cpp @@ -3,14 +3,13 @@ #include "duckdb/storage/metadata/metadata_manager.hpp" #include "duckdb/storage/metadata/metadata_reader.hpp" #include "duckdb/storage/metadata/metadata_writer.hpp" -#include "duckdb/common/pair.hpp" +#include "duckdb/storage/checkpoint/row_group_writer.hpp" namespace duckdb { RowVersionManager::RowVersionManager(BufferManager &buffer_manager_p) noexcept : allocator(STANDARD_VECTOR_SIZE * sizeof(transaction_t), buffer_manager_p.GetTemporaryBlockManager(), - MemoryTag::BASE_TABLE), - has_unserialized_changes(false) { + MemoryTag::BASE_TABLE) { } idx_t RowVersionManager::GetCommittedDeletedCount(idx_t count) { @@ -126,7 +125,6 @@ void RowVersionManager::FillVectorInfo(idx_t vector_idx) { void RowVersionManager::AppendVersionInfo(TransactionData transaction, idx_t count, idx_t row_group_start, idx_t row_group_end) { lock_guard lock(version_lock); - has_unserialized_changes = true; idx_t start_vector_idx = row_group_start / STANDARD_VECTOR_SIZE; idx_t end_vector_idx = (row_group_end - 1) / STANDARD_VECTOR_SIZE; @@ -179,7 +177,6 @@ void RowVersionManager::CommitAppend(transaction_t commit_id, idx_t row_group_st idx_t vend = vector_idx == end_vector_idx ? row_group_end - end_vector_idx * STANDARD_VECTOR_SIZE : STANDARD_VECTOR_SIZE; auto &info = *vector_info[vector_idx]; - D_ASSERT(has_unserialized_changes); info.CommitAppend(commit_id, vstart, vend); } } @@ -208,9 +205,6 @@ void RowVersionManager::CleanupAppend(transaction_t lowest_active_transaction, i // if we wrote the entire chunk info try to compress it auto cleanup = info.Cleanup(lowest_active_transaction); if (cleanup) { - if (info.HasDeletes()) { - has_unserialized_changes = true; - } vector_info[vector_idx].reset(); } } @@ -220,7 +214,6 @@ void RowVersionManager::RevertAppend(idx_t new_count) { lock_guard lock(version_lock); idx_t start_vector_idx = (new_count + (STANDARD_VECTOR_SIZE - 1)) / STANDARD_VECTOR_SIZE; for (idx_t vector_idx = start_vector_idx; vector_idx < vector_info.size(); vector_idx++) { - D_ASSERT(has_unserialized_changes); vector_info[vector_idx].reset(); } } @@ -243,19 +236,22 @@ ChunkVectorInfo &RowVersionManager::GetVectorInfo(idx_t vector_idx) { idx_t RowVersionManager::DeleteRows(idx_t vector_idx, transaction_t transaction_id, row_t rows[], idx_t count) { lock_guard lock(version_lock); - has_unserialized_changes = true; return GetVectorInfo(vector_idx).Delete(transaction_id, rows, count); } void RowVersionManager::CommitDelete(idx_t vector_idx, transaction_t commit_id, const DeleteInfo &info) { lock_guard lock(version_lock); - has_unserialized_changes = true; + if (!uncheckpointed_delete_commit.IsValid() || commit_id > uncheckpointed_delete_commit.GetIndex()) { + uncheckpointed_delete_commit = commit_id; + } GetVectorInfo(vector_idx).CommitDelete(commit_id, info); } -vector RowVersionManager::Checkpoint(MetadataManager &manager) { +vector RowVersionManager::Checkpoint(RowGroupWriter &writer) { lock_guard lock(version_lock); - if (!has_unserialized_changes) { + auto &manager = *writer.GetMetadataManager(); + auto options = writer.GetCheckpointOptions(); + if (!uncheckpointed_delete_commit.IsValid()) { // we can write the current pointer as-is // ensure the blocks we are pointing to are not marked as free manager.ClearModifiedBlocks(storage_pointers); @@ -269,7 +265,7 @@ vector RowVersionManager::Checkpoint(MetadataManager &manager) if (!chunk_info) { continue; } - if (!chunk_info->HasDeletes()) { + if (!chunk_info->HasDeletes(options.transaction_id)) { continue; } to_serialize.emplace_back(vector_idx, *chunk_info); @@ -278,19 +274,23 @@ vector RowVersionManager::Checkpoint(MetadataManager &manager) storage_pointers.clear(); if (!to_serialize.empty()) { - MetadataWriter writer(manager, &storage_pointers); + MetadataWriter metadata_writer(manager, &storage_pointers); // now serialize the actual version information - writer.Write(to_serialize.size()); + metadata_writer.Write(to_serialize.size()); for (auto &entry : to_serialize) { auto &vector_idx = entry.first; auto &chunk_info = entry.second.get(); - writer.Write(vector_idx); - chunk_info.Write(writer); + metadata_writer.Write(vector_idx); + chunk_info.Write(metadata_writer, options.transaction_id); } - writer.Flush(); + metadata_writer.Flush(); } - has_unserialized_changes = false; + if (uncheckpointed_delete_commit.IsValid() && uncheckpointed_delete_commit.GetIndex() <= options.transaction_id) { + // the last checkpointed id was either before or on the transaction we are checkpointing + // nothing to checkpoint in future commits until more deletes appear + uncheckpointed_delete_commit = optional_idx(); + } return storage_pointers; } @@ -314,18 +314,18 @@ shared_ptr RowVersionManager::Deserialize(MetaBlockPointer de version_info->FillVectorInfo(vector_index); version_info->vector_info[vector_index] = ChunkInfo::Read(version_info->GetAllocator(), source); } - version_info->has_unserialized_changes = false; + version_info->uncheckpointed_delete_commit = optional_idx(); return version_info; } bool RowVersionManager::HasUnserializedChanges() { lock_guard lock(version_lock); - return has_unserialized_changes; + return uncheckpointed_delete_commit.IsValid(); } vector RowVersionManager::GetStoragePointers() { lock_guard lock(version_lock); - D_ASSERT(!has_unserialized_changes); + D_ASSERT(!uncheckpointed_delete_commit.IsValid()); return storage_pointers; } diff --git a/src/duckdb/src/storage/table_index_list.cpp b/src/duckdb/src/storage/table_index_list.cpp index 49d81398c..186db5143 100644 --- a/src/duckdb/src/storage/table_index_list.cpp +++ b/src/duckdb/src/storage/table_index_list.cpp @@ -9,6 +9,7 @@ #include "duckdb/planner/expression_binder/index_binder.hpp" #include "duckdb/storage/data_table.hpp" #include "duckdb/storage/table/data_table_info.hpp" +#include "duckdb/storage/table/scan_state.hpp" namespace duckdb { @@ -214,14 +215,19 @@ void TableIndexList::VerifyForeignKey(optional_ptr storage, c // Check whether the chunk can be inserted in or deleted from the referenced table storage. auto entry = FindForeignKeyIndex(fk_keys, fk_type); auto &index = *entry->index; + lock_guard guard(entry->lock); D_ASSERT(index.IsBound()); - optional_ptr delete_index; + IndexAppendInfo index_append_info; if (storage) { - delete_index = storage->delete_indexes.Find(index.GetIndexName()); + auto delete_index = storage->delete_indexes.Find(index.GetIndexName()); + if (delete_index) { + index_append_info.delete_indexes.push_back(*delete_index); + } + } + if (entry->removed_data_during_checkpoint) { + index_append_info.delete_indexes.push_back(*entry->removed_data_during_checkpoint); } - IndexAppendInfo index_append_info(IndexAppendMode::DEFAULT, delete_index); - lock_guard entry_lock(entry->lock); auto &main_index = index.Cast(); main_index.VerifyConstraint(chunk, index_append_info, conflict_manager); if (entry->added_data_during_checkpoint) { @@ -263,7 +269,7 @@ vector TableIndexList::SerializeToDisk(QueryContext context, c return infos; } -void TableIndexList::MergeCheckpointDeltas(transaction_t checkpoint_id) { +void TableIndexList::MergeCheckpointDeltas(DataTable &storage, transaction_t checkpoint_id) { lock_guard lock(index_entries_lock); for (auto &entry : index_entries) { // merge any data appended to the index while the checkpoint was running @@ -272,12 +278,100 @@ void TableIndexList::MergeCheckpointDeltas(transaction_t checkpoint_id) { continue; } lock_guard guard(entry->lock); + auto &bound_index = index.Cast(); + vector> delta_indexes; + vector delta_index_is_delete; + if (entry->removed_data_during_checkpoint) { + delta_indexes.push_back(*entry->removed_data_during_checkpoint); + delta_index_is_delete.push_back(true); + } if (entry->added_data_during_checkpoint) { - // we have written data here while checkpointing - merge it into the main index - auto &bound_index = index.Cast(); - bound_index.MergeIndexes(*entry->added_data_during_checkpoint); - entry->added_data_during_checkpoint.reset(); + delta_indexes.push_back(*entry->added_data_during_checkpoint); + delta_index_is_delete.push_back(false); + } + for (idx_t i = 0; i < delta_indexes.size(); i++) { + auto &delta_index = delta_indexes[i].get(); + auto is_delete = delta_index_is_delete[i]; + // FIXME: this should use an optimized (removal) merge instead of doing fetches in the base table + // fetch all row-ids to delete + auto &art = delta_index.Cast(); + auto scan_state = art.InitializeFullScan(); + set all_row_ids; + art.Scan(*scan_state, NumericLimits::Maximum(), all_row_ids); + + // FIXME: this is mostly copied over from RowGroupCollection::RemoveFromIndexes, but we shouldn't be doing + // this anyway... + if (!all_row_ids.empty()) { + // in a loop fetch the + Vector row_identifiers(LogicalType::BIGINT); + auto row_ids = FlatVector::GetData(row_identifiers); + idx_t count = 0; + + auto indexed_column_id_set = bound_index.GetColumnIdSet(); + vector column_ids; + for (auto &col : indexed_column_id_set) { + column_ids.emplace_back(col); + } + sort(column_ids.begin(), column_ids.end()); + + auto types = storage.GetTypes(); + vector column_types; + for (auto &col : column_ids) { + column_types.push_back(types[col.GetPrimaryIndex()]); + } + + DataChunk fetch_chunk; + fetch_chunk.Initialize(Allocator::DefaultAllocator(), column_types); + + ColumnFetchState state; + state.fetch_type = FetchType::FORCE_FETCH; + + DataChunk result_chunk; + auto fetched_columns = vector(types.size(), false); + result_chunk.Initialize(Allocator::DefaultAllocator(), types, fetched_columns); + // Now set all to-be-fetched columns. + for (auto &col : indexed_column_id_set) { + fetched_columns[col] = true; + } + auto last_row_id = *all_row_ids.rbegin(); + for (auto &row_id : all_row_ids) { + row_ids[count++] = row_id; + if (row_id == last_row_id || count == STANDARD_VECTOR_SIZE) { + fetch_chunk.Reset(); + storage.FetchCommitted(fetch_chunk, column_ids, row_identifiers, count, state); + + // Reference the necessary columns of the fetch_chunk. + idx_t fetch_idx = 0; + for (idx_t j = 0; j < types.size(); j++) { + if (fetched_columns[j]) { + result_chunk.data[j].Reference(fetch_chunk.data[fetch_idx++]); + continue; + } + result_chunk.data[j].Reference(Value(types[j])); + } + result_chunk.SetCardinality(fetch_chunk); + if (is_delete) { + auto delete_count = bound_index.TryDelete(result_chunk, row_identifiers); + if (delete_count != result_chunk.size()) { + throw InternalException("Failed to remove all rows while merging checkpoint deltas - " + "this signifies a bug or broken index\nChunk: %s", + result_chunk.ToString()); + } + } else { + auto error = bound_index.Append(result_chunk, row_identifiers); + if (error.HasError()) { + throw InternalException("Failed to append while merging checkpoint deltas - this " + "signifies a bug or broken index: %s", + error.Message()); + } + } + count = 0; + } + } + } } + entry->removed_data_during_checkpoint.reset(); + entry->added_data_during_checkpoint.reset(); entry->last_written_checkpoint = checkpoint_id; } } diff --git a/src/duckdb/src/transaction/cleanup_state.cpp b/src/duckdb/src/transaction/cleanup_state.cpp index 1a07bf6ee..e6abec9ab 100644 --- a/src/duckdb/src/transaction/cleanup_state.cpp +++ b/src/duckdb/src/transaction/cleanup_state.cpp @@ -14,10 +14,10 @@ namespace duckdb { -CleanupState::CleanupState(const QueryContext &context, transaction_t lowest_active_transaction, +CleanupState::CleanupState(DuckTransaction &transaction, transaction_t lowest_active_transaction, ActiveTransactionState transaction_state) : lowest_active_transaction(lowest_active_transaction), transaction_state(transaction_state), - index_data_remover(context, IndexRemovalType::DELETED_ROWS_IN_USE) { + index_data_remover(transaction, QueryContext(), IndexRemovalType::DELETED_ROWS_IN_USE) { } void CleanupState::CleanupEntry(UndoFlags type, data_ptr_t data) { diff --git a/src/duckdb/src/transaction/commit_state.cpp b/src/duckdb/src/transaction/commit_state.cpp index 1819e0c46..be4806985 100644 --- a/src/duckdb/src/transaction/commit_state.cpp +++ b/src/duckdb/src/transaction/commit_state.cpp @@ -18,14 +18,15 @@ #include "duckdb/transaction/delete_info.hpp" #include "duckdb/transaction/update_info.hpp" #include "duckdb/transaction/duck_transaction.hpp" +#include "duckdb/transaction/duck_transaction_manager.hpp" namespace duckdb { //===--------------------------------------------------------------------===// // IndexDataRemover //===--------------------------------------------------------------------===// -IndexDataRemover::IndexDataRemover(QueryContext context, IndexRemovalType removal_type) - : context(context), removal_type(removal_type) { +IndexDataRemover::IndexDataRemover(DuckTransaction &transaction_p, QueryContext context, IndexRemovalType removal_type) + : transaction(transaction_p), context(context), removal_type(removal_type) { } void IndexDataRemover::PushDelete(DeleteInfo &info) { @@ -74,11 +75,13 @@ void IndexDataRemover::Flush(DataTable &table, row_t *row_numbers, idx_t count) // set up the row identifiers vector Vector row_identifiers(LogicalType::ROW_TYPE, data_ptr_cast(row_numbers)); + auto active_checkpoint = transaction.GetTransactionManager().Cast().GetActiveCheckpoint(); + auto checkpoint_id = active_checkpoint == MAX_TRANSACTION_ID ? optional_idx() : active_checkpoint; // delete the tuples from all the indexes. // If there is any issue with removal, a FatalException must be thrown since there may be a corruption of // data, hence the transaction cannot be guaranteed. try { - table.RemoveFromIndexes(context, row_identifiers, count, removal_type); + table.RemoveFromIndexes(context, row_identifiers, count, removal_type, checkpoint_id); } catch (std::exception &ex) { throw FatalException(ErrorData(ex).Message()); } catch (...) { @@ -94,7 +97,8 @@ void IndexDataRemover::Flush(DataTable &table, row_t *row_numbers, idx_t count) CommitState::CommitState(DuckTransaction &transaction_p, transaction_t commit_id, ActiveTransactionState transaction_state, CommitMode commit_mode) : transaction(transaction_p), commit_id(commit_id), - index_data_remover(*transaction.context.lock(), GetIndexRemovalType(transaction_state, commit_mode)) { + index_data_remover(transaction, *transaction.context.lock(), + GetIndexRemovalType(transaction_state, commit_mode)) { } IndexRemovalType CommitState::GetIndexRemovalType(ActiveTransactionState transaction_state, CommitMode commit_mode) { diff --git a/src/duckdb/src/transaction/duck_transaction.cpp b/src/duckdb/src/transaction/duck_transaction.cpp index 53bf74d6c..d4c73c5cf 100644 --- a/src/duckdb/src/transaction/duck_transaction.cpp +++ b/src/duckdb/src/transaction/duck_transaction.cpp @@ -259,6 +259,9 @@ ErrorData DuckTransaction::Commit(AttachedDatabase &db, CommitInfo &commit_info, try { storage->Commit(commit_state.get()); undo_buffer.Commit(iterator_state, commit_info); + // if (DebugForceAbortCommit()) { + // throw InvalidInputException("Force revert"); + // } if (commit_state) { // if we have written to the WAL - flush after the commit has been successful commit_state->FlushCommit(); @@ -289,30 +292,36 @@ void DuckTransaction::Cleanup(transaction_t lowest_active_transaction) { } void DuckTransaction::SetModifications(DatabaseModificationType type) { - if (write_lock) { - // already have a write lock - return; + if (!checkpoint_lock) { + bool require_write_lock = false; + require_write_lock = require_write_lock || type.UpdateData(); + require_write_lock = require_write_lock || type.AlterTable(); + require_write_lock = require_write_lock || type.CreateCatalogEntry(); + require_write_lock = require_write_lock || type.DropCatalogEntry(); + require_write_lock = require_write_lock || type.Sequence(); + require_write_lock = require_write_lock || type.CreateIndex(); + + if (require_write_lock) { + // obtain a shared checkpoint lock to prevent concurrent checkpoints while this transaction is running + checkpoint_lock = GetTransactionManager().SharedCheckpointLock(); + } } - bool require_write_lock = false; - require_write_lock = require_write_lock || type.DeleteData(); - require_write_lock = require_write_lock || type.UpdateData(); - require_write_lock = require_write_lock || type.AlterTable(); - require_write_lock = require_write_lock || type.CreateCatalogEntry(); - require_write_lock = require_write_lock || type.DropCatalogEntry(); - require_write_lock = require_write_lock || type.Sequence(); - require_write_lock = require_write_lock || type.CreateIndex(); - - if (require_write_lock) { - // obtain a shared checkpoint lock to prevent concurrent checkpoints while this transaction is running - write_lock = GetTransactionManager().SharedCheckpointLock(); + if (!vacuum_lock) { + bool require_vacuum_lock = false; + require_vacuum_lock = require_vacuum_lock || type.InsertData(); + require_vacuum_lock = require_vacuum_lock || type.DeleteData(); + + if (require_vacuum_lock) { + vacuum_lock = GetTransactionManager().SharedVacuumLock(); + } } } unique_ptr DuckTransaction::TryGetCheckpointLock() { - if (!write_lock) { + if (!checkpoint_lock) { return GetTransactionManager().TryGetCheckpointLock(); } else { - return GetTransactionManager().TryUpgradeCheckpointLock(*write_lock); + return GetTransactionManager().TryUpgradeCheckpointLock(*checkpoint_lock); } } diff --git a/src/duckdb/src/transaction/duck_transaction_manager.cpp b/src/duckdb/src/transaction/duck_transaction_manager.cpp index 49128adee..29d2fbde3 100644 --- a/src/duckdb/src/transaction/duck_transaction_manager.cpp +++ b/src/duckdb/src/transaction/duck_transaction_manager.cpp @@ -267,6 +267,14 @@ unique_ptr DuckTransactionManager::TryGetCheckpointLock() { return checkpoint_lock.TryGetExclusiveLock(); } +unique_ptr DuckTransactionManager::SharedVacuumLock() { + return vacuum_lock.GetSharedLock(); +} + +unique_ptr DuckTransactionManager::TryGetVacuumLock() { + return vacuum_lock.TryGetExclusiveLock(); +} + transaction_t DuckTransactionManager::GetCommitTimestamp() { return current_start_timestamp++; } diff --git a/src/duckdb/src/transaction/undo_buffer.cpp b/src/duckdb/src/transaction/undo_buffer.cpp index 8adb8e2de..29f036934 100644 --- a/src/duckdb/src/transaction/undo_buffer.cpp +++ b/src/duckdb/src/transaction/undo_buffer.cpp @@ -181,7 +181,7 @@ void UndoBuffer::Cleanup(transaction_t lowest_active_transaction) { // the chunks) // (2) there is no active transaction with start_id < commit_id of this // transaction - CleanupState state(QueryContext(), lowest_active_transaction, active_transaction_state); + CleanupState state(transaction, lowest_active_transaction, active_transaction_state); UndoBuffer::IteratorState iterator_state; IterateEntries(iterator_state, [&](UndoFlags type, data_ptr_t data) { state.CleanupEntry(type, data); }); } diff --git a/src/duckdb/third_party/fsst/libfsst.cpp b/src/duckdb/third_party/fsst/libfsst.cpp index b8394a446..62d89702f 100644 --- a/src/duckdb/third_party/fsst/libfsst.cpp +++ b/src/duckdb/third_party/fsst/libfsst.cpp @@ -18,21 +18,23 @@ #include "libfsst.hpp" #include "duckdb/common/unique_ptr.hpp" +namespace libfsst { Symbol concat(Symbol a, Symbol b) { Symbol s; u32 length = a.length()+b.length(); if (length > Symbol::maxLength) length = Symbol::maxLength; s.set_code_len(FSST_CODE_MASK, length); - s.val.num = (b.val.num << (8*a.length())) | a.val.num; + s.store_num((b.load_num() << (8*a.length())) | a.load_num()); return s; } +} // namespace libfsst namespace std { template <> -class hash { -public: - size_t operator()(const QSymbol& q) const { - uint64_t k = q.symbol.val.num; +class hash { + public: + size_t operator()(const libfsst::QSymbol& q) const { + uint64_t k = q.symbol.load_num(); const uint64_t m = 0xc6a4a7935bd1e995; const int r = 47; uint64_t h = 0x8445d61a4e774912 ^ (8*m); @@ -49,6 +51,7 @@ class hash { }; } +namespace libfsst { bool isEscapeCode(u16 pos) { return pos < FSST_CODE_BASE; } std::ostream& operator<<(std::ostream& out, const Symbol& s) { @@ -57,7 +60,7 @@ std::ostream& operator<<(std::ostream& out, const Symbol& s) { return out; } -SymbolTable *buildSymbolTable(Counters& counters, vector line, size_t len[], bool zeroTerminated=false) { +SymbolTable *buildSymbolTable(Counters& counters, vector line, const size_t len[], bool zeroTerminated=false) { SymbolTable *st = new SymbolTable(), *bestTable = new SymbolTable(); int bestGain = (int) -FSST_SAMPLEMAXSZ; // worst case (everything exception) size_t sampleFrac = 128; @@ -70,8 +73,8 @@ SymbolTable *buildSymbolTable(Counters& counters, vector line, size_t len[] u16 byteHisto[256]; memset(byteHisto, 0, sizeof(byteHisto)); for(size_t i=0; iterminator = 256; @@ -91,15 +94,14 @@ SymbolTable *buildSymbolTable(Counters& counters, vector line, size_t len[] int gain = 0; for(size_t i=0; i sampleFrac) continue; } if (cur < end) { - u8* start = cur; u16 code2 = 255, code1 = st->findLongestSymbol(cur, end); cur += st->symbols[code1].length(); gain += (int) (st->symbols[code1].length()-(1+isEscapeCode(code1))); @@ -124,7 +126,7 @@ SymbolTable *buildSymbolTable(Counters& counters, vector line, size_t len[] Symbol s = st->hashTab[idx]; code2 = st->shortCodes[word & 0xFFFF] & FSST_CODE_MASK; word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); - if ((s.icl < FSST_ICL_FREE) & (s.val.num == word)) { + if ((s.icl < FSST_ICL_FREE) & (s.load_num() == word)) { code2 = s.code(); cur += s.length(); } else if (code2 >= FSST_CODE_BASE) { @@ -188,10 +190,11 @@ SymbolTable *buildSymbolTable(Counters& counters, vector line, size_t len[] addOrInc(cands, s1, ((s1.length()==1)?8LL:1LL)*cnt1); if (sampleFrac >= 128 || // last round we do not create new (combined) symbols - s1.length() == Symbol::maxLength || // symbol cannot be extended - s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte + s1.length() == Symbol::maxLength || // symbol cannot be extended + s1.val.str[0] == st->terminator) { // multi-byte symbols cannot contain the terminator byte continue; } + for (u32 pos2=0; pos2nSymbols; pos2++) { u32 cnt2 = counters.count2GetNext(pos1, pos2); // may advance pos2!! if (!cnt2) continue; @@ -205,7 +208,7 @@ SymbolTable *buildSymbolTable(Counters& counters, vector line, size_t len[] } // insert candidates into priority queue (by gain) - auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.val.num > q2.symbol.val.num); }; + auto cmpGn = [](const QSymbol& q1, const QSymbol& q2) { return (q1.gain < q2.gain) || (q1.gain == q2.gain && q1.symbol.load_num() > q2.symbol.load_num()); }; priority_queue,decltype(cmpGn)> pq(cmpGn); for (auto& q : cands) pq.push(q); @@ -244,11 +247,11 @@ SymbolTable *buildSymbolTable(Counters& counters, vector line, size_t len[] // optimized adaptive *scalar* compression method static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_t lenIn[], u8* strIn[], size_t size, u8* out, size_t lenOut[], u8* strOut[], bool noSuffixOpt, bool avoidBranch) { - u8 *cur = NULL, *end = NULL, *lim = out + size; + const u8 *cur = NULL, *end = NULL, *lim = out + size; size_t curLine, suffixLim = symbolTable.suffixLim; u8 byteLim = symbolTable.nSymbols + symbolTable.zeroTerminated - symbolTable.lenHisto[0]; - u8 buf[512+7] = {}; /* +7 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */ + u8 buf[512+8] = {}; /* +8 sentinel is to avoid 8-byte unaligned-loads going beyond 511 out-of-bounds */ // three variants are possible. dead code falls away since the bool arguments are constants auto compressVariant = [&](bool noSuffixOpt, bool avoidBranch) { @@ -264,7 +267,7 @@ static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_ Symbol s = symbolTable.hashTab[idx]; out[1] = (u8) word; // speculatively write out escaped byte word &= (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); - if ((s.icl < FSST_ICL_FREE) && s.val.num == word) { + if ((s.icl < FSST_ICL_FREE) && s.load_num() == word) { *out++ = (u8) s.code(); cur += s.length(); } else if (avoidBranch) { // could be a 2-byte or 1-byte code, or miss @@ -320,19 +323,20 @@ static inline size_t compressBulk(SymbolTable &symbolTable, size_t nlines, size_ #define FSST_SAMPLELINE ((size_t) 512) // quickly select a uniformly random set of lines such that we have between [FSST_SAMPLETARGET,FSST_SAMPLEMAXSZ) string bytes -vector makeSample(u8* sampleBuf, u8* strIn[], size_t *lenIn, size_t nlines, +vector makeSample(u8* sampleBuf, u8* strIn[], size_t *lenIn, size_t nlines, duckdb::unique_ptr>& sample_len_out) { size_t totSize = 0; - vector sample; + vector sample; for(size_t i=0; i>(new vector()); sample_len_out->reserve(nlines + FSST_SAMPLEMAXSZ/FSST_SAMPLELINE); @@ -365,9 +369,9 @@ vector makeSample(u8* sampleBuf, u8* strIn[], size_t *lenIn, size_t nlines, extern "C" duckdb_fsst_encoder_t* duckdb_fsst_create(size_t n, size_t lenIn[], u8 *strIn[], int zeroTerminated) { u8* sampleBuf = new u8[FSST_SAMPLEMAXSZ]; duckdb::unique_ptr> sample_sizes; - vector sample = makeSample(sampleBuf, strIn, lenIn, n?n:1, sample_sizes); // careful handling of input to get a right-size and representative sample + vector sample = makeSample(sampleBuf, strIn, lenIn, n?n:1, sample_sizes); // careful handling of input to get a right-size and representative sample Encoder *encoder = new Encoder(); - size_t* sampleLen = sample_sizes ? sample_sizes->data() : &lenIn[0]; + const size_t* sampleLen = sample_sizes ? sample_sizes->data() : &lenIn[0]; encoder->symbolTable = shared_ptr(buildSymbolTable(encoder->counters, sample, sampleLen, zeroTerminated)); delete[] sampleBuf; return (duckdb_fsst_encoder_t*) encoder; @@ -403,6 +407,8 @@ extern "C" u32 duckdb_fsst_export(duckdb_fsst_encoder_t *encoder, u8 *buf) { (((u64) e->symbolTable->nSymbols) << 8) | FSST_ENDIAN_MARKER; // least significant byte is nonzero + version = swap64_if_be(version); // ensure version is little-endian encoded + /* do not assume unaligned reads here */ memcpy(buf, &version, 8); buf[8] = e->symbolTable->zeroTerminated; @@ -427,6 +433,8 @@ extern "C" u32 duckdb_fsst_import(duckdb_fsst_decoder_t *decoder, u8 *buf) { // version field (first 8 bytes) is now there just for future-proofness, unused still (skipped) memcpy(&version, buf, 8); + version = swap64_if_be(version); // version is always little-endian encoded + if ((version>>32) != FSST_VERSION) return 0; decoder->zeroTerminated = buf[8]&1; memcpy(lenHisto, buf+9, 8); @@ -481,7 +489,9 @@ inline size_t _compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn size_t compressAuto(Encoder *e, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[], int simd) { return _compressAuto(e, nlines, lenIn, strIn, size, output, lenOut, strOut, simd); } +} // namespace libfsst +using namespace libfsst; // the main compression function (everything automatic) extern "C" size_t duckdb_fsst_compress(duckdb_fsst_encoder_t *encoder, size_t nlines, size_t lenIn[], u8 *strIn[], size_t size, u8 *output, size_t *lenOut, u8 *strOut[]) { // to be faster than scalar, simd needs 64 lines or more of length >=12; or fewer lines, but big ones (totLen > 32KB) diff --git a/src/duckdb/third_party/fsst/libfsst.hpp b/src/duckdb/third_party/fsst/libfsst.hpp index 0d556386a..fd33ce581 100644 --- a/src/duckdb/third_party/fsst/libfsst.hpp +++ b/src/duckdb/third_party/fsst/libfsst.hpp @@ -37,16 +37,17 @@ using namespace std; #include "fsst.h" // the official FSST API -- also usable by C mortals /* unsigned integers */ +namespace libfsst { typedef uint8_t u8; typedef uint16_t u16; typedef uint32_t u32; typedef uint64_t u64; +} // namespace libfsst -inline uint64_t fsst_unaligned_load(u8 const* V) { - uint64_t Ret; - memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible) - return Ret; -} +#if UINTPTR_MAX == 0xffffffffU +// We're on a 32-bit platform +#define NONOPT_FSST +#endif #define FSST_ENDIAN_MARKER ((u64) 1) #define FSST_VERSION_20190218 20190218 @@ -63,6 +64,29 @@ inline uint64_t fsst_unaligned_load(u8 const* V) { #define FSST_CODE_MAX (1UL< 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) + return __builtin_bswap64(v); + #elif + return (v&0xff00000000000000ull) >> 56) | (v&0x00ff000000000000ull) >> 40 | \ + (v&0x0000ff0000000000ull) >> 24 | (v&0x000000ff00000000ull) >> 8 | \ + (v&0x00000000ff000000ull) << 8 | (v&0x0000000000ff0000ull) << 24 | \ + (v&0x000000000000ff00ull) << 40 | (v&0x00000000000000ffull) << 56 + + #endif +#else + return v; // little-endian (or unknown), so no swap needed +#endif +} + +inline uint64_t fsst_unaligned_load(u8 const* V) { + uint64_t Ret; + memcpy(&Ret, V, sizeof(uint64_t)); // compiler will generate efficient code (unaligned load, where possible) + return swap64_if_be(Ret); +} + struct Symbol { static const unsigned maxLength = 8; @@ -74,9 +98,9 @@ struct Symbol { Symbol() : icl(0) { val.num = 0; } - explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { val.num = c; } // single-char symbol + explicit Symbol(u8 c, u16 code) : icl((1<<28)|(code<<16)|56) { store_num(c); } // single-char symbol explicit Symbol(const char* begin, const char* end) : Symbol(begin, (u32) (end-begin)) {} - explicit Symbol(u8* begin, u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {} + explicit Symbol(const u8* begin, const u8* end) : Symbol((const char*)begin, (u32) (end-begin)) {} explicit Symbol(const char* input, u32 len) { val.num = 0; if (len>=8) { @@ -89,18 +113,21 @@ struct Symbol { } void set_code_len(u32 code, u32 len) { icl = (len<<28)|(code<<16)|((8-len)*8); } + u64 load_num() const { return swap64_if_be(val.num); } + void store_num(u64 v) { val.num = swap64_if_be(v); } + u32 length() const { return (u32) (icl >> 28); } u16 code() const { return (icl >> 16) & FSST_CODE_MASK; } u32 ignoredBits() const { return (u32) icl; } - u8 first() const { assert( length() >= 1); return 0xFF & val.num; } - u16 first2() const { assert( length() >= 2); return 0xFFFF & val.num; } + u8 first() const { assert( length() >= 1); return 0xFF & load_num(); } + u16 first2() const { assert( length() >= 2); return 0xFFFF & load_num(); } #define FSST_HASH_LOG2SIZE 10 #define FSST_HASH_PRIME 2971215073LL #define FSST_SHIFT 15 #define FSST_HASH(w) (((w)*FSST_HASH_PRIME)^(((w)*FSST_HASH_PRIME)>>FSST_SHIFT)) - size_t hash() const { size_t v = 0xFFFFFF & val.num; return FSST_HASH(v); } // hash on the next 3 bytes + size_t hash() const { size_t v = 0xFFFFFF & load_num(); return FSST_HASH(v); } // hash on the next 3 bytes }; // Symbol that can be put in a queue, ordered on gain @@ -117,7 +144,7 @@ struct QSymbol{ // two phases of compression, before and after optimize(): // // (1) to encode values we probe (and maintain) three datastructures: -// - u16 byteCodes[65536] array at the position of the next byte (s.length==1) +// - u16 byteCodes[256] array at the position of the next byte (s.length==1) // - u16 shortCodes[65536] array at the position of the next twobyte pattern (s.length==2) // - Symbol hashtable[1024] (keyed by the next three bytes, ie for s.length>2), // this search will yield a u16 code, it points into Symbol symbols[]. You always find a hit, because the first 256 codes are @@ -215,7 +242,7 @@ struct SymbolTable { bool taken = (hashTab[idx].icl < FSST_ICL_FREE); if (taken) return false; // collision in hash table hashTab[idx].icl = s.icl; - hashTab[idx].val.num = s.val.num & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl); + hashTab[idx].store_num(s.load_num() & (0xFFFFFFFFFFFFFFFF >> (u8) s.icl)); return true; } bool add(Symbol s) { @@ -236,8 +263,8 @@ struct SymbolTable { /// Find longest expansion, return code (= position in symbol table) u16 findLongestSymbol(Symbol s) const { size_t idx = s.hash() & (hashTabSize-1); - if (hashTab[idx].icl <= s.icl && hashTab[idx].val.num == (s.val.num & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) { - return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol + if (hashTab[idx].icl <= s.icl && hashTab[idx].load_num() == (s.load_num() & (0xFFFFFFFFFFFFFFFF >> ((u8) hashTab[idx].icl)))) { + return (hashTab[idx].icl>>16) & FSST_CODE_MASK; // matched a long symbol } if (s.length() >= 2) { u16 code = shortCodes[s.first2()] & FSST_CODE_MASK; @@ -245,7 +272,7 @@ struct SymbolTable { } return byteCodes[s.first()] & FSST_CODE_MASK; } - u16 findLongestSymbol(u8* cur, u8* end) const { + u16 findLongestSymbol(const u8* cur, const u8* end) const { return findLongestSymbol(Symbol(cur,end)); // represent the string as a temporary symbol } @@ -380,7 +407,7 @@ struct Counters { } u32 count1GetNext(u32 &pos1) { // note: we will advance pos1 to the next nonzero counter in register range // read 16-bits single symbol counter, split into two 8-bits numbers (count1Low, count1High), while skipping over zeros - u64 high = fsst_unaligned_load(&count1High[pos1]); + u64 high = fsst_unaligned_load(&count1High[pos1]); // note: this reads 8 subsequent counters [pos1..pos1+7] u32 zero = high?(__builtin_ctzll(high)>>3):7UL; // number of zero bytes high = (high >> (zero << 3)) & 255; // advance to nonzero counter @@ -393,7 +420,7 @@ struct Counters { } u32 count2GetNext(u32 pos1, u32 &pos2) { // note: we will advance pos2 to the next nonzero counter in register range // read 12-bits pairwise symbol counter, split into low 8-bits and high 4-bits number while skipping over zeros - u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); + u64 high = fsst_unaligned_load(&count2High[pos1][pos2>>1]); // note: this reads 16 subsequent counters [pos2..pos2+15] high >>= ((pos2&1) << 2); // odd pos2: ignore the lowest 4 bits & we see only 15 counters u32 zero = high?(__builtin_ctzll(high)>>2):(15UL-(pos2&1UL)); // number of zero 4-bits counters @@ -434,5 +461,6 @@ struct SIMDjob { }; // C++ fsst-compress function with some more control of how the compression happens (algorithm flavor, simd unroll degree) -size_t compressImpl(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); -size_t compressAuto(Encoder *encoder, size_t n, size_t lenIn[], u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); +size_t compressImpl(Encoder *encoder, size_t n, const size_t lenIn[], const u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], bool noSuffixOpt, bool avoidBranch, int simd); +size_t compressAuto(Encoder *encoder, size_t n, const size_t lenIn[], const u8 *strIn[], size_t size, u8 * output, size_t *lenOut, u8 *strOut[], int simd); +} // namespace libfsst diff --git a/src/duckdb/ub_src_optimizer.cpp b/src/duckdb/ub_src_optimizer.cpp index 0cbee13d3..89c5e7a6c 100644 --- a/src/duckdb/ub_src_optimizer.cpp +++ b/src/duckdb/ub_src_optimizer.cpp @@ -12,6 +12,8 @@ #include "src/optimizer/compressed_materialization.cpp" +#include "src/optimizer/count_window_elimination.cpp" + #include "src/optimizer/cse_optimizer.cpp" #include "src/optimizer/cte_filter_pusher.cpp"