From 608ea706537bc7020f2dda2514027aededccedbc Mon Sep 17 00:00:00 2001 From: liutang123 Date: Fri, 30 Jan 2026 00:06:32 +0800 Subject: [PATCH 1/2] [fix](parquet) Don't decompress dict page when dict page is empty When a string column's data are all null, the dict page may be empty. The error message is as follows: INTERNAL_ERROR]Read parquet file hdfs://HDFS82742/ydbi/original/server/tlbbgl/auction_zstd/dt=2024-12-13/084cadfc5200b4ad-c2b2568a00000045_1132749056_data.0.parq failed, reason = [INVALID_ARGUMENT]ZSTD_decompressDCtx error: Unknown frame descriptor. cur path: xxx We needn't decompress dcit page data when dict page is empty and just cache empty data as decompressed data. --- .../parquet/vparquet_column_chunk_reader.cpp | 20 +++++++++++++++++-- .../format/parquet/vparquet_page_reader.h | 1 + 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index 61568dc4f4c901..418f54ea933fd6 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -417,12 +417,21 @@ Status ColumnChunkReader::_decode_dict_page() { if (!dict_loaded) { // Load and decompress dictionary page from file if (_block_compress_codec != nullptr) { + auto dict_num = header->dictionary_page_header.num_values; + if (dict_num == 0 && uncompressed_size != 0) { + return Status::IOError( + "Dictionary page's num_values is {} but uncompressed_size is {}", dict_num, + uncompressed_size); + } Slice compressed_data; - RETURN_IF_ERROR(_page_reader->get_page_data(compressed_data)); Slice dict_slice(dict_data.get(), uncompressed_size); - RETURN_IF_ERROR(_block_compress_codec->decompress(compressed_data, &dict_slice)); + if (dict_num != 0) { + RETURN_IF_ERROR(_page_reader->get_page_data(compressed_data)); + RETURN_IF_ERROR(_block_compress_codec->decompress(compressed_data, &dict_slice)); + } // Decide whether to cache decompressed or compressed dictionary based on threshold + // If uncompressed_page_size == 0, should_cache_decompressed will return true bool cache_payload_decompressed = should_cache_decompressed(header, _metadata); if (_page_read_ctx.enable_parquet_file_page_cache && @@ -431,10 +440,12 @@ Status ColumnChunkReader::_decode_dict_page() { std::vector empty_levels; // Dictionary pages don't have levels if (cache_payload_decompressed) { // Cache the decompressed dictionary page + // If dict_num == 0, `dict_slice` will be empty _insert_page_into_cache(empty_levels, dict_slice); _chunk_statistics.page_cache_decompressed_write_counter += 1; } else { if (config::enable_parquet_cache_compressed_pages) { + DCHECK(!compressed_data.empty()); // Cache the compressed dictionary page _insert_page_into_cache(empty_levels, Slice(compressed_data.data, compressed_data.size)); @@ -442,6 +453,11 @@ Status ColumnChunkReader::_decode_dict_page() { } } } + // `get_page_data` not called, we should skip the page data + // Because `_insert_page_into_cache` will use _page_reader, we should exec `skip_page_data` after `_insert_page_into_cache` + if (dict_num == 0) { + _page_reader->skip_page_data(); + } } else { Slice dict_slice; RETURN_IF_ERROR(_page_reader->get_page_data(dict_slice)); diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h b/be/src/vec/exec/format/parquet/vparquet_page_reader.h index 9aa5ba3171c851..5ffed29ed2f6c1 100644 --- a/be/src/vec/exec/format/parquet/vparquet_page_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h @@ -65,6 +65,7 @@ inline bool should_cache_decompressed(const tparquet::PageHeader* header, const tparquet::ColumnMetaData& metadata) { if (header->compressed_page_size <= 0) return true; if (metadata.codec == tparquet::CompressionCodec::UNCOMPRESSED) return true; + if (header->uncompressed_page_size == 0) return true; double ratio = static_cast(header->uncompressed_page_size) / static_cast(header->compressed_page_size); From 04583b01f2e089eb575bf90bb7d2d671bdd3b80b Mon Sep 17 00:00:00 2001 From: liutang123 Date: Sat, 31 Jan 2026 00:51:35 +0800 Subject: [PATCH 2/2] In FixLengthDictDecoder and ByteArrayDictDecoder, don't exec `ColumnDictI32::insert_many_dict_data` when dict data is empty. --- .../vec/exec/format/parquet/byte_array_dict_decoder.cpp | 9 ++++++++- .../vec/exec/format/parquet/fix_length_dict_decoder.hpp | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp index 49ab5cd584bb09..561d4cce4ca1b2 100644 --- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp +++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp @@ -81,6 +81,13 @@ Status ByteArrayDictDecoder::read_dict_values_to_column(MutableColumnPtr& doris_ MutableColumnPtr ByteArrayDictDecoder::convert_dict_column_to_string_column( const ColumnInt32* dict_column) { auto res = ColumnString::create(); + if (_dict_items.empty()) { + if (dict_column->size() > 0) { + LOG(ERROR) << "Attempt to convert dict column with empty dictionary, column size: " + << dict_column->size(); + } + return res; + } std::vector dict_values(dict_column->size()); const auto& data = dict_column->get_data(); for (size_t i = 0; i < dict_column->size(); ++i) { @@ -106,7 +113,7 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); if (doris_column->is_column_dictionary()) { ColumnDictI32& dict_column = assert_cast(*doris_column); - if (dict_column.dict_size() == 0) { + if (dict_column.dict_size() == 0 && !_dict_items.empty()) { //If the dictionary grows too big, whether in size or number of distinct values, // the encoding will fall back to the plain encoding. dict_column.insert_many_dict_data(_dict_items.data(), diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index c47df37c4d15a0..086bc045eb7a92 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -81,7 +81,7 @@ class FixLengthDictDecoder final : public BaseDictDecoder { ColumnSelectVector& select_vector, bool is_dict_filter) { size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); if (doris_column->is_column_dictionary() && - assert_cast(*doris_column).dict_size() == 0) { + assert_cast(*doris_column).dict_size() == 0 && !_dict_items.empty()) { std::vector dict_items; char* dict_item_address = (char*)_dict.get(); @@ -213,6 +213,13 @@ class FixLengthDictDecoder final : public BaseDictDecoder { MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override { auto res = ColumnString::create(); + if (_dict_items.empty()) { + if (dict_column->size() > 0) { + LOG(ERROR) << "Attempt to convert dict column with empty dictionary, column size: " + << dict_column->size(); + } + return res; + } std::vector dict_values; dict_values.reserve(dict_column->size()); const auto& data = dict_column->get_data();