From d5a8bfab18ff41e03cf3a4be4b96358cc729f053 Mon Sep 17 00:00:00 2001 From: duanyyyyyy Date: Wed, 29 Apr 2026 11:25:04 +0800 Subject: [PATCH 1/3] feat(parquet): add metrics for parquet reader observability Add row groups, rows, batch count, and latency metrics to ParquetFileBatchReader, matching the observability level of the ORC reader. Co-Authored-By: Claude Opus 4.6 --- .../parquet/parquet_file_batch_reader.cpp | 15 ++++++++++ .../parquet/parquet_file_batch_reader.h | 4 +++ src/paimon/format/parquet/parquet_metrics.h | 30 +++++++++++++++++++ 3 files changed, 49 insertions(+) create mode 100644 src/paimon/format/parquet/parquet_metrics.h diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index f81c0bdc6..8be95d8c3 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -37,8 +37,10 @@ #include "paimon/common/metrics/metrics_impl.h" #include "paimon/common/utils/arrow/status_utils.h" #include "paimon/common/utils/options_utils.h" +#include "paimon/core/utils/duration.h" #include "paimon/format/parquet/parquet_field_id_converter.h" #include "paimon/format/parquet/parquet_format_defs.h" +#include "paimon/format/parquet/parquet_metrics.h" #include "paimon/format/parquet/parquet_timestamp_converter.h" #include "paimon/format/parquet/predicate_converter.h" #include "paimon/reader/batch_reader.h" @@ -151,6 +153,10 @@ Status ParquetFileBatchReader::SetReadSchema( read_row_groups_ = row_groups; read_column_indices_ = column_indices; + metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_TOTAL, + reader_->GetNumberOfRowGroups()); + metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_FILTERED, row_groups.size()); + PAIMON_ASSIGN_OR_RAISE(std::set ordered_row_groups, reader_->FilterRowGroupsByReadRanges(read_ranges_, read_row_groups_)); return reader_->PrepareForReadingLazy(ordered_row_groups, read_column_indices_); @@ -221,6 +227,7 @@ Result> ParquetFileBatchReader::FilterRowGroupsByBitmap( } Result ParquetFileBatchReader::NextBatch() { + Duration timer; PAIMON_ASSIGN_OR_RAISE(std::shared_ptr batch, reader_->Next()); if (batch == nullptr) { return BatchReader::MakeEofBatch(); @@ -243,6 +250,14 @@ Result ParquetFileBatchReader::NextBatch() { std::unique_ptr c_array = std::make_unique(); std::unique_ptr c_schema = std::make_unique(); PAIMON_RETURN_NOT_OK_FROM_ARROW(arrow::ExportArray(*array, c_array.get(), c_schema.get())); + + read_rows_ += array->length(); + read_batch_count_++; + read_next_batch_latency_ms_ += timer.Get(); + metrics_->SetCounter(ParquetMetrics::READ_ROWS, read_rows_); + metrics_->SetCounter(ParquetMetrics::READ_BATCH_COUNT, read_batch_count_); + metrics_->SetCounter(ParquetMetrics::READ_NEXT_BATCH_LATENCY_MS, read_next_batch_latency_ms_); + return make_pair(std::move(c_array), std::move(c_schema)); } diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.h b/src/paimon/format/parquet/parquet_file_batch_reader.h index 81fb2b8dc..7d18bafe7 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.h +++ b/src/paimon/format/parquet/parquet_file_batch_reader.h @@ -174,6 +174,10 @@ class ParquetFileBatchReader : public PrefetchFileBatchReader { std::shared_ptr metrics_; + uint64_t read_rows_ = 0; + uint64_t read_batch_count_ = 0; + uint64_t read_next_batch_latency_ms_ = 0; + // last time set read schema std::vector read_row_groups_; std::vector read_column_indices_; diff --git a/src/paimon/format/parquet/parquet_metrics.h b/src/paimon/format/parquet/parquet_metrics.h new file mode 100644 index 000000000..d53befbb1 --- /dev/null +++ b/src/paimon/format/parquet/parquet_metrics.h @@ -0,0 +1,30 @@ +/* + * Copyright 2024-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace paimon::parquet { + +class ParquetMetrics { + public: + static inline const char READ_ROW_GROUPS_TOTAL[] = "parquet.read.row_groups.total"; + static inline const char READ_ROW_GROUPS_FILTERED[] = "parquet.read.row_groups.filtered"; + static inline const char READ_ROWS[] = "parquet.read.rows"; + static inline const char READ_BATCH_COUNT[] = "parquet.read.batch.count"; + static inline const char READ_NEXT_BATCH_LATENCY_MS[] = "parquet.read.next_batch.latency.ms"; +}; + +} // namespace paimon::parquet From f3d8d812813e5dc2f3f8c5668fafcf1d7f6c3146 Mon Sep 17 00:00:00 2001 From: duanyyyyyy Date: Wed, 29 Apr 2026 12:55:38 +0800 Subject: [PATCH 2/3] fix(parquet): resolve ParquetMetrics class redefinition Move parquet read metric constants into the existing ParquetMetrics class in parquet_format_defs.h, rather than defining a duplicate class in a new header. Fixes build error introduced in d5a8bfa. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../parquet/parquet_file_batch_reader.cpp | 1 - .../format/parquet/parquet_format_defs.h | 7 +++++ src/paimon/format/parquet/parquet_metrics.h | 30 ------------------- 3 files changed, 7 insertions(+), 31 deletions(-) delete mode 100644 src/paimon/format/parquet/parquet_metrics.h diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 8be95d8c3..50c176b51 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -40,7 +40,6 @@ #include "paimon/core/utils/duration.h" #include "paimon/format/parquet/parquet_field_id_converter.h" #include "paimon/format/parquet/parquet_format_defs.h" -#include "paimon/format/parquet/parquet_metrics.h" #include "paimon/format/parquet/parquet_timestamp_converter.h" #include "paimon/format/parquet/predicate_converter.h" #include "paimon/reader/batch_reader.h" diff --git a/src/paimon/format/parquet/parquet_format_defs.h b/src/paimon/format/parquet/parquet_format_defs.h index 9022dfcf5..be1f7dc1a 100644 --- a/src/paimon/format/parquet/parquet_format_defs.h +++ b/src/paimon/format/parquet/parquet_format_defs.h @@ -61,6 +61,13 @@ static constexpr uint32_t DEFAULT_PARQUET_READ_PREDICATE_NODE_COUNT_LIMIT = 512; class ParquetMetrics { public: static inline const char WRITE_RECORD_COUNT[] = "parquet.write.record.count"; + + // read + static inline const char READ_ROW_GROUPS_TOTAL[] = "parquet.read.row_groups.total"; + static inline const char READ_ROW_GROUPS_FILTERED[] = "parquet.read.row_groups.filtered"; + static inline const char READ_ROWS[] = "parquet.read.rows"; + static inline const char READ_BATCH_COUNT[] = "parquet.read.batch.count"; + static inline const char READ_NEXT_BATCH_LATENCY_MS[] = "parquet.read.next_batch.latency.ms"; }; } // namespace paimon::parquet diff --git a/src/paimon/format/parquet/parquet_metrics.h b/src/paimon/format/parquet/parquet_metrics.h deleted file mode 100644 index d53befbb1..000000000 --- a/src/paimon/format/parquet/parquet_metrics.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright 2024-present Alibaba Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -namespace paimon::parquet { - -class ParquetMetrics { - public: - static inline const char READ_ROW_GROUPS_TOTAL[] = "parquet.read.row_groups.total"; - static inline const char READ_ROW_GROUPS_FILTERED[] = "parquet.read.row_groups.filtered"; - static inline const char READ_ROWS[] = "parquet.read.rows"; - static inline const char READ_BATCH_COUNT[] = "parquet.read.batch.count"; - static inline const char READ_NEXT_BATCH_LATENCY_MS[] = "parquet.read.next_batch.latency.ms"; -}; - -} // namespace paimon::parquet From 9df53c63a3933f32ee6ac9541035539a67b2d832 Mon Sep 17 00:00:00 2001 From: duanyyyyyy Date: Wed, 29 Apr 2026 13:07:25 +0800 Subject: [PATCH 3/3] fix pre-commit --- src/paimon/format/parquet/parquet_file_batch_reader.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/paimon/format/parquet/parquet_file_batch_reader.cpp b/src/paimon/format/parquet/parquet_file_batch_reader.cpp index 50c176b51..80761b0be 100644 --- a/src/paimon/format/parquet/parquet_file_batch_reader.cpp +++ b/src/paimon/format/parquet/parquet_file_batch_reader.cpp @@ -152,8 +152,7 @@ Status ParquetFileBatchReader::SetReadSchema( read_row_groups_ = row_groups; read_column_indices_ = column_indices; - metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_TOTAL, - reader_->GetNumberOfRowGroups()); + metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_TOTAL, reader_->GetNumberOfRowGroups()); metrics_->SetCounter(ParquetMetrics::READ_ROW_GROUPS_FILTERED, row_groups.size()); PAIMON_ASSIGN_OR_RAISE(std::set ordered_row_groups,