From f00d8fd1c71f5e2ea6d33411a798c68d2ac30675 Mon Sep 17 00:00:00 2001 From: SteNicholas Date: Wed, 28 Jan 2026 13:27:59 +0800 Subject: [PATCH] feat: support scan metrics of FileStoreScan to align with ScanMetrics --- .../core/operation/file_store_commit_impl.cpp | 2 +- src/paimon/core/operation/file_store_scan.cpp | 52 ++++++++++++++----- src/paimon/core/operation/file_store_scan.h | 12 ++++- .../key_value_file_store_scan_test.cpp | 21 ++++++++ .../core/operation/metrics/scan_metrics.h | 31 +++++++++++ 5 files changed, 101 insertions(+), 17 deletions(-) create mode 100644 src/paimon/core/operation/metrics/scan_metrics.h diff --git a/src/paimon/core/operation/file_store_commit_impl.cpp b/src/paimon/core/operation/file_store_commit_impl.cpp index 1b0019e2..109bc37f 100644 --- a/src/paimon/core/operation/file_store_commit_impl.cpp +++ b/src/paimon/core/operation/file_store_commit_impl.cpp @@ -404,7 +404,7 @@ Status FileStoreCommitImpl::Commit(const std::shared_ptr& c } } metrics_->SetCounter(CommitMetrics::LAST_COMMIT_DURATION, - std::chrono::duration_cast( + std::chrono::duration_cast( std::chrono::high_resolution_clock::now() - started) .count()); metrics_->SetCounter(CommitMetrics::LAST_COMMIT_ATTEMPTS, attempt); diff --git a/src/paimon/core/operation/file_store_scan.cpp b/src/paimon/core/operation/file_store_scan.cpp index 7199a80f..34987d10 100644 --- a/src/paimon/core/operation/file_store_scan.cpp +++ b/src/paimon/core/operation/file_store_scan.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ #include "paimon/core/manifest/manifest_file.h" #include "paimon/core/manifest/manifest_file_meta.h" #include "paimon/core/manifest/manifest_list.h" +#include "paimon/core/operation/metrics/scan_metrics.h" #include "paimon/core/partition/partition_info.h" #include "paimon/core/stats/simple_stats.h" #include "paimon/core/utils/field_mapping.h" @@ -72,10 +74,12 @@ FileStoreScan::RawPlan::GroupFiles FileStoreScan::RawPlan::GroupByPartFiles( Result> FileStoreScan::ReadPartitionEntries() const { std::optional snapshot; - std::vector manifest_file_metas; - PAIMON_RETURN_NOT_OK(ReadManifests(&snapshot, &manifest_file_metas)); + std::vector all_manifest_file_metas; + std::vector filtered_manifest_file_metas; + PAIMON_RETURN_NOT_OK( + ReadManifests(&snapshot, &all_manifest_file_metas, &filtered_manifest_file_metas)); std::vector manifest_entries; - PAIMON_RETURN_NOT_OK(ReadFileEntries(manifest_file_metas, &manifest_entries)); + PAIMON_RETURN_NOT_OK(ReadFileEntries(filtered_manifest_file_metas, &manifest_entries)); std::unordered_map partitions; PAIMON_RETURN_NOT_OK(PartitionEntry::Merge(manifest_entries, &partitions)); @@ -90,13 +94,16 @@ Result> FileStoreScan::ReadPartitionEntries() const } Result> FileStoreScan::CreatePlan() const { + const auto started = std::chrono::high_resolution_clock::now(); std::optional snapshot; - std::vector manifest_file_metas; - PAIMON_RETURN_NOT_OK(ReadManifests(&snapshot, &manifest_file_metas)); - manifest_file_metas = PostFilterManifests(std::move(manifest_file_metas)); + std::vector all_manifest_file_metas; + std::vector filtered_manifest_file_metas; + PAIMON_RETURN_NOT_OK( + ReadManifests(&snapshot, &all_manifest_file_metas, &filtered_manifest_file_metas)); + filtered_manifest_file_metas = PostFilterManifests(std::move(filtered_manifest_file_metas)); std::vector manifest_entries; - PAIMON_RETURN_NOT_OK(ReadManifestEntries(manifest_file_metas, &manifest_entries)); + PAIMON_RETURN_NOT_OK(ReadManifestEntries(filtered_manifest_file_metas, &manifest_entries)); PAIMON_ASSIGN_OR_RAISE(manifest_entries, PostFilterManifestEntries(std::move(manifest_entries))); @@ -121,29 +128,46 @@ Result> FileStoreScan::CreatePlan() cons } } } + const int64_t all_data_files = std::accumulate( + all_manifest_file_metas.begin(), all_manifest_file_metas.end(), int64_t{0}, + [](const int64_t sum, const ManifestFileMeta& manifest_file_meta) { + return sum + manifest_file_meta.NumAddedFiles() - manifest_file_meta.NumDeletedFiles(); + }); + metrics_->SetCounter(ScanMetrics::LAST_SCAN_DURATION, + std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - started) + .count()); + metrics_->SetCounter(ScanMetrics::LAST_SCANNED_SNAPSHOT_ID, + snapshot.has_value() ? snapshot.value().Id() : int64_t{0}); + metrics_->SetCounter(ScanMetrics::LAST_SCANNED_MANIFESTS, filtered_manifest_file_metas.size()); + metrics_->SetCounter(ScanMetrics::LAST_SCAN_SKIPPED_TABLE_FILES, + all_data_files - manifest_entries.size()); + metrics_->SetCounter(ScanMetrics::LAST_SCAN_RESULTED_TABLE_FILES, manifest_entries.size()); return std::make_shared(scan_mode_, snapshot, std::move(manifest_entries)); } Status FileStoreScan::ReadManifests(std::optional* snapshot_ptr, - std::vector* manifests_ptr) const { + std::vector* all_manifests_ptr, + std::vector* filter_manifests_ptr) const { auto& snapshot = *snapshot_ptr; - auto& manifests = *manifests_ptr; + auto& all_manifests = *all_manifests_ptr; + auto& filtered_manifests = *filter_manifests_ptr; if (specified_snapshot_ != std::nullopt) { snapshot = specified_snapshot_; } else { PAIMON_ASSIGN_OR_RAISE(snapshot, snapshot_manager_->LatestSnapshot()); } if (snapshot == std::nullopt) { - manifests = std::vector(); + all_manifests = std::vector(); + filtered_manifests = std::vector(); return Status::OK(); } - std::vector unfiltered_manifest_metas; - PAIMON_RETURN_NOT_OK(ReadManifestsWithSnapshot(snapshot.value(), &unfiltered_manifest_metas)); - for (const auto& meta : unfiltered_manifest_metas) { + PAIMON_RETURN_NOT_OK(ReadManifestsWithSnapshot(snapshot.value(), &all_manifests)); + for (const auto& meta : all_manifests) { PAIMON_ASSIGN_OR_RAISE(bool filter_meta_result, FilterManifestFileMeta(meta)); if (filter_meta_result) { - manifests.push_back(meta); + filtered_manifests.push_back(meta); } } return Status::OK(); diff --git a/src/paimon/core/operation/file_store_scan.h b/src/paimon/core/operation/file_store_scan.h index 48432160..4e387024 100644 --- a/src/paimon/core/operation/file_store_scan.h +++ b/src/paimon/core/operation/file_store_scan.h @@ -28,6 +28,7 @@ #include #include "paimon/common/data/binary_row.h" +#include "paimon/common/metrics/metrics_impl.h" #include "paimon/common/predicate/compound_predicate_impl.h" #include "paimon/common/predicate/leaf_predicate_impl.h" #include "paimon/common/predicate/literal_converter.h" @@ -86,7 +87,8 @@ class FileStoreScan { snapshot_manager_(snapshot_manager), manifest_list_(manifest_list), manifest_file_(manifest_file), - executor_(executor) { + executor_(executor), + metrics_(std::make_shared()) { assert(executor_); } @@ -135,6 +137,10 @@ class FileStoreScan { return partition_filter_; } + std::shared_ptr GetScanMetrics() const { + return metrics_; + } + static Result> CreatePartitionPredicate( const std::vector& partition_keys, const std::string& partition_default_name, const std::shared_ptr& arrow_schema, @@ -210,7 +216,8 @@ class FileStoreScan { private: Status ReadManifests(std::optional* snapshot_ptr, - std::vector* manifests_ptr) const; + std::vector* all_manifests_ptr, + std::vector* filtered_manifests_ptr) const; Status ReadManifestsWithSnapshot(const Snapshot& snapshot, std::vector* manifests) const; @@ -255,5 +262,6 @@ class FileStoreScan { std::optional bucket_filter_; std::function level_filter_; std::optional specified_snapshot_; + std::shared_ptr metrics_; }; } // namespace paimon diff --git a/src/paimon/core/operation/key_value_file_store_scan_test.cpp b/src/paimon/core/operation/key_value_file_store_scan_test.cpp index 3e78be64..1c259d20 100644 --- a/src/paimon/core/operation/key_value_file_store_scan_test.cpp +++ b/src/paimon/core/operation/key_value_file_store_scan_test.cpp @@ -31,6 +31,7 @@ #include "paimon/core/manifest/file_source.h" #include "paimon/core/manifest/manifest_file.h" #include "paimon/core/manifest/manifest_list.h" +#include "paimon/core/operation/metrics/scan_metrics.h" #include "paimon/core/schema/schema_manager.h" #include "paimon/core/schema/table_schema.h" #include "paimon/core/snapshot.h" @@ -143,7 +144,27 @@ TEST_F(KeyValueFileStoreScanTest, TestMaxSequenceNumber) { CreateFileStoreScan(table_path, scan_filter, /*table_schema_id=*/0, /*snapshot_id=*/2)); + const auto started = std::chrono::high_resolution_clock::now(); ASSERT_OK_AND_ASSIGN(std::shared_ptr raw_plan, scan->CreatePlan()); + std::shared_ptr metrics = scan->GetScanMetrics(); + ASSERT_TRUE(metrics); + ASSERT_OK_AND_ASSIGN(uint64_t last_scan_duration, + metrics->GetCounter(ScanMetrics::LAST_SCAN_DURATION)); + ASSERT_LE(last_scan_duration, std::chrono::duration_cast( + std::chrono::high_resolution_clock::now() - started) + .count()); + ASSERT_OK_AND_ASSIGN(uint64_t last_scanned_snapshot_id, + metrics->GetCounter(ScanMetrics::LAST_SCANNED_SNAPSHOT_ID)); + ASSERT_EQ(last_scanned_snapshot_id, 2u); + ASSERT_OK_AND_ASSIGN(uint64_t last_scanned_manifests, + metrics->GetCounter(ScanMetrics::LAST_SCANNED_MANIFESTS)); + ASSERT_EQ(last_scanned_manifests, 2u); + ASSERT_OK_AND_ASSIGN(uint64_t last_scan_skipped_table_files, + metrics->GetCounter(ScanMetrics::LAST_SCAN_SKIPPED_TABLE_FILES)); + ASSERT_EQ(last_scan_skipped_table_files, 1u); + ASSERT_OK_AND_ASSIGN(uint64_t last_scan_resulted_table_files, + metrics->GetCounter(ScanMetrics::LAST_SCAN_RESULTED_TABLE_FILES)); + ASSERT_EQ(last_scan_resulted_table_files, 1u); int64_t max_sequence_num = GetMaxSequenceNumberOfRawPlan(raw_plan); ASSERT_EQ(max_sequence_num, 1); // test multiple scan diff --git a/src/paimon/core/operation/metrics/scan_metrics.h b/src/paimon/core/operation/metrics/scan_metrics.h new file mode 100644 index 00000000..501b1735 --- /dev/null +++ b/src/paimon/core/operation/metrics/scan_metrics.h @@ -0,0 +1,31 @@ +/* + * Copyright 2026-present Alibaba Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace paimon { + +/// Metrics to measure scan operation. +class ScanMetrics { + public: + static constexpr char LAST_SCAN_DURATION[] = "lastScanDuration"; + static constexpr char LAST_SCANNED_SNAPSHOT_ID[] = "lastScannedSnapshotId"; + static constexpr char LAST_SCANNED_MANIFESTS[] = "lastScannedManifests"; + static constexpr char LAST_SCAN_SKIPPED_TABLE_FILES[] = "lastScanSkippedTableFiles"; + static constexpr char LAST_SCAN_RESULTED_TABLE_FILES[] = "lastScanResultedTableFiles"; +}; + +} // namespace paimon