From 92d2177fef99c5180cde051dcc9ad3ef57f5d4bf Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Wed, 20 Aug 2025 16:57:46 +0800 Subject: [PATCH 01/34] in one --- be/src/cloud/cloud_storage_engine.cpp | 1 + be/src/cloud/cloud_storage_engine.h | 13 + be/src/cloud/cloud_tablet.cpp | 81 ++ be/src/cloud/cloud_tablet.h | 24 + be/src/cloud/pb_convert.cpp | 12 + be/src/olap/rowset/rowset.cpp | 4 + be/src/olap/rowset/rowset.h | 2 + be/src/olap/rowset/rowset_meta.h | 17 + be/src/olap/tablet_meta.h | 6 +- be/src/olap/version_graph.cpp | 112 ++ be/src/olap/version_graph.h | 31 + be/src/pipeline/exec/olap_scan_operator.cpp | 14 +- be/src/runtime/runtime_state.h | 9 + be/src/vec/exec/scan/olap_scanner.cpp | 15 +- ...cloud_tablet_query_with_tolerance_test.cpp | 1070 +++++++++++++++++ cloud/src/meta-service/meta_service_job.cpp | 39 +- cloud/src/meta-service/meta_service_txn.cpp | 5 + cloud/test/meta_service_job_test.cpp | 15 + cloud/test/meta_service_test.cpp | 63 + .../org/apache/doris/qe/SessionVariable.java | 7 + gensrc/proto/olap_file.proto | 4 + gensrc/thrift/PaloInternalService.thrift | 2 + 22 files changed, 1537 insertions(+), 9 deletions(-) create mode 100644 be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index 4648f0f58c5cca..cd450714e38d41 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -100,6 +100,7 @@ CloudStorageEngine::CloudStorageEngine(const EngineOptions& options) std::make_shared(); _cumulative_compaction_policies[CUMULATIVE_TIME_SERIES_POLICY] = std::make_shared(); + _startup_timepoint = std::chrono::system_clock::now(); } CloudStorageEngine::~CloudStorageEngine() { diff --git a/be/src/cloud/cloud_storage_engine.h b/be/src/cloud/cloud_storage_engine.h index 000ba92144b099..fba9165c0f44c4 100644 --- a/be/src/cloud/cloud_storage_engine.h +++ b/be/src/cloud/cloud_storage_engine.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -161,6 +162,16 @@ class CloudStorageEngine final : public BaseStorageEngine { void unregister_index_change_compaction(int64_t tablet_id, bool is_base_compact); + std::chrono::time_point startup_timepoint() const { + return _startup_timepoint; + } + +#ifdef BE_TEST + void set_startup_timepoint(const std::chrono::time_point& tp) { + _startup_timepoint = tp; + } +#endif + private: void _refresh_storage_vault_info_thread_callback(); void _vacuum_stale_rowsets_thread_callback(); @@ -238,6 +249,8 @@ class CloudStorageEngine final : public BaseStorageEngine { EngineOptions _options; std::mutex _store_lock; + + std::chrono::time_point _startup_timepoint; }; } // namespace doris diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 04c3b76f976910..1748d7dff383a9 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -27,8 +27,11 @@ #include #include +#include #include #include +#include +#include #include #include #include @@ -70,6 +73,17 @@ bvar::LatencyRecorder g_base_compaction_get_delete_bitmap_lock_time_ms( bvar::Adder g_unused_rowsets_count("unused_rowsets_count"); bvar::Adder g_unused_rowsets_bytes("unused_rowsets_bytes"); +bvar::Adder g_capture_with_freshness_tolerance_count( + "capture_with_freshness_tolerance_count"); +bvar::Adder g_capture_with_freshness_tolerance_fallback_count( + "capture_with_freshness_tolerance_fallback_count"); +bvar::Window> g_capture_with_freshness_tolerance_count_window( + "capture_with_freshness_tolerance_count_window", &g_capture_with_freshness_tolerance_count, + 30); +bvar::Window> g_capture_with_freshness_tolerance_fallback_count_window( + "capture_with_freshness_tolerance_fallback_count_window", + &g_capture_with_freshness_tolerance_fallback_count, 30); + static constexpr int LOAD_INITIATOR_ID = -1; bvar::Adder g_file_cache_cloud_tablet_submitted_segment_size( @@ -165,6 +179,73 @@ Status CloudTablet::capture_rs_readers(const Version& spec_version, return capture_rs_readers_unlocked(version_path, rs_splits); } +Status CloudTablet::capture_rs_readers_with_freshness_tolerance( + const Version& spec_version, std::vector* rs_splits, + bool skip_missing_version, int64_t query_freshness_tolerance_ms) { + g_capture_with_freshness_tolerance_count << 1; + using namespace std::chrono; + auto freshness_limit_tp = system_clock::now() - milliseconds(query_freshness_tolerance_ms); + auto startup_timepoint = _engine.startup_timepoint(); + // find a version path where every edge(rowset) has been warmuped + auto rowset_is_warmed_up = [&](int64_t start_version, int64_t end_version) -> bool { + if (start_version > end_version) { + return false; + } + Version version {start_version, end_version}; + auto it = _rs_version_map.find(version); + if (it == _rs_version_map.end()) { + it = _stale_rs_version_map.find(version); + if (it == _stale_rs_version_map.end()) { + return Status::Error( + "fail to find Rowset in stale_rs_version for version. tablet={}, " + "version={}-{}", + tablet_id(), version.first, version.second); + } + } + const auto& rs = it->second; + if (rs->visible_timestamp() < startup_timepoint) { + // We only care about rowsets that are created after startup time point. For other rowsets, + // we assume they are warmuped up. + return true; + } + return is_rowset_warmed_up(rs->rowset_id()); + }; + Versions version_path; + std::shared_lock rlock(_meta_lock); + if (enable_unique_key_merge_on_write()) { + // For merge-on-write table, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can ony capture rowsets which are in newest data layout. Otherwise there may be data correctness issue. + RETURN_IF_ERROR( + _timestamped_version_tracker.capture_newest_consistent_versions_with_validator( + 0, version_path, rowset_is_warmed_up)); + } else { + RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator( + 0, version_path, rowset_is_warmed_up)); + } + int64_t path_max_version = version_path.back().second; + auto should_be_visible_but_not_warmed_up = [&](const auto& rs_meta) -> bool { + if (rs_meta->version() == Version {0, 1}) { + // skip rowset[0-1] + return false; + } + return rs_meta->start_version() > path_max_version && + rs_meta->visible_timestamp() < freshness_limit_tp; + }; + // use std::views::concat after C++26 + bool should_fallback = std::ranges::any_of(_tablet_meta->all_rs_metas(), + should_be_visible_but_not_warmed_up) || + std::ranges::any_of(_tablet_meta->all_stale_rs_metas(), + should_be_visible_but_not_warmed_up); + if (should_fallback) { + g_capture_with_freshness_tolerance_fallback_count << 1; + // if there exists a rowset which satisfies freshness tolerance and its start version is larger than the path max version + // but has not been warmuped up yet, fallback to capture rowsets as usual + return capture_rs_readers(spec_version, rs_splits, skip_missing_version); + } + + return capture_rs_readers_unlocked(version_path, rs_splits); +} + // There are only two tablet_states RUNNING and NOT_READY in cloud mode // This function will erase the tablet from `CloudTabletMgr` when it can't find this tablet in MS. Status CloudTablet::sync_rowsets(const SyncOptions& options, SyncRowsetStats* stats) { diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 8a4c1ae5ced7c4..fc850cecbe8616 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -70,6 +70,11 @@ class CloudTablet final : public BaseTablet { Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, bool skip_missing_version) override; + Status capture_rs_readers_with_freshness_tolerance(const Version& spec_version, + std::vector* rs_splits, + bool skip_missing_version, + int64_t query_freshness_tolerance_ms); + Status capture_consistent_rowsets_unlocked( const Version& spec_version, std::vector* rowsets) const override; @@ -300,6 +305,22 @@ class CloudTablet final : public BaseTablet { bool add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState state); WarmUpState complete_rowset_segment_warmup(RowsetId rowset_id, Status status); + bool is_rowset_warmed_up(const RowsetId& rowset_id) const { + std::shared_lock rlock(_warmed_up_rowsets_mutex); + return _warmed_up_rowsets.contains(rowset_id); + } + + // TODO: add to warm up callback when file cache donwload task is done + void add_warmed_up_rowset(const RowsetId& rowset_id) { + std::unique_lock wlock(_warmed_up_rowsets_mutex); + _warmed_up_rowsets.insert(rowset_id); + } + + void remove_warmed_up_rowset(const RowsetId& rowset_id) { + std::unique_lock wlock(_warmed_up_rowsets_mutex); + _warmed_up_rowsets.erase(rowset_id); + } + private: // FIXME(plat1ko): No need to record base size if rowsets are ordered by version void update_base_size(const Rowset& rs); @@ -367,6 +388,9 @@ class CloudTablet final : public BaseTablet { // for warm up states management std::unordered_map> _rowset_warm_up_states; + + mutable std::shared_mutex _warmed_up_rowsets_mutex; + std::unordered_set _warmed_up_rowsets; }; using CloudTabletSPtr = std::shared_ptr; diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index b4319cdb946f9b..113ee352bc7828 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -96,6 +96,9 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in) if (in.has___split_schema()) { out->mutable___split_schema()->CopyFrom(in.__split_schema()); } + if (in.has_visible_time_ms()) { + out->set_visible_time_ms(in.visible_time_ms()); + } } void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { @@ -159,6 +162,9 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { if (in.has___split_schema()) { out->mutable___split_schema()->Swap(in.mutable___split_schema()); } + if (in.has_visible_time_ms()) { + out->set_visible_time_ms(in.visible_time_ms()); + } } RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in) { @@ -232,6 +238,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) if (in.has___split_schema()) { out->mutable___split_schema()->CopyFrom(in.__split_schema()); } + if (in.has_visible_time_ms()) { + out->set_visible_time_ms(in.visible_time_ms()); + } } void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { @@ -294,6 +303,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { if (in.has___split_schema()) { out->mutable___split_schema()->Swap(in.mutable___split_schema()); } + if (in.has_visible_time_ms()) { + out->set_visible_time_ms(in.visible_time_ms()); + } } TabletSchemaCloudPB doris_tablet_schema_to_cloud(const TabletSchemaPB& in) { diff --git a/be/src/olap/rowset/rowset.cpp b/be/src/olap/rowset/rowset.cpp index c318cce9a4228e..0ca50b118e1385 100644 --- a/be/src/olap/rowset/rowset.cpp +++ b/be/src/olap/rowset/rowset.cpp @@ -234,6 +234,10 @@ int64_t Rowset::approximate_cache_index_size() { return total_cache_size; } +std::chrono::time_point Rowset::visible_timestamp() const { + return _rowset_meta->visible_timestamp(); +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index ae68eb0bcf45da..9ab9ca3356b5f7 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -327,6 +327,8 @@ class Rowset : public std::enable_shared_from_this, public MetadataAdder int64_t approximate_cache_index_size(); + std::chrono::time_point visible_timestamp() const; + protected: friend class RowsetFactory; diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h index 47752181750964..ca532834296fac 100644 --- a/be/src/olap/rowset/rowset_meta.h +++ b/be/src/olap/rowset/rowset_meta.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -368,6 +369,22 @@ class RowsetMeta : public MetadataAdder { int64_t newest_write_timestamp() const { return _rowset_meta_pb.newest_write_timestamp(); } + // for cloud only + bool has_visible_time_ms() const { return _rowset_meta_pb.has_visible_time_ms(); } + int64_t visible_time_ms() const { return _rowset_meta_pb.visible_time_ms(); } + std::chrono::time_point visible_timestamp() const { + using namespace std::chrono; + if (has_visible_time_ms()) { + return time_point(milliseconds(visible_time_ms())); + } + return system_clock::from_time_t(newest_write_timestamp()); + } +#ifdef BE_TEST + void set_visible_time_ms(int64_t visible_time_ms) { + _rowset_meta_pb.set_visible_time_ms(visible_time_ms); + } +#endif + void set_tablet_schema(const TabletSchemaSPtr& tablet_schema); void set_tablet_schema(const TabletSchemaPB& tablet_schema); diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 6101a96f5cf379..d858d577bc3e6a 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -249,7 +249,11 @@ class TabletMeta : public MetadataAdder { void remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version); bool enable_unique_key_merge_on_write() const { return _enable_unique_key_merge_on_write; } - +#ifdef BE_TEST + void set_enable_unique_key_merge_on_write(bool value) { + _enable_unique_key_merge_on_write = value; + } +#endif // TODO(Drogon): thread safety const BinlogConfig& binlog_config() const { return _binlog_config; } void set_binlog_config(BinlogConfig binlog_config) { diff --git a/be/src/olap/version_graph.cpp b/be/src/olap/version_graph.cpp index b769c5895e1ce4..3cdac07da1d67a 100644 --- a/be/src/olap/version_graph.cpp +++ b/be/src/olap/version_graph.cpp @@ -25,6 +25,7 @@ #include // IWYU pragma: keep #include #include +#include #include #include @@ -336,6 +337,20 @@ Status TimestampedVersionTracker::capture_consistent_versions( return _version_graph.capture_consistent_versions(spec_version, version_path); } +Status TimestampedVersionTracker::capture_consistent_versions_with_validator( + int64_t start, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_consistent_versions_with_validator(start, version_path, + validator); +} + +Status TimestampedVersionTracker::capture_newest_consistent_versions_with_validator( + int64_t start, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_newest_consistent_versions_with_validator(start, version_path, + validator); +} + void TimestampedVersionTracker::capture_expired_paths( int64_t stale_sweep_endtime, std::vector* path_version_vec) const { std::map::const_iterator iter = @@ -635,6 +650,103 @@ Status VersionGraph::capture_consistent_versions(const Version& spec_version, return Status::OK(); } +Status VersionGraph::capture_consistent_versions_with_validator( + int64_t start, std::vector& version_path, + const std::function& validator) const { + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == start) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. start {}", + start); + } + + while (true) { + int64_t next_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + +Status VersionGraph::capture_newest_consistent_versions_with_validator( + int64_t start, std::vector& version_path, + const std::function& validator) const { + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == start) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. start {}", + start); + } + + std::optional end_value; + while (!end_value.has_value() || _version_graph[cur_idx].value < end_value.value()) { + int64_t next_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + if (_version_graph[cur_idx].value + 1 == _version_graph[it].value) { + break; + } + if (!end_value.has_value() || _version_graph[it].value < end_value.value()) { + // when encounter a compaction's output rowset which is not valid, try to find a version path + // with smaller max version + end_value = _version_graph[it].value; + } + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + double VersionGraph::get_orphan_vertex_ratio() { int64_t vertex_num = _version_graph.size(); int64_t orphan_vertex_num = 0; diff --git a/be/src/olap/version_graph.h b/be/src/olap/version_graph.h index 56d07a52871ae7..a6db8618d4048b 100644 --- a/be/src/olap/version_graph.h +++ b/be/src/olap/version_graph.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -55,6 +56,21 @@ class VersionGraph { Status capture_consistent_versions(const Version& spec_version, std::vector* version_path) const; + // Given a start, this method can find a version path which satisfy the following conditions: + // 1. all edges satisfy the conditions specified by `validator` in the graph. + // 2. the destination version is as far as possible. + // 3. the path is the shortest path. + // The version paths are added to version_path as return info. + // If this version not in main version, version_path can be included expired rowset. + // NOTE: this method may return edges which is in stale path + Status capture_consistent_versions_with_validator( + int64_t start, std::vector& version_path, + const std::function& validator) const; + + Status capture_newest_consistent_versions_with_validator( + int64_t start, std::vector& version_path, + const std::function& validator) const; + // See comment of TimestampedVersionTracker's get_orphan_vertex_ratio(); double get_orphan_vertex_ratio(); @@ -168,6 +184,21 @@ class TimestampedVersionTracker { Status capture_consistent_versions(const Version& spec_version, std::vector* version_path) const; + // Given a start, this method can find a version path which satisfy the following conditions: + // 1. all edges satisfy the conditions specified by `validator` in the graph. + // 2. the destination version is as far as possible. + // 3. the path is the shortest path. + // The version paths are added to version_path as return info. + // If this version not in main version, version_path can be included expired rowset. + // NOTE: this method may return edges which is in stale path + Status capture_consistent_versions_with_validator( + int64_t start, std::vector& version_path, + const std::function& validator) const; + + Status capture_newest_consistent_versions_with_validator( + int64_t start, std::vector& version_path, + const std::function& validator) const; + /// Capture all expired path version. /// When the last rowset create time of a path greater than expired time which can be expressed /// "now() - tablet_rowset_stale_sweep_time_sec" , this path will be remained. diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index b89575db580ec2..8b690b64acec35 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -691,9 +691,17 @@ Status OlapScanLocalState::prepare(RuntimeState* state) { } for (size_t i = 0; i < _scan_ranges.size(); i++) { - RETURN_IF_ERROR(_tablets[i].tablet->capture_rs_readers({0, _tablets[i].version}, - &_read_sources[i].rs_splits, - _state->skip_missing_version())); + if (config::is_cloud_mode() && _state->enable_query_freshness_tolerance()) { + RETURN_IF_ERROR(std::static_pointer_cast(_tablets[i].tablet) + ->capture_rs_readers_with_freshness_tolerance( + {0, _tablets[i].version}, &_read_sources[i].rs_splits, + _state->skip_missing_version(), + _state->query_freshness_tolerance_ms())); + } else { + RETURN_IF_ERROR(_tablets[i].tablet->capture_rs_readers({0, _tablets[i].version}, + &_read_sources[i].rs_splits, + _state->skip_missing_version())); + } if (!PipelineXLocalState<>::_state->skip_delete_predicate()) { _read_sources[i].fill_delete_predicates(); } diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 1d27944da314bb..044c888e45f3d6 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -423,6 +423,15 @@ class RuntimeState { bool enable_page_cache() const; + int64_t query_freshness_tolerance_ms() const { + return _query_options.query_freshness_tolerance_ms; + } + + bool enable_query_freshness_tolerance() const { + return _query_options.__isset.query_freshness_tolerance_ms && + _query_options.query_freshness_tolerance_ms > 0; + } + std::vector tablet_commit_infos() const { std::lock_guard lock(_tablet_infos_mutex); return _tablet_commit_infos; diff --git a/be/src/vec/exec/scan/olap_scanner.cpp b/be/src/vec/exec/scan/olap_scanner.cpp index 1e8a7f9321291e..3a1e890713704f 100644 --- a/be/src/vec/exec/scan/olap_scanner.cpp +++ b/be/src/vec/exec/scan/olap_scanner.cpp @@ -218,9 +218,18 @@ Status OlapScanner::prepare() { ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_hotspot().count(*tablet); } - auto st = tablet->capture_rs_readers(_tablet_reader_params.version, - &read_source.rs_splits, - _state->skip_missing_version()); + Status st {}; + if (config::is_cloud_mode() && _state->enable_query_freshness_tolerance()) { + st = std::static_pointer_cast(tablet) + ->capture_rs_readers_with_freshness_tolerance( + _tablet_reader_params.version, &read_source.rs_splits, + _state->skip_missing_version(), + _state->query_freshness_tolerance_ms()); + } else { + st = tablet->capture_rs_readers(_tablet_reader_params.version, + &read_source.rs_splits, + _state->skip_missing_version()); + } if (!st.ok()) { LOG(WARNING) << "fail to init reader.res=" << st; return st; diff --git a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp new file mode 100644 index 00000000000000..263b87f94d80b1 --- /dev/null +++ b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp @@ -0,0 +1,1070 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_meta.h" +#include "olap/tablet_meta.h" +#include "util/uid_util.h" + +namespace doris { + +using namespace std::chrono; + +class TestFreshnessTolerance : public testing::Test { +public: + TestFreshnessTolerance() : _engine(CloudStorageEngine(EngineOptions {})) {} + + void SetUp() override { + _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, + UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, + TCompressionType::LZ4F)); + } + void TearDown() override {} + + RowsetSharedPtr create_rowset_without_visible_time(Version version) { + auto rs_meta = std::make_shared(); + rs_meta->set_rowset_type(BETA_ROWSET); + rs_meta->set_version(version); + rs_meta->set_rowset_id(_engine.next_rowset_id()); + RowsetSharedPtr rowset; + Status st = RowsetFactory::create_rowset(nullptr, "", rs_meta, &rowset); + if (!st.ok()) { + return nullptr; + } + return rowset; + } + + RowsetSharedPtr create_rowset(Version version, + time_point visible_timestamp = system_clock::now() - + seconds(100)) { + auto rs = create_rowset_without_visible_time(version); + if (!rs) { + return nullptr; + } + rs->rowset_meta()->set_visible_time_ms( + duration_cast(visible_timestamp.time_since_epoch()).count()); + return rs; + } + + CloudTabletSPtr create_tablet_with_initial_rowsets(int max_version, bool is_mow = false) { + CloudTabletSPtr tablet = + std::make_shared(_engine, std::make_shared(*_tablet_meta)); + tablet->tablet_meta()->set_enable_unique_key_merge_on_write(is_mow); + std::vector rowsets; + auto rs1 = create_rowset(Version {0, 1}); + rowsets.emplace_back(rs1); + tablet->add_warmed_up_rowset(rs1->rowset_id()); + for (int ver = 2; ver <= max_version; ver++) { + auto rs = create_rowset(Version {ver, ver}); + tablet->add_warmed_up_rowset(rs->rowset_id()); + rowsets.emplace_back(rs); + } + { + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets(rowsets, false, wlock, false); + } + return tablet; + } + + void add_new_version_rowset(CloudTabletSPtr tablet, int64_t version, bool warmed_up, + time_point visible_timestamp) { + auto rowset = create_rowset(Version {version, version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(rowset->rowset_id()); + } + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets({rowset}, false, wlock, false); + } + + void do_cumu_compaction(CloudTabletSPtr tablet, int64_t start_version, int64_t end_version, + bool warmed_up, time_point visible_timestamp) { + std::unique_lock wrlock {tablet->get_header_lock()}; + std::vector input_rowsets; + auto output_rowset = create_rowset(Version {start_version, end_version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(output_rowset->rowset_id()); + } + std::ranges::copy_if(std::views::values(tablet->rowset_map()), + std::back_inserter(input_rowsets), [=](const RowsetSharedPtr& rowset) { + return rowset->version().first >= start_version && + rowset->version().first <= end_version; + }); + if (input_rowsets.size() == 1) { + tablet->add_rowsets({output_rowset}, true, wrlock); + } else { + tablet->delete_rowsets(input_rowsets, wrlock); + tablet->add_rowsets({output_rowset}, false, wrlock); + } + } + + void check_capture_result(CloudTabletSPtr tablet, Version spec_version, + int64_t query_freshness_tolerance_ms, + const std::vector& expected_versions) { + std::vector rs_splits; + auto st = tablet->capture_rs_readers_with_freshness_tolerance( + spec_version, &rs_splits, false, query_freshness_tolerance_ms); + ASSERT_TRUE(st.ok()); + auto dump_versions = [](const std::vector& expected_versions, + const std::vector& splits) { + std::vector expected_str; + for (const auto& version : expected_versions) { + expected_str.push_back(version.to_string()); + } + std::vector versions; + for (const auto& split : splits) { + versions.push_back(split.rs_reader->rowset()->version().to_string()); + } + return fmt::format("expected_versions: {}, actual_versions: {}", + fmt::join(expected_str, ", "), fmt::join(versions, ", ")); + }; + ASSERT_EQ(rs_splits.size(), expected_versions.size()) + << dump_versions(expected_versions, rs_splits); + for (size_t i = 0; i < rs_splits.size(); i++) { + ASSERT_EQ(rs_splits[i].rs_reader->rowset()->version(), expected_versions[i]) + << dump_versions(expected_versions, rs_splits); + } + } + +protected: + std::string _json_rowset_meta; + TabletMetaSharedPtr _tablet_meta; + +private: + CloudStorageEngine _engine; +}; + +TEST_F(TestFreshnessTolerance, testVisibleTimestamp) { + { + // for historical rowset, visible time is not set, RowsetMeta::visible_timestamp() uses + // newest_write_timestamp + auto tp1 = system_clock::now() - seconds(100); + auto rs = create_rowset_without_visible_time({2, 2}); + auto d = duration_cast(tp1.time_since_epoch()).count(); + rs->rowset_meta()->set_newest_write_timestamp(d); + ASSERT_EQ(rs->rowset_meta()->visible_timestamp(), system_clock::from_time_t(d)); + } + + { + // when visible_time_ms is set, RowsetMeta::visible_timestamp() uses visible_time_ms which is more precise + auto tp1 = system_clock::now() - seconds(100); + auto tp2 = system_clock::now() - seconds(50); + auto rs = create_rowset_without_visible_time({2, 2}); + auto d1 = duration_cast(tp1.time_since_epoch()).count(); + auto d2 = duration_cast(tp2.time_since_epoch()).count(); + rs->rowset_meta()->set_newest_write_timestamp(d1); + rs->rowset_meta()->set_visible_time_ms(d2); + ASSERT_EQ(rs->rowset_meta()->visible_timestamp(), + time_point(milliseconds(d2))); + } +} + +TEST_F(TestFreshnessTolerance, testCapture_1_1) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_2) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │ ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + NOTE: rowset[16-16] should be visible becasue it's within the query freshness tolerance time limit. + However, since the data files of rowset[16-16] is not in the cache, there is no difference between + capturing up to version 16 and capturing up to version 18. So we capture up to version 18. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_3) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_4) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16],[17-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_3_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │ │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-17],[18-18] + note: should fallback +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_1) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_2) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │ ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + NOTE: rowset[16-16] must be visible becasue it's within the query freshness tolerance time limit. + However, since the data files of rowset[16-16] is not in the cache, there is no difference between + capturing up to version 16 and capturing up to version 18 +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_3) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_4) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16],[17-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17] + note: due to the existence of rowset [11-17], we can only capture up to version 17 + because newly rowsets may generate delete bitmap marks on [11-17]. If we capture [18-18], + we may meet data correctness issue if [18-18] has duplicate rows with [11-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16] + note: due to the existence of rowset [2-16], we can only capture up to version 16 + because newly rowsets may generate delete bitmap marks on [2-16]. If we capture [17-17], + we may meet data correctness issue if [17-17] has duplicate rows with [2-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_5) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_6) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ │ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-17],[18-18] + note: because rowset [11-15] is not warmed up, we can only choose a path whose max verion is below 15 + but rowset version 16 is within the query freshness tolerance time limit. So we should fallback to + capture rowsets with tablet's max version +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, false, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_3_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │ │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-17],[18-18] + note: should fallback +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} +} // namespace doris diff --git a/cloud/src/meta-service/meta_service_job.cpp b/cloud/src/meta-service/meta_service_job.cpp index 68da6dcec8888d..4338ab7a0b0336 100644 --- a/cloud/src/meta-service/meta_service_job.cpp +++ b/cloud/src/meta-service/meta_service_job.cpp @@ -1249,9 +1249,22 @@ void process_compaction_job(MetaServiceCode& code, std::string& msg, std::string INSTANCE_LOG(INFO) << "remove tmp rowset meta, tablet_id=" << tablet_id << " tmp_rowset_key=" << hex(tmp_rowset_key); + using namespace std::chrono; + auto rowset_visible_time = + duration_cast(system_clock::now().time_since_epoch()).count(); + rs_meta.set_visible_time_ms(rowset_visible_time); + std::string rowset_val; + if (!rs_meta.SerializeToString(&rowset_val)) { + code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR; + SS << "failed to serialize rowset meta, tablet_id=" << tablet_id + << " rowset_id=" << rowset_id; + msg = ss.str(); + return; + } + int64_t version = compaction.output_versions(0); auto rowset_key = meta_rowset_key({instance_id, tablet_id, version}); - txn->put(rowset_key, tmp_rowset_val); + txn->put(rowset_key, rowset_val); if (is_versioned_write) { std::string meta_rowset_compact_key = versioned::meta_rowset_compact_key({instance_id, tablet_id, version}); @@ -1867,9 +1880,31 @@ void process_schema_change_job(MetaServiceCode& code, std::string& msg, std::str : cast_as(err); return; } + + RowsetMetaCloudPB tmp_rowset_meta; + if (!tmp_rowset_meta.ParseFromString(tmp_rowset_val)) { + code = MetaServiceCode::PROTOBUF_PARSE_ERR; + SS << "malformed tmp rowset meta, unable to deserialize, tablet_id=" << new_tablet_id + << " key=" << hex(tmp_rowset_key); + msg = ss.str(); + return; + } + using namespace std::chrono; + auto rowset_visible_time = + duration_cast(system_clock::now().time_since_epoch()).count(); + tmp_rowset_meta.set_visible_time_ms(rowset_visible_time); + std::string rowset_val; + if (!tmp_rowset_meta.SerializeToString(&rowset_val)) { + code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR; + SS << "failed to serialize rowset meta, tablet_id=" << new_tablet_id + << " rowset_id=" << tmp_rowset_meta.rowset_id_v2(); + msg = ss.str(); + return; + } + auto rowset_key = meta_rowset_key( {instance_id, new_tablet_id, schema_change.output_versions().at(i)}); - txn->put(rowset_key, tmp_rowset_val); + txn->put(rowset_key, rowset_val); txn->remove(tmp_rowset_key); if (is_versioned_write) { doris::RowsetMetaCloudPB rs_meta; diff --git a/cloud/src/meta-service/meta_service_txn.cpp b/cloud/src/meta-service/meta_service_txn.cpp index ce128f6d30e485..6eb046a01500ff 100644 --- a/cloud/src/meta-service/meta_service_txn.cpp +++ b/cloud/src/meta-service/meta_service_txn.cpp @@ -1316,6 +1316,10 @@ void MetaServiceImpl::commit_txn_immediately( std::vector, const RowsetMetaCloudPB&>> rowsets; std::unordered_map tablet_stats; // tablet_id -> stats rowsets.reserve(tmp_rowsets_meta.size()); + + int64_t rowsets_visible_time_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + for (auto& [_, i] : tmp_rowsets_meta) { int64_t tablet_id = i.tablet_id(); int64_t partition_id = i.partition_id(); @@ -1338,6 +1342,7 @@ void MetaServiceImpl::commit_txn_immediately( int64_t new_version = versions[partition_id] + 1; i.set_start_version(new_version); i.set_end_version(new_version); + i.set_visible_time_ms(rowsets_visible_time_ms); // Accumulate affected rows auto& stats = tablet_stats[tablet_id]; diff --git a/cloud/test/meta_service_job_test.cpp b/cloud/test/meta_service_job_test.cpp index b71fee9aa3f231..e6ff74cc927df1 100644 --- a/cloud/test/meta_service_job_test.cpp +++ b/cloud/test/meta_service_job_test.cpp @@ -1142,6 +1142,14 @@ TEST(MetaServiceJobTest, CompactionJobTest) { auto rowset_key = meta_rowset_key({instance_id, tablet_id, input_version_end}); std::string rowset_val; EXPECT_EQ(txn->get(rowset_key, &rowset_val), TxnErrorCode::TXN_OK) << hex(rowset_key); + doris::RowsetMetaCloudPB rowset_meta; + ASSERT_TRUE(rowset_meta.ParseFromString(rowset_val)); + ASSERT_TRUE(rowset_meta.has_visible_time_ms() && rowset_meta.visible_time_ms() > 0); + using namespace std::chrono; + auto visible_tp = time_point(milliseconds(rowset_meta.visible_time_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; }; auto test_abort_compaction_job = [&](int64_t table_id, int64_t index_id, int64_t partition_id, @@ -3630,6 +3638,13 @@ TEST(MetaServiceJobTest, SchemaChangeJobTest) { EXPECT_EQ(saved_rowset.start_version(), rs.start_version()); EXPECT_EQ(saved_rowset.end_version(), rs.end_version()); EXPECT_EQ(saved_rowset.rowset_id_v2(), rs.rowset_id_v2()); + ASSERT_TRUE(saved_rowset.has_visible_time_ms() && saved_rowset.visible_time_ms() > 0); + using namespace std::chrono; + auto visible_tp = + time_point(milliseconds(saved_rowset.visible_time_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; } for (int i = 3; i < 5; ++i) { // [14-14][15-15] auto [k, v] = it->next(); diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index ed51b9cb46eb0d..e67b662659ebb7 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -11873,4 +11874,66 @@ TEST(MetaServiceTest, CreateTabletIdempotentAndHandlingError) { ASSERT_EQ(res.status().code(), MetaServiceCode::KV_TXN_GET_ERR); } +TEST(MetaServiceTest, RowsetVisibleTimeTest) { + auto meta_service = get_meta_service(); + using namespace std::chrono; + int64_t txn_id = -1; + // begin txn + { + brpc::Controller cntl; + BeginTxnRequest req; + req.set_cloud_unique_id("test_cloud_unique_id"); + TxnInfoPB txn_info_pb; + txn_info_pb.set_db_id(666); + txn_info_pb.set_label("test_label"); + txn_info_pb.add_table_ids(1234); + txn_info_pb.set_timeout_ms(36000); + req.mutable_txn_info()->CopyFrom(txn_info_pb); + BeginTxnResponse res; + meta_service->begin_txn(reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &req, + &res, nullptr); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + txn_id = res.txn_id(); + } + + // mock rowset and tablet + int64_t tablet_id_base = 1103; + for (int i = 0; i < 5; ++i) { + create_tablet(meta_service.get(), 1234, 1235, 1236, tablet_id_base + i); + auto tmp_rowset = create_rowset(txn_id, tablet_id_base + i); + CreateRowsetResponse res; + commit_rowset(meta_service.get(), tmp_rowset, res); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + } + { + brpc::Controller cntl; + CommitTxnRequest req; + req.set_cloud_unique_id("test_cloud_unique_id"); + req.set_db_id(666); + req.set_txn_id(txn_id); + CommitTxnResponse res; + meta_service->commit_txn(reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &req, + &res, nullptr); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + } + + for (int i = 0; i < 5; ++i) { + int64_t tablet_id = tablet_id_base + i; + int64_t ver = 2; + std::string rowset_key = meta_rowset_key({mock_instance, tablet_id, ver}); + std::string val; + std::unique_ptr txn; + ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), TxnErrorCode::TXN_OK); + ASSERT_EQ(txn->get(rowset_key, &val), TxnErrorCode::TXN_OK); + RowsetMetaCloudPB rowset_pb; + ASSERT_TRUE(rowset_pb.ParseFromString(val)); + ASSERT_TRUE(rowset_pb.has_visible_time_ms()); + std::cout << rowset_pb.visible_time_ms() << "\n"; + ASSERT_GT(rowset_pb.visible_time_ms(), 0); + auto visible_tp = time_point(milliseconds(rowset_pb.visible_time_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; + } +} } // namespace doris::cloud diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index f9cccbbfd859e2..23cc71890e9e9c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -856,6 +856,8 @@ public static double getHotValueThreshold() { public static final String MULTI_DISTINCT_STRATEGY = "multi_distinct_strategy"; public static final String AGG_PHASE = "agg_phase"; + public static final String QUERY_FRESHNESS_TOLERANCE_MS = "query_freshness_tolerance_ms"; + /** * If set false, user couldn't submit analyze SQL and FE won't allocate any related resources. */ @@ -2504,6 +2506,9 @@ public boolean isEnableHboNonStrictMatchingMode() { }, checker = "checkSkewRewriteAggBucketNum") public int skewRewriteAggBucketNum = 1024; + @VariableMgr.VarAttr(name = QUERY_FRESHNESS_TOLERANCE_MS) + public long queryFreshnessToleranceMs = 5000; + public void setSkewRewriteAggBucketNum(int num) { this.skewRewriteAggBucketNum = num; } @@ -4556,6 +4561,8 @@ public TQueryOptions toThrift() { tResult.setSkipBadTablet(skipBadTablet); tResult.setDisableFileCache(disableFileCache); + tResult.setQueryFreshnessToleranceMs(queryFreshnessToleranceMs); + // for spill tResult.setEnableSpill(enableSpill); tResult.setEnableForceSpill(enableForceSpill); diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index a1d1719c5ebe51..9faaaaa362d43b 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -148,6 +148,8 @@ message RowsetMetaPB { optional SchemaDictKeyList schema_dict_key_list = 1008; // align to cloud rowset optional SplitSchemaPB __split_schema = 1009; // A special field, DO NOT change it. + + optional int64 visible_time_ms = 1010; } message SchemaDictKeyList { @@ -243,6 +245,8 @@ message RowsetMetaCloudPB { repeated InvertedIndexFileInfo inverted_index_file_info = 107; optional SplitSchemaPB __split_schema = 108; // A special field, DO NOT change it. + + optional int64 visible_time_ms = 109; } message SegmentStatisticsPB { diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index b0456913037d2b..e90883d82bdbb9 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -404,6 +404,8 @@ struct TQueryOptions { 171: optional bool optimize_index_scan_parallelism = false; + 172: optional i64 query_freshness_tolerance_ms + // For cloud, to control if the content would be written into file cache // In write path, to control if the content would be written into file cache. // In read path, read from file cache or remote storage when execute query. From 060c74556c5962b3c6b32e2a595a723ce3f3c9e2 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 21 Aug 2025 11:53:02 +0800 Subject: [PATCH 02/34] add option enable_prefer_cached_rowset --- be/src/cloud/cloud_schema_change_job.cpp | 5 ++++- be/src/cloud/cloud_tablet.cpp | 17 ++++++++++++--- be/src/cloud/cloud_tablet.h | 8 ++++--- be/src/olap/base_tablet.h | 11 +++++++++- be/src/olap/tablet.cpp | 4 ++-- be/src/olap/tablet.h | 4 ++-- be/src/pipeline/exec/olap_scan_operator.cpp | 20 ++++++++---------- be/src/runtime/runtime_state.h | 5 +++++ be/src/vec/exec/scan/olap_scanner.cpp | 21 ++++++++----------- ...cloud_tablet_query_with_tolerance_test.cpp | 8 +++++-- be/test/olap/tablet_test.cpp | 4 ++-- gensrc/thrift/PaloInternalService.thrift | 3 ++- 12 files changed, 70 insertions(+), 40 deletions(-) diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index 42a2615b961318..ef3513530d55af 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -148,7 +148,10 @@ Status CloudSchemaChangeJob::process_alter_tablet(const TAlterTabletReqV2& reque if (request.alter_version > 1) { // [0-1] is a placeholder rowset, no need to convert RETURN_IF_ERROR(_base_tablet->capture_rs_readers({2, start_resp.alter_version()}, - &rs_splits, false)); + &rs_splits, + {.skip_missing_version = false, + .enable_prefer_cached_rowset = false, + .query_freshness_tolerance_ms = -1})); } Defer defer2 {[&]() { _new_tablet->set_alter_version(-1); diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 1748d7dff383a9..3da319a5f1fa26 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -153,7 +153,18 @@ Status CloudTablet::capture_consistent_rowsets_unlocked( Status CloudTablet::capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) { + const CaptureRsReaderOptions& opts) { + if (opts.query_freshness_tolerance_ms > 0) { + return capture_rs_readers_with_freshness_tolerance(spec_version, rs_splits, + opts.query_freshness_tolerance_ms); + } else if (opts.enable_prefer_cached_rowset) { + return capture_rs_readers_prefer_cache(spec_version, rs_splits); + } + return capture_rs_readers_internal(spec_version, rs_splits); +} + +Status CloudTablet::capture_rs_readers_internal(const Version& spec_version, + std::vector* rs_splits) { DBUG_EXECUTE_IF("CloudTablet.capture_rs_readers.return.e-230", { LOG_WARNING("CloudTablet.capture_rs_readers.return e-230").tag("tablet_id", tablet_id()); return Status::Error(-230, "injected error"); @@ -181,7 +192,7 @@ Status CloudTablet::capture_rs_readers(const Version& spec_version, Status CloudTablet::capture_rs_readers_with_freshness_tolerance( const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version, int64_t query_freshness_tolerance_ms) { + int64_t query_freshness_tolerance_ms) { g_capture_with_freshness_tolerance_count << 1; using namespace std::chrono; auto freshness_limit_tp = system_clock::now() - milliseconds(query_freshness_tolerance_ms); @@ -240,7 +251,7 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( g_capture_with_freshness_tolerance_fallback_count << 1; // if there exists a rowset which satisfies freshness tolerance and its start version is larger than the path max version // but has not been warmuped up yet, fallback to capture rowsets as usual - return capture_rs_readers(spec_version, rs_splits, skip_missing_version); + return capture_rs_readers_internal(spec_version, rs_splits); } return capture_rs_readers_unlocked(version_path, rs_splits); diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index fc850cecbe8616..a205da7f36f542 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -68,11 +68,13 @@ class CloudTablet final : public BaseTablet { bool vertical) override; Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) override; - + const CaptureRsReaderOptions& opts) override; + Status capture_rs_readers_internal(const Version& spec_version, + std::vector* rs_splits); + Status capture_rs_readers_prefer_cache(const Version& spec_version, + std::vector* rs_splits); Status capture_rs_readers_with_freshness_tolerance(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version, int64_t query_freshness_tolerance_ms); Status capture_consistent_rowsets_unlocked( diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index c33e00ba3c598a..0044eb444bd3d5 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -50,6 +50,15 @@ struct TabletWithVersion { int64_t version; }; +struct CaptureRsReaderOptions { + // used by local mode only + bool skip_missing_version {false}; + + // used by cloud mode only + bool enable_prefer_cached_rowset {false}; + int64_t query_freshness_tolerance_ms {-1}; +}; + enum class CompactionStage { NOT_SCHEDULED, PENDING, EXECUTING }; // Base class for all tablet classes @@ -113,7 +122,7 @@ class BaseTablet : public std::enable_shared_from_this { virtual Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) = 0; + const CaptureRsReaderOptions& opts) = 0; virtual size_t tablet_footprint() = 0; diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 489eea4e439a30..ff71371f1bfd68 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -1007,11 +1007,11 @@ Status Tablet::capture_consistent_rowsets_unlocked(const Version& spec_version, } Status Tablet::capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) { + const CaptureRsReaderOptions& opts) { std::shared_lock rlock(_meta_lock); std::vector version_path; RETURN_IF_ERROR(capture_consistent_versions_unlocked(spec_version, &version_path, - skip_missing_version, false)); + opts.skip_missing_version, false)); RETURN_IF_ERROR(capture_rs_readers_unlocked(version_path, rs_splits)); return Status::OK(); } diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index be09bb64320bb6..28139a64f20ad9 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -197,9 +197,9 @@ class Tablet final : public BaseTablet { Status capture_consistent_rowsets_unlocked( const Version& spec_version, std::vector* rowsets) const override; - // If skip_missing_version is true, skip versions if they are missing. + // If opts.skip_missing_version is true, skip versions if they are missing. Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) override; + const CaptureRsReaderOptions& opts) override; // Find the missed versions until the spec_version. // diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index 8b690b64acec35..8aab3c72512831 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -690,18 +690,16 @@ Status OlapScanLocalState::prepare(RuntimeState* state) { } } + CaptureRsReaderOptions opts { + .skip_missing_version = _state->skip_missing_version(), + .enable_prefer_cached_rowset = + config::is_cloud_mode() ? _state->enable_prefer_cached_rowset() : false, + .query_freshness_tolerance_ms = + config::is_cloud_mode() ? _state->query_freshness_tolerance_ms() : -1, + }; for (size_t i = 0; i < _scan_ranges.size(); i++) { - if (config::is_cloud_mode() && _state->enable_query_freshness_tolerance()) { - RETURN_IF_ERROR(std::static_pointer_cast(_tablets[i].tablet) - ->capture_rs_readers_with_freshness_tolerance( - {0, _tablets[i].version}, &_read_sources[i].rs_splits, - _state->skip_missing_version(), - _state->query_freshness_tolerance_ms())); - } else { - RETURN_IF_ERROR(_tablets[i].tablet->capture_rs_readers({0, _tablets[i].version}, - &_read_sources[i].rs_splits, - _state->skip_missing_version())); - } + RETURN_IF_ERROR(_tablets[i].tablet->capture_rs_readers({0, _tablets[i].version}, + &_read_sources[i].rs_splits, opts)); if (!PipelineXLocalState<>::_state->skip_delete_predicate()) { _read_sources[i].fill_delete_predicates(); } diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 044c888e45f3d6..24693dfd4bd240 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -423,6 +423,11 @@ class RuntimeState { bool enable_page_cache() const; + bool enable_prefer_cached_rowset() const { + return _query_options.__isset.enable_prefer_cached_rowset && + _query_options.enable_prefer_cached_rowset; + } + int64_t query_freshness_tolerance_ms() const { return _query_options.query_freshness_tolerance_ms; } diff --git a/be/src/vec/exec/scan/olap_scanner.cpp b/be/src/vec/exec/scan/olap_scanner.cpp index 3a1e890713704f..120a370ae31150 100644 --- a/be/src/vec/exec/scan/olap_scanner.cpp +++ b/be/src/vec/exec/scan/olap_scanner.cpp @@ -218,18 +218,15 @@ Status OlapScanner::prepare() { ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_hotspot().count(*tablet); } - Status st {}; - if (config::is_cloud_mode() && _state->enable_query_freshness_tolerance()) { - st = std::static_pointer_cast(tablet) - ->capture_rs_readers_with_freshness_tolerance( - _tablet_reader_params.version, &read_source.rs_splits, - _state->skip_missing_version(), - _state->query_freshness_tolerance_ms()); - } else { - st = tablet->capture_rs_readers(_tablet_reader_params.version, - &read_source.rs_splits, - _state->skip_missing_version()); - } + CaptureRsReaderOptions opts { + .skip_missing_version = _state->skip_missing_version(), + .enable_prefer_cached_rowset = + config::is_cloud_mode() ? _state->enable_prefer_cached_rowset() : false, + .query_freshness_tolerance_ms = + config::is_cloud_mode() ? _state->query_freshness_tolerance_ms() : -1, + }; + auto st = tablet->capture_rs_readers(_tablet_reader_params.version, + &read_source.rs_splits, opts); if (!st.ok()) { LOG(WARNING) << "fail to init reader.res=" << st; return st; diff --git a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp index 263b87f94d80b1..665a892b04365c 100644 --- a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp +++ b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp @@ -26,6 +26,7 @@ #include "cloud/cloud_storage_engine.h" #include "cloud/cloud_tablet.h" +#include "olap/base_tablet.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_factory.h" #include "olap/rowset/rowset_meta.h" @@ -127,8 +128,11 @@ class TestFreshnessTolerance : public testing::Test { int64_t query_freshness_tolerance_ms, const std::vector& expected_versions) { std::vector rs_splits; - auto st = tablet->capture_rs_readers_with_freshness_tolerance( - spec_version, &rs_splits, false, query_freshness_tolerance_ms); + CaptureRsReaderOptions opts { + .skip_missing_version = false, .enable_prefer_cached_rowset = false, + .query_freshness_tolerance_ms = query_freshness_tolerance_ms + } + auto st = tablet->capture_rs_readers(spec_version, &rs_splits, opts); ASSERT_TRUE(st.ok()); auto dump_versions = [](const std::vector& expected_versions, const std::vector& splits) { diff --git a/be/test/olap/tablet_test.cpp b/be/test/olap/tablet_test.cpp index ea7ddc24bbf445..8b6700c1f16950 100644 --- a/be/test/olap/tablet_test.cpp +++ b/be/test/olap/tablet_test.cpp @@ -296,11 +296,11 @@ TEST_F(TestTablet, pad_rowset) { Version version(5, 5); std::vector splits; - ASSERT_FALSE(_tablet->capture_rs_readers(version, &splits, false).ok()); + ASSERT_FALSE(_tablet->capture_rs_readers(version, &splits, {}).ok()); splits.clear(); static_cast(PadRowsetAction::_pad_rowset(_tablet.get(), version)); - ASSERT_TRUE(_tablet->capture_rs_readers(version, &splits, false).ok()); + ASSERT_TRUE(_tablet->capture_rs_readers(version, &splits, {}).ok()); } TEST_F(TestTablet, cooldown_policy) { diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index e90883d82bdbb9..1e04385fd735aa 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -404,7 +404,8 @@ struct TQueryOptions { 171: optional bool optimize_index_scan_parallelism = false; - 172: optional i64 query_freshness_tolerance_ms + 172: optional bool enable_prefer_cached_rowset + 173: optional i64 query_freshness_tolerance_ms // For cloud, to control if the content would be written into file cache // In write path, to control if the content would be written into file cache. From f42b0671f14036d7b2d1b2d4e4cbf74ea33c2719 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 21 Aug 2025 12:14:08 +0800 Subject: [PATCH 03/34] set visible ts for rowset in lazy committer and sub txn commit' --- be/src/cloud/cloud_tablet.cpp | 8 +++++--- be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp | 7 +++---- cloud/src/meta-service/meta_service_job.cpp | 1 - cloud/src/meta-service/meta_service_txn.cpp | 4 ++++ cloud/src/meta-service/txn_lazy_committer.cpp | 4 ++++ 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 3da319a5f1fa26..a6bcb2a5e91f42 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -157,7 +157,7 @@ Status CloudTablet::capture_rs_readers(const Version& spec_version, if (opts.query_freshness_tolerance_ms > 0) { return capture_rs_readers_with_freshness_tolerance(spec_version, rs_splits, opts.query_freshness_tolerance_ms); - } else if (opts.enable_prefer_cached_rowset) { + } else if (opts.enable_prefer_cached_rowset && enable_unique_key_merge_on_write()) { return capture_rs_readers_prefer_cache(spec_version, rs_splits); } return capture_rs_readers_internal(spec_version, rs_splits); @@ -207,10 +207,12 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( if (it == _rs_version_map.end()) { it = _stale_rs_version_map.find(version); if (it == _stale_rs_version_map.end()) { - return Status::Error( - "fail to find Rowset in stale_rs_version for version. tablet={}, " + LOG_INFO( + "fail to find Rowset in rs_version or stale_rs_version for version. " + "tablet={}, " "version={}-{}", tablet_id(), version.first, version.second); + return false; } } const auto& rs = it->second; diff --git a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp index 665a892b04365c..87bc4e0ddf83a8 100644 --- a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp +++ b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp @@ -128,10 +128,9 @@ class TestFreshnessTolerance : public testing::Test { int64_t query_freshness_tolerance_ms, const std::vector& expected_versions) { std::vector rs_splits; - CaptureRsReaderOptions opts { - .skip_missing_version = false, .enable_prefer_cached_rowset = false, - .query_freshness_tolerance_ms = query_freshness_tolerance_ms - } + CaptureRsReaderOptions opts {.skip_missing_version = false, + .enable_prefer_cached_rowset = false, + .query_freshness_tolerance_ms = query_freshness_tolerance_ms}; auto st = tablet->capture_rs_readers(spec_version, &rs_splits, opts); ASSERT_TRUE(st.ok()); auto dump_versions = [](const std::vector& expected_versions, diff --git a/cloud/src/meta-service/meta_service_job.cpp b/cloud/src/meta-service/meta_service_job.cpp index 4338ab7a0b0336..dca269980a2aa6 100644 --- a/cloud/src/meta-service/meta_service_job.cpp +++ b/cloud/src/meta-service/meta_service_job.cpp @@ -1234,7 +1234,6 @@ void process_compaction_job(MetaServiceCode& code, std::string& msg, std::string return; } - // We don't actually need to parse the rowset meta doris::RowsetMetaCloudPB rs_meta; rs_meta.ParseFromString(tmp_rowset_val); if (rs_meta.txn_id() <= 0) { diff --git a/cloud/src/meta-service/meta_service_txn.cpp b/cloud/src/meta-service/meta_service_txn.cpp index 6eb046a01500ff..92bca3ff8b5833 100644 --- a/cloud/src/meta-service/meta_service_txn.cpp +++ b/cloud/src/meta-service/meta_service_txn.cpp @@ -2333,6 +2333,9 @@ void MetaServiceImpl::commit_txn_with_sub_txn(const CommitTxnRequest* request, commit_txn_log.set_txn_id(txn_id); commit_txn_log.set_db_id(db_id); + int64_t rowsets_visible_time_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + // -> rowset meta std::vector, RowsetMetaCloudPB>> rowsets; std::unordered_map tablet_stats; // tablet_id -> stats @@ -2366,6 +2369,7 @@ void MetaServiceImpl::commit_txn_with_sub_txn(const CommitTxnRequest* request, } i.set_start_version(new_version); i.set_end_version(new_version); + i.set_visible_time_ms(rowsets_visible_time_ms); LOG(INFO) << "xxx update rowset version, txn_id=" << txn_id << ", sub_txn_id=" << sub_txn_id << ", table_id=" << table_id << ", partition_id=" << partition_id << ", tablet_id=" << tablet_id diff --git a/cloud/src/meta-service/txn_lazy_committer.cpp b/cloud/src/meta-service/txn_lazy_committer.cpp index 01192508bba6d4..233e2d33f6b97d 100644 --- a/cloud/src/meta-service/txn_lazy_committer.cpp +++ b/cloud/src/meta-service/txn_lazy_committer.cpp @@ -147,6 +147,9 @@ void convert_tmp_rowsets( // tablet_id -> stats std::unordered_map tablet_stats; + int64_t rowsets_visible_time_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + for (auto& [tmp_rowset_key, tmp_rowset_pb] : tmp_rowsets_meta) { std::string tmp_rowst_data; err = txn->get(tmp_rowset_key, &tmp_rowst_data); @@ -309,6 +312,7 @@ void convert_tmp_rowsets( tmp_rowset_pb.set_start_version(version); tmp_rowset_pb.set_end_version(version); + tmp_rowset_pb.set_visible_time_ms(rowsets_visible_time_ms); rowset_val.clear(); if (!tmp_rowset_pb.SerializeToString(&rowset_val)) { From 94af6df771ed6829f5a133065e0c4dfa9350b4f8 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 21 Aug 2025 14:07:38 +0800 Subject: [PATCH 04/34] change visible_time_ms to visible_ts_ms --- be/src/cloud/cloud_tablet.cpp | 9 +++++++++ be/src/cloud/pb_convert.cpp | 16 ++++++++-------- be/src/olap/rowset/rowset_meta.h | 12 ++++++------ .../cloud_tablet_query_with_tolerance_test.cpp | 6 +++--- cloud/src/meta-service/meta_service_job.cpp | 4 ++-- cloud/src/meta-service/meta_service_txn.cpp | 4 ++-- cloud/src/meta-service/txn_lazy_committer.cpp | 4 ++-- cloud/test/meta_service_job_test.cpp | 8 ++++---- cloud/test/meta_service_test.cpp | 8 ++++---- gensrc/proto/olap_file.proto | 4 ++-- 10 files changed, 42 insertions(+), 33 deletions(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index a6bcb2a5e91f42..01024e75ac828c 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -73,10 +73,13 @@ bvar::LatencyRecorder g_base_compaction_get_delete_bitmap_lock_time_ms( bvar::Adder g_unused_rowsets_count("unused_rowsets_count"); bvar::Adder g_unused_rowsets_bytes("unused_rowsets_bytes"); +bvar::Adder g_capture_prefer_cache_count("capture_prefer_cache_count"); bvar::Adder g_capture_with_freshness_tolerance_count( "capture_with_freshness_tolerance_count"); bvar::Adder g_capture_with_freshness_tolerance_fallback_count( "capture_with_freshness_tolerance_fallback_count"); +bvar::Window> g_capture_prefer_cache_count_window( + "capture_prefer_cache_count_window", &g_capture_prefer_cache_count, 30); bvar::Window> g_capture_with_freshness_tolerance_count_window( "capture_with_freshness_tolerance_count_window", &g_capture_with_freshness_tolerance_count, 30); @@ -190,6 +193,12 @@ Status CloudTablet::capture_rs_readers_internal(const Version& spec_version, return capture_rs_readers_unlocked(version_path, rs_splits); } +Status CloudTablet::capture_rs_readers_prefer_cache(const Version& spec_version, + std::vector* rs_splits) { + g_capture_prefer_cache_count << 1; + return Status::OK(); +} + Status CloudTablet::capture_rs_readers_with_freshness_tolerance( const Version& spec_version, std::vector* rs_splits, int64_t query_freshness_tolerance_ms) { diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index 113ee352bc7828..9b51876104687a 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -96,8 +96,8 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in) if (in.has___split_schema()) { out->mutable___split_schema()->CopyFrom(in.__split_schema()); } - if (in.has_visible_time_ms()) { - out->set_visible_time_ms(in.visible_time_ms()); + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); } } @@ -162,8 +162,8 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { if (in.has___split_schema()) { out->mutable___split_schema()->Swap(in.mutable___split_schema()); } - if (in.has_visible_time_ms()) { - out->set_visible_time_ms(in.visible_time_ms()); + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); } } @@ -238,8 +238,8 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) if (in.has___split_schema()) { out->mutable___split_schema()->CopyFrom(in.__split_schema()); } - if (in.has_visible_time_ms()) { - out->set_visible_time_ms(in.visible_time_ms()); + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); } } @@ -303,8 +303,8 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { if (in.has___split_schema()) { out->mutable___split_schema()->Swap(in.mutable___split_schema()); } - if (in.has_visible_time_ms()) { - out->set_visible_time_ms(in.visible_time_ms()); + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); } } diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h index ca532834296fac..6d536d5c2a2d7b 100644 --- a/be/src/olap/rowset/rowset_meta.h +++ b/be/src/olap/rowset/rowset_meta.h @@ -370,18 +370,18 @@ class RowsetMeta : public MetadataAdder { int64_t newest_write_timestamp() const { return _rowset_meta_pb.newest_write_timestamp(); } // for cloud only - bool has_visible_time_ms() const { return _rowset_meta_pb.has_visible_time_ms(); } - int64_t visible_time_ms() const { return _rowset_meta_pb.visible_time_ms(); } + bool has_visible_ts_ms() const { return _rowset_meta_pb.has_visible_ts_ms(); } + int64_t visible_ts_ms() const { return _rowset_meta_pb.visible_ts_ms(); } std::chrono::time_point visible_timestamp() const { using namespace std::chrono; - if (has_visible_time_ms()) { - return time_point(milliseconds(visible_time_ms())); + if (has_visible_ts_ms()) { + return time_point(milliseconds(visible_ts_ms())); } return system_clock::from_time_t(newest_write_timestamp()); } #ifdef BE_TEST - void set_visible_time_ms(int64_t visible_time_ms) { - _rowset_meta_pb.set_visible_time_ms(visible_time_ms); + void set_visible_ts_ms(int64_t visible_ts_ms) { + _rowset_meta_pb.set_visible_ts_ms(visible_ts_ms); } #endif diff --git a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp index 87bc4e0ddf83a8..61c4d84143ed4e 100644 --- a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp +++ b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp @@ -68,7 +68,7 @@ class TestFreshnessTolerance : public testing::Test { if (!rs) { return nullptr; } - rs->rowset_meta()->set_visible_time_ms( + rs->rowset_meta()->set_visible_ts_ms( duration_cast(visible_timestamp.time_since_epoch()).count()); return rs; } @@ -174,14 +174,14 @@ TEST_F(TestFreshnessTolerance, testVisibleTimestamp) { } { - // when visible_time_ms is set, RowsetMeta::visible_timestamp() uses visible_time_ms which is more precise + // when visible_ts_ms is set, RowsetMeta::visible_timestamp() uses visible_ts_ms which is more precise auto tp1 = system_clock::now() - seconds(100); auto tp2 = system_clock::now() - seconds(50); auto rs = create_rowset_without_visible_time({2, 2}); auto d1 = duration_cast(tp1.time_since_epoch()).count(); auto d2 = duration_cast(tp2.time_since_epoch()).count(); rs->rowset_meta()->set_newest_write_timestamp(d1); - rs->rowset_meta()->set_visible_time_ms(d2); + rs->rowset_meta()->set_visible_ts_ms(d2); ASSERT_EQ(rs->rowset_meta()->visible_timestamp(), time_point(milliseconds(d2))); } diff --git a/cloud/src/meta-service/meta_service_job.cpp b/cloud/src/meta-service/meta_service_job.cpp index dca269980a2aa6..0ea6f2d9483a44 100644 --- a/cloud/src/meta-service/meta_service_job.cpp +++ b/cloud/src/meta-service/meta_service_job.cpp @@ -1251,7 +1251,7 @@ void process_compaction_job(MetaServiceCode& code, std::string& msg, std::string using namespace std::chrono; auto rowset_visible_time = duration_cast(system_clock::now().time_since_epoch()).count(); - rs_meta.set_visible_time_ms(rowset_visible_time); + rs_meta.set_visible_ts_ms(rowset_visible_time); std::string rowset_val; if (!rs_meta.SerializeToString(&rowset_val)) { code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR; @@ -1891,7 +1891,7 @@ void process_schema_change_job(MetaServiceCode& code, std::string& msg, std::str using namespace std::chrono; auto rowset_visible_time = duration_cast(system_clock::now().time_since_epoch()).count(); - tmp_rowset_meta.set_visible_time_ms(rowset_visible_time); + tmp_rowset_meta.set_visible_ts_ms(rowset_visible_time); std::string rowset_val; if (!tmp_rowset_meta.SerializeToString(&rowset_val)) { code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR; diff --git a/cloud/src/meta-service/meta_service_txn.cpp b/cloud/src/meta-service/meta_service_txn.cpp index 92bca3ff8b5833..289ce0a8917b81 100644 --- a/cloud/src/meta-service/meta_service_txn.cpp +++ b/cloud/src/meta-service/meta_service_txn.cpp @@ -1317,7 +1317,7 @@ void MetaServiceImpl::commit_txn_immediately( std::unordered_map tablet_stats; // tablet_id -> stats rowsets.reserve(tmp_rowsets_meta.size()); - int64_t rowsets_visible_time_ms = + int64_t rowsets_visible_ts_ms = duration_cast(system_clock::now().time_since_epoch()).count(); for (auto& [_, i] : tmp_rowsets_meta) { @@ -1342,7 +1342,7 @@ void MetaServiceImpl::commit_txn_immediately( int64_t new_version = versions[partition_id] + 1; i.set_start_version(new_version); i.set_end_version(new_version); - i.set_visible_time_ms(rowsets_visible_time_ms); + i.set_visible_ts_ms(rowsets_visible_ts_ms); // Accumulate affected rows auto& stats = tablet_stats[tablet_id]; diff --git a/cloud/src/meta-service/txn_lazy_committer.cpp b/cloud/src/meta-service/txn_lazy_committer.cpp index 233e2d33f6b97d..ddf0cd111e6ad8 100644 --- a/cloud/src/meta-service/txn_lazy_committer.cpp +++ b/cloud/src/meta-service/txn_lazy_committer.cpp @@ -147,7 +147,7 @@ void convert_tmp_rowsets( // tablet_id -> stats std::unordered_map tablet_stats; - int64_t rowsets_visible_time_ms = + int64_t rowsets_visible_ts_ms = duration_cast(system_clock::now().time_since_epoch()).count(); for (auto& [tmp_rowset_key, tmp_rowset_pb] : tmp_rowsets_meta) { @@ -312,7 +312,7 @@ void convert_tmp_rowsets( tmp_rowset_pb.set_start_version(version); tmp_rowset_pb.set_end_version(version); - tmp_rowset_pb.set_visible_time_ms(rowsets_visible_time_ms); + tmp_rowset_pb.set_visible_ts_ms(rowsets_visible_ts_ms); rowset_val.clear(); if (!tmp_rowset_pb.SerializeToString(&rowset_val)) { diff --git a/cloud/test/meta_service_job_test.cpp b/cloud/test/meta_service_job_test.cpp index e6ff74cc927df1..351de729493064 100644 --- a/cloud/test/meta_service_job_test.cpp +++ b/cloud/test/meta_service_job_test.cpp @@ -1144,9 +1144,9 @@ TEST(MetaServiceJobTest, CompactionJobTest) { EXPECT_EQ(txn->get(rowset_key, &rowset_val), TxnErrorCode::TXN_OK) << hex(rowset_key); doris::RowsetMetaCloudPB rowset_meta; ASSERT_TRUE(rowset_meta.ParseFromString(rowset_val)); - ASSERT_TRUE(rowset_meta.has_visible_time_ms() && rowset_meta.visible_time_ms() > 0); + ASSERT_TRUE(rowset_meta.has_visible_ts_ms() && rowset_meta.visible_ts_ms() > 0); using namespace std::chrono; - auto visible_tp = time_point(milliseconds(rowset_meta.visible_time_ms())); + auto visible_tp = time_point(milliseconds(rowset_meta.visible_ts_ms())); std::time_t visible_time = system_clock::to_time_t(visible_tp); std::cout << "visible time: " << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; @@ -3638,10 +3638,10 @@ TEST(MetaServiceJobTest, SchemaChangeJobTest) { EXPECT_EQ(saved_rowset.start_version(), rs.start_version()); EXPECT_EQ(saved_rowset.end_version(), rs.end_version()); EXPECT_EQ(saved_rowset.rowset_id_v2(), rs.rowset_id_v2()); - ASSERT_TRUE(saved_rowset.has_visible_time_ms() && saved_rowset.visible_time_ms() > 0); + ASSERT_TRUE(saved_rowset.has_visible_ts_ms() && saved_rowset.visible_ts_ms() > 0); using namespace std::chrono; auto visible_tp = - time_point(milliseconds(saved_rowset.visible_time_ms())); + time_point(milliseconds(saved_rowset.visible_ts_ms())); std::time_t visible_time = system_clock::to_time_t(visible_tp); std::cout << "visible time: " << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index e67b662659ebb7..f45158693dec86 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -11927,10 +11927,10 @@ TEST(MetaServiceTest, RowsetVisibleTimeTest) { ASSERT_EQ(txn->get(rowset_key, &val), TxnErrorCode::TXN_OK); RowsetMetaCloudPB rowset_pb; ASSERT_TRUE(rowset_pb.ParseFromString(val)); - ASSERT_TRUE(rowset_pb.has_visible_time_ms()); - std::cout << rowset_pb.visible_time_ms() << "\n"; - ASSERT_GT(rowset_pb.visible_time_ms(), 0); - auto visible_tp = time_point(milliseconds(rowset_pb.visible_time_ms())); + ASSERT_TRUE(rowset_pb.has_visible_ts_ms()); + std::cout << rowset_pb.visible_ts_ms() << "\n"; + ASSERT_GT(rowset_pb.visible_ts_ms(), 0); + auto visible_tp = time_point(milliseconds(rowset_pb.visible_ts_ms())); std::time_t visible_time = system_clock::to_time_t(visible_tp); std::cout << "visible time: " << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index 9faaaaa362d43b..3dda0c132a9485 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -149,7 +149,7 @@ message RowsetMetaPB { optional SplitSchemaPB __split_schema = 1009; // A special field, DO NOT change it. - optional int64 visible_time_ms = 1010; + optional int64 visible_ts_ms = 1010; } message SchemaDictKeyList { @@ -246,7 +246,7 @@ message RowsetMetaCloudPB { optional SplitSchemaPB __split_schema = 108; // A special field, DO NOT change it. - optional int64 visible_time_ms = 109; + optional int64 visible_ts_ms = 109; } message SegmentStatisticsPB { From e823317a23d54de00d35d24a57854b60f3de57d8 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 21 Aug 2025 14:54:14 +0800 Subject: [PATCH 05/34] add callback --- be/src/cloud/cloud_backend_service.cpp | 1 + be/src/cloud/cloud_tablet.cpp | 14 +++++++++----- be/src/cloud/cloud_tablet.h | 3 ++- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/be/src/cloud/cloud_backend_service.cpp b/be/src/cloud/cloud_backend_service.cpp index a5caf89e2074af..bbc70fc1739cc3 100644 --- a/be/src/cloud/cloud_backend_service.cpp +++ b/be/src/cloud/cloud_backend_service.cpp @@ -213,6 +213,7 @@ void CloudBackendService::warm_up_cache_async(TWarmUpCacheAsyncResponse& respons if (!cntl.Failed()) { g_file_cache_warm_up_cache_async_submitted_segment_num << brpc_response.file_cache_block_metas().size(); + // TODO(bobhan1): add callback _engine.file_cache_block_downloader().submit_download_task( std::move(*brpc_response.mutable_file_cache_block_metas())); } else { diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 01024e75ac828c..296533e9b73eb3 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -216,18 +216,17 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( if (it == _rs_version_map.end()) { it = _stale_rs_version_map.find(version); if (it == _stale_rs_version_map.end()) { - LOG_INFO( + LOG_WARNING( "fail to find Rowset in rs_version or stale_rs_version for version. " - "tablet={}, " - "version={}-{}", - tablet_id(), version.first, version.second); + "tablet={}, version={}", + tablet_id(), version.to_string()); return false; } } const auto& rs = it->second; if (rs->visible_timestamp() < startup_timepoint) { // We only care about rowsets that are created after startup time point. For other rowsets, - // we assume they are warmuped up. + // we assume they are warmed up. return true; } return is_rowset_warmed_up(rs->rowset_id()); @@ -641,6 +640,10 @@ uint64_t CloudTablet::delete_expired_stale_rowsets() { auto& manager = ExecEnv::GetInstance()->storage_engine().to_cloud().cloud_warm_up_manager(); manager.recycle_cache(tablet_id(), recycled_rowsets); } + // these rowsets will not be choosen for query any more, so don't need to maintain if they are warmed up + for (const auto& rs : expired_rowsets) { + remove_warmed_up_rowset(rs->rowset_id()); + } if (config::enable_mow_verbose_log) { LOG_INFO("finish delete_expired_stale_rowset for tablet={}", tablet_id()); } @@ -1602,6 +1605,7 @@ WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Stat _rowset_warm_up_states[rowset_id].second--; if (_rowset_warm_up_states[rowset_id].second <= 0) { g_file_cache_warm_up_rowset_complete_num << 1; + add_warmed_up_rowset(rowset_id); _rowset_warm_up_states[rowset_id].first = WarmUpState::DONE; } return _rowset_warm_up_states[rowset_id].first; diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index a205da7f36f542..7f01da18b3b5ee 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -312,7 +312,8 @@ class CloudTablet final : public BaseTablet { return _warmed_up_rowsets.contains(rowset_id); } - // TODO: add to warm up callback when file cache donwload task is done + // mark a rowset that it has been warmed up + // must be called when file cache donwload task on this rowset is done void add_warmed_up_rowset(const RowsetId& rowset_id) { std::unique_lock wlock(_warmed_up_rowsets_mutex); _warmed_up_rowsets.insert(rowset_id); From a9db01360771ba76f387ef544d783b0c36a4bb42 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 21 Aug 2025 15:57:44 +0800 Subject: [PATCH 06/34] add query prefer cache impl --- be/src/cloud/cloud_tablet.cpp | 92 ++++-- be/src/cloud/cloud_tablet.h | 3 + be/src/olap/version_graph.cpp | 51 +-- be/src/olap/version_graph.h | 8 +- .../cloud_tablet_query_prefer_cache_test.cpp | 296 ++++++++++++++++++ 5 files changed, 396 insertions(+), 54 deletions(-) create mode 100644 be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 296533e9b73eb3..55dbdb5bc14476 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -74,12 +74,17 @@ bvar::Adder g_unused_rowsets_count("unused_rowsets_count"); bvar::Adder g_unused_rowsets_bytes("unused_rowsets_bytes"); bvar::Adder g_capture_prefer_cache_count("capture_prefer_cache_count"); +bvar::Adder g_capture_prefer_cache_count_fallback_count( + "capture_prefer_cache_count_fallback_count"); bvar::Adder g_capture_with_freshness_tolerance_count( "capture_with_freshness_tolerance_count"); bvar::Adder g_capture_with_freshness_tolerance_fallback_count( "capture_with_freshness_tolerance_fallback_count"); bvar::Window> g_capture_prefer_cache_count_window( "capture_prefer_cache_count_window", &g_capture_prefer_cache_count, 30); +bvar::Window> g_capture_prefer_cache_count_fallback_count_window( + "capture_prefer_cache_count_fallback_count_window", + &g_capture_prefer_cache_count_fallback_count, 30); bvar::Window> g_capture_with_freshness_tolerance_count_window( "capture_with_freshness_tolerance_count_window", &g_capture_with_freshness_tolerance_count, 30); @@ -160,7 +165,7 @@ Status CloudTablet::capture_rs_readers(const Version& spec_version, if (opts.query_freshness_tolerance_ms > 0) { return capture_rs_readers_with_freshness_tolerance(spec_version, rs_splits, opts.query_freshness_tolerance_ms); - } else if (opts.enable_prefer_cached_rowset && enable_unique_key_merge_on_write()) { + } else if (opts.enable_prefer_cached_rowset && !enable_unique_key_merge_on_write()) { return capture_rs_readers_prefer_cache(spec_version, rs_splits); } return capture_rs_readers_internal(spec_version, rs_splits); @@ -196,41 +201,66 @@ Status CloudTablet::capture_rs_readers_internal(const Version& spec_version, Status CloudTablet::capture_rs_readers_prefer_cache(const Version& spec_version, std::vector* rs_splits) { g_capture_prefer_cache_count << 1; - return Status::OK(); + Versions version_path; + std::shared_lock rlock(_meta_lock); + RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator( + spec_version, version_path, + [&](int64_t start, int64_t end) { return rowset_is_warmed_up(start, end); })); + int64_t path_max_version = version_path.back().second; + LOG_INFO("[verbose] CloudTablet::capture_rs_readers_prefer_cache, capture path: {}", + fmt::join(version_path | std::views::transform([](const auto& version) { + return fmt::format("{}", version.to_string()); + }), + ", ")) + .tag("tablet_id", tablet_id()) + .tag("spec_version", spec_version.to_string()) + .tag("path_max_version", path_max_version); + bool should_fallback = path_max_version < spec_version.second; + if (should_fallback) { + rlock.unlock(); + LOG_INFO( + "[verbose] CloudTablet::capture_rs_readers_prefer_cache, fallback, spec_version={}", + spec_version.to_string()); + g_capture_prefer_cache_count_fallback_count << 1; + // if there exists a rowset which satisfies freshness tolerance and its start version is larger than the path max version + // but has not been warmuped up yet, fallback to capture rowsets as usual + return capture_rs_readers_internal(spec_version, rs_splits); + } + return capture_rs_readers_unlocked(version_path, rs_splits); } +bool CloudTablet::rowset_is_warmed_up(int64_t start_version, int64_t end_version) { + if (start_version > end_version) { + return false; + } + Version version {start_version, end_version}; + auto it = _rs_version_map.find(version); + if (it == _rs_version_map.end()) { + it = _stale_rs_version_map.find(version); + if (it == _stale_rs_version_map.end()) { + LOG_WARNING( + "fail to find Rowset in rs_version or stale_rs_version for version. " + "tablet={}, version={}", + tablet_id(), version.to_string()); + return false; + } + } + const auto& rs = it->second; + if (rs->visible_timestamp() < _engine.startup_timepoint()) { + // We only care about rowsets that are created after startup time point. For other rowsets, + // we assume they are warmed up. + return true; + } + return is_rowset_warmed_up(rs->rowset_id()); +}; + Status CloudTablet::capture_rs_readers_with_freshness_tolerance( const Version& spec_version, std::vector* rs_splits, int64_t query_freshness_tolerance_ms) { g_capture_with_freshness_tolerance_count << 1; using namespace std::chrono; auto freshness_limit_tp = system_clock::now() - milliseconds(query_freshness_tolerance_ms); - auto startup_timepoint = _engine.startup_timepoint(); // find a version path where every edge(rowset) has been warmuped - auto rowset_is_warmed_up = [&](int64_t start_version, int64_t end_version) -> bool { - if (start_version > end_version) { - return false; - } - Version version {start_version, end_version}; - auto it = _rs_version_map.find(version); - if (it == _rs_version_map.end()) { - it = _stale_rs_version_map.find(version); - if (it == _stale_rs_version_map.end()) { - LOG_WARNING( - "fail to find Rowset in rs_version or stale_rs_version for version. " - "tablet={}, version={}", - tablet_id(), version.to_string()); - return false; - } - } - const auto& rs = it->second; - if (rs->visible_timestamp() < startup_timepoint) { - // We only care about rowsets that are created after startup time point. For other rowsets, - // we assume they are warmed up. - return true; - } - return is_rowset_warmed_up(rs->rowset_id()); - }; Versions version_path; std::shared_lock rlock(_meta_lock); if (enable_unique_key_merge_on_write()) { @@ -238,10 +268,13 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( // So we can ony capture rowsets which are in newest data layout. Otherwise there may be data correctness issue. RETURN_IF_ERROR( _timestamped_version_tracker.capture_newest_consistent_versions_with_validator( - 0, version_path, rowset_is_warmed_up)); + spec_version, version_path, [&](int64_t start, int64_t end) { + return rowset_is_warmed_up(start, end); + })); } else { RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator( - 0, version_path, rowset_is_warmed_up)); + spec_version, version_path, + [&](int64_t start, int64_t end) { return rowset_is_warmed_up(start, end); })); } int64_t path_max_version = version_path.back().second; auto should_be_visible_but_not_warmed_up = [&](const auto& rs_meta) -> bool { @@ -258,6 +291,7 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( std::ranges::any_of(_tablet_meta->all_stale_rs_metas(), should_be_visible_but_not_warmed_up); if (should_fallback) { + rlock.unlock(); g_capture_with_freshness_tolerance_fallback_count << 1; // if there exists a rowset which satisfies freshness tolerance and its start version is larger than the path max version // but has not been warmuped up yet, fallback to capture rowsets as usual diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 7f01da18b3b5ee..3c442e75edbca1 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -332,6 +332,9 @@ class CloudTablet final : public BaseTablet { bool add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state); + // used by capture_rs_reader_xxx functions + bool rowset_is_warmed_up(int64_t start_version, int64_t end_version); + CloudStorageEngine& _engine; // this mutex MUST ONLY be used when sync meta diff --git a/be/src/olap/version_graph.cpp b/be/src/olap/version_graph.cpp index 3cdac07da1d67a..b77eede9b0a78d 100644 --- a/be/src/olap/version_graph.cpp +++ b/be/src/olap/version_graph.cpp @@ -338,17 +338,17 @@ Status TimestampedVersionTracker::capture_consistent_versions( } Status TimestampedVersionTracker::capture_consistent_versions_with_validator( - int64_t start, std::vector& version_path, + const Version& spec_version, std::vector& version_path, const std::function& validator) const { - return _version_graph.capture_consistent_versions_with_validator(start, version_path, + return _version_graph.capture_consistent_versions_with_validator(spec_version, version_path, validator); } Status TimestampedVersionTracker::capture_newest_consistent_versions_with_validator( - int64_t start, std::vector& version_path, + const Version& spec_version, std::vector& version_path, const std::function& validator) const { - return _version_graph.capture_newest_consistent_versions_with_validator(start, version_path, - validator); + return _version_graph.capture_newest_consistent_versions_with_validator( + spec_version, version_path, validator); } void TimestampedVersionTracker::capture_expired_paths( @@ -651,22 +651,29 @@ Status VersionGraph::capture_consistent_versions(const Version& spec_version, } Status VersionGraph::capture_consistent_versions_with_validator( - int64_t start, std::vector& version_path, + const Version& spec_version, std::vector& version_path, const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + int64_t cur_idx = -1; for (size_t i = 0; i < _version_graph.size(); i++) { - if (_version_graph[i].value == start) { + if (_version_graph[i].value == spec_version.first) { cur_idx = i; break; } } if (cur_idx < 0) { - return Status::InternalError("failed to find path in version_graph. start {}", - start); + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); } - while (true) { + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { int64_t next_idx = -1; for (const auto& it : _version_graph[cur_idx].edges) { // Only consider incremental versions. @@ -695,23 +702,29 @@ Status VersionGraph::capture_consistent_versions_with_validator( } Status VersionGraph::capture_newest_consistent_versions_with_validator( - int64_t start, std::vector& version_path, + const Version& spec_version, std::vector& version_path, const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + int64_t cur_idx = -1; for (size_t i = 0; i < _version_graph.size(); i++) { - if (_version_graph[i].value == start) { + if (_version_graph[i].value == spec_version.first) { cur_idx = i; break; } } if (cur_idx < 0) { - return Status::InternalError("failed to find path in version_graph. start {}", - start); + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); } - std::optional end_value; - while (!end_value.has_value() || _version_graph[cur_idx].value < end_value.value()) { + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { int64_t next_idx = -1; for (const auto& it : _version_graph[cur_idx].edges) { // Only consider incremental versions. @@ -723,11 +736,7 @@ Status VersionGraph::capture_newest_consistent_versions_with_validator( if (_version_graph[cur_idx].value + 1 == _version_graph[it].value) { break; } - if (!end_value.has_value() || _version_graph[it].value < end_value.value()) { - // when encounter a compaction's output rowset which is not valid, try to find a version path - // with smaller max version - end_value = _version_graph[it].value; - } + end_value = std::min(_version_graph[it].value, end_value); continue; } diff --git a/be/src/olap/version_graph.h b/be/src/olap/version_graph.h index a6db8618d4048b..0b25c0f3d339c6 100644 --- a/be/src/olap/version_graph.h +++ b/be/src/olap/version_graph.h @@ -64,11 +64,11 @@ class VersionGraph { // If this version not in main version, version_path can be included expired rowset. // NOTE: this method may return edges which is in stale path Status capture_consistent_versions_with_validator( - int64_t start, std::vector& version_path, + const Version& spec_version, std::vector& version_path, const std::function& validator) const; Status capture_newest_consistent_versions_with_validator( - int64_t start, std::vector& version_path, + const Version& spec_version, std::vector& version_path, const std::function& validator) const; // See comment of TimestampedVersionTracker's get_orphan_vertex_ratio(); @@ -192,11 +192,11 @@ class TimestampedVersionTracker { // If this version not in main version, version_path can be included expired rowset. // NOTE: this method may return edges which is in stale path Status capture_consistent_versions_with_validator( - int64_t start, std::vector& version_path, + const Version& spec_version, std::vector& version_path, const std::function& validator) const; Status capture_newest_consistent_versions_with_validator( - int64_t start, std::vector& version_path, + const Version& spec_version, std::vector& version_path, const std::function& validator) const; /// Capture all expired path version. diff --git a/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp new file mode 100644 index 00000000000000..8f6a2bd24e67e3 --- /dev/null +++ b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet.h" +#include "olap/base_tablet.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_meta.h" +#include "olap/tablet_meta.h" +#include "util/uid_util.h" + +namespace doris { + +using namespace std::chrono; + +class TestQueryPreferCache : public testing::Test { +public: + TestQueryPreferCache() : _engine(CloudStorageEngine(EngineOptions {})) {} + + void SetUp() override { + _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, + UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, + TCompressionType::LZ4F)); + } + void TearDown() override {} + + RowsetSharedPtr create_rowset_without_visible_time(Version version) { + auto rs_meta = std::make_shared(); + rs_meta->set_rowset_type(BETA_ROWSET); + rs_meta->set_version(version); + rs_meta->set_rowset_id(_engine.next_rowset_id()); + RowsetSharedPtr rowset; + Status st = RowsetFactory::create_rowset(nullptr, "", rs_meta, &rowset); + if (!st.ok()) { + return nullptr; + } + return rowset; + } + + RowsetSharedPtr create_rowset(Version version, + time_point visible_timestamp = system_clock::now() - + seconds(100)) { + auto rs = create_rowset_without_visible_time(version); + if (!rs) { + return nullptr; + } + rs->rowset_meta()->set_visible_ts_ms( + duration_cast(visible_timestamp.time_since_epoch()).count()); + return rs; + } + + CloudTabletSPtr create_tablet_with_initial_rowsets(int max_version, bool is_mow = false) { + CloudTabletSPtr tablet = + std::make_shared(_engine, std::make_shared(*_tablet_meta)); + tablet->tablet_meta()->set_enable_unique_key_merge_on_write(is_mow); + std::vector rowsets; + auto rs1 = create_rowset(Version {0, 1}); + rowsets.emplace_back(rs1); + tablet->add_warmed_up_rowset(rs1->rowset_id()); + for (int ver = 2; ver <= max_version; ver++) { + auto rs = create_rowset(Version {ver, ver}); + tablet->add_warmed_up_rowset(rs->rowset_id()); + rowsets.emplace_back(rs); + } + { + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets(rowsets, false, wlock, false); + } + return tablet; + } + + void add_new_version_rowset(CloudTabletSPtr tablet, int64_t version, bool warmed_up, + time_point visible_timestamp) { + auto rowset = create_rowset(Version {version, version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(rowset->rowset_id()); + } + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets({rowset}, false, wlock, false); + } + + void do_cumu_compaction(CloudTabletSPtr tablet, int64_t start_version, int64_t end_version, + bool warmed_up, time_point visible_timestamp) { + std::unique_lock wrlock {tablet->get_header_lock()}; + std::vector input_rowsets; + auto output_rowset = create_rowset(Version {start_version, end_version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(output_rowset->rowset_id()); + } + std::ranges::copy_if(std::views::values(tablet->rowset_map()), + std::back_inserter(input_rowsets), [=](const RowsetSharedPtr& rowset) { + return rowset->version().first >= start_version && + rowset->version().first <= end_version; + }); + if (input_rowsets.size() == 1) { + tablet->add_rowsets({output_rowset}, true, wrlock); + } else { + tablet->delete_rowsets(input_rowsets, wrlock); + tablet->add_rowsets({output_rowset}, false, wrlock); + } + } + + void check_capture_result(CloudTabletSPtr tablet, Version spec_version, + const std::vector& expected_versions) { + std::vector rs_splits; + CaptureRsReaderOptions opts {.skip_missing_version = false, + .enable_prefer_cached_rowset = true, + .query_freshness_tolerance_ms = -1}; + auto st = tablet->capture_rs_readers(spec_version, &rs_splits, opts); + ASSERT_TRUE(st.ok()); + auto dump_versions = [](const std::vector& expected_versions, + const std::vector& splits) { + std::vector expected_str; + for (const auto& version : expected_versions) { + expected_str.push_back(version.to_string()); + } + std::vector versions; + for (const auto& split : splits) { + versions.push_back(split.rs_reader->rowset()->version().to_string()); + } + return fmt::format("expected_versions: {}, actual_versions: {}", + fmt::join(expected_str, ", "), fmt::join(versions, ", ")); + }; + ASSERT_EQ(rs_splits.size(), expected_versions.size()) + << dump_versions(expected_versions, rs_splits); + for (size_t i = 0; i < rs_splits.size(); i++) { + ASSERT_EQ(rs_splits[i].rs_reader->rowset()->version(), expected_versions[i]) + << dump_versions(expected_versions, rs_splits); + } + } + +protected: + std::string _json_rowset_meta; + TabletMetaSharedPtr _tablet_meta; + +private: + CloudStorageEngine _engine; +}; + +TEST_F(TestQueryPreferCache, testCapture_1_4) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +} // namespace doris \ No newline at end of file From 9d2ef2f056df94ea57a8f55222baca3b619b26ec Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 21 Aug 2025 16:57:01 +0800 Subject: [PATCH 07/34] add user property for query_freshness_tolerance_ms --- .../org/apache/doris/mysql/privilege/Auth.java | 9 +++++++++ .../mysql/privilege/CommonUserProperties.java | 11 +++++++++++ .../doris/mysql/privilege/UserProperty.java | 18 ++++++++++++++++++ .../doris/mysql/privilege/UserPropertyMgr.java | 9 +++++++++ .../org/apache/doris/qe/SessionVariable.java | 14 +++++++++++++- .../apache/doris/catalog/UserPropertyTest.java | 2 ++ .../doris/planner/ResourceTagQueryTest.java | 2 +- 7 files changed, 63 insertions(+), 2 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java index 29d6c4f66344f9..464435cfea2c9b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java @@ -1269,6 +1269,15 @@ public Pair isWorkloadGroupInUse(String groupName) { } } + public long getQueryFreshnessToleranceMs(String qualifiedUser) { + readLock(); + try { + return propertyMgr.getQueryFreshnessToleranceMs(qualifiedUser); + } finally { + readUnlock(); + } + } + public void getAllDomains(Set allDomains) { readLock(); try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java index 24a33c5b6e3a9d..8c13f410932127 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java @@ -68,6 +68,9 @@ public class CommonUserProperties implements GsonPostProcessable { @SerializedName(value = "wg", alternate = {"workloadGroup"}) private String workloadGroup = WorkloadGroupMgr.DEFAULT_GROUP_NAME; + @SerializedName(value = "qft", alternate = {"queryFreshnessTolerance"}) + private long queryFreshnessToleranceMs = -1; + private String[] sqlBlockRulesSplit = {}; long getMaxConn() { @@ -168,6 +171,14 @@ public void setWorkloadGroup(String workloadGroup) { this.workloadGroup = workloadGroup; } + public long getQueryFreshnessToleranceMs() { + return queryFreshnessToleranceMs; + } + + public void setQueryFreshnessToleranceMs(long queryFreshnessToleranceMs) { + this.queryFreshnessToleranceMs = queryFreshnessToleranceMs; + } + @Override public void gsonPostProcess() throws IOException { if (!Strings.isNullOrEmpty(sqlBlockRules)) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java index 6a148d84a40e5e..beee1ba10cf2c3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java @@ -75,6 +75,8 @@ public class UserProperty { public static final String DEFAULT_CLOUD_CLUSTER = "default_cloud_cluster"; public static final String DEFAULT_COMPUTE_GROUP = "default_compute_group"; + public static final String PROP_QUERY_FRESHNESS_TOLERANCE = "query_freshness_tolerance"; + // for system user public static final Set ADVANCED_PROPERTIES = Sets.newHashSet(); // for normal user @@ -114,6 +116,7 @@ public class UserProperty { COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_WORKLOAD_GROUP + "$", Pattern.CASE_INSENSITIVE)); COMMON_PROPERTIES.add(Pattern.compile("^" + DEFAULT_CLOUD_CLUSTER + "$", Pattern.CASE_INSENSITIVE)); COMMON_PROPERTIES.add(Pattern.compile("^" + DEFAULT_COMPUTE_GROUP + "$", Pattern.CASE_INSENSITIVE)); + COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_QUERY_FRESHNESS_TOLERANCE + "$", Pattern.CASE_INSENSITIVE)); } public UserProperty() { @@ -171,6 +174,10 @@ public long getExecMemLimit() { return commonProperties.getExecMemLimit(); } + public long getQueryFreshnessToleranceMs() { + return commonProperties.getQueryFreshnessToleranceMs(); + } + public void update(List> properties) throws UserException { update(properties, false); } @@ -188,6 +195,7 @@ public void update(List> properties, boolean isReplay) thro int insertTimeout = this.commonProperties.getInsertTimeout(); String initCatalog = this.commonProperties.getInitCatalog(); String workloadGroup = this.commonProperties.getWorkloadGroup(); + long queryFreshnessToleranceMs = this.commonProperties.getQueryFreshnessToleranceMs(); String newDefaultCloudCluster = defaultCloudCluster; @@ -320,6 +328,12 @@ public void update(List> properties, boolean isReplay) thro throw new DdlException("workload group " + value + " not exists"); } workloadGroup = value; + } else if (keyArr[0].equalsIgnoreCase(PROP_QUERY_FRESHNESS_TOLERANCE)) { + // set property "query_freshness_tolerance" = "1000"; + if (keyArr.length != 1) { + throw new DdlException(PROP_QUERY_FRESHNESS_TOLERANCE + " format error"); + } + queryFreshnessToleranceMs = getLongProperty(key, value, keyArr, PROP_QUERY_FRESHNESS_TOLERANCE); } else { if (isReplay) { // After using SET PROPERTY to modify the user property, if FE rolls back to a version without @@ -344,6 +358,7 @@ public void update(List> properties, boolean isReplay) thro this.commonProperties.setInsertTimeout(insertTimeout); this.commonProperties.setInitCatalog(initCatalog); this.commonProperties.setWorkloadGroup(workloadGroup); + this.commonProperties.setQueryFreshnessToleranceMs(queryFreshnessToleranceMs); defaultCloudCluster = newDefaultCloudCluster; } @@ -441,6 +456,9 @@ public List> fetchProperty() { result.add(Lists.newArrayList(PROP_WORKLOAD_GROUP, String.valueOf(commonProperties.getWorkloadGroup()))); + result.add(Lists.newArrayList(PROP_QUERY_FRESHNESS_TOLERANCE, + String.valueOf(commonProperties.getQueryFreshnessToleranceMs()))); + // default cloud cluster if (defaultCloudCluster != null) { result.add(Lists.newArrayList(DEFAULT_CLOUD_CLUSTER, defaultCloudCluster)); diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java index 7477c5a308211e..f83001f7491bf9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java @@ -245,6 +245,15 @@ public Pair isWorkloadGroupInUse(String groupName) { return Pair.of(false, ""); } + public long getQueryFreshnessToleranceMs(String qualifiedUser) { + UserProperty existProperty = propertyMap.get(qualifiedUser); + existProperty = getPropertyIfNull(qualifiedUser, existProperty); + if (existProperty == null) { + return -1; + } + return existProperty.getQueryFreshnessToleranceMs(); + } + /** * The method determines which user property to return based on the existProperty parameter * and system configuration: diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 23cc71890e9e9c..f695c811e1b7f1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -3627,6 +3627,18 @@ public int getParallelExecInstanceNum() { } } + public long getQueryFreshnessToleranceMs() { + ConnectContext connectContext = ConnectContext.get(); + if (connectContext != null && connectContext.getEnv() != null && connectContext.getEnv().getAuth() != null) { + long userQueryFreshnessToleranceMs = connectContext.getEnv().getAuth() + .getQueryFreshnessToleranceMs(connectContext.getQualifiedUser()); + if (userQueryFreshnessToleranceMs > 0) { + return userQueryFreshnessToleranceMs; + } + } + return queryFreshnessToleranceMs; + } + public int getExchangeInstanceParallel() { return exchangeInstanceParallel; } @@ -4561,7 +4573,7 @@ public TQueryOptions toThrift() { tResult.setSkipBadTablet(skipBadTablet); tResult.setDisableFileCache(disableFileCache); - tResult.setQueryFreshnessToleranceMs(queryFreshnessToleranceMs); + tResult.setQueryFreshnessToleranceMs(getQueryFreshnessToleranceMs()); // for spill tResult.setEnableSpill(enableSpill); diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java index c8a7c9e037dd56..0ef8226a82b6fd 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java @@ -82,6 +82,7 @@ public void testUpdate() throws UserException { properties.add(Pair.of("sql_block_rules", "rule1,rule2")); properties.add(Pair.of("cpu_resource_limit", "2")); properties.add(Pair.of("query_timeout", "500")); + properties.add(Pair.of("query_freshness_tolerance_ms", "4500")); UserProperty userProperty = new UserProperty(); userProperty.update(properties); @@ -92,6 +93,7 @@ public void testUpdate() throws UserException { Assert.assertEquals(2, userProperty.getCpuResourceLimit()); Assert.assertEquals(500, userProperty.getQueryTimeout()); Assert.assertEquals(Sets.newHashSet(), userProperty.getCopiedResourceTags()); + Assert.assertEquals(4500, userProperty.getQueryFreshnessToleranceMs()); // fetch property List> rows = userProperty.fetchProperty(); diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java index 767de59cae7f85..8add875a69eabc 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java @@ -307,7 +307,7 @@ public void test() throws Exception { Assert.assertEquals(1000000, execMemLimit); List> userProps = Env.getCurrentEnv().getAuth().getUserProperties(Auth.ROOT_USER); - Assert.assertEquals(13, userProps.size()); + Assert.assertEquals(14, userProps.size()); // now : // be1 be2 be3 ==>tag1; From 5a56cab2d9769725b2fe61b49108d93942187e09 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 21 Aug 2025 17:22:45 +0800 Subject: [PATCH 08/34] add enable_prefer_cached_rowset session var and user property --- .../apache/doris/mysql/privilege/Auth.java | 9 ++++++++ .../mysql/privilege/CommonUserProperties.java | 11 ++++++++++ .../doris/mysql/privilege/UserProperty.java | 19 +++++++++++++++++ .../mysql/privilege/UserPropertyMgr.java | 9 ++++++++ .../org/apache/doris/qe/SessionVariable.java | 21 ++++++++++++++++++- .../doris/catalog/UserPropertyTest.java | 2 ++ .../doris/planner/ResourceTagQueryTest.java | 2 +- 7 files changed, 71 insertions(+), 2 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java index 464435cfea2c9b..495753f9b7f039 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java @@ -1269,6 +1269,15 @@ public Pair isWorkloadGroupInUse(String groupName) { } } + public boolean getEnablePreferCachedRowset(String qualifiedUser) { + readLock(); + try { + return propertyMgr.getEnablePreferCachedRowset(qualifiedUser); + } finally { + readUnlock(); + } + } + public long getQueryFreshnessToleranceMs(String qualifiedUser) { readLock(); try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java index 8c13f410932127..277a206aa87c5a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java @@ -68,6 +68,9 @@ public class CommonUserProperties implements GsonPostProcessable { @SerializedName(value = "wg", alternate = {"workloadGroup"}) private String workloadGroup = WorkloadGroupMgr.DEFAULT_GROUP_NAME; + @SerializedName(value = "epcr", alternate = {"enablePreferCachedRowset"}) + private boolean enablePreferCachedRowset = false; + @SerializedName(value = "qft", alternate = {"queryFreshnessTolerance"}) private long queryFreshnessToleranceMs = -1; @@ -179,6 +182,14 @@ public void setQueryFreshnessToleranceMs(long queryFreshnessToleranceMs) { this.queryFreshnessToleranceMs = queryFreshnessToleranceMs; } + public boolean getEnablePreferCachedRowset() { + return enablePreferCachedRowset; + } + + public void setEnablePreferCachedRowset(boolean enablePreferCachedRowset) { + this.enablePreferCachedRowset = enablePreferCachedRowset; + } + @Override public void gsonPostProcess() throws IOException { if (!Strings.isNullOrEmpty(sqlBlockRules)) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java index beee1ba10cf2c3..10b25f47c1ab5b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java @@ -75,6 +75,7 @@ public class UserProperty { public static final String DEFAULT_CLOUD_CLUSTER = "default_cloud_cluster"; public static final String DEFAULT_COMPUTE_GROUP = "default_compute_group"; + public static final String PROP_ENABLE_PREFER_CACHED_ROWSET = "enable_prefer_cached_rowset"; public static final String PROP_QUERY_FRESHNESS_TOLERANCE = "query_freshness_tolerance"; // for system user @@ -117,6 +118,7 @@ public class UserProperty { COMMON_PROPERTIES.add(Pattern.compile("^" + DEFAULT_CLOUD_CLUSTER + "$", Pattern.CASE_INSENSITIVE)); COMMON_PROPERTIES.add(Pattern.compile("^" + DEFAULT_COMPUTE_GROUP + "$", Pattern.CASE_INSENSITIVE)); COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_QUERY_FRESHNESS_TOLERANCE + "$", Pattern.CASE_INSENSITIVE)); + COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_ENABLE_PREFER_CACHED_ROWSET + "$", Pattern.CASE_INSENSITIVE)); } public UserProperty() { @@ -178,6 +180,10 @@ public long getQueryFreshnessToleranceMs() { return commonProperties.getQueryFreshnessToleranceMs(); } + public boolean getEnablePreferCachedRowset() { + return commonProperties.getEnablePreferCachedRowset(); + } + public void update(List> properties) throws UserException { update(properties, false); } @@ -196,6 +202,7 @@ public void update(List> properties, boolean isReplay) thro String initCatalog = this.commonProperties.getInitCatalog(); String workloadGroup = this.commonProperties.getWorkloadGroup(); long queryFreshnessToleranceMs = this.commonProperties.getQueryFreshnessToleranceMs(); + boolean enablePreferCachedRowset = this.commonProperties.getEnablePreferCachedRowset(); String newDefaultCloudCluster = defaultCloudCluster; @@ -334,6 +341,15 @@ public void update(List> properties, boolean isReplay) thro throw new DdlException(PROP_QUERY_FRESHNESS_TOLERANCE + " format error"); } queryFreshnessToleranceMs = getLongProperty(key, value, keyArr, PROP_QUERY_FRESHNESS_TOLERANCE); + } else if (keyArr[0].equalsIgnoreCase(PROP_ENABLE_PREFER_CACHED_ROWSET)) { + if (keyArr.length != 1) { + throw new DdlException(PROP_ENABLE_PREFER_CACHED_ROWSET + " format error"); + } + try { + enablePreferCachedRowset = Boolean.parseBoolean(value); + } catch (NumberFormatException e) { + throw new DdlException(PROP_ENABLE_PREFER_CACHED_ROWSET + " is not boolean"); + } } else { if (isReplay) { // After using SET PROPERTY to modify the user property, if FE rolls back to a version without @@ -359,6 +375,7 @@ public void update(List> properties, boolean isReplay) thro this.commonProperties.setInitCatalog(initCatalog); this.commonProperties.setWorkloadGroup(workloadGroup); this.commonProperties.setQueryFreshnessToleranceMs(queryFreshnessToleranceMs); + this.commonProperties.setEnablePreferCachedRowset(enablePreferCachedRowset); defaultCloudCluster = newDefaultCloudCluster; } @@ -456,6 +473,8 @@ public List> fetchProperty() { result.add(Lists.newArrayList(PROP_WORKLOAD_GROUP, String.valueOf(commonProperties.getWorkloadGroup()))); + result.add(Lists.newArrayList(PROP_ENABLE_PREFER_CACHED_ROWSET, + String.valueOf(commonProperties.getEnablePreferCachedRowset()))); result.add(Lists.newArrayList(PROP_QUERY_FRESHNESS_TOLERANCE, String.valueOf(commonProperties.getQueryFreshnessToleranceMs()))); diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java index f83001f7491bf9..e068182308ed16 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java @@ -245,6 +245,15 @@ public Pair isWorkloadGroupInUse(String groupName) { return Pair.of(false, ""); } + public boolean getEnablePreferCachedRowset(String qualifiedUser) { + UserProperty existProperty = propertyMap.get(qualifiedUser); + existProperty = getPropertyIfNull(qualifiedUser, existProperty); + if (existProperty == null) { + return false; + } + return existProperty.getEnablePreferCachedRowset(); + } + public long getQueryFreshnessToleranceMs(String qualifiedUser) { UserProperty existProperty = propertyMap.get(qualifiedUser); existProperty = getPropertyIfNull(qualifiedUser, existProperty); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index f695c811e1b7f1..d5219ed8de37d1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -856,6 +856,7 @@ public static double getHotValueThreshold() { public static final String MULTI_DISTINCT_STRATEGY = "multi_distinct_strategy"; public static final String AGG_PHASE = "agg_phase"; + public static final String ENABLE_PREFER_CACHED_ROWSET = "enable_prefer_cached_rowset"; public static final String QUERY_FRESHNESS_TOLERANCE_MS = "query_freshness_tolerance_ms"; /** @@ -2506,7 +2507,12 @@ public boolean isEnableHboNonStrictMatchingMode() { }, checker = "checkSkewRewriteAggBucketNum") public int skewRewriteAggBucketNum = 1024; - @VariableMgr.VarAttr(name = QUERY_FRESHNESS_TOLERANCE_MS) + @VariableMgr.VarAttr(name = ENABLE_PREFER_CACHED_ROWSET, needForward = false, + description = {"是否启用 prefer cached rowset 功能", + "Whether to enable prefer cached rowset feature"}) + public boolean enablePreferCachedRowset = false; + + @VariableMgr.VarAttr(name = QUERY_FRESHNESS_TOLERANCE_MS, needForward = false) public long queryFreshnessToleranceMs = 5000; public void setSkewRewriteAggBucketNum(int num) { @@ -3627,6 +3633,18 @@ public int getParallelExecInstanceNum() { } } + public boolean getEnablePreferCachedRowset() { + ConnectContext connectContext = ConnectContext.get(); + if (connectContext != null && connectContext.getEnv() != null && connectContext.getEnv().getAuth() != null) { + boolean userEnablePreferCachedRowset = connectContext.getEnv().getAuth() + .getEnablePreferCachedRowset(connectContext.getQualifiedUser()); + if (userEnablePreferCachedRowset) { + return userEnablePreferCachedRowset; + } + } + return enablePreferCachedRowset; + } + public long getQueryFreshnessToleranceMs() { ConnectContext connectContext = ConnectContext.get(); if (connectContext != null && connectContext.getEnv() != null && connectContext.getEnv().getAuth() != null) { @@ -4573,6 +4591,7 @@ public TQueryOptions toThrift() { tResult.setSkipBadTablet(skipBadTablet); tResult.setDisableFileCache(disableFileCache); + tResult.setEnablePreferCachedRowset(getEnablePreferCachedRowset()); tResult.setQueryFreshnessToleranceMs(getQueryFreshnessToleranceMs()); // for spill diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java index 0ef8226a82b6fd..428fb6fb2f85b0 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java @@ -82,6 +82,7 @@ public void testUpdate() throws UserException { properties.add(Pair.of("sql_block_rules", "rule1,rule2")); properties.add(Pair.of("cpu_resource_limit", "2")); properties.add(Pair.of("query_timeout", "500")); + properties.add(Pair.of("enable_prefer_cached_rowset", "true")); properties.add(Pair.of("query_freshness_tolerance_ms", "4500")); UserProperty userProperty = new UserProperty(); @@ -93,6 +94,7 @@ public void testUpdate() throws UserException { Assert.assertEquals(2, userProperty.getCpuResourceLimit()); Assert.assertEquals(500, userProperty.getQueryTimeout()); Assert.assertEquals(Sets.newHashSet(), userProperty.getCopiedResourceTags()); + Assert.assertEquals(true, userProperty.getEnablePreferCachedRowset()); Assert.assertEquals(4500, userProperty.getQueryFreshnessToleranceMs()); // fetch property diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java index 8add875a69eabc..1789d79acd0f77 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java @@ -307,7 +307,7 @@ public void test() throws Exception { Assert.assertEquals(1000000, execMemLimit); List> userProps = Env.getCurrentEnv().getAuth().getUserProperties(Auth.ROOT_USER); - Assert.assertEquals(14, userProps.size()); + Assert.assertEquals(15, userProps.size()); // now : // be1 be2 be3 ==>tag1; From 78aef939dab55e48c1356c87a2fcd4f1b727bae4 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 22 Aug 2025 11:16:00 +0800 Subject: [PATCH 09/34] add capture_consistent_versions_prefer_cache --- be/src/cloud/cloud_tablet.cpp | 10 +- be/src/olap/version_graph.cpp | 75 ++- be/src/olap/version_graph.h | 12 +- .../cloud_tablet_query_prefer_cache_test.cpp | 521 +++++++++++++++++- 4 files changed, 599 insertions(+), 19 deletions(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 55dbdb5bc14476..9eca5092d630a3 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -203,7 +203,7 @@ Status CloudTablet::capture_rs_readers_prefer_cache(const Version& spec_version, g_capture_prefer_cache_count << 1; Versions version_path; std::shared_lock rlock(_meta_lock); - RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator( + RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_prefer_cache( spec_version, version_path, [&](int64_t start, int64_t end) { return rowset_is_warmed_up(start, end); })); int64_t path_max_version = version_path.back().second; @@ -266,11 +266,9 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( if (enable_unique_key_merge_on_write()) { // For merge-on-write table, newly generated delete bitmap marks will be on the rowsets which are in newest layout. // So we can ony capture rowsets which are in newest data layout. Otherwise there may be data correctness issue. - RETURN_IF_ERROR( - _timestamped_version_tracker.capture_newest_consistent_versions_with_validator( - spec_version, version_path, [&](int64_t start, int64_t end) { - return rowset_is_warmed_up(start, end); - })); + RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator_mow( + spec_version, version_path, + [&](int64_t start, int64_t end) { return rowset_is_warmed_up(start, end); })); } else { RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator( spec_version, version_path, diff --git a/be/src/olap/version_graph.cpp b/be/src/olap/version_graph.cpp index b77eede9b0a78d..711746528cd53e 100644 --- a/be/src/olap/version_graph.cpp +++ b/be/src/olap/version_graph.cpp @@ -344,11 +344,18 @@ Status TimestampedVersionTracker::capture_consistent_versions_with_validator( validator); } -Status TimestampedVersionTracker::capture_newest_consistent_versions_with_validator( +Status TimestampedVersionTracker::capture_consistent_versions_prefer_cache( const Version& spec_version, std::vector& version_path, const std::function& validator) const { - return _version_graph.capture_newest_consistent_versions_with_validator( - spec_version, version_path, validator); + return _version_graph.capture_consistent_versions_prefer_cache(spec_version, version_path, + validator); +} + +Status TimestampedVersionTracker::capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_consistent_versions_with_validator_mow(spec_version, version_path, + validator); } void TimestampedVersionTracker::capture_expired_paths( @@ -650,6 +657,66 @@ Status VersionGraph::capture_consistent_versions(const Version& spec_version, return Status::OK(); } +Status VersionGraph::capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == spec_version.first) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); + } + + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { + int64_t next_idx = -1; + int64_t first_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + if (first_idx == -1) { + first_idx = it; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else if (first_idx != -1) { + // if all edges are not in cache, use the first edge if possible + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[first_idx].value - 1); + cur_idx = first_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + Status VersionGraph::capture_consistent_versions_with_validator( const Version& spec_version, std::vector& version_path, const std::function& validator) const { @@ -701,7 +768,7 @@ Status VersionGraph::capture_consistent_versions_with_validator( return Status::OK(); } -Status VersionGraph::capture_newest_consistent_versions_with_validator( +Status VersionGraph::capture_consistent_versions_with_validator_mow( const Version& spec_version, std::vector& version_path, const std::function& validator) const { if (spec_version.first > spec_version.second) { diff --git a/be/src/olap/version_graph.h b/be/src/olap/version_graph.h index 0b25c0f3d339c6..a845eaffaa4706 100644 --- a/be/src/olap/version_graph.h +++ b/be/src/olap/version_graph.h @@ -56,6 +56,10 @@ class VersionGraph { Status capture_consistent_versions(const Version& spec_version, std::vector* version_path) const; + Status capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + // Given a start, this method can find a version path which satisfy the following conditions: // 1. all edges satisfy the conditions specified by `validator` in the graph. // 2. the destination version is as far as possible. @@ -67,7 +71,7 @@ class VersionGraph { const Version& spec_version, std::vector& version_path, const std::function& validator) const; - Status capture_newest_consistent_versions_with_validator( + Status capture_consistent_versions_with_validator_mow( const Version& spec_version, std::vector& version_path, const std::function& validator) const; @@ -184,6 +188,10 @@ class TimestampedVersionTracker { Status capture_consistent_versions(const Version& spec_version, std::vector* version_path) const; + Status capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + // Given a start, this method can find a version path which satisfy the following conditions: // 1. all edges satisfy the conditions specified by `validator` in the graph. // 2. the destination version is as far as possible. @@ -195,7 +203,7 @@ class TimestampedVersionTracker { const Version& spec_version, std::vector& version_path, const std::function& validator) const; - Status capture_newest_consistent_versions_with_validator( + Status capture_consistent_versions_with_validator_mow( const Version& spec_version, std::vector& version_path, const std::function& validator) const; diff --git a/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp index 8f6a2bd24e67e3..b30c3cc4a75281 100644 --- a/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp +++ b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp @@ -73,7 +73,8 @@ class TestQueryPreferCache : public testing::Test { return rs; } - CloudTabletSPtr create_tablet_with_initial_rowsets(int max_version, bool is_mow = false) { + CloudTabletSPtr create_tablet_with_initial_rowsets(int max_version, bool is_mow = false, + bool warmup = true) { CloudTabletSPtr tablet = std::make_shared(_engine, std::make_shared(*_tablet_meta)); tablet->tablet_meta()->set_enable_unique_key_merge_on_write(is_mow); @@ -83,7 +84,9 @@ class TestQueryPreferCache : public testing::Test { tablet->add_warmed_up_rowset(rs1->rowset_id()); for (int ver = 2; ver <= max_version; ver++) { auto rs = create_rowset(Version {ver, ver}); - tablet->add_warmed_up_rowset(rs->rowset_id()); + if (warmup) { + tablet->add_warmed_up_rowset(rs->rowset_id()); + } rowsets.emplace_back(rs); } { @@ -161,14 +164,14 @@ class TestQueryPreferCache : public testing::Test { CloudStorageEngine _engine; }; -TEST_F(TestQueryPreferCache, testCapture_1_4) { +TEST_F(TestQueryPreferCache, testCapture_1_1) { /* be startup time now-10s now now - 30s │ │ 10s │ │ ◄───────────────────────────┤ ┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ -│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │incache│ │ │ ││ │ │ │ ││ │ │ │ │ │ │ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ └────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ @@ -177,7 +180,7 @@ TEST_F(TestQueryPreferCache, testCapture_1_4) { │ │ │ │ │ │ - return: [2-10],[11-15],[16-16],[17-17] + return: [2-10],[11-15],[16-16],[17-17],[18-18] note: We only care about rowsets that are created after startup time point. For other historical rowsets, we just assume that they are warmuped up. */ @@ -198,7 +201,7 @@ TEST_F(TestQueryPreferCache, testCapture_1_4) { check_capture_result(tablet, Version {0, 18}, expected_versions); } -TEST_F(TestQueryPreferCache, testCapture_2_3) { +TEST_F(TestQueryPreferCache, testCapture_1_2) { /* now-10s now │ 10s │ @@ -243,7 +246,7 @@ TEST_F(TestQueryPreferCache, testCapture_2_3) { check_capture_result(tablet, Version {0, 18}, expected_versions); } -TEST_F(TestQueryPreferCache, testCapture_2_4) { +TEST_F(TestQueryPreferCache, testCapture_1_3) { /* now-10s now │ 10s │ @@ -293,4 +296,508 @@ TEST_F(TestQueryPreferCache, testCapture_2_4) { check_capture_result(tablet, Version {0, 18}, expected_versions); } +TEST_F(TestQueryPreferCache, testCapture_1_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_1) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │ ││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_1) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │ │ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │ ││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_4_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [11-16]│ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ │ │ │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-16],[17-17],[18-18] + note: when there are no warmed up rowset at some vertex, choose the latest edge +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, false, false); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, false, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + } // namespace doris \ No newline at end of file From 97174cf8ce56787de5b9a5429bfa3f0c85257f2d Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 22 Aug 2025 15:04:59 +0800 Subject: [PATCH 10/34] fix typo --- .../java/org/apache/doris/mysql/privilege/UserProperty.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java index 10b25f47c1ab5b..995b0e81af6ba9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java @@ -76,7 +76,7 @@ public class UserProperty { public static final String DEFAULT_COMPUTE_GROUP = "default_compute_group"; public static final String PROP_ENABLE_PREFER_CACHED_ROWSET = "enable_prefer_cached_rowset"; - public static final String PROP_QUERY_FRESHNESS_TOLERANCE = "query_freshness_tolerance"; + public static final String PROP_QUERY_FRESHNESS_TOLERANCE = "query_freshness_tolerance_ms"; // for system user public static final Set ADVANCED_PROPERTIES = Sets.newHashSet(); From d55db6d41c495c681338da08d869ffcb38d5dbce Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 22 Aug 2025 15:24:04 +0800 Subject: [PATCH 11/34] add session var property case --- .../org/apache/doris/qe/SessionVariable.java | 2 +- .../test_read_cluster_var_property.groovy | 207 ++++++++++++++++++ 2 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index d5219ed8de37d1..ab03fb3aabe7ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -2513,7 +2513,7 @@ public boolean isEnableHboNonStrictMatchingMode() { public boolean enablePreferCachedRowset = false; @VariableMgr.VarAttr(name = QUERY_FRESHNESS_TOLERANCE_MS, needForward = false) - public long queryFreshnessToleranceMs = 5000; + public long queryFreshnessToleranceMs = -1; public void setSkewRewriteAggBucketNum(int num) { this.skewRewriteAggBucketNum = num; diff --git a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy new file mode 100644 index 00000000000000..47baf979b58648 --- /dev/null +++ b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy @@ -0,0 +1,207 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite('test_read_cluster_var_property') { + if (!isCloudMode()) { + return + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + def ret = matcher[0][1] as long + logger.info("getBrpcMetrics, ${url}, name:${name}, value:${ret}") + return ret + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + // test non-mow table + try { + def tableName = "test_read_cluster_var_property" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + DUPLICATE KEY(k) + DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "disable_auto_compaction" = "true"); + """ + + (1..20).each{ id -> + sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" + } + + sql "select * from ${tableName};" + + def backends = sql_return_maparray('show backends') + def tabletStats = sql_return_maparray("show tablets from ${tableName};") + assert tabletStats.size() == 1 + def tabletId = tabletStats[0].TabletId + def tabletBackendId = tabletStats[0].BackendId + def tabletBackend + for (def be : backends) { + if (be.BackendId == tabletBackendId) { + tabletBackend = be + break; + } + } + logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); + + try { + // 1. test enable_prefer_cached_rowset + sql "set enable_prefer_cached_rowset=true;" + def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + 1 + + sql "set enable_prefer_cached_rowset=false;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + // user property has higher prioroty than session variable + sql "set property for 'root' enable_prefer_cached_rowset=true;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + 1 + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set property for 'root' enable_prefer_cached_rowset=false;" + } + + try { + // 2. test query_freshness_tolerance_ms + sql "set query_freshness_tolerance_ms=1000;" + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + 1 + + sql "set query_freshness_tolerance_ms=-1;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + + // user property has higher prioroty than session variable + sql "set property for 'root' query_freshness_tolerance_ms=2000;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + 1 + } finally { + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for 'root' query_freshness_tolerance_ms=-1;" + } + } catch (Exception e) { + logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for 'root' enable_prefer_cached_rowset=false;" + sql "set property for 'root' query_freshness_tolerance_ms=-1;" + } + + // test mow table + try { + def tableName = "test_read_cluster_var_property_mow" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + UNIQUE KEY(k) DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "disable_auto_compaction" = "true"); + """ + + (1..20).each{ id -> + sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" + } + + sql "select * from ${tableName};" + + def backends = sql_return_maparray('show backends') + def tabletStats = sql_return_maparray("show tablets from ${tableName};") + assert tabletStats.size() == 1 + def tabletId = tabletStats[0].TabletId + def tabletBackendId = tabletStats[0].BackendId + def tabletBackend + for (def be : backends) { + if (be.BackendId == tabletBackendId) { + tabletBackend = be + break; + } + } + logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); + + try { + // 1. test enable_prefer_cached_rowset + // enable_prefer_cached_rowset should not take effect on mow table + sql "set enable_prefer_cached_rowset=true;" + def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + sql "set enable_prefer_cached_rowset=false;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + // user property has higher prioroty than session variable + sql "set property for 'root' enable_prefer_cached_rowset=true;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set property for 'root' enable_prefer_cached_rowset=false;" + } + + try { + // 2. test query_freshness_tolerance_ms + sql "set query_freshness_tolerance_ms=1000;" + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + 1 + + sql "set query_freshness_tolerance_ms=-1;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + + // user property has higher prioroty than session variable + sql "set property for 'root' query_freshness_tolerance_ms=2000;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + 1 + } finally { + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for 'root' query_freshness_tolerance_ms=-1;" + } + } catch (Exception e) { + logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") + throw e + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for 'root' enable_prefer_cached_rowset=false;" + sql "set property for 'root' query_freshness_tolerance_ms=-1;" + } +} \ No newline at end of file From 561b48ee1826c4a9423c58a1ca6b66f7a4ad36bc Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 22 Aug 2025 16:58:23 +0800 Subject: [PATCH 12/34] tmp add cases --- .../test_enable_prefer_cached_rowset.out | 15 ++ .../test_query_freshness_tolerance.out | 16 ++ .../test_enable_prefer_cached_rowset.groovy | 167 +++++++++++++++++ .../test_query_freshness_tolerance.groovy | 172 ++++++++++++++++++ 4 files changed, 370 insertions(+) create mode 100644 regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out create mode 100644 regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy create mode 100644 regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy diff --git a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out new file mode 100644 index 00000000000000..f3304b77661ae9 --- /dev/null +++ b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !sql -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + diff --git a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out new file mode 100644 index 00000000000000..99e1f4ad641944 --- /dev/null +++ b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out @@ -0,0 +1,16 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !sql -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} +6 {"a":1111.11111} + diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy new file mode 100644 index 00000000000000..cdac17d7d181df --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy @@ -0,0 +1,167 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_enable_prefer_cached_rowset', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'block_file_cache_monitor_interval_sec=1', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def updateBeConf = {cluster, key, value -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + def (code, out, err) = update_be_config(ip, port, key, value) + logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) + } + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def injectS3FileReadSlow = {cluster, sleep_s -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + def injectName = 'S3FileReader::read_at_impl.io_slow' + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + GetDebugPoint().enableDebugPoint(ip, port as int, NodeType.BE, injectName, [sleep:sleep_s]) + } + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + updateBeConf(clusterName2, "enable_warmup_immediately_on_new_rowset", "true") + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + sql """ + create table test ( + col0 int not null, + col1 variant NOT NULL + ) DUPLICATE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(15000) + + sql """insert into test values (1, '{"a" : 1.0}')""" + sql """insert into test values (2, '{"a" : 111.1111}')""" + sql """insert into test values (3, '{"a" : "11111"}')""" + sql """insert into test values (4, '{"a" : 1111111111}')""" + sql """insert into test values (5, '{"a" : 1111.11111}')""" + + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + qt_sql """select * from test""" + + // switch to source cluster and trigger compaction + sql """use @${clusterName1}""" + trigger_and_wait_compaction("test", "cumulative") + // sql """insert into test values (6, '{"a" : 1111.11111}')""" + sleep(2000) + + // inject to let cluster2 read compaction rowset data slowly + injectS3FileReadSlow(clusterName2, 10) + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + def t1 = System.currentTimeMillis() + qt_sql """select * from test""" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + // wait until the injection complete + // sleep(1000) + + // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_download_submitted_num")) + // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_sync_rowset_num")) + // assertEquals(0, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_job_num")) + // sleep(5000) + // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_complete_num")) + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy new file mode 100644 index 00000000000000..b6dceb614966d5 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_query_freshness_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'block_file_cache_monitor_interval_sec=1', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def updateBeConf = {cluster, key, value -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + def (code, out, err) = update_be_config(ip, port, key, value) + logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) + } + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def injectS3FileReadSlow = {cluster, sleep_s -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + def injectName = 'S3FileReader::read_at_impl.io_slow' + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + GetDebugPoint().enableDebugPoint(ip, port as int, NodeType.BE, injectName, [sleep:sleep_s]) + } + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + updateBeConf(clusterName2, "enable_warmup_immediately_on_new_rowset", "true") + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + sql """ + create table test ( + col0 int not null, + col1 variant NOT NULL + ) DUPLICATE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(15000) + + sql """insert into test values (1, '{"a" : 1.0}')""" + sql """insert into test values (2, '{"a" : 111.1111}')""" + sql """insert into test values (3, '{"a" : "11111"}')""" + sql """insert into test values (4, '{"a" : 1111111111}')""" + sql """insert into test values (5, '{"a" : 1111.11111}')""" + + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + qt_sql """select * from test""" + + // switch to source cluster and trigger compaction + sql """use @${clusterName1}""" + trigger_and_wait_compaction("test", "cumulative") + // load new data to increase the version + sql """insert into test values (6, '{"a" : 1111.11111}')""" + + sleep(5000) + + // inject to let cluster2 read compaction rowset data slowly + injectS3FileReadSlow(clusterName2, 10) + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + // sql "set query_freshness_tolerance_ms = 5000" + sql "set enable_profile=true;" + sql "set profile_level=2;" + def t1 = System.currentTimeMillis() + qt_sql """select * from test""" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + // wait until the injection complete + // sleep(1000) + + // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_download_submitted_num")) + // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_sync_rowset_num")) + // assertEquals(0, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_job_num")) + // sleep(5000) + // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_complete_num")) + } +} From aaa52ce75c130f709d17eb4e849d86f5a5d7c6ae Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 22 Aug 2025 17:25:14 +0800 Subject: [PATCH 13/34] fix test_query_freshness_tolerance --- .../test_query_freshness_tolerance.out | 18 +++++++- .../test_query_freshness_tolerance.groovy | 42 ++++++++++++------- .../test_read_cluster_var_property.groovy | 24 +++++------ 3 files changed, 56 insertions(+), 28 deletions(-) diff --git a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out index 99e1f4ad641944..4441657b9aa704 100644 --- a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out +++ b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out @@ -1,12 +1,19 @@ -- This file is automatically generated. You should know what you did if you want to edit this --- !sql -- +-- !cluster1 -- 1 {"a":1} 2 {"a":111.1111} 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} --- !sql -- +-- !cluster2 -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster1_new_data -- 1 {"a":1} 2 {"a":111.1111} 3 {"a":"11111"} @@ -14,3 +21,10 @@ 5 {"a":1111.11111} 6 {"a":1111.11111} +-- !cluster2 -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy index b6dceb614966d5..b077e603aa3986 100644 --- a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy @@ -73,7 +73,9 @@ suite('test_query_freshness_tolerance', 'docker') { def metrics = new URL(url).text def matcher = metrics =~ ~"${name}\\s+(\\d+)" if (matcher.find()) { - return matcher[0][1] as long + def ret = matcher[0][1] as long + logger.info("getBrpcMetrics, ${url}, name:${name}, value:${ret}") + return ret } else { throw new RuntimeException("${name} not found for ${ip}:${port}") } @@ -137,36 +139,48 @@ suite('test_query_freshness_tolerance', 'docker') { sql """insert into test values (4, '{"a" : 1111111111}')""" sql """insert into test values (5, '{"a" : 1111.11111}')""" + sql """use @${clusterName1}""" + qt_cluster1 """select * from test""" + // switch to read cluster, trigger a sync rowset sql """use @${clusterName2}""" - qt_sql """select * from test""" + qt_cluster2 """select * from test""" + + // sleep for 5s to let these rowsets meet the requirement of query freshness tolerance + sleep(5000) // switch to source cluster and trigger compaction sql """use @${clusterName1}""" trigger_and_wait_compaction("test", "cumulative") // load new data to increase the version sql """insert into test values (6, '{"a" : 1111.11111}')""" + qt_cluster1_new_data "select * from test;" - sleep(5000) - // inject to let cluster2 read compaction rowset data slowly injectS3FileReadSlow(clusterName2, 10) // switch to read cluster, trigger a sync rowset sql """use @${clusterName2}""" - // sql "set query_freshness_tolerance_ms = 5000" sql "set enable_profile=true;" sql "set profile_level=2;" + + /* + def t1 = System.currentTimeMillis() + qt_sql """select * from test""" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + + when don't set query_freshness_tolerance_ms, this will have to read 2 rowsets data and will last for more than 10s+10s=20s + */ + sql "set query_freshness_tolerance_ms = 5000" def t1 = System.currentTimeMillis() - qt_sql """select * from test""" + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + qt_cluster2 """select * from test""" def t2 = System.currentTimeMillis() logger.info("query in cluster2 cost=${t2 - t1} ms") - // wait until the injection complete - // sleep(1000) - - // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_download_submitted_num")) - // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_sync_rowset_num")) - // assertEquals(0, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_job_num")) - // sleep(5000) - // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_complete_num")) + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + // query with freshness tolerance should not fallback + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount } } diff --git a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy index 47baf979b58648..7b5f442ac5f783 100644 --- a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy +++ b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy @@ -91,20 +91,20 @@ suite('test_read_cluster_var_property') { try { // 2. test query_freshness_tolerance_ms sql "set query_freshness_tolerance_ms=1000;" - def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + 1 + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 sql "set query_freshness_tolerance_ms=-1;" - queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance // user property has higher prioroty than session variable sql "set property for 'root' query_freshness_tolerance_ms=2000;" - queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + 1 + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 } finally { sql "set query_freshness_tolerance_ms=-1;" sql "set property for 'root' query_freshness_tolerance_ms=-1;" @@ -177,20 +177,20 @@ suite('test_read_cluster_var_property') { try { // 2. test query_freshness_tolerance_ms sql "set query_freshness_tolerance_ms=1000;" - def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + 1 + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 sql "set query_freshness_tolerance_ms=-1;" - queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance // user property has higher prioroty than session variable sql "set property for 'root' query_freshness_tolerance_ms=2000;" - queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_fallback_count") == queryFreshnessTolerance + 1 + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 } finally { sql "set query_freshness_tolerance_ms=-1;" sql "set property for 'root' query_freshness_tolerance_ms=-1;" From 2c8a7362c98629161cd919b15936d3cef020deda Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 22 Aug 2025 19:49:56 +0800 Subject: [PATCH 14/34] fix test_enable_prefer_cached_rowset case --- be/src/cloud/cloud_tablet.cpp | 14 +++++++ .../test_enable_prefer_cached_rowset.out | 20 +++++++++- .../test_enable_prefer_cached_rowset.groovy | 38 ++++++++++--------- 3 files changed, 53 insertions(+), 19 deletions(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 9eca5092d630a3..1e6cbc0ae51540 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -442,6 +442,20 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, }, .download_done {[=](Status st) { + DBUG_EXECUTE_IF("CloudTablet::add_rowsets.download_data.callback.block_compaction_rowset", { + // clang-format on + if (rs->version().second > rs->version().first) { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for rowset={}, " + "version={}, sleep={}", + rs->rowset_id().to_string(), + rs->version().to_string(), sleep_time); + std::this_thread::sleep_for( + std::chrono::seconds(sleep_time)); + } + // clang-format off + }); self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st); if (!st) { LOG_WARNING("add rowset warm up error ").error(st); diff --git a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out index f3304b77661ae9..c9382a70217ba3 100644 --- a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out +++ b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out @@ -1,15 +1,31 @@ -- This file is automatically generated. You should know what you did if you want to edit this --- !sql -- +-- !cluster1 -- 1 {"a":1} 2 {"a":111.1111} 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} --- !sql -- +-- !cluster2 -- 1 {"a":1} 2 {"a":111.1111} 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} +-- !cluster1_new_data -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} +6 {"a":1111.11111} + +-- !cluster2 -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} +6 {"a":1111.11111} + diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy index cdac17d7d181df..561c29605712e6 100644 --- a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy @@ -89,10 +89,10 @@ suite('test_enable_prefer_cached_rowset', 'docker') { return getBrpcMetrics(ip, port, name) } - def injectS3FileReadSlow = {cluster, sleep_s -> + def injectCompactionRowsetDownloadSlow = {cluster, sleep_s -> def backends = sql """SHOW BACKENDS""" def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } - def injectName = 'S3FileReader::read_at_impl.io_slow' + def injectName = 'CloudTablet::add_rowsets.download_data.callback.block_compaction_rowset' for (be in cluster_bes) { def ip = be[1] def port = be[4] @@ -129,7 +129,7 @@ suite('test_enable_prefer_cached_rowset', 'docker') { """ clearFileCacheOnAllBackends() - sleep(15000) + sleep(10000) sql """insert into test values (1, '{"a" : 1.0}')""" sql """insert into test values (2, '{"a" : 111.1111}')""" @@ -137,31 +137,35 @@ suite('test_enable_prefer_cached_rowset', 'docker') { sql """insert into test values (4, '{"a" : 1111111111}')""" sql """insert into test values (5, '{"a" : 1111.11111}')""" + sql """use @${clusterName1}""" + qt_cluster1 """select * from test""" + // switch to read cluster, trigger a sync rowset sql """use @${clusterName2}""" - qt_sql """select * from test""" + qt_cluster2 """select * from test""" // switch to source cluster and trigger compaction sql """use @${clusterName1}""" trigger_and_wait_compaction("test", "cumulative") - // sql """insert into test values (6, '{"a" : 1111.11111}')""" - sleep(2000) - + // load new data to increase the version + sql """insert into test values (6, '{"a" : 1111.11111}')""" + qt_cluster1_new_data "select * from test;" + // inject to let cluster2 read compaction rowset data slowly - injectS3FileReadSlow(clusterName2, 10) + injectCompactionRowsetDownloadSlow(clusterName2, 10) // switch to read cluster, trigger a sync rowset sql """use @${clusterName2}""" + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set enable_prefer_cached_rowset = true" + // when enable_prefer_cached_rowset = true, only need to read newly load data, compaction rowsets data will be skipped def t1 = System.currentTimeMillis() - qt_sql """select * from test""" + def capturePreferCacheCount = getBrpcMetricsByCluster(clusterName2, "capture_prefer_cache_count") + qt_cluster2 """select * from test""" def t2 = System.currentTimeMillis() logger.info("query in cluster2 cost=${t2 - t1} ms") - // wait until the injection complete - // sleep(1000) - - // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_download_submitted_num")) - // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_sync_rowset_num")) - // assertEquals(0, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_job_num")) - // sleep(5000) - // assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_complete_num")) + assert t2 - t1 < 2000 + assert getBrpcMetricsByCluster(clusterName2, "capture_prefer_cache_count") == capturePreferCacheCount + 1 } } From cd6726f1d0451fcade7b9590eaebc0bd6ad0320e Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Sun, 24 Aug 2025 10:41:56 +0800 Subject: [PATCH 15/34] update --- be/src/cloud/cloud_backend_service.cpp | 1 - be/src/cloud/cloud_tablet.cpp | 32 +++++-------------- cloud/src/meta-service/meta_service_txn.cpp | 6 ++-- cloud/test/meta_service_job_test.cpp | 3 +- .../test_query_freshness_tolerance.groovy | 8 ----- 5 files changed, 12 insertions(+), 38 deletions(-) diff --git a/be/src/cloud/cloud_backend_service.cpp b/be/src/cloud/cloud_backend_service.cpp index bbc70fc1739cc3..a5caf89e2074af 100644 --- a/be/src/cloud/cloud_backend_service.cpp +++ b/be/src/cloud/cloud_backend_service.cpp @@ -213,7 +213,6 @@ void CloudBackendService::warm_up_cache_async(TWarmUpCacheAsyncResponse& respons if (!cntl.Failed()) { g_file_cache_warm_up_cache_async_submitted_segment_num << brpc_response.file_cache_block_metas().size(); - // TODO(bobhan1): add callback _engine.file_cache_block_downloader().submit_download_task( std::move(*brpc_response.mutable_file_cache_block_metas())); } else { diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 1e6cbc0ae51540..9d3683823573a1 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -74,17 +74,12 @@ bvar::Adder g_unused_rowsets_count("unused_rowsets_count"); bvar::Adder g_unused_rowsets_bytes("unused_rowsets_bytes"); bvar::Adder g_capture_prefer_cache_count("capture_prefer_cache_count"); -bvar::Adder g_capture_prefer_cache_count_fallback_count( - "capture_prefer_cache_count_fallback_count"); bvar::Adder g_capture_with_freshness_tolerance_count( "capture_with_freshness_tolerance_count"); bvar::Adder g_capture_with_freshness_tolerance_fallback_count( "capture_with_freshness_tolerance_fallback_count"); bvar::Window> g_capture_prefer_cache_count_window( "capture_prefer_cache_count_window", &g_capture_prefer_cache_count, 30); -bvar::Window> g_capture_prefer_cache_count_fallback_count_window( - "capture_prefer_cache_count_fallback_count_window", - &g_capture_prefer_cache_count_fallback_count, 30); bvar::Window> g_capture_with_freshness_tolerance_count_window( "capture_with_freshness_tolerance_count_window", &g_capture_with_freshness_tolerance_count, 30); @@ -207,25 +202,14 @@ Status CloudTablet::capture_rs_readers_prefer_cache(const Version& spec_version, spec_version, version_path, [&](int64_t start, int64_t end) { return rowset_is_warmed_up(start, end); })); int64_t path_max_version = version_path.back().second; - LOG_INFO("[verbose] CloudTablet::capture_rs_readers_prefer_cache, capture path: {}", - fmt::join(version_path | std::views::transform([](const auto& version) { - return fmt::format("{}", version.to_string()); - }), - ", ")) - .tag("tablet_id", tablet_id()) - .tag("spec_version", spec_version.to_string()) - .tag("path_max_version", path_max_version); - bool should_fallback = path_max_version < spec_version.second; - if (should_fallback) { - rlock.unlock(); - LOG_INFO( - "[verbose] CloudTablet::capture_rs_readers_prefer_cache, fallback, spec_version={}", - spec_version.to_string()); - g_capture_prefer_cache_count_fallback_count << 1; - // if there exists a rowset which satisfies freshness tolerance and its start version is larger than the path max version - // but has not been warmuped up yet, fallback to capture rowsets as usual - return capture_rs_readers_internal(spec_version, rs_splits); - } + VLOG_DEBUG << fmt::format( + "[verbose] CloudTablet::capture_rs_readers_prefer_cache, capture path: {}, " + "tablet_id={}, spec_version={}, path_max_version={}", + fmt::join(version_path | std::views::transform([](const auto& version) { + return fmt::format("{}", version.to_string()); + }), + ", "), + tablet_id(), spec_version.to_string(), path_max_version); return capture_rs_readers_unlocked(version_path, rs_splits); } diff --git a/cloud/src/meta-service/meta_service_txn.cpp b/cloud/src/meta-service/meta_service_txn.cpp index 289ce0a8917b81..c17b2565456953 100644 --- a/cloud/src/meta-service/meta_service_txn.cpp +++ b/cloud/src/meta-service/meta_service_txn.cpp @@ -2333,8 +2333,8 @@ void MetaServiceImpl::commit_txn_with_sub_txn(const CommitTxnRequest* request, commit_txn_log.set_txn_id(txn_id); commit_txn_log.set_db_id(db_id); - int64_t rowsets_visible_time_ms = - duration_cast(system_clock::now().time_since_epoch()).count(); + int64_t rowsets_visible_ts_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); // -> rowset meta std::vector, RowsetMetaCloudPB>> rowsets; @@ -2369,7 +2369,7 @@ void MetaServiceImpl::commit_txn_with_sub_txn(const CommitTxnRequest* request, } i.set_start_version(new_version); i.set_end_version(new_version); - i.set_visible_time_ms(rowsets_visible_time_ms); + i.set_visible_ts_ms(rowsets_visible_ts_ms); LOG(INFO) << "xxx update rowset version, txn_id=" << txn_id << ", sub_txn_id=" << sub_txn_id << ", table_id=" << table_id << ", partition_id=" << partition_id << ", tablet_id=" << tablet_id diff --git a/cloud/test/meta_service_job_test.cpp b/cloud/test/meta_service_job_test.cpp index 351de729493064..c30be90e1dad35 100644 --- a/cloud/test/meta_service_job_test.cpp +++ b/cloud/test/meta_service_job_test.cpp @@ -3640,8 +3640,7 @@ TEST(MetaServiceJobTest, SchemaChangeJobTest) { EXPECT_EQ(saved_rowset.rowset_id_v2(), rs.rowset_id_v2()); ASSERT_TRUE(saved_rowset.has_visible_ts_ms() && saved_rowset.visible_ts_ms() > 0); using namespace std::chrono; - auto visible_tp = - time_point(milliseconds(saved_rowset.visible_ts_ms())); + auto visible_tp = time_point(milliseconds(saved_rowset.visible_ts_ms())); std::time_t visible_time = system_clock::to_time_t(visible_tp); std::cout << "visible time: " << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy index b077e603aa3986..c8dd0d261e8f62 100644 --- a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy @@ -163,14 +163,6 @@ suite('test_query_freshness_tolerance', 'docker') { sql "set enable_profile=true;" sql "set profile_level=2;" - /* - def t1 = System.currentTimeMillis() - qt_sql """select * from test""" - def t2 = System.currentTimeMillis() - logger.info("query in cluster2 cost=${t2 - t1} ms") - - when don't set query_freshness_tolerance_ms, this will have to read 2 rowsets data and will last for more than 10s+10s=20s - */ sql "set query_freshness_tolerance_ms = 5000" def t1 = System.currentTimeMillis() def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") From 9b65e964da0fcf8f759513d7e716899c3eab6d01 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 5 Sep 2025 18:56:34 +0800 Subject: [PATCH 16/34] add bvar for rowset warmup task --- be/src/cloud/cloud_tablet.cpp | 37 +++++++++++++++++++++++------------ be/src/cloud/cloud_tablet.h | 16 ++++++++++++--- 2 files changed, 37 insertions(+), 16 deletions(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 9d3683823573a1..c0a400bbe73a56 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -18,6 +18,7 @@ #include "cloud/cloud_tablet.h" #include +#include #include #include #include @@ -123,6 +124,8 @@ bvar::Adder g_file_cache_warm_up_rowset_triggered_by_job_num( "file_cache_warm_up_rowset_triggered_by_job_num"); bvar::Adder g_file_cache_warm_up_rowset_triggered_by_sync_rowset_num( "file_cache_warm_up_rowset_triggered_by_sync_rowset_num"); +bvar::LatencyRecorder g_file_cache_warm_up_rowset_all_segments_latency( + "file_cache_warm_up_rowset_all_segments_latency"); CloudTablet::CloudTablet(CloudStorageEngine& engine, TabletMetaSharedPtr tablet_meta) : BaseTablet(std::move(tablet_meta)), _engine(engine) {} @@ -1598,19 +1601,21 @@ Status CloudTablet::check_delete_bitmap_cache(int64_t txn_id, WarmUpState CloudTablet::get_rowset_warmup_state(RowsetId rowset_id) { std::shared_lock rlock(_meta_lock); - if (_rowset_warm_up_states.find(rowset_id) == _rowset_warm_up_states.end()) { + if (!_rowset_warm_up_states.contains(rowset_id)) { return WarmUpState::NONE; } - return _rowset_warm_up_states[rowset_id].first; + return _rowset_warm_up_states[rowset_id].state; } -bool CloudTablet::add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState state) { +bool CloudTablet::add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp) { std::lock_guard wlock(_meta_lock); - return add_rowset_warmup_state_unlocked(rowset, state); + return add_rowset_warmup_state_unlocked(rowset, state, start_tp); } -bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state) { - if (_rowset_warm_up_states.find(rowset.rowset_id()) != _rowset_warm_up_states.end()) { +bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp) { + if (_rowset_warm_up_states.contains(rowset.rowset_id())) { return false; } if (state == WarmUpState::TRIGGERED_BY_JOB) { @@ -1618,13 +1623,14 @@ bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, War } else if (state == WarmUpState::TRIGGERED_BY_SYNC_ROWSET) { g_file_cache_warm_up_rowset_triggered_by_sync_rowset_num << 1; } - _rowset_warm_up_states[rowset.rowset_id()] = std::make_pair(state, rowset.num_segments()); + _rowset_warm_up_states[rowset.rowset_id()] = { + .state = state, .num_segments = rowset.num_segments(), .start_tp = start_tp}; return true; } WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Status status) { std::lock_guard wlock(_meta_lock); - if (_rowset_warm_up_states.find(rowset_id) == _rowset_warm_up_states.end()) { + if (!_rowset_warm_up_states.contains(rowset_id)) { return WarmUpState::NONE; } VLOG_DEBUG << "complete rowset segment warmup for rowset " << rowset_id << ", " << status; @@ -1632,13 +1638,18 @@ WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Stat if (!status.ok()) { g_file_cache_warm_up_segment_failed_num << 1; } - _rowset_warm_up_states[rowset_id].second--; - if (_rowset_warm_up_states[rowset_id].second <= 0) { + _rowset_warm_up_states[rowset_id].num_segments--; + if (_rowset_warm_up_states[rowset_id].num_segments <= 0) { g_file_cache_warm_up_rowset_complete_num << 1; add_warmed_up_rowset(rowset_id); - _rowset_warm_up_states[rowset_id].first = WarmUpState::DONE; - } - return _rowset_warm_up_states[rowset_id].first; + auto cost = std::chrono::duration_cast( + std::chrono::steady_clock::now() - + _rowset_warm_up_states[rowset_id].start_tp) + .count(); + g_file_cache_warm_up_rowset_all_segments_latency << cost; + _rowset_warm_up_states[rowset_id].state = WarmUpState::DONE; + } + return _rowset_warm_up_states[rowset_id].state; } #include "common/compile_check_end.h" diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 3c442e75edbca1..a76bba04d6f8ab 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -21,6 +21,7 @@ #include "olap/base_tablet.h" #include "olap/partial_update_info.h" +#include "olap/rowset/rowset.h" namespace doris { @@ -304,7 +305,9 @@ class CloudTablet final : public BaseTablet { // Add warmup state management WarmUpState get_rowset_warmup_state(RowsetId rowset_id); - bool add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState state); + bool add_rowset_warmup_state( + const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp = std::chrono::steady_clock::now()); WarmUpState complete_rowset_segment_warmup(RowsetId rowset_id, Status status); bool is_rowset_warmed_up(const RowsetId& rowset_id) const { @@ -330,7 +333,9 @@ class CloudTablet final : public BaseTablet { Status sync_if_not_running(SyncRowsetStats* stats = nullptr); - bool add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state); + bool add_rowset_warmup_state_unlocked( + const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp = std::chrono::steady_clock::now()); // used by capture_rs_reader_xxx functions bool rowset_is_warmed_up(int64_t start_version, int64_t end_version); @@ -393,7 +398,12 @@ class CloudTablet final : public BaseTablet { std::vector, DeleteBitmapKeyRanges>> _unused_delete_bitmap; // for warm up states management - std::unordered_map> _rowset_warm_up_states; + struct RowsetWarmUpInfo { + WarmUpState state; + int64_t num_segments; + std::chrono::steady_clock::time_point start_tp; + }; + std::unordered_map _rowset_warm_up_states; mutable std::shared_mutex _warmed_up_rowsets_mutex; std::unordered_set _warmed_up_rowsets; From b0e23bbbebf3189b19dcdab539041372c7f14a24 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 5 Sep 2025 22:54:30 +0800 Subject: [PATCH 17/34] update case --- .../suites/cloud_p0/test_read_cluster_var_property.groovy | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy index 7b5f442ac5f783..8b79fb06d5bc49 100644 --- a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy +++ b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy @@ -82,7 +82,7 @@ suite('test_read_cluster_var_property') { sql "set property for 'root' enable_prefer_cached_rowset=true;" preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + 1 + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") > preferCachedRowsetCount } finally { sql "set enable_prefer_cached_rowset=false;" sql "set property for 'root' enable_prefer_cached_rowset=false;" @@ -104,7 +104,7 @@ suite('test_read_cluster_var_property') { sql "set property for 'root' query_freshness_tolerance_ms=2000;" queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") > queryFreshnessTolerance } finally { sql "set query_freshness_tolerance_ms=-1;" sql "set property for 'root' query_freshness_tolerance_ms=-1;" @@ -190,7 +190,7 @@ suite('test_read_cluster_var_property') { sql "set property for 'root' query_freshness_tolerance_ms=2000;" queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") > queryFreshnessTolerance } finally { sql "set query_freshness_tolerance_ms=-1;" sql "set property for 'root' query_freshness_tolerance_ms=-1;" From 4e871805f96f3ddad9e09773640b11fd68a17cab Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 8 Sep 2025 10:47:57 +0800 Subject: [PATCH 18/34] fix test_read_cluster_var_property case --- .../test_read_cluster_var_property.groovy | 311 +++++++++--------- 1 file changed, 159 insertions(+), 152 deletions(-) diff --git a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy index 8b79fb06d5bc49..3305d4ce5dbfdb 100644 --- a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy +++ b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy @@ -18,7 +18,12 @@ suite('test_read_cluster_var_property') { if (!isCloudMode()) { return - } + } + String userName = "test_read_cluster_var_property_user" + String pwd = '123456' + sql """drop user if exists ${userName}""" + sql """CREATE USER '${userName}' IDENTIFIED BY '${pwd}'""" + sql """GRANT ADMIN_PRIV ON *.*.* TO ${userName}""" def getBrpcMetrics = {ip, port, name -> def url = "http://${ip}:${port}/brpc_metrics" @@ -33,175 +38,177 @@ suite('test_read_cluster_var_property') { } } - // test non-mow table - try { - def tableName = "test_read_cluster_var_property" - sql """ DROP TABLE IF EXISTS ${tableName} """ - sql """ CREATE TABLE ${tableName} - (k int, v1 int, v2 int ) - DUPLICATE KEY(k) - DISTRIBUTED BY HASH (k) - BUCKETS 1 PROPERTIES( - "replication_num" = "1", - "disable_auto_compaction" = "true"); - """ - - (1..20).each{ id -> - sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" - } - - sql "select * from ${tableName};" - - def backends = sql_return_maparray('show backends') - def tabletStats = sql_return_maparray("show tablets from ${tableName};") - assert tabletStats.size() == 1 - def tabletId = tabletStats[0].TabletId - def tabletBackendId = tabletStats[0].BackendId - def tabletBackend - for (def be : backends) { - if (be.BackendId == tabletBackendId) { - tabletBackend = be - break; + connect(userName, "${pwd}", context.config.jdbcUrl) { + // test non-mow table + try { + def tableName = "test_read_cluster_var_property" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + DUPLICATE KEY(k) + DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "disable_auto_compaction" = "true"); + """ + + (1..20).each{ id -> + sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" } - } - logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); - try { - // 1. test enable_prefer_cached_rowset - sql "set enable_prefer_cached_rowset=true;" - def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + 1 - sql "set enable_prefer_cached_rowset=false;" - preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + def backends = sql_return_maparray('show backends') + def tabletStats = sql_return_maparray("show tablets from ${tableName};") + assert tabletStats.size() == 1 + def tabletId = tabletStats[0].TabletId + def tabletBackendId = tabletStats[0].BackendId + def tabletBackend + for (def be : backends) { + if (be.BackendId == tabletBackendId) { + tabletBackend = be + break; + } + } + logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); + + try { + // 1. test enable_prefer_cached_rowset + sql "set enable_prefer_cached_rowset=true;" + def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + 1 + + sql "set enable_prefer_cached_rowset=false;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + // user property has higher prioroty than session variable + sql "set property for '${userName}' enable_prefer_cached_rowset=true;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == 1 + preferCachedRowsetCount + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' enable_prefer_cached_rowset=false;" + } - // user property has higher prioroty than session variable - sql "set property for 'root' enable_prefer_cached_rowset=true;" - preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") > preferCachedRowsetCount + try { + // 2. test query_freshness_tolerance_ms + sql "set query_freshness_tolerance_ms=1000;" + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 + + sql "set query_freshness_tolerance_ms=-1;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + + // user property has higher prioroty than session variable + sql "set property for '${userName}' query_freshness_tolerance_ms=2000;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == 1 + queryFreshnessTolerance + } finally { + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' query_freshness_tolerance_ms=-1;" + } + } catch (Exception e) { + logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") } finally { sql "set enable_prefer_cached_rowset=false;" - sql "set property for 'root' enable_prefer_cached_rowset=false;" - } - - try { - // 2. test query_freshness_tolerance_ms - sql "set query_freshness_tolerance_ms=1000;" - def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 - - sql "set query_freshness_tolerance_ms=-1;" - queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance - - // user property has higher prioroty than session variable - sql "set property for 'root' query_freshness_tolerance_ms=2000;" - queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") > queryFreshnessTolerance - } finally { sql "set query_freshness_tolerance_ms=-1;" - sql "set property for 'root' query_freshness_tolerance_ms=-1;" - } - } catch (Exception e) { - logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") - } finally { - sql "set enable_prefer_cached_rowset=false;" - sql "set query_freshness_tolerance_ms=-1;" - sql "set property for 'root' enable_prefer_cached_rowset=false;" - sql "set property for 'root' query_freshness_tolerance_ms=-1;" - } - - // test mow table - try { - def tableName = "test_read_cluster_var_property_mow" - sql """ DROP TABLE IF EXISTS ${tableName} """ - sql """ CREATE TABLE ${tableName} - (k int, v1 int, v2 int ) - UNIQUE KEY(k) DISTRIBUTED BY HASH (k) - BUCKETS 1 PROPERTIES( - "replication_num" = "1", - "enable_unique_key_merge_on_write" = "true", - "disable_auto_compaction" = "true"); - """ - - (1..20).each{ id -> - sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" + sql "set property for '${userName}' enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' query_freshness_tolerance_ms=-1;" } - sql "select * from ${tableName};" - - def backends = sql_return_maparray('show backends') - def tabletStats = sql_return_maparray("show tablets from ${tableName};") - assert tabletStats.size() == 1 - def tabletId = tabletStats[0].TabletId - def tabletBackendId = tabletStats[0].BackendId - def tabletBackend - for (def be : backends) { - if (be.BackendId == tabletBackendId) { - tabletBackend = be - break; + // test mow table + try { + def tableName = "test_read_cluster_var_property_mow" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + UNIQUE KEY(k) DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "disable_auto_compaction" = "true"); + """ + + (1..20).each{ id -> + sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" } - } - logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); - try { - // 1. test enable_prefer_cached_rowset - // enable_prefer_cached_rowset should not take effect on mow table - sql "set enable_prefer_cached_rowset=true;" - def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount - sql "set enable_prefer_cached_rowset=false;" - preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + def backends = sql_return_maparray('show backends') + def tabletStats = sql_return_maparray("show tablets from ${tableName};") + assert tabletStats.size() == 1 + def tabletId = tabletStats[0].TabletId + def tabletBackendId = tabletStats[0].BackendId + def tabletBackend + for (def be : backends) { + if (be.BackendId == tabletBackendId) { + tabletBackend = be + break; + } + } + logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); + + try { + // 1. test enable_prefer_cached_rowset + // enable_prefer_cached_rowset should not take effect on mow table + sql "set enable_prefer_cached_rowset=true;" + def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + sql "set enable_prefer_cached_rowset=false;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + // user property has higher prioroty than session variable + sql "set property for '${userName}' enable_prefer_cached_rowset=true;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' enable_prefer_cached_rowset=false;" + } - // user property has higher prioroty than session variable - sql "set property for 'root' enable_prefer_cached_rowset=true;" - preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + try { + // 2. test query_freshness_tolerance_ms + sql "set query_freshness_tolerance_ms=1000;" + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 + + sql "set query_freshness_tolerance_ms=-1;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + + // user property has higher prioroty than session variable + sql "set property for '${userName}' query_freshness_tolerance_ms=2000;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == 1 + queryFreshnessTolerance + } finally { + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' query_freshness_tolerance_ms=-1;" + } + } catch (Exception e) { + logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") + throw e } finally { sql "set enable_prefer_cached_rowset=false;" - sql "set property for 'root' enable_prefer_cached_rowset=false;" - } - - try { - // 2. test query_freshness_tolerance_ms - sql "set query_freshness_tolerance_ms=1000;" - def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 - - sql "set query_freshness_tolerance_ms=-1;" - queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance - - // user property has higher prioroty than session variable - sql "set property for 'root' query_freshness_tolerance_ms=2000;" - queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") - sql "select * from ${tableName};" - assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") > queryFreshnessTolerance - } finally { sql "set query_freshness_tolerance_ms=-1;" - sql "set property for 'root' query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' query_freshness_tolerance_ms=-1;" } - } catch (Exception e) { - logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") - throw e - } finally { - sql "set enable_prefer_cached_rowset=false;" - sql "set query_freshness_tolerance_ms=-1;" - sql "set property for 'root' enable_prefer_cached_rowset=false;" - sql "set property for 'root' query_freshness_tolerance_ms=-1;" } } \ No newline at end of file From 7ec1fd47639f3d64c027d602de3565c644404af1 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 8 Sep 2025 11:26:49 +0800 Subject: [PATCH 19/34] move cases --- .../no_warmup}/test_enable_prefer_cached_rowset.out | 0 .../no_warmup}/test_query_freshness_tolerance.out | 0 .../no_warmup}/test_enable_prefer_cached_rowset.groovy | 0 .../no_warmup}/test_query_freshness_tolerance.groovy | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename regression-test/data/cloud_p0/{cache/multi_cluster/warm_up/cluster => read_cluster_cache/no_warmup}/test_enable_prefer_cached_rowset.out (100%) rename regression-test/data/cloud_p0/{cache/multi_cluster/warm_up/cluster => read_cluster_cache/no_warmup}/test_query_freshness_tolerance.out (100%) rename regression-test/suites/cloud_p0/{cache/multi_cluster/warm_up/cluster => read_cluster_cache/no_warmup}/test_enable_prefer_cached_rowset.groovy (100%) rename regression-test/suites/cloud_p0/{cache/multi_cluster/warm_up/cluster => read_cluster_cache/no_warmup}/test_query_freshness_tolerance.groovy (100%) diff --git a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out similarity index 100% rename from regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.out rename to regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out diff --git a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out similarity index 100% rename from regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.out rename to regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy similarity index 100% rename from regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_enable_prefer_cached_rowset.groovy rename to regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy similarity index 100% rename from regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_query_freshness_tolerance.groovy rename to regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy From d8cdd865b620802dcbc9b7244edc0301e6ddc58e Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 8 Sep 2025 19:07:06 +0800 Subject: [PATCH 20/34] fix --- be/src/cloud/cloud_internal_service.cpp | 36 +++++++++---------- .../test_enable_prefer_cached_rowset.groovy | 4 +-- .../test_query_freshness_tolerance.groovy | 4 +-- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 59a198ee1710c9..94ba7505fafdda 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -190,6 +190,7 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c continue; } int64_t tablet_id = rs_meta.tablet_id(); + auto rowset_id = rs_meta.rowset_id(); bool local_only = !(request->has_skip_existence_check() && request->skip_existence_check()); auto res = _engine.tablet_mgr().get_tablet(tablet_id, /* warmup_data = */ false, /* sync_delete_bitmap = */ true, @@ -216,7 +217,7 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c g_file_cache_warm_up_rowset_request_to_handle_slow_count << 1; LOG(INFO) << "warm up rowset (request to handle) took " << handle_ts - request_ts << " us, tablet_id: " << rs_meta.tablet_id() - << ", rowset_id: " << rs_meta.rowset_id().to_string(); + << ", rowset_id: " << rowset_id.to_string(); } int64_t expiration_time = tablet_meta->ttl_seconds() == 0 || rs_meta.newest_write_timestamp() <= 0 @@ -227,16 +228,14 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c } if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpState::TRIGGERED_BY_JOB)) { - LOG(INFO) << "found duplicate warmup task for rowset " << rs_meta.rowset_id() + LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string() << ", skip it"; continue; } for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { - auto download_done = [&, tablet_id = rs_meta.tablet_id(), - rowset_id = rs_meta.rowset_id().to_string(), - segment_size = rs_meta.segment_file_size(segment_id), - wait](Status st) { + auto segment_size = rs_meta.segment_file_size(segment_id); + auto download_done = [=, version = rs_meta.version()](Status st) { if (st.ok()) { g_file_cache_event_driven_warm_up_finished_segment_num << 1; g_file_cache_event_driven_warm_up_finished_segment_size << segment_size; @@ -250,25 +249,26 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c now_ts - request_ts > config::warm_up_rowset_slow_log_ms * 1000) { g_file_cache_warm_up_rowset_slow_count << 1; LOG(INFO) << "warm up rowset took " << now_ts - request_ts - << " us, tablet_id: " << tablet_id << ", rowset_id: " << rowset_id + << " us, tablet_id: " << tablet_id + << ", rowset_id: " << rowset_id.to_string() << ", segment_id: " << segment_id; } if (now_ts - handle_ts > config::warm_up_rowset_slow_log_ms * 1000) { g_file_cache_warm_up_rowset_handle_to_finish_slow_count << 1; LOG(INFO) << "warm up rowset (handle to finish) took " << now_ts - handle_ts - << " us, tablet_id: " << tablet_id << ", rowset_id: " << rowset_id + << " us, tablet_id: " << tablet_id + << ", rowset_id: " << rowset_id.to_string() << ", segment_id: " << segment_id; } } else { g_file_cache_event_driven_warm_up_failed_segment_num << 1; g_file_cache_event_driven_warm_up_failed_segment_size << segment_size; LOG(WARNING) << "download segment failed, tablet_id: " << tablet_id - << " rowset_id: " << rowset_id << ", error: " << st; + << " rowset_id: " << rowset_id.to_string() << ", error: " << st; } - if (tablet->complete_rowset_segment_warmup(rs_meta.rowset_id(), st) == - WarmUpState::DONE) { - VLOG_DEBUG << "warmup rowset " << rs_meta.version() << "(" << rowset_id - << ") completed"; + if (tablet->complete_rowset_segment_warmup(rowset_id, st) == WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" + << rowset_id.to_string() << ") completed"; } if (wait) { wait->signal(); @@ -277,9 +277,9 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c io::DownloadFileMeta download_meta { .path = storage_resource.value()->remote_segment_path(rs_meta, segment_id), - .file_size = rs_meta.segment_file_size(segment_id), + .file_size = segment_size, .offset = 0, - .download_size = rs_meta.segment_file_size(segment_id), + .download_size = segment_size, .file_system = storage_resource.value()->fs, .ctx = { @@ -291,8 +291,7 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c .download_done = std::move(download_done), }; g_file_cache_event_driven_warm_up_submitted_segment_num << 1; - g_file_cache_event_driven_warm_up_submitted_segment_size - << rs_meta.segment_file_size(segment_id); + g_file_cache_event_driven_warm_up_submitted_segment_size << segment_size; if (wait) { wait->add_count(); } @@ -300,8 +299,7 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c auto download_inverted_index = [&](std::string index_path, uint64_t idx_size) { auto storage_resource = rs_meta.remote_storage_resource(); - auto download_done = [=, tablet_id = rs_meta.tablet_id(), - rowset_id = rs_meta.rowset_id().to_string()](Status st) { + auto download_done = [=, rowset_id = rowset_id.to_string()](Status st) { if (st.ok()) { g_file_cache_event_driven_warm_up_finished_index_num << 1; g_file_cache_event_driven_warm_up_finished_index_size << idx_size; diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy index 561c29605712e6..b0d6795d5c5fab 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy @@ -54,7 +54,7 @@ suite('test_enable_prefer_cached_rowset', 'docker') { } // clear file cache is async, wait it done - sleep(5000) + sleep(2000) } def updateBeConf = {cluster, key, value -> @@ -129,7 +129,7 @@ suite('test_enable_prefer_cached_rowset', 'docker') { """ clearFileCacheOnAllBackends() - sleep(10000) + sleep(2000) sql """insert into test values (1, '{"a" : 1.0}')""" sql """insert into test values (2, '{"a" : 111.1111}')""" diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy index c8dd0d261e8f62..13e9ba837344ca 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy @@ -54,7 +54,7 @@ suite('test_query_freshness_tolerance', 'docker') { } // clear file cache is async, wait it done - sleep(5000) + sleep(2000) } def updateBeConf = {cluster, key, value -> @@ -131,7 +131,7 @@ suite('test_query_freshness_tolerance', 'docker') { """ clearFileCacheOnAllBackends() - sleep(15000) + sleep(2000) sql """insert into test values (1, '{"a" : 1.0}')""" sql """insert into test values (2, '{"a" : 111.1111}')""" From e1161655aa5435cdabbc4331d7ca2d801aca9ab1 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Tue, 9 Sep 2025 16:05:54 +0800 Subject: [PATCH 21/34] tmo --- be/src/cloud/cloud_internal_service.cpp | 7 + ...armup_delay_compaction_query_tolerance.out | 9 + ...up_delay_compaction_query_tolerance.groovy | 309 ++++++++++++++++++ 3 files changed, 325 insertions(+) create mode 100644 regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out create mode 100644 regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 94ba7505fafdda..c7a958b40b64dc 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -26,6 +26,7 @@ #include "io/cache/block_file_cache.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" +#include "util/debug_points.h" namespace doris { #include "common/compile_check_avoid_begin.h" @@ -236,6 +237,12 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { auto segment_size = rs_meta.segment_file_size(segment_id); auto download_done = [=, version = rs_meta.version()](Status st) { + DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_segment", { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO("[verbose] block download for rowset={}, version={}, sleep={}", + rowset_id.to_string(), version.to_string(), sleep_time); + std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); + }); if (st.ok()) { g_file_cache_event_driven_warm_up_finished_segment_num << 1; g_file_cache_event_driven_warm_up_finished_segment_size << segment_size; diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out new file mode 100644 index 00000000000000..7cefab58718a8e --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2 -- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 + diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy new file mode 100644 index 00000000000000..7c1d870d300a56 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy @@ -0,0 +1,309 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_compaction_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=20000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(5000) + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test values (2, 2)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(3000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assertEquals(num_submitted + 1, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num")) + assertEquals(num_finished, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num")) + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + + + // in this moment, compaction has completed, but not commited, it's waiting for warm up + // trigger a query on read cluster, can't read the compaction data + sql """use @${clusterName2}""" + sql "select * from test" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-2]") + assert rowsets[2].contains("[3-3]") + assert rowsets[3].contains("[4-4]") + assert rowsets[4].contains("[5-5]") + assert rowsets[5].contains("[6-6]") + assert rowsets[6].contains("[7-7]") + assert rowsets[7].contains("[8-8]") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + qt_cluster2 """select * from test""" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + future.get() + } +} From 2e68f6b3da6dfcb2ef8ea074673f249c8338703a Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Wed, 10 Sep 2025 15:11:03 +0800 Subject: [PATCH 22/34] consider inverted idx file --- be/src/cloud/cloud_internal_service.cpp | 27 ++++++-- be/src/cloud/cloud_tablet.cpp | 60 +++++++++++++++--- be/src/cloud/cloud_tablet.h | 20 +++++- be/src/cloud/cloud_warm_up_manager.cpp | 36 +++++++++-- be/test/cloud/cloud_tablet_test.cpp | 84 +++++++++++++++++++++---- 5 files changed, 191 insertions(+), 36 deletions(-) diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index c7a958b40b64dc..53470088566992 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -273,7 +273,8 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c LOG(WARNING) << "download segment failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id.to_string() << ", error: " << st; } - if (tablet->complete_rowset_segment_warmup(rowset_id, st) == WarmUpState::DONE) { + if (tablet->complete_rowset_segment_warmup(rowset_id, st, 1, 0) == + WarmUpState::DONE) { VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" << rowset_id.to_string() << ") completed"; } @@ -304,9 +305,18 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c } _engine.file_cache_block_downloader().submit_download_task(download_meta); - auto download_inverted_index = [&](std::string index_path, uint64_t idx_size) { + auto download_inverted_index = [&, tablet](std::string index_path, uint64_t idx_size) { auto storage_resource = rs_meta.remote_storage_resource(); - auto download_done = [=, rowset_id = rowset_id.to_string()](Status st) { + auto download_done = [=, version = rs_meta.version()](Status st) { + DBUG_EXECUTE_IF( + "CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for rowset={}, inverted index " + "file={}, sleep={}", + rowset_id.to_string(), index_path, sleep_time); + std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); + }); if (st.ok()) { g_file_cache_event_driven_warm_up_finished_index_num << 1; g_file_cache_event_driven_warm_up_finished_index_size << idx_size; @@ -323,14 +333,14 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c g_file_cache_warm_up_rowset_slow_count << 1; LOG(INFO) << "warm up rowset took " << now_ts - request_ts << " us, tablet_id: " << tablet_id - << ", rowset_id: " << rowset_id + << ", rowset_id: " << rowset_id.to_string() << ", segment_id: " << segment_id; } if (now_ts - handle_ts > config::warm_up_rowset_slow_log_ms * 1000) { g_file_cache_warm_up_rowset_handle_to_finish_slow_count << 1; LOG(INFO) << "warm up rowset (handle to finish) took " << now_ts - handle_ts << " us, tablet_id: " << tablet_id - << ", rowset_id: " << rowset_id + << ", rowset_id: " << rowset_id.to_string() << ", segment_id: " << segment_id; } } else { @@ -339,6 +349,11 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c LOG(WARNING) << "download inverted index failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id << ", error: " << st; } + if (tablet->complete_rowset_segment_warmup(rowset_id, st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" + << rowset_id.to_string() << ") completed"; + } if (wait) { wait->signal(); } @@ -358,7 +373,7 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c }; g_file_cache_event_driven_warm_up_submitted_index_num << 1; g_file_cache_event_driven_warm_up_submitted_index_size << idx_size; - + tablet->update_rowset_warmup_state_inverted_idx_num(rowset_id, 1); if (wait) { wait->add_count(); } diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index c0a400bbe73a56..8896d92c7b4539 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -118,6 +118,10 @@ bvar::Adder g_file_cache_warm_up_segment_complete_num( "file_cache_warm_up_segment_complete_num"); bvar::Adder g_file_cache_warm_up_segment_failed_num( "file_cache_warm_up_segment_failed_num"); +bvar::Adder g_file_cache_warm_up_inverted_idx_complete_num( + "file_cache_warm_up_inverted_idx_complete_num"); +bvar::Adder g_file_cache_warm_up_inverted_idx_failed_num( + "file_cache_warm_up_inverted_idx_failed_num"); bvar::Adder g_file_cache_warm_up_rowset_complete_num( "file_cache_warm_up_rowset_complete_num"); bvar::Adder g_file_cache_warm_up_rowset_triggered_by_job_num( @@ -443,14 +447,14 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ } // clang-format off }); - self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st); + self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st, 1, 0); if (!st) { LOG_WARNING("add rowset warm up error ").error(st); } }}, }); - auto download_idx_file = [&](const io::Path& idx_path, int64_t idx_size) { + auto download_idx_file = [&, self](const io::Path& idx_path, int64_t idx_size) { io::DownloadFileMeta meta { .path = idx_path, .file_size = idx_size, @@ -460,12 +464,27 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ .expiration_time = expiration_time, .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, }, - .download_done {[](Status st) { + .download_done {[=](Status st) { + DBUG_EXECUTE_IF("CloudTablet::add_rowsets.download_idx.callback.block", { + // clang-format on + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for " + "rowset={}, inverted_idx_file={}, " + "sleep={}", + rs->rowset_id().to_string(), + idx_path.string(), sleep_time); + std::this_thread::sleep_for( + std::chrono::seconds(sleep_time)); + // clang-format off + }); + self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st, 0, 1); if (!st) { LOG_WARNING("add rowset warm up error ").error(st); } }}, }; + self->update_rowset_warmup_state_inverted_idx_num_unlocked(rowset_meta->rowset_id(), 1); _engine.file_cache_block_downloader().submit_download_task(std::move(meta)); g_file_cache_cloud_tablet_submitted_index_num << 1; g_file_cache_cloud_tablet_submitted_index_size << idx_size; @@ -1613,6 +1632,19 @@ bool CloudTablet::add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState return add_rowset_warmup_state_unlocked(rowset, state, start_tp); } +void CloudTablet::update_rowset_warmup_state_inverted_idx_num(RowsetId rowset_id, int64_t delta) { + std::lock_guard wlock(_meta_lock); + update_rowset_warmup_state_inverted_idx_num_unlocked(rowset_id, delta); +} + +void CloudTablet::update_rowset_warmup_state_inverted_idx_num_unlocked(RowsetId rowset_id, + int64_t delta) { + if (!_rowset_warm_up_states.contains(rowset_id)) { + return; + } + _rowset_warm_up_states[rowset_id].num_inverted_idx += delta; +} + bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state, std::chrono::steady_clock::time_point start_tp) { if (_rowset_warm_up_states.contains(rowset.rowset_id())) { @@ -1628,18 +1660,28 @@ bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, War return true; } -WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Status status) { +WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Status status, + int64_t segment_num, + int64_t inverted_idx_num) { std::lock_guard wlock(_meta_lock); if (!_rowset_warm_up_states.contains(rowset_id)) { return WarmUpState::NONE; } VLOG_DEBUG << "complete rowset segment warmup for rowset " << rowset_id << ", " << status; - g_file_cache_warm_up_segment_complete_num << 1; - if (!status.ok()) { - g_file_cache_warm_up_segment_failed_num << 1; + if (segment_num > 0) { + g_file_cache_warm_up_segment_complete_num << segment_num; + if (!status.ok()) { + g_file_cache_warm_up_segment_failed_num << segment_num; + } + } + if (inverted_idx_num > 0) { + g_file_cache_warm_up_inverted_idx_complete_num << inverted_idx_num; + if (!status.ok()) { + g_file_cache_warm_up_inverted_idx_failed_num << inverted_idx_num; + } } - _rowset_warm_up_states[rowset_id].num_segments--; - if (_rowset_warm_up_states[rowset_id].num_segments <= 0) { + _rowset_warm_up_states[rowset_id].done(segment_num, inverted_idx_num); + if (_rowset_warm_up_states[rowset_id].has_finished()) { g_file_cache_warm_up_rowset_complete_num << 1; add_warmed_up_rowset(rowset_id); auto cost = std::chrono::duration_cast( diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index a76bba04d6f8ab..91b4cb3f64702d 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -308,7 +308,10 @@ class CloudTablet final : public BaseTablet { bool add_rowset_warmup_state( const RowsetMeta& rowset, WarmUpState state, std::chrono::steady_clock::time_point start_tp = std::chrono::steady_clock::now()); - WarmUpState complete_rowset_segment_warmup(RowsetId rowset_id, Status status); + void update_rowset_warmup_state_inverted_idx_num(RowsetId rowset_id, int64_t delta); + void update_rowset_warmup_state_inverted_idx_num_unlocked(RowsetId rowset_id, int64_t delta); + WarmUpState complete_rowset_segment_warmup(RowsetId rowset_id, Status status, + int64_t segment_num, int64_t inverted_idx_num); bool is_rowset_warmed_up(const RowsetId& rowset_id) const { std::shared_lock rlock(_warmed_up_rowsets_mutex); @@ -400,8 +403,21 @@ class CloudTablet final : public BaseTablet { // for warm up states management struct RowsetWarmUpInfo { WarmUpState state; - int64_t num_segments; + int64_t num_segments = 0; + int64_t num_inverted_idx = 0; + int64_t num_segments_warmed_up = 0; + int64_t num_inverted_idx_warmed_up = 0; std::chrono::steady_clock::time_point start_tp; + + void done(int64_t num_segments, int64_t num_inverted_idx) { + num_segments_warmed_up += num_segments; + num_inverted_idx_warmed_up += num_inverted_idx; + } + + bool has_finished() const { + return (num_segments_warmed_up >= num_segments) && + (num_inverted_idx_warmed_up >= num_inverted_idx); + } }; std::unordered_map _rowset_warm_up_states; diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 8310d493009293..a92d25772b382d 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -242,8 +242,8 @@ void CloudWarmUpManager::handle_jobs() { [tablet, rs, seg_id](Status st) { VLOG_DEBUG << "warmup rowset " << rs->version() << " segment " << seg_id << " completed"; - if (tablet->complete_rowset_segment_warmup(rs->rowset_id(), st) == - WarmUpState::DONE) { + if (tablet->complete_rowset_segment_warmup( + rs->rowset_id(), st, 1, 0) == WarmUpState::DONE) { VLOG_DEBUG << "warmup rowset " << rs->version() << " completed"; } }); @@ -277,8 +277,20 @@ void CloudWarmUpManager::handle_jobs() { } } } - submit_download_tasks(idx_path, file_size, storage_resource.value()->fs, - expiration_time, wait, true); + tablet->update_rowset_warmup_state_inverted_idx_num(rs->rowset_id(), 1); + submit_download_tasks( + idx_path, file_size, storage_resource.value()->fs, + expiration_time, wait, true, [=](Status st) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " segment " << seg_id + << "inverted idx:" << idx_path << " completed"; + if (tablet->complete_rowset_segment_warmup(rs->rowset_id(), + st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " completed"; + } + }); } } else { if (schema_ptr->has_inverted_index() || schema_ptr->has_ann_index()) { @@ -286,8 +298,20 @@ void CloudWarmUpManager::handle_jobs() { storage_resource.value()->remote_idx_v2_path(*rs, seg_id); file_size = idx_file_info.has_index_size() ? idx_file_info.index_size() : -1; - submit_download_tasks(idx_path, file_size, storage_resource.value()->fs, - expiration_time, wait, true); + tablet->update_rowset_warmup_state_inverted_idx_num(rs->rowset_id(), 1); + submit_download_tasks( + idx_path, file_size, storage_resource.value()->fs, + expiration_time, wait, true, [=](Status st) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " segment " << seg_id + << "inverted idx:" << idx_path << " completed"; + if (tablet->complete_rowset_segment_warmup(rs->rowset_id(), + st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " completed"; + } + }); } } } diff --git a/be/test/cloud/cloud_tablet_test.cpp b/be/test/cloud/cloud_tablet_test.cpp index 5ec3df0417591c..fe9751ff7bfbc9 100644 --- a/be/test/cloud/cloud_tablet_test.cpp +++ b/be/test/cloud/cloud_tablet_test.cpp @@ -135,7 +135,8 @@ TEST_F(CloudTabletWarmUpStateTest, TestAddDuplicateRowsetWarmupState) { TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupNonExistent) { auto non_existent_id = _engine.next_rowset_id(); - WarmUpState result = _tablet->complete_rowset_segment_warmup(non_existent_id, Status::OK()); + WarmUpState result = + _tablet->complete_rowset_segment_warmup(non_existent_id, Status::OK(), 1, 0); EXPECT_EQ(result, WarmUpState::NONE); } @@ -151,12 +152,12 @@ TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupPartial) { // Complete one segment, should still be in TRIGGERED_BY_JOB state WarmUpState result1 = - _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); // Complete second segment, should still be in TRIGGERED_BY_JOB state WarmUpState result2 = - _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); EXPECT_EQ(result2, WarmUpState::TRIGGERED_BY_JOB); // Verify current state is still TRIGGERED_BY_JOB @@ -176,12 +177,67 @@ TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupFull) { // Complete first segment WarmUpState result1 = - _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_SYNC_ROWSET); // Complete second segment, should transition to DONE state WarmUpState result2 = - _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result2, WarmUpState::DONE); + + // Verify final state is DONE + WarmUpState final_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(final_state, WarmUpState::DONE); +} + +// Test complete_rowset_segment_warmup with inverted index file, partial completion +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithInvertedIndexPartial) { + auto rowset = create_rowset(Version(6, 6), 1); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + + // Complete one segment file + WarmUpState result1 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); + + // Complete inverted index file, should still be in TRIGGERED_BY_JOB state + WarmUpState result2 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 0, 1); + EXPECT_EQ(result2, WarmUpState::TRIGGERED_BY_JOB); + + // Verify current state is still TRIGGERED_BY_JOB + WarmUpState current_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(current_state, WarmUpState::TRIGGERED_BY_JOB); +} + +// Test complete_rowset_segment_warmup with inverted index file, full completion +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithInvertedIndexFull) { + auto rowset = create_rowset(Version(6, 6), 1); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + + // Complete segment file + WarmUpState result1 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); + + // Complete inverted index file + WarmUpState result2 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 0, 1); EXPECT_EQ(result2, WarmUpState::DONE); // Verify final state is DONE @@ -201,7 +257,8 @@ TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithError) { // Complete with error status, should still transition to DONE when all segments complete Status error_status = Status::InternalError("Test error"); - WarmUpState result = _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), error_status); + WarmUpState result = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), error_status, 1, 0); EXPECT_EQ(result, WarmUpState::DONE); // Verify final state is DONE even with error @@ -235,13 +292,13 @@ TEST_F(CloudTabletWarmUpStateTest, TestMultipleRowsetsWarmupState) { WarmUpState::TRIGGERED_BY_JOB); // Complete rowset1 (2 segments) - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), WarmUpState::TRIGGERED_BY_JOB); - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), WarmUpState::DONE); // Complete rowset3 (1 segment) - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset3->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset3->rowset_id(), Status::OK(), 1, 0), WarmUpState::DONE); // Verify states after completion @@ -266,7 +323,8 @@ TEST_F(CloudTabletWarmUpStateTest, TestWarmupStateWithZeroSegments) { EXPECT_EQ(state, WarmUpState::TRIGGERED_BY_JOB); // Any completion call should handle the edge case gracefully - WarmUpState result = _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + WarmUpState result = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); // With 0 segments, the counter should already be 0, so this should transition to DONE EXPECT_EQ(result, WarmUpState::DONE); } @@ -285,11 +343,11 @@ TEST_F(CloudTabletWarmUpStateTest, TestConcurrentWarmupStateAccess) { WarmUpState::TRIGGERED_BY_SYNC_ROWSET)); // Interleaved completion operations - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), WarmUpState::TRIGGERED_BY_JOB); - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset2->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset2->rowset_id(), Status::OK(), 1, 0), WarmUpState::TRIGGERED_BY_SYNC_ROWSET); - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), WarmUpState::TRIGGERED_BY_JOB); // Check states are maintained correctly From 1aa0b8ebc43ec3c352a28f92d82a8c6ad710dba7 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Wed, 10 Sep 2025 18:54:22 +0800 Subject: [PATCH 23/34] t,p --- .../test_warmup_delay_sc_query_tolerance.out | 3 + ...lay_timeout_compaction_query_tolerance.out | 9 + ...up_delay_compaction_query_tolerance.groovy | 4 + ...est_warmup_delay_sc_query_tolerance.groovy | 293 ++++++++++++++++ ..._timeout_compaction_query_tolerance.groovy | 313 ++++++++++++++++++ 5 files changed, 622 insertions(+) create mode 100644 regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out create mode 100644 regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out create mode 100644 regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy create mode 100644 regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out new file mode 100644 index 00000000000000..1ae21eb6bac993 --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out @@ -0,0 +1,3 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2 -- + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out new file mode 100644 index 00000000000000..7cefab58718a8e --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2 -- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 + diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy index 7c1d870d300a56..deea395eef27a7 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy @@ -66,6 +66,7 @@ suite('test_warmup_delay_compaction_query_tolerance', 'docker') { def metrics = new URL(url).text def matcher = metrics =~ ~"${name}\\s+(\\d+)" if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") return matcher[0][1] as long } else { throw new RuntimeException("${name} not found for ${ip}:${port}") @@ -294,6 +295,7 @@ suite('test_warmup_delay_compaction_query_tolerance', 'docker') { def t1 = System.currentTimeMillis() def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) qt_cluster2 """select * from test""" def t2 = System.currentTimeMillis() logger.info("query in cluster2 cost=${t2 - t1} ms") @@ -305,5 +307,7 @@ suite('test_warmup_delay_compaction_query_tolerance', 'docker') { logWarmUpRowsetMetrics(clusterName2) future.get() + assert num_finished + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 0 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") } } diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy new file mode 100644 index 00000000000000..7cd0475981eca2 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy @@ -0,0 +1,293 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_sc_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', + 'warm_up_rowset_sync_wait_max_timeout_ms=100', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(5000) + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sleep(5100) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_submitted >= 1 + assert num_finished == num_submitted + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + sql """insert into test values (9, 9)""" + + do_cumu_compaction(src_be, "test", tablet_id, 2, 3) + + // trigger a heavy SC + sql "alter table test modify column col1 varchar(1000);" + + waitForSchemaChangeDone { + sql """ SHOW ALTER TABLE COLUMN WHERE TableName='test' ORDER BY createtime DESC LIMIT 1 """ + time 1000 + } + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + // assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + // assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + + + // trigger a query on read cluster, can't read the SC converted data and new load data + sql """use @${clusterName2}""" + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + qt_cluster2 """select * from test""" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy new file mode 100644 index 00000000000000..f7e5254a6726c5 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy @@ -0,0 +1,313 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=5000', + 'warm_up_rowset_sync_wait_max_timeout_ms=5000', // to cause timeout + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(5000) + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test values (2, 2)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(3000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assertEquals(num_submitted + 1, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num")) + assertEquals(num_finished, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num")) + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + + + // in this moment, compaction has completed, but not commited, it's waiting for warm up + // trigger a query on read cluster, can't read the compaction data + sql """use @${clusterName2}""" + sql "select * from test" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-2]") + assert rowsets[2].contains("[3-3]") + assert rowsets[3].contains("[4-4]") + assert rowsets[4].contains("[5-5]") + assert rowsets[5].contains("[6-6]") + assert rowsets[6].contains("[7-7]") + assert rowsets[7].contains("[8-8]") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) + qt_cluster2 """select * from test""" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + future.get() + assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + } +} From e256beee72426b9f1ff0a3fa1037f92e922c970b Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Wed, 10 Sep 2025 20:10:10 +0800 Subject: [PATCH 24/34] print stack --- be/src/common/config.cpp | 2 ++ be/src/common/config.h | 2 ++ be/src/io/cache/cached_remote_file_reader.cpp | 5 +++++ 3 files changed, 9 insertions(+) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index ba21728d916caf..199cc34519071c 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1592,6 +1592,8 @@ DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true"); DEFINE_mBool(enable_wal_tde, "false"); +DEFINE_mBool(print_stack_when_cache_miss, "true"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 959b46c9747c12..6ae2a490a052c9 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1646,6 +1646,8 @@ DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction); DECLARE_mBool(enable_wal_tde); +DECLARE_mBool(print_stack_when_cache_miss); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 5d4d9cc556724a..66eb5f523cc346 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -151,6 +151,11 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* ReadStatistics stats; stats.bytes_read += bytes_req; auto defer_func = [&](int*) { + if (config::print_stack_when_cache_miss) { + if (io_ctx->file_cache_stats == nullptr && !stats.hit_cache) { + LOG_INFO("[verbose] {}", Status::InternalError("not hit cache")); + } + } if (io_ctx->file_cache_stats && !is_dryrun) { // update stats in io_ctx, for query profile _update_stats(stats, io_ctx->file_cache_stats, io_ctx->is_inverted_index); From 29f872fb44871b306ca95e7d82b7ae5b704d0cf8 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Thu, 11 Sep 2025 17:34:34 +0800 Subject: [PATCH 25/34] tmp --- be/src/cloud/cloud_tablet.cpp | 24 +- be/src/cloud/cloud_tablet.h | 19 + be/src/common/config.cpp | 2 + be/src/common/config.h | 2 + be/src/olap/base_tablet.h | 5 + be/src/olap/version_graph.cpp | 18 + be/src/olap/version_graph.h | 4 + .../test_warmup_delay_idx_query_tolerance.out | 23 ++ ...st_warmup_delay_idx_query_tolerance.groovy | 327 ++++++++++++++++++ 9 files changed, 422 insertions(+), 2 deletions(-) create mode 100644 regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out create mode 100644 regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 8896d92c7b4539..0a0703ca93d8df 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -271,8 +271,28 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( // skip rowset[0-1] return false; } - return rs_meta->start_version() > path_max_version && - rs_meta->visible_timestamp() < freshness_limit_tp; + bool ret = rs_meta->start_version() > path_max_version && + rs_meta->visible_timestamp() < freshness_limit_tp; + if (ret && config::read_cluster_cache_opt_verbose_log) { + std::time_t t1 = system_clock::to_time_t(rs_meta->visible_timestamp()); + std::tm tm1 = *std::localtime(&t1); + std::ostringstream oss1; + oss1 << std::put_time(&tm1, "%Y-%m-%d %H:%M:%S"); + + std::time_t t2 = system_clock::to_time_t(freshness_limit_tp); + std::tm tm2 = *std::localtime(&t2); + std::ostringstream oss2; + oss2 << std::put_time(&tm2, "%Y-%m-%d %H:%M:%S"); + LOG_INFO( + "[verbose] CloudTablet::capture_rs_readers_with_freshness_tolerance, " + "find a rowset which should be visible but not warmed up, tablet_id={}, " + "path_max_version={}, rowset_id={}, version={}, visible_time={}, " + "freshness_limit={}, version_graph={}, rowset_warmup_digest={}", + tablet_id(), path_max_version, rs_meta->rowset_id().to_string(), + rs_meta->version().to_string(), oss1.str(), oss2.str(), + _timestamped_version_tracker.debug_string(), rowset_warmup_digest()); + } + return ret; }; // use std::views::concat after C++26 bool should_fallback = std::ranges::any_of(_tablet_meta->all_rs_metas(), diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 91b4cb3f64702d..68948aa55348c5 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -330,6 +330,25 @@ class CloudTablet final : public BaseTablet { _warmed_up_rowsets.erase(rowset_id); } + std::string rowset_warmup_digest() { + std::string res; + auto add_log = [&](const RowsetSharedPtr& rs) { + auto tmp = fmt::format("{}{}", rs->rowset_id().to_string(), rs->version().to_string()); + if (_rowset_warm_up_states.contains(rs->rowset_id())) { + tmp += fmt::format( + ", state={}, segments_warmed_up={}/{}, inverted_idx_warmed_up={}/{}", + _rowset_warm_up_states.at(rs->rowset_id()).state, + _rowset_warm_up_states.at(rs->rowset_id()).num_segments_warmed_up, + _rowset_warm_up_states.at(rs->rowset_id()).num_segments, + _rowset_warm_up_states.at(rs->rowset_id()).num_inverted_idx_warmed_up, + _rowset_warm_up_states.at(rs->rowset_id()).num_inverted_idx); + } + res += fmt::format("[{}],", tmp); + }; + traverse_rowsets_unlocked(add_log, true); + return res; + } + private: // FIXME(plat1ko): No need to record base size if rowsets are ordered by version void update_base_size(const Rowset& rs); diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 199cc34519071c..a0dc069972702a 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1594,6 +1594,8 @@ DEFINE_mBool(enable_wal_tde, "false"); DEFINE_mBool(print_stack_when_cache_miss, "true"); +DEFINE_mBool(read_cluster_cache_opt_verbose_log, "true"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 6ae2a490a052c9..3c801e94d43e17 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1648,6 +1648,8 @@ DECLARE_mBool(enable_wal_tde); DECLARE_mBool(print_stack_when_cache_miss); +DECLARE_mBool(read_cluster_cache_opt_verbose_log); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index 0044eb444bd3d5..6f79ebd5758c83 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -306,6 +306,11 @@ class BaseTablet : public std::enable_shared_from_this { void traverse_rowsets(std::function visitor, bool include_stale = false) { std::shared_lock rlock(_meta_lock); + traverse_rowsets_unlocked(visitor, include_stale); + } + + void traverse_rowsets_unlocked(std::function visitor, + bool include_stale = false) { for (auto& [v, rs] : _rs_version_map) { visitor(rs); } diff --git a/be/src/olap/version_graph.cpp b/be/src/olap/version_graph.cpp index 711746528cd53e..49cf73e22a7ac1 100644 --- a/be/src/olap/version_graph.cpp +++ b/be/src/olap/version_graph.cpp @@ -435,6 +435,10 @@ double TimestampedVersionTracker::get_orphan_vertex_ratio() { return _version_graph.get_orphan_vertex_ratio(); } +std::string TimestampedVersionTracker::debug_string() const { + return _version_graph.debug_string(); +} + void TimestampedVersionPathContainer::add_timestamped_version(TimestampedVersionSharedPtr version) { // Compare and refresh `_max_create_time`. if (version->get_create_time() > _max_create_time) { @@ -834,5 +838,19 @@ double VersionGraph::get_orphan_vertex_ratio() { return static_cast(orphan_vertex_num) / static_cast(vertex_num); } +std::string VersionGraph::debug_string() const { + std::stringstream ss; + ss << "VersionGraph: ["; + for (size_t i = 0; i < _version_graph.size(); ++i) { + ss << "{value: " << _version_graph[i].value << ", edges: ["; + for (const auto& edge : _version_graph[i].edges) { + ss << _version_graph[edge].value << ", "; + } + ss << "]}, "; + } + ss << "]"; + return ss.str(); +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/olap/version_graph.h b/be/src/olap/version_graph.h index a845eaffaa4706..6a15a6ad0e0158 100644 --- a/be/src/olap/version_graph.h +++ b/be/src/olap/version_graph.h @@ -78,6 +78,8 @@ class VersionGraph { // See comment of TimestampedVersionTracker's get_orphan_vertex_ratio(); double get_orphan_vertex_ratio(); + std::string debug_string() const; + private: /// Private method add a version to graph. void _add_vertex_to_graph(int64_t vertex_value); @@ -232,6 +234,8 @@ class TimestampedVersionTracker { // If a vertex is no longer the starting point of any edge, then this vertex is defined as orphan vertex double get_orphan_vertex_ratio(); + std::string debug_string() const; + private: /// Construct rowsets version tracker with main path rowset meta. void _construct_versioned_tracker(const std::vector& rs_metas); diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out new file mode 100644 index 00000000000000..8191de7859e2eb --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2_0 -- +3 3 +4 4 +5 5 +6 6 + +-- !cluster2_1 -- +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 +9 9 0 8 + +-- !cluster2_2 -- +1 1 0 2 +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 + diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy new file mode 100644 index 00000000000000..0bb18cfdba9992 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy @@ -0,0 +1,327 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_idx_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', // to cauase timeout + 'warm_up_rowset_sync_wait_max_timeout_ms=100', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL, + INDEX idx1(col1) USING INVERTED + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test(col0,col1,__DORIS_DELETE_SIGN__) values (1, 2, 1)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_idx_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_index_num") + def num_idx_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + assert num_idx_submitted >= 6 + assert num_idx_finished == num_idx_submitted + + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + sleep(500) + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assert num_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + assert num_finished + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_idx_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_index_num") + assert num_idx_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + + // trigger a query on read cluster without query tolerance, read the origin data + sql """use @${clusterName2}""" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-5]") + assert rowsets[2].contains("[6-6]") + assert rowsets[3].contains("[7-7]") + assert rowsets[4].contains("[8-8]") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + // trigger a query on read cluster without query tolerance, read the compacted data + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + sleep(10000) + // assert num_finished + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + } +} From b9b3ddac8c199bb71626df155b387ce868382991 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 12 Sep 2025 10:24:35 +0800 Subject: [PATCH 26/34] update --- .../test_enable_prefer_cached_rowset.out | 29 +++++----- .../test_query_freshness_tolerance.out | 20 +++---- .../test_warmup_delay_sc_query_tolerance.out | 10 +++- ...lay_timeout_compaction_query_tolerance.out | 20 ++++++- .../test_enable_prefer_cached_rowset.groovy | 21 ++++--- .../test_query_freshness_tolerance.groovy | 19 ++++--- ...est_warmup_delay_sc_query_tolerance.groovy | 29 +++++++--- ..._timeout_compaction_query_tolerance.groovy | 56 ++++++++++++------- 8 files changed, 130 insertions(+), 74 deletions(-) diff --git a/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out index c9382a70217ba3..04cf3be33e8192 100644 --- a/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out +++ b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out @@ -1,31 +1,32 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !cluster1 -- -1 {"a":1} -2 {"a":111.1111} 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} --- !cluster2 -- -1 {"a":1} -2 {"a":111.1111} +-- !cluster2_0 -- 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} -- !cluster1_new_data -- -1 {"a":1} -2 {"a":111.1111} 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} 6 {"a":1111.11111} --- !cluster2 -- -1 {"a":1} -2 {"a":111.1111} -3 {"a":"11111"} -4 {"a":1111111111} -5 {"a":1111.11111} -6 {"a":1111.11111} +-- !cluster2_1 -- +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 +6 {"a":1111.11111} 0 7 + +-- !cluster2_2 -- +1 {"a":1} 0 2 +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 +6 {"a":1111.11111} 0 7 diff --git a/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out index 4441657b9aa704..b99240d21e24ff 100644 --- a/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out +++ b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out @@ -1,30 +1,24 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !cluster1 -- -1 {"a":1} -2 {"a":111.1111} 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} --- !cluster2 -- -1 {"a":1} -2 {"a":111.1111} +-- !cluster2_0 -- 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} -- !cluster1_new_data -- -1 {"a":1} -2 {"a":111.1111} 3 {"a":"11111"} 4 {"a":1111111111} 5 {"a":1111.11111} 6 {"a":1111.11111} --- !cluster2 -- -1 {"a":1} -2 {"a":111.1111} -3 {"a":"11111"} -4 {"a":1111111111} -5 {"a":1111.11111} +-- !cluster2_1 -- +1 {"a":1} 0 2 +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out index 1ae21eb6bac993..e7fffb3bcb7ec1 100644 --- a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out @@ -1,3 +1,11 @@ -- This file is automatically generated. You should know what you did if you want to edit this --- !cluster2 -- +-- !cluster2_0 -- +2 2 + +-- !cluster2_1 -- +1 \N 1 3 +2 2 0 2 +9 9 0 4 + +-- !cluster2_2 -- diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out index 7cefab58718a8e..b915a1b6c9449a 100644 --- a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out @@ -1,9 +1,23 @@ -- This file is automatically generated. You should know what you did if you want to edit this --- !cluster2 -- -1 1 -2 2 +-- !cluster2_0 -- 3 3 4 4 5 5 6 6 +-- !cluster2_1 -- +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 +9 9 0 8 + +-- !cluster2 -- +1 1 0 2 +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 + diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy index b0d6795d5c5fab..34ca1d7e8a4b4e 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy @@ -122,17 +122,17 @@ suite('test_enable_prefer_cached_rowset', 'docker') { sql """ create table test ( col0 int not null, - col1 variant NOT NULL - ) DUPLICATE KEY(`col0`) + col1 variant NULL + ) UNIQUE KEY(`col0`) DISTRIBUTED BY HASH(col0) BUCKETS 1 - PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); """ clearFileCacheOnAllBackends() - sleep(2000) sql """insert into test values (1, '{"a" : 1.0}')""" - sql """insert into test values (2, '{"a" : 111.1111}')""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" sql """insert into test values (3, '{"a" : "11111"}')""" sql """insert into test values (4, '{"a" : 1111111111}')""" sql """insert into test values (5, '{"a" : 1111.11111}')""" @@ -142,7 +142,7 @@ suite('test_enable_prefer_cached_rowset', 'docker') { // switch to read cluster, trigger a sync rowset sql """use @${clusterName2}""" - qt_cluster2 """select * from test""" + qt_cluster2_0 """select * from test""" // switch to source cluster and trigger compaction sql """use @${clusterName1}""" @@ -158,11 +158,18 @@ suite('test_enable_prefer_cached_rowset', 'docker') { sql "set enable_profile=true;" sql "set profile_level=2;" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + + // when enable_prefer_cached_rowset = false, need to read all data including compaction rowsets + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql "set enable_prefer_cached_rowset = true" // when enable_prefer_cached_rowset = true, only need to read newly load data, compaction rowsets data will be skipped def t1 = System.currentTimeMillis() def capturePreferCacheCount = getBrpcMetricsByCluster(clusterName2, "capture_prefer_cache_count") - qt_cluster2 """select * from test""" + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" def t2 = System.currentTimeMillis() logger.info("query in cluster2 cost=${t2 - t1} ms") assert t2 - t1 < 2000 diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy index 13e9ba837344ca..215e588137ed8e 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy @@ -124,17 +124,17 @@ suite('test_query_freshness_tolerance', 'docker') { sql """ create table test ( col0 int not null, - col1 variant NOT NULL - ) DUPLICATE KEY(`col0`) + col1 variant NULL + ) UNIQUE KEY(`col0`) DISTRIBUTED BY HASH(col0) BUCKETS 1 - PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); """ clearFileCacheOnAllBackends() - sleep(2000) sql """insert into test values (1, '{"a" : 1.0}')""" - sql """insert into test values (2, '{"a" : 111.1111}')""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" sql """insert into test values (3, '{"a" : "11111"}')""" sql """insert into test values (4, '{"a" : 1111111111}')""" sql """insert into test values (5, '{"a" : 1111.11111}')""" @@ -144,7 +144,7 @@ suite('test_query_freshness_tolerance', 'docker') { // switch to read cluster, trigger a sync rowset sql """use @${clusterName2}""" - qt_cluster2 """select * from test""" + qt_cluster2_0 """select * from test""" // sleep for 5s to let these rowsets meet the requirement of query freshness tolerance sleep(5000) @@ -163,11 +163,16 @@ suite('test_query_freshness_tolerance', 'docker') { sql "set enable_profile=true;" sql "set profile_level=2;" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + sql "set query_freshness_tolerance_ms = 5000" def t1 = System.currentTimeMillis() def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") - qt_cluster2 """select * from test""" + // when query_freshness_tolerance_ms is set, newly load data and compaction rowsets data will be skipped + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" def t2 = System.currentTimeMillis() logger.info("query in cluster2 cost=${t2 - t1} ms") assert t2 - t1 < 3000 diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy index 7cd0475981eca2..f047d81204a5e0 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy @@ -58,7 +58,7 @@ suite('test_warmup_delay_sc_query_tolerance', 'docker') { } // clear file cache is async, wait it done - sleep(5000) + sleep(1000) } def getBrpcMetrics = {ip, port, name -> @@ -219,19 +219,20 @@ suite('test_warmup_delay_sc_query_tolerance', 'docker') { sql """ create table test ( col0 int not null, - col1 int NOT NULL + col1 int NULL ) UNIQUE KEY(`col0`) DISTRIBUTED BY HASH(col0) BUCKETS 1 - PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); """ clearFileCacheOnAllBackends() - sleep(5000) sql """use @${clusterName1}""" // load data - sql """insert into test values (1, 1)""" - sleep(5100) + sql """insert into test values (1, 1),(2,2);""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" + sleep(5000) def tablets = sql_return_maparray """ show tablets from test; """ logger.info("tablets: " + tablets) @@ -249,12 +250,17 @@ suite('test_warmup_delay_sc_query_tolerance', 'docker') { assert num_submitted >= 1 assert num_finished == num_submitted + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + // inject sleep when read cluster warm up rowset for compaction and load GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) sql """insert into test values (9, 9)""" - do_cumu_compaction(src_be, "test", tablet_id, 2, 3) + do_cumu_compaction(src_be, "test", tablet_id, 2, 4) // trigger a heavy SC sql "alter table test modify column col1 varchar(1000);" @@ -270,17 +276,22 @@ suite('test_warmup_delay_sc_query_tolerance', 'docker') { // assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") - // trigger a query on read cluster, can't read the SC converted data and new load data sql """use @${clusterName2}""" sql "set enable_profile=true;" sql "set profile_level=2;" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql "set query_freshness_tolerance_ms = 5000" def t1 = System.currentTimeMillis() def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") - qt_cluster2 """select * from test""" + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" def t2 = System.currentTimeMillis() logger.info("query in cluster2 cost=${t2 - t1} ms") assert t2 - t1 < 3000 diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy index f7e5254a6726c5..7cb580c4816bc2 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy @@ -31,8 +31,9 @@ suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { 'file_cache_background_monitor_interval_ms=1000', 'warm_up_rowset_slow_log_ms=1', 'enable_compaction_delay_commit_for_warm_up=true', - 'warm_up_rowset_sync_wait_min_timeout_ms=5000', - 'warm_up_rowset_sync_wait_max_timeout_ms=5000', // to cause timeout + 'read_cluster_cache_opt_verbose_log=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', + 'warm_up_rowset_sync_wait_max_timeout_ms=100', // to cause timeout ] options.enableDebugPoints() options.cloudMode = true @@ -58,7 +59,7 @@ suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { } // clear file cache is async, wait it done - sleep(5000) + sleep(2000) } def getBrpcMetrics = {ip, port, name -> @@ -222,21 +223,21 @@ suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { col1 int NOT NULL ) UNIQUE KEY(`col0`) DISTRIBUTED BY HASH(col0) BUCKETS 1 - PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); """ clearFileCacheOnAllBackends() - sleep(5000) sql """use @${clusterName1}""" // load data sql """insert into test values (1, 1)""" - sql """insert into test values (2, 2)""" + sql """insert into test(col0,col1,__DORIS_DELETE_SIGN__) values (1, 2, 1)""" sql """insert into test values (3, 3)""" sql """insert into test values (4, 4)""" sql """insert into test values (5, 5)""" sql """insert into test values (6, 6)""" - sleep(3000) + sleep(5000) def tablets = sql_return_maparray """ show tablets from test; """ logger.info("tablets: " + tablets) @@ -254,6 +255,11 @@ suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { assert num_submitted >= 6 assert num_finished == num_submitted + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + // inject sleep when read cluster warm up rowset for compaction and load GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) @@ -266,37 +272,47 @@ suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) logFileCacheDownloadMetrics(clusterName2) logWarmUpRowsetMetrics(clusterName2) - assertEquals(num_submitted + 1, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num")) - assertEquals(num_finished, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num")) + assert num_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") // a new insert will trigger the sync rowset operation in the following query sql """insert into test values (9, 9)""" + sleep(500) + assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") - // in this moment, compaction has completed, but not commited, it's waiting for warm up - // trigger a query on read cluster, can't read the compaction data + // trigger a query on read cluster without query tolerance, read the origin data sql """use @${clusterName2}""" - sql "select * from test" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) def rowsets = tablet_status ["rowsets"] - assert rowsets[1].contains("[2-2]") - assert rowsets[2].contains("[3-3]") - assert rowsets[3].contains("[4-4]") - assert rowsets[4].contains("[5-5]") - assert rowsets[5].contains("[6-6]") - assert rowsets[6].contains("[7-7]") - assert rowsets[7].contains("[8-8]") + assert rowsets[1].contains("[2-5]") + assert rowsets[2].contains("[6-6]") + assert rowsets[3].contains("[7-7]") + assert rowsets[4].contains("[8-8]") + + // this query will trigger sync_rowsets, due to compaction cnts changes, version_overlap will be true, so that compaction rowset + // and new load rowset's warmup task will be triggered. However, these rowsets' warmup tasks have been triggered in passive warmup + // we check that they will not be triggered again + assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") sql "set enable_profile=true;" sql "set profile_level=2;" + // trigger a query on read cluster without query tolerance, read the compacted data sql "set query_freshness_tolerance_ms = 5000" def t1 = System.currentTimeMillis() def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") // should not contains (9,9) - qt_cluster2 """select * from test""" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2 "select * from test order by col0, __DORIS_VERSION_COL__;" def t2 = System.currentTimeMillis() logger.info("query in cluster2 cost=${t2 - t1} ms") assert t2 - t1 < 3000 From 9e315ab2d4d2f7e22eb6b0f907f3bf80729ff3b1 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 12 Sep 2025 14:12:40 +0800 Subject: [PATCH 27/34] update --- be/src/cloud/cloud_internal_service.cpp | 6 + be/src/cloud/cloud_tablet.cpp | 21 +- be/src/cloud/cloud_tablet.h | 17 +- be/src/common/config.cpp | 4 +- be/src/olap/version_graph.cpp | 4 +- .../cloud_tablet_query_prefer_cache_test.cpp | 3 +- ...cloud_tablet_query_with_tolerance_test.cpp | 3 +- .../warmup/test_warmup_download_fail.out | 10 + ...up_delay_compaction_query_tolerance.groovy | 2 +- .../warmup/test_warmup_download_fail.groovy | 251 ++++++++++++++++++ 10 files changed, 294 insertions(+), 27 deletions(-) create mode 100644 regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out create mode 100644 regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 53470088566992..fe5571d7c24e75 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -243,6 +243,12 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c rowset_id.to_string(), version.to_string(), sleep_time); std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); }); + DBUG_EXECUTE_IF( + "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error", { + st = Status::InternalError("injected error"); + LOG_INFO("[verbose] inject error, tablet={}, rowset={}, st={}", + tablet_id, rowset_id.to_string(), st.to_string()); + }); if (st.ok()) { g_file_cache_event_driven_warm_up_finished_segment_num << 1; g_file_cache_event_driven_warm_up_finished_segment_size << segment_size; diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 0a0703ca93d8df..b6002777999d40 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -644,7 +644,6 @@ void CloudTablet::delete_rowsets(const std::vector& to_delete, _timestamped_version_tracker.add_stale_path_version(rs_metas); for (auto&& rs : to_delete) { _rs_version_map.erase(rs->version()); - _rowset_warm_up_states.erase(rs->rowset_id()); } _tablet_meta->modify_rs_metas({}, rs_metas, false); @@ -712,10 +711,6 @@ uint64_t CloudTablet::delete_expired_stale_rowsets() { auto& manager = ExecEnv::GetInstance()->storage_engine().to_cloud().cloud_warm_up_manager(); manager.recycle_cache(tablet_id(), recycled_rowsets); } - // these rowsets will not be choosen for query any more, so don't need to maintain if they are warmed up - for (const auto& rs : expired_rowsets) { - remove_warmed_up_rowset(rs->rowset_id()); - } if (config::enable_mow_verbose_log) { LOG_INFO("finish delete_expired_stale_rowset for tablet={}", tablet_id()); } @@ -777,6 +772,7 @@ void CloudTablet::remove_unused_rowsets() { continue; } tablet_meta()->remove_rowset_delete_bitmap(rs->rowset_id(), rs->version()); + _rowset_warm_up_states.erase(rs->rowset_id()); rs->clear_cache(); g_unused_rowsets_count << -1; g_unused_rowsets_bytes << -rs->total_disk_size(); @@ -1703,7 +1699,6 @@ WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Stat _rowset_warm_up_states[rowset_id].done(segment_num, inverted_idx_num); if (_rowset_warm_up_states[rowset_id].has_finished()) { g_file_cache_warm_up_rowset_complete_num << 1; - add_warmed_up_rowset(rowset_id); auto cost = std::chrono::duration_cast( std::chrono::steady_clock::now() - _rowset_warm_up_states[rowset_id].start_tp) @@ -1714,5 +1709,19 @@ WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Stat return _rowset_warm_up_states[rowset_id].state; } +bool CloudTablet::is_rowset_warmed_up(const RowsetId& rowset_id) const { + auto it = _rowset_warm_up_states.find(rowset_id); + if (it == _rowset_warm_up_states.end()) { + return false; + } + return it->second.state == WarmUpState::DONE; +} + +void CloudTablet::add_warmed_up_rowset(const RowsetId& rowset_id) { + _rowset_warm_up_states[rowset_id] = {.state = WarmUpState::DONE, + .num_segments = 1, + .start_tp = std::chrono::steady_clock::now()}; +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 68948aa55348c5..53a8869b6daaa0 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -313,22 +313,9 @@ class CloudTablet final : public BaseTablet { WarmUpState complete_rowset_segment_warmup(RowsetId rowset_id, Status status, int64_t segment_num, int64_t inverted_idx_num); - bool is_rowset_warmed_up(const RowsetId& rowset_id) const { - std::shared_lock rlock(_warmed_up_rowsets_mutex); - return _warmed_up_rowsets.contains(rowset_id); - } - - // mark a rowset that it has been warmed up - // must be called when file cache donwload task on this rowset is done - void add_warmed_up_rowset(const RowsetId& rowset_id) { - std::unique_lock wlock(_warmed_up_rowsets_mutex); - _warmed_up_rowsets.insert(rowset_id); - } + bool is_rowset_warmed_up(const RowsetId& rowset_id) const; - void remove_warmed_up_rowset(const RowsetId& rowset_id) { - std::unique_lock wlock(_warmed_up_rowsets_mutex); - _warmed_up_rowsets.erase(rowset_id); - } + void add_warmed_up_rowset(const RowsetId& rowset_id); std::string rowset_warmup_digest() { std::string res; diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index a0dc069972702a..d9d9d8e47c9760 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1592,9 +1592,9 @@ DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true"); DEFINE_mBool(enable_wal_tde, "false"); -DEFINE_mBool(print_stack_when_cache_miss, "true"); +DEFINE_mBool(print_stack_when_cache_miss, "false"); -DEFINE_mBool(read_cluster_cache_opt_verbose_log, "true"); +DEFINE_mBool(read_cluster_cache_opt_verbose_log, "false"); // clang-format off #ifdef BE_TEST diff --git a/be/src/olap/version_graph.cpp b/be/src/olap/version_graph.cpp index 49cf73e22a7ac1..1894e0953d3c4f 100644 --- a/be/src/olap/version_graph.cpp +++ b/be/src/olap/version_graph.cpp @@ -844,7 +844,9 @@ std::string VersionGraph::debug_string() const { for (size_t i = 0; i < _version_graph.size(); ++i) { ss << "{value: " << _version_graph[i].value << ", edges: ["; for (const auto& edge : _version_graph[i].edges) { - ss << _version_graph[edge].value << ", "; + if (_version_graph[edge].value > _version_graph[i].value) { + ss << _version_graph[edge].value << ", "; + } } ss << "]}, "; } diff --git a/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp index b30c3cc4a75281..8d4d5a37bf7a49 100644 --- a/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp +++ b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp @@ -42,11 +42,12 @@ class TestQueryPreferCache : public testing::Test { TestQueryPreferCache() : _engine(CloudStorageEngine(EngineOptions {})) {} void SetUp() override { + config::read_cluster_cache_opt_verbose_log = true; _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, TCompressionType::LZ4F)); } - void TearDown() override {} + void TearDown() override { config::read_cluster_cache_opt_verbose_log = false; } RowsetSharedPtr create_rowset_without_visible_time(Version version) { auto rs_meta = std::make_shared(); diff --git a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp index 61c4d84143ed4e..5fe5c2e51bcc50 100644 --- a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp +++ b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp @@ -42,11 +42,12 @@ class TestFreshnessTolerance : public testing::Test { TestFreshnessTolerance() : _engine(CloudStorageEngine(EngineOptions {})) {} void SetUp() override { + config::read_cluster_cache_opt_verbose_log = true; _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, TCompressionType::LZ4F)); } - void TearDown() override {} + void TearDown() override { config::read_cluster_cache_opt_verbose_log = false; } RowsetSharedPtr create_rowset_without_visible_time(Version version) { auto rs_meta = std::make_shared(); diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out new file mode 100644 index 00000000000000..59c500c665d402 --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out @@ -0,0 +1,10 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2 -- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +9 9 + diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy index deea395eef27a7..c70340fa3f51dd 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy @@ -58,7 +58,7 @@ suite('test_warmup_delay_compaction_query_tolerance', 'docker') { } // clear file cache is async, wait it done - sleep(5000) + sleep(1000) } def getBrpcMetrics = {ip, port, name -> diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy new file mode 100644 index 00000000000000..0283025e9e76f2 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy @@ -0,0 +1,251 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_download_fail', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=20000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(1000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(1000) + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test values (2, 2)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_failed = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + assert num_failed == 0 + + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error") + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + sleep(1000) + + assert num_failed + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + + + sql """use @${clusterName2}""" + sql "set enable_profile=true;" + sql "set profile_level=2;" + + // although download failed, the query should still read the newly inserted data + sql "set query_freshness_tolerance_ms = 5000" + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + qt_cluster2 """select * from test""" + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + } +} From 1dd24066df871911b7100253db53bd28ed4a053e Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 12 Sep 2025 15:14:44 +0800 Subject: [PATCH 28/34] add metrics exclude warmup --- be/src/cloud/cloud_internal_service.cpp | 22 ++++----- be/src/cloud/cloud_tablet.cpp | 4 +- be/src/cloud/cloud_warm_up_manager.cpp | 8 ++-- be/src/io/cache/block_file_cache.cpp | 47 +++++++++++++++++++ be/src/io/cache/block_file_cache.h | 11 +++++ be/src/io/cache/cached_remote_file_reader.cpp | 4 ++ be/src/io/cache/file_cache_common.h | 2 + be/src/io/io_common.h | 2 + 8 files changed, 79 insertions(+), 21 deletions(-) diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index fe5571d7c24e75..07880da083d22e 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -295,13 +295,10 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c .offset = 0, .download_size = segment_size, .file_system = storage_resource.value()->fs, - .ctx = - { - .is_index_data = false, - .expiration_time = expiration_time, - .is_dryrun = - config::enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.is_index_data = false, + .expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = std::move(download_done), }; g_file_cache_event_driven_warm_up_submitted_segment_num << 1; @@ -368,13 +365,10 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c .path = io::Path(index_path), .file_size = static_cast(idx_size), .file_system = storage_resource.value()->fs, - .ctx = - { - .is_index_data = false, // DORIS-20877 - .expiration_time = expiration_time, - .is_dryrun = config:: - enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.is_index_data = false, // DORIS-20877 + .expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = std::move(download_done), }; g_file_cache_event_driven_warm_up_submitted_index_num << 1; diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index b6002777999d40..f83384342da77e 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -451,10 +451,10 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ { .expiration_time = expiration_time, .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true }, .download_done {[=](Status st) { DBUG_EXECUTE_IF("CloudTablet::add_rowsets.download_data.callback.block_compaction_rowset", { - // clang-format on if (rs->version().second > rs->version().first) { auto sleep_time = dp->param("sleep", 3); LOG_INFO( @@ -465,7 +465,6 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ std::this_thread::sleep_for( std::chrono::seconds(sleep_time)); } - // clang-format off }); self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st, 1, 0); if (!st) { @@ -483,6 +482,7 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ { .expiration_time = expiration_time, .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true }, .download_done {[=](Status st) { DBUG_EXECUTE_IF("CloudTablet::add_rowsets.download_idx.callback.block", { diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index a92d25772b382d..340b189b0babb0 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -141,11 +141,9 @@ void CloudWarmUpManager::submit_download_tasks(io::Path path, int64_t file_size, .offset = offset, .download_size = current_chunk_size, .file_system = file_system, - .ctx = - { - .expiration_time = expiration_time, - .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = [&](Status st) { if (done_cb) done_cb(st); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 2efbf6340b0ac1..f9013f8d2c7407 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -204,12 +204,38 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _cache_base_path.c_str(), "file_cache_num_read_blocks_1h", _num_read_blocks.get(), 3600); + _no_warmup_num_read_blocks = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks"); + _no_warmup_num_hit_blocks = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks"); + + _no_warmup_num_hit_blocks_5m = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks_5m", + _no_warmup_num_hit_blocks.get(), 300); + _no_warmup_num_read_blocks_5m = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks_5m", + _no_warmup_num_read_blocks.get(), 300); + _no_warmup_num_hit_blocks_1h = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks_1h", + _no_warmup_num_hit_blocks.get(), 3600); + _no_warmup_num_read_blocks_1h = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks_1h", + _no_warmup_num_read_blocks.get(), 3600); + _hit_ratio = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio", 0.0); _hit_ratio_5m = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_5m", 0.0); _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_1h", 0.0); + + _no_warmup_hit_ratio = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio", 0.0); + _no_warmup_hit_ratio_5m = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio_5m", 0.0); + _no_warmup_hit_ratio_1h = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio_1h", 0.0); + _disk_limit_mode_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_disk_limit_mode", 0); _need_evict_cache_in_advance_metrics = std::make_shared>( @@ -795,9 +821,15 @@ FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t o } DCHECK(!file_blocks.empty()); *_num_read_blocks << file_blocks.size(); + if (!context.is_warmup) { + *_no_warmup_num_read_blocks << file_blocks.size(); + } for (auto& block : file_blocks) { if (block->state_unsafe() == FileBlock::State::DOWNLOADED) { *_num_hit_blocks << 1; + if (!context.is_warmup) { + *_no_warmup_num_hit_blocks << 1; + } } } } @@ -1940,6 +1972,21 @@ void BlockFileCache::run_background_monitor() { _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / (double)_num_read_blocks_1h->get_value()); } + + if (_no_warmup_num_hit_blocks->get_value() > 0) { + _no_warmup_hit_ratio->set_value((double)_no_warmup_num_hit_blocks->get_value() / + (double)_no_warmup_num_read_blocks->get_value()); + } + if (_no_warmup_num_hit_blocks_5m->get_value() > 0) { + _no_warmup_hit_ratio_5m->set_value( + (double)_no_warmup_num_hit_blocks_5m->get_value() / + (double)_no_warmup_num_read_blocks_5m->get_value()); + } + if (_no_warmup_num_hit_blocks_1h->get_value() > 0) { + _no_warmup_hit_ratio_1h->set_value( + (double)_no_warmup_num_hit_blocks_1h->get_value() / + (double)_no_warmup_num_read_blocks_1h->get_value()); + } } } } diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index a85a36b5520802..46fbc56bd30de3 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -534,9 +534,20 @@ class BlockFileCache { std::shared_ptr> _num_hit_blocks; std::shared_ptr> _num_removed_blocks; + std::shared_ptr> _no_warmup_num_read_blocks; + std::shared_ptr> _no_warmup_num_hit_blocks; + + std::shared_ptr>> _no_warmup_num_hit_blocks_5m; + std::shared_ptr>> _no_warmup_num_read_blocks_5m; + std::shared_ptr>> _no_warmup_num_hit_blocks_1h; + std::shared_ptr>> _no_warmup_num_read_blocks_1h; + std::shared_ptr> _hit_ratio; std::shared_ptr> _hit_ratio_5m; std::shared_ptr> _hit_ratio_1h; + std::shared_ptr> _no_warmup_hit_ratio; + std::shared_ptr> _no_warmup_hit_ratio_5m; + std::shared_ptr> _no_warmup_hit_ratio_1h; std::shared_ptr> _disk_limit_mode_metrics; std::shared_ptr> _need_evict_cache_in_advance_metrics; diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 66eb5f523cc346..9c5f18ea95c733 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -156,6 +156,10 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* LOG_INFO("[verbose] {}", Status::InternalError("not hit cache")); } } + if (!stats.hit_cache && config::read_cluster_cache_opt_verbose_log) { + LOG_INFO("[verbose] not hit cache, path: {}, offset: {}, size: {}", path().native(), + offset, bytes_req); + } if (io_ctx->file_cache_stats && !is_dryrun) { // update stats in io_ctx, for query profile _update_stats(stats, io_ctx->file_cache_stats, io_ctx->is_inverted_index); diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index f9ac525d0bef86..abbc4ff12fb735 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -148,6 +148,7 @@ struct CacheContext { cache_type = FileCacheType::NORMAL; } query_id = io_context->query_id ? *io_context->query_id : TUniqueId(); + is_warmup = io_context->is_warmup; } CacheContext() = default; bool operator==(const CacheContext& rhs) const { @@ -159,6 +160,7 @@ struct CacheContext { int64_t expiration_time {0}; bool is_cold_data {false}; ReadStatistics* stats; + bool is_warmup {false}; }; template diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 6934aa6a75a519..11ebb8d769b29c 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -85,6 +85,8 @@ struct IOContext { // if is_dryrun, read IO will download data to cache but return no data to reader // useful to skip cache data read from local disk to accelarate warm up bool is_dryrun = false; + + bool is_warmup {false}; }; } // namespace io From efe1928ae8734139bb66524cd66798847066e898 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Fri, 12 Sep 2025 19:28:06 +0800 Subject: [PATCH 29/34] update --- be/src/io/cache/cached_remote_file_reader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 9c5f18ea95c733..00c4427da9b69d 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -152,7 +152,7 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* stats.bytes_read += bytes_req; auto defer_func = [&](int*) { if (config::print_stack_when_cache_miss) { - if (io_ctx->file_cache_stats == nullptr && !stats.hit_cache) { + if (io_ctx->file_cache_stats == nullptr && !stats.hit_cache && !io_ctx->is_warmup) { LOG_INFO("[verbose] {}", Status::InternalError("not hit cache")); } } From 9a4e9339cca95c88660598de32684f4564b2ffe2 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 15 Sep 2025 12:19:36 +0800 Subject: [PATCH 30/34] fix --- be/src/cloud/cloud_tablet.cpp | 14 ++++++++------ be/src/cloud/cloud_tablet.h | 2 +- ..._warmup_delay_compaction_query_tolerance.groovy | 3 +++ .../test_warmup_delay_idx_query_tolerance.groovy | 5 +++++ .../test_warmup_delay_sc_query_tolerance.groovy | 4 ++++ ...delay_timeout_compaction_query_tolerance.groovy | 6 ++++++ .../warmup/test_warmup_download_fail.groovy | 3 +++ 7 files changed, 30 insertions(+), 7 deletions(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index f83384342da77e..b63dd065a2975e 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -207,7 +207,7 @@ Status CloudTablet::capture_rs_readers_prefer_cache(const Version& spec_version, std::shared_lock rlock(_meta_lock); RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_prefer_cache( spec_version, version_path, - [&](int64_t start, int64_t end) { return rowset_is_warmed_up(start, end); })); + [&](int64_t start, int64_t end) { return rowset_is_warmed_up_unlocked(start, end); })); int64_t path_max_version = version_path.back().second; VLOG_DEBUG << fmt::format( "[verbose] CloudTablet::capture_rs_readers_prefer_cache, capture path: {}, " @@ -220,7 +220,7 @@ Status CloudTablet::capture_rs_readers_prefer_cache(const Version& spec_version, return capture_rs_readers_unlocked(version_path, rs_splits); } -bool CloudTablet::rowset_is_warmed_up(int64_t start_version, int64_t end_version) { +bool CloudTablet::rowset_is_warmed_up_unlocked(int64_t start_version, int64_t end_version) { if (start_version > end_version) { return false; } @@ -258,12 +258,14 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( // For merge-on-write table, newly generated delete bitmap marks will be on the rowsets which are in newest layout. // So we can ony capture rowsets which are in newest data layout. Otherwise there may be data correctness issue. RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator_mow( - spec_version, version_path, - [&](int64_t start, int64_t end) { return rowset_is_warmed_up(start, end); })); + spec_version, version_path, [&](int64_t start, int64_t end) { + return rowset_is_warmed_up_unlocked(start, end); + })); } else { RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator( - spec_version, version_path, - [&](int64_t start, int64_t end) { return rowset_is_warmed_up(start, end); })); + spec_version, version_path, [&](int64_t start, int64_t end) { + return rowset_is_warmed_up_unlocked(start, end); + })); } int64_t path_max_version = version_path.back().second; auto should_be_visible_but_not_warmed_up = [&](const auto& rs_meta) -> bool { diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 53a8869b6daaa0..9a0307d4ac0245 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -347,7 +347,7 @@ class CloudTablet final : public BaseTablet { std::chrono::steady_clock::time_point start_tp = std::chrono::steady_clock::now()); // used by capture_rs_reader_xxx functions - bool rowset_is_warmed_up(int64_t start_version, int64_t end_version); + bool rowset_is_warmed_up_unlocked(int64_t start_version, int64_t end_version); CloudStorageEngine& _engine; diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy index c70340fa3f51dd..3ce4ee58f4b771 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy @@ -309,5 +309,8 @@ suite('test_warmup_delay_compaction_query_tolerance', 'docker') { future.get() assert num_finished + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") assert 0 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") } } diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy index 0bb18cfdba9992..de4887624e4945 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy @@ -323,5 +323,10 @@ suite('test_warmup_delay_idx_query_tolerance', 'docker') { sleep(10000) // assert num_finished + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") assert 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_index_num") } } diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy index f047d81204a5e0..688aa5e4446a57 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy @@ -300,5 +300,9 @@ suite('test_warmup_delay_sc_query_tolerance', 'docker') { logFileCacheDownloadMetrics(clusterName2) logWarmUpRowsetMetrics(clusterName2) + + sleep(10000) + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") } } diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy index 7cb580c4816bc2..b13609ed42e1e8 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy @@ -252,8 +252,10 @@ suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { logWarmUpRowsetMetrics(clusterName2) def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_requested = getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") assert num_submitted >= 6 assert num_finished == num_submitted + assert num_requested == num_finished sql """use @${clusterName2}""" // ensure that base rowsets' meta are loaded on target cluster @@ -325,5 +327,9 @@ suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { future.get() assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") assert 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + sleep(10000) + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") } } diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy index 0283025e9e76f2..ea1aafbc44cf37 100644 --- a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy @@ -247,5 +247,8 @@ suite('test_warmup_download_fail', 'docker') { logFileCacheDownloadMetrics(clusterName2) logWarmUpRowsetMetrics(clusterName2) + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") } } From 12932b94398aef02e07fd1a52c1267896f8f5860 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Tue, 16 Sep 2025 13:20:45 +0800 Subject: [PATCH 31/34] add log --- be/src/cloud/cloud_tablet.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index b63dd065a2975e..6e5db7257ce0f9 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -308,7 +308,14 @@ Status CloudTablet::capture_rs_readers_with_freshness_tolerance( // but has not been warmuped up yet, fallback to capture rowsets as usual return capture_rs_readers_internal(spec_version, rs_splits); } - + VLOG_DEBUG << fmt::format( + "[verbose] CloudTablet::capture_rs_readers_with_freshness_tolerance, capture path: {}, " + "tablet_id={}, spec_version={}, path_max_version={}", + fmt::join(version_path | std::views::transform([](const auto& version) { + return fmt::format("{}", version.to_string()); + }), + ", "), + tablet_id(), spec_version.to_string(), path_max_version); return capture_rs_readers_unlocked(version_path, rs_splits); } From 7e3cfc601880ad0ee53a84f26c5d4bde2e7cf82b Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Tue, 16 Sep 2025 14:25:55 +0800 Subject: [PATCH 32/34] add log --- be/src/io/cache/cached_remote_file_reader.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 00c4427da9b69d..3d9b41be61ba93 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -157,8 +157,8 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* } } if (!stats.hit_cache && config::read_cluster_cache_opt_verbose_log) { - LOG_INFO("[verbose] not hit cache, path: {}, offset: {}, size: {}", path().native(), - offset, bytes_req); + LOG_INFO("[verbose] not hit cache, path: {}, offset: {}, size: {}, warmup: {}", + path().native(), offset, bytes_req, io_ctx->is_warmup); } if (io_ctx->file_cache_stats && !is_dryrun) { // update stats in io_ctx, for query profile From 8dca3c14504b5213ca5690bc673e04cab98a4a7c Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Tue, 16 Sep 2025 14:31:26 +0800 Subject: [PATCH 33/34] update --- be/src/io/cache/cached_remote_file_reader.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 3d9b41be61ba93..08c5960fb036db 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -150,6 +150,8 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* } ReadStatistics stats; stats.bytes_read += bytes_req; + MonotonicStopWatch read_at_sw; + read_at_sw.start(); auto defer_func = [&](int*) { if (config::print_stack_when_cache_miss) { if (io_ctx->file_cache_stats == nullptr && !stats.hit_cache && !io_ctx->is_warmup) { @@ -157,8 +159,11 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* } } if (!stats.hit_cache && config::read_cluster_cache_opt_verbose_log) { - LOG_INFO("[verbose] not hit cache, path: {}, offset: {}, size: {}, warmup: {}", - path().native(), offset, bytes_req, io_ctx->is_warmup); + LOG_INFO( + "[verbose] not hit cache, path: {}, offset: {}, size: {}, cost: {} ms, warmup: " + "{}", + path().native(), offset, bytes_req, read_at_sw.elapsed_time_milliseconds(), + io_ctx->is_warmup); } if (io_ctx->file_cache_stats && !is_dryrun) { // update stats in io_ctx, for query profile From 42534cbbdbed5aa0fc69718bcf149c270e46795b Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Wed, 17 Sep 2025 11:58:10 +0800 Subject: [PATCH 34/34] add comment --- be/src/cloud/cloud_tablet.h | 20 ++++++++++++++++++++ be/src/io/io_common.h | 2 +- be/src/olap/base_tablet.h | 17 +++++++++++++++-- be/src/olap/version_graph.h | 20 ++++++++++++++++++++ 4 files changed, 56 insertions(+), 3 deletions(-) diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 9a0307d4ac0245..425196301ec1af 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -72,8 +72,28 @@ class CloudTablet final : public BaseTablet { const CaptureRsReaderOptions& opts) override; Status capture_rs_readers_internal(const Version& spec_version, std::vector* rs_splits); + + // Capture rowset readers with cache preference optimization. + // This method prioritizes using cached/warmed-up rowsets when building version paths, + // avoiding cold data reads when possible. It uses capture_consistent_versions_prefer_cache + // to find a consistent version path that prefers already warmed-up rowsets. Status capture_rs_readers_prefer_cache(const Version& spec_version, std::vector* rs_splits); + + // Capture rowset readers with query freshness tolerance. + // This method finds a consistent version path where all rowsets are warmed up, + // but allows fallback to normal capture if there are newer rowsets that should be + // visible (based on freshness tolerance) but haven't been warmed up yet. + // For merge-on-write tables, uses special validation to ensure data correctness. + // + // IMPORTANT: The returned version may be smaller than the requested version if newer + // data hasn't been warmed up yet. This can cause different tablets in the same query + // to read from different versions, potentially leading to inconsistent query results. + // + // @param query_freshness_tolerance_ms: Time tolerance in milliseconds. Rowsets that + // became visible within this time range (after current_time - query_freshness_tolerance_ms) + // can be skipped if not warmed up. However, if older rowsets (before this time point) + // are not warmed up, the method will fallback to normal capture. Status capture_rs_readers_with_freshness_tolerance(const Version& spec_version, std::vector* rs_splits, int64_t query_freshness_tolerance_ms); diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 11ebb8d769b29c..82e9ae30ecada2 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -85,7 +85,7 @@ struct IOContext { // if is_dryrun, read IO will download data to cache but return no data to reader // useful to skip cache data read from local disk to accelarate warm up bool is_dryrun = false; - + // if `is_warmup` == true, this I/O request is from a warm up task bool is_warmup {false}; }; diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index 6f79ebd5758c83..bf1c600c0cebb7 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -51,11 +51,24 @@ struct TabletWithVersion { }; struct CaptureRsReaderOptions { - // used by local mode only + // Used by local mode only. + // If true, allows skipping missing versions during rowset capture. + // This can be useful when some versions are temporarily unavailable. bool skip_missing_version {false}; - // used by cloud mode only + // ======== only take effect in cloud mode ======== + + // Enable preference for cached/warmed-up rowsets when building version paths. + // When enabled, the capture process will prioritize already cached rowsets + // to avoid cold data reads and improve query performance. bool enable_prefer_cached_rowset {false}; + + // Query freshness tolerance in milliseconds. + // Defines the time window for considering data as "fresh enough". + // Rowsets that became visible within this time range can be skipped if not warmed up, + // but older rowsets (before current_time - query_freshness_tolerance_ms) that are + // not warmed up will trigger fallback to normal capture. + // Set to -1 to disable freshness tolerance checking. int64_t query_freshness_tolerance_ms {-1}; }; diff --git a/be/src/olap/version_graph.h b/be/src/olap/version_graph.h index 6a15a6ad0e0158..4c65d9208614c1 100644 --- a/be/src/olap/version_graph.h +++ b/be/src/olap/version_graph.h @@ -67,10 +67,20 @@ class VersionGraph { // The version paths are added to version_path as return info. // If this version not in main version, version_path can be included expired rowset. // NOTE: this method may return edges which is in stale path + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset should be included in the path, false to skip it Status capture_consistent_versions_with_validator( const Version& spec_version, std::vector& version_path, const std::function& validator) const; + // Capture consistent versions with validator for merge-on-write (MOW) tables. + // Similar to capture_consistent_versions_with_validator but with special handling for MOW tables. + // For MOW tables, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can only capture rowsets which are in newest data layout to ensure data correctness. + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset is warmed up, false if not warmed up Status capture_consistent_versions_with_validator_mow( const Version& spec_version, std::vector& version_path, const std::function& validator) const; @@ -201,10 +211,20 @@ class TimestampedVersionTracker { // The version paths are added to version_path as return info. // If this version not in main version, version_path can be included expired rowset. // NOTE: this method may return edges which is in stale path + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset should be included in the path, false to skip it Status capture_consistent_versions_with_validator( const Version& spec_version, std::vector& version_path, const std::function& validator) const; + // Capture consistent versions with validator for merge-on-write (MOW) tables. + // Similar to capture_consistent_versions_with_validator but with special handling for MOW tables. + // For MOW tables, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can only capture rowsets which are in newest data layout to ensure data correctness. + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset is warmed up, false if not warmed up Status capture_consistent_versions_with_validator_mow( const Version& spec_version, std::vector& version_path, const std::function& validator) const;