diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 59a198ee1710c9..07880da083d22e 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -26,6 +26,7 @@ #include "io/cache/block_file_cache.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" +#include "util/debug_points.h" namespace doris { #include "common/compile_check_avoid_begin.h" @@ -190,6 +191,7 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c continue; } int64_t tablet_id = rs_meta.tablet_id(); + auto rowset_id = rs_meta.rowset_id(); bool local_only = !(request->has_skip_existence_check() && request->skip_existence_check()); auto res = _engine.tablet_mgr().get_tablet(tablet_id, /* warmup_data = */ false, /* sync_delete_bitmap = */ true, @@ -216,7 +218,7 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c g_file_cache_warm_up_rowset_request_to_handle_slow_count << 1; LOG(INFO) << "warm up rowset (request to handle) took " << handle_ts - request_ts << " us, tablet_id: " << rs_meta.tablet_id() - << ", rowset_id: " << rs_meta.rowset_id().to_string(); + << ", rowset_id: " << rowset_id.to_string(); } int64_t expiration_time = tablet_meta->ttl_seconds() == 0 || rs_meta.newest_write_timestamp() <= 0 @@ -227,16 +229,26 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c } if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpState::TRIGGERED_BY_JOB)) { - LOG(INFO) << "found duplicate warmup task for rowset " << rs_meta.rowset_id() + LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string() << ", skip it"; continue; } for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { - auto download_done = [&, tablet_id = rs_meta.tablet_id(), - rowset_id = rs_meta.rowset_id().to_string(), - segment_size = rs_meta.segment_file_size(segment_id), - wait](Status st) { + auto segment_size = rs_meta.segment_file_size(segment_id); + auto download_done = [=, version = rs_meta.version()](Status st) { + DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_segment", { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO("[verbose] block download for rowset={}, version={}, sleep={}", + rowset_id.to_string(), version.to_string(), sleep_time); + std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); + }); + DBUG_EXECUTE_IF( + "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error", { + st = Status::InternalError("injected error"); + LOG_INFO("[verbose] inject error, tablet={}, rowset={}, st={}", + tablet_id, rowset_id.to_string(), st.to_string()); + }); if (st.ok()) { g_file_cache_event_driven_warm_up_finished_segment_num << 1; g_file_cache_event_driven_warm_up_finished_segment_size << segment_size; @@ -250,25 +262,27 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c now_ts - request_ts > config::warm_up_rowset_slow_log_ms * 1000) { g_file_cache_warm_up_rowset_slow_count << 1; LOG(INFO) << "warm up rowset took " << now_ts - request_ts - << " us, tablet_id: " << tablet_id << ", rowset_id: " << rowset_id + << " us, tablet_id: " << tablet_id + << ", rowset_id: " << rowset_id.to_string() << ", segment_id: " << segment_id; } if (now_ts - handle_ts > config::warm_up_rowset_slow_log_ms * 1000) { g_file_cache_warm_up_rowset_handle_to_finish_slow_count << 1; LOG(INFO) << "warm up rowset (handle to finish) took " << now_ts - handle_ts - << " us, tablet_id: " << tablet_id << ", rowset_id: " << rowset_id + << " us, tablet_id: " << tablet_id + << ", rowset_id: " << rowset_id.to_string() << ", segment_id: " << segment_id; } } else { g_file_cache_event_driven_warm_up_failed_segment_num << 1; g_file_cache_event_driven_warm_up_failed_segment_size << segment_size; LOG(WARNING) << "download segment failed, tablet_id: " << tablet_id - << " rowset_id: " << rowset_id << ", error: " << st; + << " rowset_id: " << rowset_id.to_string() << ", error: " << st; } - if (tablet->complete_rowset_segment_warmup(rs_meta.rowset_id(), st) == + if (tablet->complete_rowset_segment_warmup(rowset_id, st, 1, 0) == WarmUpState::DONE) { - VLOG_DEBUG << "warmup rowset " << rs_meta.version() << "(" << rowset_id - << ") completed"; + VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" + << rowset_id.to_string() << ") completed"; } if (wait) { wait->signal(); @@ -277,31 +291,35 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c io::DownloadFileMeta download_meta { .path = storage_resource.value()->remote_segment_path(rs_meta, segment_id), - .file_size = rs_meta.segment_file_size(segment_id), + .file_size = segment_size, .offset = 0, - .download_size = rs_meta.segment_file_size(segment_id), + .download_size = segment_size, .file_system = storage_resource.value()->fs, - .ctx = - { - .is_index_data = false, - .expiration_time = expiration_time, - .is_dryrun = - config::enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.is_index_data = false, + .expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = std::move(download_done), }; g_file_cache_event_driven_warm_up_submitted_segment_num << 1; - g_file_cache_event_driven_warm_up_submitted_segment_size - << rs_meta.segment_file_size(segment_id); + g_file_cache_event_driven_warm_up_submitted_segment_size << segment_size; if (wait) { wait->add_count(); } _engine.file_cache_block_downloader().submit_download_task(download_meta); - auto download_inverted_index = [&](std::string index_path, uint64_t idx_size) { + auto download_inverted_index = [&, tablet](std::string index_path, uint64_t idx_size) { auto storage_resource = rs_meta.remote_storage_resource(); - auto download_done = [=, tablet_id = rs_meta.tablet_id(), - rowset_id = rs_meta.rowset_id().to_string()](Status st) { + auto download_done = [=, version = rs_meta.version()](Status st) { + DBUG_EXECUTE_IF( + "CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for rowset={}, inverted index " + "file={}, sleep={}", + rowset_id.to_string(), index_path, sleep_time); + std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); + }); if (st.ok()) { g_file_cache_event_driven_warm_up_finished_index_num << 1; g_file_cache_event_driven_warm_up_finished_index_size << idx_size; @@ -318,14 +336,14 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c g_file_cache_warm_up_rowset_slow_count << 1; LOG(INFO) << "warm up rowset took " << now_ts - request_ts << " us, tablet_id: " << tablet_id - << ", rowset_id: " << rowset_id + << ", rowset_id: " << rowset_id.to_string() << ", segment_id: " << segment_id; } if (now_ts - handle_ts > config::warm_up_rowset_slow_log_ms * 1000) { g_file_cache_warm_up_rowset_handle_to_finish_slow_count << 1; LOG(INFO) << "warm up rowset (handle to finish) took " << now_ts - handle_ts << " us, tablet_id: " << tablet_id - << ", rowset_id: " << rowset_id + << ", rowset_id: " << rowset_id.to_string() << ", segment_id: " << segment_id; } } else { @@ -334,6 +352,11 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c LOG(WARNING) << "download inverted index failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id << ", error: " << st; } + if (tablet->complete_rowset_segment_warmup(rowset_id, st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" + << rowset_id.to_string() << ") completed"; + } if (wait) { wait->signal(); } @@ -342,18 +365,15 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c .path = io::Path(index_path), .file_size = static_cast(idx_size), .file_system = storage_resource.value()->fs, - .ctx = - { - .is_index_data = false, // DORIS-20877 - .expiration_time = expiration_time, - .is_dryrun = config:: - enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.is_index_data = false, // DORIS-20877 + .expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = std::move(download_done), }; g_file_cache_event_driven_warm_up_submitted_index_num << 1; g_file_cache_event_driven_warm_up_submitted_index_size << idx_size; - + tablet->update_rowset_warmup_state_inverted_idx_num(rowset_id, 1); if (wait) { wait->add_count(); } diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index 42a2615b961318..ef3513530d55af 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -148,7 +148,10 @@ Status CloudSchemaChangeJob::process_alter_tablet(const TAlterTabletReqV2& reque if (request.alter_version > 1) { // [0-1] is a placeholder rowset, no need to convert RETURN_IF_ERROR(_base_tablet->capture_rs_readers({2, start_resp.alter_version()}, - &rs_splits, false)); + &rs_splits, + {.skip_missing_version = false, + .enable_prefer_cached_rowset = false, + .query_freshness_tolerance_ms = -1})); } Defer defer2 {[&]() { _new_tablet->set_alter_version(-1); diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index 4648f0f58c5cca..cd450714e38d41 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -100,6 +100,7 @@ CloudStorageEngine::CloudStorageEngine(const EngineOptions& options) std::make_shared(); _cumulative_compaction_policies[CUMULATIVE_TIME_SERIES_POLICY] = std::make_shared(); + _startup_timepoint = std::chrono::system_clock::now(); } CloudStorageEngine::~CloudStorageEngine() { diff --git a/be/src/cloud/cloud_storage_engine.h b/be/src/cloud/cloud_storage_engine.h index 000ba92144b099..fba9165c0f44c4 100644 --- a/be/src/cloud/cloud_storage_engine.h +++ b/be/src/cloud/cloud_storage_engine.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -161,6 +162,16 @@ class CloudStorageEngine final : public BaseStorageEngine { void unregister_index_change_compaction(int64_t tablet_id, bool is_base_compact); + std::chrono::time_point startup_timepoint() const { + return _startup_timepoint; + } + +#ifdef BE_TEST + void set_startup_timepoint(const std::chrono::time_point& tp) { + _startup_timepoint = tp; + } +#endif + private: void _refresh_storage_vault_info_thread_callback(); void _vacuum_stale_rowsets_thread_callback(); @@ -238,6 +249,8 @@ class CloudStorageEngine final : public BaseStorageEngine { EngineOptions _options; std::mutex _store_lock; + + std::chrono::time_point _startup_timepoint; }; } // namespace doris diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index 04c3b76f976910..6e5db7257ce0f9 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -18,6 +18,7 @@ #include "cloud/cloud_tablet.h" #include +#include #include #include #include @@ -27,8 +28,11 @@ #include #include +#include #include #include +#include +#include #include #include #include @@ -70,6 +74,20 @@ bvar::LatencyRecorder g_base_compaction_get_delete_bitmap_lock_time_ms( bvar::Adder g_unused_rowsets_count("unused_rowsets_count"); bvar::Adder g_unused_rowsets_bytes("unused_rowsets_bytes"); +bvar::Adder g_capture_prefer_cache_count("capture_prefer_cache_count"); +bvar::Adder g_capture_with_freshness_tolerance_count( + "capture_with_freshness_tolerance_count"); +bvar::Adder g_capture_with_freshness_tolerance_fallback_count( + "capture_with_freshness_tolerance_fallback_count"); +bvar::Window> g_capture_prefer_cache_count_window( + "capture_prefer_cache_count_window", &g_capture_prefer_cache_count, 30); +bvar::Window> g_capture_with_freshness_tolerance_count_window( + "capture_with_freshness_tolerance_count_window", &g_capture_with_freshness_tolerance_count, + 30); +bvar::Window> g_capture_with_freshness_tolerance_fallback_count_window( + "capture_with_freshness_tolerance_fallback_count_window", + &g_capture_with_freshness_tolerance_fallback_count, 30); + static constexpr int LOAD_INITIATOR_ID = -1; bvar::Adder g_file_cache_cloud_tablet_submitted_segment_size( @@ -100,12 +118,18 @@ bvar::Adder g_file_cache_warm_up_segment_complete_num( "file_cache_warm_up_segment_complete_num"); bvar::Adder g_file_cache_warm_up_segment_failed_num( "file_cache_warm_up_segment_failed_num"); +bvar::Adder g_file_cache_warm_up_inverted_idx_complete_num( + "file_cache_warm_up_inverted_idx_complete_num"); +bvar::Adder g_file_cache_warm_up_inverted_idx_failed_num( + "file_cache_warm_up_inverted_idx_failed_num"); bvar::Adder g_file_cache_warm_up_rowset_complete_num( "file_cache_warm_up_rowset_complete_num"); bvar::Adder g_file_cache_warm_up_rowset_triggered_by_job_num( "file_cache_warm_up_rowset_triggered_by_job_num"); bvar::Adder g_file_cache_warm_up_rowset_triggered_by_sync_rowset_num( "file_cache_warm_up_rowset_triggered_by_sync_rowset_num"); +bvar::LatencyRecorder g_file_cache_warm_up_rowset_all_segments_latency( + "file_cache_warm_up_rowset_all_segments_latency"); CloudTablet::CloudTablet(CloudStorageEngine& engine, TabletMetaSharedPtr tablet_meta) : BaseTablet(std::move(tablet_meta)), _engine(engine) {} @@ -139,7 +163,18 @@ Status CloudTablet::capture_consistent_rowsets_unlocked( Status CloudTablet::capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) { + const CaptureRsReaderOptions& opts) { + if (opts.query_freshness_tolerance_ms > 0) { + return capture_rs_readers_with_freshness_tolerance(spec_version, rs_splits, + opts.query_freshness_tolerance_ms); + } else if (opts.enable_prefer_cached_rowset && !enable_unique_key_merge_on_write()) { + return capture_rs_readers_prefer_cache(spec_version, rs_splits); + } + return capture_rs_readers_internal(spec_version, rs_splits); +} + +Status CloudTablet::capture_rs_readers_internal(const Version& spec_version, + std::vector* rs_splits) { DBUG_EXECUTE_IF("CloudTablet.capture_rs_readers.return.e-230", { LOG_WARNING("CloudTablet.capture_rs_readers.return e-230").tag("tablet_id", tablet_id()); return Status::Error(-230, "injected error"); @@ -165,6 +200,125 @@ Status CloudTablet::capture_rs_readers(const Version& spec_version, return capture_rs_readers_unlocked(version_path, rs_splits); } +Status CloudTablet::capture_rs_readers_prefer_cache(const Version& spec_version, + std::vector* rs_splits) { + g_capture_prefer_cache_count << 1; + Versions version_path; + std::shared_lock rlock(_meta_lock); + RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_prefer_cache( + spec_version, version_path, + [&](int64_t start, int64_t end) { return rowset_is_warmed_up_unlocked(start, end); })); + int64_t path_max_version = version_path.back().second; + VLOG_DEBUG << fmt::format( + "[verbose] CloudTablet::capture_rs_readers_prefer_cache, capture path: {}, " + "tablet_id={}, spec_version={}, path_max_version={}", + fmt::join(version_path | std::views::transform([](const auto& version) { + return fmt::format("{}", version.to_string()); + }), + ", "), + tablet_id(), spec_version.to_string(), path_max_version); + return capture_rs_readers_unlocked(version_path, rs_splits); +} + +bool CloudTablet::rowset_is_warmed_up_unlocked(int64_t start_version, int64_t end_version) { + if (start_version > end_version) { + return false; + } + Version version {start_version, end_version}; + auto it = _rs_version_map.find(version); + if (it == _rs_version_map.end()) { + it = _stale_rs_version_map.find(version); + if (it == _stale_rs_version_map.end()) { + LOG_WARNING( + "fail to find Rowset in rs_version or stale_rs_version for version. " + "tablet={}, version={}", + tablet_id(), version.to_string()); + return false; + } + } + const auto& rs = it->second; + if (rs->visible_timestamp() < _engine.startup_timepoint()) { + // We only care about rowsets that are created after startup time point. For other rowsets, + // we assume they are warmed up. + return true; + } + return is_rowset_warmed_up(rs->rowset_id()); +}; + +Status CloudTablet::capture_rs_readers_with_freshness_tolerance( + const Version& spec_version, std::vector* rs_splits, + int64_t query_freshness_tolerance_ms) { + g_capture_with_freshness_tolerance_count << 1; + using namespace std::chrono; + auto freshness_limit_tp = system_clock::now() - milliseconds(query_freshness_tolerance_ms); + // find a version path where every edge(rowset) has been warmuped + Versions version_path; + std::shared_lock rlock(_meta_lock); + if (enable_unique_key_merge_on_write()) { + // For merge-on-write table, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can ony capture rowsets which are in newest data layout. Otherwise there may be data correctness issue. + RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator_mow( + spec_version, version_path, [&](int64_t start, int64_t end) { + return rowset_is_warmed_up_unlocked(start, end); + })); + } else { + RETURN_IF_ERROR(_timestamped_version_tracker.capture_consistent_versions_with_validator( + spec_version, version_path, [&](int64_t start, int64_t end) { + return rowset_is_warmed_up_unlocked(start, end); + })); + } + int64_t path_max_version = version_path.back().second; + auto should_be_visible_but_not_warmed_up = [&](const auto& rs_meta) -> bool { + if (rs_meta->version() == Version {0, 1}) { + // skip rowset[0-1] + return false; + } + bool ret = rs_meta->start_version() > path_max_version && + rs_meta->visible_timestamp() < freshness_limit_tp; + if (ret && config::read_cluster_cache_opt_verbose_log) { + std::time_t t1 = system_clock::to_time_t(rs_meta->visible_timestamp()); + std::tm tm1 = *std::localtime(&t1); + std::ostringstream oss1; + oss1 << std::put_time(&tm1, "%Y-%m-%d %H:%M:%S"); + + std::time_t t2 = system_clock::to_time_t(freshness_limit_tp); + std::tm tm2 = *std::localtime(&t2); + std::ostringstream oss2; + oss2 << std::put_time(&tm2, "%Y-%m-%d %H:%M:%S"); + LOG_INFO( + "[verbose] CloudTablet::capture_rs_readers_with_freshness_tolerance, " + "find a rowset which should be visible but not warmed up, tablet_id={}, " + "path_max_version={}, rowset_id={}, version={}, visible_time={}, " + "freshness_limit={}, version_graph={}, rowset_warmup_digest={}", + tablet_id(), path_max_version, rs_meta->rowset_id().to_string(), + rs_meta->version().to_string(), oss1.str(), oss2.str(), + _timestamped_version_tracker.debug_string(), rowset_warmup_digest()); + } + return ret; + }; + // use std::views::concat after C++26 + bool should_fallback = std::ranges::any_of(_tablet_meta->all_rs_metas(), + should_be_visible_but_not_warmed_up) || + std::ranges::any_of(_tablet_meta->all_stale_rs_metas(), + should_be_visible_but_not_warmed_up); + if (should_fallback) { + rlock.unlock(); + g_capture_with_freshness_tolerance_fallback_count << 1; + // if there exists a rowset which satisfies freshness tolerance and its start version is larger than the path max version + // but has not been warmuped up yet, fallback to capture rowsets as usual + return capture_rs_readers_internal(spec_version, rs_splits); + } + VLOG_DEBUG << fmt::format( + "[verbose] CloudTablet::capture_rs_readers_with_freshness_tolerance, capture path: {}, " + "tablet_id={}, spec_version={}, path_max_version={}", + fmt::join(version_path | std::views::transform([](const auto& version) { + return fmt::format("{}", version.to_string()); + }), + ", "), + tablet_id(), spec_version.to_string(), path_max_version); + return capture_rs_readers_unlocked(version_path, rs_splits); +} + // There are only two tablet_states RUNNING and NOT_READY in cloud mode // This function will erase the tablet from `CloudTabletMgr` when it can't find this tablet in MS. Status CloudTablet::sync_rowsets(const SyncOptions& options, SyncRowsetStats* stats) { @@ -306,16 +460,29 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ { .expiration_time = expiration_time, .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true }, .download_done {[=](Status st) { - self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st); + DBUG_EXECUTE_IF("CloudTablet::add_rowsets.download_data.callback.block_compaction_rowset", { + if (rs->version().second > rs->version().first) { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for rowset={}, " + "version={}, sleep={}", + rs->rowset_id().to_string(), + rs->version().to_string(), sleep_time); + std::this_thread::sleep_for( + std::chrono::seconds(sleep_time)); + } + }); + self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st, 1, 0); if (!st) { LOG_WARNING("add rowset warm up error ").error(st); } }}, }); - auto download_idx_file = [&](const io::Path& idx_path, int64_t idx_size) { + auto download_idx_file = [&, self](const io::Path& idx_path, int64_t idx_size) { io::DownloadFileMeta meta { .path = idx_path, .file_size = idx_size, @@ -324,13 +491,29 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ { .expiration_time = expiration_time, .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true }, - .download_done {[](Status st) { + .download_done {[=](Status st) { + DBUG_EXECUTE_IF("CloudTablet::add_rowsets.download_idx.callback.block", { + // clang-format on + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for " + "rowset={}, inverted_idx_file={}, " + "sleep={}", + rs->rowset_id().to_string(), + idx_path.string(), sleep_time); + std::this_thread::sleep_for( + std::chrono::seconds(sleep_time)); + // clang-format off + }); + self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st, 0, 1); if (!st) { LOG_WARNING("add rowset warm up error ").error(st); } }}, }; + self->update_rowset_warmup_state_inverted_idx_num_unlocked(rowset_meta->rowset_id(), 1); _engine.file_cache_block_downloader().submit_download_task(std::move(meta)); g_file_cache_cloud_tablet_submitted_index_num << 1; g_file_cache_cloud_tablet_submitted_index_size << idx_size; @@ -470,7 +653,6 @@ void CloudTablet::delete_rowsets(const std::vector& to_delete, _timestamped_version_tracker.add_stale_path_version(rs_metas); for (auto&& rs : to_delete) { _rs_version_map.erase(rs->version()); - _rowset_warm_up_states.erase(rs->rowset_id()); } _tablet_meta->modify_rs_metas({}, rs_metas, false); @@ -599,6 +781,7 @@ void CloudTablet::remove_unused_rowsets() { continue; } tablet_meta()->remove_rowset_delete_bitmap(rs->rowset_id(), rs->version()); + _rowset_warm_up_states.erase(rs->rowset_id()); rs->clear_cache(); g_unused_rowsets_count << -1; g_unused_rowsets_bytes << -rs->total_disk_size(); @@ -1462,19 +1645,34 @@ Status CloudTablet::check_delete_bitmap_cache(int64_t txn_id, WarmUpState CloudTablet::get_rowset_warmup_state(RowsetId rowset_id) { std::shared_lock rlock(_meta_lock); - if (_rowset_warm_up_states.find(rowset_id) == _rowset_warm_up_states.end()) { + if (!_rowset_warm_up_states.contains(rowset_id)) { return WarmUpState::NONE; } - return _rowset_warm_up_states[rowset_id].first; + return _rowset_warm_up_states[rowset_id].state; +} + +bool CloudTablet::add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp) { + std::lock_guard wlock(_meta_lock); + return add_rowset_warmup_state_unlocked(rowset, state, start_tp); } -bool CloudTablet::add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState state) { +void CloudTablet::update_rowset_warmup_state_inverted_idx_num(RowsetId rowset_id, int64_t delta) { std::lock_guard wlock(_meta_lock); - return add_rowset_warmup_state_unlocked(rowset, state); + update_rowset_warmup_state_inverted_idx_num_unlocked(rowset_id, delta); +} + +void CloudTablet::update_rowset_warmup_state_inverted_idx_num_unlocked(RowsetId rowset_id, + int64_t delta) { + if (!_rowset_warm_up_states.contains(rowset_id)) { + return; + } + _rowset_warm_up_states[rowset_id].num_inverted_idx += delta; } -bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state) { - if (_rowset_warm_up_states.find(rowset.rowset_id()) != _rowset_warm_up_states.end()) { +bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp) { + if (_rowset_warm_up_states.contains(rowset.rowset_id())) { return false; } if (state == WarmUpState::TRIGGERED_BY_JOB) { @@ -1482,26 +1680,56 @@ bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, War } else if (state == WarmUpState::TRIGGERED_BY_SYNC_ROWSET) { g_file_cache_warm_up_rowset_triggered_by_sync_rowset_num << 1; } - _rowset_warm_up_states[rowset.rowset_id()] = std::make_pair(state, rowset.num_segments()); + _rowset_warm_up_states[rowset.rowset_id()] = { + .state = state, .num_segments = rowset.num_segments(), .start_tp = start_tp}; return true; } -WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Status status) { +WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Status status, + int64_t segment_num, + int64_t inverted_idx_num) { std::lock_guard wlock(_meta_lock); - if (_rowset_warm_up_states.find(rowset_id) == _rowset_warm_up_states.end()) { + if (!_rowset_warm_up_states.contains(rowset_id)) { return WarmUpState::NONE; } VLOG_DEBUG << "complete rowset segment warmup for rowset " << rowset_id << ", " << status; - g_file_cache_warm_up_segment_complete_num << 1; - if (!status.ok()) { - g_file_cache_warm_up_segment_failed_num << 1; + if (segment_num > 0) { + g_file_cache_warm_up_segment_complete_num << segment_num; + if (!status.ok()) { + g_file_cache_warm_up_segment_failed_num << segment_num; + } + } + if (inverted_idx_num > 0) { + g_file_cache_warm_up_inverted_idx_complete_num << inverted_idx_num; + if (!status.ok()) { + g_file_cache_warm_up_inverted_idx_failed_num << inverted_idx_num; + } } - _rowset_warm_up_states[rowset_id].second--; - if (_rowset_warm_up_states[rowset_id].second <= 0) { + _rowset_warm_up_states[rowset_id].done(segment_num, inverted_idx_num); + if (_rowset_warm_up_states[rowset_id].has_finished()) { g_file_cache_warm_up_rowset_complete_num << 1; - _rowset_warm_up_states[rowset_id].first = WarmUpState::DONE; + auto cost = std::chrono::duration_cast( + std::chrono::steady_clock::now() - + _rowset_warm_up_states[rowset_id].start_tp) + .count(); + g_file_cache_warm_up_rowset_all_segments_latency << cost; + _rowset_warm_up_states[rowset_id].state = WarmUpState::DONE; + } + return _rowset_warm_up_states[rowset_id].state; +} + +bool CloudTablet::is_rowset_warmed_up(const RowsetId& rowset_id) const { + auto it = _rowset_warm_up_states.find(rowset_id); + if (it == _rowset_warm_up_states.end()) { + return false; } - return _rowset_warm_up_states[rowset_id].first; + return it->second.state == WarmUpState::DONE; +} + +void CloudTablet::add_warmed_up_rowset(const RowsetId& rowset_id) { + _rowset_warm_up_states[rowset_id] = {.state = WarmUpState::DONE, + .num_segments = 1, + .start_tp = std::chrono::steady_clock::now()}; } #include "common/compile_check_end.h" diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 8a4c1ae5ced7c4..425196301ec1af 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -21,6 +21,7 @@ #include "olap/base_tablet.h" #include "olap/partial_update_info.h" +#include "olap/rowset/rowset.h" namespace doris { @@ -68,7 +69,34 @@ class CloudTablet final : public BaseTablet { bool vertical) override; Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) override; + const CaptureRsReaderOptions& opts) override; + Status capture_rs_readers_internal(const Version& spec_version, + std::vector* rs_splits); + + // Capture rowset readers with cache preference optimization. + // This method prioritizes using cached/warmed-up rowsets when building version paths, + // avoiding cold data reads when possible. It uses capture_consistent_versions_prefer_cache + // to find a consistent version path that prefers already warmed-up rowsets. + Status capture_rs_readers_prefer_cache(const Version& spec_version, + std::vector* rs_splits); + + // Capture rowset readers with query freshness tolerance. + // This method finds a consistent version path where all rowsets are warmed up, + // but allows fallback to normal capture if there are newer rowsets that should be + // visible (based on freshness tolerance) but haven't been warmed up yet. + // For merge-on-write tables, uses special validation to ensure data correctness. + // + // IMPORTANT: The returned version may be smaller than the requested version if newer + // data hasn't been warmed up yet. This can cause different tablets in the same query + // to read from different versions, potentially leading to inconsistent query results. + // + // @param query_freshness_tolerance_ms: Time tolerance in milliseconds. Rowsets that + // became visible within this time range (after current_time - query_freshness_tolerance_ms) + // can be skipped if not warmed up. However, if older rowsets (before this time point) + // are not warmed up, the method will fallback to normal capture. + Status capture_rs_readers_with_freshness_tolerance(const Version& spec_version, + std::vector* rs_splits, + int64_t query_freshness_tolerance_ms); Status capture_consistent_rowsets_unlocked( const Version& spec_version, std::vector* rowsets) const override; @@ -297,8 +325,36 @@ class CloudTablet final : public BaseTablet { // Add warmup state management WarmUpState get_rowset_warmup_state(RowsetId rowset_id); - bool add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState state); - WarmUpState complete_rowset_segment_warmup(RowsetId rowset_id, Status status); + bool add_rowset_warmup_state( + const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp = std::chrono::steady_clock::now()); + void update_rowset_warmup_state_inverted_idx_num(RowsetId rowset_id, int64_t delta); + void update_rowset_warmup_state_inverted_idx_num_unlocked(RowsetId rowset_id, int64_t delta); + WarmUpState complete_rowset_segment_warmup(RowsetId rowset_id, Status status, + int64_t segment_num, int64_t inverted_idx_num); + + bool is_rowset_warmed_up(const RowsetId& rowset_id) const; + + void add_warmed_up_rowset(const RowsetId& rowset_id); + + std::string rowset_warmup_digest() { + std::string res; + auto add_log = [&](const RowsetSharedPtr& rs) { + auto tmp = fmt::format("{}{}", rs->rowset_id().to_string(), rs->version().to_string()); + if (_rowset_warm_up_states.contains(rs->rowset_id())) { + tmp += fmt::format( + ", state={}, segments_warmed_up={}/{}, inverted_idx_warmed_up={}/{}", + _rowset_warm_up_states.at(rs->rowset_id()).state, + _rowset_warm_up_states.at(rs->rowset_id()).num_segments_warmed_up, + _rowset_warm_up_states.at(rs->rowset_id()).num_segments, + _rowset_warm_up_states.at(rs->rowset_id()).num_inverted_idx_warmed_up, + _rowset_warm_up_states.at(rs->rowset_id()).num_inverted_idx); + } + res += fmt::format("[{}],", tmp); + }; + traverse_rowsets_unlocked(add_log, true); + return res; + } private: // FIXME(plat1ko): No need to record base size if rowsets are ordered by version @@ -306,7 +362,12 @@ class CloudTablet final : public BaseTablet { Status sync_if_not_running(SyncRowsetStats* stats = nullptr); - bool add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state); + bool add_rowset_warmup_state_unlocked( + const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp = std::chrono::steady_clock::now()); + + // used by capture_rs_reader_xxx functions + bool rowset_is_warmed_up_unlocked(int64_t start_version, int64_t end_version); CloudStorageEngine& _engine; @@ -366,7 +427,28 @@ class CloudTablet final : public BaseTablet { std::vector, DeleteBitmapKeyRanges>> _unused_delete_bitmap; // for warm up states management - std::unordered_map> _rowset_warm_up_states; + struct RowsetWarmUpInfo { + WarmUpState state; + int64_t num_segments = 0; + int64_t num_inverted_idx = 0; + int64_t num_segments_warmed_up = 0; + int64_t num_inverted_idx_warmed_up = 0; + std::chrono::steady_clock::time_point start_tp; + + void done(int64_t num_segments, int64_t num_inverted_idx) { + num_segments_warmed_up += num_segments; + num_inverted_idx_warmed_up += num_inverted_idx; + } + + bool has_finished() const { + return (num_segments_warmed_up >= num_segments) && + (num_inverted_idx_warmed_up >= num_inverted_idx); + } + }; + std::unordered_map _rowset_warm_up_states; + + mutable std::shared_mutex _warmed_up_rowsets_mutex; + std::unordered_set _warmed_up_rowsets; }; using CloudTabletSPtr = std::shared_ptr; diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 8310d493009293..340b189b0babb0 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -141,11 +141,9 @@ void CloudWarmUpManager::submit_download_tasks(io::Path path, int64_t file_size, .offset = offset, .download_size = current_chunk_size, .file_system = file_system, - .ctx = - { - .expiration_time = expiration_time, - .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = [&](Status st) { if (done_cb) done_cb(st); @@ -242,8 +240,8 @@ void CloudWarmUpManager::handle_jobs() { [tablet, rs, seg_id](Status st) { VLOG_DEBUG << "warmup rowset " << rs->version() << " segment " << seg_id << " completed"; - if (tablet->complete_rowset_segment_warmup(rs->rowset_id(), st) == - WarmUpState::DONE) { + if (tablet->complete_rowset_segment_warmup( + rs->rowset_id(), st, 1, 0) == WarmUpState::DONE) { VLOG_DEBUG << "warmup rowset " << rs->version() << " completed"; } }); @@ -277,8 +275,20 @@ void CloudWarmUpManager::handle_jobs() { } } } - submit_download_tasks(idx_path, file_size, storage_resource.value()->fs, - expiration_time, wait, true); + tablet->update_rowset_warmup_state_inverted_idx_num(rs->rowset_id(), 1); + submit_download_tasks( + idx_path, file_size, storage_resource.value()->fs, + expiration_time, wait, true, [=](Status st) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " segment " << seg_id + << "inverted idx:" << idx_path << " completed"; + if (tablet->complete_rowset_segment_warmup(rs->rowset_id(), + st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " completed"; + } + }); } } else { if (schema_ptr->has_inverted_index() || schema_ptr->has_ann_index()) { @@ -286,8 +296,20 @@ void CloudWarmUpManager::handle_jobs() { storage_resource.value()->remote_idx_v2_path(*rs, seg_id); file_size = idx_file_info.has_index_size() ? idx_file_info.index_size() : -1; - submit_download_tasks(idx_path, file_size, storage_resource.value()->fs, - expiration_time, wait, true); + tablet->update_rowset_warmup_state_inverted_idx_num(rs->rowset_id(), 1); + submit_download_tasks( + idx_path, file_size, storage_resource.value()->fs, + expiration_time, wait, true, [=](Status st) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " segment " << seg_id + << "inverted idx:" << idx_path << " completed"; + if (tablet->complete_rowset_segment_warmup(rs->rowset_id(), + st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " completed"; + } + }); } } } diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index b4319cdb946f9b..9b51876104687a 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -96,6 +96,9 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in) if (in.has___split_schema()) { out->mutable___split_schema()->CopyFrom(in.__split_schema()); } + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); + } } void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { @@ -159,6 +162,9 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { if (in.has___split_schema()) { out->mutable___split_schema()->Swap(in.mutable___split_schema()); } + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); + } } RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in) { @@ -232,6 +238,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in) if (in.has___split_schema()) { out->mutable___split_schema()->CopyFrom(in.__split_schema()); } + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); + } } void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { @@ -294,6 +303,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in) { if (in.has___split_schema()) { out->mutable___split_schema()->Swap(in.mutable___split_schema()); } + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); + } } TabletSchemaCloudPB doris_tablet_schema_to_cloud(const TabletSchemaPB& in) { diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index ba21728d916caf..d9d9d8e47c9760 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1592,6 +1592,10 @@ DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true"); DEFINE_mBool(enable_wal_tde, "false"); +DEFINE_mBool(print_stack_when_cache_miss, "false"); + +DEFINE_mBool(read_cluster_cache_opt_verbose_log, "false"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 959b46c9747c12..3c801e94d43e17 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1646,6 +1646,10 @@ DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction); DECLARE_mBool(enable_wal_tde); +DECLARE_mBool(print_stack_when_cache_miss); + +DECLARE_mBool(read_cluster_cache_opt_verbose_log); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 2efbf6340b0ac1..f9013f8d2c7407 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -204,12 +204,38 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _cache_base_path.c_str(), "file_cache_num_read_blocks_1h", _num_read_blocks.get(), 3600); + _no_warmup_num_read_blocks = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks"); + _no_warmup_num_hit_blocks = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks"); + + _no_warmup_num_hit_blocks_5m = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks_5m", + _no_warmup_num_hit_blocks.get(), 300); + _no_warmup_num_read_blocks_5m = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks_5m", + _no_warmup_num_read_blocks.get(), 300); + _no_warmup_num_hit_blocks_1h = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks_1h", + _no_warmup_num_hit_blocks.get(), 3600); + _no_warmup_num_read_blocks_1h = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks_1h", + _no_warmup_num_read_blocks.get(), 3600); + _hit_ratio = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio", 0.0); _hit_ratio_5m = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_5m", 0.0); _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_1h", 0.0); + + _no_warmup_hit_ratio = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio", 0.0); + _no_warmup_hit_ratio_5m = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio_5m", 0.0); + _no_warmup_hit_ratio_1h = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio_1h", 0.0); + _disk_limit_mode_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_disk_limit_mode", 0); _need_evict_cache_in_advance_metrics = std::make_shared>( @@ -795,9 +821,15 @@ FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t o } DCHECK(!file_blocks.empty()); *_num_read_blocks << file_blocks.size(); + if (!context.is_warmup) { + *_no_warmup_num_read_blocks << file_blocks.size(); + } for (auto& block : file_blocks) { if (block->state_unsafe() == FileBlock::State::DOWNLOADED) { *_num_hit_blocks << 1; + if (!context.is_warmup) { + *_no_warmup_num_hit_blocks << 1; + } } } } @@ -1940,6 +1972,21 @@ void BlockFileCache::run_background_monitor() { _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / (double)_num_read_blocks_1h->get_value()); } + + if (_no_warmup_num_hit_blocks->get_value() > 0) { + _no_warmup_hit_ratio->set_value((double)_no_warmup_num_hit_blocks->get_value() / + (double)_no_warmup_num_read_blocks->get_value()); + } + if (_no_warmup_num_hit_blocks_5m->get_value() > 0) { + _no_warmup_hit_ratio_5m->set_value( + (double)_no_warmup_num_hit_blocks_5m->get_value() / + (double)_no_warmup_num_read_blocks_5m->get_value()); + } + if (_no_warmup_num_hit_blocks_1h->get_value() > 0) { + _no_warmup_hit_ratio_1h->set_value( + (double)_no_warmup_num_hit_blocks_1h->get_value() / + (double)_no_warmup_num_read_blocks_1h->get_value()); + } } } } diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index a85a36b5520802..46fbc56bd30de3 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -534,9 +534,20 @@ class BlockFileCache { std::shared_ptr> _num_hit_blocks; std::shared_ptr> _num_removed_blocks; + std::shared_ptr> _no_warmup_num_read_blocks; + std::shared_ptr> _no_warmup_num_hit_blocks; + + std::shared_ptr>> _no_warmup_num_hit_blocks_5m; + std::shared_ptr>> _no_warmup_num_read_blocks_5m; + std::shared_ptr>> _no_warmup_num_hit_blocks_1h; + std::shared_ptr>> _no_warmup_num_read_blocks_1h; + std::shared_ptr> _hit_ratio; std::shared_ptr> _hit_ratio_5m; std::shared_ptr> _hit_ratio_1h; + std::shared_ptr> _no_warmup_hit_ratio; + std::shared_ptr> _no_warmup_hit_ratio_5m; + std::shared_ptr> _no_warmup_hit_ratio_1h; std::shared_ptr> _disk_limit_mode_metrics; std::shared_ptr> _need_evict_cache_in_advance_metrics; diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index 5d4d9cc556724a..08c5960fb036db 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -150,7 +150,21 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* } ReadStatistics stats; stats.bytes_read += bytes_req; + MonotonicStopWatch read_at_sw; + read_at_sw.start(); auto defer_func = [&](int*) { + if (config::print_stack_when_cache_miss) { + if (io_ctx->file_cache_stats == nullptr && !stats.hit_cache && !io_ctx->is_warmup) { + LOG_INFO("[verbose] {}", Status::InternalError("not hit cache")); + } + } + if (!stats.hit_cache && config::read_cluster_cache_opt_verbose_log) { + LOG_INFO( + "[verbose] not hit cache, path: {}, offset: {}, size: {}, cost: {} ms, warmup: " + "{}", + path().native(), offset, bytes_req, read_at_sw.elapsed_time_milliseconds(), + io_ctx->is_warmup); + } if (io_ctx->file_cache_stats && !is_dryrun) { // update stats in io_ctx, for query profile _update_stats(stats, io_ctx->file_cache_stats, io_ctx->is_inverted_index); diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index f9ac525d0bef86..abbc4ff12fb735 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -148,6 +148,7 @@ struct CacheContext { cache_type = FileCacheType::NORMAL; } query_id = io_context->query_id ? *io_context->query_id : TUniqueId(); + is_warmup = io_context->is_warmup; } CacheContext() = default; bool operator==(const CacheContext& rhs) const { @@ -159,6 +160,7 @@ struct CacheContext { int64_t expiration_time {0}; bool is_cold_data {false}; ReadStatistics* stats; + bool is_warmup {false}; }; template diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 6934aa6a75a519..82e9ae30ecada2 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -85,6 +85,8 @@ struct IOContext { // if is_dryrun, read IO will download data to cache but return no data to reader // useful to skip cache data read from local disk to accelarate warm up bool is_dryrun = false; + // if `is_warmup` == true, this I/O request is from a warm up task + bool is_warmup {false}; }; } // namespace io diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index c33e00ba3c598a..bf1c600c0cebb7 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -50,6 +50,28 @@ struct TabletWithVersion { int64_t version; }; +struct CaptureRsReaderOptions { + // Used by local mode only. + // If true, allows skipping missing versions during rowset capture. + // This can be useful when some versions are temporarily unavailable. + bool skip_missing_version {false}; + + // ======== only take effect in cloud mode ======== + + // Enable preference for cached/warmed-up rowsets when building version paths. + // When enabled, the capture process will prioritize already cached rowsets + // to avoid cold data reads and improve query performance. + bool enable_prefer_cached_rowset {false}; + + // Query freshness tolerance in milliseconds. + // Defines the time window for considering data as "fresh enough". + // Rowsets that became visible within this time range can be skipped if not warmed up, + // but older rowsets (before current_time - query_freshness_tolerance_ms) that are + // not warmed up will trigger fallback to normal capture. + // Set to -1 to disable freshness tolerance checking. + int64_t query_freshness_tolerance_ms {-1}; +}; + enum class CompactionStage { NOT_SCHEDULED, PENDING, EXECUTING }; // Base class for all tablet classes @@ -113,7 +135,7 @@ class BaseTablet : public std::enable_shared_from_this { virtual Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) = 0; + const CaptureRsReaderOptions& opts) = 0; virtual size_t tablet_footprint() = 0; @@ -297,6 +319,11 @@ class BaseTablet : public std::enable_shared_from_this { void traverse_rowsets(std::function visitor, bool include_stale = false) { std::shared_lock rlock(_meta_lock); + traverse_rowsets_unlocked(visitor, include_stale); + } + + void traverse_rowsets_unlocked(std::function visitor, + bool include_stale = false) { for (auto& [v, rs] : _rs_version_map) { visitor(rs); } diff --git a/be/src/olap/rowset/rowset.cpp b/be/src/olap/rowset/rowset.cpp index c318cce9a4228e..0ca50b118e1385 100644 --- a/be/src/olap/rowset/rowset.cpp +++ b/be/src/olap/rowset/rowset.cpp @@ -234,6 +234,10 @@ int64_t Rowset::approximate_cache_index_size() { return total_cache_size; } +std::chrono::time_point Rowset::visible_timestamp() const { + return _rowset_meta->visible_timestamp(); +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index ae68eb0bcf45da..9ab9ca3356b5f7 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -327,6 +327,8 @@ class Rowset : public std::enable_shared_from_this, public MetadataAdder int64_t approximate_cache_index_size(); + std::chrono::time_point visible_timestamp() const; + protected: friend class RowsetFactory; diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h index 47752181750964..6d536d5c2a2d7b 100644 --- a/be/src/olap/rowset/rowset_meta.h +++ b/be/src/olap/rowset/rowset_meta.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -368,6 +369,22 @@ class RowsetMeta : public MetadataAdder { int64_t newest_write_timestamp() const { return _rowset_meta_pb.newest_write_timestamp(); } + // for cloud only + bool has_visible_ts_ms() const { return _rowset_meta_pb.has_visible_ts_ms(); } + int64_t visible_ts_ms() const { return _rowset_meta_pb.visible_ts_ms(); } + std::chrono::time_point visible_timestamp() const { + using namespace std::chrono; + if (has_visible_ts_ms()) { + return time_point(milliseconds(visible_ts_ms())); + } + return system_clock::from_time_t(newest_write_timestamp()); + } +#ifdef BE_TEST + void set_visible_ts_ms(int64_t visible_ts_ms) { + _rowset_meta_pb.set_visible_ts_ms(visible_ts_ms); + } +#endif + void set_tablet_schema(const TabletSchemaSPtr& tablet_schema); void set_tablet_schema(const TabletSchemaPB& tablet_schema); diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 489eea4e439a30..ff71371f1bfd68 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -1007,11 +1007,11 @@ Status Tablet::capture_consistent_rowsets_unlocked(const Version& spec_version, } Status Tablet::capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) { + const CaptureRsReaderOptions& opts) { std::shared_lock rlock(_meta_lock); std::vector version_path; RETURN_IF_ERROR(capture_consistent_versions_unlocked(spec_version, &version_path, - skip_missing_version, false)); + opts.skip_missing_version, false)); RETURN_IF_ERROR(capture_rs_readers_unlocked(version_path, rs_splits)); return Status::OK(); } diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index be09bb64320bb6..28139a64f20ad9 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -197,9 +197,9 @@ class Tablet final : public BaseTablet { Status capture_consistent_rowsets_unlocked( const Version& spec_version, std::vector* rowsets) const override; - // If skip_missing_version is true, skip versions if they are missing. + // If opts.skip_missing_version is true, skip versions if they are missing. Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) override; + const CaptureRsReaderOptions& opts) override; // Find the missed versions until the spec_version. // diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 6101a96f5cf379..d858d577bc3e6a 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -249,7 +249,11 @@ class TabletMeta : public MetadataAdder { void remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version); bool enable_unique_key_merge_on_write() const { return _enable_unique_key_merge_on_write; } - +#ifdef BE_TEST + void set_enable_unique_key_merge_on_write(bool value) { + _enable_unique_key_merge_on_write = value; + } +#endif // TODO(Drogon): thread safety const BinlogConfig& binlog_config() const { return _binlog_config; } void set_binlog_config(BinlogConfig binlog_config) { diff --git a/be/src/olap/version_graph.cpp b/be/src/olap/version_graph.cpp index b769c5895e1ce4..1894e0953d3c4f 100644 --- a/be/src/olap/version_graph.cpp +++ b/be/src/olap/version_graph.cpp @@ -25,6 +25,7 @@ #include // IWYU pragma: keep #include #include +#include #include #include @@ -336,6 +337,27 @@ Status TimestampedVersionTracker::capture_consistent_versions( return _version_graph.capture_consistent_versions(spec_version, version_path); } +Status TimestampedVersionTracker::capture_consistent_versions_with_validator( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_consistent_versions_with_validator(spec_version, version_path, + validator); +} + +Status TimestampedVersionTracker::capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_consistent_versions_prefer_cache(spec_version, version_path, + validator); +} + +Status TimestampedVersionTracker::capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_consistent_versions_with_validator_mow(spec_version, version_path, + validator); +} + void TimestampedVersionTracker::capture_expired_paths( int64_t stale_sweep_endtime, std::vector* path_version_vec) const { std::map::const_iterator iter = @@ -413,6 +435,10 @@ double TimestampedVersionTracker::get_orphan_vertex_ratio() { return _version_graph.get_orphan_vertex_ratio(); } +std::string TimestampedVersionTracker::debug_string() const { + return _version_graph.debug_string(); +} + void TimestampedVersionPathContainer::add_timestamped_version(TimestampedVersionSharedPtr version) { // Compare and refresh `_max_create_time`. if (version->get_create_time() > _max_create_time) { @@ -635,6 +661,172 @@ Status VersionGraph::capture_consistent_versions(const Version& spec_version, return Status::OK(); } +Status VersionGraph::capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == spec_version.first) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); + } + + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { + int64_t next_idx = -1; + int64_t first_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + if (first_idx == -1) { + first_idx = it; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else if (first_idx != -1) { + // if all edges are not in cache, use the first edge if possible + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[first_idx].value - 1); + cur_idx = first_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + +Status VersionGraph::capture_consistent_versions_with_validator( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == spec_version.first) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); + } + + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { + int64_t next_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + +Status VersionGraph::capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == spec_version.first) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); + } + + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { + int64_t next_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + if (_version_graph[cur_idx].value + 1 == _version_graph[it].value) { + break; + } + end_value = std::min(_version_graph[it].value, end_value); + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + double VersionGraph::get_orphan_vertex_ratio() { int64_t vertex_num = _version_graph.size(); int64_t orphan_vertex_num = 0; @@ -646,5 +838,21 @@ double VersionGraph::get_orphan_vertex_ratio() { return static_cast(orphan_vertex_num) / static_cast(vertex_num); } +std::string VersionGraph::debug_string() const { + std::stringstream ss; + ss << "VersionGraph: ["; + for (size_t i = 0; i < _version_graph.size(); ++i) { + ss << "{value: " << _version_graph[i].value << ", edges: ["; + for (const auto& edge : _version_graph[i].edges) { + if (_version_graph[edge].value > _version_graph[i].value) { + ss << _version_graph[edge].value << ", "; + } + } + ss << "]}, "; + } + ss << "]"; + return ss.str(); +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/olap/version_graph.h b/be/src/olap/version_graph.h index 56d07a52871ae7..4c65d9208614c1 100644 --- a/be/src/olap/version_graph.h +++ b/be/src/olap/version_graph.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -55,9 +56,40 @@ class VersionGraph { Status capture_consistent_versions(const Version& spec_version, std::vector* version_path) const; + Status capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + + // Given a start, this method can find a version path which satisfy the following conditions: + // 1. all edges satisfy the conditions specified by `validator` in the graph. + // 2. the destination version is as far as possible. + // 3. the path is the shortest path. + // The version paths are added to version_path as return info. + // If this version not in main version, version_path can be included expired rowset. + // NOTE: this method may return edges which is in stale path + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset should be included in the path, false to skip it + Status capture_consistent_versions_with_validator( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + + // Capture consistent versions with validator for merge-on-write (MOW) tables. + // Similar to capture_consistent_versions_with_validator but with special handling for MOW tables. + // For MOW tables, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can only capture rowsets which are in newest data layout to ensure data correctness. + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset is warmed up, false if not warmed up + Status capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + // See comment of TimestampedVersionTracker's get_orphan_vertex_ratio(); double get_orphan_vertex_ratio(); + std::string debug_string() const; + private: /// Private method add a version to graph. void _add_vertex_to_graph(int64_t vertex_value); @@ -168,6 +200,35 @@ class TimestampedVersionTracker { Status capture_consistent_versions(const Version& spec_version, std::vector* version_path) const; + Status capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + + // Given a start, this method can find a version path which satisfy the following conditions: + // 1. all edges satisfy the conditions specified by `validator` in the graph. + // 2. the destination version is as far as possible. + // 3. the path is the shortest path. + // The version paths are added to version_path as return info. + // If this version not in main version, version_path can be included expired rowset. + // NOTE: this method may return edges which is in stale path + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset should be included in the path, false to skip it + Status capture_consistent_versions_with_validator( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + + // Capture consistent versions with validator for merge-on-write (MOW) tables. + // Similar to capture_consistent_versions_with_validator but with special handling for MOW tables. + // For MOW tables, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can only capture rowsets which are in newest data layout to ensure data correctness. + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset is warmed up, false if not warmed up + Status capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + /// Capture all expired path version. /// When the last rowset create time of a path greater than expired time which can be expressed /// "now() - tablet_rowset_stale_sweep_time_sec" , this path will be remained. @@ -193,6 +254,8 @@ class TimestampedVersionTracker { // If a vertex is no longer the starting point of any edge, then this vertex is defined as orphan vertex double get_orphan_vertex_ratio(); + std::string debug_string() const; + private: /// Construct rowsets version tracker with main path rowset meta. void _construct_versioned_tracker(const std::vector& rs_metas); diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index b89575db580ec2..8aab3c72512831 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -690,10 +690,16 @@ Status OlapScanLocalState::prepare(RuntimeState* state) { } } + CaptureRsReaderOptions opts { + .skip_missing_version = _state->skip_missing_version(), + .enable_prefer_cached_rowset = + config::is_cloud_mode() ? _state->enable_prefer_cached_rowset() : false, + .query_freshness_tolerance_ms = + config::is_cloud_mode() ? _state->query_freshness_tolerance_ms() : -1, + }; for (size_t i = 0; i < _scan_ranges.size(); i++) { RETURN_IF_ERROR(_tablets[i].tablet->capture_rs_readers({0, _tablets[i].version}, - &_read_sources[i].rs_splits, - _state->skip_missing_version())); + &_read_sources[i].rs_splits, opts)); if (!PipelineXLocalState<>::_state->skip_delete_predicate()) { _read_sources[i].fill_delete_predicates(); } diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index c6ab70ad85c2c5..9a96b25deb1982 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -425,6 +425,20 @@ class RuntimeState { bool enable_page_cache() const; + bool enable_prefer_cached_rowset() const { + return _query_options.__isset.enable_prefer_cached_rowset && + _query_options.enable_prefer_cached_rowset; + } + + int64_t query_freshness_tolerance_ms() const { + return _query_options.query_freshness_tolerance_ms; + } + + bool enable_query_freshness_tolerance() const { + return _query_options.__isset.query_freshness_tolerance_ms && + _query_options.query_freshness_tolerance_ms > 0; + } + std::vector tablet_commit_infos() const { std::lock_guard lock(_tablet_infos_mutex); return _tablet_commit_infos; diff --git a/be/src/vec/exec/scan/olap_scanner.cpp b/be/src/vec/exec/scan/olap_scanner.cpp index 1e8a7f9321291e..120a370ae31150 100644 --- a/be/src/vec/exec/scan/olap_scanner.cpp +++ b/be/src/vec/exec/scan/olap_scanner.cpp @@ -218,9 +218,15 @@ Status OlapScanner::prepare() { ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_hotspot().count(*tablet); } + CaptureRsReaderOptions opts { + .skip_missing_version = _state->skip_missing_version(), + .enable_prefer_cached_rowset = + config::is_cloud_mode() ? _state->enable_prefer_cached_rowset() : false, + .query_freshness_tolerance_ms = + config::is_cloud_mode() ? _state->query_freshness_tolerance_ms() : -1, + }; auto st = tablet->capture_rs_readers(_tablet_reader_params.version, - &read_source.rs_splits, - _state->skip_missing_version()); + &read_source.rs_splits, opts); if (!st.ok()) { LOG(WARNING) << "fail to init reader.res=" << st; return st; diff --git a/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp new file mode 100644 index 00000000000000..8d4d5a37bf7a49 --- /dev/null +++ b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp @@ -0,0 +1,804 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet.h" +#include "olap/base_tablet.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_meta.h" +#include "olap/tablet_meta.h" +#include "util/uid_util.h" + +namespace doris { + +using namespace std::chrono; + +class TestQueryPreferCache : public testing::Test { +public: + TestQueryPreferCache() : _engine(CloudStorageEngine(EngineOptions {})) {} + + void SetUp() override { + config::read_cluster_cache_opt_verbose_log = true; + _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, + UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, + TCompressionType::LZ4F)); + } + void TearDown() override { config::read_cluster_cache_opt_verbose_log = false; } + + RowsetSharedPtr create_rowset_without_visible_time(Version version) { + auto rs_meta = std::make_shared(); + rs_meta->set_rowset_type(BETA_ROWSET); + rs_meta->set_version(version); + rs_meta->set_rowset_id(_engine.next_rowset_id()); + RowsetSharedPtr rowset; + Status st = RowsetFactory::create_rowset(nullptr, "", rs_meta, &rowset); + if (!st.ok()) { + return nullptr; + } + return rowset; + } + + RowsetSharedPtr create_rowset(Version version, + time_point visible_timestamp = system_clock::now() - + seconds(100)) { + auto rs = create_rowset_without_visible_time(version); + if (!rs) { + return nullptr; + } + rs->rowset_meta()->set_visible_ts_ms( + duration_cast(visible_timestamp.time_since_epoch()).count()); + return rs; + } + + CloudTabletSPtr create_tablet_with_initial_rowsets(int max_version, bool is_mow = false, + bool warmup = true) { + CloudTabletSPtr tablet = + std::make_shared(_engine, std::make_shared(*_tablet_meta)); + tablet->tablet_meta()->set_enable_unique_key_merge_on_write(is_mow); + std::vector rowsets; + auto rs1 = create_rowset(Version {0, 1}); + rowsets.emplace_back(rs1); + tablet->add_warmed_up_rowset(rs1->rowset_id()); + for (int ver = 2; ver <= max_version; ver++) { + auto rs = create_rowset(Version {ver, ver}); + if (warmup) { + tablet->add_warmed_up_rowset(rs->rowset_id()); + } + rowsets.emplace_back(rs); + } + { + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets(rowsets, false, wlock, false); + } + return tablet; + } + + void add_new_version_rowset(CloudTabletSPtr tablet, int64_t version, bool warmed_up, + time_point visible_timestamp) { + auto rowset = create_rowset(Version {version, version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(rowset->rowset_id()); + } + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets({rowset}, false, wlock, false); + } + + void do_cumu_compaction(CloudTabletSPtr tablet, int64_t start_version, int64_t end_version, + bool warmed_up, time_point visible_timestamp) { + std::unique_lock wrlock {tablet->get_header_lock()}; + std::vector input_rowsets; + auto output_rowset = create_rowset(Version {start_version, end_version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(output_rowset->rowset_id()); + } + std::ranges::copy_if(std::views::values(tablet->rowset_map()), + std::back_inserter(input_rowsets), [=](const RowsetSharedPtr& rowset) { + return rowset->version().first >= start_version && + rowset->version().first <= end_version; + }); + if (input_rowsets.size() == 1) { + tablet->add_rowsets({output_rowset}, true, wrlock); + } else { + tablet->delete_rowsets(input_rowsets, wrlock); + tablet->add_rowsets({output_rowset}, false, wrlock); + } + } + + void check_capture_result(CloudTabletSPtr tablet, Version spec_version, + const std::vector& expected_versions) { + std::vector rs_splits; + CaptureRsReaderOptions opts {.skip_missing_version = false, + .enable_prefer_cached_rowset = true, + .query_freshness_tolerance_ms = -1}; + auto st = tablet->capture_rs_readers(spec_version, &rs_splits, opts); + ASSERT_TRUE(st.ok()); + auto dump_versions = [](const std::vector& expected_versions, + const std::vector& splits) { + std::vector expected_str; + for (const auto& version : expected_versions) { + expected_str.push_back(version.to_string()); + } + std::vector versions; + for (const auto& split : splits) { + versions.push_back(split.rs_reader->rowset()->version().to_string()); + } + return fmt::format("expected_versions: {}, actual_versions: {}", + fmt::join(expected_str, ", "), fmt::join(versions, ", ")); + }; + ASSERT_EQ(rs_splits.size(), expected_versions.size()) + << dump_versions(expected_versions, rs_splits); + for (size_t i = 0; i < rs_splits.size(); i++) { + ASSERT_EQ(rs_splits[i].rs_reader->rowset()->version(), expected_versions[i]) + << dump_versions(expected_versions, rs_splits); + } + } + +protected: + std::string _json_rowset_meta; + TabletMetaSharedPtr _tablet_meta; + +private: + CloudStorageEngine _engine; +}; + +TEST_F(TestQueryPreferCache, testCapture_1_1) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │incache│ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_1_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_1_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_1_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_1) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │ ││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_1) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │ │ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │ ││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_4_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [11-16]│ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ │ │ │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-16],[17-17],[18-18] + note: when there are no warmed up rowset at some vertex, choose the latest edge +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, false, false); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, false, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +} // namespace doris \ No newline at end of file diff --git a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp new file mode 100644 index 00000000000000..5fe5c2e51bcc50 --- /dev/null +++ b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp @@ -0,0 +1,1074 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet.h" +#include "olap/base_tablet.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_meta.h" +#include "olap/tablet_meta.h" +#include "util/uid_util.h" + +namespace doris { + +using namespace std::chrono; + +class TestFreshnessTolerance : public testing::Test { +public: + TestFreshnessTolerance() : _engine(CloudStorageEngine(EngineOptions {})) {} + + void SetUp() override { + config::read_cluster_cache_opt_verbose_log = true; + _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, + UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, + TCompressionType::LZ4F)); + } + void TearDown() override { config::read_cluster_cache_opt_verbose_log = false; } + + RowsetSharedPtr create_rowset_without_visible_time(Version version) { + auto rs_meta = std::make_shared(); + rs_meta->set_rowset_type(BETA_ROWSET); + rs_meta->set_version(version); + rs_meta->set_rowset_id(_engine.next_rowset_id()); + RowsetSharedPtr rowset; + Status st = RowsetFactory::create_rowset(nullptr, "", rs_meta, &rowset); + if (!st.ok()) { + return nullptr; + } + return rowset; + } + + RowsetSharedPtr create_rowset(Version version, + time_point visible_timestamp = system_clock::now() - + seconds(100)) { + auto rs = create_rowset_without_visible_time(version); + if (!rs) { + return nullptr; + } + rs->rowset_meta()->set_visible_ts_ms( + duration_cast(visible_timestamp.time_since_epoch()).count()); + return rs; + } + + CloudTabletSPtr create_tablet_with_initial_rowsets(int max_version, bool is_mow = false) { + CloudTabletSPtr tablet = + std::make_shared(_engine, std::make_shared(*_tablet_meta)); + tablet->tablet_meta()->set_enable_unique_key_merge_on_write(is_mow); + std::vector rowsets; + auto rs1 = create_rowset(Version {0, 1}); + rowsets.emplace_back(rs1); + tablet->add_warmed_up_rowset(rs1->rowset_id()); + for (int ver = 2; ver <= max_version; ver++) { + auto rs = create_rowset(Version {ver, ver}); + tablet->add_warmed_up_rowset(rs->rowset_id()); + rowsets.emplace_back(rs); + } + { + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets(rowsets, false, wlock, false); + } + return tablet; + } + + void add_new_version_rowset(CloudTabletSPtr tablet, int64_t version, bool warmed_up, + time_point visible_timestamp) { + auto rowset = create_rowset(Version {version, version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(rowset->rowset_id()); + } + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets({rowset}, false, wlock, false); + } + + void do_cumu_compaction(CloudTabletSPtr tablet, int64_t start_version, int64_t end_version, + bool warmed_up, time_point visible_timestamp) { + std::unique_lock wrlock {tablet->get_header_lock()}; + std::vector input_rowsets; + auto output_rowset = create_rowset(Version {start_version, end_version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(output_rowset->rowset_id()); + } + std::ranges::copy_if(std::views::values(tablet->rowset_map()), + std::back_inserter(input_rowsets), [=](const RowsetSharedPtr& rowset) { + return rowset->version().first >= start_version && + rowset->version().first <= end_version; + }); + if (input_rowsets.size() == 1) { + tablet->add_rowsets({output_rowset}, true, wrlock); + } else { + tablet->delete_rowsets(input_rowsets, wrlock); + tablet->add_rowsets({output_rowset}, false, wrlock); + } + } + + void check_capture_result(CloudTabletSPtr tablet, Version spec_version, + int64_t query_freshness_tolerance_ms, + const std::vector& expected_versions) { + std::vector rs_splits; + CaptureRsReaderOptions opts {.skip_missing_version = false, + .enable_prefer_cached_rowset = false, + .query_freshness_tolerance_ms = query_freshness_tolerance_ms}; + auto st = tablet->capture_rs_readers(spec_version, &rs_splits, opts); + ASSERT_TRUE(st.ok()); + auto dump_versions = [](const std::vector& expected_versions, + const std::vector& splits) { + std::vector expected_str; + for (const auto& version : expected_versions) { + expected_str.push_back(version.to_string()); + } + std::vector versions; + for (const auto& split : splits) { + versions.push_back(split.rs_reader->rowset()->version().to_string()); + } + return fmt::format("expected_versions: {}, actual_versions: {}", + fmt::join(expected_str, ", "), fmt::join(versions, ", ")); + }; + ASSERT_EQ(rs_splits.size(), expected_versions.size()) + << dump_versions(expected_versions, rs_splits); + for (size_t i = 0; i < rs_splits.size(); i++) { + ASSERT_EQ(rs_splits[i].rs_reader->rowset()->version(), expected_versions[i]) + << dump_versions(expected_versions, rs_splits); + } + } + +protected: + std::string _json_rowset_meta; + TabletMetaSharedPtr _tablet_meta; + +private: + CloudStorageEngine _engine; +}; + +TEST_F(TestFreshnessTolerance, testVisibleTimestamp) { + { + // for historical rowset, visible time is not set, RowsetMeta::visible_timestamp() uses + // newest_write_timestamp + auto tp1 = system_clock::now() - seconds(100); + auto rs = create_rowset_without_visible_time({2, 2}); + auto d = duration_cast(tp1.time_since_epoch()).count(); + rs->rowset_meta()->set_newest_write_timestamp(d); + ASSERT_EQ(rs->rowset_meta()->visible_timestamp(), system_clock::from_time_t(d)); + } + + { + // when visible_ts_ms is set, RowsetMeta::visible_timestamp() uses visible_ts_ms which is more precise + auto tp1 = system_clock::now() - seconds(100); + auto tp2 = system_clock::now() - seconds(50); + auto rs = create_rowset_without_visible_time({2, 2}); + auto d1 = duration_cast(tp1.time_since_epoch()).count(); + auto d2 = duration_cast(tp2.time_since_epoch()).count(); + rs->rowset_meta()->set_newest_write_timestamp(d1); + rs->rowset_meta()->set_visible_ts_ms(d2); + ASSERT_EQ(rs->rowset_meta()->visible_timestamp(), + time_point(milliseconds(d2))); + } +} + +TEST_F(TestFreshnessTolerance, testCapture_1_1) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_2) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │ ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + NOTE: rowset[16-16] should be visible becasue it's within the query freshness tolerance time limit. + However, since the data files of rowset[16-16] is not in the cache, there is no difference between + capturing up to version 16 and capturing up to version 18. So we capture up to version 18. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_3) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_4) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16],[17-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_3_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │ │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-17],[18-18] + note: should fallback +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_1) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_2) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │ ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + NOTE: rowset[16-16] must be visible becasue it's within the query freshness tolerance time limit. + However, since the data files of rowset[16-16] is not in the cache, there is no difference between + capturing up to version 16 and capturing up to version 18 +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_3) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_4) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16],[17-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17] + note: due to the existence of rowset [11-17], we can only capture up to version 17 + because newly rowsets may generate delete bitmap marks on [11-17]. If we capture [18-18], + we may meet data correctness issue if [18-18] has duplicate rows with [11-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16] + note: due to the existence of rowset [2-16], we can only capture up to version 16 + because newly rowsets may generate delete bitmap marks on [2-16]. If we capture [17-17], + we may meet data correctness issue if [17-17] has duplicate rows with [2-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_5) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_6) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ │ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-17],[18-18] + note: because rowset [11-15] is not warmed up, we can only choose a path whose max verion is below 15 + but rowset version 16 is within the query freshness tolerance time limit. So we should fallback to + capture rowsets with tablet's max version +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, false, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_3_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │ │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-17],[18-18] + note: should fallback +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} +} // namespace doris diff --git a/be/test/cloud/cloud_tablet_test.cpp b/be/test/cloud/cloud_tablet_test.cpp index 5ec3df0417591c..fe9751ff7bfbc9 100644 --- a/be/test/cloud/cloud_tablet_test.cpp +++ b/be/test/cloud/cloud_tablet_test.cpp @@ -135,7 +135,8 @@ TEST_F(CloudTabletWarmUpStateTest, TestAddDuplicateRowsetWarmupState) { TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupNonExistent) { auto non_existent_id = _engine.next_rowset_id(); - WarmUpState result = _tablet->complete_rowset_segment_warmup(non_existent_id, Status::OK()); + WarmUpState result = + _tablet->complete_rowset_segment_warmup(non_existent_id, Status::OK(), 1, 0); EXPECT_EQ(result, WarmUpState::NONE); } @@ -151,12 +152,12 @@ TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupPartial) { // Complete one segment, should still be in TRIGGERED_BY_JOB state WarmUpState result1 = - _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); // Complete second segment, should still be in TRIGGERED_BY_JOB state WarmUpState result2 = - _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); EXPECT_EQ(result2, WarmUpState::TRIGGERED_BY_JOB); // Verify current state is still TRIGGERED_BY_JOB @@ -176,12 +177,67 @@ TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupFull) { // Complete first segment WarmUpState result1 = - _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_SYNC_ROWSET); // Complete second segment, should transition to DONE state WarmUpState result2 = - _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result2, WarmUpState::DONE); + + // Verify final state is DONE + WarmUpState final_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(final_state, WarmUpState::DONE); +} + +// Test complete_rowset_segment_warmup with inverted index file, partial completion +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithInvertedIndexPartial) { + auto rowset = create_rowset(Version(6, 6), 1); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + + // Complete one segment file + WarmUpState result1 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); + + // Complete inverted index file, should still be in TRIGGERED_BY_JOB state + WarmUpState result2 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 0, 1); + EXPECT_EQ(result2, WarmUpState::TRIGGERED_BY_JOB); + + // Verify current state is still TRIGGERED_BY_JOB + WarmUpState current_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(current_state, WarmUpState::TRIGGERED_BY_JOB); +} + +// Test complete_rowset_segment_warmup with inverted index file, full completion +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithInvertedIndexFull) { + auto rowset = create_rowset(Version(6, 6), 1); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + + // Complete segment file + WarmUpState result1 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); + + // Complete inverted index file + WarmUpState result2 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 0, 1); EXPECT_EQ(result2, WarmUpState::DONE); // Verify final state is DONE @@ -201,7 +257,8 @@ TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithError) { // Complete with error status, should still transition to DONE when all segments complete Status error_status = Status::InternalError("Test error"); - WarmUpState result = _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), error_status); + WarmUpState result = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), error_status, 1, 0); EXPECT_EQ(result, WarmUpState::DONE); // Verify final state is DONE even with error @@ -235,13 +292,13 @@ TEST_F(CloudTabletWarmUpStateTest, TestMultipleRowsetsWarmupState) { WarmUpState::TRIGGERED_BY_JOB); // Complete rowset1 (2 segments) - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), WarmUpState::TRIGGERED_BY_JOB); - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), WarmUpState::DONE); // Complete rowset3 (1 segment) - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset3->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset3->rowset_id(), Status::OK(), 1, 0), WarmUpState::DONE); // Verify states after completion @@ -266,7 +323,8 @@ TEST_F(CloudTabletWarmUpStateTest, TestWarmupStateWithZeroSegments) { EXPECT_EQ(state, WarmUpState::TRIGGERED_BY_JOB); // Any completion call should handle the edge case gracefully - WarmUpState result = _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK()); + WarmUpState result = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); // With 0 segments, the counter should already be 0, so this should transition to DONE EXPECT_EQ(result, WarmUpState::DONE); } @@ -285,11 +343,11 @@ TEST_F(CloudTabletWarmUpStateTest, TestConcurrentWarmupStateAccess) { WarmUpState::TRIGGERED_BY_SYNC_ROWSET)); // Interleaved completion operations - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), WarmUpState::TRIGGERED_BY_JOB); - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset2->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset2->rowset_id(), Status::OK(), 1, 0), WarmUpState::TRIGGERED_BY_SYNC_ROWSET); - EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK()), + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), WarmUpState::TRIGGERED_BY_JOB); // Check states are maintained correctly diff --git a/be/test/olap/tablet_test.cpp b/be/test/olap/tablet_test.cpp index ea7ddc24bbf445..8b6700c1f16950 100644 --- a/be/test/olap/tablet_test.cpp +++ b/be/test/olap/tablet_test.cpp @@ -296,11 +296,11 @@ TEST_F(TestTablet, pad_rowset) { Version version(5, 5); std::vector splits; - ASSERT_FALSE(_tablet->capture_rs_readers(version, &splits, false).ok()); + ASSERT_FALSE(_tablet->capture_rs_readers(version, &splits, {}).ok()); splits.clear(); static_cast(PadRowsetAction::_pad_rowset(_tablet.get(), version)); - ASSERT_TRUE(_tablet->capture_rs_readers(version, &splits, false).ok()); + ASSERT_TRUE(_tablet->capture_rs_readers(version, &splits, {}).ok()); } TEST_F(TestTablet, cooldown_policy) { diff --git a/cloud/src/meta-service/meta_service_job.cpp b/cloud/src/meta-service/meta_service_job.cpp index 68da6dcec8888d..0ea6f2d9483a44 100644 --- a/cloud/src/meta-service/meta_service_job.cpp +++ b/cloud/src/meta-service/meta_service_job.cpp @@ -1234,7 +1234,6 @@ void process_compaction_job(MetaServiceCode& code, std::string& msg, std::string return; } - // We don't actually need to parse the rowset meta doris::RowsetMetaCloudPB rs_meta; rs_meta.ParseFromString(tmp_rowset_val); if (rs_meta.txn_id() <= 0) { @@ -1249,9 +1248,22 @@ void process_compaction_job(MetaServiceCode& code, std::string& msg, std::string INSTANCE_LOG(INFO) << "remove tmp rowset meta, tablet_id=" << tablet_id << " tmp_rowset_key=" << hex(tmp_rowset_key); + using namespace std::chrono; + auto rowset_visible_time = + duration_cast(system_clock::now().time_since_epoch()).count(); + rs_meta.set_visible_ts_ms(rowset_visible_time); + std::string rowset_val; + if (!rs_meta.SerializeToString(&rowset_val)) { + code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR; + SS << "failed to serialize rowset meta, tablet_id=" << tablet_id + << " rowset_id=" << rowset_id; + msg = ss.str(); + return; + } + int64_t version = compaction.output_versions(0); auto rowset_key = meta_rowset_key({instance_id, tablet_id, version}); - txn->put(rowset_key, tmp_rowset_val); + txn->put(rowset_key, rowset_val); if (is_versioned_write) { std::string meta_rowset_compact_key = versioned::meta_rowset_compact_key({instance_id, tablet_id, version}); @@ -1867,9 +1879,31 @@ void process_schema_change_job(MetaServiceCode& code, std::string& msg, std::str : cast_as(err); return; } + + RowsetMetaCloudPB tmp_rowset_meta; + if (!tmp_rowset_meta.ParseFromString(tmp_rowset_val)) { + code = MetaServiceCode::PROTOBUF_PARSE_ERR; + SS << "malformed tmp rowset meta, unable to deserialize, tablet_id=" << new_tablet_id + << " key=" << hex(tmp_rowset_key); + msg = ss.str(); + return; + } + using namespace std::chrono; + auto rowset_visible_time = + duration_cast(system_clock::now().time_since_epoch()).count(); + tmp_rowset_meta.set_visible_ts_ms(rowset_visible_time); + std::string rowset_val; + if (!tmp_rowset_meta.SerializeToString(&rowset_val)) { + code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR; + SS << "failed to serialize rowset meta, tablet_id=" << new_tablet_id + << " rowset_id=" << tmp_rowset_meta.rowset_id_v2(); + msg = ss.str(); + return; + } + auto rowset_key = meta_rowset_key( {instance_id, new_tablet_id, schema_change.output_versions().at(i)}); - txn->put(rowset_key, tmp_rowset_val); + txn->put(rowset_key, rowset_val); txn->remove(tmp_rowset_key); if (is_versioned_write) { doris::RowsetMetaCloudPB rs_meta; diff --git a/cloud/src/meta-service/meta_service_txn.cpp b/cloud/src/meta-service/meta_service_txn.cpp index ce128f6d30e485..c17b2565456953 100644 --- a/cloud/src/meta-service/meta_service_txn.cpp +++ b/cloud/src/meta-service/meta_service_txn.cpp @@ -1316,6 +1316,10 @@ void MetaServiceImpl::commit_txn_immediately( std::vector, const RowsetMetaCloudPB&>> rowsets; std::unordered_map tablet_stats; // tablet_id -> stats rowsets.reserve(tmp_rowsets_meta.size()); + + int64_t rowsets_visible_ts_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + for (auto& [_, i] : tmp_rowsets_meta) { int64_t tablet_id = i.tablet_id(); int64_t partition_id = i.partition_id(); @@ -1338,6 +1342,7 @@ void MetaServiceImpl::commit_txn_immediately( int64_t new_version = versions[partition_id] + 1; i.set_start_version(new_version); i.set_end_version(new_version); + i.set_visible_ts_ms(rowsets_visible_ts_ms); // Accumulate affected rows auto& stats = tablet_stats[tablet_id]; @@ -2328,6 +2333,9 @@ void MetaServiceImpl::commit_txn_with_sub_txn(const CommitTxnRequest* request, commit_txn_log.set_txn_id(txn_id); commit_txn_log.set_db_id(db_id); + int64_t rowsets_visible_ts_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + // -> rowset meta std::vector, RowsetMetaCloudPB>> rowsets; std::unordered_map tablet_stats; // tablet_id -> stats @@ -2361,6 +2369,7 @@ void MetaServiceImpl::commit_txn_with_sub_txn(const CommitTxnRequest* request, } i.set_start_version(new_version); i.set_end_version(new_version); + i.set_visible_ts_ms(rowsets_visible_ts_ms); LOG(INFO) << "xxx update rowset version, txn_id=" << txn_id << ", sub_txn_id=" << sub_txn_id << ", table_id=" << table_id << ", partition_id=" << partition_id << ", tablet_id=" << tablet_id diff --git a/cloud/src/meta-service/txn_lazy_committer.cpp b/cloud/src/meta-service/txn_lazy_committer.cpp index 01192508bba6d4..ddf0cd111e6ad8 100644 --- a/cloud/src/meta-service/txn_lazy_committer.cpp +++ b/cloud/src/meta-service/txn_lazy_committer.cpp @@ -147,6 +147,9 @@ void convert_tmp_rowsets( // tablet_id -> stats std::unordered_map tablet_stats; + int64_t rowsets_visible_ts_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + for (auto& [tmp_rowset_key, tmp_rowset_pb] : tmp_rowsets_meta) { std::string tmp_rowst_data; err = txn->get(tmp_rowset_key, &tmp_rowst_data); @@ -309,6 +312,7 @@ void convert_tmp_rowsets( tmp_rowset_pb.set_start_version(version); tmp_rowset_pb.set_end_version(version); + tmp_rowset_pb.set_visible_ts_ms(rowsets_visible_ts_ms); rowset_val.clear(); if (!tmp_rowset_pb.SerializeToString(&rowset_val)) { diff --git a/cloud/test/meta_service_job_test.cpp b/cloud/test/meta_service_job_test.cpp index b71fee9aa3f231..c30be90e1dad35 100644 --- a/cloud/test/meta_service_job_test.cpp +++ b/cloud/test/meta_service_job_test.cpp @@ -1142,6 +1142,14 @@ TEST(MetaServiceJobTest, CompactionJobTest) { auto rowset_key = meta_rowset_key({instance_id, tablet_id, input_version_end}); std::string rowset_val; EXPECT_EQ(txn->get(rowset_key, &rowset_val), TxnErrorCode::TXN_OK) << hex(rowset_key); + doris::RowsetMetaCloudPB rowset_meta; + ASSERT_TRUE(rowset_meta.ParseFromString(rowset_val)); + ASSERT_TRUE(rowset_meta.has_visible_ts_ms() && rowset_meta.visible_ts_ms() > 0); + using namespace std::chrono; + auto visible_tp = time_point(milliseconds(rowset_meta.visible_ts_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; }; auto test_abort_compaction_job = [&](int64_t table_id, int64_t index_id, int64_t partition_id, @@ -3630,6 +3638,12 @@ TEST(MetaServiceJobTest, SchemaChangeJobTest) { EXPECT_EQ(saved_rowset.start_version(), rs.start_version()); EXPECT_EQ(saved_rowset.end_version(), rs.end_version()); EXPECT_EQ(saved_rowset.rowset_id_v2(), rs.rowset_id_v2()); + ASSERT_TRUE(saved_rowset.has_visible_ts_ms() && saved_rowset.visible_ts_ms() > 0); + using namespace std::chrono; + auto visible_tp = time_point(milliseconds(saved_rowset.visible_ts_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; } for (int i = 3; i < 5; ++i) { // [14-14][15-15] auto [k, v] = it->next(); diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index ed51b9cb46eb0d..f45158693dec86 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -11873,4 +11874,66 @@ TEST(MetaServiceTest, CreateTabletIdempotentAndHandlingError) { ASSERT_EQ(res.status().code(), MetaServiceCode::KV_TXN_GET_ERR); } +TEST(MetaServiceTest, RowsetVisibleTimeTest) { + auto meta_service = get_meta_service(); + using namespace std::chrono; + int64_t txn_id = -1; + // begin txn + { + brpc::Controller cntl; + BeginTxnRequest req; + req.set_cloud_unique_id("test_cloud_unique_id"); + TxnInfoPB txn_info_pb; + txn_info_pb.set_db_id(666); + txn_info_pb.set_label("test_label"); + txn_info_pb.add_table_ids(1234); + txn_info_pb.set_timeout_ms(36000); + req.mutable_txn_info()->CopyFrom(txn_info_pb); + BeginTxnResponse res; + meta_service->begin_txn(reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &req, + &res, nullptr); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + txn_id = res.txn_id(); + } + + // mock rowset and tablet + int64_t tablet_id_base = 1103; + for (int i = 0; i < 5; ++i) { + create_tablet(meta_service.get(), 1234, 1235, 1236, tablet_id_base + i); + auto tmp_rowset = create_rowset(txn_id, tablet_id_base + i); + CreateRowsetResponse res; + commit_rowset(meta_service.get(), tmp_rowset, res); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + } + { + brpc::Controller cntl; + CommitTxnRequest req; + req.set_cloud_unique_id("test_cloud_unique_id"); + req.set_db_id(666); + req.set_txn_id(txn_id); + CommitTxnResponse res; + meta_service->commit_txn(reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &req, + &res, nullptr); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + } + + for (int i = 0; i < 5; ++i) { + int64_t tablet_id = tablet_id_base + i; + int64_t ver = 2; + std::string rowset_key = meta_rowset_key({mock_instance, tablet_id, ver}); + std::string val; + std::unique_ptr txn; + ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), TxnErrorCode::TXN_OK); + ASSERT_EQ(txn->get(rowset_key, &val), TxnErrorCode::TXN_OK); + RowsetMetaCloudPB rowset_pb; + ASSERT_TRUE(rowset_pb.ParseFromString(val)); + ASSERT_TRUE(rowset_pb.has_visible_ts_ms()); + std::cout << rowset_pb.visible_ts_ms() << "\n"; + ASSERT_GT(rowset_pb.visible_ts_ms(), 0); + auto visible_tp = time_point(milliseconds(rowset_pb.visible_ts_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; + } +} } // namespace doris::cloud diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java index 83a1f79b0ffd10..464ec680e9212f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java @@ -1254,6 +1254,24 @@ public Pair isWorkloadGroupInUse(String groupName) { } } + public boolean getEnablePreferCachedRowset(String qualifiedUser) { + readLock(); + try { + return propertyMgr.getEnablePreferCachedRowset(qualifiedUser); + } finally { + readUnlock(); + } + } + + public long getQueryFreshnessToleranceMs(String qualifiedUser) { + readLock(); + try { + return propertyMgr.getQueryFreshnessToleranceMs(qualifiedUser); + } finally { + readUnlock(); + } + } + public void getAllDomains(Set allDomains) { readLock(); try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java index 24a33c5b6e3a9d..277a206aa87c5a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java @@ -68,6 +68,12 @@ public class CommonUserProperties implements GsonPostProcessable { @SerializedName(value = "wg", alternate = {"workloadGroup"}) private String workloadGroup = WorkloadGroupMgr.DEFAULT_GROUP_NAME; + @SerializedName(value = "epcr", alternate = {"enablePreferCachedRowset"}) + private boolean enablePreferCachedRowset = false; + + @SerializedName(value = "qft", alternate = {"queryFreshnessTolerance"}) + private long queryFreshnessToleranceMs = -1; + private String[] sqlBlockRulesSplit = {}; long getMaxConn() { @@ -168,6 +174,22 @@ public void setWorkloadGroup(String workloadGroup) { this.workloadGroup = workloadGroup; } + public long getQueryFreshnessToleranceMs() { + return queryFreshnessToleranceMs; + } + + public void setQueryFreshnessToleranceMs(long queryFreshnessToleranceMs) { + this.queryFreshnessToleranceMs = queryFreshnessToleranceMs; + } + + public boolean getEnablePreferCachedRowset() { + return enablePreferCachedRowset; + } + + public void setEnablePreferCachedRowset(boolean enablePreferCachedRowset) { + this.enablePreferCachedRowset = enablePreferCachedRowset; + } + @Override public void gsonPostProcess() throws IOException { if (!Strings.isNullOrEmpty(sqlBlockRules)) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java index 3d0de61a49b321..6244634ecdb3d8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java @@ -75,6 +75,9 @@ public class UserProperty { public static final String DEFAULT_CLOUD_CLUSTER = "default_cloud_cluster"; public static final String DEFAULT_COMPUTE_GROUP = "default_compute_group"; + public static final String PROP_ENABLE_PREFER_CACHED_ROWSET = "enable_prefer_cached_rowset"; + public static final String PROP_QUERY_FRESHNESS_TOLERANCE = "query_freshness_tolerance_ms"; + // for system user public static final Set ADVANCED_PROPERTIES = Sets.newHashSet(); // for normal user @@ -114,6 +117,8 @@ public class UserProperty { COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_WORKLOAD_GROUP + "$", Pattern.CASE_INSENSITIVE)); COMMON_PROPERTIES.add(Pattern.compile("^" + DEFAULT_CLOUD_CLUSTER + "$", Pattern.CASE_INSENSITIVE)); COMMON_PROPERTIES.add(Pattern.compile("^" + DEFAULT_COMPUTE_GROUP + "$", Pattern.CASE_INSENSITIVE)); + COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_QUERY_FRESHNESS_TOLERANCE + "$", Pattern.CASE_INSENSITIVE)); + COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_ENABLE_PREFER_CACHED_ROWSET + "$", Pattern.CASE_INSENSITIVE)); } public UserProperty() { @@ -171,6 +176,14 @@ public long getExecMemLimit() { return commonProperties.getExecMemLimit(); } + public long getQueryFreshnessToleranceMs() { + return commonProperties.getQueryFreshnessToleranceMs(); + } + + public boolean getEnablePreferCachedRowset() { + return commonProperties.getEnablePreferCachedRowset(); + } + public void update(List> properties) throws UserException { update(properties, false); } @@ -188,6 +201,8 @@ public void update(List> properties, boolean isReplay) thro int insertTimeout = this.commonProperties.getInsertTimeout(); String initCatalog = this.commonProperties.getInitCatalog(); String workloadGroup = this.commonProperties.getWorkloadGroup(); + long queryFreshnessToleranceMs = this.commonProperties.getQueryFreshnessToleranceMs(); + boolean enablePreferCachedRowset = this.commonProperties.getEnablePreferCachedRowset(); String newDefaultCloudCluster = defaultCloudCluster; @@ -320,6 +335,21 @@ public void update(List> properties, boolean isReplay) thro throw new DdlException("workload group " + value + " not exists"); } workloadGroup = value; + } else if (keyArr[0].equalsIgnoreCase(PROP_QUERY_FRESHNESS_TOLERANCE)) { + // set property "query_freshness_tolerance" = "1000"; + if (keyArr.length != 1) { + throw new DdlException(PROP_QUERY_FRESHNESS_TOLERANCE + " format error"); + } + queryFreshnessToleranceMs = getLongProperty(key, value, keyArr, PROP_QUERY_FRESHNESS_TOLERANCE); + } else if (keyArr[0].equalsIgnoreCase(PROP_ENABLE_PREFER_CACHED_ROWSET)) { + if (keyArr.length != 1) { + throw new DdlException(PROP_ENABLE_PREFER_CACHED_ROWSET + " format error"); + } + try { + enablePreferCachedRowset = Boolean.parseBoolean(value); + } catch (NumberFormatException e) { + throw new DdlException(PROP_ENABLE_PREFER_CACHED_ROWSET + " is not boolean"); + } } else { if (isReplay) { // After using SET PROPERTY to modify the user property, if FE rolls back to a version without @@ -344,6 +374,8 @@ public void update(List> properties, boolean isReplay) thro this.commonProperties.setInsertTimeout(insertTimeout); this.commonProperties.setInitCatalog(initCatalog); this.commonProperties.setWorkloadGroup(workloadGroup); + this.commonProperties.setQueryFreshnessToleranceMs(queryFreshnessToleranceMs); + this.commonProperties.setEnablePreferCachedRowset(enablePreferCachedRowset); defaultCloudCluster = newDefaultCloudCluster; } @@ -441,6 +473,11 @@ public List> fetchProperty() { result.add(Lists.newArrayList(PROP_WORKLOAD_GROUP, String.valueOf(commonProperties.getWorkloadGroup()))); + result.add(Lists.newArrayList(PROP_ENABLE_PREFER_CACHED_ROWSET, + String.valueOf(commonProperties.getEnablePreferCachedRowset()))); + result.add(Lists.newArrayList(PROP_QUERY_FRESHNESS_TOLERANCE, + String.valueOf(commonProperties.getQueryFreshnessToleranceMs()))); + // default cloud cluster if (defaultCloudCluster != null) { result.add(Lists.newArrayList(DEFAULT_CLOUD_CLUSTER, defaultCloudCluster)); diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java index 7477c5a308211e..e068182308ed16 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java @@ -245,6 +245,24 @@ public Pair isWorkloadGroupInUse(String groupName) { return Pair.of(false, ""); } + public boolean getEnablePreferCachedRowset(String qualifiedUser) { + UserProperty existProperty = propertyMap.get(qualifiedUser); + existProperty = getPropertyIfNull(qualifiedUser, existProperty); + if (existProperty == null) { + return false; + } + return existProperty.getEnablePreferCachedRowset(); + } + + public long getQueryFreshnessToleranceMs(String qualifiedUser) { + UserProperty existProperty = propertyMap.get(qualifiedUser); + existProperty = getPropertyIfNull(qualifiedUser, existProperty); + if (existProperty == null) { + return -1; + } + return existProperty.getQueryFreshnessToleranceMs(); + } + /** * The method determines which user property to return based on the existProperty parameter * and system configuration: diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index e4d7a6bcbb7490..af197bfd73492a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -859,6 +859,9 @@ public static double getHotValueThreshold() { public static final String MULTI_DISTINCT_STRATEGY = "multi_distinct_strategy"; public static final String AGG_PHASE = "agg_phase"; + public static final String ENABLE_PREFER_CACHED_ROWSET = "enable_prefer_cached_rowset"; + public static final String QUERY_FRESHNESS_TOLERANCE_MS = "query_freshness_tolerance_ms"; + static { affectQueryResultFields = Arrays.stream(SessionVariable.class.getDeclaredFields()) .filter(f -> { @@ -2526,6 +2529,14 @@ public boolean isEnableHboNonStrictMatchingMode() { }, checker = "checkSkewRewriteAggBucketNum") public int skewRewriteAggBucketNum = 1024; + @VariableMgr.VarAttr(name = ENABLE_PREFER_CACHED_ROWSET, needForward = false, + description = {"是否启用 prefer cached rowset 功能", + "Whether to enable prefer cached rowset feature"}) + public boolean enablePreferCachedRowset = false; + + @VariableMgr.VarAttr(name = QUERY_FRESHNESS_TOLERANCE_MS, needForward = false) + public long queryFreshnessToleranceMs = -1; + public void setSkewRewriteAggBucketNum(int num) { this.skewRewriteAggBucketNum = num; } @@ -3644,6 +3655,30 @@ public int getParallelExecInstanceNum() { } } + public boolean getEnablePreferCachedRowset() { + ConnectContext connectContext = ConnectContext.get(); + if (connectContext != null && connectContext.getEnv() != null && connectContext.getEnv().getAuth() != null) { + boolean userEnablePreferCachedRowset = connectContext.getEnv().getAuth() + .getEnablePreferCachedRowset(connectContext.getQualifiedUser()); + if (userEnablePreferCachedRowset) { + return userEnablePreferCachedRowset; + } + } + return enablePreferCachedRowset; + } + + public long getQueryFreshnessToleranceMs() { + ConnectContext connectContext = ConnectContext.get(); + if (connectContext != null && connectContext.getEnv() != null && connectContext.getEnv().getAuth() != null) { + long userQueryFreshnessToleranceMs = connectContext.getEnv().getAuth() + .getQueryFreshnessToleranceMs(connectContext.getQualifiedUser()); + if (userQueryFreshnessToleranceMs > 0) { + return userQueryFreshnessToleranceMs; + } + } + return queryFreshnessToleranceMs; + } + public int getExchangeInstanceParallel() { return exchangeInstanceParallel; } @@ -4586,6 +4621,9 @@ public TQueryOptions toThrift() { tResult.setSkipBadTablet(skipBadTablet); tResult.setDisableFileCache(disableFileCache); + tResult.setEnablePreferCachedRowset(getEnablePreferCachedRowset()); + tResult.setQueryFreshnessToleranceMs(getQueryFreshnessToleranceMs()); + // for spill tResult.setEnableSpill(enableSpill); tResult.setEnableForceSpill(enableForceSpill); diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java index c8a7c9e037dd56..428fb6fb2f85b0 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java @@ -82,6 +82,8 @@ public void testUpdate() throws UserException { properties.add(Pair.of("sql_block_rules", "rule1,rule2")); properties.add(Pair.of("cpu_resource_limit", "2")); properties.add(Pair.of("query_timeout", "500")); + properties.add(Pair.of("enable_prefer_cached_rowset", "true")); + properties.add(Pair.of("query_freshness_tolerance_ms", "4500")); UserProperty userProperty = new UserProperty(); userProperty.update(properties); @@ -92,6 +94,8 @@ public void testUpdate() throws UserException { Assert.assertEquals(2, userProperty.getCpuResourceLimit()); Assert.assertEquals(500, userProperty.getQueryTimeout()); Assert.assertEquals(Sets.newHashSet(), userProperty.getCopiedResourceTags()); + Assert.assertEquals(true, userProperty.getEnablePreferCachedRowset()); + Assert.assertEquals(4500, userProperty.getQueryFreshnessToleranceMs()); // fetch property List> rows = userProperty.fetchProperty(); diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java index 767de59cae7f85..1789d79acd0f77 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java @@ -307,7 +307,7 @@ public void test() throws Exception { Assert.assertEquals(1000000, execMemLimit); List> userProps = Env.getCurrentEnv().getAuth().getUserProperties(Auth.ROOT_USER); - Assert.assertEquals(13, userProps.size()); + Assert.assertEquals(15, userProps.size()); // now : // be1 be2 be3 ==>tag1; diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index a1d1719c5ebe51..3dda0c132a9485 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -148,6 +148,8 @@ message RowsetMetaPB { optional SchemaDictKeyList schema_dict_key_list = 1008; // align to cloud rowset optional SplitSchemaPB __split_schema = 1009; // A special field, DO NOT change it. + + optional int64 visible_ts_ms = 1010; } message SchemaDictKeyList { @@ -243,6 +245,8 @@ message RowsetMetaCloudPB { repeated InvertedIndexFileInfo inverted_index_file_info = 107; optional SplitSchemaPB __split_schema = 108; // A special field, DO NOT change it. + + optional int64 visible_ts_ms = 109; } message SegmentStatisticsPB { diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 6e749c266c83a2..71179e17c487ae 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -404,6 +404,9 @@ struct TQueryOptions { 171: optional bool optimize_index_scan_parallelism = false; + 172: optional bool enable_prefer_cached_rowset + 173: optional i64 query_freshness_tolerance_ms + // For cloud, to control if the content would be written into file cache // In write path, to control if the content would be written into file cache. // In read path, read from file cache or remote storage when execute query. diff --git a/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out new file mode 100644 index 00000000000000..04cf3be33e8192 --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out @@ -0,0 +1,32 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster1 -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster2_0 -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster1_new_data -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} +6 {"a":1111.11111} + +-- !cluster2_1 -- +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 +6 {"a":1111.11111} 0 7 + +-- !cluster2_2 -- +1 {"a":1} 0 2 +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 +6 {"a":1111.11111} 0 7 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out new file mode 100644 index 00000000000000..b99240d21e24ff --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out @@ -0,0 +1,24 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster1 -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster2_0 -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster1_new_data -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} +6 {"a":1111.11111} + +-- !cluster2_1 -- +1 {"a":1} 0 2 +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out new file mode 100644 index 00000000000000..7cefab58718a8e --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2 -- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out new file mode 100644 index 00000000000000..8191de7859e2eb --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2_0 -- +3 3 +4 4 +5 5 +6 6 + +-- !cluster2_1 -- +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 +9 9 0 8 + +-- !cluster2_2 -- +1 1 0 2 +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out new file mode 100644 index 00000000000000..e7fffb3bcb7ec1 --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out @@ -0,0 +1,11 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2_0 -- +2 2 + +-- !cluster2_1 -- +1 \N 1 3 +2 2 0 2 +9 9 0 4 + +-- !cluster2_2 -- + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out new file mode 100644 index 00000000000000..b915a1b6c9449a --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2_0 -- +3 3 +4 4 +5 5 +6 6 + +-- !cluster2_1 -- +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 +9 9 0 8 + +-- !cluster2 -- +1 1 0 2 +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out new file mode 100644 index 00000000000000..59c500c665d402 --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out @@ -0,0 +1,10 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2 -- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +9 9 + diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy new file mode 100644 index 00000000000000..34ca1d7e8a4b4e --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_enable_prefer_cached_rowset', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'block_file_cache_monitor_interval_sec=1', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(2000) + } + + def updateBeConf = {cluster, key, value -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + def (code, out, err) = update_be_config(ip, port, key, value) + logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) + } + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def injectCompactionRowsetDownloadSlow = {cluster, sleep_s -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + def injectName = 'CloudTablet::add_rowsets.download_data.callback.block_compaction_rowset' + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + GetDebugPoint().enableDebugPoint(ip, port as int, NodeType.BE, injectName, [sleep:sleep_s]) + } + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + updateBeConf(clusterName2, "enable_warmup_immediately_on_new_rowset", "true") + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + sql """ + create table test ( + col0 int not null, + col1 variant NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """insert into test values (1, '{"a" : 1.0}')""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" + sql """insert into test values (3, '{"a" : "11111"}')""" + sql """insert into test values (4, '{"a" : 1111111111}')""" + sql """insert into test values (5, '{"a" : 1111.11111}')""" + + sql """use @${clusterName1}""" + qt_cluster1 """select * from test""" + + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + qt_cluster2_0 """select * from test""" + + // switch to source cluster and trigger compaction + sql """use @${clusterName1}""" + trigger_and_wait_compaction("test", "cumulative") + // load new data to increase the version + sql """insert into test values (6, '{"a" : 1111.11111}')""" + qt_cluster1_new_data "select * from test;" + + // inject to let cluster2 read compaction rowset data slowly + injectCompactionRowsetDownloadSlow(clusterName2, 10) + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + + // when enable_prefer_cached_rowset = false, need to read all data including compaction rowsets + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + + sql "set enable_prefer_cached_rowset = true" + // when enable_prefer_cached_rowset = true, only need to read newly load data, compaction rowsets data will be skipped + def t1 = System.currentTimeMillis() + def capturePreferCacheCount = getBrpcMetricsByCluster(clusterName2, "capture_prefer_cache_count") + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 2000 + assert getBrpcMetricsByCluster(clusterName2, "capture_prefer_cache_count") == capturePreferCacheCount + 1 + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy new file mode 100644 index 00000000000000..215e588137ed8e --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_query_freshness_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'block_file_cache_monitor_interval_sec=1', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(2000) + } + + def updateBeConf = {cluster, key, value -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + def (code, out, err) = update_be_config(ip, port, key, value) + logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) + } + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + def ret = matcher[0][1] as long + logger.info("getBrpcMetrics, ${url}, name:${name}, value:${ret}") + return ret + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def injectS3FileReadSlow = {cluster, sleep_s -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + def injectName = 'S3FileReader::read_at_impl.io_slow' + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + GetDebugPoint().enableDebugPoint(ip, port as int, NodeType.BE, injectName, [sleep:sleep_s]) + } + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + updateBeConf(clusterName2, "enable_warmup_immediately_on_new_rowset", "true") + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + sql """ + create table test ( + col0 int not null, + col1 variant NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """insert into test values (1, '{"a" : 1.0}')""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" + sql """insert into test values (3, '{"a" : "11111"}')""" + sql """insert into test values (4, '{"a" : 1111111111}')""" + sql """insert into test values (5, '{"a" : 1111.11111}')""" + + sql """use @${clusterName1}""" + qt_cluster1 """select * from test""" + + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + qt_cluster2_0 """select * from test""" + + // sleep for 5s to let these rowsets meet the requirement of query freshness tolerance + sleep(5000) + + // switch to source cluster and trigger compaction + sql """use @${clusterName1}""" + trigger_and_wait_compaction("test", "cumulative") + // load new data to increase the version + sql """insert into test values (6, '{"a" : 1111.11111}')""" + qt_cluster1_new_data "select * from test;" + + // inject to let cluster2 read compaction rowset data slowly + injectS3FileReadSlow(clusterName2, 10) + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // when query_freshness_tolerance_ms is set, newly load data and compaction rowsets data will be skipped + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + // query with freshness tolerance should not fallback + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy new file mode 100644 index 00000000000000..3ce4ee58f4b771 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy @@ -0,0 +1,316 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_compaction_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=20000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(1000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(5000) + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test values (2, 2)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(3000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assertEquals(num_submitted + 1, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num")) + assertEquals(num_finished, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num")) + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + + + // in this moment, compaction has completed, but not commited, it's waiting for warm up + // trigger a query on read cluster, can't read the compaction data + sql """use @${clusterName2}""" + sql "select * from test" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-2]") + assert rowsets[2].contains("[3-3]") + assert rowsets[3].contains("[4-4]") + assert rowsets[4].contains("[5-5]") + assert rowsets[5].contains("[6-6]") + assert rowsets[6].contains("[7-7]") + assert rowsets[7].contains("[8-8]") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) + qt_cluster2 """select * from test""" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + future.get() + assert num_finished + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 0 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy new file mode 100644 index 00000000000000..de4887624e4945 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy @@ -0,0 +1,332 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_idx_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', // to cauase timeout + 'warm_up_rowset_sync_wait_max_timeout_ms=100', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL, + INDEX idx1(col1) USING INVERTED + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test(col0,col1,__DORIS_DELETE_SIGN__) values (1, 2, 1)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_idx_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_index_num") + def num_idx_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + assert num_idx_submitted >= 6 + assert num_idx_finished == num_idx_submitted + + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + sleep(500) + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assert num_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + assert num_finished + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_idx_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_index_num") + assert num_idx_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + + // trigger a query on read cluster without query tolerance, read the origin data + sql """use @${clusterName2}""" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-5]") + assert rowsets[2].contains("[6-6]") + assert rowsets[3].contains("[7-7]") + assert rowsets[4].contains("[8-8]") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + // trigger a query on read cluster without query tolerance, read the compacted data + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + sleep(10000) + // assert num_finished + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_index_num") + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy new file mode 100644 index 00000000000000..688aa5e4446a57 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy @@ -0,0 +1,308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_sc_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', + 'warm_up_rowset_sync_wait_max_timeout_ms=100', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(1000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1),(2,2);""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_submitted >= 1 + assert num_finished == num_submitted + + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + sql """insert into test values (9, 9)""" + + do_cumu_compaction(src_be, "test", tablet_id, 2, 4) + + // trigger a heavy SC + sql "alter table test modify column col1 varchar(1000);" + + waitForSchemaChangeDone { + sql """ SHOW ALTER TABLE COLUMN WHERE TableName='test' ORDER BY createtime DESC LIMIT 1 """ + time 1000 + } + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + // assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + // assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + + + sql """use @${clusterName2}""" + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + sleep(10000) + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy new file mode 100644 index 00000000000000..b13609ed42e1e8 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy @@ -0,0 +1,335 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'read_cluster_cache_opt_verbose_log=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', + 'warm_up_rowset_sync_wait_max_timeout_ms=100', // to cause timeout + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(2000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test(col0,col1,__DORIS_DELETE_SIGN__) values (1, 2, 1)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_requested = getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + assert num_requested == num_finished + + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assert num_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + sleep(500) + assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + + + // trigger a query on read cluster without query tolerance, read the origin data + sql """use @${clusterName2}""" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-5]") + assert rowsets[2].contains("[6-6]") + assert rowsets[3].contains("[7-7]") + assert rowsets[4].contains("[8-8]") + + // this query will trigger sync_rowsets, due to compaction cnts changes, version_overlap will be true, so that compaction rowset + // and new load rowset's warmup task will be triggered. However, these rowsets' warmup tasks have been triggered in passive warmup + // we check that they will not be triggered again + assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + // trigger a query on read cluster without query tolerance, read the compacted data + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + future.get() + assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + sleep(10000) + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy new file mode 100644 index 00000000000000..ea1aafbc44cf37 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy @@ -0,0 +1,254 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_download_fail', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=20000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(1000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(1000) + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test values (2, 2)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_failed = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + assert num_failed == 0 + + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error") + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + sleep(1000) + + assert num_failed + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + + + sql """use @${clusterName2}""" + sql "set enable_profile=true;" + sql "set profile_level=2;" + + // although download failed, the query should still read the newly inserted data + sql "set query_freshness_tolerance_ms = 5000" + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + qt_cluster2 """select * from test""" + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + } +} diff --git a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy new file mode 100644 index 00000000000000..3305d4ce5dbfdb --- /dev/null +++ b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite('test_read_cluster_var_property') { + if (!isCloudMode()) { + return + } + String userName = "test_read_cluster_var_property_user" + String pwd = '123456' + sql """drop user if exists ${userName}""" + sql """CREATE USER '${userName}' IDENTIFIED BY '${pwd}'""" + sql """GRANT ADMIN_PRIV ON *.*.* TO ${userName}""" + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + def ret = matcher[0][1] as long + logger.info("getBrpcMetrics, ${url}, name:${name}, value:${ret}") + return ret + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + connect(userName, "${pwd}", context.config.jdbcUrl) { + // test non-mow table + try { + def tableName = "test_read_cluster_var_property" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + DUPLICATE KEY(k) + DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "disable_auto_compaction" = "true"); + """ + + (1..20).each{ id -> + sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" + } + + sql "select * from ${tableName};" + + def backends = sql_return_maparray('show backends') + def tabletStats = sql_return_maparray("show tablets from ${tableName};") + assert tabletStats.size() == 1 + def tabletId = tabletStats[0].TabletId + def tabletBackendId = tabletStats[0].BackendId + def tabletBackend + for (def be : backends) { + if (be.BackendId == tabletBackendId) { + tabletBackend = be + break; + } + } + logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); + + try { + // 1. test enable_prefer_cached_rowset + sql "set enable_prefer_cached_rowset=true;" + def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + 1 + + sql "set enable_prefer_cached_rowset=false;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + // user property has higher prioroty than session variable + sql "set property for '${userName}' enable_prefer_cached_rowset=true;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == 1 + preferCachedRowsetCount + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' enable_prefer_cached_rowset=false;" + } + + try { + // 2. test query_freshness_tolerance_ms + sql "set query_freshness_tolerance_ms=1000;" + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 + + sql "set query_freshness_tolerance_ms=-1;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + + // user property has higher prioroty than session variable + sql "set property for '${userName}' query_freshness_tolerance_ms=2000;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == 1 + queryFreshnessTolerance + } finally { + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' query_freshness_tolerance_ms=-1;" + } + } catch (Exception e) { + logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' query_freshness_tolerance_ms=-1;" + } + + // test mow table + try { + def tableName = "test_read_cluster_var_property_mow" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + UNIQUE KEY(k) DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "disable_auto_compaction" = "true"); + """ + + (1..20).each{ id -> + sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" + } + + sql "select * from ${tableName};" + + def backends = sql_return_maparray('show backends') + def tabletStats = sql_return_maparray("show tablets from ${tableName};") + assert tabletStats.size() == 1 + def tabletId = tabletStats[0].TabletId + def tabletBackendId = tabletStats[0].BackendId + def tabletBackend + for (def be : backends) { + if (be.BackendId == tabletBackendId) { + tabletBackend = be + break; + } + } + logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); + + try { + // 1. test enable_prefer_cached_rowset + // enable_prefer_cached_rowset should not take effect on mow table + sql "set enable_prefer_cached_rowset=true;" + def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + sql "set enable_prefer_cached_rowset=false;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + // user property has higher prioroty than session variable + sql "set property for '${userName}' enable_prefer_cached_rowset=true;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' enable_prefer_cached_rowset=false;" + } + + try { + // 2. test query_freshness_tolerance_ms + sql "set query_freshness_tolerance_ms=1000;" + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 + + sql "set query_freshness_tolerance_ms=-1;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + + // user property has higher prioroty than session variable + sql "set property for '${userName}' query_freshness_tolerance_ms=2000;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == 1 + queryFreshnessTolerance + } finally { + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' query_freshness_tolerance_ms=-1;" + } + } catch (Exception e) { + logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") + throw e + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' query_freshness_tolerance_ms=-1;" + } + } +} \ No newline at end of file