From a1423fd5e2f988489aa2c5736e8fb349765b3293 Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Tue, 22 Aug 2023 14:15:41 +0800 Subject: [PATCH 1/3] add check in commit phase --- be/src/olap/rowset_builder.cpp | 6 ++++++ be/src/olap/tablet.cpp | 6 +++--- be/src/olap/tablet.h | 6 +++--- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index aad17ce2fdfdc7..0698013838e5f6 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -249,6 +249,12 @@ Status RowsetBuilder::wait_calc_delete_bitmap() { } Status RowsetBuilder::commit_txn() { + if (_tablet->enable_unique_key_merge_on_write() && + config::enable_merge_on_write_correctness_check) { + RETURN_IF_ERROR(_tablet->check_delete_bitmap_correctness( + _delete_bitmap, _rowset->end_version() - 1, _req.txn_id, _rowset_ids)); + } + std::lock_guard l(_lock); SCOPED_TIMER(_commit_txn_timer); Status res = _storage_engine->txn_manager()->commit_txn(_req.partition_id, _tablet, _req.txn_id, diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 644c6a4c59d557..a81ce52a94620e 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -3256,7 +3256,7 @@ Status Tablet::update_delete_bitmap_without_lock(const RowsetSharedPtr& rowset) << "(us), total rows: " << total_rows; if (config::enable_merge_on_write_correctness_check) { // check if all the rowset has ROWSET_SENTINEL_MARK - RETURN_IF_ERROR(_check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, -1, + RETURN_IF_ERROR(check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, -1, cur_rowset_ids, &specified_rowsets)); _remove_sentinel_mark_from_delete_bitmap(delete_bitmap); } @@ -3360,7 +3360,7 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, if (config::enable_merge_on_write_correctness_check && rowset->num_rows() != 0) { // only do correctness check if the rowset has at least one row written // check if all the rowset has ROWSET_SENTINEL_MARK - RETURN_IF_ERROR(_check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, txn_id, + RETURN_IF_ERROR(check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, txn_id, cur_rowset_ids)); _remove_sentinel_mark_from_delete_bitmap(delete_bitmap); } @@ -3717,7 +3717,7 @@ void Tablet::_remove_sentinel_mark_from_delete_bitmap(DeleteBitmapPtr delete_bit } } -Status Tablet::_check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap, int64_t max_version, +Status Tablet::check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap, int64_t max_version, int64_t txn_id, const RowsetIdUnorderedSet& rowset_ids, std::vector* rowsets) { diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 5ea64cce78ea3e..a1684fe9e65c5f 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -552,6 +552,9 @@ class Tablet : public BaseTablet { void set_binlog_config(BinlogConfig binlog_config); void add_sentinel_mark_to_delete_bitmap(DeleteBitmap* delete_bitmap, const RowsetIdUnorderedSet& rowsetids); + Status check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap, int64_t max_version, + int64_t txn_id, const RowsetIdUnorderedSet& rowset_ids, + std::vector* rowsets = nullptr); private: Status _init_once_action(); @@ -597,9 +600,6 @@ class Tablet : public BaseTablet { //////////////////////////////////////////////////////////////////////////// void _remove_sentinel_mark_from_delete_bitmap(DeleteBitmapPtr delete_bitmap); - Status _check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap, int64_t max_version, - int64_t txn_id, const RowsetIdUnorderedSet& rowset_ids, - std::vector* rowsets = nullptr); std::string _get_rowset_info_str(RowsetSharedPtr rowset, bool delete_flag); public: From f4789313b878e7d29e0e836d731f46cb8ad3e96d Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Mon, 28 Aug 2023 18:59:57 +0800 Subject: [PATCH 2/3] format --- be/src/olap/tablet.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index a81ce52a94620e..616237cf859bd3 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -3257,7 +3257,7 @@ Status Tablet::update_delete_bitmap_without_lock(const RowsetSharedPtr& rowset) if (config::enable_merge_on_write_correctness_check) { // check if all the rowset has ROWSET_SENTINEL_MARK RETURN_IF_ERROR(check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, -1, - cur_rowset_ids, &specified_rowsets)); + cur_rowset_ids, &specified_rowsets)); _remove_sentinel_mark_from_delete_bitmap(delete_bitmap); } for (auto iter = delete_bitmap->delete_bitmap.begin(); @@ -3361,7 +3361,7 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, // only do correctness check if the rowset has at least one row written // check if all the rowset has ROWSET_SENTINEL_MARK RETURN_IF_ERROR(check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, txn_id, - cur_rowset_ids)); + cur_rowset_ids)); _remove_sentinel_mark_from_delete_bitmap(delete_bitmap); } @@ -3718,9 +3718,9 @@ void Tablet::_remove_sentinel_mark_from_delete_bitmap(DeleteBitmapPtr delete_bit } Status Tablet::check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap, int64_t max_version, - int64_t txn_id, - const RowsetIdUnorderedSet& rowset_ids, - std::vector* rowsets) { + int64_t txn_id, + const RowsetIdUnorderedSet& rowset_ids, + std::vector* rowsets) { RowsetIdUnorderedSet missing_ids; for (const auto& rowsetid : rowset_ids) { if (!delete_bitmap->delete_bitmap.contains( From 12c380764c610530b2ff7d4b8171c39ef2f202aa Mon Sep 17 00:00:00 2001 From: bobhan1 Date: Wed, 30 Aug 2023 11:24:36 +0800 Subject: [PATCH 3/3] update --- be/src/olap/delta_writer.cpp | 2 +- be/src/olap/rowset_builder.cpp | 14 +++++++++++--- be/src/olap/tablet.cpp | 18 +++++++++++++----- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp index fac198f73c71d7..8e443779157c1e 100644 --- a/be/src/olap/delta_writer.cpp +++ b/be/src/olap/delta_writer.cpp @@ -163,7 +163,7 @@ Status DeltaWriter::commit_txn(const PSlaveTabletNodes& slave_tablet_nodes, const bool write_single_replica) { std::lock_guard l(_lock); SCOPED_TIMER(_commit_txn_timer); - _rowset_builder.commit_txn(); + RETURN_IF_ERROR(_rowset_builder.commit_txn()); if (write_single_replica) { for (auto node_info : slave_tablet_nodes.slave_nodes()) { diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index 0698013838e5f6..7aafb8f83eaccf 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -250,9 +250,17 @@ Status RowsetBuilder::wait_calc_delete_bitmap() { Status RowsetBuilder::commit_txn() { if (_tablet->enable_unique_key_merge_on_write() && - config::enable_merge_on_write_correctness_check) { - RETURN_IF_ERROR(_tablet->check_delete_bitmap_correctness( - _delete_bitmap, _rowset->end_version() - 1, _req.txn_id, _rowset_ids)); + config::enable_merge_on_write_correctness_check && _rowset->num_rows() != 0) { + auto st = _tablet->check_delete_bitmap_correctness( + _delete_bitmap, _rowset->end_version() - 1, _req.txn_id, _rowset_ids); + if (!st.ok()) { + LOG(WARNING) << fmt::format( + "[tablet_id:{}][txn_id:{}][load_id:{}][partition_id:{}] " + "delete bitmap correctness check failed in commit phase!", + _req.tablet_id, _req.txn_id, UniqueId(_req.load_id).to_string(), + _req.partition_id); + return st; + } } std::lock_guard l(_lock); diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 616237cf859bd3..a572503efb59f6 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -3256,8 +3256,11 @@ Status Tablet::update_delete_bitmap_without_lock(const RowsetSharedPtr& rowset) << "(us), total rows: " << total_rows; if (config::enable_merge_on_write_correctness_check) { // check if all the rowset has ROWSET_SENTINEL_MARK - RETURN_IF_ERROR(check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, -1, - cur_rowset_ids, &specified_rowsets)); + auto st = check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, -1, + cur_rowset_ids, &specified_rowsets); + if (!st.ok()) { + LOG(WARNING) << fmt::format("delete bitmap correctness check failed in publish phase!"); + } _remove_sentinel_mark_from_delete_bitmap(delete_bitmap); } for (auto iter = delete_bitmap->delete_bitmap.begin(); @@ -3360,8 +3363,11 @@ Status Tablet::update_delete_bitmap(const RowsetSharedPtr& rowset, if (config::enable_merge_on_write_correctness_check && rowset->num_rows() != 0) { // only do correctness check if the rowset has at least one row written // check if all the rowset has ROWSET_SENTINEL_MARK - RETURN_IF_ERROR(check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, txn_id, - cur_rowset_ids)); + auto st = check_delete_bitmap_correctness(delete_bitmap, cur_version - 1, -1, + cur_rowset_ids, &specified_rowsets); + if (!st.ok()) { + LOG(WARNING) << fmt::format("delete bitmap correctness check failed in publish phase!"); + } _remove_sentinel_mark_from_delete_bitmap(delete_bitmap); } @@ -3779,7 +3785,9 @@ Status Tablet::check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap, in root.Accept(writer); std::string rowset_status_string = std::string(strbuf.GetString()); LOG_EVERY_SECOND(WARNING) << rowset_status_string; - DCHECK(false) << "check delete bitmap correctness failed!"; + // let it crash if correctness check failed in Debug mode + DCHECK(false) << "delete bitmap correctness check failed in publish phase!"; + return Status::InternalError("check delete bitmap failed!"); } return Status::OK(); }