Skip to content

Commit 02f36c2

Browse files
morningmanimay
authored andcommitted
Set tablet as bad when loading index failed (#1146)
Bad tablet will be reported to FE and be handled And add a config auto_recover_index_loading_failure to control the index loading failure processing
1 parent 6117227 commit 02f36c2

File tree

7 files changed

+50
-20
lines changed

7 files changed

+50
-20
lines changed

be/src/common/config.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,10 @@ namespace config {
399399

400400
// max consumer num in one data consumer group, for routine load
401401
CONF_Int32(max_consumer_num_per_group, "3");
402+
403+
// Is set to true, index loading failure will not causing BE exit,
404+
// and the tablet will be marked as bad, so that FE will try to repair it.
405+
CONF_Bool(auto_recover_index_loading_failure, "false");
402406
} // namespace config
403407

404408
} // namespace doris

be/src/exec/olap_meta_reader.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,14 @@ Status EngineMetaReader::get_hints(
4343
RuntimeProfile* profile) {
4444
auto tablet_id = scan_range->scan_range().tablet_id;
4545
int32_t schema_hash = strtoul(scan_range->scan_range().schema_hash.c_str(), NULL, 10);
46+
std::string err;
4647
OLAPTablePtr table = OLAPEngine::get_instance()->get_table(
47-
tablet_id, schema_hash);
48+
tablet_id, schema_hash, true, &err);
4849
if (table.get() == NULL) {
49-
LOG(WARNING) << "tablet does not exist. tablet_id=" << tablet_id << ", schema_hash="
50-
<< schema_hash;
5150
std::stringstream ss;
52-
ss << "tablet does not exist: " << tablet_id;
51+
ss << "failed to get tablet: " << tablet_id << "with schema hash: "
52+
<< schema_hash << ", reason: " << err;
53+
LOG(WARNING) << ss.str();
5354
return Status(ss.str());
5455
}
5556

be/src/exec/olap_scanner.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,13 @@ Status OlapScanner::_prepare(
7979
VersionHash version_hash =
8080
strtoul(scan_range->scan_range().version_hash.c_str(), nullptr, 10);
8181
{
82-
_olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash);
82+
std::string err;
83+
_olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash, true, &err);
8384
if (_olap_table.get() == nullptr) {
84-
OLAP_LOG_WARNING("tablet does not exist. [tablet_id=%ld schema_hash=%d]",
85-
tablet_id, schema_hash);
86-
8785
std::stringstream ss;
88-
ss << "tablet does not exist: " << tablet_id;
86+
ss << "failed to get tablet: " << tablet_id << " with schema hash: " << schema_hash
87+
<< ", reason: " << err;
88+
LOG(WARNING) << ss.str();
8989
return Status(ss.str());
9090
}
9191
{

be/src/olap/olap_engine.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ OLAPTablePtr OLAPEngine::_get_table_with_no_lock(TTabletId tablet_id, SchemaHash
785785
return olap_table;
786786
}
787787

788-
OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table) {
788+
OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table, std::string* err) {
789789
_tablet_map_lock.rdlock();
790790
OLAPTablePtr olap_table;
791791
olap_table = _get_table_with_no_lock(tablet_id, schema_hash);
@@ -794,13 +794,18 @@ OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash,
794794
if (olap_table.get() != NULL) {
795795
if (!olap_table->is_used()) {
796796
OLAP_LOG_WARNING("olap table cannot be used. [table=%ld]", tablet_id);
797+
if (err != nullptr) { *err = "tablet cannot be used"; }
797798
olap_table.reset();
798799
} else if (load_table && !olap_table->is_loaded()) {
799-
if (olap_table->load() != OLAP_SUCCESS) {
800+
OLAPStatus ost = olap_table->load();
801+
if (ost != OLAP_SUCCESS) {
800802
OLAP_LOG_WARNING("fail to load olap table. [table=%ld]", tablet_id);
803+
if (err != nullptr) { *err = "load tablet failed"; }
801804
olap_table.reset();
802805
}
803806
}
807+
} else if (err != nullptr) {
808+
*err = "tablet does not exist";
804809
}
805810

806811
return olap_table;
@@ -835,6 +840,10 @@ OLAPStatus OLAPEngine::get_tables_by_id(
835840
it = table_list->erase(it);
836841
continue;
837842
}
843+
} else if ((*it)->is_used()) {
844+
LOG(WARNING) << "table is bad: " << (*it)->full_name().c_str();
845+
it = table_list->erase(it);
846+
continue;
838847
}
839848
++it;
840849
}
@@ -1884,7 +1893,7 @@ OLAPTablePtr OLAPEngine::_find_best_tablet_to_compaction(CompactionType compacti
18841893
OLAPTablePtr best_table;
18851894
for (tablet_map_t::value_type& table_ins : _tablet_map){
18861895
for (OLAPTablePtr& table_ptr : table_ins.second.table_arr) {
1887-
if (!table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) {
1896+
if (!table_ptr->is_used() || !table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) {
18881897
continue;
18891898
}
18901899

be/src/olap/olap_engine.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,11 @@ class OLAPEngine {
8686
}
8787

8888
// Get table pointer
89-
OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true);
89+
// TODO(cmy): I think it is better to return Status instead of OLAPTablePtr,
90+
// so that the caller can decide what to do next based on Status.
91+
// Currently, I just add a new parameter 'err' to save the error msg.
92+
// This should be redesigned later.
93+
OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true, std::string* err = nullptr);
9094

9195
OLAPStatus get_tables_by_id(TTabletId tablet_id, std::list<OLAPTablePtr>* table_list);
9296

be/src/olap/olap_table.cpp

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,8 @@ OLAPTable::OLAPTable(OLAPHeader* header, OlapStore* store) :
148148
_num_key_fields(0),
149149
_id(0),
150150
_store(store),
151-
_is_loaded(false) {
151+
_is_loaded(false),
152+
_is_bad(false) {
152153
if (header == NULL) {
153154
return; // for convenience of mock test.
154155
}
@@ -310,13 +311,18 @@ OLAPStatus OLAPTable::load() {
310311
<< "res=" << res << ", root=" << one_schema_root;
311312
goto EXIT;
312313
} else if (res != OLAP_SUCCESS) {
313-
OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true);
314-
return res;
314+
// OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true);
315+
goto EXIT;
315316
}
316317
res = load_indices();
317318

318319
if (res != OLAP_SUCCESS) {
319-
LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
320+
if (config::auto_recover_index_loading_failure) {
321+
LOG(WARNING) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
322+
} else {
323+
// fatal log will let BE process exit
324+
LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
325+
}
320326
goto EXIT;
321327
}
322328

@@ -333,11 +339,14 @@ OLAPStatus OLAPTable::load() {
333339
}
334340
release_header_lock();
335341

342+
EXIT:
343+
// always set _is_loaded to true, so that this tablet will be not loaded again
336344
_is_loaded = true;
337345

338-
EXIT:
339346
if (res != OLAP_SUCCESS) {
340-
OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash());
347+
_is_bad = true;
348+
// Do not drop table directly here, FE will get the report and handle it.
349+
// OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash());
341350
}
342351

343352
return res;
@@ -2227,7 +2236,7 @@ void OLAPTable::set_io_error() {
22272236
}
22282237

22292238
bool OLAPTable::is_used() {
2230-
return _store->is_used();
2239+
return !_is_bad && _store->is_used();
22312240
}
22322241

22332242
VersionEntity OLAPTable::get_version_entity_by_version(const Version& version) {

be/src/olap/olap_table.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -626,6 +626,8 @@ class OLAPTable : public std::enable_shared_from_this<OLAPTable> {
626626

627627
bool is_used();
628628

629+
void set_bad(bool is_bad) { _is_bad = is_bad; }
630+
629631
// 得到当前table的root path路径,路径末尾不带斜杠(/)
630632
std::string storage_root_path_name() {
631633
return _storage_root_path;
@@ -753,6 +755,7 @@ class OLAPTable : public std::enable_shared_from_this<OLAPTable> {
753755
std::string _tablet_path;
754756

755757
bool _table_for_check;
758+
std::atomic<bool> _is_bad; // if this tablet is broken, set to true. default is false
756759

757760
DISALLOW_COPY_AND_ASSIGN(OLAPTable);
758761
};

0 commit comments

Comments
 (0)