From 02f36c23ede193bc18fbba6f2228ff6ed462161f Mon Sep 17 00:00:00 2001 From: Mingyu Chen Date: Mon, 13 May 2019 10:22:04 +0800 Subject: [PATCH] Set tablet as bad when loading index failed (#1146) Bad tablet will be reported to FE and be handled And add a config auto_recover_index_loading_failure to control the index loading failure processing --- be/src/common/config.h | 4 ++++ be/src/exec/olap_meta_reader.cpp | 9 +++++---- be/src/exec/olap_scanner.cpp | 10 +++++----- be/src/olap/olap_engine.cpp | 15 ++++++++++++--- be/src/olap/olap_engine.h | 6 +++++- be/src/olap/olap_table.cpp | 23 ++++++++++++++++------- be/src/olap/olap_table.h | 3 +++ 7 files changed, 50 insertions(+), 20 deletions(-) diff --git a/be/src/common/config.h b/be/src/common/config.h index f3b664b4e5..86566ebae5 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -399,6 +399,10 @@ namespace config { // max consumer num in one data consumer group, for routine load CONF_Int32(max_consumer_num_per_group, "3"); + + // Is set to true, index loading failure will not causing BE exit, + // and the tablet will be marked as bad, so that FE will try to repair it. + CONF_Bool(auto_recover_index_loading_failure, "false"); } // namespace config } // namespace doris diff --git a/be/src/exec/olap_meta_reader.cpp b/be/src/exec/olap_meta_reader.cpp index ceeda76dc7..5a2bdd5119 100644 --- a/be/src/exec/olap_meta_reader.cpp +++ b/be/src/exec/olap_meta_reader.cpp @@ -43,13 +43,14 @@ Status EngineMetaReader::get_hints( RuntimeProfile* profile) { auto tablet_id = scan_range->scan_range().tablet_id; int32_t schema_hash = strtoul(scan_range->scan_range().schema_hash.c_str(), NULL, 10); + std::string err; OLAPTablePtr table = OLAPEngine::get_instance()->get_table( - tablet_id, schema_hash); + tablet_id, schema_hash, true, &err); if (table.get() == NULL) { - LOG(WARNING) << "tablet does not exist. tablet_id=" << tablet_id << ", schema_hash=" - << schema_hash; std::stringstream ss; - ss << "tablet does not exist: " << tablet_id; + ss << "failed to get tablet: " << tablet_id << "with schema hash: " + << schema_hash << ", reason: " << err; + LOG(WARNING) << ss.str(); return Status(ss.str()); } diff --git a/be/src/exec/olap_scanner.cpp b/be/src/exec/olap_scanner.cpp index 80854d6947..f8ce87a0c0 100644 --- a/be/src/exec/olap_scanner.cpp +++ b/be/src/exec/olap_scanner.cpp @@ -79,13 +79,13 @@ Status OlapScanner::_prepare( VersionHash version_hash = strtoul(scan_range->scan_range().version_hash.c_str(), nullptr, 10); { - _olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash); + std::string err; + _olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash, true, &err); if (_olap_table.get() == nullptr) { - OLAP_LOG_WARNING("tablet does not exist. [tablet_id=%ld schema_hash=%d]", - tablet_id, schema_hash); - std::stringstream ss; - ss << "tablet does not exist: " << tablet_id; + ss << "failed to get tablet: " << tablet_id << " with schema hash: " << schema_hash + << ", reason: " << err; + LOG(WARNING) << ss.str(); return Status(ss.str()); } { diff --git a/be/src/olap/olap_engine.cpp b/be/src/olap/olap_engine.cpp index f16791b7c5..9601f63c71 100644 --- a/be/src/olap/olap_engine.cpp +++ b/be/src/olap/olap_engine.cpp @@ -785,7 +785,7 @@ OLAPTablePtr OLAPEngine::_get_table_with_no_lock(TTabletId tablet_id, SchemaHash return olap_table; } -OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table) { +OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table, std::string* err) { _tablet_map_lock.rdlock(); OLAPTablePtr olap_table; olap_table = _get_table_with_no_lock(tablet_id, schema_hash); @@ -794,13 +794,18 @@ OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, if (olap_table.get() != NULL) { if (!olap_table->is_used()) { OLAP_LOG_WARNING("olap table cannot be used. [table=%ld]", tablet_id); + if (err != nullptr) { *err = "tablet cannot be used"; } olap_table.reset(); } else if (load_table && !olap_table->is_loaded()) { - if (olap_table->load() != OLAP_SUCCESS) { + OLAPStatus ost = olap_table->load(); + if (ost != OLAP_SUCCESS) { OLAP_LOG_WARNING("fail to load olap table. [table=%ld]", tablet_id); + if (err != nullptr) { *err = "load tablet failed"; } olap_table.reset(); } } + } else if (err != nullptr) { + *err = "tablet does not exist"; } return olap_table; @@ -835,6 +840,10 @@ OLAPStatus OLAPEngine::get_tables_by_id( it = table_list->erase(it); continue; } + } else if ((*it)->is_used()) { + LOG(WARNING) << "table is bad: " << (*it)->full_name().c_str(); + it = table_list->erase(it); + continue; } ++it; } @@ -1884,7 +1893,7 @@ OLAPTablePtr OLAPEngine::_find_best_tablet_to_compaction(CompactionType compacti OLAPTablePtr best_table; for (tablet_map_t::value_type& table_ins : _tablet_map){ for (OLAPTablePtr& table_ptr : table_ins.second.table_arr) { - if (!table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) { + if (!table_ptr->is_used() || !table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) { continue; } diff --git a/be/src/olap/olap_engine.h b/be/src/olap/olap_engine.h index 532a7750a5..8928571b35 100644 --- a/be/src/olap/olap_engine.h +++ b/be/src/olap/olap_engine.h @@ -86,7 +86,11 @@ public: } // Get table pointer - OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true); + // TODO(cmy): I think it is better to return Status instead of OLAPTablePtr, + // so that the caller can decide what to do next based on Status. + // Currently, I just add a new parameter 'err' to save the error msg. + // This should be redesigned later. + OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true, std::string* err = nullptr); OLAPStatus get_tables_by_id(TTabletId tablet_id, std::list* table_list); diff --git a/be/src/olap/olap_table.cpp b/be/src/olap/olap_table.cpp index 2a52a2c288..408e6033e5 100644 --- a/be/src/olap/olap_table.cpp +++ b/be/src/olap/olap_table.cpp @@ -148,7 +148,8 @@ OLAPTable::OLAPTable(OLAPHeader* header, OlapStore* store) : _num_key_fields(0), _id(0), _store(store), - _is_loaded(false) { + _is_loaded(false), + _is_bad(false) { if (header == NULL) { return; // for convenience of mock test. } @@ -310,13 +311,18 @@ OLAPStatus OLAPTable::load() { << "res=" << res << ", root=" << one_schema_root; goto EXIT; } else if (res != OLAP_SUCCESS) { - OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true); - return res; + // OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true); + goto EXIT; } res = load_indices(); if (res != OLAP_SUCCESS) { - LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']"; + if (config::auto_recover_index_loading_failure) { + LOG(WARNING) << "fail to load indices. [res=" << res << " table='" << _full_name << "']"; + } else { + // fatal log will let BE process exit + LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']"; + } goto EXIT; } @@ -333,11 +339,14 @@ OLAPStatus OLAPTable::load() { } release_header_lock(); +EXIT: + // always set _is_loaded to true, so that this tablet will be not loaded again _is_loaded = true; -EXIT: if (res != OLAP_SUCCESS) { - OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash()); + _is_bad = true; + // Do not drop table directly here, FE will get the report and handle it. + // OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash()); } return res; @@ -2227,7 +2236,7 @@ void OLAPTable::set_io_error() { } bool OLAPTable::is_used() { - return _store->is_used(); + return !_is_bad && _store->is_used(); } VersionEntity OLAPTable::get_version_entity_by_version(const Version& version) { diff --git a/be/src/olap/olap_table.h b/be/src/olap/olap_table.h index eda2e0d214..85e7e84626 100644 --- a/be/src/olap/olap_table.h +++ b/be/src/olap/olap_table.h @@ -626,6 +626,8 @@ public: bool is_used(); + void set_bad(bool is_bad) { _is_bad = is_bad; } + // 得到当前table的root path路径,路径末尾不带斜杠(/) std::string storage_root_path_name() { return _storage_root_path; @@ -753,6 +755,7 @@ private: std::string _tablet_path; bool _table_for_check; + std::atomic _is_bad; // if this tablet is broken, set to true. default is false DISALLOW_COPY_AND_ASSIGN(OLAPTable); };