Set tablet as bad when loading index failed (#1146)

Bad tablet will be reported to FE and be handled

And add a config auto_recover_index_loading_failure to control the index loading failure processing
This commit is contained in:
Mingyu Chen
2019-05-13 10:22:04 +08:00
committed by ZHAO Chun
parent 6117227754
commit 02f36c23ed
7 changed files with 50 additions and 20 deletions

View File

@ -399,6 +399,10 @@ namespace config {
// max consumer num in one data consumer group, for routine load
CONF_Int32(max_consumer_num_per_group, "3");
// Is set to true, index loading failure will not causing BE exit,
// and the tablet will be marked as bad, so that FE will try to repair it.
CONF_Bool(auto_recover_index_loading_failure, "false");
} // namespace config
} // namespace doris

View File

@ -43,13 +43,14 @@ Status EngineMetaReader::get_hints(
RuntimeProfile* profile) {
auto tablet_id = scan_range->scan_range().tablet_id;
int32_t schema_hash = strtoul(scan_range->scan_range().schema_hash.c_str(), NULL, 10);
std::string err;
OLAPTablePtr table = OLAPEngine::get_instance()->get_table(
tablet_id, schema_hash);
tablet_id, schema_hash, true, &err);
if (table.get() == NULL) {
LOG(WARNING) << "tablet does not exist. tablet_id=" << tablet_id << ", schema_hash="
<< schema_hash;
std::stringstream ss;
ss << "tablet does not exist: " << tablet_id;
ss << "failed to get tablet: " << tablet_id << "with schema hash: "
<< schema_hash << ", reason: " << err;
LOG(WARNING) << ss.str();
return Status(ss.str());
}

View File

@ -79,13 +79,13 @@ Status OlapScanner::_prepare(
VersionHash version_hash =
strtoul(scan_range->scan_range().version_hash.c_str(), nullptr, 10);
{
_olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash);
std::string err;
_olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash, true, &err);
if (_olap_table.get() == nullptr) {
OLAP_LOG_WARNING("tablet does not exist. [tablet_id=%ld schema_hash=%d]",
tablet_id, schema_hash);
std::stringstream ss;
ss << "tablet does not exist: " << tablet_id;
ss << "failed to get tablet: " << tablet_id << " with schema hash: " << schema_hash
<< ", reason: " << err;
LOG(WARNING) << ss.str();
return Status(ss.str());
}
{

View File

@ -785,7 +785,7 @@ OLAPTablePtr OLAPEngine::_get_table_with_no_lock(TTabletId tablet_id, SchemaHash
return olap_table;
}
OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table) {
OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table, std::string* err) {
_tablet_map_lock.rdlock();
OLAPTablePtr olap_table;
olap_table = _get_table_with_no_lock(tablet_id, schema_hash);
@ -794,13 +794,18 @@ OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash,
if (olap_table.get() != NULL) {
if (!olap_table->is_used()) {
OLAP_LOG_WARNING("olap table cannot be used. [table=%ld]", tablet_id);
if (err != nullptr) { *err = "tablet cannot be used"; }
olap_table.reset();
} else if (load_table && !olap_table->is_loaded()) {
if (olap_table->load() != OLAP_SUCCESS) {
OLAPStatus ost = olap_table->load();
if (ost != OLAP_SUCCESS) {
OLAP_LOG_WARNING("fail to load olap table. [table=%ld]", tablet_id);
if (err != nullptr) { *err = "load tablet failed"; }
olap_table.reset();
}
}
} else if (err != nullptr) {
*err = "tablet does not exist";
}
return olap_table;
@ -835,6 +840,10 @@ OLAPStatus OLAPEngine::get_tables_by_id(
it = table_list->erase(it);
continue;
}
} else if ((*it)->is_used()) {
LOG(WARNING) << "table is bad: " << (*it)->full_name().c_str();
it = table_list->erase(it);
continue;
}
++it;
}
@ -1884,7 +1893,7 @@ OLAPTablePtr OLAPEngine::_find_best_tablet_to_compaction(CompactionType compacti
OLAPTablePtr best_table;
for (tablet_map_t::value_type& table_ins : _tablet_map){
for (OLAPTablePtr& table_ptr : table_ins.second.table_arr) {
if (!table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) {
if (!table_ptr->is_used() || !table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) {
continue;
}

View File

@ -86,7 +86,11 @@ public:
}
// Get table pointer
OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true);
// TODO(cmy): I think it is better to return Status instead of OLAPTablePtr,
// so that the caller can decide what to do next based on Status.
// Currently, I just add a new parameter 'err' to save the error msg.
// This should be redesigned later.
OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true, std::string* err = nullptr);
OLAPStatus get_tables_by_id(TTabletId tablet_id, std::list<OLAPTablePtr>* table_list);

View File

@ -148,7 +148,8 @@ OLAPTable::OLAPTable(OLAPHeader* header, OlapStore* store) :
_num_key_fields(0),
_id(0),
_store(store),
_is_loaded(false) {
_is_loaded(false),
_is_bad(false) {
if (header == NULL) {
return; // for convenience of mock test.
}
@ -310,13 +311,18 @@ OLAPStatus OLAPTable::load() {
<< "res=" << res << ", root=" << one_schema_root;
goto EXIT;
} else if (res != OLAP_SUCCESS) {
OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true);
return res;
// OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true);
goto EXIT;
}
res = load_indices();
if (res != OLAP_SUCCESS) {
LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
if (config::auto_recover_index_loading_failure) {
LOG(WARNING) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
} else {
// fatal log will let BE process exit
LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
}
goto EXIT;
}
@ -333,11 +339,14 @@ OLAPStatus OLAPTable::load() {
}
release_header_lock();
EXIT:
// always set _is_loaded to true, so that this tablet will be not loaded again
_is_loaded = true;
EXIT:
if (res != OLAP_SUCCESS) {
OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash());
_is_bad = true;
// Do not drop table directly here, FE will get the report and handle it.
// OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash());
}
return res;
@ -2227,7 +2236,7 @@ void OLAPTable::set_io_error() {
}
bool OLAPTable::is_used() {
return _store->is_used();
return !_is_bad && _store->is_used();
}
VersionEntity OLAPTable::get_version_entity_by_version(const Version& version) {

View File

@ -626,6 +626,8 @@ public:
bool is_used();
void set_bad(bool is_bad) { _is_bad = is_bad; }
// 得到当前table的root path路径,路径末尾不带斜杠(/)
std::string storage_root_path_name() {
return _storage_root_path;
@ -753,6 +755,7 @@ private:
std::string _tablet_path;
bool _table_for_check;
std::atomic<bool> _is_bad; // if this tablet is broken, set to true. default is false
DISALLOW_COPY_AND_ASSIGN(OLAPTable);
};