Set tablet as bad when loading index failed (#1146)
Bad tablet will be reported to FE and be handled And add a config auto_recover_index_loading_failure to control the index loading failure processing
This commit is contained in:
@ -399,6 +399,10 @@ namespace config {
|
||||
|
||||
// max consumer num in one data consumer group, for routine load
|
||||
CONF_Int32(max_consumer_num_per_group, "3");
|
||||
|
||||
// Is set to true, index loading failure will not causing BE exit,
|
||||
// and the tablet will be marked as bad, so that FE will try to repair it.
|
||||
CONF_Bool(auto_recover_index_loading_failure, "false");
|
||||
} // namespace config
|
||||
|
||||
} // namespace doris
|
||||
|
||||
@ -43,13 +43,14 @@ Status EngineMetaReader::get_hints(
|
||||
RuntimeProfile* profile) {
|
||||
auto tablet_id = scan_range->scan_range().tablet_id;
|
||||
int32_t schema_hash = strtoul(scan_range->scan_range().schema_hash.c_str(), NULL, 10);
|
||||
std::string err;
|
||||
OLAPTablePtr table = OLAPEngine::get_instance()->get_table(
|
||||
tablet_id, schema_hash);
|
||||
tablet_id, schema_hash, true, &err);
|
||||
if (table.get() == NULL) {
|
||||
LOG(WARNING) << "tablet does not exist. tablet_id=" << tablet_id << ", schema_hash="
|
||||
<< schema_hash;
|
||||
std::stringstream ss;
|
||||
ss << "tablet does not exist: " << tablet_id;
|
||||
ss << "failed to get tablet: " << tablet_id << "with schema hash: "
|
||||
<< schema_hash << ", reason: " << err;
|
||||
LOG(WARNING) << ss.str();
|
||||
return Status(ss.str());
|
||||
}
|
||||
|
||||
|
||||
@ -79,13 +79,13 @@ Status OlapScanner::_prepare(
|
||||
VersionHash version_hash =
|
||||
strtoul(scan_range->scan_range().version_hash.c_str(), nullptr, 10);
|
||||
{
|
||||
_olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash);
|
||||
std::string err;
|
||||
_olap_table = OLAPEngine::get_instance()->get_table(tablet_id, schema_hash, true, &err);
|
||||
if (_olap_table.get() == nullptr) {
|
||||
OLAP_LOG_WARNING("tablet does not exist. [tablet_id=%ld schema_hash=%d]",
|
||||
tablet_id, schema_hash);
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "tablet does not exist: " << tablet_id;
|
||||
ss << "failed to get tablet: " << tablet_id << " with schema hash: " << schema_hash
|
||||
<< ", reason: " << err;
|
||||
LOG(WARNING) << ss.str();
|
||||
return Status(ss.str());
|
||||
}
|
||||
{
|
||||
|
||||
@ -785,7 +785,7 @@ OLAPTablePtr OLAPEngine::_get_table_with_no_lock(TTabletId tablet_id, SchemaHash
|
||||
return olap_table;
|
||||
}
|
||||
|
||||
OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table) {
|
||||
OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table, std::string* err) {
|
||||
_tablet_map_lock.rdlock();
|
||||
OLAPTablePtr olap_table;
|
||||
olap_table = _get_table_with_no_lock(tablet_id, schema_hash);
|
||||
@ -794,13 +794,18 @@ OLAPTablePtr OLAPEngine::get_table(TTabletId tablet_id, SchemaHash schema_hash,
|
||||
if (olap_table.get() != NULL) {
|
||||
if (!olap_table->is_used()) {
|
||||
OLAP_LOG_WARNING("olap table cannot be used. [table=%ld]", tablet_id);
|
||||
if (err != nullptr) { *err = "tablet cannot be used"; }
|
||||
olap_table.reset();
|
||||
} else if (load_table && !olap_table->is_loaded()) {
|
||||
if (olap_table->load() != OLAP_SUCCESS) {
|
||||
OLAPStatus ost = olap_table->load();
|
||||
if (ost != OLAP_SUCCESS) {
|
||||
OLAP_LOG_WARNING("fail to load olap table. [table=%ld]", tablet_id);
|
||||
if (err != nullptr) { *err = "load tablet failed"; }
|
||||
olap_table.reset();
|
||||
}
|
||||
}
|
||||
} else if (err != nullptr) {
|
||||
*err = "tablet does not exist";
|
||||
}
|
||||
|
||||
return olap_table;
|
||||
@ -835,6 +840,10 @@ OLAPStatus OLAPEngine::get_tables_by_id(
|
||||
it = table_list->erase(it);
|
||||
continue;
|
||||
}
|
||||
} else if ((*it)->is_used()) {
|
||||
LOG(WARNING) << "table is bad: " << (*it)->full_name().c_str();
|
||||
it = table_list->erase(it);
|
||||
continue;
|
||||
}
|
||||
++it;
|
||||
}
|
||||
@ -1884,7 +1893,7 @@ OLAPTablePtr OLAPEngine::_find_best_tablet_to_compaction(CompactionType compacti
|
||||
OLAPTablePtr best_table;
|
||||
for (tablet_map_t::value_type& table_ins : _tablet_map){
|
||||
for (OLAPTablePtr& table_ptr : table_ins.second.table_arr) {
|
||||
if (!table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) {
|
||||
if (!table_ptr->is_used() || !table_ptr->is_loaded() || !_can_do_compaction(table_ptr)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
@ -86,7 +86,11 @@ public:
|
||||
}
|
||||
|
||||
// Get table pointer
|
||||
OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true);
|
||||
// TODO(cmy): I think it is better to return Status instead of OLAPTablePtr,
|
||||
// so that the caller can decide what to do next based on Status.
|
||||
// Currently, I just add a new parameter 'err' to save the error msg.
|
||||
// This should be redesigned later.
|
||||
OLAPTablePtr get_table(TTabletId tablet_id, SchemaHash schema_hash, bool load_table = true, std::string* err = nullptr);
|
||||
|
||||
OLAPStatus get_tables_by_id(TTabletId tablet_id, std::list<OLAPTablePtr>* table_list);
|
||||
|
||||
|
||||
@ -148,7 +148,8 @@ OLAPTable::OLAPTable(OLAPHeader* header, OlapStore* store) :
|
||||
_num_key_fields(0),
|
||||
_id(0),
|
||||
_store(store),
|
||||
_is_loaded(false) {
|
||||
_is_loaded(false),
|
||||
_is_bad(false) {
|
||||
if (header == NULL) {
|
||||
return; // for convenience of mock test.
|
||||
}
|
||||
@ -310,13 +311,18 @@ OLAPStatus OLAPTable::load() {
|
||||
<< "res=" << res << ", root=" << one_schema_root;
|
||||
goto EXIT;
|
||||
} else if (res != OLAP_SUCCESS) {
|
||||
OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true);
|
||||
return res;
|
||||
// OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash(), true);
|
||||
goto EXIT;
|
||||
}
|
||||
res = load_indices();
|
||||
|
||||
if (res != OLAP_SUCCESS) {
|
||||
LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
|
||||
if (config::auto_recover_index_loading_failure) {
|
||||
LOG(WARNING) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
|
||||
} else {
|
||||
// fatal log will let BE process exit
|
||||
LOG(FATAL) << "fail to load indices. [res=" << res << " table='" << _full_name << "']";
|
||||
}
|
||||
goto EXIT;
|
||||
}
|
||||
|
||||
@ -333,11 +339,14 @@ OLAPStatus OLAPTable::load() {
|
||||
}
|
||||
release_header_lock();
|
||||
|
||||
EXIT:
|
||||
// always set _is_loaded to true, so that this tablet will be not loaded again
|
||||
_is_loaded = true;
|
||||
|
||||
EXIT:
|
||||
if (res != OLAP_SUCCESS) {
|
||||
OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash());
|
||||
_is_bad = true;
|
||||
// Do not drop table directly here, FE will get the report and handle it.
|
||||
// OLAPEngine::get_instance()->drop_table(tablet_id(), schema_hash());
|
||||
}
|
||||
|
||||
return res;
|
||||
@ -2227,7 +2236,7 @@ void OLAPTable::set_io_error() {
|
||||
}
|
||||
|
||||
bool OLAPTable::is_used() {
|
||||
return _store->is_used();
|
||||
return !_is_bad && _store->is_used();
|
||||
}
|
||||
|
||||
VersionEntity OLAPTable::get_version_entity_by_version(const Version& version) {
|
||||
|
||||
@ -626,6 +626,8 @@ public:
|
||||
|
||||
bool is_used();
|
||||
|
||||
void set_bad(bool is_bad) { _is_bad = is_bad; }
|
||||
|
||||
// 得到当前table的root path路径,路径末尾不带斜杠(/)
|
||||
std::string storage_root_path_name() {
|
||||
return _storage_root_path;
|
||||
@ -753,6 +755,7 @@ private:
|
||||
std::string _tablet_path;
|
||||
|
||||
bool _table_for_check;
|
||||
std::atomic<bool> _is_bad; // if this tablet is broken, set to true. default is false
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(OLAPTable);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user