diff --git a/be/src/olap/data_dir.cpp b/be/src/olap/data_dir.cpp index 3820f2c186..6cf9e0ca64 100644 --- a/be/src/olap/data_dir.cpp +++ b/be/src/olap/data_dir.cpp @@ -477,7 +477,7 @@ OLAPStatus DataDir::load() { // 1. add committed rowset to txn map // 2. add visible rowset to tablet // ignore any errors when load tablet or rowset, because fe will repair them after report - int64_t tablet_not_found = 0; + int64_t invalid_rowset_counter = 0; for (auto rowset_meta : dir_rowset_metas) { TabletSharedPtr tablet = _tablet_manager->get_tablet(rowset_meta->tablet_id(), rowset_meta->tablet_schema_hash()); @@ -486,7 +486,7 @@ OLAPStatus DataDir::load() { VLOG_NOTICE << "could not find tablet id: " << rowset_meta->tablet_id() << ", schema hash: " << rowset_meta->tablet_schema_hash() << ", for rowset: " << rowset_meta->rowset_id() << ", skip this rowset"; - ++tablet_not_found; + ++invalid_rowset_counter; continue; } RowsetSharedPtr rowset; @@ -534,10 +534,15 @@ OLAPStatus DataDir::load() { << " schema hash: " << rowset_meta->tablet_schema_hash() << " txn: " << rowset_meta->txn_id() << " current valid tablet uid: " << tablet->tablet_uid(); + ++invalid_rowset_counter; } } + // At startup, we only count these invalid rowset, but do not actually delete it. + // The actual delete operation is in StorageEngine::_clean_unused_rowset_metas, + // which is cleaned up uniformly by the background cleanup thread. LOG(INFO) << "finish to load tablets from " << _path_desc.filepath << ", total rowset meta: " - << dir_rowset_metas.size() << ", tablet not found: " << tablet_not_found; + << dir_rowset_metas.size() << ", invalid rowset num: " << invalid_rowset_counter; + return OLAP_SUCCESS; } diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 201bffb664..823565be28 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -714,11 +714,12 @@ void StorageEngine::_clean_unused_rowset_metas() { std::vector invalid_rowset_metas; auto clean_rowset_func = [this, &invalid_rowset_metas](TabletUid tablet_uid, RowsetId rowset_id, const std::string& meta_str) -> bool { + // return false will break meta iterator, return true to skip this error RowsetMetaSharedPtr rowset_meta(new AlphaRowsetMeta()); bool parsed = rowset_meta->init(meta_str); if (!parsed) { LOG(WARNING) << "parse rowset meta string failed for rowset_id:" << rowset_id; - // return false will break meta iterator, return true to skip this error + invalid_rowset_metas.push_back(rowset_meta); return true; } if (rowset_meta->tablet_uid() != tablet_uid) { @@ -726,12 +727,32 @@ void StorageEngine::_clean_unused_rowset_metas() { << ", rowset_id=" << rowset_meta->rowset_id() << ", in_put_tablet_uid=" << tablet_uid << ", tablet_uid in rowset meta=" << rowset_meta->tablet_uid(); + invalid_rowset_metas.push_back(rowset_meta); return true; } TabletSharedPtr tablet = _tablet_manager->get_tablet( - rowset_meta->tablet_id(), rowset_meta->tablet_schema_hash(), tablet_uid); + rowset_meta->tablet_id(), rowset_meta->tablet_schema_hash()); if (tablet == nullptr) { + // tablet may be dropped + // TODO(cmy): this is better to be a VLOG, because drop table is a very common case. + // leave it as INFO log for observation. Maybe change it in future. + LOG(INFO) << "failed to find tablet " << rowset_meta->tablet_id() << " for rowset: " << rowset_meta->rowset_id() + << ", tablet may be dropped"; + invalid_rowset_metas.push_back(rowset_meta); + return true; + } + if (tablet->tablet_uid() != rowset_meta->tablet_uid()) { + // In this case, we get the tablet using the tablet id recorded in the rowset meta. + // but the uid in the tablet is different from the one recorded in the rowset meta. + // How this happened: + // Replica1 of Tablet A exists on BE1. Because of the clone task, a new replica2 is createed on BE2, + // and then replica1 deleted from BE1. After some time, we created replica again on BE1, + // which will creates a new tablet with the same id but a different uid. + // And in the historical version, when we deleted the replica, we did not delete the corresponding rowset meta, + // thus causing the original rowset meta to remain(with same tablet id but different uid). + LOG(WARNING) << "rowset's tablet uid " << rowset_meta->tablet_uid() << " does not equal to tablet uid: " << tablet->tablet_uid(); + invalid_rowset_metas.push_back(rowset_meta); return true; } if (rowset_meta->rowset_state() == RowsetStatePB::VISIBLE && @@ -750,8 +771,9 @@ void StorageEngine::_clean_unused_rowset_metas() { RowsetMetaManager::traverse_rowset_metas(data_dir->get_meta(), clean_rowset_func); for (auto& rowset_meta : invalid_rowset_metas) { RowsetMetaManager::remove(data_dir->get_meta(), rowset_meta->tablet_uid(), - rowset_meta->rowset_id()); + rowset_meta->rowset_id()); } + LOG(INFO) << "remove " << invalid_rowset_metas.size() << " invalid rowset meta from dir: " << data_dir->path(); invalid_rowset_metas.clear(); } }