[fix](gc tablet) fix get shutdown tablet cost a lot time (#27693)
This commit is contained in:
@ -619,24 +619,12 @@ TabletSharedPtr TabletManager::_get_tablet_unlocked(TTabletId tablet_id, bool in
|
||||
TabletSharedPtr tablet;
|
||||
tablet = _get_tablet_unlocked(tablet_id);
|
||||
if (tablet == nullptr && include_deleted) {
|
||||
{
|
||||
std::shared_lock rdlock(_shutdown_tablets_lock);
|
||||
for (auto& deleted_tablet : _shutdown_tablets) {
|
||||
CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr";
|
||||
if (deleted_tablet->tablet_id() == tablet_id) {
|
||||
tablet = deleted_tablet;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (tablet == nullptr) {
|
||||
std::shared_lock rdlock(_shutdown_deleting_tablets_lock);
|
||||
for (auto& deleted_tablet : _shutdown_deleting_tablets) {
|
||||
CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr";
|
||||
if (deleted_tablet->tablet_id() == tablet_id) {
|
||||
tablet = deleted_tablet;
|
||||
break;
|
||||
}
|
||||
std::shared_lock rdlock(_shutdown_tablets_lock);
|
||||
for (auto& deleted_tablet : _shutdown_tablets) {
|
||||
CHECK(deleted_tablet != nullptr) << "deleted tablet is nullptr";
|
||||
if (deleted_tablet->tablet_id() == tablet_id) {
|
||||
tablet = deleted_tablet;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1040,119 +1028,155 @@ Status TabletManager::build_all_report_tablets_info(std::map<TTabletId, TTablet>
|
||||
}
|
||||
|
||||
Status TabletManager::start_trash_sweep() {
|
||||
SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
|
||||
{
|
||||
for_each_tablet(
|
||||
[](const TabletSharedPtr& tablet) { tablet->delete_expired_stale_rowset(); },
|
||||
filter_all_tablets);
|
||||
std::unique_lock<std::mutex> lock(_gc_tablets_lock, std::defer_lock);
|
||||
if (!lock.try_lock()) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int32_t clean_num = 0;
|
||||
do {
|
||||
SCOPED_CONSUME_MEM_TRACKER(_mem_tracker);
|
||||
for_each_tablet([](const TabletSharedPtr& tablet) { tablet->delete_expired_stale_rowset(); },
|
||||
filter_all_tablets);
|
||||
|
||||
std::list<TabletSharedPtr>::iterator last_it;
|
||||
{
|
||||
std::shared_lock rdlock(_shutdown_tablets_lock);
|
||||
last_it = _shutdown_tablets.begin();
|
||||
if (last_it == _shutdown_tablets.end()) {
|
||||
return Status::OK();
|
||||
}
|
||||
}
|
||||
|
||||
auto get_batch_tablets = [this, &last_it](int limit) {
|
||||
std::vector<TabletSharedPtr> batch_tablets;
|
||||
std::lock_guard<std::shared_mutex> wrdlock(_shutdown_tablets_lock);
|
||||
while (last_it != _shutdown_tablets.end() && batch_tablets.size() < limit) {
|
||||
// it means current tablet is referenced by other thread
|
||||
if (last_it->use_count() > 1) {
|
||||
last_it++;
|
||||
} else {
|
||||
batch_tablets.push_back(*last_it);
|
||||
last_it = _shutdown_tablets.erase(last_it);
|
||||
}
|
||||
}
|
||||
|
||||
return batch_tablets;
|
||||
};
|
||||
|
||||
std::list<TabletSharedPtr> failed_tablets;
|
||||
// return true if need continue delete
|
||||
auto delete_one_batch = [this, get_batch_tablets, &failed_tablets]() -> bool {
|
||||
int limit = 200;
|
||||
for (;;) {
|
||||
auto batch_tablets = get_batch_tablets(limit);
|
||||
for (const auto& tablet : batch_tablets) {
|
||||
if (_move_tablet_to_trash(tablet)) {
|
||||
limit--;
|
||||
} else {
|
||||
failed_tablets.push_back(tablet);
|
||||
}
|
||||
}
|
||||
if (limit <= 0) {
|
||||
return true;
|
||||
}
|
||||
if (batch_tablets.empty()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
while (delete_one_batch()) {
|
||||
#ifndef BE_TEST
|
||||
sleep(1);
|
||||
#endif
|
||||
clean_num = 0;
|
||||
// should get write lock here, because it will remove tablet from shut_down_tablets
|
||||
// and get tablet will access shut_down_tablets
|
||||
{
|
||||
std::lock_guard<std::shared_mutex> wrlock1(_shutdown_tablets_lock);
|
||||
std::lock_guard<std::shared_mutex> wrlock2(_shutdown_deleting_tablets_lock);
|
||||
for (const auto& tablet : _shutdown_tablets) {
|
||||
_shutdown_deleting_tablets.push_back(tablet);
|
||||
}
|
||||
_shutdown_tablets.clear();
|
||||
}
|
||||
std::lock_guard<std::shared_mutex> wrlock(_shutdown_deleting_tablets_lock);
|
||||
auto it = _shutdown_deleting_tablets.begin();
|
||||
while (it != _shutdown_deleting_tablets.end()) {
|
||||
// check if the meta has the tablet info and its state is shutdown
|
||||
if (it->use_count() > 1) {
|
||||
// it means current tablet is referenced by other thread
|
||||
++it;
|
||||
continue;
|
||||
}
|
||||
TabletMetaSharedPtr tablet_meta(new TabletMeta());
|
||||
int64_t get_meta_ts = MonotonicMicros();
|
||||
Status check_st = TabletMetaManager::get_meta((*it)->data_dir(), (*it)->tablet_id(),
|
||||
(*it)->schema_hash(), tablet_meta);
|
||||
if (check_st.ok()) {
|
||||
if (tablet_meta->tablet_state() != TABLET_SHUTDOWN ||
|
||||
tablet_meta->tablet_uid() != (*it)->tablet_uid()) {
|
||||
LOG(WARNING) << "tablet's state changed to normal, skip remove dirs"
|
||||
<< " tablet id = " << tablet_meta->tablet_id()
|
||||
<< " schema hash = " << tablet_meta->schema_hash()
|
||||
<< " old tablet_uid=" << (*it)->tablet_uid()
|
||||
<< " cur tablet_uid=" << tablet_meta->tablet_uid();
|
||||
// remove it from list
|
||||
it = _shutdown_deleting_tablets.erase(it);
|
||||
continue;
|
||||
}
|
||||
// move data to trash
|
||||
const auto& tablet_path = (*it)->tablet_path();
|
||||
bool exists = false;
|
||||
Status exists_st = io::global_local_filesystem()->exists(tablet_path, &exists);
|
||||
if (!exists_st) {
|
||||
continue;
|
||||
}
|
||||
if (exists) {
|
||||
// take snapshot of tablet meta
|
||||
auto meta_file_path = fmt::format("{}/{}.hdr", tablet_path, (*it)->tablet_id());
|
||||
int64_t save_meta_ts = MonotonicMicros();
|
||||
RETURN_IF_ERROR((*it)->tablet_meta()->save(meta_file_path));
|
||||
int64_t now = MonotonicMicros();
|
||||
LOG(INFO) << "start to move tablet to trash. " << tablet_path
|
||||
<< ". rocksdb get meta cost " << (save_meta_ts - get_meta_ts)
|
||||
<< " us, rocksdb save meta cost " << (now - save_meta_ts)
|
||||
<< " us, clean_num=" << clean_num;
|
||||
Status rm_st = (*it)->data_dir()->move_to_trash(tablet_path);
|
||||
if (!rm_st.ok()) {
|
||||
LOG(WARNING) << "fail to move dir to trash. " << tablet_path;
|
||||
++it;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// remove tablet meta
|
||||
RETURN_IF_ERROR(TabletMetaManager::remove((*it)->data_dir(), (*it)->tablet_id(),
|
||||
(*it)->schema_hash()));
|
||||
LOG(INFO) << "successfully move tablet to trash. "
|
||||
<< "tablet_id=" << (*it)->tablet_id()
|
||||
<< ", schema_hash=" << (*it)->schema_hash()
|
||||
<< ", tablet_path=" << tablet_path;
|
||||
it = _shutdown_deleting_tablets.erase(it);
|
||||
++clean_num;
|
||||
} else {
|
||||
// if could not find tablet info in meta store, then check if dir existed
|
||||
const auto& tablet_path = (*it)->tablet_path();
|
||||
bool exists = false;
|
||||
Status exists_st = io::global_local_filesystem()->exists(tablet_path, &exists);
|
||||
if (!exists_st) {
|
||||
continue;
|
||||
}
|
||||
if (exists) {
|
||||
LOG(WARNING) << "errors while load meta from store, skip this tablet. "
|
||||
<< "tablet_id=" << (*it)->tablet_id()
|
||||
<< ", schema_hash=" << (*it)->schema_hash();
|
||||
++it;
|
||||
} else {
|
||||
LOG(INFO) << "could not find tablet dir, skip it and remove it from gc-queue. "
|
||||
<< "tablet_id=" << (*it)->tablet_id()
|
||||
<< ", schema_hash=" << (*it)->schema_hash()
|
||||
<< ", tablet_path=" << tablet_path;
|
||||
it = _shutdown_deleting_tablets.erase(it);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// yield to avoid holding _tablet_map_lock for too long
|
||||
if (clean_num >= 200) {
|
||||
break;
|
||||
if (!failed_tablets.empty()) {
|
||||
std::lock_guard<std::shared_mutex> wrlock(_shutdown_tablets_lock);
|
||||
_shutdown_tablets.splice(_shutdown_tablets.end(), failed_tablets);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool TabletManager::_move_tablet_to_trash(const TabletSharedPtr& tablet) {
|
||||
TabletMetaSharedPtr tablet_meta(new TabletMeta());
|
||||
int64_t get_meta_ts = MonotonicMicros();
|
||||
Status check_st = TabletMetaManager::get_meta(tablet->data_dir(), tablet->tablet_id(),
|
||||
tablet->schema_hash(), tablet_meta);
|
||||
if (check_st.ok()) {
|
||||
if (tablet_meta->tablet_state() != TABLET_SHUTDOWN ||
|
||||
tablet_meta->tablet_uid() != tablet->tablet_uid()) {
|
||||
LOG(WARNING) << "tablet's state changed to normal, skip remove dirs"
|
||||
<< " tablet id = " << tablet_meta->tablet_id()
|
||||
<< " schema hash = " << tablet_meta->schema_hash()
|
||||
<< " old tablet_uid=" << tablet->tablet_uid()
|
||||
<< " cur tablet_uid=" << tablet_meta->tablet_uid();
|
||||
return true;
|
||||
}
|
||||
// move data to trash
|
||||
const auto& tablet_path = tablet->tablet_path();
|
||||
bool exists = false;
|
||||
Status exists_st = io::global_local_filesystem()->exists(tablet_path, &exists);
|
||||
if (!exists_st) {
|
||||
return false;
|
||||
}
|
||||
if (exists) {
|
||||
// take snapshot of tablet meta
|
||||
auto meta_file_path = fmt::format("{}/{}.hdr", tablet_path, tablet->tablet_id());
|
||||
int64_t save_meta_ts = MonotonicMicros();
|
||||
auto save_st = tablet->tablet_meta()->save(meta_file_path);
|
||||
if (!save_st.ok()) {
|
||||
LOG(WARNING) << "failed to save meta, tablet_id=" << tablet_meta->tablet_id()
|
||||
<< ", tablet_uid=" << tablet_meta->tablet_uid()
|
||||
<< ", error=" << save_st;
|
||||
return false;
|
||||
}
|
||||
int64_t now = MonotonicMicros();
|
||||
LOG(INFO) << "start to move tablet to trash. " << tablet_path
|
||||
<< ". rocksdb get meta cost " << (save_meta_ts - get_meta_ts)
|
||||
<< " us, rocksdb save meta cost " << (now - save_meta_ts) << " us";
|
||||
Status rm_st = tablet->data_dir()->move_to_trash(tablet_path);
|
||||
if (!rm_st.ok()) {
|
||||
LOG(WARNING) << "fail to move dir to trash. " << tablet_path;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// >= 200 means there may be more tablets need to be handled
|
||||
// So continue
|
||||
} while (clean_num >= 200);
|
||||
return Status::OK();
|
||||
} // start_trash_sweep
|
||||
// remove tablet meta
|
||||
auto remove_st = TabletMetaManager::remove(tablet->data_dir(), tablet->tablet_id(),
|
||||
tablet->schema_hash());
|
||||
if (!remove_st.ok()) {
|
||||
LOG(WARNING) << "failed to remove meta, tablet_id=" << tablet_meta->tablet_id()
|
||||
<< ", tablet_uid=" << tablet_meta->tablet_uid() << ", error=" << remove_st;
|
||||
return false;
|
||||
}
|
||||
LOG(INFO) << "successfully move tablet to trash. "
|
||||
<< "tablet_id=" << tablet->tablet_id()
|
||||
<< ", schema_hash=" << tablet->schema_hash() << ", tablet_path=" << tablet_path;
|
||||
return true;
|
||||
} else {
|
||||
// if could not find tablet info in meta store, then check if dir existed
|
||||
const auto& tablet_path = tablet->tablet_path();
|
||||
bool exists = false;
|
||||
Status exists_st = io::global_local_filesystem()->exists(tablet_path, &exists);
|
||||
if (!exists_st) {
|
||||
return false;
|
||||
}
|
||||
if (exists) {
|
||||
LOG(WARNING) << "errors while load meta from store, skip this tablet. "
|
||||
<< "tablet_id=" << tablet->tablet_id()
|
||||
<< ", schema_hash=" << tablet->schema_hash();
|
||||
return false;
|
||||
} else {
|
||||
LOG(INFO) << "could not find tablet dir, skip it and remove it from gc-queue. "
|
||||
<< "tablet_id=" << tablet->tablet_id()
|
||||
<< ", schema_hash=" << tablet->schema_hash()
|
||||
<< ", tablet_path=" << tablet_path;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool TabletManager::register_clone_tablet(int64_t tablet_id) {
|
||||
tablets_shard& shard = _get_tablets_shard(tablet_id);
|
||||
|
||||
@ -206,6 +206,8 @@ private:
|
||||
|
||||
std::shared_mutex& _get_tablets_shard_lock(TTabletId tabletId);
|
||||
|
||||
bool _move_tablet_to_trash(const TabletSharedPtr& tablet);
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(TabletManager);
|
||||
|
||||
@ -237,11 +239,9 @@ private:
|
||||
std::shared_mutex _shutdown_tablets_lock;
|
||||
// partition_id => tablet_info
|
||||
std::map<int64_t, std::set<TabletInfo>> _partition_tablet_map;
|
||||
std::vector<TabletSharedPtr> _shutdown_tablets;
|
||||
|
||||
// gc thread will move _shutdown_tablets to _shutdown_deleting_tablets
|
||||
std::shared_mutex _shutdown_deleting_tablets_lock;
|
||||
std::list<TabletSharedPtr> _shutdown_deleting_tablets;
|
||||
// the delete tablets. notice only allow function `start_trash_sweep` can erase tablets in _shutdown_tablets
|
||||
std::list<TabletSharedPtr> _shutdown_tablets;
|
||||
std::mutex _gc_tablets_lock;
|
||||
|
||||
std::mutex _tablet_stat_cache_mutex;
|
||||
std::shared_ptr<std::vector<TTabletStat>> _tablet_stat_list_cache =
|
||||
|
||||
Reference in New Issue
Block a user