From d29bf30f74a3989d2b8da666644fb5a8e380a319 Mon Sep 17 00:00:00 2001 From: ZhangYu0123 <67053339+ZhangYu0123@users.noreply.github.com> Date: Tue, 8 Sep 2020 18:52:53 +0800 Subject: [PATCH] [BUG] Fix stale path delete checking logic when current main path is missing. (#4549) Fix stale path delete checking logic. When current main path is version missing, then delete checking logic is always core dumped. So we fix the checking logic to tolerate current main version missing. --- be/src/olap/tablet.cpp | 64 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index e020767ac6..ea841642c0 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -440,6 +440,11 @@ void Tablet::delete_expired_stale_rowset() { LOG(WARNING) << "lastest_delta is null " << tablet_id(); return; } + + // fetch missing version before delete + std::vector missed_versions; + calc_missed_versions_unlocked(lastest_delta->end_version(), &missed_versions); + // do check consistent operation auto path_id_iter = path_id_vec.begin(); @@ -452,21 +457,58 @@ void Tablet::delete_expired_stale_rowset() { stale_version_path_map[*path_id_iter] = version_path; OLAPStatus status = capture_consistent_versions(test_version, nullptr); - // When there is no consistent versions, we must reconstruct the tracker. + // 1. When there is no consistent versions, we must reconstruct the tracker. if (status != OLAP_SUCCESS) { - LOG(WARNING) << "The consistent version check fails, there are bugs. " - << "Reconstruct the tracker to recover versions in tablet=" << tablet_id(); + + // 2. fetch missing version after delete + std::vector after_missed_versions; + calc_missed_versions_unlocked(lastest_delta->end_version(), &after_missed_versions); - _timestamped_version_tracker.recover_versioned_tracker(stale_version_path_map); + // 2.1 check whether missed_versions and after_missed_versions are the same. + // when they are the same, it means we can delete the path securely. + bool is_missng = missed_versions.size() != after_missed_versions.size(); + + if (!is_missng) { + for (int ver_index = 0; ver_index < missed_versions.size(); ver_index++) { + if(missed_versions[ver_index] != after_missed_versions[ver_index]) { + is_missng = true; + break; + } + } + } - // double check the consistent versions - status = capture_consistent_versions(test_version, nullptr); + if (is_missng) { + LOG(WARNING) << "The consistent version check fails, there are bugs. " + << "Reconstruct the tracker to recover versions in tablet=" << tablet_id(); - if (status != OLAP_SUCCESS) { - if (!config::ignore_rowset_stale_unconsistent_delete) { - LOG(FATAL) << "rowset stale unconsistent delete. tablet= " << tablet_id(); - } else { - LOG(WARNING) << "rowset stale unconsistent delete. tablet= " << tablet_id(); + // 3. try to recover + _timestamped_version_tracker.recover_versioned_tracker(stale_version_path_map); + + // 4. double check the consistent versions + // fetch missing version after recover + std::vector recover_missed_versions; + calc_missed_versions_unlocked(lastest_delta->end_version(), &recover_missed_versions); + + // 4.1 check whether missed_versions and recover_missed_versions are the same. + // when they are the same, it means we recover successlly. + bool is_recover_missng = missed_versions.size() != recover_missed_versions.size(); + + if (!is_recover_missng) { + for (int ver_index = 0; ver_index < missed_versions.size(); ver_index++) { + if(missed_versions[ver_index] != recover_missed_versions[ver_index]) { + is_recover_missng = true; + break; + } + } + } + + // 5. check recover fail, version is mission + if (is_recover_missng) { + if (!config::ignore_rowset_stale_unconsistent_delete) { + LOG(FATAL) << "rowset stale unconsistent delete. tablet= " << tablet_id(); + } else { + LOG(WARNING) << "rowset stale unconsistent delete. tablet= " << tablet_id(); + } } } return;