enhance clog replaying checking logic to prevent replaying clog after tablet delete/finish transfer out tx is committed

This commit is contained in:
hiddenbomb
2024-03-26 02:45:53 +00:00
committed by ob-robot
parent 0251ce4679
commit d57716b902
6 changed files with 64 additions and 9 deletions

View File

@ -1895,6 +1895,10 @@ DEF_BOOL(_enable_choose_migration_source_policy, OB_TENANT_PARAMETER, "True",
DEF_BOOL(_global_enable_rich_vector_format, OB_CLUSTER_PARAMETER, "True",
"Control whether use rich vector format in vectorization engine",
ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_BOOL(_allow_skip_replay_redo_after_detete_tablet, OB_TENANT_PARAMETER, "FALSE",
"allow skip replay invalid redo log after tablet delete transaction is committed."
"The default value is FALSE. Value: TRUE means we allow skip replaying this invalid redo log, False means we do not alow such behavior.",
ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
//check os params
DEF_BOOL(strict_check_os_params, OB_CLUSTER_PARAMETER, "False",

View File

@ -24,6 +24,7 @@
#include "observer/net/ob_ingress_bw_alloc_service.h"
#include "observer/ob_srv_network_frame.h"
#include "observer/report/ob_i_meta_report.h"
#include "observer/omt/ob_tenant_config_mgr.h"
#include "rootserver/freeze/ob_major_freeze_service.h"
#include "rootserver/tenant_snapshot/ob_tenant_snapshot_scheduler.h"
#include "rootserver/restore/ob_clone_scheduler.h"
@ -46,6 +47,7 @@
#include "share/leak_checker/obj_leak_checker.h"
#include "share/ob_ls_id.h"
#include "share/ob_global_autoinc_service.h"
#include "share/ob_force_print_log.h"
#include "sql/das/ob_das_id_service.h"
#include "storage/compaction/ob_tenant_tablet_scheduler.h"
#include "storage/ls/ob_ls.h"
@ -1780,6 +1782,8 @@ int ObLS::replay_get_tablet(
&& ObTabletStatus::TRANSFER_OUT_DELETED != tablet_status) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("tablet is empty shell but user data is unexpected", K(ret), KPC(tablet));
} else if (OB_FAIL(check_tablet_status_and_scn(data, scn))) {
LOG_ERROR("fail to check tablet status and scn", K(ret), K(ls_id), K(tablet_id), K(data), K(scn));
} else {
ret = OB_OBSOLETE_CLOG_NEED_SKIP;
LOG_INFO("tablet is already deleted, need skip", KR(ret), K(ls_id), K(tablet_id), K(scn));
@ -1800,6 +1804,8 @@ int ObLS::replay_get_tablet(
LOG_INFO("latest transaction has not committed yet, should retry", KR(ret), K(ls_id), K(tablet_id),
K(scn), "clog_checkpoint_scn", tablet->get_clog_checkpoint_scn(), K(data));
}
} else if (OB_FAIL(check_tablet_status_and_scn(data, scn))) {
LOG_ERROR("fail to check tablet status and scn", K(ret), K(ls_id), K(tablet_id), K(data), K(scn));
}
}
@ -1810,6 +1816,30 @@ int ObLS::replay_get_tablet(
return ret;
}
int ObLS::check_tablet_status_and_scn(
const ObTabletCreateDeleteMdsUserData &data,
const share::SCN &scn)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY((ObTabletStatus::DELETED == data.tablet_status_ || ObTabletStatus::TRANSFER_OUT_DELETED == data.tablet_status_)
&& scn >= data.delete_commit_scn_)) {
omt::ObTenantConfigGuard tenant_config(TENANT_CONF(MTL_ID()));
if (OB_UNLIKELY(!tenant_config.is_valid())) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("tenant config is invalid", K(ret));
} else if (tenant_config->_allow_skip_replay_redo_after_detete_tablet) {
FLOG_WARN("scn is bigger than tablet delete commit scn, allow to skip replaying this clog for emergency",
K(ret), K(data), K(scn));
} else {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("scn is bigger than tablet delete commit scn", K(ret), K(data), K(scn));
}
}
return ret;
}
int ObLS::logstream_freeze(const int64_t trace_id, const bool is_sync, const int64_t abs_timeout_ts)
{
int ret = OB_SUCCESS;

View File

@ -85,7 +85,7 @@ class ObCompactionScheduleIterator;
}
namespace storage
{
const static int64_t LS_INNER_TABLET_FROZEN_TIMESTAMP = 1;
class ObTabletCreateDeleteMdsUserData;
struct ObLSVTInfo
{
@ -928,7 +928,10 @@ public:
DELEGATE_WITH_RET(reserved_snapshot_mgr_, add_dependent_medium_tablet, int);
DELEGATE_WITH_RET(reserved_snapshot_mgr_, del_dependent_medium_tablet, int);
int set_ls_migration_gc(bool &allow_gc);
private:
static int check_tablet_status_and_scn(
const ObTabletCreateDeleteMdsUserData &data,
const share::SCN &scn);
private:
// StorageBaseUtil
// table manager: create, remove and guard get.

View File

@ -5508,7 +5508,8 @@ int ObTablet::get_ddl_info(int64_t &schema_version, int64_t &schema_refreshed_ts
return ret;
}
int ObTablet::get_rec_log_scn(SCN &rec_scn) {
int ObTablet::get_rec_log_scn(SCN &rec_scn)
{
int ret = OB_SUCCESS;
rec_scn = SCN::max_scn();
ObTableHandleV2 handle;

View File

@ -11,6 +11,7 @@
*/
#include "share/throttle/ob_throttle_unit.h"
#include "observer/omt/ob_tenant_config_mgr.h"
#include "storage/ls/ob_ls.h"
#include "storage/ls/ob_ls_tx_service.h"
#include "storage/memtable/ob_memtable.h"
@ -765,7 +766,9 @@ int ObTxReplayExecutor::replay_row_(storage::ObStoreCtx &store_ctx,
memtable::ObMemtableMutatorIterator *mmi_ptr)
{
int ret = OB_SUCCESS;
common::ObTimeGuard timeguard("replay_row_in_memtable", 10 * 1000);
const share::ObLSID &ls_id = tablet->get_ls_id();
const common::ObTabletID &tablet_id = tablet->get_tablet_id();
common::ObTimeGuard timeguard("replay_row_in_memtable", 10_ms);
ObIMemtable *mem_ptr = nullptr;
ObMemtable *data_mem_ptr = nullptr;
ObStorageTableGuard w_guard(tablet, store_ctx, true, true, log_ts_ns_);
@ -774,12 +777,25 @@ int ObTxReplayExecutor::replay_row_(storage::ObStoreCtx &store_ctx,
TRANS_LOG(WARN, "[Replay Tx] invaild arguments", K(ret), KP(mmi_ptr));
} else if (FALSE_IT(timeguard.click("start"))) {
} else if (OB_FAIL(prepare_memtable_replay_(w_guard, mem_ptr))) {
if (OB_NO_NEED_UPDATE != ret) {
TRANS_LOG(WARN, "[Replay Tx] prepare for replay failed", K(ret), KP(mem_ptr), KP(mmi_ptr));
if (OB_NO_NEED_UPDATE == ret) {
TRANS_LOG(DEBUG, "[Replay Tx] Not need replay row for tablet",
K(ret), K(ls_id), K(tablet_id), K(log_ts_ns_),
K(tx_part_log_no_), K(mmi_ptr->get_row_head()));
} else if (OB_TABLET_NOT_EXIST == ret) {
omt::ObTenantConfigGuard tenant_config(TENANT_CONF(MTL_ID()));
if (OB_UNLIKELY(!tenant_config.is_valid())) {
ret = OB_ERR_UNEXPECTED;
TRANS_LOG(WARN, "tenant config is invalid", K(ret));
} else if (tenant_config->_allow_skip_replay_redo_after_detete_tablet) {
ret = OB_NO_NEED_UPDATE;
TRANS_LOG(WARN, "[Replay Tx] tablet does not exist while preparing memtable for replay, allow to skip this clog replaying for emergency",
K(ret), K(ls_id), K(tablet_id), K_(log_ts_ns));
} else {
TRANS_LOG(ERROR, "[Replay Tx] tablet does not exist while preparing memtable for replay",
K(ret), K(ls_id), K(tablet_id), K_(log_ts_ns));
}
} else {
TRANS_LOG(DEBUG, "[Replay Tx] Not need replay row for tablet", K(log_ts_ns_),
K(tx_part_log_no_), K(mmi_ptr->get_row_head()),
K(tablet->get_tablet_meta().tablet_id_));
TRANS_LOG(WARN, "[Replay Tx] prepare for replay failed", K(ret), K(ls_id), K(tablet_id), KP(mem_ptr), KP(mmi_ptr));
}
// dynamic_cast will check whether this is really a ObMemtable.
} else if (OB_ISNULL(data_mem_ptr = static_cast<ObMemtable *>(mem_ptr))) {

View File

@ -254,6 +254,7 @@ writing_throttling_maximum_duration
writing_throttling_trigger_percentage
zone
_advance_checkpoint_timeout
_allow_skip_replay_redo_after_detete_tablet
_audit_mode
_auto_broadcast_tablet_location_rate_limit
_auto_drop_recovering_auxiliary_tenant