[CP] fix standby replay 4719

This commit is contained in:
maosy
2024-02-08 20:32:20 +00:00
committed by ob-robot
parent 9005f0a5a9
commit b325b511ca
10 changed files with 279 additions and 147 deletions

View File

@ -1077,10 +1077,9 @@ int ObRecoveryLSService::do_standby_balance_()
int ObRecoveryLSService::do_ls_balance_task_()
{
int ret = OB_SUCCESS;
ObBalanceTaskHelper ls_balance_task;
ObArray<ObBalanceTaskHelper> ls_balance_tasks;
ObTenantInfoLoader *tenant_info_loader = MTL(rootserver::ObTenantInfoLoader*);
ObAllTenantInfo tenant_info;
bool has_next_task = true;
if (OB_UNLIKELY(!inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret), K(inited_));
@ -1090,97 +1089,156 @@ int ObRecoveryLSService::do_ls_balance_task_()
} else if (OB_ISNULL(tenant_info_loader)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("mtl pointer is null", KR(ret), KP(tenant_info_loader));
}
while (OB_SUCC(ret) && has_next_task) {
ret = ObBalanceTaskHelperTableOperator::pop_task(tenant_id_,
*proxy_, ls_balance_task);
} else if (OB_FAIL(tenant_info_loader->get_tenant_info(tenant_info))) {
LOG_WARN("get_tenant_info failed", K(ret));
} else if (OB_FAIL(ObBalanceTaskHelperTableOperator::load_tasks_order_by_scn(
tenant_id_, *proxy_, tenant_info.get_standby_scn(),
ls_balance_tasks))) {
if (OB_ENTRY_NOT_EXIST == ret) {
ret = OB_SUCCESS;
has_next_task = false;
} else if (OB_FAIL(ret)) {
LOG_WARN("failed to get balance task", KR(ret), K(tenant_id_));
} else if (has_set_stop()) {
ret = OB_IN_STOP_STATE;
LOG_WARN("thread is in stop state", KR(ret));
} else if (OB_FAIL(tenant_info_loader->get_tenant_info(tenant_info))) {
LOG_WARN("get_tenant_info failed", K(ret));
} else if (tenant_info.get_standby_scn() >= ls_balance_task.get_operation_scn()) {
const uint64_t exec_tenant_id = gen_meta_tenant_id(tenant_id_);
//transfer_scn maybe sync_scn
SCN transfer_scn;
START_TRANSACTION(proxy_, exec_tenant_id)
if (OB_FAIL(ret)) {
LOG_WARN("failed to start trans", KR(ret));
} else if (ls_balance_task.get_task_op().is_ls_alter()) {
if (OB_FAIL(do_ls_balance_alter_task_(ls_balance_task, trans))) {
LOG_WARN("failed to do ls alter task", KR(ret), K(ls_balance_task));
}
} else if (ls_balance_task.get_task_op().is_transfer_end()) {
transfer_scn = ls_balance_task.get_operation_scn();
} else if (ls_balance_task.get_task_op().is_transfer_begin()) {
//find transfer end, or tenant is in flashback
ObBalanceTaskHelper transfer_end_task;
ret = ObBalanceTaskHelperTableOperator::try_find_transfer_end(tenant_id_,
ls_balance_task.get_operation_scn(), ls_balance_task.get_src_ls(),
ls_balance_task.get_dest_ls(), trans, transfer_end_task);
if (OB_SUCC(ret)) {
//if has transfer end, can remove transfer begin
transfer_scn = ls_balance_task.get_operation_scn();
LOG_INFO("has transfer end task", KR(ret), K(ls_balance_task), K(transfer_end_task));
} else if (OB_ENTRY_NOT_EXIST != ret) {
LOG_WARN("failed to find transfer end task", KR(ret), K(tenant_id_), K(ls_balance_task));
} else if (tenant_info.is_prepare_flashback_for_switch_to_primary_status()
|| tenant_info.is_prepare_flashback_for_failover_to_primary_status()) {
//check tenant_info status and check wait readable_scn is equal to sync_scn
ret = OB_SUCCESS;
transfer_scn = tenant_info.get_sync_scn();
if (tenant_info.get_sync_scn() != tenant_info.get_standby_scn()) {
ret = OB_NEED_WAIT;
LOG_WARN("must wait repay to newest", KR(ret), K(tenant_id_), K(tenant_info),
K(ls_balance_task));
} else {
LOG_INFO("replay to newest", K(tenant_info), K(ls_balance_task));
}
} else {
ret = OB_NEED_RETRY;
LOG_WARN("can not find transfer end task, can not end transfer begin task", KR(ret), K(tenant_info), K(ls_balance_task));
}
}
if (OB_FAIL(ret)) {
} else if (ls_balance_task.get_task_op().is_transfer_begin()
|| ls_balance_task.get_task_op().is_transfer_end()) {
bool is_replay_finish = false;
if (!transfer_scn.is_valid()) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("transfer scn is invalid", KR(ret), K(transfer_scn), K(tenant_info), K(ls_balance_task));
} else if (OB_FAIL(ObLSServiceHelper::check_transfer_task_replay(
tenant_id_, ls_balance_task.get_src_ls(),
ls_balance_task.get_dest_ls(), transfer_scn, is_replay_finish))) {
LOG_WARN("failed to check transfer task replay", KR(ret), K(tenant_id_), K(ls_balance_task),
K(tenant_info), K(transfer_scn));
} else if (!is_replay_finish) {
ret = OB_NEED_RETRY;
LOG_WARN("can not remove ls balance task helper", KR(ret), K(ls_balance_task), K(transfer_scn));
}
}
if (FAILEDx(ObBalanceTaskHelperTableOperator::remove_task(tenant_id_,
ls_balance_task.get_operation_scn(), trans))) {
LOG_WARN("failed to remove task", KR(ret), K(tenant_id_), K(ls_balance_task));
} else {
LOG_INFO("task can be remove", KR(ret), K(ls_balance_task));
}
END_TRANSACTION(trans)
LOG_INFO("no need process balance task", K(tenant_info));
} else {
if (REACH_TENANT_TIME_INTERVAL(10 * 1000 * 1000)) { // 10s
LOG_INFO("can not remove ls balance task helper", K(ls_balance_task), K(tenant_info));
}
has_next_task = false;
LOG_WARN("failed to load task", KR(ret), K(tenant_id_), K(tenant_info));
}
}//end while
} else {
//使用接口保证获取到的ls_balance_task都是可读点越过,可以处理的task,task是按照顺序删除的,
//这个是必须要保证的
for (int64_t i = 0; OB_SUCC(ret) && i < ls_balance_tasks.count() && !has_set_stop(); ++i) {
const ObBalanceTaskHelper &ls_balance_task = ls_balance_tasks.at(i);
//按顺序梳理task,不能如果一个task处理失败就失败掉
if (OB_FAIL(try_do_ls_balance_task_(ls_balance_task, tenant_info))) {
LOG_WARN("failed to ls balance task", KR(ret), K(ls_balance_task), K(tenant_info));
}
}
}
return ret;
}
int ObRecoveryLSService::try_do_ls_balance_task_(
const share::ObBalanceTaskHelper &ls_balance_task,
const share::ObAllTenantInfo &tenant_info)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret), K(inited_));
} else if (OB_UNLIKELY(!ls_balance_task.is_valid() || !tenant_info.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", KR(ret), K(ls_balance_task), K(tenant_info));
} else if (OB_ISNULL(proxy_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("sql is null", KR(ret), KP(proxy_));
} else {
const uint64_t exec_tenant_id = gen_meta_tenant_id(tenant_id_);
bool can_remove = false;
START_TRANSACTION(proxy_, exec_tenant_id)
if (OB_FAIL(ret)) {
LOG_WARN("failed to start trans", KR(ret));
} else if (ls_balance_task.get_task_op().is_ls_alter()) {
can_remove = true;
if (OB_FAIL(do_ls_balance_alter_task_(ls_balance_task, trans))) {
LOG_WARN("failed to do ls alter task", KR(ret), K(ls_balance_task));
}
} else if (ls_balance_task.get_task_op().is_transfer_end()) {
can_remove = true;
bool is_replay_finish = true;
if (OB_FAIL(ObLSServiceHelper::check_transfer_task_replay(
tenant_id_, ls_balance_task.get_src_ls(),
ls_balance_task.get_dest_ls(), ls_balance_task.get_operation_scn(),
is_replay_finish))) {
LOG_WARN("failed to check transfer task replay", KR(ret), K(tenant_id_),
K(ls_balance_task), K(tenant_info));
} else if (!is_replay_finish) {
ret = OB_NEED_RETRY;
LOG_WARN("can not remove ls balance task helper", KR(ret), K(ls_balance_task));
}
} else if (ls_balance_task.get_task_op().is_transfer_begin()) {
if (OB_FAIL(check_transfer_begin_can_remove_(ls_balance_task, tenant_info, can_remove))) {
LOG_WARN("failed to check transfer begin can remove", KR(ret),
K(ls_balance_task), K(tenant_info));
}
} else {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("ls balance task op is unexpected", KR(ret), K(ls_balance_task));
}
if (OB_FAIL(ret)) {
} else if (!can_remove) {
LOG_INFO("balance task helper can not remove", K(ls_balance_task), K(tenant_info));
} else if (OB_FAIL(ObBalanceTaskHelperTableOperator::remove_task(tenant_id_,
ls_balance_task.get_operation_scn(), trans))) {
LOG_WARN("failed to remove task", KR(ret), K(tenant_id_), K(ls_balance_task));
} else {
LOG_INFO("task can be remove", KR(ret), K(ls_balance_task));
}
END_TRANSACTION(trans)
}
return ret;
}
int ObRecoveryLSService::check_transfer_begin_can_remove_(
const share::ObBalanceTaskHelper &ls_balance_task,
const share::ObAllTenantInfo &tenant_info,
bool &can_remove)
{
int ret = OB_SUCCESS;
can_remove = true;
if (OB_UNLIKELY(!inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("not init", KR(ret), K(inited_));
} else if (OB_UNLIKELY(!ls_balance_task.is_valid()
|| !ls_balance_task.get_task_op().is_transfer_begin())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", KR(ret), K(ls_balance_task));
} else if (OB_ISNULL(proxy_)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("sql can't null", K(ret), K(proxy_));
} else {
//find transfer end, or tenant is in flashback
ObBalanceTaskHelper transfer_end_task;
SCN transfer_scn;
bool is_replay_finish = false;
ret = ObBalanceTaskHelperTableOperator::try_find_transfer_end(tenant_id_,
ls_balance_task.get_operation_scn(), ls_balance_task.get_src_ls(),
ls_balance_task.get_dest_ls(), *proxy_, transfer_end_task);
if (OB_SUCC(ret)) {
//if has transfer end, can remove transfer begin
transfer_scn = ls_balance_task.get_operation_scn();
LOG_INFO("has transfer end task, can remove transfer begin", KR(ret),
K(ls_balance_task), K(transfer_end_task));
} else if (OB_ENTRY_NOT_EXIST != ret) {
LOG_WARN("failed to find transfer end task", KR(ret), K(tenant_id_), K(ls_balance_task));
} else if (tenant_info.is_prepare_flashback_for_switch_to_primary_status()
|| tenant_info.is_prepare_flashback_for_failover_to_primary_status()) {
//check tenant_info status and check wait readable_scn is equal to sync_scn
ret = OB_SUCCESS;
transfer_scn = tenant_info.get_sync_scn();
if (tenant_info.get_sync_scn() != tenant_info.get_standby_scn()) {
can_remove = false;
LOG_WARN("There are transfer tasks in progress. Must wait for replay to newest",
KR(ret), K(tenant_id_), K(tenant_info), K(ls_balance_task));
} else {
LOG_INFO("replay to newest, can remove transfer begin before switchover/failover to primary",
K(tenant_info), K(ls_balance_task));
}
} else {
can_remove = false;
LOG_WARN("can not find transfer end task, can not end transfer begin task", KR(ret), K(tenant_info), K(ls_balance_task));
ret = OB_SUCCESS;
}
if (OB_FAIL(ret) || !can_remove) {
} else if (OB_FAIL(ObLSServiceHelper::check_transfer_task_replay(
tenant_id_, ls_balance_task.get_src_ls(),
ls_balance_task.get_dest_ls(), transfer_scn, is_replay_finish))) {
LOG_WARN("failed to check transfer task replay", KR(ret), K(tenant_id_), K(ls_balance_task),
K(tenant_info), K(transfer_scn));
} else if (!is_replay_finish) {
ret = OB_NEED_RETRY;
LOG_WARN("can not remove ls balance task helper", KR(ret), K(ls_balance_task), K(transfer_scn));
}
}
return ret;
}
int ObRecoveryLSService::do_ls_balance_alter_task_(const share::ObBalanceTaskHelper &ls_balance_task,
common::ObMySQLTransaction &trans)

View File

@ -136,6 +136,11 @@ private:
//thread1
int do_standby_balance_();
int do_ls_balance_task_();
int try_do_ls_balance_task_(const share::ObBalanceTaskHelper &ls_balance_task,
const share::ObAllTenantInfo &tenant_info);
int check_transfer_begin_can_remove_(const share::ObBalanceTaskHelper &ls_balance_task,
const share::ObAllTenantInfo &tenant_info,
bool &can_remove);
int do_ls_balance_alter_task_(const share::ObBalanceTaskHelper &ls_balance_task,
common::ObMySQLTransaction &trans);
int reset_restore_proxy_(ObRestoreSourceServiceAttr &service_attr);

View File

@ -691,7 +691,7 @@ int ObTenantBalanceService::lock_and_check_balance_job_(
trans,
tenant_id,
OB_ALL_BALANCE_JOB_TID,
EXCLUSIVE))) {
EXCLUSIVE, false))) {
LOG_WARN("lock inner table failed", KR(ret), K(tenant_id));
} else if (OB_FAIL(ObBalanceJobTableOperator::get_balance_job(
tenant_id,

View File

@ -525,13 +525,15 @@ int ObTenantRoleTransitionService::wait_ls_balance_task_finish_()
LOG_INFO("data version is smaller than 4200, no need check", K(user_compat_version));
} else {
bool is_finish = false;
ObBalanceTaskHelper ls_balance_task;
ObArray<ObBalanceTaskHelper> ls_balance_tasks;
ObBalanceTaskArray balance_task_array;
share::ObAllTenantInfo cur_tenant_info;
int tmp_ret = OB_SUCCESS;
SCN max_scn;
max_scn.set_max();
while (!THIS_WORKER.is_timeout() && OB_SUCC(ret) && !is_finish) {
if (FALSE_IT(ret = ObBalanceTaskHelperTableOperator::pop_task(tenant_id_,
*sql_proxy_, ls_balance_task))) {
if (FALSE_IT(ret = ObBalanceTaskHelperTableOperator::load_tasks_order_by_scn(tenant_id_,
*sql_proxy_, max_scn, ls_balance_tasks))) {
} else if (OB_ENTRY_NOT_EXIST == ret) {
ret = OB_SUCCESS;
balance_task_array.reset();
@ -579,7 +581,7 @@ int ObTenantRoleTransitionService::wait_ls_balance_task_finish_()
LOG_WARN("failed to notify recovery ls service", KR(tmp_ret));
}
usleep(100L * 1000L);
LOG_INFO("has balance task not finish", K(ls_balance_task),
LOG_INFO("has balance task not finish", K(ls_balance_tasks),
K(balance_task_array), K(cur_tenant_info));
}
}

View File

@ -1391,7 +1391,7 @@ int ObTenantSnapshotUtil::check_tenant_is_in_transfer_procedure_(
trans,
tenant_id,
OB_ALL_BALANCE_JOB_TID,
EXCLUSIVE))) {
EXCLUSIVE, false/*is_from_sql*/))) {
LOG_WARN("lock inner table failed", KR(ret), K(tenant_id));
} else if (OB_FAIL(ObBalanceJobTableOperator::get_balance_job(
tenant_id,