Fix kill trans return 4023 bug

This commit is contained in:
godyangfight
2023-07-19 09:48:26 +00:00
committed by ob-robot
parent cd6e2d650c
commit d35479be24
7 changed files with 63 additions and 21 deletions

File diff suppressed because one or more lines are too long

View File

@ -1797,6 +1797,7 @@ DEFINE_ERROR(OB_TRANSFER_SRC_LS_NOT_EXIST, -7115, -1, "HY000", "transfer src ls
DEFINE_ERROR(OB_TRANSFER_SRC_TABLET_NOT_EXIST, -7116, -1, "HY000", "transfer src tablet does not exist");
DEFINE_ERROR(OB_LS_NEED_REBUILD, -7117, -1, "HY000", "ls need rebuild");
DEFINE_ERROR(OB_OBSOLETE_CLOG_NEED_SKIP, -7118, -1, "HY000", "obsolete clog need skip");
DEFINE_ERROR(OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT, -7119, -1, "HY000", "transfer wait transactions end timeout");
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//error code for gis -7201 ---- -7300
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1406,6 +1406,7 @@ constexpr int OB_TRANSFER_SRC_LS_NOT_EXIST = -7115;
constexpr int OB_TRANSFER_SRC_TABLET_NOT_EXIST = -7116;
constexpr int OB_LS_NEED_REBUILD = -7117;
constexpr int OB_OBSOLETE_CLOG_NEED_SKIP = -7118;
constexpr int OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT = -7119;
constexpr int OB_ERR_INVALID_XML_DATATYPE = -7402;
constexpr int OB_ERR_XML_MISSING_COMMA = -7403;
constexpr int OB_ERR_INVALID_XPATH_EXPRESSION = -7404;
@ -3394,6 +3395,7 @@ constexpr int OB_ERR_INVALID_DATE_MSG_FMT_V2 = -4219;
#define OB_TRANSFER_SRC_TABLET_NOT_EXIST__USER_ERROR_MSG "transfer src tablet does not exist"
#define OB_LS_NEED_REBUILD__USER_ERROR_MSG "ls need rebuild"
#define OB_OBSOLETE_CLOG_NEED_SKIP__USER_ERROR_MSG "obsolete clog need skip"
#define OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT__USER_ERROR_MSG "transfer wait transactions end timeout"
#define OB_ERR_GIS_DIFFERENT_SRIDS__USER_ERROR_MSG "Binary geometry function %s given two geometries of different srids: %u and %u, which should have been identical."
#define OB_ERR_GIS_UNSUPPORTED_ARGUMENT__USER_ERROR_MSG "Calling geometry function %s with unsupported types of arguments."
#define OB_ERR_GIS_UNKNOWN_ERROR__USER_ERROR_MSG "Unknown GIS error occurred in function %s."
@ -5493,6 +5495,7 @@ constexpr int OB_ERR_INVALID_DATE_MSG_FMT_V2 = -4219;
#define OB_TRANSFER_SRC_TABLET_NOT_EXIST__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7116, transfer src tablet does not exist"
#define OB_LS_NEED_REBUILD__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7117, ls need rebuild"
#define OB_OBSOLETE_CLOG_NEED_SKIP__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7118, obsolete clog need skip"
#define OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7119, transfer wait transactions end timeout"
#define OB_ERR_GIS_DIFFERENT_SRIDS__ORA_USER_ERROR_MSG "ORA-00600: Binary geometry function %s given two geometries of different srids: %u and %u, which should have been identical."
#define OB_ERR_GIS_UNSUPPORTED_ARGUMENT__ORA_USER_ERROR_MSG "ORA-00600: Calling geometry function %s with unsupported types of arguments."
#define OB_ERR_GIS_UNKNOWN_ERROR__ORA_USER_ERROR_MSG "ORA-00600: Unknown GIS error occurred in function %s."
@ -5960,7 +5963,7 @@ constexpr int OB_ERR_INVALID_DATE_MSG_FMT_V2 = -4219;
#define OB_ERR_DATA_TOO_LONG_MSG_FMT_V2__ORA_USER_ERROR_MSG "ORA-12899: value too large for column %.*s (actual: %ld, maximum: %ld)"
#define OB_ERR_INVALID_DATE_MSG_FMT_V2__ORA_USER_ERROR_MSG "ORA-01861: Incorrect datetime value for column '%.*s' at row %ld"
extern int g_all_ob_errnos[2095];
extern int g_all_ob_errnos[2096];
const char *ob_error_name(const int oberr);
const char* ob_error_cause(const int oberr);

View File

@ -1486,6 +1486,10 @@ DEF_TIME(_balance_kill_transaction_threshold, OB_TENANT_PARAMETER, "100ms", "[1m
"the time given to the transaction to execute when do balance"
"before it will be killed. Range: [1ms, 60s]",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_TIME(_balance_wait_killing_transaction_end_threshold, OB_TENANT_PARAMETER, "100ms", "[10ms, 60s]",
"the threshold for waiting time after killing transactions until they end."
"Range: [10ms, 60s]",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_BOOL(_enable_px_fast_reclaim, OB_CLUSTER_PARAMETER, "True",
"Enable the fast reclaim function through PX tasks deteting for survival by detect manager. The default value is True.",
ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));

View File

@ -418,7 +418,6 @@ int ObTransferHandler::do_with_start_status_(const share::ObTransferTaskInfo &ta
ObTimeoutCtx timeout_ctx;
ObMySQLTransaction trans;
bool enable_kill_trx = false;
int64_t kill_trx_threshold = 0;
if (!is_inited_) {
ret = OB_NOT_INIT;
@ -450,14 +449,13 @@ int ObTransferHandler::do_with_start_status_(const share::ObTransferTaskInfo &ta
omt::ObTenantConfigGuard tenant_config(TENANT_CONF(MTL_ID()));
if (tenant_config.is_valid()) {
enable_kill_trx = tenant_config->_enable_balance_kill_transaction;
kill_trx_threshold = tenant_config->_balance_kill_transaction_threshold;
}
if (OB_FAIL(ret)) {
} else if (OB_FAIL(lock_src_and_dest_ls_member_list_(task_info, task_info.src_ls_id_, task_info.dest_ls_id_))) {
LOG_WARN("failed to lock src and dest ls member list", K(ret), K(task_info));
} else if (!enable_kill_trx && OB_FAIL(check_src_ls_has_active_trans_(task_info.src_ls_id_))) {
LOG_WARN("failed to check src ls active trans", K(ret), K(task_info));
} else if (OB_FAIL(block_and_kill_tx_(task_info, enable_kill_trx, kill_trx_threshold, timeout_ctx))) {
} else if (OB_FAIL(block_and_kill_tx_(task_info, enable_kill_trx, timeout_ctx))) {
LOG_WARN("failed to block and kill tx", K(ret), K(task_info));
} else if (OB_FAIL(check_start_status_transfer_tablets_(task_info))) {
LOG_WARN("failed to check start status transfer tablets", K(ret), K(task_info));
@ -905,7 +903,11 @@ int ObTransferHandler::start_trans_(
omt::ObTenantConfigGuard tenant_config(TENANT_CONF(tenant_id));
int64_t stmt_timeout = 10_s;
if (tenant_config.is_valid()) {
stmt_timeout = tenant_config->_transfer_start_trans_timeout + tenant_config->_balance_kill_transaction_threshold;
stmt_timeout = tenant_config->_transfer_start_trans_timeout;
if (tenant_config->_enable_balance_kill_transaction) {
stmt_timeout += tenant_config->_balance_kill_transaction_threshold;
stmt_timeout += tenant_config->_balance_wait_killing_transaction_end_threshold;
}
}
if (!is_inited_) {
@ -1746,22 +1748,34 @@ int ObTransferHandler::do_worker_transfer_()
int ObTransferHandler::block_and_kill_tx_(
const share::ObTransferTaskInfo &task_info,
const bool enable_kill_trx,
const int64_t kill_trx_threshold,
ObTimeoutCtx &timeout_ctx)
{
int ret = OB_SUCCESS;
const uint64_t tenant_id = task_info.tenant_id_;
const share::ObLSID &src_ls_id = task_info.src_ls_id_;
const int64_t start_ts = ObTimeUtil::current_time();
int64_t before_kill_trx_threshold = 0;
int64_t after_kill_trx_threshold = 0;
omt::ObTenantConfigGuard tenant_config(TENANT_CONF(MTL_ID()));
int64_t active_trans_count = 0;
if (tenant_config.is_valid()) {
before_kill_trx_threshold = tenant_config->_balance_kill_transaction_threshold;
after_kill_trx_threshold = tenant_config->_balance_wait_killing_transaction_end_threshold;
}
if (OB_FAIL(block_tx_(tenant_id, src_ls_id))) {
LOG_WARN("failed to block tx", K(ret), K(task_info));
} else if (!enable_kill_trx) {
LOG_INFO("transfer no need kill tx", K(task_info));
} else if (OB_FAIL(check_for_kill_(tenant_id, src_ls_id, kill_trx_threshold, false/*is_after_kill*/, timeout_ctx))) {
LOG_WARN("failed to check before kill", K(ret));
} else if (OB_FAIL(kill_tx_(tenant_id, src_ls_id))) {
LOG_WARN("failed to kill tx", K(ret));
} else if (OB_FAIL(check_for_kill_(tenant_id, src_ls_id, kill_trx_threshold, true/*is_after_kill*/, timeout_ctx))) {
if (OB_FAIL(get_ls_active_trans_count_(src_ls_id, active_trans_count))) {
LOG_WARN("failed to get src ls has active trans", K(ret));
} else if (0 != active_trans_count) {
ret = OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT;
LOG_WARN("transfer src ls still has active transactions, cannot do transfer", K(ret), K(src_ls_id),
K(active_trans_count));
}
} else if (OB_FAIL(check_and_kill_tx_(tenant_id, src_ls_id, before_kill_trx_threshold, false/*with_trans_kill*/, timeout_ctx))) {
LOG_WARN("failed to check after kill", K(ret));
} else if (OB_FAIL(check_and_kill_tx_(tenant_id, src_ls_id, after_kill_trx_threshold, true/*with_trans_kill*/, timeout_ctx))) {
LOG_WARN("failed to check after kill", K(ret));
} else {
LOG_INFO("[TRANSFER] success to block and kill tx", "cost", ObTimeUtil::current_time() - start_ts);
@ -1773,11 +1787,11 @@ int ObTransferHandler::block_and_kill_tx_(
return ret;
}
int ObTransferHandler::check_for_kill_(
int ObTransferHandler::check_and_kill_tx_(
const uint64_t tenant_id,
const share::ObLSID &ls_id,
const int64_t timeout,
const bool is_after_kill,
const bool with_trans_kill,
ObTimeoutCtx &timeout_ctx)
{
int ret = OB_SUCCESS;
@ -1790,9 +1804,9 @@ int ObTransferHandler::check_for_kill_(
ret = OB_TIMEOUT;
LOG_WARN("trans ctx already timeout", K(ret));
} else if (cur_ts - start_ts > timeout) {
if (is_after_kill) {
ret = OB_TIMEOUT;
LOG_WARN("check active trans after kill timeout", K(cur_ts), K(start_ts));
if (with_trans_kill) {
ret = OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT;
LOG_WARN("wait active trans finish timeout", K(ret), K(cur_ts), K(start_ts));
} else {
break;
}
@ -1806,6 +1820,13 @@ int ObTransferHandler::check_for_kill_(
LOG_WARN("failed to get src ls has active trans", K(ret));
} else if (0 != active_trans_count) {
LOG_INFO("still has active trans", K(tenant_id), K(ls_id), K(active_trans_count));
if (with_trans_kill && OB_FAIL(kill_tx_(tenant_id, ls_id))) {
if (OB_EAGAIN == ret) {
ret = OB_SUCCESS;
} else {
LOG_WARN("failed to kill tx", K(ret), K(tenant_id), K(ls_id));
}
}
} else {
break;
}

View File

@ -191,16 +191,15 @@ private:
int block_and_kill_tx_(
const share::ObTransferTaskInfo &task_info,
const bool enable_kill_trx,
const int64_t kill_trx_threshold,
ObTimeoutCtx &timeout_ctx);
int block_tx_(
const uint64_t tenant_id,
const share::ObLSID &ls_id);
int check_for_kill_(
int check_and_kill_tx_(
const uint64_t tenant_id,
const share::ObLSID &ls_id,
const int64_t timeout,
const bool is_after_kill,
const bool with_trans_kill,
ObTimeoutCtx &timeout_ctx);
int kill_tx_(
const uint64_t tenant_id,

View File

@ -243,6 +243,7 @@ _backup_idle_time
_backup_task_keep_alive_interval
_backup_task_keep_alive_timeout
_balance_kill_transaction_threshold
_balance_wait_killing_transaction_end_threshold
_bloom_filter_enabled
_bloom_filter_ratio
_cache_wash_interval