Fix kill trans return 4023 bug

This commit is contained in:
godyangfight
2023-07-19 09:48:26 +00:00
committed by ob-robot
parent cd6e2d650c
commit d35479be24
7 changed files with 63 additions and 21 deletions

File diff suppressed because one or more lines are too long

View File

@ -1797,6 +1797,7 @@ DEFINE_ERROR(OB_TRANSFER_SRC_LS_NOT_EXIST, -7115, -1, "HY000", "transfer src ls
DEFINE_ERROR(OB_TRANSFER_SRC_TABLET_NOT_EXIST, -7116, -1, "HY000", "transfer src tablet does not exist"); DEFINE_ERROR(OB_TRANSFER_SRC_TABLET_NOT_EXIST, -7116, -1, "HY000", "transfer src tablet does not exist");
DEFINE_ERROR(OB_LS_NEED_REBUILD, -7117, -1, "HY000", "ls need rebuild"); DEFINE_ERROR(OB_LS_NEED_REBUILD, -7117, -1, "HY000", "ls need rebuild");
DEFINE_ERROR(OB_OBSOLETE_CLOG_NEED_SKIP, -7118, -1, "HY000", "obsolete clog need skip"); DEFINE_ERROR(OB_OBSOLETE_CLOG_NEED_SKIP, -7118, -1, "HY000", "obsolete clog need skip");
DEFINE_ERROR(OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT, -7119, -1, "HY000", "transfer wait transactions end timeout");
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//error code for gis -7201 ---- -7300 //error code for gis -7201 ---- -7300
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -1406,6 +1406,7 @@ constexpr int OB_TRANSFER_SRC_LS_NOT_EXIST = -7115;
constexpr int OB_TRANSFER_SRC_TABLET_NOT_EXIST = -7116; constexpr int OB_TRANSFER_SRC_TABLET_NOT_EXIST = -7116;
constexpr int OB_LS_NEED_REBUILD = -7117; constexpr int OB_LS_NEED_REBUILD = -7117;
constexpr int OB_OBSOLETE_CLOG_NEED_SKIP = -7118; constexpr int OB_OBSOLETE_CLOG_NEED_SKIP = -7118;
constexpr int OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT = -7119;
constexpr int OB_ERR_INVALID_XML_DATATYPE = -7402; constexpr int OB_ERR_INVALID_XML_DATATYPE = -7402;
constexpr int OB_ERR_XML_MISSING_COMMA = -7403; constexpr int OB_ERR_XML_MISSING_COMMA = -7403;
constexpr int OB_ERR_INVALID_XPATH_EXPRESSION = -7404; constexpr int OB_ERR_INVALID_XPATH_EXPRESSION = -7404;
@ -3394,6 +3395,7 @@ constexpr int OB_ERR_INVALID_DATE_MSG_FMT_V2 = -4219;
#define OB_TRANSFER_SRC_TABLET_NOT_EXIST__USER_ERROR_MSG "transfer src tablet does not exist" #define OB_TRANSFER_SRC_TABLET_NOT_EXIST__USER_ERROR_MSG "transfer src tablet does not exist"
#define OB_LS_NEED_REBUILD__USER_ERROR_MSG "ls need rebuild" #define OB_LS_NEED_REBUILD__USER_ERROR_MSG "ls need rebuild"
#define OB_OBSOLETE_CLOG_NEED_SKIP__USER_ERROR_MSG "obsolete clog need skip" #define OB_OBSOLETE_CLOG_NEED_SKIP__USER_ERROR_MSG "obsolete clog need skip"
#define OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT__USER_ERROR_MSG "transfer wait transactions end timeout"
#define OB_ERR_GIS_DIFFERENT_SRIDS__USER_ERROR_MSG "Binary geometry function %s given two geometries of different srids: %u and %u, which should have been identical." #define OB_ERR_GIS_DIFFERENT_SRIDS__USER_ERROR_MSG "Binary geometry function %s given two geometries of different srids: %u and %u, which should have been identical."
#define OB_ERR_GIS_UNSUPPORTED_ARGUMENT__USER_ERROR_MSG "Calling geometry function %s with unsupported types of arguments." #define OB_ERR_GIS_UNSUPPORTED_ARGUMENT__USER_ERROR_MSG "Calling geometry function %s with unsupported types of arguments."
#define OB_ERR_GIS_UNKNOWN_ERROR__USER_ERROR_MSG "Unknown GIS error occurred in function %s." #define OB_ERR_GIS_UNKNOWN_ERROR__USER_ERROR_MSG "Unknown GIS error occurred in function %s."
@ -5493,6 +5495,7 @@ constexpr int OB_ERR_INVALID_DATE_MSG_FMT_V2 = -4219;
#define OB_TRANSFER_SRC_TABLET_NOT_EXIST__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7116, transfer src tablet does not exist" #define OB_TRANSFER_SRC_TABLET_NOT_EXIST__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7116, transfer src tablet does not exist"
#define OB_LS_NEED_REBUILD__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7117, ls need rebuild" #define OB_LS_NEED_REBUILD__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7117, ls need rebuild"
#define OB_OBSOLETE_CLOG_NEED_SKIP__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7118, obsolete clog need skip" #define OB_OBSOLETE_CLOG_NEED_SKIP__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7118, obsolete clog need skip"
#define OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT__ORA_USER_ERROR_MSG "ORA-00600: internal error code, arguments: -7119, transfer wait transactions end timeout"
#define OB_ERR_GIS_DIFFERENT_SRIDS__ORA_USER_ERROR_MSG "ORA-00600: Binary geometry function %s given two geometries of different srids: %u and %u, which should have been identical." #define OB_ERR_GIS_DIFFERENT_SRIDS__ORA_USER_ERROR_MSG "ORA-00600: Binary geometry function %s given two geometries of different srids: %u and %u, which should have been identical."
#define OB_ERR_GIS_UNSUPPORTED_ARGUMENT__ORA_USER_ERROR_MSG "ORA-00600: Calling geometry function %s with unsupported types of arguments." #define OB_ERR_GIS_UNSUPPORTED_ARGUMENT__ORA_USER_ERROR_MSG "ORA-00600: Calling geometry function %s with unsupported types of arguments."
#define OB_ERR_GIS_UNKNOWN_ERROR__ORA_USER_ERROR_MSG "ORA-00600: Unknown GIS error occurred in function %s." #define OB_ERR_GIS_UNKNOWN_ERROR__ORA_USER_ERROR_MSG "ORA-00600: Unknown GIS error occurred in function %s."
@ -5960,7 +5963,7 @@ constexpr int OB_ERR_INVALID_DATE_MSG_FMT_V2 = -4219;
#define OB_ERR_DATA_TOO_LONG_MSG_FMT_V2__ORA_USER_ERROR_MSG "ORA-12899: value too large for column %.*s (actual: %ld, maximum: %ld)" #define OB_ERR_DATA_TOO_LONG_MSG_FMT_V2__ORA_USER_ERROR_MSG "ORA-12899: value too large for column %.*s (actual: %ld, maximum: %ld)"
#define OB_ERR_INVALID_DATE_MSG_FMT_V2__ORA_USER_ERROR_MSG "ORA-01861: Incorrect datetime value for column '%.*s' at row %ld" #define OB_ERR_INVALID_DATE_MSG_FMT_V2__ORA_USER_ERROR_MSG "ORA-01861: Incorrect datetime value for column '%.*s' at row %ld"
extern int g_all_ob_errnos[2095]; extern int g_all_ob_errnos[2096];
const char *ob_error_name(const int oberr); const char *ob_error_name(const int oberr);
const char* ob_error_cause(const int oberr); const char* ob_error_cause(const int oberr);

View File

@ -1486,6 +1486,10 @@ DEF_TIME(_balance_kill_transaction_threshold, OB_TENANT_PARAMETER, "100ms", "[1m
"the time given to the transaction to execute when do balance" "the time given to the transaction to execute when do balance"
"before it will be killed. Range: [1ms, 60s]", "before it will be killed. Range: [1ms, 60s]",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)); ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_TIME(_balance_wait_killing_transaction_end_threshold, OB_TENANT_PARAMETER, "100ms", "[10ms, 60s]",
"the threshold for waiting time after killing transactions until they end."
"Range: [10ms, 60s]",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_BOOL(_enable_px_fast_reclaim, OB_CLUSTER_PARAMETER, "True", DEF_BOOL(_enable_px_fast_reclaim, OB_CLUSTER_PARAMETER, "True",
"Enable the fast reclaim function through PX tasks deteting for survival by detect manager. The default value is True.", "Enable the fast reclaim function through PX tasks deteting for survival by detect manager. The default value is True.",
ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE)); ObParameterAttr(Section::OBSERVER, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));

View File

@ -418,7 +418,6 @@ int ObTransferHandler::do_with_start_status_(const share::ObTransferTaskInfo &ta
ObTimeoutCtx timeout_ctx; ObTimeoutCtx timeout_ctx;
ObMySQLTransaction trans; ObMySQLTransaction trans;
bool enable_kill_trx = false; bool enable_kill_trx = false;
int64_t kill_trx_threshold = 0;
if (!is_inited_) { if (!is_inited_) {
ret = OB_NOT_INIT; ret = OB_NOT_INIT;
@ -450,14 +449,13 @@ int ObTransferHandler::do_with_start_status_(const share::ObTransferTaskInfo &ta
omt::ObTenantConfigGuard tenant_config(TENANT_CONF(MTL_ID())); omt::ObTenantConfigGuard tenant_config(TENANT_CONF(MTL_ID()));
if (tenant_config.is_valid()) { if (tenant_config.is_valid()) {
enable_kill_trx = tenant_config->_enable_balance_kill_transaction; enable_kill_trx = tenant_config->_enable_balance_kill_transaction;
kill_trx_threshold = tenant_config->_balance_kill_transaction_threshold;
} }
if (OB_FAIL(ret)) { if (OB_FAIL(ret)) {
} else if (OB_FAIL(lock_src_and_dest_ls_member_list_(task_info, task_info.src_ls_id_, task_info.dest_ls_id_))) { } else if (OB_FAIL(lock_src_and_dest_ls_member_list_(task_info, task_info.src_ls_id_, task_info.dest_ls_id_))) {
LOG_WARN("failed to lock src and dest ls member list", K(ret), K(task_info)); LOG_WARN("failed to lock src and dest ls member list", K(ret), K(task_info));
} else if (!enable_kill_trx && OB_FAIL(check_src_ls_has_active_trans_(task_info.src_ls_id_))) { } else if (!enable_kill_trx && OB_FAIL(check_src_ls_has_active_trans_(task_info.src_ls_id_))) {
LOG_WARN("failed to check src ls active trans", K(ret), K(task_info)); LOG_WARN("failed to check src ls active trans", K(ret), K(task_info));
} else if (OB_FAIL(block_and_kill_tx_(task_info, enable_kill_trx, kill_trx_threshold, timeout_ctx))) { } else if (OB_FAIL(block_and_kill_tx_(task_info, enable_kill_trx, timeout_ctx))) {
LOG_WARN("failed to block and kill tx", K(ret), K(task_info)); LOG_WARN("failed to block and kill tx", K(ret), K(task_info));
} else if (OB_FAIL(check_start_status_transfer_tablets_(task_info))) { } else if (OB_FAIL(check_start_status_transfer_tablets_(task_info))) {
LOG_WARN("failed to check start status transfer tablets", K(ret), K(task_info)); LOG_WARN("failed to check start status transfer tablets", K(ret), K(task_info));
@ -905,7 +903,11 @@ int ObTransferHandler::start_trans_(
omt::ObTenantConfigGuard tenant_config(TENANT_CONF(tenant_id)); omt::ObTenantConfigGuard tenant_config(TENANT_CONF(tenant_id));
int64_t stmt_timeout = 10_s; int64_t stmt_timeout = 10_s;
if (tenant_config.is_valid()) { if (tenant_config.is_valid()) {
stmt_timeout = tenant_config->_transfer_start_trans_timeout + tenant_config->_balance_kill_transaction_threshold; stmt_timeout = tenant_config->_transfer_start_trans_timeout;
if (tenant_config->_enable_balance_kill_transaction) {
stmt_timeout += tenant_config->_balance_kill_transaction_threshold;
stmt_timeout += tenant_config->_balance_wait_killing_transaction_end_threshold;
}
} }
if (!is_inited_) { if (!is_inited_) {
@ -1746,22 +1748,34 @@ int ObTransferHandler::do_worker_transfer_()
int ObTransferHandler::block_and_kill_tx_( int ObTransferHandler::block_and_kill_tx_(
const share::ObTransferTaskInfo &task_info, const share::ObTransferTaskInfo &task_info,
const bool enable_kill_trx, const bool enable_kill_trx,
const int64_t kill_trx_threshold,
ObTimeoutCtx &timeout_ctx) ObTimeoutCtx &timeout_ctx)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
const uint64_t tenant_id = task_info.tenant_id_; const uint64_t tenant_id = task_info.tenant_id_;
const share::ObLSID &src_ls_id = task_info.src_ls_id_; const share::ObLSID &src_ls_id = task_info.src_ls_id_;
const int64_t start_ts = ObTimeUtil::current_time(); const int64_t start_ts = ObTimeUtil::current_time();
int64_t before_kill_trx_threshold = 0;
int64_t after_kill_trx_threshold = 0;
omt::ObTenantConfigGuard tenant_config(TENANT_CONF(MTL_ID()));
int64_t active_trans_count = 0;
if (tenant_config.is_valid()) {
before_kill_trx_threshold = tenant_config->_balance_kill_transaction_threshold;
after_kill_trx_threshold = tenant_config->_balance_wait_killing_transaction_end_threshold;
}
if (OB_FAIL(block_tx_(tenant_id, src_ls_id))) { if (OB_FAIL(block_tx_(tenant_id, src_ls_id))) {
LOG_WARN("failed to block tx", K(ret), K(task_info)); LOG_WARN("failed to block tx", K(ret), K(task_info));
} else if (!enable_kill_trx) { } else if (!enable_kill_trx) {
LOG_INFO("transfer no need kill tx", K(task_info)); if (OB_FAIL(get_ls_active_trans_count_(src_ls_id, active_trans_count))) {
} else if (OB_FAIL(check_for_kill_(tenant_id, src_ls_id, kill_trx_threshold, false/*is_after_kill*/, timeout_ctx))) { LOG_WARN("failed to get src ls has active trans", K(ret));
LOG_WARN("failed to check before kill", K(ret)); } else if (0 != active_trans_count) {
} else if (OB_FAIL(kill_tx_(tenant_id, src_ls_id))) { ret = OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT;
LOG_WARN("failed to kill tx", K(ret)); LOG_WARN("transfer src ls still has active transactions, cannot do transfer", K(ret), K(src_ls_id),
} else if (OB_FAIL(check_for_kill_(tenant_id, src_ls_id, kill_trx_threshold, true/*is_after_kill*/, timeout_ctx))) { K(active_trans_count));
}
} else if (OB_FAIL(check_and_kill_tx_(tenant_id, src_ls_id, before_kill_trx_threshold, false/*with_trans_kill*/, timeout_ctx))) {
LOG_WARN("failed to check after kill", K(ret));
} else if (OB_FAIL(check_and_kill_tx_(tenant_id, src_ls_id, after_kill_trx_threshold, true/*with_trans_kill*/, timeout_ctx))) {
LOG_WARN("failed to check after kill", K(ret)); LOG_WARN("failed to check after kill", K(ret));
} else { } else {
LOG_INFO("[TRANSFER] success to block and kill tx", "cost", ObTimeUtil::current_time() - start_ts); LOG_INFO("[TRANSFER] success to block and kill tx", "cost", ObTimeUtil::current_time() - start_ts);
@ -1773,11 +1787,11 @@ int ObTransferHandler::block_and_kill_tx_(
return ret; return ret;
} }
int ObTransferHandler::check_for_kill_( int ObTransferHandler::check_and_kill_tx_(
const uint64_t tenant_id, const uint64_t tenant_id,
const share::ObLSID &ls_id, const share::ObLSID &ls_id,
const int64_t timeout, const int64_t timeout,
const bool is_after_kill, const bool with_trans_kill,
ObTimeoutCtx &timeout_ctx) ObTimeoutCtx &timeout_ctx)
{ {
int ret = OB_SUCCESS; int ret = OB_SUCCESS;
@ -1790,9 +1804,9 @@ int ObTransferHandler::check_for_kill_(
ret = OB_TIMEOUT; ret = OB_TIMEOUT;
LOG_WARN("trans ctx already timeout", K(ret)); LOG_WARN("trans ctx already timeout", K(ret));
} else if (cur_ts - start_ts > timeout) { } else if (cur_ts - start_ts > timeout) {
if (is_after_kill) { if (with_trans_kill) {
ret = OB_TIMEOUT; ret = OB_TRANSFER_WAIT_TRANSACTION_END_TIMEOUT;
LOG_WARN("check active trans after kill timeout", K(cur_ts), K(start_ts)); LOG_WARN("wait active trans finish timeout", K(ret), K(cur_ts), K(start_ts));
} else { } else {
break; break;
} }
@ -1806,6 +1820,13 @@ int ObTransferHandler::check_for_kill_(
LOG_WARN("failed to get src ls has active trans", K(ret)); LOG_WARN("failed to get src ls has active trans", K(ret));
} else if (0 != active_trans_count) { } else if (0 != active_trans_count) {
LOG_INFO("still has active trans", K(tenant_id), K(ls_id), K(active_trans_count)); LOG_INFO("still has active trans", K(tenant_id), K(ls_id), K(active_trans_count));
if (with_trans_kill && OB_FAIL(kill_tx_(tenant_id, ls_id))) {
if (OB_EAGAIN == ret) {
ret = OB_SUCCESS;
} else {
LOG_WARN("failed to kill tx", K(ret), K(tenant_id), K(ls_id));
}
}
} else { } else {
break; break;
} }

View File

@ -191,16 +191,15 @@ private:
int block_and_kill_tx_( int block_and_kill_tx_(
const share::ObTransferTaskInfo &task_info, const share::ObTransferTaskInfo &task_info,
const bool enable_kill_trx, const bool enable_kill_trx,
const int64_t kill_trx_threshold,
ObTimeoutCtx &timeout_ctx); ObTimeoutCtx &timeout_ctx);
int block_tx_( int block_tx_(
const uint64_t tenant_id, const uint64_t tenant_id,
const share::ObLSID &ls_id); const share::ObLSID &ls_id);
int check_for_kill_( int check_and_kill_tx_(
const uint64_t tenant_id, const uint64_t tenant_id,
const share::ObLSID &ls_id, const share::ObLSID &ls_id,
const int64_t timeout, const int64_t timeout,
const bool is_after_kill, const bool with_trans_kill,
ObTimeoutCtx &timeout_ctx); ObTimeoutCtx &timeout_ctx);
int kill_tx_( int kill_tx_(
const uint64_t tenant_id, const uint64_t tenant_id,

View File

@ -243,6 +243,7 @@ _backup_idle_time
_backup_task_keep_alive_interval _backup_task_keep_alive_interval
_backup_task_keep_alive_timeout _backup_task_keep_alive_timeout
_balance_kill_transaction_threshold _balance_kill_transaction_threshold
_balance_wait_killing_transaction_end_threshold
_bloom_filter_enabled _bloom_filter_enabled
_bloom_filter_ratio _bloom_filter_ratio
_cache_wash_interval _cache_wash_interval