[CP] [UPGRADE] Calculate default timeout according to tenant count

This commit is contained in:
tino247 2023-12-28 05:19:19 +00:00 committed by ob-robot
parent 901bf26411
commit 9093d820a0
6 changed files with 406 additions and 104 deletions

View File

@ -127,16 +127,45 @@ def set_parameter(cur, parameter, value, timeout = 0):
cur.execute(sql)
wait_parameter_sync(cur, False, parameter, value, timeout)
def set_session_timeout(cur, seconds):
sql = "set @@session.ob_query_timeout = {0}".format(seconds * 1000 * 1000)
logging.info(sql)
cur.execute(sql)
def set_default_timeout_by_tenant(cur, timeout, timeout_per_tenant, min_timeout):
if timeout > 0:
logging.info("use timeout from opt, timeout(s):{0}".format(timeout))
else:
query_cur = QueryCursor(cur)
tenant_id_list = fetch_tenant_ids(query_cur)
cal_timeout = len(tenant_id_list) * timeout_per_tenant
timeout = (cal_timeout if cal_timeout > min_timeout else min_timeout)
logging.info("use default timeout caculated by tenants, "
"timeout(s):{0}, tenant_count:{1}, "
"timeout_per_tenant(s):{2}, min_timeout(s):{3}"
.format(timeout, len(tenant_id_list), timeout_per_tenant, min_timeout))
return timeout
def set_tenant_parameter(cur, parameter, value, timeout = 0):
tenants_list = []
if get_min_cluster_version(cur) < get_version("4.2.1.0"):
tenants_list = ['all']
else:
tenants_list = ['sys', 'all_user', 'all_meta']
query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
set_session_timeout(cur, query_timeout)
for tenants in tenants_list:
sql = """alter system set {0} = '{1}' tenant = '{2}'""".format(parameter, value, tenants)
logging.info(sql)
cur.execute(sql)
set_session_timeout(cur, 10)
wait_parameter_sync(cur, True, parameter, value, timeout)
def get_ori_enable_ddl(cur, timeout):
@ -225,7 +254,20 @@ def wait_parameter_sync(cur, is_tenant_config, key, value, timeout):
table_name = "GV$OB_PARAMETERS" if not is_tenant_config else "__all_virtual_tenant_parameter_info"
sql = """select count(*) as cnt from oceanbase.{0}
where name = '{1}' and value != '{2}'""".format(table_name, key, value)
times = (timeout if timeout > 0 else 60) / 5
wait_timeout = 0
query_timeout = 0
if not is_tenant_config or timeout > 0:
wait_timeout = (timeout if timeout > 0 else 60)
query_timeout = wait_timeout
else:
# is_tenant_config & timeout not set
wait_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
query_timeout = set_default_timeout_by_tenant(cur, timeout, 2, 60)
set_session_timeout(cur, query_timeout)
times = wait_timeout / 5
while times >= 0:
logging.info(sql)
cur.execute(sql)
@ -245,6 +287,8 @@ def wait_parameter_sync(cur, is_tenant_config, key, value, timeout):
raise e
time.sleep(5)
set_session_timeout(cur, 10)
def do_begin_upgrade(cur, timeout):
if not check_parameter(cur, False, "enable_upgrade_mode", "True"):
@ -316,24 +360,38 @@ def do_suspend_merge(cur, timeout):
tenants_list = ['all']
else:
tenants_list = ['sys', 'all_user', 'all_meta']
query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
set_session_timeout(cur, query_timeout)
for tenants in tenants_list:
action_sql = "alter system suspend merge tenant = {0}".format(tenants)
rollback_sql = "alter system resume merge tenant = {0}".format(tenants)
logging.info(action_sql)
cur.execute(action_sql)
set_session_timeout(cur, 10)
def do_resume_merge(cur, timeout):
tenants_list = []
if get_min_cluster_version(cur) < get_version("4.2.1.0"):
tenants_list = ['all']
else:
tenants_list = ['sys', 'all_user', 'all_meta']
query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
set_session_timeout(cur, query_timeout)
for tenants in tenants_list:
action_sql = "alter system resume merge tenant = {0}".format(tenants)
rollback_sql = "alter system suspend merge tenant = {0}".format(tenants)
logging.info(action_sql)
cur.execute(action_sql)
set_session_timeout(cur, 10)
class Cursor:
__cursor = None
def __init__(self, cursor):

View File

@ -21,17 +21,7 @@ def do_upgrade(conn, cur, timeout, user, pwd):
else:
run_upgrade_job(conn, cur, "UPGRADE_VIRTUAL_SCHEMA", timeout)
# just to make __all_virtual_upgrade_inspection avaliable
timeout_ts = (timeout if timeout > 0 else 600) * 1000 * 1000
sql = "set @@session.ob_query_timeout = {0}".format(timeout_ts)
logging.info(sql)
cur.execute(sql)
sql = "alter system run job 'root_inspection'"
logging.info(sql)
cur.execute(sql)
sql = "set @@session.ob_query_timeout = 10000000"
logging.info(sql)
cur.execute(sql)
run_root_inspection(cur, timeout)
####========******####======== actions begin ========####******========####
upgrade_syslog_level(conn, cur)
return
@ -47,7 +37,6 @@ def upgrade_syslog_level(conn, cur):
info_cnt = result[0][0]
if info_cnt > 0:
actions.set_parameter(cur, "syslog_level", "WDIAG")
except Exception, e:
logging.warn("upgrade syslog level failed!")
raise e
@ -62,6 +51,18 @@ def query(cur, sql):
def get_tenant_ids(cur):
return [_[0] for _ in query(cur, 'select tenant_id from oceanbase.__all_tenant')]
def run_root_inspection(cur, timeout):
query_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 600)
actions.set_session_timeout(cur, query_timeout)
sql = "alter system run job 'root_inspection'"
logging.info(sql)
cur.execute(sql)
actions.set_session_timeout(cur, 10)
def upgrade_across_version(cur):
current_data_version = actions.get_current_data_version()
int_current_data_version = actions.get_version(current_data_version)
@ -167,7 +168,9 @@ def check_can_run_upgrade_job(cur, job_name):
def check_upgrade_job_result(cur, job_name, timeout, max_used_job_id):
try:
times = (timeout if timeout > 0 else 3600) / 10
wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 100, 3600)
times = wait_timeout / 10
while (times >= 0):
sql = """select job_status, rs_svr_ip, rs_svr_port, gmt_create from oceanbase.__all_rootservice_job
where job_type = '{0}' and job_id > {1} order by job_id desc limit 1

View File

@ -73,7 +73,7 @@ sys.argv[0] + """ [OPTIONS]""" +\
' that all modules should be run. They are splitted by ",".\n' +\
' For example: -m all, or --module=ddl,normal_dml,special_action\n' +\
'-l, --log-file=name Log file path. If log file path is not given it\'s ' + os.path.splitext(sys.argv[0])[0] + '.log\n' +\
'-t, --timeout=name check timeout, default: 600(s).\n' + \
'-t, --timeout=name check timeout.\n' + \
'-z, --zone=name If zone is not specified, check all servers status in cluster. \n' +\
' Otherwise, only check servers status in specified zone. \n' + \
'\n\n' +\
@ -135,8 +135,7 @@ Option('p', 'password', True, False, ''),\
Option('m', 'module', True, False, 'all'),\
# 日志文件路径,不同脚本的main函数中中会改成不同的默认值
Option('l', 'log-file', True, False),\
# 一些检查的超时时间,默认是600s
Option('t', 'timeout', True, False, '600'),\
Option('t', 'timeout', True, False, 0),\
Option('z', 'zone', True, False, ''),\
]\
@ -288,13 +287,38 @@ def check_zone_valid(query_cur, zone):
else:
logging.info("zone is empty, check all servers in cluster")
def fetch_tenant_ids(query_cur):
try:
tenant_id_list = []
(desc, results) = query_cur.exec_query("""select distinct tenant_id from oceanbase.__all_tenant order by tenant_id desc""")
for r in results:
tenant_id_list.append(r[0])
return tenant_id_list
except Exception, e:
logging.exception('fail to fetch distinct tenant ids')
raise e
def set_default_timeout_by_tenant(query_cur, timeout, timeout_per_tenant, min_timeout):
if timeout > 0:
logging.info("use timeout from opt, timeout(s):{0}".format(timeout))
else:
tenant_id_list = fetch_tenant_ids(query_cur)
cal_timeout = len(tenant_id_list) * timeout_per_tenant
timeout = (cal_timeout if cal_timeout > min_timeout else min_timeout)
logging.info("use default timeout caculated by tenants, "
"timeout(s):{0}, tenant_count:{1}, "
"timeout_per_tenant(s):{2}, min_timeout(s):{3}"
.format(timeout, len(tenant_id_list), timeout_per_tenant, min_timeout))
return timeout
#### START ####
# 0. 检查server版本是否严格一致
def check_server_version_by_zone(query_cur, zone):
if zone == '':
logging.info("skip check server version by cluster")
else:
sql = """select distinct(substring_index(build_version, '_', 1)) from __all_server where zone = '{0}'""".format(zone);
sql = """select distinct(substring_index(build_version, '_', 1)) from oceanbase.__all_server where zone = '{0}'""".format(zone);
(desc, results) = query_cur.exec_query(sql);
if len(results) != 1:
raise MyError("servers build_version not match")
@ -304,8 +328,9 @@ def check_server_version_by_zone(query_cur, zone):
# 1. 检查paxos副本是否同步, paxos副本是否缺失
def check_paxos_replica(query_cur, timeout):
# 1.1 检查paxos副本是否同步
sql = """select count(*) from GV$OB_LOG_STAT where in_sync = 'NO'"""
check_until_timeout(query_cur, sql, 0, timeout)
sql = """select count(*) from oceanbase.GV$OB_LOG_STAT where in_sync = 'NO'"""
wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 10, 600)
check_until_timeout(query_cur, sql, 0, wait_timeout)
# 1.2 检查paxos副本是否有缺失 TODO
logging.info('check paxos replica success')
@ -315,26 +340,29 @@ def check_observer_status(query_cur, zone, timeout):
sql = """select count(*) from oceanbase.__all_server where (start_service_time <= 0 or status='inactive')"""
if zone != '':
sql += """ and zone = '{0}'""".format(zone)
check_until_timeout(query_cur, sql, 0, timeout)
wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 10, 600)
check_until_timeout(query_cur, sql, 0, wait_timeout)
# 3. 检查schema是否刷新成功
def check_schema_status(query_cur, timeout):
sql = """select if (a.cnt = b.cnt, 1, 0) as passed from (select count(*) as cnt from oceanbase.__all_virtual_server_schema_info where refreshed_schema_version > 1 and refreshed_schema_version % 8 = 0) as a join (select count(*) as cnt from oceanbase.__all_server join oceanbase.__all_tenant) as b"""
check_until_timeout(query_cur, sql, 1, timeout)
wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 30, 600)
check_until_timeout(query_cur, sql, 1, wait_timeout)
# 4. check major finish
def check_major_merge(query_cur, timeout):
need_check = 0
(desc, results) = query_cur.exec_query("""select distinct value from GV$OB_PARAMETERs where name = 'enable_major_freeze';""")
(desc, results) = query_cur.exec_query("""select distinct value from oceanbase.GV$OB_PARAMETERS where name = 'enable_major_freeze';""")
if len(results) != 1:
need_check = 1
elif results[0][0] != 'True':
need_check = 1
if need_check == 1:
sql = """select count(1) from CDB_OB_MAJOR_COMPACTION where (GLOBAL_BROADCAST_SCN > LAST_SCN or STATUS != 'IDLE')"""
check_until_timeout(query_cur, sql, 0, timeout)
sql2 = """select /*+ query_timeout(1000000000) */ count(1) from __all_virtual_tablet_compaction_info where max_received_scn > finished_scn and max_received_scn > 0"""
check_until_timeout(query_cur, sql2, 0, timeout)
wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 30, 600)
sql = """select count(1) from oceanbase.CDB_OB_MAJOR_COMPACTION where (GLOBAL_BROADCAST_SCN > LAST_SCN or STATUS != 'IDLE')"""
check_until_timeout(query_cur, sql, 0, wait_timeout)
sql2 = """select /*+ query_timeout(1000000000) */ count(1) from oceanbase.__all_virtual_tablet_compaction_info where max_received_scn > finished_scn and max_received_scn > 0"""
check_until_timeout(query_cur, sql2, 0, wait_timeout)
def check_until_timeout(query_cur, sql, value, timeout):
times = timeout / 10
@ -366,7 +394,6 @@ def do_check(my_host, my_port, my_user, my_passwd, upgrade_params, timeout, need
raise_on_warnings = True)
conn.autocommit = True
cur = conn.cursor(buffered=True)
timeout = timeout if timeout > 0 else 600
try:
query_cur = QueryCursor(cur)
check_zone_valid(query_cur, zone)

View File

@ -135,16 +135,45 @@
# cur.execute(sql)
# wait_parameter_sync(cur, False, parameter, value, timeout)
#
#def set_session_timeout(cur, seconds):
# sql = "set @@session.ob_query_timeout = {0}".format(seconds * 1000 * 1000)
# logging.info(sql)
# cur.execute(sql)
#
#def set_default_timeout_by_tenant(cur, timeout, timeout_per_tenant, min_timeout):
# if timeout > 0:
# logging.info("use timeout from opt, timeout(s):{0}".format(timeout))
# else:
# query_cur = QueryCursor(cur)
# tenant_id_list = fetch_tenant_ids(query_cur)
# cal_timeout = len(tenant_id_list) * timeout_per_tenant
# timeout = (cal_timeout if cal_timeout > min_timeout else min_timeout)
# logging.info("use default timeout caculated by tenants, "
# "timeout(s):{0}, tenant_count:{1}, "
# "timeout_per_tenant(s):{2}, min_timeout(s):{3}"
# .format(timeout, len(tenant_id_list), timeout_per_tenant, min_timeout))
#
# return timeout
#
#def set_tenant_parameter(cur, parameter, value, timeout = 0):
#
# tenants_list = []
# if get_min_cluster_version(cur) < get_version("4.2.1.0"):
# tenants_list = ['all']
# else:
# tenants_list = ['sys', 'all_user', 'all_meta']
#
# query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
#
# set_session_timeout(cur, query_timeout)
#
# for tenants in tenants_list:
# sql = """alter system set {0} = '{1}' tenant = '{2}'""".format(parameter, value, tenants)
# logging.info(sql)
# cur.execute(sql)
#
# set_session_timeout(cur, 10)
#
# wait_parameter_sync(cur, True, parameter, value, timeout)
#
#def get_ori_enable_ddl(cur, timeout):
@ -233,7 +262,20 @@
# table_name = "GV$OB_PARAMETERS" if not is_tenant_config else "__all_virtual_tenant_parameter_info"
# sql = """select count(*) as cnt from oceanbase.{0}
# where name = '{1}' and value != '{2}'""".format(table_name, key, value)
# times = (timeout if timeout > 0 else 60) / 5
#
# wait_timeout = 0
# query_timeout = 0
# if not is_tenant_config or timeout > 0:
# wait_timeout = (timeout if timeout > 0 else 60)
# query_timeout = wait_timeout
# else:
# # is_tenant_config & timeout not set
# wait_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
# query_timeout = set_default_timeout_by_tenant(cur, timeout, 2, 60)
#
# set_session_timeout(cur, query_timeout)
#
# times = wait_timeout / 5
# while times >= 0:
# logging.info(sql)
# cur.execute(sql)
@ -253,6 +295,8 @@
# raise e
# time.sleep(5)
#
# set_session_timeout(cur, 10)
#
#def do_begin_upgrade(cur, timeout):
#
# if not check_parameter(cur, False, "enable_upgrade_mode", "True"):
@ -324,24 +368,38 @@
# tenants_list = ['all']
# else:
# tenants_list = ['sys', 'all_user', 'all_meta']
#
# query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
#
# set_session_timeout(cur, query_timeout)
#
# for tenants in tenants_list:
# action_sql = "alter system suspend merge tenant = {0}".format(tenants)
# rollback_sql = "alter system resume merge tenant = {0}".format(tenants)
# logging.info(action_sql)
# cur.execute(action_sql)
#
# set_session_timeout(cur, 10)
#
#def do_resume_merge(cur, timeout):
# tenants_list = []
# if get_min_cluster_version(cur) < get_version("4.2.1.0"):
# tenants_list = ['all']
# else:
# tenants_list = ['sys', 'all_user', 'all_meta']
#
# query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
#
# set_session_timeout(cur, query_timeout)
#
# for tenants in tenants_list:
# action_sql = "alter system resume merge tenant = {0}".format(tenants)
# rollback_sql = "alter system suspend merge tenant = {0}".format(tenants)
# logging.info(action_sql)
# cur.execute(action_sql)
#
# set_session_timeout(cur, 10)
#
#class Cursor:
# __cursor = None
# def __init__(self, cursor):
@ -1334,17 +1392,7 @@
# else:
# run_upgrade_job(conn, cur, "UPGRADE_VIRTUAL_SCHEMA", timeout)
#
# # just to make __all_virtual_upgrade_inspection avaliable
# timeout_ts = (timeout if timeout > 0 else 600) * 1000 * 1000
# sql = "set @@session.ob_query_timeout = {0}".format(timeout_ts)
# logging.info(sql)
# cur.execute(sql)
# sql = "alter system run job 'root_inspection'"
# logging.info(sql)
# cur.execute(sql)
# sql = "set @@session.ob_query_timeout = 10000000"
# logging.info(sql)
# cur.execute(sql)
# run_root_inspection(cur, timeout)
#####========******####======== actions begin ========####******========####
# upgrade_syslog_level(conn, cur)
# return
@ -1360,7 +1408,6 @@
# info_cnt = result[0][0]
# if info_cnt > 0:
# actions.set_parameter(cur, "syslog_level", "WDIAG")
#
# except Exception, e:
# logging.warn("upgrade syslog level failed!")
# raise e
@ -1375,6 +1422,18 @@
#def get_tenant_ids(cur):
# return [_[0] for _ in query(cur, 'select tenant_id from oceanbase.__all_tenant')]
#
#def run_root_inspection(cur, timeout):
#
# query_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 600)
#
# actions.set_session_timeout(cur, query_timeout)
#
# sql = "alter system run job 'root_inspection'"
# logging.info(sql)
# cur.execute(sql)
#
# actions.set_session_timeout(cur, 10)
#
#def upgrade_across_version(cur):
# current_data_version = actions.get_current_data_version()
# int_current_data_version = actions.get_version(current_data_version)
@ -1480,7 +1539,9 @@
#
#def check_upgrade_job_result(cur, job_name, timeout, max_used_job_id):
# try:
# times = (timeout if timeout > 0 else 3600) / 10
# wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 100, 3600)
#
# times = wait_timeout / 10
# while (times >= 0):
# sql = """select job_status, rs_svr_ip, rs_svr_port, gmt_create from oceanbase.__all_rootservice_job
# where job_type = '{0}' and job_id > {1} order by job_id desc limit 1
@ -2217,7 +2278,8 @@
# fail_list.append("""still has tenant snapshot task, upgrade is not allowed temporarily""")
# else:
# logging.info('check tenant snapshot task success')
## 19. 检查是否有租户在升到4.3.0版本之前已将binlog_row_image设为MINIMAL
#
## 17. 检查是否有租户在升到4.3.0版本之前已将binlog_row_image设为MINIMAL
#def check_variable_binlog_row_image(query_cur):
## 4.3.0.0之前的版本,MINIMAL模式生成的日志CDC无法正常消费(DELETE日志).
## 4.3.0版本开始,MINIMAL模式做了改进,支持CDC消费,需要在升级到4.3.0.0之后再打开.
@ -2405,7 +2467,7 @@
#' that all modules should be run. They are splitted by ",".\n' +\
#' For example: -m all, or --module=ddl,normal_dml,special_action\n' +\
#'-l, --log-file=name Log file path. If log file path is not given it\'s ' + os.path.splitext(sys.argv[0])[0] + '.log\n' +\
#'-t, --timeout=name check timeout, default: 600(s).\n' + \
#'-t, --timeout=name check timeout.\n' + \
#'-z, --zone=name If zone is not specified, check all servers status in cluster. \n' +\
#' Otherwise, only check servers status in specified zone. \n' + \
#'\n\n' +\
@ -2467,8 +2529,7 @@
#Option('m', 'module', True, False, 'all'),\
## 日志文件路径,不同脚本的main函数中中会改成不同的默认值
#Option('l', 'log-file', True, False),\
## 一些检查的超时时间,默认是600s
#Option('t', 'timeout', True, False, '600'),\
#Option('t', 'timeout', True, False, 0),\
#Option('z', 'zone', True, False, ''),\
#]\
#
@ -2620,13 +2681,38 @@
# else:
# logging.info("zone is empty, check all servers in cluster")
#
#def fetch_tenant_ids(query_cur):
# try:
# tenant_id_list = []
# (desc, results) = query_cur.exec_query("""select distinct tenant_id from oceanbase.__all_tenant order by tenant_id desc""")
# for r in results:
# tenant_id_list.append(r[0])
# return tenant_id_list
# except Exception, e:
# logging.exception('fail to fetch distinct tenant ids')
# raise e
#
#def set_default_timeout_by_tenant(query_cur, timeout, timeout_per_tenant, min_timeout):
# if timeout > 0:
# logging.info("use timeout from opt, timeout(s):{0}".format(timeout))
# else:
# tenant_id_list = fetch_tenant_ids(query_cur)
# cal_timeout = len(tenant_id_list) * timeout_per_tenant
# timeout = (cal_timeout if cal_timeout > min_timeout else min_timeout)
# logging.info("use default timeout caculated by tenants, "
# "timeout(s):{0}, tenant_count:{1}, "
# "timeout_per_tenant(s):{2}, min_timeout(s):{3}"
# .format(timeout, len(tenant_id_list), timeout_per_tenant, min_timeout))
#
# return timeout
#
##### START ####
## 0. 检查server版本是否严格一致
#def check_server_version_by_zone(query_cur, zone):
# if zone == '':
# logging.info("skip check server version by cluster")
# else:
# sql = """select distinct(substring_index(build_version, '_', 1)) from __all_server where zone = '{0}'""".format(zone);
# sql = """select distinct(substring_index(build_version, '_', 1)) from oceanbase.__all_server where zone = '{0}'""".format(zone);
# (desc, results) = query_cur.exec_query(sql);
# if len(results) != 1:
# raise MyError("servers build_version not match")
@ -2636,8 +2722,9 @@
## 1. 检查paxos副本是否同步, paxos副本是否缺失
#def check_paxos_replica(query_cur, timeout):
# # 1.1 检查paxos副本是否同步
# sql = """select count(*) from GV$OB_LOG_STAT where in_sync = 'NO'"""
# check_until_timeout(query_cur, sql, 0, timeout)
# sql = """select count(*) from oceanbase.GV$OB_LOG_STAT where in_sync = 'NO'"""
# wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 10, 600)
# check_until_timeout(query_cur, sql, 0, wait_timeout)
#
# # 1.2 检查paxos副本是否有缺失 TODO
# logging.info('check paxos replica success')
@ -2647,26 +2734,29 @@
# sql = """select count(*) from oceanbase.__all_server where (start_service_time <= 0 or status='inactive')"""
# if zone != '':
# sql += """ and zone = '{0}'""".format(zone)
# check_until_timeout(query_cur, sql, 0, timeout)
# wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 10, 600)
# check_until_timeout(query_cur, sql, 0, wait_timeout)
#
## 3. 检查schema是否刷新成功
#def check_schema_status(query_cur, timeout):
# sql = """select if (a.cnt = b.cnt, 1, 0) as passed from (select count(*) as cnt from oceanbase.__all_virtual_server_schema_info where refreshed_schema_version > 1 and refreshed_schema_version % 8 = 0) as a join (select count(*) as cnt from oceanbase.__all_server join oceanbase.__all_tenant) as b"""
# check_until_timeout(query_cur, sql, 1, timeout)
# wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 30, 600)
# check_until_timeout(query_cur, sql, 1, wait_timeout)
#
## 4. check major finish
#def check_major_merge(query_cur, timeout):
# need_check = 0
# (desc, results) = query_cur.exec_query("""select distinct value from GV$OB_PARAMETERs where name = 'enable_major_freeze';""")
# (desc, results) = query_cur.exec_query("""select distinct value from oceanbase.GV$OB_PARAMETERS where name = 'enable_major_freeze';""")
# if len(results) != 1:
# need_check = 1
# elif results[0][0] != 'True':
# need_check = 1
# if need_check == 1:
# sql = """select count(1) from CDB_OB_MAJOR_COMPACTION where (GLOBAL_BROADCAST_SCN > LAST_SCN or STATUS != 'IDLE')"""
# check_until_timeout(query_cur, sql, 0, timeout)
# sql2 = """select /*+ query_timeout(1000000000) */ count(1) from __all_virtual_tablet_compaction_info where max_received_scn > finished_scn and max_received_scn > 0"""
# check_until_timeout(query_cur, sql2, 0, timeout)
# wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 30, 600)
# sql = """select count(1) from oceanbase.CDB_OB_MAJOR_COMPACTION where (GLOBAL_BROADCAST_SCN > LAST_SCN or STATUS != 'IDLE')"""
# check_until_timeout(query_cur, sql, 0, wait_timeout)
# sql2 = """select /*+ query_timeout(1000000000) */ count(1) from oceanbase.__all_virtual_tablet_compaction_info where max_received_scn > finished_scn and max_received_scn > 0"""
# check_until_timeout(query_cur, sql2, 0, wait_timeout)
#
#def check_until_timeout(query_cur, sql, value, timeout):
# times = timeout / 10
@ -2698,7 +2788,6 @@
# raise_on_warnings = True)
# conn.autocommit = True
# cur = conn.cursor(buffered=True)
# timeout = timeout if timeout > 0 else 600
# try:
# query_cur = QueryCursor(cur)
# check_zone_valid(query_cur, zone)
@ -2795,8 +2884,14 @@
# # check compatible sync
# parameter_count = int(server_count) * int(tenant_count)
# current_data_version = actions.get_current_data_version()
#
# query_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 2, 60)
# actions.set_session_timeout(cur, query_timeout)
#
# sql = """select count(*) as cnt from oceanbase.__all_virtual_tenant_parameter_info where name = 'compatible' and value = '{0}' and tenant_id in ({1})""".format(current_data_version, tenant_ids_str)
# times = (timeout if timeout > 0 else 60) / 5
#
# wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 60)
# times = wait_timeout / 5
# while times >= 0:
# logging.info(sql)
# cur.execute(sql)
@ -2816,6 +2911,8 @@
# raise e
# time.sleep(5)
#
# actions.set_session_timeout(cur, 10)
#
# # check target_data_version/current_data_version from __all_core_table
# int_current_data_version = actions.get_version(current_data_version)
# sql = "select count(*) from __all_virtual_core_table where column_name in ('target_data_version', 'current_data_version') and column_value = {0} and tenant_id in ({1})".format(int_current_data_version, tenant_ids_str)
@ -2830,16 +2927,20 @@
# logging.info("all tenant's target_data_version/current_data_version are match with {0}".format(current_data_version))
#
## 3 检查内部表自检是否成功
#def check_root_inspection(query_cur, timeout):
#def check_root_inspection(cur, query_cur, timeout):
# sql = "select count(*) from oceanbase.__all_virtual_upgrade_inspection where info != 'succeed'"
# times = timeout if timeout > 0 else 180
# while times > 0 :
#
# wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 600)
#
# times = wait_timeout / 10
# while times >= 0 :
# (desc, results) = query_cur.exec_query(sql)
# if results[0][0] == 0:
# break
# time.sleep(10)
# times -= 1
# if times == 0:
#
# if times == -1:
# logging.warn('check root inspection failed!')
# raise e
# logging.info('check root inspection success')
@ -2867,7 +2968,7 @@
# try:
# check_cluster_version(cur, timeout)
# check_data_version(cur, query_cur, timeout)
# check_root_inspection(query_cur, timeout)
# check_root_inspection(cur, query_cur, timeout)
# enable_ddl(cur, timeout)
# enable_rebalance(cur, timeout)
# enable_rereplication(cur, timeout)

View File

@ -41,8 +41,14 @@ def check_data_version(cur, query_cur, timeout):
# check compatible sync
parameter_count = int(server_count) * int(tenant_count)
current_data_version = actions.get_current_data_version()
query_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 2, 60)
actions.set_session_timeout(cur, query_timeout)
sql = """select count(*) as cnt from oceanbase.__all_virtual_tenant_parameter_info where name = 'compatible' and value = '{0}' and tenant_id in ({1})""".format(current_data_version, tenant_ids_str)
times = (timeout if timeout > 0 else 60) / 5
wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 60)
times = wait_timeout / 5
while times >= 0:
logging.info(sql)
cur.execute(sql)
@ -62,6 +68,8 @@ def check_data_version(cur, query_cur, timeout):
raise e
time.sleep(5)
actions.set_session_timeout(cur, 10)
# check target_data_version/current_data_version from __all_core_table
int_current_data_version = actions.get_version(current_data_version)
sql = "select count(*) from __all_virtual_core_table where column_name in ('target_data_version', 'current_data_version') and column_value = {0} and tenant_id in ({1})".format(int_current_data_version, tenant_ids_str)
@ -76,16 +84,20 @@ def check_data_version(cur, query_cur, timeout):
logging.info("all tenant's target_data_version/current_data_version are match with {0}".format(current_data_version))
# 3 检查内部表自检是否成功
def check_root_inspection(query_cur, timeout):
def check_root_inspection(cur, query_cur, timeout):
sql = "select count(*) from oceanbase.__all_virtual_upgrade_inspection where info != 'succeed'"
times = timeout if timeout > 0 else 180
while times > 0 :
wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 600)
times = wait_timeout / 10
while times >= 0 :
(desc, results) = query_cur.exec_query(sql)
if results[0][0] == 0:
break
time.sleep(10)
times -= 1
if times == 0:
if times == -1:
logging.warn('check root inspection failed!')
raise e
logging.info('check root inspection success')
@ -113,7 +125,7 @@ def do_check(conn, cur, query_cur, timeout):
try:
check_cluster_version(cur, timeout)
check_data_version(cur, query_cur, timeout)
check_root_inspection(query_cur, timeout)
check_root_inspection(cur, query_cur, timeout)
enable_ddl(cur, timeout)
enable_rebalance(cur, timeout)
enable_rereplication(cur, timeout)

View File

@ -135,16 +135,45 @@
# cur.execute(sql)
# wait_parameter_sync(cur, False, parameter, value, timeout)
#
#def set_session_timeout(cur, seconds):
# sql = "set @@session.ob_query_timeout = {0}".format(seconds * 1000 * 1000)
# logging.info(sql)
# cur.execute(sql)
#
#def set_default_timeout_by_tenant(cur, timeout, timeout_per_tenant, min_timeout):
# if timeout > 0:
# logging.info("use timeout from opt, timeout(s):{0}".format(timeout))
# else:
# query_cur = QueryCursor(cur)
# tenant_id_list = fetch_tenant_ids(query_cur)
# cal_timeout = len(tenant_id_list) * timeout_per_tenant
# timeout = (cal_timeout if cal_timeout > min_timeout else min_timeout)
# logging.info("use default timeout caculated by tenants, "
# "timeout(s):{0}, tenant_count:{1}, "
# "timeout_per_tenant(s):{2}, min_timeout(s):{3}"
# .format(timeout, len(tenant_id_list), timeout_per_tenant, min_timeout))
#
# return timeout
#
#def set_tenant_parameter(cur, parameter, value, timeout = 0):
#
# tenants_list = []
# if get_min_cluster_version(cur) < get_version("4.2.1.0"):
# tenants_list = ['all']
# else:
# tenants_list = ['sys', 'all_user', 'all_meta']
#
# query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
#
# set_session_timeout(cur, query_timeout)
#
# for tenants in tenants_list:
# sql = """alter system set {0} = '{1}' tenant = '{2}'""".format(parameter, value, tenants)
# logging.info(sql)
# cur.execute(sql)
#
# set_session_timeout(cur, 10)
#
# wait_parameter_sync(cur, True, parameter, value, timeout)
#
#def get_ori_enable_ddl(cur, timeout):
@ -233,7 +262,20 @@
# table_name = "GV$OB_PARAMETERS" if not is_tenant_config else "__all_virtual_tenant_parameter_info"
# sql = """select count(*) as cnt from oceanbase.{0}
# where name = '{1}' and value != '{2}'""".format(table_name, key, value)
# times = (timeout if timeout > 0 else 60) / 5
#
# wait_timeout = 0
# query_timeout = 0
# if not is_tenant_config or timeout > 0:
# wait_timeout = (timeout if timeout > 0 else 60)
# query_timeout = wait_timeout
# else:
# # is_tenant_config & timeout not set
# wait_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
# query_timeout = set_default_timeout_by_tenant(cur, timeout, 2, 60)
#
# set_session_timeout(cur, query_timeout)
#
# times = wait_timeout / 5
# while times >= 0:
# logging.info(sql)
# cur.execute(sql)
@ -253,6 +295,8 @@
# raise e
# time.sleep(5)
#
# set_session_timeout(cur, 10)
#
#def do_begin_upgrade(cur, timeout):
#
# if not check_parameter(cur, False, "enable_upgrade_mode", "True"):
@ -324,24 +368,38 @@
# tenants_list = ['all']
# else:
# tenants_list = ['sys', 'all_user', 'all_meta']
#
# query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
#
# set_session_timeout(cur, query_timeout)
#
# for tenants in tenants_list:
# action_sql = "alter system suspend merge tenant = {0}".format(tenants)
# rollback_sql = "alter system resume merge tenant = {0}".format(tenants)
# logging.info(action_sql)
# cur.execute(action_sql)
#
# set_session_timeout(cur, 10)
#
#def do_resume_merge(cur, timeout):
# tenants_list = []
# if get_min_cluster_version(cur) < get_version("4.2.1.0"):
# tenants_list = ['all']
# else:
# tenants_list = ['sys', 'all_user', 'all_meta']
#
# query_timeout = set_default_timeout_by_tenant(cur, timeout, 10, 60)
#
# set_session_timeout(cur, query_timeout)
#
# for tenants in tenants_list:
# action_sql = "alter system resume merge tenant = {0}".format(tenants)
# rollback_sql = "alter system suspend merge tenant = {0}".format(tenants)
# logging.info(action_sql)
# cur.execute(action_sql)
#
# set_session_timeout(cur, 10)
#
#class Cursor:
# __cursor = None
# def __init__(self, cursor):
@ -1334,17 +1392,7 @@
# else:
# run_upgrade_job(conn, cur, "UPGRADE_VIRTUAL_SCHEMA", timeout)
#
# # just to make __all_virtual_upgrade_inspection avaliable
# timeout_ts = (timeout if timeout > 0 else 600) * 1000 * 1000
# sql = "set @@session.ob_query_timeout = {0}".format(timeout_ts)
# logging.info(sql)
# cur.execute(sql)
# sql = "alter system run job 'root_inspection'"
# logging.info(sql)
# cur.execute(sql)
# sql = "set @@session.ob_query_timeout = 10000000"
# logging.info(sql)
# cur.execute(sql)
# run_root_inspection(cur, timeout)
#####========******####======== actions begin ========####******========####
# upgrade_syslog_level(conn, cur)
# return
@ -1360,7 +1408,6 @@
# info_cnt = result[0][0]
# if info_cnt > 0:
# actions.set_parameter(cur, "syslog_level", "WDIAG")
#
# except Exception, e:
# logging.warn("upgrade syslog level failed!")
# raise e
@ -1375,6 +1422,18 @@
#def get_tenant_ids(cur):
# return [_[0] for _ in query(cur, 'select tenant_id from oceanbase.__all_tenant')]
#
#def run_root_inspection(cur, timeout):
#
# query_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 600)
#
# actions.set_session_timeout(cur, query_timeout)
#
# sql = "alter system run job 'root_inspection'"
# logging.info(sql)
# cur.execute(sql)
#
# actions.set_session_timeout(cur, 10)
#
#def upgrade_across_version(cur):
# current_data_version = actions.get_current_data_version()
# int_current_data_version = actions.get_version(current_data_version)
@ -1480,7 +1539,9 @@
#
#def check_upgrade_job_result(cur, job_name, timeout, max_used_job_id):
# try:
# times = (timeout if timeout > 0 else 3600) / 10
# wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 100, 3600)
#
# times = wait_timeout / 10
# while (times >= 0):
# sql = """select job_status, rs_svr_ip, rs_svr_port, gmt_create from oceanbase.__all_rootservice_job
# where job_type = '{0}' and job_id > {1} order by job_id desc limit 1
@ -2217,7 +2278,8 @@
# fail_list.append("""still has tenant snapshot task, upgrade is not allowed temporarily""")
# else:
# logging.info('check tenant snapshot task success')
## 19. 检查是否有租户在升到4.3.0版本之前已将binlog_row_image设为MINIMAL
#
## 17. 检查是否有租户在升到4.3.0版本之前已将binlog_row_image设为MINIMAL
#def check_variable_binlog_row_image(query_cur):
## 4.3.0.0之前的版本,MINIMAL模式生成的日志CDC无法正常消费(DELETE日志).
## 4.3.0版本开始,MINIMAL模式做了改进,支持CDC消费,需要在升级到4.3.0.0之后再打开.
@ -2405,7 +2467,7 @@
#' that all modules should be run. They are splitted by ",".\n' +\
#' For example: -m all, or --module=ddl,normal_dml,special_action\n' +\
#'-l, --log-file=name Log file path. If log file path is not given it\'s ' + os.path.splitext(sys.argv[0])[0] + '.log\n' +\
#'-t, --timeout=name check timeout, default: 600(s).\n' + \
#'-t, --timeout=name check timeout.\n' + \
#'-z, --zone=name If zone is not specified, check all servers status in cluster. \n' +\
#' Otherwise, only check servers status in specified zone. \n' + \
#'\n\n' +\
@ -2467,8 +2529,7 @@
#Option('m', 'module', True, False, 'all'),\
## 日志文件路径,不同脚本的main函数中中会改成不同的默认值
#Option('l', 'log-file', True, False),\
## 一些检查的超时时间,默认是600s
#Option('t', 'timeout', True, False, '600'),\
#Option('t', 'timeout', True, False, 0),\
#Option('z', 'zone', True, False, ''),\
#]\
#
@ -2620,13 +2681,38 @@
# else:
# logging.info("zone is empty, check all servers in cluster")
#
#def fetch_tenant_ids(query_cur):
# try:
# tenant_id_list = []
# (desc, results) = query_cur.exec_query("""select distinct tenant_id from oceanbase.__all_tenant order by tenant_id desc""")
# for r in results:
# tenant_id_list.append(r[0])
# return tenant_id_list
# except Exception, e:
# logging.exception('fail to fetch distinct tenant ids')
# raise e
#
#def set_default_timeout_by_tenant(query_cur, timeout, timeout_per_tenant, min_timeout):
# if timeout > 0:
# logging.info("use timeout from opt, timeout(s):{0}".format(timeout))
# else:
# tenant_id_list = fetch_tenant_ids(query_cur)
# cal_timeout = len(tenant_id_list) * timeout_per_tenant
# timeout = (cal_timeout if cal_timeout > min_timeout else min_timeout)
# logging.info("use default timeout caculated by tenants, "
# "timeout(s):{0}, tenant_count:{1}, "
# "timeout_per_tenant(s):{2}, min_timeout(s):{3}"
# .format(timeout, len(tenant_id_list), timeout_per_tenant, min_timeout))
#
# return timeout
#
##### START ####
## 0. 检查server版本是否严格一致
#def check_server_version_by_zone(query_cur, zone):
# if zone == '':
# logging.info("skip check server version by cluster")
# else:
# sql = """select distinct(substring_index(build_version, '_', 1)) from __all_server where zone = '{0}'""".format(zone);
# sql = """select distinct(substring_index(build_version, '_', 1)) from oceanbase.__all_server where zone = '{0}'""".format(zone);
# (desc, results) = query_cur.exec_query(sql);
# if len(results) != 1:
# raise MyError("servers build_version not match")
@ -2636,8 +2722,9 @@
## 1. 检查paxos副本是否同步, paxos副本是否缺失
#def check_paxos_replica(query_cur, timeout):
# # 1.1 检查paxos副本是否同步
# sql = """select count(*) from GV$OB_LOG_STAT where in_sync = 'NO'"""
# check_until_timeout(query_cur, sql, 0, timeout)
# sql = """select count(*) from oceanbase.GV$OB_LOG_STAT where in_sync = 'NO'"""
# wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 10, 600)
# check_until_timeout(query_cur, sql, 0, wait_timeout)
#
# # 1.2 检查paxos副本是否有缺失 TODO
# logging.info('check paxos replica success')
@ -2647,26 +2734,29 @@
# sql = """select count(*) from oceanbase.__all_server where (start_service_time <= 0 or status='inactive')"""
# if zone != '':
# sql += """ and zone = '{0}'""".format(zone)
# check_until_timeout(query_cur, sql, 0, timeout)
# wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 10, 600)
# check_until_timeout(query_cur, sql, 0, wait_timeout)
#
## 3. 检查schema是否刷新成功
#def check_schema_status(query_cur, timeout):
# sql = """select if (a.cnt = b.cnt, 1, 0) as passed from (select count(*) as cnt from oceanbase.__all_virtual_server_schema_info where refreshed_schema_version > 1 and refreshed_schema_version % 8 = 0) as a join (select count(*) as cnt from oceanbase.__all_server join oceanbase.__all_tenant) as b"""
# check_until_timeout(query_cur, sql, 1, timeout)
# wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 30, 600)
# check_until_timeout(query_cur, sql, 1, wait_timeout)
#
## 4. check major finish
#def check_major_merge(query_cur, timeout):
# need_check = 0
# (desc, results) = query_cur.exec_query("""select distinct value from GV$OB_PARAMETERs where name = 'enable_major_freeze';""")
# (desc, results) = query_cur.exec_query("""select distinct value from oceanbase.GV$OB_PARAMETERS where name = 'enable_major_freeze';""")
# if len(results) != 1:
# need_check = 1
# elif results[0][0] != 'True':
# need_check = 1
# if need_check == 1:
# sql = """select count(1) from CDB_OB_MAJOR_COMPACTION where (GLOBAL_BROADCAST_SCN > LAST_SCN or STATUS != 'IDLE')"""
# check_until_timeout(query_cur, sql, 0, timeout)
# sql2 = """select /*+ query_timeout(1000000000) */ count(1) from __all_virtual_tablet_compaction_info where max_received_scn > finished_scn and max_received_scn > 0"""
# check_until_timeout(query_cur, sql2, 0, timeout)
# wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 30, 600)
# sql = """select count(1) from oceanbase.CDB_OB_MAJOR_COMPACTION where (GLOBAL_BROADCAST_SCN > LAST_SCN or STATUS != 'IDLE')"""
# check_until_timeout(query_cur, sql, 0, wait_timeout)
# sql2 = """select /*+ query_timeout(1000000000) */ count(1) from oceanbase.__all_virtual_tablet_compaction_info where max_received_scn > finished_scn and max_received_scn > 0"""
# check_until_timeout(query_cur, sql2, 0, wait_timeout)
#
#def check_until_timeout(query_cur, sql, value, timeout):
# times = timeout / 10
@ -2698,7 +2788,6 @@
# raise_on_warnings = True)
# conn.autocommit = True
# cur = conn.cursor(buffered=True)
# timeout = timeout if timeout > 0 else 600
# try:
# query_cur = QueryCursor(cur)
# check_zone_valid(query_cur, zone)
@ -2795,8 +2884,14 @@
# # check compatible sync
# parameter_count = int(server_count) * int(tenant_count)
# current_data_version = actions.get_current_data_version()
#
# query_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 2, 60)
# actions.set_session_timeout(cur, query_timeout)
#
# sql = """select count(*) as cnt from oceanbase.__all_virtual_tenant_parameter_info where name = 'compatible' and value = '{0}' and tenant_id in ({1})""".format(current_data_version, tenant_ids_str)
# times = (timeout if timeout > 0 else 60) / 5
#
# wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 60)
# times = wait_timeout / 5
# while times >= 0:
# logging.info(sql)
# cur.execute(sql)
@ -2816,6 +2911,8 @@
# raise e
# time.sleep(5)
#
# actions.set_session_timeout(cur, 10)
#
# # check target_data_version/current_data_version from __all_core_table
# int_current_data_version = actions.get_version(current_data_version)
# sql = "select count(*) from __all_virtual_core_table where column_name in ('target_data_version', 'current_data_version') and column_value = {0} and tenant_id in ({1})".format(int_current_data_version, tenant_ids_str)
@ -2830,16 +2927,20 @@
# logging.info("all tenant's target_data_version/current_data_version are match with {0}".format(current_data_version))
#
## 3 检查内部表自检是否成功
#def check_root_inspection(query_cur, timeout):
#def check_root_inspection(cur, query_cur, timeout):
# sql = "select count(*) from oceanbase.__all_virtual_upgrade_inspection where info != 'succeed'"
# times = timeout if timeout > 0 else 180
# while times > 0 :
#
# wait_timeout = actions.set_default_timeout_by_tenant(cur, timeout, 10, 600)
#
# times = wait_timeout / 10
# while times >= 0 :
# (desc, results) = query_cur.exec_query(sql)
# if results[0][0] == 0:
# break
# time.sleep(10)
# times -= 1
# if times == 0:
#
# if times == -1:
# logging.warn('check root inspection failed!')
# raise e
# logging.info('check root inspection success')
@ -2867,7 +2968,7 @@
# try:
# check_cluster_version(cur, timeout)
# check_data_version(cur, query_cur, timeout)
# check_root_inspection(query_cur, timeout)
# check_root_inspection(cur, query_cur, timeout)
# enable_ddl(cur, timeout)
# enable_rebalance(cur, timeout)
# enable_rereplication(cur, timeout)