[CP] [UPGRADE] Calculate default timeout according to tenant count

This commit is contained in:
tino247
2023-12-28 05:19:19 +00:00
committed by ob-robot
parent 901bf26411
commit 9093d820a0
6 changed files with 406 additions and 104 deletions

View File

@ -73,7 +73,7 @@ sys.argv[0] + """ [OPTIONS]""" +\
' that all modules should be run. They are splitted by ",".\n' +\
' For example: -m all, or --module=ddl,normal_dml,special_action\n' +\
'-l, --log-file=name Log file path. If log file path is not given it\'s ' + os.path.splitext(sys.argv[0])[0] + '.log\n' +\
'-t, --timeout=name check timeout, default: 600(s).\n' + \
'-t, --timeout=name check timeout.\n' + \
'-z, --zone=name If zone is not specified, check all servers status in cluster. \n' +\
' Otherwise, only check servers status in specified zone. \n' + \
'\n\n' +\
@ -135,8 +135,7 @@ Option('p', 'password', True, False, ''),\
Option('m', 'module', True, False, 'all'),\
# 日志文件路径,不同脚本的main函数中中会改成不同的默认值
Option('l', 'log-file', True, False),\
# 一些检查的超时时间,默认是600s
Option('t', 'timeout', True, False, '600'),\
Option('t', 'timeout', True, False, 0),\
Option('z', 'zone', True, False, ''),\
]\
@ -288,13 +287,38 @@ def check_zone_valid(query_cur, zone):
else:
logging.info("zone is empty, check all servers in cluster")
def fetch_tenant_ids(query_cur):
try:
tenant_id_list = []
(desc, results) = query_cur.exec_query("""select distinct tenant_id from oceanbase.__all_tenant order by tenant_id desc""")
for r in results:
tenant_id_list.append(r[0])
return tenant_id_list
except Exception, e:
logging.exception('fail to fetch distinct tenant ids')
raise e
def set_default_timeout_by_tenant(query_cur, timeout, timeout_per_tenant, min_timeout):
if timeout > 0:
logging.info("use timeout from opt, timeout(s):{0}".format(timeout))
else:
tenant_id_list = fetch_tenant_ids(query_cur)
cal_timeout = len(tenant_id_list) * timeout_per_tenant
timeout = (cal_timeout if cal_timeout > min_timeout else min_timeout)
logging.info("use default timeout caculated by tenants, "
"timeout(s):{0}, tenant_count:{1}, "
"timeout_per_tenant(s):{2}, min_timeout(s):{3}"
.format(timeout, len(tenant_id_list), timeout_per_tenant, min_timeout))
return timeout
#### START ####
# 0. 检查server版本是否严格一致
def check_server_version_by_zone(query_cur, zone):
if zone == '':
logging.info("skip check server version by cluster")
else:
sql = """select distinct(substring_index(build_version, '_', 1)) from __all_server where zone = '{0}'""".format(zone);
sql = """select distinct(substring_index(build_version, '_', 1)) from oceanbase.__all_server where zone = '{0}'""".format(zone);
(desc, results) = query_cur.exec_query(sql);
if len(results) != 1:
raise MyError("servers build_version not match")
@ -304,8 +328,9 @@ def check_server_version_by_zone(query_cur, zone):
# 1. 检查paxos副本是否同步, paxos副本是否缺失
def check_paxos_replica(query_cur, timeout):
# 1.1 检查paxos副本是否同步
sql = """select count(*) from GV$OB_LOG_STAT where in_sync = 'NO'"""
check_until_timeout(query_cur, sql, 0, timeout)
sql = """select count(*) from oceanbase.GV$OB_LOG_STAT where in_sync = 'NO'"""
wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 10, 600)
check_until_timeout(query_cur, sql, 0, wait_timeout)
# 1.2 检查paxos副本是否有缺失 TODO
logging.info('check paxos replica success')
@ -315,26 +340,29 @@ def check_observer_status(query_cur, zone, timeout):
sql = """select count(*) from oceanbase.__all_server where (start_service_time <= 0 or status='inactive')"""
if zone != '':
sql += """ and zone = '{0}'""".format(zone)
check_until_timeout(query_cur, sql, 0, timeout)
wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 10, 600)
check_until_timeout(query_cur, sql, 0, wait_timeout)
# 3. 检查schema是否刷新成功
def check_schema_status(query_cur, timeout):
sql = """select if (a.cnt = b.cnt, 1, 0) as passed from (select count(*) as cnt from oceanbase.__all_virtual_server_schema_info where refreshed_schema_version > 1 and refreshed_schema_version % 8 = 0) as a join (select count(*) as cnt from oceanbase.__all_server join oceanbase.__all_tenant) as b"""
check_until_timeout(query_cur, sql, 1, timeout)
wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 30, 600)
check_until_timeout(query_cur, sql, 1, wait_timeout)
# 4. check major finish
def check_major_merge(query_cur, timeout):
need_check = 0
(desc, results) = query_cur.exec_query("""select distinct value from GV$OB_PARAMETERs where name = 'enable_major_freeze';""")
(desc, results) = query_cur.exec_query("""select distinct value from oceanbase.GV$OB_PARAMETERS where name = 'enable_major_freeze';""")
if len(results) != 1:
need_check = 1
elif results[0][0] != 'True':
need_check = 1
if need_check == 1:
sql = """select count(1) from CDB_OB_MAJOR_COMPACTION where (GLOBAL_BROADCAST_SCN > LAST_SCN or STATUS != 'IDLE')"""
check_until_timeout(query_cur, sql, 0, timeout)
sql2 = """select /*+ query_timeout(1000000000) */ count(1) from __all_virtual_tablet_compaction_info where max_received_scn > finished_scn and max_received_scn > 0"""
check_until_timeout(query_cur, sql2, 0, timeout)
wait_timeout = set_default_timeout_by_tenant(query_cur, timeout, 30, 600)
sql = """select count(1) from oceanbase.CDB_OB_MAJOR_COMPACTION where (GLOBAL_BROADCAST_SCN > LAST_SCN or STATUS != 'IDLE')"""
check_until_timeout(query_cur, sql, 0, wait_timeout)
sql2 = """select /*+ query_timeout(1000000000) */ count(1) from oceanbase.__all_virtual_tablet_compaction_info where max_received_scn > finished_scn and max_received_scn > 0"""
check_until_timeout(query_cur, sql2, 0, wait_timeout)
def check_until_timeout(query_cur, sql, value, timeout):
times = timeout / 10
@ -366,7 +394,6 @@ def do_check(my_host, my_port, my_user, my_passwd, upgrade_params, timeout, need
raise_on_warnings = True)
conn.autocommit = True
cur = conn.cursor(buffered=True)
timeout = timeout if timeout > 0 else 600
try:
query_cur = QueryCursor(cur)
check_zone_valid(query_cur, zone)