#!/usr/bin/env python # -*- coding: utf-8 -*- import sys import os import time import mysql.connector from mysql.connector import errorcode import logging import getopt class UpgradeParams: log_filename = 'upgrade_cluster_health_checker.log' #### --------------start : my_error.py -------------- class MyError(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value) #### --------------start : actions.py 只允许执行查询语句-------------- class QueryCursor: __cursor = None def __init__(self, cursor): self.__cursor = cursor def exec_sql(self, sql, print_when_succ = True): try: self.__cursor.execute(sql) rowcount = self.__cursor.rowcount if True == print_when_succ: logging.info('succeed to execute sql: %s, rowcount = %d', sql, rowcount) return rowcount except mysql.connector.Error, e: logging.exception('mysql connector error, fail to execute sql: %s', sql) raise e except Exception, e: logging.exception('normal error, fail to execute sql: %s', sql) raise e def exec_query(self, sql, print_when_succ = True): try: self.__cursor.execute(sql) results = self.__cursor.fetchall() rowcount = self.__cursor.rowcount if True == print_when_succ: logging.info('succeed to execute query: %s, rowcount = %d', sql, rowcount) return (self.__cursor.description, results) except mysql.connector.Error, e: logging.exception('mysql connector error, fail to execute sql: %s', sql) raise e except Exception, e: logging.exception('normal error, fail to execute sql: %s', sql) raise e #### ---------------end---------------------- #### --------------start : opt.py -------------- help_str = \ """ Help: """ +\ sys.argv[0] + """ [OPTIONS]""" +\ '\n\n' +\ '-I, --help Display this help and exit.\n' +\ '-V, --version Output version information and exit.\n' +\ '-h, --host=name Connect to host.\n' +\ '-P, --port=name Port number to use for connection.\n' +\ '-u, --user=name User for login.\n' +\ '-p, --password=name Password to use when connecting to server. If password is\n' +\ ' not given it\'s empty string "".\n' +\ '-m, --module=name Modules to run. Modules should be a string combined by some of\n' +\ ' the following strings: ddl, normal_dml, each_tenant_dml,\n' +\ ' system_variable_dml, special_action, all. "all" represents\n' +\ ' that all modules should be run. They are splitted by ",".\n' +\ ' For example: -m all, or --module=ddl,normal_dml,special_action\n' +\ '-l, --log-file=name Log file path. If log file path is not given it\'s ' + os.path.splitext(sys.argv[0])[0] + '.log\n' +\ '-t, --timeout=name check timeout, default: 600(s).\n' + \ '\n\n' +\ 'Maybe you want to run cmd like that:\n' +\ sys.argv[0] + ' -h 127.0.0.1 -P 3306 -u xxx -p xxx\n' version_str = """version 1.0.0""" class Option: __g_short_name_set = set([]) __g_long_name_set = set([]) __short_name = None __long_name = None __is_with_param = None __is_local_opt = None __has_value = None __value = None def __init__(self, short_name, long_name, is_with_param, is_local_opt, default_value = None): if short_name in Option.__g_short_name_set: raise MyError('duplicate option short name: {0}'.format(short_name)) elif long_name in Option.__g_long_name_set: raise MyError('duplicate option long name: {0}'.format(long_name)) Option.__g_short_name_set.add(short_name) Option.__g_long_name_set.add(long_name) self.__short_name = short_name self.__long_name = long_name self.__is_with_param = is_with_param self.__is_local_opt = is_local_opt self.__has_value = False if None != default_value: self.set_value(default_value) def is_with_param(self): return self.__is_with_param def get_short_name(self): return self.__short_name def get_long_name(self): return self.__long_name def has_value(self): return self.__has_value def get_value(self): return self.__value def set_value(self, value): self.__value = value self.__has_value = True def is_local_opt(self): return self.__is_local_opt def is_valid(self): return None != self.__short_name and None != self.__long_name and True == self.__has_value and None != self.__value g_opts =\ [\ Option('I', 'help', False, True),\ Option('V', 'version', False, True),\ Option('h', 'host', True, False),\ Option('P', 'port', True, False),\ Option('u', 'user', True, False),\ Option('p', 'password', True, False, ''),\ # 要跑哪个模块,默认全跑 Option('m', 'module', True, False, 'all'),\ # 日志文件路径,不同脚本的main函数中中会改成不同的默认值 Option('l', 'log-file', True, False),\ # 一些检查的超时时间,默认是600s Option('t', 'timeout', True, False, '600') ]\ def change_opt_defult_value(opt_long_name, opt_default_val): global g_opts for opt in g_opts: if opt.get_long_name() == opt_long_name: opt.set_value(opt_default_val) return def has_no_local_opts(): global g_opts no_local_opts = True for opt in g_opts: if opt.is_local_opt() and opt.has_value(): no_local_opts = False return no_local_opts def check_db_client_opts(): global g_opts for opt in g_opts: if not opt.is_local_opt() and not opt.has_value(): raise MyError('option "-{0}" has not been specified, maybe you should run "{1} --help" for help'\ .format(opt.get_short_name(), sys.argv[0])) def parse_option(opt_name, opt_val): global g_opts for opt in g_opts: if opt_name in (('-' + opt.get_short_name()), ('--' + opt.get_long_name())): opt.set_value(opt_val) def parse_options(argv): global g_opts short_opt_str = '' long_opt_list = [] for opt in g_opts: if opt.is_with_param(): short_opt_str += opt.get_short_name() + ':' else: short_opt_str += opt.get_short_name() for opt in g_opts: if opt.is_with_param(): long_opt_list.append(opt.get_long_name() + '=') else: long_opt_list.append(opt.get_long_name()) (opts, args) = getopt.getopt(argv, short_opt_str, long_opt_list) for (opt_name, opt_val) in opts: parse_option(opt_name, opt_val) if has_no_local_opts(): check_db_client_opts() def deal_with_local_opt(opt): if 'help' == opt.get_long_name(): global help_str print help_str elif 'version' == opt.get_long_name(): global version_str print version_str def deal_with_local_opts(): global g_opts if has_no_local_opts(): raise MyError('no local options, can not deal with local options') else: for opt in g_opts: if opt.is_local_opt() and opt.has_value(): deal_with_local_opt(opt) # 只处理一个 return def get_opt_host(): global g_opts for opt in g_opts: if 'host' == opt.get_long_name(): return opt.get_value() def get_opt_port(): global g_opts for opt in g_opts: if 'port' == opt.get_long_name(): return opt.get_value() def get_opt_user(): global g_opts for opt in g_opts: if 'user' == opt.get_long_name(): return opt.get_value() def get_opt_password(): global g_opts for opt in g_opts: if 'password' == opt.get_long_name(): return opt.get_value() def get_opt_module(): global g_opts for opt in g_opts: if 'module' == opt.get_long_name(): return opt.get_value() def get_opt_log_file(): global g_opts for opt in g_opts: if 'log-file' == opt.get_long_name(): return opt.get_value() def get_opt_timeout(): global g_opts for opt in g_opts: if 'timeout' == opt.get_long_name(): return opt.get_value() #### ---------------end---------------------- #### --------------start : do_upgrade_pre.py-------------- def config_logging_module(log_filenamme): logging.basicConfig(level=logging.INFO,\ format='[%(asctime)s] %(levelname)s %(filename)s:%(lineno)d %(message)s',\ datefmt='%Y-%m-%d %H:%M:%S',\ filename=log_filenamme,\ filemode='w') # 定义日志打印格式 formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(filename)s:%(lineno)d %(message)s', '%Y-%m-%d %H:%M:%S') ####################################### # 定义一个Handler打印INFO及以上级别的日志到sys.stdout stdout_handler = logging.StreamHandler(sys.stdout) stdout_handler.setLevel(logging.INFO) # 设置日志打印格式 stdout_handler.setFormatter(formatter) # 将定义好的stdout_handler日志handler添加到root logger logging.getLogger('').addHandler(stdout_handler) #### ---------------end---------------------- #### START #### # 1. 检查paxos副本是否同步, paxos副本是否缺失 def check_paxos_replica(query_cur): # 2.1 检查paxos副本是否同步 (desc, results) = query_cur.exec_query("""select count(1) as unsync_cnt from __all_virtual_clog_stat where is_in_sync = 0 and is_offline = 0 and replica_type != 16""") if results[0][0] > 0 : raise MyError('{0} replicas unsync, please check'.format(results[0][0])) # 2.2 检查paxos副本是否有缺失 TODO logging.info('check paxos replica success') # 2. 检查是否有做balance, locality变更 def check_rebalance_task(query_cur): # 3.1 检查是否有做locality变更 (desc, results) = query_cur.exec_query("""select count(1) as cnt from __all_rootservice_job where job_status='INPROGRESS' and return_code is null""") if results[0][0] > 0 : raise MyError('{0} locality tasks is doing, please check'.format(results[0][0])) # 3.2 检查是否有做balance (desc, results) = query_cur.exec_query("""select count(1) as rebalance_task_cnt from __all_virtual_rebalance_task_stat""") if results[0][0] > 0 : raise MyError('{0} rebalance tasks is doing, please check'.format(results[0][0])) logging.info('check rebalance task success') # 3. 检查集群状态 def check_cluster_status(query_cur): # 4.1 检查是否非合并状态 (desc, results) = query_cur.exec_query("""select info from __all_zone where zone='' and name='merge_status'""") if cmp(results[0][0], 'IDLE') != 0 : raise MyError('global status expected = {0}, actual = {1}'.format('IDLE', results[0][0])) logging.info('check cluster status success') # 4.2 检查合并版本是否>=3 (desc, results) = query_cur.exec_query("""select cast(value as unsigned) value from __all_zone where zone='' and name='last_merged_version'""") if results[0][0] < 2: raise MyError('global last_merged_version expected >= 2 actual = {0}'.format(results[0][0])) logging.info('check global last_merged_version success') # 4. 检查租户分区数是否超出内存限制 def check_tenant_part_num(query_cur): # 统计每个租户在各个server上的分区数量 (desc, res_part_num) = query_cur.exec_query("""select svr_ip, svr_port, table_id >> 40 as tenant_id, count(*) as part_num from __all_virtual_clog_stat group by 1,2,3 order by 1,2,3""") # 计算每个租户在每个server上的max_memory (desc, res_unit_memory) = query_cur.exec_query("""select u.svr_ip, u.svr_port, t.tenant_id, uc.max_memory, p.replica_type from __all_unit u, __All_resource_pool p, __all_tenant t, __all_unit_config uc where p.resource_pool_id = u.resource_pool_id and t.tenant_id = p.tenant_id and p.unit_config_id = uc.unit_config_id""") # 查询每个server的memstore_limit_percentage (desc, res_svr_memstore_percent) = query_cur.exec_query("""select svr_ip, svr_port, name, value from __all_virtual_sys_parameter_stat where name = 'memstore_limit_percentage'""") part_static_cost = 128 * 1024 part_dynamic_cost = 400 * 1024 # 考虑到升级过程中可能有建表的需求,因此预留512个分区 part_num_reserved = 512 for line in res_part_num: svr_ip = line[0] svr_port = line[1] tenant_id = line[2] part_num = line[3] for uline in res_unit_memory: uip = uline[0] uport = uline[1] utid = uline[2] umem = uline[3] utype = uline[4] if svr_ip == uip and svr_port == uport and tenant_id == utid: for mpline in res_svr_memstore_percent: mpip = mpline[0] mpport = mpline[1] if mpip == uip and mpport == uport: mspercent = int(mpline[3]) mem_limit = umem if 0 == utype: # full类型的unit需要为memstore预留内存 mem_limit = umem * (100 - mspercent) / 100 part_num_limit = mem_limit / (part_static_cost + part_dynamic_cost / 10); if part_num_limit <= 1000: part_num_limit = mem_limit / (part_static_cost + part_dynamic_cost) if part_num >= (part_num_limit - part_num_reserved): raise MyError('{0} {1} {2} exceed tenant partition num limit, please check'.format(line, uline, mpline)) break logging.info('check tenant partition num success') # 5. 检查存在租户partition,但是不存在unit的observer def check_tenant_resource(query_cur): (desc, res_unit) = query_cur.exec_query("""select tenant_id, svr_ip, svr_port from __all_virtual_partition_info where (tenant_id, svr_ip, svr_port) not in (select tenant_id, svr_ip, svr_port from __all_unit, __all_resource_pool where __all_unit.resource_pool_id = __all_resource_pool.resource_pool_id group by tenant_id, svr_ip, svr_port) group by tenant_id, svr_ip, svr_port""") for line in res_unit: raise MyError('{0} tenant unit not exist but partition exist'.format(line)) logging.info("check tenant resource success") # 6. 检查progressive_merge_round都升到1 def check_progressive_merge_round(query_cur): (desc, results) = query_cur.exec_query("""select count(*) as cnt from __all_virtual_table where progressive_merge_round = 0 and table_type not in (1,2,4) and data_table_id = 0""") if results[0][0] != 0: raise MyError("""progressive_merge_round of main table should all be 1""") (desc, results) = query_cur.exec_query("""select count(*) as cnt from __all_virtual_table where progressive_merge_round = 0 and table_type not in (1,2,4) and data_table_id > 0 and data_table_id in (select table_id from __all_virtual_table where table_type not in (1,2,4) and data_table_id = 0)""") if results[0][0] != 0: raise MyError("""progressive_merge_round of index should all be 1""") logging.info("""check progressive_merge_round status success""") # 主库状态检查 def check_primary_cluster_sync_status(query_cur, timeout): (desc, res) = query_cur.exec_query("""select current_scn from oceanbase.v$ob_cluster where cluster_role='PRIMARY' and cluster_status='VALID'""") if len(res) != 1: raise MyError('query results count is not 1') query_sql = "select count(*) from oceanbase.v$ob_standby_status where cluster_role != 'PHYSICAL STANDBY' or cluster_status != 'VALID' or current_scn < {0}".format(res[0][0]); times = timeout print times while times > 0 : (desc, res1) = query_cur.exec_query(query_sql) if len(res1) == 1 and res1[0][0] == 0: break; time.sleep(1) times -=1 if times == 0: raise MyError("there exists standby cluster not synchronizing, checking primary cluster status failed!!!") else: logging.info("check primary cluster sync status success") # 备库状态检查 def check_standby_cluster_sync_status(query_cur, timeout): (desc, res) = query_cur.exec_query("""select time_to_usec(now(6)) from dual""") query_sql = "select count(*) from oceanbase.v$ob_cluster where (cluster_role != 'PHYSICAL STANDBY') or (cluster_status != 'VALID') or (current_scn < {0}) or (switchover_status != 'NOT ALLOWED')".format(res[0][0]); times = timeout while times > 0 : (desc, res2) = query_cur.exec_query(query_sql) if len(res2) == 1 and res2[0][0] == 0: break time.sleep(1) times -= 1 if times == 0: raise MyError('current standby cluster not synchronizing, please check!!!') else: logging.info("check standby cluster sync status success") # 判断是主库还是备库 def check_cluster_sync_status(query_cur, timeout): (desc, res) = query_cur.exec_query("""select cluster_role from oceanbase.v$ob_cluster""") if res[0][0] == 'PRIMARY': check_primary_cluster_sync_status(query_cur, timeout) else: check_standby_cluster_sync_status(query_cur, timeout) # 开始升级前的检查 def do_check(my_host, my_port, my_user, my_passwd, upgrade_params, timeout): try: conn = mysql.connector.connect(user = my_user, password = my_passwd, host = my_host, port = my_port, database = 'oceanbase', raise_on_warnings = True) conn.autocommit = True cur = conn.cursor(buffered=True) try: query_cur = QueryCursor(cur) check_paxos_replica(query_cur) check_rebalance_task(query_cur) check_cluster_status(query_cur) check_tenant_part_num(query_cur) check_tenant_resource(query_cur) check_cluster_sync_status(query_cur, timeout) except Exception, e: logging.exception('run error') raise e finally: cur.close() conn.close() except mysql.connector.Error, e: logging.exception('connection error') raise e except Exception, e: logging.exception('normal error') raise e if __name__ == '__main__': upgrade_params = UpgradeParams() change_opt_defult_value('log-file', upgrade_params.log_filename) parse_options(sys.argv[1:]) if not has_no_local_opts(): deal_with_local_opts() else: check_db_client_opts() log_filename = get_opt_log_file() upgrade_params.log_filename = log_filename # 日志配置放在这里是为了前面的操作不要覆盖掉日志文件 config_logging_module(upgrade_params.log_filename) try: host = get_opt_host() port = int(get_opt_port()) user = get_opt_user() password = get_opt_password() timeout = int(get_opt_timeout()) logging.info('parameters from cmd: host=\"%s\", port=%s, user=\"%s\", password=\"%s\", log-file=\"%s\", timeout=%s', \ host, port, user, password, log_filename, timeout) do_check(host, port, user, password, upgrade_params, timeout) except mysql.connector.Error, e: logging.exception('mysql connctor error') raise e except Exception, e: logging.exception('normal error') raise e