From 46495bb59edff2d7d0e4d6b3c3b1cc4b5f50c961 Mon Sep 17 00:00:00 2001 From: VilhoRaatikka Date: Tue, 3 Jun 2014 19:39:50 +0300 Subject: [PATCH] Bug #443, http://bugs.skysql.com/show_bug.cgi?id=443 monitors used mysql_ping without MYSQL_OPT_READ_TIMEOUT which caused read to block. Fixed in mysql and galera monitor. Added log writing per each status change of each server and repeatedly if server is not running. Removed SERVER_IS_JOINED checks from rwsplit router. --- server/modules/include/readwritesplit.h | 3 +- server/modules/monitor/galera_mon.c | 23 +++++++++++ server/modules/monitor/mysql_mon.c | 41 +++++++++++++++---- server/modules/routing/readconnroute.c | 2 +- .../routing/readwritesplit/readwritesplit.c | 2 +- utils/skygw_debug.h | 8 +++- 6 files changed, 66 insertions(+), 13 deletions(-) diff --git a/server/modules/include/readwritesplit.h b/server/modules/include/readwritesplit.h index a4eecf4d5..00857ae1b 100644 --- a/server/modules/include/readwritesplit.h +++ b/server/modules/include/readwritesplit.h @@ -229,7 +229,6 @@ typedef struct router_instance { } ROUTER_INSTANCE; #define BACKEND_TYPE(b) (SERVER_IS_MASTER((b)->backend_server) ? BE_MASTER : \ - (SERVER_IS_SLAVE((b)->backend_server) ? BE_SLAVE : \ - (SERVER_IS_JOINED((b)->backend_server) ? BE_JOINED : BE_UNDEFINED))); + (SERVER_IS_SLAVE((b)->backend_server) ? BE_SLAVE : BE_UNDEFINED)); #endif /*< _RWSPLITROUTER_H */ diff --git a/server/modules/monitor/galera_mon.c b/server/modules/monitor/galera_mon.c index 715edfb96..a9f242756 100644 --- a/server/modules/monitor/galera_mon.c +++ b/server/modules/monitor/galera_mon.c @@ -317,10 +317,22 @@ char *server_string; if (database->con == NULL || mysql_ping(database->con) != 0) { char *dpwd = decryptPassword(passwd); + int rc; + int read_timeout = 1; + database->con = mysql_init(NULL); + rc = mysql_options(database->con, MYSQL_OPT_READ_TIMEOUT, (void *)&read_timeout); + if (mysql_real_connect(database->con, database->server->name, uname, dpwd, NULL, database->server->port, NULL, 0) == NULL) { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Error : Monitor was unable to connect to " + "server %s:%d : \"%s\"", + database->server->name, + database->server->port, + mysql_error(database->con)))); server_clear_status(database->server, SERVER_RUNNING); database->server->node_id = -1; free(dpwd); @@ -416,6 +428,7 @@ long master_id; while (ptr) { + unsigned int prev_status = ptr->server->status; monitorDatabase(ptr, handle->defaultUser, handle->defaultPasswd); /* set master_id to the lowest value of ptr->server->node_id */ @@ -433,6 +446,16 @@ long master_id; server_clear_status(ptr->server, SERVER_SLAVE); server_clear_status(ptr->server, SERVER_MASTER); } + if (ptr->server->status != prev_status || + SERVER_IS_DOWN(ptr->server)) + { + LOGIF(LM, (skygw_log_write_flush( + LOGFILE_MESSAGE, + "Backend server %s:%d state : %s", + ptr->server->name, + ptr->server->port, + STRSRVSTATUS(ptr->server)))); + } ptr = ptr->next; } diff --git a/server/modules/monitor/mysql_mon.c b/server/modules/monitor/mysql_mon.c index 78ced4d75..d643a00b9 100644 --- a/server/modules/monitor/mysql_mon.c +++ b/server/modules/monitor/mysql_mon.c @@ -302,7 +302,7 @@ char *sep; * Monitor an individual server * * @param handle The MySQL Monitor object - * @param database The database to probe + * @param database The database to probe */ static void monitorDatabase(MYSQL_MONITOR *handle, MONITOR_SERVERS *database) @@ -324,7 +324,7 @@ int replication_heartbeat = handle->replicationHeartbeat; } if (uname == NULL) return; - + /* Don't probe servers in maintenance mode */ if (SERVER_IN_MAINT(database->server)) return; @@ -332,7 +332,11 @@ int replication_heartbeat = handle->replicationHeartbeat; if (database->con == NULL || mysql_ping(database->con) != 0) { char *dpwd = decryptPassword(passwd); + int rc; + int read_timeout = 1; database->con = mysql_init(NULL); + rc = mysql_options(database->con, MYSQL_OPT_READ_TIMEOUT, (void *)&read_timeout); + if (mysql_real_connect(database->con, database->server->name, uname, @@ -342,6 +346,14 @@ int replication_heartbeat = handle->replicationHeartbeat; NULL, 0) == NULL) { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Error : Monitor was unable to connect to " + "server %s:%d : \"%s\"", + database->server->name, + database->server->port, + mysql_error(database->con)))); + free(dpwd); server_clear_status(database->server, SERVER_RUNNING); return; @@ -626,7 +638,6 @@ int replication_heartbeat = handle->replicationHeartbeat; server_clear_status(database->server, SERVER_SLAVE); server_clear_status(database->server, SERVER_MASTER); } - } /** @@ -661,13 +672,27 @@ MONITOR_SERVERS *ptr; ptr = handle->databases; while (ptr) { + unsigned int prev_status = ptr->server->status; + monitorDatabase(handle, ptr); + + if (ptr->server->status != prev_status || + SERVER_IS_DOWN(ptr->server)) + { + LOGIF(LM, (skygw_log_write_flush( + LOGFILE_MESSAGE, + "Backend server %s:%d state : %s", + ptr->server->name, + ptr->server->port, + STRSRVSTATUS(ptr->server)))); + } + ptr = ptr->next; } thread_millisleep(handle->interval); } } - + /** * Set the default id to use in the monitor. * @@ -676,11 +701,11 @@ MONITOR_SERVERS *ptr; */ static void defaultId(void *arg, unsigned long id) -{ + { MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg; memcpy(&handle->id, &id, sizeof(unsigned long)); -} - + } + /** * Set the monitor sampling interval. * @@ -692,7 +717,7 @@ setInterval(void *arg, unsigned long interval) { MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg; memcpy(&handle->interval, &interval, sizeof(unsigned long)); -} + } /** * Enable/Disable the MySQL Replication hearbeat, detecting slave lag behind master. diff --git a/server/modules/routing/readconnroute.c b/server/modules/routing/readconnroute.c index 7a28241cc..0652f9f0c 100644 --- a/server/modules/routing/readconnroute.c +++ b/server/modules/routing/readconnroute.c @@ -352,7 +352,7 @@ int master_host = -1; inst->bitmask))); } - if (SERVER_IN_MAINT(inst->server)) + if (SERVER_IN_MAINT(inst->servers[i]->server)) continue; /* diff --git a/server/modules/routing/readwritesplit/readwritesplit.c b/server/modules/routing/readwritesplit/readwritesplit.c index fca3bca92..ddb0422c7 100644 --- a/server/modules/routing/readwritesplit/readwritesplit.c +++ b/server/modules/routing/readwritesplit/readwritesplit.c @@ -808,7 +808,7 @@ static bool get_dcb( } ss_dassert(succp); } - else if (btype == BE_MASTER || BE_JOINED) + else if (btype == BE_MASTER) { for (i=0; irses_nbackends; i++) { diff --git a/utils/skygw_debug.h b/utils/skygw_debug.h index 7277cb2e5..835b30aaa 100644 --- a/utils/skygw_debug.h +++ b/utils/skygw_debug.h @@ -228,7 +228,13 @@ typedef enum skygw_chk_t { ((c) == LEAST_GLOBAL_CONNECTIONS ? "LEAST_GLOBAL_CONNECTIONS" : \ ((c) == LEAST_ROUTER_CONNECTIONS ? "LEAST_ROUTER_CONNECTIONS" : \ ((c) == LEAST_BEHIND_MASTER ? "LEAST_BEHIND_MASTER" : "Unknown criteria")))) - + +#define STRSRVSTATUS(s) ((SERVER_IS_RUNNING(s) && SERVER_IS_MASTER(s)) ? "RUNNING MASTER" : \ + ((SERVER_IS_RUNNING(s) && SERVER_IS_SLAVE(s)) ? "RUNNING SLAVE" : \ + ((SERVER_IS_RUNNING(s) && SERVER_IS_JOINED(s)) ? "RUNNING JOINED" : \ + ((SERVER_IS_RUNNING(s) && SERVER_IS_MAINT(s)) ? "RUNNING MAINTENANCE" : \ + (SERVER_IS_RUNNING(s) ? "RUNNING (only)" : "NO STATUS"))))) + #define CHK_MLIST(l) { \ ss_info_dassert((l->mlist_chk_top == CHK_NUM_MLIST && \ l->mlist_chk_tail == CHK_NUM_MLIST), \