diff --git a/server/modules/monitor/galera_mon.c b/server/modules/monitor/galera_mon.c index 689fabb29..5d5b34b38 100644 --- a/server/modules/monitor/galera_mon.c +++ b/server/modules/monitor/galera_mon.c @@ -77,6 +77,8 @@ static MONITOR_SERVERS *get_candidate_master(MONITOR_SERVERS *); static MONITOR_SERVERS *set_cluster_master(MONITOR_SERVERS *, MONITOR_SERVERS *, int); static void disableMasterFailback(void *, int); static void setNetworkTimeout(void *arg, int type, int value); +static bool mon_status_changed(MONITOR_SERVERS* mon_srv); +static bool mon_print_fail_status(MONITOR_SERVERS* mon_srv); static MONITOR_OBJECT MyObject = { startMonitor, @@ -348,6 +350,9 @@ char *server_string; if (SERVER_IN_MAINT(database->server)) return; + /** Store previous status */ + database->mon_prev_status = database->server->status; + if (database->con == NULL || mysql_ping(database->con) != 0) { char *dpwd = decryptPassword(passwd); @@ -365,13 +370,7 @@ char *server_string; if (mysql_real_connect(database->con, database->server->name, uname, dpwd, NULL, database->server->port, NULL, 0) == NULL) { - LOGIF(LE, (skygw_log_write_flush( - LOGFILE_ERROR, - "Error : Monitor was unable to connect to " - "server %s:%d : \"%s\"", - database->server->name, - database->server->port, - mysql_error(database->con)))); + free(dpwd); server_clear_status(database->server, SERVER_RUNNING); @@ -385,8 +384,20 @@ char *server_string; { server_set_status(database->server, SERVER_AUTH_ERROR); } + database->server->node_id = -1; - free(dpwd); + + if (mon_status_changed(database) && mon_print_fail_status(database)) + { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Error : Monitor was unable to connect to " + "server %s:%d : \"%s\"", + database->server->name, + database->server->port, + mysql_error(database->con)))); + } + return; } else @@ -461,6 +472,8 @@ MONITOR_SERVERS *ptr; size_t nrounds = 0; MONITOR_SERVERS *candidate_master = NULL; int master_stickiness = handle->disableMasterFailback; +int is_cluster=0; +int log_no_members = 1; if (mysql_thread_init()) { @@ -501,8 +514,6 @@ int master_stickiness = handle->disableMasterFailback; while (ptr) { - unsigned int prev_status = ptr->server->status; - monitorDatabase(handle, ptr); /* clear bits for non member nodes */ @@ -518,8 +529,7 @@ int master_stickiness = handle->disableMasterFailback; } /* Log server status change */ - if (ptr->server->status != prev_status || - SERVER_IS_DOWN(ptr->server)) + if (mon_status_changed(ptr)) { LOGIF(LD, (skygw_log_write_flush( LOGFILE_DEBUG, @@ -529,6 +539,17 @@ int master_stickiness = handle->disableMasterFailback; STRSRVSTATUS(ptr->server)))); } + if (SERVER_IS_DOWN(ptr->server)) + { + /** Increase this server'e error count */ + ptr->mon_err_count += 1; + } + else + { + /** Reset this server's error count */ + ptr->mon_err_count = 0; + } + ptr = ptr->next; } @@ -574,8 +595,24 @@ int master_stickiness = handle->disableMasterFailback; } } + is_cluster++; + ptr = ptr->next; } + + if (is_cluster == 0 && log_no_members) { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Error: there are no cluster members"))); + log_no_members = 0; + } else { + if (is_cluster > 0 && log_no_members == 0) { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Info: found %i cluster members", is_cluster))); + log_no_members = 1; + } + } } } @@ -678,6 +715,13 @@ MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg; memcpy(&handle->disableMasterFailback, &disable, sizeof(int)); } +/** + * Set the default id to use in the monitor. + * + * @param arg The handle allocated by startMonitor + * @param type The connect timeout type + * @param value The timeout value to set + */ static void setNetworkTimeout(void *arg, int type, int value) { @@ -731,3 +775,51 @@ int new_timeout = max_timeout -1; break; } } + +/** + * Check if current monitored server status has changed + * + * @param mon_srv The monitored server + * @return true if status has changed or false + */ +static bool mon_status_changed( + MONITOR_SERVERS* mon_srv) +{ + bool succp; + + if (mon_srv->mon_prev_status != mon_srv->server->status) + { + succp = true; + } + else + { + succp = false; + } + return succp; +} + +/** + * Check if current monitored server has a loggable failure status + * + * @param mon_srv The monitored server + * @return true if failed status can be logged or false + */ +static bool mon_print_fail_status( + MONITOR_SERVERS* mon_srv) +{ + bool succp; + int errcount = mon_srv->mon_err_count; + uint8_t modval; + + modval = 1<<(MIN(errcount/10, 7)); + + if (SERVER_IS_DOWN(mon_srv->server) && errcount == 0) + { + succp = true; + } + else + { + succp = false; + } + return succp; +} diff --git a/server/modules/monitor/mysql_mon.c b/server/modules/monitor/mysql_mon.c index eb70ec233..29a56c1c2 100644 --- a/server/modules/monitor/mysql_mon.c +++ b/server/modules/monitor/mysql_mon.c @@ -386,7 +386,7 @@ char *server_string; if (SERVER_IN_MAINT(database->server)) return; - /** Store prevous status */ + /** Store previous status */ database->mon_prev_status = database->server->status; if (database->con == NULL || mysql_ping(database->con) != 0) @@ -414,17 +414,6 @@ char *server_string; { free(dpwd); - if (mon_print_fail_status(database)) - { - LOGIF(LE, (skygw_log_write_flush( - LOGFILE_ERROR, - "Error : Monitor was unable to connect to " - "server %s:%d : \"%s\"", - database->server->name, - database->server->port, - mysql_error(database->con)))); - } - /* The current server is not running * * Store server NOT running in server and monitor server pending struct @@ -450,6 +439,18 @@ char *server_string; monitor_clear_pending_status(database, SERVER_SLAVE_OF_EXTERNAL_MASTER); monitor_clear_pending_status(database, SERVER_STALE_STATUS); + /* Log connect failure only once */ + if (mon_status_changed(database) && mon_print_fail_status(database)) + { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Error : Monitor was unable to connect to " + "server %s:%d : \"%s\"", + database->server->name, + database->server->port, + mysql_error(database->con)))); + } + return; } else @@ -608,6 +609,7 @@ int detect_stale_master = handle->detectStaleMaster; int num_servers=0; MONITOR_SERVERS *root_master = NULL; size_t nrounds = 0; +int log_no_master = 1; if (mysql_thread_init()) { @@ -672,21 +674,21 @@ size_t nrounds = 0; dcb_call_foreach(DCB_REASON_NOT_RESPONDING); } - if (mon_status_changed(ptr) || - mon_print_fail_status(ptr)) + if (mon_status_changed(ptr)) { LOGIF(LD, (skygw_log_write_flush( LOGFILE_DEBUG, "Backend server %s:%d state : %s", ptr->server->name, ptr->server->port, - STRSRVSTATUS(ptr->server)))); - } - if (SERVER_IS_DOWN(ptr->server)) - { - /** Increase this server'e error count */ - ptr->mon_err_count += 1; + STRSRVSTATUS(ptr->server)))); } + + if (SERVER_IS_DOWN(ptr->server)) + { + /** Increase this server'e error count */ + ptr->mon_err_count += 1; + } else { /** Reset this server's error count */ @@ -724,11 +726,21 @@ size_t nrounds = 0; if (! SERVER_IN_MAINT(ptr->server)) { /* If "detect_stale_master" option is On, let's use the previus master */ if (detect_stale_master && root_master && (!strcmp(ptr->server->name, root_master->server->name) && ptr->server->port == root_master->server->port) && (ptr->server->status & SERVER_MASTER) && !(ptr->pending_status & SERVER_MASTER)) { - /* in this case server->status will not be updated from pending_status */ - LOGIF(LM, (skygw_log_write_flush( - LOGFILE_MESSAGE, "[mysql_mon]: root server [%s:%i] is no longer Master, let's use it again even if it could be a stale master, you have been warned!", ptr->server->name, ptr->server->port))); - /* Set the STALE bit for this server in server struct */ + /** + * In this case server->status will not be updated from pending_statu + * Set the STALE bit for this server in server struct + */ server_set_status(ptr->server, SERVER_STALE_STATUS); + + /* log it once */ + if (mon_status_changed(ptr)) { + LOGIF(LM, (skygw_log_write_flush( + LOGFILE_MESSAGE, "[mysql_mon]: root server [%s:%i] is no longer Master," + " let's use it again even if it could be a stale master," + " you have been warned!", + ptr->server->name, + ptr->server->port))); + } } else { ptr->server->status = ptr->pending_status; } @@ -737,6 +749,33 @@ size_t nrounds = 0; ptr = ptr->next; } + /* log master detection failure od first master becomes available after failure */ + if (root_master && mon_status_changed(root_master) && !(root_master->server->status & SERVER_STALE_STATUS)) { + if (root_master->pending_status & (SERVER_MASTER)) { + if (!(root_master->mon_prev_status & SERVER_STALE_STATUS)) { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Info: A Master Server is now available: %s:%i", + root_master->server->name, + root_master->server->port))); + } + } else { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Error: No Master can be determined. Last known was %s:%i", + root_master->server->name, + root_master->server->port))); + } + log_no_master = 1; + } else { + if (!root_master && log_no_master) { + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Error: No Master can be determined"))); + log_no_master = 0; + } + } + /* Do now the heartbeat replication set/get for MySQL Replication Consistency */ if (replication_heartbeat && root_master && (SERVER_IS_MASTER(root_master->server) || SERVER_IS_RELAY_SERVER(root_master->server))) { set_master_heartbeat(handle, root_master); @@ -808,6 +847,12 @@ MYSQL_MONITOR *handle = (MYSQL_MONITOR *)arg; memcpy(&handle->detectStaleMaster, &enable, sizeof(int)); } +/** + * Check if current monitored server status has changed + * + * @param mon_srv The monitored server + * @return true if status has changed or false + */ static bool mon_status_changed( MONITOR_SERVERS* mon_srv) { @@ -824,6 +869,12 @@ static bool mon_status_changed( return succp; } +/** + * Check if current monitored server has a loggable failure status + * + * @param mon_srv The monitored server + * @return true if failed status can be logged or false + */ static bool mon_print_fail_status( MONITOR_SERVERS* mon_srv) { @@ -833,7 +884,7 @@ static bool mon_print_fail_status( modval = 1<<(MIN(errcount/10, 7)); - if (SERVER_IS_DOWN(mon_srv->server) && errcount%modval == 0) + if (SERVER_IS_DOWN(mon_srv->server) && errcount == 0) { succp = true; } @@ -1178,6 +1229,7 @@ static MONITOR_SERVERS *get_replication_tree(MYSQL_MONITOR *handle, int num_serv add_slave_to_master(master->server->slaves, MONITOR_MAX_NUM_SLAVES, current->node_id); master->server->depth = current->depth -1; monitor_set_pending_status(master, SERVER_MASTER); + handle->master = master; } else { if (current->master_id > 0) { /* this server is slave of another server not in MaxScale configuration