From 5f4db64ac7f4cd74b736cfd589bd382d5b9d949f Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Fri, 5 Jan 2018 10:46:21 +0200 Subject: [PATCH] Better timing for switchover, check slaves for IO/SQL errors Time elapsed is now properly tracked during a switchover. After slave redirection, an event is added to the master. Then, the slaves are queried repeatedly until they advance to the newest event. I/O and SQL errors are also detected. --- .../modules/monitor/mariadbmon/mysql_mon.cc | 231 ++++++++++++++---- 1 file changed, 189 insertions(+), 42 deletions(-) diff --git a/server/modules/monitor/mariadbmon/mysql_mon.cc b/server/modules/monitor/mariadbmon/mysql_mon.cc index cc979dc0a..1c7cfbb96 100644 --- a/server/modules/monitor/mariadbmon/mysql_mon.cc +++ b/server/modules/monitor/mariadbmon/mysql_mon.cc @@ -707,6 +707,7 @@ public: * binary log file. */ Gtid gtid_io_pos; /**< Gtid I/O position of the slave thread. Only shows the triplet with * the current master domain. */ + string last_error; /**< Last IO or SQL error encountered. */ SlaveStatusInfo() : master_server_id(0), @@ -1253,9 +1254,14 @@ static bool do_show_slave_status(MYSQL_MONITOR* mon, const char* using_gtid = mxs_mysql_get_value(result, row, "Using_Gtid"); const char* master_host = mxs_mysql_get_value(result, row, "Master_Host"); const char* master_port = mxs_mysql_get_value(result, row, "Master_Port"); - ss_dassert(beats && period && using_gtid && master_host && master_port); + const char* last_io_error = mxs_mysql_get_value(result, row, "Last_IO_Error"); + const char* last_sql_error = mxs_mysql_get_value(result, row, "Last_SQL_Error"); + ss_dassert(beats && period && using_gtid && master_host && master_port && + last_io_error && last_sql_error); serv_info->slave_status.master_host = master_host; serv_info->slave_status.master_port = atoi(master_port); + serv_info->slave_status.last_error = *last_io_error ? last_io_error : + (*last_sql_error ? last_sql_error : ""); int heartbeats = atoi(beats); if (serv_info->slave_heartbeats < heartbeats) @@ -3415,6 +3421,7 @@ bool redirect_one_slave(MXS_MONITORED_SERVER* slave, const char* change_cmd) { bool rval = false; if (mxs_mysql_query(slave->con, "STOP SLAVE;") == 0 && + mxs_mysql_query(slave->con, "RESET SLAVE;") == 0 && // To erase any old I/O or SQL errors mxs_mysql_query(slave->con, change_cmd) == 0 && mxs_mysql_query(slave->con, "START SLAVE;") == 0) { @@ -3435,9 +3442,11 @@ bool redirect_one_slave(MXS_MONITORED_SERVER* slave, const char* change_cmd) * @param mon The monitor * @param slaves An array of slaves * @param new_master The replication master + * @param redirected_slaves A vector where to insert successfully redirected slaves. Can be NULL. * @return The number of slaves successfully redirected. */ -int redirect_slaves(MYSQL_MONITOR* mon, ServerVector& slaves, MXS_MONITORED_SERVER* new_master) +int redirect_slaves(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, const ServerVector& slaves, + ServerVector* redirected_slaves = NULL) { MXS_NOTICE("Redirecting slaves to new master."); std::string change_cmd = generate_change_master_cmd(mon, new_master); @@ -3447,6 +3456,10 @@ int redirect_slaves(MYSQL_MONITOR* mon, ServerVector& slaves, MXS_MONITORED_SERV if (redirect_one_slave(*iter, change_cmd.c_str())) { successes++; + if (redirected_slaves) + { + redirected_slaves->push_back(*iter); + } } } return successes; @@ -3481,7 +3494,7 @@ static bool do_failover(MYSQL_MONITOR* mon, json_t** err_out) promote_new_master(new_master, err_out)) { // Step 4: Redirect slaves. - int redirects = redirect_slaves(mon, slaves, new_master); + int redirects = redirect_slaves(mon, new_master, slaves); rval = slaves.empty() ? true : redirects > 0; } return rval; @@ -3671,7 +3684,7 @@ static bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const Gti StringVector output; bool gtid_reached = false; bool error = false; - double seconds_remaining = total_timeout > 0 ? total_timeout : 0.01; + double seconds_remaining = total_timeout; // Determine a reasonable timeout for the MASTER_GTID_WAIT-function depending on the // backend_read_timeout setting (should be >= 1) and time remaining. @@ -3735,7 +3748,7 @@ static bool switchover_wait_slaves_catchup(const ServerVector& slaves, const Gti iter != slaves.end() && success; iter++) { - if (seconds_remaining < 0) + if (seconds_remaining <= 0) { success = false; } @@ -3786,25 +3799,19 @@ static bool switchover_start_slave(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* old } /** - * Get error strings from all MySQL connections, form one string. + * Get MySQL connection error strings from all the given servers, form one string. * - * @param slaves Slave servers - * @param old_master Old master server + * @param slaves Servers with errors * @return Concatenated string. */ -static string get_connection_errors(ServerVector& slaves, MXS_MONITORED_SERVER* old_master) +static string get_connection_errors(const ServerVector& servers) { // Get errors from all connections, form a string. - ss_dassert(old_master); - ServerVector servers; - servers.reserve(1 + slaves.size()); - servers.push_back(old_master); - servers.insert(servers.end(), slaves.begin(), slaves.end()); std::stringstream ss; for (ServerVector::const_iterator iter = servers.begin(); iter != servers.end(); iter++) { const char* error = mysql_error((*iter)->con); - ss_dassert(strlen(error) > 0); // Every connection should have an error. + ss_dassert(*error); // Every connection should have an error. ss << (*iter)->server->unique_name << ": '" << error << "'"; if (iter + 1 != servers.end()) { @@ -3814,6 +3821,100 @@ static string get_connection_errors(ServerVector& slaves, MXS_MONITORED_SERVER* return ss.str(); } +/** + * Send an event to new master and wait for slaves to get the event. + * + * @param mon Cluster monitor + * @param new_master Where to send the event + * @param slaves Servers to monitor + * @param seconds_remaining How long can we wait + * @return True, if at least one slave got the new event within the time limit + */ +static bool wait_cluster_stabilization(MYSQL_MONITOR* mon, + MXS_MONITORED_SERVER* new_master, + const ServerVector& slaves, + int seconds_remaining) +{ + ss_dassert(!slaves.empty()); + bool rval = false; + time_t begin = time(NULL); + MySqlServerInfo* new_master_info = get_server_info(mon, new_master); + + if (mxs_mysql_query(new_master->con, "FLUSH TABLES;") == 0 && + update_gtids(mon, new_master, new_master_info)) + { + int query_fails = 0; + int repl_fails = 0; + int successes = 0; + const Gtid target = new_master_info->gtid_current_pos; + ServerVector wait_list = slaves; // Check all the servers in the list + bool first_round = true; + bool time_is_up = false; + + while (!wait_list.empty() && !time_is_up) + { + if (!first_round) + { + thread_millisleep(500); + } + + // Erasing elements from an array, so iterate from last to first + int i = wait_list.size() - 1; + while (i >= 0) + { + MXS_MONITORED_SERVER* slave = wait_list[i]; + MySqlServerInfo* slave_info = get_server_info(mon, slave); + if (update_gtids(mon, slave, slave_info) && do_show_slave_status(mon, slave_info, slave)) + { + if (!slave_info->slave_status.last_error.empty()) + { + // IO or SQL error on slave, replication is a fail + MXS_WARNING("Slave '%s' cannot start replication: '%s'.", + slave->server->unique_name, + slave_info->slave_status.last_error.c_str()); + wait_list.erase(wait_list.begin() + i); + repl_fails++; + } + else if (slave_info->gtid_current_pos.sequence >= target.sequence) + { + // This slave has reached the same gtid as master, remove from list + wait_list.erase(wait_list.begin() + i); + successes++; + } + } + else + { + wait_list.erase(wait_list.begin() + i); + query_fails++; + } + i--; + } + + first_round = false; // Sleep at start of next iteration + if (difftime(time(NULL), begin) >= seconds_remaining) + { + time_is_up = true; + } + } + + ServerVector::size_type fails = repl_fails + query_fails + wait_list.size(); + if (fails > 0) + { + const char MSG[] = "Replication from the new master could not be confirmed for %lu slaves. " + "%d encountered an I/O or SQL error, %d failed to reply and %lu did not " + "advance in Gtid until time ran out."; + MXS_WARNING(MSG, fails, repl_fails, query_fails, wait_list.size()); + } + rval = (successes > 0); + } + else + { + MXS_ERROR("Could not confirm replication after switchover/failover because query to " + "the new master failed."); + } + return rval; +} + /** * Performs switchover for a simple topology (1 master, N slaves, no intermediate masters). If an intermediate * step fails, the cluster may be left without a master. @@ -3837,20 +3938,24 @@ static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_mast PRINT_MXS_JSON_ERROR(err_out, "Cluster gtid domain is unknown. Cannot switchover."); return false; } + // Total time limit on how long this operation may take. Checked and modified after significant steps are + // completed. + int seconds_remaining = mon->switchover_timeout; + time_t start_time = time(NULL); // Step 1: Select promotion candidate, save all slaves except promotion target to an array. If we have a // user-defined master candidate, check it. Otherwise, autoselect. MXS_MONITORED_SERVER* promotion_target = NULL; - ServerVector slaves; + ServerVector redirectable_slaves; if (new_master) { - if (switchover_check_preferred_master(mon, new_master, &slaves, err_out)) + if (switchover_check_preferred_master(mon, new_master, &redirectable_slaves, err_out)) { promotion_target = new_master; } } else { - promotion_target = select_new_master(mon, &slaves, err_out); + promotion_target = select_new_master(mon, &redirectable_slaves, err_out); } if (promotion_target == NULL) { @@ -3859,39 +3964,81 @@ static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_mast bool rval = false; MySqlServerInfo* curr_master_info = get_server_info(mon, demotion_target); - // Step 2: Set read-only to 1, flush logs. + + // Step 2: Set read-only to on, flush logs. if (switchover_demote_master(mon, demotion_target, curr_master_info, err_out)) { - // Step 3a: Wait for the selected slave to catch up with master. - if (switchover_wait_slave_catchup(promotion_target, curr_master_info->gtid_binlog_pos, - mon->switchover_timeout, mon->monitor->read_timeout, err_out) && - // Step 3b: Wait for other slaves to catch up with master. - switchover_wait_slaves_catchup(slaves, curr_master_info->gtid_binlog_pos, - mon->switchover_timeout, mon->monitor->read_timeout, err_out) && - // Step 4: Stop and reset slave, set read-only to 0. - promote_new_master(promotion_target, err_out)) + bool catchup_and_promote_success = false; + time_t step2_time = time(NULL); + seconds_remaining -= difftime(step2_time, start_time); + + // Step 3: Wait for the slaves (including promotion target) to catch up with master. + ServerVector catchup_slaves = redirectable_slaves; + catchup_slaves.push_back(promotion_target); + if (switchover_wait_slaves_catchup(catchup_slaves, curr_master_info->gtid_binlog_pos, + seconds_remaining, mon->monitor->read_timeout, err_out)) { - // Step 5: Redirect slaves. - int redirects = redirect_slaves(mon, slaves, promotion_target); - // Step 6: Set the old master to replicate from the new. - bool start_ok = switchover_start_slave(mon, demotion_target, promotion_target); - rval = slaves.empty() ? start_ok : start_ok || redirects > 0; - if (rval == false) + time_t step3_time = time(NULL); + int seconds_step3 = difftime(step3_time, step2_time); + MXS_DEBUG("Switchover: slave catchup took %d seconds.", seconds_step3); + seconds_remaining -= seconds_step3; + + // Step 4: On new master STOP and RESET SLAVE, set read-only to off. + if (promote_new_master(promotion_target, err_out)) { - // This is a special case. Individual server errors have already been printed to the log. - // For JSON, gather the errors again. - const char MSG[] = "Could not redirect any slaves to the new master."; - MXS_ERROR(MSG); - if (err_out) + catchup_and_promote_success = true; + // Step 5: Redirect slaves and start replication on old master. + ServerVector redirected_slaves; + bool start_ok = switchover_start_slave(mon, demotion_target, promotion_target); + if (start_ok) { - string combined_error = get_connection_errors(slaves, demotion_target); - *err_out = mxs_json_error_append(*err_out, "%s Errors: %s.", MSG, combined_error.c_str()); + redirected_slaves.push_back(demotion_target); + } + int redirects = redirect_slaves(mon, promotion_target, redirectable_slaves, &redirected_slaves); + + bool success = redirectable_slaves.empty() ? start_ok : start_ok || redirects > 0; + if (success == false) + { + rval = false; + // This is a special case. Individual server errors have already been printed to the log. + // For JSON, gather the errors again. + const char MSG[] = "Could not redirect any slaves to the new master."; + MXS_ERROR(MSG); + if (err_out) + { + ServerVector failed_slaves; + failed_slaves.push_back(demotion_target); + failed_slaves.insert(failed_slaves.end(), + redirectable_slaves.begin(), redirectable_slaves.end()); + string combined_error = get_connection_errors(failed_slaves); + *err_out = mxs_json_error_append(*err_out, + "%s Errors: %s.", MSG, combined_error.c_str()); + } + } + else + { + time_t step5_time = time(NULL); + seconds_remaining -= difftime(step5_time, step3_time); + + // Step 6: Finally, add an event to the new master to advance gtid and wait for the slaves + // to receive it. + if (wait_cluster_stabilization(mon, promotion_target, redirected_slaves, + seconds_remaining)) + { + rval = true; + time_t step6_time = time(NULL); + int seconds_step6 = difftime(step6_time, step5_time); + seconds_remaining -= seconds_step6; + MXS_DEBUG("Switchover: slave replication confirmation took %d seconds with " + "%d seconds to spare.", seconds_step6, seconds_remaining); + } } } } - else + + if (!catchup_and_promote_success) { - // Step 3a, 3b or 4 failed, try to undo step 2. + // Step 3 or 4 failed, try to undo step 2. const char QUERY_UNDO[] = "SET GLOBAL read_only=0;"; if (mxs_mysql_query(demotion_target->con, QUERY_UNDO) == 0) {