MXS-1588: Wait on all slaves during switchover
During switchover, MASTER_GTID_WAIT is now called on all slaves. This causes switchover to complete slower than before but is safer if log_slave_updates is not on on the new master server. Also, read_only is disabled on the demoted server if waiting on slaves or promotion fails. This should effectively cancel the failover for the old master.
This commit is contained in:
		| @ -3360,7 +3360,7 @@ bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_maste | |||||||
|  * @param err_out json object for error printing. Can be NULL. |  * @param err_out json object for error printing. Can be NULL. | ||||||
|  * @return True if successful |  * @return True if successful | ||||||
|  */ |  */ | ||||||
| bool promote_new_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, json_t** err_out) | bool promote_new_master(MXS_MONITORED_SERVER* new_master, json_t** err_out) | ||||||
| { | { | ||||||
|     bool success = false; |     bool success = false; | ||||||
|     MXS_NOTICE("Promoting server '%s' to master.", new_master->server->unique_name); |     MXS_NOTICE("Promoting server '%s' to master.", new_master->server->unique_name); | ||||||
| @ -3478,7 +3478,7 @@ static bool do_failover(MYSQL_MONITOR* mon, json_t** err_out) | |||||||
|     // Step 2: Wait until relay log consumed. |     // Step 2: Wait until relay log consumed. | ||||||
|     if (failover_wait_relay_log(mon, new_master, err_out) && |     if (failover_wait_relay_log(mon, new_master, err_out) && | ||||||
|         // Step 3: Stop and reset slave, set read-only to 0. |         // Step 3: Stop and reset slave, set read-only to 0. | ||||||
|         promote_new_master(mon, new_master, err_out)) |         promote_new_master(new_master, err_out)) | ||||||
|     { |     { | ||||||
|         // Step 4: Redirect slaves. |         // Step 4: Redirect slaves. | ||||||
|         int redirects = redirect_slaves(mon, slaves, new_master); |         int redirects = redirect_slaves(mon, slaves, new_master); | ||||||
| @ -3714,6 +3714,48 @@ static bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave, const Gti | |||||||
|     return gtid_reached; |     return gtid_reached; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /** | ||||||
|  |  * Wait until slave replication catches up with the master gtid for all slaves in the vector. | ||||||
|  |  * | ||||||
|  |  * @param slave Slaves to wait on | ||||||
|  |  * @param gtid Which gtid must be reached | ||||||
|  |  * @param total_timeout Maximum wait time in seconds | ||||||
|  |  * @param read_timeout The value of read_timeout for the connection | ||||||
|  |  * @param err_out json object for error printing. Can be NULL. | ||||||
|  |  * @return True, if target gtid was reached within allotted time for all servers | ||||||
|  |  */ | ||||||
|  | static bool switchover_wait_slaves_catchup(const ServerVector& slaves, const Gtid& gtid, | ||||||
|  |                                            int total_timeout, int read_timeout, | ||||||
|  |                                            json_t** err_out) | ||||||
|  | { | ||||||
|  |     bool success = true; | ||||||
|  |     int seconds_remaining = total_timeout; | ||||||
|  |  | ||||||
|  |     for (ServerVector::const_iterator iter = slaves.begin(); | ||||||
|  |          iter != slaves.end() && success; | ||||||
|  |          iter++) | ||||||
|  |     { | ||||||
|  |         if (seconds_remaining < 0) | ||||||
|  |         { | ||||||
|  |             success = false; | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |             time_t begin = time(NULL); | ||||||
|  |             MXS_MONITORED_SERVER* slave = *iter; | ||||||
|  |             if (switchover_wait_slave_catchup(slave, gtid, seconds_remaining, read_timeout, err_out)) | ||||||
|  |             { | ||||||
|  |                 seconds_remaining -= difftime(time(NULL), begin); | ||||||
|  |             } | ||||||
|  |             else | ||||||
|  |             { | ||||||
|  |                 success = false; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     return success; | ||||||
|  | } | ||||||
|  |  | ||||||
| /** | /** | ||||||
|  * Starts a new slave connection on a server. Should be used on a demoted master server. |  * Starts a new slave connection on a server. Should be used on a demoted master server. | ||||||
|  * |  * | ||||||
| @ -3818,28 +3860,48 @@ static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_mast | |||||||
|     bool rval = false; |     bool rval = false; | ||||||
|     MySqlServerInfo* curr_master_info = get_server_info(mon, demotion_target); |     MySqlServerInfo* curr_master_info = get_server_info(mon, demotion_target); | ||||||
|     // Step 2: Set read-only to 1, flush logs. |     // Step 2: Set read-only to 1, flush logs. | ||||||
|     if (switchover_demote_master(mon, demotion_target, curr_master_info, err_out) && |     if (switchover_demote_master(mon, demotion_target, curr_master_info, err_out)) | ||||||
|         // Step 3: Wait for the selected slave to catch up with master. |  | ||||||
|         switchover_wait_slave_catchup(promotion_target, curr_master_info->gtid_binlog_pos, |  | ||||||
|                                       mon->switchover_timeout, mon->monitor->read_timeout, err_out) && |  | ||||||
|         // Step 4: Stop and reset slave, set read-only to 0. |  | ||||||
|         promote_new_master(mon, promotion_target, err_out)) |  | ||||||
|     { |     { | ||||||
|         // Step 5: Redirect slaves. |         // Step 3a: Wait for the selected slave to catch up with master. | ||||||
|         int redirects = redirect_slaves(mon, slaves, promotion_target); |         if (switchover_wait_slave_catchup(promotion_target, curr_master_info->gtid_binlog_pos, | ||||||
|         // Step 6: Set the old master to replicate from the new. |                                           mon->switchover_timeout, mon->monitor->read_timeout, err_out) && | ||||||
|         bool start_ok = switchover_start_slave(mon, demotion_target, promotion_target); |             // Step 3b: Wait for other slaves to catch up with master. | ||||||
|         rval = slaves.empty() ? start_ok : start_ok || redirects > 0; |             switchover_wait_slaves_catchup(slaves, curr_master_info->gtid_binlog_pos, | ||||||
|         if (rval == false) |                                            mon->switchover_timeout, mon->monitor->read_timeout, err_out) && | ||||||
|  |             // Step 4: Stop and reset slave, set read-only to 0. | ||||||
|  |             promote_new_master(promotion_target, err_out)) | ||||||
|         { |         { | ||||||
|             // This is a special case. Individual server errors have already been printed to the log. |             // Step 5: Redirect slaves. | ||||||
|             // For JSON, gather the errors again. |             int redirects = redirect_slaves(mon, slaves, promotion_target); | ||||||
|             const char MSG[] = "Could not redirect any slaves to the new master."; |             // Step 6: Set the old master to replicate from the new. | ||||||
|             MXS_ERROR(MSG); |             bool start_ok = switchover_start_slave(mon, demotion_target, promotion_target); | ||||||
|             if (err_out) |             rval = slaves.empty() ? start_ok : start_ok || redirects > 0; | ||||||
|  |             if (rval == false) | ||||||
|             { |             { | ||||||
|                 string combined_error = get_connection_errors(slaves, demotion_target); |                 // This is a special case. Individual server errors have already been printed to the log. | ||||||
|                 *err_out = mxs_json_error_append(*err_out, "%s Errors: %s.", MSG, combined_error.c_str()); |                 // For JSON, gather the errors again. | ||||||
|  |                 const char MSG[] = "Could not redirect any slaves to the new master."; | ||||||
|  |                 MXS_ERROR(MSG); | ||||||
|  |                 if (err_out) | ||||||
|  |                 { | ||||||
|  |                     string combined_error = get_connection_errors(slaves, demotion_target); | ||||||
|  |                     *err_out = mxs_json_error_append(*err_out, "%s Errors: %s.", MSG, combined_error.c_str()); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         else | ||||||
|  |         { | ||||||
|  |             // Step 3a, 3b or 4 failed, try to undo step 2. | ||||||
|  |             const char QUERY_UNDO[] = "SET GLOBAL read_only=0;"; | ||||||
|  |             if (mxs_mysql_query(demotion_target->con, QUERY_UNDO) == 0) | ||||||
|  |             { | ||||||
|  |                 PRINT_MXS_JSON_ERROR(err_out, "read_only disabled on server %s.", | ||||||
|  |                     demotion_target->server->unique_name); | ||||||
|  |             } | ||||||
|  |             else | ||||||
|  |             { | ||||||
|  |                 PRINT_MXS_JSON_ERROR(err_out, "Could not disable read_only on server %s: '%s'.", | ||||||
|  |                     demotion_target->server->unique_name, mysql_error(demotion_target->con)); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  | |||||||
		Reference in New Issue
	
	Block a user
	 Esa Korhonen
					Esa Korhonen