Merge branch '2.2' into develop

This commit is contained in:
Johan Wikman
2018-02-12 14:00:40 +02:00
31 changed files with 900 additions and 352 deletions

View File

@ -58,7 +58,7 @@ int atomic_load_int(const int *variable)
#ifdef MXS_USE_ATOMIC_BUILTINS
return __atomic_load_n(variable, __ATOMIC_SEQ_CST);
#else
return __sync_fetch_and_or(variable, 0);
return __sync_fetch_and_or((int*)variable, 0);
#endif
}
@ -67,7 +67,7 @@ int32_t atomic_load_int32(const int32_t *variable)
#ifdef MXS_USE_ATOMIC_BUILTINS
return __atomic_load_n(variable, __ATOMIC_SEQ_CST);
#else
return __sync_fetch_and_or(variable, 0);
return __sync_fetch_and_or((int32_t*)variable, 0);
#endif
}
@ -76,7 +76,7 @@ int64_t atomic_load_int64(const int64_t *variable)
#ifdef MXS_USE_ATOMIC_BUILTINS
return __atomic_load_n(variable, __ATOMIC_SEQ_CST);
#else
return __sync_fetch_and_or(variable, 0);
return __sync_fetch_and_or((int64_t*)variable, 0);
#endif
}
@ -85,7 +85,7 @@ uint32_t atomic_load_uint32(const uint32_t *variable)
#ifdef MXS_USE_ATOMIC_BUILTINS
return __atomic_load_n(variable, __ATOMIC_SEQ_CST);
#else
return __sync_fetch_and_or(variable, 0);
return __sync_fetch_and_or((uint32_t*)variable, 0);
#endif
}
@ -94,7 +94,7 @@ uint64_t atomic_load_uint64(const uint64_t *variable)
#ifdef MXS_USE_ATOMIC_BUILTINS
return __atomic_load_n(variable, __ATOMIC_SEQ_CST);
#else
return __sync_fetch_and_or(variable, 0);
return __sync_fetch_and_or((uint64_t*)variable, 0);
#endif
}

View File

@ -146,6 +146,7 @@ const char CN_TYPE[] = "type";
const char CN_UNIX[] = "unix";
const char CN_USER[] = "user";
const char CN_USERS[] = "users";
const char CN_USERS_REFRESH_TIME[] = "users_refresh_time";
const char CN_VERSION_STRING[] = "version_string";
const char CN_WEIGHTBY[] = "weightby";
const char CN_SESSION_TRACK_TRX_STATE[] = "session_track_trx_state";
@ -1668,6 +1669,44 @@ handle_global_item(const char *name, const char *value)
{
gateway.local_address = MXS_STRDUP_A(value);
}
else if (strcmp(name, CN_USERS_REFRESH_TIME) == 0)
{
char* endptr;
long users_refresh_time = strtol(value, &endptr, 0);
if (*endptr == '\0')
{
if (users_refresh_time < 0)
{
MXS_NOTICE("Value of '%s' is less than 0, users will "
"not be automatically refreshed.", CN_USERS_REFRESH_TIME);
// Strictly speaking they will be refreshed once every 68 years,
// but I just don't beleave the uptime will be that long.
users_refresh_time = INT32_MAX;
}
else if (users_refresh_time < USERS_REFRESH_TIME_MIN)
{
MXS_WARNING("%s is less than the allowed minimum value of %d for the "
"configuration option '%s', using the minimum value.",
value, USERS_REFRESH_TIME_MIN, CN_USERS_REFRESH_TIME);
users_refresh_time = USERS_REFRESH_TIME_MIN;
}
if (users_refresh_time > INT32_MAX)
{
// To ensure that there will be no overflows when
// we later do arithmetic.
users_refresh_time = INT32_MAX;
}
gateway.users_refresh_time = users_refresh_time;
}
else
{
MXS_ERROR("%s is an invalid value for '%s', using default %d instead.",
value, CN_USERS_REFRESH_TIME, USERS_REFRESH_TIME_DEFAULT);
gateway.users_refresh_time = USERS_REFRESH_TIME_DEFAULT;
}
}
else
{
for (i = 0; lognames[i].name; i++)

View File

@ -515,15 +515,15 @@ monitorShow(DCB *dcb, MXS_MONITOR *monitor)
break;
}
dcb_printf(dcb, "Monitor: %p\n", monitor);
dcb_printf(dcb, "Name: %s\n", monitor->name);
dcb_printf(dcb, "State: %s\n", state);
dcb_printf(dcb, "Sampling interval: %lu milliseconds\n", monitor->interval);
dcb_printf(dcb, "Connect Timeout: %i seconds\n", monitor->connect_timeout);
dcb_printf(dcb, "Read Timeout: %i seconds\n", monitor->read_timeout);
dcb_printf(dcb, "Write Timeout: %i seconds\n", monitor->write_timeout);
dcb_printf(dcb, "Connect attempts: %i \n", monitor->connect_attempts);
dcb_printf(dcb, "Monitored servers: ");
dcb_printf(dcb, "Monitor: %p\n", monitor);
dcb_printf(dcb, "Name: %s\n", monitor->name);
dcb_printf(dcb, "State: %s\n", state);
dcb_printf(dcb, "Sampling interval: %lu milliseconds\n", monitor->interval);
dcb_printf(dcb, "Connect Timeout: %i seconds\n", monitor->connect_timeout);
dcb_printf(dcb, "Read Timeout: %i seconds\n", monitor->read_timeout);
dcb_printf(dcb, "Write Timeout: %i seconds\n", monitor->write_timeout);
dcb_printf(dcb, "Connect attempts: %i \n", monitor->connect_attempts);
dcb_printf(dcb, "Monitored servers: ");
const char *sep = "";
@ -543,12 +543,12 @@ monitorShow(DCB *dcb, MXS_MONITOR *monitor)
}
else
{
dcb_printf(dcb, "\t(no diagnostics)\n");
dcb_printf(dcb, " (no diagnostics)\n");
}
}
else
{
dcb_printf(dcb, "\tMonitor failed\n");
dcb_printf(dcb, " Monitor failed\n");
}
dcb_printf(dcb, "\n");
}
@ -2444,41 +2444,41 @@ MXS_MONITORED_SERVER* mon_get_monitored_server(const MXS_MONITOR* mon, SERVER* s
int mon_config_get_servers(const MXS_CONFIG_PARAMETER* params, const char* key, const MXS_MONITOR* mon,
MXS_MONITORED_SERVER*** monitored_servers_out)
{
ss_dassert(*monitored_servers_out == NULL);
ss_dassert(monitored_servers_out != NULL && *monitored_servers_out == NULL);
SERVER** servers = NULL;
int servers_size = config_get_server_list(params, key, &servers);
int rval = 0;
int found = 0;
// All servers in the array must be monitored by the given monitor.
if (servers_size > 0)
{
MXS_MONITORED_SERVER** monitored_array =
(MXS_MONITORED_SERVER**)MXS_CALLOC(servers_size, sizeof(MXS_MONITORED_SERVER*));
bool error = false;
for (int i = 0; i < servers_size && !error; i++)
for (int i = 0; i < servers_size; i++)
{
MXS_MONITORED_SERVER* mon_serv = mon_get_monitored_server(mon, servers[i]);
if (mon_serv != NULL)
{
monitored_array[i] = mon_serv;
monitored_array[found++] = mon_serv;
}
else
{
MXS_ERROR("Server '%s' is not monitored by monitor '%s'.", servers[i]->unique_name, mon->name);
error = true;
MXS_WARNING("Server '%s' is not monitored by monitor '%s'.",
servers[i]->unique_name, mon->name);
}
}
MXS_FREE(servers);
if (error)
ss_dassert(found <= servers_size);
if (found == 0)
{
MXS_FREE(monitored_array);
rval = -1;
monitored_array = NULL;
}
else
else if (found < servers_size)
{
*monitored_servers_out = monitored_array;
rval = servers_size;
monitored_array = (MXS_MONITORED_SERVER**)MXS_REALLOC(monitored_array, found);
}
*monitored_servers_out = monitored_array;
}
return rval;
}
return found;
}

View File

@ -335,6 +335,34 @@ serviceStartPort(SERVICE *service, SERV_LISTENER *port)
}
}
MXS_CONFIG* config = config_get_global_options();
time_t last;
bool warned;
/**
* At service start last update is set to config->users_refresh_time seconds earlier.
* This way MaxScale could try reloading users just after startup. But only if user
* refreshing has not been turned off.
*/
if (config->users_refresh_time == INT32_MAX)
{
last = time(NULL);
warned = true; // So that there will not be a refresh rate warning.
}
else
{
last = time(NULL) - config->users_refresh_time;
warned = false;
}
int nthreads = config_threadcount();
for (int i = 0; i < nthreads; ++i)
{
service->rate_limits[i].last = last;
service->rate_limits[i].warned = warned;
}
if (port->listener->func.listen(port->listener, config_bind))
{
port->listener->session = session_alloc(service, port->listener);
@ -1629,22 +1657,24 @@ int service_refresh_users(SERVICE *service)
self = 0;
}
MXS_CONFIG* config = config_get_global_options();
/* Check if refresh rate limit has been exceeded */
if ((now < service->rate_limits[self].last + USERS_REFRESH_TIME) ||
(service->rate_limits[self].nloads >= USERS_REFRESH_MAX_PER_TIME))
if (now < service->rate_limits[self].last + config->users_refresh_time)
{
MXS_ERROR("[%s] Refresh rate limit exceeded for load of users' table.", service->name);
if (!service->rate_limits[self].warned)
{
MXS_WARNING("[%s] Refresh rate limit (once every %ld seconds) exceeded for "
"load of users' table.",
service->name, config->users_refresh_time);
service->rate_limits[self].warned = true;
}
}
else
{
service->rate_limits[self].nloads++;
service->rate_limits[self].last = now;
service->rate_limits[self].warned = false;
/** If we have reached the limit on users refreshes, reset refresh time and count */
if (service->rate_limits[self].nloads >= USERS_REFRESH_MAX_PER_TIME)
{
service->rate_limits[self].nloads = 0;
service->rate_limits[self].last = now;
}
ret = 0;
LISTENER_ITERATOR iter;

View File

@ -703,8 +703,8 @@ static bool get_hostname(DCB *dcb, char *client_hostname, size_t size)
if (lookup_result != 0 && lookup_result != EAI_NONAME)
{
MXS_ERROR("Client hostname lookup failed for '%s', getnameinfo() returned: '%s'.",
dcb->remote, gai_strerror(lookup_result));
MXS_WARNING("Client hostname lookup failed for '%s', getnameinfo() returned: '%s'.",
dcb->remote, gai_strerror(lookup_result));
}
return lookup_result == 0;

View File

@ -129,6 +129,7 @@ static bool wait_cluster_stabilization(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER*
const ServerVector& slaves, int seconds_remaining);
static string get_connection_errors(const ServerVector& servers);
static int64_t scan_server_id(const char* id_string);
static string generate_change_master_cmd(MYSQL_MONITOR* mon, const string& master_host, int master_port);
static bool report_version_err = true;
static const char* hb_table_name = "maxscale_schema.replication_heartbeat";
@ -160,6 +161,9 @@ static const char CN_REPLICATION_PASSWORD[] = "replication_password";
/** Server id default value */
static const int64_t SERVER_ID_UNKNOWN = -1;
/** Default port */
static const int PORT_UNKNOWN = 0;
class Gtid
{
public:
@ -1056,6 +1060,8 @@ startMonitor(MXS_MONITOR *monitor, const MXS_CONFIG_PARAMETER* params)
handle->id = config_get_global_options()->id;
handle->warn_set_standalone_master = true;
handle->master_gtid_domain = -1;
handle->external_master_host[0] = '\0';
handle->external_master_port = PORT_UNKNOWN;
handle->monitor = monitor;
}
@ -1170,6 +1176,21 @@ static bool stop_monitor(MXS_MONITOR* mon)
return actually_stopped;
}
static string monitored_servers_to_string(MXS_MONITORED_SERVER** array, size_t array_size)
{
string rval;
if (array_size > 0)
{
const char* separator = "";
for (size_t i = 0; i < array_size; i++)
{
rval += separator;
rval += array[i]->server->unique_name;
separator = ",";
}
}
return rval;
}
/**
* Daignostic interface
*
@ -1180,35 +1201,57 @@ static void diagnostics(DCB *dcb, const MXS_MONITOR *mon)
{
const MYSQL_MONITOR *handle = (const MYSQL_MONITOR *)mon->handle;
dcb_printf(dcb, "Automatic failover:\t%s\n", handle->auto_failover ? "Enabled" : "Disabled");
dcb_printf(dcb, "Failcount:\t\t%d\n", handle->failcount);
dcb_printf(dcb, "Failover Timeout:\t%u\n", handle->failover_timeout);
dcb_printf(dcb, "Switchover Timeout:\t%u\n", handle->switchover_timeout);
dcb_printf(dcb, "Auto rejoin:\t\t%s\n", handle->auto_rejoin ? "Enabled" : "Disabled");
dcb_printf(dcb, "MaxScale MonitorId:\t%lu\n", handle->id);
dcb_printf(dcb, "Replication lag:\t%s\n", (handle->replicationHeartbeat == 1) ? "enabled" : "disabled");
dcb_printf(dcb, "Detect Stale Master:\t%s\n", (handle->detectStaleMaster == 1) ? "enabled" : "disabled");
dcb_printf(dcb, "Server information\n\n");
dcb_printf(dcb, "Automatic failover: %s\n", handle->auto_failover ? "Enabled" : "Disabled");
dcb_printf(dcb, "Failcount: %d\n", handle->failcount);
dcb_printf(dcb, "Failover timeout: %u\n", handle->failover_timeout);
dcb_printf(dcb, "Switchover timeout: %u\n", handle->switchover_timeout);
dcb_printf(dcb, "Automatic rejoin: %s\n", handle->auto_rejoin ? "Enabled" : "Disabled");
dcb_printf(dcb, "MaxScale monitor ID: %lu\n", handle->id);
dcb_printf(dcb, "Detect replication lag: %s\n", (handle->replicationHeartbeat == 1) ?
"Enabled" : "Disabled");
dcb_printf(dcb, "Detect stale master: %s\n", (handle->detectStaleMaster == 1) ?
"Enabled" : "Disabled");
if (handle->n_excluded > 0)
{
dcb_printf(dcb, "Non-promotable servers (failover): ");
dcb_printf(dcb, "%s\n",
monitored_servers_to_string(handle->excluded_servers, handle->n_excluded).c_str());
}
dcb_printf(dcb, "\nServer information:\n-------------------\n\n");
for (MXS_MONITORED_SERVER *db = mon->monitored_servers; db; db = db->next)
{
MySqlServerInfo *serv_info = get_server_info(handle, db);
dcb_printf(dcb, "Server: %s\n", db->server->unique_name);
dcb_printf(dcb, "Server ID: %" PRId64 "\n", serv_info->server_id);
dcb_printf(dcb, "Read only: %s\n", serv_info->read_only ? "ON" : "OFF");
dcb_printf(dcb, "Slave configured: %s\n", serv_info->slave_configured ? "YES" : "NO");
dcb_printf(dcb, "Slave IO running: %s\n", serv_info->slave_status.slave_io_running ? "YES" : "NO");
dcb_printf(dcb, "Slave SQL running: %s\n", serv_info->slave_status.slave_sql_running ? "YES" : "NO");
dcb_printf(dcb, "Master ID: %" PRId64 "\n", serv_info->slave_status.master_server_id);
dcb_printf(dcb, "Master binlog file: %s\n", serv_info->slave_status.master_log_file.c_str());
dcb_printf(dcb, "Master binlog position: %lu\n", serv_info->slave_status.read_master_log_pos);
dcb_printf(dcb, "Server: %s\n", db->server->unique_name);
dcb_printf(dcb, "Server ID: %" PRId64 "\n", serv_info->server_id);
dcb_printf(dcb, "Read only: %s\n", serv_info->read_only ? "YES" : "NO");
dcb_printf(dcb, "Slave configured: %s\n", serv_info->slave_configured ? "YES" : "NO");
if (serv_info->slave_configured)
{
dcb_printf(dcb, "Slave IO running: %s\n", serv_info->slave_status.slave_io_running ? "YES" : "NO");
dcb_printf(dcb, "Slave SQL running: %s\n", serv_info->slave_status.slave_sql_running ? "YES" : "NO");
dcb_printf(dcb, "Master ID: %" PRId64 "\n", serv_info->slave_status.master_server_id);
dcb_printf(dcb, "Master binlog file: %s\n", serv_info->slave_status.master_log_file.c_str());
dcb_printf(dcb, "Master binlog position: %lu\n", serv_info->slave_status.read_master_log_pos);
}
if (serv_info->gtid_current_pos.server_id != SERVER_ID_UNKNOWN)
{
dcb_printf(dcb, "Gtid current position: %s\n",
serv_info->gtid_current_pos.to_string().c_str());
}
if (serv_info->gtid_binlog_pos.server_id != SERVER_ID_UNKNOWN)
{
dcb_printf(dcb, "Gtid binlog position: %s\n",
serv_info->gtid_current_pos.to_string().c_str());
}
if (serv_info->slave_status.gtid_io_pos.server_id != SERVER_ID_UNKNOWN)
{
dcb_printf(dcb, "Gtid_IO_Pos: %s\n", serv_info->slave_status.gtid_io_pos.to_string().c_str());
dcb_printf(dcb, "Gtid slave IO position: %s\n",
serv_info->slave_status.gtid_io_pos.to_string().c_str());
}
if (handle->multimaster)
{
dcb_printf(dcb, "Master group: %d\n", serv_info->group);
dcb_printf(dcb, "Master group: %d\n", serv_info->group);
}
dcb_printf(dcb, "\n");
@ -1243,7 +1286,11 @@ static json_t* diagnostics_json(const MXS_MONITOR *mon)
{
json_object_set_new(rval, "script", json_string(handle->script));
}
if (handle->n_excluded > 0)
{
string list = monitored_servers_to_string(handle->excluded_servers, handle->n_excluded);
json_object_set_new(rval, CN_NO_PROMOTE_SERVERS, json_string(list.c_str()));
}
if (mon->monitored_servers)
{
json_t* arr = json_array();
@ -1267,11 +1314,12 @@ static json_t* diagnostics_json(const MXS_MONITOR *mon)
json_string(serv_info->slave_status.master_log_file.c_str()));
json_object_set_new(srv, "master_binlog_position",
json_integer(serv_info->slave_status.read_master_log_pos));
if (serv_info->slave_status.gtid_io_pos.server_id != SERVER_ID_UNKNOWN)
{
json_object_set_new(srv, "gtid_io_pos",
json_object_set_new(srv, "gtid_current_pos",
json_string(serv_info->gtid_current_pos.to_string().c_str()));
json_object_set_new(srv, "gtid_binlog_pos",
json_string(serv_info->gtid_binlog_pos.to_string().c_str()));
json_object_set_new(srv, "gtid_io_pos",
json_string(serv_info->slave_status.gtid_io_pos.to_string().c_str()));
}
if (handle->multimaster)
{
json_object_set_new(srv, "master_group", json_integer(serv_info->group));
@ -1707,6 +1755,11 @@ monitorDatabase(MXS_MONITOR *mon, MXS_MONITORED_SERVER *database)
}
/* Query a few settings. */
read_server_variables(database, serv_info);
/* If gtid domain exists and server is 10.0, update gtid:s */
if (handle->master_gtid_domain >= 0 && serv_info->version == MYSQL_SERVER_VERSION_100)
{
update_gtids(handle, database, serv_info);
}
/* Check for MariaDB 10.x.x and get status for multi-master replication */
if (serv_info->version == MYSQL_SERVER_VERSION_100 || serv_info->version == MYSQL_SERVER_VERSION_55)
{
@ -2216,13 +2269,49 @@ monitorMain(void *arg)
if (handle->master != NULL && SERVER_IS_MASTER(handle->master->server))
{
int64_t domain = get_server_info(handle, handle->master)->gtid_domain_id;
MySqlServerInfo* master_info = get_server_info(handle, handle->master);
// Update cluster gtid domain
int64_t domain = master_info->gtid_domain_id;
if (handle->master_gtid_domain >= 0 && domain != handle->master_gtid_domain)
{
MXS_INFO("gtid_domain_id of master has changed: %" PRId64 " -> %" PRId64 ".",
MXS_NOTICE("Gtid domain id of master has changed: %" PRId64 " -> %" PRId64 ".",
handle->master_gtid_domain, domain);
}
handle->master_gtid_domain = domain;
// Update cluster external master
if (SERVER_IS_SLAVE_OF_EXTERNAL_MASTER(handle->master->server))
{
if (master_info->slave_status.master_host != handle->external_master_host ||
master_info->slave_status.master_port != handle->external_master_port)
{
const char* new_ext_host = master_info->slave_status.master_host.c_str();
const int new_ext_port = master_info->slave_status.master_port;
if (handle->external_master_port == PORT_UNKNOWN)
{
MXS_NOTICE("Cluster master server is replicating from an external master: %s:%d",
new_ext_host, new_ext_port);
}
else
{
MXS_NOTICE("The external master of the cluster has changed: %s:%d -> %s:%d.",
handle->external_master_host, handle->external_master_port,
new_ext_host, new_ext_port);
}
snprintf(handle->external_master_host, sizeof(handle->external_master_host),
"%s", new_ext_host);
handle->external_master_port = new_ext_port;
}
}
else
{
if (handle->external_master_port != PORT_UNKNOWN)
{
MXS_NOTICE("Cluster lost the external master.");
}
handle->external_master_host[0] = '\0';
handle->external_master_port = PORT_UNKNOWN;
}
}
ptr = mon->monitored_servers;
@ -3359,17 +3448,14 @@ static bool check_replication_settings(const MXS_MONITORED_SERVER* server, MySql
}
/**
* Check that the given slave is a valid promotion candidate. Update the server info structs of all slaves.
* Also populate the output vector with other slave servers.
* Check that the given slave is a valid promotion candidate.
*
* @param mon Cluster monitor
* @param preferred Preferred new master
* @param slaves_out Output array for other slaves. These should be redirected to the new master. Can be NULL.
* @param err_out Json object for error printing. Can be NULL.
* @return True, if given slave is a valid promotion candidate.
*/
bool switchover_check_preferred_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* preferred,
ServerVector* slaves_out, json_t** err_out)
bool switchover_check_preferred_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* preferred, json_t** err_out)
{
ss_dassert(preferred);
bool rval = true;
@ -3380,20 +3466,6 @@ bool switchover_check_preferred_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER*
preferred->server->unique_name);
rval = false;
}
for (MXS_MONITORED_SERVER *slave = mon->monitor->monitored_servers; slave; slave = slave->next)
{
if (slave != preferred)
{
// The update_slave_info()-call is not strictly necessary here, but it should be ran to keep this
// function analogous with failover_select_new_master(). The later functions can then assume that
// slave server info is up to date.
MySqlServerInfo* slave_info = update_slave_info(mon, slave);
if (slave_info && slaves_out)
{
slaves_out->push_back(slave);
}
}
}
return rval;
}
@ -3459,7 +3531,8 @@ MXS_MONITORED_SERVER* select_new_master(MYSQL_MONITOR* mon, ServerVector* slaves
// If a server cannot be connected to, it won't be considered for promotion or redirected.
// Do not worry about the exclusion list yet, querying the excluded servers is ok.
MySqlServerInfo* cand_info = update_slave_info(mon, cand);
if (cand_info)
// If master is replicating from external master, it is updated but not added to array.
if (cand_info && cand != mon->master)
{
slaves_out->push_back(cand);
// Check that server is not in the exclusion list while still being a valid choice.
@ -3587,6 +3660,25 @@ bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_maste
return rval;
}
bool start_external_replication(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, json_t** err_out)
{
bool rval = false;
string change_cmd = generate_change_master_cmd(mon, mon->external_master_host, mon->external_master_port);
if (mxs_mysql_query(new_master->con, change_cmd.c_str()) == 0 &&
mxs_mysql_query(new_master->con, "START SLAVE;") == 0)
{
MXS_NOTICE("New master starting replication from external master %s:%d.",
mon->external_master_host, mon->external_master_port);
rval = true;
}
else
{
PRINT_MXS_JSON_ERROR(err_out, "Could not start replication from external master: '%s'.",
mysql_error(new_master->con));
}
return rval;
}
/**
* Prepares a server for the replication master role.
*
@ -3595,7 +3687,7 @@ bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_maste
* @param err_out json object for error printing. Can be NULL.
* @return True if successful
*/
bool promote_new_master(MXS_MONITORED_SERVER* new_master, json_t** err_out)
bool promote_new_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, json_t** err_out)
{
bool success = false;
MXS_NOTICE("Promoting server '%s' to master.", new_master->server->unique_name);
@ -3612,19 +3704,35 @@ bool promote_new_master(MXS_MONITORED_SERVER* new_master, json_t** err_out)
}
}
}
if (!success)
{
PRINT_MXS_JSON_ERROR(err_out, "Promotion failed: '%s'. Query: '%s'.",
mysql_error(new_master->con), query);
}
// If the previous master was a slave to an external master, start the equivalent slave connection on
// the new master. Success of replication is not checked.
else if (mon->external_master_port != PORT_UNKNOWN &&
!start_external_replication(mon, new_master, err_out))
{
success = false;
}
return success;
}
string generate_change_master_cmd(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master)
/**
* Generate a CHANGE MASTER TO-query.
*
* @param mon Cluster monitor, needed for username & password
* @param master_host Master hostname/address
* @param master_port Master port
* @return Generated query
*/
string generate_change_master_cmd(MYSQL_MONITOR* mon, const string& master_host, int master_port)
{
std::stringstream change_cmd;
change_cmd << "CHANGE MASTER TO MASTER_HOST = '" << new_master->server->name << "', ";
change_cmd << "MASTER_PORT = " << new_master->server->port << ", ";
change_cmd << "CHANGE MASTER TO MASTER_HOST = '" << master_host << "', ";
change_cmd << "MASTER_PORT = " << master_port << ", ";
change_cmd << "MASTER_USE_GTID = current_pos, ";
change_cmd << "MASTER_USER = '" << mon->replication_user << "', ";
const char MASTER_PW[] = "MASTER_PASSWORD = '";
@ -3678,7 +3786,8 @@ int redirect_slaves(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, const
ServerVector* redirected_slaves = NULL)
{
MXS_NOTICE("Redirecting slaves to new master.");
std::string change_cmd = generate_change_master_cmd(mon, new_master);
std::string change_cmd = generate_change_master_cmd(mon,
new_master->server->name, new_master->server->port);
int successes = 0;
for (ServerVector::const_iterator iter = slaves.begin(); iter != slaves.end(); iter++)
{
@ -3763,7 +3872,7 @@ static bool do_failover(MYSQL_MONITOR* mon, json_t** err_out)
seconds_remaining -= seconds_step2;
// Step 3: Stop and reset slave, set read-only to 0.
if (promote_new_master(new_master, err_out))
if (promote_new_master(mon, new_master, err_out))
{
// Step 4: Redirect slaves.
ServerVector redirected_slaves;
@ -3776,8 +3885,20 @@ static bool do_failover(MYSQL_MONITOR* mon, json_t** err_out)
// Step 5: Finally, add an event to the new master to advance gtid and wait for the slaves
// to receive it. seconds_remaining can be 0 or less at this point. Even in such a case
// wait_cluster_stabilization() may succeed if replication is fast enough.
if (wait_cluster_stabilization(mon, new_master, redirected_slaves, seconds_remaining))
// wait_cluster_stabilization() may succeed if replication is fast enough. If using external
// replication, skip this step. Come up with an alternative later.
if (mon->external_master_port != PORT_UNKNOWN)
{
MXS_WARNING("Replicating from external master, skipping final check.");
rval = true;
}
else if (redirected_slaves.empty())
{
// No slaves to check. Assume success.
rval = true;
MXS_DEBUG("Failover: no slaves to redirect, skipping stabilization check.");
}
else if (wait_cluster_stabilization(mon, new_master, redirected_slaves, seconds_remaining))
{
rval = true;
time_t step5_time = time(NULL);
@ -3910,47 +4031,79 @@ static bool switchover_demote_master(MYSQL_MONITOR* mon,
json_t** err_out)
{
MXS_NOTICE("Demoting server '%s'.", current_master->server->unique_name);
string error;
bool success = false;
const char* query = "SET GLOBAL read_only=1;";
if (mxs_mysql_query(current_master->con, query) == 0)
bool query_error = false;
MYSQL* conn = current_master->con;
const char* query = "";
// The presence of an external master changes several things.
const bool external_master = SERVER_IS_SLAVE_OF_EXTERNAL_MASTER(current_master->server);
if (external_master)
{
query = "FLUSH TABLES;";
if (mxs_mysql_query(current_master->con, query) == 0)
// First need to stop slave. read_only is probably on already, although not certain.
query = "STOP SLAVE;";
query_error = (mxs_mysql_query(conn, query) != 0);
if (!query_error)
{
query = "FLUSH LOGS;";
if (mxs_mysql_query(current_master->con, query) == 0)
{
query = "";
if (update_gtids(mon, current_master, info))
{
success = true;
}
}
}
if (!success)
{
// Somehow, a step after "SET read_only" failed. Try to set read_only back to 0. It may not
// work since the connection is likely broken.
error = mysql_error(current_master->con);
mxs_mysql_query(current_master->con, "SET GLOBAL read_only=0;");
query = "RESET SLAVE ALL;";
query_error = (mxs_mysql_query(conn, query) != 0);
}
}
else
string error_desc;
if (!query_error)
{
error = mysql_error(current_master->con);
query = "SET GLOBAL read_only=1;";
query_error = (mxs_mysql_query(conn, query) != 0);
if (!query_error)
{
// If have external master, no writes are allowed so skip this step. It's not essential, just
// adds one to gtid.
if (!external_master)
{
query = "FLUSH TABLES;";
query_error = (mxs_mysql_query(conn, query) != 0);
}
if (!query_error)
{
query = "FLUSH LOGS;";
query_error = (mxs_mysql_query(conn, query) != 0);
if (!query_error)
{
query = "";
if (update_gtids(mon, current_master, info))
{
success = true;
}
}
}
if (!success)
{
// Somehow, a step after "SET read_only" failed. Try to set read_only back to 0. It may not
// work since the connection is likely broken.
error_desc = mysql_error(conn);
mxs_mysql_query(conn, "SET GLOBAL read_only=0;");
}
}
}
if (query_error)
{
error_desc = mysql_error(conn);
}
if (!success)
{
if (error.empty())
if (error_desc.empty())
{
PRINT_MXS_JSON_ERROR(err_out, "Demotion failed due to an error in updating gtid:s.");
}
else
{
PRINT_MXS_JSON_ERROR(err_out, "Demotion failed due to a query error: '%s'. Query: '%s'.",
error.c_str(), query);
error_desc.c_str(), query);
}
}
return success;
@ -4075,16 +4228,15 @@ static bool switchover_wait_slaves_catchup(const ServerVector& slaves, const Gti
* @return True if commands were accepted. This does not guarantee that replication proceeds
* successfully.
*/
static bool switchover_start_slave(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* old_master,
MXS_MONITORED_SERVER* new_master)
static bool switchover_start_slave(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* old_master, SERVER* new_master)
{
bool rval = false;
std::string change_cmd = generate_change_master_cmd(mon, new_master);
std::string change_cmd = generate_change_master_cmd(mon, new_master->name, new_master->port);
if (mxs_mysql_query(old_master->con, change_cmd.c_str()) == 0 &&
mxs_mysql_query(old_master->con, "START SLAVE;") == 0)
{
MXS_NOTICE("Old master '%s' starting replication from '%s'.",
old_master->server->unique_name, new_master->server->unique_name);
old_master->server->unique_name, new_master->unique_name);
rval = true;
}
else
@ -4245,9 +4397,26 @@ static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_mast
ServerVector redirectable_slaves;
if (new_master)
{
if (switchover_check_preferred_master(mon, new_master, &redirectable_slaves, err_out))
if (switchover_check_preferred_master(mon, new_master, err_out))
{
promotion_target = new_master;
/* User-given candidate is good. Update info on all slave servers.
* The update_slave_info()-call is not strictly necessary here, but it should be ran to keep this
* path analogous with failover_select_new_master(). The later functions can then assume that
* slave server info is up to date.
*/
for (MXS_MONITORED_SERVER* slave = mon->monitor->monitored_servers; slave; slave = slave->next)
{
if (slave != promotion_target)
{
MySqlServerInfo* slave_info = update_slave_info(mon, slave);
// If master is replicating from external master, it is updated but not added to array.
if (slave_info && slave != current_master)
{
redirectable_slaves.push_back(slave);
}
}
}
}
}
else
@ -4262,7 +4431,7 @@ static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_mast
bool rval = false;
MySqlServerInfo* curr_master_info = get_server_info(mon, demotion_target);
// Step 2: Set read-only to on, flush logs.
// Step 2: Set read-only to on, flush logs, update master gtid:s
if (switchover_demote_master(mon, demotion_target, curr_master_info, err_out))
{
bool catchup_and_promote_success = false;
@ -4281,12 +4450,12 @@ static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_mast
seconds_remaining -= seconds_step3;
// Step 4: On new master STOP and RESET SLAVE, set read-only to off.
if (promote_new_master(promotion_target, err_out))
if (promote_new_master(mon, promotion_target, err_out))
{
catchup_and_promote_success = true;
// Step 5: Redirect slaves and start replication on old master.
ServerVector redirected_slaves;
bool start_ok = switchover_start_slave(mon, demotion_target, promotion_target);
bool start_ok = switchover_start_slave(mon, demotion_target, promotion_target->server);
if (start_ok)
{
redirected_slaves.push_back(demotion_target);
@ -4301,9 +4470,15 @@ static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_mast
seconds_remaining -= difftime(step5_time, step3_time);
// Step 6: Finally, add an event to the new master to advance gtid and wait for the slaves
// to receive it.
if (wait_cluster_stabilization(mon, promotion_target, redirected_slaves,
seconds_remaining))
// to receive it. If using external replication, skip this step. Come up with an
// alternative later.
if (mon->external_master_port != PORT_UNKNOWN)
{
MXS_WARNING("Replicating from external master, skipping final check.");
rval = true;
}
else if (wait_cluster_stabilization(mon, promotion_target, redirected_slaves,
seconds_remaining))
{
rval = true;
time_t step6_time = time(NULL);
@ -4334,6 +4509,12 @@ static bool do_switchover(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* current_mast
PRINT_MXS_JSON_ERROR(err_out, "Could not disable read_only on server %s: '%s'.",
demotion_target->server->unique_name, mysql_error(demotion_target->con));
}
// Try to reactivate external replication if any.
if (mon->external_master_port != PORT_UNKNOWN)
{
start_external_replication(mon, new_master, err_out);
}
}
}
return rval;
@ -4517,18 +4698,18 @@ static bool get_joinable_servers(MYSQL_MONITOR* mon, ServerVector* output)
*/
static uint32_t do_rejoin(MYSQL_MONITOR* mon, const ServerVector& joinable_servers)
{
MXS_MONITORED_SERVER* master = mon->master;
SERVER* master = mon->master->server;
uint32_t servers_joined = 0;
if (!joinable_servers.empty())
{
string change_cmd = generate_change_master_cmd(mon, master);
string change_cmd = generate_change_master_cmd(mon, master->name, master->port);
for (ServerVector::const_iterator iter = joinable_servers.begin();
iter != joinable_servers.end();
iter++)
{
MXS_MONITORED_SERVER* joinable = *iter;
const char* name = joinable->server->unique_name;
const char* master_name = master->server->unique_name;
const char* master_name = master->unique_name;
MySqlServerInfo* redir_info = get_server_info(mon, joinable);
bool op_success;

View File

@ -71,6 +71,8 @@ typedef struct
bool verify_master_failure; /**< Whether master failure is verified via slaves */
int master_failure_timeout; /**< Time in seconds to wait before doing failover */
int64_t master_gtid_domain; /**< Gtid domain currently used by the master */
char external_master_host[MAX_SERVER_ADDRESS_LEN]; /**< External master host, for fail/switchover */
int external_master_port; /**< External master port */
bool auto_rejoin; /**< Attempt to start slave replication on standalone servers or servers
replicating from the wrong master. */
int n_excluded; /**< Number of excluded servers */