Cleanup slave connection removal during promotion/demotion

The removing and slave status updating is now separated to a function.
As the MariaDBServer object now contains the updated slave connections,
keeping track of removed connections is no longer required.
This commit is contained in:
Esa Korhonen
2018-10-09 12:20:57 +03:00
parent c10fab977d
commit 2f1512a22d
3 changed files with 114 additions and 129 deletions

View File

@ -1195,14 +1195,6 @@ const SlaveStatus* MariaDBServer::slave_connection_status_host_port(const MariaD
return NULL; return NULL;
} }
/**
* Private, non-const version of 'slave_connection_status'.
*/
SlaveStatus* MariaDBServer::slave_connection_status_mutable(const MariaDBServer* target)
{
return const_cast<SlaveStatus*>(slave_connection_status(target));
}
bool MariaDBServer::enable_events(json_t** error_out) bool MariaDBServer::enable_events(json_t** error_out)
{ {
int found_disabled_events = 0; int found_disabled_events = 0;
@ -1446,7 +1438,7 @@ bool MariaDBServer::promote(ClusterOperation& op)
{ {
json_t** const error_out = op.error_out; json_t** const error_out = op.error_out;
// Function should only be called for a master-slave pair. // Function should only be called for a master-slave pair.
auto master_conn = slave_connection_status_mutable(op.demotion_target); auto master_conn = slave_connection_status(op.demotion_target);
mxb_assert(master_conn); mxb_assert(master_conn);
if (master_conn == NULL) if (master_conn == NULL)
{ {
@ -1461,66 +1453,17 @@ bool MariaDBServer::promote(ClusterOperation& op)
// Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion // Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion
// target. In case of switchover, remove other slave connections as well since the demotion target // target. In case of switchover, remove other slave connections as well since the demotion target
// will take them over. // will take them over.
bool stop_slave_error = false; bool stopped = false;
// Helper function for stopping a slave connection and setting error.
auto stop_slave_helper = [this, &timer, &stop_slave_error, &op, error_out](SlaveStatus* slave_conn) {
if (!stop_slave_conn(slave_conn, StopMode::RESET_ALL, op.time_remaining, error_out))
{
stop_slave_error = true;
}
op.time_remaining -= timer.restart();
};
if (op.type == OperationType::SWITCHOVER) if (op.type == OperationType::SWITCHOVER)
{ {
for (size_t i = 0; !stop_slave_error && i < m_slave_status.size(); i++) stopped = remove_slave_conns(op, m_slave_status);
{
stop_slave_helper(&m_slave_status[i]);
}
} }
else else if (op.type == OperationType::FAILOVER)
{ {
stop_slave_helper(master_conn); stopped = remove_slave_conns(op, {*master_conn});
} }
if (!stop_slave_error) if (stopped)
{
// Check that the deleted connection(s) are really gone.
string error_msg;
if (do_show_slave_status(&error_msg))
{
if (op.type == OperationType::SWITCHOVER)
{
if (!m_slave_status.empty())
{
stop_slave_error = true;
PRINT_MXS_JSON_ERROR(error_out,
"%s has %lu slave connections although all connections should "
"have been removed.", name(), m_slave_status.size());
}
}
else
{
if (slave_connection_status(op.demotion_target) != NULL)
{
stop_slave_error = true;
PRINT_MXS_JSON_ERROR(error_out,
"%s has a slave connection to %s although it should "
"have been removed.", name(), op.demotion_target->name());
}
}
}
else
{
stop_slave_error = true;
PRINT_MXS_JSON_ERROR(error_out,
"Failed to update slave connections of %s: %s",
name(), error_msg.c_str());
}
}
if (!stop_slave_error)
{ {
// Step 2: If demotion target is master, meaning this server will become the master, // Step 2: If demotion target is master, meaning this server will become the master,
// enable writing and scheduled events. Also, run promotion_sql_file. // enable writing and scheduled events. Also, run promotion_sql_file.
@ -1568,20 +1511,7 @@ bool MariaDBServer::promote(ClusterOperation& op)
// operation. // operation.
if (!promotion_error) if (!promotion_error)
{ {
if (op.type == OperationType::FAILOVER) if (op.type == OperationType::SWITCHOVER)
{
if (merge_slave_conns(op, op.demotion_target_conns))
{
success = true;
}
else
{
PRINT_MXS_JSON_ERROR(error_out,
"Could not merge slave connections from %s to %s.",
op.demotion_target->name(), name());
}
}
else
{ {
if (copy_slave_conns(op, op.demotion_target_conns, op.demotion_target)) if (copy_slave_conns(op, op.demotion_target_conns, op.demotion_target))
{ {
@ -1594,6 +1524,19 @@ bool MariaDBServer::promote(ClusterOperation& op)
op.demotion_target->name(), name()); op.demotion_target->name(), name());
} }
} }
else if (op.type == OperationType::FAILOVER)
{
if (merge_slave_conns(op, op.demotion_target_conns))
{
success = true;
}
else
{
PRINT_MXS_JSON_ERROR(error_out,
"Could not merge slave connections from %s to %s.",
op.demotion_target->name(), name());
}
}
} }
} }
return success; return success;
@ -1604,44 +1547,10 @@ bool MariaDBServer::demote(ClusterOperation& op)
mxb_assert(op.type == OperationType::SWITCHOVER && op.demotion_target == this); mxb_assert(op.type == OperationType::SWITCHOVER && op.demotion_target == this);
json_t** error_out = op.error_out; json_t** error_out = op.error_out;
bool success = false; bool success = false;
StopWatch timer;
// Step 1: Stop & reset slave connections. The promotion target will copy them. The connection // Step 1: Stop & reset slave connections. The promotion target will copy them. The connection
// information has been backed up in the operation object. // information has been backed up in the operation object.
bool stop_slave_error = false; if (remove_slave_conns(op, m_slave_status))
for (size_t i = 0; !stop_slave_error && i < m_slave_status.size(); i++)
{
if (!stop_slave_conn(&m_slave_status[i], StopMode::RESET_ALL, op.time_remaining, error_out))
{
stop_slave_error = true;
}
op.time_remaining -= timer.lap();
}
if (!stop_slave_error)
{
// Check that slave connections are really gone.
string error_msg;
if (do_show_slave_status(&error_msg))
{
if (!m_slave_status.empty())
{
stop_slave_error = true;
PRINT_MXS_JSON_ERROR(error_out,
"%s has %lu slave connections although all connections should have been "
"removed.", name(), m_slave_status.size());
}
}
else
{
stop_slave_error = true;
PRINT_MXS_JSON_ERROR(error_out,
"Failed to update slave connections of %s: %s",
name(), error_msg.c_str());
}
}
if (!stop_slave_error)
{ {
// Step 2: If this server is master, disable writes and scheduled events, flush logs, // Step 2: If this server is master, disable writes and scheduled events, flush logs,
// update gtid:s, run demotion_sql_file. // update gtid:s, run demotion_sql_file.
@ -1654,6 +1563,7 @@ bool MariaDBServer::demote(ClusterOperation& op)
if (op.demotion_target_is_master) if (op.demotion_target_is_master)
{ {
mxb_assert(is_master()); mxb_assert(is_master());
StopWatch timer;
// Step 2a: Enabling read-only can take time if writes are on or table locks taken. // Step 2a: Enabling read-only can take time if writes are on or table locks taken.
// TODO: use max_statement_time to be safe! // TODO: use max_statement_time to be safe!
bool ro_enabled = set_read_only(ReadOnlySetting::ENABLE, op.time_remaining, error_out); bool ro_enabled = set_read_only(ReadOnlySetting::ENABLE, op.time_remaining, error_out);
@ -1739,7 +1649,16 @@ bool MariaDBServer::demote(ClusterOperation& op)
return success; return success;
} }
bool MariaDBServer::stop_slave_conn(SlaveStatus* slave_conn, StopMode mode, Duration time_limit, /**
* Stop and optionally reset/reset-all a slave connection.
*
* @param conn_name Slave connection name. Use empty string for the nameless connection.
* @param mode STOP, RESET or RESET ALL
* @param time_limit Operation time limit
* @param error_out Error output
* @return True on success
*/
bool MariaDBServer::stop_slave_conn(const std::string& conn_name, StopMode mode, Duration time_limit,
json_t** error_out) json_t** error_out)
{ {
/* STOP SLAVE is a bit problematic, since sometimes it seems to take several seconds to complete. /* STOP SLAVE is a bit problematic, since sometimes it seems to take several seconds to complete.
@ -1749,8 +1668,7 @@ bool MariaDBServer::stop_slave_conn(SlaveStatus* slave_conn, StopMode mode, Dura
* an already stopped slave connection an error. */ * an already stopped slave connection an error. */
Duration time_left = time_limit; Duration time_left = time_limit;
StopWatch timer; StopWatch timer;
const char* conn_name = slave_conn->name.c_str(); string stop = string_printf("STOP SLAVE '%s';", conn_name.c_str());
string stop = string_printf("STOP SLAVE '%s';", conn_name);
string error_msg; string error_msg;
bool stop_success = execute_cmd_time_limit(stop, time_left, &error_msg); bool stop_success = execute_cmd_time_limit(stop, time_left, &error_msg);
time_left -= timer.restart(); time_left -= timer.restart();
@ -1763,20 +1681,15 @@ bool MariaDBServer::stop_slave_conn(SlaveStatus* slave_conn, StopMode mode, Dura
if (mode == StopMode::RESET || mode == StopMode::RESET_ALL) if (mode == StopMode::RESET || mode == StopMode::RESET_ALL)
{ {
string reset = string_printf("RESET SLAVE '%s'%s;", string reset = string_printf("RESET SLAVE '%s'%s;",
conn_name, (mode == StopMode::RESET_ALL) ? " ALL" : ""); conn_name.c_str(), (mode == StopMode::RESET_ALL) ? " ALL" : "");
if (execute_cmd_time_limit(reset, time_left, &error_msg)) if (execute_cmd_time_limit(reset, time_left, &error_msg))
{ {
rval = true; rval = true;
if (mode == StopMode::RESET_ALL)
{
slave_conn->exists = false; // The RESET_ALL means that the connection has been
// erased.
}
} }
else else
{ {
PRINT_MXS_JSON_ERROR(error_out, PRINT_MXS_JSON_ERROR(error_out,
"Failed to reset slave connection on '%s': %s", "Failed to reset slave connection on %s: %s",
name(), error_msg.c_str()); name(), error_msg.c_str());
} }
} }
@ -1788,12 +1701,86 @@ bool MariaDBServer::stop_slave_conn(SlaveStatus* slave_conn, StopMode mode, Dura
else else
{ {
PRINT_MXS_JSON_ERROR(error_out, PRINT_MXS_JSON_ERROR(error_out,
"Failed to stop slave connection on '%s': %s", "Failed to stop slave connection on %s: %s",
name(), error_msg.c_str()); name(), error_msg.c_str());
} }
return rval; return rval;
} }
/**
* Removes the given slave connections from the server and then updates slave connection status.
*
* @param op Operation descriptor
* @param conns_to_remove Which connections should be removed
* @return True if succesfull
*/
bool MariaDBServer::remove_slave_conns(ClusterOperation& op, const SlaveStatusArray& conns_to_remove)
{
json_t** error_out = op.error_out;
StopWatch timer;
bool stop_slave_error = false;
for (size_t i = 0; !stop_slave_error && i < conns_to_remove.size(); i++)
{
if (!stop_slave_conn(conns_to_remove[i].name, StopMode::RESET_ALL, op.time_remaining, error_out))
{
stop_slave_error = true;
}
op.time_remaining -= timer.lap();
}
bool success = false;
if (stop_slave_error)
{
PRINT_MXS_JSON_ERROR(error_out, "Failed to remove slave connection(s) from %s.", name());
}
else
{
// Check that the slave connections are really gone by comparing connection names. It's probably
// enough to just update the slave status. Checking that the connections are really gone is
// likely overkill, but doesn't hurt.
string error_msg;
if (do_show_slave_status(&error_msg))
{
// Insert all existing connection names to a set, then check that none of the removed ones are
// there.
std::set<string> connection_names;
for (auto& slave_conn : m_slave_status)
{
connection_names.insert(slave_conn.name);
}
int found = 0;
for (auto& removed_conn : conns_to_remove)
{
if (connection_names.count(removed_conn.name) > 0)
{
found++;
}
}
if (found == 0)
{
success = true;
}
else
{
// This means server is really bugging.
PRINT_MXS_JSON_ERROR(error_out,
"%s still has %i removed slave connections,"
"RESET SLAVE must have failed.", name(), found);
}
}
else
{
PRINT_MXS_JSON_ERROR(error_out,
"Failed to update slave connections of %s: %s",
name(), error_msg.c_str());
}
}
op.time_remaining -= timer.lap();
return success;
}
bool MariaDBServer::set_read_only(ReadOnlySetting setting, maxbase::Duration time_limit, json_t** error_out) bool MariaDBServer::set_read_only(ReadOnlySetting setting, maxbase::Duration time_limit, json_t** error_out)
{ {
int new_val = (setting == ReadOnlySetting::ENABLE) ? 1 : 0; int new_val = (setting == ReadOnlySetting::ENABLE) ? 1 : 0;
@ -2060,11 +2047,11 @@ bool MariaDBServer::redirect_existing_slave_conn(ClusterOperation& op)
const MariaDBServer* old_master = op.demotion_target; const MariaDBServer* old_master = op.demotion_target;
const MariaDBServer* new_master = op.promotion_target; const MariaDBServer* new_master = op.promotion_target;
auto old_conn = slave_connection_status_mutable(old_master); auto old_conn = slave_connection_status(old_master);
mxb_assert(old_conn); mxb_assert(old_conn);
bool success = false; bool success = false;
// First, just stop the slave connection. // First, just stop the slave connection.
bool stopped = stop_slave_conn(old_conn, StopMode::STOP_ONLY, op.time_remaining, op.error_out); bool stopped = stop_slave_conn(old_conn->name, StopMode::STOP_ONLY, op.time_remaining, op.error_out);
op.time_remaining -= timer.restart(); op.time_remaining -= timer.restart();
if (stopped) if (stopped)
{ {

View File

@ -523,16 +523,16 @@ private:
bool update_slave_status(std::string* errmsg_out = NULL); bool update_slave_status(std::string* errmsg_out = NULL);
bool sstatus_array_topology_equal(const SlaveStatusArray& new_slave_status); bool sstatus_array_topology_equal(const SlaveStatusArray& new_slave_status);
const SlaveStatus* sstatus_find_previous_row(const SlaveStatus& new_row, size_t guess); const SlaveStatus* sstatus_find_previous_row(const SlaveStatus& new_row, size_t guess);
SlaveStatus* slave_connection_status_mutable(const MariaDBServer* target);
void warn_event_scheduler(); void warn_event_scheduler();
bool events_foreach(ManipulatorFunc& func, json_t** error_out); bool events_foreach(ManipulatorFunc& func, json_t** error_out);
bool alter_event(const EventInfo& event, const std::string& target_status, bool alter_event(const EventInfo& event, const std::string& target_status,
json_t** error_out); json_t** error_out);
bool stop_slave_conn(SlaveStatus* slave_conn, StopMode mode, maxbase::Duration time_limit, bool stop_slave_conn(const std::string& conn_name, StopMode mode, maxbase::Duration time_limit,
json_t** error_out); json_t** error_out);
bool remove_slave_conns(ClusterOperation& op, const SlaveStatusArray& conns_to_remove);
bool execute_cmd_ex(const std::string& cmd, QueryRetryMode mode, bool execute_cmd_ex(const std::string& cmd, QueryRetryMode mode,
std::string* errmsg_out = NULL, unsigned int* errno_out = NULL); std::string* errmsg_out = NULL, unsigned int* errno_out = NULL);

View File

@ -170,8 +170,6 @@ public:
SLAVE_IO_NO, SLAVE_IO_NO,
}; };
bool exists = true; /* Has this connection been removed from the
* server but the monitor hasn't updated yet? */
bool seen_connected = false; /* Has this slave connection been seen connected, bool seen_connected = false; /* Has this slave connection been seen connected,
* meaning that the master server id is correct? * meaning that the master server id is correct?
**/ **/