Update master failure detection from slaves

The detection now works with multiple slave connections.
This commit is contained in:
Esa Korhonen
2018-08-27 10:24:52 +03:00
parent a593d00c65
commit 85d8a85cde
4 changed files with 116 additions and 119 deletions

View File

@ -13,6 +13,7 @@
#include "mariadbmon.hh"
#include <chrono>
#include <inttypes.h>
#include <sstream>
#include <maxscale/clock.h>
@ -20,6 +21,7 @@
#include <maxscale/utils.hh>
using std::string;
using std::chrono::steady_clock;
using maxscale::string_printf;
static const char RE_ENABLE_FMT[] = "To re-enable automatic %s, manually set '%s' to 'true' "
@ -1246,6 +1248,9 @@ void MariaDBMonitor::handle_auto_failover()
}
int master_down_count = m_master->m_server_base->mon_err_count;
const MariaDBServer* connected_slave = NULL;
Duration event_age;
if (m_failcount > 1 && m_warn_master_down)
{
int monitor_passes = m_failcount - master_down_count;
@ -1254,9 +1259,12 @@ void MariaDBMonitor::handle_auto_failover()
m_warn_master_down = false;
}
// If master seems to be down, check if slaves are receiving events.
else if (m_verify_master_failure && slave_receiving_events()) // TODO: Fix the events detection
else if (m_verify_master_failure &&
(connected_slave = slave_receiving_events(m_master, &event_age)) != NULL)
{
MXS_INFO("Master failure not yet confirmed by slaves, delaying failover.");
MXS_NOTICE("Slave '%s' is still connected to '%s' and received a new gtid or heartbeat event %.1f "
"seconds ago. Delaying failover.",
connected_slave->name(), m_master->name(), event_age.count());
}
else if (master_down_count >= m_failcount)
{
@ -1365,32 +1373,37 @@ void MariaDBMonitor::check_cluster_operations_support()
}
/**
* Check if a slave is receiving events from master.
* Check if a slave is receiving events from master. Returns the first slave that is both
* connected (or not realized the disconnect yet) and has an event more recent than
* master_failure_timeout. The age of the event is written in 'event_age_out'.
*
* @return True, if a slave has an event more recent than master_failure_timeout.
* @param demotion_target The server whose slaves should be checked
* @param event_age_out Output for event age
* @return The first connected slave or NULL if none found
*/
bool MariaDBMonitor::slave_receiving_events()
const MariaDBServer* MariaDBMonitor::slave_receiving_events(const MariaDBServer* demotion_target,
Duration* event_age_out)
{
mxb_assert(m_master);
bool received_event = false;
int64_t master_id = m_master->m_server_base->server->node_id;
steady_clock::time_point alive_after = steady_clock::now() -
std::chrono::seconds(m_master_failure_timeout);
for (MariaDBServer* server : m_servers)
const MariaDBServer* connected_slave = NULL;
for (MariaDBServer* slave : demotion_target->m_node.children)
{
if (!server->m_slave_status.empty() &&
server->m_slave_status[0].slave_io_running == SlaveStatus::SLAVE_IO_YES &&
server->m_slave_status[0].master_server_id == master_id &&
difftime(time(NULL), server->m_latest_event) < m_master_failure_timeout)
const SlaveStatus* slave_conn = NULL;
if (slave->is_running() &&
(slave_conn = slave->slave_connection_status(demotion_target)) != NULL &&
slave_conn->slave_io_running == SlaveStatus::SLAVE_IO_YES &&
slave_conn->last_data_time >= alive_after)
{
/**
* The slave is still connected to the correct master and has received events. This means that
* while MaxScale can't connect to the master, it's probably still alive.
*/
received_event = true;
// The slave is still connected to the correct master and has received events. This means that
// while MaxScale can't connect to the master, it's probably still alive.
connected_slave = slave;
*event_age_out = steady_clock::now() - slave_conn->last_data_time;
break;
}
}
return received_event;
return connected_slave;
}
/**

View File

@ -12,6 +12,7 @@
*/
#pragma once
#include "mariadbmon_common.hh"
#include <chrono>
#include <condition_variable>
#include <functional>
#include <string>
@ -103,6 +104,7 @@ protected:
void process_state_changes();
private:
typedef std::chrono::duration<double> Duration;
struct CycleInfo
{
@ -232,7 +234,8 @@ private:
MariaDBServer** demotion_target_out, json_t** error_out);
bool failover_perform(MariaDBServer* promotion_target, MariaDBServer* demotion_target,
json_t** error_out);
bool slave_receiving_events();
const MariaDBServer* slave_receiving_events(const MariaDBServer* demotion_target,
Duration* event_age_out);
bool manual_failover(json_t** output);
void handle_auto_failover();

View File

@ -13,6 +13,7 @@
#include "mariadbserver.hh"
#include <chrono>
#include <fstream>
#include <inttypes.h>
#include <iomanip>
@ -22,6 +23,7 @@
#include <maxscale/utils.hh>
using std::string;
using std::chrono::steady_clock;
using maxscale::string_printf;
namespace
@ -33,17 +35,12 @@ namespace
const char NO[] = "No";
}
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index)
: m_server_base(monitored_server)
, m_config_index(config_index)
, m_version(version::UNKNOWN)
, m_server_id(SERVER_ID_UNKNOWN)
, m_read_only(false)
, m_n_slaves_running(0)
, m_n_slave_heartbeats(0)
, m_heartbeat_period(0)
, m_latest_event(time(NULL))
, m_gtid_domain_id(GTID_DOMAIN_UNKNOWN)
, m_topology_changed(true)
, m_replication_lag(MXS_RLAG_UNDEFINED)
@ -175,54 +172,74 @@ bool MariaDBServer::do_show_slave_status(string* errmsg_out)
}
SlaveStatusArray slave_status_new;
int nrunning = 0;
while (result->next_row())
{
SlaveStatus sstatus_row;
sstatus_row.master_host = result->get_string(i_master_host);
sstatus_row.master_port = result->get_uint(i_master_port);
SlaveStatus new_row;
new_row.master_host = result->get_string(i_master_host);
new_row.master_port = result->get_uint(i_master_port);
string last_io_error = result->get_string(i_last_io_error);
string last_sql_error = result->get_string(i_last_sql_error);
sstatus_row.last_error = !last_io_error.empty() ? last_io_error : last_sql_error;
new_row.last_error = !last_io_error.empty() ? last_io_error : last_sql_error;
sstatus_row.slave_io_running =
new_row.slave_io_running =
SlaveStatus::slave_io_from_string(result->get_string(i_slave_io_running));
sstatus_row.slave_sql_running = (result->get_string(i_slave_sql_running) == "Yes");
sstatus_row.master_server_id = result->get_uint(i_master_server_id);
new_row.slave_sql_running = (result->get_string(i_slave_sql_running) == "Yes");
new_row.master_server_id = result->get_uint(i_master_server_id);
auto rlag = result->get_uint(i_seconds_behind_master);
// If slave connection is stopped, the value given by the backend is null -> -1.
sstatus_row.seconds_behind_master = (rlag < 0) ? MXS_RLAG_UNDEFINED :
new_row.seconds_behind_master = (rlag < 0) ? MXS_RLAG_UNDEFINED :
(rlag > INT_MAX) ? INT_MAX : rlag;
if (sstatus_row.slave_io_running == SlaveStatus::SLAVE_IO_YES && sstatus_row.slave_sql_running)
{
nrunning++;
// TODO: Fix for multisource replication, check changes to IO_Pos here and save somewhere.
}
if (all_slaves_status)
{
sstatus_row.name = result->get_string(i_connection_name);
auto heartbeats = result->get_uint(i_slave_rec_hbs);
if (m_n_slave_heartbeats < heartbeats) // TODO: Fix for multisource replication
{
m_latest_event = time(NULL);
m_n_slave_heartbeats = heartbeats;
m_heartbeat_period = result->get_uint(i_slave_hb_period);
}
new_row.name = result->get_string(i_connection_name);
new_row.received_heartbeats = result->get_uint(i_slave_rec_hbs);
string using_gtid = result->get_string(i_using_gtid);
string gtid_io_pos = result->get_string(i_gtid_io_pos);
if (!gtid_io_pos.empty() &&
(using_gtid == "Current_Pos" || using_gtid == "Slave_Pos"))
if (!gtid_io_pos.empty() && (using_gtid == "Current_Pos" || using_gtid == "Slave_Pos"))
{
sstatus_row.gtid_io_pos = GtidList::from_string(gtid_io_pos);
new_row.gtid_io_pos = GtidList::from_string(gtid_io_pos);
}
}
slave_status_new.push_back(sstatus_row);
// Before adding this row to the SlaveStatus array, compare the row to the one from the previous
// monitor tick and fill in the last pieces of data.
auto old_row = sstatus_find_previous_row(new_row, slave_status_new.size());
if (old_row)
{
// When the new row was created, 'last_data_time' was set to the current time. If it seems
// like the slave is not receiving data from the master, set the time to the one
// in the previous monitor tick.
if (new_row.received_heartbeats == old_row->received_heartbeats &&
new_row.gtid_io_pos == old_row->gtid_io_pos)
{
new_row.last_data_time = old_row->last_data_time;
}
}
// Finally, set the connection status.
if (new_row.slave_io_running == SlaveStatus::SLAVE_IO_YES)
{
mxb_assert(new_row.master_server_id > 0);
new_row.seen_connected = true;
}
else if (new_row.slave_io_running == SlaveStatus::SLAVE_IO_CONNECTING && old_row)
{
// Old connection data found. Even in this case the server id:s could be wrong if the
// slave connection was cleared and remade between monitor loops.
if (new_row.master_server_id == old_row->master_server_id && old_row->seen_connected)
{
new_row.seen_connected = true;
}
}
// Row complete, add it to the array.
slave_status_new.push_back(new_row);
}
sstatus_array_set_conn_status(&slave_status_new);
// Compare the previous array to the new one.
if (!sstatus_array_topology_equal(slave_status_new))
{
m_topology_changed = true;
@ -231,14 +248,6 @@ bool MariaDBServer::do_show_slave_status(string* errmsg_out)
// Always write to m_slave_status. Even if the new status is equal by topology,
// gtid:s etc may have changed.
m_slave_status = std::move(slave_status_new);
if (m_slave_status.empty())
{
/** Query returned no rows, replication is not configured */
m_n_slave_heartbeats = 0;
}
m_n_slaves_running = nrunning;
return true;
}
@ -916,71 +925,41 @@ bool MariaDBServer::sstatus_array_topology_equal(const SlaveStatusArray& new_sla
}
/**
* Compare the new slave status info to the old and set connection status.
* Check the slave status array stored in the MariaDBServer and find the row matching the connection in
* 'search_row'.
*
* @param new_slave_status The slave status just queried from the server.
* @param search_row What connection to search for
* @param guess_ind Index where the search row could be located at. If incorrect, the array is searched.
* @return The found row or NULL if not found
*/
void MariaDBServer::sstatus_array_set_conn_status(SlaveStatusArray* new_slave_status)
const SlaveStatus* MariaDBServer::sstatus_find_previous_row(const SlaveStatus& search_row, size_t guess_ind)
{
// Helper function. Checks if the connection in the new row is to the same server than in the old row
// sets connection status if yes.
auto compare_and_set = [](const SlaveStatus& old_row, SlaveStatus& new_row) -> bool
// Helper function. Checks if the connection in the new row is to the same server than in the old row.
auto compare_rows = [](const SlaveStatus& lhs, const SlaveStatus& rhs) -> bool
{
if (new_row.master_host == old_row.master_host && new_row.master_port == old_row.master_port)
{
// Was the correct connection. Even in this case the server id:s could be wrong if the
// slave connection was cleared and remade between monitor loops.
if (new_row.master_server_id == old_row.master_server_id && old_row.seen_connected)
{
new_row.seen_connected = true;
}
return true;
}
return false;
return (rhs.master_host == lhs.master_host && rhs.master_port == lhs.master_port);
};
SlaveStatusArray& new_sstatus = *new_slave_status;
// Iterate through the new slave status array. If IO thread is connected, master server id is ok.
// If IO thread is "Connecting" check the previous slave status array for the connection status.
for (size_t i = 0; i < new_sstatus.size(); i++)
// Usually the same slave connection can be found from the same index than in the previous slave
// status array, but this is not 100% (e.g. dba has just added a new connection).
const SlaveStatus* rval = NULL;
if (guess_ind < m_slave_status.size() && compare_rows(m_slave_status[guess_ind], search_row))
{
SlaveStatus& new_row = new_sstatus[i];
if (new_row.slave_io_running == SlaveStatus::SLAVE_IO_YES)
rval = &m_slave_status[guess_ind];
}
else
{
// The correct connection was not found where it should have been. Try looping.
for (const SlaveStatus& old_row : m_slave_status)
{
mxb_assert(new_row.master_server_id > 0);
new_row.seen_connected = true;
}
else if (new_row.slave_io_running == SlaveStatus::SLAVE_IO_CONNECTING)
{
// Usually the same slave connection can be found from the same index in the previous slave
// status array, but this is not 100% (e.g. dba has just added a new connection).
bool need_search = false;
if (i < m_slave_status.size())
if (compare_rows(old_row, search_row))
{
SlaveStatus& old_row = m_slave_status[i];
if (!compare_and_set(old_row, new_row))
{
need_search = true;
}
}
else
{
need_search = true;
}
if (need_search)
{
// The correct connection was not found where it should have been. Try looping.
for (const SlaveStatus& old_row : m_slave_status)
{
if (compare_and_set(old_row, new_row))
{
break;
}
}
rval = &old_row;
break;
}
}
}
return rval;
}
bool MariaDBServer::can_be_demoted_switchover(string* reason_out)

View File

@ -12,6 +12,7 @@
*/
#pragma once
#include "mariadbmon_common.hh"
#include <chrono>
#include <string>
#include <memory>
#include <maxscale/monitor.h>
@ -47,6 +48,11 @@ public:
GtidList gtid_io_pos; /* Gtid I/O position of the slave thread. */
std::string last_error; /* Last IO or SQL error encountered. */
int seconds_behind_master = MXS_RLAG_UNDEFINED; /* How much behind the slave is. */
int64_t received_heartbeats = 0; /* How many heartbeats the connection has received */
/* Time of the latest gtid event or heartbeat the slave connection has received, timed by the monitor. */
std::chrono::steady_clock::time_point last_data_time = std::chrono::steady_clock::now();
std::string to_string() const;
json_t* to_json() const;
@ -135,10 +141,6 @@ public:
version m_version; /**< Server version/type. */
int64_t m_server_id; /**< Value of @@server_id. Valid values are 32bit unsigned. */
bool m_read_only; /**< Value of @@read_only */
size_t m_n_slaves_running; /**< Number of running slave connections */
int m_n_slave_heartbeats; /**< Number of received heartbeats */
double m_heartbeat_period; /**< The time interval between heartbeats */
time_t m_latest_event; /**< Time when latest event was received from the master */
int64_t m_gtid_domain_id; /**< The value of gtid_domain_id, the domain which is used for
* new non-replicated events. */
GtidList m_gtid_current_pos; /**< Gtid of latest event. */
@ -459,7 +461,7 @@ public:
private:
bool update_slave_status(std::string* errmsg_out = NULL);
bool sstatus_array_topology_equal(const SlaveStatusArray& new_slave_status);
void sstatus_array_set_conn_status(SlaveStatusArray* new_slave_status);
const SlaveStatus* sstatus_find_previous_row(const SlaveStatus& new_row, size_t guess);
};
/**