MXS-3800: Explain lost_slave events
Currently the state change explanations are only added to mariadbmon. They are less relevant for Galera clusters as they themselves explain why they change their states but should still be added to make them easier to analyze. The event that isn't explained and is most often encountered is the loss of a Slave status. Most often the loss of a Slave status happens because either the IO thread or the SQL thread has stopped. Printing the states of the threads as well as the latest error should hint at what caused the outage. The information can be added to the REST API in 2.5 where the monitors can add extra information to the server JSON.
This commit is contained in:
@ -212,7 +212,7 @@ public:
|
||||
*/
|
||||
mxs_monitor_event_t get_event_type() const;
|
||||
|
||||
void log_state_change();
|
||||
void log_state_change(const std::string& reason);
|
||||
|
||||
/**
|
||||
* Is this server ok to update disk space status. Only checks if the server knows of valid disk space
|
||||
@ -516,6 +516,19 @@ protected:
|
||||
|
||||
bool server_status_request_waiting() const;
|
||||
|
||||
/**
|
||||
* Returns the human-readable reason why the server changed state
|
||||
*
|
||||
* @param server The server that changed state
|
||||
*
|
||||
* @return The human-readable reason why the state change occurred or
|
||||
* an empty string if no information is available
|
||||
*/
|
||||
virtual std::string annotate_state_change(mxs::MonitorServer* server)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains monitor base class settings. Since monitors are stopped before a setting change,
|
||||
* the items cannot be modified while a monitor is running. No locking required.
|
||||
|
||||
@ -1238,14 +1238,14 @@ void MonitorServer::log_connect_error(ConnectResult rval)
|
||||
latest_error.c_str());
|
||||
}
|
||||
|
||||
void MonitorServer::log_state_change()
|
||||
void MonitorServer::log_state_change(const std::string& reason)
|
||||
{
|
||||
string prev = SERVER::status_to_string(mon_prev_status);
|
||||
string next = server->status_string();
|
||||
MXS_NOTICE("Server changed state: %s[%s:%u]: %s. [%s] -> [%s]",
|
||||
MXS_NOTICE("Server changed state: %s[%s:%u]: %s. [%s] -> [%s]%s%s",
|
||||
server->name(), server->address, server->port,
|
||||
get_event_name(),
|
||||
prev.c_str(), next.c_str());
|
||||
get_event_name(), prev.c_str(), next.c_str(),
|
||||
reason.empty() ? "" : ": ", reason.c_str());
|
||||
}
|
||||
|
||||
void Monitor::hangup_failed_servers()
|
||||
@ -1332,7 +1332,7 @@ void Monitor::detect_handle_state_changes()
|
||||
mxs_monitor_event_t event = ptr->get_event_type();
|
||||
ptr->server->last_event = event;
|
||||
ptr->server->triggered_at = mxs_clock();
|
||||
ptr->log_state_change();
|
||||
ptr->log_state_change(annotate_state_change(ptr));
|
||||
|
||||
if (event == MASTER_DOWN_EVENT)
|
||||
{
|
||||
|
||||
@ -580,6 +580,19 @@ void MariaDBMonitor::process_state_changes()
|
||||
}
|
||||
}
|
||||
|
||||
std::string MariaDBMonitor::annotate_state_change(MonitorServer* server)
|
||||
{
|
||||
std::string rval;
|
||||
|
||||
if (server->get_event_type() == LOST_SLAVE_EVENT)
|
||||
{
|
||||
MariaDBServer* srv = get_server(server);
|
||||
rval = srv->print_changed_slave_connections();
|
||||
}
|
||||
|
||||
return rval;
|
||||
}
|
||||
|
||||
/**
|
||||
* Save info on the master server's multimaster group, if any. This is required when checking for changes
|
||||
* in the topology.
|
||||
|
||||
@ -110,6 +110,7 @@ protected:
|
||||
void pre_loop() override;
|
||||
void tick() override;
|
||||
void process_state_changes() override;
|
||||
std::string annotate_state_change(mxs::MonitorServer* server) override final;
|
||||
|
||||
private:
|
||||
// Some methods need a log on/off setting.
|
||||
|
||||
@ -418,6 +418,7 @@ bool MariaDBServer::do_show_slave_status(string* errmsg_out)
|
||||
// Always write to m_slave_status. Even if the new status is equal by topology,
|
||||
// gtid:s etc may have changed.
|
||||
Guard guard(m_arraylock);
|
||||
m_old_slave_status = std::move(m_slave_status);
|
||||
m_slave_status = std::move(slave_status_new);
|
||||
}
|
||||
|
||||
@ -724,6 +725,35 @@ string MariaDBServer::diagnostics() const
|
||||
return rval;
|
||||
}
|
||||
|
||||
std::string MariaDBServer::print_changed_slave_connections()
|
||||
{
|
||||
std::stringstream ss;
|
||||
const char* separator = "";
|
||||
|
||||
for (size_t i = 0; i < m_old_slave_status.size(); i++)
|
||||
{
|
||||
const auto& old_row = m_old_slave_status[i];
|
||||
const auto* new_row = sstatus_find_previous_row(old_row, i);
|
||||
|
||||
if (new_row && !new_row->equal(old_row))
|
||||
{
|
||||
ss << "Host: " << new_row->settings.master_endpoint.to_string()
|
||||
<< ", IO Running: " << SlaveStatus::slave_io_to_string(new_row->slave_io_running)
|
||||
<< ", SQL Running: " << (new_row->slave_sql_running ? "Yes" : "No");
|
||||
|
||||
if (!new_row->last_error.empty())
|
||||
{
|
||||
ss << ", Error: " << new_row->last_error;
|
||||
}
|
||||
|
||||
ss << separator;
|
||||
separator = "; ";
|
||||
}
|
||||
}
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
json_t* MariaDBServer::to_json() const
|
||||
{
|
||||
json_t* result = json_object();
|
||||
@ -1015,13 +1045,8 @@ bool MariaDBServer::sstatus_array_topology_equal(const SlaveStatusArray& new_sla
|
||||
{
|
||||
const auto new_row = new_slave_status[i];
|
||||
const auto old_row = old_slave_status[i];
|
||||
// Strictly speaking, the following should depend on the 'assume_unique_hostnames',
|
||||
// but the situations it would make a difference are so rare they can be ignored.
|
||||
if (new_row.slave_io_running != old_row.slave_io_running
|
||||
|| new_row.slave_sql_running != old_row.slave_sql_running
|
||||
|| new_row.settings.master_endpoint != old_row.settings.master_endpoint
|
||||
|| new_row.settings.name != old_row.settings.name
|
||||
|| new_row.master_server_id != old_row.master_server_id)
|
||||
|
||||
if (!new_row.equal(old_row))
|
||||
{
|
||||
rval = false;
|
||||
break;
|
||||
|
||||
@ -136,6 +136,7 @@ public:
|
||||
GtidList m_gtid_current_pos; /* Gtid of latest event. */
|
||||
GtidList m_gtid_binlog_pos; /* Gtid of latest event written to binlog. */
|
||||
SlaveStatusArray m_slave_status; /* Data returned from SHOW (ALL) SLAVE(S) STATUS */
|
||||
SlaveStatusArray m_old_slave_status; /* Data from the previous loop */
|
||||
NodeData m_node; /* Replication topology data */
|
||||
|
||||
/* Replication lag of the server. Used during calculation so that the actual SERVER struct is
|
||||
@ -169,6 +170,8 @@ public:
|
||||
*/
|
||||
std::string diagnostics() const;
|
||||
|
||||
std::string print_changed_slave_connections();
|
||||
|
||||
void update_server(bool time_to_update_disk_space,
|
||||
const mxs::MonitorServer::ConnectionSettings& conn_settings);
|
||||
|
||||
|
||||
@ -87,6 +87,17 @@ json_t* SlaveStatus::to_json() const
|
||||
return result;
|
||||
}
|
||||
|
||||
bool SlaveStatus::equal(const SlaveStatus& rhs) const
|
||||
{
|
||||
// Strictly speaking, the following should depend on the 'assume_unique_hostnames',
|
||||
// but the situations it would make a difference are so rare they can be ignored.
|
||||
return slave_io_running == rhs.slave_io_running
|
||||
&& slave_sql_running == rhs.slave_sql_running
|
||||
&& settings.master_endpoint == rhs.settings.master_endpoint
|
||||
&& settings.name == rhs.settings.name
|
||||
&& master_server_id == rhs.master_server_id;
|
||||
}
|
||||
|
||||
SlaveStatus::slave_io_running_t SlaveStatus::slave_io_from_string(const std::string& str)
|
||||
{
|
||||
slave_io_running_t rval = SLAVE_IO_NO;
|
||||
|
||||
@ -262,6 +262,8 @@ public:
|
||||
std::string to_string() const;
|
||||
json_t* to_json() const;
|
||||
|
||||
bool equal(const SlaveStatus& rhs) const;
|
||||
|
||||
static slave_io_running_t slave_io_from_string(const std::string& str);
|
||||
static std::string slave_io_to_string(slave_io_running_t slave_io);
|
||||
bool should_be_copied(std::string* ignore_reason_out) const;
|
||||
|
||||
Reference in New Issue
Block a user