MXS-3800: Explain lost_slave events

Currently the state change explanations are only added to mariadbmon. They
are less relevant for Galera clusters as they themselves explain why they
change their states but should still be added to make them easier to
analyze.

The event that isn't explained and is most often encountered is the loss
of a Slave status. Most often the loss of a Slave status happens because
either the IO thread or the SQL thread has stopped. Printing the states of
the threads as well as the latest error should hint at what caused the
outage.

The information can be added to the REST API in 2.5 where the monitors can
add extra information to the server JSON.
This commit is contained in:
Markus Mäkelä
2021-10-18 08:59:14 +03:00
parent 136d0271df
commit 0bf5641d80
8 changed files with 84 additions and 16 deletions

View File

@ -418,6 +418,7 @@ bool MariaDBServer::do_show_slave_status(string* errmsg_out)
// Always write to m_slave_status. Even if the new status is equal by topology,
// gtid:s etc may have changed.
Guard guard(m_arraylock);
m_old_slave_status = std::move(m_slave_status);
m_slave_status = std::move(slave_status_new);
}
@ -724,6 +725,35 @@ string MariaDBServer::diagnostics() const
return rval;
}
std::string MariaDBServer::print_changed_slave_connections()
{
std::stringstream ss;
const char* separator = "";
for (size_t i = 0; i < m_old_slave_status.size(); i++)
{
const auto& old_row = m_old_slave_status[i];
const auto* new_row = sstatus_find_previous_row(old_row, i);
if (new_row && !new_row->equal(old_row))
{
ss << "Host: " << new_row->settings.master_endpoint.to_string()
<< ", IO Running: " << SlaveStatus::slave_io_to_string(new_row->slave_io_running)
<< ", SQL Running: " << (new_row->slave_sql_running ? "Yes" : "No");
if (!new_row->last_error.empty())
{
ss << ", Error: " << new_row->last_error;
}
ss << separator;
separator = "; ";
}
}
return ss.str();
}
json_t* MariaDBServer::to_json() const
{
json_t* result = json_object();
@ -1015,13 +1045,8 @@ bool MariaDBServer::sstatus_array_topology_equal(const SlaveStatusArray& new_sla
{
const auto new_row = new_slave_status[i];
const auto old_row = old_slave_status[i];
// Strictly speaking, the following should depend on the 'assume_unique_hostnames',
// but the situations it would make a difference are so rare they can be ignored.
if (new_row.slave_io_running != old_row.slave_io_running
|| new_row.slave_sql_running != old_row.slave_sql_running
|| new_row.settings.master_endpoint != old_row.settings.master_endpoint
|| new_row.settings.name != old_row.settings.name
|| new_row.master_server_id != old_row.master_server_id)
if (!new_row.equal(old_row))
{
rval = false;
break;