diff --git a/maxscale-system-test/verify_master_failure.cpp b/maxscale-system-test/verify_master_failure.cpp index aaf4e1d09..56685f683 100644 --- a/maxscale-system-test/verify_master_failure.cpp +++ b/maxscale-system-test/verify_master_failure.cpp @@ -14,7 +14,7 @@ int main(int argc, char* argv[]) test.tprintf("Blocking master and checking that master failure is delayed at least once."); test.repl->block_node(0); sleep(5); - test.log_includes(0, "delaying.*failover"); + test.log_includes(0, "Delaying failover"); test.tprintf("Waiting to see if failover is performed."); sleep(10); diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index a62f4ac0e..db66693e5 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -1340,7 +1340,8 @@ void MariaDBMonitor::handle_auto_failover() int master_down_count = m_master->m_server_base->mon_err_count; const MariaDBServer* connected_slave = NULL; - maxbase::Duration event_age; + Duration event_age; + Duration delay_time; if (m_failcount > 1 && m_warn_master_down) { @@ -1352,11 +1353,11 @@ void MariaDBMonitor::handle_auto_failover() } // If master seems to be down, check if slaves are receiving events. else if (m_verify_master_failure - && (connected_slave = slave_receiving_events(m_master, &event_age)) != NULL) + && (connected_slave = slave_receiving_events(m_master, &event_age, &delay_time)) != NULL) { - MXS_NOTICE("Slave '%s' is still connected to '%s' and received a new gtid or heartbeat event %.1f " - "seconds ago. Delaying failover.", - connected_slave->name(), m_master->name(), event_age.secs()); + MXS_NOTICE("Slave %s is still connected to %s and received a new gtid or heartbeat event %.1f " + "seconds ago. Delaying failover for at least %.1f seconds.", + connected_slave->name(), m_master->name(), event_age.secs(), delay_time.secs()); } else if (master_down_count >= m_failcount) { @@ -1468,10 +1469,11 @@ void MariaDBMonitor::check_cluster_operations_support() * @return The first connected slave or NULL if none found */ const MariaDBServer* MariaDBMonitor::slave_receiving_events(const MariaDBServer* demotion_target, - maxbase::Duration* event_age_out) + Duration* event_age_out, Duration* delay_out) { - auto time_now = maxbase::Clock::now(); - maxbase::Clock::time_point alive_after = time_now - std::chrono::seconds(m_master_failure_timeout); + Duration event_timeout(static_cast(m_master_failure_timeout)); + auto current_time = maxbase::Clock::now(); + maxbase::Clock::time_point recent_event_time = current_time - event_timeout; const MariaDBServer* connected_slave = NULL; for (MariaDBServer* slave : demotion_target->m_node.children) @@ -1480,12 +1482,14 @@ const MariaDBServer* MariaDBMonitor::slave_receiving_events(const MariaDBServer* if (slave->is_running() && (slave_conn = slave->slave_connection_status(demotion_target)) != NULL && slave_conn->slave_io_running == SlaveStatus::SLAVE_IO_YES - && slave_conn->last_data_time >= alive_after) + && slave_conn->last_data_time >= recent_event_time) { // The slave is still connected to the correct master and has received events. This means that // while MaxScale can't connect to the master, it's probably still alive. connected_slave = slave; - *event_age_out = time_now - slave_conn->last_data_time; + auto latest_event_age = current_time - slave_conn->last_data_time; + *event_age_out = latest_event_age; + *delay_out = event_timeout - latest_event_age; break; } } diff --git a/server/modules/monitor/mariadbmon/mariadbmon.hh b/server/modules/monitor/mariadbmon/mariadbmon.hh index 193ab7cf4..a6e355aa5 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon.hh @@ -241,7 +241,8 @@ private: std::unique_ptr failover_prepare(Log log_mode, json_t** error_out); bool failover_perform(ClusterOperation& operation); const MariaDBServer* slave_receiving_events(const MariaDBServer* demotion_target, - maxbase::Duration* event_age_out); + maxbase::Duration* event_age_out, + maxbase::Duration* delay_out); bool manual_failover(json_t** output); void handle_auto_failover();