From 715eaf6760c82143e2653ea7f15e36c3c2c5e08d Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Thu, 15 Nov 2018 14:45:12 +0200 Subject: [PATCH 1/4] MXS-1598 Explain how monitor measures replication lag --- Documentation/Monitors/MariaDB-Monitor.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/Monitors/MariaDB-Monitor.md b/Documentation/Monitors/MariaDB-Monitor.md index 1847375c4..887d60fba 100644 --- a/Documentation/Monitors/MariaDB-Monitor.md +++ b/Documentation/Monitors/MariaDB-Monitor.md @@ -141,6 +141,9 @@ switchover and rejoin-specific parameters are listed in their own Deprecated and unused as of MaxScale 2.3. Can be defined but is ignored. +Is effectively always on. The monitor uses the "Seconds_Behind_Master"-field of +"SHOW SLAVE STATUS" to get the replication lag. + ### `detect_stale_master` Allow previous master to be available even in case of stopped or misconfigured From 6a1cfddb43280319d7e0951877e605f41a041ee0 Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Thu, 15 Nov 2018 12:22:01 +0200 Subject: [PATCH 2/4] MXS-2158 Clean up gtid updating during rejoin Error messages from update_gtids() are now printed. can_replicate_from() no longer updates gtid:s. --- .../mariadbmon/cluster_manipulation.cc | 24 +++++------ .../monitor/mariadbmon/mariadbserver.cc | 43 +++++++++---------- .../monitor/mariadbmon/mariadbserver.hh | 10 +++-- 3 files changed, 37 insertions(+), 40 deletions(-) diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index 9f6257be6..e376a404e 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -118,7 +118,8 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output) if (server_is_rejoin_suspect(slave_cand, output)) { - if (m_master->update_gtids()) + string gtid_update_error; + if (m_master->update_gtids(>id_update_error)) { string no_rejoin_reason; if (slave_cand->can_replicate_from(m_master, &no_rejoin_reason)) @@ -138,18 +139,15 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output) else { PRINT_MXS_JSON_ERROR(output, - "Server '%s' cannot replicate from cluster master '%s': " - "%s.", - rejoin_serv_name, - m_master->name(), - no_rejoin_reason.c_str()); + "%s cannot replicate from cluster master %s: %s.", + rejoin_serv_name, m_master->name(), no_rejoin_reason.c_str()); } } else { PRINT_MXS_JSON_ERROR(output, - "Cluster master '%s' gtid info could not be updated.", - m_master->name()); + "The GTIDs of master server %s could not be updated: %s", + m_master->name(), gtid_update_error.c_str()); } } // server_is_rejoin_suspect has added any error messages to the output, no need to print here } @@ -687,7 +685,8 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output) bool comm_ok = true; if (!suspects.empty()) { - if (m_master->update_gtids()) + string gtid_update_error; + if (m_master->update_gtids(>id_update_error)) { for (size_t i = 0; i < suspects.size(); i++) { @@ -710,6 +709,8 @@ bool MariaDBMonitor::get_joinable_servers(ServerArray* output) } else { + MXS_ERROR("The GTIDs of master server %s could not be updated while attempting an automatic " + "rejoin: %s", m_master->name(), gtid_update_error.c_str()); comm_ok = false; } } @@ -1760,10 +1761,7 @@ void MariaDBMonitor::handle_auto_rejoin() MXS_NOTICE("%d server(s) redirected or rejoined the cluster.", joins); } } - else - { - MXS_ERROR("Query error to master '%s' prevented a possible rejoin operation.", m_master->name()); - } + // get_joinable_servers prints an error if master is unresponsive } void MariaDBMonitor::report_and_disable(const string& operation, const string& setting_name, diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index 0ef954c2b..c91e62374 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -693,35 +693,32 @@ json_t* MariaDBServer::to_json() const return result; } -bool MariaDBServer::can_replicate_from(MariaDBServer* master, string* error_out) +bool MariaDBServer::can_replicate_from(MariaDBServer* master, string* reason_out) { - bool rval = false; - if (update_gtids()) + mxb_assert(reason_out); + mxb_assert(is_usable()); // The server must be running. + + bool can_replicate = false; + if (m_gtid_current_pos.empty()) { - if (m_gtid_current_pos.empty()) - { - *error_out = string("'") + name() + "' does not have a valid 'gtid_current_pos'."; - } - else if (master->m_gtid_binlog_pos.empty()) - { - *error_out = string("'") + master->name() + "' does not have a valid 'gtid_binlog_pos'."; - } - else - { - rval = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos); - if (!rval) - { - *error_out = string("gtid_current_pos of '") + name() + "' (" - + m_gtid_current_pos.to_string() + ") is incompatible with gtid_binlog_pos of '" - + master->name() + "' (" + master->m_gtid_binlog_pos.to_string() + ")."; - } - } + *reason_out = string_printf("%s does not have a valid gtid_current_pos.", name()); + } + else if (master->m_gtid_binlog_pos.empty()) + { + *reason_out = string_printf("%s does not have a valid gtid_binlog_pos.", master->name()); } else { - *error_out = string("Server '") + name() + "' could not be queried."; + can_replicate = m_gtid_current_pos.can_replicate_from(master->m_gtid_binlog_pos); + if (!can_replicate) + { + *reason_out = string_printf("gtid_current_pos of %s (%s) is incompatible with " + "gtid_binlog_pos of %s (%s).", + name(), m_gtid_current_pos.to_string().c_str(), + master->name(), master->m_gtid_binlog_pos.to_string().c_str()); + } } - return rval; + return can_replicate; } bool MariaDBServer::redirect_one_slave(const string& change_cmd) diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index c35d4b016..e13c0cba5 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -261,14 +261,16 @@ public: const SlaveStatus* slave_connection_status_host_port(const MariaDBServer* target) const; /** - * Checks if this server can replicate from master. Only considers gtid:s and only detects obvious errors. - * The non-detected errors will mostly be detected once the slave tries to start replicating. + * Checks if this server can replicate from master. Only considers gtid:s and only detects obvious + * errors. The non-detected errors will mostly be detected once the slave tries to start replicating. + * Before calling this, update the gtid:s of the master so that the the gtid:s of the master are more + * recent than those of this server. * * @param master_info Master server - * @param error_out Details the reason for a negative result + * @param reason_out Details the reason for a negative result * @return True if slave can replicate from master */ - bool can_replicate_from(MariaDBServer* master, std::string* error_out); + bool can_replicate_from(MariaDBServer* master, std::string* reason_out); /** * Redirect one slave server to another master From bba0bc0f3147de80306db5a302d3d9bd708fc1e2 Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Mon, 12 Nov 2018 12:53:54 +0200 Subject: [PATCH 3/4] MXS-2158 Relax requirements for manual rejoin The operation is now allowed even if the rejoining server has empty gtid:s. Auto-rejoin keeps the safeties on. --- Documentation/Monitors/MariaDB-Monitor.md | 17 ++++--- .../mariadbmon/cluster_manipulation.cc | 44 +++++++++++++------ 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/Documentation/Monitors/MariaDB-Monitor.md b/Documentation/Monitors/MariaDB-Monitor.md index 887d60fba..78eb4510c 100644 --- a/Documentation/Monitors/MariaDB-Monitor.md +++ b/Documentation/Monitors/MariaDB-Monitor.md @@ -368,12 +368,17 @@ operations. ### Manual activation Cluster operations can be activated manually through the REST API, MaxCtrl or -MaxAdmin. The commands are only performed when MaxScale is in active mode. All -commands require the monitor instance name as the first parameter. Failover +MaxAdmin. The commands are only performed when MaxScale is in active mode. The +commands generally match their automatic versions. The exception is _rejoin_, in +which the manual command allows rejoining even when the joining server has empty +gtid:s. This rule allows the user to force a rejoin on a server without binary +logs. + +All commands require the monitor instance name as the first parameter. Failover selects the new master server automatically and does not require additional parameters. Rejoin requires the name of the joining server as second parameter. -Replication reset accepts the name of the new master server as second -parameter. If not given, the current master is selected. +Replication reset accepts the name of the new master server as second parameter. +If not given, the current master is selected. Switchover takes one to three parameters. If only the monitor name is given, switchover will autoselect both the slave to promote and the current master as @@ -401,8 +406,8 @@ to demote (OldMasterServ). For rejoin, the server to join (OldMasterServ) is required. Replication reset requires the server to promote (NewMasterServ). It is safe to perform manual operations even with automatic failover, switchover -or rejoin enabled since the automatic operations cannot happen simultaneously -with the manual one. +or rejoin enabled since automatic operations cannot happen simultaneously +with manual ones. If a switchover or failover fails, automatic failover is disabled to prevent master changes to a possibly malfunctioning cluster. Automatic failover can be diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index e376a404e..c18220705 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -115,17 +115,42 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output) if (mon_slave_cand) { MariaDBServer* slave_cand = get_server_info(mon_slave_cand); - if (server_is_rejoin_suspect(slave_cand, output)) { string gtid_update_error; if (m_master->update_gtids(>id_update_error)) { + // The manual version of rejoin does not need to be as careful as the automatic one. + // The rules are mostly the same, the only difference is that a server with empty gtid:s + // can be rejoined manually. + // TODO: Add the warning to JSON output. string no_rejoin_reason; - if (slave_cand->can_replicate_from(m_master, &no_rejoin_reason)) + bool safe_rejoin = slave_cand->can_replicate_from(m_master, &no_rejoin_reason); + bool empty_gtid = slave_cand->m_gtid_current_pos.empty(); + bool rejoin_allowed = false; + if (safe_rejoin) { - ServerArray joinable_server; - joinable_server.push_back(slave_cand); + rejoin_allowed = true; + } + else + { + if (empty_gtid) + { + rejoin_allowed = true; + MXB_WARNING("gtid_curren_pos of %s is empty. Manual rejoin is unsafe " + "but allowed.", rejoin_serv_name); + } + else + { + PRINT_MXS_JSON_ERROR(output, "%s cannot replicate from master server %s: %s", + rejoin_serv_name, m_master->name(), + no_rejoin_reason.c_str()); + } + } + + if (rejoin_allowed) + { + ServerArray joinable_server = {slave_cand}; if (do_rejoin(joinable_server, output) == 1) { rval = true; @@ -136,12 +161,6 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output) PRINT_MXS_JSON_ERROR(output, "Rejoin attempted but failed."); } } - else - { - PRINT_MXS_JSON_ERROR(output, - "%s cannot replicate from cluster master %s: %s.", - rejoin_serv_name, m_master->name(), no_rejoin_reason.c_str()); - } } else { @@ -153,8 +172,7 @@ bool MariaDBMonitor::manual_rejoin(SERVER* rejoin_server, json_t** output) } else { - PRINT_MXS_JSON_ERROR(output, - "The given server '%s' is not monitored by this monitor.", + PRINT_MXS_JSON_ERROR(output, "The given server '%s' is not monitored by this monitor.", rejoin_serv_name); } } @@ -1862,4 +1880,4 @@ MariaDBMonitor::FailoverParams::FailoverParams(const ServerOperation& promotion, , demotion_target(demotion_target) , general(general) { -} \ No newline at end of file +} From 35b3dd4dd8d0e3e00c1520b97ff92a33b00ab9de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Fri, 16 Nov 2018 15:45:05 +0200 Subject: [PATCH 4/4] Install systemd libraries Install systemd libraries in the install script. --- BUILD/install_build_deps.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/BUILD/install_build_deps.sh b/BUILD/install_build_deps.sh index 64d54048e..4bd055e60 100755 --- a/BUILD/install_build_deps.sh +++ b/BUILD/install_build_deps.sh @@ -16,7 +16,7 @@ then build-essential libssl-dev ncurses-dev bison flex \ perl libtool libpcre3-dev tcl tcl-dev uuid \ uuid-dev libsqlite3-dev liblzma-dev libpam0g-dev pkg-config \ - libedit-dev + libedit-dev libsystemd-dev ## separatelibgnutls installation process for Ubuntu Trusty cat /etc/*release | grep -E "Trusty|wheezy" @@ -71,6 +71,9 @@ else # Attempt to install libasan, it'll only work on CentOS 7 sudo yum install -y --nogpgcheck libasan + # Attempt to install systemd-devel, doesn't work on CentOS 6 + sudo yum install -y systemd-devel + grep "release 6" /etc/redhat-release if [ $? == 0 ] then