From dd9ff277430a389003ea4d659c31874027f5514c Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Fri, 14 Sep 2018 15:32:31 +0300 Subject: [PATCH] MXS-1845 Rewrite server promotion code In progress, does not yet overwrite existing code. The new promotion mechanism automatically retries queries which timed out. It also handles multimaster situations correctly. --- include/maxscale/mysql_utils.h | 20 +- server/core/mysql_utils.cc | 15 +- .../mariadbmon/cluster_manipulation.cc | 2 + .../monitor/mariadbmon/mariadbmon_common.cc | 3 + .../monitor/mariadbmon/mariadbmon_common.hh | 3 + .../monitor/mariadbmon/mariadbserver.cc | 261 +++++++++++++++++- .../monitor/mariadbmon/mariadbserver.hh | 50 +++- 7 files changed, 331 insertions(+), 23 deletions(-) diff --git a/include/maxscale/mysql_utils.h b/include/maxscale/mysql_utils.h index 24203072f..c24d98704 100644 --- a/include/maxscale/mysql_utils.h +++ b/include/maxscale/mysql_utils.h @@ -42,16 +42,15 @@ char* mxs_lestr_consume(uint8_t** c, size_t* size); MYSQL* mxs_mysql_real_connect(MYSQL* mysql, SERVER* server, const char* user, const char* passwd); /** - * Check if the MYSQL error number is a connection error + * Check if the MYSQL error number is a connection error. * + * @param Error code * @return True if the MYSQL error number is a connection error */ -bool mxs_mysql_is_net_error(int errcode); +bool mxs_mysql_is_net_error(unsigned int errcode); /** - * Execute a query - * - * This function wraps mysql_query in a way that automatic query retry is possible. + * Execute a query using global query retry settings. * * @param conn MySQL connection * @param query Query to execute @@ -60,6 +59,17 @@ bool mxs_mysql_is_net_error(int errcode); */ int mxs_mysql_query(MYSQL* conn, const char* query); +/** + * Execute a query, manually defining retry limits. + * + * @param conn MySQL connection + * @param query Query to execute + * @param query_retries Maximum number of retries + * @param query_retry_timeout Maximum time to spend retrying, in seconds + * @return return value of mysql_query + */ +int mxs_mysql_query_ex(MYSQL* conn, const char* query, int query_retries, time_t query_retry_timeout); + /** * Trim MySQL quote characters surrounding a string. * diff --git a/server/core/mysql_utils.cc b/server/core/mysql_utils.cc index a20a949e0..cd0ff41d7 100644 --- a/server/core/mysql_utils.cc +++ b/server/core/mysql_utils.cc @@ -216,7 +216,7 @@ MYSQL* mxs_mysql_real_connect(MYSQL* con, SERVER* server, const char* user, cons return mysql; } -bool mxs_mysql_is_net_error(int errcode) +bool mxs_mysql_is_net_error(unsigned int errcode) { switch (errcode) { @@ -234,15 +234,14 @@ bool mxs_mysql_is_net_error(int errcode) } } -int mxs_mysql_query(MYSQL* conn, const char* query) +int mxs_mysql_query_ex(MYSQL* conn, const char* query, int query_retries, time_t query_retry_timeout) { - MXS_CONFIG* cnf = config_get_global_options(); time_t start = time(NULL); int rc = mysql_query(conn, query); - for (int n = 0; rc != 0 && n < cnf->query_retries + for (int n = 0; rc != 0 && n < query_retries && mxs_mysql_is_net_error(mysql_errno(conn)) - && time(NULL) - start < cnf->query_retry_timeout; n++) + && time(NULL) - start < query_retry_timeout; n++) { rc = mysql_query(conn, query); } @@ -260,6 +259,12 @@ int mxs_mysql_query(MYSQL* conn, const char* query) return rc; } +int mxs_mysql_query(MYSQL* conn, const char* query) +{ + MXS_CONFIG* cnf = config_get_global_options(); + return mxs_mysql_query_ex(conn, query, cnf->query_retries, cnf->query_retry_timeout); +} + const char* mxs_mysql_get_value(MYSQL_RES* result, MYSQL_ROW row, const char* key) { MYSQL_FIELD* f = mysql_fetch_fields(result); diff --git a/server/modules/monitor/mariadbmon/cluster_manipulation.cc b/server/modules/monitor/mariadbmon/cluster_manipulation.cc index 1dee2191c..b484a830e 100644 --- a/server/modules/monitor/mariadbmon/cluster_manipulation.cc +++ b/server/modules/monitor/mariadbmon/cluster_manipulation.cc @@ -1510,6 +1510,7 @@ unique_ptr MariaDBMonitor::failover_prepare(Log log_mode, json rval.reset(new ClusterOperation(OperationType::FAILOVER, promotion_target, demotion_target, demotion_target == m_master, m_handle_event_scheduler, + m_promote_sql_file, m_demote_sql_file, error_out, time_limit)); } } @@ -1851,6 +1852,7 @@ unique_ptr MariaDBMonitor::switchover_prepare(SERVER* promotio rval.reset(new ClusterOperation(op_type, promotion_target, demotion_target, demotion_target == m_master, m_handle_event_scheduler, + m_promote_sql_file, m_demote_sql_file, error_out, time_limit)); } return rval; diff --git a/server/modules/monitor/mariadbmon/mariadbmon_common.cc b/server/modules/monitor/mariadbmon/mariadbmon_common.cc index 6ef2d5953..03197385b 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon_common.cc +++ b/server/modules/monitor/mariadbmon/mariadbmon_common.cc @@ -36,12 +36,15 @@ void DelimitedPrinter::cat(string& target, const string& addition) ClusterOperation::ClusterOperation(OperationType type, MariaDBServer* promotion_target, MariaDBServer* demotion_target, bool demo_target_is_master, bool handle_events, + string& promotion_sql_file, string& demotion_sql_file, json_t** error, maxbase::Duration time_remaining) : type(type) , promotion_target(promotion_target) , demotion_target(demotion_target) , demotion_target_is_master(demo_target_is_master) , handle_events(handle_events) + , promotion_sql_file(promotion_sql_file) + , demotion_sql_file(demotion_sql_file) , error_out(error) , time_remaining(time_remaining) {} diff --git a/server/modules/monitor/mariadbmon/mariadbmon_common.hh b/server/modules/monitor/mariadbmon/mariadbmon_common.hh index 24566dbc0..61f9758e2 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon_common.hh +++ b/server/modules/monitor/mariadbmon/mariadbmon_common.hh @@ -94,11 +94,14 @@ public: MariaDBServer* const demotion_target; // Which server will be demoted const bool demotion_target_is_master; // Was the demotion target the master? const bool handle_events; // Should scheduled server events be disabled/enabled? + const std::string promotion_sql_file; // SQL commands ran on a server promoted to master + const std::string demotion_sql_file; // SQL commands ran on a server demoted from master json_t** const error_out; // Json error output maxbase::Duration time_remaining; // How much time remains to complete the operation ClusterOperation(OperationType type, MariaDBServer* promotion_target, MariaDBServer* demotion_target, bool demo_target_is_master, bool handle_events, + std::string& promotion_sql_file, std::string& demotion_sql_file, json_t** error, maxbase::Duration time_remaining); }; diff --git a/server/modules/monitor/mariadbmon/mariadbserver.cc b/server/modules/monitor/mariadbmon/mariadbserver.cc index 89cf7046e..7c6612254 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.cc +++ b/server/modules/monitor/mariadbmon/mariadbserver.cc @@ -12,6 +12,7 @@ */ #include "mariadbserver.hh" +#include "maxbase/stopwatch.hh" #include #include @@ -25,6 +26,8 @@ using std::string; using std::chrono::steady_clock; using maxscale::string_printf; +using maxbase::Duration; +using maxbase::StopWatch; namespace { @@ -108,11 +111,30 @@ std::unique_ptr MariaDBServer::execute_query(const string& query, s return rval; } -bool MariaDBServer::execute_cmd(const string& cmd, std::string* errmsg_out) +/** + * Execute a query which does not return data. If the query returns data, an error is returned. + * + * @param cmd The query + * @param mode Retry a failed query using the global query retry settings or not + * @param errmsg_out Error output. + * @return True on success, false on error or if query returned data + */ +bool MariaDBServer::execute_cmd_ex(const string& cmd, QueryRetryMode mode, + std::string* errmsg_out, unsigned int* errno_out) { - bool rval = false; auto conn = m_server_base->con; - if (mxs_mysql_query(conn, cmd.c_str()) == 0) + bool query_success = false; + if (mode == QueryRetryMode::ENABLED) + { + query_success = (mxs_mysql_query(conn, cmd.c_str()) == 0); + } + else + { + query_success = (mxs_mysql_query_ex(conn, cmd.c_str(), 0, 0) == 0); + } + + bool rval = false; + if (query_success) { MYSQL_RES* result = mysql_store_result(conn); if (result == NULL) @@ -125,18 +147,77 @@ bool MariaDBServer::execute_cmd(const string& cmd, std::string* errmsg_out) int rows = mysql_num_rows(result); *errmsg_out = string_printf("Query '%s' returned %d columns and %d rows of data when none " "was expected.", - cmd.c_str(), - cols, - rows); + cmd.c_str(), cols, rows); } } - else if (errmsg_out) + else { - *errmsg_out = string_printf("Query '%s' failed: '%s'.", cmd.c_str(), mysql_error(conn)); + if (errmsg_out) + { + *errmsg_out = string_printf("Query '%s' failed: '%s'.", cmd.c_str(), mysql_error(conn)); + } + if (errno_out) + { + *errno_out = mysql_errno(conn); + } } return rval; } +bool MariaDBServer::execute_cmd(const std::string& cmd, std::string* errmsg_out) +{ + return execute_cmd_ex(cmd, QueryRetryMode::ENABLED, errmsg_out); +} + +bool MariaDBServer::execute_cmd_no_retry(const std::string& cmd, + std::string* errmsg_out, unsigned int* errno_out) +{ + return execute_cmd_ex(cmd, QueryRetryMode::DISABLED, errmsg_out, errno_out); +} + +/** + * Execute a query which does not return data. If the query fails because of a network error + * (e.g. Connector-C timeout), automatically retry the query until time is up. + * + * @param cmd The query to execute. Should be a query with a predictable effect even when retried or + * ran several times. + * @param time_limit How long to retry + * @param errmsg_out Error output + * @return True, if successful. + */ +bool MariaDBServer::execute_cmd_time_limit(const std::string& cmd, maxbase::Duration time_limit, + std::string* errmsg_out) +{ + StopWatch timer; + // Even if time is up, try at least once. + bool cmd_success = false; + bool keep_trying = true; + while (!cmd_success && keep_trying) + { + string error_msg; + unsigned int errornum = 0; + cmd_success = execute_cmd_no_retry(cmd, &error_msg, &errornum); + + // Check if there is time to retry. + Duration time_remaining = time_limit - timer.lap(); + keep_trying = (mxs_mysql_is_net_error(errornum) && (time_remaining.secs() > 0)); + if (!cmd_success) + { + if (keep_trying) + { + MXS_WARNING("Query '%s' failed on '%s': %s Retrying with %.1f seconds left.", + cmd.c_str(), name(), error_msg.c_str(), time_remaining.secs()); + } + else if (errmsg_out) + { + *errmsg_out = string_printf("Query '%s' failed on '%s': %s", + cmd.c_str(), name(), error_msg.c_str()); + } + } + } + return cmd_success; +} + bool MariaDBServer::do_show_slave_status(string* errmsg_out) { unsigned int columns = 0; @@ -1335,6 +1416,170 @@ bool MariaDBServer::reset_all_slave_conns(json_t** error_out) return !error; } +bool MariaDBServer::promote_v2(ClusterOperation* op) +{ + bool success = false; + json_t** const error_out = op->error_out; + + // Function should only be called for a master-slave pair. + auto master_conn = slave_connection_status(op->demotion_target); + mxb_assert(master_conn); + if (master_conn == NULL) + { + PRINT_MXS_JSON_ERROR(error_out, + "'%s' is not a slave of '%s' and cannot be promoted to its place.", + name(), op->demotion_target->name()); + return false; + } + + StopWatch timer; + + // Step 1: Stop & reset slave connections. If doing a failover, only remove the connection to demotion + // target. In case of switchover, remove other slave connections as well since the demotion target + // will take them over. + bool stop_slave_error = false; + + // Helper function for stopping a slave connection and setting error. + auto stop_slave_helper = [this, &timer, &stop_slave_error, op, error_out](const string& conn_name) { + if (!stop_slave_conn(conn_name, StopMode::RESET_ALL, op->time_remaining, error_out)) + { + stop_slave_error = true; + } + op->time_remaining -= timer.restart(); + }; + + if (op->type == OperationType::SWITCHOVER) + { + for (size_t i = 0; !stop_slave_error && i < m_slave_status.size(); i++) + { + stop_slave_helper(m_slave_status[i].name); + } + } + else + { + stop_slave_helper(master_conn->name); + } + + if (!stop_slave_error) + { + // Step 2: If demotion target is master, meaning this server will become the master, + // enable writing and scheduled events. Also, run promotion_sql_file. + bool promotion_error = false; + if (op->demotion_target_is_master) + { + // Disabling read-only should be quick. + bool ro_disabled = set_read_only(ReadOnlySetting::DISABLE, op->time_remaining, error_out); + op->time_remaining -= timer.restart(); + if (!ro_disabled) + { + promotion_error = true; + } + else + { + if (op->handle_events) + { + // TODO: Add query replying to enable_events + bool events_enabled = enable_events(error_out); + op->time_remaining -= timer.restart(); + if (!events_enabled) + { + promotion_error = true; + PRINT_MXS_JSON_ERROR(error_out, "Failed to enable events on '%s'.", name()); + } + } + + // Run promotion_sql_file if no errors so far. + if (!promotion_error && !op->promotion_sql_file.empty()) + { + bool file_ran_ok = run_sql_from_file(op->promotion_sql_file, error_out); + op->time_remaining -= timer.restart(); + if (!file_ran_ok) + { + promotion_error = true; + PRINT_MXS_JSON_ERROR(error_out, + "Execution of file '%s' failed during promotion of server '%s'.", + op->promotion_sql_file.c_str(), name()); + } + } + } + } + + // Step 3: Copy slave connections from demotion target. If demotion target was replicating from + // this server (circular topology), the connection should be ignored. Also, connections which + // already exist on this server need not be regenerated. + if (!promotion_error) + { + // TODO + } + } + return success; +} + +bool MariaDBServer::stop_slave_conn(const string& conn_name, StopMode mode, Duration time_limit, + json_t** error_out) +{ + /* STOP SLAVE is a bit problematic, since sometimes it seems to take several seconds to complete. + * If this time is greater than the connection read timeout, connector-c will cut the connection/ + * query. The query is likely completed afterwards by the server. To prevent false errors, + * try the query repeatedly until time is up. Fortunately, the server doesn't consider stopping + * an already stopped slave connection an error. */ + Duration time_left = time_limit; + StopWatch timer; + string stop = string_printf("STOP SLAVE '%s';", conn_name.c_str()); + string error_msg; + bool stop_success = execute_cmd_time_limit(stop, time_left, &error_msg); + time_left -= timer.restart(); + + bool rval = false; + if (stop_success) + { + // The RESET SLAVE-query can also take a while if there is lots of relay log to delete. + // Very rare, though. + if (mode == StopMode::RESET || mode == StopMode::RESET_ALL) + { + string reset = string_printf("RESET SLAVE '%s'%s;", + conn_name.c_str(), (mode == StopMode::RESET_ALL) ? " ALL" : ""); + if (execute_cmd_time_limit(reset, time_left, &error_msg)) + { + rval = true; + } + else + { + PRINT_MXS_JSON_ERROR(error_out, + "Failed to reset slave connection on '%s': %s", + name(), error_msg.c_str()); + } + } + else + { + rval = true; + } + } + else + { + PRINT_MXS_JSON_ERROR(error_out, + "Failed to stop slave connection on '%s': %s", + name(), error_msg.c_str()); + } + return rval; +} + +bool MariaDBServer::set_read_only(ReadOnlySetting setting, maxbase::Duration time_limit, json_t** error_out) +{ + int new_val = (setting == ReadOnlySetting::ENABLE) ? 1 : 0; + string cmd = string_printf("SET GLOBAL read_only=%i;", new_val); + string error_msg; + bool success = execute_cmd_time_limit(cmd, time_limit, &error_msg); + if (!success) + { + string target_str = (setting == ReadOnlySetting::ENABLE) ? "enable" : "disable"; + PRINT_MXS_JSON_ERROR(error_out, + "Failed to %s read_only on '%s': %s", + target_str.c_str(), name(), error_msg.c_str()); + } + return success; +} + string SlaveStatus::to_string() const { // Print all of this on the same line to make things compact. Are the widths reasonable? The format is diff --git a/server/modules/monitor/mariadbmon/mariadbserver.hh b/server/modules/monitor/mariadbmon/mariadbserver.hh index 4e4cb2aeb..15447bc7d 100644 --- a/server/modules/monitor/mariadbmon/mariadbserver.hh +++ b/server/modules/monitor/mariadbmon/mariadbserver.hh @@ -17,6 +17,7 @@ #include #include #include +#include #include "gtid.hh" class QueryResult; @@ -207,14 +208,16 @@ public: std::unique_ptr execute_query(const std::string& query, std::string* errmsg_out = NULL); /** - * Execute a query which does not return data. If the query returns data, an error is returned. - * - * @param cmd The query - * @param errmsg_out Error output. Can be null. - * @return True on success, false on error or if query returned data + * execute_cmd_ex with query retry ON. */ bool execute_cmd(const std::string& cmd, std::string* errmsg_out = NULL); + /** + * execute_cmd_ex with query retry OFF. + */ + bool execute_cmd_no_retry(const std::string& cmd, + std::string* errmsg_out = NULL, unsigned int* errno_out = NULL); + /** * Update server slave connection information. * @@ -488,10 +491,36 @@ public: */ bool reset_all_slave_conns(json_t** error_out); + /** + * Promote this server to take role of demotion target. Remove slave connections from this server. + * If target is/was a master, set read-only to OFF. Copy slave connections from target. + * + * @param op Cluster operation descriptor + * @return True if successful + */ + bool promote_v2(ClusterOperation* operation); + private: class EventInfo; typedef std::function ManipulatorFunc; + enum class StopMode + { + STOP_ONLY, + RESET, + RESET_ALL + }; + enum class QueryRetryMode + { + ENABLED, + DISABLED + }; + enum class ReadOnlySetting + { + ENABLE, + DISABLE + }; + bool update_slave_status(std::string* errmsg_out = NULL); bool sstatus_array_topology_equal(const SlaveStatusArray& new_slave_status); const SlaveStatus* sstatus_find_previous_row(const SlaveStatus& new_row, size_t guess); @@ -499,6 +528,17 @@ private: bool events_foreach(ManipulatorFunc& func, json_t** error_out); bool alter_event(const EventInfo& event, const std::string& target_status, json_t** error_out); + + bool stop_slave_conn(const std::string& conn_name, StopMode mode, maxbase::Duration time_limit, + json_t** error_out); + + bool execute_cmd_ex(const std::string& cmd, QueryRetryMode mode, + std::string* errmsg_out = NULL, unsigned int* errno_out = NULL); + + bool execute_cmd_time_limit(const std::string& cmd, maxbase::Duration time_limit, + std::string* errmsg_out); + + bool set_read_only(ReadOnlySetting value, maxbase::Duration time_limit, json_t** error_out); }; /**