MXS-1490: Cleanup error printing, add json error

Slave redirection is a special case, as there the total failure is only known after all redirects have been attempted. In the failure case, all errors from connections are gathered to one message.
2017-11-16 16:39:34 +02:00
parent 204fa17322
commit 59616b5f3e
1 changed files with 121 additions and 53 deletions
--- a/server/modules/monitor/mysqlmon/mysql_mon.cc
+++ b/server/modules/monitor/mysqlmon/mysql_mon.cc
@ -56,6 +56,16 @@
 #define SLAVE_HOSTS_HOSTNAME 1
 #define SLAVE_HOSTS_PORT 2

+/** Utility macro for printing both MXS_ERROR and json error */
+#define PRINT_MXS_JSON_ERROR(err_out, format, ...)\
+    do {\
+       MXS_ERROR(format, ##__VA_ARGS__);\
+       if (err_out)\
+       {\
+            *err_out = mxs_json_error_append(*err_out, format, ##__VA_ARGS__);\
+       }\
+    } while (false)
+
 using std::string;
 typedef std::vector<MXS_MONITORED_SERVER*> ServerVector;
 typedef std::vector<string> StringVector;
@ -77,7 +87,7 @@ static bool isMySQLEvent(mxs_monitor_event_t event);
 void check_maxscale_schema_replication(MXS_MONITOR *monitor);
 static bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout);
 static bool do_failover(MYSQL_MONITOR* mon);
-static bool do_switchover(MYSQL_MONITOR* mon);
+static bool do_switchover(MYSQL_MONITOR* mon, json_t** err_out);
 static bool update_gtids(MXS_MONITORED_SERVER *database, int64_t domain, MySqlServerInfo* info);
 static bool update_replication_settings(MXS_MONITORED_SERVER *database, MySqlServerInfo* info);

@ -294,7 +304,7 @@ bool mysql_switchover(MXS_MONITOR* mon, SERVER* new_master, SERVER* current_mast
    if (rv)
    {
        bool failover = config_get_bool(mon->parameters, CN_FAILOVER);
-        rv = do_switchover(handle);
+        rv = do_switchover(handle, output);

        if (rv)
        {
@ -2943,9 +2953,12 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, uint32_t failover_timeout)
 *
 * @param mon The monitor
 * @param out_slaves Vector for storing slave servers, can be NULL
+ * @param err_out json object for error printing. Can be NULL.
 * @return The found master, or NULL if not found
 */
-MXS_MONITORED_SERVER* failover_select_new_master(MYSQL_MONITOR* mon, ServerVector* out_slaves)
+MXS_MONITORED_SERVER* failover_select_new_master(MYSQL_MONITOR* mon,
+                                                 ServerVector* out_slaves,
+                                                 json_t** err_out)
 {
    /* Select a new master candidate. Selects the one with the latest event in relay log.
     * If multiple slaves have same number of events, select the one with most processed events. */
@ -2965,24 +2978,27 @@ MXS_MONITORED_SERVER* failover_select_new_master(MYSQL_MONITOR* mon, ServerVecto
            {
                out_slaves->push_back(mon_server);
            }
+            char* servername = mon_server->server->unique_name;
            if (cand_info->rpl_settings.log_bin == false)
            {
-                MXS_WARNING("Failover: Slave '%s' has binary log disabled and is not a valid promotion "
-                        "candidate.", mon_server->server->unique_name);
+                const char NO_BINLOG[] = "Slave '%s' has binary log disabled and is not a valid promotion "
+                                         "candidate.";
+                MXS_WARNING(NO_BINLOG, servername);
                continue;
            }
            if (cand_info->rpl_settings.gtid_strict_mode == false)
            {
-                MXS_WARNING("Failover: Slave '%s' has gtid_strict_mode disabled. Enabling this setting is "
+                const char NO_STRICT[] = "Slave '%s' has gtid_strict_mode disabled. Enabling this setting is "
                                         "recommended. For more information, see "
-                        "https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode",
-                        mon_server->server->unique_name);
+                                         "https://mariadb.com/kb/en/library/gtid/#gtid_strict_mode";
+                MXS_WARNING(NO_STRICT, servername);
            }
            if (cand_info->rpl_settings.log_slave_updates == false)
            {
-                MXS_WARNING("Failover: Slave '%s' has log_slave_updates disabled. It is a valid candidate "
-                        "but replication will break for lagging slaves if '%s' is promoted.",
-                        mon_server->server->unique_name, mon_server->server->unique_name);
+                const char NO_SLAVE_UPDATES[] = "Slave '%s' has log_slave_updates disabled. It is a valid "
+                                                "candidate but replication will break for lagging slaves if "
+                                                "'%s' is promoted.";
+                MXS_WARNING(NO_SLAVE_UPDATES, servername, servername);
            }
            bool select_this = false;
            // If no candidate yet, accept any.
@ -3028,6 +3044,10 @@ MXS_MONITORED_SERVER* failover_select_new_master(MYSQL_MONITOR* mon, ServerVecto
        remove_this += master_vector_index;
        out_slaves->erase(remove_this);
    }
+    if (new_master == NULL)
+    {
+        PRINT_MXS_JSON_ERROR(err_out, "No suitable promotion candidate found.");
+    }
    return new_master;
 }

@ -3087,9 +3107,10 @@ bool failover_wait_relay_log(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_maste
 *
 * @param mon The monitor
 * @param new_master The new master server
+ * @param err_out json object for error printing. Can be NULL.
 * @return True if successful
 */
-bool failover_promote_new_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master)
+bool failover_promote_new_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_master, json_t** err_out)
 {
    MXS_NOTICE("Failover: Promoting server '%s' to master.", new_master->server->unique_name);
    if (mxs_mysql_query(new_master->con, "STOP SLAVE;") == 0 &&
@ -3100,7 +3121,7 @@ bool failover_promote_new_master(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_m
    }
    else
    {
-        MXS_WARNING("Failover: Promotion failed: '%s'.", mysql_error(new_master->con));
+        PRINT_MXS_JSON_ERROR(err_out, "Promotion failed: '%s'.", mysql_error(new_master->con));
        return false;
    }
 }
@ -3118,7 +3139,7 @@ string generate_change_master_cmd(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_
    std::stringstream change_cmd_nopw;
    change_cmd_nopw << change_cmd.str();
    change_cmd_nopw << MASTER_PW << "******" << END;;
-    MXS_DEBUG("Failover: Change master command is '%s'.", change_cmd_nopw.str().c_str());
+    MXS_DEBUG("Change master command is '%s'.", change_cmd_nopw.str().c_str());
 #endif
    change_cmd << MASTER_PW << mon->replication_password << END;
    return change_cmd.str();
@ -3130,13 +3151,12 @@ string generate_change_master_cmd(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* new_
 * @param mon The monitor
 * @param slaves An array of slaves
 * @param new_master The replication master
- * @return True, if even one slave was redirected, or if there was none to redirect
+ * @return The number of slaves successfully redirected.
 */
-bool failover_redirect_slaves(MYSQL_MONITOR* mon, ServerVector& slaves, MXS_MONITORED_SERVER* new_master)
+int failover_redirect_slaves(MYSQL_MONITOR* mon, ServerVector& slaves, MXS_MONITORED_SERVER* new_master)
 {
-    MXS_NOTICE("Failover: Redirecting slaves to new master.");
+    MXS_NOTICE("Redirecting slaves to new master.");
    std::string change_cmd = generate_change_master_cmd(mon, new_master, false);
-    int fails = 0;
    int successes = 0;
    for (ServerVector::const_iterator iter = slaves.begin(); iter != slaves.end(); iter++)
    {
@ -3146,16 +3166,15 @@ bool failover_redirect_slaves(MYSQL_MONITOR* mon, ServerVector& slaves, MXS_MONI
            mxs_mysql_query(mon_server->con, "START SLAVE;") == 0)
        {
            successes++;
-            MXS_NOTICE("Failover: Slave '%s' redirected to new master.", mon_server->server->unique_name);
+            MXS_NOTICE("Slave '%s' redirected to new master.", mon_server->server->unique_name);
        }
        else
        {
-            fails++;
-            MXS_ERROR("Failover: Slave '%s' redirection failed: '%s'.", mon_server->server->unique_name,
+            MXS_WARNING("Slave '%s' redirection failed: '%s'.", mon_server->server->unique_name,
                    mysql_error(mon_server->con));
        }
    }
-    return (successes + fails) == 0 || successes > 0;
+    return successes;
 }

 /**
@ -3168,23 +3187,23 @@ static bool do_failover(MYSQL_MONITOR* mon)
 {
    // Topology has already been tested to be simple.
    // Step 1: Select new master. Also populate a vector with all slaves not the selected master.
-    ServerVector redirect_slaves;
-    MXS_MONITORED_SERVER* new_master = failover_select_new_master(mon, &redirect_slaves);
+    ServerVector slaves;
+    MXS_MONITORED_SERVER* new_master = failover_select_new_master(mon, &slaves, NULL);
    if (new_master == NULL)
    {
-        MXS_ERROR("Failover: No suitable promotion candidate found, cancelling.");
        return false;
    }
+    bool rval = false;
    // Step 2: Wait until relay log consumed.
    if (failover_wait_relay_log(mon, new_master) &&
        // Step 3: Stop and reset slave, set read-only to 0.
-        failover_promote_new_master(mon, new_master) &&
-        // Step 4: Redirect slaves.
-        failover_redirect_slaves(mon, redirect_slaves, new_master))
+        failover_promote_new_master(mon, new_master, NULL))
    {
-        return true;
+        // Step 4: Redirect slaves.
+        int redirects = failover_redirect_slaves(mon, slaves, new_master);
+        rval = slaves.empty() ? true : redirects > 0;
    }
-    return false;
+    return rval;
 }

 /**
@ -3282,13 +3301,14 @@ static bool update_gtids(MXS_MONITORED_SERVER *database, int64_t domain, MySqlSe
 * @param current_master Server to demote
 * @param info Current master info. Will be written to.
 * @param domain Replication gtid domain.
+ * @param err_out json object for error printing. Can be NULL.
 * @return True if successful.
 */
 static bool switchover_demote_master(MXS_MONITORED_SERVER* current_master,
                                     MySqlServerInfo* info,
-                                     int32_t domain)
+                                     int32_t domain, json_t** err_out)
 {
-    MXS_NOTICE("Switchover: Demoting server '%s'.", current_master->server->unique_name);
+    MXS_NOTICE("Demoting server '%s'.", current_master->server->unique_name);
    string error;
    bool rval = false;
    if (mxs_mysql_query(current_master->con, "SET GLOBAL read_only=1;") == 0)
@ -3314,14 +3334,13 @@ static bool switchover_demote_master(MXS_MONITORED_SERVER* current_master,

    if (rval == false)
    {
-        const char* MSG = "Switchover: Demotion failed";
-        if (!error.empty())
+        if (error.empty())
        {
-            MXS_ERROR("%s: '%s'.", MSG, error.c_str());
+            PRINT_MXS_JSON_ERROR(err_out, "Demotion failed due to an error in updating gtid:s.");
        }
        else
        {
-            MXS_ERROR("%s.", MSG);
+            PRINT_MXS_JSON_ERROR(err_out, "Demotion failed due to a query error: '%s'.", error.c_str());
        }
    }
    return rval;
@ -3333,10 +3352,12 @@ static bool switchover_demote_master(MXS_MONITORED_SERVER* current_master,
 * @param slave Slave to wait on
 * @param master_binlog_pos Which gtid must be reached
 * @param timeout Maximum wait time in seconds
+ * @param err_out json object for error printing. Can be NULL.
 * @return True, if target gtid was reached within allotted time
 */
 static bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave,
-                                          const Gtid& master_binlog_pos, int timeout)
+                                          const Gtid& master_binlog_pos, int timeout,
+                                          json_t** err_out)
 {
    /*
     * TODO: The MASTER_GTID_WAIT()-call is currently buggy, as the connection read timeout prematurely
@ -3358,10 +3379,15 @@ static bool switchover_wait_slave_catchup(MXS_MONITORED_SERVER* slave,
        }
        else
        {
-            MXS_ERROR("Timeout reached when waiting for slave '%s' to catch up with master.",
+            PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() timed out on slave '%s'.",
                                 slave->server->unique_name);
 	}
    }
+    else
+    {
+        PRINT_MXS_JSON_ERROR(err_out, "MASTER_GTID_WAIT() query error on slave '%s'.",
+                             slave->server->unique_name);
+    }
    return rval;
 }

@ -3393,46 +3419,88 @@ static bool switchover_start_slave(MYSQL_MONITOR* mon, MXS_MONITORED_SERVER* old
    }
    return rval;
 }
+
+/**
+ * Get error strings from all MySQL connections, form one string.
+ *
+ * @param slaves Slave servers
+ * @param old_master Old master server
+ * @return Concatenated string.
+ */
+static string get_connection_errors(ServerVector& slaves, MXS_MONITORED_SERVER* old_master)
+{
+    // Get errors from all connections, form a string.
+    ss_dassert(old_master);
+    ServerVector servers;
+    servers.reserve(1 + slaves.size());
+    servers.push_back(old_master);
+    servers.insert(servers.end(), slaves.begin(), slaves.end());
+    std::stringstream ss;
+    for (ServerVector::const_iterator iter = servers.begin(); iter != servers.end(); iter++)
+    {
+        const char* error = mysql_error((*iter)->con);
+        ss_dassert(strlen(error) > 0); // Every connection should have an error.
+        ss << (*iter)->server->unique_name << ": '" << error << "'";
+        if (iter + 1 != servers.end())
+        {
+            ss << ", ";
+        }
+    }
+    return ss.str();
+}
+
 /**
 * Performs switchover for a simple topology (1 master, N slaves, no intermediate masters). If an intermediate
 * step fails, the cluster may be left without a master.
 *
 * @param mon Server cluster monitor
+ * @param err_out json object for error printing. Can be NULL.
 * @return True if successful. If false, the cluster can be in various situations depending on which step
 * failed. In practice, manual intervention is usually required on failure.
 */
-static bool do_switchover(MYSQL_MONITOR* mon)
+static bool do_switchover(MYSQL_MONITOR* mon, json_t** err_out)
 {
    MXS_MONITORED_SERVER* demotion_target = mon->master;
    if (demotion_target == NULL)
    {
-        MXS_ERROR("Switchover: Cluster does not have a running master. Run failover instead.");
+        PRINT_MXS_JSON_ERROR(err_out, "Cluster does not have a running master. Run failover instead.");
        return false;
    }
    ServerVector slaves;
    // Step 1: Select promotion candidate, save all slaves except promotion target to an array.
-    MXS_MONITORED_SERVER* promotion_target = failover_select_new_master(mon, &slaves);
+    MXS_MONITORED_SERVER* promotion_target = failover_select_new_master(mon, &slaves, err_out);
    if (promotion_target == NULL)
    {
-        MXS_ERROR("Switchover: No suitable promotion candidate found.");
        return false;
    }
    bool rval = false;
    MySqlServerInfo* curr_master_info = get_server_info(mon, demotion_target);
    uint32_t repl_domain = get_server_info(mon, promotion_target)->slave_status.gtid_io_pos.domain;
    // Step 2: Set read-only to 1, flush logs.
-    if (switchover_demote_master(demotion_target, curr_master_info, repl_domain) &&
+    if (switchover_demote_master(demotion_target, curr_master_info, repl_domain, err_out) &&
        // Step 3: Wait for the selected slave to catch up with master.
        switchover_wait_slave_catchup(promotion_target, curr_master_info->gtid_binlog_pos,
-            mon->switchover_timeout) &&
+                                      mon->switchover_timeout, err_out) &&
        // Step 4: Stop and reset slave, set read-only to 0.
-            failover_promote_new_master(mon, promotion_target))
+        failover_promote_new_master(mon, promotion_target, err_out))
    {
        // Step 5: Redirect slaves.
-        bool redirect_ok = failover_redirect_slaves(mon, slaves, promotion_target);
+        int redirects = failover_redirect_slaves(mon, slaves, promotion_target);
        // Step 6: Set the old master to replicate from the new.
        bool start_ok = switchover_start_slave(mon, demotion_target, promotion_target);
-        rval = redirect_ok || start_ok;
+        rval = slaves.empty() ? start_ok : start_ok || redirects > 0;
+        if (rval == false)
+        {
+            // This is a special case. Individual server errors have already been printed to the log.
+            // For JSON, gather the errors again.
+            const char MSG[] = "Could not redirect any slaves to the new master.";
+            MXS_ERROR(MSG);
+            if (err_out)
+            {
+                string combined_error = get_connection_errors(slaves, demotion_target);
+                *err_out = mxs_json_error_append(*err_out, "%s Errors: %s.", MSG, combined_error.c_str());
+            }
+        }
    }
    return rval;
 }