MXS-2175: Fix available_when_donor

If a Galera cluster drops down to a single node, the last node would not be considered valid. During the failure of the second to last node, the master would also temporarily lose the master status. The behavior was changed to always keep the cluster UUID until the cluster size drops down to zero. This guarantees that the same cluster is used as long as possible.
2018-11-22 13:24:21 +02:00
parent 2ff95216a9
commit f41caae5c7
1 changed files with 29 additions and 72 deletions
--- a/server/modules/monitor/galeramon/galeramon.c
+++ b/server/modules/monitor/galeramon/galeramon.c
@ -424,24 +424,9 @@ monitorDatabase(MXS_MONITOR *mon, MXS_MONITORED_SERVER *database)
                info.local_state = atoi(row[1]);
            }

-            /* We can check:
-             * wsrep_local_state == 0
-             * wsrep_cluster_size == 0
-             * wsrep_cluster_state_uuid == ""
-             */
            if (strcmp(row[0], "wsrep_cluster_state_uuid") == 0)
            {
-                if (row[1] == NULL || !strlen(row[1]))
-                {
-                    MXS_DEBUG("Node %s is not running Galera Cluster",
-                              database->server->unique_name);
-                    info.cluster_uuid = NULL;
-                    info.joined = 0;
-                }
-                else
-                {
-                    info.cluster_uuid = MXS_STRDUP(row[1]);
-                }
+                info.cluster_uuid = row[1] && *row[1] ? MXS_STRDUP(row[1]) : NULL;
            }
        }

@ -1141,28 +1126,20 @@ static void set_galera_cluster(MXS_MONITOR *mon)
        {
            /* fetch the Value for the Key */
            value = hashtable_fetch(table, key);
-            if (value)
+            ss_dassert(value);
+
+            if (SERVER_IS_RUNNING(value->node) && value->joined && value->cluster_size)
            {
-                if (!SERVER_IN_MAINT(value->node) &&
-                    SERVER_IS_RUNNING(value->node) &&
-                    value->joined)
+                /* This server is part of a cluster */
+                n_nodes++;
+
+                /* Pick the cluster UUID of the largest cluster if we have no previous cluster. */
+                if ((handle->cluster_info.c_uuid == NULL && value->cluster_size > cluster_size) ||
+                    (handle->cluster_info.c_uuid &&
+                     strcmp(handle->cluster_info.c_uuid, value->cluster_uuid) == 0))
                {
-                    /* This server can be part of a cluster */
-                    n_nodes++;
-
-                    /* Set cluster_uuid for nodes that report
-                     * highest value of cluster_size
-                     */
-                    if (value->cluster_size > cluster_size)
-                    {
-                        cluster_size = value->cluster_size;
-                        cluster_uuid = value->cluster_uuid;
-                    }
-
-                    MXS_DEBUG("Candidate cluster member %s: UUID %s, joined nodes %d",
-                              value->node->unique_name,
-                              value->cluster_uuid,
-                              value->cluster_size);
+                    cluster_size = value->cluster_size;
+                    cluster_uuid = value->cluster_uuid;
                }
            }
        }
@ -1181,17 +1158,22 @@ static void set_galera_cluster(MXS_MONITOR *mon)
                              n_nodes,
                              cluster_uuid,
                              cluster_size);
-    /**
-     * Free && set the new cluster_uuid:
-     * Handling the special case n_nodes == 1
-     */
-    if (ret || (!ret && n_nodes != 1))
+
+    if (ret)
    {
-        /* Set the new cluster_uuid */
+        // We have a working cluster
+        ss_dassert(cluster_uuid);
        MXS_FREE(handle->cluster_info.c_uuid);
-        handle->cluster_info.c_uuid = ret ? MXS_STRDUP(cluster_uuid) : NULL;
+        handle->cluster_info.c_uuid = MXS_STRDUP(cluster_uuid);
        handle->cluster_info.c_size = cluster_size;
    }
+    else if (n_nodes == 0)
+    {
+        // No live nodes, clear the stored cluster UUID
+        MXS_FREE(handle->cluster_info.c_uuid);
+        handle->cluster_info.c_uuid = NULL;
+        handle->cluster_info.c_size = 0;
+    }

    /**
     * Set the JOINED status in cluster members only, if any.
@ -1295,44 +1277,19 @@ static bool detect_cluster_size(const GALERA_MONITOR *handle,
    }
    else if (n_nodes == 1)
    {
-        char *msg = "Galera cluster with 1 node only";
-
-        /* If 1 node only:
-         * ifc_uuid is not set, return value will be true.
-         * if c_uuid is equal to candidate_uuid, return value will be true.
-         */
-        if (c_uuid == NULL ||
-            (c_uuid && strcmp(c_uuid, candidate_uuid) == 0))
+        if (c_uuid == NULL || strcmp(c_uuid, candidate_uuid) == 0)
        {
+            // No previous cluster or this node is in the correct cluster
            ret = true;
        }
-
-        /* Log change if no previous UUID was set */
-        if (c_uuid == NULL)
-        {
-            if (ret)
-            {
-                MXS_INFO("%s has UUID %s: continue", msg, candidate_uuid);
-            }
-        }
-        else
-        {
-            if (strcmp(c_uuid, candidate_uuid) && c_size != 1)
-            {
-                /* This error should be logged once */
-                MXS_ERROR("%s and its UUID %s is different from previous set one %s: aborting",
-                          msg,
-                          candidate_uuid,
-                          c_uuid);
-            }
-        }
    }
    else
    {
        int min_cluster_size = (n_nodes / 2) + 1;

        /* Return true if there are enough members */
-        if (candidate_size >= min_cluster_size)
+        if ((c_uuid && strcmp(c_uuid, candidate_uuid) == 0) ||
+            (c_uuid == NULL && candidate_size >= min_cluster_size))
        {
            ret = true;
            /* Log the successful change once */