From 3fefe557c0ce0d1f2ee8f1ea1682ff8e7a35a66b Mon Sep 17 00:00:00 2001 From: Johan Wikman Date: Thu, 18 Feb 2021 15:12:20 +0200 Subject: [PATCH] MXS-3410 Make xpand system test setup more resilient Xpand nodes must be added one by one to the cluster as an attempt to add more nodes will fail, if any of the nodes to be added already are in the cluster. Further, the adding of a node may also fail if, when the addition is made, the addition of the previous node is still in process. Now it will be attempted at most 5 times, while sleaping as many seconds between each attempt as there has been attempts. Some additional logging was also added. --- system-test/maxtest/src/xpand_nodes.cpp | 89 ++++++++++++++++++++----- 1 file changed, 72 insertions(+), 17 deletions(-) diff --git a/system-test/maxtest/src/xpand_nodes.cpp b/system-test/maxtest/src/xpand_nodes.cpp index 7502008e0..129568169 100644 --- a/system-test/maxtest/src/xpand_nodes.cpp +++ b/system-test/maxtest/src/xpand_nodes.cpp @@ -131,26 +131,78 @@ int Xpand_nodes::prepare_server(int m) int Xpand_nodes::start_replication() { - int rv = 1; + int rv = 0; - connect(); - - // The nodes must be added one by one to the cluster. An attempt to add them - // all with one ALTER command will fail, if one or more of them already are in - // the cluster. - - for (int i = 1; i < N; ++i) + if (connect() == 0) { - std::string cluster_setup_sql = std::string("ALTER CLUSTER ADD '") - + std::string(IP_private[i]) - + std::string("'"); + // The nodes must be added one by one to the cluster. An attempt to add them + // all with one ALTER command will fail, if one or more of them already are in + // the cluster. - execute_query(nodes[0], "%s", cluster_setup_sql.c_str()); + for (int i = 1; i < N; ++i) + { + std::string cluster_setup_sql = std::string("ALTER CLUSTER ADD '") + + std::string(IP_private[i]) + + std::string("'"); + + bool retry = false; + int attempts = 0; + + do + { + ++attempts; + + rv = execute_query(nodes[0], "%s", cluster_setup_sql.c_str()); + + if (rv != 0) + { + std::string error(mysql_error(nodes[0])); + + if (error.find("already in cluster") != std::string::npos) + { + // E.g. '[25609] Bad parameter.: Host "10.166.0.171" already in cluster' + // That's ok and can be ignored. + rv = 0; + } + else if (error.find("addition is pending") != std::string::npos) + { + // E.g. '[50180] Multiple nodes cannot be added when an existing addition is pending' + // Sleep and retry. + + if (attempts < 5) + { + printf("Retrying after %d seconds.", attempts); + sleep(attempts); + retry = true; + } + else + { + printf("After %d attempts, still could not add node to cluster, bailing out.", + attempts); + retry = false; + } + } + else + { + printf("Fatal error when setting up xpand: %s", error.c_str()); + retry = false; + } + } + } + while (rv != 0 && retry); + + if (rv != 0) + { + break; + } + } + + close_connections(); + } + else + { + rv = 1; } - - close_connections(); - - rv = 0; return rv; } @@ -179,8 +231,11 @@ int Xpand_nodes::check_replication() { for (int i = 0; i < N; i++) { - if (execute_query_count_rows(nodes[i], "select * from system.nodeinfo") != N) + int n = execute_query_count_rows(nodes[i], "select * from system.nodeinfo"); + + if (n != N) { + printf("Expected %d nodes configured at node %d, found %d", N, i, n); res = 1; } }