Fix replication in parallel

If the replication is broken between the nodes, it is now fixed in
parallel on all nodes instead of doing it one server at a time.

This reduces the time from about 120 seconds to 13 seconds. The time was
measured by running the check_backend test first with all backends broken
and then with the fixed backends subtracting time of the latter from the
former.
This commit is contained in:
Markus Mäkelä
2018-11-02 15:59:02 +02:00
parent 04e4f17618
commit ccec2a387a
2 changed files with 122 additions and 154 deletions

View File

@ -202,7 +202,8 @@ void Mariadb_nodes::read_env()
} }
else else
{ {
sprintf(cleanup_db_command[i], "rm -rf /var/lib/mysql/*; killall -9 mysqld"); sprintf(cleanup_db_command[i],
"service mysql stop; killall -9 mysqld; rm -rf /var/lib/mysql/*");
} }
} }
} }
@ -310,17 +311,22 @@ int Mariadb_nodes::start_node(int node, const char* param)
int Mariadb_nodes::stop_nodes() int Mariadb_nodes::stop_nodes()
{ {
int i; std::vector<std::thread> workers;
int local_result = 0; int local_result = 0;
connect(); connect();
for (i = 0; i < N; i++)
for (int i = 0; i < N; i++)
{ {
printf("Stopping node %d\n", i); workers.emplace_back([&, i]() {
fflush(stdout);
local_result += execute_query(nodes[i], "stop slave;");
local_result += stop_node(i); local_result += stop_node(i);
local_result += ssh_node_f(i, true, "rm -f /var/lib/mysql/*master*.info"); });
} }
for (auto& a : workers)
{
a.join();
}
return local_result; return local_result;
} }
@ -359,10 +365,21 @@ int Mariadb_nodes::cleanup_db_nodes()
return local_result; return local_result;
} }
int Mariadb_nodes::start_replication() void Mariadb_nodes::create_users(int node)
{ {
char str[1024]; char str[1024];
char dtr[1024]; char dtr[1024];
// Create users for replication as well as the users that are used by the tests
sprintf(str, "%s/create_user.sh", test_dir);
sprintf(dtr, "%s", access_homedir[node]);
copy_to_node(node, str, dtr);
ssh_node_f(node, false,
"export node_user=\"%s\"; export node_password=\"%s\"; %s/create_user.sh %s",
user_name, password, access_homedir[0], socket_cmd[0]);
}
int Mariadb_nodes::start_replication()
{
int local_result = 0; int local_result = 0;
// Start all nodes // Start all nodes
@ -370,25 +387,11 @@ int Mariadb_nodes::start_replication()
{ {
if (start_node(i, (char*) "")) if (start_node(i, (char*) ""))
{ {
printf("Start of node %d failed, trying to cleanup and re-initialize node\n", i); printf("Start of node %d failed\n", i);
cleanup_db_node(i); return 1;
prepare_server(i);
local_result += start_node(i, (char*) "");
} }
ssh_node_f(i, true, "sudo rm -f /etc/my.cnf.d/kerb.cnf"); create_users(i);
// Create users for replication as well as the users that are used by the tests
sprintf(str, "%s/create_user.sh", test_dir);
sprintf(dtr, "%s", access_homedir[i]);
copy_to_node(i, str, dtr);
ssh_node_f(i,
false,
"export node_user=\"%s\"; export node_password=\"%s\"; %s/create_user.sh %s",
user_name,
password,
access_homedir[0],
socket_cmd[0]);
} }
connect(); connect();
@ -421,6 +424,8 @@ int Mariadb_nodes::start_replication()
} }
} }
disconnect();
return local_result; return local_result;
} }
@ -444,7 +449,6 @@ int Galera_nodes::start_galera()
if (start_node(0, (char*) " --wsrep-cluster-address=gcomm://") != 0) if (start_node(0, (char*) " --wsrep-cluster-address=gcomm://") != 0)
{ {
cleanup_db_node(0);
prepare_server(0); prepare_server(0);
local_result += start_node(0, (char*) " --wsrep-cluster-address=gcomm://"); local_result += start_node(0, (char*) " --wsrep-cluster-address=gcomm://");
} }
@ -559,10 +563,20 @@ int Mariadb_nodes::unblock_node(int node)
int Mariadb_nodes::unblock_all_nodes() int Mariadb_nodes::unblock_all_nodes()
{ {
int rval = 0; int rval = 0;
std::vector<std::thread> threads;
for (int i = 0; i < this->N; i++) for (int i = 0; i < this->N; i++)
{ {
threads.emplace_back([&, i]() {
rval += this->unblock_node(i); rval += this->unblock_node(i);
});
} }
for (auto& a : threads)
{
a.join();
}
return rval; return rval;
} }
@ -584,31 +598,6 @@ bool Mariadb_nodes::check_master_node(MYSQL* conn)
{ {
bool rval = true; bool rval = true;
if (mysql_query(conn, "SHOW SLAVE HOSTS"))
{
printf("%s\n", mysql_error(conn));
rval = false;
}
else
{
MYSQL_RES* res = mysql_store_result(conn);
if (res)
{
int rows = mysql_num_rows(res);
if (rows != N - 1)
{
if (!v51)
{
printf("Number of slave hosts is %d when it should be %d\n", rows, N - 1);
rval = false;
}
}
}
mysql_free_result(res);
}
if (mysql_query(conn, "SHOW SLAVE STATUS")) if (mysql_query(conn, "SHOW SLAVE STATUS"))
{ {
printf("%s\n", mysql_error(conn)); printf("%s\n", mysql_error(conn));
@ -758,8 +747,9 @@ int Mariadb_nodes::check_replication()
fflush(stdout); fflush(stdout);
} }
if (this->connect()) if (connect())
{ {
printf("Failed to connect to all servers\n");
return 1; return 1;
} }
@ -802,69 +792,38 @@ int Mariadb_nodes::check_replication()
bool Mariadb_nodes::fix_replication() bool Mariadb_nodes::fix_replication()
{ {
bool rval = true;
if (check_replication()) if (check_replication())
{ {
unblock_all_nodes(); printf("Replication is broken, fixing...\n");
rval = false;
if (check_nodes()) if (unblock_all_nodes() == 0)
{ {
printf("****** VMS ARE BROKEN! Exiting *****\n"); printf("Prepare nodes\n");
return false;
}
int attempts = 2;
int attempts_with_cleanup = 1;
int attempts_with_revert = 1;
while (check_replication() && attempts > 0)
{
if (attempts != 2)
{
stop_nodes();
}
start_replication();
close_connections();
check_replication();
attempts--;
if (attempts == 0 && check_replication())
{
if (attempts_with_cleanup > 0)
{
printf("****** BACKEND IS STILL BROKEN! Trying to cleanup all nodes *****\n");
stop_nodes();
cleanup_db_nodes();
prepare_servers(); prepare_servers();
attempts_with_cleanup--; printf("Starting replication\n");
attempts = 2;
sleep(10);
start_replication(); start_replication();
sleep(10);
} if (check_replication() == 0)
else
{ {
if (attempts_with_revert > 0) printf("Replication is fixed\n");
{
printf(
"****** BACKEND IS STILL BROKEN! Trying to revert all nodes from snapshot *****\n");
revert_nodes_snapshot();
attempts_with_cleanup = 1;
attempts = 2;
}
else
{
printf("****** BACKEND IS STILL BROKEN! Exiting *****\n");
return false;
}
}
}
}
flush_hosts(); flush_hosts();
rval = true;
}
else
{
printf("FATAL ERROR: Replication is still broken\n");
}
}
else
{
printf("SSH access to nodes doesn't work\n");
}
} }
return true; return rval;
} }
bool Mariadb_nodes::revert_nodes_snapshot() bool Mariadb_nodes::revert_nodes_snapshot()
@ -1391,22 +1350,36 @@ void Mariadb_nodes::add_server_setting(int node, const char* setting)
ssh_node_f(node, true, "sudo sed -i '$a %s' /etc/my.cnf.d/server*.cnf", setting); ssh_node_f(node, true, "sudo sed -i '$a %s' /etc/my.cnf.d/server*.cnf", setting);
} }
void Mariadb_nodes::reset_server_settings(int node) std::string Mariadb_nodes::get_config_name(int node)
{ {
std::stringstream ss; std::stringstream ss;
ss << test_dir << "/mdbci/cnf/server" << node + 1 << ".cnf"; ss << "server" << node + 1 << ".cnf";
return ss.str();
}
std::string Galera_nodes::get_config_name(int node)
{
std::stringstream ss;
ss << "galera_server" << node + 1 << ".cnf";
return ss.str();
}
void Mariadb_nodes::reset_server_settings(int node)
{
std::string cnfdir = std::string(test_dir) + "/mdbci/cnf/";
std::string cnf = get_config_name(node);
// Note: This is a CentOS specific path // Note: This is a CentOS specific path
ssh_node(node, "sudo rm -rf /etc/my.cnf.d/*", true); ssh_node(node, "sudo rm -rf /etc/my.cnf.d/*", true);
copy_to_node(node, ss.str().c_str(), "~/"); copy_to_node(node, (cnfdir + cnf).c_str(), "~/");
ssh_node_f(node, false, "sudo mv ~/server%d.cnf /etc/my.cnf.d/", node + 1); ssh_node_f(node, false, "sudo mv ~/%s /etc/my.cnf.d/", cnf.c_str());
} }
void Mariadb_nodes::reset_server_settings() void Mariadb_nodes::reset_server_settings()
{ {
for (int i = 0; i < N; i++) for (int node = 0; node < N; node++)
{ {
reset_server_settings(i); reset_server_settings(node);
} }
} }
@ -1437,23 +1410,21 @@ char* extract_version_from_string(char* version)
int Mariadb_nodes::prepare_server(int i) int Mariadb_nodes::prepare_server(int i)
{ {
cleanup_db_node(i);
reset_server_settings(i);
// Note: These should be done by MDBCI
ssh_node(i, "test -d /etc/apparmor.d/ && "
"ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/usr.sbin.mysqld && "
"sudo service apparmor restart", true);
int ec; int ec;
char* version = ssh_node_output(i, "/usr/sbin/mysqld --version", false, &ec);
char* version;
char* version_digits;
char* tmp_pass;
char str1[1024];
char str2[1024];
ssh_node_f(i, true, "%s", stop_db_command[i]);
sleep(5);
ssh_node(i, "sed -i \"s/bind-address/#bind-address/g\" /etc/mysql/my.cnf.d/*.cnf", true);
ssh_node(i,
"ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/usr.sbin.mysqld; sudo service apparmor restart",
true);
version = ssh_node_output(i, "/usr/sbin/mysqld --version", false, &ec);
if (ec == 0) if (ec == 0)
{ {
char* version_digits;
char* tmp_pass;
version_digits = extract_version_from_string(version); version_digits = extract_version_from_string(version);
printf("Detected server version on node %d is %s\n", i, version_digits); printf("Detected server version on node %d is %s\n", i, version_digits);
@ -1490,34 +1461,29 @@ int Mariadb_nodes::prepare_server(int i)
printf("Server start on node %d failed\n", i); printf("Server start on node %d failed\n", i);
} }
} }
sleep(15); }
sprintf(str1, "%s/mdbci/backend/create_*_user.sql", test_dir);
sprintf(str2, "%s/", access_homedir[i]);
copy_to_node(i, str1, str2);
sprintf(str1, "mysql < %s/create_repl_user.sql", access_homedir[i]);
ssh_node(i, str1, true);
sprintf(str1, "mysql < %s/create_skysql_user.sql", access_homedir[i]);
ssh_node(i, str1, true);
free(version); free(version);
return 0; return ec;
}
else
{
return 1;
}
} }
int Mariadb_nodes::prepare_servers() int Mariadb_nodes::prepare_servers()
{ {
int rval = 0; int rval = 0;
for (int i; i < N; i++) std::vector<std::thread> threads;
for (int i = 0; i < N; i++)
{ {
if (prepare_server(i)) threads.emplace_back([&, i]() {
rval += prepare_server(i);
});
}
for (auto& a : threads)
{ {
rval = 1; a.join();
}
} }
return rval; return rval;
} }

View File

@ -223,6 +223,9 @@ public:
*/ */
virtual int start_replication(); virtual int start_replication();
// Create the default users used by all tests
void create_users(int node);
/** /**
* @brif BlockNode setup firewall on a backend node to block MariaDB port * @brif BlockNode setup firewall on a backend node to block MariaDB port
* @param node Index of node to block * @param node Index of node to block
@ -403,16 +406,20 @@ public:
void add_server_setting(int node, const char* setting); void add_server_setting(int node, const char* setting);
/** /**
* Restore the original configuration for this server * Get the configuration file name for a particular node
* *
* @param node Node to restore * @param node Node number for which the configuration is requested
*
* @return The name of the configuration file
*/ */
void reset_server_settings(int node); virtual std::string get_config_name(int node);
/** /**
* Restore the original configuration for all servers * Restore the original configuration for all servers
*/ */
void reset_server_settings(); void reset_server_settings();
// Same but for an individual server
void reset_server_settings(int node);
/** /**
* @brief revert_nodes_snapshot Execute MDBCI snapshot revert command for all nodes * @brief revert_nodes_snapshot Execute MDBCI snapshot revert command for all nodes
@ -477,12 +484,7 @@ public:
return check_galera(); return check_galera();
} }
// int prepare_galera_server(int i); std::string get_config_name(int node) override;
// virtual int prepare_server(int i)
// {
// return prepare_galera_server(i);
// }
}; };
#endif // MARIADB_NODES_H #endif // MARIADB_NODES_H