From e2400f5799e1b7d6832733102bb248a517754ae4 Mon Sep 17 00:00:00 2001 From: Timofey Turenko Date: Wed, 7 Aug 2019 14:46:02 +0300 Subject: [PATCH 1/8] Add scripts to create repository with all minor versions Some users need repository with all versions of Maxscale for easy version selection and downgrating --- BUILD/mdbci/create_full_repo.sh | 106 ++++++++++++++++++++++++++++++ BUILD/mdbci/create_remote_repo.sh | 7 +- 2 files changed, 112 insertions(+), 1 deletion(-) create mode 100755 BUILD/mdbci/create_full_repo.sh diff --git a/BUILD/mdbci/create_full_repo.sh b/BUILD/mdbci/create_full_repo.sh new file mode 100755 index 000000000..ec7ef9c44 --- /dev/null +++ b/BUILD/mdbci/create_full_repo.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# $box - Vagrant box to be used for build + +# $target - name of repository to put results + +# $cmake_flags - cmake flags to be used in the build + +# $MDBCI_VM_PATH - path to the MDBCI virtual machies directory + +# $source - reference to the point in the source code repository + +# $do_not_destroy_vm - if "yes" VM stays alive after the build + +# $try_already_running - if "yes" already running VM will be used for build + +# $gpg_keys_path - path to the directory containing GPG keys for repo signing +# directory have to contain only one file *.public and only one *.private + +set -x + +# read the name of build scripts directory +export script_dir="$(dirname $(readlink -f $0))" + +# load all needed variables +. ${script_dir}/set_build_variables.sh + +export platform=`${mdbci_dir}/mdbci show boxinfo --box-name=$box --field='platform' --silent` +export platform_version=`${mdbci_dir}/mdbci show boxinfo --box-name=$box --field='platform_version' --silent` +export dist_sfx="$platform"."$platform_version" + +# prerare VM +export provider=`${mdbci_dir}/mdbci show provider $box --silent 2> /dev/null` +export name="$box-${JOB_NAME}-${BUILD_NUMBER}" +export name=`echo $name | sed "s|/|-|g"` + + +# destroying existing box +if [ -d "$MDBCI_VM_PATH/${name}" ]; then + ${mdbci_dir}/mdbci destroy $name +fi + + eval "cat < /dev/null > $MDBCI_VM_PATH/${name}.json + +# starting VM for build +echo "Generating build VM template" +${mdbci_dir}/mdbci --override --template $MDBCI_VM_PATH/$name.json generate $name +echo "starting VM for build" +${mdbci_dir}/mdbci up --attempts=1 $name +if [ $? != 0 ] ; then + echo "Error starting VM" + exit 1 +fi +echo "copying public keys to VM" +cp ~/build-scripts/team_keys . +${mdbci_dir}/mdbci public_keys --key team_keys --silent $name + + +echo "Get VM info" +export sshuser=`${mdbci_dir}/mdbci ssh --command 'whoami' --silent $name/build 2> /dev/null | tr -d '\r'` +export IP=`${mdbci_dir}/mdbci show network $name/build --silent 2> /dev/null` +export sshkey=`${mdbci_dir}/mdbci show keyfile $name/build --silent 2> /dev/null | sed 's/"//g'` +export scpopt="-i $sshkey -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=120 " +export sshopt="$scpopt $sshuser@$IP" + +rm -rf $pre_repo_dir/$target/$box +mkdir -p $pre_repo_dir/$target/SRC +mkdir -p $pre_repo_dir/$target/$box + +export work_dir="MaxScale" +export orig_image=$box + +ssh $sshopt "sudo rm -rf $work_dir" +echo "copying stuff to $image machine" +ssh $sshopt "mkdir -p $work_dir" + +rsync -avz --delete -e "ssh $scpopt" ${script_dir}/../../ $sshuser@$IP:./$work_dir/ +if [ $? -ne 0 ] ; then + echo "Error copying stuff to $box machine" + exit 2 +fi + +ssh $sshopt ./MaxScale/BUILD/install_build_deps.sh +${script_dir}/create_remote_repo.sh full_repo +export build_result=$? + +if [ ${build_result} -eq 0 ]; then + ${script_dir}/copy_repos.sh + export build_result=$? +fi + +echo "Removing locks and destroying VM" + +if [[ "$do_not_destroy_vm" != "yes" && "$try_already_running" != "yes" ]] ; then + echo "Destroying VM" + ${mdbci_dir}/mdbci destroy $name +fi +cd $dir + +if [ $build_result -ne 0 ] ; then + echo "Build FAILED!" + exit $build_result +fi + diff --git a/BUILD/mdbci/create_remote_repo.sh b/BUILD/mdbci/create_remote_repo.sh index b3b1e710f..335197d13 100755 --- a/BUILD/mdbci/create_remote_repo.sh +++ b/BUILD/mdbci/create_remote_repo.sh @@ -16,7 +16,12 @@ echo " creating dirs on VM" ssh $sshopt "mkdir -p dest ; mkdir -p src; mkdir gpg_keys" echo "copying stuff to VM" -scp $scpopt $pre_repo_dir/$target/$box/* $sshuser@$IP:src/ +if [ $1 == "full_repo" ] ; then + find ${repo_path}/maxscale-${major_ver}.*-release/mariadb-maxscale/${platform}/${platform_version}/* -name "*.rpm" -exec scp $scpopt {} $sshuser@$IP:src/ \; + find ${repo_path}/maxscale-${major_ver}.*-release/mariadb-maxscale/${platform}/dists/${platform_version}/* -name "*.deb" -exec scp $scpopt {} $sshuser@$IP:src/ \; +else + scp $scpopt $pre_repo_dir/$target/$box/* $sshuser@$IP:src/ +fi scp $scpopt -r ${gpg_keys_path}/* $sshuser@$IP:./gpg_keys/ ssh $sshopt "key=\`ls ~/gpg_keys/*.public -1\` ; gpg --import \$key" From 910990115c1ef3ce3f28954766c0d11fc4aa0050 Mon Sep 17 00:00:00 2001 From: Timofey Turenko Date: Thu, 8 Aug 2019 16:58:07 +0300 Subject: [PATCH 2/8] add Docker to build VM temlates Now Docker is installed to VM which are used to build Maxscale. It allows to run tests which require Docker without installing it during build process. --- BUILD/mdbci/templates/build.json.template | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/BUILD/mdbci/templates/build.json.template b/BUILD/mdbci/templates/build.json.template index 08f93905a..9f3a21b6e 100644 --- a/BUILD/mdbci/templates/build.json.template +++ b/BUILD/mdbci/templates/build.json.template @@ -3,7 +3,11 @@ { "hostname" : "default", "memory_size" : "4096", - "box" : "$box" + "box" : "$box", + "product" : { + "name": "docker" + } + } } From 9f6efef67a6b68875241dea08f491311bf31f1dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Mon, 5 Aug 2019 13:09:09 +0300 Subject: [PATCH 3/8] MXS-2576: Update states atomically in csmon This prevents false transient states from occurring. --- server/modules/monitor/csmon/csmon.cc | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/server/modules/monitor/csmon/csmon.cc b/server/modules/monitor/csmon/csmon.cc index d57896644..a13e390e7 100644 --- a/server/modules/monitor/csmon/csmon.cc +++ b/server/modules/monitor/csmon/csmon.cc @@ -56,7 +56,7 @@ static std::string do_query(MXS_MONITORED_SERVER* srv, const char* query) // Returns a numeric version similar to mysql_get_server_version int get_cs_version(MXS_MONITORED_SERVER* srv) { - int rval = 0; + int rval = -1; std::string prefix = "Columnstore "; std::string result = do_query(srv, "SELECT @@version_comment"); auto pos = result.find(prefix); @@ -106,16 +106,21 @@ void CsMonitor::update_server_status(MXS_MONITORED_SERVER* srv) if (do_query(srv, alive_query) == "1") { - status |= SERVER_RUNNING; + auto version = get_cs_version(srv); - if (get_cs_version(srv) >= 10200) + if (version >= 0) { - // 1.2 supports the mcsSystemPrimary function - status |= do_query(srv, role_query) == "1" ? SERVER_MASTER : SERVER_SLAVE; - } - else - { - status |= srv->server == m_primary ? SERVER_MASTER : SERVER_SLAVE; + status |= SERVER_RUNNING; + + if (version >= 10200) + { + // 1.2 supports the mcsSystemPrimary function + status |= do_query(srv, role_query) == "1" ? SERVER_MASTER : SERVER_SLAVE; + } + else + { + status |= srv->server == m_primary ? SERVER_MASTER : SERVER_SLAVE; + } } } From bb6f9213d4b762a47a8eb6a4fcd42fcb4236a87f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Mon, 5 Aug 2019 16:08:25 +0300 Subject: [PATCH 4/8] Fix debug assert on master reconnection If a master reconnection occurred after the session command history was disabled due to the limit being exceeded, a debug assertion would be hit in prepare_target. This assert makes sure that a connection can be safely created to the server which means that in release mode builds the session state would be inconsistent on the new master. As this is an unrecoverable situation, the session should stop immediately even if delayed_retry is enabled. Currently the session will continue until the delayed retry timeout is hit. This happens due to the fact that the delayed retry mechanism handles all errors in a similar way. --- server/modules/routing/readwritesplit/rwsplit_route_stmt.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc index 4ac85bed1..28b4d11b8 100644 --- a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc +++ b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc @@ -721,7 +721,8 @@ SRWBackend RWSplitSession::get_master_backend() if (master) { - if (master->in_use() || (m_config.master_reconnection && master->can_connect())) + if (master->in_use() + || (m_config.master_reconnection && master->can_connect() && can_recover_servers())) { if (master->is_master()) { @@ -736,7 +737,8 @@ SRWBackend RWSplitSession::get_master_backend() } else { - MXS_ERROR("Server '%s' is not in use and can't be chosen as the master.", + MXS_ERROR("Cannot choose server '%s' as the master because it is not " + "in use and a new connection to it cannot be created.", master->name()); } } From 8bc4e42f2df62deed6b34d40299112690cf20753 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Tue, 6 Aug 2019 11:17:15 +0300 Subject: [PATCH 5/8] Fix query queuing on session command execution If session command execution during server reconnection caused a query to be queued, the query would be put on the tail end of the queue. This would cause queries to be reordered if the queue wasn't empty. The correct thing to do would be to put the next pending query back at the front of the queue. --- server/modules/routing/readwritesplit/rwsplit_route_stmt.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc index 28b4d11b8..7919d5a0a 100644 --- a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc +++ b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc @@ -336,7 +336,7 @@ bool RWSplitSession::route_single_stmt(GWBUF* querybuf) else if (target->has_session_commands()) { // We need to wait until the session commands are executed - m_query_queue.emplace_back(gwbuf_clone(querybuf)); + m_query_queue.emplace_front(gwbuf_clone(querybuf)); MXS_INFO("Queuing query until '%s' completes session command", target->name()); } else From 547236b7a4c46d7d30bc7119154b5b14e1779be4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Wed, 7 Aug 2019 13:23:11 +0300 Subject: [PATCH 6/8] MXS-2609: Store history size in Backend When a connection is created, the size of the history that is about to be replayed is known. Storing this and decrementing it each time a session command is completed tells us when the Backend has finished replaying the session command history. This can then be used to distinguish whether a session command executed on a master should be retried or whether to simply discard the connection. --- include/maxscale/backend.hh | 13 ++++++++++++- server/core/backend.cc | 11 ++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/include/maxscale/backend.hh b/include/maxscale/backend.hh index a71074878..ab8416317 100644 --- a/include/maxscale/backend.hh +++ b/include/maxscale/backend.hh @@ -303,6 +303,16 @@ public: return m_state & FATAL_FAILURE; } + /** + * Is the backend replaying session command history + * + * @return If a list of session commands was provided at connect time, the function returns true as long + * as the backend has not completed those session commands. + */ + inline bool is_replaying_history() const + { + return m_history_size > 0; + } /** * @brief Get the object name of this server @@ -389,7 +399,8 @@ private: maxbase::StopWatch m_session_timer; maxbase::IntervalTimer m_select_timer; - int64_t m_num_selects = 0; + int64_t m_num_selects {0}; + int64_t m_history_size {0}; }; typedef std::shared_ptr SBackend; diff --git a/server/core/backend.cc b/server/core/backend.cc index 201ae5c6d..f837f7957 100644 --- a/server/core/backend.cc +++ b/server/core/backend.cc @@ -54,6 +54,7 @@ void Backend::close(close_type type) m_closed = true; m_closed_at = time(NULL); m_session_commands.clear(); + m_history_size = 0; if (in_use()) { @@ -139,6 +140,12 @@ uint64_t Backend::complete_session_command() { uint64_t rval = m_session_commands.front()->get_position(); m_session_commands.pop_front(); + + if (m_history_size > 0) + { + --m_history_size; + } + return rval; } @@ -192,10 +199,12 @@ bool Backend::connect(MXS_SESSION* session, SessionCommandList* sescmd) m_state = IN_USE; mxb::atomic::add(&m_backend->connections, 1, mxb::atomic::RELAXED); rval = true; + m_history_size = 0; - if (sescmd && sescmd->size()) + if (sescmd && !sescmd->empty()) { append_session_command(*sescmd); + m_history_size = sescmd->size(); if (!execute_session_command()) { From 1748e6599d0a7f3a244feef709d8002eb01f414f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Wed, 7 Aug 2019 14:04:49 +0300 Subject: [PATCH 7/8] MXS-2609: Fix session command mixup on master failure If a master failed during an ongoing session command history replay, it would be treated as if a normal session command failed which would result in the already executed session command being re-executed on all servers at the wrong logical position. To fix this, the history replay must be distinguished from normal session command execution. When a connection replaying the history fails, the query routing simply needs to be attempted again. --- .../routing/readwritesplit/rwsplitsession.cc | 31 +++++++++++++++++-- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/server/modules/routing/readwritesplit/rwsplitsession.cc b/server/modules/routing/readwritesplit/rwsplitsession.cc index c3693465a..aad2e8a20 100644 --- a/server/modules/routing/readwritesplit/rwsplitsession.cc +++ b/server/modules/routing/readwritesplit/rwsplitsession.cc @@ -879,17 +879,40 @@ bool RWSplitSession::retry_master_query(SRWBackend& backend) { bool can_continue = false; - if (backend->has_session_commands()) + if (backend->is_replaying_history()) { - // Try to route the session command again. If the master is not available, the response will be - // returned from one of the slaves. + // Master failed while it was replaying the session command history + mxb_assert(m_config.master_reconnection); + mxb_assert(!m_query_queue.empty()); + retry_query(m_query_queue.front().release()); + m_query_queue.pop_front(); + can_continue = true; + } + else if (backend->has_session_commands()) + { + // We were routing a session command to all servers but the master server from which the response + // was expected failed: try to route the session command again. If the master is not available, + // the response will be returned from one of the slaves if the configuration allows it. + + mxb_assert(backend->next_session_command()->get_position() == m_recv_sescmd + 1); + mxb_assert(m_qc.current_route_info().target() == TARGET_ALL); mxb_assert(!m_current_query.get()); mxb_assert(!m_sescmd_list.empty()); mxb_assert(m_sescmd_count >= 2); MXS_INFO("Retrying session command due to master failure: %s", backend->next_session_command()->to_string().c_str()); + // MXS-2609: Maxscale crash in RWSplitSession::retry_master_query() + // To prevent a crash from happening, we make sure the session command list is not empty before + // we touch it. This should be converted into a debug assertion once the true root cause of the + // problem is found. + if (m_sescmd_count < 2 || m_sescmd_list.empty()) + { + MXS_WARNING("Session command list was empty when it should not be"); + return false; + } + // Before routing it, pop the failed session command off the list and decrement the number of // executed session commands. This "overwrites" the existing command and prevents history duplication. m_sescmd_list.pop_back(); @@ -900,6 +923,8 @@ bool RWSplitSession::retry_master_query(SRWBackend& backend) } else if (m_current_query.get()) { + // A query was in progress, try to route it again + mxb_assert(m_prev_target == backend); retry_query(m_current_query.release()); can_continue = true; } From eda830c9f3cc52f9d0216da686a0d81565caedf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Wed, 7 Aug 2019 19:31:04 +0300 Subject: [PATCH 8/8] MXS-2609: Add master reconnection test case The test case covers a few bugs that were fixed by the previous commits. The first part of the test covers the case when master reconnection fails while session command history is being executed. The second part of the test makes sure exceeding the session command history will prevent master reconnections from taking place. --- maxscale-system-test/CMakeLists.txt | 3 + ...xscale.cnf.template.mxs2609_history_replay | 27 ++++++++ .../mxs2609_history_replay.cpp | 69 +++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100755 maxscale-system-test/cnf/maxscale.cnf.template.mxs2609_history_replay create mode 100644 maxscale-system-test/mxs2609_history_replay.cpp diff --git a/maxscale-system-test/CMakeLists.txt b/maxscale-system-test/CMakeLists.txt index 0dabd0bbc..f8160fed3 100644 --- a/maxscale-system-test/CMakeLists.txt +++ b/maxscale-system-test/CMakeLists.txt @@ -976,6 +976,9 @@ add_test_executable(mxs2521_double_exec.cpp mxs2521_double_exec mxs2521_double_e # MXS-2490: Direct execution doesn't work with MaxScale add_test_executable(mxs2490_ps_execute_direct.cpp mxs2490_ps_execute_direct replication LABELS REPL_BACKEND readwritesplit) +# MXS-2609: Maxscale crash in RWSplitSession::retry_master_query() +add_test_executable(mxs2609_history_replay.cpp mxs2609_history_replay mxs2609_history_replay LABELS readwritesplit REPL_BACKEND) + # MXS-2621: Incorrect SQL if lower_case_table_names is used. add_test_executable(mxs2621_lower_case_tables.cpp mxs2621_lower_case_tables mxs2621_lower_case_tables LABELS REPL_BACKEND) diff --git a/maxscale-system-test/cnf/maxscale.cnf.template.mxs2609_history_replay b/maxscale-system-test/cnf/maxscale.cnf.template.mxs2609_history_replay new file mode 100755 index 000000000..bf4cc230c --- /dev/null +++ b/maxscale-system-test/cnf/maxscale.cnf.template.mxs2609_history_replay @@ -0,0 +1,27 @@ +[maxscale] +threads=###threads### + +###server### + +[MySQL-Monitor] +type=monitor +module=mariadbmon +servers=###server_line### +user=maxskysql +password=skysql +monitor_interval=1000 + +[RW-Split-Router] +type=service +router=readwritesplit +servers=###server_line### +user=maxskysql +password=skysql +transaction_replay=true +master_accept_reads=true + +[RW-Split-Listener] +type=listener +service=RW-Split-Router +protocol=MySQLClient +port=4006 diff --git a/maxscale-system-test/mxs2609_history_replay.cpp b/maxscale-system-test/mxs2609_history_replay.cpp new file mode 100644 index 000000000..add9b50e2 --- /dev/null +++ b/maxscale-system-test/mxs2609_history_replay.cpp @@ -0,0 +1,69 @@ +/** + * MXS-2609: Maxscale crash in RWSplitSession::retry_master_query() + * + * https://jira.mariadb.org/browse/MXS-2609 + * + * This test attempts to reproduce the crash described in MXS-2609 which + * occurred during a retrying attempt of a session command that failed on + * the master. + */ + + +#include "testconnections.h" + +int main(int argc, char** argv) +{ + TestConnections test(argc, argv); + + auto block = [&test](int n){ + test.repl->block_node(n); + test.maxscales->wait_for_monitor(); + test.repl->unblock_node(n); + test.maxscales->wait_for_monitor(); + }; + auto conn = test.maxscales->rwsplit(); + + // + // Test 1: Master failure mid-reconnect should trigger query replay + // + + test.expect(conn.connect(), "First connect should work: %s", conn.error()); + + // Queue up session commands so that the history replay takes some time + for (int i = 0; i < 10; i++) + { + conn.query("SET @a = (SLEEP 1)"); + } + + block(0); + + test.set_timeout(90); + + std::thread([&](){ + sleep(5); + block(0); + }).detach(); + + test.expect(conn.query("SELECT @@last_insert_id"), "Query should work: %s", conn.error()); + + test.stop_timeout(); + conn.disconnect(); + + // + // Test 2: Exceed history limit and trigger a master reconnection + // + + test.maxctrl("alter service RW-Split-Router max_sescmd_history 2"); + test.expect(conn.connect(), "Second should work: %s", conn.error()); + + for (int i = 0; i < 5; i++) + { + conn.query("SET @a = (SLEEP 1)"); + } + + block(0); + + test.expect(!conn.query("SELECT @@last_insert_id"), "Query should fail"); + + return test.global_result; +}