diff --git a/BUILD/mdbci/create_full_repo.sh b/BUILD/mdbci/create_full_repo.sh new file mode 100755 index 000000000..ec7ef9c44 --- /dev/null +++ b/BUILD/mdbci/create_full_repo.sh @@ -0,0 +1,106 @@ +#!/bin/bash + +# $box - Vagrant box to be used for build + +# $target - name of repository to put results + +# $cmake_flags - cmake flags to be used in the build + +# $MDBCI_VM_PATH - path to the MDBCI virtual machies directory + +# $source - reference to the point in the source code repository + +# $do_not_destroy_vm - if "yes" VM stays alive after the build + +# $try_already_running - if "yes" already running VM will be used for build + +# $gpg_keys_path - path to the directory containing GPG keys for repo signing +# directory have to contain only one file *.public and only one *.private + +set -x + +# read the name of build scripts directory +export script_dir="$(dirname $(readlink -f $0))" + +# load all needed variables +. ${script_dir}/set_build_variables.sh + +export platform=`${mdbci_dir}/mdbci show boxinfo --box-name=$box --field='platform' --silent` +export platform_version=`${mdbci_dir}/mdbci show boxinfo --box-name=$box --field='platform_version' --silent` +export dist_sfx="$platform"."$platform_version" + +# prerare VM +export provider=`${mdbci_dir}/mdbci show provider $box --silent 2> /dev/null` +export name="$box-${JOB_NAME}-${BUILD_NUMBER}" +export name=`echo $name | sed "s|/|-|g"` + + +# destroying existing box +if [ -d "$MDBCI_VM_PATH/${name}" ]; then + ${mdbci_dir}/mdbci destroy $name +fi + + eval "cat < /dev/null > $MDBCI_VM_PATH/${name}.json + +# starting VM for build +echo "Generating build VM template" +${mdbci_dir}/mdbci --override --template $MDBCI_VM_PATH/$name.json generate $name +echo "starting VM for build" +${mdbci_dir}/mdbci up --attempts=1 $name +if [ $? != 0 ] ; then + echo "Error starting VM" + exit 1 +fi +echo "copying public keys to VM" +cp ~/build-scripts/team_keys . +${mdbci_dir}/mdbci public_keys --key team_keys --silent $name + + +echo "Get VM info" +export sshuser=`${mdbci_dir}/mdbci ssh --command 'whoami' --silent $name/build 2> /dev/null | tr -d '\r'` +export IP=`${mdbci_dir}/mdbci show network $name/build --silent 2> /dev/null` +export sshkey=`${mdbci_dir}/mdbci show keyfile $name/build --silent 2> /dev/null | sed 's/"//g'` +export scpopt="-i $sshkey -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -o ConnectTimeout=120 " +export sshopt="$scpopt $sshuser@$IP" + +rm -rf $pre_repo_dir/$target/$box +mkdir -p $pre_repo_dir/$target/SRC +mkdir -p $pre_repo_dir/$target/$box + +export work_dir="MaxScale" +export orig_image=$box + +ssh $sshopt "sudo rm -rf $work_dir" +echo "copying stuff to $image machine" +ssh $sshopt "mkdir -p $work_dir" + +rsync -avz --delete -e "ssh $scpopt" ${script_dir}/../../ $sshuser@$IP:./$work_dir/ +if [ $? -ne 0 ] ; then + echo "Error copying stuff to $box machine" + exit 2 +fi + +ssh $sshopt ./MaxScale/BUILD/install_build_deps.sh +${script_dir}/create_remote_repo.sh full_repo +export build_result=$? + +if [ ${build_result} -eq 0 ]; then + ${script_dir}/copy_repos.sh + export build_result=$? +fi + +echo "Removing locks and destroying VM" + +if [[ "$do_not_destroy_vm" != "yes" && "$try_already_running" != "yes" ]] ; then + echo "Destroying VM" + ${mdbci_dir}/mdbci destroy $name +fi +cd $dir + +if [ $build_result -ne 0 ] ; then + echo "Build FAILED!" + exit $build_result +fi + diff --git a/BUILD/mdbci/create_remote_repo.sh b/BUILD/mdbci/create_remote_repo.sh index b3b1e710f..335197d13 100755 --- a/BUILD/mdbci/create_remote_repo.sh +++ b/BUILD/mdbci/create_remote_repo.sh @@ -16,7 +16,12 @@ echo " creating dirs on VM" ssh $sshopt "mkdir -p dest ; mkdir -p src; mkdir gpg_keys" echo "copying stuff to VM" -scp $scpopt $pre_repo_dir/$target/$box/* $sshuser@$IP:src/ +if [ $1 == "full_repo" ] ; then + find ${repo_path}/maxscale-${major_ver}.*-release/mariadb-maxscale/${platform}/${platform_version}/* -name "*.rpm" -exec scp $scpopt {} $sshuser@$IP:src/ \; + find ${repo_path}/maxscale-${major_ver}.*-release/mariadb-maxscale/${platform}/dists/${platform_version}/* -name "*.deb" -exec scp $scpopt {} $sshuser@$IP:src/ \; +else + scp $scpopt $pre_repo_dir/$target/$box/* $sshuser@$IP:src/ +fi scp $scpopt -r ${gpg_keys_path}/* $sshuser@$IP:./gpg_keys/ ssh $sshopt "key=\`ls ~/gpg_keys/*.public -1\` ; gpg --import \$key" diff --git a/BUILD/mdbci/templates/build.json.template b/BUILD/mdbci/templates/build.json.template index 08f93905a..9f3a21b6e 100644 --- a/BUILD/mdbci/templates/build.json.template +++ b/BUILD/mdbci/templates/build.json.template @@ -3,7 +3,11 @@ { "hostname" : "default", "memory_size" : "4096", - "box" : "$box" + "box" : "$box", + "product" : { + "name": "docker" + } + } } diff --git a/include/maxscale/backend.hh b/include/maxscale/backend.hh index 4c8c50168..33d22defe 100644 --- a/include/maxscale/backend.hh +++ b/include/maxscale/backend.hh @@ -303,6 +303,16 @@ public: return m_state & FATAL_FAILURE; } + /** + * Is the backend replaying session command history + * + * @return If a list of session commands was provided at connect time, the function returns true as long + * as the backend has not completed those session commands. + */ + inline bool is_replaying_history() const + { + return m_history_size > 0; + } /** * @brief Get the object name of this server @@ -389,7 +399,8 @@ private: maxbase::StopWatch m_session_timer; maxbase::IntervalTimer m_select_timer; - int64_t m_num_selects = 0; + int64_t m_num_selects {0}; + int64_t m_history_size {0}; }; typedef std::shared_ptr SBackend; diff --git a/maxscale-system-test/CMakeLists.txt b/maxscale-system-test/CMakeLists.txt index 27eab7685..f4045c71b 100644 --- a/maxscale-system-test/CMakeLists.txt +++ b/maxscale-system-test/CMakeLists.txt @@ -1001,6 +1001,9 @@ add_test_executable(sr_basics.cpp sr_basics sr_basics LABELS REPL_BACKEND) # MXS-2490: Direct execution doesn't work with MaxScale add_test_executable(mxs2490_ps_execute_direct.cpp mxs2490_ps_execute_direct replication LABELS REPL_BACKEND readwritesplit) +# MXS-2609: Maxscale crash in RWSplitSession::retry_master_query() +add_test_executable(mxs2609_history_replay.cpp mxs2609_history_replay mxs2609_history_replay LABELS readwritesplit REPL_BACKEND) + # MXS-2621: Incorrect SQL if lower_case_table_names is used. add_test_executable(mxs2621_lower_case_tables.cpp mxs2621_lower_case_tables mxs2621_lower_case_tables LABELS REPL_BACKEND) diff --git a/maxscale-system-test/cnf/maxscale.cnf.template.mxs2609_history_replay b/maxscale-system-test/cnf/maxscale.cnf.template.mxs2609_history_replay new file mode 100755 index 000000000..bf4cc230c --- /dev/null +++ b/maxscale-system-test/cnf/maxscale.cnf.template.mxs2609_history_replay @@ -0,0 +1,27 @@ +[maxscale] +threads=###threads### + +###server### + +[MySQL-Monitor] +type=monitor +module=mariadbmon +servers=###server_line### +user=maxskysql +password=skysql +monitor_interval=1000 + +[RW-Split-Router] +type=service +router=readwritesplit +servers=###server_line### +user=maxskysql +password=skysql +transaction_replay=true +master_accept_reads=true + +[RW-Split-Listener] +type=listener +service=RW-Split-Router +protocol=MySQLClient +port=4006 diff --git a/maxscale-system-test/mxs2609_history_replay.cpp b/maxscale-system-test/mxs2609_history_replay.cpp new file mode 100644 index 000000000..add9b50e2 --- /dev/null +++ b/maxscale-system-test/mxs2609_history_replay.cpp @@ -0,0 +1,69 @@ +/** + * MXS-2609: Maxscale crash in RWSplitSession::retry_master_query() + * + * https://jira.mariadb.org/browse/MXS-2609 + * + * This test attempts to reproduce the crash described in MXS-2609 which + * occurred during a retrying attempt of a session command that failed on + * the master. + */ + + +#include "testconnections.h" + +int main(int argc, char** argv) +{ + TestConnections test(argc, argv); + + auto block = [&test](int n){ + test.repl->block_node(n); + test.maxscales->wait_for_monitor(); + test.repl->unblock_node(n); + test.maxscales->wait_for_monitor(); + }; + auto conn = test.maxscales->rwsplit(); + + // + // Test 1: Master failure mid-reconnect should trigger query replay + // + + test.expect(conn.connect(), "First connect should work: %s", conn.error()); + + // Queue up session commands so that the history replay takes some time + for (int i = 0; i < 10; i++) + { + conn.query("SET @a = (SLEEP 1)"); + } + + block(0); + + test.set_timeout(90); + + std::thread([&](){ + sleep(5); + block(0); + }).detach(); + + test.expect(conn.query("SELECT @@last_insert_id"), "Query should work: %s", conn.error()); + + test.stop_timeout(); + conn.disconnect(); + + // + // Test 2: Exceed history limit and trigger a master reconnection + // + + test.maxctrl("alter service RW-Split-Router max_sescmd_history 2"); + test.expect(conn.connect(), "Second should work: %s", conn.error()); + + for (int i = 0; i < 5; i++) + { + conn.query("SET @a = (SLEEP 1)"); + } + + block(0); + + test.expect(!conn.query("SELECT @@last_insert_id"), "Query should fail"); + + return test.global_result; +} diff --git a/server/core/backend.cc b/server/core/backend.cc index 23c1bb961..ba458eed3 100644 --- a/server/core/backend.cc +++ b/server/core/backend.cc @@ -54,6 +54,7 @@ void Backend::close(close_type type) m_closed = true; m_closed_at = time(NULL); m_session_commands.clear(); + m_history_size = 0; if (in_use()) { @@ -139,6 +140,12 @@ uint64_t Backend::complete_session_command() { uint64_t rval = m_session_commands.front()->get_position(); m_session_commands.pop_front(); + + if (m_history_size > 0) + { + --m_history_size; + } + return rval; } @@ -192,10 +199,12 @@ bool Backend::connect(MXS_SESSION* session, SessionCommandList* sescmd) m_state = IN_USE; mxb::atomic::add(&m_backend->connections, 1, mxb::atomic::RELAXED); rval = true; + m_history_size = 0; - if (sescmd && sescmd->size()) + if (sescmd && !sescmd->empty()) { append_session_command(*sescmd); + m_history_size = sescmd->size(); if (!execute_session_command()) { diff --git a/server/modules/monitor/csmon/csmon.cc b/server/modules/monitor/csmon/csmon.cc index d2420ae66..1d82f7d1b 100644 --- a/server/modules/monitor/csmon/csmon.cc +++ b/server/modules/monitor/csmon/csmon.cc @@ -58,7 +58,7 @@ static std::string do_query(MonitorServer* srv, const char* query) // Returns a numeric version similar to mysql_get_server_version int get_cs_version(MonitorServer* srv) { - int rval = 0; + int rval = -1; std::string prefix = "Columnstore "; std::string result = do_query(srv, "SELECT @@version_comment"); auto pos = result.find(prefix); @@ -107,16 +107,21 @@ void CsMonitor::update_server_status(MonitorServer* srv) if (do_query(srv, alive_query) == "1") { - status |= SERVER_RUNNING; + auto version = get_cs_version(srv); - if (get_cs_version(srv) >= 10200) + if (version >= 0) { - // 1.2 supports the mcsSystemPrimary function - status |= do_query(srv, role_query) == "1" ? SERVER_MASTER : SERVER_SLAVE; - } - else - { - status |= srv->server == m_primary ? SERVER_MASTER : SERVER_SLAVE; + status |= SERVER_RUNNING; + + if (version >= 10200) + { + // 1.2 supports the mcsSystemPrimary function + status |= do_query(srv, role_query) == "1" ? SERVER_MASTER : SERVER_SLAVE; + } + else + { + status |= srv->server == m_primary ? SERVER_MASTER : SERVER_SLAVE; + } } } diff --git a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc index 69dd07f80..93e70a2f5 100644 --- a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc +++ b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc @@ -355,7 +355,7 @@ bool RWSplitSession::route_single_stmt(GWBUF* querybuf) else if (target->has_session_commands()) { // We need to wait until the session commands are executed - m_query_queue.emplace_back(gwbuf_clone(querybuf)); + m_query_queue.emplace_front(gwbuf_clone(querybuf)); MXS_INFO("Queuing query until '%s' completes session command", target->name()); } else @@ -652,7 +652,8 @@ RWBackend* RWSplitSession::get_master_backend() if (master) { - if (master->in_use() || (m_config.master_reconnection && master->can_connect())) + if (master->in_use() + || (m_config.master_reconnection && master->can_connect() && can_recover_servers())) { if (can_continue_using_master(master)) { @@ -667,7 +668,8 @@ RWBackend* RWSplitSession::get_master_backend() } else { - MXS_ERROR("Server '%s' is not in use and can't be chosen as the master.", + MXS_ERROR("Cannot choose server '%s' as the master because it is not " + "in use and a new connection to it cannot be created.", master->name()); } } diff --git a/server/modules/routing/readwritesplit/rwsplitsession.cc b/server/modules/routing/readwritesplit/rwsplitsession.cc index bac38aaad..767981308 100644 --- a/server/modules/routing/readwritesplit/rwsplitsession.cc +++ b/server/modules/routing/readwritesplit/rwsplitsession.cc @@ -957,17 +957,40 @@ bool RWSplitSession::retry_master_query(RWBackend* backend) { bool can_continue = false; - if (backend->has_session_commands()) + if (backend->is_replaying_history()) { - // Try to route the session command again. If the master is not available, the response will be - // returned from one of the slaves. + // Master failed while it was replaying the session command history + mxb_assert(m_config.master_reconnection); + mxb_assert(!m_query_queue.empty()); + retry_query(m_query_queue.front().release()); + m_query_queue.pop_front(); + can_continue = true; + } + else if (backend->has_session_commands()) + { + // We were routing a session command to all servers but the master server from which the response + // was expected failed: try to route the session command again. If the master is not available, + // the response will be returned from one of the slaves if the configuration allows it. + + mxb_assert(backend->next_session_command()->get_position() == m_recv_sescmd + 1); + mxb_assert(m_qc.current_route_info().target() == TARGET_ALL); mxb_assert(!m_current_query.get()); mxb_assert(!m_sescmd_list.empty()); mxb_assert(m_sescmd_count >= 2); MXS_INFO("Retrying session command due to master failure: %s", backend->next_session_command()->to_string().c_str()); + // MXS-2609: Maxscale crash in RWSplitSession::retry_master_query() + // To prevent a crash from happening, we make sure the session command list is not empty before + // we touch it. This should be converted into a debug assertion once the true root cause of the + // problem is found. + if (m_sescmd_count < 2 || m_sescmd_list.empty()) + { + MXS_WARNING("Session command list was empty when it should not be"); + return false; + } + // Before routing it, pop the failed session command off the list and decrement the number of // executed session commands. This "overwrites" the existing command and prevents history duplication. m_sescmd_list.pop_back(); @@ -978,6 +1001,8 @@ bool RWSplitSession::retry_master_query(RWBackend* backend) } else if (m_current_query.get()) { + // A query was in progress, try to route it again + mxb_assert(m_prev_target == backend); retry_query(m_current_query.release()); can_continue = true; }