oceanbase/src/storage/tx/ob_tx_2pc_ctx_impl.cpp

/**
 * Copyright (c) 2021 OceanBase
 * OceanBase CE is licensed under Mulan PubL v2.
 * You can use this software according to the terms and conditions of the Mulan PubL v2.
 * You may obtain a copy of Mulan PubL v2 at:
 *          http://license.coscl.org.cn/MulanPubL-2.0
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PubL v2 for more details.
 */

#include "storage/tx/ob_trans_part_ctx.h"
#include "storage/tx/ob_trans_service.h"
#include "storage/tx/ob_committer_define.h"

namespace oceanbase
{
using namespace common;
using namespace share;
namespace transaction
{

// get_2pc_role is engaged with the current state, so it may become from a leaf
// to a internal at later. So we can only decide its state under lock at one time.
Ob2PCRole ObPartTransCtx::get_2pc_role() const
{
  Ob2PCRole role = Ob2PCRole::UNKNOWN;

  if (exec_info_.upstream_.is_valid()) {
    if (exec_info_.upstream_ == ls_id_) {
      role = Ob2PCRole::ROOT;
    } else if (exec_info_.participants_.empty()) {
      // not root & downstream is empty
      // root must not be leaf, because the distributed txn must be composed by
      // more than one participants.
      role = Ob2PCRole::LEAF;
    } else {
      role = Ob2PCRole::INTERNAL;
    }
  }

  return role;
}

int64_t ObPartTransCtx::get_downstream_size() const
{
  return exec_info_.participants_.count();
}

int64_t ObPartTransCtx::get_self_id()
{
  int ret = OB_SUCCESS;
  if (self_id_ == -1) {
    if (OB_FAIL(find_participant_id_(ls_id_, self_id_))) {
      if (is_root()) {
        TRANS_LOG(ERROR, "find participant id failed", K(ret), K(*this));
      } else {
        self_id_ = -1;
      }
    }
  }
  return self_id_;
}

int ObPartTransCtx::restart_2pc_trans_timer_()
{
  int ret = OB_SUCCESS;

  trans_2pc_timeout_ = ObServerConfig::get_instance().trx_2pc_retry_interval;
  (void)unregister_timeout_task_();
  if (OB_FAIL(register_timeout_task_(trans_2pc_timeout_))) {
    TRANS_LOG(WARN, "register timeout handler error", KR(ret), "context", *this);
  }

  return ret;
}

/*
 * If no_need_submit_log is true, it will not submit prepare log after do_prepare.
 * XA and dup_table will submit commit info log in do_prepare and drive to submit prepare log after the conditions are met.
 * When prepare log on_succ, 2PC will enter into next phase.
 * */
int ObPartTransCtx::do_prepare(bool &no_need_submit_log)
{
  int ret = OB_SUCCESS;
  no_need_submit_log = false;

  if (OB_SUCC(ret)) {
    if (sub_state_.is_force_abort()) {
      if (OB_FAIL(compensate_abort_log_())) {
        TRANS_LOG(WARN, "compensate abort log failed", K(ret), K(ls_id_), K(trans_id_),
                  K(get_downstream_state()), K(get_upstream_state()), K(sub_state_));
      } else {
        ret = OB_TRANS_KILLED;
      }
    }
  }

  if (OB_SUCC(ret)) {
    if (OB_FAIL(search_unsubmitted_dup_table_redo_())) {
      TRANS_LOG(WARN, "search unsubmitted dup table redo", K(ret), KPC(this));
    }
  }

  if (OB_SUCC(ret)) {
    if (exec_info_.is_dup_tx_ && !is_dup_table_redo_sync_completed_()) {
      no_need_submit_log = true;
      if (OB_FAIL(dup_table_tx_redo_sync_())) {
        TRANS_LOG(WARN, "dup table tx  redo sync failed", K(ret));
      }
    } else if (OB_FAIL(generate_prepare_version_())) {
      TRANS_LOG(WARN, "generate prepare version failed", K(ret), K(*this));
    }
  }

  if (exec_info_.is_dup_tx_) {
    TRANS_LOG(INFO, "do prepare for dup table", K(ret), K(dup_table_follower_max_read_version_),
              K(is_dup_table_redo_sync_completed_()), KPC(this));
  }

  if (OB_SUCC(ret)) {
    if (OB_FAIL(prepare_mul_data_source_tx_end_(true))) {
      TRANS_LOG(WARN, "trans commit need retry", K(ret), K(trans_id_), K(ls_id_));
    }
  }
  if (OB_SUCC(ret) && OB_FAIL(restart_2pc_trans_timer_())) {
    TRANS_LOG(WARN, "restart_2pc_trans_timer_ error", KR(ret), KPC(this));
  }
  return ret;
}

int ObPartTransCtx::on_prepare()
{
  int ret = OB_SUCCESS;

  // currently do nothing

  return ret;
}


/* Why do we need to execute wait_gts_elapse_commit_version_ successfully before post
 * the pre-commit message;
 *
 * The pre-commit request does not write the palf log; then when the leader replica is switched,
 * the information of the received pre-commit message will be lost, which is equivalent to not
 * executing the pre-commit;
 * Exec wait_gts_elapse_commit_version_ before the pre-commit message is sent, which is equivalent
 * to adding a limit, the max_commit_ts timestamp of pre-commit promotion will not exceed gts;
 * the leader replica switch is completed and then max_commit_ts is updated to the gts value;
 * equivalent to executing pre-commit ;
 *
 * Scenarios that trigger the problem:
 *
 * 1. LS A as coordinator, LS B as participant;
 * 2. LS A sends a pre-commit message to LS B, the pre-commit message (version = 105) will push the
 *    max_commit_ts of each node higher than gts (version = 100);
 *
 * 3. The leader replica of LS B responded to the pre-commit message successfully;
 * 4. The part_ctx on LS A runs to the commit state and executes wait_gts_elapse_commit_version_;
 * 5. The leader replica switch of LS B occurs. After the leader replica switch is completed,
 *    the max_commit_ts of the leader replica of LS B is updated to the gts value (version = 100);
 * 6. LS A wait_gts_elapse_commit_version_ is executed successfully and returns success to the
 *    client, let the client know that value (version=105) has been successfully committed;
      The OB_MSG_TX_COMMIT_REQ message sent by LS A to LS B is still in the network;
 * 7. After the client receives the successful commit, it sends a read request req_1 to the LS B;
 *    the thread thread_1 that processes the req_1 obtains the max_commit_ts of the LS B leader
 *    replica as snapshot ( version = 100 ); then this thread_1 is suspended for scheduling reasons;
 * 8. Then LS B receives the OB_MSG_TX_COMMIT_REQ sent by LS A, updates max_commit_ts, and unlocks
 *    the row;
 * 9. The thread_1 continues to execute, and will read the value of version = 100;
 */
int ObPartTransCtx::do_pre_commit(bool &need_wait)
{
  int ret = OB_SUCCESS;
  int tmp_ret = OB_SUCCESS;

  need_wait = false;
  if (is_root()) {
    if (OB_FAIL(ctx_tx_data_.set_commit_version(exec_info_.prepare_version_))) {
      TRANS_LOG(ERROR, "set tx data commit version", K(ret), K(ctx_tx_data_));
    } else if (OB_FAIL(wait_gts_elapse_commit_version_(need_wait))) {
      TRANS_LOG(ERROR, "wait gts elapse commit version failed", KR(ret), KPC(this));
    } else {
    }
  }

  // try to post ts sync request at first
  if (exec_info_.is_dup_tx_ && (OB_TMP_FAIL(dup_table_tx_pre_commit_()))) {
    if (OB_EAGAIN == tmp_ret) {
      TRANS_LOG(INFO, "dup table pre_commit is not finish", K(tmp_ret));
    } else {
      TRANS_LOG(WARN, "dup table pre_commit error", K(tmp_ret));
    }
  }

  if (!need_wait && OB_FAIL(update_local_max_commit_version_(ctx_tx_data_.get_commit_version()))) {
    TRANS_LOG(ERROR, "update publish version failed", KR(ret), KPC(this));
  }
  if (OB_SUCC(ret) && OB_FAIL(restart_2pc_trans_timer_())) {
    TRANS_LOG(ERROR, "restart_2pc_trans_timer_ error", KR(ret), KPC(this));
  }
  return ret;
}

int ObPartTransCtx::do_commit()
{
  int ret = OB_SUCCESS;
  SCN gts, local_max_read_version;

  if (is_local_tx_()) {
    if (OB_FAIL(get_gts_(gts))) {
      if (OB_EAGAIN == ret) {
        TRANS_LOG(INFO, "get gts eagain", KR(ret), KPC(this));
        ret = OB_SUCCESS;
      } else {
        TRANS_LOG(ERROR, "get gts failed", KR(ret), K(*this));
      }
    } else {
      // To order around the read-write conflict(anti dependency), we need push
      // the txn version upper than all previous read version. So we record all
      // read version each access begins and get the max read version to handle
      // the dependency conflict
      mt_ctx_.before_prepare(gts);
      if (OB_FAIL(get_local_max_read_version_(local_max_read_version))) {
        TRANS_LOG(ERROR, "get local max read version failed", KR(ret), K(*this));
      } else if (OB_FAIL(ctx_tx_data_.set_commit_version(SCN::max(gts, local_max_read_version)))) {
        TRANS_LOG(ERROR, "set tx data commit version failed", K(ret));
      }
    }
  } else {
    // Because Oceanbase two phase commit delay is optimized through distributed
    // prepare state, we can guarantee commit state even commit log hasnot been
    // synchronized.
    //
    // We cannot release row lock until the commit has been submitted(Note the
    // difference between 'synchronized' and 'submitted'), because the other txn
    // that touches the same row and commits after the txn may submit its own log
    // about the row or even the commit info before this txn's.
    //
    // TODO(handora.qc): add ls info arr to prepare_log_info_arr
    // ObLSLogInfoArray ls_info_arr;
    //
    // We already update the local max commit version in precommit phase,
    // while we optimize the phase and skip some participants' request,
    // so we need always update local max commit version
    if (OB_FAIL(update_local_max_commit_version_(ctx_tx_data_.get_commit_version()))) {
      TRANS_LOG(ERROR, "update local max commit version failed", K(ret), KPC(this));
    } else if (is_root() && OB_FAIL(coord_prepare_info_arr_.assign(exec_info_.prepare_log_info_arr_))) {
      TRANS_LOG(WARN, "assign coord_prepare_info_arr_ for root failed", K(ret));
    } else if (is_root()) {
      check_and_response_scheduler_(ObTxState::COMMIT, OB_SUCCESS);
    }
  }
  if (OB_SUCC(ret) && OB_FAIL(restart_2pc_trans_timer_())) {
    TRANS_LOG(ERROR, "restart_2pc_trans_timer_ error", KR(ret), KPC(this));
  }
  return ret;
}

int ObPartTransCtx::check_and_response_scheduler_(ObTxState next_phase, int result)
{
  int ret = OB_SUCCESS;
  // when error inject, response scheduler delayed to CLEAR state
  int inject_err = OB_E(EventTable::EN_EARLY_RESPONSE_SCHEDULER) OB_SUCCESS;
  if (!is_sub2pc()
      && inject_err != OB_SUCCESS
      && next_phase != ObTxState::CLEAR
      && !callback_scheduler_on_clear_) {
    callback_scheduler_on_clear_ = true;
    return OB_SUCCESS;
  }

  if (callback_scheduler_on_clear_ && ObTxState::CLEAR != next_phase) {
    // delayed, skip other state
    return OB_SUCCESS;
  }

  if (is_sub2pc()) {
    // TODO, according to part trans action
    if (ObTxState::COMMIT == next_phase) {
      if (OB_FAIL(reply_to_scheduler_for_sub2pc(SUBCOMMIT_RESP))) {
        TRANS_LOG(ERROR, "fail to reply sub commit", K(ret), K(*this));
      }
    } else if (ObTxState::ABORT == next_phase || ObTxState::ABORT == get_downstream_state()) {
      if (OB_FAIL(reply_to_scheduler_for_sub2pc(SUBROLLBACK_RESP))) {
        TRANS_LOG(ERROR, "fail to reply sub rollback", K(ret), K(*this));
      }
    }
  } else {
    if (OB_FAIL(post_tx_commit_resp_(result))) {
      TRANS_LOG(WARN, "post commit response failed", KR(ret), K(*this));
      ret = OB_SUCCESS;
    }
  }

  if (OB_FAIL(ret)) {
    TRANS_LOG(INFO, "response scheduler in 2pc", K(ret), K(result), KPC(this));
  }
  return ret;
}


int ObPartTransCtx::update_local_max_commit_version_(const SCN &commit_version)
{
  int ret = OB_SUCCESS;
  trans_service_->get_tx_version_mgr().update_max_commit_ts(commit_version, false);
  return ret;
}

int ObPartTransCtx::on_commit()
{
  int ret = OB_SUCCESS;

  if (is_local_tx_()) {
    // TODO: fill it for sp commit
  } else {
    if (OB_FAIL(on_dist_end_(true /*commit*/))) {
      TRANS_LOG(WARN, "transaciton end error", KR(ret), "context", *this);
    }
  }

  return ret;
}

int ObPartTransCtx::do_abort()
{
  int ret = OB_SUCCESS;

  if (is_sub2pc()) {
    // do nothing
  } else {
    if (is_root()) {
      check_and_response_scheduler_(ObTxState::ABORT, OB_TRANS_KILLED);
    }
  }

  return ret;
}

int ObPartTransCtx::on_abort()
{
  int ret = OB_SUCCESS;

  if (is_sub2pc() && is_root()) {
    check_and_response_scheduler_(ObTxState::ABORT, OB_TRANS_KILLED);
  }
  if (OB_FAIL(on_dist_end_(false /*commit*/))) {
    TRANS_LOG(WARN, "transaciton end error", KR(ret), "context", *this);
  } else if (OB_FAIL(trans_clear_())) {
    TRANS_LOG(WARN, "transaciton clear error", KR(ret), "context", *this);
  }

  return ret;
}

int ObPartTransCtx::do_clear()
{
  int ret = OB_SUCCESS;

  if (is_root()) {
    // response scheduler after all participant commit log sycned
    check_and_response_scheduler_(ObTxState::CLEAR, OB_SUCCESS);
  }
  // currently do nothing

  return ret;
}

int ObPartTransCtx::on_clear()
{
  int ret = OB_SUCCESS;

  (void)unregister_timeout_task_();
  (void)trans_clear_();
  (void)set_exiting_();

  return ret;
}

int ObPartTransCtx::reply_to_scheduler_for_sub2pc(int64_t msg_type)
{
  int ret = OB_SUCCESS;

  if (is_local_tx_()) {
    ret = OB_ERR_UNEXPECTED;
    TRANS_LOG(WARN, "unexpected trans type", KR(ret), K(*this));
  } else if (!is_root()) {
    ret = OB_ERR_UNEXPECTED;
    TRANS_LOG(WARN, "not root, unexpected", KR(ret), K(*this));
  } else {
    if (SUBPREPARE_RESP == msg_type) {
      if (OB_FAIL(post_tx_sub_prepare_resp_(OB_SUCCESS /*commit*/))) {
        TRANS_LOG(WARN, "fail to post sub prepare response", KR(ret), K(*this));
        ret = OB_SUCCESS;
      }
      TRANS_LOG(INFO, "reply to scheduler for sub prepare", KR(ret), K(*this));
    } else if (SUBCOMMIT_RESP == msg_type) {
      if (OB_FAIL(post_tx_sub_commit_resp_(OB_SUCCESS))) {
        TRANS_LOG(WARN, "fail to post sub commit response", KR(ret), K(*this));
        ret = OB_SUCCESS;
      }
      TRANS_LOG(INFO, "reply to scheduler for sub commit", KR(ret), K(*this));
    } else if (SUBROLLBACK_RESP == msg_type) {
      if (OB_FAIL(post_tx_sub_rollback_resp_(OB_SUCCESS))) {
        TRANS_LOG(WARN, "fail to post sub rollback response", KR(ret), K(*this));
        ret = OB_SUCCESS;
      }
      TRANS_LOG(INFO, "reply to scheduler for sub rollback", KR(ret), K(*this));
    }
  }

  return ret;
}

// When to merge the intermediate_participants into the participants in a two
// phase commit needs careful consideration. One of the most critical factors is
// how to deal with concurrency with the transfer out logs.
//
// The primary rule we need to follow is that "the two phase commit state before
// the transfer out log will be relocated to the dest, and the two phase commit
// state after this log will participant into the src participants, progressing
// through a tree-style two phase commit." Therefore, before committing the
// transfer out log, we will first block the advancement of the two phase commit
// protocol for all txns which is required by the transfer (by blocking the
// advancement of the in-memory state through "drive_self_2pc_phase" and the
// advancement of the persistent state machine through "submit_log_if_allow"),
// ensuring the integrity of the two phase commit state. Simultaneously, we will
// add intermediate_participants for these blocked txns.
//
// The second rule is that we need to adhere to the principle that "when a
// participant of a txn enters a certain two phase commit state with a log, all
// transfer out logs before this log need to be included in the participants."
// Therefore, we must ensure that the transfer out logs before the writing of
// this two phase commit log will definitely be included in the temporary
// participants(because the transfer out logs are barrier logs), while the
// transfer out logs after that will not be included in the intermediate
// participants(because the transfer out logs block the advancement of the txn's
// state machine before being written to paxos, including both of the in-memory
// state and the persistent state, as explained above).
//
// Hence, with the protection of the blocking capability of the state machine in
// the in-memory state advancement("drive_self_2pc_phase") and the advancement
// of the persistent state machine("submit_log_if_allow"), we can safely proceed
// with the action of merging the intermediate_participant into the participants.
int ObPartTransCtx::merge_intermediate_participants()
{
  int ret = OB_SUCCESS;
  bool exist = false;

  const int64_t participants_size = exec_info_.participants_.count();
  const int64_t increase_size = exec_info_.intermediate_participants_.count();

  if (increase_size > 0) {
    if (participants_size != exec_info_.commit_parts_.count()) {
      ret = OB_ERR_UNEXPECTED;
      TRANS_LOG(WARN, "part size not match", KR(ret), KPC(this));
    } else if (OB_FAIL(exec_info_.participants_.reserve(participants_size + increase_size))) {
      TRANS_LOG(WARN, "part reserve failed", KR(ret), KPC(this));
    } else if (OB_FAIL(exec_info_.commit_parts_.reserve(participants_size + increase_size))) {
      TRANS_LOG(WARN, "part reserve failed", KR(ret), KPC(this));
    }
    for (int64_t i = 0; OB_SUCC(ret) && i < increase_size; i++) {
      exist = false;
      for (int64_t j = 0; OB_SUCC(ret) && !exist && j < participants_size; j++) {
        if (exec_info_.participants_[j] == exec_info_.intermediate_participants_[i].ls_id_) {
          if (exec_info_.commit_parts_.at(j).ls_id_ != exec_info_.participants_[j]) {
            ret = OB_ERR_UNEXPECTED;
            TRANS_LOG(WARN, "commit part ls_id not match", KR(ret), KPC(this));
          } else if (exec_info_.commit_parts_.at(j).transfer_epoch_ > 0) {
            // do nothing
            // use first transfer_epoch to drive
          } else {
            exec_info_.commit_parts_.at(j).transfer_epoch_ = exec_info_.intermediate_participants_[i].transfer_epoch_;
          }
          exist = true;
        }
      }

      if (OB_SUCC(ret) && !exist) {
        if (OB_FAIL(exec_info_.participants_.push_back(exec_info_.intermediate_participants_[i].ls_id_))) {
          TRANS_LOG(WARN, "fail to push back incremental participants", KR(ret), KPC(this));
        } else if (OB_FAIL(exec_info_.commit_parts_.push_back(exec_info_.intermediate_participants_[i]))) {
          TRANS_LOG(WARN, "fail to push back incremental participants", KR(ret), KPC(this));
        }
      }
    }

    TRANS_LOG(INFO, "merge participant", KR(ret),
                                         K(trans_id_),
                                         K(ls_id_),
                                         KP(this),
                                         K(exec_info_.participants_),
                                         K(exec_info_.intermediate_participants_));
    if (OB_SUCC(ret)) {
      (void)exec_info_.intermediate_participants_.reuse();
    }
  }

  return ret;
}

bool ObPartTransCtx::is_real_upstream_(const ObLSID upstream)
{
  return upstream == exec_info_.upstream_;
}

bool ObPartTransCtx::is_real_upstream()
{
  bool bret = false;

  if (OB_ISNULL(msg_2pc_cache_)) {
    // If msg_2pc_cache is empty, it is called by handle_timeout, and we only
    // need to send to real upstream during handle_timeout.
    bret = true;
  } else {
    bret = is_real_upstream_(msg_2pc_cache_->sender_);
  }

  return bret;
}

int ObPartTransCtx::add_intermediate_participants(const share::ObLSID ls_id, int64_t transfer_epoch)
{
  int ret = OB_SUCCESS;
  bool exist = false;
  for (int64_t i = 0; OB_SUCC(ret) && !exist && i < exec_info_.intermediate_participants_.count(); i++) {
    if (ls_id == exec_info_.intermediate_participants_[i].ls_id_) {
      exist = true;
    }
  }
  if (OB_SUCC(ret) && !exist) {
    if (OB_FAIL(exec_info_.intermediate_participants_.push_back(ObTxExecPart(ls_id, -1, transfer_epoch)))) {
      TRANS_LOG(WARN, "fail to push back participant into intermediate participants", KR(ret), KPC(this));
    }
  }

  return ret;
}

} // end namespace transaction
} // end namespace oceanbase