Files
oceanbase/src/storage/ob_partition_group.cpp

6140 lines
224 KiB
C++

/**
* Copyright (c) 2021 OceanBase
* OceanBase CE is licensed under Mulan PubL v2.
* You can use this software according to the terms and conditions of the Mulan PubL v2.
* You may obtain a copy of Mulan PubL v2 at:
* http://license.coscl.org.cn/MulanPubL-2.0
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PubL v2 for more details.
*/
#define USING_LOG_PREFIX STORAGE
#include "clog/ob_clog_history_reporter.h"
#include "storage/ob_partition_group.h"
#include "lib/lock/ob_spin_rwlock.h"
#include "lib/allocator/ob_malloc.h"
#include "lib/utility/ob_print_utils.h"
#include "lib/utility/ob_tracepoint.h"
#include "common/ob_partition_key.h"
#include "storage/blocksstable/ob_block_sstable_struct.h"
#include "storage/ob_partition_storage.h"
#include "storage/ob_partition_service_rpc.h"
#include "storage/ob_partition_service.h"
#include "storage/ob_partition_log.h"
#include "storage/ob_saved_storage_info_v2.h"
#include "common/storage/ob_freeze_define.h"
#include "storage/memtable/ob_memtable.h"
#include "storage/transaction/ob_trans_service.h"
#include "storage/ob_partition_scheduler.h"
#include "observer/ob_server_struct.h"
#include "storage/ob_partition_split_worker.h"
#include "storage/transaction/ob_ts_mgr.h"
#include "storage/ob_partition_log.h"
#include "storage/ob_partition_checkpoint.h"
#include "share/ob_partition_modify.h"
#include "storage/ob_partition_migrator.h"
#include "storage/ob_pg_log.h"
#include "share/schema/ob_part_mgr_util.h"
#include "storage/ob_file_system_util.h"
#include "share/backup/ob_backup_info_mgr.h"
#include "storage/transaction/ob_trans_split_adapter.h"
#include "share/ob_cluster_version.h"
namespace oceanbase {
using namespace common;
using namespace rootserver;
using namespace blocksstable;
using namespace memtable;
using namespace transaction;
using namespace clog;
using namespace share::schema;
using namespace share;
using namespace obrpc;
namespace storage {
const char* OB_PARTITION_STATE_STR[INVALID_STATE + 1] = {"INIT",
"F_WORKING",
"F_MINOR",
"L_TAKEOVER",
"L_TAKEOVERED",
"L_CANCELED",
"L_WORKING",
"L_FROZEN",
"L_MINOR",
"L_REVOKE",
"OFFLINING",
"OFFLINE",
"REMOVE",
"INVALID_STATE"};
ObPartitionGroup::ObPartitionGroup()
: is_inited_(false),
pkey_(),
replay_status_(NULL),
cp_fty_(NULL),
pls_(NULL),
txs_(NULL),
rp_eg_(NULL),
ps_(NULL),
pg_index_(nullptr),
pg_partition_map_(nullptr),
lock_(),
partition_state_lock_(),
partition_state_(INIT),
schema_version_container_(),
schema_service_(NULL),
split_state_(),
split_info_(),
split_trans_clear_ts_(0),
is_split_blocked_by_mc_(false),
max_passed_trans_version_(0),
freeze_record_(),
trans_version_lock_(),
gc_schema_drop_ts_(OB_INVALID_TIMESTAMP),
gc_seq_check_valid_member_(-1),
offline_log_id_(OB_INVALID_ID),
migrate_retry_flag_(NO_NEED_RETRY),
need_gc_(false),
restore_task_cnt_(0),
restore_task_ts_(0),
has_clear_trans_after_restore_(false)
{
REF_KEEPER.reg(safe_ref_, (void*)this);
}
ObPartitionGroup::~ObPartitionGroup()
{
FLOG_INFO("deconstruct ObPartitionGroup", K(this), K_(pkey));
destroy();
}
int ObPartitionGroup::check_init_(void* cp, const char* cp_name) const
{
int ret = OB_SUCCESS;
if (!is_inited_ || NULL == cp || NULL == cp_name) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "component does not exist", "component name", cp_name);
}
return ret;
}
const ObPartitionKey& ObPartitionGroup::get_partition_key() const
{
return pkey_;
}
int ObPartitionGroup::get_pg_partition(const common::ObPartitionKey& pkey, ObPGPartitionGuard& guard)
{
return pg_storage_.get_pg_partition(pkey, guard);
}
int ObPartitionGroup::set_valid()
{
return try_switch_partition_state(F_WORKING);
}
void ObPartitionGroup::set_invalid()
{
STORAGE_LOG(ERROR, "set partition to INVALID_STATE state");
ATOMIC_STORE(&partition_state_, INVALID_STATE);
while (1) {
;
}
}
int ObPartitionGroup::table_scan(ObTableScanParam& param, ObNewRowIterator*& result)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
} else if (OB_FAIL(pg_storage_.table_scan(param, result))) {
if (NULL != param.trans_desc_) {
LOG_WARN("failed to do table scan", K(ret), "this", *this, "trans_id", param.trans_desc_->get_trans_id());
} else {
LOG_WARN("failed to do table scan", K(ret), "this", *this);
}
} else {
// do nothing
}
return ret;
}
/*
The description of access replica_type and splite_state without lock at DML interfaces such as table_scan and
insert/update.
1. TableScan
a) There is no guarantee that read does not report errors if we check replica type without lock at PG layer, for
example read and replica type transforming occur concurrently. This situation should be dealt with at
ObPartitionStore::get_read_tables. b) PG split The origin partition can be choosed as the read partition, if table scan
starts before PG split; The read operation at dest partition needs to retry (transmit it maybe better), if the
partition is at splitting sate which means that the logical split is not completed.
2. Insert/Update/Delete/Lock
a) The replica type change is made on the follower. Even if there is no defense here, the transaction layer will
still report an error. b) During the operation of DML, it is necessary to ensure that the memstore that has been frozen
can no longer be written. This is guaranteed at check_split_state.
In conclusion, the above interfaces do not need to be modified to access replica_type and split_state with lock.
*/
int ObPartitionGroup::table_scan(ObTableScanParam& param, ObNewIterIterator*& result)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
} else if (OB_FAIL(pg_storage_.table_scan(param, result))) {
if (NULL != param.trans_desc_) {
LOG_WARN("failed to do table scan", K(ret), "this", *this, "trans_id", param.trans_desc_->get_trans_id());
} else {
LOG_WARN("failed to do table scan", K(ret), "this", *this);
}
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::join_mv_scan(ObTableScanParam& left_param, ObTableScanParam& right_param,
ObIPartitionGroup& right_partition, common::ObNewRowIterator*& result)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
} else if (OB_FAIL(pg_storage_.join_mv_scan(left_param, right_param, right_partition, result))) {
LOG_WARN("join mv scan failed", K(ret), K(left_param), K(right_param));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::delete_rows(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param,
const ObIArray<uint64_t>& column_ids, ObNewRowIterator* row_iter, int64_t& affected_rows)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.delete_rows(ctx, dml_param, column_ids, row_iter, affected_rows))) {
LOG_WARN("failed to delete row", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::delete_row(
const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param, const ObIArray<uint64_t>& column_ids, const ObNewRow& row)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.delete_row(ctx, dml_param, column_ids, row))) {
LOG_WARN("failed to delete row", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::put_rows(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param,
const ObIArray<uint64_t>& column_ids, ObNewRowIterator* row_iter, int64_t& affected_rows)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K(pkey_));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.put_rows(ctx, dml_param, column_ids, row_iter, affected_rows))) {
LOG_WARN("failed to put rows", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::insert_rows(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param,
const ObIArray<uint64_t>& column_ids, ObNewRowIterator* row_iter, int64_t& affected_rows)
// ObPartitionKey &pkey
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.insert_rows(ctx, dml_param, column_ids, row_iter, affected_rows))) {
LOG_WARN("failed to insert rows", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::insert_row(
const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param, const ObIArray<uint64_t>& column_ids, const ObNewRow& row)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
} else if (OB_FAIL(pg_storage_.insert_row(ctx, dml_param, column_ids, row))) {
LOG_WARN("failed to insert rows", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::insert_row(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param,
const common::ObIArray<uint64_t>& column_ids, const common::ObIArray<uint64_t>& duplicated_column_ids,
const common::ObNewRow& row, const ObInsertFlag flag, int64_t& affected_rows,
common::ObNewRowIterator*& duplicated_rows)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.insert_row(
ctx, dml_param, column_ids, duplicated_column_ids, row, flag, affected_rows, duplicated_rows))) {
if (OB_ERR_PRIMARY_KEY_DUPLICATE != ret) {
LOG_WARN("failed to insert rows", K(ret), "this", *this, K(ctx));
}
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::fetch_conflict_rows(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param,
const ObIArray<uint64_t>& in_column_ids, const ObIArray<uint64_t>& out_column_ids, ObNewRowIterator& check_row_iter,
ObIArray<ObNewRowIterator*>& dup_row_iters)
{
int ret = OB_SUCCESS;
// For DML, it should check if the replica is writable and be able to read the data in memtable
// even if there is no sstable.
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.fetch_conflict_rows(
ctx, dml_param, in_column_ids, out_column_ids, check_row_iter, dup_row_iters))) {
LOG_WARN("failed to fetch conflict rows", K(ret), KPC(this), K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::revert_insert_iter(const common::ObPartitionKey& pkey, ObNewRowIterator* iter)
{
return pg_storage_.revert_insert_iter(pkey, iter);
}
int ObPartitionGroup::update_rows(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param,
const ObIArray<uint64_t>& column_ids, const ObIArray<uint64_t>& updated_column_ids, ObNewRowIterator* row_iter,
int64_t& affected_rows)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(
pg_storage_.update_rows(ctx, dml_param, column_ids, updated_column_ids, row_iter, affected_rows))) {
LOG_WARN("failed to update rows", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::update_row(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param,
const ObIArray<uint64_t>& column_ids, const ObIArray<uint64_t>& updated_column_ids, const ObNewRow& old_row,
const ObNewRow& new_row)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.update_row(ctx, dml_param, column_ids, updated_column_ids, old_row, new_row))) {
LOG_WARN("failed to update row", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::lock_rows(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param, const int64_t abs_lock_timeout,
ObNewRowIterator* row_iter, const ObLockFlag lock_flag, int64_t& affected_rows)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.lock_rows(ctx, dml_param, abs_lock_timeout, row_iter, lock_flag, affected_rows))) {
LOG_WARN("failed to lock rows", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::lock_rows(const ObStoreCtx& ctx, const ObDMLBaseParam& dml_param, const int64_t abs_lock_timeout,
const ObNewRow& row, const ObLockFlag lock_flag)
{
int ret = OB_SUCCESS;
if (OB_FAIL(check_split_state_())) {
LOG_WARN("check split state failed", K(ret), K_(pkey));
// } else if (OB_FAIL(schema_recorder_.try_update_table_schema(dml_param.schema_version_))) {
// LOG_WARN("fail to record table schema", K(ret), K(pkey_), K(dml_param.schema_version_));
} else if (OB_FAIL(pg_storage_.lock_rows(ctx, dml_param, abs_lock_timeout, row, lock_flag))) {
LOG_WARN("failed to lock rows", K(ret), "this", *this, K(ctx));
} else {
// do nothing
}
return ret;
}
const clog::ObIPartitionLogService* ObPartitionGroup::get_log_service() const
{
return pls_;
}
clog::ObIPartitionLogService* ObPartitionGroup::get_log_service()
{
return pls_;
}
ObPartitionService* ObPartitionGroup::get_partition_service()
{
return ps_;
}
ObTransService* ObPartitionGroup::get_trans_service()
{
return txs_;
}
int ObPartitionGroup::get_role(common::ObRole& role) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_role(role);
}
return ret;
}
int ObPartitionGroup::get_role_unsafe(common::ObRole &role) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_role(role);
}
return ret;
}
int ObPartitionGroup::get_leader(common::ObAddr& leader) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_leader(leader);
}
return ret;
}
int ObPartitionGroup::get_leader_curr_member_list(ObMemberList& member_list) const
{
return get_leader_curr_member_list_(member_list);
}
int ObPartitionGroup::get_leader_curr_member_list_(ObMemberList& member_list) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_leader_curr_member_list(member_list);
}
return ret;
}
int ObPartitionGroup::get_curr_member_list(ObMemberList& member_list) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_curr_member_list(member_list);
}
return ret;
}
int ObPartitionGroup::get_curr_member_list_for_report(ObMemberList& member_list) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_curr_member_list_for_report(member_list);
}
return ret;
}
int ObPartitionGroup::get_curr_leader_and_memberlist(common::ObAddr& leader, common::ObRole& role,
common::ObMemberList& curr_member_list, common::ObChildReplicaList& children_list) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_curr_leader_and_memberlist(leader, role, curr_member_list, children_list);
}
return ret;
}
int ObPartitionGroup::get_dst_leader_candidate(ObMemberList& member_list) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_dst_leader_candidate(member_list);
}
return ret;
}
int ObPartitionGroup::get_log_archive_status(ObPGLogArchiveStatus& status) const
{
int ret = OB_SUCCESS;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
ret = pls_->get_log_archive_status(status);
}
return ret;
}
int ObPartitionGroup::change_leader(const common::ObAddr& server)
{
int ret = OB_SUCCESS;
ObTsWindows changing_leader_windows;
if (OB_SUCCESS == (ret = check_init_(pls_, "partition log service"))) {
if (OB_FAIL(pls_->change_leader(server, changing_leader_windows))) {
STORAGE_LOG(WARN, "change leader failed", K(ret), K(server));
} else if (GET_MIN_CLUSTER_VERSION() < CLUSTER_VERSION_2100 || !GCONF.enable_smooth_leader_switch) {
STORAGE_LOG(INFO, "do not register prepare changing leader task", K_(pkey));
} else if (changing_leader_windows.get_start() == changing_leader_windows.get_end()) {
STORAGE_LOG(WARN, "invalid change leader windows", K_(pkey), K(changing_leader_windows));
} else if (OB_FAIL(register_txs_change_leader(server, changing_leader_windows))) {
STORAGE_LOG(WARN, "register txs prepare change leader task failed", K(ret), K_(pkey), K(changing_leader_windows));
}
}
return ret;
}
int ObPartitionGroup::register_txs_change_leader(const common::ObAddr& server, ObTsWindows& changing_leader_windows)
{
int ret = OB_SUCCESS;
const int64_t ts = changing_leader_windows.get_start() + changing_leader_windows.get_left_size() / 2;
if (OB_FAIL(txs_->register_prepare_changing_leader_task(pkey_, server, ts))) {
STORAGE_LOG(
WARN, "register prepare changing leader task failed", K(ret), K_(pkey), K(ts), K(changing_leader_windows));
} else {
STORAGE_LOG(INFO, "register prepare changing leader task success", K_(pkey), K(ts), K(changing_leader_windows));
}
return ret;
}
int ObPartitionGroup::check_physical_split(bool& finished)
{
int ret = OB_SUCCESS;
ret = check_physical_split_(finished);
return ret;
}
int ObPartitionGroup::wait_replay_()
{
int ret = OB_SUCCESS;
restore_task_cnt_ = 0;
while (true) {
int64_t task_cnt = replay_status_->get_pending_task_count();
int64_t curr = ObTimeUtility::current_time();
if (restore_task_cnt_ && curr - restore_task_ts_ > 1000L * 1000L) {
STORAGE_LOG(INFO, "wait replay", K(pkey_), K(task_cnt));
}
if (0 == task_cnt) {
break;
} else if (task_cnt != restore_task_cnt_) {
restore_task_cnt_ = task_cnt;
restore_task_ts_ = curr;
} else if (curr - restore_task_ts_ > 180L * 1000L * 1000L) {
// It is regarded as timeout if there is no replay action for 3 minutes.
ret = OB_TIMEOUT;
break;
}
usleep(20 * 1000); // 20ms
}
return ret;
}
int ObPartitionGroup::leader_takeover()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(pls_) || OB_ISNULL(txs_) || OB_ISNULL(replay_status_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K(ret), K(pkey_));
} else {
const bool in_slog_trans = false;
const bool is_replay = false;
ObPartitionGroupLockGuard guard(lock_, PGLOCKREPLAY | PGLOCKCLOG | PGLOCKTRANS, PGLOCKSTORAGE);
SpinRLockGuard split_guard(split_lock_);
ObBaseStorageInfo clog_info;
if (0 == get_replica_property().get_memstore_percent()) {
if (OB_FAIL(pg_storage_.get_saved_clog_info(clog_info))) {
STORAGE_LOG(WARN, "fail to get saved info", K(ret), K(pkey_));
} else if (OB_FAIL(clear_non_reused_stores_(pkey_))) {
STORAGE_LOG(WARN, "fail to clear non-reused stores", K(ret), K(pkey_));
// D replica will create memtable at leader takeover process.
} else if (OB_FAIL(create_memtable_(in_slog_trans, is_replay, true))) {
STORAGE_LOG(WARN, "fail to create memtable", K(ret), K(pkey_));
} else if (FALSE_IT(guard.unlock(PGLOCKSTORAGE))) {
} else if (OB_FAIL(pls_->restore_replayed_log(clog_info))) {
STORAGE_LOG(WARN, "fail to restore replayed log", K(ret), K(pkey_), K(clog_info));
} else if (OB_FAIL(wait_replay_())) {
STORAGE_LOG(WARN, "fail to wait replay", K(ret), K(pkey_));
}
}
}
if (OB_FAIL(ret)) {
// rollback
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKTRANS | PGLOCKREPLAY | PGLOCKSTORAGE);
if (0 == get_replica_property().get_memstore_percent()) {
int64_t tmp_ret = OB_SUCCESS;
if (OB_SUCCESS != (tmp_ret = replay_status_->set_need_filter_trans_log(pkey_, true /*need filter*/))) {
STORAGE_LOG(ERROR, "fail to set_need_filter_trans_log", K(tmp_ret), K_(pkey));
} else if (OB_SUCCESS != (tmp_ret = txs_->clear_all_ctx(pkey_))) {
STORAGE_LOG(ERROR, "fail to clear all trans ctx", K(tmp_ret), K_(pkey));
// PG or stand alone partition
} else if (OB_SUCCESS != (tmp_ret = clear_non_reused_stores_(pkey_))) {
STORAGE_LOG(WARN, "fail to clear non reused store", K(ret), K(pkey_));
}
}
}
return ret;
}
int ObPartitionGroup::leader_revoke()
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
// TODO: comebine the two locks
ObPartitionGroupLockGuard pg_guard(lock_, 0, PGLOCKTRANS | PGLOCKREPLAY | PGLOCKSTORAGE);
SpinWLockGuard guard(split_lock_);
tg.click();
if (OB_FAIL(split_state_.switch_state(LEADER_REVOKE))) {
STORAGE_LOG(WARN, "switch state failed", K(ret));
}
is_split_blocked_by_mc_ = false;
replica_split_progress_array_.reset();
// Checkpoint can be written continuously if the partition is selected as leader again.
partition_loop_worker_.reset_last_checkpoint();
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "leader revoke failed", K(ret), K_(pkey));
} else {
STORAGE_LOG(INFO, "leader revoke success", K_(pkey));
}
if (0 == get_replica_property().get_memstore_percent()) {
int64_t tmp_ret = OB_SUCCESS;
if (OB_SUCCESS != (tmp_ret = replay_status_->set_need_filter_trans_log(pkey_, true /*need filter*/))) {
STORAGE_LOG(ERROR, "fail to set_need_filter_trans_log", K(tmp_ret), K_(pkey));
} else if (OB_SUCCESS != (tmp_ret = txs_->clear_all_ctx(pkey_))) {
STORAGE_LOG(ERROR, "fail to clear all trans ctx", K(tmp_ret), K_(pkey));
// PG or stand alone partition
} else if (OB_SUCCESS != (tmp_ret = clear_non_reused_stores_(pkey_))) {
STORAGE_LOG(ERROR, "fail to clear non reused store", K(ret), K(pkey_));
}
}
return ret;
}
int ObPartitionGroup::leader_active()
{
int ret = OB_SUCCESS;
bool is_all_trans_clear = false;
int64_t cur_checkpoint = 0;
if (OB_FAIL(pg_storage_.get_weak_read_timestamp(cur_checkpoint))) {
STORAGE_LOG(WARN, "get readable ts error", K(ret), K_(pkey));
} else {
partition_loop_worker_.update_last_checkpoint(cur_checkpoint);
}
if (OB_SUCC(ret)) {
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (OB_FAIL(split_state_.switch_state(LEADER_TAKEOVER))) {
STORAGE_LOG(WARN, "switch state failed", K(ret));
} else {
if (in_source_splitting(split_state_.get_state())) {
if (OB_FAIL(txs_->block_partition(pkey_, is_all_trans_clear))) {
STORAGE_LOG(WARN, "block partition failed", K(ret), K_(pkey));
} else if (OB_FAIL(push_split_task_(split_info_.get_schema_version(), split_info_.get_spp()))) {
STORAGE_LOG(WARN, "push split task failed", K(ret), K_(split_state));
} else {
STORAGE_LOG(INFO, "push split task success", K_(split_state));
}
}
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "leader active failed", K(ret), K_(pkey));
} else {
STORAGE_LOG(INFO, "leader active success", K_(pkey));
}
return ret;
}
ObReplayStatus* ObPartitionGroup::get_replay_status()
{
return replay_status_;
}
int ObPartitionGroup::serialize(ObArenaAllocator& allocator, char*& new_buf, int64_t& serialize_size)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.serialize(allocator, new_buf, serialize_size))) {
STORAGE_LOG(WARN, "Fail to serialize storage, ", K(ret));
}
return ret;
}
int ObPartitionGroup::deserialize(const char* buf, const int64_t buf_len, int64_t& pos)
{
int ret = OB_SUCCESS;
int64_t state = static_cast<ObPartitionSplitStateEnum>(UNKNOWN_SPLIT_STATE);
ObPartitionSplitInfo split_info;
if (NULL == buf || buf_len <= 0 || pos < 0 || pos >= buf_len) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(buf), K(buf_len), K(pos), K(ret));
} else {
if (OB_FAIL(pg_storage_.init(pkey_, cp_fty_, schema_service_, txs_, pls_, this))) {
STORAGE_LOG(
WARN, "partition info init error", K(ret), K_(pkey), KP_(cp_fty), KP_(schema_service), KP_(pls), KP_(txs));
} else if (OB_FAIL(pg_storage_.deserialize(buf, buf_len, pos, state, split_info))) {
STORAGE_LOG(WARN, "Fail to deserialize pg partition info, ", K(ret));
} else if (OB_FAIL(split_state_.set_partition_key(pg_storage_.get_partition_key()))) {
STORAGE_LOG(WARN, "Fail to set partition key", K(ret));
} else if (UNKNOWN_SPLIT_STATE != state &&
OB_FAIL(split_state_.set_state(static_cast<ObPartitionSplitStateEnum>(state)))) {
STORAGE_LOG(WARN, "set split state failed", K(ret), K(state));
} else if (OB_FAIL(split_info_.assign(split_info))) {
STORAGE_LOG(WARN, "failed to assign split info", K(ret), K(split_info));
} else {
// The update of pkey_ should be before the partition_loop_worker init.
pkey_ = pg_storage_.get_partition_key();
if (nullptr != ps_ && OB_FAIL(partition_loop_worker_.init(this))) {
STORAGE_LOG(WARN, "partition worker init error", K(ret), K_(pkey));
} else if (OB_ISNULL(replay_status_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "replay status is NULL", K(ret), K(pkey_));
} else if (OB_FAIL(replay_status_->init(pkey_.get_tenant_id(), rp_eg_, safe_ref_))) {
STORAGE_LOG(WARN, "failed to init replay_status", K(ret), K(pkey_));
}
}
}
STORAGE_LOG(INFO, "deserialize partition", K(ret), K_(pkey));
return ret;
}
bool ObPartitionGroup::is_valid() const
{
ObPartitionState state = get_partition_state();
return state > INIT && state < INVALID_STATE;
}
int ObPartitionGroup::init(const ObPartitionKey& key, ObIPartitionComponentFactory* cp_fty,
share::schema::ObMultiVersionSchemaService* schema_service, transaction::ObTransService* txs,
replayengine::ObILogReplayEngine* rp_eg, ObPartitionService* ps,
ObPartitionGroupIndex* pg_index /*only assign in replay process*/,
ObPGPartitionMap* pg_partition_map /*only assign in replay process*/)
{
// WARNING: the parameter of pkey is fake, when the server restarts.
int ret = OB_SUCCESS;
if (!key.is_valid() || NULL == cp_fty || NULL == schema_service || NULL == txs || NULL == rp_eg ||
(NULL == ps && (NULL == pg_index || NULL == pg_partition_map))) {
STORAGE_LOG(WARN,
"invalid arguments.",
K(key),
KP(cp_fty),
KP(schema_service),
KP(txs),
KP(rp_eg),
KP(ps),
KP(pg_index),
KP(pg_partition_map));
ret = OB_INVALID_ARGUMENT;
} else if (is_inited_) {
ret = OB_INIT_TWICE;
STORAGE_LOG(WARN, "partition is already initialized", K(key), K(ret));
// FIXME
} else {
// The assignment of cp_fty_ must be first, otherwise there will be a memory leak in the case of init failure.
cp_fty_ = cp_fty;
uint64_t tenant_id = key.get_tenant_id();
if (tenant_id <= 0) {
tenant_id = OB_SERVER_TENANT_ID;
}
if (NULL == (pls_ = cp_fty->get_log_service(tenant_id))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(ERROR, "create partition log service failed, ", K(ret));
} else if (NULL == (replay_status_ = cp_fty->get_replay_status(tenant_id))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(ERROR, "create replay status object failed.", K(ret));
} else if (OB_FAIL(split_state_.init(key))) {
STORAGE_LOG(WARN, "init split state failed", K(ret));
} else if (OB_FAIL(split_state_.set_state(FOLLOWER_INIT))) {
STORAGE_LOG(WARN, "set split state failed", K(ret));
} else {
is_inited_ = true;
pkey_ = key;
next_ = NULL;
txs_ = txs;
rp_eg_ = rp_eg;
ps_ = ps;
pg_index_ = (nullptr == ps) ? pg_index : &ps->get_pg_index();
pg_partition_map_ = (nullptr == ps) ? pg_partition_map : &ps->get_partition_map();
// create_timestamp_ = ObTimeUtility::current_time();
schema_service_ = schema_service;
if (key.get_table_id() != OB_MIN_USER_TABLE_ID) {
// The key may be invalid at this time when the machine is down and restarted
if (OB_FAIL(pg_storage_.init(key, cp_fty_, schema_service, txs, pls_, this))) {
STORAGE_LOG(WARN, "pg storage init error", K(ret), K(key));
} else if (nullptr != ps_ && OB_FAIL(partition_loop_worker_.init(this))) {
STORAGE_LOG(WARN, "partition worker init error", K(ret), K_(pkey));
} else if (OB_FAIL(replay_status_->init(tenant_id, rp_eg, safe_ref_))) {
STORAGE_LOG(WARN, "replay status init failed", K(ret), K(key));
} else {
// do nothing
}
}
}
if (NULL != replay_status_) {
// You need to add the reference count whether init succeeds or fails to make sure that
// destroy will not make mistakes in case of init failure.
replay_status_->inc_ref();
}
if (!is_inited_) {
destroy();
}
}
STORAGE_LOG(INFO, "partition init", KP(replay_status_), K(ret), K(key), KP(this));
return ret;
}
// Write meta slog and create PG memstore while creating partition group.
int ObPartitionGroup::create_partition_group(const ObCreatePGParam& param)
{
int ret = OB_SUCCESS;
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKSTORAGE);
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else if (!param.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid arguments", K(ret), K(pkey_), K(param));
} else if (OB_FAIL(pg_storage_.create_partition_group(param))) {
STORAGE_LOG(WARN, "pg storage update pg meta error", K(ret), K(param));
} else if (OB_FAIL(split_info_.assign(param.split_info_))) {
STORAGE_LOG(WARN, "failed to assign split info", K(ret), K(param));
} else if (OB_FAIL(split_state_.set_state(static_cast<ObPartitionSplitStateEnum>(param.split_state_)))) {
STORAGE_LOG(WARN, "failed to set split state", K(ret), K(param));
} else {
STORAGE_LOG(INFO, "create partition group success", K(param));
}
return ret;
}
int ObPartitionGroup::replay_pg_partition(const common::ObPartitionKey& pkey, const uint64_t log_id)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else if (!pkey.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(pkey), K(log_id));
} else {
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKSTORAGE, true /* trylock */);
if (!guard.locked()) {
ret = OB_EAGAIN;
} else if (OB_FAIL(pg_storage_.replay_pg_partition(pkey, log_id))) {
STORAGE_LOG(WARN, "create pg partition error", K(ret), "pkey", pkey_, K(pkey), K(log_id));
} else {
STORAGE_LOG(INFO, "create pg partition success", "pkey", pkey_, K(pkey), K(log_id));
}
}
return ret;
}
int ObPartitionGroup::create_pg_partition(const common::ObPartitionKey& pkey, const int64_t multi_version_start,
const uint64_t data_table_id, const obrpc::ObCreatePartitionArg& arg, const bool in_slog_trans,
const bool is_replay, const uint64_t log_id, ObTablesHandle& sstables_handle)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else {
ObCreatePartitionParam param;
if (OB_FAIL(param.extract_from(arg))) {
STORAGE_LOG(WARN, "failed to extract create_param", KR(ret), K(pkey), K(arg));
} else {
ret = create_pg_partition(
pkey, multi_version_start, data_table_id, param, in_slog_trans, is_replay, log_id, sstables_handle);
}
}
return ret;
}
int ObPartitionGroup::create_pg_partition(const common::ObPartitionKey& pkey, const int64_t multi_version_start,
const uint64_t data_table_id, const ObCreatePartitionParam& arg, const bool in_slog_trans, const bool is_replay,
const uint64_t log_id, ObTablesHandle& sstables_handle)
{
ObTimeGuard tg(__func__, 100L * 1000L);
int ret = OB_SUCCESS;
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKSTORAGE, is_replay /* trylock */);
if (!guard.locked()) {
ret = OB_EAGAIN;
}
tg.click();
if (OB_FAIL(ret)) {
} else if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else if (OB_INVALID_ID == data_table_id) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid arguments", K(ret), K(data_table_id));
// Add Partition To PG
} else if (OB_FAIL(pg_storage_.create_pg_partition(
pkey, multi_version_start, data_table_id, arg, in_slog_trans, is_replay, log_id, sstables_handle))) {
STORAGE_LOG(WARN, "failed to create partition store", K(ret));
} else {
// do nothing
}
tg.click();
return ret;
}
void ObPartitionGroup::destroy()
{
ObTimeGuard timeguard("destroy partition", 3 * 1000 * 1000);
STORAGE_LOG(INFO, "destroy partition", K(*this), KP(this), K(lbt()));
// It must be placed in the header to ensure that all members of the partition are valid during the reference process
REF_KEEPER.unreg(safe_ref_);
if (NULL != cp_fty_) {
timeguard.click();
if (NULL != pls_) {
cp_fty_->free(pls_);
pls_ = NULL;
}
timeguard.click();
if (NULL != replay_status_) {
if (0 == replay_status_->dec_ref()) {
cp_fty_->free(replay_status_);
}
replay_status_ = nullptr;
}
}
timeguard.click();
cp_fty_ = NULL;
partition_state_ = INIT;
migrate_retry_flag_ = NO_NEED_RETRY;
need_gc_ = false;
is_inited_ = false;
}
void ObPartitionGroup::clear()
{
destroy();
pg_storage_.clear();
}
int ObPartitionGroup::get_replica_state(ObPartitionReplicaState& state)
{
int ret = OB_SUCCESS;
state = OB_UNKNOWN_REPLICA;
bool disable_replay_log = false;
if (!is_inited_ || NULL == pls_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized.", K(ret), K_(pls));
} else if (OB_SUCCESS != (ret = pls_->is_offline(disable_replay_log))) {
STORAGE_LOG(WARN, "get temporary state from clog error.", K(ret), K_(pkey));
} else if (OB_FAIL(pg_storage_.get_replica_state(disable_replay_log, state))) {
STORAGE_LOG(WARN, "get replica state error", K(ret), K_(pkey), K(state));
} else {
// do nothing
}
return ret;
}
void ObPartitionGroup::replay_status_revoke()
{
// replay_status_.leader_revoke(pkey_);
}
int ObPartitionGroup::report_clog_history_online()
{
int ret = OB_SUCCESS;
uint64_t start_log_id = 0;
int64_t start_tstamp = 0;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (OB_FAIL(get_saved_last_log_info_(start_log_id, start_tstamp))) {
STORAGE_LOG(WARN, "fail to get last log info", K(ret), K(pkey_), K(start_log_id), K(start_tstamp));
} else if (ObClogHistoryReporter::is_related_table(pkey_.get_tenant_id(), pkey_.get_table_id())) {
STORAGE_LOG(INFO, "update clog history info", K(pkey_));
(void)ObClogHistoryReporter::get_instance().online(pkey_, start_log_id, start_tstamp);
}
return ret;
}
int ObPartitionGroup::report_clog_history_offline()
{
int ret = OB_SUCCESS;
uint64_t start_log_id = OB_INVALID_ID;
int64_t start_log_timestamp = OB_INVALID_TIMESTAMP;
uint64_t end_log_id = OB_INVALID_ID;
int64_t end_log_timestamp = OB_INVALID_TIMESTAMP;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (ObClogHistoryReporter::is_related_table(pkey_.get_tenant_id(), pkey_.get_table_id())) {
if (OB_FAIL(get_clog_service_range_for_clog_history_info_(
start_log_id, start_log_timestamp, end_log_id, end_log_timestamp))) {
STORAGE_LOG(WARN, "fail to get clog service range", K(ret), K(pkey_));
} else {
STORAGE_LOG(INFO,
"update clog history info on offline",
K(pkey_),
K(start_log_id),
K(start_log_timestamp),
K(end_log_id),
K(end_log_timestamp));
(void)clog::ObClogHistoryReporter::get_instance().offline(
pkey_, start_log_id, start_log_timestamp, end_log_id, end_log_timestamp);
}
}
return ret;
}
int ObPartitionGroup::get_clog_service_range_for_clog_history_info_(
uint64_t& start_log_id, int64_t& start_log_timestamp, uint64_t& end_log_id, int64_t& end_log_timestamp)
{
int ret = OB_SUCCESS;
clog::ObIPartitionLogService* pls = NULL;
uint64_t max_log_id = OB_INVALID_ID;
int64_t max_log_ts = OB_INVALID_TIMESTAMP;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (NULL == (pls = get_log_service())) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "pls should not be NULL", K(ret));
} else if (OB_SUCCESS !=
(ret = pls->get_log_id_range(start_log_id, start_log_timestamp, end_log_id, end_log_timestamp))) {
STORAGE_LOG(WARN, "fail to get log_id_range", K(ret));
}
// For clog history info, the maximum log ID can be larger, but not smaller.
// The maximum value returned by get_log_id_range() is the maximum log ID
// that has been replayed. Some logs may not be replayed in the sliding window.
// These logs should also be the logs of the server service and be recorded at
// clog history. For this purpose, the right boundary of the sliding window is
// taken as the maximum log of the service
else if (OB_FAIL(pls->get_sw_max_log_id_info(max_log_id, max_log_ts))) {
STORAGE_LOG(WARN, "get_sw_max_log_id_info fail", K(ret), K(pkey_));
} else {
end_log_id = max_log_id;
end_log_timestamp = max_log_ts;
STORAGE_LOG(INFO,
"get clog service range",
K_(pkey),
K(start_log_id),
K(start_log_timestamp),
K(end_log_id),
K(start_log_timestamp));
}
return ret;
}
void ObPartitionGroup::erase_pg_from_clog()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(pls_)) {
} else if (OB_FAIL(pls_->set_offline())) {
STORAGE_LOG(WARN, "partition log service set offline failed", K(ret), K(pkey_));
}
}
int ObPartitionGroup::remove_election_from_mgr()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(pls_)) {
// do nothing
} else if (OB_FAIL(pls_->remove_election())) {
STORAGE_LOG(ERROR, "remove election failed", K(ret), K(pkey_));
} else {
STORAGE_LOG(INFO, "remove election success", K(ret), K(pkey_));
}
return ret;
}
void ObPartitionGroup::erase_pg_from_election()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(pls_)) {
// do nothing
} else if (OB_FAIL(pls_->stop_election())) {
STORAGE_LOG(ERROR, "stop election failed", K(ret), K(pkey_));
} else {
// do nothing
}
}
void ObPartitionGroup::erase_pg_from_replay_engine()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(rp_eg_)) {
} else if (OB_FAIL(rp_eg_->remove_partition(pkey_, this))) {
// The partition may be not exist after offline.
if (OB_ENTRY_NOT_EXIST != ret && OB_PARTITION_NOT_EXIST != ret) {
STORAGE_LOG(WARN, "replay engine remove partition failed", K(ret), K(pkey_));
}
}
}
void ObPartitionGroup::erase_pg_from_trans()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(txs_)) {
} else if (OB_FAIL(txs_->remove_partition(pkey_, false))) {
if (OB_ENTRY_NOT_EXIST != ret && OB_PARTITION_NOT_EXIST != ret) {
STORAGE_LOG(WARN, "fail to remove partition from transaction service", K(ret), K(pkey_));
}
}
}
void ObPartitionGroup::erase_pg_from_stat_cache()
{
int ret = OB_SUCCESS;
if (!is_pg()) {
ObPGPartitionGuard pg_partition_guard;
ObPGPartition* pg_partition = nullptr;
ObPartitionStorage* storage = nullptr;
if (OB_FAIL(get_pg_storage().get_pg_partition(pkey_, pg_partition_guard))) {
STORAGE_LOG(WARN, "failed to get pg partition", K(ret), K(pkey_));
} else if (OB_ISNULL(pg_partition = pg_partition_guard.get_pg_partition())) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "pg partition is null", K(ret), K(pkey_));
} else if (OB_ISNULL(storage = static_cast<ObPartitionStorage*>(pg_partition->get_storage()))) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "partition storage is null", K(ret), K(pkey_));
} else if (OB_FAIL(storage->erase_stat_cache())) {
if (OB_ENTRY_NOT_EXIST != ret) {
STORAGE_LOG(WARN, "failed to erase stat cache", K(ret), K(pkey_));
}
}
}
}
int ObPartitionGroup::pause()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(replay_status_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "replay_status is NULL", K(ret), K(pkey_));
} else {
ObPGLockWithPendingReplayGuard guard(
lock_, *replay_status_, pkey_, 0, PGLOCKREPLAY | PGLOCKCLOG | PGLOCKTRANS | PGLOCKSTORAGE);
ret = pause_();
}
return ret;
}
int ObPartitionGroup::pause_()
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else {
pg_storage_.pause();
erase_pg_from_clog();
erase_pg_from_replay_engine();
erase_pg_from_trans();
int tmp_ret = OB_SUCCESS;
if (OB_UNLIKELY(OB_SUCCESS != (tmp_ret = report_clog_history_offline()))) {
STORAGE_LOG(WARN, "fail to report offline to clog hisory", K(ret), K(pkey_));
}
if (OB_FAIL(clear_non_reused_stores_(pkey_))) {
STORAGE_LOG(WARN, "failed to clear pg stores", K(ret), K(pkey_));
}
}
return ret;
}
int ObPartitionGroup::stop()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(replay_status_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "replay_status is NULL", K(ret), K(pkey_));
} else {
ObPGLockWithPendingReplayGuard guard(
lock_, *replay_status_, pkey_, 0, PGLOCKREPLAY | PGLOCKCLOG | PGLOCKTRANS | PGLOCKSTORAGE);
ret = stop_();
}
return ret;
}
int ObPartitionGroup::stop_()
{
int ret = OB_SUCCESS;
DEBUG_SYNC(SYNC_PG_AND_REPLAY_ENGINE_DEADLOCK);
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (OB_FAIL(pg_storage_.set_pg_removed())) {
STORAGE_LOG(WARN, "failed to set removed", K(ret), K(pkey_));
} else if (OB_FAIL(try_switch_partition_state(REMOVE))) {
STORAGE_LOG(WARN, "switch partition state to REMOVE failed", K(ret), K(pkey_));
} else if (OB_FAIL(pause_())) {
STORAGE_LOG(WARN, "fail to pause partition", K(ret), K(pkey_));
} else {
ATOMIC_SET(&has_clear_trans_after_restore_, false);
erase_pg_from_election();
erase_pg_from_stat_cache();
}
return ret;
}
// TODO: refactor is needed later.
int ObPartitionGroup::offline_()
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (OB_FAIL(try_switch_partition_state(OFFLINING))) {
STORAGE_LOG(WARN, "partition is doing other task, sleep and retry", K(ret), K(pkey_));
} else if (OB_FAIL(pause_())) {
STORAGE_LOG(WARN, "fail to pause partition", K(ret), K(pkey_));
} else {
erase_pg_from_election();
erase_pg_from_stat_cache();
if (OB_FAIL(try_switch_partition_state(OFFLINE))) {
STORAGE_LOG(WARN, "switch partition state to OFFLINE failed", K(ret), K(pkey_));
}
}
return ret;
}
int ObPartitionGroup::offline()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(replay_status_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "replay_status is NULL", K(ret), K(pkey_));
} else {
ObPGLockWithPendingReplayGuard guard(
lock_, *replay_status_, pkey_, 0, PGLOCKREPLAY | PGLOCKCLOG | PGLOCKTRANS | PGLOCKSTORAGE);
ret = offline_();
}
return ret;
}
int ObPartitionGroup::schema_drop(const bool for_replay, const uint64_t log_id, const bool is_physical_drop)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else {
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKTRANS | PGLOCKSTORAGE, for_replay /* trylock */);
if (!guard.locked()) {
ret = OB_EAGAIN;
} else if (for_replay) {
// Make a mark at ReplayStatus when the task of OFFLINE_PARTITION is submitted to
// temporarily solve the deadlock of asynchronous OFFLINE_PARTITION task and migration task.
replay_status_->offline_partition_task_submitted(log_id);
}
if (OB_SUCC(ret)) {
const KillTransArg arg(true, false);
bool is_all_trans_clear = false;
if (OB_FAIL(set_offline_log_id(log_id))) {
STORAGE_LOG(WARN, "fail to set_offline_log_id", K(ret), K(pkey_));
} else if (pg_storage_.has_memstore()) {
// The reason for calling the block partition interface of transaction when the offline partition
// log is replayed at follower are as follows:
// 1. to avoid read at follower.
// 2. both the leader and follower need to call block partition.
// 3. bugfix (12293135).
if (OB_FAIL(txs_->block_partition(pkey_, is_all_trans_clear))) {
STORAGE_LOG(ERROR, "block partition error in follower state", K(ret), K(pkey_));
} else if (!is_all_trans_clear && OB_FAIL(txs_->kill_all_trans(pkey_, arg, is_all_trans_clear))) {
STORAGE_LOG(WARN, "kill all trans failed", K(ret), K(pkey_));
} else if (!is_all_trans_clear && OB_FAIL(txs_->wait_all_trans_clear(pkey_))) {
if (OB_EAGAIN != ret) {
STORAGE_LOG(WARN, "wait all trans clear failed", K(ret), K(pkey_));
}
} else if (is_physical_drop && OB_FAIL(clear_non_reused_stores_(pkey_))) {
STORAGE_LOG(WARN, "failed to clear pg stores", K(ret), K(pkey_));
} else if (OB_FAIL(report_clog_history_offline())) {
STORAGE_LOG(WARN, "fail to report offline to clog hisory", K(ret), K(pkey_));
}
}
}
}
if (OB_FAIL(ret)) {
// Override the error code and let the upper layer try again
ret = OB_EAGAIN;
}
return ret;
}
int ObPartitionGroup::get_create_ts(int64_t& create_ts)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.get_pg_create_ts(create_ts))) {
STORAGE_LOG(WARN, "get pg create ts fail", K(ret), K(pkey_));
}
return ret;
}
int ObPartitionGroup::submit_add_partition_to_pg_log(const obrpc::ObCreatePartitionArg& arg, ObPartitionService* ps,
uint64_t& log_id, int64_t& log_ts, ObAddPartitionToPGLogCb*& out_cb)
{
int ret = OB_SUCCESS;
ObStorageLogType log_type = OB_LOG_ADD_PARTITION_TO_PG;
ObAddPartitionToPGLogCb* cb = NULL;
ObPartitionState partition_state;
ObAddPartitionToPGLog log;
char* buf = NULL;
// 64K
const int64_t MAX_LOG_SIZE = (common::OB_MAX_LOG_ALLOWED_SIZE >> 5);
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (!ObStorageLogTypeChecker::is_add_partition_to_pg_log(log_type) || !arg.is_valid() || OB_ISNULL(ps)) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(log_type), K(arg), KP(ps));
} else if (OFFLINE == (partition_state = get_partition_state()) || OFFLINING == partition_state ||
REMOVE == partition_state) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "state not match", K(ret), K(partition_state));
} else if (!pkey_.is_valid() || !pkey_.is_pg()) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "unexpected pkey", K(ret), K_(pkey));
} else if (!share::ObMultiClusterUtil::is_cluster_allow_submit_log(pkey_.get_table_id())) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "is_cluster_allow_submit_log return false", K(ret), K(pkey_));
} else if (OB_ISNULL(cb = op_alloc(ObAddPartitionToPGLogCb))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(WARN, "alloc memory fail", K(ret), KP(cb));
} else if (NULL == (buf = reinterpret_cast<char*>(ob_malloc(MAX_LOG_SIZE, ObModIds::OB_PARTITION_LOG_BUF)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(ERROR, "allocate memory failed", K(ret));
} else {
if (OB_FAIL(cb->init(log_type, pkey_, arg.partition_key_))) {
STORAGE_LOG(WARN, "ObAddPartitionToPGLogCb init error", K(ret), K(log_type), K(arg));
} else if (OB_FAIL(log.init(log_type, arg))) {
STORAGE_LOG(WARN, "ObPGPartitionChangeLog init error", K(ret), K(log_type), K(arg));
} else {
int64_t pos = 0;
int64_t log_timestamp = 0;
const int64_t base_timestamp = 0;
const bool is_trans_log = false;
if (OB_SUCCESS != (ret = serialization::encode_i64(buf, MAX_LOG_SIZE, pos, log_type))) {
STORAGE_LOG(WARN, "serialize log_type error", K(ret), K(log_type));
} else if (OB_FAIL(log.serialize(buf, MAX_LOG_SIZE, pos))) {
STORAGE_LOG(WARN, "serialize log failed", K(ret));
} else if (OB_FAIL(pls_->submit_log(buf, pos, base_timestamp, cb, is_trans_log, log_id, log_timestamp))) {
STORAGE_LOG(WARN, "submit add partition to pg log error", K(ret), K(log_type), K_(pkey));
} else {
out_cb = cb;
log_ts = log_timestamp;
STORAGE_LOG(INFO, "submit add partition to pg log success", K(log_type), K(log_ts), K(arg));
}
}
if (NULL != buf) {
ob_free(buf);
buf = NULL;
}
}
if (OB_FAIL(ret) && NULL != cb) {
op_free(cb);
cb = NULL;
}
return ret;
}
int ObPartitionGroup::submit_partition_schema_change_log(const common::ObPartitionKey& pkey,
const int64_t schema_version, const uint64_t index_id, ObPartitionService* ps, uint64_t& log_id, int64_t& log_ts,
ObSchemaChangeClogCb*& out_cb)
{
int ret = OB_SUCCESS;
ObStorageLogType log_type = OB_PARTITION_SCHEMA_VERSION_CHANGE_LOG;
ObSchemaChangeClogCb* cb = NULL;
ObPartitionState partition_state;
ObPGSchemaChangeLog log;
char* buf = NULL;
// 64K
const int64_t MAX_LOG_SIZE = (common::OB_MAX_LOG_ALLOWED_SIZE >> 5);
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (!ObStorageLogTypeChecker::is_schema_version_change_log(log_type) || !pkey.is_valid() ||
schema_version < 0 || OB_ISNULL(ps)) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(log_type), K(pkey_), K(pkey), K(schema_version), KP(ps));
} else if (OFFLINE == (partition_state = get_partition_state()) || OFFLINING == partition_state ||
REMOVE == partition_state) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "state not match", K(ret), K(partition_state));
} else if (!share::ObMultiClusterUtil::is_cluster_allow_submit_log(pkey_.get_table_id())) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "is_cluster_allow_submit_log return false", K(ret), K(pkey_));
} else if (OB_ISNULL(cb = op_alloc(ObSchemaChangeClogCb))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(WARN, "alloc memory fail", K(ret), KP(cb));
} else if (NULL == (buf = reinterpret_cast<char*>(ob_malloc(MAX_LOG_SIZE, ObModIds::OB_PARTITION_LOG_BUF)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(ERROR, "allocate memory failed", K(ret));
} else {
if (OB_FAIL(cb->init(log_type, pkey_, pkey))) {
STORAGE_LOG(WARN, "ObSchemaChangeClogCb init error", K(ret), K(log_type), K(pkey_), K(pkey));
} else if (OB_FAIL(log.init(log_type, pkey_, pkey, schema_version, index_id))) {
STORAGE_LOG(WARN,
"ObPGPartitionChangeLog init error",
K(ret),
K(log_type),
K(pkey_),
K(pkey),
K(schema_version),
K(index_id));
} else {
int64_t pos = 0;
const int64_t base_timestamp = 0;
const bool is_trans_log = false;
if (OB_FAIL(serialization::encode_i64(buf, MAX_LOG_SIZE, pos, log_type))) {
STORAGE_LOG(WARN, "serialize log_type error", K(ret), K(log_type));
} else if (OB_FAIL(log.serialize(buf, MAX_LOG_SIZE, pos))) {
STORAGE_LOG(WARN, "serialize log failed", K(ret));
} else if (OB_FAIL(pls_->submit_log(buf, pos, base_timestamp, cb, is_trans_log, log_id, log_ts))) {
STORAGE_LOG(
WARN, "submit partition schema version change log error", K(ret), K(log_type), K_(pkey), K(schema_version));
} else {
out_cb = cb;
STORAGE_LOG(
WARN, "submit partition schema version change log success", K(log_type), K_(pkey), K(schema_version));
}
}
if (NULL != buf) {
ob_free(buf);
buf = NULL;
}
}
if (OB_FAIL(ret) && NULL != cb) {
op_free(cb);
cb = NULL;
}
return ret;
}
int ObPartitionGroup::remove_partition_from_pg(
const bool for_replay, const ObPartitionKey& pkey, const bool write_slog_trans, const uint64_t log_id)
{
int ret = OB_SUCCESS;
ObPartitionState partition_state;
ObPGPartitionGuard partition_guard;
ObPartitionKey trans_table_pkey;
ObPGPartitionGuard trans_partition_guard;
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKSTORAGE, for_replay /* trylock */);
if (!guard.locked()) {
ret = OB_EAGAIN;
} else if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (!pkey.is_valid() || pkey.is_pg()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(pkey));
} else if (OFFLINE == (partition_state = get_partition_state()) || OFFLINING == partition_state ||
REMOVE == partition_state) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "state not match", K(ret), K(partition_state));
// pg partition
} else if (OB_FAIL(get_pg_partition(pkey, partition_guard))) {
if (OB_ENTRY_NOT_EXIST == ret) {
LOG_WARN("pg partition is already removed, no need to remove again", K(ret), K(pkey), "pg_key", pkey_);
ret = OB_SUCCESS;
} else {
LOG_WARN("failed to get_pg_partition", K(ret), K(pkey), "pg key", pkey_);
}
} else if (OB_FAIL(clear_non_reused_stores_(pkey))) {
STORAGE_LOG(WARN, "clear pg partition non reused stores error", K(ret), K(pkey), K(pkey_));
} else if (OB_FAIL(pg_storage_.remove_pg_partition_from_pg(pkey, write_slog_trans, log_id))) {
STORAGE_LOG(WARN, "remove pg partition from pg error", K(ret), K(pkey), K(pkey_));
} else {
STORAGE_LOG(INFO, "remove pg partition from pg success", K(pkey), K(pkey_), K(log_id));
}
return ret;
}
int ObPartitionGroup::submit_remove_partition_from_pg_log(const ObPartitionKey& pkey)
{
int ret = OB_SUCCESS;
ObStorageLogType log_type = OB_LOG_REMOVE_PARTITION_FROM_PG;
const int64_t REMOVE_PARTITION_FROM_PG_BUF_LEN = 256;
ObRemovePartitionFromPGLogCb* cb = NULL;
ObPartitionState partition_state;
ObRemovePartitionFromPGLog log;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (!pkey.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(pkey));
} else if (!share::ObMultiClusterUtil::is_cluster_allow_submit_log(pkey_.get_table_id())) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "is_cluster_allow_submit_log return false", K(ret), K(pkey_));
} else if (OFFLINE == (partition_state = get_partition_state()) || OFFLINING == partition_state ||
REMOVE == partition_state) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "state not match", K(ret), K(partition_state));
} else if (OB_ISNULL(cb = op_alloc(ObRemovePartitionFromPGLogCb))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(WARN, "alloc memory fail", K(ret), KP(cb));
} else if (OB_FAIL(cb->init(log_type, pkey_, pkey, &(ps_->get_callback_async_worker())))) {
STORAGE_LOG(WARN, "ObRemovePartitionFromPGLogCb init error", K(ret), K(log_type), K(pkey));
} else if (OB_FAIL(log.init(log_type, pkey_, pkey))) {
STORAGE_LOG(WARN, "ObRemovePartitionFromPGLog init error", K(ret), K(log_type), K(pkey), K(pkey_));
} else {
char buf[REMOVE_PARTITION_FROM_PG_BUF_LEN];
int64_t pos = 0;
uint64_t log_id = 0;
int64_t log_timestamp = 0;
const int64_t base_timestamp = 0;
const bool is_trans_log = false;
if (OB_FAIL(serialization::encode_i64(buf, REMOVE_PARTITION_FROM_PG_BUF_LEN, pos, log_type))) {
STORAGE_LOG(WARN, "serialize log_type error", K(ret), K(log_type));
} else if (OB_FAIL(log.serialize(buf, REMOVE_PARTITION_FROM_PG_BUF_LEN, pos))) {
STORAGE_LOG(WARN, "serialize log failed", K(ret));
} else if (OB_SUCCESS !=
(ret = pls_->submit_log(buf, pos, base_timestamp, cb, is_trans_log, log_id, log_timestamp))) {
STORAGE_LOG(WARN, "submit remove partition from pg error", K(ret), K(log_type), K_(pkey));
} else {
// do nothing
}
}
STORAGE_LOG(INFO, "submit remove partition from pg log", K(ret), K(pkey), K(pkey_));
if (OB_FAIL(ret) && NULL != cb) {
op_free(cb);
cb = NULL;
}
return ret;
}
bool ObPartitionGroup::is_removed() const
{
return (REMOVE == get_partition_state());
}
int ObPartitionGroup::check_is_in_member_list(bool& is_in_member_list) const
{
int ret = OB_SUCCESS;
is_in_member_list = false;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (OB_ISNULL(pls_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "partition log service is empty", K_(pkey), K(ret));
} else {
is_in_member_list = pls_->is_svr_in_member_list(GCTX.self_addr_);
}
return ret;
}
int ObPartitionGroup::offline_itself(const bool is_physical_drop)
{
int ret = OB_SUCCESS;
const int64_t OFFLINE_PARTITION_LOG_BUF_LEN = 256;
char buf[OFFLINE_PARTITION_LOG_BUF_LEN] = "";
int64_t pos = 0;
uint64_t log_id = 0;
int64_t log_timestamp = 0;
const int64_t base_timestamp = 0;
const bool is_trans_log = false;
ObPartitionState partition_state;
ObOfflinePartitionCb* cb = NULL;
// New log types will be written after 226 version
ObStorageLogType log_type =
GET_MIN_CLUSTER_VERSION() < CLUSTER_VERSION_2260 ? OB_LOG_OFFLINE_PARTITION : OB_LOG_OFFLINE_PARTITION_V2;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (!share::ObMultiClusterUtil::is_cluster_allow_submit_log(pkey_.get_table_id())) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "is_cluster_allow_submit_log return false", K(ret), K(pkey_));
} else if (OFFLINE == (partition_state = get_partition_state()) || OFFLINING == partition_state ||
REMOVE == partition_state) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "state not match", K(ret), K(partition_state));
} else if (OB_ISNULL(cb = op_alloc(ObOfflinePartitionCb))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(WARN, "alloc memory fail", K(ret), KP(cb));
} else if (cb->init(&(ps_->get_callback_async_worker()), is_physical_drop)) {
STORAGE_LOG(WARN, "ObOfflinePartitionCb init error", K(ret), K(pkey_));
} else if (OB_SUCCESS != (ret = serialization::encode_i64(buf, OFFLINE_PARTITION_LOG_BUF_LEN, pos, log_type))) {
STORAGE_LOG(WARN, "serialize log_type error", K(ret), K(log_type));
} else {
if (OB_LOG_OFFLINE_PARTITION == log_type) {
// do nothing
} else {
ObOfflinePartitionLog log;
if (OB_FAIL(log.init(log_type, is_physical_drop))) {
STORAGE_LOG(WARN, "failed to init offline log", KR(ret), K(log_type), K(is_physical_drop));
} else if (OB_FAIL(log.serialize(buf, OFFLINE_PARTITION_LOG_BUF_LEN, pos))) {
STORAGE_LOG(WARN, "serialize offline log error", KR(ret), K(log_type), K(is_physical_drop));
} else { /*do nothing*/
}
}
if (OB_SUCC(ret)) {
if (OB_FAIL(pls_->submit_log(buf, pos, base_timestamp, cb, is_trans_log, log_id, log_timestamp))) {
STORAGE_LOG(WARN, "submit offline log error", K(ret), K(log_type), K_(pkey));
}
}
}
if (OB_FAIL(ret) && NULL != cb) {
op_free(cb);
cb = NULL;
}
STORAGE_LOG(INFO, "offline_itself", K(ret), K_(pkey), K(log_type), K(is_physical_drop));
return ret;
}
int ObPartitionGroup::replay_partition_meta_log(
const ObStorageLogType log_type, const int64_t log_id, const char* buf, const int64_t size)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not inited", K(ret), K(pkey_));
} else if (OB_LOG_PARTITION_SCHEMA == log_type) {
if (OB_FAIL(pg_storage_.replay_schema_log(buf, size, log_id))) {
STORAGE_LOG(WARN, "fail to replay schema log", K(ret), K(pkey_));
}
} else {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(ERROR, "unknown meta log type", K(ret), K(pkey_), K(log_type));
}
return ret;
}
int ObPartitionGroup::set_wait_split()
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K_(pkey), K(ret));
} else if (OB_FAIL(split_state_.switch_state(WAIT_SPLIT))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret), K_(pkey));
} else {
// do nothing
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "set wait split failed", K(ret), K_(pkey));
} else {
STORAGE_LOG(INFO, "set wait split success", K_(pkey));
}
return ret;
}
int ObPartitionGroup::save_split_state(const bool write_slog)
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K_(pkey), K(ret));
} else {
ret = save_split_state_(split_state_.get_persistent_state(), write_slog);
}
return ret;
}
int ObPartitionGroup::restore_split_state(const int state)
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
if (OB_FAIL(split_state_.set_state(static_cast<ObPartitionSplitStateEnum>(state)))) {
STORAGE_LOG(WARN,
"restore split state failed",
K(ret),
K_(pkey),
"state",
to_state_str(static_cast<ObPartitionSplitStateEnum>(state)));
} else {
STORAGE_LOG(INFO,
"restore split state success",
K_(pkey),
"state",
to_state_str(static_cast<ObPartitionSplitStateEnum>(state)));
}
return ret;
}
int ObPartitionGroup::restore_split_info(const ObPartitionSplitInfo& split_info)
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
if (OB_FAIL(split_info_.assign(split_info))) {
STORAGE_LOG(WARN, "restore split info failed", K(ret), K_(pkey), K(split_info));
} else {
STORAGE_LOG(INFO, "restore split info success", K_(pkey), K(split_info));
}
return ret;
}
int ObPartitionGroup::replay_split_source_log(
const ObPartitionSplitSourceLog& log, const uint64_t log_id, const int64_t log_ts)
{
int ret = OB_SUCCESS;
bool write_slog = false;
const int64_t start_ts = ObTimeUtility::current_time();
bool is_dest_partition_ready = false;
ObTimeGuard tg(__func__, 1000000);
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKSTORAGE, true /* trylock */);
SpinWLockGuard split_guard(split_lock_);
tg.click();
if (!guard.locked()) {
ret = OB_EAGAIN;
} else if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition storage is not initialized", K_(pkey), K(ret));
} else if (!log.is_valid() || !is_valid_log_id(log_id) || 0 >= log_ts) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(log), K(log_id), K(log_ts));
} else if (OB_FAIL(check_if_dest_pg_ready_(log.get_spp().get_dest_array(), is_dest_partition_ready))) {
STORAGE_LOG(WARN, "check if dest partition group ready failed", K(ret), K(log));
} else if (!is_dest_partition_ready) {
ret = OB_EAGAIN;
STORAGE_LOG(WARN, "dest partition group is not ready, need retry", K(ret), K(log));
} else if (OB_FAIL(txs_->checkpoint(pkey_, log.get_slave_read_ts(), NULL))) {
STORAGE_LOG(WARN, "split srouce log replay to checkpoint error", K(ret), K(log), K_(pkey), K(log_id), K(log_ts));
} else if (is_dest_split(split_state_.get_state())) {
ret = OB_EAGAIN;
STORAGE_LOG(WARN, "dest partition is splitting", K(ret), K_(split_state));
int tmp_ret = OB_SUCCESS;
int unused_progress = UNKNOWN_SPLIT_PROGRESS;
// drive dest partition split state
if (OB_SUCCESS != (tmp_ret = get_dest_split_progress_(unused_progress))) {
STORAGE_LOG(WARN, "get dest split progress failed", K(tmp_ret), K_(pkey));
}
} else if (!partition_split_progress_array_.is_inited() &&
OB_FAIL(partition_split_progress_array_.init(log.get_spp().get_dest_array()))) {
STORAGE_LOG(WARN, "init partition split progress array failed", K(ret));
} else {
int64_t split_version = log_ts + SPLIT_FREEZE_WAIT_TS;
bool log_success = is_split_source_log_success(split_state_.get_state());
// check if need write slog before switch state
write_slog = (log_success ? false : true);
if (OB_FAIL(split_state_.switch_state(REPLAY_SOURCE_SPLIT_LOG))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret), K_(pkey));
} else {
split_info_.set_split_version(split_version);
split_info_.set_source_log_id(log_id);
split_info_.set_source_log_ts(log_ts);
if (OB_FAIL(
split_info_.set(log.get_schema_version(), log.get_spp(), ObPartitionSplitInfo::SPLIT_SOURCE_PARTITION))) {
} else if (OB_FAIL(shutdown_(split_version, log_id, log.get_schema_version()))) {
STORAGE_LOG(WARN, "shutdown source partition failed", K(ret));
} else if (OB_FAIL(E(EventTable::EN_REPLAY_SOURCE_SPLIT_LOG_FAILED) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_REPLAY_SOURCE_SPLIT_LOG_FAILED", K(ret), K_(pkey));
} else if (OB_FAIL(push_reference_tables_(log.get_spp().get_dest_array(), split_version))) {
STORAGE_LOG(WARN, "push reference tables failed", K(ret));
} else if (OB_FAIL(save_split_info_(split_info_, write_slog))) {
STORAGE_LOG(WARN, "save split info failed", K(ret), K_(pkey));
} else if (OB_FAIL(save_split_state_(split_state_.get_persistent_state(), write_slog))) {
STORAGE_LOG(WARN, "save split state failed", K(ret), K_(pkey));
} else {
// do nothing
}
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
}
const int64_t end_ts = ObTimeUtility::current_time();
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN,
"replay split source log failed",
K(ret),
K_(pkey),
K(log_id),
K(log_ts),
"used_time",
end_ts - start_ts);
} else {
STORAGE_LOG(INFO,
"replay split source log success",
K_(pkey),
K(log_id),
K(log_ts),
K(write_slog),
"used_time",
end_ts - start_ts);
}
return ret;
}
int ObPartitionGroup::replay_split_dest_log(const ObPartitionSplitDestLog& log)
{
int ret = OB_SUCCESS;
bool write_slog = false;
bool is_empty_pg = pg_storage_.is_empty_pg();
const int64_t start_ts = ObTimeUtility::current_time();
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition storage is not initialized", K_(pkey), K(ret));
} else if (OB_FAIL(E(EventTable::EN_REPLAY_SPLIT_DEST_LOG_FAILED) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_REPLAY_SPLIT_DEST_LOG_FAILED", K(ret), K_(pkey));
} else if (OB_UNLIKELY(!log.is_valid())) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(log));
} else if (log.get_schema_version() < split_info_.get_schema_version()) {
STORAGE_LOG(INFO, "ignore replay split dest log", K(log), K_(split_info));
} else {
write_slog = (is_split_dest_log_success(split_state_.get_state()) ? false : true);
if (OB_FAIL(split_state_.switch_state(REPLAY_DEST_SPLIT_LOG))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret), K_(pkey));
} else {
int64_t split_state = split_state_.get_persistent_state();
if (is_empty_pg) {
split_state = FOLLOWER_INIT;
}
split_info_.set_split_version(log.get_split_version());
split_info_.set_source_log_id(log.get_source_log_id());
split_info_.set_source_log_ts(log.get_source_log_ts());
if (OB_FAIL(
split_info_.set(log.get_schema_version(), log.get_spp(), ObPartitionSplitInfo::SPLIT_DEST_PARTITION))) {
} else if (OB_FAIL(save_split_info_(split_info_, write_slog))) {
STORAGE_LOG(WARN, "save split info failed", K(ret), K_(pkey));
} else if (OB_FAIL(save_split_state_(split_state, write_slog))) {
STORAGE_LOG(WARN, "save split state failed", K(ret), K_(pkey));
} else {
// do nothing
}
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
}
const int64_t end_ts = ObTimeUtility::current_time();
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "replay split dest log failed", K(ret), K_(pkey), K(log), "used_time", end_ts - start_ts);
} else {
STORAGE_LOG(INFO, "replay split dest log success", K_(pkey), K(log), K(write_slog), "used_time", end_ts - start_ts);
}
return ret;
}
int ObPartitionGroup::sync_split_source_log_success(const int64_t log_id, const int64_t log_ts)
{
int ret = OB_SUCCESS;
const bool write_slog = true;
const int64_t start_ts = ObTimeUtility::current_time();
ObTimeGuard tg(__func__, 1000000);
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKSTORAGE);
SpinWLockGuard split_guard(split_lock_);
tg.click();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition storage is not initialized", K_(pkey), K(ret));
} else if (!is_valid_log_id(log_id) || 0 >= log_ts) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(log_id), K(log_ts));
} else if (OB_FAIL(split_state_.switch_state(SOURCE_SPLIT_LOG_REACH_MAJORITY))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
} else {
split_info_.set_split_version(log_ts + SPLIT_FREEZE_WAIT_TS);
split_info_.set_source_log_id(log_id);
split_info_.set_source_log_ts(log_ts);
do {
if (OB_FAIL(E(EventTable::EN_SAVE_SPLIT_STATE_FAILED) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_PREPARE_SPLIT_FAILED", K(ret), K_(pkey));
} else if (OB_FAIL(save_split_info_(split_info_, write_slog))) {
STORAGE_LOG(WARN, "save split info failed", K(ret), K_(pkey));
}
} while (OB_CS_OUTOF_DISK_SPACE == ret || OB_SLOG_REACH_MAX_CONCURRENCY == ret || OB_ALLOCATE_MEMORY_FAILED == ret);
do {
if (OB_FAIL(save_split_state_(split_state_.get_persistent_state(), write_slog))) {
STORAGE_LOG(WARN, "save split state failed", K(ret), K_(pkey));
}
} while (OB_CS_OUTOF_DISK_SPACE == ret || OB_SLOG_REACH_MAX_CONCURRENCY == ret || OB_ALLOCATE_MEMORY_FAILED == ret);
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
const int64_t end_ts = ObTimeUtility::current_time();
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN,
"sync split source log failed",
K(ret),
K_(pkey),
K_(split_info),
K(log_id),
K(log_ts),
"used_time",
end_ts - start_ts);
} else {
STORAGE_LOG(INFO,
"sync split source log success",
K_(pkey),
K_(split_info),
K(log_id),
K(log_ts),
"used_time",
end_ts - start_ts);
}
return ret;
}
int ObPartitionGroup::sync_split_dest_log_success()
{
int ret = OB_SUCCESS;
const bool write_slog = true;
const int64_t start_ts = ObTimeUtility::current_time();
bool is_empty_pg = pg_storage_.is_empty_pg();
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition storage is not initialized", K_(pkey), K(ret));
} else if (OB_FAIL(split_state_.switch_state(DEST_SPLIT_LOG_REACH_MAJORITY))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
} else {
int64_t split_state = split_state_.get_persistent_state();
if (is_empty_pg) {
split_state = FOLLOWER_INIT;
}
do {
if (OB_FAIL(E(EventTable::EN_SAVE_SPLIT_STATE_FAILED) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_PREPARE_SPLIT_FAILED", K(ret), K_(pkey));
} else if (OB_FAIL(save_split_info_(split_info_, write_slog))) {
STORAGE_LOG(WARN, "save split info failed", K(ret), K_(pkey));
}
} while (OB_CS_OUTOF_DISK_SPACE == ret || OB_SLOG_REACH_MAX_CONCURRENCY == ret || OB_ALLOCATE_MEMORY_FAILED == ret);
do {
if (OB_FAIL(save_split_state_(split_state, write_slog))) {
STORAGE_LOG(WARN, "save split state failed", K(ret), K_(pkey));
}
} while (OB_CS_OUTOF_DISK_SPACE == ret || OB_SLOG_REACH_MAX_CONCURRENCY == ret || OB_ALLOCATE_MEMORY_FAILED == ret);
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
const int64_t end_ts = ObTimeUtility::current_time();
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "sync split dest log failed", K(ret), K_(pkey), "used_time", end_ts - start_ts);
} else {
STORAGE_LOG(INFO, "sync split dest log success", K_(pkey), "used_time", end_ts - start_ts);
}
return ret;
}
int ObPartitionGroup::prepare_splitting(
const ObPartitionSplitInfo& split_info, const ObMemberList& mlist, const ObAddr& leader)
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K_(pkey), K(ret));
} else if (OB_UNLIKELY(!split_info.is_valid()) || OB_UNLIKELY(!mlist.is_valid()) || OB_UNLIKELY(!leader.is_valid())) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(split_info), K(mlist), K(leader));
} else if (is_split_blocked_by_mc_) {
ret = OB_EAGAIN;
STORAGE_LOG(WARN, "partition is changing member list", K(ret), K_(pkey));
} else if (ObReplicaTypeCheck::is_readonly_replica(get_replica_type())) {
STORAGE_LOG(INFO, "readonly replica do nothing", K_(pkey));
} else if (!is_physical_split_finished_()) {
ret = OB_EAGAIN;
STORAGE_LOG(WARN, "source partition physical split not finished", K(ret), K_(pkey));
} else if (ps_->get_self_addr() == leader) {
ObMemberList leader_mlist;
if (!is_leader_state(get_partition_state())) {
ret = OB_NOT_MASTER;
STORAGE_LOG(WARN, "not master", K(ret), K_(pkey));
} else if (OB_FAIL(get_leader_curr_member_list_(leader_mlist))) {
STORAGE_LOG(WARN, "get leader current member list failed", K(ret), K_(pkey));
} else if (!mlist.member_addr_equal(leader_mlist)) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "member list not match", K(ret), K_(pkey), K(leader_mlist), K(mlist));
} else {
// do nothing
}
} else {
// do nothing
}
// Avoid writing multiple slogs
if (OB_SUCC(ret) && split_info.get_schema_version() > split_info_.get_schema_version()) {
const bool write_slog = true;
split_info_.reset();
if (OB_FAIL(split_info_.assign(split_info))) {
STORAGE_LOG(WARN, "assign split info failed", K(ret), K_(pkey));
} else if (OB_FAIL(save_split_info_(split_info_, write_slog))) {
STORAGE_LOG(WARN, "save split info failed", K(ret), K_(pkey));
} else {
STORAGE_LOG(INFO, "save split info success", K_(pkey), K_(split_info));
}
}
if (OB_SUCC(ret)) {
if (OB_FAIL(E(EventTable::EN_PREPARE_SPLIT_FAILED) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_PREPARE_SPLIT_FAILED", K(ret), K_(pkey));
}
}
STORAGE_LOG(INFO, "prepare splitting", K(ret), K(split_info), K(mlist), K(leader));
return ret;
}
/**
Description:
1. The partition will trigger splitting after setting split_info. The transaction layer judges whether current
partition is splitting according to the value of split_info.
Concurrent situation:
1. split_info should be set first and then check transactions before logical split;
2. It means the partition will be split as long as the split_info is valid in the process of transaction and
there will be a list of target partitions.
3. The transaction layer needs to check the split_info with split_lock.
WARNING:
1. The transaction layer may have add the target partition into the participant list after the split_info set.
The transaction commit process will be affected if the creating of split target partition is failed finally.
*/
int ObPartitionGroup::check_cur_partition_split(bool& is_split_partition)
{
int ret = OB_SUCCESS;
const bool split_kill_trans = true;
// SpinWLockGuard guard(split_lock_);
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K_(pkey), K(ret));
} else if (split_info_.is_valid()) {
is_split_partition = true;
} else {
is_split_partition = false;
}
if (split_kill_trans || !(GET_MIN_CLUSTER_VERSION() > CLUSTER_VERSION_3100)) {
is_split_partition = false;
}
return ret;
}
int ObPartitionGroup::split_source_partition(
const int64_t schema_version, const ObSplitPartitionPair& spp, enum ObSplitProgress& partition_progress)
{
int ret = OB_SUCCESS;
ObMemberList member_list;
ObSArray<ObAddr> member_array;
ObTimeGuard tg(__func__, 1000000);
bool is_all_dest_finished = false;
SpinWLockGuard guard(split_lock_);
tg.click();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition storage is not initialized", K_(pkey), K(ret));
} else if (OB_UNLIKELY(0 >= schema_version) || OB_UNLIKELY(!spp.is_valid())) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(schema_version), K(spp));
} else if (!spp.is_source_partition(pkey_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "self is not source partition", K(ret), K_(pkey));
} else if (spp.is_dest_partition(pkey_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "source partition is in dest partition array", K(ret), K_(pkey));
} else if (schema_version < split_info_.get_schema_version()) {
ret = OB_OLD_SCHEMA_VERSION;
STORAGE_LOG(WARN, "obsolete partition-splitting request", K(ret), K(schema_version), K_(split_info));
} else {
if (!partition_split_progress_array_.is_inited()) {
if (OB_FAIL(partition_split_progress_array_.init(spp.get_dest_array()))) {
STORAGE_LOG(WARN, "init partition split progress array failed", K(ret));
} else {
partition_progress = IN_SPLITTING;
}
} else {
int tmp_progress = UNKNOWN_SPLIT_PROGRESS;
if (OB_FAIL(partition_split_progress_array_.get_min_progress(tmp_progress))) {
STORAGE_LOG(WARN, "get min split progress failed", K(ret));
} else {
partition_progress = static_cast<enum ObSplitProgress>(tmp_progress);
}
}
}
tg.click();
if (OB_SUCCESS == ret) {
bool log_finished = false;
bool is_1pc_trx_end = false;
switch (split_state_.get_state()) {
case LEADER_INIT: {
if (OB_FAIL(split_state_.switch_state(GET_RS_SPLIT_REQUEST))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
} else {
if (OB_FAIL(txs_->block_partition(pkey_, is_1pc_trx_end))) {
STORAGE_LOG(WARN, "block partition failed", K(ret));
} else if (OB_FAIL(E(EventTable::EN_PUSH_TASK_FAILED) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_PUSH_TASK_FAILED", K(ret), K_(pkey));
} else if (OB_FAIL(push_split_task_(schema_version, spp))) {
STORAGE_LOG(WARN, "push split task failed", K(ret), K(schema_version), K(spp));
} else {
// do nothing
}
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
if (OB_SUCCESS != ret) {
break;
} else {
// go through
}
}
case SPLIT_START: {
if (!is_1pc_trx_end) {
const bool split_kill_trans = true;
if (!split_kill_trans && GET_MIN_CLUSTER_VERSION() > CLUSTER_VERSION_3100) {
if (OB_FAIL(txs_->submit_log_for_split(pkey_, log_finished))) {
STORAGE_LOG(WARN, "submit_log_for_split failed", K(ret));
} else if (log_finished) {
if (OB_FAIL(txs_->wait_1pc_trx_end(pkey_))) {
if (OB_EAGAIN != ret) {
STORAGE_LOG(WARN, "wait 1pc trx end failed", K(ret));
} else {
if (EXECUTE_COUNT_PER_SEC(1)) {
STORAGE_LOG(WARN, "wait 1pc trx end failed", K(ret));
}
}
} else {
is_1pc_trx_end = true;
}
}
} else {
const KillTransArg arg(true, true, false);
if (OB_FAIL(txs_->kill_all_trans(pkey_, arg, is_1pc_trx_end))) {
STORAGE_LOG(WARN, "kill all trans failed", K(ret));
}
}
}
if (OB_SUCCESS == ret && is_1pc_trx_end) {
split_trans_clear_ts_ = MonotonicTs::current_time();
if (OB_FAIL(split_state_.switch_state(ALL_TRANS_CLEAR))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
}
}
if (OB_SUCCESS != ret || !is_1pc_trx_end) {
break;
} else {
// go through
}
}
case SPLIT_TRANS_CLEAR: {
ObPartitionSplitSourceLog log;
int64_t gts = 0;
int64_t base_ts = 0;
MonotonicTs unused_ts = MonotonicTs::current_time();
const transaction::MonotonicTs split_trans_clear_ts =
split_trans_clear_ts_ + transaction::MonotonicTs(SPLIT_FREEZE_WAIT_TS);
if (OB_FAIL(OB_TS_MGR.get_gts(pkey_.get_tenant_id(), split_trans_clear_ts, NULL, gts, unused_ts))) {
if (OB_EAGAIN != ret) {
STORAGE_LOG(WARN, "get gts failed", K(ret));
} else {
// rewrite ret
ret = OB_SUCCESS;
}
} else if (OB_FAIL(decide_split_version_(gts, base_ts))) {
STORAGE_LOG(WARN, "fail to decide split version", K(ret), K_(pkey));
} else if (OB_FAIL(split_state_.switch_state(SUBMIT_SOURCE_SPLIT_LOG_SUCCESS))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
} else {
int64_t timestamp = 0;
if (OB_FAIL(E(EventTable::EN_BLOCK_SUBMIT_SPLIT_SOURCE_LOG) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_BLOCK_SUBMIT_SPLIT_SOURCE_LOG", K(ret), K_(pkey));
} else if (OB_FAIL(pg_storage_.get_weak_read_timestamp(timestamp))) {
STORAGE_LOG(WARN, "pg storage get weak read timestamp error", KR(ret), K_(pkey));
} else if (OB_FAIL(log.init(schema_version, spp, timestamp))) {
STORAGE_LOG(WARN, "split source log init failed", K(ret));
} else if (OB_FAIL(submit_split_source_log_(log, base_ts))) {
STORAGE_LOG(WARN, "submit split source log failed", K(ret));
} else {
STORAGE_LOG(INFO, "submit split source log success", K_(pkey));
}
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
break;
}
case SPLIT_SOURCE_LOGGING: {
STORAGE_LOG(INFO, "split source partition is logging");
break;
}
case LEADER_SPLIT_SOURCE_LOG:
// go through
case FOLLOWER_SPLIT_SOURCE_LOG: {
if (OB_FAIL(split_state_.switch_state(SOURCE_SHUTDOWN_SUCCESS))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
} else {
const int64_t split_version = split_info_.get_split_version();
const int64_t log_id = split_info_.get_source_log_id();
const int64_t schema_version = split_info_.get_schema_version();
if (OB_FAIL(E(EventTable::EN_BLOCK_SHUTDOWN_PARTITION) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_PUSH_REFERENCE_TABLE_FAIL", K(ret), K_(pkey));
} else if (OB_FAIL(shutdown_(split_version, log_id, schema_version))) {
STORAGE_LOG(WARN, "shutdown source partition failed", K(ret));
}
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
break;
}
case SHUTDOWN_SUCCESS: {
if (OB_FAIL(split_state_.switch_state(SET_REFERENCE_TABLE_SUCCESS))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
} else {
const int64_t split_version = split_info_.get_split_version();
bool is_dest_partition_ready = false;
if (OB_FAIL(E(EventTable::EN_PUSH_REFERENCE_TABLE_FAIL) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_PUSH_REFERENCE_TABLE_FAIL", K(ret), K_(pkey));
} else if (OB_FAIL(check_if_dest_pg_ready_(spp.get_dest_array(), is_dest_partition_ready))) {
STORAGE_LOG(WARN, "check if dest partition group ready failed", K(ret), K_(pkey));
} else if (!is_dest_partition_ready) {
ret = OB_EAGAIN;
STORAGE_LOG(WARN, "dest partition group is not ready, need retry", K(ret), K_(pkey));
} else if (OB_FAIL(push_reference_tables_(split_info_.get_dest_partitions(), split_version))) {
STORAGE_LOG(WARN, "push reference tables failed", K(ret), K(split_version));
}
if (OB_SUCCESS != ret) {
(void)split_state_.restore_state();
}
}
break;
}
case TABLE_REFERENCE_SUCCESS: {
if (!is_all_dest_finished) {
if (OB_FAIL(E(EventTable::EN_BLOCK_SPLIT_DEST_PARTITION) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_BLOCK_SPLIT_DEST_PARTITION", K(ret), K_(pkey));
} else if (OB_FAIL(split_dest_partitions_(is_all_dest_finished))) {
STORAGE_LOG(WARN, "split dest partitions failed", K(ret));
} else {
// do nothing
}
}
break;
}
case FOLLOWER_INIT:
// go through
break;
default: {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "unexpected split state", K(ret), K_(split_state));
}
}
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN,
"split source partition failed",
K(ret),
K_(pkey),
K(schema_version),
K(partition_progress),
K(tg),
K_(partition_split_progress_array));
} else if (PHYSICAL_SPLIT_FINISH != partition_progress) {
STORAGE_LOG(INFO,
"receive split source partition request",
K_(pkey),
K(schema_version),
K(partition_progress),
K(tg),
K_(partition_split_progress_array));
} else {
STORAGE_LOG(INFO,
"split source partition success",
K_(pkey),
K(schema_version),
K(partition_progress),
K(tg),
K_(partition_split_progress_array));
}
return ret;
}
int ObPartitionGroup::split_dest_partition(const ObPartitionSplitInfo& split_info, enum ObSplitProgress& progress)
{
int ret = OB_SUCCESS;
int tmp_progress = UNKNOWN_SPLIT_PROGRESS;
ObTimeGuard tg(__func__, 1000000);
ObMemberList mlist;
ObSArray<ObAddr> marray;
SpinWLockGuard guard(split_lock_);
tg.click();
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition storage is not initialized", K_(pkey), K(ret));
} else if (0 >= split_info.get_split_version() || 0 > split_info.get_schema_version() ||
!is_valid_log_id(split_info.get_source_log_id()) || !split_info.get_spp().is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(split_info));
} else if (OB_FAIL(get_leader_curr_member_list_(mlist))) {
STORAGE_LOG(WARN, "get leader curr member list failed", K(ret));
} else if (split_info.get_schema_version() < split_info_.get_schema_version()) {
progress = PHYSICAL_SPLIT_FINISH;
STORAGE_LOG(WARN, "receive obsolete split dest request", K_(pkey), K(split_info), K(split_info));
} else if (!replica_split_progress_array_.is_inited() || !mlist.member_addr_equal(saved_member_list_)) {
replica_split_progress_array_.reset();
if (OB_FAIL(mlist.get_addr_array(marray))) {
STORAGE_LOG(WARN, "get addr array failed", K(ret), K(mlist));
} else if (OB_FAIL(replica_split_progress_array_.init(marray))) {
STORAGE_LOG(WARN, "init replica split progress array failed", K(ret), K(marray));
} else {
progress = IN_SPLITTING;
saved_member_list_ = mlist;
}
} else if (OB_FAIL(replica_split_progress_array_.get_min_progress(tmp_progress))) {
STORAGE_LOG(WARN, "get min progress failed", K(ret), K_(replica_split_progress_array));
} else {
progress = static_cast<ObSplitProgress>(tmp_progress);
}
if (OB_SUCCESS == ret && PHYSICAL_SPLIT_FINISH > progress) {
if (OB_FAIL(query_replica_split_progress_(split_info.get_schema_version()))) {
STORAGE_LOG(WARN, "query replica split progress failed", K(ret));
}
}
bool is_complete = false;
const int64_t split_version = split_info.get_split_version();
if (OB_SUCCESS == ret && PHYSICAL_SPLIT_FINISH > progress) {
switch (split_state_.get_state()) {
case LEADER_INIT: {
break;
}
case LEADER_WAIT_SPLIT: {
if (OB_FAIL(check_complete(is_complete))) {
STORAGE_LOG(WARN, "failed to check complete", K(ret), K(pkey_));
} else if (!is_complete) {
ret = OB_EAGAIN;
if (REACH_TIME_INTERVAL(5 * 1000 * 1000)) {
REPLAY_LOG(WARN, "dest partition not complete, should retry", K(ret), K(pkey_));
}
} else if (OB_FAIL(OB_TS_MGR.wait_gts_elapse(pkey_.get_tenant_id(), split_version))) {
STORAGE_LOG(WARN, "failed to wait gts elapse", K(ret), K(split_info));
} else if (OB_FAIL(split_state_.switch_state(GET_SOURCE_SPLIT_REQUEST))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
} else {
ObPartitionSplitDestLog log;
if (OB_FAIL(split_info_.init(
split_info.get_schema_version(), split_info.get_spp(), ObPartitionSplitInfo::SPLIT_DEST_PARTITION))) {
STORAGE_LOG(WARN, "partition split info init failed", K(ret));
} else {
split_info_.set_split_version(split_version);
split_info_.set_source_log_id(split_info.get_source_log_id());
split_info_.set_source_log_ts(split_info.get_source_log_ts());
if (OB_FAIL(log.init(split_version,
split_info.get_schema_version(),
split_info.get_source_log_id(),
split_info.get_source_log_ts(),
split_info.get_spp()))) {
STORAGE_LOG(WARN, "split dest log init failed", K(ret));
} else if (OB_FAIL(submit_split_dest_log_(log))) {
STORAGE_LOG(WARN, "submit split dest log failed", K(ret));
} else {
// do nothing
}
}
if (OB_SUCCESS != ret) {
split_info_.reset();
(void)split_state_.restore_state();
}
}
break;
}
case SPLIT_DEST_LOGGING: {
STORAGE_LOG(INFO, "split dest partition is logging");
break;
}
case LEADER_LOGICAL_SPLIT_SUCCESS: {
bool is_physical_split_finished = false;
if (OB_FAIL(check_physical_split_(is_physical_split_finished))) {
STORAGE_LOG(WARN, "check physical split failed", K(ret));
} else if (is_physical_split_finished) {
if (OB_FAIL(split_state_.switch_state(PHYSICAL_SPLIT_SUCCESS))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
}
} else {
// do nothing
}
break;
}
case FOLLOWER_INIT:
// go through
case FOLLOWER_WAIT_SPLIT:
// go through
case FOLLOWER_LOGICAL_SPLIT_SUCCESS: {
ret = OB_NOT_MASTER;
STORAGE_LOG(WARN, "dest partition is not master", K(ret), K_(split_state));
break;
}
default: {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "unexpected split state", K(ret), K_(split_state));
}
}
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN,
"split dest partition failed",
K(ret),
K_(pkey),
K(tg),
K(split_info),
K(progress),
K_(replica_split_progress_array));
} else if (PHYSICAL_SPLIT_FINISH != progress) {
STORAGE_LOG(INFO,
"receive split dest partition request",
K_(pkey),
K(tg),
K(split_info),
K(progress),
K_(replica_split_progress_array));
} else {
STORAGE_LOG(INFO, "split dest partition success", K_(pkey), K(tg), K(split_info));
}
return ret;
}
int ObPartitionGroup::push_reference_tables(const ObIArray<ObPartitionKey>& dest_array, const int64_t split_version)
{
int ret = OB_SUCCESS;
bool is_dest_partition_ready = false;
SpinWLockGuard guard(split_lock_);
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (OB_FAIL(check_if_dest_pg_ready_(dest_array, is_dest_partition_ready))) {
STORAGE_LOG(WARN, "check if dest partition group ready failed", K(ret));
} else if (!is_dest_partition_ready) {
ret = OB_EAGAIN;
STORAGE_LOG(WARN, "dest partition group is not ready, need retry", K(ret));
} else if (OB_FAIL(push_reference_tables_(dest_array, split_version))) {
STORAGE_LOG(WARN, "failed to push reference tables", K(ret), K(dest_array), K(split_version));
}
return ret;
}
int ObPartitionGroup::replay_split_state_slog(const ObSplitPartitionStateLogEntry& log_entry)
{
int ret = OB_SUCCESS;
const bool write_slog = false;
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (!log_entry.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(log_entry));
} else if (OB_FAIL(split_state_.set_state(log_entry.get_state()))) {
STORAGE_LOG(WARN, "set split state failed", K(ret));
} else if (OB_FAIL(save_split_state_(log_entry.get_state(), write_slog))) {
STORAGE_LOG(WARN, "save split state failed", K(ret), K(log_entry));
} else {
// do nothing
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "replay split state slog failed", K(ret), K_(pkey), K(log_entry));
} else {
STORAGE_LOG(INFO, "replay split state slog success", K_(pkey), K(log_entry));
}
return ret;
}
int ObPartitionGroup::replay_split_info_slog(const ObSplitPartitionInfoLogEntry& log_entry)
{
int ret = OB_SUCCESS;
const bool write_slog = false;
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
split_info_.set_split_version(log_entry.get_split_info().get_split_version());
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (!log_entry.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(log_entry));
} else if (OB_FAIL(split_info_.set(log_entry.get_split_info().get_schema_version(),
log_entry.get_split_info().get_spp(),
log_entry.get_split_info().get_split_type()))) {
STORAGE_LOG(WARN, "set split info failed", K(ret));
} else if (OB_FAIL(save_split_info_(split_info_, write_slog))) {
STORAGE_LOG(WARN, "save split info failed", K(ret), K(log_entry));
} else {
// do nothing
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "replay split info slog failed", K(ret), K_(pkey), K(log_entry));
} else {
STORAGE_LOG(INFO, "replay split info slog success", K_(pkey), K(log_entry));
}
return ret;
}
int ObPartitionGroup::set_dest_partition_split_progress(
const int64_t schema_version, const ObPartitionKey& pkey, const int progress)
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (0 > schema_version || !pkey.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(schema_version), K_(pkey));
// ignore warning
} else if (schema_version < split_info_.get_schema_version()) {
STORAGE_LOG(WARN, "schema version not match, ignore it", K(schema_version), K_(split_info));
} else if (schema_version > split_info_.get_schema_version()) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "unexpected schema version", K(ret), K(schema_version), K_(split_info));
} else if (OB_FAIL(partition_split_progress_array_.set_progress(pkey, progress))) {
STORAGE_LOG(WARN, "set progress failed", K(ret), K(schema_version), K(pkey), K(progress));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::get_reference_tables(const ObPartitionKey& pkey, const int64_t index_id, ObTablesHandle& handle)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (0 >= index_id) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(index_id));
} else {
ret = get_reference_tables_(pkey, index_id, handle);
}
return ret;
}
int ObPartitionGroup::get_reference_memtables(ObTablesHandle& handle)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else {
ret = pg_storage_.get_reference_memtables(handle);
}
return ret;
}
int ObPartitionGroup::get_all_table_ids(const ObPartitionKey& pkey, ObIArray<uint64_t>& index_tables)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else {
ret = get_all_table_ids_(pkey, index_tables);
}
return ret;
}
int ObPartitionGroup::set_reference_tables(const ObPartitionKey& pkey, const int64_t index_id, ObTablesHandle& handle)
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (0 >= index_id) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(index_id));
} else if (OB_FAIL(set_reference_tables_(pkey, index_id, handle))) {
STORAGE_LOG(WARN, "set reference tables failed", K(ret), K(index_id));
} else {
// do nothing
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "set reference tables failed", K(ret), K_(pkey), K(index_id));
} else {
STORAGE_LOG(INFO, "set reference tables success", K_(pkey), K(index_id));
}
return ret;
}
int ObPartitionGroup::set_split_version(const int64_t split_version)
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinWLockGuard guard(split_lock_);
tg.click();
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (0 >= split_version) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(split_version));
} else {
ret = set_split_version_(split_version);
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "set split version failed", K(ret), K_(pkey), K(split_version));
} else {
STORAGE_LOG(INFO, "set split version success", K_(pkey), K(split_version));
}
return ret;
}
int ObPartitionGroup::check_can_migrate(bool& can_migrate)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.check_can_migrate(can_migrate))) {
STORAGE_LOG(WARN, "failed to check can migrate", K(ret));
}
return ret;
}
bool ObPartitionGroup::is_splitting() const
{
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
return in_splitting(split_state_.get_state());
}
bool ObPartitionGroup::is_split_source_partition() const
{
return split_info_.get_src_partition() == pkey_;
}
bool ObPartitionGroup::is_in_dest_split() const
{
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
return is_dest_split(split_state_.get_state());
}
bool ObPartitionGroup::is_dest_logical_split_finish() const
{
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
return is_logical_split_dest_finish(split_state_.get_state());
}
int ObPartitionGroup::check_split_state_() const
{
int ret = OB_SUCCESS;
if (is_splitting_()) {
ret = OB_PARTITION_IS_SPLITTING;
}
return ret;
}
int ObPartitionGroup::check_split_state() const
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
return check_split_state_();
}
int ObPartitionGroup::get_split_progress(const int64_t schema_version, int& progress)
{
int ret = OB_SUCCESS;
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
if (schema_version <= 0) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(schema_version));
} else if (ObReplicaTypeCheck::is_log_replica(get_replica_type())) {
progress = NEED_NOT_SPLIT;
} else if (schema_version < split_info_.get_schema_version()) {
progress = PHYSICAL_SPLIT_FINISH;
} else {
ret = get_dest_split_progress_(progress);
}
TRANS_LOG(INFO, "get split progress", K(ret), K_(pkey), K(progress), K_(split_state));
return ret;
}
int ObPartitionGroup::set_split_progress(const ObAddr& replica, const int progress)
{
int ret = OB_SUCCESS;
if (!is_leader_state(get_partition_state())) {
ret = OB_NOT_MASTER;
STORAGE_LOG(WARN, "not master", K(ret), K(replica), K(progress));
} else if (OB_FAIL(replica_split_progress_array_.set_progress(replica, progress))) {
STORAGE_LOG(WARN, "set split progress failed", K(ret), K(replica), K(progress));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::block_partition_split_by_mc()
{
int ret = OB_SUCCESS;
SpinWLockGuard guard(split_lock_);
const ObPartitionKey& src_partition = split_info_.get_src_partition();
if (src_partition == pkey_) {
ret = OB_PARTITION_IS_SPLITTING;
STORAGE_LOG(INFO, "source partition is splitting or splitted", K(ret), K_(pkey), K_(split_info));
} else {
if (is_splitting_()) {
ret = OB_PARTITION_IS_SPLITTING;
STORAGE_LOG(INFO, "partition is splitting", K(ret), K_(pkey));
}
}
if (OB_SUCCESS == ret) {
is_split_blocked_by_mc_ = true;
}
return ret;
}
int ObPartitionGroup::unblock_partition_split_by_mc()
{
int ret = OB_SUCCESS;
SpinWLockGuard guard(split_lock_);
is_split_blocked_by_mc_ = false;
return ret;
}
int64_t ObPartitionGroup::get_freeze_snapshot_ts() const
{
int ret = OB_SUCCESS;
int64_t snapshot_version = 0;
if (OB_FAIL(freeze_record_.get_snapshot_version(snapshot_version))) {
snapshot_version = -1;
}
return snapshot_version;
}
ObPartitionState ObPartitionGroup::get_partition_state() const
{
return ATOMIC_LOAD(&partition_state_);
}
// return value : OB_SUCCESS, abbr. 'Y'
// return value : OB_STATE_NOT_MATCH, abbr. 'N'
// return value : OB_EAGAIN, abbr. 'A'
// return value : OB_NOT_MASTER, abbr. 'NM'
// -----------------------------------------------------------------------
// |\ | I | F | F | L | L | L | L | L | L | L | O | O | R | I |
// | \ new | N | _ | _ | _ | _ | _ | _ | _ | _ | _ | F | F | E | N |
// | \ state | I | W | M | T | T | C | W | F | M | R | F | F | M | V |
// | \ | T | O | I | A | A | A | O | R | I | E | L | L | O | A |
// | \ | | R | N | K | K | N | R | O | N | V | I | I | V | L |
// | \ | | K | O | E | E | C | K | Z | O | O | N | N | E | I |
// | \ | | I | R | O | O | E | I | E | R | K | I | E | | D |
// | \ | | N | | V | V | L | N | N | | E | N | | | _ |
// | \ | | G | | E | E | E | G | | | | G | | | S |
// | \ | | | | R | R | D | | | | | | | | T |
// | old \ | | | | | E | | | | | | | | | A |
// | state \ | | | | | D | | | | | | | | | T |
// | \| | | | | | | | | | | | | | E |
// -----------------------------------------------------------------------
// |INIT | N | Y | N | N | N | N | N | N | N | N | N | N | Y | Y |
// -----------------------------------------------------------------------
// |F_WORKING | N | N | Y | Y | N | N | N | NM| N | N | Y | N | Y | N |
// -----------------------------------------------------------------------
// |F_MINOR | N | Y | N | N | N | N | N | NM| N | N | A | N | A | N |
// -----------------------------------------------------------------------
// |L_TAKEOVER | N | N | N | N | Y | N | N | A | A | A | N | N | N | N |
// -----------------------------------------------------------------------
// |L_TAKEOVERED | N | N | N | N | N | Y | Y | A | A | A | N | N | N | N |
// -----------------------------------------------------------------------
// |L_CANCELED | N | N | N | N | N | N | N | NM| N | Y | N | N | N | N |
// -----------------------------------------------------------------------
// |L_WORKING | N | N | N | N | N | N | N | Y | Y | Y | N | N | Y | N |
// -----------------------------------------------------------------------
// |L_FROZEN | N | N | N | N | N | N | Y | A | A | A | N | N | N | N |
// -----------------------------------------------------------------------
// |L_MINOR | N | N | N | N | N | N | Y | A | A | A | N | N | N | N |
// -----------------------------------------------------------------------
// |L_REVOKE | N | Y | N | N | N | N | N | NM| N | N | A | N | A | N |
// -----------------------------------------------------------------------
// |OFFLINING | N | N | N | N | N | N | N | N | N | N | N | Y | Y | N |
// -----------------------------------------------------------------------
// |OFFLINE | N | Y | N | N | N | N | N | N | N | N | N | N | Y | N |
// -----------------------------------------------------------------------
// |REMOVE | N | N | N | N | N | N | N | N | N | N | N | N | N | Y |
// -----------------------------------------------------------------------
// |INVALID_STATE| N | N | N | N | N | N | N | N | N | N | N | N | Y | N |
// -----------------------------------------------------------------------
//
// TODO Manage the state of partition and PG level together
int ObPartitionGroup::try_switch_partition_state(const ObPartitionState state)
{
int ret = OB_SUCCESS;
static const int N = OB_STATE_NOT_MATCH;
static const int Y = OB_SUCCESS;
static const int A = OB_EAGAIN;
static const int NM = OB_NOT_MASTER;
static const int64_t STATE_NUM = INVALID_STATE + 1;
static const int STATE_MATRIX[STATE_NUM][STATE_NUM] = {{N, Y, N, N, N, N, N, N, N, N, N, N, Y, Y},
{N, N, Y, Y, N, N, N, NM, N, N, Y, N, Y, N},
{N, Y, N, N, N, N, N, NM, N, N, A, N, A, N},
{N, N, N, N, Y, N, N, A, A, A, N, N, N, N},
{N, N, N, N, N, Y, Y, A, A, A, N, N, N, N},
{N, N, N, N, N, N, N, NM, N, Y, N, N, N, N},
{N, N, N, N, N, N, N, Y, Y, Y, N, N, Y, N},
{N, N, N, N, N, N, Y, A, A, A, N, N, N, N},
{N, N, N, N, N, N, Y, A, A, A, N, N, N, N},
{N, Y, N, N, N, N, N, NM, N, N, A, N, A, N},
{N, N, N, N, N, N, N, N, N, N, N, Y, Y, N},
{N, Y, N, N, N, N, N, N, N, N, N, N, Y, N},
{N, N, N, N, N, N, N, N, N, N, N, N, N, Y},
{N, N, N, N, N, N, N, N, N, N, N, N, Y, N}};
ObTimeGuard time_guard("try_switch_partition_state", 100 * 1000);
lib::ObMutexGuard guard(partition_state_lock_);
time_guard.click();
ObPartitionState cur_state = get_partition_state();
ret = STATE_MATRIX[cur_state][state];
if (OB_SUCC(ret)) {
ATOMIC_STORE(&partition_state_, state);
}
if (OB_FAIL(ret)) {
STORAGE_LOG(WARN, "switch partition state failed", K(pkey_), K(cur_state), K(state), K(ret));
} else {
STORAGE_LOG(INFO, "switch partition state successfully", K(pkey_), K(cur_state), K(state));
}
return ret;
}
int ObPartitionGroup::switch_partition_state(const ObPartitionState state)
{
int ret = OB_SUCCESS;
int64_t start_ts = ObTimeUtility::current_time();
int64_t current_ts = 0;
const int64_t PRINT_INTERVAL = 10000;
const int64_t SLEEP_US = 1000;
do {
if (OB_FAIL(try_switch_partition_state(state))) {
current_ts = ObTimeUtility::current_time();
if (current_ts - start_ts > PRINT_INTERVAL) {
start_ts = current_ts;
}
if (OB_EAGAIN == ret) {
ObTransCond::usleep(SLEEP_US);
}
}
} while (OB_EAGAIN == ret);
return ret;
}
// Only check the schema_version of user table.
// System tables and dummy do not need to check, just skip it.
int ObPartitionGroup::check_schema_version(share::schema::ObMultiVersionSchemaService* schema_service)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (OB_UNLIKELY(NULL == schema_service)) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), KP(schema_service));
} else if (extract_pure_id(pkey_.get_table_id()) < OB_MIN_USER_TABLE_ID) {
// skip it,
} else if (OB_FAIL(schema_version_container_.check_base_schema_version(schema_service, pkey_))) {
STORAGE_LOG(WARN, "fail to check base schema_version", K(ret), K(pkey_));
} else {
} // do nothing
return ret;
}
int ObPartitionGroup::set_base_schema_version(int64_t base_schema_version)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init", K(ret));
} else if (base_schema_version < 0) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(base_schema_version));
} else if (OB_FAIL(schema_version_container_.update_and_leader_takeover(base_schema_version))) {
STORAGE_LOG(WARN, "fail to update and leader_takeover", K(ret), K(base_schema_version));
} else {
} // do nothing
return ret;
}
int ObPartitionGroup::SchemaVersionContainer::update_and_leader_takeover(int64_t base_schema_version)
{
int ret = OB_SUCCESS;
if (base_schema_version < 0) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(base_schema_version));
} else {
base_schema_version_ = base_schema_version;
// mem barrier to make sure execute sequence
ATOMIC_STORE(&schema_version_checked_, false);
}
return ret;
}
int ObPartitionGroup::get_latest_schema_version(share::schema::ObMultiVersionSchemaService* schema_service,
const common::ObPartitionKey& pkey, int64_t& latest_schema_version)
{
int ret = OB_SUCCESS;
const uint64_t tenant_id = is_inner_table(pkey.get_table_id()) ? OB_SYS_TENANT_ID : pkey.get_tenant_id();
if (OB_UNLIKELY(NULL == schema_service) || !pkey.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(pkey), KP(schema_service));
} else if (OB_FAIL(schema_service->get_tenant_refreshed_schema_version(tenant_id, latest_schema_version))) {
LOG_WARN("fail to get tenant refresh schema version", K(ret), K(pkey));
}
return ret;
}
int ObPartitionGroup::SchemaVersionContainer::check_base_schema_version(
share::schema::ObMultiVersionSchemaService* schema_service, common::ObPartitionKey& pkey)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(NULL == schema_service) || OB_UNLIKELY(!pkey.is_valid())) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), KP(schema_service), K(pkey));
} else {
bool is_schema_valid = false;
int64_t retry_cnt = 0;
int64_t latest_schema_version = 0;
const int64_t SLEEP_ON_NEED_RETRY = 15 * 1000; // 15ms
const uint64_t tenant_id = is_inner_table(pkey.get_table_id()) ? OB_SYS_TENANT_ID : pkey.get_tenant_id();
while (!is_schema_valid && OB_SUCCESS == ret) {
bool schema_version_checked = ATOMIC_LOAD(&schema_version_checked_);
if (schema_version_checked) {
// follower or leader already checked, go on break
break;
} else {
if (OB_FAIL(schema_service->get_tenant_refreshed_schema_version(tenant_id, latest_schema_version))) {
STORAGE_LOG(
WARN, "fail to get tenant refreshed schema version", K(ret), K(tenant_id), K(latest_schema_version));
} else if (latest_schema_version >= base_schema_version_) {
is_schema_valid = true;
}
}
retry_cnt++;
if (is_schema_valid) {
(void)ATOMIC_BCAS(&schema_version_checked_, schema_version_checked, true);
} else if (retry_cnt >= INVALID_SCHEMA_RETRY_CNT || THIS_WORKER.is_timeout()) {
const bool is_timeout = THIS_WORKER.is_timeout();
ret = OB_SCHEMA_ERROR;
STORAGE_LOG(WARN,
"schema version error",
K(ret),
K(base_schema_version_),
K(latest_schema_version),
K(retry_cnt),
K(is_timeout));
} else {
usleep(SLEEP_ON_NEED_RETRY); // retry
STORAGE_LOG(WARN, "retry to get schema", K(retry_cnt), K(base_schema_version_), K(latest_schema_version));
}
}
}
return ret;
}
int ObPartitionGroup::do_warm_up_request(const ObIWarmUpRequest* request)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not inited", K(ret));
} else if (OB_FAIL(pg_storage_.do_warm_up_request(request))) {
STORAGE_LOG(WARN, "failed to do warm up request", K(ret));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::check_can_do_merge(bool& can_merge, bool& need_merge)
{
int ret = OB_SUCCESS;
can_merge = false;
need_merge = true;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K_(pkey), K(ret));
} else if (OB_ISNULL(pls_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "partition log service is empty", K_(pkey), K(ret));
} else {
ObPartitionState state = get_partition_state();
if (!is_leader_state(state) && !is_follower_state(state)) {
need_merge = false;
STORAGE_LOG(WARN,
"current partition is not normal replica, "
"no need to merge",
K_(pkey),
K(state),
K(need_merge));
} else {
can_merge = true;
}
}
if (OB_SUCC(ret)) {
if (OB_FAIL(E(EventTable::EN_CHECK_CAN_DO_MERGE) OB_SUCCESS)) {
STORAGE_LOG(WARN, "ERRSIM: EN_CHECK_CAN_DO_MERGE", K(ret), K_(pkey));
if (OB_EAGAIN == ret) {
ret = OB_SUCCESS;
need_merge = true;
can_merge = false;
}
}
}
return ret;
}
ObReplicaType ObPartitionGroup::get_replica_type() const
{
int tmp_ret = OB_SUCCESS;
ObReplicaType replica_type = ObReplicaType::REPLICA_TYPE_MAX;
if (OB_SUCCESS != (tmp_ret = pg_storage_.get_replica_type(replica_type))) {
STORAGE_LOG(WARN, "get replica_type error", K(tmp_ret), K_(pkey), K(replica_type));
}
return replica_type;
}
ObReplicaProperty ObPartitionGroup::get_replica_property() const
{
int ret = OB_SUCCESS;
ObReplicaProperty replica_property;
if (OB_FAIL(pg_storage_.get_replica_property(replica_property))) {
STORAGE_LOG(WARN, "get replica property error", K(ret), K(pkey_), K(replica_property));
}
return replica_property;
}
int ObPartitionGroup::set_replica_type_(const ObReplicaType& replica_type, const bool write_redo_log)
{
int ret = OB_SUCCESS;
ObReplicaType origin_replica_type;
ObPartitionReplicaState state = OB_UNKNOWN_REPLICA;
if (OB_FAIL(pg_storage_.get_replica_type(origin_replica_type))) {
STORAGE_LOG(WARN, "get replica type error", K(ret), K_(pkey));
} else if (replica_type != origin_replica_type) {
if (OB_FAIL(pg_storage_.get_replica_state(false, state))) {
STORAGE_LOG(WARN, "get replica state error", K_(pkey), K(ret));
} else if (OB_FAIL(pls_->set_replica_type(replica_type))) {
STORAGE_LOG(WARN, "set replica_type to partition log service failed", K_(pkey), K(replica_type), K(ret));
} else {
int tmp_ret = OB_SUCCESS;
if (!ObReplicaTypeCheck::is_replica_with_memstore(replica_type) && OB_NORMAL_REPLICA == state) {
STORAGE_LOG(INFO, "clear pending log task", K_(pkey), K(ret));
if (OB_SUCCESS != (tmp_ret = replay_status_->set_need_filter_trans_log(pkey_, true /*need filter*/))) {
STORAGE_LOG(ERROR,
"fail to set_need_filter_trans_log",
K(tmp_ret),
K_(pkey),
K(origin_replica_type),
K(replica_type));
} else if (OB_SUCCESS != (tmp_ret = txs_->clear_all_ctx(pkey_))) {
STORAGE_LOG(ERROR,
"fail to clear all trans ctx after set replica type",
K(tmp_ret),
K_(pkey),
K(origin_replica_type),
K(replica_type));
}
}
if (OB_FAIL(pg_storage_.set_pg_replica_type(replica_type, write_redo_log))) {
STORAGE_LOG(WARN, "set replica_type to storage failed", K_(pkey), K(replica_type), K(ret));
} else {
if (ObReplicaTypeCheck::is_log_replica(replica_type)) {
if (OB_FAIL(save_split_state(static_cast<int64_t>(ObPartitionSplitStateEnum::FOLLOWER_INIT)))) {
STORAGE_LOG(WARN, "failed to save split state", K(ret), K_(pkey));
} else {
STORAGE_LOG(INFO, "set replica_type successfully", K_(pkey), K(replica_type));
}
}
}
}
} else {
STORAGE_LOG(INFO, "no need to set replica_type", K_(pkey), K(replica_type));
}
return ret;
}
int ObPartitionGroup::set_replica_type(const ObReplicaType& replica_type, const bool write_redo_log)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K_(pkey), K(ret));
} else if (OB_ISNULL(replay_status_) || OB_ISNULL(pls_) || OB_ISNULL(txs_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "invalid members", KP(replay_status_), KP(pls_), KP(txs_), K_(pkey), K(ret));
} else {
ObPGLockWithPendingReplayGuard guard(
lock_, *replay_status_, pkey_, 0, PGLOCKREPLAY | PGLOCKCLOG | PGLOCKTRANS | PGLOCKSTORAGE);
ret = set_replica_type_(replica_type, write_redo_log);
}
return ret;
}
// Get the weak read timestamp from cache.
// The cache is refreshed by trans_view thread.
int ObPartitionGroup::get_weak_read_timestamp(int64_t& timestamp)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
} else if (OB_FAIL(pg_storage_.get_weak_read_timestamp(timestamp))) {
STORAGE_LOG(WARN, "pg storage get weak read timestamp error", KR(ret), K_(pkey));
} else {
// do nothing
}
return ret;
}
int64_t ObPartitionGroup::get_cur_min_log_service_ts()
{
return partition_loop_worker_.get_cur_min_log_service_ts();
}
int64_t ObPartitionGroup::get_cur_min_trans_service_ts()
{
return partition_loop_worker_.get_cur_min_trans_service_ts();
}
int64_t ObPartitionGroup::get_cur_min_replay_engine_ts()
{
return partition_loop_worker_.get_cur_min_replay_engine_ts();
}
int ObPartitionGroup::update_last_checkpoint(const int64_t checkpoint)
{
return partition_loop_worker_.update_last_checkpoint(checkpoint);
}
int ObPartitionGroup::set_replay_checkpoint(const int64_t checkpoint)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret), K_(pkey));
} else if (OB_FAIL(partition_loop_worker_.set_replay_checkpoint(checkpoint))) {
STORAGE_LOG(WARN, "partition loop worker set replay checkpoint error", K(ret), K_(pkey));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::get_replay_checkpoint(int64_t& checkpoint)
{
return partition_loop_worker_.get_replay_checkpoint(checkpoint);
}
int ObPartitionGroup::generate_weak_read_timestamp(const int64_t max_stale_time, int64_t& timestamp)
{
return partition_loop_worker_.generate_weak_read_timestamp(max_stale_time, timestamp);
}
int ObPartitionGroup::do_partition_loop_work()
{
return partition_loop_worker_.do_partition_loop_work();
}
int ObPartitionGroup::need_minor_freeze(const uint64_t& log_id, bool& need_freeze)
{
int ret = OB_SUCCESS;
need_freeze = false;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K_(pkey), K(ret));
} else if (OB_UNLIKELY(OB_INVALID_ID == log_id)) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(log_id), K(ret));
} else if (0 == log_id) {
// need_freeze = false;
} else if (OB_FAIL(pg_storage_.check_need_minor_freeze(log_id, need_freeze))) {
STORAGE_LOG(WARN, "check need minor freeze error", K(ret), K_(pkey));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::set_emergency_release()
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K_(pkey), K(ret));
} else if (OB_FAIL(pg_storage_.set_emergency_release())) {
STORAGE_LOG(WARN, "pg storage set emergency release error", K(ret), K_(pkey));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::get_freeze_log_(const bool is_leader, uint64_t& log_id, int64_t& log_ts)
{
int ret = OB_SUCCESS;
log_id = OB_INVALID_ID;
log_ts = OB_INVALID_TIMESTAMP;
uint64_t max_majority_log_id = OB_INVALID_ID;
int64_t max_majority_log_ts = OB_INVALID_TIMESTAMP;
uint64_t next_replay_log_id = OB_INVALID_ID;
int64_t next_replay_log_ts = OB_INVALID_TIMESTAMP;
if (is_leader) {
(void)pls_->get_max_majority_log(max_majority_log_id, max_majority_log_ts);
if (OB_FAIL(pls_->get_next_replay_log_info(next_replay_log_id, next_replay_log_ts))) {
STORAGE_LOG(WARN, "get next replay log info failed", K(ret), K(*this));
} else {
log_id = MAX(max_majority_log_id, next_replay_log_id - 1);
log_ts = MAX(max_majority_log_ts, next_replay_log_ts - 1);
}
} else {
if (OB_FAIL(get_min_replayed_log_with_keepalive(log_id, log_ts))) {
STORAGE_LOG(WARN, "get min replay log with keepalive failed", K(ret), K(*this));
}
}
return ret;
}
// We guarantee that the data before freeze_id and the transactions before snapshot_version
// must be on the frozen memstore.
int ObPartitionGroup::get_freeze_cut_(ObMemtable& frozen_memtable, const bool is_leader, int64_t& snapshot_version,
uint64_t& freeze_id, int64_t& freeze_ts)
{
int ret = OB_SUCCESS;
int64_t start_log_ts = frozen_memtable.get_start_log_ts();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K(ret), K_(pkey));
} else if (OB_ISNULL(pls_) || OB_ISNULL(replay_status_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition member is NULL", K(ret), K_(pkey), KP(pls_), KP(replay_status_));
} else {
if (is_leader) {
// 1. The freeze_id of leader is the maximum log id that is submitted in sliding window.
// This is mutually exclusive with transaction submission to ensure that transactions across
// frozen point will be marked dirty
SpinWLockGuard guard(freeze_lock_);
if (OB_FAIL(get_and_submit_freeze_(frozen_memtable, true /*is_leader*/, freeze_id, freeze_ts))) {
STORAGE_LOG(WARN,
"get and submit freeze id failed",
K(frozen_memtable),
K(is_leader),
K(freeze_id),
K(freeze_ts),
K(*this));
}
} else {
// 2. The freeze_id of follower is the the maximum log id of the right
// boundary of replay queue and the max majoritied log id
// The follower will block the replay, wait it to be empty and then get the freeze_id.
if (OB_FAIL(wait_follower_no_pending_task_())) {
STORAGE_LOG(WARN, "wait follower no pending task failed", K(is_leader), K(freeze_id), K(*this));
} else if (OB_FAIL(get_and_submit_freeze_(frozen_memtable, false /*is_leader*/, freeze_id, freeze_ts))) {
STORAGE_LOG(WARN,
"get and submit freeze id failed",
K(frozen_memtable),
K(is_leader),
K(freeze_id),
K(freeze_ts),
K(*this));
} else {
// The logic below is sophistic:
//
// If you remember the semantic of end_log_ts and max_log_ts belong to
// the memstore, you will know that all data belong to the log before
// end_log_ts is within the memstore, and the data may or maynot exist
// in the memstore if the log creates the data is between end_log_ts and
// max_log_ts
//
// In terms of the minor freeze, follower needs to wait until replaying
// to a continuous log point and fetch the freeze point. While follower
// cannot use the min replayed log ts both as the end_log_ts and
// max_log_ts.
//
// To see why the more sophistic max_log_ts calculation is required,
// consider the following example:
// 1. Leader submits the log 5,6,7 and only log 7 is in quorum using
// paxos and its data is already filled in the memstore
// 2. Leader switches to the follower and the min replayed log ts is
// smaller than the log 5's log_ts
// 3. If we just use the min replayed log ts as both the end_log_ts and
// max_log_ts the semantic specified above is broken
//
// So we need maintain the max_log_ts using the log 7's timestamp, in
// terms of the implementation, we use the max_majority_log_ts which is
// updated after each log's synchronization of leader.
//
// What's more, we need mark all data whose log is between end_log_ts to
// max_log_ts as overflow(the requirement from the storage layer). while
// the data may already synced and we have no chance to mark the data
// except traversing all data in the memtable. So we choose to mark the
// end_log_ts as the max_majority_log_ts as well.
//
// NB: we never maintain the max_mjority_log_ts for follower, so we just
// use the variable for the corner case of leader transfer.
uint64_t max_majority_log_id = OB_INVALID_ID;
int64_t max_majority_log_ts = OB_INVALID_TIMESTAMP;
(void)pls_->get_max_majority_log(max_majority_log_id, max_majority_log_ts);
if (max_majority_log_ts > freeze_ts) {
TRANS_LOG(WARN,
"max majority log ts is larger than freeze timestamp",
K(max_majority_log_ts),
K(freeze_ts),
K(*this));
ret = OB_EAGAIN;
}
}
}
if (OB_FAIL(ret)) {
// do nothing
} else if (OB_FAIL(get_freeze_snapshot_version_(freeze_ts, snapshot_version))) {
STORAGE_LOG(WARN, "get freeze snapshot version error", K(ret), K_(pkey), K(freeze_ts));
}
if (OB_SUCC(ret)) {
if (start_log_ts == freeze_ts) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR,
"start log ts equal end log ts",
K(ret),
K(pkey_),
K(freeze_id),
K(freeze_ts),
K(snapshot_version),
K(is_leader),
K(frozen_memtable),
K(start_log_ts));
} else {
STORAGE_LOG(INFO,
"freeze cut",
K(ret),
K(pkey_),
K(freeze_id),
K(freeze_ts),
K(snapshot_version),
K(is_leader),
K(frozen_memtable),
K(start_log_ts));
}
}
}
return ret;
}
int ObPartitionGroup::get_freeze_snapshot_version_(const int64_t freeze_ts, int64_t& snapshot_version)
{
int ret = OB_SUCCESS;
int64_t min_prepare_version = INT64_MAX;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
} else if (OB_FAIL(txs_->get_min_prepare_version(pkey_, freeze_ts, min_prepare_version))) {
STORAGE_LOG(ERROR, "get min prepare version failed", K(ret), K_(pkey), K(freeze_ts));
} else if (INT64_MAX == min_prepare_version) {
// If there is no prepared transaction across the frozen point,
// the transactions with version smaller than the frozen point are all on the memtable
snapshot_version = freeze_ts;
} else {
snapshot_version = min_prepare_version - 1;
}
return ret;
}
int ObPartitionGroup::get_and_submit_freeze_(
ObMemtable& frozen_memtable, bool is_leader, uint64_t& freeze_id, int64_t& freeze_ts)
{
int ret = OB_SUCCESS;
int64_t start_log_ts = frozen_memtable.get_start_log_ts();
if (OB_FAIL(get_freeze_log_(is_leader, freeze_id, freeze_ts))) {
STORAGE_LOG(WARN, "get freeze log failed", K(ret), K(*this));
} else if (start_log_ts == freeze_ts && OB_FAIL(wait_freeze_log_elapse_(freeze_ts, freeze_ts))) {
STORAGE_LOG(WARN,
"fail to submit freeze record",
K(ret),
K(pkey_),
K(frozen_memtable),
K(freeze_id),
K(freeze_ts),
K(start_log_ts));
} else if (OB_FAIL(freeze_record_.submit_freeze(frozen_memtable, freeze_ts))) {
STORAGE_LOG(WARN,
"fail to submit freeze record",
K(ret),
K(pkey_),
K(frozen_memtable),
K(freeze_id),
K(freeze_ts),
K(start_log_ts));
}
return ret;
}
int ObPartitionGroup::wait_freeze_log_elapse_(const int64_t in_freeze_ts, int64_t& out_freeze_ts)
{
int ret = OB_SUCCESS;
int64_t cnt = 0;
int64_t clog_timestamp = OB_INVALID_TIMESTAMP;
uint64_t unused_log_id = OB_INVALID_ID;
// The leader needs to guarantee clog_timestamp > freeze_ts + 1 before returning freeze_ts + 1
bool need_retry = true;
while (need_retry && OB_SUCC(ret)) {
if (OB_FAIL(pls_->get_next_replay_log_info(unused_log_id, clog_timestamp))) {
STORAGE_LOG(WARN, "fail to get next replay log timestamp", K(ret), K(pkey_), K(in_freeze_ts));
} else if (OB_INVALID_TIMESTAMP == clog_timestamp) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "wrong clog timestamp", K(clog_timestamp), K(pkey_));
} else if (clog_timestamp <= in_freeze_ts + 1) {
cnt++;
if (cnt >= WAIT_FREEZE_LOG_ELAPSE_CNT_LIMIT) {
need_retry = false;
ret = OB_EAGAIN;
STORAGE_LOG(WARN,
"wait freeze log elapse failed, may be no leader",
K(ret),
K(cnt),
K(clog_timestamp),
K(in_freeze_ts));
} else {
need_retry = true;
}
} else {
out_freeze_ts = clog_timestamp - 1;
need_retry = false;
STORAGE_LOG(INFO,
"start log ts equal end log ts, wait end log ts + 1 elapse succcess",
K(in_freeze_ts),
K(out_freeze_ts));
}
}
return ret;
}
int ObPartitionGroup::freeze_log_(const bool force)
{
int ret = OB_SUCCESS;
ObSavedStorageInfoV2 info;
ObBaseStorageInfo& clog_info = info.get_clog_info();
bool log_changed = false;
bool unused = false;
if (OB_UNLIKELY(!is_inited_ || NULL == pls_)) {
ret = OB_NOT_INIT;
} else if (OB_FAIL(get_clog_info_for_freeze_log_(info))) {
STORAGE_LOG(WARN, "get clog info for freeze log failed", K(ret), K(pkey_));
} else if (OB_FAIL(pg_storage_.check_log_or_data_changed(clog_info, log_changed, unused))) {
STORAGE_LOG(WARN, "check log or data changed error", K(ret), K(clog_info));
} else if (!log_changed && !force) {
STORAGE_LOG(INFO, "skip freeze log", K(pkey_), K(clog_info));
} else if (OB_FAIL(pg_storage_.set_pg_clog_info(clog_info, false))) {
STORAGE_LOG(WARN, "fail to save clog info", K(ret), K(pkey_));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::get_clog_info_for_freeze_log_(ObSavedStorageInfoV2& info)
{
int ret = OB_SUCCESS;
uint64_t min_unreplay_log_id = OB_INVALID_ID;
uint64_t last_replay_log_id = OB_INVALID_ID;
ObBaseStorageInfo& clog_info = info.get_clog_info();
ObBaseStorageInfo saved_clog_info;
if (OB_FAIL(get_base_storage_info_(clog_info))) {
STORAGE_LOG(WARN, "fail to get base clog info", K(ret), K(pkey_));
} else if (OB_FAIL(get_saved_clog_info(saved_clog_info))) {
STORAGE_LOG(WARN, "fail to get saved clog info", K(ret), K(pkey_));
} else {
last_replay_log_id = clog_info.get_last_replay_log_id();
min_unreplay_log_id = replay_status_->get_min_unreplay_log_id();
if (min_unreplay_log_id - 1 < last_replay_log_id) {
clog_info.set_last_replay_log_id(min_unreplay_log_id - 1);
if (OB_FAIL(info.update_last_replay_log_info(pkey_,
false /*replica_with_data*/,
saved_clog_info,
PG_QUERY_LOG_INFO_TIMEOUT,
false /* log_info_usable*/))) {
STORAGE_LOG(WARN, "fail to get update last replay log info", K(ret), K(pkey_), K(saved_clog_info), K(info));
}
}
}
return ret;
}
int ObPartitionGroup::wait_follower_no_pending_task_()
{
int ret = OB_SUCCESS;
int64_t cnt = 0;
int64_t task_cnt = replay_status_->get_pending_task_count();
while (replay_status_->has_pending_task(pkey_) && !replay_status_->has_encount_fatal_error() && OB_SUCC(ret)) {
usleep(FREEZE_WAIT_RETRY_SLEEP_TS);
cnt++;
if (cnt >= MAX_FREEZE_WAIT_RETRY_SLEEP_CNT) {
int last_task_cnt = task_cnt;
if (last_task_cnt == (task_cnt = replay_status_->get_pending_task_count())) {
ret = OB_EAGAIN;
STORAGE_LOG(WARN, "replay too slow, retreat!", K(ret), K(pkey_));
} else {
STORAGE_LOG(WARN, "follower wait replay too long", K(ret), K(pkey_), K(task_cnt));
}
cnt = 0;
}
}
if (replay_status_->has_encount_fatal_error()) {
TRANS_LOG(ERROR, "encounter fatal error", K(*replay_status_), K(ret), K(pkey_));
ret = OB_ERR_UNEXPECTED;
}
return ret;
}
int ObPartitionGroup::check_range_changed_(ObTableHandle& handle, const bool is_leader, bool& changed)
{
int ret = OB_SUCCESS;
uint64_t unused = 0;
int64_t tmp_freeze_ts = 0;
int64_t tmp_snapshot_version = 0;
int64_t start_log_ts = 0;
int64_t base_version = 0;
ObMemtable* mt = NULL;
changed = false;
if (OB_FAIL(handle.get_memtable(mt))) {
STORAGE_LOG(WARN, "get memtable from handle fail", K(ret), K(handle));
} else if (OB_ISNULL(mt)) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "memtable is NULL", K(ret));
} else if (OB_FAIL(get_freeze_log_(is_leader, unused, tmp_freeze_ts))) {
STORAGE_LOG(WARN, "get freeze log failed", K(ret), K(*this));
} else if (OB_FAIL(get_freeze_snapshot_version_(tmp_freeze_ts, tmp_snapshot_version))) {
STORAGE_LOG(WARN, "failed to get_weak_read_timestamp", K(ret), K(pkey_));
} else {
start_log_ts = mt->get_start_log_ts();
base_version = mt->get_base_version();
if (tmp_freeze_ts < start_log_ts || tmp_snapshot_version < base_version) {
STORAGE_LOG(INFO,
"skip freeze, maybe in the process of restarting",
K(ret),
K(pkey_),
K(tmp_freeze_ts),
K(tmp_snapshot_version),
K(start_log_ts),
K(base_version));
} else {
changed = tmp_freeze_ts >= start_log_ts && tmp_snapshot_version >= base_version &&
(tmp_freeze_ts > start_log_ts || tmp_snapshot_version > base_version);
}
}
if (OB_SUCC(ret) && !changed) {
STORAGE_LOG(WARN,
"log_id or version range not changed",
K(ret),
K(pkey_),
K(is_leader),
K(start_log_ts),
K(base_version),
K(tmp_freeze_ts),
K(tmp_snapshot_version));
}
return ret;
}
int ObPartitionGroup::prepare_storage_info_(
const int64_t snapshot_version, const uint64_t freeze_id, const int64_t freeze_ts, ObSavedStorageInfoV2& info)
{
int ret = OB_SUCCESS;
if (OB_FAIL(get_base_storage_info_(info.get_clog_info()))) {
STORAGE_LOG(WARN, "fail to get base storage info", K(ret), K(pkey_));
} else {
info.get_data_info().set_publish_version(snapshot_version);
info.get_data_info().set_last_replay_log_id(freeze_id);
info.get_data_info().set_last_replay_log_ts(freeze_ts);
info.get_data_info().set_created_by_new_minor_freeze();
}
return ret;
}
int ObPartitionGroup::submit_freeze_and_effect_memstore_inner_(
const bool is_leader, const bool emergency, ObMemtable& frozen_memtable, bool& effected, int64_t& snapshot_version)
{
int ret = OB_SUCCESS;
int tmp_ret = OB_SUCCESS;
uint64_t freeze_id = 0;
int64_t freeze_ts = 0;
ObSavedStorageInfoV2 info;
snapshot_version = 0;
effected = false;
if (!is_leader && OB_FAIL(replay_status_->set_pending())) {
STORAGE_LOG(WARN, "set replay status pending failed", K(replay_status_), K(ret), K(*this));
} else {
if (OB_FAIL(cut_and_submit_freeze_(is_leader, frozen_memtable, snapshot_version, freeze_id, freeze_ts))) {
STORAGE_LOG(INFO, "cut and submit freeze failed", K(pkey_));
} else if (OB_FAIL(prepare_storage_info_(snapshot_version, freeze_id, freeze_ts, info))) {
STORAGE_LOG(
WARN, "fail to prepare storage info", K(ret), K(pkey_), K(freeze_id), K(freeze_ts), K(snapshot_version));
} else if (OB_FAIL(pg_storage_.effect_new_active_memstore(info, emergency))) {
frozen_memtable.clear_freeze_log_ts();
if (OB_CANCELED == ret || OB_PARTITION_IS_REMOVED == ret || OB_NOT_INIT == ret) {
STORAGE_LOG(INFO, "freeze skiped", K(ret), K(pkey_));
ret = OB_SUCCESS;
} else {
// with resource allocated before, the failure is unexpected
STORAGE_LOG(ERROR, "effect new active memstore failed", K(ret), K(pkey_), K(info));
}
} else {
(void)frozen_memtable.update_max_log_ts(freeze_ts);
MEM_BARRIER();
frozen_memtable.set_frozen();
effected = true;
}
while (!is_leader && OB_SUCCESS != (tmp_ret = replay_status_->erase_pending(
ObTimeUtility::current_time() + OB_SET_REPLAY_ENGINE_PENDING_TIMEOUT))) {
STORAGE_LOG(WARN, "unset replay status pending failed", K(replay_status_), K(tmp_ret), K(*this));
}
}
return ret;
}
int ObPartitionGroup::cut_and_submit_freeze_(const bool is_leader, ObMemtable& frozen_memtable,
int64_t& snapshot_version, uint64_t& freeze_id, int64_t& freeze_ts)
{
int ret = OB_SUCCESS;
if (is_leader && OB_FAIL(frozen_memtable.prepare_freeze_log_ts())) {
STORAGE_LOG(ERROR, "fail to prepare freeze log id", K(ret), K(pkey_), K(frozen_memtable));
} else if (OB_FAIL(get_freeze_cut_(frozen_memtable, is_leader, snapshot_version, freeze_id, freeze_ts))) {
STORAGE_LOG(WARN, "fail to get freeze cut", K(ret), K(pkey_));
}
if (OB_FAIL(ret)) {
frozen_memtable.clear_freeze_log_ts();
}
return ret;
}
int ObPartitionGroup::freeze_log_and_data_v2_(const bool emergency, const bool force, int64_t& snapshot_version)
{
int ret = OB_SUCCESS;
ObTableHandle old_handle;
ObTableHandle new_handle;
ObBaseStorageInfo clog_info;
bool changed = true;
bool found = false;
bool log_changed = true;
bool need_freeze_data = true;
bool effected = false;
ObMemtable* frozen_memtable = NULL;
snapshot_version = 0;
bool is_leader = is_leader_state(get_partition_state());
ObTimeGuard timeguard("freeze", 10L * 1000L);
if (OB_UNLIKELY(!is_inited_ || NULL == pls_)) {
ret = OB_NOT_INIT;
} else if (OB_FAIL(has_active_memtable_(found))) {
STORAGE_LOG(INFO, "fail to check active memtable", K(ret), K(pkey_));
} else if (!found) {
STORAGE_LOG(INFO, "partition has no active memstore, skip freeze", K(ret), K(pkey_));
} else if (!freeze_record_.available()) {
// consider this as freeze success
STORAGE_LOG(INFO, "freeze already in progress", K(ret), K(pkey_));
} else if (OB_FAIL(pg_storage_.get_active_memtable(old_handle))) {
STORAGE_LOG(WARN, "fail to get old active memtable", K(ret), K(pkey_));
} else if (OB_FAIL(old_handle.get_memtable(frozen_memtable))) {
STORAGE_LOG(WARN, "get memtable from handle fail", K(ret), K(old_handle));
} else if (OB_ISNULL(frozen_memtable)) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "memtable is NULL", K(ret));
} else if (OB_FAIL(get_base_storage_info_(clog_info))) {
STORAGE_LOG(WARN, "fail to get base storage info", K(ret), K(pkey_));
} else if (!force && OB_FAIL(pg_storage_.check_log_or_data_changed(clog_info, log_changed, need_freeze_data))) {
STORAGE_LOG(WARN, "fail to check log data changed", K(ret), K(pkey_));
} else if (!need_freeze_data) {
if (!log_changed) {
STORAGE_LOG(INFO, "skip freeze log and data", K(pkey_));
} else if (OB_FAIL(pg_storage_.set_pg_clog_info(clog_info, true))) {
STORAGE_LOG(WARN, "fail to freeze log", K(ret), K(pkey_));
}
} else if (OB_FAIL(check_range_changed_(old_handle, is_leader, changed))) {
if (OB_STATE_NOT_MATCH == ret) {
STORAGE_LOG(INFO, "skip freeze due to clog state", K(ret), K(pkey_));
ret = OB_SUCCESS;
} else {
STORAGE_LOG(WARN, "failed to check log_id or version range changed", K(ret), K(old_handle));
}
} else if (!changed) {
// skip
} else {
if (OB_FAIL(
submit_freeze_and_effect_memstore_(is_leader, emergency, *frozen_memtable, effected, snapshot_version))) {
STORAGE_LOG(WARN, "submit freeze and prepare memstore", K(ret), K(pkey_), K(*frozen_memtable));
} else if (effected) {
if (OB_FAIL(pg_storage_.get_active_memtable(new_handle))) {
STORAGE_LOG(WARN, "fail to get new active memtable", K(ret), K(pkey_));
} else if (OB_FAIL(freeze_record_.submit_new_active_memtable(new_handle))) {
// Submit a new memtable. Allow async_freeze threads to scan and synchronize log.
STORAGE_LOG(ERROR, "fail to submit freeze record", K(ret), K(pkey_));
} else {
STORAGE_LOG(INFO, "submit_new_active_memtable success", K(ret), K(pkey_));
}
}
if (OB_FAIL(ret) || !effected) {
TRANS_LOG(INFO, "clear the record when failed", K(*this));
freeze_record_.clear();
}
}
return ret;
}
int ObPartitionGroup::submit_freeze_and_effect_memstore_(
const bool is_leader, const bool emergency, ObMemtable& frozen_memtable, bool& effected, int64_t& snapshot_version)
{
int ret = OB_SUCCESS;
uint64_t freeze_id = 0;
int64_t freeze_ts = 0;
int64_t unused = 0;
ObSavedStorageInfoV2 info;
snapshot_version = 0;
effected = false;
// allocate the resource in advance
if (OB_FAIL(pg_storage_.new_active_memstore(unused))) {
STORAGE_LOG(WARN, "fail to new active memstore", K(ret), K(pkey_));
} else {
if (OB_FAIL(submit_freeze_and_effect_memstore_inner_(
is_leader, emergency, frozen_memtable, effected, snapshot_version))) {
STORAGE_LOG(
WARN, "submit freeze and effect memstore inner failed", K(ret), K(emergency), K(frozen_memtable), K(*this));
}
if (OB_FAIL(ret) || !effected) {
// clean the active memstore if it's not effected
int64_t tmp_ret = OB_SUCCESS;
if (OB_UNLIKELY(OB_SUCCESS != (tmp_ret = pg_storage_.clean_new_active_memstore()))) {
STORAGE_LOG(ERROR, "rollback error", K(tmp_ret), K(pkey_));
}
} else {
pg_storage_.set_freeze_ts(ObTimeUtility::current_time());
STORAGE_LOG(INFO, "effect new active memstore success", K(ret), K(pkey_), K(frozen_memtable));
}
}
return ret;
}
int ObPartitionGroup::freeze(const bool emergency, const bool force, int64_t& freeze_snapshot)
{
int ret = OB_SUCCESS;
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKSTORAGE);
if (with_data_()) {
ret = freeze_log_and_data_v2_(emergency, force, freeze_snapshot);
} else {
ret = freeze_log_(force);
}
return ret;
}
int ObPartitionGroup::set_max_passed_trans_version_(const int64_t trans_version)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(trans_version < 0)) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(pkey_));
} else {
max_passed_trans_version_ = trans_version;
// Free record strongly depends on max_passed_trans_version if the latter is reset, the former must be reset too.
freeze_record_.clear();
partition_loop_worker_.reset_memstore_info_record();
STORAGE_LOG(INFO, "set max passed trans version success", K_(pkey), K(trans_version));
}
return ret;
}
int ObPartitionGroup::raise_memstore_if_needed_(const int64_t trans_version)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret), K(pkey_));
} else {
int64_t tmp_ret = OB_SUCCESS;
while (freeze_record_.need_raise_memstore(trans_version)) { // Check in advance to reduce lock calls
ObPartitionGroupLockGuard guard(lock_, 0, PGLOCKSTORAGE, true /* trylock */);
if (guard.locked()) {
if (freeze_record_.need_raise_memstore(trans_version)) {
if (OB_FAIL(pg_storage_.effect_new_active_memstore(
freeze_record_.get_saved_storage_info(), freeze_record_.is_emergency()))) {
if (OB_CANCELED == ret || OB_PARTITION_IS_REMOVED == ret || OB_NOT_INIT == ret) {
STORAGE_LOG(INFO, "freeze skiped", K(ret), K(pkey_));
ret = OB_SUCCESS;
} else {
// The resource has been reserved, expect no error
STORAGE_LOG(ERROR, "effect new active memstore failed", K(ret), K(pkey_));
}
if (OB_UNLIKELY(OB_SUCCESS != (tmp_ret = pg_storage_.clean_new_active_memstore()))) {
STORAGE_LOG(ERROR, "memstore rollback error", K(tmp_ret), K(pkey_));
}
} else {
STORAGE_LOG(INFO, "raise memstore success", K(ret), K(pkey_), K(freeze_record_));
}
freeze_record_.clear(); // No matter success or not, this freezing point should be cleared.
}
} else {
usleep(100);
}
}
}
return ret;
}
int ObPartitionGroup::mark_dirty_trans(bool& cleared)
{
int ret = OB_SUCCESS;
cleared = false;
if (OB_ISNULL(pls_) || OB_ISNULL(txs_) || OB_ISNULL(replay_status_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K(ret), K(pkey_));
} else {
ObPartitionGroupLockGuard guard(lock_, PGLOCKTRANS | PGLOCKREPLAY | PGLOCKCLOG | PGLOCKSTORAGE, 0);
ObMemtable* frozen_memtable = NULL;
ObMemtable* active_memtable = NULL;
if (OB_FAIL(freeze_record_.get_pending_frozen_memtable(frozen_memtable, active_memtable))) {
STORAGE_LOG(ERROR, "fail to get pending frozen memtable", K(ret), K(pkey_));
} else if (OB_NOT_NULL(frozen_memtable) && OB_NOT_NULL(active_memtable)) {
int64_t cb_cnt = 0;
int64_t applied_log_ts = INT64_MAX;
uint64_t unused = 0;
int64_t replayed_ts = OB_INVALID_TIMESTAMP;
if (OB_FAIL(get_min_replayed_log_with_keepalive(unused, replayed_ts))) {
STORAGE_LOG(WARN, "get min replay log with keepalive failed", K(ret), K(*this));
} else {
if (OB_FAIL(txs_->mark_dirty_trans(pkey_, frozen_memtable, active_memtable, cb_cnt, applied_log_ts))) {
STORAGE_LOG(WARN, "fail to mark dirty trans", K(ret), K(pkey_), KP(frozen_memtable), KP(active_memtable));
}
if (OB_SUCC(ret) || cb_cnt > 0) {
STORAGE_LOG(INFO, "mark callback", K(ret), K(pkey_), K(cb_cnt), K(replayed_ts), K(applied_log_ts));
}
applied_log_ts = MIN(replayed_ts, applied_log_ts);
freeze_record_.dirty_trans_marked(frozen_memtable, cb_cnt, OB_SUCC(ret), applied_log_ts, cleared);
}
} else {
// no need freeze
cleared = true;
}
}
return ret;
}
int ObPartitionGroup::get_curr_data_info_(
const bool use_slave_safe_read_ts, const ObDataStorageInfo& saved_data_info, ObDataStorageInfo& data_info)
{
int ret = OB_SUCCESS;
if (use_slave_safe_read_ts) {
ret = OB_NOT_SUPPORTED;
TRANS_LOG(WARN, "saved storage info is not supported in 3.x version");
} else {
data_info = saved_data_info;
}
return ret;
}
// Check if it is a replica type with data
bool ObPartitionGroup::with_data_()
{
const common::ObReplicaType replica_type = get_replica_type();
bool with_data = (ObReplicaTypeCheck::is_replica_with_memstore(replica_type) ||
ObReplicaTypeCheck::is_replica_with_ssstore(replica_type));
return with_data;
}
int ObPartitionGroup::get_curr_clog_info_(
const int64_t src_cluster_id, ObBaseStorageInfo& clog_info, bool& log_info_usable)
{
int ret = OB_SUCCESS;
bool is_sys_tenant = OB_SYS_TENANT_ID == pkey_.get_tenant_id();
const int64_t self_cluster_id = obrpc::ObRpcNetHandler::CLUSTER_ID;
const bool is_restore = pg_storage_.is_restore();
if (OB_FAIL(get_base_storage_info_(clog_info))) {
STORAGE_LOG(WARN, "fail to get base storage info", K(ret), K(pkey_));
} else if (0 == clog_info.get_last_replay_log_id()) {
// do nothing
} else if (ObServerConfig::get_instance().enable_log_archive && !is_sys_tenant && !is_restore &&
src_cluster_id == self_cluster_id) {
// Only the requests from the database itself need to obtain the archive point if the archive is enabled.
// To solve the circular dependency that the leader of primary database can not take over due to the restore
// failure of the backup database replica here in the maximum protection mode (bug#30365449).
ObLogArchiveBackupInfo info;
if (OB_FAIL(ObBackupInfoMgr::get_instance().get_log_archive_backup_info(info))) {
CLOG_LOG(WARN, "failed to get_log_archive_backup_info", K(pkey_), KR(ret));
} else if (ObLogArchiveStatus::STATUS::BEGINNING == info.status_.status_ ||
ObLogArchiveStatus::STATUS::DOING == info.status_.status_) {
ObPGLogArchiveStatus log_archive_status;
if (OB_FAIL(pls_->get_log_archive_status(log_archive_status))) {
STORAGE_LOG(WARN, "failed to get_last_archived_log_info", KR(ret), K(pkey_), K(info));
} else if (info.status_.incarnation_ != log_archive_status.archive_incarnation_ ||
info.status_.round_ != log_archive_status.log_archive_round_) {
ret = OB_EAGAIN;
STORAGE_LOG(
WARN, "not the same round, try later", KR(ret), K(pkey_), K(info), K(clog_info), K(log_archive_status));
} else if (ObLogArchiveStatus::INTERRUPTED == log_archive_status.status_ ||
ObLogArchiveStatus::STOP == log_archive_status.status_) {
// just skip
STORAGE_LOG(INFO, "log archive is not doing, just skip", K(pkey_), K(info), K(log_archive_status));
} else if (!log_archive_status.is_valid_for_clog_info()) {
ret = OB_EAGAIN;
// attention: can not merge with the forword OB_EAGAIN branch
STORAGE_LOG(WARN,
"log_archive_status is not valid, try later",
KR(ret),
K(pkey_),
K(info),
K(clog_info),
K(log_archive_status));
} else {
bool clog_exist = true;
const uint64_t last_archived_log_id = log_archive_status.last_archived_log_id_;
const bool is_mandatory = ObServerConfig::get_instance().backup_log_archive_option.is_mandatory();
if (! is_mandatory && last_archived_log_id < clog_info.get_last_replay_log_id()) {
// check next archive log exist, return clog_exist = true only on situation of log exist
if (OB_FAIL(pls_->check_log_exist(last_archived_log_id + 1, clog_exist))) {
STORAGE_LOG(WARN, "failed to check log exist", KR(ret), K(pkey_), K(log_archive_status));
}
}
if (OB_FAIL(ret)) {
}
// if archive not in madatory mode and next archive log not exist, omit archive
else if (! is_mandatory && ! clog_exist) {
STORAGE_LOG(WARN, "attent!! archive had been omitted due to log based on archive not exist, maybe some error occur",
KR(ret), K(pkey_), K(log_archive_status));
} else {
STORAGE_LOG(INFO, "update with archived clog info", K(pkey_), K(clog_info), K(log_archive_status));
if (last_archived_log_id < clog_info.get_last_replay_log_id()) {
log_info_usable = true;
clog_info.set_last_replay_log_id(last_archived_log_id);
clog_info.set_submit_timestamp(log_archive_status.last_archived_log_submit_ts_);
clog_info.set_epoch_id(log_archive_status.clog_epoch_id_);
clog_info.set_accumulate_checksum(log_archive_status.accum_checksum_);
}
}
}
} else { /*do nothing*/
}
} else { /*do nothing*/
}
return ret;
}
int ObPartitionGroup::get_curr_storage_info_for_migrate(const bool use_slave_safe_read_ts,
const common::ObReplicaType replica_type, const int64_t src_cluster_id, ObSavedStorageInfoV2& info)
{
int ret = OB_SUCCESS;
static const int64_t LOG_INFO_QUERY_TIMEOUT = 5L * 1000L * 1000L;
ObSavedStorageInfoV2 saved_info;
info.reset();
ObDataStorageInfo& data_info = info.get_data_info();
ObBaseStorageInfo& clog_info = info.get_clog_info();
bool remote_with_data = ObReplicaTypeCheck::is_replica_with_memstore(replica_type);
bool with_data = false;
bool log_info_usable = false;
if (OB_UNLIKELY(!is_inited_ || NULL == pls_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K(ret), KP(pls_), K(pkey_));
} else if (OB_INVALID_CLUSTER_ID == src_cluster_id) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid arguments", K(ret), K(src_cluster_id), K(pkey_));
} else if (OB_FAIL(pg_storage_.get_all_saved_info(saved_info))) {
STORAGE_LOG(WARN, "fail to get saved info", K(ret), K(pkey_));
} else if (FALSE_IT(with_data = with_data_() && remote_with_data)) {
} else if (OB_FAIL(get_curr_data_info_(use_slave_safe_read_ts, saved_info.get_data_info(), data_info))) {
STORAGE_LOG(WARN, "fail to get curr data info", K(ret), K(pkey_));
} else if (OB_FAIL(get_curr_clog_info_(src_cluster_id, clog_info, log_info_usable))) {
STORAGE_LOG(WARN, "fail to get curr clog info", K(ret), K(pkey_));
} else if (OB_FAIL(info.update_and_fetch_log_info(
pkey_, with_data, saved_info.get_clog_info(), LOG_INFO_QUERY_TIMEOUT, log_info_usable))) {
STORAGE_LOG(WARN, "failed to update_info_and_fetch_checksum", K(ret), K_(pkey), K(saved_info), K(info));
}
STORAGE_LOG(INFO, "get curr storage info", K(ret), K(pkey_), K(replica_type), K(saved_info), K(info));
return ret;
}
int ObPartitionGroup::shutdown(
const int64_t snapshot_version, const uint64_t replay_log_id, const int64_t schema_version)
{
int ret = OB_SUCCESS;
SpinWLockGuard guard(split_lock_);
if (OB_FAIL(shutdown_(snapshot_version, replay_log_id, schema_version))) {
STORAGE_LOG(WARN, "failed to shutdown", K(ret), K(pkey_), K(snapshot_version), K(replay_log_id), K(schema_version));
}
return ret;
}
int ObPartitionGroup::shutdown_(
const int64_t snapshot_version, const uint64_t replay_log_id, const int64_t schema_version)
{
int ret = OB_SUCCESS;
ObSavedStorageInfoV2 info;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret), K(pkey_));
} else if (OB_FAIL(get_base_storage_info_(info.get_clog_info()))) {
STORAGE_LOG(WARN, "fail to get base storage info", K(ret), K(pkey_));
} else {
// This frozen memtable will be dumped into the ssstore directly, the log_id
// is meaningless at the target partition and the checksum and timestamp will
// not be checked.
ObDataStorageInfo& data_info = info.get_data_info();
data_info.set_last_replay_log_id(replay_log_id);
data_info.set_last_replay_log_ts(snapshot_version);
data_info.set_publish_version(snapshot_version);
data_info.set_schema_version(schema_version);
data_info.set_created_by_new_minor_freeze();
if (OB_FAIL(pg_storage_.complete_active_memstore(info))) {
STORAGE_LOG(WARN, "complete active memstore failed", K(ret), K(pkey_));
} else {
freeze_record_.clear();
STORAGE_LOG(INFO, "complete active memstore successfully", K(pkey_));
}
}
return ret;
}
int ObPartitionGroup::check_is_from_restore(bool& is_from_restore) const
{
int ret = OB_SUCCESS;
uint64_t last_restore_log_id = OB_INVALID_ID;
int64_t last_restore_log_ts = OB_INVALID_TIMESTAMP;
int64_t restore_snapshot_version = OB_INVALID_TIMESTAMP;
if (OB_FAIL(pg_storage_.get_restore_replay_info(last_restore_log_id,
last_restore_log_ts, restore_snapshot_version))) {
STORAGE_LOG(WARN, "fail to get_restore_replay_info", K(ret), K(pkey_));
} else {
is_from_restore = (OB_INVALID_ID != last_restore_log_id);
}
return ret;
}
int ObPartitionGroup::get_all_saved_info(ObSavedStorageInfoV2& info) const
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.get_all_saved_info(info))) {
STORAGE_LOG(WARN, "fail to get saved clog info", K(ret), K(pkey_));
}
return ret;
}
int ObPartitionGroup::get_saved_clog_info(common::ObBaseStorageInfo& clog_info) const
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.get_saved_clog_info(clog_info))) {
STORAGE_LOG(WARN, "fail to get saved clog info", K(ret), K(pkey_));
}
return ret;
}
int ObPartitionGroup::get_saved_data_info(ObDataStorageInfo& data_info) const
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.get_saved_data_info(data_info))) {
STORAGE_LOG(WARN, "fail to get saved clog info", K(ret), K(pkey_));
}
return ret;
}
int ObPartitionGroup::get_saved_last_log_info(uint64_t& log_id, int64_t& submit_timestamp) const
{
return get_saved_last_log_info_(log_id, submit_timestamp);
}
int ObPartitionGroup::get_saved_last_log_info_(uint64_t& log_id, int64_t& submit_timestamp) const
{
int ret = OB_SUCCESS;
// The clog_info and data_info is separate in design, but in order to reduce the performance
// overhead the checkpoint of clog_info is the smaller one between clog_info and data_info in
// fact. So we only take clog_info here.
ObBaseStorageInfo clog_info;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition not init", K(ret));
} else if (OB_FAIL(get_saved_clog_info(clog_info))) {
STORAGE_LOG(WARN, "get clog info failed", K(ret), K(pkey_));
} else {
log_id = clog_info.get_last_replay_log_id();
submit_timestamp = clog_info.get_submit_timestamp();
}
return ret;
}
int ObPartitionGroup::append_local_sort_data(
const ObPartitionKey& pkey, const share::ObBuildIndexAppendLocalDataParam& param, ObNewRowIterator& iter)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.append_local_sort_data(pkey, param, iter))) {
STORAGE_LOG(WARN, "fail to append local sort data", K(ret), K(param));
}
return ret;
}
int ObPartitionGroup::append_sstable(
const ObPartitionKey& pkey, const share::ObBuildIndexAppendSSTableParam& param, common::ObNewRowIterator& iter)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.append_sstable(pkey, param, iter))) {
STORAGE_LOG(WARN, "fail to append sstable", K(ret), K(param));
}
return ret;
}
int ObPartitionGroup::check_single_replica_major_sstable_exist(
const ObPartitionKey& pkey, const uint64_t index_table_id)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.check_single_replica_major_sstable_exist(pkey, index_table_id))) {
if (OB_ENTRY_NOT_EXIST != ret) {
STORAGE_LOG(WARN, "fail to check single replica major sstable exist", K(ret));
}
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::get_table_stat(const common::ObPartitionKey& pkey, ObTableStat& stat)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.get_table_stat(pkey, stat))) {
if (OB_ENTRY_NOT_EXIST != ret) {
STORAGE_LOG(WARN, "failed to get table stat", K(ret), K(pkey));
}
}
return ret;
}
int ObPartitionGroup::create_memtable(
const bool in_slog_trans, const bool is_replay, const bool ignore_memstore_percent)
{
int ret = OB_SUCCESS;
SpinRLockGuard guard(split_lock_);
if (OB_FAIL(create_memtable_(in_slog_trans, is_replay, ignore_memstore_percent))) {
STORAGE_LOG(WARN, "create memtable erorr", K(ret), K(ignore_memstore_percent), K_(pkey));
}
return ret;
}
// The times to be called:
// 1. When the partition group is created at leader.
// 2. During the process of leader takeover after the D replica is selected as leader
// 3. During the migrator process
int ObPartitionGroup::create_memtable_(
const bool in_slog_trans, const bool is_replay, const bool ignore_memstore_percent)
{
int ret = OB_SUCCESS;
ObDataStorageInfo data_info;
if (OB_FAIL(pg_storage_.get_saved_data_info(data_info))) {
STORAGE_LOG(WARN, "failed to get data info", K(ret), K(pkey_));
} else if (OB_FAIL(set_max_passed_trans_version_(data_info.get_publish_version() + 1))) {
STORAGE_LOG(WARN, "failed to set max passed trans version", K(ret), K(pkey_));
} else if (!pg_storage_.need_create_memtable()) {
STORAGE_LOG(INFO, "no need to create memtable", K(pkey_), K(split_info_), K(data_info));
} else if (OB_FAIL(pg_storage_.create_memtable(in_slog_trans, is_replay, ignore_memstore_percent))) {
STORAGE_LOG(WARN, "failed to create_memtable", K(ret), K(pkey_));
} else {
// do nothing
}
return ret;
}
int ObPartitionGroup::decide_split_version_(const int64_t base_ts, int64_t& split_version)
{
int ret = OB_SUCCESS;
bool need_retry = true;
int64_t tmp_split_version = base_ts;
while (OB_SUCCESS == ret && need_retry) {
bool safe = false;
need_retry = false;
if (OB_FAIL(freeze_record_.set_freeze_upper_limit(tmp_split_version))) {
// set limit
if (OB_EAGAIN == ret) {
// rewrite ret
ret = OB_SUCCESS;
need_retry = true;
}
} else if (OB_FAIL(is_split_version_safe_(tmp_split_version, safe))) {
// check safe
STORAGE_LOG(WARN, "fail to check split version safety", K(ret), K(pkey_), K(split_version));
} else if (!safe) {
need_retry = true;
} else {
split_version = tmp_split_version;
break;
}
if (need_retry) {
// Add 1ms if the split version is too small.
tmp_split_version = tmp_split_version + SPLIT_FREEZE_WAIT_TS;
}
}
if (OB_FAIL(ret)) {
freeze_record_.clear_freeze_upper_limit();
}
return ret;
}
int ObPartitionGroup::is_split_version_safe_(const int64_t split_version, bool& safe)
{
int ret = OB_SUCCESS;
int64_t base_version = INT64_MAX;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
} else if (0 == pg_storage_.get_partition_cnt()) {
safe = true;
} else if (OB_FAIL(pg_storage_.get_active_memtable_base_version(base_version))) {
STORAGE_LOG(WARN, "check split version safe or not", K(split_version), K(safe), K_(pkey));
} else {
safe = (base_version < split_version);
}
return ret;
}
int ObPartitionGroup::submit_split_source_log_(const ObPartitionSplitSourceLog& log, const int64_t base_ts)
{
int ret = OB_SUCCESS;
uint64_t log_id = 0;
int64_t log_ts = 0;
ObSplitLogCb* log_cb = NULL;
const bool is_trans_log = false;
if (!share::ObMultiClusterUtil::is_cluster_allow_submit_log(pkey_.get_table_id())) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "is_cluster_allow_submit_log return false", K(ret), K(pkey_));
} else if (NULL == (log_cb = ObSplitLogCbFactory::alloc())) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(ERROR, "allocate memory failed", K(ret));
} else {
int64_t pos = 0;
char* buf = NULL;
if (OB_FAIL(log_cb->init(ps_, OB_LOG_SPLIT_SOURCE_PARTITION))) {
STORAGE_LOG(WARN, "log callback init failed", K(ret));
} else if (NULL ==
(buf = reinterpret_cast<char*>(ob_malloc(OB_MAX_LOG_ALLOWED_SIZE, ObModIds::OB_SPLIT_LOG_BUF)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(ERROR, "allocate memory failed", K(ret));
} else if (OB_FAIL(serialization::encode_i64(buf, OB_MAX_LOG_ALLOWED_SIZE, pos, OB_LOG_SPLIT_SOURCE_PARTITION))) {
} else if (OB_FAIL(log.serialize(buf, OB_MAX_LOG_ALLOWED_SIZE, pos))) {
STORAGE_LOG(WARN, "serialize log failed", K(ret));
} else if (OB_FAIL(pls_->submit_log(buf, pos, base_ts, log_cb, is_trans_log, log_id, log_ts))) {
STORAGE_LOG(WARN, "submit split source log failed", K(ret));
} else {
// do nothing
}
if (OB_SUCCESS != ret) {
ObSplitLogCbFactory::release(log_cb);
log_cb = NULL;
}
if (NULL != buf) {
ob_free(buf);
buf = NULL;
}
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "submit split source log failed", K(ret), K_(pkey), K(base_ts), K(log), K(log_id), K(log_ts));
} else {
STORAGE_LOG(INFO, "submit split source log success", K_(pkey), K(base_ts), K(log), K(log_id), K(log_ts));
}
return ret;
}
int ObPartitionGroup::submit_split_dest_log_(const ObPartitionSplitDestLog& log)
{
int ret = OB_SUCCESS;
ObSplitLogCb* log_cb = NULL;
uint64_t log_id = 0;
int64_t log_timestamp = 0;
const int64_t base_timestamp = 0;
const bool is_trans_log = false;
if (!share::ObMultiClusterUtil::is_cluster_allow_submit_log(pkey_.get_table_id())) {
ret = OB_STATE_NOT_MATCH;
STORAGE_LOG(WARN, "is_cluster_allow_submit_log return false", K(ret), K(pkey_));
} else if (NULL == (log_cb = ObSplitLogCbFactory::alloc())) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(ERROR, "allocate memory failed", K(ret));
} else {
int64_t pos = 0;
char* buf = NULL;
if (OB_FAIL(log_cb->init(ps_, OB_LOG_SPLIT_DEST_PARTITION))) {
STORAGE_LOG(WARN, "log callback init failed", K(ret));
} else if (NULL ==
(buf = reinterpret_cast<char*>(ob_malloc(OB_MAX_LOG_ALLOWED_SIZE, ObModIds::OB_SPLIT_LOG_BUF)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(ERROR, "allocate memory failed", K(ret));
} else if (OB_FAIL(serialization::encode_i64(buf, OB_MAX_LOG_ALLOWED_SIZE, pos, OB_LOG_SPLIT_DEST_PARTITION))) {
} else if (OB_FAIL(log.serialize(buf, OB_MAX_LOG_ALLOWED_SIZE, pos))) {
STORAGE_LOG(WARN, "serialize log failed", K(ret));
} else if (OB_FAIL(pls_->submit_log(buf, pos, base_timestamp, log_cb, is_trans_log, log_id, log_timestamp))) {
STORAGE_LOG(WARN, "submit split dest log failed", K(ret));
} else {
STORAGE_LOG(INFO, "submit split dest log success", K(log), KP(log_cb));
}
if (OB_SUCCESS != ret) {
ObSplitLogCbFactory::release(log_cb);
log_cb = NULL;
}
if (NULL != buf) {
ob_free(buf);
buf = NULL;
}
}
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN, "submit split dest log failed", K(ret), K_(pkey), K(log));
} else {
STORAGE_LOG(INFO, "submit split dest log success", K_(pkey), K(log));
}
return ret;
}
int ObPartitionGroup::set_split_version_(const int64_t split_version)
{
int ret = OB_SUCCESS;
int64_t base_version = INT64_MAX;
if (OB_FAIL(pg_storage_.get_active_memtable_base_version(base_version))) {
STORAGE_LOG(WARN, "failed to get active memtable", K(ret), K_(pkey));
} else if (base_version >= split_version) {
// base version has been set
STORAGE_LOG(INFO, "split version has been set", K_(pkey), K(base_version), K(split_version));
} else if (OB_FAIL(pg_storage_.set_publish_version_after_create(split_version))) {
STORAGE_LOG(WARN, "set publish version after create failed", K(ret), K_(pkey), K(split_version));
} else if (OB_FAIL(set_max_passed_trans_version_(split_version + 1))) {
STORAGE_LOG(ERROR, "set max passed trans version failed", K(ret), K_(pkey), K(split_version));
} else {
STORAGE_LOG(INFO, "set publish version after create success", K_(pkey), K(split_version));
}
return ret;
}
int ObPartitionGroup::push_split_task_(const int64_t schema_version, const ObSplitPartitionPair& info)
{
int ret = OB_SUCCESS;
ObPartitionSplitTask* task = NULL;
if (NULL == (task = ObPartitionSplitTaskFactory::alloc())) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(WARN, "allocate memory failed", K(ret));
} else {
if (OB_FAIL(task->init(schema_version, info))) {
STORAGE_LOG(WARN, "split task init failed", K(ret));
} else {
ObPartitionSplitWorker* split_worker = ps_->get_split_worker();
if (NULL == split_worker) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "split worker is null", K(ret), KP(split_worker));
} else if (OB_FAIL(split_worker->push(task))) {
STORAGE_LOG(WARN, "push split worker failed", K(ret), K(*task));
} else {
// do nothing
}
}
if (OB_SUCCESS != ret) {
ObPartitionSplitTaskFactory::release(task);
task = NULL;
}
}
return ret;
}
int ObPartitionGroup::save_split_state_(const int64_t state, const bool write_slog)
{
int ret = OB_SUCCESS;
const int64_t start_ts = ObTimeUtility::current_time();
ret = pg_storage_.save_split_state(state, write_slog);
const int64_t end_ts = ObTimeUtility::current_time();
if (OB_SUCCESS != ret) {
STORAGE_LOG(WARN,
"save split state failed",
K(ret),
K_(pkey),
"state",
to_state_str(static_cast<ObPartitionSplitStateEnum>(state)),
K(write_slog),
"used_time",
end_ts - start_ts);
} else {
STORAGE_LOG(INFO,
"save split state success",
K_(pkey),
"state",
to_state_str(static_cast<ObPartitionSplitStateEnum>(state)),
K(write_slog),
"used_time",
end_ts - start_ts);
}
return ret;
}
int ObPartitionGroup::save_split_info(const ObPartitionSplitInfo& split_info)
{
int ret = OB_SUCCESS;
SpinWLockGuard guard(split_lock_);
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else if (!split_info.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "split info is not valid", K(ret), K(split_info));
} else if (OB_FAIL(pg_storage_.save_split_info(split_info, true /*write_slog*/))) {
STORAGE_LOG(WARN, "pg storage save split info and state", K(ret), K(pkey_), K(split_info));
} else if (OB_FAIL(split_info_.assign(split_info))) {
STORAGE_LOG(WARN, "failed to assign split info", K(ret), K_(pkey), K(split_info));
}
return ret;
}
int ObPartitionGroup::save_split_state(const int64_t split_state)
{
int ret = OB_SUCCESS;
SpinWLockGuard guard(split_lock_);
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else if (!is_valid_split_state(static_cast<ObPartitionSplitStateEnum>(split_state))) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid arguments", K(ret), K(pkey_), K(split_state));
} else if (OB_FAIL(pg_storage_.save_split_state(split_state, true /*wrtie_slog*/))) {
STORAGE_LOG(WARN, "pg storage save split info and state", K(ret), K(pkey_), K(split_state));
} else if (OB_FAIL(split_state_.set_state(static_cast<ObPartitionSplitStateEnum>(split_state)))) {
STORAGE_LOG(WARN, "failed to set split state", K(ret), K_(pkey), K(split_state));
}
return ret;
}
int ObPartitionGroup::save_split_info_(const ObPartitionSplitInfo& split_info, const bool write_slog)
{
int ret = OB_SUCCESS;
const int64_t start_ts = ObTimeUtility::current_time();
ret = pg_storage_.save_split_info(split_info, write_slog);
const int64_t end_ts = ObTimeUtility::current_time();
if (OB_SUCCESS != ret) {
STORAGE_LOG(
WARN, "save split info failed", K(ret), K_(pkey), K(split_info), K(write_slog), "used_time", end_ts - start_ts);
} else {
STORAGE_LOG(
INFO, "save split info success", K_(pkey), K(split_info), K(write_slog), "used_time", end_ts - start_ts);
}
return ret;
}
int ObPartitionGroup::split_dest_partitions_(bool& is_all_finished)
{
int ret = OB_SUCCESS;
// must be inited true
bool tmp_all_finished = true;
ObSArray<ObPartitionKey> partitions;
if (OB_FAIL(partition_split_progress_array_.get_not_finished(partitions))) {
STORAGE_LOG(WARN, "get not mask failed", K(ret));
} else {
for (int64_t i = 0; i < partitions.count(); i++) {
enum ObSplitProgress progress = UNKNOWN_SPLIT_PROGRESS;
const ObPartitionKey& dest_partition = partitions.at(i);
ObAddr leader;
// ignore tmp_ret, continue
if (OB_FAIL(ps_->nonblock_get_strong_leader_from_loc_cache(dest_partition, leader))) {
STORAGE_LOG(WARN, "get leader from loc cache failed", K(ret), K(dest_partition));
} else if (ps_->get_self_addr() != leader) {
tmp_all_finished = false;
ObIPartitionServiceRpc& rpc = ps_->get_pts_rpc();
ObSplitDestPartitionRequestArg arg;
if (OB_FAIL(arg.init(dest_partition, split_info_))) {
STORAGE_LOG(WARN, "init split dest partition request failed", K(ret), K(dest_partition));
} else if (OB_FAIL(rpc.post_split_dest_partition_request(leader, dest_partition.get_tenant_id(), arg))) {
STORAGE_LOG(WARN, "post split dest partition request failed", K(ret));
} else {
STORAGE_LOG(INFO, "post split dest partition request success", K(dest_partition), K(leader));
}
} else {
if (OB_FAIL(ps_->split_dest_partition(dest_partition, split_info_, progress))) {
STORAGE_LOG(WARN, "split dest partition failed", K(ret), K(dest_partition));
(void)ps_->nonblock_get_strong_leader_from_loc_cache(dest_partition, leader, true);
} else if (OB_FAIL(partition_split_progress_array_.set_progress(dest_partition, progress))) {
STORAGE_LOG(WARN, "set progress failed", K(ret), K(dest_partition), K(progress));
} else if (PHYSICAL_SPLIT_FINISH > progress) {
tmp_all_finished = false;
} else {
// do nothing
}
}
if (OB_SUCCESS != ret) {
// rewrite ret
ret = OB_SUCCESS;
tmp_all_finished = false;
}
}
}
if (OB_SUCCESS == ret) {
is_all_finished = tmp_all_finished;
}
return ret;
}
int ObPartitionGroup::query_replica_split_progress_(const int64_t schema_version)
{
int ret = OB_SUCCESS;
const ObIArray<ObReplicaSplitProgress>& progress_array = replica_split_progress_array_.get_progress_array();
for (int64_t i = 0; OB_SUCC(ret) && i < progress_array.count(); i++) {
const ObReplicaSplitProgress& split_progress = progress_array.at(i);
if (PHYSICAL_SPLIT_FINISH != split_progress.progress_) {
ObReplicaSplitProgressRequest arg;
arg.schema_version_ = schema_version;
arg.pkey_ = pkey_;
arg.addr_ = split_progress.obj_;
ObIPartitionServiceRpc& rpc = ps_->get_pts_rpc();
if (OB_FAIL(rpc.post_replica_split_progress_request(split_progress.obj_, pkey_.get_tenant_id(), arg))) {
STORAGE_LOG(WARN, "post replica split progress request failed", K(ret), K(arg));
} else {
STORAGE_LOG(INFO, "post replica split progress request success", K(arg));
}
}
}
return ret;
}
int ObPartitionGroup::get_split_partition_member_list_(
const ObIArray<ObPartitionKey>& pkey_array, ObMemberList& member_list)
{
int ret = OB_SUCCESS;
ObMemberList tmp_member_list;
ObMemberList out_member_list;
for (int64_t i = 0; OB_SUCC(ret) && i < pkey_array.count(); i++) {
const ObPartitionKey& pkey = pkey_array.at(i);
tmp_member_list.reset();
if (OB_FAIL(ps_->get_curr_member_list(pkey, tmp_member_list))) {
if (OB_PARTITION_NOT_EXIST == ret) {
// rewrite ret
ret = OB_SUCCESS;
} else {
STORAGE_LOG(WARN, "get curr member list failed", K(ret), K(pkey));
}
} else if (out_member_list.get_member_number() < tmp_member_list.get_member_number()) {
out_member_list = tmp_member_list;
} else {
// do nothing
}
}
if (OB_SUCC(ret)) {
if (out_member_list.get_member_number() <= 0) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "unexpected split partition member count", K(ret), K(out_member_list));
} else {
member_list = out_member_list;
}
}
return ret;
}
int ObPartitionGroup::get_all_table_ids_(const ObPartitionKey& pkey, ObIArray<uint64_t>& index_tables)
{
int ret = OB_SUCCESS;
index_tables.reset();
if (OB_FAIL(pg_storage_.get_all_table_ids(pkey, index_tables))) {
STORAGE_LOG(WARN, "get all table ids failed", K(ret));
}
return ret;
}
int ObPartitionGroup::get_reference_tables_(const ObPartitionKey& pkey, const int64_t index_id, ObTablesHandle& handle)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.get_reference_tables(pkey, index_id, handle))) {
STORAGE_LOG(WARN, "get reference tables failed", K(ret), K(index_id));
}
return ret;
}
int ObPartitionGroup::set_reference_tables_(const ObPartitionKey& pkey, const int64_t index_id, ObTablesHandle& handle)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.set_reference_tables(pkey, index_id, handle))) {
STORAGE_LOG(WARN, "set reference tables failed", K(ret), K(index_id));
if (OB_ENTRY_NOT_EXIST == ret) {
// rewrite ret
ret = OB_SUCCESS;
}
}
return ret;
}
int ObPartitionGroup::push_reference_tables_(const ObIArray<ObPartitionKey>& dest_array, const int64_t split_version)
{
int ret = OB_SUCCESS;
ObArray<uint64_t> index_tables;
ObIPartitionGroupGuard dest_guard;
ObIPartitionGroup* dest_partition_group = NULL;
ObPartitionArray src_pkeys;
ObPartitionArray dest_pkeys;
ObTablesHandle handle;
ObTablesHandle memtables;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("parttion group is not inited", K(ret));
} else if (OB_FAIL(pg_storage_.get_reference_memtables(memtables))) {
LOG_WARN("failed to get reference memtables", K(ret), K_(pkey));
} else if (OB_FAIL(pg_storage_.get_all_pg_partition_keys(src_pkeys))) {
STORAGE_LOG(WARN, "failed to get all partition keys", K(ret), K_(pkey));
} else if (0 == src_pkeys.count()) {
// do nothing
} else if (OB_FAIL(txs_->copy_trans_table(pkey_, dest_array))) {
STORAGE_LOG(WARN, "failed to copy trans table", K(ret), K_(pkey), K(dest_array));
} else {
for (int64_t i = 0; OB_SUCC(ret) && i < dest_array.count(); i++) {
const ObPartitionKey& dest_pgkey = dest_array.at(i);
bool is_complete = false;
dest_pkeys.reset();
dest_partition_group = NULL;
if (OB_FAIL(ps_->get_partition(dest_pgkey, dest_guard))) {
if (OB_PARTITION_NOT_EXIST == ret) {
ret = OB_SUCCESS;
STORAGE_LOG(INFO, "partition not exist, should have finished logical split", K(ret), K(dest_pgkey));
} else {
STORAGE_LOG(WARN, "get dest partition failed", K(ret), K(dest_pgkey), K_(pkey));
}
// rewrite ret
} else if (OB_ISNULL(dest_partition_group = dest_guard.get_partition_group())) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "get dest partition failed, do not push reference tables", K(ret), K(dest_pgkey), K_(pkey));
} else if (OB_FAIL(dest_partition_group->get_pg_storage().check_complete(is_complete))) {
STORAGE_LOG(WARN, "failed to check complete", K(ret), K(dest_pgkey), K_(pkey));
} else if (is_complete) {
STORAGE_LOG(INFO, "dest pg is complete, no need to push reference tables", K(dest_pgkey), K_(pkey));
} else if (OB_FAIL(dest_partition_group->set_split_version(split_version))) {
STORAGE_LOG(WARN, "set split version failed", K(ret), K(split_version), K(dest_pgkey), K_(pkey));
} else if (OB_FAIL(dest_partition_group->get_pg_storage().set_reference_memtables(memtables))) {
STORAGE_LOG(WARN, "failed to set reference memtables", K(ret), K(dest_pgkey), K_(pkey));
} else if (OB_FAIL(dest_partition_group->get_pg_storage().get_all_pg_partition_keys(dest_pkeys))) {
STORAGE_LOG(WARN, "failed to get all partition keys", K(ret), K(dest_pgkey), K_(pkey));
} else {
for (int64_t j = 0; OB_SUCC(ret) && j < dest_pkeys.count(); j++) {
bool found = false;
const ObPartitionKey& dest_pkey = dest_pkeys.at(j);
for (int64_t m = 0; OB_SUCC(ret) && m < src_pkeys.count(); m++) {
if (src_pkeys.at(m).get_table_id() == dest_pkey.get_table_id()) {
found = true;
const ObPartitionKey& src_pkey = src_pkeys.at(m);
if (OB_FAIL(get_all_table_ids_(src_pkey, index_tables))) {
STORAGE_LOG(WARN, "failed to get all table ids", K(ret), K_(pkey), K(src_pkey), K(dest_pkey));
} else {
for (int64_t k = 0; OB_SUCC(ret) && k < index_tables.count(); k++) {
int64_t table_count = 0;
const int64_t index_id = index_tables.at(k);
if (OB_FAIL(pg_storage_.get_latest_table_count(src_pkey, index_id, table_count))) {
STORAGE_LOG(WARN, "failed to get latest table count", K(src_pkey), K(index_id));
} else if (0 == table_count) {
STORAGE_LOG(INFO, "no sstables, no need to set reference table", K(src_pkey), K(index_id));
} else if (OB_FAIL(get_reference_tables_(src_pkey, index_id, handle))) {
STORAGE_LOG(WARN, "get reference tables failed", K(ret), K(index_id), K(src_pkey));
} else if (OB_FAIL(dest_partition_group->set_reference_tables(dest_pkey, index_id, handle))) {
STORAGE_LOG(WARN, "set reference tables failed", K(ret), K(index_id), K(dest_pkey), K(dest_pgkey));
} else {
STORAGE_LOG(INFO, "set reference tables success", K(index_id), K(dest_pkey), K(dest_pgkey));
}
}
}
break;
}
}
if (!found) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "cannot find split source partition", K(ret), K(dest_pkey));
}
}
}
// The target partition is triggered to dump the transaction state table immediately
// after the completion of logical splitting.
if (OB_SUCC(ret)) {
bool is_merged = false;
ObPartitionScheduler::get_instance().schedule_pg(
MINI_MERGE, *dest_partition_group, ObVersion::MIN_VERSION, is_merged);
}
}
}
return ret;
}
int ObPartitionGroup::get_max_decided_trans_version(int64_t& max_decided_trans_version) const
{
int ret = OB_SUCCESS;
int64_t unused_epoch = OB_INVALID_TIMESTAMP;
ObTsWindows unused_windows;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartition is not inited", K(ret), K(pkey_));
} else if (OB_FAIL(pls_->get_role_unlock(unused_epoch, unused_windows))) {
STORAGE_LOG(WARN, "get_role_unlock failed", K(ret), K(pkey_));
} else {
const int64_t last_checkpoint = partition_loop_worker_.get_last_checkpoint();
if (0 < last_checkpoint) {
if (ObTimeUtility::current_time() - last_checkpoint > 120000000) {
if (EXECUTE_COUNT_PER_SEC(16)) {
TRANS_LOG(WARN, "checkpoint is too old", K_(pkey), K(last_checkpoint), K_(partition_loop_worker));
}
}
max_decided_trans_version = last_checkpoint;
} else {
ret = OB_EAGAIN;
}
}
return ret;
}
int ObPartitionGroup::allow_gc(bool& allow_gc)
{
int ret = OB_SUCCESS;
bool has_migrate_task = false;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartition is not inited", K(ret), K(pkey_));
} else if (OB_FAIL(pls_->allow_gc(allow_gc))) {
STORAGE_LOG(WARN, "ObPartitionLogService allow_gc failed", K(ret), K(pkey_));
} else if (!allow_gc) {
STORAGE_LOG(INFO, "ObPartitionLogService is not allowed to gc", K(ret), K(pkey_));
} else if (OB_FAIL(pg_storage_.allow_gc(allow_gc))) {
STORAGE_LOG(WARN, "fail to check if pg_storage allow gc", K(ret), K(pkey_));
} else if (!allow_gc) {
STORAGE_LOG(INFO, "pg storage is not allowed to gc", K(ret), K(pkey_));
}
if (OB_SUCC(ret) && allow_gc) {
if (OB_FAIL(ObPartGroupMigrator::get_instance().has_task(pkey_, has_migrate_task))) {
STORAGE_LOG(WARN, "failed to check has task", K(ret), K_(pkey));
} else if (has_migrate_task) {
allow_gc = false;
STORAGE_LOG(INFO, "this partition is migrating in, not allowed to gc", K(pkey_));
}
}
return ret;
}
int ObPartitionGroup::gc_check_valid_member(const bool is_valid, const int64_t gc_seq, bool& need_gc)
{
int ret = OB_SUCCESS;
need_gc = false;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartition is not inited", K(ret));
} else if (gc_seq <= 0) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(ERROR, "invalid arguments", K(ret), K(gc_seq));
} else if (!is_valid) {
STORAGE_LOG(INFO, "gc_check_valid_member, invalid member", K(is_valid), K(gc_seq), K(gc_seq_check_valid_member_));
if (gc_seq == gc_seq_check_valid_member_ + 1) {
need_gc = true;
}
gc_seq_check_valid_member_ = gc_seq;
}
return ret;
}
bool ObPartitionGroup::check_pg_partition_offline(const ObPartitionKey& pkey)
{
int ret = OB_SUCCESS;
bool is_offline = false;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
STORAGE_LOG(ERROR, "ObPartition is not inited", K(ret));
} else if (!pkey.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(pkey));
// Both the stand alone partition and PG only need to check the offline_log_id.
} else if (!pkey_.is_pg() || pkey_ == pkey) {
is_offline = OB_INVALID_ID != offline_log_id_;
// The partition in PG need check the tags at partition store.
} else if (OB_FAIL(pg_storage_.check_pg_partition_offline(pkey, is_offline))) {
STORAGE_LOG(WARN, "check pg partition offline error", K(ret), K(pkey), K(pkey_));
} else {
// do nothing
}
return is_offline;
}
int ObPartitionGroup::check_offline_log_archived(
const ObPartitionKey& pkey, const int64_t incarnation, const int64_t archive_round, bool& has_archived) const
{
int ret = OB_SUCCESS;
has_archived = false;
uint64_t last_archived_log_id = OB_INVALID_ID;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
STORAGE_LOG(ERROR, "ObPartition is not inited", K(ret));
} else if (!pkey.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(pkey));
// Both the stand alone partition and PG only need to check the offline_log_id.
} else if (pkey_.is_pg() && pkey_ != pkey) {
has_archived = true;
} else if (OB_INVALID_ID == offline_log_id_) {
has_archived = false;
} else if (OB_ISNULL(pls_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "NULL ptr", K(ret));
} else if (OB_FAIL(pls_->get_last_archived_log_id(incarnation, archive_round, last_archived_log_id))) {
STORAGE_LOG(
WARN, "failed to get_last_archived_log_id", K(incarnation), K(archive_round), K(last_archived_log_id), K(ret));
} else {
has_archived = (last_archived_log_id >= offline_log_id_);
}
return ret;
}
int ObPartitionGroup::get_leader_epoch(int64_t& leader_epoch) const
{
int ret = OB_SUCCESS;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartition is not inited", K(ret));
} else if (OB_ISNULL(pls_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "NULL ptr", K(ret));
} else {
ObTsWindows unused_windows;
ret = pls_->get_role_unlock(leader_epoch, unused_windows);
if (OB_FAIL(ret)) {
STORAGE_LOG(WARN, "get role unlock failed", K(ret));
}
}
return ret;
}
// TODO reuse sstable in split dest partition
int ObPartitionGroup::get_replica_status(share::ObReplicaStatus& status)
{
int ret = OB_SUCCESS;
bool can_migrate = false;
status = REPLICA_STATUS_MAX;
ObMigrateStatus migrate_status;
if (OB_FAIL(pg_storage_.get_pg_migrate_status(migrate_status))) {
STORAGE_LOG(WARN, "failed to get migrate status", K(ret), K_(pkey));
} else if (OB_FAIL(check_can_migrate(can_migrate))) {
STORAGE_LOG(WARN, "failed to check can migrate", K(ret), K_(pkey));
} else {
ObPartitionState state = get_partition_state();
if (!is_working_state(state)) {
status = REPLICA_STATUS_OFFLINE;
} else if (OB_MIGRATE_STATUS_MIGRATE_FAIL == migrate_status || OB_MIGRATE_STATUS_ADD_FAIL == migrate_status) {
status = REPLICA_STATUS_OFFLINE;
} else if (OB_MIGRATE_STATUS_ADD == migrate_status || OB_MIGRATE_STATUS_MIGRATE == migrate_status) {
status = REPLICA_STATUS_FLAG;
} else if (!can_migrate) {
status = REPLICA_STATUS_UNMERGED;
} else {
status = REPLICA_STATUS_NORMAL;
}
}
return ret;
}
// Check if the target replica need gc from replica_status.
// Target replica: R(read only) replica, F(full) replica of backup database (not in member list)
int ObPartitionGroup::is_replica_need_gc(bool& is_offline)
{
int ret = OB_SUCCESS;
share::ObReplicaStatus status;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartition is not inited", K(ret), K(pkey_));
} else if (OB_FAIL(get_replica_status(status))) {
STORAGE_LOG(WARN, "get_replica_status failed", K(ret), K(pkey_));
} else {
is_offline = (REPLICA_STATUS_OFFLINE == status);
}
return ret;
}
// Destroy the memtable and sstable of PG or pg partition
int ObPartitionGroup::clear_non_reused_stores_(const ObPartitionKey& pkey)
{
int ret = OB_SUCCESS;
bool cleared_memstore = false;
if (!pkey.is_valid()) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "invalid argument", K(ret), K(pkey), K(pkey_));
} else if (OB_FAIL(pg_storage_.clear_non_reused_stores(pkey, cleared_memstore))) {
STORAGE_LOG(WARN, "clear non-base store(s) failed.", K(pkey_), K(ret));
} else if (cleared_memstore) {
if (OB_FAIL(set_max_passed_trans_version_(0))) {
STORAGE_LOG(WARN, "set max passed trans version failed", K(ret), K(pkey_));
} else {
STORAGE_LOG(INFO, "reset max passed trans version success", K(pkey_));
}
} else {
// do nothing
}
if (OB_SUCC(ret)) {
partition_loop_worker_.set_migrating_flag(false);
STORAGE_LOG(INFO, "clear pg/partition non-reused store(s) success", K(pkey), K(cleared_memstore), K(pkey_));
}
return ret;
}
int ObPartitionGroup::set_storage_info(const ObSavedStorageInfoV2& info)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.set_pg_storage_info(info))) {
STORAGE_LOG(WARN, "failed to set_storage_info", K(ret), K(pkey_));
} else {
STORAGE_LOG(INFO, "succeed to set_storage_info", K(pkey_));
}
return ret;
}
int ObPartitionGroup::fill_pg_partition_replica(
const ObPartitionKey& pkey, ObReplicaStatus& replica_status, ObReportStatus& report_status)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.fill_pg_partition_replica(pkey, report_status))) {
STORAGE_LOG(WARN, "failed to fill pg partition replica", K(ret), K(pkey));
} else if (OB_FAIL(get_replica_status(replica_status))) {
STORAGE_LOG(WARN, "failed to get replica status", K(ret), K(pkey));
} else {
STORAGE_LOG(DEBUG, "succeed to get pg partition replica success", K(pkey), K(replica_status), K(report_status));
}
return ret;
}
int ObPartitionGroup::fill_replica(share::ObPartitionReplica& replica)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.fill_replica(replica))) {
STORAGE_LOG(WARN, "failed to fill replica", K(ret));
} else if (OB_FAIL(get_replica_status(replica.status_))) {
STORAGE_LOG(WARN, "failed to get replica status", K(ret));
} else {
STORAGE_LOG(TRACE, "succeed to get replica status", K(replica));
}
return ret;
}
int ObPartitionGroup::get_merge_priority_info(ObMergePriorityInfo& merge_priority_info) const
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.get_merge_priority_info(merge_priority_info))) {
STORAGE_LOG(WARN, "failed to get first frozen memtable", K(ret), K_(pkey));
}
return ret;
}
// It will only be called in a single thread
int64_t ObPartitionGroup::get_gc_schema_drop_ts()
{
if (OB_INVALID_TIMESTAMP == gc_schema_drop_ts_) {
gc_schema_drop_ts_ = ObTimeUtility::current_time();
}
return gc_schema_drop_ts_;
}
bool ObPartitionGroup::is_splitting_() const
{
return in_splitting(split_state_.get_state());
}
bool ObPartitionGroup::is_split_source_partition_() const
{
return is_source_split(split_state_.get_state());
}
bool ObPartitionGroup::is_split_dest_partition_() const
{
return is_dest_split(split_state_.get_state());
}
bool ObPartitionGroup::is_dest_splitting_() const
{
return in_dest_splitting(split_state_.get_state());
}
bool ObPartitionGroup::is_physical_split_finished_() const
{
return is_physical_split_finished(split_state_.get_state());
}
int ObPartitionGroup::check_physical_split_(bool& finished) const
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.check_physical_split(finished))) {
STORAGE_LOG(WARN, "check physical split failed", K(ret));
}
return ret;
}
int ObPartitionGroup::get_dest_split_progress_(int& progress)
{
int ret = OB_SUCCESS;
switch (split_state_.get_state()) {
case LEADER_INIT:
case FOLLOWER_INIT: {
progress = PHYSICAL_SPLIT_FINISH;
break;
}
case LEADER_WAIT_SPLIT:
case FOLLOWER_WAIT_SPLIT:
case SPLIT_DEST_LOGGING: {
progress = IN_SPLITTING;
break;
}
case LEADER_LOGICAL_SPLIT_SUCCESS:
case FOLLOWER_LOGICAL_SPLIT_SUCCESS: {
progress = LOGICAL_SPLIT_FINISH;
bool is_physical_split_finished = false;
if (OB_FAIL(check_physical_split_(is_physical_split_finished))) {
STORAGE_LOG(WARN, "check physical split failed", K(ret));
} else if (is_physical_split_finished) {
if (OB_FAIL(split_state_.switch_state(PHYSICAL_SPLIT_SUCCESS))) {
STORAGE_LOG(WARN, "switch split state failed", K(ret));
} else {
progress = PHYSICAL_SPLIT_FINISH;
}
} else {
// do nothing
}
break;
}
default: {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "unexpected split state", K(ret), K_(split_state));
}
}
return ret;
}
int ObPartitionGroup::check_if_dest_pg_ready_(const common::ObIArray<common::ObPartitionKey>& dest_pgs, bool& is_ready)
{
int ret = OB_SUCCESS;
ObPartitionArray src_pkeys;
if (OB_FAIL(get_all_pg_partition_keys(src_pkeys))) {
STORAGE_LOG(WARN, "get source partition group partition failed", K(ret));
} else {
bool tmp_ready = true;
ObPartitionArray dest_pkeys;
for (int64_t i = 0; OB_SUCC(ret) && tmp_ready && i < dest_pgs.count(); i++) {
dest_pkeys.reset();
ObIPartitionGroupGuard pg_guard;
storage::ObIPartitionGroup* pg = NULL;
if (OB_FAIL(ps_->get_partition(dest_pgs.at(i), pg_guard))) {
if (OB_PARTITION_NOT_EXIST == ret) {
is_ready = true;
ret = OB_SUCCESS;
STORAGE_LOG(INFO, "partition not exist, possible for read only partition", K(ret), K(dest_pgs.at(i)));
break;
} else {
STORAGE_LOG(WARN, "get dest partition failed", K(ret), K(dest_pgs.at(i)));
}
} else if (OB_ISNULL(pg = pg_guard.get_partition_group())) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "partition should not be null", K(ret), K(dest_pgs.at(i)));
} else if (OB_FAIL(pg->get_all_pg_partition_keys(dest_pkeys))) {
STORAGE_LOG(WARN, "get dest partition group partition failed", K(ret), K(dest_pgs.at(i)));
} else {
for (int64_t j = 0; j < src_pkeys.count(); j++) {
bool found = false;
for (int64_t k = 0; !found && k < dest_pkeys.count(); k++) {
if (src_pkeys.at(j).get_table_id() == dest_pkeys.at(k).get_table_id()) {
found = true;
}
}
if (!found) {
tmp_ready = false;
STORAGE_LOG(INFO, "some partitions not found", K(src_pkeys), K(dest_pkeys));
break;
}
}
}
}
if (OB_SUCC(ret)) {
is_ready = tmp_ready;
}
}
STORAGE_LOG(INFO, "check if dest partition group ready", K(ret), K(dest_pgs), K(is_ready));
return ret;
}
int ObPartitionGroup::has_active_memtable(bool& found)
{
ObTimeGuard tg(__func__, 1000000);
SpinRLockGuard guard(split_lock_);
tg.click();
return has_active_memtable_(found);
}
int ObPartitionGroup::has_active_memtable_(bool& found)
{
int ret = OB_SUCCESS;
found = true;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "partition is not initialized", K(ret), K(pkey_));
} else if (is_dest_splitting_()) {
found = false;
} else {
if (!pg_storage_.has_active_memtable()) {
found = false;
if (EXECUTE_COUNT_PER_SEC(16)) {
STORAGE_LOG(INFO, "the partition has no active memtable", K(ret), K(pkey_));
}
}
}
return ret;
}
int ObPartitionGroup::retire_warmup_store(const bool is_disk_full)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.retire_warmup_store(is_disk_full))) {
STORAGE_LOG(WARN, "retire warmup store error", K(ret), K_(pkey));
}
return ret;
}
int ObPartitionGroup::enable_write_log(const bool is_replay_old)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.enable_write_log(is_replay_old))) {
STORAGE_LOG(WARN, "enable write log error", K(ret), K_(pkey));
}
return ret;
}
uint64_t ObPartitionGroup::get_min_replayed_log_id()
{
uint64_t min_replay_log_id = UINT64_MAX;
int64_t unused = 0;
get_min_replayed_log_with_keepalive(min_replay_log_id, unused);
return min_replay_log_id;
}
int ObPartitionGroup::get_min_replayed_log_with_keepalive(uint64_t &min_replay_log_id, int64_t &min_replay_log_ts)
{
int ret = OB_SUCCESS;
uint64_t unreplay_log_id = UINT64_MAX;
int64_t unreplay_log_ts = 0;
uint64_t next_replay_log_id = UINT64_MAX;
int64_t next_replay_log_ts = 0;
// 1. The left boundary of sliding window(including keepalive log).
if (OB_FAIL(pls_->get_next_replay_log_info(next_replay_log_id, next_replay_log_ts))) {
STORAGE_LOG(WARN, "get next replay log info failed", K(ret), K(*this));
} else {
// 2. The minimum continuously replayed log of replay engine.
replay_status_->get_min_unreplay_log(unreplay_log_id, unreplay_log_ts);
if (unreplay_log_id == next_replay_log_id) {
// cold partition, return next_replay_log_ts instead of unreplay_log_ts, unreplay_log_ts may be too small.
min_replay_log_id = next_replay_log_id - 1;
min_replay_log_ts = next_replay_log_ts - 1;
} else {
min_replay_log_id = unreplay_log_id - 1;
min_replay_log_ts = unreplay_log_ts - 1;
}
STORAGE_LOG(INFO,
"min replayed log with keepalive",
K(pkey_),
K(min_replay_log_ts),
K(min_replay_log_id),
K(unreplay_log_ts),
K(unreplay_log_id),
K(next_replay_log_ts),
K(next_replay_log_id));
}
return ret;
}
int ObPartitionGroup::get_all_pg_partition_keys(ObPartitionArray& pkeys, const bool include_trans_table)
{
return pg_storage_.get_all_pg_partition_keys(pkeys, include_trans_table);
}
int ObPartitionGroup::add_sstable_for_merge(const ObPartitionKey& pkey, storage::ObSSTable* sstable,
const int64_t max_kept_major_version_number, ObIPartitionReport& report, ObSSTable* complement_minor_sstable)
{
bool need_report = false;
int ret = OB_SUCCESS;
if (OB_ISNULL(sstable) && OB_ISNULL(complement_minor_sstable)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "both sstable and complement sstable is null", K(ret));
} else {
ObPartitionKey dest_pkey;
SpinRLockGuard guard(split_lock_);
// synchronized with push_reference_tables
if (OB_NOT_NULL(sstable) &&
(is_split_source_partition_() || (is_split_dest_partition_() && sstable->get_partition_key() != pkey))) {
ObIPartitionGroupGuard dest_guard;
storage::ObIPartitionGroup* dest_partition = NULL;
const common::ObIArray<common::ObPartitionKey>& dest_pgkeys = split_info_.get_dest_partitions();
bool is_complete = true;
for (int64_t i = 0; OB_SUCC(ret) && i < dest_pgkeys.count(); i++) {
const ObPartitionKey dest_pgkey = dest_pgkeys.at(i);
if (OB_FAIL(ps_->get_partition(dest_pgkey, dest_guard))) {
if (OB_PARTITION_NOT_EXIST == ret) {
ret = OB_SUCCESS;
// do nothing
} else {
STORAGE_LOG(WARN, "Fail to get partition", K(ret));
}
} else if (OB_ISNULL(dest_partition = dest_guard.get_partition_group())) {
ret = OB_ERR_SYS;
LOG_WARN("partition must not null", K(ret), K(dest_pgkey));
} else if (OB_FAIL(dest_partition->check_complete(is_complete))) {
LOG_WARN("failed to check complete", K(ret), K(dest_pgkey));
} else if (!is_complete) {
LOG_INFO("logical split has not finished, no need to update", K(ret));
break;
} else if (OB_FAIL(dest_partition->get_pg_storage().get_pkey_for_table(pkey.get_table_id(), dest_pkey))) {
LOG_WARN("failed to get pkey for table", K(ret), K(pkey));
} else if (OB_FAIL(dest_partition->get_pg_storage().add_sstable_for_merge(
dest_pkey, sstable, 1 /*max_kept_major_version_number*/, complement_minor_sstable))) {
LOG_WARN("failed to add sstable", K(ret), K(dest_pkey));
}
}
if (OB_SUCC(ret)) {
need_report = true;
}
}
if (OB_SUCC(ret)) {
if ((OB_ISNULL(sstable) && OB_NOT_NULL(complement_minor_sstable)) || sstable->get_partition_key() == pkey ||
sstable->is_trans_sstable()) {
if (OB_FAIL(pg_storage_.add_sstable_for_merge(
pkey, sstable, max_kept_major_version_number, complement_minor_sstable))) {
LOG_WARN("failed to add table", K(ret), K_(pkey), K(pkey), K(sstable));
}
}
}
}
if (OB_SUCC(ret) && OB_NOT_NULL(sstable) && sstable->is_minor_sstable()) {
int tmp_ret = OB_SUCCESS;
if (OB_UNLIKELY(OB_SUCCESS != (tmp_ret = pg_storage_.clear_unused_trans_status()))) {
LOG_WARN("failed to check release memtable", K(tmp_ret), K(pkey));
}
}
if (need_report) {
int tmp_ret = OB_SUCCESS;
bool is_merged = false;
const common::ObIArray<common::ObPartitionKey>& dest_pgkeys = split_info_.get_dest_partitions();
for (int64_t i = 0; i < dest_pgkeys.count(); i++) {
const ObPartitionKey dest_pgkey = dest_pgkeys.at(i);
if (OB_SUCCESS != (tmp_ret = report.submit_pt_update_task(dest_pgkey))) {
STORAGE_LOG(ERROR, "fail to submit pt update task", K(tmp_ret));
}
// Trigger target partition physical split immediately.
ObPartitionScheduler::get_instance().schedule_merge(dest_pgkey, is_merged);
}
}
return ret;
}
// Check if the replica is readable or not. The read snapshot version should
// be smaller than or equal to weak read timestamp.
int ObPartitionGroup::check_replica_ready_for_bounded_staleness_read(const int64_t snapshot_version)
{
int ret = OB_SUCCESS;
int64_t base_snapshot_version = 0;
if (OB_FAIL(get_weak_read_timestamp(base_snapshot_version))) {
TRANS_LOG(WARN, "get min safe slave read timestamp error", K(ret), K(base_snapshot_version), "context", *this);
} else if (snapshot_version > base_snapshot_version) {
TRANS_LOG(WARN,
"read snapshot version is greater than replica slave read timestamp, "
"replica not readable",
K(snapshot_version),
K(base_snapshot_version),
"delta",
snapshot_version - base_snapshot_version);
ret = OB_REPLICA_NOT_READABLE;
} else {
// readable
}
return ret;
}
int ObPartitionGroup::set_offline_log_id(const uint64_t log_id)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup not inited", K(pkey_), K(ret));
} else {
ATOMIC_CAS(&offline_log_id_, common::OB_INVALID_ID, log_id);
// Can only change offline_log_id smaller, due to the GC limitation of archiving
while (true) {
uint64_t old_offlinie_log_id = ATOMIC_LOAD(&offline_log_id_);
if ((common::OB_INVALID_ID == old_offlinie_log_id) || log_id < old_offlinie_log_id) {
if (ATOMIC_BCAS(&offline_log_id_, old_offlinie_log_id, log_id)) {
break;
}
} else {
break;
}
}
STORAGE_LOG(INFO, "set_offline_log_id", K(pkey_), K(log_id), K(offline_log_id_));
}
return ret;
}
int ObPartitionGroup::feedback_scan_access_stat(const ObTableScanParam& param)
{
return pg_storage_.feedback_scan_access_stat(param);
}
int ObPartitionGroup::create_sstable(
const ObPGCreateSSTableParam& param, ObTableHandle& table_handle, const bool in_slog_trans)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup has not been inited", K(ret));
} else if (OB_FAIL(pg_storage_.create_sstable(param, in_slog_trans, table_handle))) {
STORAGE_LOG(WARN, "fail to create sstable", K(ret));
}
return ret;
}
int ObPartitionGroup::create_sstables(const common::ObIArray<ObPGCreateSSTableParam>& create_sstable_params,
ObTablesHandle& tables_handle, const bool in_slog_trans)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup has not been inited", K(ret));
} else if (OB_FAIL(pg_storage_.create_sstables(create_sstable_params, in_slog_trans, tables_handle))) {
STORAGE_LOG(WARN, "fail to create sstables", K(ret));
}
return ret;
}
int ObPartitionGroup::get_checkpoint_info(common::ObArenaAllocator& allocator, ObPGCheckpointInfo& pg_checkpoint_info)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("ObPartitionGroup has not been inited", K(ret));
} else if (OB_FAIL(pg_storage_.get_checkpoint_info(allocator, pg_checkpoint_info))) {
LOG_WARN("fail to write pg checkpoint entry", K(ret));
}
return ret;
}
int ObPartitionGroup::acquire_sstable(const ObITable::TableKey& table_key, ObTableHandle& table_handle)
{
int ret = OB_SUCCESS;
if (OB_FAIL(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("ObPartitionGroup has not been inited", K(ret));
} else if (OB_FAIL(pg_storage_.acquire_sstable(table_key, table_handle))) {
if (OB_ENTRY_NOT_EXIST != ret) {
LOG_WARN("fail to acquire sstable", K(ret));
}
}
return ret;
}
bool ObPartitionGroup::need_replay_redo() const
{
return pg_storage_.need_create_memtable();
}
// Only deal with the situation that both the split info exists and the origin partition is in schema.
// The situation of tenant deletion or table deletion will not be handled.
int ObPartitionGroup::try_clear_split_info()
{
int ret = OB_SUCCESS;
bool is_exist = false;
int64_t local_schema_version = OB_INVALID_VERSION;
if (!is_inited_) {
ret = OB_NOT_INIT;
LOG_WARN("not init", K(ret));
} else if (!split_info_.is_valid()) {
// nothing todo
} else if (pkey_ != split_info_.get_src_partition()) {
// not the origin partition
} else {
ObSchemaGetterGuard guard;
const ObPartitionSchema* partition_schema = NULL;
if (OB_FAIL(schema_service_->get_tenant_schema_guard(pkey_.get_tenant_id(), guard))) {
LOG_WARN("failed to get tenant schema guard", K(ret), K_(pkey));
} else if (pkey_.is_pg()) {
const ObTablegroupSchema* tablegroup_schema = NULL;
if (OB_FAIL(guard.get_tablegroup_schema(pkey_.get_tablegroup_id(), tablegroup_schema))) {
LOG_WARN("failed to get tablegroup schema", K(ret), K_(pkey));
} else if (OB_ISNULL(tablegroup_schema)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("tablegroup schema is null", K(ret), K_(pkey));
} else {
partition_schema = tablegroup_schema;
}
} else {
const ObSimpleTableSchemaV2* table_schema = NULL;
if (OB_FAIL(guard.get_table_schema(pkey_.get_table_id(), table_schema))) {
LOG_WARN("failed to get table schema guard", K(ret), K_(pkey));
} else if (OB_ISNULL(table_schema)) {
ret = OB_ERR_UNEXPECTED;
LOG_WARN("failed to get table schema", K(ret), K_(pkey));
} else {
partition_schema = table_schema;
}
}
if (OB_FAIL(ret)) {
} else if (OB_FAIL(ObPartMgrUtils::check_part_exist(
*partition_schema, pkey_.get_partition_id(), false /*check_dropped_partition*/, is_exist))) {
LOG_WARN("failed to check part exist", K(ret), K_(pkey), "schema", *partition_schema);
} else if (!is_exist) {
// Does not exist in the schema table, no need to consider it.
} else if (OB_FAIL(guard.get_schema_version(pkey_.get_tenant_id(), local_schema_version))) {
STORAGE_LOG(WARN, "fail to get schema guard version", K(ret), K_(pkey));
} else {
// It is the origin partition of split and is in the schema table, we need check whether split info
// is valid.
if (split_info_.get_schema_version() < local_schema_version) {
// It means that the split request with a split_info is failed, if the local refreshed schema
// version is greater than that in split_info, and the origin partition is still in schema table.
// The split info can be deleted to prevent misjudgment.
LOG_INFO("need to clear split info", K(ret), K(local_schema_version), K_(split_info));
ObPartitionSplitInfo new_split_info;
if (OB_FAIL(pg_storage_.clear_split_info())) {
LOG_WARN("failed to save split info", K(ret), K_(split_info));
} else {
split_info_.reset();
LOG_INFO("succeed to clear split info", K(ret), K_(pkey), K_(split_info));
}
}
}
}
return OB_SUCCESS;
}
// If the log id of the transaction crosses the freeze_id, it means that it
// is dirty.
int ObPartitionGroup::check_dirty_txn(
const int64_t min_log_ts, const int64_t max_log_ts, int64_t& freeze_ts, bool& is_dirty)
{
int ret = OB_SUCCESS;
const ObMemtable* mt = NULL;
is_dirty = false;
// Mutex with freeze point select
SpinRLockGuard guard(freeze_lock_);
freeze_ts = freeze_record_.get_freeze_ts();
if (ObFreezeRecord::OB_INVALID_FREEZE_TS != freeze_ts) {
is_dirty = min_log_ts <= freeze_ts && freeze_ts < max_log_ts;
}
return ret;
}
int ObPartitionGroup::try_update_clog_member_list(const uint64_t ms_log_id, const int64_t mc_timestamp,
const int64_t replica_num, const ObMemberList& mlist, const common::ObProposalID& ms_proposal_id)
{
int ret = OB_SUCCESS;
ObBaseStorageInfo curr_clog_info;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup not inited", K(ret), K(pkey_));
} else if (OB_FAIL(get_saved_clog_info(curr_clog_info))) {
STORAGE_LOG(WARN, "fail to get saved clog info", K(ret), K(pkey_));
} else if (curr_clog_info.get_membership_timestamp() >= mc_timestamp) {
// curr_clog_info is new enough, no need to update
STORAGE_LOG(WARN,
"curr_clog_info no need update",
K(ret),
K(pkey_),
K(curr_clog_info),
K(ms_log_id),
K(mc_timestamp),
K(replica_num),
K(mlist),
K(ms_proposal_id));
} else if (OB_FAIL(pg_storage_.try_update_member_list(ms_log_id, mc_timestamp, replica_num, mlist, ms_proposal_id))) {
STORAGE_LOG(WARN,
"fail to update member list",
K(ret),
K(pkey_),
K(ms_log_id),
K(mc_timestamp),
K(replica_num),
K(mlist),
K(ms_proposal_id));
} else {
STORAGE_LOG(INFO, "update member list success", K(pkey_), K(ms_log_id), K(mc_timestamp), K(replica_num), K(mlist));
}
return ret;
}
int ObPartitionGroup::check_complete(bool& is_complete)
{
return pg_storage_.check_complete(is_complete);
}
int ObPartitionGroup::check_physical_flashback_succ(const obrpc::ObCheckPhysicalFlashbackArg& arg,
const int64_t max_version, obrpc::ObPhysicalFlashbackResultArg& result)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup not inited", K(pkey_), K(ret));
} else if (ObMultiClusterUtil::is_cluster_private_table(pkey_.get_table_id())) {
if (OB_FAIL(check_private_table_flashback_result_(max_version, result))) {
LOG_WARN("failed to check private table flashback succ", K(ret), K(arg));
}
} else {
if (OB_FAIL(check_non_private_table_flashback_result_(arg.flashback_scn_, result.enable_result_))) {
LOG_WARN("failed to check non private table flashback result", K(arg), K(result));
}
}
return ret;
}
int ObPartitionGroup::check_private_table_flashback_result_(
const int64_t max_version, obrpc::ObPhysicalFlashbackResultArg& result)
{
int ret = OB_SUCCESS;
bool is_empty_pg = pg_storage_.is_empty_pg();
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup not inited", K(pkey_), K(ret));
} else if (!ObMultiClusterUtil::is_cluster_private_table(pkey_.table_id_)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("non private table is invalid", K(ret), K_(pkey));
} else if (is_pg() && is_empty_pg) {
result.enable_result_ = true;
LOG_INFO("empty pg", K(ret), K_(pkey));
} else if (!common::ObReplicaTypeCheck::is_replica_with_ssstore(get_replica_type())) {
result.enable_result_ = true;
LOG_INFO(
"without sstable replica physical flashback success", K(ret), K_(pkey), "replica_type", get_replica_type());
} else if (OB_FAIL(get_pg_storage().get_min_max_major_version(result.min_version_, result.max_version_))) {
LOG_WARN("failed to get min and max major version", K(ret), K_(pkey));
} else if (INT64_MIN == max_version || max_version == result.max_version_) {
result.enable_result_ = true;
LOG_INFO("private table is valid", K(ret), K_(pkey), K(result));
} else if (max_version != result.max_version_) {
result.enable_result_ = false;
LOG_WARN("private table not valid", K(ret), K_(pkey), K(max_version), K(result));
}
return ret;
}
int ObPartitionGroup::check_non_private_table_flashback_result_(const int64_t flashback_scn, bool& result)
{
int ret = OB_SUCCESS;
result = false;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup not inited", K(pkey_), K(ret));
} else if (ObMultiClusterUtil::is_cluster_private_table(pkey_.table_id_)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("private table is invalid", K(ret), K_(pkey));
} else {
ObIPartitionLogService* pls = NULL;
int64_t sstable_ts = 0;
if (OB_ISNULL(pls = get_log_service())) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "get_log_service failed", K(ret), K_(pkey));
} else if (flashback_scn < pls->get_last_submit_timestamp()) {
result = false;
LOG_WARN("failed to do physical flashback",
K(ret),
"start_id",
pls->get_last_submit_timestamp(),
K_(pkey),
K(flashback_scn));
} else if (OB_FAIL(get_pg_storage().get_max_major_sstable_snapshot(sstable_ts))) {
STORAGE_LOG(WARN, "failed to get max sstable snapshop", K(ret), K(sstable_ts));
} else if (sstable_ts > flashback_scn) {
result = false;
LOG_WARN("physical flashback failed", K(ret), K(sstable_ts), K(flashback_scn), K_(pkey));
} else {
result = true;
STORAGE_LOG(INFO, "check physical flashback succ", K(ret), K(sstable_ts), K_(pkey), K(result));
}
}
return ret;
}
int ObPartitionGroup::get_merge_log_ts(int64_t& merge_ts)
{
int ret = OB_SUCCESS;
int64_t applied_log_ts = OB_INVALID_TIMESTAMP;
ObPartitionGroupLockGuard guard(lock_, PGLOCKTRANS | PGLOCKREPLAY | PGLOCKCLOG, 0);
uint64_t unused = 0;
get_min_replayed_log_with_keepalive(unused, merge_ts);
if (OB_ISNULL(txs_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "txs_ is NULL", K(ret), K(pkey_));
} else if (OB_FAIL(txs_->get_applied_log_ts(pkey_, applied_log_ts))) {
STORAGE_LOG(WARN, "fail to get applied log id", K(ret), K(pkey_));
} else {
STORAGE_LOG(INFO, "get merge log id", K(merge_ts), K(applied_log_ts));
merge_ts = MIN(merge_ts, applied_log_ts);
}
return ret;
}
int ObPartitionGroup::recycle_unused_sstables(const int64_t max_recycle_cnt, int64_t& recycled_cnt)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.recycle_unused_sstables(max_recycle_cnt, recycled_cnt))) {
LOG_WARN("fail to recycle unused sstables", K(ret));
}
return ret;
}
int ObPartitionGroup::recycle_sstable(const ObITable::TableKey &table_key)
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.recycle_sstable(table_key))) {
STORAGE_LOG(WARN, "fail to recycle sstable", K(ret), K(table_key));
}
return ret;
}
int ObPartitionGroup::set_meta_block_list(const common::ObIArray<blocksstable::MacroBlockId> &meta_block_list)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("ObPartitionGroup has not been inited", K(ret));
} else if (OB_FAIL(pg_storage_.set_meta_block_list(meta_block_list))) {
LOG_WARN("fail to set meta block list", K(ret));
}
return ret;
}
int ObPartitionGroup::physical_flashback(const int64_t flashback_scn)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup not inited", K(pkey_), K(ret));
} else if (flashback_scn <= 0) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "physical flashback get invalid argument", K(ret), K(flashback_scn));
} else if (OB_FAIL(pg_storage_.physical_flashback(flashback_scn))) {
STORAGE_LOG(WARN, "failed to do physical flashback", K(ret), K(pkey_));
}
return ret;
}
int ObPartitionGroup::get_meta_block_list(common::ObIArray<blocksstable::MacroBlockId>& meta_block_list) const
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("ObPartitionGroup has not been inited", K(ret));
} else if (OB_FAIL(pg_storage_.get_meta_block_list(meta_block_list))) {
LOG_WARN("fail to get meta block list", K(ret));
}
return ret;
}
int ObPartitionGroup::set_storage_file(ObStorageFileHandle& file_handle)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("ObPartitionGroup has not been inited", K(ret));
} else if (OB_FAIL(pg_storage_.set_storage_file(file_handle))) {
LOG_WARN("fail to set storage file", K(ret));
}
return ret;
}
int ObPartitionGroup::get_all_tables(ObTablesHandle& tables_handle)
{
int ret = OB_SUCCESS;
tables_handle.reset();
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
LOG_WARN("ObPartitionGroup has not been inited", K(ret));
} else {
ObPGPartition* pg_partition = nullptr;
ObSinglePGPartitionIterator iter;
ObTablesHandle tmp_tables_handle;
const bool need_trans_table = true;
if (OB_FAIL(iter.init(this, need_trans_table))) {
LOG_WARN("fail to init single pg partition iter", K(ret));
} else {
while (OB_SUCC(ret)) {
if (OB_FAIL(iter.get_next(pg_partition))) {
if (OB_ITER_END == ret) {
ret = OB_SUCCESS;
break;
} else {
LOG_WARN("fail to get next partition", K(ret));
}
} else if (OB_ISNULL(pg_partition)) {
ret = OB_ERR_SYS;
LOG_WARN("error sys, pg partition must not be null", K(ret));
} else if (OB_ISNULL(pg_partition->get_storage())) {
ret = OB_ERR_SYS;
LOG_WARN("error sys, storage must not be null", K(ret));
} else if (OB_FAIL(pg_partition->get_storage()->get_all_tables(tmp_tables_handle))) {
LOG_WARN("fail to get all tables", K(ret));
} else if (OB_FAIL(tables_handle.add_tables(tmp_tables_handle))) {
LOG_WARN("fail to add tables", K(ret));
}
}
}
}
return ret;
}
int ObPartitionGroup::get_table_store_cnt(int64_t& table_cnt) const
{
int ret = OB_SUCCESS;
if (OB_FAIL(pg_storage_.get_table_store_cnt(table_cnt))) {
STORAGE_LOG(WARN, "fail to get table store cnt", K(ret), K(table_cnt));
}
return ret;
}
int ObPartitionGroup::check_can_free(bool& can_free)
{
can_free = false;
return pg_storage_.check_can_free(can_free);
}
int ObPartitionGroup::check_can_physical_flashback(const int64_t flashback_scn)
{
int ret = OB_SUCCESS;
if (!is_inited_) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "ObPartitionGroup not inited", K(pkey_), K(ret));
} else if (flashback_scn <= 0) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(WARN, "physical flashback get invalid argument", K(ret), K(flashback_scn));
} else if (OB_FAIL(pg_storage_.check_can_physical_flashback(flashback_scn))) {
STORAGE_LOG(WARN, "[PHY_FLASHBACK]check_can_physical_flashback failed", K(ret), K(pkey_), K(flashback_scn));
} else {
STORAGE_LOG(INFO, "[PHY_FLASHBACK]check_can_physical_flashback success", K(ret), K(pkey_), K(flashback_scn));
}
return ret;
}
int ObPartitionGroup::clear_trans_after_restore_log(const uint64_t last_restore_log_id,
const int64_t last_restore_log_ts)
{
int ret = OB_SUCCESS;
ObPartitionGroupLockGuard guard(lock_, PGLOCKTRANS | PGLOCKSTORAGE, 0);
if (OB_SYS_TENANT_ID == pkey_.get_tenant_id()) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(ERROR, "sys partitions do not do physical restore", K(ret), K(pkey_));
} else if (OB_ISNULL(txs_)) {
ret = OB_ERR_UNEXPECTED;
STORAGE_LOG(WARN, "txs_ is NULL", KR(ret), K_(pkey));
} else if (OB_UNLIKELY(OB_INVALID_ID == last_restore_log_id)
|| OB_UNLIKELY(OB_INVALID_TIMESTAMP == last_restore_log_ts)) {
ret = OB_INVALID_ARGUMENT;
STORAGE_LOG(
WARN, "invalid last_restore_log_info", KR(ret), K_(pkey), K(last_restore_log_id), K(last_restore_log_ts));
} else if (OB_FAIL(pg_storage_.set_last_restore_log_info(last_restore_log_id, last_restore_log_ts))) {
STORAGE_LOG(
WARN, "failed to set_last_restore_log_info", K(ret), K_(pkey), K(last_restore_log_id), K(last_restore_log_ts));
} else if (OB_FAIL(txs_->set_last_restore_log_info(pkey_, last_restore_log_id, last_restore_log_ts))) {
STORAGE_LOG(
WARN, "failed to set_last_restore_log_info", KR(ret), K_(pkey), K(last_restore_log_id), K(last_restore_log_ts));
} else {
ATOMIC_SET(&has_clear_trans_after_restore_, true);
}
return ret;
}
int ObPartitionGroup::get_base_storage_info_(common::ObBaseStorageInfo& info)
{
int ret = OB_SUCCESS;
uint64_t sw_last_replay_log_id = OB_INVALID_ID;
if (OB_FAIL(pls_->get_base_storage_info(info, sw_last_replay_log_id))) {
STORAGE_LOG(WARN, "fail to get base clog info", K(ret), K(pkey_));
} else {
int64_t restore_snapshot_version = OB_INVALID_TIMESTAMP;
uint64_t last_restore_log_id = OB_INVALID_ID;
int64_t last_restore_log_ts = OB_INVALID_TIMESTAMP;
if (OB_FAIL(pg_storage_.get_restore_replay_info(last_restore_log_id,
last_restore_log_ts, restore_snapshot_version))) {
STORAGE_LOG(WARN, "failed to get_restore_replay_info", KR(ret), K(pkey_));
} else if (OB_INVALID_TIMESTAMP != restore_snapshot_version) {
// The last_replay_log_id of recovered partition needs to be adjusted.
if (OB_INVALID_ID == last_restore_log_id) {
// It is still pulling log or before that, ajust info's last replay log id,
// max_flush_ilog_id does not need consider.
if (info.get_last_replay_log_id() < sw_last_replay_log_id) {
info.set_last_replay_log_id(sw_last_replay_log_id);
}
} else if (info.get_last_replay_log_id() < last_restore_log_id) {
info.set_last_replay_log_id(std::min(sw_last_replay_log_id, last_restore_log_id));
}
}
}
return ret;
}
int ObPartitionGroup::reset_for_replay()
{
int ret = OB_SUCCESS;
if (OB_ISNULL(rp_eg_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "not init, rp_eg_ is NULL", K(ret), K(pkey_));
} else if (OB_FAIL(rp_eg_->reset_partition(pkey_))) {
STORAGE_LOG(WARN, "failed to reset_partition", K(ret), K(pkey_));
} else {
STORAGE_LOG(INFO, "success to reset_for_replay", K(ret), K(pkey_));
}
return ret;
}
int ObPartitionGroup::get_trans_split_info(ObTransSplitInfo& split_info)
{
int ret = OB_SUCCESS;
SpinWLockGuard guard(split_lock_);
split_info.reset();
const share::ObSplitPartitionPair& spp = split_info_.get_spp();
if (OB_FAIL(split_info.init(spp.get_source_pkey(), spp.get_dest_array()))) {
TRANS_LOG(WARN, "init trans split info failed", KR(ret));
}
return ret;
}
int ObPartitionGroup::inc_pending_batch_commit_count(memtable::ObMemtableCtx& mt_ctx, const int64_t log_ts)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else {
if (OB_FAIL(pg_storage_.inc_pending_batch_commit_count(mt_ctx, log_ts))) {
STORAGE_LOG(WARN, "failed to inc_pending_batch_commit_count", K(ret), K_(pkey), K(log_ts));
}
}
return ret;
}
int ObPartitionGroup::inc_pending_elr_count(memtable::ObMemtableCtx& mt_ctx, const int64_t log_ts)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else {
if (OB_FAIL(pg_storage_.inc_pending_elr_count(mt_ctx, log_ts))) {
STORAGE_LOG(WARN, "failed to inc_pending_elr_count", K(ret), K_(pkey), K(log_ts));
}
}
return ret;
}
int ObPartitionGroup::update_max_majority_log(const uint64_t log_id, const int64_t log_ts)
{
int ret = OB_SUCCESS;
if (OB_UNLIKELY(!is_inited_)) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "Partition object not initialized", K(ret), K(is_inited_));
} else {
pls_->try_update_max_majority_log(log_id, log_ts);
}
return ret;
}
} // namespace storage
} // end of namespace oceanbase