From fd4ddaac3d5dafd53de128c777c8e0f64a7b8698 Mon Sep 17 00:00:00 2001 From: BinChenn Date: Wed, 17 May 2023 12:41:16 +0000 Subject: [PATCH] [PALF] add `enable_sync` and `enable_vote` columns in __all_virtual_ha_diagnose --- mittest/logservice/test_ob_simple_log_arb.cpp | 2 +- .../logservice/test_ob_simple_log_rebuild.cpp | 27 +++++++++-- src/logservice/ob_log_service.cpp | 1 + src/logservice/palf/log_sliding_window.cpp | 10 +++++ src/logservice/palf/log_sliding_window.h | 1 + src/logservice/palf/palf_handle_impl.cpp | 9 +++- src/logservice/palf/palf_handle_impl.h | 8 +++- .../ob_all_virtual_ha_diagnose.cpp | 11 +++++ .../ob_all_virtual_ha_diagnose.h | 3 ++ .../ob_inner_table_schema.12301_12350.cpp | 45 +++++++++++++++++++ .../inner_table/ob_inner_table_schema_def.py | 5 ++- src/storage/ls/ob_ls.h | 10 +++-- 12 files changed, 121 insertions(+), 11 deletions(-) diff --git a/mittest/logservice/test_ob_simple_log_arb.cpp b/mittest/logservice/test_ob_simple_log_arb.cpp index cbc622e56..e4228224d 100644 --- a/mittest/logservice/test_ob_simple_log_arb.cpp +++ b/mittest/logservice/test_ob_simple_log_arb.cpp @@ -81,7 +81,7 @@ TEST_F(TestObSimpleLogClusterArbService, test_2f1a_degrade_upgrade) { oceanbase::common::ObClusterVersion::get_instance().cluster_version_ = CLUSTER_VERSION_4_1_0_0; SET_CASE_LOG_FILE(TEST_NAME, "arb_2f1a_degrade_upgrade"); - OB_LOGGER.set_log_level("DEBUG"); + OB_LOGGER.set_log_level("TRACE"); MockLocCB loc_cb; int ret = OB_SUCCESS; PALF_LOG(INFO, "begin test_2f1a_degrade_upgrade"); diff --git a/mittest/logservice/test_ob_simple_log_rebuild.cpp b/mittest/logservice/test_ob_simple_log_rebuild.cpp index 62f8de8b8..ca799e95b 100644 --- a/mittest/logservice/test_ob_simple_log_rebuild.cpp +++ b/mittest/logservice/test_ob_simple_log_rebuild.cpp @@ -42,6 +42,7 @@ public: server_idx_(-1), rebuild_palf_id_(-1), rebuild_lsn_(), + allow_rebuild_(false), is_inited_(false) {} virtual ~TestRebuildCbImpl() { destroy(); } public: @@ -80,6 +81,7 @@ public: server_idx_ = -1; rebuild_palf_id_ = -1; rebuild_lsn_.reset(); + allow_rebuild_ = false; test_base_ = NULL; } @@ -106,7 +108,7 @@ public: ObTenantEnv::set_tenant(&tenant_base); lib::set_thread_name("RebuildCB"); while (!has_set_stop()) { - if (rebuild_palf_id_ != -1 && rebuild_lsn_.is_valid()) { + if (true == allow_rebuild_ && rebuild_palf_id_ != -1 && rebuild_lsn_.is_valid()) { PalfHandleImplGuard leader; PalfHandleImplGuard *rebuild_palf; int64_t leader_idx; @@ -133,6 +135,7 @@ public: int64_t server_idx_; int64_t rebuild_palf_id_; LSN rebuild_lsn_; + bool allow_rebuild_; bool is_inited_; }; }; @@ -170,10 +173,9 @@ TEST_F(TestObSimpleLogClusterRebuild, test_old_leader_rebuild) PALF_LOG(INFO, "begin block net", K(id), K(leader_idx), K(follower_idx1), K(follower_idx2)); block_net(leader_idx, follower_idx1); block_net(leader_idx, follower_idx2); + submit_log(leader, 100, id, 6 * KB); PALF_LOG(INFO, "begin submit logs", K(id), K(leader_idx), K(follower_idx1), K(follower_idx2)); (void) submit_log(leader, 1000, leader_idx, MB); - // sleep to wait leader switching - sleep(16); PALF_LOG(INFO, "after sleep 16s, begin get_leader", K(id), K(leader_idx), K(follower_idx1), K(follower_idx2)); int64_t new_leader_idx = 0; unittest::PalfHandleImplGuard new_leader; @@ -206,10 +208,26 @@ TEST_F(TestObSimpleLogClusterRebuild, test_old_leader_rebuild) EXPECT_EQ(OB_SUCCESS, rebuild_server->palf_handle_impl_->log_engine_.get_block_id_range(min_block_id, max_block_id)); PALF_LOG_RET(ERROR, OB_ERR_UNEXPECTED, "runlin trace get_block_id_range", K(min_block_id), K(max_block_id)); + // submit a cond task before unblocking net to stop truncating task + IOTaskCond cond(id, rebuild_server->palf_env_impl_->last_palf_epoch_); + LogIOWorker *io_worker = &rebuild_server->palf_env_impl_->log_io_worker_wrapper_.user_log_io_worker_; + io_worker->submit_io_task(&cond); + // after unblocking net, old leader will do rebuild unblock_net(leader_idx, follower_idx1); unblock_net(leader_idx, follower_idx2); - sleep(10); + sleep(5); + + // is truncating, can not rebuild + if (rebuild_server->palf_handle_impl_->sw_.is_truncating_) { + PalfBaseInfo rebuild_base_info; + EXPECT_EQ(OB_SUCCESS, new_leader.palf_handle_impl_->get_base_info(rebuild_cb.rebuild_lsn_, rebuild_base_info)); + EXPECT_EQ(OB_SUCCESS, rebuild_server->palf_handle_impl_->disable_sync()); + EXPECT_EQ(OB_EAGAIN, rebuild_server->palf_handle_impl_->advance_base_info(rebuild_base_info, true)); + } + cond.cond_.signal(); + sleep(5); + rebuild_cb.allow_rebuild_ = true; PalfBaseInfo base_info_in_leader; PalfBaseInfo base_info_after_rebuild; @@ -255,6 +273,7 @@ TEST_F(TestObSimpleLogClusterRebuild, test_follower_rebuild) EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list)); follower_idx = (leader_idx + 1) % 3; TestRebuildCbImpl rebuild_cb; + rebuild_cb.allow_rebuild_ = true; PalfRebuildCbNode rebuild_node(&rebuild_cb); EXPECT_EQ(OB_SUCCESS, rebuild_cb.init(this, follower_idx)); EXPECT_EQ(OB_SUCCESS, rebuild_cb.start()); diff --git a/src/logservice/ob_log_service.cpp b/src/logservice/ob_log_service.cpp index 3b89a0a9c..5c2578fe9 100644 --- a/src/logservice/ob_log_service.cpp +++ b/src/logservice/ob_log_service.cpp @@ -706,6 +706,7 @@ int ObLogService::diagnose_apply(const share::ObLSID &id, return ret; } + int ObLogService::get_io_start_time(int64_t &last_working_time) { int ret = OB_SUCCESS; diff --git a/src/logservice/palf/log_sliding_window.cpp b/src/logservice/palf/log_sliding_window.cpp index b4182a9ab..b7285e446 100644 --- a/src/logservice/palf/log_sliding_window.cpp +++ b/src/logservice/palf/log_sliding_window.cpp @@ -2774,6 +2774,16 @@ int LogSlidingWindow::get_majority_lsn_(const ObMemberList &member_list, return ret; } +bool LogSlidingWindow::is_allow_rebuild() const +{ + // Caller holds palf_handle_impl's rlock. + bool bool_ret = false; + if (IS_INIT) { + bool_ret = !is_truncating_; + } + return bool_ret; +} + int LogSlidingWindow::truncate_for_rebuild(const PalfBaseInfo &palf_base_info) { // Caller holds palf_handle_impl's wrlock. diff --git a/src/logservice/palf/log_sliding_window.h b/src/logservice/palf/log_sliding_window.h index 0dc2b1dd1..be9c76112 100755 --- a/src/logservice/palf/log_sliding_window.h +++ b/src/logservice/palf/log_sliding_window.h @@ -203,6 +203,7 @@ public: virtual int ack_log(const common::ObAddr &src_server, const LSN &end_lsn); virtual int truncate(const TruncateLogInfo &truncate_log_info, const LSN &expected_prev_lsn, const int64_t expected_prev_log_pid); + virtual bool is_allow_rebuild() const; virtual int truncate_for_rebuild(const PalfBaseInfo &palf_base_info); virtual bool is_prev_log_pid_match(const int64_t log_id, const LSN &lsn, diff --git a/src/logservice/palf/palf_handle_impl.cpp b/src/logservice/palf/palf_handle_impl.cpp index 46038dfe1..a8eb490f6 100644 --- a/src/logservice/palf/palf_handle_impl.cpp +++ b/src/logservice/palf/palf_handle_impl.cpp @@ -1467,7 +1467,12 @@ int PalfHandleImpl::advance_base_info(const PalfBaseInfo &palf_base_info, const TruncatePrefixBlocksCbCtx truncate_prefix_cb_ctx(new_base_lsn); flush_meta_cb_ctx.type_ = SNAPSHOT_META; flush_meta_cb_ctx.base_lsn_ = new_base_lsn; - if (OB_FAIL(check_need_advance_base_info_(new_base_lsn, prev_log_info, is_rebuild))) { + // Note: can not rebuild while a truncate operation is doing, because group_buffer may be + // truncated by LogCallback again after it has been advanced by rebuild operation. + if (false == sw_.is_allow_rebuild()) { + ret = OB_EAGAIN; + PALF_LOG(WARN, "can not advance_base_info for now, try again failed", K(ret), KPC(this), K(palf_base_info), K(is_rebuild)); + } else if (OB_FAIL(check_need_advance_base_info_(new_base_lsn, prev_log_info, is_rebuild))) { PALF_LOG(WARN, "check_need_advance_base_info failed", K(ret), KPC(this), K(palf_base_info), K(is_rebuild)); } else if (OB_FAIL(log_snapshot_meta.generate(new_base_lsn, prev_log_info))) { PALF_LOG(WARN, "LogSnapshotMeta generate failed", K(ret), KPC(this), K(palf_base_info)); @@ -3873,6 +3878,8 @@ int PalfHandleImpl::diagnose(PalfDiagnoseInfo &diagnose_info) const state_mgr_.get_role_and_state(diagnose_info.palf_role_, diagnose_info.palf_state_); diagnose_info.palf_proposal_id_ = state_mgr_.get_proposal_id(); state_mgr_.get_election_role(diagnose_info.election_role_, diagnose_info.election_epoch_); + diagnose_info.enable_sync_ = state_mgr_.is_sync_enabled(); + diagnose_info.enable_vote_ = state_mgr_.is_allow_vote(); return ret; } diff --git a/src/logservice/palf/palf_handle_impl.h b/src/logservice/palf/palf_handle_impl.h index 2bfde27bc..abac51711 100755 --- a/src/logservice/palf/palf_handle_impl.h +++ b/src/logservice/palf/palf_handle_impl.h @@ -108,18 +108,24 @@ struct PalfDiagnoseInfo { common::ObRole palf_role_; palf::ObReplicaState palf_state_; int64_t palf_proposal_id_; + bool enable_sync_; + bool enable_vote_; void reset() { election_role_ = FOLLOWER; election_epoch_ = 0; palf_role_ = FOLLOWER; palf_state_ = ObReplicaState::INVALID_STATE; palf_proposal_id_ = INVALID_PROPOSAL_ID; + enable_sync_ = false; + enable_vote_ = false; } TO_STRING_KV(K(election_role_), K(election_epoch_), K(palf_role_), K(palf_state_), - K(palf_proposal_id_)); + K(palf_proposal_id_), + K(enable_sync_), + K(enable_vote_)); }; struct FetchLogStat { diff --git a/src/observer/virtual_table/ob_all_virtual_ha_diagnose.cpp b/src/observer/virtual_table/ob_all_virtual_ha_diagnose.cpp index 8d3727003..322e3ef0d 100644 --- a/src/observer/virtual_table/ob_all_virtual_ha_diagnose.cpp +++ b/src/observer/virtual_table/ob_all_virtual_ha_diagnose.cpp @@ -236,6 +236,17 @@ int ObAllVirtualHADiagnose::insert_stat_(storage::DiagnoseInfo &diagnose_info) cur_row_.cells_[i].set_collation_type(ObCharset::get_default_collation( ObCharset::get_default_charset())); break; + case ENABLE_SYNC: + cur_row_.cells_[i].set_bool(diagnose_info.palf_diagnose_info_.enable_sync_); + break; + case ENABLE_VOTE: + cur_row_.cells_[i].set_bool(diagnose_info.palf_diagnose_info_.enable_vote_); + break; + case ARB_SRV_INFO: + cur_row_.cells_[i].set_varchar(ObString("")); + cur_row_.cells_[i].set_collation_type(ObCharset::get_default_collation( + ObCharset::get_default_charset())); + break; default: ret = OB_ERR_UNEXPECTED; SERVER_LOG(WARN, "unkown column"); diff --git a/src/observer/virtual_table/ob_all_virtual_ha_diagnose.h b/src/observer/virtual_table/ob_all_virtual_ha_diagnose.h index 8f92465c0..521ca4118 100644 --- a/src/observer/virtual_table/ob_all_virtual_ha_diagnose.h +++ b/src/observer/virtual_table/ob_all_virtual_ha_diagnose.h @@ -52,6 +52,9 @@ enum IOStatColumn RESTORE_HANDLER_PROPOSAL_ID, RESTORE_CONTEXT_INFO, RESTORE_ERR_CONTEXT_INFO, + ENABLE_SYNC, + ENABLE_VOTE, + ARB_SRV_INFO, }; class ObAllVirtualHADiagnose : public common::ObVirtualTableScannerIterator diff --git a/src/share/inner_table/ob_inner_table_schema.12301_12350.cpp b/src/share/inner_table/ob_inner_table_schema.12301_12350.cpp index a1bbd51d3..9d96b0aa0 100644 --- a/src/share/inner_table/ob_inner_table_schema.12301_12350.cpp +++ b/src/share/inner_table/ob_inner_table_schema.12301_12350.cpp @@ -8449,6 +8449,51 @@ int ObInnerTableSchema::all_virtual_ha_diagnose_schema(ObTableSchema &table_sche false, //is_nullable false); //is_autoincrement } + + if (OB_SUCC(ret)) { + ADD_COLUMN_SCHEMA("enable_sync", //column_name + ++column_id, //column_id + 0, //rowkey_id + 0, //index_id + 0, //part_key_pos + ObTinyIntType, //column_type + CS_TYPE_INVALID, //column_collation_type + 1, //column_length + -1, //column_precision + -1, //column_scale + false, //is_nullable + false); //is_autoincrement + } + + if (OB_SUCC(ret)) { + ADD_COLUMN_SCHEMA("enable_vote", //column_name + ++column_id, //column_id + 0, //rowkey_id + 0, //index_id + 0, //part_key_pos + ObTinyIntType, //column_type + CS_TYPE_INVALID, //column_collation_type + 1, //column_length + -1, //column_precision + -1, //column_scale + false, //is_nullable + false); //is_autoincrement + } + + if (OB_SUCC(ret)) { + ADD_COLUMN_SCHEMA("arb_srv_info", //column_name + ++column_id, //column_id + 0, //rowkey_id + 0, //index_id + 0, //part_key_pos + ObVarcharType, //column_type + CS_TYPE_INVALID, //column_collation_type + 1024, //column_length + -1, //column_precision + -1, //column_scale + false, //is_nullable + false); //is_autoincrement + } if (OB_SUCC(ret)) { table_schema.get_part_option().set_part_num(1); table_schema.set_part_level(PARTITION_LEVEL_ONE); diff --git a/src/share/inner_table/ob_inner_table_schema_def.py b/src/share/inner_table/ob_inner_table_schema_def.py index b52e2faa8..0bb20a4e3 100644 --- a/src/share/inner_table/ob_inner_table_schema_def.py +++ b/src/share/inner_table/ob_inner_table_schema_def.py @@ -11344,7 +11344,10 @@ def_table_schema( ('restore_handler_role', 'varchar:32'), ('restore_proposal_id', 'int'), ('restore_context_info', 'varchar:1024'), - ('restore_err_context_info', 'varchar:1024') + ('restore_err_context_info', 'varchar:1024'), + ('enable_sync', 'bool'), + ('enable_vote', 'bool'), + ('arb_srv_info', 'varchar:1024') ], partition_columns = ['svr_ip', 'svr_port'], diff --git a/src/storage/ls/ob_ls.h b/src/storage/ls/ob_ls.h index cb9b1efcd..2c0525776 100644 --- a/src/storage/ls/ob_ls.h +++ b/src/storage/ls/ob_ls.h @@ -97,8 +97,11 @@ struct DiagnoseInfo DiagnoseInfo() { reset(); } ~DiagnoseInfo() { reset(); } bool is_role_sync() { - return ((palf_diagnose_info_.election_role_ == palf_diagnose_info_.palf_role_) - && (palf_diagnose_info_.palf_role_ == log_handler_diagnose_info_.log_handler_role_)); + return ((palf_diagnose_info_.election_role_ == palf_diagnose_info_.palf_role_) && + ((palf_diagnose_info_.palf_role_ == log_handler_diagnose_info_.log_handler_role_ && + palf_diagnose_info_.palf_proposal_id_ == log_handler_diagnose_info_.log_handler_proposal_id_) || + (palf_diagnose_info_.palf_role_ == restore_diagnose_info_.restore_role_ && + palf_diagnose_info_.palf_proposal_id_ == restore_diagnose_info_.restore_proposal_id_))); } int64_t ls_id_; logservice::LogHandlerDiagnoseInfo log_handler_diagnose_info_; @@ -117,7 +120,8 @@ struct DiagnoseInfo K(replay_diagnose_info_), K(gc_diagnose_info_), K(checkpoint_diagnose_info_), - K(restore_diagnose_info_)); + K(restore_diagnose_info_) + ); void reset() { ls_id_ = -1; log_handler_diagnose_info_.reset();