diff --git a/deps/oblib/src/common/ob_learner_list.ipp b/deps/oblib/src/common/ob_learner_list.ipp index 1c8472edc..db70d134c 100644 --- a/deps/oblib/src/common/ob_learner_list.ipp +++ b/deps/oblib/src/common/ob_learner_list.ipp @@ -140,7 +140,7 @@ int BaseLearnerList::add_learner(const T &learner) ret = OB_INVALID_ARGUMENT; } else if (learner_array_.count() >= MAX_SIZE) { ret = OB_SIZE_OVERFLOW; - } else if (OB_UNLIKELY(contains(learner.get_server()))) { + } else if (OB_UNLIKELY(contains(learner))) { ret = OB_ENTRY_EXIST; } else if (OB_FAIL(learner_array_.push_back(learner))) { COMMON_LOG(ERROR, "learner_array_ push back failed", K(ret), K(learner)); @@ -356,7 +356,7 @@ int BaseLearnerList::transform_to_string( COMMON_LOG(WARN, "convert server to string failed", K(ret), K(learner)); } else if (need_comma && OB_FAIL(output_string.append(","))) { COMMON_LOG(WARN, "failed to append comma to string", K(ret)); - } else if (OB_FAIL(output_string.append_fmt("%.*s:%ld", static_cast(sizeof(ip_port)), ip_port, learner.get_timestamp()))) { + } else if (OB_FAIL(output_string.append_fmt("%.*s:%ld:%ld", static_cast(sizeof(ip_port)), ip_port, learner.get_timestamp(), learner.get_flag()))) { COMMON_LOG(WARN, "failed to append ip_port to string", K(ret), K(learner)); } else { need_comma = true; diff --git a/deps/oblib/src/common/ob_member.cpp b/deps/oblib/src/common/ob_member.cpp index ca73c827d..39499ce39 100644 --- a/deps/oblib/src/common/ob_member.cpp +++ b/deps/oblib/src/common/ob_member.cpp @@ -71,6 +71,21 @@ bool ObMember::is_valid() const return server_.is_valid(); } +bool ObMember::is_migrating() const +{ + return (flag_ >> MIGRATING_FLAG_BIT) & 1U; +} + +void ObMember::set_migrating() +{ + flag_ |= (1UL << MIGRATING_FLAG_BIT); +} + +void ObMember::reset_migrating() +{ + flag_ &= ~(1UL << MIGRATING_FLAG_BIT); +} + OB_SERIALIZE_MEMBER(ObMember, server_, timestamp_, flag_); bool ObReplicaMember::is_readonly_replica() const diff --git a/deps/oblib/src/common/ob_member.h b/deps/oblib/src/common/ob_member.h index 70fba22b9..25b8d005f 100644 --- a/deps/oblib/src/common/ob_member.h +++ b/deps/oblib/src/common/ob_member.h @@ -36,19 +36,24 @@ public: const common::ObAddr &get_server() const; int64_t get_timestamp() const; int64_t get_flag() const; + void set_flag(const int64_t &flag) { flag_ = flag; } virtual void reset(); virtual bool is_valid() const; - virtual bool need_encrypt() const { return false; /* modify by yaoying */} friend bool operator==(const ObMember &lhs, const ObMember &rhs); friend bool operator<(const ObMember &lhs, const ObMember &rhs); ObMember &operator=(const ObMember &rhs); int assign(const ObMember &other); + bool is_migrating() const; + void set_migrating(); + void reset_migrating(); + TO_STRING_KV(K_(server), K_(timestamp), K_(flag)); TO_YSON_KV(OB_Y_(server), OB_ID(t), timestamp_, OB_Y_(flag)); OB_UNIS_VERSION(1); protected: + static const int64_t MIGRATING_FLAG_BIT = 1; common::ObAddr server_; int64_t timestamp_; int64_t flag_; diff --git a/mittest/logservice/test_ob_simple_log_arb_mock_ele.cpp b/mittest/logservice/test_ob_simple_log_arb_mock_ele.cpp index b7d8b7740..e81565f0a 100755 --- a/mittest/logservice/test_ob_simple_log_arb_mock_ele.cpp +++ b/mittest/logservice/test_ob_simple_log_arb_mock_ele.cpp @@ -639,6 +639,59 @@ TEST_F(TestObSimpleLogClusterArbMockEleService, test_add_remove_lose_logs) PALF_LOG(INFO, "end test test_add_remove_lose_logs", K(id)); } +// 1. 2F1A, the leader degraded B +// 2. migrate B to D +TEST_F(TestObSimpleLogClusterArbMockEleService, test_2f1a_degrade_migrate) +{ + int ret = OB_SUCCESS; + const int64_t id = ATOMIC_AAF(&palf_id_, 1); + const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; + OB_LOGGER.set_log_level("TRACE"); + SET_CASE_LOG_FILE(TEST_NAME, "test_2f1a_degrade_migrate"); + PALF_LOG(INFO, "begin test_2f1a_degrade_migrate", K(id)); + { + int64_t leader_idx = 0; + int64_t arb_replica_idx = 0; + PalfHandleImplGuard leader; + std::vector palf_list; + EXPECT_EQ(OB_SUCCESS, create_paxos_group_with_arb_mock_election(id, arb_replica_idx, leader_idx, leader)); + EXPECT_EQ(OB_SUCCESS, submit_log(leader, 200, id)); + EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list)); + dynamic_cast(get_cluster()[leader_idx])->log_service_.get_arbitration_service()->start(); + + const int64_t b_idx = (leader_idx + 1) % 4; + const int64_t c_idx = (leader_idx + 2) % 4; + const int64_t d_idx = (leader_idx + 3) % 4; + const common::ObAddr a_addr = get_cluster()[leader_idx]->get_addr(); + const common::ObAddr b_addr = get_cluster()[b_idx]->get_addr(); + const common::ObAddr c_addr = get_cluster()[c_idx]->get_addr(); + const common::ObAddr d_addr = get_cluster()[d_idx]->get_addr(); + PalfHandleImplGuard *a_handle = palf_list[leader_idx]; + PalfHandleImplGuard *b_handle = palf_list[b_idx]; + + // 1. degrade B + block_net(leader_idx, b_idx); + EXPECT_TRUE(is_degraded(leader, b_idx)); + + // 2. migrate B to D + common::ObMember b_member = common::ObMember(b_addr, 1); + common::ObMember migrating_d = common::ObMember(d_addr, 1); + migrating_d.set_migrating(); + + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(migrating_d, CONFIG_CHANGE_TIMEOUT)); + LogConfigVersion config_version; + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(config_version)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_member_with_learner(migrating_d, b_member, config_version, CONFIG_CHANGE_TIMEOUT)); + EXPECT_EQ(0, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.degraded_learnerlist_.get_member_number()); + EXPECT_EQ(2, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number()); + EXPECT_EQ(2, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + unblock_net(leader_idx, b_idx); + + revert_cluster_palf_handle_guard(palf_list); + } + delete_paxos_group(id); + PALF_LOG(INFO, "end test_2f1a_degrade_migrate", K(id)); +} } // end unittest } // end oceanbase diff --git a/mittest/logservice/test_ob_simple_log_config_change.cpp b/mittest/logservice/test_ob_simple_log_config_change.cpp index 5b99a48ca..edcf1a116 100644 --- a/mittest/logservice/test_ob_simple_log_config_change.cpp +++ b/mittest/logservice/test_ob_simple_log_config_change.cpp @@ -379,6 +379,230 @@ TEST_F(TestObSimpleLogClusterConfigChange, test_basic_config_change) PALF_LOG(INFO, "end test config change", K(id)); } +TEST_F(TestObSimpleLogClusterConfigChange, test_basic_config_change_for_migration) +{ + SET_CASE_LOG_FILE(TEST_NAME, "config_change_for_migration"); + int ret = OB_SUCCESS; + const int64_t id = ATOMIC_AAF(&palf_id_, 1); + PALF_LOG(INFO, "begin test config change", K(id)); + { + int64_t leader_idx = 0; + std::vector palf_list; + const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; // 10s + PalfHandleImplGuard leader; + EXPECT_EQ(OB_SUCCESS, create_paxos_group(id, &loc_cb, leader_idx, leader)); + PalfHandleImplGuard new_leader; + int64_t new_leader_idx; + EXPECT_EQ(OB_SUCCESS, get_leader(id, new_leader, new_leader_idx)); + loc_cb.leader_ = get_cluster()[new_leader_idx]->get_addr(); + PALF_LOG(INFO, "set leader for loc_cb", "leader", get_cluster()[new_leader_idx]->get_addr()); + EXPECT_EQ(OB_SUCCESS, get_cluster_palf_handle_guard(id, palf_list)); + EXPECT_EQ(OB_SUCCESS, submit_log(leader, 100, id)); + const common::ObAddr &addr2 = get_cluster()[(leader_idx+2)%3]->get_addr(); + const common::ObAddr &addr3 = get_cluster()[3]->get_addr(); + const common::ObAddr &addr4 = get_cluster()[4]->get_addr(); + const common::ObAddr &addr5 = get_cluster()[5]->get_addr(); + // 1. replicate an FULL replica + { + PALF_LOG(INFO, "CASE1: replicate an FULL replica", K(id)); + common::ObMember added_member = ObMember(addr3, 1); + added_member.set_migrating(); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(added_member, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_member)); + EXPECT_EQ(3, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + + // clean + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_learner(added_member, CONFIG_CHANGE_TIMEOUT)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_member)); + // add again + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(added_member, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_member)); + + LogConfigVersion config_version; + ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(config_version)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->switch_learner_to_acceptor(added_member, 4, config_version, CONFIG_CHANGE_TIMEOUT)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_member)); + // member with flag do not exist + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member.get_server())); + EXPECT_EQ(4, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + // reentrant, do not get config_version again + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->switch_learner_to_acceptor(added_member, 4, config_version, CONFIG_CHANGE_TIMEOUT)); + // reset environment + added_member.reset_migrating(); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_member(added_member, 3, CONFIG_CHANGE_TIMEOUT)); + } + + // 2. migrate an FULL replica (addr2 -> addr3) + { + PALF_LOG(INFO, "CASE2: migrate an FULL replica", K(id)); + common::ObMember added_member = ObMember(addr3, 1); + added_member.set_migrating(); + common::ObMember replaced_member = ObMember(addr2, 1); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(added_member, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_member)); + EXPECT_EQ(3, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + LogConfigVersion config_version; + ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(config_version)); + LogConfigChangeArgs args(added_member, 0, config_version, SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->one_stage_config_change_(args, CONFIG_CHANGE_TIMEOUT)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_member)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member.get_server())); + EXPECT_EQ(4, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + + ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(config_version)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_member_with_learner(added_member, replaced_member, config_version, CONFIG_CHANGE_TIMEOUT)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_member)); + // member with flag do not exist + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member.get_server())); + EXPECT_EQ(3, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(replaced_member.get_server())); + // reentrant and check + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_member_with_learner(added_member, replaced_member, config_version, CONFIG_CHANGE_TIMEOUT)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_member)); + // member with flag do not exist + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member.get_server())); + EXPECT_EQ(3, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(replaced_member.get_server())); + // reset environment + added_member.reset_migrating(); + ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(config_version)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_member(replaced_member, added_member, config_version, CONFIG_CHANGE_TIMEOUT)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(added_member.get_server())); + EXPECT_EQ(3, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_memberlist_.contains(replaced_member.get_server())); + } + + // 3. replicate an READONLY replica + { + PALF_LOG(INFO, "CASE3: replicate an READONLY replica", K(id)); + // learner's addr must be different from members' + common::ObMember migrating_member = ObMember(addr2, 1); + EXPECT_EQ(OB_INVALID_ARGUMENT, leader.palf_handle_impl_->add_learner(migrating_member, CONFIG_CHANGE_TIMEOUT)); + + common::ObMember added_migrating_learner = ObMember(addr3, 1); + common::ObMember added_learner = ObMember(addr3, 1); + added_migrating_learner.set_migrating(); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(added_migrating_learner, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_migrating_learner)); + EXPECT_EQ(3, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + ObMemberList added_learners, removed_learners; + EXPECT_EQ(OB_SUCCESS, added_learners.add_member(added_learner)); + EXPECT_EQ(OB_SUCCESS, removed_learners.add_member(added_migrating_learner)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_learners(added_learners, removed_learners, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_learner)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_migrating_learner)); + // reentrant and check + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_learners(added_learners, removed_learners, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_learner)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_migrating_learner)); + // reset environment + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_learner(added_learner, CONFIG_CHANGE_TIMEOUT)); + } + + // 4. migrate an READONLY replica, addr4 -> addr3 + { + PALF_LOG(INFO, "CASE4: migrate an READONLY replica", K(id)); + common::ObMember removed_learner = ObMember(addr4, 1); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(removed_learner, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(removed_learner)); + + common::ObMember added_migrating_learner = ObMember(addr3, 1); + common::ObMember added_learner = ObMember(addr3, 1); + added_migrating_learner.set_migrating(); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(added_migrating_learner, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_migrating_learner)); + EXPECT_EQ(3, leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.log_sync_replica_num_); + ObMemberList added_learners, removed_learners; + EXPECT_EQ(OB_SUCCESS, added_learners.add_member(added_learner)); + EXPECT_EQ(OB_SUCCESS, removed_learners.add_member(added_migrating_learner)); + EXPECT_EQ(OB_SUCCESS, removed_learners.add_member(removed_learner)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_learners(added_learners, removed_learners, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_learner)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_migrating_learner)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(removed_learner)); + // reentrant and check + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_learners(added_learners, removed_learners, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_learner)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(added_migrating_learner)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(removed_learner)); + // reset environment + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_learner(added_learner, CONFIG_CHANGE_TIMEOUT)); + } + // 5. replace_learners (addr3, addr4) -> (addr3, addr5) + { + PALF_LOG(INFO, "CASE5: replace_learners", K(id)); + const common::ObMember member2 = ObMember(addr2, 1); + const common::ObMember member3 = ObMember(addr3, 1); + const common::ObMember member4 = ObMember(addr4, 1); + const common::ObMember member5 = ObMember(addr5, 1); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(member3, CONFIG_CHANGE_TIMEOUT)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(member4, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member3)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member4)); + + ObMemberList added_learners, removed_learners; + EXPECT_EQ(OB_SUCCESS, added_learners.add_member(member3)); + EXPECT_EQ(OB_SUCCESS, added_learners.add_member(member5)); + EXPECT_EQ(OB_SUCCESS, removed_learners.add_member(member2)); + EXPECT_EQ(OB_SUCCESS, removed_learners.add_member(member4)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_learners(added_learners, removed_learners, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member3)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member5)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member2)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member4)); + // reentrant and check + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->replace_learners(added_learners, removed_learners, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member3)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member5)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member2)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member4)); + // reset environment + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_learner(member3, CONFIG_CHANGE_TIMEOUT)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_learner(member5, CONFIG_CHANGE_TIMEOUT)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member3)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(member5)); + } + // 6. defensive + { + PALF_LOG(INFO, "CASE6: defensive", K(id)); + const common::ObMember member2 = ObMember(addr2, 1); + const common::ObMember member3 = ObMember(addr3, 1); + const common::ObMember member4 = ObMember(addr4, 1); + const common::ObMember member5 = ObMember(addr5, 1); + common::ObMember migrating_member3 = member3; + migrating_member3.set_migrating(); + + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(migrating_member3, CONFIG_CHANGE_TIMEOUT)); + EXPECT_EQ(OB_INVALID_ARGUMENT, leader.palf_handle_impl_->add_learner(member3, CONFIG_CHANGE_TIMEOUT)); + LogConfigVersion config_version; + ASSERT_EQ(OB_SUCCESS, leader.palf_handle_impl_->get_config_version(config_version)); + EXPECT_EQ(OB_INVALID_ARGUMENT, leader.palf_handle_impl_->add_member(member3, 4, config_version, CONFIG_CHANGE_TIMEOUT)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_learner(member3, CONFIG_CHANGE_TIMEOUT)); + EXPECT_TRUE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(migrating_member3)); + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->remove_learner(migrating_member3, CONFIG_CHANGE_TIMEOUT)); + EXPECT_FALSE(leader.palf_handle_impl_->config_mgr_.log_ms_meta_.curr_.config_.learnerlist_.contains(migrating_member3)); + + EXPECT_EQ(OB_SUCCESS, leader.palf_handle_impl_->add_learner(migrating_member3, CONFIG_CHANGE_TIMEOUT)); + ObMemberList added_learners, removed_learners; + EXPECT_EQ(OB_SUCCESS, added_learners.add_member(member3)); + EXPECT_EQ(OB_SUCCESS, removed_learners.add_member(member4)); + EXPECT_EQ(OB_INVALID_ARGUMENT, leader.palf_handle_impl_->replace_learners(added_learners, removed_learners, CONFIG_CHANGE_TIMEOUT)); + + common::ObMember migrating_member2 = member2; + migrating_member2.set_migrating(); + EXPECT_EQ(OB_INVALID_ARGUMENT, leader.palf_handle_impl_->remove_member(migrating_member2, 2, CONFIG_CHANGE_TIMEOUT)); + } + EXPECT_EQ(OB_SUCCESS, submit_log(leader, 100, id)); + revert_cluster_palf_handle_guard(palf_list); + } + delete_paxos_group(id); + PALF_LOG(INFO, "end test config change", K(id)); +} + TEST_F(TestObSimpleLogClusterConfigChange, test_replace_member) { SET_CASE_LOG_FILE(TEST_NAME, "replace_member"); diff --git a/mittest/simple_server/CMakeLists.txt b/mittest/simple_server/CMakeLists.txt index 61f23c2f7..30e65a74a 100644 --- a/mittest/simple_server/CMakeLists.txt +++ b/mittest/simple_server/CMakeLists.txt @@ -75,6 +75,7 @@ ob_unittest_observer(test_ob_detect_manager_in_simple_server test_ob_detect_mana ob_unittest_observer(test_transfer_lock_info_operator storage_ha/test_transfer_lock_info_operator.cpp) ob_unittest_observer(test_mds_recover test_mds_recover.cpp) ob_unittest_observer(test_keep_alive_min_start_scn test_keep_alive_min_start_scn.cpp) +ob_unittest_observer(test_ls_replica test_ls_replica.cpp) # TODO(muwei.ym): open later #ob_ha_unittest_observer(test_transfer_handler storage_ha/test_transfer_handler.cpp) #ob_ha_unittest_observer(test_transfer_and_restart_basic storage_ha/test_transfer_and_restart_basic.cpp) diff --git a/mittest/simple_server/test_ls_replica.cpp b/mittest/simple_server/test_ls_replica.cpp new file mode 100644 index 000000000..a8f1301f8 --- /dev/null +++ b/mittest/simple_server/test_ls_replica.cpp @@ -0,0 +1,95 @@ +/** + * Copyright (c) 2021 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#define USING_LOG_PREFIX SHARE + +#include +#include +#include "share/ls/ob_ls_info.h" +#include "env/ob_simple_cluster_test_base.h" +#include "lib/ob_errno.h" + + + +namespace oceanbase +{ +using namespace unittest; +namespace share +{ +using ::testing::_; +using ::testing::Invoke; +using ::testing::Return; + +using namespace schema; +using namespace common; + +class TestLSReplica : public unittest::ObSimpleClusterTestBase +{ +public: + TestLSReplica() : unittest::ObSimpleClusterTestBase("test_ls_replica") {} +protected: + ObLSReplica ls_replica_; +}; + +TEST_F(TestLSReplica, test_text2learnerlist) +{ + int ret = OB_SUCCESS; + GlobalLearnerList learner_list; + bool learner_exists = false; + + const ObAddr addr1(ObAddr::IPV4, "127.0.0.1", 1000); + const ObAddr addr2(ObAddr::IPV4, "127.0.0.1", 1001); + const ObAddr addr3(ObAddr::IPV4, "127.0.0.2", 1000); + const ObAddr addr4(ObAddr::IPV4, "127.0.0.2", 1001); + + ObString string_to_parse = "127.0.0.1:1000:0:1,127.0.0.1:1001:0:1,127.0.0.2:1000:1:0"; + ret = ls_replica_.text2learner_list(to_cstring(string_to_parse), learner_list); + ASSERT_EQ(OB_SUCCESS, ret); + + ObMember member1(addr1, 0); + learner_exists = learner_list.contains(member1); + ASSERT_EQ(false, learner_exists); + + member1.set_flag(-1); + learner_exists = learner_list.contains(member1); + ASSERT_EQ(false, learner_exists); + + member1.set_flag(1); + learner_exists = learner_list.contains(member1); + ASSERT_EQ(true, learner_exists); + + ObMember member2(addr2, 0); + member2.set_flag(10); + learner_exists = learner_list.contains(member2); + ASSERT_EQ(false, learner_exists); + + ObMember member3(addr3, 0); + member3.set_flag(0); + learner_exists = learner_list.contains(member3); + ASSERT_EQ(false, learner_exists); + + ObMember member4(addr4, 0); + learner_exists = learner_list.contains(member4); + ASSERT_EQ(false, learner_exists); + +} + +} // namespace share +} // namespace oceanbase + +int main(int argc, char **argv) +{ + init_log_and_gtest(argc, argv); + OB_LOGGER.set_log_level("INFO"); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/logservice/logrpc/ob_log_request_handler.cpp b/src/logservice/logrpc/ob_log_request_handler.cpp index 2fa476638..554498f96 100755 --- a/src/logservice/logrpc/ob_log_request_handler.cpp +++ b/src/logservice/logrpc/ob_log_request_handler.cpp @@ -260,6 +260,12 @@ int ConfigChangeCmdHandler::handle_config_change_cmd(const LogConfigChangeCmd &r case GET_CONFIG_CHANGE_LOCK_STAT_CMD: ret = palf_handle_->get_config_change_lock_stat(resp.lock_owner_, resp.is_locked_); break; + case REPLACE_LEARNERS_CMD: + ret = palf_handle_->replace_learners(req.added_list_, req.removed_list_, req.timeout_us_); + break; + case REPLACE_MEMBER_WITH_LEARNER_CMD: + ret = palf_handle_->replace_member_with_learner(req.added_member_, req.removed_member_, req.config_version_, req.timeout_us_); + break; default: break; } diff --git a/src/logservice/logrpc/ob_log_rpc_req.cpp b/src/logservice/logrpc/ob_log_rpc_req.cpp index 75f9bc769..cf3210f7e 100644 --- a/src/logservice/logrpc/ob_log_rpc_req.cpp +++ b/src/logservice/logrpc/ob_log_rpc_req.cpp @@ -31,7 +31,9 @@ LogConfigChangeCmd::LogConfigChangeCmd() cmd_type_(INVALID_CONFIG_CHANGE_CMD), timeout_us_(0), lock_owner_(palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), - config_version_() { } + config_version_(), + added_list_(), + removed_list_() { } LogConfigChangeCmd::LogConfigChangeCmd( const common::ObAddr &src, @@ -51,7 +53,9 @@ LogConfigChangeCmd::LogConfigChangeCmd( cmd_type_(cmd_type), timeout_us_(timeout_us), lock_owner_(palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), - config_version_() { } + config_version_(), + added_list_(), + removed_list_() { } LogConfigChangeCmd::LogConfigChangeCmd( const common::ObAddr &src, @@ -71,7 +75,9 @@ LogConfigChangeCmd::LogConfigChangeCmd( cmd_type_(cmd_type), timeout_us_(timeout_us), lock_owner_(palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), - config_version_() { } + config_version_(), + added_list_(), + removed_list_() { } LogConfigChangeCmd::LogConfigChangeCmd(const common::ObAddr &src, const int64_t palf_id, @@ -79,6 +85,27 @@ LogConfigChangeCmd::LogConfigChangeCmd(const common::ObAddr &src, const LogConfigChangeCmdType cmd_type, const int64_t timeout_us) : src_(src), + palf_id_(palf_id), + added_member_(), + removed_member_(), + curr_member_list_(), + curr_replica_num_(0), + new_replica_num_(0), + cmd_type_(cmd_type), + timeout_us_(timeout_us), + lock_owner_(lock_owner), + config_version_(), + added_list_(), + removed_list_() { } + +LogConfigChangeCmd::LogConfigChangeCmd( + const common::ObAddr &src, + const int64_t palf_id, + const common::ObMemberList &added_list, + const common::ObMemberList &removed_list, + const LogConfigChangeCmdType cmd_type, + const int64_t timeout_us) + : src_(src), palf_id_(palf_id), added_member_(), removed_member_(), @@ -87,7 +114,10 @@ LogConfigChangeCmd::LogConfigChangeCmd(const common::ObAddr &src, new_replica_num_(0), cmd_type_(cmd_type), timeout_us_(timeout_us), - lock_owner_(lock_owner){} + lock_owner_(palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), + config_version_(), + added_list_(added_list), + removed_list_(removed_list) { } LogConfigChangeCmd::~LogConfigChangeCmd() { @@ -113,6 +143,8 @@ bool LogConfigChangeCmd::is_valid() const && is_valid_replica_num(curr_replica_num_) && is_valid_replica_num(new_replica_num_): true);\ bool_ret = bool_ret && ((TRY_LOCK_CONFIG_CHANGE_CMD == cmd_type_ || UNLOCK_CONFIG_CHANGE_CMD == cmd_type_) ? \ (palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER != lock_owner_) : true); + bool_ret = bool_ret && ((REPLACE_LEARNERS_CMD == cmd_type_)? (added_list_.is_valid() \ + && removed_list_.is_valid()): true); return bool_ret; } @@ -120,14 +152,16 @@ bool LogConfigChangeCmd::is_remove_member_list() const { return REMOVE_MEMBER_CMD == cmd_type_ || REPLACE_MEMBER_CMD == cmd_type_ - || SWITCH_TO_LEARNER_CMD == cmd_type_; + || SWITCH_TO_LEARNER_CMD == cmd_type_ + || REPLACE_MEMBER_WITH_LEARNER_CMD == cmd_type_; } bool LogConfigChangeCmd::is_add_member_list() const { return ADD_MEMBER_CMD == cmd_type_ || REPLACE_MEMBER_CMD == cmd_type_ - || SWITCH_TO_ACCEPTOR_CMD == cmd_type_; + || SWITCH_TO_ACCEPTOR_CMD == cmd_type_ + || REPLACE_MEMBER_WITH_LEARNER_CMD == cmd_type_; } bool LogConfigChangeCmd::is_set_new_replica_num() const @@ -151,10 +185,13 @@ void LogConfigChangeCmd::reset() timeout_us_ = 0; lock_owner_ = palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER; config_version_.reset(); + added_list_.reset(); + removed_list_.reset(); } OB_SERIALIZE_MEMBER(LogConfigChangeCmd, src_, palf_id_, added_member_, removed_member_, -curr_member_list_, curr_replica_num_, new_replica_num_, cmd_type_, timeout_us_, lock_owner_, config_version_); +curr_member_list_, curr_replica_num_, new_replica_num_, cmd_type_, timeout_us_, lock_owner_, +config_version_, added_list_, removed_list_); // ============= LogConfigChangeCmd end ============= // ============= LogConfigChangeCmdResp begin =========== diff --git a/src/logservice/logrpc/ob_log_rpc_req.h b/src/logservice/logrpc/ob_log_rpc_req.h index c7fdc1379..c516f6858 100644 --- a/src/logservice/logrpc/ob_log_rpc_req.h +++ b/src/logservice/logrpc/ob_log_rpc_req.h @@ -39,6 +39,8 @@ enum LogConfigChangeCmdType { TRY_LOCK_CONFIG_CHANGE_CMD, UNLOCK_CONFIG_CHANGE_CMD, GET_CONFIG_CHANGE_LOCK_STAT_CMD, + REPLACE_LEARNERS_CMD, + REPLACE_MEMBER_WITH_LEARNER_CMD, }; inline const char *log_config_change_cmd2str(const LogConfigChangeCmdType state) @@ -57,6 +59,8 @@ inline const char *log_config_change_cmd2str(const LogConfigChangeCmdType state) CHECK_CMD_TYPE_STR(TRY_LOCK_CONFIG_CHANGE_CMD); CHECK_CMD_TYPE_STR(UNLOCK_CONFIG_CHANGE_CMD); CHECK_CMD_TYPE_STR(GET_CONFIG_CHANGE_LOCK_STAT_CMD); + CHECK_CMD_TYPE_STR(REPLACE_LEARNERS_CMD); + CHECK_CMD_TYPE_STR(REPLACE_MEMBER_WITH_LEARNER_CMD); default: return "Invalid"; } @@ -86,6 +90,12 @@ public: const int64_t lock_owner, const LogConfigChangeCmdType cmd_type, const int64_t timeout_us); + LogConfigChangeCmd(const common::ObAddr &src, + const int64_t palf_id, + const common::ObMemberList &added_list, + const common::ObMemberList &removed_list, + const LogConfigChangeCmdType cmd_type, + const int64_t timeout_us); ~LogConfigChangeCmd(); bool is_valid() const; void reset(); @@ -94,8 +104,9 @@ public: void in_leader(const palf::LogConfigVersion &config_version); bool is_set_new_replica_num() const; TO_STRING_KV("cmd_type", log_config_change_cmd2str(cmd_type_), K_(src), K_(palf_id), \ - K_(added_member), K_(removed_member), K_(curr_member_list), K_(curr_replica_num), \ - K_(new_replica_num), K_(timeout_us), K_(lock_owner), K_(config_version)); + K_(added_member), K_(removed_member), K_(curr_member_list), K_(curr_replica_num), \ + K_(new_replica_num), K_(timeout_us), K_(lock_owner), K_(config_version), \ + K_(added_list), K_(removed_list)); common::ObAddr src_; int64_t palf_id_; common::ObMember added_member_; @@ -107,6 +118,8 @@ public: int64_t timeout_us_; int64_t lock_owner_; palf::LogConfigVersion config_version_; + common::ObMemberList added_list_; + common::ObMemberList removed_list_; }; struct LogConfigChangeCmdResp { diff --git a/src/logservice/ob_garbage_collector.cpp b/src/logservice/ob_garbage_collector.cpp index 9e2764ef3..7f33f75a6 100644 --- a/src/logservice/ob_garbage_collector.cpp +++ b/src/logservice/ob_garbage_collector.cpp @@ -69,11 +69,12 @@ class ObGarbageCollector::QueryLSIsValidMemberFunctor { public: QueryLSIsValidMemberFunctor(obrpc::ObSrvRpcProxy *rpc_proxy, + obrpc::ObLogServiceRpcProxy *log_rpc_proxy, ObLSService *ls_service, const common::ObAddr &self_addr, const int64_t gc_seq, ObGCCandidateArray &gc_candidates) - : rpc_proxy_(rpc_proxy), ls_service_(ls_service), self_addr_(self_addr), + : rpc_proxy_(rpc_proxy), log_rpc_proxy_(log_rpc_proxy), ls_service_(ls_service), self_addr_(self_addr), gc_seq_(gc_seq), gc_candidates_(gc_candidates), ret_value_(common::OB_SUCCESS) {} ~QueryLSIsValidMemberFunctor() {} public: @@ -85,7 +86,9 @@ public: return common::OB_SUCCESS == ret_value_; } int get_ret_value() const { return ret_value_; } + TO_STRING_KV(K(self_addr_), K(gc_seq_)); private: + int remove_self_from_learnerlist_(const ObAddr &leader, ObLS *ls); int handle_ls_array_(const ObAddr &leader, const ObLSArray &ls_array); int handle_rpc_response_(const ObAddr &leader, @@ -93,6 +96,7 @@ private: int try_renew_location_(const ObLSArray &ls_array); private: obrpc::ObSrvRpcProxy *rpc_proxy_; + obrpc::ObLogServiceRpcProxy *log_rpc_proxy_; ObLSService *ls_service_; common::ObAddr self_addr_; int64_t gc_seq_; @@ -102,6 +106,38 @@ private: DISALLOW_COPY_AND_ASSIGN(QueryLSIsValidMemberFunctor); }; +int ObGarbageCollector::QueryLSIsValidMemberFunctor::remove_self_from_learnerlist_(const ObAddr &leader, ObLS *ls) +{ + int ret = OB_SUCCESS; + const ObLSID &ls_id = ls->get_ls_id(); + const int64_t TIMEOUT_US = 10 * 1000 * 1000L; + LogGetPalfStatReq get_palf_stat_req(self_addr_, ls_id.id(), true/*is_to_leader*/); + LogGetPalfStatResp get_palf_stat_resp; + if (OB_FAIL(log_rpc_proxy_->to(leader) + .by(MTL_ID()) + .timeout(TIMEOUT_US) + .max_process_handler_time(TIMEOUT_US) + .get_palf_stat(get_palf_stat_req, get_palf_stat_resp))) { + CLOG_LOG(WARN, "get_palf_stat failed", K(ls_id), K(leader), K(get_palf_stat_req)); + } else { + const common::GlobalLearnerList &learner_list = get_palf_stat_resp.palf_stat_.learner_list_; + ObMember member; + if (OB_FAIL(learner_list.get_learner_by_addr(self_addr_, member))) { + if (OB_ENTRY_NOT_EXIST == ret) { + ret = OB_SUCCESS; + CLOG_LOG(INFO, "self is not in learnerlist", KPC(this), K(leader), K(learner_list), K(ls_id)); + } else { + CLOG_LOG(WARN, "failed to get_learner_by_addr", KPC(this), K(leader), K(learner_list), K(ls_id)); + } + } else if (OB_FAIL(ls->remove_learner(member, TIMEOUT_US))) { + CLOG_LOG(WARN, "failed to remove_learner", KPC(this), K(leader), K(learner_list), K(ls_id), K(member)); + } else { + CLOG_LOG(INFO, "learner is removed from leader", KPC(this), K(leader), K(learner_list), K(ls_id), K(member)); + } + } + return ret; +} + int ObGarbageCollector::QueryLSIsValidMemberFunctor::handle_ls_array_(const ObAddr &leader, const ObLSArray &ls_array) { @@ -161,11 +197,13 @@ int ObGarbageCollector::QueryLSIsValidMemberFunctor::handle_rpc_response_(const const ObLSArray &ls_array = response.ls_array_; const common::ObSEArray &candidates_status = response.candidates_status_; const common::ObSEArray &ret_array = response.ret_array_; + const common::ObSEArray &gc_stat_array = response.gc_stat_array_; if (ls_array.count() != candidates_status.count() - || ls_array.count() != ret_array.count()) { + || ls_array.count() != ret_array.count() + || ((gc_stat_array.count() > 0) && (gc_stat_array.count() != ls_array.count()))) { ret = OB_ERR_UNEXPECTED; - CLOG_LOG(ERROR, "response count not match, unexpected", K(ret), K(leader)); + CLOG_LOG(ERROR, "response count not match, unexpected", K(ret), K(leader), K(response)); } else { for (int64_t index = 0; OB_SUCC(ret) && index < ls_array.count(); index++) { ObLSHandle handle; @@ -173,6 +211,8 @@ int ObGarbageCollector::QueryLSIsValidMemberFunctor::handle_rpc_response_(const ObGCHandler *gc_handler = NULL; const ObLSID &id = ls_array[index]; const bool is_valid_member = candidates_status[index]; + const obrpc::LogMemberGCStat member_gc_stat = gc_stat_array.count() > 0 ? + gc_stat_array[index] : obrpc::LogMemberGCStat::LOG_MEMBER_NORMAL_GC_STAT; bool need_gc = false; if (OB_SUCCESS != ret_array[index]) { CLOG_LOG(INFO, "remote_ret_code is not success, need renew location", K(id), K(leader), @@ -200,7 +240,6 @@ int ObGarbageCollector::QueryLSIsValidMemberFunctor::handle_rpc_response_(const candidate.ls_id_ = id; candidate.ls_status_ = LSStatus::LS_NEED_GC; candidate.gc_reason_ = NOT_IN_LEADER_MEMBER_LIST; - if (OB_FAIL(gc_candidates_.push_back(candidate))) { CLOG_LOG(WARN, "gc_candidates push_back failed", K(ret), K(id), K(leader)); } else { @@ -210,7 +249,16 @@ int ObGarbageCollector::QueryLSIsValidMemberFunctor::handle_rpc_response_(const CLOG_LOG(INFO, "gc_check_invalid_member_seq set seq", K(tmp_ret), K(id), K(leader), K(gc_seq_), K(need_gc)); } } else { - CLOG_LOG(INFO, "GC check ls in member list, skip it", K(id), K(leader)); + //is valid member, check member_gc_stat + if (obrpc::LogMemberGCStat::LOG_MEMBER_NORMAL_GC_STAT == member_gc_stat) { + CLOG_LOG(INFO, "GC check ls in member list, skip it", K(id), K(leader)); + } else if (obrpc::LogMemberGCStat::LOG_LEARNER_IN_MIGRATING == member_gc_stat) { + if (OB_SUCCESS != (tmp_ret = remove_self_from_learnerlist_(leader, ls))) { + CLOG_LOG(WARN, "failed to remove self from learnerlist", K(tmp_ret), K(id), K(leader)); + } + } else { + CLOG_LOG(ERROR, "invalid member_gc_stat,", K(id), K(leader), K(member_gc_stat)); + } } } } @@ -1010,6 +1058,7 @@ ObGarbageCollector::ObGarbageCollector() : is_inited_(false), ls_service_(NULL), rpc_proxy_(NULL), sql_proxy_(NULL), + log_rpc_proxy_(NULL), self_addr_(), seq_(1), safe_destroy_handler_(), @@ -1024,16 +1073,25 @@ ObGarbageCollector::~ObGarbageCollector() int ObGarbageCollector::mtl_init(ObGarbageCollector* &gc_service) { + int ret = OB_SUCCESS; ObLSService *ls_service = MTL(ObLSService*); + ObLogService *log_service = MTL(ObLogService*); obrpc::ObSrvRpcProxy *rpc_proxy = GCTX.srv_rpc_proxy_; common::ObMySQLProxy *sql_proxy = GCTX.sql_proxy_; const common::ObAddr self_addr = GCTX.self_addr(); - return gc_service->init(ls_service, rpc_proxy, sql_proxy, self_addr); + if (OB_ISNULL(log_service)) { + ret = OB_ERR_UNEXPECTED; + CLOG_LOG(ERROR, "ObLogService is NULL"); + } else{ + ret = gc_service->init(ls_service, rpc_proxy, sql_proxy,log_service->get_rpc_proxy(), self_addr); + } + return ret; } int ObGarbageCollector::init(ObLSService *ls_service, obrpc::ObSrvRpcProxy *rpc_proxy, common::ObMySQLProxy *sql_proxy, + obrpc::ObLogServiceRpcProxy *log_rpc_proxy, const common::ObAddr &self_addr) { int ret = OB_SUCCESS; @@ -1041,15 +1099,17 @@ int ObGarbageCollector::init(ObLSService *ls_service, ret = OB_INIT_TWICE; CLOG_LOG(WARN, "ObGarbageCollector is inited twice"); } else if (OB_ISNULL(ls_service) || OB_ISNULL(rpc_proxy) - || OB_ISNULL(sql_proxy) || !self_addr.is_valid()) { + || OB_ISNULL(sql_proxy) || OB_ISNULL(log_rpc_proxy)|| !self_addr.is_valid()) { ret = OB_INVALID_ARGUMENT; - CLOG_LOG(WARN, "invalid arguments", K(ret), KP(ls_service), KP(rpc_proxy), KP(sql_proxy), K(self_addr)); + CLOG_LOG(WARN, "invalid arguments", K(ret), KP(ls_service), KP(rpc_proxy), KP(sql_proxy), + KP(log_rpc_proxy), K(self_addr)); } else if (OB_FAIL(safe_destroy_handler_.init())) { CLOG_LOG(WARN, "safe destroy handler init failed", K(ret)); } else { ls_service_ = ls_service; rpc_proxy_ = rpc_proxy; sql_proxy_ = sql_proxy; + log_rpc_proxy_ = log_rpc_proxy; self_addr_ = self_addr; seq_ = 1; is_inited_ = true; @@ -1104,6 +1164,7 @@ void ObGarbageCollector::destroy() ls_service_ = NULL; rpc_proxy_ = NULL; sql_proxy_ = NULL; + log_rpc_proxy_ = NULL; self_addr_.reset(); safe_destroy_handler_.destroy(); } @@ -1297,7 +1358,7 @@ int ObGarbageCollector::handle_each_ls_for_member_list_(ServerLSMap &server_ls_m ObGCCandidateArray &gc_candidates) { int ret = OB_SUCCESS; - QueryLSIsValidMemberFunctor functor(rpc_proxy_, ls_service_, self_addr_, seq_, gc_candidates); + QueryLSIsValidMemberFunctor functor(rpc_proxy_, log_rpc_proxy_, ls_service_, self_addr_, seq_, gc_candidates); if (OB_SUCCESS != server_ls_map.for_each(functor)) { ret = functor.get_ret_value(); CLOG_LOG(WARN, "handle_each_ls_for_member_list_ failed", K(ret)); diff --git a/src/logservice/ob_garbage_collector.h b/src/logservice/ob_garbage_collector.h index dfbd8f95d..bf81ad662 100644 --- a/src/logservice/ob_garbage_collector.h +++ b/src/logservice/ob_garbage_collector.h @@ -24,6 +24,7 @@ #include "storage/tx_storage/ob_safe_destroy_handler.h" #include "logservice/ob_log_base_header.h" #include "logservice/ob_append_callback.h" +#include "logservice/logrpc/ob_log_rpc_proxy.h" namespace oceanbase { @@ -140,6 +141,7 @@ public: int init(storage::ObLSService *ls_service, obrpc::ObSrvRpcProxy *rpc_proxy, common::ObMySQLProxy *sql_proxy, + obrpc::ObLogServiceRpcProxy *log_rpc_proxy, const common::ObAddr &self_addr); int start(); void stop(); @@ -221,6 +223,7 @@ private: storage::ObLSService *ls_service_; obrpc::ObSrvRpcProxy *rpc_proxy_; common::ObMySQLProxy *sql_proxy_; + obrpc::ObLogServiceRpcProxy *log_rpc_proxy_; common::ObAddr self_addr_; int64_t seq_; storage::ObSafeDestroyHandler safe_destroy_handler_; diff --git a/src/logservice/ob_log_handler.cpp b/src/logservice/ob_log_handler.cpp index 673818d91..dbabe2e54 100755 --- a/src/logservice/ob_log_handler.cpp +++ b/src/logservice/ob_log_handler.cpp @@ -33,6 +33,7 @@ namespace oceanbase { using namespace share; +using namespace obrpc; namespace logservice { using namespace palf; @@ -128,7 +129,7 @@ int ObLogHandler::stop() tg.click("wrlock succ"); if (IS_INIT) { is_in_stop_state_ = true; - common::ObSpinLockGuard deps_guard(deps_lock_); + common::TCWLockGuard deps_guard(deps_lock_); //unregister_file_size_cb不能在apply status锁内, 可能会导致死锁 apply_status_->unregister_file_size_cb(); tg.click("unreg cb end"); @@ -184,7 +185,7 @@ void ObLogHandler::destroy() is_inited_ = false; is_offline_ = false; is_in_stop_state_ = true; - common::ObSpinLockGuard deps_guard(deps_lock_); + common::TCWLockGuard deps_guard(deps_lock_); apply_service_->revert_apply_status(apply_status_); apply_status_ = NULL; apply_service_ = NULL; @@ -560,9 +561,13 @@ int ObLogHandler::change_replica_num(const common::ObMemberList &member_list, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(member_list), K(curr_replica_num), + K(new_replica_num), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (!member_list.is_valid() || @@ -594,13 +599,12 @@ int ObLogHandler::change_replica_num(const common::ObMemberList &member_list, int ObLogHandler::force_set_as_single_replica() { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + common::TCWLockGuard deps_guard(deps_lock_); if (IS_NOT_INIT) { ret = OB_NOT_INIT; } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else { - common::ObMember dummy_member; common::ObMemberList dummy_member_list; int64_t dummy_replica_num = -1, new_replica_num = 1; const int64_t timeout_us = 10 * 1000 * 1000L; @@ -610,6 +614,8 @@ int ObLogHandler::force_set_as_single_replica() LogConfigChangeCmdResp resp; if (OB_FAIL(cmd_handler.handle_config_change_cmd(req, resp))) { CLOG_LOG(WARN, "handle_config_change_cmd failed", KR(ret), K_(id)); + } else { + CLOG_LOG(INFO, "force_set_as_single_replica success", KR(ret), K_(id), K_(self)); } } return ret; @@ -627,9 +633,13 @@ int ObLogHandler::add_member(const common::ObMember &added_member, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(added_member), K(new_replica_num), + K(config_version), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (!added_member.is_valid() || @@ -663,9 +673,13 @@ int ObLogHandler::remove_member(const common::ObMember &removed_member, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(removed_member), + K(new_replica_num), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (!removed_member.is_valid() || @@ -698,9 +712,13 @@ int ObLogHandler::replace_member(const common::ObMember &added_member, const palf::LogConfigVersion &config_version, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(added_member), K(removed_member), + K(config_version), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (!added_member.is_valid() || @@ -721,6 +739,42 @@ int ObLogHandler::replace_member(const common::ObMember &added_member, return ret; } +int ObLogHandler::replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const palf::LogConfigVersion &config_version, + const int64_t timeout_us) +{ + int ret = OB_SUCCESS; + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(added_member), K(removed_member), + K(config_version), K(timeout_us)); + } else if (is_in_stop_state_) { + ret = OB_NOT_RUNNING; + } else if (!added_member.is_valid() || + !removed_member.is_valid() || + false == config_version.is_valid() || + timeout_us <= 0) { + ret = OB_INVALID_ARGUMENT; + CLOG_LOG(WARN, "invalid argument", KR(ret), K_(id), K(added_member), + K(removed_member), K(config_version), K(timeout_us)); + } else { + LogConfigChangeCmd replace_req(self_, id_, added_member, removed_member, 0, + REPLACE_MEMBER_WITH_LEARNER_CMD, timeout_us); + replace_req.in_leader(config_version); + if (OB_FAIL(submit_config_change_cmd_(replace_req))) { + CLOG_LOG(WARN, " submit_config_change_cmd failed", KR(ret), K_(id), K(replace_req), K(timeout_us)); + } else { + CLOG_LOG(INFO, "replace_member_with_learner success", KR(ret), K_(id), K(added_member), + K(removed_member), K(config_version), K(timeout_us)); + } + } + return ret; +} + // @desc: add_learner interface // | 1.add_learner() // V @@ -731,9 +785,12 @@ int ObLogHandler::add_learner(const common::ObMember &added_learner, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(added_learner), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (!added_learner.is_valid() || @@ -746,7 +803,7 @@ int ObLogHandler::add_learner(const common::ObMember &added_learner, if (OB_FAIL(submit_config_change_cmd_(req))) { CLOG_LOG(WARN, " submit_config_change_cmd failed", KR(ret), K_(id), K(req), K(timeout_us)); } else { - CLOG_LOG(INFO, "add_member success", KR(ret), K_(id), K(added_learner)); + CLOG_LOG(INFO, "add_learner success", KR(ret), K_(id), K(added_learner)); } } return ret; @@ -762,9 +819,12 @@ int ObLogHandler::remove_learner(const common::ObMember &removed_learner, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(removed_learner), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (!removed_learner.is_valid() || @@ -777,38 +837,38 @@ int ObLogHandler::remove_learner(const common::ObMember &removed_learner, if (OB_FAIL(submit_config_change_cmd_(req))) { CLOG_LOG(WARN, " submit_config_change_cmd failed", KR(ret), K_(id), K(req), K(timeout_us)); } else { - CLOG_LOG(INFO, "add_member success", KR(ret), K_(id), K(removed_learner)); + CLOG_LOG(INFO, "remove_learner success", KR(ret), K_(id), K(removed_learner)); } } return ret; } -int ObLogHandler::replace_learner(const common::ObMember &added_learner, - const common::ObMember &removed_learner, - const int64_t timeout_us) +int ObLogHandler::replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(added_learners), + K(removed_learners), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; - } else if (!added_learner.is_valid() || - !removed_learner.is_valid() || + } else if (!added_learners.is_valid() || + !removed_learners.is_valid() || timeout_us <= 0) { ret = OB_INVALID_ARGUMENT; - CLOG_LOG(WARN, "invalid argument", KR(ret), K_(id), K(added_learner), K(removed_learner), K(timeout_us)); + CLOG_LOG(WARN, "invalid argument", KR(ret), K_(id), K(added_learners), K(removed_learners), K(timeout_us)); } else { - common::ObMember dummy_member; - LogConfigChangeCmd add_req(self_, id_, added_learner, dummy_member, 0, ADD_LEARNER_CMD, timeout_us); - LogConfigChangeCmd remove_req(self_, id_, dummy_member, removed_learner, 0, REMOVE_LEARNER_CMD, timeout_us); - if (OB_FAIL(submit_config_change_cmd_(add_req))) { - CLOG_LOG(WARN, " submit_config_change_cmd failed", KR(ret), K_(id), K(add_req), K(timeout_us)); - } else if (OB_FAIL(submit_config_change_cmd_(remove_req))) { - CLOG_LOG(WARN, " submit_config_change_cmd failed", KR(ret), K_(id), K(remove_req), K(timeout_us)); + LogConfigChangeCmd replace_req(self_, id_, added_learners, removed_learners, REPLACE_LEARNERS_CMD, timeout_us); + if (OB_FAIL(submit_config_change_cmd_(replace_req))) { + CLOG_LOG(WARN, " submit_config_change_cmd failed", KR(ret), K_(id), K(replace_req), K(timeout_us)); } else { - CLOG_LOG(INFO, "replace_learner success", KR(ret), K_(id), K(added_learner), K(removed_learner), K(timeout_us)); + CLOG_LOG(INFO, "replace_learners success", KR(ret), K_(id), K(added_learners), K(removed_learners), K(timeout_us)); } } return ret; @@ -826,9 +886,13 @@ int ObLogHandler::switch_learner_to_acceptor(const common::ObMember &learner, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(learner), K(new_replica_num), + K(config_version), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (!learner.is_valid() || @@ -860,9 +924,13 @@ int ObLogHandler::switch_acceptor_to_learner(const common::ObMember &member, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(member), + K(new_replica_num), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (!member.is_valid() || @@ -875,7 +943,7 @@ int ObLogHandler::switch_acceptor_to_learner(const common::ObMember &member, if (OB_FAIL(submit_config_change_cmd_(req))) { CLOG_LOG(WARN, " submit_config_change_cmd failed", KR(ret), K_(id), K(req), K(timeout_us)); } else { - CLOG_LOG(INFO, "add_member success", KR(ret), K_(id), K(member), K(new_replica_num)); + CLOG_LOG(INFO, "switch_acceptor_to_learner success", KR(ret), K_(id), K(member), K(new_replica_num)); } } return ret; @@ -886,9 +954,12 @@ int ObLogHandler::try_lock_config_change(const int64_t lock_owner, const int64_t { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(lock_owner), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER == lock_owner || timeout_us <= 0) { @@ -908,9 +979,12 @@ int ObLogHandler::try_lock_config_change(const int64_t lock_owner, const int64_t int ObLogHandler::unlock_config_change(const int64_t lock_owner, const int64_t timeout_us) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + timeout_us / 2; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(lock_owner), K(timeout_us)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else if (palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER == lock_owner || timeout_us <= 0) { @@ -930,13 +1004,16 @@ int ObLogHandler::unlock_config_change(const int64_t lock_owner, const int64_t t int ObLogHandler::get_config_change_lock_stat(int64_t &lock_owner, bool &is_locked) { int ret = OB_SUCCESS; - common::ObSpinLockGuard deps_guard(deps_lock_); + const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; // 10s + const int64_t abs_timeout_us = common::ObTimeUtility::current_time() + CONFIG_CHANGE_TIMEOUT; + WLockGuardWithTimeout deps_guard(deps_lock_, abs_timeout_us, ret); if (IS_NOT_INIT) { ret = OB_NOT_INIT; + } else if (OB_FAIL(ret)) { + CLOG_LOG(WARN, "get_lock failed", KR(ret), K_(id), K(lock_owner), K(is_locked)); } else if (is_in_stop_state_) { ret = OB_NOT_RUNNING; } else { - const int64_t CONFIG_CHANGE_TIMEOUT = 10 * 1000 * 1000L; // 10s LogConfigChangeCmd req(self_, id_, palf::OB_INVALID_CONFIG_CHANGE_LOCK_OWNER/*unused*/, GET_CONFIG_CHANGE_LOCK_STAT_CMD, CONFIG_CHANGE_TIMEOUT/*timeout_us*/); LogConfigChangeCmdResp resp; @@ -1058,8 +1135,9 @@ int ObLogHandler::submit_config_change_cmd_(const LogConfigChangeCmd &req, return ret; } -int ObLogHandler::is_valid_member(const common::ObAddr &addr, - bool &is_valid) const +int ObLogHandler::get_member_gc_stat(const common::ObAddr &addr, + bool &is_valid_member, + LogMemberGCStat &stat) const { int ret = OB_SUCCESS; common::ObRole role; @@ -1070,6 +1148,8 @@ int ObLogHandler::is_valid_member(const common::ObAddr &addr, int64_t paxos_replica_num = 0; GlobalLearnerList learner_list; bool is_pending_state = false; + is_valid_member = true; + stat = LogMemberGCStat::LOG_MEMBER_NORMAL_GC_STAT; RLockGuard guard(lock_); if (IS_NOT_INIT) { ret = OB_NOT_INIT; @@ -1094,12 +1174,21 @@ int ObLogHandler::is_valid_member(const common::ObAddr &addr, CLOG_LOG(ERROR, "get_role failed", K(ret), KPC(this)); } else { if (role == new_role && proposal_id == new_proposal_id) { - is_valid = member_list.contains(addr) || learner_list.contains(addr); + is_valid_member = member_list.contains(addr) || learner_list.contains(addr); + if (learner_list.contains(addr)) { + ObMember member; + if (OB_FAIL(learner_list.get_learner_by_addr(addr, member))) { + CLOG_LOG(ERROR, "failed to get_learner_by_addr", K(learner_list), K(addr), KPC(this)); + } else { + stat = member.is_migrating() ? LogMemberGCStat::LOG_LEARNER_IN_MIGRATING : LogMemberGCStat::LOG_MEMBER_NORMAL_GC_STAT; + } + } } else { ret = OB_NOT_MASTER; CLOG_LOG(INFO, "role changed during is_valid_member", K(ret), KPC(this), K(role), K(new_role), K(proposal_id), K(new_proposal_id)); } + CLOG_LOG(INFO, "get_member_gc_stat", K(is_valid_member), K(stat), K(member_list), K(learner_list), K(addr), KPC(this)); } return ret; } diff --git a/src/logservice/ob_log_handler.h b/src/logservice/ob_log_handler.h index 09bb9e77b..df8965837 100755 --- a/src/logservice/ob_log_handler.h +++ b/src/logservice/ob_log_handler.h @@ -127,6 +127,13 @@ public: const int64_t timeout_us) = 0; virtual int add_learner(const common::ObMember &added_learner, const int64_t timeout_us) = 0; virtual int remove_learner(const common::ObMember &removed_learner, const int64_t timeout_us) = 0; + virtual int replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us) = 0; + virtual int replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const palf::LogConfigVersion &config_version, + const int64_t timeout_us) = 0; virtual int switch_learner_to_acceptor(const common::ObMember &learner, const int64_t paxos_replica_num, const palf::LogConfigVersion &config_version, @@ -134,9 +141,6 @@ public: virtual int switch_acceptor_to_learner(const common::ObMember &member, const int64_t paxos_replica_num, const int64_t timeout_us) = 0; - virtual int replace_learner(const common::ObMember &added_learner, - const common::ObMember &removed_learner, - const int64_t timeout_us) = 0; virtual int try_lock_config_change(const int64_t lock_owner, const int64_t timeout_us) = 0; virtual int unlock_config_change(const int64_t lock_owner, const int64_t timeout_us) = 0; virtual int get_config_change_lock_stat(int64_t &lock_owner, bool &is_locked) = 0; @@ -146,7 +150,7 @@ public: virtual int disable_sync() = 0; virtual bool is_sync_enabled() const = 0; virtual int advance_base_info(const palf::PalfBaseInfo &palf_base_info, const bool is_rebuild) = 0; - virtual int is_valid_member(const common::ObAddr &addr, bool &is_valid) const = 0; + virtual int get_member_gc_stat(const common::ObAddr &addr, bool &is_valid_member, obrpc::LogMemberGCStat &stat) const = 0; virtual void wait_append_sync() = 0; virtual int enable_replay(const palf::LSN &initial_lsn, const share::SCN &initial_scn) = 0; virtual int disable_replay() = 0; @@ -424,7 +428,7 @@ public: const int64_t timeout_us) override final; // @brief, replace old_member with new_member, can be called in any member - // @param[in] const common::ObMember &added_member: member wil be added + // @param[in] const common::ObMember &added_member: member will be added // @param[in] const common::ObMember &removed_member: member will be removed // @param[in] const palf::LogConfigVersion &config_version: config_version for leader checking // @param[in] const int64_t timeout_us @@ -448,18 +452,33 @@ public: int add_learner(const common::ObMember &added_learner, const int64_t timeout_us) override final; - // @brief, replace removed_learner with added_learner, can be called in any member - // @param[in] const common::ObMember &added_learner: learner wil be added - // @param[in] const common::ObMember &removed_learner: learner will be removed + // @brief, replace removed_learners with added_learners, can be called in any member + // @param[in] const common::ObMemberList &added_learners: learners will be added + // @param[in] const common::ObMemberList &removed_learners: learners will be removed // @param[in] const int64_t timeout_us // @return // - OB_SUCCESS: replace learner successfully // - OB_INVALID_ARGUMENT: invalid argumemt or not supported config change // - OB_TIMEOUT: replace learner timeout // - other: bug - int replace_learner(const common::ObMember &added_learner, - const common::ObMember &removed_learner, - const int64_t timeout_us) override final; + int replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us) override final; + + // @brief, replace removed_member with learner, can be called in any member + // @param[in] const common::ObMember &added_member: member will be added + // @param[in] const common::ObMember &removed_member: member will be removed + // @param[in] const palf::LogConfigVersion &config_version: config_version for leader checking + // @param[in] const int64_t timeout_us + // @return + // - OB_SUCCESS: replace member successfully + // - OB_INVALID_ARGUMENT: invalid argumemt or not supported config change + // - OB_TIMEOUT: replace member timeout + // - other: bug + int replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const palf::LogConfigVersion &config_version, + const int64_t timeout_us) override final; // @brief: remove a learner(read only replica) in this cluster // @param[in] const common::ObMember &removed_learner: learner will be removed @@ -539,8 +558,9 @@ public: // @breif, check request server is in self member list // @param[in] const common::ObAddr, request server. - // @param[out] bool&, whether in self member list. - int is_valid_member(const common::ObAddr &addr, bool &is_valid) const override final; + // @param[out] is_valid_member&, whether in member list or learner list. + // @param[out] obrpc::LogMemberGCStat&, gc stat like learner in migrating. + int get_member_gc_stat(const common::ObAddr &addr, bool &is_valid_member, obrpc::LogMemberGCStat &stat) const override final; // @breif, wait cb append onto apply service Qsync // protect submit log and push cb in Qsync guard void wait_append_sync() override final; @@ -580,12 +600,13 @@ public: int diagnose(LogHandlerDiagnoseInfo &diagnose_info) const; int diagnose_palf(palf::PalfDiagnoseInfo &diagnose_info) const; - TO_STRING_KV(K_(role), K_(proposal_id), KP(palf_env_), K(is_in_stop_state_), K(is_inited_)); + TO_STRING_KV(K_(role), K_(proposal_id), KP(palf_env_), K(is_in_stop_state_), K(is_inited_), K(id_)); int offline() override final; int online(const palf::LSN &lsn, const share::SCN &scn) override final; bool is_offline() const override final; private: static constexpr int64_t MIN_CONN_TIMEOUT_US = 5 * 1000 * 1000; // 5s + typedef common::TCRWLock::WLockGuardWithTimeout WLockGuardWithTimeout; private: int submit_config_change_cmd_(const LogConfigChangeCmd &req); int submit_config_change_cmd_(const LogConfigChangeCmd &req, @@ -598,7 +619,8 @@ private: ObLogApplyService *apply_service_; ObLogReplayService *replay_service_; ObRoleChangeService *rc_service_; - ObSpinLock deps_lock_; + // Note: using TCRWLock for using WLockGuardWithTimeout + common::TCRWLock deps_lock_; mutable palf::PalfLocationCacheCb *lc_cb_; mutable obrpc::ObLogServiceRpcProxy *rpc_proxy_; common::ObQSync ls_qs_; diff --git a/src/logservice/palf/log_config_mgr.cpp b/src/logservice/palf/log_config_mgr.cpp index 6922d60a6..675e15a40 100755 --- a/src/logservice/palf/log_config_mgr.cpp +++ b/src/logservice/palf/log_config_mgr.cpp @@ -1090,8 +1090,12 @@ int LogConfigMgr::check_config_change_args_by_type_(const LogConfigChangeArgs &a const common::GlobalLearnerList &curr_learner_list = log_ms_meta_.curr_.config_.learnerlist_; const common::GlobalLearnerList °raded_learnerlist = log_ms_meta_.curr_.config_.degraded_learnerlist_; const common::ObMember &member = args.server_; + const common::ObMember member_wo_flag = common::ObMember(args.server_.get_server(), \ + args.server_.get_timestamp()); const int64_t new_replica_num = args.new_replica_num_; - const bool is_in_log_sync_memberlist = log_sync_member_list.contains(member); + // Note: for reentrancy of SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM, we check if the member + // without the flag is in the log_sync_memberlist + const bool is_in_log_sync_memberlist = log_sync_member_list.contains(member_wo_flag); const bool is_in_degraded_learnerlist = degraded_learnerlist.contains(member); const bool is_in_learnerlist = curr_learner_list.contains(member); const bool is_arb_replica = (log_ms_meta_.curr_.config_.arbitration_member_ == member); @@ -1239,6 +1243,7 @@ int LogConfigMgr::check_config_change_args_by_type_(const LogConfigChangeArgs &a break; } case SWITCH_LEARNER_TO_ACCEPTOR: + case SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM: case UPGRADE_LEARNER_TO_ACCEPTOR: { if ((is_in_degraded_learnerlist || is_in_learnerlist) && is_in_log_sync_memberlist) { @@ -1248,12 +1253,12 @@ int LogConfigMgr::check_config_change_args_by_type_(const LogConfigChangeArgs &a if (UPGRADE_LEARNER_TO_ACCEPTOR == args.type_ && true == is_in_learnerlist) { ret = OB_INVALID_ARGUMENT; PALF_LOG(WARN, "can not upgrade a normal learner", KR(ret), K_(palf_id), K_(self), K_(log_ms_meta), K(args)); - } else if (SWITCH_LEARNER_TO_ACCEPTOR == args.type_ && true == is_in_degraded_learnerlist) { + } else if (UPGRADE_LEARNER_TO_ACCEPTOR != args.type_ && true == is_in_degraded_learnerlist) { ret = OB_INVALID_ARGUMENT; - PALF_LOG(WARN, "can not upgrade a normal learner", KR(ret), K_(palf_id), K_(self), K_(log_ms_meta), K(args)); + PALF_LOG(WARN, "can not switch a degraded learner to member", KR(ret), K_(palf_id), K_(self), K_(log_ms_meta), K(args)); } } else if (!is_in_learnerlist && !is_in_degraded_learnerlist && is_in_log_sync_memberlist) { - if (args.type_ == UPGRADE_LEARNER_TO_ACCEPTOR || new_replica_num == curr_replica_num) { + if (args.type_ != SWITCH_LEARNER_TO_ACCEPTOR || new_replica_num == curr_replica_num) { is_already_finished = true; PALF_LOG(INFO, "learner_to_acceptor is already finished", KR(ret), K_(palf_id), K_(self), K_(log_ms_meta), K(member)); } else { @@ -1352,6 +1357,38 @@ int LogConfigMgr::check_config_change_args_by_type_(const LogConfigChangeArgs &a } break; } + case REPLACE_LEARNERS: + { + bool all_added_in_learnerlist = true; + bool all_removed_not_in_learnerlist = true; + for (int i = 0; OB_SUCC(ret) && i < args.added_list_.get_member_number(); i++) { + common::ObMember member; + if (OB_FAIL(args.added_list_.get_member_by_index(i, member))) { + PALF_LOG(WARN, "get_member_by_index failed", KR(ret), K_(palf_id), K_(self), K(member), K(args)); + } else if (true == curr_member_list.contains(member.get_server())) { + ret = OB_INVALID_ARGUMENT; + PALF_LOG(WARN, "server is already in memberlist, can not replace_learners", KR(ret), + K_(palf_id), K_(self), K_(log_ms_meta), K(args)); + } else if (false == curr_learner_list.contains(member)) { + all_added_in_learnerlist = false; + break; + } + } + for (int i = 0; OB_SUCC(ret) && i < args.removed_list_.get_member_number(); i++) { + common::ObMember member; + if (OB_FAIL(args.removed_list_.get_member_by_index(i, member))) { + PALF_LOG(WARN, "get_member_by_index failed", KR(ret), K_(palf_id), K_(self), K(member), K(args)); + } else if (true == curr_learner_list.contains(member)) { + all_removed_not_in_learnerlist = false; + break; + } + } + is_already_finished = OB_SUCC(ret) && all_added_in_learnerlist && all_removed_not_in_learnerlist; + if (is_already_finished) { + PALF_LOG(INFO, "replace_learners is already finished", KR(ret), K_(palf_id), K_(self), K_(log_ms_meta), K(args)); + } + break; + } default: { ret = OB_INVALID_ARGUMENT; @@ -1660,7 +1697,10 @@ int LogConfigMgr::generate_new_config_info_(const int64_t proposal_id, if (OB_SUCC(ret) && is_add_member_list(cc_type)) { // update log_sync_member_list or arb_member if (is_add_log_sync_member_list(args.type_)) { - if (OB_FAIL(new_config_info.config_.log_sync_memberlist_.add_member(member))) { + // Note: all members in log_sync_member_list must not be migrating status + common::ObMember added_log_sync_member = member; + added_log_sync_member.reset_migrating(); + if (OB_FAIL(new_config_info.config_.log_sync_memberlist_.add_member(added_log_sync_member))) { PALF_LOG(WARN, "new_member_list add_member failed", KR(ret), K_(palf_id), K_(self), K(args), K(new_config_info)); } } else { @@ -1693,6 +1733,16 @@ int LogConfigMgr::generate_new_config_info_(const int64_t proposal_id, if (OB_FAIL(new_config_info.config_.degraded_learnerlist_.add_learner(member))) { PALF_LOG(WARN, "new_learner_list add_learner failed", KR(ret), K_(palf_id), K_(self), K(args), K(new_config_info)); } + } else if (is_use_added_list(cc_type)) { + for (int i = 0; OB_SUCC(ret) && i < args.added_list_.get_member_number(); i++) { + common::ObMember added_learner; + if (OB_FAIL(args.added_list_.get_member_by_index(i, added_learner))) { + } else if (OB_FAIL(new_config_info.config_.learnerlist_.add_learner(added_learner)) && OB_ENTRY_EXIST != ret) { + PALF_LOG(WARN, "new_learner_list add_learner failed", KR(ret), K_(palf_id), K_(self), K(args), K(new_config_info)); + } else { + ret = OB_SUCCESS; + } + } } else { if (OB_FAIL(new_config_info.config_.learnerlist_.add_learner(member))) { PALF_LOG(WARN, "new_learner_list add_learner failed", KR(ret), K_(palf_id), K_(self), K(args), K(new_config_info)); @@ -1702,11 +1752,22 @@ int LogConfigMgr::generate_new_config_info_(const int64_t proposal_id, // learnerlist remove if (OB_SUCC(ret) && is_remove_learner_list(cc_type)) { if (UPGRADE_LEARNER_TO_ACCEPTOR == cc_type) { - if (OB_FAIL(new_config_info.config_.degraded_learnerlist_.remove_learner(member.get_server()))) { + if (OB_FAIL(new_config_info.config_.degraded_learnerlist_.remove_learner(member))) { PALF_LOG(WARN, "new_learner_list add_learner failed", KR(ret), K_(palf_id), K_(self), K(args), K(new_config_info)); } + } else if (is_use_removed_list(cc_type)) { + for (int i = 0; OB_SUCC(ret) && i < args.removed_list_.get_member_number(); i++) { + common::ObMember removed_learner; + if (OB_FAIL(args.removed_list_.get_member_by_index(i, removed_learner))) { + } else if (OB_FAIL(new_config_info.config_.learnerlist_.remove_learner(removed_learner)) && + OB_ENTRY_NOT_EXIST != ret) { + PALF_LOG(WARN, "new_learner_list remove_learner failed", KR(ret), K_(palf_id), K_(self), K(args), K(new_config_info)); + } else { + ret = OB_SUCCESS; + } + } } else { - if (OB_FAIL(new_config_info.config_.learnerlist_.remove_learner(member.get_server()))) { + if (OB_FAIL(new_config_info.config_.learnerlist_.remove_learner(member))) { PALF_LOG(WARN, "new_learner_list add_learner failed", KR(ret), K_(palf_id), K_(self), K(args), K(new_config_info)); } } @@ -1737,6 +1798,12 @@ int LogConfigMgr::generate_new_config_info_(const int64_t proposal_id, new_config_info.config_.log_sync_memberlist_.add_member(member); new_config_info.config_.log_sync_replica_num_ = args.new_replica_num_; } + // check if the new_config_info is valid + if (OB_SUCC(ret) && false == new_config_info.is_valid()) { + ret = OB_INVALID_ARGUMENT; + PALF_LOG(WARN, "generate_new_config_info_ failed", KR(ret), K_(palf_id), K_(self), K_(log_ms_meta), + K(proposal_id), K(args), K(new_config_info)); + } } return ret; } @@ -2041,7 +2108,8 @@ bool LogConfigChangeArgs::is_valid() const bool bool_ret = true; bool_ret = bool_ret && (type_ != INVALID_LOG_CONFIG_CHANGE_TYPE); bool_ret = bool_ret && ((is_add_member_list(type_) || is_remove_member_list(type_) || - is_add_learner_list(type_) || is_remove_learner_list(type_))? server_.is_valid(): true); + is_add_learner_list(type_) || is_remove_learner_list(type_))? \ + (server_.is_valid() || (added_list_.is_valid() && removed_list_.is_valid())): true); bool_ret = bool_ret && ((is_use_replica_num_args(type_))? is_valid_replica_num(new_replica_num_): true); bool_ret = bool_ret && ((type_ == CHANGE_REPLICA_NUM)? \ (curr_member_list_.is_valid() && is_valid_replica_num(curr_replica_num_) && is_valid_replica_num(new_replica_num_)): true); @@ -2063,6 +2131,8 @@ void LogConfigChangeArgs::reset() lock_owner_ = OB_INVALID_CONFIG_CHANGE_LOCK_OWNER; lock_type_ = ConfigChangeLockType::LOCK_NOTHING; type_ = INVALID_LOG_CONFIG_CHANGE_TYPE; + added_list_.reset(); + removed_list_.reset(); } int LogConfigMgr::check_follower_sync_status(const LogConfigChangeArgs &args, @@ -2641,14 +2711,18 @@ int LogConfigMgr::handle_register_parent_req(const LogLearner &child, const bool LogCandidateList candidate_list; LogLearnerList retired_children; RegisterReturn reg_ret = INVALID_REG_RET; + common::ObMember learner_in_list; + const int in_list_ret = all_learnerlist_.get_learner_by_addr(child.get_server(), learner_in_list); if (IS_NOT_INIT) { ret = OB_NOT_INIT; } else if (!child.is_valid() || child.register_time_us_ <= 0) { ret = OB_INVALID_ARGUMENT; PALF_LOG(WARN, "invalid argument", KR(ret), K_(palf_id), K_(self), K(child)); - } else if (is_to_leader && !all_learnerlist_.contains(child.get_server())) { + } else if (is_to_leader && (OB_SUCCESS != in_list_ret || learner_in_list.is_migrating())) { ret = OB_INVALID_ARGUMENT; - PALF_LOG(WARN, "registering child is not in global learner list", K_(palf_id), K_(self), K(child)); + // Note: do not register parent for migrating learners, because their logs may lag behind its parent + PALF_LOG(WARN, "registering child is not in learner list or is migrating", K_(palf_id), + K_(self), K(child), K(in_list_ret), K(learner_in_list)); } else { SpinLockGuard guard(child_lock_); int64_t idx = -1; diff --git a/src/logservice/palf/log_config_mgr.h b/src/logservice/palf/log_config_mgr.h index 76847a3dc..0209c9ef3 100755 --- a/src/logservice/palf/log_config_mgr.h +++ b/src/logservice/palf/log_config_mgr.h @@ -70,6 +70,8 @@ enum LogConfigChangeType FORCE_SINGLE_MEMBER, TRY_LOCK_CONFIG_CHANGE, UNLOCK_CONFIG_CHANGE, + REPLACE_LEARNERS, + SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM, }; inline const char *LogConfigChangeType2Str(const LogConfigChangeType state) @@ -92,6 +94,10 @@ inline const char *LogConfigChangeType2Str(const LogConfigChangeType state) CHECK_LOG_CONFIG_TYPE_STR(UPGRADE_LEARNER_TO_ACCEPTOR); CHECK_LOG_CONFIG_TYPE_STR(STARTWORKING); CHECK_LOG_CONFIG_TYPE_STR(FORCE_SINGLE_MEMBER); + CHECK_LOG_CONFIG_TYPE_STR(TRY_LOCK_CONFIG_CHANGE); + CHECK_LOG_CONFIG_TYPE_STR(UNLOCK_CONFIG_CHANGE); + CHECK_LOG_CONFIG_TYPE_STR(REPLACE_LEARNERS); + CHECK_LOG_CONFIG_TYPE_STR(SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM); default: return "Invalid"; } @@ -109,13 +115,14 @@ inline bool need_check_config_version(const LogConfigChangeType type) const bool is_cluster_already_4200 = GET_MIN_CLUSTER_VERSION() >= CLUSTER_VERSION_4_2_0_0; return (is_cluster_already_4200) && (ADD_MEMBER == type || ADD_MEMBER_AND_NUM == type || - SWITCH_LEARNER_TO_ACCEPTOR == type); + SWITCH_LEARNER_TO_ACCEPTOR == type || SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM == type); } inline bool is_add_log_sync_member_list(const LogConfigChangeType type) { return ADD_MEMBER == type || ADD_MEMBER_AND_NUM == type || - SWITCH_LEARNER_TO_ACCEPTOR == type || UPGRADE_LEARNER_TO_ACCEPTOR == type; + SWITCH_LEARNER_TO_ACCEPTOR == type || UPGRADE_LEARNER_TO_ACCEPTOR == type || + SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM == type; } inline bool is_remove_log_sync_member_list(const LogConfigChangeType type) @@ -141,12 +148,15 @@ inline bool is_arb_member_change_type(const LogConfigChangeType type) inline bool is_add_learner_list(const LogConfigChangeType type) { - return ADD_LEARNER == type || SWITCH_ACCEPTOR_TO_LEARNER == type || DEGRADE_ACCEPTOR_TO_LEARNER == type; + return ADD_LEARNER == type || SWITCH_ACCEPTOR_TO_LEARNER == type || + DEGRADE_ACCEPTOR_TO_LEARNER == type || REPLACE_LEARNERS == type; } inline bool is_remove_learner_list(const LogConfigChangeType type) { - return REMOVE_LEARNER == type || SWITCH_LEARNER_TO_ACCEPTOR == type || UPGRADE_LEARNER_TO_ACCEPTOR == type; + return REMOVE_LEARNER == type || SWITCH_LEARNER_TO_ACCEPTOR == type || + UPGRADE_LEARNER_TO_ACCEPTOR == type || SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM == type || + REPLACE_LEARNERS == type; } inline bool is_upgrade_or_degrade(const LogConfigChangeType type) @@ -188,6 +198,16 @@ inline bool is_unlock_config_change(const LogConfigChangeType type) return UNLOCK_CONFIG_CHANGE == type; } +inline bool is_use_added_list(const LogConfigChangeType type) +{ + return REPLACE_LEARNERS == type; +} + +inline bool is_use_removed_list(const LogConfigChangeType type) +{ + return REPLACE_LEARNERS == type; +} + struct LogConfigChangeArgs { public: @@ -200,7 +220,9 @@ public: ref_scn_(), lock_owner_(OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), lock_type_(ConfigChangeLockType::LOCK_NOTHING), - type_(INVALID_LOG_CONFIG_CHANGE_TYPE) { } + type_(INVALID_LOG_CONFIG_CHANGE_TYPE), + added_list_(), + removed_list_() { } LogConfigChangeArgs(const common::ObMember &server, const int64_t new_replica_num, @@ -208,14 +230,15 @@ public: const LogConfigChangeType type) : server_(server), curr_member_list_(), curr_replica_num_(0), new_replica_num_(new_replica_num), config_version_(config_version), ref_scn_(), lock_owner_(OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), - lock_type_(ConfigChangeLockType::LOCK_NOTHING), type_(type) { } + lock_type_(ConfigChangeLockType::LOCK_NOTHING), type_(type), + added_list_(), removed_list_() { } LogConfigChangeArgs(const common::ObMember &server, const int64_t new_replica_num, const LogConfigChangeType type) : server_(server), curr_member_list_(), curr_replica_num_(0), new_replica_num_(new_replica_num), config_version_(), ref_scn_(), lock_owner_(OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), - lock_type_(ConfigChangeLockType::LOCK_NOTHING), type_(type) { } + lock_type_(ConfigChangeLockType::LOCK_NOTHING), type_(type), added_list_(), removed_list_() { } LogConfigChangeArgs(const common::ObMemberList &member_list, const int64_t curr_replica_num, @@ -223,14 +246,23 @@ public: const LogConfigChangeType type) : server_(), curr_member_list_(member_list), curr_replica_num_(curr_replica_num), new_replica_num_(new_replica_num), config_version_(), ref_scn_(), lock_owner_(OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), - lock_type_(ConfigChangeLockType::LOCK_NOTHING), type_(type) { } + lock_type_(ConfigChangeLockType::LOCK_NOTHING), type_(type), added_list_(), removed_list_() { } LogConfigChangeArgs(const int64_t lock_owner, const int64_t lock_type, const LogConfigChangeType type) : server_(), curr_member_list_(), curr_replica_num_(0), new_replica_num_(), config_version_(), ref_scn_(), lock_owner_(lock_owner), - lock_type_(lock_type), type_(type) { } + lock_type_(lock_type), type_(type), added_list_(), removed_list_() { } + + LogConfigChangeArgs(const common::ObMemberList &added_list, + const common::ObMemberList &removed_list, + const LogConfigChangeType type) + : server_(), curr_member_list_(), curr_replica_num_(0), + new_replica_num_(0), config_version_(), ref_scn_(), + lock_owner_(OB_INVALID_CONFIG_CHANGE_LOCK_OWNER), + lock_type_(ConfigChangeLockType::LOCK_NOTHING), type_(type), + added_list_(added_list), removed_list_(removed_list) { } ~LogConfigChangeArgs() { @@ -239,34 +271,9 @@ public: bool is_valid() const; void reset(); - const char *Type2Str(const LogConfigChangeType state) const - { - #define CHECK_LOG_CONFIG_TYPE_STR(x) case(LogConfigChangeType::x): return #x - switch(state) - { - CHECK_LOG_CONFIG_TYPE_STR(CHANGE_REPLICA_NUM); - CHECK_LOG_CONFIG_TYPE_STR(ADD_MEMBER); - CHECK_LOG_CONFIG_TYPE_STR(ADD_ARB_MEMBER); - CHECK_LOG_CONFIG_TYPE_STR(REMOVE_MEMBER); - CHECK_LOG_CONFIG_TYPE_STR(REMOVE_ARB_MEMBER); - CHECK_LOG_CONFIG_TYPE_STR(ADD_MEMBER_AND_NUM); - CHECK_LOG_CONFIG_TYPE_STR(REMOVE_MEMBER_AND_NUM); - CHECK_LOG_CONFIG_TYPE_STR(ADD_LEARNER); - CHECK_LOG_CONFIG_TYPE_STR(REMOVE_LEARNER); - CHECK_LOG_CONFIG_TYPE_STR(SWITCH_LEARNER_TO_ACCEPTOR); - CHECK_LOG_CONFIG_TYPE_STR(SWITCH_ACCEPTOR_TO_LEARNER); - CHECK_LOG_CONFIG_TYPE_STR(DEGRADE_ACCEPTOR_TO_LEARNER); - CHECK_LOG_CONFIG_TYPE_STR(UPGRADE_LEARNER_TO_ACCEPTOR); - CHECK_LOG_CONFIG_TYPE_STR(STARTWORKING); - CHECK_LOG_CONFIG_TYPE_STR(TRY_LOCK_CONFIG_CHANGE); - CHECK_LOG_CONFIG_TYPE_STR(UNLOCK_CONFIG_CHANGE); - default: - return "Invalid"; - } - #undef CHECK_LOG_CONFIG_TYPE_STR - } TO_STRING_KV(K_(server), K_(curr_member_list), K_(curr_replica_num), K_(new_replica_num), - K_(config_version), K_(ref_scn), K_(lock_owner), K_(lock_type), "type", LogConfigChangeType2Str(type_)); + K_(config_version), K_(ref_scn), K_(lock_owner), K_(lock_type), K_(added_list), + K_(removed_list), "type", LogConfigChangeType2Str(type_)); common::ObMember server_; common::ObMemberList curr_member_list_; int64_t curr_replica_num_; @@ -276,6 +283,8 @@ public: int64_t lock_owner_; int64_t lock_type_; LogConfigChangeType type_; + common::ObMemberList added_list_; + common::ObMemberList removed_list_; }; struct LogReconfigBarrier diff --git a/src/logservice/palf/log_meta_info.cpp b/src/logservice/palf/log_meta_info.cpp index 6a3a506dd..738262f03 100755 --- a/src/logservice/palf/log_meta_info.cpp +++ b/src/logservice/palf/log_meta_info.cpp @@ -392,33 +392,26 @@ bool LogConfigInfo::is_valid() const config_version_.is_valid(); } +// IP addresses of all members and learners should be different bool LogConfigInfo::is_all_list_unique() const { int ret = OB_SUCCESS; - bool is_all_list_unique = true; + bool is_all_list_unique = false; GlobalLearnerList server_list; - server_list = learnerlist_; - if (OB_ENTRY_EXIST == (ret = server_list.append(degraded_learnerlist_))) { - is_all_list_unique = false; - PALF_LOG(WARN, "learnerlist_ should not overlap with degraded_learnerlist_", - K_(learnerlist), K_(degraded_learnerlist)); - } else if (arbitration_member_.is_valid() && - OB_ENTRY_EXIST == (ret = server_list.add_learner(arbitration_member_))) { - is_all_list_unique = false; - PALF_LOG(WARN, "learnerlist should not overlap with arb_member", - K_(learnerlist), K_(degraded_learnerlist), K_(arbitration_member)); + if (arbitration_member_.is_valid() && + OB_FAIL(server_list.add_learner(arbitration_member_))) { + PALF_LOG(WARN, "add_learner failed", K(server_list), K_(arbitration_member)); + } else if (OB_FAIL(check_list_unique(server_list, log_sync_memberlist_))) { + PALF_LOG(WARN, "serverlist should not overlap with log_sync_memberlist", + K_(arbitration_member), K_(log_sync_memberlist)); + } else if (OB_FAIL(check_list_unique(server_list, degraded_learnerlist_))) { + PALF_LOG(WARN, "serverlist should not overlap with log_sync_memberlist", + K_(arbitration_member), K_(log_sync_memberlist), K_(degraded_learnerlist)); + } else if (OB_FAIL(check_list_unique(server_list, learnerlist_))) { + PALF_LOG(WARN, "serverlist should not overlap with log_sync_memberlist", + K_(arbitration_member), K_(log_sync_memberlist), K_(degraded_learnerlist), K_(learnerlist)); } else { - for (int i = 0; i < log_sync_memberlist_.get_member_number(); i++) { - common::ObMember member; - if (OB_FAIL(log_sync_memberlist_.get_member_by_index(i, member))) { - PALF_LOG(WARN, "get_server_by_index failed", K_(log_sync_memberlist)); - } else if (OB_ENTRY_EXIST == (ret = server_list.add_learner(member))) { - is_all_list_unique = false; - PALF_LOG(WARN, "serverlist should not overlap with log_sync_member_list", K_(learnerlist), - K_(degraded_learnerlist), K_(log_sync_memberlist), K_(arbitration_member)); - break; - } - } + is_all_list_unique = true; } return is_all_list_unique; } diff --git a/src/logservice/palf/log_meta_info.h b/src/logservice/palf/log_meta_info.h index 45f8519b3..db0ca05eb 100644 --- a/src/logservice/palf/log_meta_info.h +++ b/src/logservice/palf/log_meta_info.h @@ -155,6 +155,25 @@ public: // learners which have been degraded from members common::GlobalLearnerList degraded_learnerlist_; LogConfigVersion config_version_; +private: + template + int check_list_unique(GlobalLearnerList &server_list, + const LIST &list) const + { + int ret = OB_SUCCESS; + for (int i = 0; OB_SUCC(ret) && i < list.get_member_number(); i++) { + common::ObMember member; + if (OB_FAIL(list.get_member_by_index(i, member))) { + PALF_LOG(WARN, "get_server_by_index failed", K(list)); + } else if (server_list.contains(member.get_server())) { + ret = OB_INVALID_ARGUMENT; + PALF_LOG(WARN, "serverlist should not overlap with list", K(server_list), K(list)); + } else if (OB_FAIL(server_list.add_learner(member))) { + PALF_LOG(WARN, "add_learner failed", K(server_list), K(list)); + } + } + return ret; + } }; enum ConfigChangeLockType diff --git a/src/logservice/palf/palf_handle.cpp b/src/logservice/palf/palf_handle.cpp index 76dd5e141..cd40d1efb 100755 --- a/src/logservice/palf/palf_handle.cpp +++ b/src/logservice/palf/palf_handle.cpp @@ -391,6 +391,23 @@ int PalfHandle::switch_acceptor_to_learner(const common::ObMember &member, return palf_handle_impl_->switch_acceptor_to_learner(member, new_replica_num, timeout_us); } +int PalfHandle::replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us) +{ + CHECK_VALID; + return palf_handle_impl_->replace_learners(added_learners, removed_learners, timeout_us); +} + +int PalfHandle::replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const LogConfigVersion &config_version, + const int64_t timeout_us) +{ + CHECK_VALID; + return palf_handle_impl_->replace_member_with_learner(added_member, removed_member, config_version, timeout_us); +} + int PalfHandle::change_leader_to(const common::ObAddr &dst_addr) { diff --git a/src/logservice/palf/palf_handle.h b/src/logservice/palf/palf_handle.h index f3432307a..7af18e1e5 100755 --- a/src/logservice/palf/palf_handle.h +++ b/src/logservice/palf/palf_handle.h @@ -240,7 +240,7 @@ public: const int64_t timeout_us); // @brief, replace old_member with new_member, can be called only in leader - // @param[in] const common::ObMember &added_member: member wil be added + // @param[in] const common::ObMember &added_member: member will be added // @param[in] const common::ObMember &removed_member: member will be removed // @param[in] const LogConfigVersion &config_version: config_version for leader checking // @param[in] const int64_t timeout_us @@ -306,6 +306,36 @@ public: int switch_acceptor_to_learner(const common::ObMember &member, const int64_t new_replica_num, const int64_t timeout_us); + + // @brief, replace removed_learners with added_learners + // @param[in] const common::ObMemberList &added_learners: learners will be added + // @param[in] const common::ObMemberList &removed_learners: learners will be removed + // @param[in] const int64_t timeout_us + // @return + // - OB_SUCCESS: replace learner successfully + // - OB_INVALID_ARGUMENT: invalid argumemt or not supported config change + // - OB_TIMEOUT: replace learner timeout + // - OB_NOT_MASTER: not leader or rolechange during membership changing + // - other: bug + int replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us); + + // @brief, replace removed_member with learner + // @param[in] const common::ObMember &added_member: member will be added + // @param[in] const common::ObMember &removed_member: member will be removed + // @param[in] const LogConfigVersion &config_version: config_version for leader checking + // @param[in] const int64_t timeout_us + // @return + // - OB_SUCCESS: replace member successfully + // - OB_INVALID_ARGUMENT: invalid argumemt or not supported config change + // - OB_TIMEOUT: replace member timeout + // - OB_NOT_MASTER: not leader or rolechange during membership changing + // - other: bug + int replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const LogConfigVersion &config_version, + const int64_t timeout_us); int revoke_leader(const int64_t proposal_id); int change_leader_to(const common::ObAddr &dst_addr); // @brief: change AccessMode of palf. diff --git a/src/logservice/palf/palf_handle_impl.cpp b/src/logservice/palf/palf_handle_impl.cpp index 9cd8142cb..2f26c7f5a 100755 --- a/src/logservice/palf/palf_handle_impl.cpp +++ b/src/logservice/palf/palf_handle_impl.cpp @@ -767,7 +767,7 @@ int PalfHandleImpl::replace_member( K(removed_member), K(timeout_us), K(old_member_list), K(old_replica_num), K(curr_member_list), K(curr_replica_num), "leader replace_member cost time(us)", common::ObTimeUtility::current_time() - begin_time_us); - report_replace_member_(added_member, removed_member, curr_member_list); + report_replace_member_(added_member, removed_member, curr_member_list, "REPLACE_MEMBER"); } } return ret; @@ -820,6 +820,7 @@ int PalfHandleImpl::switch_learner_to_acceptor(const common::ObMember &learner, if (IS_NOT_INIT) { ret = OB_NOT_INIT; } else if (!learner.is_valid() || + !is_valid_replica_num(new_replica_num) || timeout_us <= 0) { ret = OB_INVALID_ARGUMENT; PALF_LOG(WARN, "invalid argument", KPC(this), K(learner), K(timeout_us)); @@ -842,7 +843,9 @@ int PalfHandleImpl::switch_acceptor_to_learner(const common::ObMember &member, int ret = OB_SUCCESS; if (IS_NOT_INIT) { ret = OB_NOT_INIT; - } else if (!member.is_valid() || timeout_us <= 0) { + } else if (!member.is_valid() || + !is_valid_replica_num(new_replica_num) || + timeout_us <= 0) { ret = OB_INVALID_ARGUMENT; } else { LogConfigChangeArgs args(member, new_replica_num, SWITCH_ACCEPTOR_TO_LEARNER); @@ -856,6 +859,72 @@ int PalfHandleImpl::switch_acceptor_to_learner(const common::ObMember &member, return ret; } +int PalfHandleImpl::replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us) +{ + int ret = OB_SUCCESS; + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + } else if (!added_learners.is_valid() || !removed_learners.is_valid() || timeout_us <= 0) { + ret = OB_INVALID_ARGUMENT; + PALF_LOG(WARN, "invalid argument", KR(ret), KPC(this), K(added_learners), K(removed_learners), K(timeout_us)); + } else { + LogConfigChangeArgs args(added_learners, removed_learners, REPLACE_LEARNERS); + if (OB_FAIL(one_stage_config_change_(args, timeout_us))) { + PALF_LOG(WARN, "replace_learners failed", KR(ret), KPC(this), K(args), K(timeout_us)); + } else { + PALF_EVENT("replace_learners success", palf_id_, K(ret), KPC(this), K(args), K(timeout_us)); + report_replace_learners_(added_learners, removed_learners); + } + } + return ret; +} + +int PalfHandleImpl::replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const palf::LogConfigVersion &config_version, + const int64_t timeout_us) +{ + int ret = OB_SUCCESS; + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + PALF_LOG(WARN, "PalfHandleImpl not init", KR(ret), KPC(this)); + } else if (!added_member.is_valid() || + !removed_member.is_valid() || + false == config_version.is_valid() || + timeout_us <= 0) { + ret = OB_INVALID_ARGUMENT; + PALF_LOG(WARN, "invalid argument", KR(ret), KPC(this), K(added_member), + K(removed_member), K(config_version), K(timeout_us)); + } else { + ObMemberList old_member_list, curr_member_list; + int64_t old_replica_num = -1, curr_replica_num = -1; + LogConfigChangeArgs args(added_member, 0, config_version, SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM); + const int64_t begin_time_us = common::ObTimeUtility::current_time(); + if (OB_FAIL(config_mgr_.get_curr_member_list(old_member_list, old_replica_num))) { + PALF_LOG(WARN, "get_curr_member_list failed", KR(ret), KPC(this)); + } else if (OB_FAIL(one_stage_config_change_(args, timeout_us))) { + PALF_LOG(WARN, "add_member in replace_member_with_learner failed", KR(ret), KPC(this), K(args)); + } else if (FALSE_IT(args.server_ = removed_member)) { + } else if (FALSE_IT(args.type_ = REMOVE_MEMBER_AND_NUM)) { + } else if (OB_FAIL(one_stage_config_change_(args, timeout_us + begin_time_us - common::ObTimeUtility::current_time()))) { + if (palf_reach_time_interval(100 * 1000, replace_member_print_time_us_)) { + PALF_LOG(WARN, "remove_member in replace_member_with_learner failed", KR(ret), K(args), KPC(this)); + } + } else if (OB_FAIL(config_mgr_.get_curr_member_list(curr_member_list, curr_replica_num))) { + PALF_LOG(WARN, "get_curr_member_list failed", KR(ret), KPC(this)); + } else { + PALF_EVENT("replace_member_with_learner success", palf_id_, KR(ret), KPC(this), K(added_member), + K(removed_member), K(config_version), K(timeout_us), K(old_member_list), K(old_replica_num), + K(curr_member_list), K(curr_replica_num), + "leader replace_member_with_learner cost time(us)", common::ObTimeUtility::current_time() - begin_time_us); + report_replace_member_(added_member, removed_member, curr_member_list, "REPLACE_MEMBER_WITH_LEARNER"); + } + } + return ret; +} + int PalfHandleImpl::change_access_mode(const int64_t proposal_id, const int64_t mode_version, @@ -4903,7 +4972,10 @@ void PalfHandleImpl::report_remove_member_(const int64_t prev_replica_num, const plugins_.record_reconfiguration_event(LogConfigChangeType2Str(LogConfigChangeType::REMOVE_MEMBER), palf_id_, config_version, prev_replica_num, curr_replica_num, EXTRA_INFOS); } -void PalfHandleImpl::report_replace_member_(const common::ObMember &added_member, const common::ObMember &removed_member, const common::ObMemberList &member_list) +void PalfHandleImpl::report_replace_member_(const common::ObMember &added_member, + const common::ObMember &removed_member, + const common::ObMemberList &member_list, + const char *event_name) { LogConfigVersion config_version; (void) config_mgr_.get_config_version(config_version); @@ -4919,7 +4991,7 @@ void PalfHandleImpl::report_replace_member_(const common::ObMember &added_member "member_list", member_list_buf); int64_t curr_replica_num; (void) config_mgr_.get_replica_num(curr_replica_num); - plugins_.record_reconfiguration_event("REPLACE_MEMBER", palf_id_, config_version, curr_replica_num, curr_replica_num, EXTRA_INFOS); + plugins_.record_reconfiguration_event(event_name, palf_id_, config_version, curr_replica_num, curr_replica_num, EXTRA_INFOS); } void PalfHandleImpl::report_add_learner_(const common::ObMember &added_learner) { @@ -5035,6 +5107,24 @@ void PalfHandleImpl::report_switch_acceptor_to_learner_(const common::ObMember & plugins_.record_replica_type_change_event(palf_id_, config_version, replica_full_name_, replica_readonly_name_, EXTRA_INFOS); } +void PalfHandleImpl::report_replace_learners_(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners) +{ + LogConfigVersion config_version; + (void) config_mgr_.get_config_version(config_version); + common::ObMemberList curr_member_list; + int64_t curr_replica_num; + (void) config_mgr_.get_curr_member_list(curr_member_list, curr_replica_num); + ObSqlString added_learners_buf, removed_learners_buf; + member_list_to_string(added_learners, added_learners_buf); + member_list_to_string(removed_learners, removed_learners_buf); + PALF_REPORT_INFO_KV( + "added_learners", added_learners_buf, + "removed_learners", removed_learners_buf); + plugins_.record_reconfiguration_event(LogConfigChangeType2Str(LogConfigChangeType::REPLACE_LEARNERS), + palf_id_, config_version, curr_replica_num, curr_replica_num, EXTRA_INFOS); +} + bool PalfHandleImpl::check_need_hook_fetch_log_(const FetchLogType fetch_type, const LSN &start_lsn) { bool bool_ret = false; diff --git a/src/logservice/palf/palf_handle_impl.h b/src/logservice/palf/palf_handle_impl.h index eae96ca81..fc2cbbc05 100755 --- a/src/logservice/palf/palf_handle_impl.h +++ b/src/logservice/palf/palf_handle_impl.h @@ -397,7 +397,7 @@ public: const int64_t timeout_us) = 0; // @brief, replace old_member with new_member - // @param[in] const common::ObMember &added_member: member wil be added + // @param[in] const common::ObMember &added_member: member will be added // @param[in] const common::ObMember &removed_member: member will be removed // @param[in] const LogConfigVersion &config_version: config_version for leader checking // @param[in] const int64_t timeout_us @@ -462,6 +462,35 @@ public: const int64_t new_replica_num, const int64_t timeout_us) = 0; + // @brief, replace removed_learners with added_learners + // @param[in] const common::ObMemberList &added_learners: learners will be added + // @param[in] const common::ObMemberList &removed_learners: learners will be removed + // @param[in] const int64_t timeout_us + // @return + // - OB_SUCCESS: replace learner successfully + // - OB_INVALID_ARGUMENT: invalid argumemt or not supported config change + // - OB_TIMEOUT: replace learner timeout + // - OB_NOT_MASTER: not leader or rolechange during membership changing + // - other: bug + virtual int replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us) = 0; + + // @brief, replace removed_member with learner + // @param[in] const common::ObMember &added_member: member will be added + // @param[in] const common::ObMember &removed_member: member will be removed + // @param[in] const LogConfigVersion &config_version: config_version for leader checking + // @param[in] const int64_t timeout_us + // @return + // - OB_SUCCESS: replace member successfully + // - OB_INVALID_ARGUMENT: invalid argumemt or not supported config change + // - OB_TIMEOUT: replace member timeout + // - OB_NOT_MASTER: not leader or rolechange during membership changing + // - other: bug + virtual int replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const LogConfigVersion &config_version, + const int64_t timeout_us) = 0; // 设置日志文件的可回收位点,小于等于lsn的日志文件均可以安全回收 // @@ -825,6 +854,13 @@ public: int switch_acceptor_to_learner(const common::ObMember &member, const int64_t new_replica_num, const int64_t timeout_us) override final; + int replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us) override final; + int replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const LogConfigVersion &config_version, + const int64_t timeout_us) override final; int set_base_lsn(const LSN &lsn) override final; int enable_sync() override final; int disable_sync() override final; @@ -1185,13 +1221,18 @@ private: void report_change_replica_num_(const int64_t prev_replica_num, const int64_t curr_replica_num, const common::ObMemberList &member_list); void report_add_member_(const int64_t prev_replica_num, const int64_t curr_replica_num, const common::ObMember &added_member); void report_remove_member_(const int64_t prev_replica_num, const int64_t curr_replica_num, const common::ObMember &removed_member); - void report_replace_member_(const common::ObMember &added_member, const common::ObMember &removed_member, const common::ObMemberList &member_list); + void report_replace_member_(const common::ObMember &added_member, + const common::ObMember &removed_member, + const common::ObMemberList &member_list, + const char *event_name); void report_add_learner_(const common::ObMember &added_learner); void report_remove_learner_(const common::ObMember &removed_learner); void report_add_arb_member_(const common::ObMember &added_arb_member); void report_remove_arb_member_(const common::ObMember &removed_arb_member); void report_switch_learner_to_acceptor_(const common::ObMember &learner); void report_switch_acceptor_to_learner_(const common::ObMember &acceptor); + void report_replace_learners_(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners); // ======================= report event end ======================================= bool check_need_hook_fetch_log_(const FetchLogType fetch_type, const LSN &start_lsn); private: diff --git a/src/logservice/palf_handle_guard.h b/src/logservice/palf_handle_guard.h index 79669cd31..5bc649737 100644 --- a/src/logservice/palf_handle_guard.h +++ b/src/logservice/palf_handle_guard.h @@ -208,7 +208,7 @@ public: // @brief, replace old_member with new_member, can be called only in leader // @param[in] const common::ObMember &removed_member: member will be removed - // @param[in] const common::ObMember &added_member: member wil be added + // @param[in] const common::ObMember &added_member: member will be added // @param[in] const int64_t timeout_us // @return // - OB_SUCCESS: replace member successfully diff --git a/src/observer/ob_rpc_processor_simple.cpp b/src/observer/ob_rpc_processor_simple.cpp index d88f8ae0c..4f86e3fa1 100644 --- a/src/observer/ob_rpc_processor_simple.cpp +++ b/src/observer/ob_rpc_processor_simple.cpp @@ -2206,6 +2206,7 @@ int ObQueryLSIsValidMemberP::process() for (int64_t index = 0; OB_SUCC(ret) && index < arg_.ls_array_.count(); index++) { const share::ObLSID &id = arg_.ls_array_[index]; bool is_valid_member = true; + obrpc::LogMemberGCStat stat = obrpc::LogMemberGCStat::LOG_MEMBER_NORMAL_GC_STAT; ObLSHandle handle; if (OB_SUCCESS != (tmp_ret = ls_service->get_ls(id, handle, ObLSGetMod::OBSERVER_MOD))) { if (OB_LS_NOT_EXIST == tmp_ret || OB_NOT_RUNNING == tmp_ret) { @@ -2216,18 +2217,20 @@ int ObQueryLSIsValidMemberP::process() } else if (OB_ISNULL(ls = handle.get_ls())) { tmp_ret = OB_ERR_UNEXPECTED; COMMON_LOG(ERROR, " log stream not exist", K(id), K(tmp_ret)); - } else if (OB_SUCCESS != (tmp_ret = ls->is_valid_member(addr, is_valid_member))) { + } else if (OB_SUCCESS != (tmp_ret = ls->get_member_gc_stat(addr, is_valid_member, stat))) { if (REACH_TIME_INTERVAL(100 * 1000)) { - COMMON_LOG(WARN, "is_valid_member failed", K(tmp_ret), K(id), K(addr)); + COMMON_LOG(WARN, "get_member_gc_stat failed", K(tmp_ret), K(id), K(addr)); } } else {} if (OB_FAIL(response.ls_array_.push_back(id))) { - COMMON_LOG(WARN, "response partition_array_ push_back failed", K(ret), K(id)); + COMMON_LOG(WARN, "response partition_array_ push_back failed", K(addr), K(id)); } else if (OB_FAIL(response.ret_array_.push_back(tmp_ret))) { - COMMON_LOG(WARN, "response ret_array push_back failed", K(ret), K(id), K(tmp_ret)); + COMMON_LOG(WARN, "response ret_array push_back failed", K(addr), K(id), K(tmp_ret)); } else if (OB_FAIL(response.candidates_status_.push_back(is_valid_member))) { - COMMON_LOG(WARN, "response candidates_status_ push_back failed", K(ret), K(id), K(is_valid_member)); + COMMON_LOG(WARN, "response candidates_status_ push_back failed", K(addr), K(id), K(is_valid_member)); + } else if (OB_FAIL(response.gc_stat_array_.push_back(stat))) { + COMMON_LOG(WARN, "response gc_stat_array_ push_back failed", K(addr), K(id), K(stat)); } else { // do nothing } diff --git a/src/rootserver/ob_disaster_recovery_info.cpp b/src/rootserver/ob_disaster_recovery_info.cpp index 08a991905..7d0ea670f 100644 --- a/src/rootserver/ob_disaster_recovery_info.cpp +++ b/src/rootserver/ob_disaster_recovery_info.cpp @@ -379,10 +379,12 @@ void DRLSInfo::reset_last_disaster_recovery_ls() int DRLSInfo::construct_filtered_ls_info_to_use_( const share::ObLSInfo &input_ls_info, - share::ObLSInfo &output_ls_info) + share::ObLSInfo &output_ls_info, + const bool &filter_readonly_replicas_with_flag) { int ret = OB_SUCCESS; output_ls_info.reset(); + const ObLSReplica *leader_replica = NULL; if (OB_UNLIKELY(!input_ls_info.is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(input_ls_info)); @@ -390,15 +392,41 @@ int DRLSInfo::construct_filtered_ls_info_to_use_( input_ls_info.get_tenant_id(), input_ls_info.get_ls_id()))) { LOG_WARN("fail to init ls info", KR(ret), K(input_ls_info)); + } else if (OB_FAIL(input_ls_info.find_leader(leader_replica))) { + LOG_WARN("fail to find leader replica in input_ls_info", KR(ret), K(input_ls_info)); + } else if (OB_ISNULL(leader_replica)) { + ret = OB_LEADER_NOT_EXIST; + LOG_WARN("no leader in input_ls_info", KR(ret), K(input_ls_info)); } else { uint64_t tenant_id = input_ls_info.get_tenant_id(); ObLSID ls_id = input_ls_info.get_ls_id(); for (int64_t i = 0; OB_SUCC(ret) && i < input_ls_info.get_replicas().count(); i++) { const ObLSReplica &ls_replica = input_ls_info.get_replicas().at(i); - if (ls_replica.get_in_member_list() || ls_replica.get_in_learner_list()) { + // filter replicas not in member/learner list + if (ls_replica.get_in_member_list()) { if (OB_FAIL(output_ls_info.add_replica(ls_replica))) { - LOG_WARN("fail to add replica to new ls_info", KR(ret), K(ls_replica)); + LOG_WARN("fail to add full replica to new ls_info", KR(ret), K(ls_replica)); } + } else if (ls_replica.get_in_learner_list()) { + if (!filter_readonly_replicas_with_flag) { + if (OB_FAIL(output_ls_info.add_replica(ls_replica))) { + LOG_WARN("fail to add read only replica to new ls_info", KR(ret), K(ls_replica)); + } + } else { + // filter learner replicas with flag + common::ObMember learner_in_learner_list; + if (OB_FAIL(leader_replica->get_learner_list().get_learner_by_addr(ls_replica.get_server(), learner_in_learner_list))) { + LOG_WARN("fail to get learner from leader learner_list", KR(ret), KPC(leader_replica), K(ls_replica)); + } else if (learner_in_learner_list.is_migrating()) { + LOG_TRACE("replica is filtered because it is a garbage replica generated by migrating", + K(input_ls_info), K(ls_replica), KPC(leader_replica), K(learner_in_learner_list)); + } else if (OB_FAIL(output_ls_info.add_replica(ls_replica))) { + LOG_WARN("fail to add read only replica to new ls_info", KR(ret), K(ls_replica)); + } + } + } else { + LOG_TRACE("replica is filtered because it is not in member/learner list", + K(input_ls_info), K(ls_replica), KPC(leader_replica)); } } } @@ -407,7 +435,8 @@ int DRLSInfo::construct_filtered_ls_info_to_use_( int DRLSInfo::build_disaster_ls_info( const share::ObLSInfo &ls_info, - const share::ObLSStatusInfo &ls_status_info) + const share::ObLSStatusInfo &ls_status_info, + const bool &filter_readonly_replicas_with_flag) { int ret = OB_SUCCESS; @@ -423,8 +452,9 @@ int DRLSInfo::build_disaster_ls_info( ret = OB_INVALID_ARGUMENT; LOG_WARN("tenant id not match", KR(ret), K(resource_tenant_id_), "ls_tenant_id", ls_info.get_tenant_id()); - } else if (OB_FAIL(construct_filtered_ls_info_to_use_(ls_info, inner_ls_info_))) { - LOG_WARN("fail to filter replicas not in both member_list and learner_list", KR(ret), K(ls_info)); + } else if (OB_FAIL(construct_filtered_ls_info_to_use_(ls_info, inner_ls_info_, filter_readonly_replicas_with_flag))) { + LOG_WARN("fail to filter not in member/learner list replicas and learner_with_flag replicas", + KR(ret), K(ls_info), K(filter_readonly_replicas_with_flag)); } else if (OB_FAIL(ls_status_info_.assign(ls_status_info))) { LOG_WARN("fail to assign ls_status_info", KR(ret)); } else if (OB_FAIL(sys_schema_guard_.get_tenant_info( diff --git a/src/rootserver/ob_disaster_recovery_info.h b/src/rootserver/ob_disaster_recovery_info.h index dfd68a670..83c4370e3 100644 --- a/src/rootserver/ob_disaster_recovery_info.h +++ b/src/rootserver/ob_disaster_recovery_info.h @@ -167,7 +167,8 @@ public: int init(); int build_disaster_ls_info( const share::ObLSInfo &ls_info, - const share::ObLSStatusInfo &ls_status_info); + const share::ObLSStatusInfo &ls_status_info, + const bool &filter_readonly_replicas_with_flag); public: const common::ObIArray &get_locality() const { return zone_locality_array_; @@ -209,7 +210,8 @@ public: private: int construct_filtered_ls_info_to_use_( const share::ObLSInfo &input_ls_info, - share::ObLSInfo &output_ls_info); + share::ObLSInfo &output_ls_info, + const bool &filter_readonly_replicas_with_flag); // init related private func int gather_server_unit_stat(); int fill_servers(); diff --git a/src/rootserver/ob_disaster_recovery_worker.cpp b/src/rootserver/ob_disaster_recovery_worker.cpp index fcb2c18fe..612b81a73 100755 --- a/src/rootserver/ob_disaster_recovery_worker.cpp +++ b/src/rootserver/ob_disaster_recovery_worker.cpp @@ -1802,6 +1802,7 @@ int ObDRWorker::check_tenant_locality_match( for (int64_t i = 0; OB_SUCC(ret) && i < ls_status_info_array.count() && locality_is_matched; ++i) { share::ObLSInfo ls_info; share::ObLSStatusInfo &ls_status_info = ls_status_info_array.at(i); + bool filter_readonly_replicas_with_flag = true; DRLSInfo dr_ls_info(gen_user_tenant_id(tenant_id), &unit_mgr, &zone_mgr, @@ -1818,8 +1819,11 @@ int ObDRWorker::check_tenant_locality_match( } else if (OB_FAIL(dr_ls_info.init())) { LOG_WARN("fail to init dr log stream info", KR(ret)); } else if (OB_FAIL(dr_ls_info.build_disaster_ls_info( - ls_info, ls_status_info))) { - LOG_WARN("fail to generate dr log stream info", KR(ret)); + ls_info, + ls_status_info, + filter_readonly_replicas_with_flag))) { + LOG_WARN("fail to generate dr log stream info", KR(ret), K(ls_info), + K(ls_status_info), K(filter_readonly_replicas_with_flag)); } else if (OB_FAIL(check_ls_locality_match_( dr_ls_info, unit_mgr, zone_mgr, locality_is_matched))) { LOG_WARN("fail to try log stream disaster recovery", KR(ret)); @@ -1896,10 +1900,8 @@ int ObDRWorker::try_disaster_recovery() } acc_dr_task += tenant_acc_dr_task; } - statistic_total_dr_task(acc_dr_task); } - return ret; } @@ -1931,10 +1933,16 @@ int ObDRWorker::try_tenant_disaster_recovery( for (int64_t i = 0; OB_SUCC(ret) && i < ls_status_info_array.count(); ++i) { share::ObLSInfo ls_info; share::ObLSStatusInfo &ls_status_info = ls_status_info_array.at(i); - DRLSInfo dr_ls_info(gen_user_tenant_id(tenant_id), - unit_mgr_, - zone_mgr_, - schema_service_); + // this structure is used to generate migrtion/locality alignment/shrink unit tasks + DRLSInfo dr_ls_info_without_flag(gen_user_tenant_id(tenant_id), + unit_mgr_, + zone_mgr_, + schema_service_); + // this structure is used to generate permanent offline tasks + DRLSInfo dr_ls_info_with_flag(gen_user_tenant_id(tenant_id), + unit_mgr_, + zone_mgr_, + schema_service_); int64_t ls_acc_dr_task = 0; int tmp_ret = OB_SUCCESS; // ignore ret for different ls LOG_INFO("start try ls disaster recovery", K(ls_status_info)); @@ -1950,13 +1958,18 @@ int ObDRWorker::try_tenant_disaster_recovery( share::ObLSTable::COMPOSITE_MODE, ls_info))) { LOG_WARN("fail to get log stream info", KR(tmp_ret)); - } else if (OB_SUCCESS != (tmp_ret = dr_ls_info.init())) { + } else if (OB_SUCCESS != (tmp_ret = dr_ls_info_without_flag.init())) { LOG_WARN("fail to init dr log stream info", KR(tmp_ret)); - } else if (OB_SUCCESS != (tmp_ret = dr_ls_info.build_disaster_ls_info( - ls_info, ls_status_info))) { + } else if (OB_SUCCESS != (tmp_ret = dr_ls_info_without_flag.build_disaster_ls_info( + ls_info, ls_status_info, true/*filter_readonly_replica_with_flag*/))) { LOG_WARN("fail to generate dr log stream info", KR(tmp_ret)); + } else if (OB_SUCCESS != (tmp_ret = dr_ls_info_with_flag.init())) { + LOG_WARN("fail to init dr log stream info with flag", KR(tmp_ret)); + } else if (OB_SUCCESS != (tmp_ret = dr_ls_info_with_flag.build_disaster_ls_info( + ls_info, ls_status_info, false/*filter_readonly_replica_with_flag*/))) { + LOG_WARN("fail to generate dr log stream info with flag", KR(tmp_ret)); } else if (OB_SUCCESS != (tmp_ret = try_ls_disaster_recovery( - only_for_display, dr_ls_info, ls_acc_dr_task))) { + only_for_display, dr_ls_info_without_flag, ls_acc_dr_task, dr_ls_info_with_flag))) { LOG_WARN("fail to try log stream disaster recovery", KR(tmp_ret), K(only_for_display)); } } @@ -1981,7 +1994,8 @@ int ObDRWorker::try_assign_unit( int ObDRWorker::try_ls_disaster_recovery( const bool only_for_display, DRLSInfo &dr_ls_info, - int64_t &acc_dr_task_cnt) + int64_t &acc_dr_task_cnt, + DRLSInfo &dr_ls_info_with_flag) { int ret = OB_SUCCESS; ObRootBalanceHelp::BalanceController controller; @@ -2004,9 +2018,9 @@ int ObDRWorker::try_ls_disaster_recovery( if (OB_SUCC(ret) && acc_dr_task_cnt <= 0) { if (OB_FAIL(try_remove_permanent_offline_replicas( - only_for_display, dr_ls_info, acc_dr_task_cnt))) { + only_for_display, dr_ls_info_with_flag, acc_dr_task_cnt))) { LOG_WARN("fail to try remove permanent offline replicas", - KR(ret), K(dr_ls_info)); + KR(ret), K(dr_ls_info_with_flag)); } } // ATTENTION!!! @@ -2016,12 +2030,15 @@ int ObDRWorker::try_ls_disaster_recovery( // regards these replicas as abnormal replicas and can not generate any task // for this log stream(locality alignment, replica migration etc.). // So we make sure log stream does not have replicas only in member_list AFTER try_remove_permanent_offline - // Please DO NOT change the order of try_remove_permanent_offline, check_ls_only_in_member_list_ and other operations. + // + // Also we DO NOT want to see replicas created during migration or rebuilding. + // So we have to make sure those replicas with flag in learner_list not exists. + // Please DO NOT change the order of try_remove_permanent_offline, filter_learner_with_flag, check_ls_only_in_member_list_or_with_flag_ and other operations. if (OB_SUCC(ret) && acc_dr_task_cnt <= 0) { bool is_only_in_member_list = true; - if (OB_FAIL(check_ls_only_in_member_list_(dr_ls_info))) { - LOG_WARN("only_in_memberlist check is failed", KR(ret), K(dr_ls_info)); + if (OB_FAIL(check_ls_only_in_member_list_or_with_flag_(dr_ls_info))) { + LOG_WARN("only_in_memberlist and flag replica check is failed", KR(ret), K(dr_ls_info)); } } // step2: replicate to unit @@ -2438,14 +2455,17 @@ int ObDRWorker::do_single_replica_permanent_offline_( const int64_t memstore_percent = 100; ObDRTaskKey task_key; bool can_generate = false; - ObReplicaMember remove_member(member_to_remove.get_server(), - member_to_remove.get_timestamp(), - replica_type, - memstore_percent); + ObReplicaMember remove_member(member_to_remove); + //ObReplicaMember remove_member(member_to_remove.get_server(), + // member_to_remove.get_timestamp(), + // replica_type, + // memstore_percent); ObDRTaskType task_type = ObReplicaTypeCheck::is_paxos_replica_V2(replica_type) ? ObDRTaskType::LS_REMOVE_PAXOS_REPLICA : ObDRTaskType::LS_REMOVE_NON_PAXOS_REPLICA; - if (OB_FAIL(construct_extra_infos_to_build_remove_replica_task( + if (OB_FAIL(remove_member.set_replica_type(replica_type))) { + LOG_WARN("fail to set replica type", KR(ret), K(replica_type), K(remove_member)); + } else if (OB_FAIL(construct_extra_infos_to_build_remove_replica_task( dr_ls_info, task_id, new_paxos_replica_number, @@ -4878,7 +4898,7 @@ int ObDRWorker::choose_disaster_recovery_data_source( return ret; } -int ObDRWorker::check_ls_only_in_member_list_( +int ObDRWorker::check_ls_only_in_member_list_or_with_flag_( const DRLSInfo &dr_ls_info) { int ret = OB_SUCCESS; @@ -4909,12 +4929,24 @@ int ObDRWorker::check_ls_only_in_member_list_( } // check learner list for (int64_t index = 0; OB_SUCC(ret) && index < leader_replica->get_learner_list().get_member_number(); ++index) { - common::ObAddr server_to_check; + common::ObMember learner_to_check; const share::ObLSReplica *replica = nullptr; - if (OB_FAIL(leader_replica->get_learner_list().get_server_by_index(index,server_to_check))) { + if (OB_FAIL(leader_replica->get_learner_list().get_member_by_index(index, learner_to_check))) { LOG_WARN("fail to get learner by index", KR(ret), K(index)); - } else if (OB_FAIL(inner_ls_info.find(server_to_check, replica))) { - LOG_WARN("fail to find read only replica", KR(ret), K(inner_ls_info), K(server_to_check)); + } else if (learner_to_check.is_migrating()) { + if (OB_FAIL(inner_ls_info.find(learner_to_check.get_server(), replica))) { + if (OB_ENTRY_NOT_EXIST == ret) { + // good, learner with flag should not appear in inner_ls_info + ret = OB_SUCCESS; + } else { + LOG_WARN("fail to find replica", KR(ret), K(inner_ls_info), K(learner_to_check)); + } + } else { + ret = OB_STATE_NOT_MATCH; + LOG_WARN("read only replica with flag should not appear in inner_ls_info", KR(ret), K(learner_to_check), K(inner_ls_info)); + } + } else if (OB_FAIL(inner_ls_info.find(learner_to_check.get_server(), replica))) { + LOG_WARN("fail to find read only replica", KR(ret), K(inner_ls_info), K(learner_to_check)); } } } diff --git a/src/rootserver/ob_disaster_recovery_worker.h b/src/rootserver/ob_disaster_recovery_worker.h index c7aa3fe7a..22c804783 100755 --- a/src/rootserver/ob_disaster_recovery_worker.h +++ b/src/rootserver/ob_disaster_recovery_worker.h @@ -666,7 +666,8 @@ private: int try_ls_disaster_recovery( const bool only_for_display, DRLSInfo &dr_ls_info, - int64_t &acc_dr_task); + int64_t &acc_dr_task, + DRLSInfo &dr_ls_info_with_flag); int check_has_leader_while_remove_replica( const common::ObAddr &server, @@ -696,7 +697,7 @@ private: const ObMember &member_to_remove, int64_t &acc_dr_task); - int check_ls_only_in_member_list_( + int check_ls_only_in_member_list_or_with_flag_( const DRLSInfo &dr_ls_info); int check_can_generate_task( diff --git a/src/rootserver/ob_migrate_unit_finish_checker.cpp b/src/rootserver/ob_migrate_unit_finish_checker.cpp index ff8c0d2d8..e90d88782 100644 --- a/src/rootserver/ob_migrate_unit_finish_checker.cpp +++ b/src/rootserver/ob_migrate_unit_finish_checker.cpp @@ -196,7 +196,8 @@ int ObMigrateUnitFinishChecker::try_check_migrate_unit_finish_by_tenant( LOG_WARN("fail to get log stream info", KR(ret)); } else if (OB_FAIL(dr_ls_info.build_disaster_ls_info( ls_info, - ls_status_info))) { + ls_status_info, + true/*filter_readonly_replicas_with_flag*/))) { LOG_WARN("fail to generate dr log stream info", KR(ret)); } else if (OB_FAIL(statistic_migrate_unit_by_ls( dr_ls_info, diff --git a/src/share/ls/ob_ls_info.cpp b/src/share/ls/ob_ls_info.cpp index 14a8ba0be..69c656bc7 100644 --- a/src/share/ls/ob_ls_info.cpp +++ b/src/share/ls/ob_ls_info.cpp @@ -305,15 +305,17 @@ bool ObLSReplica::learner_list_is_equal(const common::GlobalLearnerList &a, cons { bool is_equal = true; if (a.get_member_number() != b.get_member_number()) { + // ObMember with flag is considered. is_equal = false; } else { for (int i = 0; is_equal && i < a.get_member_number(); ++i) { - ObAddr learner; + ObMember learner; int ret = OB_SUCCESS; - if (OB_FAIL(a.get_server_by_index(i, learner))) { + if (OB_FAIL(a.get_member_by_index(i, learner))) { is_equal = false; LOG_WARN("failed to get server by index", KR(ret), K(i), K(a), K(b)); } else { + // flag of learner is considered is_equal = b.contains(learner); } } @@ -470,30 +472,47 @@ int ObLSReplica::text2learner_list(const char *text, GlobalLearnerList &learner_ char *learner_text = nullptr; char *save_ptr1 = nullptr; learner_list.reset(); - if (nullptr == text) { + if (OB_ISNULL(text)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), KP(text)); } while (OB_SUCC(ret)) { learner_text = strtok_r((nullptr == learner_text ? const_cast(text) : nullptr), ",", &save_ptr1); /* - * ipv4 format: a.b.c.d:port:timestamp,... - * ipv6 format: [a:b:c:d:e:f:g:h]:port:timestamp,... + * ipv4 format: a.b.c.d:port:timestamp:flag,... + * ipv6 format: [a:b:c:d:e:f:g:h]:port:timestamp:flag,... */ - if (nullptr != learner_text) { + if (OB_NOT_NULL(learner_text)) { + char *flag_str = nullptr; char *timestamp_str = nullptr; char *end_ptr = nullptr; ObAddr learner_addr; - if (OB_NOT_NULL(timestamp_str = strrchr(learner_text, ':'))) { - *timestamp_str++ = '\0'; - int64_t timestamp_val = strtoll(timestamp_str, &end_ptr, 10); - if (end_ptr == timestamp_str || *end_ptr != '\0') { + if (OB_NOT_NULL(flag_str = strrchr(learner_text, ':'))) { + // strrchar will return substring after last ':' + *flag_str++ = '\0'; + int64_t flag_value = strtoll(flag_str, &end_ptr, 10); + //end_ptr is the location of the first character can not translated into digital number + if (end_ptr == flag_str || *end_ptr != '\0') { ret = OB_ERR_UNEXPECTED; LOG_WARN("strtoll failed", KR(ret)); - } else if (OB_FAIL(learner_addr.parse_from_cstring(learner_text))) { - LOG_ERROR("parse from cstring failed", KR(ret), K(learner_text)); - } else if (OB_FAIL(learner_list.add_learner(ObMember(learner_addr, timestamp_val)))) { - LOG_WARN("push back failed", KR(ret), K(learner_addr), K(timestamp_val)); + } else if (OB_NOT_NULL(timestamp_str = strrchr(learner_text, ':'))) { + *timestamp_str++ = '\0'; + int64_t timestamp_val = strtoll(timestamp_str, &end_ptr, 10); + if (end_ptr == timestamp_str || *end_ptr != '\0') { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("strtoll failed", KR(ret)); + } else if (OB_FAIL(learner_addr.parse_from_cstring(learner_text))) { + LOG_ERROR("parse from cstring failed", KR(ret), K(learner_text)); + } else { + ObMember member_to_add(learner_addr, timestamp_val); + member_to_add.set_flag(flag_value); + if (OB_FAIL(learner_list.add_learner(member_to_add))) { + LOG_WARN("push back learner failed", KR(ret), K(member_to_add)); + } + } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("parse learner text failed", KR(ret), K(learner_text)); } } else { ret = OB_ERR_UNEXPECTED; diff --git a/src/share/ob_debug_sync_point.h b/src/share/ob_debug_sync_point.h index d8a27078c..89a0b2a0f 100755 --- a/src/share/ob_debug_sync_point.h +++ b/src/share/ob_debug_sync_point.h @@ -543,6 +543,7 @@ class ObString; ACT(STOP_TRANSFER_LS_LOGICAL_TABLE_REPLACED,)\ ACT(BEFORE_TRANSFER_DOING,)\ ACT(BEFORE_BUILD_LS_MIGRATION_DAG_NET,)\ + ACT(AFTER_JOIN_LEARNER_LIST,)\ ACT(MAX_DEBUG_SYNC_POINT,) DECLARE_ENUM(ObDebugSyncPoint, debug_sync_point, OB_DEBUG_SYNC_POINT_DEF); diff --git a/src/share/ob_rpc_struct.cpp b/src/share/ob_rpc_struct.cpp index 0608a6aa4..360619487 100755 --- a/src/share/ob_rpc_struct.cpp +++ b/src/share/ob_rpc_struct.cpp @@ -8005,7 +8005,7 @@ int ObRemoveSysLsArg::assign(const ObRemoveSysLsArg &other) OB_SERIALIZE_MEMBER(ObRemoveSysLsArg, server_); OB_SERIALIZE_MEMBER(ObQueryLSIsValidMemberRequest, tenant_id_, self_addr_, ls_array_); -OB_SERIALIZE_MEMBER(ObQueryLSIsValidMemberResponse, ret_value_, ls_array_, candidates_status_, ret_array_); +OB_SERIALIZE_MEMBER(ObQueryLSIsValidMemberResponse, ret_value_, ls_array_, candidates_status_, ret_array_, gc_stat_array_); OB_SERIALIZE_MEMBER(ObSwitchSchemaResult, ret_); int ObTenantConfigArg::assign(const ObTenantConfigArg &other) diff --git a/src/share/ob_rpc_struct.h b/src/share/ob_rpc_struct.h index abf5509a9..233f7566c 100755 --- a/src/share/ob_rpc_struct.h +++ b/src/share/ob_rpc_struct.h @@ -8581,12 +8581,20 @@ public: TO_STRING_KV(K(tenant_id_), K(self_addr_), K(ls_array_)); }; +enum LogMemberGCStat +{ + LOG_MEMBER_GC_STAT_INVALID = 0, + LOG_MEMBER_NORMAL_GC_STAT = 1, + LOG_LEARNER_IN_MIGRATING = 2, + LOG_MEMBER_GC_STAT_MAX = 256 +}; + struct ObQueryLSIsValidMemberResponse { OB_UNIS_VERSION(1); public: ObQueryLSIsValidMemberResponse() : ret_value_(common::OB_SUCCESS), ls_array_(), - candidates_status_(), ret_array_() {} + candidates_status_(), ret_array_(), gc_stat_array_() {} ~ObQueryLSIsValidMemberResponse() {} void reset() { @@ -8594,13 +8602,15 @@ public: ls_array_.reset(); candidates_status_.reset(); ret_array_.reset(); + gc_stat_array_.reset(); } int ret_value_; share::ObLSArray ls_array_; common::ObSEArray candidates_status_; common::ObSEArray ret_array_; - TO_STRING_KV(K(ret_value_), K(ls_array_), K(candidates_status_), K(ret_array_)); + common::ObSEArray gc_stat_array_; + TO_STRING_KV(K(ret_value_), K(ls_array_), K(candidates_status_), K(ret_array_), K(gc_stat_array_)); }; struct ObSwitchSchemaResult diff --git a/src/storage/high_availability/ob_ls_complete_migration.cpp b/src/storage/high_availability/ob_ls_complete_migration.cpp index a14a680c0..b96f271e2 100644 --- a/src/storage/high_availability/ob_ls_complete_migration.cpp +++ b/src/storage/high_availability/ob_ls_complete_migration.cpp @@ -1240,55 +1240,147 @@ int ObStartCompleteMigrationTask::change_member_list_() LOG_WARN("failed to get ls leader", K(ret), KPC(ctx_)); } else if (OB_FAIL(fake_config_version.generate(0, 0))) { LOG_WARN("failed to generate config version", K(ret)); - } else { + } else if (cluster_version < CLUSTER_VERSION_4_2_0_0) { const int64_t change_member_list_timeout_us = GCONF.sys_bkgd_migration_change_member_list_timeout; if (ObMigrationOpType::ADD_LS_OP == ctx_->arg_.type_) { - if (cluster_version < CLUSTER_VERSION_4_2_0_0) { + if (REPLICA_TYPE_FULL == ctx_->arg_.dst_.get_replica_type()) { if (OB_FAIL(ls->get_log_handler()->add_member(ctx_->arg_.dst_, ctx_->arg_.paxos_replica_number_, fake_config_version, change_member_list_timeout_us))) { LOG_WARN("failed to add member", K(ret), KPC(ctx_)); } - } else if (REPLICA_TYPE_FULL == ctx_->arg_.dst_.get_replica_type()) { - if (OB_FAIL(ls->add_member(ctx_->arg_.dst_, - ctx_->arg_.paxos_replica_number_, - change_member_list_timeout_us))) { - LOG_WARN("failed to add member", K(ret)); - } - } else { - // R-replica - if (OB_FAIL(ls->add_learner(ctx_->arg_.dst_, change_member_list_timeout_us))) { - LOG_WARN("failed to add learner", K(ret), KPC(ctx_)); - } } } else if (ObMigrationOpType::MIGRATE_LS_OP == ctx_->arg_.type_) { - if (cluster_version < CLUSTER_VERSION_4_2_0_0) { + if (REPLICA_TYPE_FULL == ctx_->arg_.dst_.get_replica_type()) { if (OB_FAIL(ls->get_log_handler()->replace_member(ctx_->arg_.dst_, ctx_->arg_.src_, fake_config_version, change_member_list_timeout_us))) { LOG_WARN("failed to repalce member", K(ret), KPC(ctx_)); } - } else if (REPLICA_TYPE_FULL == ctx_->arg_.dst_.get_replica_type()) { - if (OB_FAIL(ls->replace_member(ctx_->arg_.dst_, - ctx_->arg_.src_, - change_member_list_timeout_us))) { - LOG_WARN("failed to replace member", K(ret)); - } - } else { - // R-replica - if (OB_FAIL(ls->replace_learner(ctx_->arg_.dst_, ctx_->arg_.src_, - change_member_list_timeout_us))) { - LOG_WARN("failed to replace_learner", K(ret), KPC(ctx_)); - } } } else { ret = OB_ERR_UNEXPECTED; LOG_WARN("change member list get invalid type", K(ret), KPC(ctx_)); } - - if (OB_SUCC(ret)) { - const int64_t cost_ts = ObTimeUtility::current_time() - start_ts; - LOG_INFO("succeed change member list", "cost", cost_ts, "tenant_id", ctx_->tenant_id_, "ls_id", ctx_->arg_.ls_id_); + } else { + const int64_t change_member_list_timeout_us = GCONF.sys_bkgd_migration_change_member_list_timeout; + if (ObMigrationOpType::ADD_LS_OP == ctx_->arg_.type_) { + if (REPLICA_TYPE_FULL == ctx_->arg_.dst_.get_replica_type()) { + if (OB_FAIL(switch_learner_to_acceptor_(ls))) { + LOG_WARN("failed to switch learner to acceptor", K(ret), K(leader_addr), K(ls_transfer_scn)); + } + } else { + // R-replica + if (OB_FAIL(replace_learners_for_add_(ls))) { + LOG_WARN("failed to replace learners for add", K(ret), K(leader_addr), K(ls_transfer_scn)); + } + } + } else if (ObMigrationOpType::MIGRATE_LS_OP == ctx_->arg_.type_) { + if (REPLICA_TYPE_FULL == ctx_->arg_.dst_.get_replica_type()) { + if (OB_FAIL(replace_member_with_learner_(ls))) { + LOG_WARN("failed to replace member with learner", K(ret), K(leader_addr), K(ls_transfer_scn)); + } + } else { + // R-replica + if (OB_FAIL(replace_learners_for_migration_(ls))) { + LOG_WARN("failed to replace learners for migration", K(ret), K(leader_addr), K(ls_transfer_scn)); + } + } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("change member list get invalid type", K(ret), KPC(ctx_)); } } + if (OB_SUCC(ret)) { + const int64_t cost_ts = ObTimeUtility::current_time() - start_ts; + LOG_INFO("succeed change member list", "cost", cost_ts, "tenant_id", ctx_->tenant_id_, "ls_id", ctx_->arg_.ls_id_); + } + return ret; +} + +int ObStartCompleteMigrationTask::get_ls_transfer_scn_(ObLS *ls, share::SCN &transfer_scn) +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(ls)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls should not be NULL", K(ret)); + } else if (OB_FAIL(ls->get_transfer_scn(transfer_scn))) { + LOG_WARN("failed to get transfer scn", K(ret), KP(ls)); + } + return ret; +} + +int ObStartCompleteMigrationTask::switch_learner_to_acceptor_(ObLS *ls) +{ + int ret = OB_SUCCESS; + const int64_t timeout = GCONF.sys_bkgd_migration_change_member_list_timeout; + ObMember dst = ctx_->arg_.dst_; + dst.set_migrating(); + if (OB_ISNULL(ls)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls should not be NULL", K(ret), KP(ls)); + } else if (OB_FAIL(ls->switch_learner_to_acceptor(dst, ctx_->arg_.paxos_replica_number_, timeout))) { + LOG_WARN("failed to switch_learner_to_acceptor", K(ret), KPC(ctx_)); + } + return ret; +} + +int ObStartCompleteMigrationTask::replace_member_with_learner_(ObLS *ls) +{ + int ret = OB_SUCCESS; + const int64_t timeout = GCONF.sys_bkgd_migration_change_member_list_timeout; + ObMember dst = ctx_->arg_.dst_; + dst.set_migrating(); + if (OB_ISNULL(ls)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls should not be NULL", K(ret), KP(ls)); + } else if (OB_FAIL(ls->replace_member_with_learner(dst, ctx_->arg_.src_, timeout))) { + LOG_WARN("failed to replace_member_with_learner", K(ret), KPC(ctx_)); + } + return ret; +} + +int ObStartCompleteMigrationTask::replace_learners_for_add_(ObLS *ls) +{ + int ret = OB_SUCCESS; + ObMemberList added_learners, removed_learners; + ObMember new_dst = ctx_->arg_.dst_; + new_dst.reset_migrating(); + ObMember old_dst = ctx_->arg_.dst_; + old_dst.set_migrating(); + const int64_t change_member_list_timeout_us = GCONF.sys_bkgd_migration_change_member_list_timeout; + if (OB_FAIL(added_learners.add_member(new_dst))) { + LOG_WARN("failed to add member", K(ret), KPC(ctx_)); + } else if (OB_FAIL(removed_learners.add_member(old_dst))) { + LOG_WARN("failed to add member", K(ret), KPC(ctx_)); + } else if (OB_FAIL(ls->replace_learners(added_learners, removed_learners, change_member_list_timeout_us))) { + LOG_WARN("failed to replace learners", K(ret), KPC(ctx_)); + } else { + LOG_INFO("replace learners for add", K(added_learners), K(removed_learners)); + } + return ret; +} + +int ObStartCompleteMigrationTask::replace_learners_for_migration_(ObLS *ls) +{ + int ret = OB_SUCCESS; + ObMemberList added_learners, removed_learners; + const int64_t change_member_list_timeout_us = GCONF.sys_bkgd_migration_change_member_list_timeout; + ObMember new_dst = ctx_->arg_.dst_; + new_dst.reset_migrating(); + ObMember old_dst = ctx_->arg_.dst_; + old_dst.set_migrating(); + ObMember src = ctx_->arg_.src_; + src.reset_migrating(); + if (OB_FAIL(added_learners.add_member(new_dst))) { + LOG_WARN("failed to add member", K(ret), KPC(ctx_)); + } else if (OB_FAIL(removed_learners.add_member(old_dst))) { + LOG_WARN("failed to add member", K(ret), KPC(ctx_)); + } else if (OB_FAIL(removed_learners.add_member(src))) { + LOG_WARN("failed to add member", K(ret), KPC(ctx_)); + } else if (OB_FAIL(ls->replace_learners(added_learners, removed_learners, change_member_list_timeout_us))) { + LOG_WARN("failed to replace learners", K(ret), KPC(ctx_)); + } else { + LOG_INFO("replace members for migration", K(added_learners), K(removed_learners)); + } return ret; } diff --git a/src/storage/high_availability/ob_ls_complete_migration.h b/src/storage/high_availability/ob_ls_complete_migration.h index 40f5efcf6..2bee9b1fd 100644 --- a/src/storage/high_availability/ob_ls_complete_migration.h +++ b/src/storage/high_availability/ob_ls_complete_migration.h @@ -178,6 +178,13 @@ private: int wait_trans_tablet_explain_data_(); int change_member_list_with_retry_(); int change_member_list_(); + int get_ls_transfer_scn_( + ObLS *ls, + share::SCN &transfer_scn); + int switch_learner_to_acceptor_(ObLS *ls); + int replace_member_with_learner_(ObLS *ls); + int replace_learners_for_add_(ObLS *ls); + int replace_learners_for_migration_(ObLS *ls); int check_need_wait_( ObLS *ls, bool &need_wait); diff --git a/src/storage/high_availability/ob_ls_member_list_service.cpp b/src/storage/high_availability/ob_ls_member_list_service.cpp index 7d347de10..582675629 100644 --- a/src/storage/high_availability/ob_ls_member_list_service.cpp +++ b/src/storage/high_availability/ob_ls_member_list_service.cpp @@ -125,6 +125,34 @@ int ObLSMemberListService::replace_member( return ret; } +// TODO(yangyi.yyy) :replace member with learner +int ObLSMemberListService::replace_member_with_learner( + const common::ObMember &added_member, + const common::ObMember &removed_member, + const int64_t replace_member_timeout_us) +{ + int ret = OB_SUCCESS; + palf::LogConfigVersion leader_config_version; + share::SCN leader_transfer_scn; + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + STORAGE_LOG(WARN, "ls is not inited", K(ret)); + } else if (OB_FAIL(get_leader_config_version_and_transfer_scn_( + leader_config_version, leader_transfer_scn))) { + STORAGE_LOG(WARN, "failed to get leader config version and transfer scn", K(ret)); + } else if (OB_FAIL(check_ls_transfer_scn_(leader_transfer_scn))) { + STORAGE_LOG(WARN, "failed to check ls transfer scn", K(ret)); + } else if (OB_FAIL(log_handler_->replace_member_with_learner(added_member, + removed_member, + leader_config_version, + replace_member_timeout_us))) { + STORAGE_LOG(WARN, "failed to add member", K(ret)); + } else { + STORAGE_LOG(INFO, "replace member with learner success", K(ret)); + } + return ret; +} + int ObLSMemberListService::switch_learner_to_acceptor( const common::ObMember &learner, const int64_t paxos_replica_num, diff --git a/src/storage/high_availability/ob_ls_member_list_service.h b/src/storage/high_availability/ob_ls_member_list_service.h index ff4fc7681..13c4cfb42 100644 --- a/src/storage/high_availability/ob_ls_member_list_service.h +++ b/src/storage/high_availability/ob_ls_member_list_service.h @@ -42,6 +42,9 @@ public: int switch_learner_to_acceptor(const common::ObMember &learner, const int64_t paxos_replica_num, const int64_t timeout); + int replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const int64_t timeout); private: int get_leader_config_version_and_transfer_scn_( diff --git a/src/storage/high_availability/ob_ls_migration.cpp b/src/storage/high_availability/ob_ls_migration.cpp index 7e5768d5a..5ef5f5a41 100644 --- a/src/storage/high_availability/ob_ls_migration.cpp +++ b/src/storage/high_availability/ob_ls_migration.cpp @@ -28,6 +28,7 @@ #include "storage/tablet/ob_tablet.h" #include "share/ls/ob_ls_table_operator.h" #include "ob_rebuild_service.h" +#include "share/ob_cluster_version.h" namespace oceanbase { @@ -37,6 +38,7 @@ namespace storage ERRSIM_POINT_DEF(EN_BUILD_SYS_TABLETS_DAG_FAILED); ERRSIM_POINT_DEF(EN_UPDATE_LS_MIGRATION_STATUS_FAILED); +ERRSIM_POINT_DEF(EN_JOIN_LEARNER_LIST_FAILED); /******************ObMigrationCtx*********************/ ObMigrationCtx::ObMigrationCtx() @@ -3226,6 +3228,8 @@ int ObDataTabletsMigrationTask::process() //do nothing } else if (OB_FAIL(try_remove_unneeded_tablets_())) { LOG_WARN("failed to try remove unneeded tablets", K(ret), KPC(ctx_)); + } else if (OB_FAIL(join_learner_list_())) { + LOG_WARN("failed to add to learner list", K(ret)); } else if (OB_FAIL(ls_online_())) { LOG_WARN("failed to start replay log", K(ret), K(*ctx_)); } else if (OB_FAIL(build_tablet_group_info_())) { @@ -3273,6 +3277,53 @@ int ObDataTabletsMigrationTask::process() return ret; } +int ObDataTabletsMigrationTask::join_learner_list_() +{ + int ret = OB_SUCCESS; + ObLS *ls = nullptr; + const int64_t timeout = GCONF.sys_bkgd_migration_change_member_list_timeout; + if (!is_inited_) { + ret = OB_NOT_INIT; + LOG_WARN("data tablets migration task do not init", K(ret)); + } else if (GET_MIN_CLUSTER_VERSION() < CLUSTER_VERSION_4_2_0_0) { + // do nothing + } else if (ObMigrationOpType::ADD_LS_OP != ctx_->arg_.type_ + && ObMigrationOpType::MIGRATE_LS_OP != ctx_->arg_.type_) { + // only join learner list when migration and copy + } else if (OB_ISNULL(ls = ls_handle_.get_ls())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ls should not be NULL", K(ret), KP(ls)); + } else { + ctx_->arg_.dst_.set_migrating(); + const ObMember &dst_member = ctx_->arg_.dst_; + if (OB_FAIL(ls->add_learner(dst_member, timeout))) { + LOG_WARN("failed to add learner", K(ret), K(dst_member)); + } else { + LOG_INFO("add to learner list succ", KPC(ctx_)); + } + } +#ifdef ERRSIM + if (OB_SUCC(ret)) { + ret = EN_JOIN_LEARNER_LIST_FAILED ? : OB_SUCCESS; + if (OB_FAIL(ret)) { + STORAGE_LOG(ERROR, "fake EN_JOIN_LEARNER_LIST_FAILED", K(ret)); + SERVER_EVENT_SYNC_ADD("storage_ha", "join_learner_list_failed", + "tenant_id", ctx_->tenant_id_, + "ls_id", ctx_->arg_.ls_id_.id(), + "src", ctx_->arg_.src_.get_server(), + "dst", ctx_->arg_.dst_.get_server()); + } + } +#endif + SERVER_EVENT_SYNC_ADD("storage_ha", "after_join_learner_list", + "tenant_id", ctx_->tenant_id_, + "ls_id", ctx_->arg_.ls_id_.id(), + "src", ctx_->arg_.src_.get_server(), + "dst", ctx_->arg_.dst_.get_server()); + DEBUG_SYNC(AFTER_JOIN_LEARNER_LIST); + return ret; +} + int ObDataTabletsMigrationTask::ls_online_() { int ret = OB_SUCCESS; diff --git a/src/storage/high_availability/ob_ls_migration.h b/src/storage/high_availability/ob_ls_migration.h index 4f0630483..054b9be03 100644 --- a/src/storage/high_availability/ob_ls_migration.h +++ b/src/storage/high_availability/ob_ls_migration.h @@ -435,6 +435,7 @@ public: virtual int process() override; VIRTUAL_TO_STRING_KV(K("ObDataTabletsMigrationTask"), KP(this), KPC(ctx_)); private: + int join_learner_list_(); int ls_online_(); int generate_tablet_group_migration_dag_(); int generate_tablet_group_dag_( diff --git a/src/storage/ls/ob_ls.h b/src/storage/ls/ob_ls.h index dcc524c79..367133e2e 100755 --- a/src/storage/ls/ob_ls.h +++ b/src/storage/ls/ob_ls.h @@ -600,10 +600,11 @@ public: // @brief, get max decided log scn considering both apply and replay. // @param[out] share::SCN&, max decided log scn. DELEGATE_WITH_RET(log_handler_, get_max_decided_scn, int); - // @breif, check request server is in self member list + // @breif,get member stat: whether in paxos member list or learner list and whether is migrating // @param[in] const common::ObAddr, request server. - // @param[out] bool&, whether in self member list. - DELEGATE_WITH_RET(log_handler_, is_valid_member, int); + // @param[out] bool &(is_valid_member), + // @param[out] LogMemberGCStat&, + DELEGATE_WITH_RET(log_handler_, get_member_gc_stat, int); // @brief append count bytes from the buffer starting at buf to the palf handle, return the LSN and timestamp // @param[in] const void *, the data buffer. // @param[in] const uint64_t, the length of data buffer. @@ -639,9 +640,10 @@ public: DELEGATE_WITH_RET(member_list_service_, switch_learner_to_acceptor, int); DELEGATE_WITH_RET(member_list_service_, add_member, int); DELEGATE_WITH_RET(member_list_service_, replace_member, int); + DELEGATE_WITH_RET(member_list_service_, replace_member_with_learner, int); DELEGATE_WITH_RET(member_list_service_, get_config_version_and_transfer_scn, int); DELEGATE_WITH_RET(log_handler_, add_learner, int); //TODO(yanfeng): fix it - DELEGATE_WITH_RET(log_handler_, replace_learner, int); //TODO(yanfeng): fix it + DELEGATE_WITH_RET(log_handler_, replace_learners, int); DELEGATE_WITH_RET(block_tx_service_, ha_block_tx, int); DELEGATE_WITH_RET(block_tx_service_, ha_kill_tx, int); DELEGATE_WITH_RET(block_tx_service_, ha_unblock_tx, int); diff --git a/unittest/logservice/test_log_config_mgr.cpp b/unittest/logservice/test_log_config_mgr.cpp index 6ac642d76..f5a47693f 100755 --- a/unittest/logservice/test_log_config_mgr.cpp +++ b/unittest/logservice/test_log_config_mgr.cpp @@ -831,7 +831,61 @@ TEST_F(TestLogConfigMgr, test_apply_config_meta) expect_finished_list.push_back(false); expect_member_list.push_back(init_member_list); expect_member_list.back().add_server(addr4); - + // 54. SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM, not learner + config_info_list.push_back(default_config_info); + arg_list.push_back(LogConfigChangeArgs(ObMember(addr4, -1), 0, SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM)); + expect_ret_list.push_back(OB_INVALID_ARGUMENT); + expect_finished_list.push_back(false); + expect_member_list.push_back(init_member_list); + // 55. SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM, learner + config_info_list.push_back(three_f_one_learner_config_info); + arg_list.push_back(LogConfigChangeArgs(ObMember(addr4, -1), 0, SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM)); + expect_ret_list.push_back(OB_SUCCESS); + expect_finished_list.push_back(false); + expect_member_list.push_back(init_member_list); + expect_member_list.back().add_server(addr4); + // 56. SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM, already finish + config_info_list.push_back(default_config_info); + common::ObMember migrating_learner = ObMember(addr3, -1); + migrating_learner.set_migrating(); + arg_list.push_back(LogConfigChangeArgs(migrating_learner, 0, SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM)); + expect_ret_list.push_back(OB_SUCCESS); + expect_finished_list.push_back(true); + expect_member_list.push_back(init_member_list); + // 57. SWITCH_LEARNER_TO_ACCEPTOR, already finish + config_info_list.push_back(default_config_info); + arg_list.push_back(LogConfigChangeArgs(migrating_learner, 3, SWITCH_LEARNER_TO_ACCEPTOR)); + expect_ret_list.push_back(OB_SUCCESS); + expect_finished_list.push_back(true); + expect_member_list.push_back(init_member_list); + // 58. SWITCH_LEARNER_TO_ACCEPTOR, already finish + config_info_list.push_back(default_config_info); + common::ObMember migrated_member = migrating_learner; + migrated_member.reset_migrating(); + arg_list.push_back(LogConfigChangeArgs(migrated_member, 3, SWITCH_LEARNER_TO_ACCEPTOR)); + expect_ret_list.push_back(OB_SUCCESS); + expect_finished_list.push_back(true); + expect_member_list.push_back(init_member_list); + // 59. REPLACE_LEARNERS, already finish + config_info_list.push_back(three_f_one_learner_config_info); + common::ObMemberList added_learners; + common::ObMemberList removed_learners; + added_learners.add_server(addr4); + removed_learners.add_server(addr5); + arg_list.push_back(LogConfigChangeArgs(added_learners, removed_learners, REPLACE_LEARNERS)); + expect_ret_list.push_back(OB_SUCCESS); + expect_finished_list.push_back(true); + expect_member_list.push_back(init_member_list); + // 60. REPLACE_LEARNERS, INVALID ARGUMENT + config_info_list.push_back(three_f_one_learner_config_info); + added_learners.remove_server(addr4); + common::ObMember added_learner(addr3, -1); + added_learner.set_migrating(); + added_learners.add_member(added_learner); + arg_list.push_back(LogConfigChangeArgs(added_learners, removed_learners, REPLACE_LEARNERS)); + expect_ret_list.push_back(OB_INVALID_ARGUMENT); + expect_finished_list.push_back(false); + expect_member_list.push_back(init_member_list); for (int i = 0; i < arg_list.size(); ++i) { PALF_LOG(INFO, "test_check_config_change_args begin case", K(i+1)); LogConfigMgr cm; @@ -1124,6 +1178,93 @@ TEST_F(TestLogConfigMgr, test_apply_config_meta) EXPECT_EQ(tmp_ret, OB_INVALID_ARGUMENT) << "remove(C, 5)"; PALF_LOG(INFO, "test_check_config_change_args end case 32"); } + { + // 33. 2F1A - abc, degrade b, migrate b to d: add d as learner, remove degraded b. + PALF_LOG(INFO, "test_check_config_change_args begin case 33"); + LogConfigMgr cm; + init_test_log_config_env(addr1, two_f_one_a_config_info, cm); + bool already_finished = false; + ObMemberList expect_member_list; + int64_t expect_replica_num = 0; + // degrade B + LogConfigChangeArgs degrade_b_arg(ObMember(addr2, -1), 0, palf::DEGRADE_ACCEPTOR_TO_LEARNER); + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, degrade_b_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 1); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 1); + // add d as learner + LogConfigVersion config_version; + common::ObMember migrating_d = common::ObMember(addr4, -1); + migrating_d.set_migrating(); + LogConfigChangeArgs add_d_arg(migrating_d, 0, palf::ADD_LEARNER); + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, add_d_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 1); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 1); + // add d as learner, reentrant + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, add_d_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 1); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 1); + // switch d to acceptor + EXPECT_EQ(OB_SUCCESS, cm.get_config_version(config_version)); + LogConfigChangeArgs switch_d_arg(migrating_d, 2, config_version, palf::SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM); + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, switch_d_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 3); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); + // switch d to acceptor, reentrant + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, switch_d_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 3); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); + // remove B and num + LogConfigChangeArgs remove_b_arg(ObMember(addr2, -1), 0, palf::REMOVE_MEMBER_AND_NUM); + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, remove_b_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); + // remove B and num, reentrant + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, remove_b_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); + PALF_LOG(INFO, "test_check_config_change_args end case 33"); + } + { + // 32. 4F1A, remove(D, 5), remove(C, 5), + PALF_LOG(INFO, "test_check_config_change_args begin case 32"); + LogConfigMgr cm; + init_test_log_config_env(addr1, four_f_one_a_config_info, cm); + bool already_finished = false; + int tmp_ret = OB_SUCCESS; + LSN prev_lsn; + prev_lsn.val_ = PALF_INITIAL_LSN_VAL; + // remove(D, 5) + LogConfigChangeArgs remove_d_arg(ObMember(addr4, -1), 5, palf::REMOVE_MEMBER); + tmp_ret = cm.append_config_meta_(1, remove_d_arg, already_finished); + EXPECT_EQ(tmp_ret, OB_SUCCESS) << "remove(D, 5)"; + EXPECT_EQ(already_finished, false) << "remove(D, 5)"; + ObMemberList expect_member_list = four_f_member_list; + expect_member_list.remove_server(addr4); + expect_member_list.add_server(addr5); + // memberlist will not be applied right now when there is arb member, so use alive_paxos_memberlist_ + bool member_equal = (cm.alive_paxos_memberlist_.member_addr_equal(expect_member_list)); + EXPECT_TRUE(member_equal); + // remove(C, 5) + cm.reset_status(); + LogConfigChangeArgs remove_c_arg(ObMember(addr3, -1), 5, palf::REMOVE_MEMBER); + tmp_ret = cm.append_config_meta_(1, remove_c_arg, already_finished); + EXPECT_EQ(tmp_ret, OB_INVALID_ARGUMENT) << "remove(C, 5)"; + PALF_LOG(INFO, "test_check_config_change_args end case 32"); + } { // 33. 2F1A - abc, degrade b, migrate b to d: add d as learner, remove degraded b. PALF_LOG(INFO, "test_check_config_change_args begin case 33"); @@ -1156,32 +1297,32 @@ TEST_F(TestLogConfigMgr, test_apply_config_meta) EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 1); EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 1); // switch d to acceptor - // EXPECT_EQ(OB_SUCCESS, cm.get_config_version(config_version)); - // LogConfigChangeArgs switch_d_arg(migrating_d, 2, config_version, palf::SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM); - // EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, switch_d_arg, already_finished)); - // EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); - // EXPECT_EQ(expect_replica_num, 3); - // EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); - // EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); - // // switch d to acceptor, reentrant - // EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, switch_d_arg, already_finished)); - // EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); - // EXPECT_EQ(expect_replica_num, 3); - // EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); - // EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); + EXPECT_EQ(OB_SUCCESS, cm.get_config_version(config_version)); + LogConfigChangeArgs switch_d_arg(migrating_d, 2, config_version, palf::SWITCH_LEARNER_TO_ACCEPTOR_AND_NUM); + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, switch_d_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 3); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); + // switch d to acceptor, reentrant + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, switch_d_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 3); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); // remove B and num - // LogConfigChangeArgs remove_b_arg(ObMember(addr2, -1), 0, palf::REMOVE_MEMBER_AND_NUM); - // EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, remove_b_arg, already_finished)); - // EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); - // EXPECT_EQ(expect_replica_num, 2); - // EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); - // EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); - // // remove B and num, reentrant - // EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, remove_b_arg, already_finished)); - // EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); - // EXPECT_EQ(expect_replica_num, 2); - // EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); - // EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); + LogConfigChangeArgs remove_b_arg(ObMember(addr2, -1), 0, palf::REMOVE_MEMBER_AND_NUM); + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, remove_b_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); + // remove B and num, reentrant + EXPECT_EQ(OB_SUCCESS, cm.append_config_meta_(1, remove_b_arg, already_finished)); + EXPECT_EQ(OB_SUCCESS, cm.get_curr_member_list(expect_member_list, expect_replica_num)); + EXPECT_EQ(expect_replica_num, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_replica_num_, 2); + EXPECT_EQ(cm.log_ms_meta_.curr_.config_.log_sync_memberlist_.get_member_number(), 2); PALF_LOG(INFO, "test_check_config_change_args end case 31"); } } diff --git a/unittest/storage/mock_ob_log_handler.h b/unittest/storage/mock_ob_log_handler.h index 1962e070e..dc44b4903 100644 --- a/unittest/storage/mock_ob_log_handler.h +++ b/unittest/storage/mock_ob_log_handler.h @@ -333,15 +333,26 @@ public: return ret; } - int replace_learner(const common::ObMember &added_learner, - const common::ObMember &removed_learner, - const int64_t timeout_us) + int replace_learners(const common::ObMemberList &added_learners, + const common::ObMemberList &removed_learners, + const int64_t timeout_us) { int ret = OB_SUCCESS; - UNUSEDx(added_learner, removed_learner, timeout_us); + UNUSEDx(added_learners, removed_learners, timeout_us); return ret; } + int replace_member_with_learner(const common::ObMember &added_member, + const common::ObMember &removed_member, + const palf::LogConfigVersion &config_version, + const int64_t timeout_us) + { + int ret = OB_SUCCESS; + UNUSEDx(added_member, removed_member, timeout_us); + return ret; + } + + int switch_learner_to_acceptor(const common::ObMember &learner, const int64_t new_replica_num, const palf::LogConfigVersion &config_version, @@ -382,10 +393,9 @@ public: UNUSEDx(upgrade_servers, timeout_us); return OB_SUCCESS; } - int is_valid_member(const common::ObAddr &addr, bool &is_valid) const + int get_member_gc_stat(const common::ObAddr &addr, bool &is_valid_member, obrpc::LogMemberGCStat &stat) const { - UNUSED(addr); - UNUSED(is_valid); + UNUSEDx(addr, is_valid_member, stat); return OB_SUCCESS; } int set_region(const common::ObRegion ®ion)