diff --git a/deps/oblib/src/lib/stat/ob_latch_define.h b/deps/oblib/src/lib/stat/ob_latch_define.h index cce27ef19..b8c7348f8 100644 --- a/deps/oblib/src/lib/stat/ob_latch_define.h +++ b/deps/oblib/src/lib/stat/ob_latch_define.h @@ -309,6 +309,8 @@ LATCH_DEF(SQL_WF_PARTICIPATOR_COND_LOCK, 296, "window function participator lock LATCH_DEF(ARB_SERVER_CONFIG_LOCK, 297, "arbserver config lock", LATCH_FIFO, 2000, 0, ARB_SERVER_CONFIG_WAIT, "arbserver config lock") LATCH_DEF(CDC_SERVICE_LS_CTX_LOCK, 298, "cdcservice clientlsctx lock", LATCH_FIFO, 2000, 0, CDC_SERVICE_LS_CTX_LOCK_WAIT, "cdcservice clientlsctx lock") LATCH_DEF(MAJOR_FREEZE_DIAGNOSE_LOCK, 299, "major freeze diagnose lock", LATCH_READ_PREFER, 2000, 0, MAJOR_FREEZE_DIAGNOSE_LOCK_WAIT, "major freeze diagnose lock") +LATCH_DEF(HB_RESPONSES_LOCK, 300, "hb responses lock", LATCH_READ_PREFER, 2000, 0, HB_RESPONSES_LOCK_WAIT, "hb responses lock") +LATCH_DEF(ALL_SERVERS_INFO_IN_TABLE_LOCK, 301, "all servers info in table lock", LATCH_READ_PREFER, 2000, 0, ALL_SERVERS_INFO_IN_TABLE_LOCK_WAIT, "all servers info in table lock") LATCH_DEF(LATCH_END, 99999, "latch end", LATCH_FIFO, 2000, 0, WAIT_EVENT_END, "latch end") #endif diff --git a/deps/oblib/src/lib/wait_event/ob_wait_event.h b/deps/oblib/src/lib/wait_event/ob_wait_event.h index fd0a2c232..dbd0e5f06 100644 --- a/deps/oblib/src/lib/wait_event/ob_wait_event.h +++ b/deps/oblib/src/lib/wait_event/ob_wait_event.h @@ -300,6 +300,8 @@ WAIT_EVENT_DEF(TENANT_IO_CONFIG_WAIT, 15254, "rwlock: tenant io config wait", "a WAIT_EVENT_DEF(SQL_WF_PARTICIPATOR_LOCK_WAIT, 15255, "latch: window function participator cond lock wait", "address", "", "", CONCURRENCY, "window function participator cond lock wait", true) WAIT_EVENT_DEF(SQL_WF_PARTICIPATOR_COND_WAIT, 15256, "mutex: window function participator cond wait", "address", "", "", CONCURRENCY, "window function participator cond wait", true) WAIT_EVENT_DEF(MAJOR_FREEZE_DIAGNOSE_LOCK_WAIT, 15257, "latch: major_freeze diagnose lock wait", "address", "number", "tries", CONCURRENCY, "latch: major_freeze diagnose lock wait", true) +WAIT_EVENT_DEF(HB_RESPONSES_LOCK_WAIT, 15258, "latch: hb responses lock wait", "address", "number", "tries", CONCURRENCY, "latch: hb responses lock wait", true) +WAIT_EVENT_DEF(ALL_SERVERS_INFO_IN_TABLE_LOCK_WAIT, 15259, "latch: all servers info in table lock wait", "address", "number", "tries", CONCURRENCY, "latch: all servers info in table lock wait", true) //transaction WAIT_EVENT_DEF(END_TRANS_WAIT, 16001, "wait end trans", "rollback", "trans_hash_value", "participant_count", COMMIT,"wait end trans", false) diff --git a/deps/oblib/src/lib/wait_event/ob_wait_event_desc.md b/deps/oblib/src/lib/wait_event/ob_wait_event_desc.md index 68017533a..600b4e6e5 100644 --- a/deps/oblib/src/lib/wait_event/ob_wait_event_desc.md +++ b/deps/oblib/src/lib/wait_event/ob_wait_event_desc.md @@ -315,3 +315,9 @@ The read and write operation on configs in gc handler should be mutually exclusi ## latch: cdcservice clientlsctx lock wait The read and write operation on source(RemoteLogParent) in ClientLSCtx should be mutually exclusive. +## latch: hb respnses lock wait + +The read and write operation on hb_responses_ should be mutually exclusive. +## latch: all servers info in table lock wait + +The read and write operation on all_servers_info_in_table_ should be mutually exclusive. diff --git a/deps/oblib/src/rpc/obrpc/ob_rpc_packet_list.h b/deps/oblib/src/rpc/obrpc/ob_rpc_packet_list.h index fef61f449..c02f53eff 100644 --- a/deps/oblib/src/rpc/obrpc/ob_rpc_packet_list.h +++ b/deps/oblib/src/rpc/obrpc/ob_rpc_packet_list.h @@ -53,7 +53,7 @@ PCODE_DEF(OB_START_ZONE, 0x110) PCODE_DEF(OB_STOP_ZONE, 0x111) PCODE_DEF(OB_TENANT_MGR, 0x113) -PCODE_DEF(OB_MERGE_FINISH, 0x114) +// PCODE_DEF(OB_MERGE_FINISH, 0x114) //PCODE_DEF(OB_MERGE_ERROR, 0x115) // 4.0 not supported PCODE_DEF(OB_START_SERVER, 0x116) PCODE_DEF(OB_STOP_SERVER, 0x117) @@ -85,7 +85,7 @@ PCODE_DEF(OB_CHECK_DEPLOYMENT_MODE, 0x143) //PCODE_DEF(OB_GET_CLUSTER_STATS, 0x146)// 4.0 not supported PCODE_DEF(OB_WAIT_MASTER_KEY_IN_SYNC, 0x147) PCODE_DEF(OB_GET_REGION_BW, 0x148) -PCODE_DEF(OB_FETCH_ACTIVE_SERVER_STATUS, 0x149) +// PCODE_DEF(OB_FETCH_ACTIVE_SERVER_STATUS, 0x149) PCODE_DEF(OB_DETECT_MASTER_RS_LS, 0x14A) //PCODE_DEF(OB_DETECT_MASTER_RS_LOG_STREAM, 0x14B) // for rpc_log_stream_table's get PCODE_DEF(OB_BATCH_BROADCAST_SCHEMA, 0x14C) @@ -185,7 +185,7 @@ PCODE_DEF(OB_DO_SEQUENCE_DDL, 0x245) PCODE_DEF(OB_CREATE_TENANT_END, 0x248) //PCODE_DEF(OB_REACH_PARTITION_LIMIT, 0x24A) //PCODE_DEF(OB_ALTER_CLUSTER_INFO, 0x24B)// 4.0 not supported -PCODE_DEF(OB_CHECK_MERGE_FINISH, 0x24C) +//PCODE_DEF(OB_CHECK_MERGE_FINISH, 0x24C) //PCODE_DEF(OB_CHECK_CLUSTER_VALID_TO_ADD, 0x24D)// 4.0 not supported PCODE_DEF(OB_FLASHBACK_TABLE_TO_SCN, 0x24E) //PCODE_DEF(OB_GET_STANDBY_CLUSTER_STATISTIC, 0x24F)// 4.0 not supported diff --git a/mittest/simple_server/CMakeLists.txt b/mittest/simple_server/CMakeLists.txt index 0376b9873..6482714f8 100644 --- a/mittest/simple_server/CMakeLists.txt +++ b/mittest/simple_server/CMakeLists.txt @@ -46,4 +46,5 @@ ob_unittest_observer(test_fast_commit_report fast_commit_report.cpp) ob_unittest_observer(test_mvcc_gc test_mvcc_gc.cpp) ob_unittest_observer(test_ob_simple_rto test_ob_simple_rto.cpp) ob_unittest_observer(test_all_virtual_proxy_partition_info_default_value test_all_virtual_proxy_partition_info_default_value.cpp) +ob_unittest_observer(test_get_stopped_zone_list test_get_stopped_zone_list.cpp) ob_unittest_observer(test_lock_table_with_tx test_lock_table_with_tx.cpp) diff --git a/mittest/simple_server/test_get_stopped_zone_list.cpp b/mittest/simple_server/test_get_stopped_zone_list.cpp new file mode 100644 index 000000000..57e7d39f5 --- /dev/null +++ b/mittest/simple_server/test_get_stopped_zone_list.cpp @@ -0,0 +1,102 @@ +/** + * Copyright (c) 2021 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ +#define USING_LOG_PREFIX SHARE +#include +#include +#include "lib/string/ob_sql_string.h" // ObSqlString +#include "lib/mysqlclient/ob_mysql_proxy.h" // ObISqlClient, SMART_VAR +#include "observer/ob_sql_client_decorator.h" // ObSQLClientRetryWeak +#include "env/ob_simple_cluster_test_base.h" +#include "lib/ob_errno.h" +#include "lib/oblog/ob_log.h" +#include "rootserver/ob_root_utils.h" +#include "share/ob_server_table_operator.h" +#include "share/ob_zone_table_operation.h" +#define SQL_PROXY (get_curr_simple_server().get_observer().get_mysql_proxy()) +namespace oceanbase +{ +using namespace unittest; +namespace share +{ +using ::testing::_; +using ::testing::Invoke; +using ::testing::Return; +using namespace schema; +using namespace common; +class TestGetStoppedZoneList : public unittest::ObSimpleClusterTestBase +{ +public: + TestGetStoppedZoneList() : unittest::ObSimpleClusterTestBase("test_get_stopped_zone_list") {} +}; +TEST_F(TestGetStoppedZoneList, GetStoppedZoneList) +{ + // empty zone z3 is stopped + // server2 in z2 is stopped + // stopped_zone_list should be z2, z3, stopped_server_list should be server2 + // have_other_stop_task is also tested + ObServerInfoInTable server_info_in_table; + ObAddr server2; + ObServerTableOperator st_operator; + int64_t affected_rows = 0; + ObZone z2("z2"); + ObZone z3("z3"); + ObSqlString sql; + ASSERT_EQ(OB_SUCCESS, st_operator.init(&SQL_PROXY)); + ASSERT_TRUE(server2.set_ip_addr("127.0.0.1", 11111)); + + ASSERT_FALSE(rootserver::ObRootUtils::have_other_stop_task(GCONF.zone.str())); + + ASSERT_EQ(OB_SUCCESS, sql.assign_fmt("alter system add zone z2")); + ASSERT_EQ(OB_SUCCESS, SQL_PROXY.write(OB_SYS_TENANT_ID, sql.ptr(), affected_rows)); + sql.reset(); + + ASSERT_TRUE(rootserver::ObRootUtils::have_other_stop_task(GCONF.zone.str())); + + ASSERT_EQ(OB_SUCCESS, sql.assign_fmt("alter system start zone z2")); + ASSERT_EQ(OB_SUCCESS, SQL_PROXY.write(OB_SYS_TENANT_ID, sql.ptr(), affected_rows)); + sql.reset(); + ASSERT_FALSE(rootserver::ObRootUtils::have_other_stop_task(GCONF.zone.str())); + int ret = server_info_in_table.init(server2, 2, "z2", 15432, false, ObServerStatus::OB_SERVER_ACTIVE, "test_version", 5558888, 55555, 0); + ASSERT_EQ(OB_SUCCESS, ret); + ret = st_operator.insert(SQL_PROXY, server_info_in_table); + ASSERT_EQ(OB_SUCCESS, ret); + ASSERT_TRUE(rootserver::ObRootUtils::have_other_stop_task(GCONF.zone.str())); + + ASSERT_EQ(OB_SUCCESS, sql.assign_fmt("alter system add zone z3")); + ASSERT_EQ(OB_SUCCESS, SQL_PROXY.write(OB_SYS_TENANT_ID, sql.ptr(), affected_rows)); + + ObArray active_zone_list; + ObArray inactive_zone_list; + ASSERT_EQ(OB_SUCCESS, ObZoneTableOperation::get_active_zone_list(SQL_PROXY, active_zone_list)); + ASSERT_EQ(OB_SUCCESS, ObZoneTableOperation::get_inactive_zone_list(SQL_PROXY, inactive_zone_list)); + ASSERT_EQ(z3, inactive_zone_list.at(0)); + ASSERT_EQ(2, active_zone_list.count()); + + ObArray stopped_zone_list; + ObArray stopped_server_list; + ret = rootserver::ObRootUtils::get_stopped_zone_list(stopped_zone_list, stopped_server_list); + ASSERT_EQ(OB_SUCCESS, ret); + ASSERT_EQ(1, stopped_server_list.count()); + ASSERT_EQ(server2, stopped_server_list.at(0)); + ASSERT_EQ(2, stopped_zone_list.count()); + ASSERT_TRUE(has_exist_in_array(stopped_zone_list, z2)); + ASSERT_TRUE(has_exist_in_array(stopped_zone_list, z3)); +} +} // share +} // oceanbase +int main(int argc, char **argv) +{ + init_log_and_gtest(argc, argv); + OB_LOGGER.set_log_level("INFO"); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/src/logservice/ob_log_base_type.h b/src/logservice/ob_log_base_type.h index 67983083e..84a667417 100644 --- a/src/logservice/ob_log_base_type.h +++ b/src/logservice/ob_log_base_type.h @@ -75,6 +75,8 @@ enum ObLogBaseType // for arbitration service ARBITRATION_SERVICE_LOG_BASE_TYPE = 21, + + HEARTBEAT_SERVICE_LOG_BASE_TYPE = 22, // pay attention!!! // add log type in log_base_type_to_string // max value @@ -133,6 +135,8 @@ int log_base_type_to_string(const ObLogBaseType log_type, strncpy(str ,"DATA_DICTIONARY_SERVICE", str_len); } else if (log_type == ARBITRATION_SERVICE_LOG_BASE_TYPE) { strncpy(str ,"ARBITRATION_SERVICE", str_len); + } else if (log_type == HEARTBEAT_SERVICE_LOG_BASE_TYPE) { + strncpy(str ,"HEARTBEAT_SERVICE", str_len); } else { ret = OB_INVALID_ARGUMENT; } diff --git a/src/observer/CMakeLists.txt b/src/observer/CMakeLists.txt index 8d06a21ea..6b2afb279 100644 --- a/src/observer/CMakeLists.txt +++ b/src/observer/CMakeLists.txt @@ -6,6 +6,7 @@ ob_set_subtarget(ob_server ALONE ob_set_subtarget(ob_server common ob_dump_task_generator.cpp ob_heartbeat.cpp + ob_heartbeat_handler.cpp ob_inner_sql_rpc_proxy.cpp ob_inner_sql_rpc_processor.cpp ob_inner_sql_connection.cpp diff --git a/src/observer/dbms_job/ob_dbms_job_master.cpp b/src/observer/dbms_job/ob_dbms_job_master.cpp index ae6a7930a..3a48d30b7 100644 --- a/src/observer/dbms_job/ob_dbms_job_master.cpp +++ b/src/observer/dbms_job/ob_dbms_job_master.cpp @@ -25,6 +25,7 @@ #include "lib/profile/ob_trace_id.h" #include "share/partition_table/ob_partition_location.h" +#include "share/ob_all_server_tracer.h" #include "observer/ob_server_struct.h" namespace oceanbase @@ -235,8 +236,7 @@ ObDBMSJobMaster &ObDBMSJobMaster::get_instance() return master_; } -int ObDBMSJobMaster::init(ObServerManager *server_mgr, - ObISQLClient *sql_client, +int ObDBMSJobMaster::init(ObISQLClient *sql_client, ObMultiVersionSchemaService *schema_service) { int ret = OB_SUCCESS; @@ -247,16 +247,12 @@ int ObDBMSJobMaster::init(ObServerManager *server_mgr, if (inited_) { ret = OB_INIT_TWICE; LOG_WARN("dbms job master already inited", K(ret), K(inited_)); - } else if (OB_ISNULL(server_mgr) - || OB_ISNULL(sql_client) + } else if (OB_ISNULL(sql_client) || OB_ISNULL(schema_service) || OB_ISNULL(GCTX.dbms_job_rpc_proxy_) ) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("null ptr", K(ret), K(server_mgr), K(sql_client), K(schema_service)); - } else if (!server_mgr->is_inited()) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not init yet", K(ret)); + LOG_WARN("null ptr", K(ret), K(sql_client), K(schema_service)); } else if (OB_FAIL(ready_queue_.init(ready_queue_size))) { LOG_WARN("fail to init ready job queue for all jobs", K(ret)); } else if (OB_FAIL(scheduler_task_.init(&ready_queue_))) { @@ -273,7 +269,6 @@ int ObDBMSJobMaster::init(ObServerManager *server_mgr, } else { trace_id_ = ObCurTraceId::get(); self_addr_ = GCONF.self_addr_; - server_mgr_ = server_mgr; schema_service_ = schema_service; job_rpc_proxy_ = GCTX.dbms_job_rpc_proxy_; inited_ = true; @@ -481,9 +476,6 @@ int ObDBMSJobMaster::get_all_servers(int64_t tenant_id, ObString &pick_zone, ObI } else if (OB_INVALID_ID == tenant_id) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid job id", K(ret), K(tenant_id)); - } else if (!server_mgr_->is_inited()) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not init yet!", K(ret)); } else if (OB_FAIL(schema_service_->get_tenant_schema_guard(OB_SYS_TENANT_ID, schema_guard))) { LOG_WARN("fail get schema guard", K(ret)); } else if (OB_FAIL(schema_guard.get_tenant_info(tenant_id, tenant_info))) { @@ -500,8 +492,8 @@ int ObDBMSJobMaster::get_all_servers(int64_t tenant_id, ObString &pick_zone, ObI if (pick_zone.empty() || 0 == pick_zone.case_compare(dbms_job::ObDBMSJobInfo::__ALL_SERVER_BC) || 0 == pick_zone.case_compare(zone.str())) { - if (OB_FAIL(server_mgr_->get_alive_servers(zone, server_list))) { - LOG_WARN("fail to get zone server list", K(ret)); + if (OB_FAIL(SVR_TRACER.get_alive_servers(zone, server_list))) { + LOG_WARN("fail to get zone server list", KR(ret), K(zone)); } else { for (int64_t j = 0; OB_SUCC(ret) && j < server_list.count(); j++) { if (common::is_contain(servers, server_list.at(j))) { @@ -537,12 +529,16 @@ int ObDBMSJobMaster::server_random_pick(int64_t tenant_id, ObString &pick_zone, while (OB_SUCC(ret) && cnt < total_server.count()) { pos = (pos + 1) % total_server.count(); pick = total_server.at(pos); - server_mgr_->check_server_alive(pick, is_alive); - server_mgr_->check_server_active(pick, is_active); - if (is_alive && is_active) { - break; + if (OB_FAIL(SVR_TRACER.check_server_alive(pick, is_alive))) { + LOG_WARN("fail to check server alive", KR(ret), K(pick)); + } else if (OB_FAIL(SVR_TRACER.check_server_active(pick, is_active))) { + LOG_WARN("fail to check server active", KR(ret), K(pick)); + } else { + if (is_alive && is_active) { + break; + } + cnt++; } - cnt++; } if (OB_FAIL(ret)) { } else if (cnt >= total_server.count()) { diff --git a/src/observer/dbms_job/ob_dbms_job_master.h b/src/observer/dbms_job/ob_dbms_job_master.h index 3c6d43623..99efaddd8 100644 --- a/src/observer/dbms_job/ob_dbms_job_master.h +++ b/src/observer/dbms_job/ob_dbms_job_master.h @@ -29,7 +29,6 @@ #include "share/schema/ob_schema_service.h" #include "share/schema/ob_multi_version_schema_service.h" -#include "rootserver/ob_server_manager.h" #include "rootserver/ob_ddl_service.h" @@ -159,7 +158,6 @@ public: running_(false), trace_id_(NULL), rand_(), - server_mgr_(NULL), schema_service_(NULL), job_rpc_proxy_(NULL), self_addr_(), @@ -172,8 +170,7 @@ public: bool is_inited() { return inited_; } - int init(rootserver::ObServerManager *server_mgr, - common::ObISQLClient *sql_client, + int init(common::ObISQLClient *sql_client, share::schema::ObMultiVersionSchemaService *schema_service); int start(); @@ -211,7 +208,6 @@ private: const uint64_t *trace_id_; common::ObRandom rand_; // for random pick server - rootserver::ObServerManager *server_mgr_; share::schema::ObMultiVersionSchemaService *schema_service_; // for got all tenant info obrpc::ObDBMSJobRpcProxy *job_rpc_proxy_; diff --git a/src/observer/dbms_scheduler/ob_dbms_sched_job_master.cpp b/src/observer/dbms_scheduler/ob_dbms_sched_job_master.cpp index 151c97897..e66ce007e 100644 --- a/src/observer/dbms_scheduler/ob_dbms_sched_job_master.cpp +++ b/src/observer/dbms_scheduler/ob_dbms_sched_job_master.cpp @@ -25,6 +25,7 @@ #include "lib/profile/ob_trace_id.h" #include "share/partition_table/ob_partition_location.h" +#include "share/ob_all_server_tracer.h" #include "observer/ob_server_struct.h" #include "rootserver/ob_root_service.h" @@ -242,8 +243,7 @@ ObDBMSSchedJobMaster &ObDBMSSchedJobMaster::get_instance() return master_; } -int ObDBMSSchedJobMaster::init(ObServerManager *server_mgr, - ObUnitManager *unit_mgr, +int ObDBMSSchedJobMaster::init(ObUnitManager *unit_mgr, ObISQLClient *sql_client, ObMultiVersionSchemaService *schema_service) { @@ -251,17 +251,13 @@ int ObDBMSSchedJobMaster::init(ObServerManager *server_mgr, if (inited_) { ret = OB_INIT_TWICE; LOG_WARN("dbms sched job master already inited", K(ret), K(inited_)); - } else if (OB_ISNULL(server_mgr) - || OB_ISNULL(unit_mgr) + } else if (OB_ISNULL(unit_mgr) || OB_ISNULL(sql_client) || OB_ISNULL(schema_service) || OB_ISNULL(GCTX.dbms_sched_job_rpc_proxy_) ) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("null ptr", K(ret), K(server_mgr), K(unit_mgr), K(sql_client), K(schema_service)); - } else if (!server_mgr->is_inited()) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not init yet", K(ret)); + LOG_WARN("null ptr", K(ret), K(unit_mgr), K(sql_client), K(schema_service)); } else if (OB_FAIL(ready_queue_.init(MAX_READY_JOBS_CAPACITY))) { LOG_WARN("fail to init ready job queue for all jobs", K(ret)); } else if (OB_FAIL(scheduler_task_.init())) { @@ -278,7 +274,6 @@ int ObDBMSSchedJobMaster::init(ObServerManager *server_mgr, } else { trace_id_ = ObCurTraceId::get(); self_addr_ = GCONF.self_addr_; - server_mgr_ = server_mgr; unit_mgr_ = unit_mgr; schema_service_ = schema_service; job_rpc_proxy_ = GCTX.dbms_sched_job_rpc_proxy_; @@ -486,16 +481,16 @@ int ObDBMSSchedJobMaster::server_random_pick(int64_t tenant_id, ObString &pick_z } else if (OB_INVALID_ID == tenant_id) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid job id", K(ret), K(tenant_id)); - } else if (!server_mgr_->is_inited()) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not init yet!", K(ret)); + } else if (OB_ISNULL(schema_service_) || OB_ISNULL(unit_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("schema_service_ or unit_mgr_ is null", KR(ret), KP(schema_service_), KP(unit_mgr_)); } else if (OB_FAIL(schema_service_->get_tenant_schema_guard(OB_SYS_TENANT_ID, schema_guard))) { LOG_WARN("fail get schema guard", K(ret)); } else if (OB_FAIL(schema_guard.get_tenant_info(tenant_id, tenant_info))) { LOG_WARN("fail to get tenant info", K(ret), K(tenant_id)); } else if (OB_ISNULL(tenant_info)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("null ptr", K(ret), K(tenant_info)); + LOG_WARN("null ptr", K(ret), KP(tenant_info)); } else if (OB_FAIL(tenant_info->get_zone_list(zone_list))) { LOG_WARN("fail to get zone list", K(ret)); } else { @@ -503,8 +498,8 @@ int ObDBMSSchedJobMaster::server_random_pick(int64_t tenant_id, ObString &pick_z common::ObZone zone = zone_list.at(i); common::ObArray server_list; if (pick_zone.empty() || 0 == pick_zone.case_compare(zone.str())) { - if (OB_FAIL(server_mgr_->get_alive_servers(zone, server_list))) { - LOG_WARN("fail to get zone server list", K(ret)); + if (OB_FAIL(SVR_TRACER.get_alive_servers(zone, server_list))) { + LOG_WARN("fail to get zone server list", KR(ret), K(zone)); } else { for (int64_t j = 0; OB_SUCC(ret) && j < server_list.count(); j++) { if (OB_FAIL(total_server.push_back(server_list.at(j)))) { @@ -529,13 +524,18 @@ int ObDBMSSchedJobMaster::server_random_pick(int64_t tenant_id, ObString &pick_z do { pos = (pos + 1) % total_server.count(); pick = total_server.at(pos); - server_mgr_->check_server_alive(pick, is_alive); - server_mgr_->check_server_active(pick, is_active); - unit_mgr_->check_tenant_on_server(tenant_id, pick, on_server); - if (is_alive && is_active && on_server) { - break; + if (OB_FAIL(SVR_TRACER.check_server_alive(pick, is_alive))) { + LOG_WARN("fail to check server alive", KR(ret), K(pick)); + } else if (OB_FAIL(SVR_TRACER.check_server_active(pick, is_active))) { + LOG_WARN("fail to check server active", KR(ret), K(pick)); + } else if (OB_FAIL(unit_mgr_->check_tenant_on_server(tenant_id, pick, on_server))) { + LOG_WARN("fail to check tenant on server", KR(ret), K(tenant_id), K(pick)); + } else { + if (is_alive && is_active && on_server) { + break; + } + cnt++; } - cnt++; } while (cnt < total_server.count()); if (cnt >= total_server.count()) { ret = OB_ERR_UNEXPECTED; diff --git a/src/observer/dbms_scheduler/ob_dbms_sched_job_master.h b/src/observer/dbms_scheduler/ob_dbms_sched_job_master.h index 1cc5ad95e..58252edcf 100644 --- a/src/observer/dbms_scheduler/ob_dbms_sched_job_master.h +++ b/src/observer/dbms_scheduler/ob_dbms_sched_job_master.h @@ -30,7 +30,6 @@ #include "share/schema/ob_schema_service.h" #include "share/schema/ob_multi_version_schema_service.h" -#include "rootserver/ob_server_manager.h" #include "rootserver/ob_ddl_service.h" @@ -168,7 +167,6 @@ public: running_(false), trace_id_(NULL), rand_(), - server_mgr_(NULL), schema_service_(NULL), job_rpc_proxy_(NULL), self_addr_(), @@ -181,8 +179,7 @@ public: bool is_inited() { return inited_; } - int init(rootserver::ObServerManager *server_mgr, - rootserver::ObUnitManager *unit_mgr, + int init(rootserver::ObUnitManager *unit_mgr, common::ObISQLClient *sql_client, share::schema::ObMultiVersionSchemaService *schema_service); @@ -224,7 +221,6 @@ private: const uint64_t *trace_id_; common::ObRandom rand_; // for random pick server - rootserver::ObServerManager *server_mgr_; rootserver::ObUnitManager *unit_mgr_; share::schema::ObMultiVersionSchemaService *schema_service_; // for got all tenant info obrpc::ObDBMSSchedJobRpcProxy *job_rpc_proxy_; diff --git a/src/observer/ob_heartbeat.cpp b/src/observer/ob_heartbeat.cpp index 4ba5314cd..eb334d325 100644 --- a/src/observer/ob_heartbeat.cpp +++ b/src/observer/ob_heartbeat.cpp @@ -27,6 +27,7 @@ #include "observer/ob_server_schema_updater.h" #include "observer/ob_server.h" #include "observer/omt/ob_tenant_config_mgr.h" +#include "observer/ob_heartbeat_handler.h" #include "common/ob_timeout_ctx.h" #include "storage/slog/ob_storage_logger_manager.h" @@ -84,40 +85,22 @@ int ObHeartBeatProcess::init() int ObHeartBeatProcess::init_lease_request(ObLeaseRequest &lease_request) { int ret = OB_SUCCESS; - omt::ObTenantNodeBalancer::ServerResource svr_res_assigned; common::ObArray > max_stored_versions; - int64_t clog_free_size_byte = 0; - int64_t clog_total_size_byte = 0; - logservice::ObServerLogBlockMgr *log_block_mgr = GCTX.log_block_mgr_; - - if (!inited_ || OB_ISNULL(log_block_mgr)) { + if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; - LOG_WARN("not init or log_block_mgr is null", KR(ret), K(inited_), K(GCTX.log_block_mgr_)); - } else if (OB_FAIL(omt::ObTenantNodeBalancer::get_instance().get_server_allocated_resource(svr_res_assigned))) { - LOG_WARN("fail to get server allocated resource", KR(ret)); - } else if (OB_FAIL(log_block_mgr->get_disk_usage(clog_free_size_byte, clog_total_size_byte))) { - LOG_WARN("Failed to get clog stat ", KR(ret)); + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (OB_ISNULL(GCTX.ob_service_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.ob_service_ is null", KR(ret), KP(GCTX.ob_service_)); + } else if (OB_FAIL((GCTX.ob_service_->get_server_resource_info(lease_request.resource_info_)))) { + LOG_WARN("fail to get server resource info", KR(ret)); } else { - int64_t reserved_size = 4 * 1024 * 1024 * 1024L; // default RESERVED_DISK_SIZE -> 4G - (void) SLOGGERMGR.get_reserved_size(reserved_size); lease_request.request_lease_time_ = 0; // this is not a valid member lease_request.version_ = ObLeaseRequest::LEASE_VERSION; lease_request.zone_ = gctx_.config_->zone.str(); lease_request.server_ = gctx_.self_addr(); lease_request.sql_port_ = gctx_.config_->mysql_port; - lease_request.resource_info_.cpu_ = get_cpu_count(); - lease_request.resource_info_.report_cpu_assigned_ = svr_res_assigned.min_cpu_; - lease_request.resource_info_.report_cpu_max_assigned_ = svr_res_assigned.max_cpu_; - lease_request.resource_info_.report_mem_assigned_ = svr_res_assigned.memory_size_; - lease_request.resource_info_.mem_in_use_ = 0; - lease_request.resource_info_.mem_total_ = GMEMCONF.get_server_memory_avail(); - lease_request.resource_info_.disk_total_ - = OB_SERVER_BLOCK_MGR.get_max_macro_block_count(reserved_size) * OB_SERVER_BLOCK_MGR.get_macro_block_size(); - lease_request.resource_info_.disk_in_use_ - = OB_SERVER_BLOCK_MGR.get_used_macro_block_count() * OB_SERVER_BLOCK_MGR.get_macro_block_size(); - lease_request.resource_info_.log_disk_total_ = clog_total_size_byte; - lease_request.resource_info_.report_log_disk_assigned_ = svr_res_assigned.log_disk_size_; get_package_and_svn(lease_request.build_version_, sizeof(lease_request.build_version_)); OTC_MGR.get_lease_request(lease_request); lease_request.start_service_time_ = gctx_.start_service_time_; @@ -174,8 +157,8 @@ int ObHeartBeatProcess::do_heartbeat_event(const ObLeaseResponse &lease_response if (OB_INVALID_ID != lease_response.server_id_) { if (GCTX.server_id_ != lease_response.server_id_) { LOG_INFO("receive new server id", - "old_id", GCTX.server_id_, - "new_id", lease_response.server_id_); + "old_id", GCTX.server_id_, + "new_id", lease_response.server_id_); GCTX.server_id_ = lease_response.server_id_; GCONF.server_id = lease_response.server_id_; const int64_t delay = 0; @@ -189,13 +172,16 @@ int ObHeartBeatProcess::do_heartbeat_event(const ObLeaseResponse &lease_response } } - // update server status if needed - if (RSS_INVALID != lease_response.rs_server_status_) { - if (GCTX.rs_server_status_ != lease_response.rs_server_status_) { - LOG_INFO("receive new server status recorded in rs", - "old_status", GCTX.rs_server_status_, - "new_status", lease_response.rs_server_status_); - GCTX.rs_server_status_ = lease_response.rs_server_status_; + if (!ObHeartbeatHandler::is_rs_epoch_id_valid()) { + ///// if the new heartbeat service has not started, this heartbeat is responsible for + //// update server_id_ and rs_server_status_ + if (RSS_INVALID != lease_response.rs_server_status_) { + if (GCTX.rs_server_status_ != lease_response.rs_server_status_) { + LOG_INFO("receive new server status recorded in rs", + "old_status", GCTX.rs_server_status_, + "new_status", lease_response.rs_server_status_); + GCTX.rs_server_status_ = lease_response.rs_server_status_; + } } } // even try reload schema failed, we should continue do following things @@ -203,10 +189,10 @@ int ObHeartBeatProcess::do_heartbeat_event(const ObLeaseResponse &lease_response if (OB_SUCCESS != schema_ret) { LOG_WARN("try reload schema failed", "schema_version", lease_response.schema_version_, - "refresh_schema_info", lease_response.refresh_schema_info_, K(schema_ret)); + "refresh_schema_info", lease_response.refresh_schema_info_, K(schema_ret)); } else { LOG_INFO("try reload schema success", "schema_version", lease_response.schema_version_, - "refresh_schema_info", lease_response.refresh_schema_info_, K(schema_ret)); + "refresh_schema_info", lease_response.refresh_schema_info_, K(schema_ret)); } const int64_t delay = 0; diff --git a/src/observer/ob_heartbeat_handler.cpp b/src/observer/ob_heartbeat_handler.cpp new file mode 100644 index 000000000..3efe8ad5f --- /dev/null +++ b/src/observer/ob_heartbeat_handler.cpp @@ -0,0 +1,193 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ +#define USING_LOG_PREFIX SERVER +#include "observer/ob_heartbeat_handler.h" + +#include "observer/ob_server.h" +#include "share/ob_version.h" +#include "observer/ob_service.h" + +namespace oceanbase +{ +namespace observer +{ +static const char *OB_DATA_DISK_STATUS_STR[] = {"INVALID", "NORMAL", "ERROR"}; +OB_SERIALIZE_MEMBER( + ObServerHealthStatus, + data_disk_status_ +) +ObServerHealthStatus::ObServerHealthStatus() + : data_disk_status_(ObDataDiskStatus::DATA_DISK_STATUS_INVALID) +{ +} +ObServerHealthStatus::~ObServerHealthStatus() +{ +} +int ObServerHealthStatus::init(ObDataDiskStatus data_disk_status) +{ + int ret = OB_SUCCESS; + if (data_disk_status <= DATA_DISK_STATUS_INVALID || data_disk_status >= DATA_DISK_STATUS_MAX) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(data_disk_status)); + } else { + data_disk_status_ = data_disk_status; + } + return ret; +} +int ObServerHealthStatus::assign(const ObServerHealthStatus server_health_status) +{ + int ret = OB_SUCCESS; + data_disk_status_ = server_health_status.data_disk_status_; + return ret; +} +void ObServerHealthStatus::reset() +{ + data_disk_status_ = ObDataDiskStatus::DATA_DISK_STATUS_INVALID; +} +bool ObServerHealthStatus::is_valid() const +{ + return data_disk_status_ > ObDataDiskStatus::DATA_DISK_STATUS_INVALID + && data_disk_status_ < ObDataDiskStatus::DATA_DISK_STATUS_MAX; +} +bool ObServerHealthStatus::is_healthy() const +{ + return ObDataDiskStatus::DATA_DISK_STATUS_NORMAL == data_disk_status_; +} +const char *ObServerHealthStatus::data_disk_status_to_str(const ObDataDiskStatus data_disk_status) +{ + STATIC_ASSERT(ARRAYSIZEOF(OB_DATA_DISK_STATUS_STR) == DATA_DISK_STATUS_MAX, "array size mismatch"); + const char *str = "UNKNOWN"; + if (OB_UNLIKELY(data_disk_status >= ARRAYSIZEOF(OB_DATA_DISK_STATUS_STR) + || data_disk_status < DATA_DISK_STATUS_INVALID)) { + LOG_ERROR_RET(OB_ERR_UNEXPECTED, "fatal error, unknown data disk status", K(data_disk_status)); + } else { + str = OB_DATA_DISK_STATUS_STR[data_disk_status]; + } + return str; +} +ObHeartbeatHandler::ObHeartbeatHandler() +{ +} +ObHeartbeatHandler::~ObHeartbeatHandler() +{ +} +int64_t ObHeartbeatHandler::rs_epoch_id_ = palf::INVALID_PROPOSAL_ID; +bool ObHeartbeatHandler::is_rs_epoch_id_valid() +{ + return palf::INVALID_PROPOSAL_ID != ATOMIC_LOAD(&rs_epoch_id_); +} +int ObHeartbeatHandler::handle_heartbeat( + const share::ObHBRequest &hb_request, + share::ObHBResponse &hb_response) +{ + int ret = OB_SUCCESS; + hb_response.reset(); + int64_t rs_epoch_id = ATOMIC_LOAD(&rs_epoch_id_); + if (OB_UNLIKELY(!hb_request.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("receive an invalid heartbeat request", KR(ret), K(hb_request)); + } else if (OB_ISNULL(GCTX.rs_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("rs manager is null", KR(ret), KP(GCTX.rs_mgr_)); + } else { + const int64_t epoch_id = hb_request.get_epoch_id(); + if (rs_epoch_id < epoch_id || palf::INVALID_PROPOSAL_ID == rs_epoch_id) { + LOG_INFO("receive new rs epoch", "old rs_epoch_id", rs_epoch_id, "new rs_epoch_id", epoch_id); + int64_t current_epoch_id = ATOMIC_CAS(&rs_epoch_id_, rs_epoch_id, epoch_id); + if (rs_epoch_id != current_epoch_id) { + ret = OB_NEED_RETRY; + LOG_WARN("set rs_epoch_id_failed", KR(ret), K(rs_epoch_id), K(epoch_id), K(current_epoch_id)); + } + } else if (rs_epoch_id > epoch_id) { + ret = OB_RS_NOT_MASTER; + LOG_WARN("this rs is not the newest leader", KR(ret), K(rs_epoch_id), K(epoch_id)); + } + } + if (FAILEDx(GCTX.rs_mgr_->force_set_master_rs(hb_request.get_rs_addr()))) { + LOG_WARN("fail to set master rs", KR(ret), K(hb_request.get_rs_addr())); + } else if (OB_FAIL(init_hb_response_(hb_response))) { + LOG_WARN("fail to init hb response", KR(ret)); + } else { + // const uint64_t server_id = hb_request.get_server_id(); + const share::RSServerStatus rs_server_status = hb_request.get_rs_server_status(); + // if (GCTX.server_id_ != server_id) { + // LOG_INFO("receive new server id", "old server_id_", GCTX.server_id_, "new server_id_", server_id); + // GCTX.server_id_ = server_id; + // } + if (GCTX.rs_server_status_ != rs_server_status) { + LOG_INFO("receive new server status recorded in rs", + "old_status", GCTX.rs_server_status_, + "new_status", rs_server_status); + GCTX.rs_server_status_ = rs_server_status; + } + } + return ret; +} +int ObHeartbeatHandler::check_disk_status_(ObServerHealthStatus &server_health_status) +{ + int ret = OB_SUCCESS; + int tmp_ret = OB_SUCCESS; + ObDeviceHealthStatus dhs = DEVICE_HEALTH_NORMAL; + int64_t abnormal_time = 0; + server_health_status.reset(); + if (OB_TMP_FAIL(ObIOManager::get_instance().get_device_health_status(dhs, abnormal_time))) { + LOG_WARN("fail to get device health status", KR(ret), KR(tmp_ret)); + } else if (OB_UNLIKELY(DEVICE_HEALTH_ERROR == dhs)) { + const int64_t PRINT_LOG_INTERVAL_IN_US = 60 * 1000 * 1000; // 1min + if (REACH_TIME_INTERVAL(PRINT_LOG_INTERVAL_IN_US)) { + LOG_WARN("error occurs on data disk, ", + "data_disk_health_status", device_health_status_to_str(dhs), K(abnormal_time)); + } + } + const bool is_data_disk_error = (DEVICE_HEALTH_ERROR == dhs); + if (is_data_disk_error) { + server_health_status.init(ObServerHealthStatus::DATA_DISK_STATUS_ERROR); + } else { + server_health_status.init(ObServerHealthStatus::DATA_DISK_STATUS_NORMAL); + } + return ret; +} +ERRSIM_POINT_DEF(ERRSIM_DISK_ERROR); +int ObHeartbeatHandler::init_hb_response_(share::ObHBResponse &hb_response) +{ + int ret = OB_SUCCESS; + ObServerHealthStatus server_health_status; + if (OB_FAIL(check_disk_status_(server_health_status))) { + LOG_WARN("fail to check disk status", KR(ret)); + } else { + int64_t sql_port = GCONF.mysql_port; + share::ObServerInfoInTable::ObBuildVersion build_version; + common::ObZone zone; + int64_t test_id = ERRSIM_DISK_ERROR ? 2 : OB_INVALID_ID; + if (test_id == GCTX.server_id_) { + server_health_status.reset(); + server_health_status.init(ObServerHealthStatus::DATA_DISK_STATUS_ERROR); + } + if (OB_FAIL(zone.assign(GCONF.zone.str()))) { + LOG_WARN("fail to assign zone", KR(ret), K(GCONF.zone.str())); + } else if (OB_FAIL(ObService::get_build_version(build_version))) { + LOG_WARN("fail to get build_version", KR(ret), K(build_version)); + } else if (OB_FAIL(hb_response.init( + zone, + GCTX.self_addr(), + sql_port, + build_version, + GCTX.start_service_time_, + server_health_status))) { + LOG_WARN("fail to init the heartbeat response", KR(ret), K(zone), K(GCTX.self_addr()), + K(sql_port), K(build_version), K(GCTX.start_service_time_), K(server_health_status)); + } else {} + } + return ret; +} +} // observer +} // oceanbase \ No newline at end of file diff --git a/src/observer/ob_heartbeat_handler.h b/src/observer/ob_heartbeat_handler.h new file mode 100644 index 000000000..ffdd54a27 --- /dev/null +++ b/src/observer/ob_heartbeat_handler.h @@ -0,0 +1,75 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ +#ifndef OCEANBASE_OBSERVER_OB_HEARTBEAT_HANDLER_H_ +#define OCEANBASE_OBSERVER_OB_HEARTBEAT_HANDLER_H_ +#include "lib/utility/ob_print_utils.h" +#include "lib/utility/ob_unify_serialize.h" +namespace oceanbase +{ +namespace share +{ + struct ObHBRequest; + struct ObHBResponse; +} +namespace observer +{ +// currently, server health status only covers data disk status. +struct ObServerHealthStatus +{ + OB_UNIS_VERSION(1); +public: + enum ObDataDiskStatus + { + DATA_DISK_STATUS_INVALID = 0, + DATA_DISK_STATUS_NORMAL = 1, + DATA_DISK_STATUS_ERROR = 2, + DATA_DISK_STATUS_MAX = 3 + }; + explicit ObServerHealthStatus(); + virtual ~ObServerHealthStatus(); + int init(ObDataDiskStatus data_disk_status); + int assign(const ObServerHealthStatus server_health_status); + void reset(); + bool is_valid() const; + bool is_healthy() const; + static const char *data_disk_status_to_str(const ObDataDiskStatus data_disk_status); + inline bool operator ==(const ObServerHealthStatus &other) const + { + return data_disk_status_ == other.data_disk_status_; + } + inline bool operator !=(const ObServerHealthStatus &other) const + { + return data_disk_status_ != other.data_disk_status_; + } + TO_STRING_KV(K(data_disk_status_), "data_disk_status", data_disk_status_to_str(data_disk_status_)); +private: + ObDataDiskStatus data_disk_status_; +}; +class ObHeartbeatHandler +{ +public: + explicit ObHeartbeatHandler(); + virtual ~ObHeartbeatHandler(); + static int handle_heartbeat( + const share::ObHBRequest &hb_request, + share::ObHBResponse &hb_response); + static bool is_rs_epoch_id_valid(); +private: + static int check_disk_status_(ObServerHealthStatus &server_health_status); + static int init_hb_response_(share::ObHBResponse &hb_response); + static int64_t rs_epoch_id_; +private: + DISALLOW_COPY_AND_ASSIGN(ObHeartbeatHandler); +}; +} // observer +} // oceanbase +#endif \ No newline at end of file diff --git a/src/observer/ob_rpc_processor_simple.cpp b/src/observer/ob_rpc_processor_simple.cpp index 2415ab4fc..9c29a50ac 100644 --- a/src/observer/ob_rpc_processor_simple.cpp +++ b/src/observer/ob_rpc_processor_simple.cpp @@ -811,6 +811,18 @@ int ObRpcIsEmptyServerP::process() return ret; } +int ObRpcCheckServerForAddingServerP::process() +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(gctx_.ob_service_)) { + ret = OB_INVALID_ARGUMENT; + LOG_ERROR("invalid argument", KR(ret), KP(gctx_.ob_service_)); + } else if (OB_FAIL(gctx_.ob_service_->check_server_for_adding_server(arg_, result_))) { + LOG_WARN("fail to call check_server_for_adding_server", KR(ret), K(arg_)); + } else {} + return ret; +} + int ObRpcCheckDeploymentModeP::process() { int ret = OB_SUCCESS; @@ -2334,9 +2346,33 @@ int ObSyncRewriteRulesP::process() } else if (OB_FAIL(rule_mgr->sync_rule_from_inner_table())) { LOG_WARN("failed to sync rewrite rules from inner table", K(ret)); } + } + return ret; +} + +int ObRpcSendHeartbeatP::process() +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(gctx_.ob_service_)) { + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("invalid argument", KR(ret), KP(gctx_.ob_service_)); + } else if (OB_FAIL(gctx_.ob_service_->handle_heartbeat(arg_, result_))) { + LOG_WARN("fail to call handle_heartbeat in ob service", KR(ret), K(arg_)); } return ret; } +int ObRpcGetServerResourceInfoP::process() +{ + int ret = OB_SUCCESS; + if (OB_ISNULL(gctx_.ob_service_)) { + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("invalid argument", KR(ret), KP(gctx_.ob_service_)); + } else if (OB_FAIL(gctx_.ob_service_->get_server_resource_info(arg_, result_))) { + LOG_WARN("fail to call get_server_resource_info in ob service", KR(ret), K(arg_)); + } else {} + return ret; +} + } // end of namespace observer } // end of namespace oceanbase diff --git a/src/observer/ob_rpc_processor_simple.h b/src/observer/ob_rpc_processor_simple.h index 5d6a3c14d..3b48686f2 100644 --- a/src/observer/ob_rpc_processor_simple.h +++ b/src/observer/ob_rpc_processor_simple.h @@ -132,6 +132,7 @@ OB_DEFINE_PROCESSOR_S(Srv, OB_REFRESH_MEMORY_STAT, ObRpcRefreshMemStatP); OB_DEFINE_PROCESSOR_S(Srv, OB_WASH_MEMORY_FRAGMENTATION, ObRpcWashMemFragmentationP); OB_DEFINE_PROCESSOR_S(Srv, OB_BOOTSTRAP, ObRpcBootstrapP); OB_DEFINE_PROCESSOR_S(Srv, OB_IS_EMPTY_SERVER, ObRpcIsEmptyServerP); +OB_DEFINE_PROCESSOR_S(Srv, OB_CHECK_SERVER_FOR_ADDING_SERVER, ObRpcCheckServerForAddingServerP); OB_DEFINE_PROCESSOR_S(Srv, OB_CHECK_DEPLOYMENT_MODE, ObRpcCheckDeploymentModeP); OB_DEFINE_PROCESSOR_S(Srv, OB_REFRESH_SYNC_VALUE, ObRpcSyncAutoincValueP); OB_DEFINE_PROCESSOR_S(Srv, OB_CLEAR_AUTOINC_CACHE, ObRpcClearAutoincCacheP); @@ -217,6 +218,8 @@ OB_DEFINE_PROCESSOR_S(Srv, OB_ESTIMATE_TABLET_BLOCK_COUNT, ObEstimateTabletBlock OB_DEFINE_PROCESSOR_S(Srv, OB_DDL_CHECK_TABLET_MERGE_STATUS, ObRpcDDLCheckTabletMergeStatusP); OB_DEFINE_PROCESSOR_S(Srv, OB_REFRESH_TENANT_INFO, ObRefreshTenantInfoP); OB_DEFINE_PROCESSOR_S(Srv, OB_SYNC_REWRITE_RULES, ObSyncRewriteRulesP); +OB_DEFINE_PROCESSOR_S(Srv, OB_SEND_HEARTBEAT, ObRpcSendHeartbeatP); +OB_DEFINE_PROCESSOR_S(Srv, OB_GET_SERVER_RESOURCE_INFO, ObRpcGetServerResourceInfoP); OB_DEFINE_PROCESSOR_S(Srv, OB_UPDATE_TENANT_INFO_CACHE, ObUpdateTenantInfoCacheP); } // end of namespace observer } // end of namespace oceanbase diff --git a/src/observer/ob_service.cpp b/src/observer/ob_service.cpp index af122e286..25e05ba76 100644 --- a/src/observer/ob_service.cpp +++ b/src/observer/ob_service.cpp @@ -73,6 +73,8 @@ #include "storage/compaction/ob_tenant_tablet_scheduler.h" #include "share/ob_cluster_event_history_table_operator.h"//CLUSTER_EVENT_INSTANCE #include "storage/ddl/ob_tablet_ddl_kv_mgr.h" +#include "observer/ob_heartbeat_handler.h" +#include "storage/slog/ob_storage_logger_manager.h" namespace oceanbase { @@ -1484,8 +1486,124 @@ int ObService::is_empty_server(const obrpc::ObCheckServerEmptyArg &arg, obrpc::B } return ret; } +int ObService::check_server_for_adding_server( + const obrpc::ObCheckServerForAddingServerArg &arg, + obrpc::ObCheckServerForAddingServerResult &result) +{ + int ret = OB_SUCCESS; + uint64_t sys_tenant_data_version = 0; + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (OB_FAIL(GET_MIN_DATA_VERSION(OB_SYS_TENANT_ID, sys_tenant_data_version))) { + LOG_WARN("fail to get sys tenant data version", KR(ret)); + } else if (arg.get_sys_tenant_data_version() > 0 + && sys_tenant_data_version > arg.get_sys_tenant_data_version()) { + ret = OB_NOT_SUPPORTED; + LOG_WARN("adding server with larger sys tenant data version is not supported", + KR(ret), K(arg), K(sys_tenant_data_version), K(arg.get_sys_tenant_data_version())); + } else { + bool server_empty = false; + ObCheckServerEmptyArg check_server_empty_arg; + check_server_empty_arg.mode_ = ObCheckServerEmptyArg::ADD_SERVER; + const bool wait_log_scan = ObCheckServerEmptyArg::BOOTSTRAP == check_server_empty_arg.mode_; + if (OB_FAIL(check_server_empty(check_server_empty_arg, wait_log_scan, server_empty))) { + LOG_WARN("check_server_empty failed", KR(ret), K(check_server_empty_arg), K(wait_log_scan)); + } else { + char build_version[common::OB_SERVER_VERSION_LENGTH] = {0}; + ObServerInfoInTable::ObBuildVersion build_version_string; + ObZone zone; + int64_t sql_port = GCONF.mysql_port; + get_package_and_svn(build_version, sizeof(build_version)); + if (OB_FAIL(zone.assign(GCONF.zone.str()))) { + LOG_WARN("fail to assign zone", KR(ret), K(GCONF.zone.str())); + } else if (OB_FAIL(build_version_string.assign(build_version))) { + LOG_WARN("fail to assign build version", KR(ret), K(build_version)); + } else if (OB_FAIL(result.init( + server_empty, + zone, + sql_port, + build_version_string))) { + LOG_WARN("fail to init result", KR(ret), K(server_empty), K(zone), K(sql_port), + K(build_version_string)); + } else {} + } + } + LOG_INFO("generate result", KR(ret), K(arg), K(result)); + return ret; +} +int ObService::get_server_resource_info( + const obrpc::ObGetServerResourceInfoArg &arg, + obrpc::ObGetServerResourceInfoResult &result) +{ + int ret = OB_SUCCESS; + const ObAddr &my_addr = GCONF.self_addr_; + share::ObServerResourceInfo resource_info; + result.reset(); + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (OB_UNLIKELY(!arg.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(arg)); + } else if (OB_FAIL(get_server_resource_info(resource_info))) { + LOG_WARN("fail to get server resource info", KR(ret)); + } else if (OB_FAIL(result.init(my_addr, resource_info))) { + LOG_WARN("fail to init result", KR(ret), K(my_addr), K(resource_info)); + } + FLOG_INFO("get server resource info", KR(ret), K(arg), K(result)); + return ret; +} +int ObService::get_server_resource_info(share::ObServerResourceInfo &resource_info) +{ + int ret = OB_SUCCESS; + omt::ObTenantNodeBalancer::ServerResource svr_res_assigned; + int64_t clog_free_size_byte = 0; + int64_t clog_total_size_byte = 0; + logservice::ObServerLogBlockMgr *log_block_mgr = GCTX.log_block_mgr_; + resource_info.reset(); + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (OB_ISNULL(log_block_mgr)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("log_block_mgr is null", KR(ret), K(GCTX.log_block_mgr_)); + } else if (OB_FAIL(omt::ObTenantNodeBalancer::get_instance().get_server_allocated_resource(svr_res_assigned))) { + LOG_WARN("fail to get server allocated resource", KR(ret)); + } else if (OB_FAIL(log_block_mgr->get_disk_usage(clog_free_size_byte, clog_total_size_byte))) { + LOG_WARN("Failed to get clog stat ", KR(ret)); + } else { + int64_t reserved_size = 4 * 1024 * 1024 * 1024L; // default RESERVED_DISK_SIZE -> 4G + (void) SLOGGERMGR.get_reserved_size(reserved_size); + resource_info.cpu_ = get_cpu_count(); + resource_info.report_cpu_assigned_ = svr_res_assigned.min_cpu_; + resource_info.report_cpu_max_assigned_ = svr_res_assigned.max_cpu_; + resource_info.report_mem_assigned_ = svr_res_assigned.memory_size_; + resource_info.mem_in_use_ = 0; + resource_info.mem_total_ = GMEMCONF.get_server_memory_avail(); + resource_info.disk_total_ + = OB_SERVER_BLOCK_MGR.get_max_macro_block_count(reserved_size) * OB_SERVER_BLOCK_MGR.get_macro_block_size(); + resource_info.disk_in_use_ + = OB_SERVER_BLOCK_MGR.get_used_macro_block_count() * OB_SERVER_BLOCK_MGR.get_macro_block_size(); + resource_info.log_disk_total_ = clog_total_size_byte; + resource_info.report_log_disk_assigned_ = svr_res_assigned.log_disk_size_; + + } + return ret; +} +int ObService::get_build_version(share::ObServerInfoInTable::ObBuildVersion &build_version) +{ + int ret = OB_SUCCESS; + char build_version_char_array[common::OB_SERVER_VERSION_LENGTH] = {0}; + build_version.reset(); + get_package_and_svn(build_version_char_array, sizeof(build_version)); + if (OB_FAIL(build_version.assign(build_version_char_array))) { + LOG_WARN("fail to assign build_version", KR(ret), K(build_version_char_array)); + } + return ret; +} int ObService::get_partition_count(obrpc::ObGetPartitionCountResult &result) { UNUSEDx(result); @@ -1504,6 +1622,7 @@ int ObService::get_partition_count(obrpc::ObGetPartitionCountResult &result) int ObService::check_server_empty(const ObCheckServerEmptyArg &arg, const bool wait_log_scan, bool &is_empty) { + // **TODO (linqiucen.lqc): if rs_epoch has been already valid, this server is not empty int ret = OB_SUCCESS; is_empty = true; UNUSED(wait_log_scan); @@ -1735,18 +1854,6 @@ int ObService::sync_partition_table(const obrpc::Int64 &arg) return OB_NOT_SUPPORTED; } -int ObService::get_server_heartbeat_expire_time(int64_t &lease_expire_time) -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else { - lease_expire_time = lease_state_mgr_.get_heartbeat_expire_time(); - } - return ret; -} - int ObService::set_tracepoint(const obrpc::ObAdminSetTPArg &arg) { int ret = OB_SUCCESS; @@ -2680,6 +2787,23 @@ int ObService::init_tenant_config( return OB_SUCCESS; } +int ObService::handle_heartbeat( + const share::ObHBRequest &hb_request, + share::ObHBResponse &hb_response) +{ + int ret = OB_SUCCESS; + LOG_TRACE("receive a heartbeat request from heartbeat service", K(hb_request)); + const int64_t now = ::oceanbase::common::ObTimeUtility::current_time(); + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("ObService is not inited", KR(ret), K(inited_)); + } else if (OB_FAIL(ObHeartbeatHandler::handle_heartbeat(hb_request, hb_response))) { + LOG_WARN("fail to handle heartbeat", KR(ret), K(hb_request)); + } + const int64_t time_cost = ::oceanbase::common::ObTimeUtility::current_time() - now; + FLOG_INFO("handle_heartbeat", KR(ret), K(hb_request), K(hb_response), K(time_cost)); + return ret; +} int ObService::update_tenant_info_cache( const ObUpdateTenantInfoCacheArg &arg, ObUpdateTenantInfoCacheRes &result) diff --git a/src/observer/ob_service.h b/src/observer/ob_service.h index 63fdd0ad8..ea53c4bbd 100644 --- a/src/observer/ob_service.h +++ b/src/observer/ob_service.h @@ -181,6 +181,14 @@ public: //////////////////////////////////////////////////////////////// // ObRpcBootstrapP @RS bootstrap int bootstrap(const obrpc::ObBootstrapArg &arg); + // ObRpcCheckServerForAddingServerP @RS add server + int check_server_for_adding_server( + const obrpc::ObCheckServerForAddingServerArg &arg, + obrpc::ObCheckServerForAddingServerResult &result); + // ObRpcGetServerStatusP @RS + int get_server_resource_info(const obrpc::ObGetServerResourceInfoArg &arg, obrpc::ObGetServerResourceInfoResult &result); + int get_server_resource_info(share::ObServerResourceInfo &resource_info); + static int get_build_version(share::ObServerInfoInTable::ObBuildVersion &build_version); // ObRpcIsEmptyServerP @RS bootstrap int is_empty_server(const obrpc::ObCheckServerEmptyArg &arg, obrpc::Bool &is_empty); // ObRpcCheckDeploymentModeP @@ -209,8 +217,6 @@ public: int sync_partition_table(const obrpc::Int64 &arg); // ObRpcSetTPP @RS::admin to set tracepoint int set_tracepoint(const obrpc::ObAdminSetTPArg &arg); - // for ObPartitionService::check_mc_allowed_by_server_lease - int get_server_heartbeat_expire_time(int64_t &lease_expire_time); int cancel_sys_task(const share::ObTaskId &task_id); int refresh_memory_stat(); int wash_memory_fragmentation(); @@ -231,6 +237,9 @@ public: int init_tenant_config( const obrpc::ObInitTenantConfigArg &arg, obrpc::ObInitTenantConfigRes &result); + int handle_heartbeat( + const share::ObHBRequest &hb_request, + share::ObHBResponse &hb_response); private: int get_role_from_palf_( logservice::ObLogService &log_service, diff --git a/src/observer/ob_srv_xlator_primary.cpp b/src/observer/ob_srv_xlator_primary.cpp index d2b93f227..0134d56e4 100644 --- a/src/observer/ob_srv_xlator_primary.cpp +++ b/src/observer/ob_srv_xlator_primary.cpp @@ -72,6 +72,7 @@ void oceanbase::observer::init_srv_xlator_for_sys(ObSrvRpcXlator *xlator) { RPC_PROCESSOR(ObInitTenantConfigP, gctx_); RPC_PROCESSOR(ObGetLeaderLocationsP, gctx_); RPC_PROCESSOR(ObBatchBroadcastSchemaP, gctx_); + RPC_PROCESSOR(ObRpcSendHeartbeatP, gctx_); RPC_PROCESSOR(ObRpcNotifySwitchLeaderP, gctx_); // interrupt @@ -109,6 +110,8 @@ void oceanbase::observer::init_srv_xlator_for_sys(ObSrvRpcXlator *xlator) { //dbms_scheduler RPC_PROCESSOR(ObRpcRunDBMSSchedJobP, gctx_); + + RPC_PROCESSOR(ObRpcGetServerResourceInfoP, gctx_); } void oceanbase::observer::init_srv_xlator_for_schema_test(ObSrvRpcXlator *xlator) { diff --git a/src/observer/ob_srv_xlator_rootserver.cpp b/src/observer/ob_srv_xlator_rootserver.cpp index 2895a9ad5..5eb1af9c7 100644 --- a/src/observer/ob_srv_xlator_rootserver.cpp +++ b/src/observer/ob_srv_xlator_rootserver.cpp @@ -60,11 +60,11 @@ void oceanbase::observer::init_srv_xlator_for_rootserver(ObSrvRpcXlator *xlator) RPC_PROCESSOR(rootserver::ObRpcReportSysLSP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObRpcRemoveSysLSP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObRpcFetchLocationP, *gctx_.root_service_); - RPC_PROCESSOR(rootserver::ObRpcMergeFinishP, *gctx_.root_service_); + // RPC_PROCESSOR(rootserver::ObRpcMergeFinishP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObBroadcastDSActionP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObRpcFetchAliveServerP, *gctx_.root_service_); - RPC_PROCESSOR(rootserver::ObRpcFetchActiveServerStatusP, *gctx_.root_service_); + // RPC_PROCESSOR(rootserver::ObRpcFetchActiveServerStatusP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObRpcRefreshTimeZoneInfoP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObRpcRequestTimeZoneInfoP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObCheckDanglingReplicaFinishP, *gctx_.root_service_); @@ -238,7 +238,6 @@ void oceanbase::observer::init_srv_xlator_for_rootserver(ObSrvRpcXlator *xlator) //for upgrade RPC_PROCESSOR(ObGetTenantSchemaVersionP, gctx_); - RPC_PROCESSOR(rootserver::ObCheckMergeFinishP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObRpcFlashBackTableToScnP, *gctx_.root_service_); RPC_PROCESSOR(rootserver::ObRpcCreateRestorePointP, *gctx_.root_service_); diff --git a/src/observer/ob_srv_xlator_storage.cpp b/src/observer/ob_srv_xlator_storage.cpp index 9b62d2672..aae50b5cb 100644 --- a/src/observer/ob_srv_xlator_storage.cpp +++ b/src/observer/ob_srv_xlator_storage.cpp @@ -68,6 +68,7 @@ void oceanbase::observer::init_srv_xlator_for_storage(ObSrvRpcXlator *xlator) { RPC_PROCESSOR(ObRpcWashMemFragmentationP, gctx_); RPC_PROCESSOR(ObRpcBootstrapP, gctx_); RPC_PROCESSOR(ObRpcIsEmptyServerP, gctx_); + RPC_PROCESSOR(ObRpcCheckServerForAddingServerP, gctx_); RPC_PROCESSOR(ObRpcCheckDeploymentModeP, gctx_); RPC_PROCESSOR(ObRpcSyncAutoincValueP, gctx_); RPC_PROCESSOR(ObRpcClearAutoincCacheP, gctx_); diff --git a/src/observer/omt/ob_multi_tenant.cpp b/src/observer/omt/ob_multi_tenant.cpp index 0ccd79fc6..093d7dcda 100644 --- a/src/observer/omt/ob_multi_tenant.cpp +++ b/src/observer/omt/ob_multi_tenant.cpp @@ -102,6 +102,7 @@ #include "observer/table_load/ob_table_load_service.h" #include "sql/plan_cache/ob_plan_cache.h" #include "sql/plan_cache/ob_ps_cache.h" +#include "rootserver/ob_heartbeat_service.h" using namespace oceanbase; using namespace oceanbase::lib; @@ -452,6 +453,7 @@ int ObMultiTenant::init(ObAddr myaddr, // MTL_BIND2(nullptr, nullptr, start_sql_nio_server, mtl_stop_default, // mtl_wait_default, mtl_destroy_default); } + MTL_BIND2(mtl_new_default, rootserver::ObHeartbeatService::mtl_init, nullptr, rootserver::ObHeartbeatService::mtl_stop, rootserver::ObHeartbeatService::mtl_wait, mtl_destroy_default); } if (OB_SUCC(ret)) { diff --git a/src/rootserver/CMakeLists.txt b/src/rootserver/CMakeLists.txt index 2d7fc13db..b3183ec5c 100644 --- a/src/rootserver/CMakeLists.txt +++ b/src/rootserver/CMakeLists.txt @@ -3,6 +3,7 @@ ob_set_subtarget(ob_rootserver ALONE ob_root_service.cpp ddl_task/ob_ddl_redefinition_task.cpp ob_unit_manager.cpp + ob_heartbeat_service.cpp ) ob_set_subtarget(ob_rootserver backup @@ -54,6 +55,7 @@ ob_set_subtarget(ob_rootserver common ob_empty_server_checker.cpp ob_lost_replica_checker.cpp ob_server_manager.cpp + ob_server_zone_op_service.cpp ob_snapshot_info_manager.cpp ob_tablet_creator.cpp ob_tablet_drop.cpp diff --git a/src/rootserver/backup/ob_archive_scheduler_service.cpp b/src/rootserver/backup/ob_archive_scheduler_service.cpp index 56b470cfa..f1386f3a8 100644 --- a/src/rootserver/backup/ob_archive_scheduler_service.cpp +++ b/src/rootserver/backup/ob_archive_scheduler_service.cpp @@ -13,7 +13,6 @@ #define USING_LOG_PREFIX ARCHIVE #include "rootserver/backup/ob_archive_scheduler_service.h" #include "rootserver/backup/ob_tenant_archive_scheduler.h" -#include "rootserver/ob_server_manager.h" #include "rootserver/ob_rs_event_history_table_operator.h" #include "rootserver/ob_unit_manager.h" #include "storage/tx/ob_ts_mgr.h" @@ -73,14 +72,13 @@ void ObArchiveThreadIdling::set_checkpoint_interval(const int64_t interval_us) */ ObArchiveSchedulerService::ObArchiveSchedulerService() : is_inited_(false), is_working_(false), idling_(stop_), - server_mgr_(nullptr), zone_mgr_(nullptr), unit_mgr_(nullptr), + zone_mgr_(nullptr), unit_mgr_(nullptr), rpc_proxy_(nullptr), sql_proxy_(nullptr), schema_service_(nullptr), backup_lease_service_(nullptr) { } int ObArchiveSchedulerService::init( - ObServerManager &server_mgr, ObZoneManager &zone_mgr, ObUnitManager &unit_manager, share::schema::ObMultiVersionSchemaService *schema_service, @@ -100,7 +98,6 @@ int ObArchiveSchedulerService::init( } else if (OB_FAIL(create(thread_cnt, "LOG_ARCHIVE_SERVICE"))) { LOG_WARN("failed to create log archive thread", K(ret)); } else { - server_mgr_ = &server_mgr; zone_mgr_ = &zone_mgr; unit_mgr_ = &unit_manager; schema_service_ = schema_service; @@ -311,7 +308,7 @@ int ObArchiveSchedulerService::start_tenant_archive_(const uint64_t tenant_id) ObArchiveHandler archive_handler; // Only one dest is supported. const int64_t dest_no = 0; - if (OB_FAIL(archive_handler.init(tenant_id, *server_mgr_, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { + if (OB_FAIL(archive_handler.init(tenant_id, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { LOG_WARN("failed to init archive_handler", K(ret)); } else if (OB_FAIL(archive_handler.enable_archive(dest_no))) { LOG_WARN("failed to enable archive tenant", K(ret), K(tenant_id), K(dest_no)); @@ -328,7 +325,7 @@ int ObArchiveSchedulerService::stop_tenant_archive_(const uint64_t tenant_id) ObArchiveHandler archive_handler; // Only one dest is supported. const int64_t dest_no = 0; - if (OB_FAIL(archive_handler.init(tenant_id, *server_mgr_, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { + if (OB_FAIL(archive_handler.init(tenant_id, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { LOG_WARN("failed to init archive_handler", K(ret), K(tenant_id)); } else if (OB_FAIL(archive_handler.disable_archive(dest_no))) { LOG_WARN("failed to disable tenant archive", K(ret), K(tenant_id), K(dest_no)); @@ -375,7 +372,7 @@ int ObArchiveSchedulerService::inner_process_(const uint64_t tenant_id) bool no_round = false; ObArchiveHandler tenant_scheduler; - if (OB_FAIL(tenant_scheduler.init(tenant_id, *server_mgr_, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { + if (OB_FAIL(tenant_scheduler.init(tenant_id, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { LOG_WARN("failed to init tenant archive scheduler", K(ret), K(tenant_id)); } else if (OB_TMP_FAIL(tenant_scheduler.checkpoint())) { LOG_WARN("failed to checkpoint", K(tmp_ret), K(tenant_id)); @@ -526,7 +523,7 @@ int ObArchiveSchedulerService::open_tenant_archive_mode_(const uint64_t tenant_i { int ret = OB_SUCCESS; ObArchiveHandler tenant_scheduler; - if (OB_FAIL(tenant_scheduler.init(tenant_id, *server_mgr_, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { + if (OB_FAIL(tenant_scheduler.init(tenant_id, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { LOG_WARN("failed to init tenant archive scheduler", K(ret), K(tenant_id)); } else if (OB_FAIL(tenant_scheduler.open_archive_mode())) { LOG_WARN("failed to open archive mode", K(ret), K(tenant_id)); @@ -588,7 +585,7 @@ int ObArchiveSchedulerService::close_tenant_archive_mode_(const uint64_t tenant_ { int ret = OB_SUCCESS; ObArchiveHandler tenant_scheduler; - if (OB_FAIL(tenant_scheduler.init(tenant_id, *server_mgr_, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { + if (OB_FAIL(tenant_scheduler.init(tenant_id, *zone_mgr_, *unit_mgr_, schema_service_, *rpc_proxy_, *sql_proxy_))) { LOG_WARN("failed to init tenant archive scheduler", K(ret), K(tenant_id)); } else if (OB_FAIL(tenant_scheduler.close_archive_mode())) { LOG_WARN("failed to close archive mode", K(ret), K(tenant_id)); diff --git a/src/rootserver/backup/ob_archive_scheduler_service.h b/src/rootserver/backup/ob_archive_scheduler_service.h index 516869b5d..a7e2c53c4 100644 --- a/src/rootserver/backup/ob_archive_scheduler_service.h +++ b/src/rootserver/backup/ob_archive_scheduler_service.h @@ -38,7 +38,6 @@ namespace share { namespace rootserver { -class ObServerManager; class ObZoneManager; class ObUnitManager; @@ -66,7 +65,6 @@ public: ~ObArchiveSchedulerService() {} int init( - ObServerManager &server_mgr, ObZoneManager &zone_mgr, ObUnitManager &unit_manager, share::schema::ObMultiVersionSchemaService *schema_service, @@ -129,7 +127,6 @@ private: bool is_inited_; bool is_working_; mutable ObArchiveThreadIdling idling_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; ObUnitManager *unit_mgr_; obrpc::ObSrvRpcProxy *rpc_proxy_; diff --git a/src/rootserver/backup/ob_backup_clean_scheduler.cpp b/src/rootserver/backup/ob_backup_clean_scheduler.cpp index e78c9ba45..4109be3bf 100644 --- a/src/rootserver/backup/ob_backup_clean_scheduler.cpp +++ b/src/rootserver/backup/ob_backup_clean_scheduler.cpp @@ -32,7 +32,6 @@ namespace rootserver ObBackupCleanScheduler::ObBackupCleanScheduler() : ObIBackupJobScheduler(BackupJobType::BACKUP_CLEAN_JOB), is_inited_(false), - server_mgr_(nullptr), sql_proxy_(nullptr), rpc_proxy_(nullptr), schema_service_(nullptr), @@ -43,7 +42,6 @@ ObBackupCleanScheduler::ObBackupCleanScheduler() } int ObBackupCleanScheduler::init( - ObServerManager &server_mgr, common::ObMySQLProxy &sql_proxy, obrpc::ObSrvRpcProxy &rpc_proxy, share::schema::ObMultiVersionSchemaService &schema_service, @@ -56,7 +54,6 @@ int ObBackupCleanScheduler::init( ret = OB_INIT_TWICE; LOG_WARN("init twice", K(ret)); } else { - server_mgr_ = &server_mgr; sql_proxy_ = &sql_proxy; rpc_proxy_ = &rpc_proxy; schema_service_ = &schema_service; @@ -1914,7 +1911,6 @@ int ObSysTenantBackupDeleteMgr::advance_status_( ObBackupAutoObsoleteDeleteTrigger::ObBackupAutoObsoleteDeleteTrigger() : ObIBackupTrigger(BackupTriggerType::BACKUP_AUTO_DELETE_TRIGGER), is_inited_(false), - server_mgr_(nullptr), sql_proxy_(nullptr), rpc_proxy_(nullptr), schema_service_(nullptr), @@ -1925,7 +1921,6 @@ ObBackupAutoObsoleteDeleteTrigger::ObBackupAutoObsoleteDeleteTrigger() } int ObBackupAutoObsoleteDeleteTrigger::init( - ObServerManager &server_mgr, common::ObMySQLProxy &sql_proxy, obrpc::ObSrvRpcProxy &rpc_proxy, share::schema::ObMultiVersionSchemaService &schema_service, @@ -1938,7 +1933,6 @@ int ObBackupAutoObsoleteDeleteTrigger::init( ret = OB_INIT_TWICE; LOG_WARN("init twice", K(ret)); } else { - server_mgr_ = &server_mgr; sql_proxy_ = &sql_proxy; rpc_proxy_ = &rpc_proxy; schema_service_ = &schema_service; diff --git a/src/rootserver/backup/ob_backup_clean_scheduler.h b/src/rootserver/backup/ob_backup_clean_scheduler.h index 572fa1a56..20d5ed284 100644 --- a/src/rootserver/backup/ob_backup_clean_scheduler.h +++ b/src/rootserver/backup/ob_backup_clean_scheduler.h @@ -25,7 +25,6 @@ class ObISQLClient; } namespace rootserver { -class ObServerManager; class ObIBackupDeleteMgr; class ObBackupCleanScheduler : public ObIBackupJobScheduler { @@ -43,7 +42,6 @@ public: virtual int get_need_reload_task(common::ObIAllocator &allocator, common::ObIArray &tasks) override; // reload tasks after switch master happend public: int init( - ObServerManager &server_mgr, common::ObMySQLProxy &sql_proxy, obrpc::ObSrvRpcProxy &rpc_proxy, share::schema::ObMultiVersionSchemaService &schema_service, @@ -94,7 +92,6 @@ private: int handle_failed_job_(const uint64_t tenant_id, const int64_t result, ObIBackupDeleteMgr &job_mgr, share::ObBackupCleanJobAttr &job_attr); private: bool is_inited_; - ObServerManager *server_mgr_; common::ObMySQLProxy *sql_proxy_; obrpc::ObSrvRpcProxy *rpc_proxy_; share::schema::ObMultiVersionSchemaService *schema_service_; @@ -259,7 +256,6 @@ public: virtual int process() override; public: int init( - ObServerManager &server_mgr, common::ObMySQLProxy &sql_proxy, obrpc::ObSrvRpcProxy &rpc_proxy, share::schema::ObMultiVersionSchemaService &schema_service, @@ -274,7 +270,6 @@ private: int parse_time_interval_(const char *str, int64_t &val); private: bool is_inited_; - ObServerManager *server_mgr_; common::ObMySQLProxy *sql_proxy_; obrpc::ObSrvRpcProxy *rpc_proxy_; share::schema::ObMultiVersionSchemaService *schema_service_; diff --git a/src/rootserver/backup/ob_backup_data_scheduler.cpp b/src/rootserver/backup/ob_backup_data_scheduler.cpp index 787d2fa00..52dd920cd 100644 --- a/src/rootserver/backup/ob_backup_data_scheduler.cpp +++ b/src/rootserver/backup/ob_backup_data_scheduler.cpp @@ -20,7 +20,6 @@ #include "rootserver/ob_root_utils.h" #include "share/backup/ob_tenant_archive_mgr.h" #include "share/backup/ob_backup_helper.h" -#include "rootserver/ob_server_manager.h" #include "observer/ob_sql_client_decorator.h" #include "share/ob_tenant_info_proxy.h" #include "share/backup/ob_backup_connectivity.h" diff --git a/src/rootserver/backup/ob_backup_data_scheduler.h b/src/rootserver/backup/ob_backup_data_scheduler.h index b10d7fb57..cec478c22 100644 --- a/src/rootserver/backup/ob_backup_data_scheduler.h +++ b/src/rootserver/backup/ob_backup_data_scheduler.h @@ -31,8 +31,6 @@ class ObISQLClient; namespace rootserver { - -class ObServerManager; class ObSysTenantBackupJobMgr; class ObBackupSetTaskMgr; class ObIBackupJobMgr; diff --git a/src/rootserver/backup/ob_backup_data_set_task_mgr.cpp b/src/rootserver/backup/ob_backup_data_set_task_mgr.cpp index 1f93786d4..e31d63c9d 100644 --- a/src/rootserver/backup/ob_backup_data_set_task_mgr.cpp +++ b/src/rootserver/backup/ob_backup_data_set_task_mgr.cpp @@ -19,7 +19,6 @@ #include "rootserver/ob_root_utils.h" #include "observer/omt/ob_tenant_config_mgr.h" #include "share/backup/ob_tenant_archive_mgr.h" -#include "rootserver/ob_server_manager.h" #include "observer/ob_sql_client_decorator.h" #include "storage/ls/ob_ls.h" #include "share/ls/ob_ls_operator.h" diff --git a/src/rootserver/backup/ob_backup_service.cpp b/src/rootserver/backup/ob_backup_service.cpp index 6183b823d..2b1f5ab5e 100644 --- a/src/rootserver/backup/ob_backup_service.cpp +++ b/src/rootserver/backup/ob_backup_service.cpp @@ -16,7 +16,6 @@ #include "ob_backup_schedule_task.h" #include "ob_backup_task_scheduler.h" #include "rootserver/ob_root_utils.h" -#include "rootserver/ob_server_manager.h" namespace oceanbase { @@ -50,7 +49,6 @@ ObBackupService::ObBackupService() } int ObBackupService::init( - ObServerManager &server_mgr, common::ObMySQLProxy &sql_proxy, obrpc::ObSrvRpcProxy &rpc_proxy, schema::ObMultiVersionSchemaService &schema_service, @@ -68,12 +66,12 @@ int ObBackupService::init( } else if (OB_FAIL(register_job_(&backup_data_scheduler_))) { LOG_WARN("fail to regist job", K(ret), "job_type", backup_data_scheduler_.get_job_type()); } else if (OB_FAIL(backup_clean_scheduler_.init( - server_mgr, sql_proxy, rpc_proxy, schema_service, lease_service, task_scheduler, *this))) { + sql_proxy, rpc_proxy, schema_service, lease_service, task_scheduler, *this))) { LOG_WARN("fail to init backup clean scheduler", K(ret)); } else if (OB_FAIL(register_job_(&backup_clean_scheduler_))) { LOG_WARN("fail to regist job", K(ret), "job_type", backup_clean_scheduler_.get_job_type()); } else if (OB_FAIL(backup_auto_obsolete_delete_trigger_.init( - server_mgr, sql_proxy, rpc_proxy, schema_service, lease_service, task_scheduler, *this))) { + sql_proxy, rpc_proxy, schema_service, lease_service, task_scheduler, *this))) { LOG_WARN("fail to init backup auto obsolete delete trigger", K(ret)); } else if (OB_FAIL(register_trigger_(&backup_auto_obsolete_delete_trigger_))) { LOG_WARN("fail to regist job", K(ret), "job_type", backup_auto_obsolete_delete_trigger_.get_trigger_type()); diff --git a/src/rootserver/backup/ob_backup_service.h b/src/rootserver/backup/ob_backup_service.h index d4ca344d2..ca5c86a96 100644 --- a/src/rootserver/backup/ob_backup_service.h +++ b/src/rootserver/backup/ob_backup_service.h @@ -35,7 +35,7 @@ class ObBackupService : public ObRsReentrantThread public: ObBackupService(); virtual ~ObBackupService() {}; - int init(ObServerManager &server_mgr, common::ObMySQLProxy &sql_proxy, obrpc::ObSrvRpcProxy &rpc_proxy, + int init(common::ObMySQLProxy &sql_proxy, obrpc::ObSrvRpcProxy &rpc_proxy, share::schema::ObMultiVersionSchemaService &schema_service, ObBackupLeaseService &lease_service, ObBackupTaskScheduler &task_scheduler); virtual void run3() override; diff --git a/src/rootserver/backup/ob_backup_task_scheduler.cpp b/src/rootserver/backup/ob_backup_task_scheduler.cpp index ecb5cf145..f455978d5 100644 --- a/src/rootserver/backup/ob_backup_task_scheduler.cpp +++ b/src/rootserver/backup/ob_backup_task_scheduler.cpp @@ -17,12 +17,12 @@ #include "lib/lock/ob_mutex.h" #include "lib/stat/ob_diagnose_info.h" #include "lib/profile/ob_trace_id.h" -#include "rootserver/ob_server_manager.h" #include "lib/alloc/ob_malloc_allocator.h" #include "lib/oblog/ob_log_module.h" #include "share/ob_rpc_struct.h" #include "rootserver/ob_rs_event_history_table_operator.h" #include "share/ob_srv_rpc_proxy.h" +#include "share/ob_all_server_tracer.h" namespace oceanbase { using namespace common; @@ -43,7 +43,6 @@ ObBackupTaskSchedulerQueue::ObBackupTaskSchedulerQueue() task_map_(), rpc_proxy_(nullptr), task_scheduler_(nullptr), - server_mgr_(nullptr), zone_mgr_(nullptr), backup_service_(nullptr), sql_proxy_(nullptr), @@ -97,7 +96,6 @@ void ObBackupTaskSchedulerQueue::reset() int ObBackupTaskSchedulerQueue::init( ObTenantBackupScheduleTaskStatMap &tenant_stat_map, ObServerBackupScheduleTaskStatMap &server_stat_map, - ObServerManager &server_manager, ObZoneManager &zone_manager, ObBackupService &backup_service, const int64_t bucket_num, @@ -124,7 +122,6 @@ int ObBackupTaskSchedulerQueue::init( max_size_ = max_size; tenant_stat_map_ = &tenant_stat_map; server_stat_map_ = &server_stat_map; - server_mgr_ = &server_manager; zone_mgr_ = &zone_manager; rpc_proxy_ = rpc_proxy; task_scheduler_ = task_scheduler; @@ -406,7 +403,8 @@ int ObBackupTaskSchedulerQueue::get_all_servers_( tmp_server_list.reuse(); const ObZone &zone = all_zones.at(i).zone_; const int64_t priority = all_zones.at(i).priority_; - if (OB_FAIL(server_mgr_->get_alive_servers(zone, tmp_server_list))) { + // **FIXME (linqiucen.lqc): temp. solution, this will be replaced when transfer branch is merged + if (OB_FAIL(SVR_TRACER.get_alive_servers(zone, tmp_server_list))) { LOG_WARN("failed to get alive servers", KR(ret), K(zone)); } else { for (int64_t j = 0; OB_SUCC(ret) && j < tmp_server_list.count(); ++j) { @@ -1084,7 +1082,6 @@ ObBackupTaskScheduler::ObBackupTaskScheduler() server_stat_map_(), queue_(), self_(), - server_mgr_(nullptr), zone_mgr_(nullptr), rpc_proxy_(nullptr), backup_service_(nullptr), @@ -1094,7 +1091,6 @@ ObBackupTaskScheduler::ObBackupTaskScheduler() } int ObBackupTaskScheduler::init( - ObServerManager *server_mgr, ObZoneManager *zone_mgr, obrpc::ObSrvRpcProxy *rpc_proxy, ObBackupService *backup_mgr, @@ -1108,13 +1104,12 @@ int ObBackupTaskScheduler::init( if (IS_INIT) { ret = OB_INIT_TWICE; LOG_WARN("init twice", K(ret)); - } else if (OB_UNLIKELY(nullptr == server_mgr || nullptr == rpc_proxy || nullptr == zone_mgr || nullptr == service)) { + } else if (OB_UNLIKELY(nullptr == rpc_proxy || nullptr == zone_mgr || nullptr == service)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ret), K(server_mgr), K(rpc_proxy), K(zone_mgr)); + LOG_WARN("invalid argument", K(ret), K(rpc_proxy), K(zone_mgr)); } else if (OB_FAIL(create(backup_task_scheduler_thread_cnt, BACKUPTASKSCHEDULER))) { LOG_WARN("create backup task scheduler thread failed", K(ret), K(backup_task_scheduler_thread_cnt)); } else { - server_mgr_ = server_mgr; zone_mgr_ = zone_mgr; rpc_proxy_ = rpc_proxy; backup_service_ = backup_mgr; @@ -1124,7 +1119,7 @@ int ObBackupTaskScheduler::init( LOG_WARN("init tenant stat failed", K(ret), LITERAL_K(MAX_BACKUP_TASK_QUEUE_LIMIT)); } else if (OB_FAIL(server_stat_map_.init(MAX_BACKUP_TASK_QUEUE_LIMIT))) { LOG_WARN("init server stat failed", K(ret), LITERAL_K(MAX_BACKUP_TASK_QUEUE_LIMIT)); - } else if (OB_FAIL(queue_.init(tenant_stat_map_, server_stat_map_, *server_mgr, + } else if (OB_FAIL(queue_.init(tenant_stat_map_, server_stat_map_, *zone_mgr, *backup_mgr, MAX_BACKUP_TASK_QUEUE_LIMIT, rpc_proxy_, this, MAX_BACKUP_TASK_QUEUE_LIMIT, sql_proxy, lease_service))) { LOG_WARN("init rebalance task queue failed", K(ret), LITERAL_K(MAX_BACKUP_TASK_QUEUE_LIMIT)); } else { @@ -1289,19 +1284,21 @@ int ObBackupTaskScheduler::check_alive_(int64_t &last_check_task_on_server_ts, b bool is_exist = true; ObBackupScheduleTask *task = schedule_tasks.at(i); const ObAddr dst = task->get_dst(); - share::ObServerStatus server_status; + share::ObServerInfoInTable server_info; obrpc::ObBackupCheckTaskArg check_task_arg; check_task_arg.tenant_id_ = task->get_tenant_id(); check_task_arg.trace_id_ = task->get_trace_id(); if ((now - task->get_generate_time() < backup_task_keep_alive_interval) && !reload_flag) { // no need to check alive, wait next turn - } else if (OB_FAIL(server_mgr_->is_server_exist(dst, is_exist))) { + // **FIXME (linqiucen.lqc): temp. solution, this will be replaced when transfer branch is merged + } else if (OB_FAIL(SVR_TRACER.is_server_exist(dst, is_exist))) { LOG_WARN("fail to check server exist", K(ret), K(dst)); } else if (!is_exist) { LOG_WARN("backup dest server is not exist", K(ret), K(dst)); - } else if (OB_FAIL(server_mgr_->get_server_status(dst, server_status))) { - LOG_WARN("fail to get server status", K(ret), K(dst)); - } else if (!server_status.is_active() || !server_status.in_service()) { + // **FIXME (linqiucen.lqc): temp. solution, this will be replaced when transfer branch is merged + } else if (OB_FAIL(SVR_TRACER.get_server_info(dst, server_info))) { + LOG_WARN("fail to get server_info", K(ret), K(dst)); + } else if (!server_info.is_active() || !server_info.in_service()) { is_exist = false; LOG_WARN("server status may not active or in service", K(ret), K(dst)); } else if (OB_FAIL(rpc_proxy_->to(dst).check_backup_task_exist(check_task_arg, res))) { @@ -1383,12 +1380,14 @@ int ObBackupTaskScheduler::do_execute_(const ObBackupScheduleTask &task) bool is_alive = false; bool in_service = false; common::ObAddr leader; - if (OB_FAIL(server_mgr_->check_server_alive(online_server, is_alive))) { + // **FIXME (linqiucen.lqc): temp. solution, this will be replaced when transfer branch is merged + if (OB_FAIL(SVR_TRACER.check_server_alive(online_server, is_alive))) { LOG_WARN("check server alive failed", K(ret), K(online_server)); } else if (!is_alive) { ret = OB_REBALANCE_TASK_CANT_EXEC; LOG_WARN("dst server not alive", K(ret), K(online_server)); - } else if (OB_FAIL(server_mgr_->check_in_service(online_server, in_service))) { + // **FIXME (linqiucen.lqc): temp. solution, this will be replaced when transfer branch is merged + } else if (OB_FAIL(SVR_TRACER.check_in_service(online_server, in_service))) { LOG_WARN("check in service failed", K(ret), K(online_server)); } else if (!in_service) { ret = OB_REBALANCE_TASK_CANT_EXEC; diff --git a/src/rootserver/backup/ob_backup_task_scheduler.h b/src/rootserver/backup/ob_backup_task_scheduler.h index 12bfc15a4..7422fd9a8 100644 --- a/src/rootserver/backup/ob_backup_task_scheduler.h +++ b/src/rootserver/backup/ob_backup_task_scheduler.h @@ -27,8 +27,6 @@ class ObMutex; } namespace rootserver { - -class ObServerManager; class ObZoneManager; class ObBackupTaskScheduler; class ObBackupService; @@ -43,8 +41,7 @@ public: virtual ~ObBackupTaskSchedulerQueue(); int init(ObTenantBackupScheduleTaskStatMap &tenant_stat_map, - ObServerBackupScheduleTaskStatMap &server_stat_map, - ObServerManager &server_manager, + ObServerBackupScheduleTaskStatMap &server_stat_map, ObZoneManager &zone_manager, ObBackupService &backup_mgr, const int64_t bucket_num, @@ -129,7 +126,6 @@ private: TaskMap task_map_; obrpc::ObSrvRpcProxy *rpc_proxy_; ObBackupTaskScheduler *task_scheduler_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; ObBackupService *backup_service_; common::ObMySQLProxy *sql_proxy_; @@ -155,8 +151,7 @@ public: public: ObBackupTaskScheduler(); - int init(ObServerManager *server_mgr, - ObZoneManager *zone_mgr_, + int init(ObZoneManager *zone_mgr_, obrpc::ObSrvRpcProxy *rpc_proxy, ObBackupService *backup_mgr, common::ObMySQLProxy &sql_proxy, @@ -202,7 +197,6 @@ private: ObBackupTaskSchedulerQueue queue_; // scheduler's self server addr common::ObAddr self_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; obrpc::ObSrvRpcProxy *rpc_proxy_; ObBackupService *backup_service_; diff --git a/src/rootserver/backup/ob_tenant_archive_scheduler.cpp b/src/rootserver/backup/ob_tenant_archive_scheduler.cpp index 5ab258918..2afe7d8cd 100644 --- a/src/rootserver/backup/ob_tenant_archive_scheduler.cpp +++ b/src/rootserver/backup/ob_tenant_archive_scheduler.cpp @@ -12,7 +12,6 @@ #define USING_LOG_PREFIX ARCHIVE #include "rootserver/backup/ob_tenant_archive_scheduler.h" -#include "rootserver/ob_server_manager.h" #include "rootserver/ob_rs_event_history_table_operator.h" #include "rootserver/ob_unit_manager.h" #include "storage/tx/ob_ts_mgr.h" @@ -385,7 +384,7 @@ static int round_checkpoint_cb( */ ObArchiveHandler::ObArchiveHandler() : is_inited_(false), tenant_id_(OB_INVALID_TENANT_ID), - server_mgr_(nullptr), zone_mgr_(nullptr), unit_mgr_(nullptr), rpc_proxy_(nullptr), + zone_mgr_(nullptr), unit_mgr_(nullptr), rpc_proxy_(nullptr), sql_proxy_(nullptr), schema_service_(nullptr), round_handler_(), archive_table_op_() { @@ -394,7 +393,6 @@ ObArchiveHandler::ObArchiveHandler() int ObArchiveHandler::init( const uint64_t tenant_id, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, ObUnitManager &unit_manager, share::schema::ObMultiVersionSchemaService *schema_service, @@ -415,7 +413,6 @@ int ObArchiveHandler::init( LOG_WARN("failed to init archive round", K(ret), K(tenant_id)); } else { tenant_id_ = tenant_id; - server_mgr_ = &server_mgr; zone_mgr_ = &zone_mgr; unit_mgr_ = &unit_manager; schema_service_ = schema_service; diff --git a/src/rootserver/backup/ob_tenant_archive_scheduler.h b/src/rootserver/backup/ob_tenant_archive_scheduler.h index 60e40d0d2..27e0193b8 100644 --- a/src/rootserver/backup/ob_tenant_archive_scheduler.h +++ b/src/rootserver/backup/ob_tenant_archive_scheduler.h @@ -32,7 +32,6 @@ namespace common { namespace rootserver { -class ObServerManager; class ObZoneManager; class ObUnitManager; @@ -44,7 +43,6 @@ public: int init( const uint64_t tenant_id, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, ObUnitManager &unit_manager, share::schema::ObMultiVersionSchemaService *schema_service, @@ -77,7 +75,6 @@ private: private: bool is_inited_; uint64_t tenant_id_; // user tenant id - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; ObUnitManager *unit_mgr_; obrpc::ObSrvRpcProxy *rpc_proxy_; diff --git a/src/rootserver/ddl_task/ob_ddl_task.cpp b/src/rootserver/ddl_task/ob_ddl_task.cpp index ff2a6d54d..29fc85d03 100644 --- a/src/rootserver/ddl_task/ob_ddl_task.cpp +++ b/src/rootserver/ddl_task/ob_ddl_task.cpp @@ -34,7 +34,6 @@ #include "share/ob_max_id_fetcher.h" #include "share/ob_freeze_info_proxy.h" #include "share/scheduler/ob_sys_task_stat.h" -#include "rootserver/ob_server_manager.h" #include "rootserver/ob_zone_manager.h" #include "rootserver/ob_ddl_service.h" #include "rootserver/ob_root_service.h" diff --git a/src/rootserver/freeze/ob_checksum_validator.h b/src/rootserver/freeze/ob_checksum_validator.h index cb1982638..3c9adeb56 100644 --- a/src/rootserver/freeze/ob_checksum_validator.h +++ b/src/rootserver/freeze/ob_checksum_validator.h @@ -29,7 +29,6 @@ namespace rootserver { class ObZoneMergeManager; class ObFreezeInfoManager; -class ObServerManager; struct ObMergeTimeStatistics; class ObMergeErrorCallback diff --git a/src/rootserver/ob_all_server_checker.cpp b/src/rootserver/ob_all_server_checker.cpp index 39423d73a..bc01bdb4f 100644 --- a/src/rootserver/ob_all_server_checker.cpp +++ b/src/rootserver/ob_all_server_checker.cpp @@ -16,6 +16,7 @@ #include "share/config/ob_server_config.h" #include "rootserver/ob_server_manager.h" +#include "rootserver/ob_heartbeat_service.h" namespace oceanbase { @@ -149,8 +150,12 @@ ObCheckServerTask::ObCheckServerTask(common::ObWorkQueue &work_queue, int ObCheckServerTask::process() { int ret = OB_SUCCESS; - if (OB_FAIL(checker_.check_all_server())) { - LOG_WARN("checker all server failed", K(ret)); + if (!ObHeartbeatService::is_service_enabled()) { + if (OB_FAIL(checker_.check_all_server())) { + LOG_WARN("checker all server failed", K(ret)); + } + } else { + LOG_TRACE("no need to do ObCheckServerTask in version >= 4.2"); } return ret; } diff --git a/src/rootserver/ob_all_server_task.cpp b/src/rootserver/ob_all_server_task.cpp index 665ae1e6c..b35c9e734 100644 --- a/src/rootserver/ob_all_server_task.cpp +++ b/src/rootserver/ob_all_server_task.cpp @@ -21,6 +21,8 @@ #include "rootserver/ob_disaster_recovery_task_mgr.h" #include "rootserver/ob_root_utils.h" #include "observer/ob_server_struct.h" +#include "share/ob_all_server_tracer.h" +#include "ob_heartbeat_service.h" namespace oceanbase { @@ -47,18 +49,25 @@ ObAllServerTask::~ObAllServerTask() int ObAllServerTask::process() { int ret = OB_SUCCESS; - if (OB_ISNULL(ObCurTraceId::get_trace_id())) { - //Prevent the current trace_id from being overwritten - ObCurTraceId::init(GCONF.self_addr_); - } - - THIS_WORKER.set_timeout_ts(INT64_MAX); - if (!ObRootServiceRoleChecker::is_rootserver()) { - ret = OB_NOT_MASTER; - LOG_WARN("not master", K(ret)); - } else if (OB_FAIL(server_manager_.adjust_server_status( - server_, disaster_recovery_task_mgr_, with_rootserver_))) { - LOG_WARN("fail to adjust server status", K(ret), K(server_)); + int tmp_ret = OB_SUCCESS; + if (!ObHeartbeatService::is_service_enabled()) { + if (OB_ISNULL(ObCurTraceId::get_trace_id())) { + //Prevent the current trace_id from being overwritten + ObCurTraceId::init(GCONF.self_addr_); + } + THIS_WORKER.set_timeout_ts(INT64_MAX); + if (!ObRootServiceRoleChecker::is_rootserver()) { + ret = OB_NOT_MASTER; + LOG_WARN("not master", K(ret)); + } else if (OB_FAIL(server_manager_.adjust_server_status( + server_, disaster_recovery_task_mgr_, with_rootserver_))) { + LOG_WARN("fail to adjust server status", K(ret), K(server_)); + } + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh all server tracer", KR(ret), KR(tmp_ret)); + } + } else { + LOG_TRACE("no need to do ObAllServerTask in version >= 4.2"); } return ret; } diff --git a/src/rootserver/ob_alter_locality_finish_checker.cpp b/src/rootserver/ob_alter_locality_finish_checker.cpp index 081073ecb..19b8f91fb 100644 --- a/src/rootserver/ob_alter_locality_finish_checker.cpp +++ b/src/rootserver/ob_alter_locality_finish_checker.cpp @@ -34,7 +34,6 @@ ObAlterLocalityFinishChecker::ObAlterLocalityFinishChecker(volatile bool &stop) common_rpc_proxy_(NULL), self_(), unit_mgr_(NULL), - server_mgr_(NULL), zone_mgr_(NULL), sql_proxy_(NULL), stop_(stop) @@ -50,7 +49,6 @@ int ObAlterLocalityFinishChecker::init( obrpc::ObCommonRpcProxy &common_rpc_proxy, common::ObAddr &addr, ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, common::ObMySQLProxy &sql_proxy, share::ObLSTableOperator &lst_operator) @@ -67,7 +65,6 @@ int ObAlterLocalityFinishChecker::init( common_rpc_proxy_ = &common_rpc_proxy; self_ = addr; unit_mgr_ = &unit_mgr; - server_mgr_ = &server_mgr; zone_mgr_ = &zone_mgr; sql_proxy_ = &sql_proxy; lst_operator_ = &lst_operator; @@ -89,11 +86,9 @@ int ObAlterLocalityFinishChecker::check() } else if (OB_ISNULL(schema_service_) || OB_ISNULL(unit_mgr_) || OB_ISNULL(zone_mgr_) - || OB_ISNULL(server_mgr_) || !self_.is_valid()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), KP_(schema_service), KP_(unit_mgr), - KP_(zone_mgr), KP_(server_mgr), K_(self)); + LOG_WARN("invalid argument", KR(ret), KP_(schema_service), KP_(unit_mgr), KP_(zone_mgr), K_(self)); } else if (OB_FAIL(check_stop())) { LOG_WARN("ObAlterLocalityFinishChecker stopped", KR(ret)); } else if (OB_FAIL(schema_service_->get_tenant_schema_guard(OB_SYS_TENANT_ID, schema_guard))) { @@ -123,7 +118,6 @@ int ObAlterLocalityFinishChecker::check() } else if (OB_SUCCESS != (tmp_ret = ObDRWorker::check_tenant_locality_match( tenant_id, *unit_mgr_, - *server_mgr_, *zone_mgr_, alter_locality_finish))){ LOG_WARN("fail to check tenant locality match", KR(tmp_ret), K(tenant_id), K(alter_locality_finish)); @@ -131,7 +125,6 @@ int ObAlterLocalityFinishChecker::check() && OB_SUCCESS != (tmp_ret = ObDRWorker::check_tenant_locality_match( gen_meta_tenant_id(tenant_id), *unit_mgr_, - *server_mgr_, *zone_mgr_, meta_alter_locality_finish))){ LOG_WARN("fail to check tenant locality match", KR(tmp_ret), "meta_tenant_id", diff --git a/src/rootserver/ob_alter_locality_finish_checker.h b/src/rootserver/ob_alter_locality_finish_checker.h index 84aff62e0..8ccfd2450 100644 --- a/src/rootserver/ob_alter_locality_finish_checker.h +++ b/src/rootserver/ob_alter_locality_finish_checker.h @@ -33,7 +33,6 @@ class ObLSTableOperator; namespace rootserver { class ObUnitManager; -class ObServerManager; class ObZoneManager; class DRLSInfo; class LocalityMap; @@ -60,7 +59,6 @@ public: obrpc::ObCommonRpcProxy &common_rpc_proxy, common::ObAddr &self, ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, common::ObMySQLProxy &sql_proxy, share::ObLSTableOperator &lst_operator); @@ -76,7 +74,6 @@ private: obrpc::ObCommonRpcProxy *common_rpc_proxy_; //use GCTX.rs_rpc_proxy_ common::ObAddr self_; ObUnitManager *unit_mgr_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; common::ObMySQLProxy *sql_proxy_; share::ObLSTableOperator *lst_operator_; diff --git a/src/rootserver/ob_balance_info.cpp b/src/rootserver/ob_balance_info.cpp index dd5b151aa..0f130c8a5 100644 --- a/src/rootserver/ob_balance_info.cpp +++ b/src/rootserver/ob_balance_info.cpp @@ -23,7 +23,6 @@ #include "share/schema/ob_table_schema.h" #include "share/schema/ob_schema_getter_guard.h" #include "share/schema/ob_part_mgr_util.h" -#include "ob_server_manager.h" #include "ob_unit_manager.h" #include "ob_zone_manager.h" #include "ob_root_utils.h" diff --git a/src/rootserver/ob_balance_info.h b/src/rootserver/ob_balance_info.h index 3d34042a0..6c531f92a 100644 --- a/src/rootserver/ob_balance_info.h +++ b/src/rootserver/ob_balance_info.h @@ -41,7 +41,7 @@ class ObSchemaGetterGuard; namespace rootserver { class ObUnitManager; -class ObServerManager; + class ObZoneManager; class ObDataSourceCandidateChecker diff --git a/src/rootserver/ob_bootstrap.cpp b/src/rootserver/ob_bootstrap.cpp index 5e143d2c0..a2e92714b 100644 --- a/src/rootserver/ob_bootstrap.cpp +++ b/src/rootserver/ob_bootstrap.cpp @@ -44,15 +44,19 @@ #include "storage/ob_file_system_router.h" #include "share/ls/ob_ls_creator.h"//ObLSCreator #include "share/ls/ob_ls_life_manager.h"//ObLSLifeAgentManager +#include "share/ob_all_server_tracer.h" #include "rootserver/ob_rs_event_history_table_operator.h" #include "rootserver/ob_rs_async_rpc_proxy.h" #include "rootserver/ob_ddl_operator.h" #include "rootserver/ob_locality_util.h" #include "rootserver/ob_rs_async_rpc_proxy.h" +#include "rootserver/ob_server_zone_op_service.h" #include "observer/ob_server_struct.h" #include "rootserver/freeze/ob_freeze_info_manager.h" #include "rootserver/ob_table_creator.h" #include "share/scn.h" +#include "rootserver/ob_heartbeat_service.h" +#include "rootserver/ob_root_service.h" namespace oceanbase { @@ -503,11 +507,10 @@ ObBootstrap::ObBootstrap( { } -int ObBootstrap::execute_bootstrap() +int ObBootstrap::execute_bootstrap(rootserver::ObServerZoneOpService &server_zone_op_service) { int ret = OB_SUCCESS; bool already_bootstrap = true; - uint64_t server_id = OB_INIT_SERVER_ID; ObSArray table_schemas; begin_ts_ = ObTimeUtility::current_time(); @@ -522,12 +525,8 @@ int ObBootstrap::execute_bootstrap() LOG_WARN("ob system is already bootstrap, cannot bootstrap again", K(ret)); } else if (OB_FAIL(check_bootstrap_rs_list(rs_list_))) { LOG_WARN("failed to check_bootstrap_rs_list", K_(rs_list), K(ret)); - } else if (OB_FAIL(add_rs_list(server_id))) { - LOG_WARN("fail to add rs list to server manager", K(ret)); } else if (OB_FAIL(create_all_core_table_partition())) { LOG_WARN("fail to create all core_table partition", KR(ret)); - } else if (OB_FAIL(wait_all_rs_online())) { - LOG_WARN("failed to wait all rs online", K(ret)); } else if (OB_FAIL(set_in_bootstrap())) { LOG_WARN("failed to set in bootstrap", K(ret)); } else if (OB_FAIL(init_global_stat())) { @@ -545,24 +544,23 @@ int ObBootstrap::execute_bootstrap() ObMultiVersionSchemaService &schema_service = ddl_service_.get_schema_service(); if (OB_SUCC(ret)) { - if (OB_FAIL(init_system_data(server_id))) { - LOG_WARN("failed to init system data", K(server_id), K(ret)); + if (OB_FAIL(init_system_data())) { + LOG_WARN("failed to init system data", KR(ret)); } else if (OB_FAIL(ddl_service_.refresh_schema(OB_SYS_TENANT_ID))) { LOG_WARN("failed to refresh_schema", K(ret)); } } BOOTSTRAP_CHECK_SUCCESS_V2("refresh_schema"); - - if (OB_SUCC(ret)) { - if (OB_FAIL(wait_all_rs_in_service())) { - LOG_WARN("failed to wait all rs in service", KR(ret)); - } else if (OB_FAIL(init_backup_inner_table())) { - LOG_WARN("failed to init backup inner table", KR(ret)); - } else if (OB_FAIL(init_backup_data())) { - LOG_WARN("failed to init backup inner table version", KR(ret)); - } else { - ROOTSERVICE_EVENT_ADD("bootstrap", "bootstrap_succeed"); - } + if (FAILEDx(add_servers_in_rs_list(server_zone_op_service))) { + LOG_WARN("fail to add servers in rs_list_", KR(ret)); + } else if (OB_FAIL(wait_all_rs_in_service())) { + LOG_WARN("failed to wait all rs in service", KR(ret)); + } else if (OB_FAIL(init_backup_inner_table())) { + LOG_WARN("failed to init backup inner table", KR(ret)); + } else if (OB_FAIL(init_backup_data())) { + LOG_WARN("failed to init backup inner table version", KR(ret)); + } else { + ROOTSERVICE_EVENT_ADD("bootstrap", "bootstrap_succeed"); } BOOTSTRAP_CHECK_SUCCESS(); @@ -884,12 +882,7 @@ int ObBootstrap::broadcast_sys_schema(const ObSArray &table_schem if (INT64_MAX != THIS_WORKER.get_timeout_ts()) { rpc_timeout = max(rpc_timeout, THIS_WORKER.get_timeout_remain()); } - if (OB_FAIL(ddl_service_.get_server_manager().check_server_active(rs->server_, is_active))) { - LOG_WARN("check_server_active failed", KR(ret), "server", rs->server_); - } else if (!is_active) { - ret = OB_SERVER_NOT_ACTIVE; - LOG_WARN("server not active", KR(ret), "server", rs->server_); - } else if (OB_FAIL(proxy.call(rs->server_, rpc_timeout, arg))) { + if (OB_FAIL(proxy.call(rs->server_, rpc_timeout, arg))) { LOG_WARN("broadcast_sys_schema failed", KR(ret), K(rpc_timeout), "server", rs->server_); } @@ -1045,60 +1038,39 @@ int ObBootstrap::construct_schema( return ret; } -int ObBootstrap::add_rs_list(uint64_t &server_id) -{ +int ObBootstrap::add_servers_in_rs_list(rootserver::ObServerZoneOpService &server_zone_op_service) { int ret = OB_SUCCESS; - ObServerManager &server_mgr = ddl_service_.get_server_manager(); - if (OB_FAIL(check_inner_stat())) { - LOG_WARN("check_inner_stat failed", K(ret)); - } else if (OB_FAIL(server_mgr.add_server_list(rs_list_, server_id))) { - LOG_WARN("add_server_list failed", K_(rs_list), K(ret)); - } - BOOTSTRAP_CHECK_SUCCESS(); - return ret; -} - -int ObBootstrap::wait_all_rs_online() -{ - int ret = OB_SUCCESS; - int64_t left_time_can_sleep = 0; - ObServerManager &server_mgr = ddl_service_.get_server_manager(); - if (OB_FAIL(check_inner_stat())) { - LOG_WARN("check_inner_stat failed", K(ret)); - } else if (OB_FAIL(server_mgr.get_lease_duration(left_time_can_sleep))) { - LOG_WARN("get_lease_duration failed", K(ret)); + ObArray servers; + if (OB_ISNULL(GCTX.root_service_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.root_service_ is null", KR(ret), KP(GCTX.root_service_)); } else { - while (OB_SUCC(ret)) { - if (!ObRootServiceRoleChecker::is_rootserver()) { - ret = OB_RS_SHUTDOWN; - LOG_WARN("wait all rs online fail, self is not master rootservice any more, check SYS LS leader revoke infos", - KR(ret), K(left_time_can_sleep)); - break; - } - - bool all_alive = true; - if (INT64_MAX != THIS_WORKER.get_timeout_ts()) { - left_time_can_sleep = max(left_time_can_sleep, THIS_WORKER.get_timeout_remain()); - } - for (int64_t i = 0; OB_SUCC(ret) && i < rs_list_.count(); ++i) { - bool is_alive = false; - if (OB_FAIL(server_mgr.check_server_alive(rs_list_.at(i).server_, is_alive))) { - LOG_WARN("check_server_alive failed", "server", rs_list_.at(i).server_, K(ret)); - } else if (!is_alive) { - LOG_WARN("server is not alive", "server", rs_list_.at(i).server_, K(is_alive)); - all_alive = false; - break; + if (!ObHeartbeatService::is_service_enabled()) { + for (int64_t i = 0; OB_SUCC(ret) && i < rs_list_.count(); i++) { + const ObAddr &server = rs_list_.at(i).server_; + const ObZone &zone = rs_list_.at(i).zone_; + if (OB_FAIL(GCTX.root_service_->add_server_for_bootstrap_in_version_smaller_than_4_2_0(server, zone))) { + LOG_WARN("fail to add server in version < 4.2", KR(ret), K(server), K(zone)); } + FLOG_INFO("add servers in rs_list_ in version < 4.2", KR(ret), K(server), K(zone)); } - if (OB_FAIL(ret)) { - } else if (all_alive) { - break; - } else if (left_time_can_sleep > 0) { - USLEEP(min(left_time_can_sleep, 200 * 1000)); + } else { + for (int64_t i = 0; OB_SUCC(ret) && i < rs_list_.count(); i++) { + servers.reuse(); + const ObAddr &server = rs_list_.at(i).server_; + const ObZone &zone = rs_list_.at(i).zone_; + if (OB_FAIL(servers.push_back(server))) { + LOG_WARN("fail to push an element into servers", KR(ret), K(server)); + } else if (OB_FAIL(server_zone_op_service.add_servers(servers, zone, true /* is_bootstrap */))) { + LOG_WARN("fail to add servers", KR(ret), K(servers), K(zone)); + } + FLOG_INFO("add servers in rs_list_ in version >= 4.2", KR(ret), K(servers), K(zone)); + } + if (FAILEDx(GCTX.root_service_->load_server_manager())) { + LOG_WARN("fail to load server manager", KR(ret), KP(GCTX.root_service_)); } } } - BOOTSTRAP_CHECK_SUCCESS(); return ret; } @@ -1124,8 +1096,13 @@ int ObBootstrap::wait_all_rs_in_service() if (INT64_MAX != THIS_WORKER.get_timeout_ts()) { left_time_can_sleep = max(left_time_can_sleep, THIS_WORKER.get_timeout_remain()); } - if (OB_FAIL(ddl_service_.get_server_manager().check_in_service(rs->server_, in_service))) { + // mark + if (OB_FAIL(SVR_TRACER.check_in_service(rs->server_, in_service))) { LOG_WARN("check_in_service failed", "server", rs->server_, K(ret)); + if (OB_ENTRY_NOT_EXIST == ret) { + ret = OB_SUCCESS; + all_in_service = false; + } } else if (!in_service) { LOG_WARN("server is not in_service ", "server", rs->server_); all_in_service = false; @@ -1405,14 +1382,11 @@ int ObBootstrap::insert_sys_ls_(const share::schema::ObTenantSchema &tenant_sche } -int ObBootstrap::init_system_data(const uint64_t server_id) +int ObBootstrap::init_system_data() { int ret = OB_SUCCESS; if (OB_FAIL(check_inner_stat())) { LOG_WARN("check_inner_stat failed", KR(ret)); - } else if (OB_INVALID_ID == server_id) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server_id", K(server_id), KR(ret)); } else if (OB_FAIL(unit_mgr_.load())) { LOG_WARN("unit_mgr load failed", KR(ret)); } else if (OB_FAIL(create_sys_unit_config())) { @@ -1421,8 +1395,6 @@ int ObBootstrap::init_system_data(const uint64_t server_id) LOG_WARN("create sys resource pool failed", KR(ret)); } else if (OB_FAIL(create_sys_tenant())) { LOG_WARN("create system tenant failed", KR(ret)); - } else if (OB_FAIL(init_server_id(server_id))) { - LOG_WARN("init server id failed", K(server_id), KR(ret)); } else if (OB_FAIL(init_all_zone_table())) { LOG_WARN("failed to init all zone table", KR(ret)); } @@ -1461,7 +1433,9 @@ int ObBootstrap::init_backup_data() LOG_WARN("failed to init backup inner table version", K(ret)); } else if (OB_FAIL(ObBackupInfoOperator::set_backup_leader_epoch(ddl_service_.get_sql_proxy(), 1))) { LOG_WARN("failed to init backup leader epoch", K(ret)); - } else if (OB_FAIL(ObBackupInfoOperator::set_backup_leader(ddl_service_.get_sql_proxy(), ddl_service_.get_server_manager().get_rs_addr()))) { + // mark + } else if (OB_FAIL(ObBackupInfoOperator::set_backup_leader(ddl_service_.get_sql_proxy(), GCTX.self_addr()))) { + // LOG_WARN("failed to init backup leader", K(ret)); } @@ -1631,24 +1605,6 @@ int ObBootstrap::init_all_zone_table() return ret; } -//FIXME:it need to write in new table, if table name changes after splitting -int ObBootstrap::init_server_id(const uint64_t server_id) -{ - int ret = OB_SUCCESS; - ObMaxIdFetcher fetcher(ddl_service_.get_sql_proxy()); - if (OB_FAIL(check_inner_stat())) { - LOG_WARN("check_inner_stat failed", K(ret)); - } else if (OB_INVALID_ID == server_id) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server_id", K(server_id), K(ret)); - } else if (OB_FAIL(fetcher.update_max_id(ddl_service_.get_sql_proxy(), - OB_SYS_TENANT_ID, OB_MAX_USED_SERVER_ID_TYPE, server_id))) { - LOG_WARN("update max used server id failed", K(server_id), K(ret)); - } - BOOTSTRAP_CHECK_SUCCESS(); - return ret; -} - template int ObBootstrap::set_replica_options(SCHEMA &schema) { diff --git a/src/rootserver/ob_bootstrap.h b/src/rootserver/ob_bootstrap.h index 09de8bc7d..c2d87de6a 100644 --- a/src/rootserver/ob_bootstrap.h +++ b/src/rootserver/ob_bootstrap.h @@ -52,6 +52,7 @@ namespace rootserver class ObRsGtsManager; struct ObSysStat; class ObTableCreator; +class ObServerZoneOpService; class ObBaseBootstrap { @@ -138,7 +139,7 @@ public: obrpc::ObCommonRpcProxy &rs_rpc_proxy); virtual ~ObBootstrap() {} - virtual int execute_bootstrap(); + virtual int execute_bootstrap(rootserver::ObServerZoneOpService &server_zone_op_service); static int create_all_schema( ObDDLService &ddl_service, common::ObIArray &table_schemas); @@ -170,12 +171,10 @@ private: virtual int check_is_already_bootstrap(bool &is_bootstrap); virtual int init_global_stat(); virtual int init_sequence_id(); - virtual int init_system_data(const uint64_t server_id); + virtual int init_system_data(); virtual int init_all_zone_table(); virtual int init_multiple_zone_deployment_table(common::ObISQLClient &sql_client); - virtual int init_server_id(const uint64_t server_id); - virtual int add_rs_list(uint64_t &server_id); - virtual int wait_all_rs_online(); + virtual int add_servers_in_rs_list(rootserver::ObServerZoneOpService &server_zone_op_service); virtual int wait_all_rs_in_service(); int init_backup_inner_table(); int init_backup_data(); diff --git a/src/rootserver/ob_ddl_operator.cpp b/src/rootserver/ob_ddl_operator.cpp index 7d0f071e1..82440d86d 100644 --- a/src/rootserver/ob_ddl_operator.cpp +++ b/src/rootserver/ob_ddl_operator.cpp @@ -126,7 +126,7 @@ int ObSysStat::set_initial_values(const uint64_t tenant_id) ob_max_used_unit_config_id_.value_.set_int(OB_USER_UNIT_CONFIG_ID); ob_max_used_resource_pool_id_.value_.set_int(OB_USER_RESOURCE_POOL_ID); ob_max_used_unit_id_.value_.set_int(OB_USER_UNIT_ID); - ob_max_used_server_id_.value_.set_int(OB_INIT_SERVER_ID); + ob_max_used_server_id_.value_.set_int(OB_INIT_SERVER_ID - 1); ob_max_used_ddl_task_id_.value_.set_int(OB_INIT_DDL_TASK_ID); ob_max_used_unit_group_id_.value_.set_int(OB_USER_UNIT_GROUP_ID); } else { diff --git a/src/rootserver/ob_ddl_service.cpp b/src/rootserver/ob_ddl_service.cpp index 56784ba73..90e7c20fc 100644 --- a/src/rootserver/ob_ddl_service.cpp +++ b/src/rootserver/ob_ddl_service.cpp @@ -55,7 +55,7 @@ #include "sql/resolver/ddl/ob_ddl_resolver.h" #include "sql/resolver/expr/ob_raw_expr_modify_column_name.h" #include "sql/resolver/expr/ob_raw_expr_printer.h" -#include "ob_server_manager.h" +#include "share/ob_all_server_tracer.h" #include "ob_zone_manager.h" #include "rootserver/ob_schema2ddl_sql.h" #include "rootserver/ob_unit_manager.h" @@ -161,7 +161,6 @@ ObDDLService::ObDDLService() sql_proxy_(NULL), schema_service_(NULL), lst_operator_(NULL), - server_mgr_(NULL), zone_mgr_(NULL), unit_mgr_(NULL), snapshot_mgr_(NULL) @@ -173,7 +172,6 @@ int ObDDLService::init(obrpc::ObSrvRpcProxy &rpc_proxy, common::ObMySQLProxy &sql_proxy, share::schema::ObMultiVersionSchemaService &schema_service, share::ObLSTableOperator &lst_operator, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, ObUnitManager &unit_mgr, ObSnapshotInfoManager &snapshot_mgr) @@ -188,7 +186,6 @@ int ObDDLService::init(obrpc::ObSrvRpcProxy &rpc_proxy, sql_proxy_ = &sql_proxy; schema_service_ = &schema_service; lst_operator_ = &lst_operator; - server_mgr_ = &server_mgr; zone_mgr_ = &zone_mgr; unit_mgr_ = &unit_mgr; snapshot_mgr_ = &snapshot_mgr; @@ -1494,9 +1491,9 @@ int ObDDLService::check_inner_stat() const || OB_ISNULL(rpc_proxy_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("schema_service_,sql_proxy_ or rpc_proxy_ is null", K(ret)); - } else if (OB_ISNULL(server_mgr_) || OB_ISNULL(lst_operator_)) { + } else if (OB_ISNULL(lst_operator_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_mgr_ or pt_operator_ or lst_operator_ is null", KR(ret)); + LOG_WARN("lst_operator_ is null", KR(ret)); } else if (OB_ISNULL(unit_mgr_) || OB_ISNULL(zone_mgr_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unit_mgr_ or zone_mgr_ is null", K(ret)); @@ -11274,50 +11271,6 @@ int ObDDLService::check_restore_point_allow(const int64_t tenant_id, const ObTab return ret; } -int ObDDLService::check_all_server_frozen_scn(const SCN &frozen_scn) -{ - int ret = OB_SUCCESS; - if (OB_ISNULL(rpc_proxy_) || OB_ISNULL(server_mgr_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("ptr is null", K(ret), KP_(rpc_proxy), KP_(server_mgr)); - } else { - ObCheckFrozenScnProxy check_frozen_scn_proxy(*rpc_proxy_, &obrpc::ObSrvRpcProxy::check_frozen_scn); - ObZone zone; - ObArray server_statuses; - ObCheckFrozenScnArg arg; - arg.frozen_scn_ = frozen_scn; - if (OB_FAIL(server_mgr_->get_server_statuses(zone, server_statuses))) { - LOG_WARN("fail to get server statuses", K(ret)); - } else if (server_statuses.count() <= 0) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid server cnt", K(ret)); - } - // check server alive - for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses.count(); i++) { - if (!server_statuses[i].is_alive()) { - ret = OB_SERVER_NOT_ALIVE; - LOG_WARN("server not alive", K(ret), "server", server_statuses[i]); - } - } - if (OB_SUCC(ret)) { - // send async rpc - for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses.count(); i++) { - const int64_t rpc_timeout_us = THIS_WORKER.get_timeout_remain(); - const ObAddr &addr = server_statuses[i].server_; - if (OB_FAIL(check_frozen_scn_proxy.call(addr, rpc_timeout_us, arg))) { - LOG_WARN("fail to check frozen version", K(ret), K(addr), K(rpc_timeout_us)); - } - } - int tmp_ret = OB_SUCCESS; - // all server should success; - if (OB_SUCCESS != (tmp_ret = check_frozen_scn_proxy.wait())) { - LOG_WARN("fail to execute rpc", K(tmp_ret)); - } - ret = OB_SUCC(ret) ? tmp_ret : ret; - } - } - return ret; -} // This code will be used for partition operations of table and tablegroup // 1. for table, parameter is_drop_truncate_and_alter_index parameter avoids the drop/truncate partition @@ -20129,8 +20082,13 @@ int ObDDLService::create_sys_tenant( LOG_WARN("init tenant env failed", K(tenant_schema), K(ret)); } else if (OB_FAIL(ddl_operator.insert_tenant_merge_info(OB_DDL_ADD_TENANT, tenant_schema, trans))) { LOG_WARN("fail to insert tenant merge info", KR(ret)); - } else if (OB_FAIL(ObServiceEpochProxy::init_service_epoch(trans, OB_SYS_TENANT_ID, - 0/*freeze_service_epoch*/ ,0/*arbitration_service_epoch*/))) { + } else if (OB_FAIL(ObServiceEpochProxy::init_service_epoch( + trans, + OB_SYS_TENANT_ID, + 0, /*freeze_service_epoch*/ + 0, /*arbitration_service_epoch*/ + 0, /*server_zone_op_service_epoch*/ + 0 /*heartbeat_service_epoch*/))) { LOG_WARN("fail to init service epoch", KR(ret)); } if (trans.is_started()) { @@ -21188,9 +21146,14 @@ int ObDDLService::init_tenant_schema( LOG_WARN("init tenant env failed", KR(ret), K(tenant_role), K(recovery_until_scn), K(tenant_schema)); } else if (OB_FAIL(ddl_operator.insert_tenant_merge_info(OB_DDL_ADD_TENANT_START, tenant_schema, trans))) { LOG_WARN("fail to insert tenant merge info", KR(ret), K(tenant_schema)); - } else if (is_meta_tenant(tenant_id) && OB_FAIL(ObServiceEpochProxy::init_service_epoch(trans, tenant_id, - 0/*freeze_service_epoch*/, 0/*arbitration_service_epoch*/))) { - LOG_WARN("fail to init service epoch", KR(ret), K(tenant_id)); + } else if (is_meta_tenant(tenant_id) && OB_FAIL(ObServiceEpochProxy::init_service_epoch( + trans, + tenant_id, + 0, /*freeze_service_epoch*/ + 0, /*arbitration_service_epoch*/ + 0, /*server_zone_op_service_epoch*/ + 0 /*heartbeat_service_epoch*/))) { + LOG_WARN("fail to init service epoch", KR(ret)); } if (trans.is_started()) { @@ -24331,19 +24294,25 @@ int ObDDLService::notify_refresh_schema(const ObAddrIArray &addrs) { int ret = OB_SUCCESS; const ObZone zone; - ObServerManager::ObServerArray server_list; + ObArray server_list; ObSwitchSchemaProxy proxy(*rpc_proxy_, &ObSrvRpcProxy::switch_schema); ObSwitchSchemaArg arg; ObRefreshSchemaInfo local_schema_info; ObRefreshSchemaInfo &schema_info = arg.schema_info_; int64_t schema_version = OB_INVALID_VERSION; + ObAddr rs_addr; if (OB_FAIL(check_inner_stat())) { LOG_WARN("variable is not init"); - } else if (OB_FAIL(server_mgr_->get_alive_servers(zone, server_list))) { - LOG_WARN("get alive server failed", KR(ret)); - } else if (OB_ISNULL(schema_service_)) { + } else if (OB_ISNULL(GCTX.rs_mgr_) || OB_ISNULL(schema_service_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("schema_service is null", KR(ret)); + LOG_WARN("GCTX.rs_mgr_ or schema_service_ is null", KR(ret), KP(GCTX.rs_mgr_), KP(schema_service_)); + } else if (OB_FAIL(GCTX.rs_mgr_->get_master_root_server(rs_addr))) { + LOG_WARN("fail to get master root servcer", KR(ret)); + } else if (OB_UNLIKELY(!rs_addr.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("rs_addr is invalid", KR(ret), K(rs_addr)); + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(zone, server_list))) { + LOG_WARN("get alive server failed", KR(ret), K(zone)); } else if (OB_FAIL(schema_service_->get_refresh_schema_info(local_schema_info))) { LOG_WARN("fail to get schema info", KR(ret)); } else if (OB_FAIL(schema_service_->get_tenant_schema_version(OB_SYS_TENANT_ID, schema_version))) { @@ -24366,7 +24335,7 @@ int ObDDLService::notify_refresh_schema(const ObAddrIArray &addrs) if (OB_ISNULL(s)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("s is null", K(ret)); - } else if (server_mgr_->get_rs_addr() == *s) { + } else if (rs_addr == *s) { continue; } else { bool found = false; diff --git a/src/rootserver/ob_ddl_service.h b/src/rootserver/ob_ddl_service.h index c4e4ff92b..268176c15 100644 --- a/src/rootserver/ob_ddl_service.h +++ b/src/rootserver/ob_ddl_service.h @@ -83,7 +83,6 @@ namespace palf namespace rootserver { class ObDDLOperator; -class ObServerManager; class ObZoneManager; class ObUnitManager; class ObCommitAlterTenantLocalityArg; @@ -131,7 +130,6 @@ public: common::ObMySQLProxy &sql_proxy, share::schema::ObMultiVersionSchemaService &schema_service, share::ObLSTableOperator &lst_operator, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, ObUnitManager &unit_mgr, ObSnapshotInfoManager &snapshot_mgr); @@ -142,7 +140,6 @@ public: // these functions should be called after ddl_service has been inited share::schema::ObMultiVersionSchemaService &get_schema_service() { return *schema_service_; } common::ObMySQLProxy &get_sql_proxy() { return *sql_proxy_; } - ObServerManager &get_server_manager() { return *server_mgr_; } ObZoneManager &get_zone_mgr() { return *zone_mgr_; } ObSnapshotInfoManager &get_snapshot_mgr() { return *snapshot_mgr_; } share::ObLSTableOperator &get_lst_operator() { return *lst_operator_; } @@ -1781,7 +1778,6 @@ public: common::ObIArray &zone_region_list, const common::ObIArray &zone_list); - int check_all_server_frozen_scn(const share::SCN &frozen_scn); int handle_security_audit(const obrpc::ObSecurityAuditArg &arg); static int check_and_get_object_name(share::schema::ObSchemaGetterGuard &schema_guard, @@ -2300,7 +2296,6 @@ private: share::schema::ObMultiVersionSchemaService *schema_service_; share::ObLSTableOperator *lst_operator_; //TODO(jingqian): used to choose partition server, use load balancer finnally - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; ObUnitManager *unit_mgr_; ObSnapshotInfoManager *snapshot_mgr_; diff --git a/src/rootserver/ob_disaster_recovery_info.cpp b/src/rootserver/ob_disaster_recovery_info.cpp index cd2a49be1..99ec9adc5 100644 --- a/src/rootserver/ob_disaster_recovery_info.cpp +++ b/src/rootserver/ob_disaster_recovery_info.cpp @@ -17,8 +17,8 @@ #include "lib/container/ob_se_array.h" #include "lib/container/ob_se_array_iterator.h" #include "ob_unit_manager.h" -#include "ob_server_manager.h" #include "ob_zone_manager.h" +#include "share/ob_all_server_tracer.h" using namespace oceanbase::common; using namespace oceanbase::share; @@ -272,35 +272,39 @@ int DRLSInfo::fill_servers() { int ret = OB_SUCCESS; common::ObZone zone; - ObServerManager::ObServerStatusArray server_status_array; - if (OB_UNLIKELY(nullptr == server_mgr_)) { + ObArray servers_info; + if (OB_FAIL(SVR_TRACER.get_servers_info(zone, servers_info))) { + LOG_WARN("fail to get all servers_info", KR(ret)); + } else if (OB_ISNULL(zone_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("server mgr ptr is null", KR(ret), KP(server_mgr_)); - } else if (OB_FAIL(server_mgr_->get_server_statuses(zone, server_status_array))) { - LOG_WARN("fail to get all server status", KR(ret)); + LOG_WARN("zone_mgr_ is null", KR(ret), KP(zone_mgr_)); } else { server_stat_info_map_.reuse(); - FOREACH_X(s, server_status_array, OB_SUCC(ret)) { + FOREACH_X(s, servers_info, OB_SUCC(ret)) { ServerStatInfoMap::Item *item = nullptr; bool zone_active = false; - if (OB_UNLIKELY(nullptr == s)) { + if (OB_ISNULL(s)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("server ptr is null", KR(ret)); - } else if (OB_FAIL(zone_mgr_->check_zone_active(s->zone_, zone_active))) { - LOG_WARN("fail to check zone active", KR(ret), "zone", s->zone_); - } else if (OB_FAIL(server_stat_info_map_.locate(s->server_, item))) { - LOG_WARN("fail to locate server status", KR(ret), "server", s->server_); - } else if (OB_UNLIKELY(nullptr == item)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("item ptr is null", KR(ret), "server", s->server_); - } else if (OB_FAIL(item->v_.init( - s->server_, - s->is_alive(), - s->is_active(), - s->is_permanent_offline(), - s->is_migrate_in_blocked(), - (s->is_stopped() || !zone_active)))) { - LOG_WARN("fail to init server item", KR(ret)); + } else { + const ObAddr &server = s->get_server(); + const ObZone &zone = s->get_zone(); + if (OB_FAIL(zone_mgr_->check_zone_active(zone, zone_active))) { + LOG_WARN("fail to check zone active", KR(ret), "zone", zone); + } else if (OB_FAIL(server_stat_info_map_.locate(server, item))) { + LOG_WARN("fail to locate server status", KR(ret), "server", server); + } else if (OB_UNLIKELY(nullptr == item)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("item ptr is null", KR(ret), "server", server); + } else if (OB_FAIL(item->v_.init( + server, + s->is_alive(), + s->is_active(), + s->is_permanent_offline(), + s->is_migrate_in_blocked(), + (s->is_stopped() || !zone_active)))) { + LOG_WARN("fail to init server item", KR(ret)); + } } } } diff --git a/src/rootserver/ob_disaster_recovery_info.h b/src/rootserver/ob_disaster_recovery_info.h index 7f03fb332..5c9cdfc88 100644 --- a/src/rootserver/ob_disaster_recovery_info.h +++ b/src/rootserver/ob_disaster_recovery_info.h @@ -37,7 +37,6 @@ namespace rootserver { class ObUnitManager; -class ObServerManager; class ObZoneManager; struct DRServerStatInfo @@ -141,13 +140,11 @@ class DRLSInfo public: DRLSInfo(const uint64_t resource_tenant_id, ObUnitManager *unit_mgr, - ObServerManager *server_mgr, ObZoneManager *zone_mgr, share::schema::ObMultiVersionSchemaService *schema_service) : resource_tenant_id_(resource_tenant_id), sys_schema_guard_(), unit_mgr_(unit_mgr), - server_mgr_(server_mgr), zone_mgr_(zone_mgr), schema_service_(schema_service), unit_stat_info_map_(), @@ -238,7 +235,6 @@ private: uint64_t resource_tenant_id_; share::schema::ObSchemaGetterGuard sys_schema_guard_; ObUnitManager *unit_mgr_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; share::schema::ObMultiVersionSchemaService *schema_service_; UnitStatInfoMap unit_stat_info_map_; diff --git a/src/rootserver/ob_disaster_recovery_task.cpp b/src/rootserver/ob_disaster_recovery_task.cpp index 504d41f3c..3909f0aa3 100644 --- a/src/rootserver/ob_disaster_recovery_task.cpp +++ b/src/rootserver/ob_disaster_recovery_task.cpp @@ -24,7 +24,6 @@ #include "rootserver/ob_root_balancer.h" #include "rootserver/ob_root_service.h" #include "ob_rs_event_history_table_operator.h" -#include "ob_server_manager.h" #include "share/ob_rpc_struct.h" #include "observer/ob_server_struct.h" #include "observer/ob_server.h" @@ -331,22 +330,6 @@ int ObDRTask::set_task_key( return ret; } -int ObDRTask::update_with_partition( - const common::ObAddr &dst_server) const -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(!dst_server.is_valid())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), K(dst_server)); - } else if (OB_UNLIKELY(nullptr == GCTX.root_service_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("rootservice ptr is null", KR(ret)); - } else if (OB_FAIL(GCTX.root_service_->get_server_mgr().set_with_partition(dst_server))) { - LOG_WARN("fail to set with partition", KR(ret), K(dst_server)); - } - return ret; -} - void ObDRTask::set_schedule() { set_schedule_time(ObTimeUtility::current_time()); @@ -579,9 +562,6 @@ int ObMigrateLSReplicaTask::check_before_execute( LOG_WARN("fail to check paxos replica number", KR(ret), K(ls_info)); } else if (OB_FAIL(check_online(ls_info, ret_comment))) { LOG_WARN("fail to check online", KR(ret), K(ls_info)); - } else if (OB_FAIL(update_with_partition(dst_replica_.get_server()))) { - LOG_WARN("fail to update with partition", KR(ret), - "server", dst_replica_.get_server()); } return ret; } @@ -1032,9 +1012,6 @@ int ObAddLSReplicaTask::check_before_execute( LOG_WARN("fail to check online", KR(ret), K(ls_info)); } else if (OB_FAIL(check_paxos_member(ls_info, ret_comment))) { LOG_WARN("fail to check paxos member", KR(ret), K(ls_info)); - } else if (OB_FAIL(update_with_partition(dst_replica_.get_server()))) { - LOG_WARN("fail to update with partition", KR(ret), - "server", dst_replica_.get_server()); } return ret; } @@ -1505,9 +1482,6 @@ int ObLSTypeTransformTask::check_before_execute( LOG_WARN("fail to check online", KR(ret), K(ls_info)); } else if (OB_FAIL(check_paxos_member(ls_info, ret_comment))) { LOG_WARN("fail to check paxos member", KR(ret), K(ls_info)); - } else if (OB_FAIL(update_with_partition(dst_replica_.get_server()))) { - LOG_WARN("fail to update with partition", KR(ret), - "server", dst_replica_.get_server()); } return ret; } diff --git a/src/rootserver/ob_disaster_recovery_task.h b/src/rootserver/ob_disaster_recovery_task.h index c85f7c5f3..0b179405d 100644 --- a/src/rootserver/ob_disaster_recovery_task.h +++ b/src/rootserver/ob_disaster_recovery_task.h @@ -331,8 +331,6 @@ public: virtual int clone(void *input_ptr, ObDRTask *&output_task) const = 0; int deep_copy(const ObDRTask &that); public: - int update_with_partition( - const common::ObAddr &dst_server) const; void set_schedule(); bool is_manual_task() const { return obrpc::ObAdminClearDRTaskArg::TaskType::MANUAL == invoked_source_; } public: diff --git a/src/rootserver/ob_disaster_recovery_task_executor.cpp b/src/rootserver/ob_disaster_recovery_task_executor.cpp index 6eacd8f82..868c84354 100644 --- a/src/rootserver/ob_disaster_recovery_task_executor.cpp +++ b/src/rootserver/ob_disaster_recovery_task_executor.cpp @@ -20,11 +20,11 @@ #include "share/ls/ob_ls_table_operator.h" #include "share/ob_cluster_version.h" #include "ob_rs_event_history_table_operator.h" -#include "ob_server_manager.h" #include "ob_disaster_recovery_task_mgr.h" #include "ob_disaster_recovery_task.h" #include "observer/ob_server.h" #include "lib/utility/ob_tracepoint.h" +#include "share/ob_all_server_tracer.h" namespace oceanbase { @@ -35,8 +35,7 @@ namespace rootserver int ObDRTaskExecutor::init( share::ObLSTableOperator &lst_operator, - obrpc::ObSrvRpcProxy &rpc_proxy, - ObServerManager &server_mgr) + obrpc::ObSrvRpcProxy &rpc_proxy) { int ret = OB_SUCCESS; if (OB_UNLIKELY(inited_)) { @@ -45,7 +44,6 @@ int ObDRTaskExecutor::init( } else { lst_operator_ = &lst_operator; rpc_proxy_ = &rpc_proxy; - server_mgr_ = &server_mgr; inited_ = true; } return ret; @@ -57,18 +55,16 @@ int ObDRTaskExecutor::execute( ObDRTaskRetComment &ret_comment) const { int ret = OB_SUCCESS; + const ObAddr &dst_server = task.get_dst_server(); + ObServerInfoInTable server_info; if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); - } else if (OB_ISNULL(server_mgr_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("server mgr ptr is null", KR(ret), KP(server_mgr_)); + } else if (OB_FAIL(SVR_TRACER.get_server_info(dst_server, server_info))) { + LOG_WARN("fail to get server_info", KR(ret), K(dst_server)); } else { - const ObAddr &dst_server = task.get_dst_server(); - bool is_dst_server_alive = false; - if (OB_FAIL(server_mgr_->check_server_alive(dst_server, is_dst_server_alive))) { - LOG_WARN("fail to check server alive", KR(ret), K(dst_server)); - } else if (!is_dst_server_alive) { + const bool is_dst_server_alive = server_info.is_alive(); + if (!is_dst_server_alive) { ret = OB_REBALANCE_TASK_CANT_EXEC; ret_comment = ObDRTaskRetComment::CANNOT_EXECUTE_DUE_TO_SERVER_NOT_ALIVE; LOG_WARN("dst server not alive", KR(ret), K(dst_server)); diff --git a/src/rootserver/ob_disaster_recovery_task_executor.h b/src/rootserver/ob_disaster_recovery_task_executor.h index 0c11aa220..b0963e154 100644 --- a/src/rootserver/ob_disaster_recovery_task_executor.h +++ b/src/rootserver/ob_disaster_recovery_task_executor.h @@ -14,7 +14,6 @@ #define OCEANBASE_ROOTSERVER_OB_DISASTER_RECOVERY_TASK_EXECUTOR_H_ #include "rootserver/ob_disaster_recovery_task_mgr.h" -#include "rootserver/ob_server_manager.h" namespace oceanbase { @@ -35,7 +34,6 @@ class ObMultiVersionSchemaService; namespace rootserver { -class ObServerManager; class ObDRTask; class ObDRTaskExecutor @@ -44,18 +42,15 @@ public: ObDRTaskExecutor() : inited_(false), lst_operator_(nullptr), - rpc_proxy_(nullptr), - server_mgr_(nullptr) {} + rpc_proxy_(nullptr) {} virtual ~ObDRTaskExecutor() {} public: // init a ObDRTaskExecutor // param [in] lst_operator, to check task // param [in] rpc_proxy, to send task execution to dst server - // param [in] server_mgr, to check whether dst server alive int init( share::ObLSTableOperator &lst_operator, - obrpc::ObSrvRpcProxy &rpc_proxy, - ObServerManager &server_mgr); + obrpc::ObSrvRpcProxy &rpc_proxy); // do previous check and execute a task // @param [in] task, the task to execute @@ -68,7 +63,6 @@ private: bool inited_; share::ObLSTableOperator *lst_operator_; obrpc::ObSrvRpcProxy *rpc_proxy_; - ObServerManager *server_mgr_; private: DISALLOW_COPY_AND_ASSIGN(ObDRTaskExecutor); }; diff --git a/src/rootserver/ob_disaster_recovery_task_mgr.cpp b/src/rootserver/ob_disaster_recovery_task_mgr.cpp index bfc1a6348..aa54f0874 100644 --- a/src/rootserver/ob_disaster_recovery_task_mgr.cpp +++ b/src/rootserver/ob_disaster_recovery_task_mgr.cpp @@ -24,16 +24,15 @@ #include "ob_disaster_recovery_task_executor.h" #include "rootserver/ob_root_balancer.h" #include "ob_rs_event_history_table_operator.h" -#include "ob_server_manager.h" #include "share/ob_rpc_struct.h" #include "observer/ob_server_struct.h" -#include "share/ob_server_status.h" #include "sql/executor/ob_executor_rpc_proxy.h" #include "rootserver/ob_disaster_recovery_task.h" // for ObDRTaskType #include "share/ob_share_util.h" // for ObShareUtil #include "lib/lock/ob_tc_rwlock.h" // for common::RWLock #include "rootserver/ob_disaster_recovery_task.h" #include "share/inner_table/ob_inner_table_schema_constants.h" +#include "share/ob_all_server_tracer.h" namespace oceanbase { @@ -49,7 +48,6 @@ ObDRTaskQueue::ObDRTaskQueue() : inited_(false), schedule_list_(), task_map_(), rpc_proxy_(nullptr), - server_mgr_(nullptr), priority_(ObDRTaskPriority::MAX_PRI) { } @@ -101,7 +99,6 @@ int ObDRTaskQueue::init( common::ObServerConfig &config, const int64_t bucket_num, obrpc::ObSrvRpcProxy *rpc_proxy, - ObServerManager *server_mgr, ObDRTaskPriority priority) { int ret = OB_SUCCESS; @@ -110,10 +107,9 @@ int ObDRTaskQueue::init( LOG_WARN("init twice", KR(ret)); } else if (OB_UNLIKELY(bucket_num <= 0) || OB_ISNULL(rpc_proxy) - || OB_ISNULL(server_mgr) || (ObDRTaskPriority::LOW_PRI != priority && ObDRTaskPriority::HIGH_PRI != priority)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), K(bucket_num), KP(rpc_proxy), KP(server_mgr), K(priority)); + LOG_WARN("invalid argument", KR(ret), K(bucket_num), KP(rpc_proxy), K(priority)); } else if (OB_FAIL(task_map_.create(bucket_num, "DRTaskMap"))) { LOG_WARN("fail to create task map", KR(ret), K(bucket_num)); } else if (OB_FAIL(task_alloc_.init( @@ -123,7 +119,6 @@ int ObDRTaskQueue::init( } else { config_ = &config; rpc_proxy_ = rpc_proxy; - server_mgr_ = server_mgr; priority_ = priority; inited_ = true; } @@ -414,32 +409,30 @@ int ObDRTaskQueue::check_task_need_cleaning_( // (3) rpc ls_check_dr_task_exist successfully told us task not exist // (4) task is timeout while any failure during whole procedure need_cleanning = false; - share::ObServerStatus server_status; Bool task_exist = false; - bool server_exist = true; - + const ObAddr &dst_server = task.get_dst_server(); + share::ObServerInfoInTable server_info; if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("task queue not init", KR(ret)); - } else if (OB_ISNULL(server_mgr_) || OB_ISNULL(rpc_proxy_)) { + } else if (OB_ISNULL(rpc_proxy_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("some ptr is null", KR(ret), KP(server_mgr_), KP(rpc_proxy_)); - } else if (OB_FAIL(server_mgr_->is_server_exist(task.get_dst_server(), server_exist))) { - LOG_WARN("fail to check is server exist", KR(ret), "server", task.get_dst_server()); - } else if (!server_exist) { - // case 1. server not exist - FLOG_INFO("the reason to clean this task: server not exist", K(task)); - need_cleanning = true; - ret_comment = ObDRTaskRetComment::CLEAN_TASK_DUE_TO_SERVER_NOT_EXIST; - } else if (OB_FAIL(server_mgr_->get_server_status(task.get_dst_server(), server_status))) { - // we only care about HeartBeatStatus in server_status - LOG_WARN("fail to get server status", KR(ret), "server", task.get_dst_server()); - } else if (server_status.is_permanent_offline()) { - // case 2. server status is permanant offline - FLOG_INFO("the reason to clean this task: server permanent offline", K(task), K(server_status)); + LOG_WARN("some ptr is null", KR(ret), KP(rpc_proxy_)); + } else if (OB_FAIL(SVR_TRACER.get_server_info(dst_server, server_info))) { + LOG_WARN("fail to get server_info", KR(ret), "server", dst_server); + // case 1. server not exist + if (OB_ENTRY_NOT_EXIST == ret) { + ret = OB_SUCCESS; + FLOG_INFO("the reason to clean this task: server not exist", K(task)); + need_cleanning = true; + ret_comment = ObDRTaskRetComment::CLEAN_TASK_DUE_TO_SERVER_NOT_EXIST; + } + } else if (server_info.is_permanent_offline()) { + // case 2. server is permanant offline + FLOG_INFO("the reason to clean this task: server permanent offline", K(task), K(server_info)); need_cleanning = true; ret_comment = ObDRTaskRetComment::CLEAN_TASK_DUE_TO_SERVER_PERMANENT_OFFLINE; - } else if (server_status.is_alive()) { + } else if (server_info.is_alive()) { ObDRTaskExistArg arg; arg.task_id_ = task.get_task_id(); arg.tenant_id_ = task.get_tenant_id(); @@ -454,12 +447,12 @@ int ObDRTaskQueue::check_task_need_cleaning_( need_cleanning = true; ret_comment = ObDRTaskRetComment::CLEAN_TASK_DUE_TO_TASK_NOT_RUNNING; } - } else if (server_status.is_temporary_offline()) { + } else if (server_info.is_temporary_offline()) { ret = OB_SERVER_NOT_ALIVE; - LOG_WARN("server status is not alive, task may be cleanned later", KR(ret), "server", task.get_dst_server(), K(server_status), K(task)); + LOG_WARN("server status is not alive, task may be cleanned later", KR(ret), "server", task.get_dst_server(), K(server_info), K(task)); } else { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected server status", KR(ret), "server", task.get_dst_server(), K(server_status), K(task)); + LOG_WARN("unexpected server status", KR(ret), "server", task.get_dst_server(), K(server_info), K(task)); } // case 4. task is timeout while any OB_FAIL occurs @@ -608,7 +601,6 @@ int ObDRTaskMgr::init( const common::ObAddr &server, common::ObServerConfig &config, ObDRTaskExecutor &task_executor, - ObServerManager *server_mgr, obrpc::ObSrvRpcProxy *rpc_proxy, common::ObMySQLProxy *sql_proxy, share::schema::ObMultiVersionSchemaService *schema_service) @@ -619,12 +611,11 @@ int ObDRTaskMgr::init( ret = OB_INIT_TWICE; LOG_WARN("init twice", KR(ret), K(inited_), K_(stopped)); } else if (OB_UNLIKELY(!server.is_valid()) - || OB_ISNULL(server_mgr) || OB_ISNULL(rpc_proxy) || OB_ISNULL(sql_proxy) || OB_ISNULL(schema_service)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), K(server), KP(server_mgr), KP(rpc_proxy), + LOG_WARN("invalid argument", KR(ret), K(server), KP(rpc_proxy), KP(sql_proxy), KP(schema_service)); } else if (OB_FAIL(cond_.init(ObWaitEventIds::REBALANCE_TASK_MGR_COND_WAIT))) { LOG_WARN("fail to init cond", KR(ret)); @@ -634,15 +625,14 @@ int ObDRTaskMgr::init( config_ = &config; self_ = server; task_executor_ = &task_executor; - server_mgr_ = server_mgr; rpc_proxy_ = rpc_proxy; sql_proxy_ = sql_proxy; schema_service_ = schema_service; if (OB_FAIL(high_task_queue_.init( - config, TASK_QUEUE_LIMIT, rpc_proxy_, server_mgr_, ObDRTaskPriority::HIGH_PRI))) { + config, TASK_QUEUE_LIMIT, rpc_proxy_, ObDRTaskPriority::HIGH_PRI))) { LOG_WARN("fail to init high priority task queue", KR(ret)); } else if (OB_FAIL(low_task_queue_.init( - config, TASK_QUEUE_LIMIT, rpc_proxy_, server_mgr_, ObDRTaskPriority::LOW_PRI))) { + config, TASK_QUEUE_LIMIT, rpc_proxy_, ObDRTaskPriority::LOW_PRI))) { LOG_WARN("fail to init low priority task queue", KR(ret)); } else if (OB_FAIL(disaster_recovery_task_table_updater_.init(sql_proxy, this))) { LOG_WARN("fail to init a ObDRTaskTableUpdater", KR(ret)); diff --git a/src/rootserver/ob_disaster_recovery_task_mgr.h b/src/rootserver/ob_disaster_recovery_task_mgr.h index 69cf26435..3981ac421 100644 --- a/src/rootserver/ob_disaster_recovery_task_mgr.h +++ b/src/rootserver/ob_disaster_recovery_task_mgr.h @@ -33,7 +33,6 @@ struct ObDRTaskReplyResult; namespace rootserver { class ObDRTaskExecutor; -class ObServerManager; class ObDRTaskMgr; class ObDRTaskQueue @@ -53,12 +52,10 @@ public: // @param [in] config, server config // @param [in] bucket_num, the size of task_map // @param [in] rpc_proxy, to send rpc - // @param [in] server_mgr, server manager to get server infos int init( common::ObServerConfig &config, const int64_t bucket_num, obrpc::ObSrvRpcProxy *rpc_proxy, - ObServerManager *server_mgr, ObDRTaskPriority priority); public: @@ -190,7 +187,6 @@ private: TaskList schedule_list_; TaskMap task_map_; obrpc::ObSrvRpcProxy *rpc_proxy_; - ObServerManager *server_mgr_; ObDRTaskPriority priority_; private: DISALLOW_COPY_AND_ASSIGN(ObDRTaskQueue); @@ -218,7 +214,6 @@ public: low_task_queue_(queues_[1]), self_(), task_executor_(nullptr), - server_mgr_(nullptr), rpc_proxy_(nullptr), sql_proxy_(nullptr), schema_service_(nullptr) {} @@ -229,7 +224,6 @@ public: // @param [in] server, local server address // @param [in] config, local server config // @param [in] task_executor, to execute a task - // @param [in] server_mgr, to check server status for task queue // @param [in] rpc_proxy, to send rpc for task queue // @param [in] sql_proxy, to send sql for updater // @param [in] schema_service, to get infos about objects @@ -237,7 +231,6 @@ public: const common::ObAddr &server, common::ObServerConfig &config, ObDRTaskExecutor &task_executor, - ObServerManager *server_mgr, obrpc::ObSrvRpcProxy *rpc_proxy, common::ObMySQLProxy *sql_proxy, share::schema::ObMultiVersionSchemaService *schema_service); @@ -427,7 +420,6 @@ private: ObDRTaskQueue &low_task_queue_; // queues_[1] common::ObAddr self_; ObDRTaskExecutor *task_executor_; - ObServerManager *server_mgr_; obrpc::ObSrvRpcProxy *rpc_proxy_; common::ObMySQLProxy *sql_proxy_; share::schema::ObMultiVersionSchemaService *schema_service_; diff --git a/src/rootserver/ob_disaster_recovery_worker.cpp b/src/rootserver/ob_disaster_recovery_worker.cpp index 2c093152c..8bb327091 100644 --- a/src/rootserver/ob_disaster_recovery_worker.cpp +++ b/src/rootserver/ob_disaster_recovery_worker.cpp @@ -222,13 +222,11 @@ int ObDRWorker::LocalityAlignment::locate_zone_locality( } ObDRWorker::LocalityAlignment::LocalityAlignment(ObUnitManager *unit_mgr, - ObServerManager *server_mgr, ObZoneManager *zone_mgr, DRLSInfo &dr_ls_info) : task_idx_(0), add_replica_task_(), unit_mgr_(unit_mgr), - server_mgr_(server_mgr), zone_mgr_(zone_mgr), dr_ls_info_(dr_ls_info), task_array_(), @@ -1077,11 +1075,9 @@ int ObDRWorker::LocalityAlignment::build() int ret = OB_SUCCESS; uint64_t tenant_id = OB_INVALID_ID; if (OB_UNLIKELY(nullptr == unit_mgr_ - || nullptr == server_mgr_ || nullptr == zone_mgr_)) { ret = OB_NOT_INIT; - LOG_WARN("LocalityAlignment not init", - KR(ret), KP(unit_mgr_), KP(server_mgr_), KP(zone_mgr_)); + LOG_WARN("LocalityAlignment not init", KR(ret), KP(unit_mgr_), KP(zone_mgr_)); } else if (OB_FAIL(locality_map_.create(LOCALITY_MAP_BUCKET_NUM, "LocAlign"))) { LOG_WARN("fail to create locality map", KR(ret)); } else if (OB_FAIL(generate_paxos_replica_number())) { @@ -1098,7 +1094,7 @@ int ObDRWorker::LocalityAlignment::build() LOG_WARN("fail to create unit set", KR(ret)); } else if (OB_FAIL(init_unit_set(unit_set_))) { LOG_WARN("fail to init unit set", KR(ret)); - } else if (OB_FAIL(unit_provider_.init(gen_user_tenant_id(tenant_id), unit_mgr_, server_mgr_))) { + } else if (OB_FAIL(unit_provider_.init(gen_user_tenant_id(tenant_id), unit_mgr_))) { LOG_WARN("fail to init unit provider", KR(ret), K(tenant_id)); } return ret; @@ -1521,25 +1517,18 @@ int ObDRWorker::LocalityAlignment::get_next_locality_alignment_task( int ObDRWorker::UnitProvider::init( const uint64_t tenant_id, - ObUnitManager *unit_mgr, - ObServerManager *server_mgr) + ObUnitManager *unit_mgr) { int ret = OB_SUCCESS; if (OB_UNLIKELY(inited_)) { ret = OB_INIT_TWICE; LOG_WARN("init twice", KR(ret)); - } else if (OB_UNLIKELY(OB_INVALID_ID == tenant_id - || nullptr == unit_mgr - || nullptr == server_mgr)) { + } else if (OB_UNLIKELY(OB_INVALID_ID == tenant_id || nullptr == unit_mgr)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), - K(tenant_id), - KP(unit_mgr), - KP(server_mgr)); + LOG_WARN("invalid argument", KR(ret), K(tenant_id), KP(unit_mgr)); } else { tenant_id_ = tenant_id; unit_mgr_ = unit_mgr; - server_mgr_ = server_mgr; inited_ = true; } return ret; @@ -1554,9 +1543,9 @@ int ObDRWorker::UnitProvider::get_unit( if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); - } else if (OB_ISNULL(unit_mgr_) || OB_ISNULL(server_mgr_)) { + } else if (OB_ISNULL(unit_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit mgr ptr is null", KR(ret), KP(unit_mgr_), KP(server_mgr_)); + LOG_WARN("unit_mgr is null", KR(ret), KP(unit_mgr_)); } else { common::ObArray unit_array; bool found = false; @@ -1571,7 +1560,7 @@ int ObDRWorker::UnitProvider::get_unit( int hash_ret = OB_SUCCESS; if (this_info.unit_.zone_ != zone) { // bypass, because we only support migrate in same zone - } else if (OB_FAIL(server_mgr_->check_server_active(this_info.unit_.server_, is_active))) { + } else if (OB_FAIL(SVR_TRACER.check_server_active(this_info.unit_.server_, is_active))) { LOG_WARN("fail to check server active", KR(ret), "server", this_info.unit_.server_); } else if (!is_active) { LOG_INFO("server is not active", "server", this_info.unit_.server_, K(is_active)); @@ -1603,7 +1592,7 @@ int ObDRWorker::UnitProvider::get_unit( int hash_ret = OB_SUCCESS; if (this_info.unit_.zone_ != zone) { // bypass, because only support migrate in same zone - } else if (OB_FAIL(server_mgr_->check_server_active(this_info.unit_.server_, is_active))) { + } else if (OB_FAIL(SVR_TRACER.check_server_active(this_info.unit_.server_, is_active))) { LOG_WARN("fail to check server active", KR(ret), "server", this_info.unit_.server_); } else if (!is_active) { LOG_INFO("server is not active", "server", this_info.unit_.server_, K(is_active)); @@ -1655,7 +1644,6 @@ ObDRWorker::ObDRWorker(volatile bool &stop) self_addr_(), config_(nullptr), unit_mgr_(nullptr), - server_mgr_(nullptr), zone_mgr_(nullptr), disaster_recovery_task_mgr_(nullptr), lst_operator_(nullptr), @@ -1675,7 +1663,6 @@ int ObDRWorker::init( common::ObAddr &self_addr, common::ObServerConfig &config, ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, ObDRTaskMgr &task_mgr, share::ObLSTableOperator &lst_operator, @@ -1694,7 +1681,6 @@ int ObDRWorker::init( self_addr_ = self_addr; config_ = &config; unit_mgr_ = &unit_mgr; - server_mgr_ = &server_mgr; zone_mgr_ = &zone_mgr; disaster_recovery_task_mgr_ = &task_mgr; lst_operator_ = &lst_operator; @@ -1789,7 +1775,6 @@ void ObDRWorker::statistic_total_dr_task(const int64_t task_cnt) int ObDRWorker::check_tenant_locality_match( const uint64_t tenant_id, ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, bool &locality_is_matched) { @@ -1816,7 +1801,6 @@ int ObDRWorker::check_tenant_locality_match( share::ObLSStatusInfo &ls_status_info = ls_status_info_array.at(i); DRLSInfo dr_ls_info(gen_user_tenant_id(tenant_id), &unit_mgr, - &server_mgr, &zone_mgr, GCTX.schema_service_); if (ls_status_info.ls_is_creating()) { @@ -1834,7 +1818,7 @@ int ObDRWorker::check_tenant_locality_match( ls_info, ls_status_info))) { LOG_WARN("fail to generate dr log stream info", KR(ret)); } else if (OB_FAIL(check_ls_locality_match_( - dr_ls_info, unit_mgr, server_mgr, zone_mgr, locality_is_matched))) { + dr_ls_info, unit_mgr, zone_mgr, locality_is_matched))) { LOG_WARN("fail to try log stream disaster recovery", KR(ret)); } } @@ -1846,7 +1830,6 @@ int ObDRWorker::check_tenant_locality_match( int ObDRWorker::check_ls_locality_match_( DRLSInfo &dr_ls_info, ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, bool &locality_is_matched) { @@ -1855,7 +1838,6 @@ int ObDRWorker::check_ls_locality_match_( locality_is_matched = false; LOG_INFO("start to check ls locality match", K(dr_ls_info)); LocalityAlignment locality_alignment(&unit_mgr, - &server_mgr, &zone_mgr, dr_ls_info); if (!dr_ls_info.has_leader()) { @@ -1948,7 +1930,6 @@ int ObDRWorker::try_tenant_disaster_recovery( share::ObLSStatusInfo &ls_status_info = ls_status_info_array.at(i); DRLSInfo dr_ls_info(gen_user_tenant_id(tenant_id), unit_mgr_, - server_mgr_, zone_mgr_, schema_service_); int64_t ls_acc_dr_task = 0; @@ -2409,13 +2390,12 @@ int ObDRWorker::do_single_replica_permanent_offline_( if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); - } else if (OB_ISNULL(server_mgr_) - || OB_UNLIKELY(!member_to_remove.is_valid() + } else if (OB_UNLIKELY(!member_to_remove.is_valid() || OB_INVALID_TENANT_ID == tenant_id || !ls_id.is_valid_with_tenant(tenant_id))) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), K(member_to_remove), K(tenant_id), K(ls_id), KP(server_mgr_)); - } else if (OB_FAIL(server_mgr_->check_server_permanent_offline(member_to_remove.get_server(), is_offline))) { + LOG_WARN("invalid argument", KR(ret), K(member_to_remove), K(tenant_id), K(ls_id)); + } else if (OB_FAIL(SVR_TRACER.check_server_permanent_offline(member_to_remove.get_server(), is_offline))) { LOG_WARN("fail to check server permanent offline", KR(ret), K(member_to_remove)); } else if (is_offline) { FLOG_INFO("found ls replica need to permanent offline", K(member_to_remove)); @@ -2565,7 +2545,6 @@ int ObDRWorker::construct_extra_infos_to_build_migrate_task( //shall never be here } else if (OB_FAIL(choose_disaster_recovery_data_source( zone_mgr_, - server_mgr_, dr_ls_info, dst_member, src_member, @@ -2907,7 +2886,6 @@ int ObDRWorker::try_generate_add_replica_locality_alignment_task( //shall never be here } else if (OB_FAIL(choose_disaster_recovery_data_source( zone_mgr_, - server_mgr_, dr_ls_info, dst_member, ObReplicaMember(),/*empty*/ @@ -2992,7 +2970,6 @@ int ObDRWorker::try_generate_type_transform_locality_alignment_task( LOG_INFO("may has no leader while member change", K(dr_ls_info)); } else if (OB_FAIL(choose_disaster_recovery_data_source( zone_mgr_, - server_mgr_, dr_ls_info, dst_member, src_member, @@ -3242,7 +3219,6 @@ int ObDRWorker::record_task_plan_for_locality_alignment( my_task->memstore_percent_); if (OB_FAIL(choose_disaster_recovery_data_source( zone_mgr_, - server_mgr_, dr_ls_info, dst_member, ObReplicaMember(),/*empty*/ @@ -3275,7 +3251,6 @@ int ObDRWorker::record_task_plan_for_locality_alignment( my_task->dst_memstore_percent_); if (OB_FAIL(choose_disaster_recovery_data_source( zone_mgr_, - server_mgr_, dr_ls_info, dst_member, ObReplicaMember(),/*empty*/ @@ -3347,10 +3322,7 @@ int ObDRWorker::try_locality_alignment( int ret = OB_SUCCESS; DEBUG_SYNC(BEFORE_TRY_LOCALITY_ALIGNMENT); LOG_INFO("try locality alignment", K(dr_ls_info), K(only_for_display)); - LocalityAlignment locality_alignment(unit_mgr_, - server_mgr_, - zone_mgr_, - dr_ls_info); + LocalityAlignment locality_alignment(unit_mgr_, zone_mgr_, dr_ls_info); const LATask *task = nullptr; if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; @@ -3784,7 +3756,6 @@ int ObDRWorker::construct_extra_infos_for_generate_migrate_to_unit_task( //shall never be here } else if (OB_FAIL(choose_disaster_recovery_data_source( zone_mgr_, - server_mgr_, dr_ls_info, dst_member, src_member, @@ -4131,7 +4102,6 @@ int ObDRWorker::generate_disaster_recovery_paxos_replica_number( int ObDRWorker::choose_disaster_recovery_data_source( ObZoneManager *zone_mgr, - ObServerManager *server_mgr, DRLSInfo &dr_ls_info, const ObReplicaMember &dst_member, const ObReplicaMember &src_member, @@ -4139,7 +4109,7 @@ int ObDRWorker::choose_disaster_recovery_data_source( int64_t &data_size) { int ret = OB_SUCCESS; - ObZone dst_zone; + ObServerInfoInTable server_info; ObRegion dst_region; ObDataSourceCandidateChecker type_checker(dst_member.get_replica_type()); int64_t replica_cnt = 0; @@ -4147,12 +4117,13 @@ int ObDRWorker::choose_disaster_recovery_data_source( DRServerStatInfo *server_stat_info = nullptr; DRUnitStatInfo *unit_stat_info = nullptr; DRUnitStatInfo *unit_in_group_stat_info = nullptr; + ObZone dst_zone; - if (OB_ISNULL(zone_mgr) || OB_ISNULL(server_mgr)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), KP(zone_mgr), KP(server_mgr)); - } else if (OB_FAIL(server_mgr->get_server_zone(dst_member.get_server(), dst_zone))) { - LOG_WARN("fail to get server zone", KR(ret), "server", dst_member.get_server()); + if (OB_ISNULL(zone_mgr)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("zone_mgr is null", KR(ret), KP(zone_mgr)); + } else if (OB_FAIL(SVR_TRACER.get_server_zone(dst_member.get_server(), dst_zone))) { + LOG_WARN("fail to get server zone", KR(ret), K(dst_member.get_server())); } else if (OB_FAIL(zone_mgr->get_region(dst_zone, dst_region))) { LOG_WARN("fail to get region", KR(ret), K(dst_zone)); } else if (OB_FAIL(dr_ls_info.get_replica_cnt(replica_cnt))) { @@ -4161,8 +4132,8 @@ int ObDRWorker::choose_disaster_recovery_data_source( ObLSReplica *src_replica = nullptr; // try task offline src for (int64_t i = 0; - OB_SUCC(ret) && i < replica_cnt && src_member.is_valid() && nullptr == src_replica; - ++i) { + OB_SUCC(ret) && i < replica_cnt && src_member.is_valid() && nullptr == src_replica; + ++i) { if (OB_FAIL(dr_ls_info.get_replica_stat( i, ls_replica, diff --git a/src/rootserver/ob_disaster_recovery_worker.h b/src/rootserver/ob_disaster_recovery_worker.h index 6f8a41eb5..ddd1f3fec 100644 --- a/src/rootserver/ob_disaster_recovery_worker.h +++ b/src/rootserver/ob_disaster_recovery_worker.h @@ -36,7 +36,6 @@ class ObLSReplica; namespace rootserver { class ObUnitManager; -class ObServerManager; class ObZoneManager; class ObDRTaskMgr; class DRLSInfo; @@ -111,14 +110,12 @@ public: common::ObAddr &self_addr, common::ObServerConfig &cfg, ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, ObDRTaskMgr &task_mgr, share::ObLSTableOperator &lst_operator, share::schema::ObMultiVersionSchemaService &schema_service, obrpc::ObSrvRpcProxy &rpc_proxy, common::ObMySQLProxy &sql_proxy); - int try_disaster_recovery(); int try_tenant_disaster_recovery( const uint64_t tenant_id, @@ -127,7 +124,6 @@ public: static int check_tenant_locality_match( const uint64_t tenant_id, ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, bool &locality_is_matched); @@ -172,7 +168,6 @@ private: static int choose_disaster_recovery_data_source( ObZoneManager *zone_mgr, - ObServerManager *server_mgr, DRLSInfo &dr_ls_info, const ObReplicaMember &dst_member, const ObReplicaMember &src_member, @@ -556,12 +551,8 @@ private: : inited_(false), tenant_id_(OB_INVALID_ID), unit_mgr_(nullptr), - server_mgr_(nullptr), unit_set_(unit_set) {} - int init( - const uint64_t tenant_id, - ObUnitManager *unit_mgr, - ObServerManager *server_mgr); + int init(const uint64_t tenant_id, ObUnitManager *unit_mgr); int get_unit( const common::ObZone &zone, const uint64_t unit_group_id, @@ -570,7 +561,6 @@ private: bool inited_; uint64_t tenant_id_; ObUnitManager *unit_mgr_; - ObServerManager *server_mgr_; common::hash::ObHashSet &unit_set_; }; @@ -583,10 +573,7 @@ private: class LocalityAlignment { public: - LocalityAlignment(ObUnitManager *unit_mgr, - ObServerManager *server_mgr, - ObZoneManager *zone_mgr, - DRLSInfo &dr_ls_info); + LocalityAlignment(ObUnitManager *unit_mgr, ObZoneManager *zone_mgr, DRLSInfo &dr_ls_info); virtual ~LocalityAlignment(); int build(); int get_next_locality_alignment_task( @@ -678,7 +665,6 @@ private: int64_t task_idx_; AddReplicaLATask add_replica_task_; ObUnitManager *unit_mgr_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; DRLSInfo &dr_ls_info_; common::ObArray task_array_; @@ -696,7 +682,6 @@ private: static int check_ls_locality_match_( DRLSInfo &dr_ls_info, ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, bool &locality_is_matched); @@ -960,7 +945,6 @@ private: common::ObAddr self_addr_; common::ObServerConfig *config_; ObUnitManager *unit_mgr_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; ObDRTaskMgr *disaster_recovery_task_mgr_; share::ObLSTableOperator *lst_operator_; diff --git a/src/rootserver/ob_empty_server_checker.cpp b/src/rootserver/ob_empty_server_checker.cpp index b2951a4d0..bf7fb0dba 100644 --- a/src/rootserver/ob_empty_server_checker.cpp +++ b/src/rootserver/ob_empty_server_checker.cpp @@ -24,12 +24,15 @@ #include "share/ob_rpc_struct.h"//GetLSReportCnt #include "share/ls/ob_ls_table_iterator.h"//ObAllLSTableIterator #include "share/ls/ob_ls_info.h"//ObLSInfo +#include "share/ob_all_server_tracer.h" #include "observer/ob_server_struct.h" #include "ob_server_manager.h" #include "ob_unit_manager.h"//ObUnitManager +#include "ob_server_zone_op_service.h" #include "rootserver/ob_rs_async_rpc_proxy.h"//ObGetLSReportCntProxy +#include "rootserver/ob_heartbeat_service.h" namespace oceanbase { @@ -43,7 +46,8 @@ int ObEmptyServerChecker::init( ObServerManager &server_mgr, ObUnitManager &unit_mgr, share::ObLSTableOperator &lst_operator, - schema::ObMultiVersionSchemaService &schema_service) + schema::ObMultiVersionSchemaService &schema_service, + ObServerZoneOpService &server_zone_op_service) { int ret = OB_SUCCESS; const int64_t empty_server_checker_thread_cnt = 1; @@ -60,6 +64,7 @@ int ObEmptyServerChecker::init( lst_operator_ = &lst_operator; schema_service_ = &schema_service; unit_mgr_ = &unit_mgr; + server_zone_op_service_ = &server_zone_op_service; empty_servers_.reset(); need_check_ = true; inited_ = true; @@ -98,47 +103,31 @@ int ObEmptyServerChecker::try_delete_server_() { int ret = OB_SUCCESS; ObZone zone; // empty means all zones - ObArray statuses; + ObArray servers_info; if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); - } else if (OB_ISNULL(server_mgr_) || OB_ISNULL(unit_mgr_)) { + } else if (OB_ISNULL(server_mgr_) || OB_ISNULL(unit_mgr_) || OB_ISNULL(server_zone_op_service_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unexpected error", KR(ret), KP(server_mgr_), KP(unit_mgr_)); - } else if (OB_FAIL(server_mgr_->get_server_statuses(zone, statuses))) { - LOG_WARN("get_server_statuses failed", K(zone), KR(ret)); + LOG_WARN("unexpected error", KR(ret), KP(server_mgr_), KP(unit_mgr_), KP(server_zone_op_service_)); + } else if (OB_FAIL(SVR_TRACER.get_servers_info(zone, servers_info))) { + LOG_WARN("get_servers_info failed", KR(ret), K(zone)); } else { - int first_error_ret = OB_SUCCESS; need_check_ = false; empty_servers_.reset(); - FOREACH_CNT_X(status, statuses, OB_SUCCESS == ret) { - if (ObServerStatus::OB_SERVER_ADMIN_DELETING == status->admin_status_) { + FOREACH_CNT_X(server_info, servers_info, OB_SUCC(ret)) { + if (server_info->is_deleting()) { need_check_ = true; bool server_empty = false; - if (OB_FAIL(unit_mgr_->check_server_empty(status->server_, server_empty))) { - LOG_WARN("check_server_empty failed", "server", status->server_, K(ret)); - } else if (server_empty && !(status->force_stop_hb_)) { - // stop server's heartbeat - bool force_stop_hb = true; - if (OB_FAIL(server_mgr_->set_force_stop_hb(status->server_, force_stop_hb))) { - LOG_WARN("set force stop hb failed", K(status->server_), K(ret)); - } else { - LOG_INFO("force set stop hb", KR(ret), K(status->server_)); - } - DEBUG_SYNC(SET_FORCE_STOP_HB_DONE); + const ObAddr &addr= server_info->get_server(); + if (OB_FAIL(unit_mgr_->check_server_empty(addr, server_empty))) { + LOG_WARN("check_server_empty failed", "server", addr, KR(ret)); + } else if (server_empty && OB_FAIL(empty_servers_.push_back(addr))) { + LOG_WARN("failed to push back empty server", KR(ret), KPC(server_info)); } - if (OB_FAIL(ret)) { - } else if (server_empty && OB_FAIL(empty_servers_.push_back(status->server_))) { - LOG_WARN("failed to push back empty server", KR(ret), KPC(status)); - } - } - // ignore single server error - if (OB_FAIL(ret)) { - first_error_ret = OB_SUCC(first_error_ret) ? ret : first_error_ret; - ret = OB_SUCCESS; } } - ret = OB_SUCC(first_error_ret) ? ret : first_error_ret; + DEBUG_SYNC(END_DELETE_SERVER_BEFORE_CHECK_META_TABLE); if (OB_SUCC(ret) && empty_servers_.count() > 0) { //need check empty if (OB_FAIL(check_server_empty_())) { @@ -149,10 +138,20 @@ int ObEmptyServerChecker::try_delete_server_() const bool commit = true; for (int64_t i = 0; OB_SUCC(ret) && i < empty_servers_.count(); ++i) { const ObAddr &addr = empty_servers_.at(i); - if (OB_FAIL(server_mgr_->end_delete_server(addr, zone, commit))) { - LOG_WARN("server_mgr end_delete_server failed", KR(ret), K(addr), K(zone)); + if (!ObHeartbeatService::is_service_enabled()) { // the old logic + LOG_INFO("sys tenant data version < 4.2, server manager executes end_delete_server"); + if (OB_FAIL(server_mgr_->end_delete_server(addr, zone, commit))) { + LOG_WARN("server_mgr end_delete_server failed", KR(ret), K(addr), K(zone)); + } + } else { + LOG_INFO("sys tenant data version >= 4.2, server zone op service executes finish_delete_server"); + if (OB_FAIL(server_zone_op_service_->finish_delete_server(addr, zone))) { + LOG_WARN("server_zone_op_service finish_delete_server failed", KR(ret), K(addr), K(zone)); + } else if (OB_FAIL(server_mgr_->load_server_manager())) { + LOG_WARN("fail to load server manager", KR(ret)); + } } - } + } } } return ret; diff --git a/src/rootserver/ob_empty_server_checker.h b/src/rootserver/ob_empty_server_checker.h index 2205e0d2e..cecf989ff 100644 --- a/src/rootserver/ob_empty_server_checker.h +++ b/src/rootserver/ob_empty_server_checker.h @@ -38,7 +38,7 @@ namespace rootserver { class ObServerManager; class ObUnitManager; - +class ObServerZoneOpService; /// Empty server checker thread. class ObEmptyServerChecker : public ObRsReentrantThread { @@ -50,7 +50,8 @@ public: server_mgr_(NULL), unit_mgr_(NULL), lst_operator_(NULL), - schema_service_(NULL) {}; + schema_service_(NULL), + server_zone_op_service_(NULL) {}; virtual ~ObEmptyServerChecker() {}; virtual void run3() override; @@ -59,7 +60,8 @@ public: int init(ObServerManager &server_mgr, ObUnitManager &unit_mgr, share::ObLSTableOperator &lst_operator, - share::schema::ObMultiVersionSchemaService &schema_service); + share::schema::ObMultiVersionSchemaService &schema_service, + ObServerZoneOpService &server_zone_op_service); virtual void wakeup(); virtual void stop(); @@ -79,6 +81,7 @@ private: ObUnitManager *unit_mgr_; share::ObLSTableOperator *lst_operator_; share::schema::ObMultiVersionSchemaService *schema_service_; + ObServerZoneOpService *server_zone_op_service_; DISALLOW_COPY_AND_ASSIGN(ObEmptyServerChecker); }; diff --git a/src/rootserver/ob_heartbeat_service.cpp b/src/rootserver/ob_heartbeat_service.cpp new file mode 100644 index 000000000..39e987764 --- /dev/null +++ b/src/rootserver/ob_heartbeat_service.cpp @@ -0,0 +1,1100 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#define USING_LOG_PREFIX RS +#include "ob_heartbeat_service.h" + +#include "share/ob_define.h" +#include "share/ob_service_epoch_proxy.h" +#include "share/ob_version.h" +#include "share/ob_zone_table_operation.h" +#include "lib/thread/threads.h" // set_run_wrapper +#include "lib/mysqlclient/ob_mysql_transaction.h" // ObMySQLTransaction +#include "lib/utility/ob_unify_serialize.h" +#include "lib/time/ob_time_utility.h" +#include "observer/ob_server_struct.h" // GCTX +#include "logservice/ob_log_base_header.h" // ObLogBaseHeader +#include "logservice/ob_log_handler.h" +#include "storage/tx_storage/ob_ls_service.h" +#include "storage/tx_storage/ob_ls_handle.h" +#include "rootserver/ob_root_utils.h" // get_proposal_id_from_sys_ls +#include "rootserver/ob_rs_event_history_table_operator.h" // ROOTSERVICE_EVENT_ADD +#include "rootserver/ob_root_service.h" +namespace oceanbase +{ +using namespace common; +using namespace share; +using observer::ObServerHealthStatus; +namespace rootserver +{ +#define HBS_LOG_INFO(fmt, args...) FLOG_INFO("[HEARTBEAT_SERVICE] " fmt, ##args) +#define HBS_LOG_WARN(fmt, args...) FLOG_WARN("[HEARTBEAT_SERVICE] " fmt, ##args) +#define HBS_LOG_ERROR(fmt, args...) FLOG_ERROR("[HEARTBEAT_SERVICE] " fmt, ##args) +ObHeartbeatService::ObHeartbeatService() + : is_inited_(false), + sql_proxy_(NULL), + srv_rpc_proxy_(NULL), + epoch_id_(palf::INVALID_PROPOSAL_ID), + whitelist_epoch_id_(palf::INVALID_PROPOSAL_ID), + hb_responses_epoch_id_(palf::INVALID_PROPOSAL_ID), + hb_responses_rwlock_(ObLatchIds::HB_RESPONSES_LOCK), + all_servers_info_in_table_rwlock_(ObLatchIds::ALL_SERVERS_INFO_IN_TABLE_LOCK), + all_servers_hb_info_(), + all_servers_info_in_table_(), + inactive_zone_list_(), + hb_responses_(), + need_process_hb_responses_(false), + need_update_server_tracer_(false), + is_rs_server_info_updated_(false) +{ +} +ObHeartbeatService::~ObHeartbeatService() +{ +} +bool ObHeartbeatService::is_service_enabled_ = false; +int ObHeartbeatService::init() +{ + int ret = OB_SUCCESS; + int BUCKET_NUM = 1024; // ** FIXME: (linqiucen.lqc) temp. value + sql_proxy_ = GCTX.sql_proxy_; + srv_rpc_proxy_ = GCTX.srv_rpc_proxy_; + lib::ObMemAttr attr(MTL_ID(), "HB_SERVICE"); + if (OB_UNLIKELY(is_inited_)) { + ret = OB_INIT_TWICE; + LOG_WARN("has already inited", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(srv_rpc_proxy_)) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_ERROR("srv_rpc_proxy_ is null", KR(ret), KP(srv_rpc_proxy_)); + } else if (OB_FAIL(ObTenantThreadHelper::create( + "HBService", + lib::TGDefIDs::HeartbeatService, + *this))) { + LOG_WARN("fail to create thread", KR(ret)); + } else if (OB_FAIL(ObTenantThreadHelper::start())) { + LOG_WARN("failed to start thread", KR(ret)); + } else if (OB_FAIL(all_servers_hb_info_.create(BUCKET_NUM, attr))) { + LOG_WARN("fail to create all_servers_hb_info_", KR(ret)); + } else { + { + SpinWLockGuard guard_for_hb_responses(hb_responses_rwlock_); + hb_responses_.reset(); + hb_responses_epoch_id_ = palf::INVALID_PROPOSAL_ID; + need_process_hb_responses_ = false; + } + { + SpinWLockGuard guard_for_servers_info(all_servers_info_in_table_rwlock_); + all_servers_info_in_table_.reset(); + inactive_zone_list_.reset(); + whitelist_epoch_id_ = palf::INVALID_PROPOSAL_ID; + } + all_servers_hb_info_.clear(); + all_servers_info_in_table_.set_attr(attr); + inactive_zone_list_.set_attr(attr); + hb_responses_.set_attr(attr); + set_epoch_id_(palf::INVALID_PROPOSAL_ID); + need_update_server_tracer_ = false; + is_rs_server_info_updated_ = false; + is_inited_ = true; + HBS_LOG_INFO("ObHeartbeatService is inited"); + } + // we do not need the returned error code when init + // only try to confirm whether the heartbeat service is enabled as early as possible, + (void) check_is_service_enabled_(); + return ret; +} +int ObHeartbeatService::check_is_service_enabled_() +{ + int ret = OB_SUCCESS; + uint64_t sys_tenant_data_version = 0; + if (OB_FAIL(GET_MIN_DATA_VERSION(OB_SYS_TENANT_ID, sys_tenant_data_version))) { + LOG_WARN("fail to get sys tenant's min data version", KR(ret)); + } else if (sys_tenant_data_version >= DATA_VERSION_4_2_0_0) { + is_service_enabled_ = true; + HBS_LOG_INFO("the heartbeart service is enabled now", K(sys_tenant_data_version), K(is_service_enabled_)); + } + return ret; +} +void ObHeartbeatService::destroy() +{ + { + SpinWLockGuard guard_for_hb_responses(hb_responses_rwlock_); + hb_responses_.reset(); + hb_responses_epoch_id_ = palf::INVALID_PROPOSAL_ID; + need_process_hb_responses_ = false; + } + { + SpinWLockGuard guard_for_servers_info(all_servers_info_in_table_rwlock_); + all_servers_info_in_table_.reset(); + inactive_zone_list_.reset(); + whitelist_epoch_id_ = palf::INVALID_PROPOSAL_ID; + } + is_inited_ = false; + sql_proxy_ = NULL; + srv_rpc_proxy_ = NULL; + need_update_server_tracer_ = false; + is_rs_server_info_updated_ = false; + set_epoch_id_(palf::INVALID_PROPOSAL_ID); + all_servers_hb_info_.destroy(); + HBS_LOG_INFO("ObHeartbeatService is destroyed"); + ObTenantThreadHelper::destroy(); +} + +int ObHeartbeatService::switch_to_leader() +{ + int ret = OB_SUCCESS; + int64_t epoch_id = palf::INVALID_PROPOSAL_ID; + ObRole role; + if (OB_FAIL(ObRootUtils::get_proposal_id_from_sys_ls(epoch_id, role))) { + LOG_WARN("fail to get proposal id from sys ls", KR(ret)); + } else if (ObRole::LEADER != role) { + ret = OB_NOT_MASTER; + HBS_LOG_WARN("not master ls", KR(ret), K(epoch_id), K(role)); + } else { + if (OB_LIKELY((palf::INVALID_PROPOSAL_ID == epoch_id_ || epoch_id_ < epoch_id) + && palf::INVALID_PROPOSAL_ID != epoch_id)) { + set_epoch_id_(epoch_id); + } else { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid epoch id", KR(ret), K(epoch_id), K(epoch_id_)); + } + } + if (FAILEDx(ObTenantThreadHelper::switch_to_leader())) { + HBS_LOG_WARN("fail to switch to leader", KR(ret)); + } else { + HBS_LOG_INFO("switch to leader", KR(ret), K(epoch_id_)); + } + return ret; +} +void ObHeartbeatService::do_work() +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_FAIL(check_upgrade_compat_())) { + LOG_WARN("fail to check upgrade compatibility", KR(ret)); + } else { + while (!has_set_stop()) { + uint64_t thread_idx = get_thread_idx(); + int64_t thread_cnt = THREAD_COUNT; + if (OB_UNLIKELY(thread_idx >= thread_cnt)) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_ERROR("unexpected thread_idx", KR(ret), K(thread_idx), K(thread_cnt)); + } else { + if (0 == thread_idx) { + if (OB_FAIL(send_heartbeat_())) { + LOG_WARN("fail to send heartbeat", KR(ret)); + } + } else { // 1 == thread_idx + if (OB_FAIL(manage_heartbeat_())) { + LOG_WARN("fail to manage heartbeat", KR(ret)); + } + } + if(OB_FAIL(ret)) { + idle(HB_FAILED_IDLE_TIME_US); + } else { + idle(HB_IDLE_TIME_US); + } + } + } // end while + } +} +int ObHeartbeatService::check_upgrade_compat_() +{ + int ret = OB_SUCCESS; + while (!is_service_enabled_ && !has_set_stop()) { + if (OB_FAIL(check_is_service_enabled_())) { + LOG_WARN("fail to check whether the heartbeat service is enabled", KR(ret)); + } + idle(HB_IDLE_TIME_US); + } + if (has_set_stop()) { + ret = OB_NOT_MASTER; + LOG_WARN("not leader", KR(ret)); + } + return ret; +} +int ObHeartbeatService::send_heartbeat_() +{ + int ret = OB_SUCCESS; + ObHBRequestArray hb_requests; + int64_t tmp_whitelist_epoch_id = palf::INVALID_PROPOSAL_ID; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(srv_rpc_proxy_)) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_ERROR("srv_rpc_proxy_ is null", KR(ret), KP(srv_rpc_proxy_)); + } else { + // step 1: prepare hb_requests based on the whitelist + if (OB_FAIL(prepare_hb_requests_(hb_requests, tmp_whitelist_epoch_id))) { + LOG_WARN("fail to prepare heartbeat requests", KR(ret)); + } else if (hb_requests.count() <= 0) { + LOG_INFO("no heartbeat request needs to be sent"); + } else { + ObSendHeartbeatProxy proxy(*srv_rpc_proxy_, &obrpc::ObSrvRpcProxy::handle_heartbeat); + int64_t timeout = GCONF.rpc_timeout; // default value is 2s + int tmp_ret = OB_SUCCESS; + ObArray return_ret_array; + // step 2: send hb_requests to all servers in the whitelist + for (int64_t i = 0; i < hb_requests.count(); i++) { + if (OB_TMP_FAIL(proxy.call( + hb_requests.at(i).get_server(), + timeout, + GCONF.cluster_id, + OB_SYS_TENANT_ID, + hb_requests.at(i)))) { + // error code will be ignored here. + // send rpc to some offline servers will return error, however, it's acceptable + LOG_WARN("fail to send heartbeat rpc", KR(ret), KR(tmp_ret), K(hb_requests.at(i))); + } + } + // step 3: wait hb_responses + if (OB_TMP_FAIL(proxy.wait_all(return_ret_array))) { + LOG_WARN("fail to wait all batch result", KR(ret), KR(tmp_ret)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } + // step 4: save hb_responses + if (FAILEDx(set_hb_responses_(tmp_whitelist_epoch_id, &proxy))) { + LOG_WARN("fail to set hb_responses", KR(ret)); + } + } + } + return ret; +} +int ObHeartbeatService::set_hb_responses_(const int64_t whitelist_epoch_id, ObSendHeartbeatProxy *proxy) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(proxy)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("proxy is null", KR(ret), KP(proxy)); + } else { + int tmp_ret = OB_SUCCESS; + SpinWLockGuard guard_for_hb_responses(hb_responses_rwlock_); + need_process_hb_responses_ = true; + hb_responses_epoch_id_ = whitelist_epoch_id; + hb_responses_.reset(); + ARRAY_FOREACH_X(proxy->get_results(), idx, cnt, OB_SUCC(ret)) { + const ObHBResponse *hb_response = proxy->get_results().at(idx); + if (OB_ISNULL(hb_response)) { + tmp_ret = OB_ERR_UNEXPECTED; + LOG_WARN("hb_response is null", KR(ret), KR(tmp_ret), KP(hb_response)); + } else if (OB_UNLIKELY(!hb_response->is_valid())) { + // if an observer does not reply the rpc, we will get an invalid hb_response. + tmp_ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid hb_response", KR(ret), KR(tmp_ret), KPC(hb_response)); + } else if (OB_FAIL(hb_responses_.push_back(*hb_response))) { + LOG_WARN("fail to push an element into hb_responses_", KR(ret), KPC(hb_response)); + } else { + LOG_DEBUG("receive a heartbeat response", KPC(hb_response)); + } + } + } + return ret; +} +int ObHeartbeatService::get_and_reset_hb_responses_( + ObHBResponseArray &hb_responses, + int64_t &hb_responses_epoch_id) +{ + int ret = OB_SUCCESS; + // set hb_responses = hb_responses_ + // locking hb_responses_ too long will block send_heartbeat() + // therefore we process hb_responses rather than hb_responses_ + hb_responses.reset(); + hb_responses_epoch_id = palf::INVALID_PROPOSAL_ID; + SpinWLockGuard guard_for_hb_responses(hb_responses_rwlock_); + if (need_process_hb_responses_) { + if (OB_FAIL(hb_responses.assign(hb_responses_))) { + LOG_WARN("fail to assign tmp_hb_responses", KR(ret), K(hb_responses_)); + } else { + need_process_hb_responses_ = false; + hb_responses_epoch_id = hb_responses_epoch_id_; + hb_responses_epoch_id_ = palf::INVALID_PROPOSAL_ID; + hb_responses_.reset(); + } + } else { + ret = OB_NEED_WAIT; + LOG_WARN("currently there are no hb_responses need to be proccessed", KR(ret)); + } + return ret; +} +int ObHeartbeatService::prepare_hb_requests_(ObHBRequestArray &hb_requests, int64_t &whitelist_epoch_id) +{ + int ret = OB_SUCCESS; + hb_requests.reset(); + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else { + // ensure when we prepare hb_requests, + // we should mark these hb_requests are based on which whitelist. + // In other words, we should mark the whitelist's corresponding whitelist_epoch_id_. + SpinRLockGuard guard_for_servers_info(all_servers_info_in_table_rwlock_); + ObHBRequest hb_request; + whitelist_epoch_id = whitelist_epoch_id_; + ARRAY_FOREACH_X(all_servers_info_in_table_, idx, cnt, OB_SUCC(ret)) { + const ObServerInfoInTable &server_info = all_servers_info_in_table_.at(idx); + bool is_stopped = false; + if (OB_UNLIKELY(!server_info.is_valid())) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_WARN("invalid server info in table", KR(ret), K(server_info)); + } else { + if (server_info.is_stopped() || has_exist_in_array(inactive_zone_list_, server_info.get_zone())) { + is_stopped = true; + } + } + if (OB_SUCC(ret)) { + hb_request.reset(); + if (OB_FAIL(hb_request.init( + server_info.get_server(), + server_info.get_server_id(), + GCTX.self_addr(), + is_stopped ? RSS_IS_STOPPED : RSS_IS_WORKING, + whitelist_epoch_id))) { + LOG_WARN("fail to init hb_request", KR(ret), K(server_info), K(is_stopped), + K(GCTX.self_addr()), K(whitelist_epoch_id)); + } else if (OB_FAIL(hb_requests.push_back(hb_request))) { + LOG_WARN("fail to push an element into hb_requests", KR(ret), K(hb_request)); + } else {} + } + } + } + return ret; +} +int ObHeartbeatService::manage_heartbeat_() +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else { + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(prepare_whitelist_())) { + ret = OB_SUCC(ret) ? tmp_ret : ret; + LOG_WARN("fail to prepare whitelist", KR(ret), KR(tmp_ret)); + } + if (OB_TMP_FAIL(process_hb_responses_())) { + ret = OB_SUCC(ret) ? tmp_ret : ret; + LOG_WARN("fail to prepare heartbeat response", KR(ret), KR(tmp_ret)); + } + if (need_update_server_tracer_) { + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh server tracer", KR(ret), KR(tmp_ret)); + } else { + need_update_server_tracer_ = false; + } + } + if (is_rs_server_info_updated_) { + if (OB_ISNULL(GCTX.root_service_)) { + tmp_ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.root_service_ is null", KR(ret), KR(tmp_ret), KP(GCTX.root_service_)); + } else if (OB_TMP_FAIL(GCTX.root_service_->get_status_change_cb().on_server_status_change(GCTX.self_addr()))) { + LOG_WARN("fail to execute on_server_status_change", KR(ret), KR(tmp_ret), K(GCTX.self_addr())); + } else { + is_rs_server_info_updated_ = false; + } + } + } + return ret; +} +int ObHeartbeatService::prepare_whitelist_() +{ + int ret = OB_SUCCESS; + int64_t epoch_id = get_epoch_id_(); + int64_t persistent_epoch_id = palf::INVALID_PROPOSAL_ID; + ObServerInfoInTableArray tmp_all_servers_info_in_table; + ObArray tmp_inactive_zone_list; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(sql_proxy_)) { + LOG_WARN("sql_proxy_ is null", KR(ret), KP(sql_proxy_)); + } else if (OB_FAIL(check_or_update_service_epoch_(epoch_id))) { + LOG_WARN("fail to check or update service epoch", KR(ret), K(epoch_id)); + } else if (OB_FAIL(ObServerTableOperator::get(*sql_proxy_, tmp_all_servers_info_in_table))) { + // It is possible that heartbeat_service_epoch is changed while we are reading __all_server table + // It's acceptable, since we cannot update __all_server table when we hold the old heartbeat_service_epoch + LOG_WARN("fail to read __all_server table", KR(ret), KP(sql_proxy_)); + } else if (OB_FAIL(ObZoneTableOperation::get_inactive_zone_list(*sql_proxy_, tmp_inactive_zone_list))) { + LOG_WARN("fail to get inactive zone list", KR(ret), KP(sql_proxy_)); + } else { + SpinWLockGuard guard_for_servers_info(all_servers_info_in_table_rwlock_); + whitelist_epoch_id_ = epoch_id; + if (OB_FAIL(all_servers_info_in_table_.assign(tmp_all_servers_info_in_table))) { + all_servers_info_in_table_.reset(); + whitelist_epoch_id_ = palf::INVALID_PROPOSAL_ID; + LOG_WARN("fail to assign all_servers_info_in_table_", KR(ret), K(tmp_all_servers_info_in_table)); + } else if (OB_FAIL(inactive_zone_list_.assign(tmp_inactive_zone_list))) { + LOG_WARN("fail to assign inactive_zone_list_",KR(ret), K(tmp_inactive_zone_list)); + } + } + return ret; +} +int ObHeartbeatService::check_or_update_service_epoch_(const int64_t epoch_id) +{ + // if persistent_epoch_id == epoch_id: check ok. + // if persistent_epoch_id < epoch_id: update heartbeat_service_epoch in __all_service_epoch table + // if the updation is successful, check ok. + // if persistent_epoch_id > epoch_id: return error OB_NOT_MASTER + int ret = OB_SUCCESS; + int64_t persistent_epoch_id = palf::INVALID_PROPOSAL_ID; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("sql_proxy_ is null", KR(ret), KP(sql_proxy_)); + } else if (OB_FAIL(ObServiceEpochProxy::get_service_epoch( + *sql_proxy_, + OB_SYS_TENANT_ID, + ObServiceEpochProxy::HEARTBEAT_SERVICE_EPOCH, + persistent_epoch_id))) { + LOG_WARN("fail to get heartbeat service epoch", KR(ret),KP(sql_proxy_)); + } else if (palf::INVALID_PROPOSAL_ID == persistent_epoch_id || palf::INVALID_PROPOSAL_ID == epoch_id) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("epoch id is unexpectedly invalid", KR(ret), K(persistent_epoch_id), K(epoch_id)); + } else if (persistent_epoch_id > epoch_id) { + ret = OB_NOT_MASTER; + HBS_LOG_WARN("persistent_epoch_id is greater than epoch_id, which means this server is not leader", + KR(ret), K(persistent_epoch_id), K(epoch_id)); + } else if (persistent_epoch_id < epoch_id) { + HBS_LOG_INFO("persistent_epoch_id is smaller than epoch_id", K(persistent_epoch_id), K(epoch_id)); + common::ObMySQLTransaction trans; + if (OB_FAIL(trans.start(sql_proxy_, OB_SYS_TENANT_ID))) { + LOG_WARN("fail to start trans", KR(ret)); + } else if (OB_FAIL(ObServiceEpochProxy::check_and_update_service_epoch( + trans, + OB_SYS_TENANT_ID, + ObServiceEpochProxy::HEARTBEAT_SERVICE_EPOCH, + epoch_id))) { + LOG_WARN("fail to check and update service epoch", KR(ret), KP(sql_proxy_), K(epoch_id)); + } + if (OB_UNLIKELY(!trans.is_started())) { + LOG_WARN("the transaction is not started"); + } else { + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(trans.end(OB_SUCC(ret)))) { + LOG_WARN("fail to commit the transaction", KR(ret), KR(tmp_ret)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } + if (OB_FAIL(ret)) { + LOG_WARN("fail to update __all_service_epoch table", KR(ret)); + } + } + // we do not care whether the table is updated successfully + // we always reset all_servers_info_in_table_ and all_servers_hb_info_ + SpinWLockGuard guard_for_servers_info(all_servers_info_in_table_rwlock_); + all_servers_info_in_table_.reset(); + whitelist_epoch_id_ = palf::INVALID_PROPOSAL_ID; + all_servers_hb_info_.clear(); + } else {} // persistent_epoch_id = epoch_id, do nothing. + return ret; +} +int ObHeartbeatService::process_hb_responses_() +{ + int ret = OB_SUCCESS; + ObHBResponseArray tmp_hb_responses; + const int64_t now = ObTimeUtility::current_time(); + int64_t tmp_hb_responses_epoch_id = palf::INVALID_PROPOSAL_ID; + common::ObArray zone_list; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_ERROR("sql_proxy_ is null", KR(ret), KP(sql_proxy_)); + } else if (OB_FAIL(get_and_reset_hb_responses_(tmp_hb_responses, tmp_hb_responses_epoch_id))) { + LOG_WARN("fail to get and reset hb_responses", KR(ret)); + } else if (OB_FAIL(ObZoneTableOperation::get_zone_list(*sql_proxy_, zone_list))) { + LOG_WARN("fail to get zone list", KR(ret)); + } else { + // Here we do not need to lock all_servers_info_in_table_. + // There are two threads in heartbeat service. + // Prepare_whitelist() will modify all_servers_info_in_table_, + // But prepare_whitelist() and this func. are in the same thread. + // In another thread, send_heartbeat() only reads server_ and server_id_ in all_servers_info_in_table_ + int tmp_ret = OB_SUCCESS; + for (int64_t i = 0; i < all_servers_info_in_table_.count(); i++) { + // note: we can only update __all_server table successfully when hb_responses_epoch_id is + // equal to current heartbeat_service_epoch in __all_service_epoch table. + // It means that our whitelist (all_servers_info_in_table_) is not outdated. + if (OB_TMP_FAIL(check_server_( + tmp_hb_responses, + all_servers_info_in_table_.at(i), + zone_list, + now, + tmp_hb_responses_epoch_id))) { + LOG_WARN("fail to check server", KR(ret), KR(tmp_ret), + K(all_servers_info_in_table_.at(i)), K(now), K(tmp_hb_responses_epoch_id)); + } + } + if (FAILEDx(clear_deleted_servers_in_all_servers_hb_info_())) { + LOG_WARN("fail to clear deleted servers in all_servers_hb_info_", KR(ret)); + } + } + return ret; +} +int ObHeartbeatService::check_server_( + const ObHBResponseArray &hb_responses, + const share::ObServerInfoInTable &server_info_in_table, + const common::ObArray &zone_list, + const int64_t now, + const int64_t hb_responses_epoch_id) +{ + int ret = OB_SUCCESS; + ObServerHBInfo server_hb_info; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(!server_info_in_table.is_valid() + || now <= 0 + || palf::INVALID_PROPOSAL_ID == hb_responses_epoch_id)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalied argument", KR(ret), K(server_info_in_table), K(now), K(hb_responses_epoch_id)); + } else if (OB_FAIL(all_servers_hb_info_.get_refactored(server_info_in_table.get_server(), server_hb_info))) { + LOG_WARN("fail to get server_hb_info, or get an old server_hb_info", KR(ret), + K(server_info_in_table.get_server()), K(server_hb_info)); + if (OB_HASH_NOT_EXIST == ret) { + if (OB_FAIL(init_server_hb_info_(now, server_info_in_table, server_hb_info))) { + LOG_WARN("fail to init server_hb_info", KR(ret), K(server_info_in_table), K(now)); + } else if (OB_FAIL(all_servers_hb_info_.set_refactored( + server_hb_info.get_server(), + server_hb_info, + 0 /* flag: 0 shows that not cover existing object. */))) { + LOG_WARN("fail to push an element into all_servers_hb_info_", KR(ret), K(server_hb_info)); + } else {} + } + } + if (OB_SUCC(ret)) { + // check whether the heartbeat response from server_info_in_table.get_server() is received + int64_t idx = OB_INVALID_INDEX_INT64; + if (!has_server_exist_in_array_(hb_responses, server_info_in_table.get_server(), idx)) { + // heartbeat response is not received + if (OB_FAIL(check_server_without_hb_response_( + now, + server_info_in_table, + hb_responses_epoch_id, + server_hb_info))) { + LOG_WARN("fail to check the server without heartbeat response", KR(ret), + K(server_info_in_table), K(now), K(hb_responses_epoch_id)); + } + } else if (OB_UNLIKELY(!hb_responses.at(idx).is_valid())) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_WARN("there exists an invalid element in hb_responses", KR(ret), + K(hb_responses.at(idx))); + } else if (OB_FAIL(check_server_with_hb_response_( + hb_responses.at(idx), + server_info_in_table, + zone_list, + now, + hb_responses_epoch_id, + server_hb_info))) { // heartbeat response is received + LOG_WARN("fail to check the server with heartbeat response", KR(ret), + K(hb_responses.at(idx)), K(server_info_in_table), K(now), K(hb_responses_epoch_id)); + } + } + return ret; +} +int ObHeartbeatService::check_server_without_hb_response_( + const int64_t now, + const share::ObServerInfoInTable &server_info_in_table, + const int64_t hb_responses_epoch_id, + ObServerHBInfo &server_hb_info) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(now <= 0 + || !server_info_in_table.is_valid() + || !server_hb_info.is_valid() + || palf::INVALID_PROPOSAL_ID == hb_responses_epoch_id)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(now), K(server_info_in_table), + K(server_hb_info), K(hb_responses_epoch_id)); + } else if (OB_FAIL(update_server_hb_info_( + now, + false, /* hb_response_exists */ + server_hb_info))) { + LOG_WARN("fail to update server_hb_info", KR(ret), K(now), K(server_info_in_table), + K(server_hb_info)); + } else if ((now - server_hb_info.get_last_hb_time() > GCONF.lease_time + && 0 == server_info_in_table.get_last_offline_time())) { + need_update_server_tracer_ = true; + if (GCTX.self_addr() == server_info_in_table.get_server()) { + is_rs_server_info_updated_ = true; + } + if (OB_FAIL(update_table_for_online_to_offline_server_( + server_info_in_table, + now, + hb_responses_epoch_id))) { + LOG_WARN("fail to update table for online to offline server", + KR(ret), K(server_info_in_table), K(now), K(hb_responses_epoch_id)); + } else { + const ObAddr &server = server_info_in_table.get_server(); + ROOTSERVICE_EVENT_ADD("server", "last_offline_time set", "server", server); + } + } else {} + return ret; +} +int ObHeartbeatService::update_table_for_online_to_offline_server_( + const share::ObServerInfoInTable &server_info_in_table, + const int64_t now, + const int64_t hb_responses_epoch_id) +{ + int ret = OB_SUCCESS; + common::ObMySQLTransaction trans; + bool is_match = false; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("sql_proxy_ is null", KR(ret), KP(sql_proxy_)); + } else if (OB_UNLIKELY(!server_info_in_table.is_valid() + || now <= 0 + || palf::INVALID_PROPOSAL_ID == hb_responses_epoch_id)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server_info_in_table), K(now), K(hb_responses_epoch_id)); + } else if (OB_FAIL(trans.start(sql_proxy_, OB_SYS_TENANT_ID))) { + LOG_WARN("fail to start trans", KR(ret)); + } else if (OB_FAIL(ObServiceEpochProxy::check_service_epoch_with_trans( + trans, + OB_SYS_TENANT_ID, + ObServiceEpochProxy::HEARTBEAT_SERVICE_EPOCH, + hb_responses_epoch_id, + is_match))) { + LOG_WARN("fail to check and update service epoch", KR(ret), K(hb_responses_epoch_id)); + } else if (OB_UNLIKELY(!is_match)) { + ret = OB_NOT_MASTER; + LOG_WARN("hb_responses_epoch_id is not the same as persistent heartbeat service epoch id", KR(ret)); + } else { + if (OB_FAIL(ObServerTableOperator::update_table_for_online_to_offline_server( + trans, + server_info_in_table.get_server(), + ObServerStatus::OB_SERVER_DELETING == server_info_in_table.get_status(), /* is_deleting */ + now /*last_offline_time */))) { + LOG_WARN("fail to update __all_server table for online to offline server", KR(ret), + K(server_info_in_table), K(now)); + } + } + if (OB_UNLIKELY(!trans.is_started())) { + LOG_WARN("the transaction is not started"); + } else { + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(trans.end(OB_SUCC(ret)))) { + HBS_LOG_WARN("fail to commit the transaction", KR(ret), KR(tmp_ret), + K(server_info_in_table.get_server())); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } + if (OB_FAIL(ret)) { + LOG_WARN("fail to update __all_server table", KR(ret)); + } + } + return ret; +} +int ObHeartbeatService::init_server_hb_info_( + const int64_t now, + const share::ObServerInfoInTable &server_info_in_table, + ObServerHBInfo &server_hb_info) +{ + int ret = OB_SUCCESS; + const ObServerStatus::DisplayStatus &display_status = server_info_in_table.get_status(); + const int64_t last_offline_time = server_info_in_table.get_last_offline_time(); + const ObAddr &server = server_info_in_table.get_server(); + int64_t last_hb_time = 0; + ObServerStatus::HeartBeatStatus hb_status = ObServerStatus::OB_HEARTBEAT_MAX; + + server_hb_info.reset(); + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(!server_info_in_table.is_valid() + || now - last_offline_time < 0)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(now), K(server_info_in_table)); + } else { + if (0 == last_offline_time) { // online, the status is active or deleting + last_hb_time = now; + hb_status = ObServerStatus::OB_HEARTBEAT_ALIVE; + } else { // last_offline_time > 0, offline, the status is inactive or deleting + last_hb_time = last_offline_time - GCONF.lease_time; + hb_status = ObServerStatus::OB_HEARTBEAT_LEASE_EXPIRED; + if (now - last_hb_time >= GCONF.server_permanent_offline_time) { + hb_status = ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE; + } + } + if (FAILEDx(server_hb_info.init(server, last_hb_time, hb_status))) { + LOG_WARN("fail to init server_hb_info", KR(ret), K(server), K(last_hb_time), K(hb_status)); + } else { + LOG_INFO("new server_hb_info is generated", K(server_hb_info)); + } + } + return ret; +} +int ObHeartbeatService::check_server_with_hb_response_( + const ObHBResponse &hb_response, + const share::ObServerInfoInTable &server_info_in_table, + const common::ObArray &zone_list, + const int64_t now, + const int64_t hb_responses_epoch_id, + ObServerHBInfo &server_hb_info) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(now <= 0 + || !server_hb_info.is_valid()) + || palf::INVALID_PROPOSAL_ID == hb_responses_epoch_id) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server_info_in_table), K(hb_response), + K(now), K(server_hb_info), K(hb_responses_epoch_id)); + } else if (OB_FAIL(check_if_hb_response_can_be_processed_( + hb_response, + server_info_in_table, + zone_list))) { + // the validity of hb_response and server_info_in_table is also checked here + LOG_WARN("hb_response cannot be processed", KR(ret), K(hb_response), + K(server_info_in_table), K(zone_list)); + } + if (OB_SUCC(ret)) { + if ((!server_info_in_table.get_with_rootserver() && hb_response.get_server() == GCTX.self_addr()) + || 0 != server_info_in_table.get_last_offline_time() + || server_info_in_table.get_build_version() != hb_response.get_build_version() + || server_info_in_table.get_start_service_time() != hb_response.get_start_service_time()) { + need_update_server_tracer_ = true; + if (GCTX.self_addr() == server_info_in_table.get_server()) { + is_rs_server_info_updated_ = true; + } + if (OB_FAIL(update_table_for_server_with_hb_response_( + hb_response, + server_info_in_table, + hb_responses_epoch_id))) { + LOG_WARN("fail to update table for server with hb_response", KR(ret), K(hb_response), + K(server_info_in_table), K(hb_responses_epoch_id)); + } + } + if (FAILEDx(check_and_execute_start_or_stop_server_( + hb_response, + server_hb_info, + server_info_in_table))) { + LOG_WARN("fail to check and execute start or stop server", KR(ret), + K(hb_response), K(server_info_in_table)); + } + if (FAILEDx(server_hb_info.set_server_health_status(hb_response.get_server_health_status()))) { + LOG_WARN("fail to set server_health_status", KR(ret), K(hb_response.get_server_health_status())); + } else if (OB_FAIL(update_server_hb_info_( + now, + true, /* hb_response_exists*/ + server_hb_info))) { + LOG_WARN("fail to get and update server_hb_info", KR(ret), K(hb_response), + K(server_info_in_table), K(now)); + } + } + return ret; +} +int ObHeartbeatService::check_if_hb_response_can_be_processed_( + const ObHBResponse &hb_response, + const share::ObServerInfoInTable &server_info_in_table, + const common::ObArray &zone_list) const +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(!server_info_in_table.is_valid() + || !hb_response.is_valid() + || server_info_in_table.get_server() != hb_response.get_server())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server_info_in_table), K(hb_response)); + } else if (server_info_in_table.get_zone() != hb_response.get_zone()) { + ret = OB_SERVER_ZONE_NOT_MATCH; + HBS_LOG_ERROR("server's zone does not match", KR(ret), K(server_info_in_table.get_zone()), + K(hb_response.get_zone())); + } else if (server_info_in_table.get_sql_port() != hb_response.get_sql_port()) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_ERROR("unexpexted error: server's sql port has changed!", KR(ret), + K(server_info_in_table), K(hb_response)); + } else { + bool zone_exists = false; + for (int64_t i = 0; !zone_exists && i < zone_list.count(); i++) { + if (zone_list.at(i) == hb_response.get_zone()) { + zone_exists = true; + } + } + if (OB_UNLIKELY(!zone_exists)) { + ret = OB_ZONE_INFO_NOT_EXIST; + HBS_LOG_ERROR("zone info not exist", KR(ret), K(hb_response.get_zone()), K(zone_list)); + } + } + return ret; +} +int ObHeartbeatService::check_and_execute_start_or_stop_server_( + const ObHBResponse &hb_response, + const ObServerHBInfo &server_hb_info, + const share::ObServerInfoInTable &server_info_in_table) +{ + int ret = OB_SUCCESS; + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + const ObAddr &server = hb_response.get_server(); + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_ERROR("sql_proxy_ is null", KR(ret), KP(sql_proxy_)); + } else if (OB_UNLIKELY(!hb_response.is_valid() + || !server_info_in_table.is_valid() + || hb_response.get_server() != server_info_in_table.get_server() + || !server.is_valid() + || !server.ip_to_string(ip, sizeof(ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server), K(hb_response), K(server_info_in_table)); + } else { + const ObServerHealthStatus &health_status = hb_response.get_server_health_status(); + bool need_start_or_stop_server = false; + bool is_start = false; + int64_t affected_rows = 0; + ObSqlString sql; + if (server_hb_info.get_server_health_status() != hb_response.get_server_health_status()) { + if (0 == server_info_in_table.get_stop_time() && !health_status.is_healthy()) { + is_start = false; + need_start_or_stop_server = true; + } + if (0 != server_info_in_table.get_stop_time() && health_status.is_healthy()) { + is_start = true; + need_start_or_stop_server = true; + } + } + if (OB_SUCC(ret) && need_start_or_stop_server) { + if (is_start) { + ROOTSERVICE_EVENT_ADD("server", "disk error repaired, start server", "server", server); + HBS_LOG_INFO("disk error repaired, try to start server", K(server), K(health_status)); + ret = sql.assign_fmt("ALTER SYSTEM START SERVER '%s:%d'", ip, server.get_port()); + } else { + ROOTSERVICE_EVENT_ADD("server", "disk error, stop server", "server", server); + HBS_LOG_INFO("disk error, try to stop server", K(server), K(health_status)); + ret = sql.assign_fmt("ALTER SYSTEM STOP SERVER '%s:%d'", ip, server.get_port()); + } + if (OB_FAIL(ret)) { + LOG_WARN("fail to assign fmt", KR(ret), K(server), K(is_start)); + } else if (OB_FAIL(sql_proxy_->write(sql.ptr(), affected_rows))) { + LOG_WARN("fail to write sql", KR(ret),K(server), K(sql)); + } else { + HBS_LOG_INFO("start or stop server successfully", K(server), K(is_start)); + } + } + } + return ret; +} +int ObHeartbeatService::update_server_hb_info_( + const int64_t now, + const bool hb_response_exists, + ObServerHBInfo &server_hb_info) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(now <= 0 + || !server_hb_info.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(now), K(server_hb_info)); + } else { + const ObServerStatus::HeartBeatStatus& hb_status = server_hb_info.get_hb_status(); + const ObAddr& server = server_hb_info.get_server(); + // step 1: update last_hb_time + if (hb_response_exists) { + server_hb_info.set_last_hb_time(now); + } + int64_t time_diff = now - server_hb_info.get_last_hb_time(); + // step 2: update hb_status + if (time_diff >= GCONF.server_permanent_offline_time + && ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE != hb_status) { + server_hb_info.set_hb_status(ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE); + ROOTSERVICE_EVENT_ADD("server", "permanent_offline", "server", server); + HBS_LOG_INFO("the server becomes permanent offline", K(server), K(time_diff)); + } else if (time_diff >= GCONF.lease_time + && ObServerStatus::OB_HEARTBEAT_LEASE_EXPIRED != hb_status + && ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE != hb_status) { + server_hb_info.set_hb_status(ObServerStatus::OB_HEARTBEAT_LEASE_EXPIRED); + ROOTSERVICE_EVENT_ADD("server", "lease_expire", "server", server); + HBS_LOG_INFO("the server's lease becomes expired'", K(server), K(time_diff)); + } else if (time_diff < GCONF.lease_time + && ObServerStatus::OB_HEARTBEAT_ALIVE != hb_status) { + server_hb_info.set_hb_status(ObServerStatus::OB_HEARTBEAT_ALIVE); + ROOTSERVICE_EVENT_ADD("server", "online", "server", server); + HBS_LOG_INFO("the server's lease becomes online'", K(server), K(time_diff)); + } else {} + // step 3: update server_hb_info + if (FAILEDx(all_servers_hb_info_.set_refactored( + server_hb_info.get_server(), + server_hb_info, + 1 /* flag: 0 shows that not cover existing object. */))) { + LOG_WARN("fail to push an element into all_servers_hb_info_", KR(ret), K(server_hb_info)); + } + } + return ret; +} + +int ObHeartbeatService::clear_deleted_servers_in_all_servers_hb_info_() +{ + int ret = OB_SUCCESS; + ObAddr server; + hash::ObHashMap::iterator iter = all_servers_hb_info_.begin(); + if (OB_UNLIKELY(!is_inited_)) { // return false + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else { + while (OB_SUCC(ret) && iter != all_servers_hb_info_.end()) { + int64_t idx = OB_INVALID_INDEX_INT64; + server.reset(); + server = iter->first; + iter++; + if (!has_server_exist_in_array_(all_servers_info_in_table_, server, idx)) { + HBS_LOG_INFO("the server is deleted, it can be removed from all_servers_hb_info", K(server)); + if (OB_FAIL(all_servers_hb_info_.erase_refactored(server))) { + LOG_WARN("fail to remove the server from all_servers_hb_info", KR(ret), K(server)); + } + } + } + } + return ret; +} + +int ObHeartbeatService::update_table_for_server_with_hb_response_( + const ObHBResponse &hb_response, + const share::ObServerInfoInTable &server_info_in_table, + const int64_t hb_responses_epoch_id) +{ + int ret = OB_SUCCESS; + common::ObMySQLTransaction trans; + bool is_match = false; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + HBS_LOG_ERROR("sql_proxy_ is null", KR(ret), KP(sql_proxy_)); + } else if (OB_UNLIKELY(!hb_response.is_valid() + || !server_info_in_table.is_valid() + || hb_response.get_server() != server_info_in_table.get_server() + || palf::INVALID_PROPOSAL_ID == hb_responses_epoch_id)) { + ret = OB_INVALID_ARGUMENT; + // return false + LOG_WARN("invalid argument", KR(ret), K(hb_response), K(server_info_in_table), K(hb_responses_epoch_id)); + } else if (OB_FAIL(trans.start(sql_proxy_, OB_SYS_TENANT_ID))) { + LOG_WARN("fail to start trans", KR(ret)); + } else if (OB_FAIL(ObServiceEpochProxy::check_service_epoch_with_trans( + trans, + OB_SYS_TENANT_ID, + ObServiceEpochProxy::HEARTBEAT_SERVICE_EPOCH, + hb_responses_epoch_id, + is_match))) { + LOG_WARN("fail to check heartbeat service epoch", KR(ret), K(hb_responses_epoch_id)); + } else if (OB_UNLIKELY(!is_match)) { + ret = OB_NOT_MASTER; + LOG_WARN("hb_responses_epoch_id is not the same as persistent heartbeat service epoch id", KR(ret)); + } else { + const ObAddr &server = server_info_in_table.get_server(); + // ********* check with_rootserver ********* // + if (OB_SUCC(ret) + && !server_info_in_table.get_with_rootserver() && server == GCTX.self_addr()) { + if (OB_FAIL(ObServerTableOperator::update_with_rootserver(trans, server))) { + HBS_LOG_WARN("fail to update_with_rootserver", KR(ret), K(server)); + } else { + ROOTSERVICE_EVENT_ADD("server", "rootserver", "server", server); + HBS_LOG_INFO("server becomes rootserver", K(server)); + } + } + + // ********* check if offline to online, then update last_offline_time and status ********* // + if (OB_SUCC(ret) && 0 != server_info_in_table.get_last_offline_time()) { + if (OB_FAIL(ObServerTableOperator::update_table_for_offline_to_online_server( + trans, + ObServerStatus::OB_SERVER_DELETING == server_info_in_table.get_status(), /* is_deleting */ + server))) { + HBS_LOG_WARN("fail to reset last_offline_time", KR(ret), K(server)); + } else { + ROOTSERVICE_EVENT_ADD("server", "last_offline_time reset", "server", server); + HBS_LOG_INFO("server becomes online", K(server)); + } + } + // ********* check build_version ********* // + if (OB_SUCC(ret) && server_info_in_table.get_build_version() != hb_response.get_build_version()) { + if (OB_FAIL(ObServerTableOperator::update_build_version( + trans, + server, + server_info_in_table.get_build_version(), // old value + hb_response.get_build_version()))) { + HBS_LOG_WARN("fail to update build_version", KR(ret), + K(server_info_in_table), K(hb_response)); + } else { + ROOTSERVICE_EVENT_ADD("server", hb_response.get_build_version().ptr(), "server", server); + HBS_LOG_INFO("build_version is updated", K(server), + K(hb_response.get_build_version()), K(hb_response.get_build_version())); + } + } + // ********* check start_service_time ********* // + if (OB_SUCC(ret) && server_info_in_table.get_start_service_time() != hb_response.get_start_service_time()) { + if (OB_FAIL(ObServerTableOperator::update_start_service_time( + trans, + server, + server_info_in_table.get_start_service_time(), // old value + hb_response.get_start_service_time()))) { + HBS_LOG_WARN("fail to update start service time", KR(ret), K(server), + K(server_info_in_table.get_start_service_time()), K(hb_response.get_start_service_time())); + } else { + ROOTSERVICE_EVENT_ADD("server", "start_service", "server", server); + HBS_LOG_INFO("start service time is updated", K(server), + K(server_info_in_table.get_start_service_time()), K(hb_response.get_start_service_time())); + } + } + } + if (OB_UNLIKELY(!trans.is_started())) { + LOG_WARN("the transaction is not started"); + } else { + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(trans.end(OB_SUCC(ret)))) { + LOG_WARN("fail to commit the transaction", KR(ret), KR(tmp_ret), + K(server_info_in_table), K(hb_response)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } + if (OB_FAIL(ret)) { + LOG_WARN("fail to update __all_server table", KR(ret)); + } + } + return ret; +} +template +bool ObHeartbeatService::has_server_exist_in_array_( + const ObIArray &array, + const common::ObAddr &server, + int64_t &idx) +{ + bool bret = false; + idx = OB_INVALID_INDEX_INT64; + for (int64_t i = 0; i < array.count(); i++) { + if (server == array.at(i).get_server()) { + bret = true; + idx = i; + break; + } + } + return bret; +} +} +} \ No newline at end of file diff --git a/src/rootserver/ob_heartbeat_service.h b/src/rootserver/ob_heartbeat_service.h new file mode 100644 index 000000000..1fccc36db --- /dev/null +++ b/src/rootserver/ob_heartbeat_service.h @@ -0,0 +1,148 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#ifndef OCEANBASE_ROOTSERVER_OB_HEARTBEAT_SERVICE_H +#define OCEANBASE_ROOTSERVER_OB_HEARTBEAT_SERVICE_H + +#include "lib/container/ob_array.h" // ObArray +#include "lib/net/ob_addr.h" // ObZone +#include "lib/utility/ob_unify_serialize.h" +#include "common/ob_zone.h" // ObAddr +#include "share/ob_server_table_operator.h" // ObServerTableOperator +#include "share/ob_heartbeat_struct.h" +#include "rootserver/ob_primary_ls_service.h" // ObTenantThreadHelper +#include "rootserver/ob_rs_async_rpc_proxy.h" // ObSendHeartbeatProxy +#include "observer/ob_heartbeat_handler.h" // ObServerHealthStatus +namespace oceanbase +{ +namespace rootserver +{ +class ObHeartbeatService : public ObTenantThreadHelper, + public logservice::ObICheckpointSubHandler, + public logservice::ObIReplaySubHandler +{ +public: + typedef common::ObArray ObHBRequestArray; + typedef common::ObArray ObHBResponseArray; + typedef common::hash::ObHashMap ObServerHBInfoMap; + typedef common::ObArray ObServerInfoInTableArray; + static const int64_t THREAD_COUNT = 2; + ObHeartbeatService(); + virtual ~ObHeartbeatService(); + int init(); + void destroy(); + virtual void do_work() override; + virtual int switch_to_leader() override; + virtual share::SCN get_rec_scn() override { return share::SCN::max_scn();} + virtual int flush(share::SCN &rec_scn) override { return OB_SUCCESS; } + int replay(const void *buffer, const int64_t nbytes, const palf::LSN &lsn, const share::SCN &scn) override + { + int ret = OB_SUCCESS; + UNUSEDx(buffer, nbytes, lsn, scn); + return ret; + } + static bool is_service_enabled() {return is_service_enabled_; } + DEFINE_MTL_FUNC(ObHeartbeatService) +private: + static const int64_t HB_IDLE_TIME_US = 2 * 1000 * 1000L; // 2s + static const int64_t HB_FAILED_IDLE_TIME_US = 0.5 * 1000 * 1000L; // 0.5s + // (based on the whitelist) generate & send hb_requests and receive hb_responses + int check_is_service_enabled_(); + int64_t get_epoch_id_() const { return ATOMIC_LOAD(&epoch_id_); } + void set_epoch_id_(int64_t epoch_id) { ATOMIC_SET(&epoch_id_, epoch_id); } + int check_upgrade_compat_(); + int send_heartbeat_(); + int prepare_hb_requests_(ObHBRequestArray &hb_requests, int64_t &whitelist_epoch_id); + // generate the whitelist and process received hb_responses + int set_hb_responses_(const int64_t whitelist_epoch_id, ObSendHeartbeatProxy *proxy); + int get_and_reset_hb_responses_(ObHBResponseArray &hb_responses, int64_t &hb_responses_epoch_id); + int manage_heartbeat_(); + // read __all_server table and generate whitelist + int prepare_whitelist_(); + int check_or_update_service_epoch_(const int64_t epoch_id); + int process_hb_responses_(); + int check_server_( + const ObHBResponseArray &hb_responses, + const share::ObServerInfoInTable &server_info_in_table, + const common::ObArray &zone_list, + const int64_t now, + const int64_t hb_responses_epoch_id); + int init_server_hb_info_( + const int64_t now, + const share::ObServerInfoInTable &server_info_in_table, + share::ObServerHBInfo &server_hb_info); + int check_server_without_hb_response_( + const int64_t now, + const share::ObServerInfoInTable &server_info_in_table, + const int64_t hb_responses_epoch_id, + share::ObServerHBInfo &server_hb_info); + int update_table_for_online_to_offline_server_( + const share::ObServerInfoInTable &server_info_in_table, + const int64_t now, + const int64_t hb_responses_epoch_id); + int check_server_with_hb_response_( + const share::ObHBResponse &hb_response, + const share::ObServerInfoInTable &server_info_in_table, + const common::ObArray &zone_list, + const int64_t now, + const int64_t hb_responses_epoch_id, + share::ObServerHBInfo &server_hb_info); + int check_if_hb_response_can_be_processed_( + const share::ObHBResponse &hb_response, + const share::ObServerInfoInTable &server_info_in_table, + const common::ObArray &zone_list) const; + // a common func. for all servers to update server_hb_info + // if a server has hb_response, server_hb_info.server_health_status_ will be updated at + // check_server_with_hb_response() + int update_server_hb_info_( + const int64_t now, + const bool hb_response_exists, + share::ObServerHBInfo &server_hb_info); + int check_and_execute_start_or_stop_server_( + const share::ObHBResponse &hb_response, + const share::ObServerHBInfo &server_hb_info, + const share::ObServerInfoInTable &server_info_in_table); + int clear_deleted_servers_in_all_servers_hb_info_(); + + int update_table_for_server_with_hb_response_( + const share::ObHBResponse &hb_response, + const share::ObServerInfoInTable &server_info_in_table, + const int64_t hb_responses_epoch_id); + template + bool has_server_exist_in_array_( + const ObIArray &array, + const common::ObAddr &server, + int64_t &idx); + bool is_inited_; + common::ObMySQLProxy *sql_proxy_; + obrpc::ObSrvRpcProxy *srv_rpc_proxy_; + int64_t epoch_id_; // the leader epoch, only be updated when the ls becomes leader + int64_t whitelist_epoch_id_; // the leader epoch when we read prepare whitelist + int64_t hb_responses_epoch_id_; // It indicates that current hb_responses are based on which epoch of the whitelist + common::SpinRWLock hb_responses_rwlock_; // when we read/write hb_responses_, need_process_hb_responses_ + // and hb_responses_epoch_id_, we should use this lock + common::SpinRWLock all_servers_info_in_table_rwlock_; // when we read/write all_servers_info_in_table_ + // and whitelist_epoch_id_, we should use this lock + ObServerHBInfoMap all_servers_hb_info_; // only used in manage_heartbeat() + ObServerInfoInTableArray all_servers_info_in_table_; // whitelist, send_heartbeat() read it and manage_heartbeat() write it + common::ObArray inactive_zone_list_; + ObHBResponseArray hb_responses_; // send_heartbeat() write it and manage_heartbeat() read it + bool need_process_hb_responses_; // true if send rpc, and will be reset if responses are processed + bool need_update_server_tracer_; + bool is_rs_server_info_updated_; + static bool is_service_enabled_; +private: + DISALLOW_COPY_AND_ASSIGN(ObHeartbeatService); +}; // end class ObHeartbeatService +} // end namespace rootserver +} // end namespace oceanbase +#endif \ No newline at end of file diff --git a/src/rootserver/ob_index_builder.cpp b/src/rootserver/ob_index_builder.cpp index 5ce8c3c75..8c5398e91 100644 --- a/src/rootserver/ob_index_builder.cpp +++ b/src/rootserver/ob_index_builder.cpp @@ -30,7 +30,6 @@ #include "share/ob_index_builder_util.h" #include "observer/ob_server_struct.h" #include "sql/resolver/ddl/ob_ddl_resolver.h" -#include "ob_server_manager.h" #include "ob_zone_manager.h" #include "ob_ddl_service.h" #include "ob_root_service.h" diff --git a/src/rootserver/ob_index_builder.h b/src/rootserver/ob_index_builder.h index 4b4f3f136..92e18e387 100644 --- a/src/rootserver/ob_index_builder.h +++ b/src/rootserver/ob_index_builder.h @@ -44,7 +44,6 @@ class ObSrvRpcProxy; namespace rootserver { -class ObServerManager; class ObZoneManager; class ObDDLService; class ObDDLTaskRecord; diff --git a/src/rootserver/ob_lost_replica_checker.cpp b/src/rootserver/ob_lost_replica_checker.cpp index f17842773..1fc20ed0c 100644 --- a/src/rootserver/ob_lost_replica_checker.cpp +++ b/src/rootserver/ob_lost_replica_checker.cpp @@ -23,7 +23,7 @@ #include "share/schema/ob_schema_getter_guard.h" #include "share/ls/ob_ls_table_iterator.h"//ObTenantLSTableIterator #include "share/ls/ob_ls_info.h"//ObLSInfo -#include "rootserver/ob_server_manager.h" +#include "share/ob_all_server_tracer.h" #include "observer/ob_server_struct.h" #include "rootserver/ob_root_service.h" namespace oceanbase @@ -37,7 +37,6 @@ namespace rootserver ObLostReplicaChecker::ObLostReplicaChecker() : inited_(false), cond_(), - server_manager_(NULL), lst_operator_(NULL), schema_service_(NULL) { @@ -65,9 +64,7 @@ int ObLostReplicaChecker::check_cancel_() return ret; } -int ObLostReplicaChecker::init(ObServerManager &server_manager, - ObLSTableOperator &lst_operator, - ObMultiVersionSchemaService &schema_service) +int ObLostReplicaChecker::init(ObLSTableOperator &lst_operator, ObMultiVersionSchemaService &schema_service) { int ret = OB_SUCCESS; const int64_t thread_cnt = 1; @@ -79,7 +76,6 @@ int ObLostReplicaChecker::init(ObServerManager &server_manager, } else if (OB_FAIL(create(thread_cnt, "LostRepCheck"))) { LOG_WARN("create empty server checker thread failed", K(ret), K(thread_cnt)); } else { - server_manager_ = &server_manager; lst_operator_ = &lst_operator; schema_service_ = &schema_service; inited_ = true; @@ -293,21 +289,18 @@ int ObLostReplicaChecker::check_lost_server_(const ObAddr &server, bool &is_lost } else if (!server.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid server", K(server), K(ret)); - } else if (OB_ISNULL(server_manager_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("server mgr is null", KR(ret), KP(server_manager_)); - } else if (!server_manager_->has_build()) { + } else if (!SVR_TRACER.has_build()) { is_lost_server = false; } else { - ObServerStatus status; - ret = server_manager_->get_server_status(server, status); + ObServerInfoInTable server_info; + ret = SVR_TRACER.get_server_info(server, server_info); if (OB_ENTRY_NOT_EXIST != ret && OB_SUCCESS != ret) { LOG_WARN("get_server_status failed", K(server), K(ret)); } else if (OB_ENTRY_NOT_EXIST == ret) { ret = OB_SUCCESS; is_lost_server = true; LOG_INFO("server not exist", K(server)); - } else if (status.is_permanent_offline()) { + } else if (server_info.is_permanent_offline()) { is_lost_server = true; } } diff --git a/src/rootserver/ob_lost_replica_checker.h b/src/rootserver/ob_lost_replica_checker.h index 3fb3d7fb8..8e3ffa2e7 100644 --- a/src/rootserver/ob_lost_replica_checker.h +++ b/src/rootserver/ob_lost_replica_checker.h @@ -34,7 +34,6 @@ class ObMultiVersionSchemaService; namespace rootserver { -class ObServerManager; class ObLostReplicaChecker : public ObRsReentrantThread { @@ -42,9 +41,7 @@ public: ObLostReplicaChecker(); virtual ~ObLostReplicaChecker(); - int init(ObServerManager &server_manager, - share::ObLSTableOperator &lst_operator, - share::schema::ObMultiVersionSchemaService &schema_service); + int init(share::ObLSTableOperator &lst_operator, share::schema::ObMultiVersionSchemaService &schema_service); int check_lost_replicas(); virtual void run3() override; virtual int blocking_run() { @@ -66,7 +63,6 @@ private: private: bool inited_; common::ObThreadCond cond_; - ObServerManager *server_manager_; share::ObLSTableOperator *lst_operator_; share::schema::ObMultiVersionSchemaService *schema_service_; private: diff --git a/src/rootserver/ob_migrate_unit_finish_checker.cpp b/src/rootserver/ob_migrate_unit_finish_checker.cpp index c88efe492..c1cf94ff8 100644 --- a/src/rootserver/ob_migrate_unit_finish_checker.cpp +++ b/src/rootserver/ob_migrate_unit_finish_checker.cpp @@ -19,7 +19,6 @@ #include "share/ls/ob_ls_status_operator.h" #include "share/ls/ob_ls_table_operator.h" #include "ob_unit_manager.h" -#include "ob_server_manager.h" #include "ob_zone_manager.h" using namespace oceanbase::common; @@ -29,7 +28,6 @@ using namespace oceanbase::rootserver; ObMigrateUnitFinishChecker::ObMigrateUnitFinishChecker(volatile bool &stop) : inited_(false), unit_mgr_(nullptr), - server_mgr_(nullptr), zone_mgr_(nullptr), schema_service_(nullptr), sql_proxy_(nullptr), @@ -54,7 +52,6 @@ int ObMigrateUnitFinishChecker::check_stop() const int ObMigrateUnitFinishChecker::init( ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, share::schema::ObMultiVersionSchemaService &schema_service, common::ObMySQLProxy &sql_proxy, @@ -66,7 +63,6 @@ int ObMigrateUnitFinishChecker::init( LOG_WARN("init twice", KR(ret)); } else { unit_mgr_ = &unit_mgr; - server_mgr_ = &server_mgr; zone_mgr_ = &zone_mgr; schema_service_ = &schema_service; sql_proxy_ = &sql_proxy; @@ -211,7 +207,6 @@ int ObMigrateUnitFinishChecker::try_check_migrate_unit_finish_by_tenant( LOG_INFO("try check migrate unit finish by tenant", K(tenant_id)); DRLSInfo dr_ls_info(gen_user_tenant_id(tenant_id), unit_mgr_, - server_mgr_, zone_mgr_, schema_service_); common::ObArray ls_status_info_array; diff --git a/src/rootserver/ob_migrate_unit_finish_checker.h b/src/rootserver/ob_migrate_unit_finish_checker.h index b1da832e8..cf9a98170 100644 --- a/src/rootserver/ob_migrate_unit_finish_checker.h +++ b/src/rootserver/ob_migrate_unit_finish_checker.h @@ -33,7 +33,6 @@ class ObLSTableOperator; namespace rootserver { class ObUnitManager; -class ObServerManager; class ObZoneManager; class DRLSInfo; @@ -45,7 +44,6 @@ public: public: int init( ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, share::schema::ObMultiVersionSchemaService &schema_service, common::ObMySQLProxy &sql_proxy, @@ -72,7 +70,6 @@ private: // data members bool inited_; ObUnitManager *unit_mgr_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; share::schema::ObMultiVersionSchemaService *schema_service_; common::ObMySQLProxy *sql_proxy_; diff --git a/src/rootserver/ob_root_balancer.cpp b/src/rootserver/ob_root_balancer.cpp index a934c6955..393656167 100644 --- a/src/rootserver/ob_root_balancer.cpp +++ b/src/rootserver/ob_root_balancer.cpp @@ -81,11 +81,11 @@ int ObRootBalancer::init(common::ObServerConfig &cfg, } else if (OB_FAIL(create(root_balancer_thread_cnt, "RootBalance"))) { LOG_WARN("create root balancer thread failed", K(ret), K(root_balancer_thread_cnt)); } else if (OB_FAIL(disaster_recovery_worker_.init( - self_addr, cfg, unit_mgr, server_mgr, zone_mgr, + self_addr, cfg, unit_mgr, zone_mgr, dr_task_mgr, *GCTX.lst_operator_, schema_service, rpc_proxy, sql_proxy))) { LOG_WARN("fail to init disaster recovery worker", KR(ret)); } else if (OB_FAIL(rootservice_util_checker_.init( - unit_mgr, server_mgr, zone_mgr, *GCTX.rs_rpc_proxy_, self_addr, schema_service, sql_proxy, *GCTX.lst_operator_))) { + unit_mgr, zone_mgr, *GCTX.rs_rpc_proxy_, self_addr, schema_service, sql_proxy, *GCTX.lst_operator_))) { LOG_WARN("fail to init rootservice util checker", KR(ret)); } else { inited_ = true; diff --git a/src/rootserver/ob_root_inspection.h b/src/rootserver/ob_root_inspection.h index 50def8fb6..7ee78c037 100644 --- a/src/rootserver/ob_root_inspection.h +++ b/src/rootserver/ob_root_inspection.h @@ -47,7 +47,6 @@ namespace rootserver class ObZoneManager; class ObRootService; class ObFetchPrimaryDDLOperator; -class ObServerManager; // Interface of all the inspection task class ObInspectionTask diff --git a/src/rootserver/ob_root_minor_freeze.cpp b/src/rootserver/ob_root_minor_freeze.cpp index bbc25edf7..bf5a0442e 100644 --- a/src/rootserver/ob_root_minor_freeze.cpp +++ b/src/rootserver/ob_root_minor_freeze.cpp @@ -16,9 +16,9 @@ #include "share/ob_srv_rpc_proxy.h" #include "share/location_cache/ob_location_service.h" +#include "share/ob_all_server_tracer.h" #include "lib/container/ob_se_array.h" #include "rootserver/ddl_task/ob_ddl_scheduler.h" -#include "rootserver/ob_server_manager.h" #include "rootserver/ob_unit_manager.h" #include "rootserver/ob_rs_async_rpc_proxy.h" @@ -35,7 +35,6 @@ ObRootMinorFreeze::ObRootMinorFreeze() :inited_(false), stopped_(false), rpc_proxy_(NULL), - server_manager_(NULL), unit_manager_(NULL) { } @@ -49,7 +48,6 @@ ObRootMinorFreeze::~ObRootMinorFreeze() } int ObRootMinorFreeze::init(ObSrvRpcProxy &rpc_proxy, - ObServerManager &server_manager, ObUnitManager &unit_manager) { int ret = OB_SUCCESS; @@ -58,7 +56,6 @@ int ObRootMinorFreeze::init(ObSrvRpcProxy &rpc_proxy, LOG_WARN("init twice", K(ret)); } else { rpc_proxy_ = &rpc_proxy; - server_manager_ = &server_manager; unit_manager_ = &unit_manager; stopped_ = false; inited_ = true; @@ -105,7 +102,7 @@ bool ObRootMinorFreeze::is_server_alive(const ObAddr &server) const bool is_alive = false; if (OB_LIKELY(server.is_valid())) { - if (OB_FAIL(server_manager_->check_server_alive(server, is_alive))) { + if (OB_FAIL(SVR_TRACER.check_server_alive(server, is_alive))) { LOG_WARN("fail to check whether server is alive, ", K(server), K(ret)); is_alive = false; } @@ -242,13 +239,10 @@ int ObRootMinorFreeze::is_server_belongs_to_zone(const ObAddr &addr, int ret = OB_SUCCESS; ObZone server_zone; - if (OB_ISNULL(server_manager_)) { - ret = OB_NOT_INIT; - LOG_WARN("server_manager_ is NULL", K(ret)); - } else if (0 == zone.size()) { + if (0 == zone.size()) { server_in_zone = true; - } else if (OB_FAIL(server_manager_->get_server_zone(addr, server_zone))) { - LOG_WARN("fail to get server zone", K(ret)); + } else if (OB_FAIL(SVR_TRACER.get_server_zone(addr, server_zone))) { + LOG_WARN("fail to get server zone", KR(ret), K(addr)); } else if (server_zone == zone) { server_in_zone = true; } else { @@ -362,8 +356,8 @@ int ObRootMinorFreeze::init_params_by_zone(const ObZone &zone, if (OB_UNLIKELY(0 == zone.size())) { ret = OB_ERR_UNEXPECTED; } else { - if (OB_FAIL(server_manager_->get_servers_of_zone(zone, target_server_list))) { - LOG_WARN("fail to get tenant server list, ", K(ret)); + if (OB_FAIL(SVR_TRACER.get_servers_of_zone(zone, target_server_list))) { + LOG_WARN("fail to get tenant server list, ", KR(ret), K(zone)); } else if (0 == target_server_list.count()) { ret = OB_ZONE_NOT_ACTIVE; LOG_WARN("empty zone or invalid", K(zone), K(ret)); @@ -398,8 +392,8 @@ int ObRootMinorFreeze::init_params_by_server(const ObIArray &server_list ObSEArray target_server_list; // get all alive server - if (OB_FAIL(server_manager_->get_alive_servers(zone, target_server_list))) { - LOG_WARN("fail to get alive servers, ", K(ret)); + if (OB_FAIL(SVR_TRACER.get_alive_servers(zone, target_server_list))) { + LOG_WARN("fail to get alive servers, ", KR(ret), K(zone)); } else { for (int i = 0; i < target_server_list.count() && OB_SUCC(ret); ++i) { if (OB_FAIL(params.push_back_param(target_server_list.at(i)))) { diff --git a/src/rootserver/ob_root_minor_freeze.h b/src/rootserver/ob_root_minor_freeze.h index c05360f29..7934b396d 100644 --- a/src/rootserver/ob_root_minor_freeze.h +++ b/src/rootserver/ob_root_minor_freeze.h @@ -31,7 +31,6 @@ class ObSrvRpcProxy; namespace rootserver { -class ObServerManager; class ObUnitManager; class ObRootMinorFreeze @@ -41,7 +40,6 @@ public: virtual ~ObRootMinorFreeze(); int init(obrpc::ObSrvRpcProxy &rpc_proxy, - ObServerManager &server_manager, ObUnitManager &unit_manager); void start(); void stop(); @@ -106,7 +104,6 @@ private: bool inited_; bool stopped_; obrpc::ObSrvRpcProxy *rpc_proxy_; - ObServerManager *server_manager_; ObUnitManager *unit_manager_; }; diff --git a/src/rootserver/ob_root_service.cpp b/src/rootserver/ob_root_service.cpp index c59e9aa48..c26a1e400 100644 --- a/src/rootserver/ob_root_service.cpp +++ b/src/rootserver/ob_root_service.cpp @@ -99,6 +99,7 @@ #include "share/scn.h" #include "logservice/palf_handle_guard.h" #include "logservice/ob_log_service.h" +#include "rootserver/ob_heartbeat_service.h" namespace oceanbase { @@ -233,6 +234,7 @@ int ObRootService::ObStartStopServerTask::process() if (!server_.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(server_)); + // still use server manager here, since this func. will be called only in version < 4.2 } else if (OB_FAIL(root_service_.get_server_mgr().is_server_exist(server_, exist))) { LOG_WARN("fail to check server exist", KR(ret), K(server_)); } else if (!exist) { @@ -643,11 +645,12 @@ ObRootService::ObRootService() server_manager_(), hb_checker_(), server_checker_(), rs_list_change_cb_(*this), + server_zone_op_service_(), root_minor_freeze_(), lst_operator_(NULL), zone_manager_(), ddl_service_(), unit_manager_(server_manager_, zone_manager_), root_balancer_(), empty_server_checker_(), lost_replica_checker_(), thread_checker_(), - vtable_location_getter_(server_manager_, unit_manager_), + vtable_location_getter_(unit_manager_), addr_agent_(NULL), root_inspection_(), upgrade_executor_(), upgrade_storage_format_executor_(), create_inner_schema_executor_(), @@ -754,34 +757,34 @@ int ObRootService::fake_init(ObServerConfig &config, // init ddl service if (OB_SUCC(ret)) { if (OB_FAIL(ddl_service_.init(rpc_proxy_, common_proxy_, sql_proxy_, *schema_service, - lst_operator, server_manager_, zone_manager_, unit_manager_, + lst_operator, zone_manager_, unit_manager_, snapshot_manager_))) { LOG_WARN("ddl_service_ init failed", K(ret)); } } if (OB_SUCC(ret)) { - if (OB_FAIL(backup_task_scheduler_.init(&server_manager_, &zone_manager_, &rpc_proxy_, &backup_service_, sql_proxy_, backup_lease_service_))) { + if (OB_FAIL(backup_task_scheduler_.init(&zone_manager_, &rpc_proxy_, &backup_service_, sql_proxy_, backup_lease_service_))) { LOG_WARN("fail to init backup task scheduler", K(ret)); } } if (OB_SUCC(ret)) { - if (OB_FAIL(backup_service_.init(server_manager_, sql_proxy_, rpc_proxy_, *schema_service_, backup_lease_service_, backup_task_scheduler_))) { + if (OB_FAIL(backup_service_.init(sql_proxy_, rpc_proxy_, *schema_service_, backup_lease_service_, backup_task_scheduler_))) { LOG_WARN("fail to init backup service", K(ret)); } } if (OB_SUCC(ret)) { if (OB_FAIL(dbms_job::ObDBMSJobMaster::get_instance() - .init(&server_manager_, &sql_proxy_, schema_service_))) { + .init(&sql_proxy_, schema_service_))) { LOG_WARN("failed to init ObDBMSJobMaster", K(ret)); } } if (OB_SUCC(ret)) { if (OB_FAIL(dbms_scheduler::ObDBMSSchedJobMaster::get_instance() - .init(&server_manager_, &unit_manager_, &sql_proxy_, schema_service_))) { + .init(&unit_manager_, &sql_proxy_, schema_service_))) { LOG_WARN("failed to init ObDBMSSchedJobMaster", K(ret)); } } @@ -866,15 +869,23 @@ int ObRootService::init(ObServerConfig &config, unit_manager_, zone_manager_, config, self, rpc_proxy_))) { // init server management related FLOG_WARN("init server manager failed", KR(ret)); + } else if (OB_FAIL(server_zone_op_service_.init( + server_change_callback_, + rpc_proxy_, + lst_operator, + unit_manager_, + sql_proxy_ + ))) { + FLOG_WARN("init server zone op service failed", KR(ret)); } else if (OB_FAIL(hb_checker_.init(server_manager_))) { FLOG_WARN("init heartbeat checker failed", KR(ret)); } else if (OB_FAIL(server_checker_.init(server_manager_, self))) { FLOG_WARN("init server checker failed", KR(ret), K(self)); - } else if (OB_FAIL(root_minor_freeze_.init(rpc_proxy_, server_manager_, unit_manager_))) { + } else if (OB_FAIL(root_minor_freeze_.init(rpc_proxy_, unit_manager_))) { // init root minor freeze FLOG_WARN("init root_minor_freeze_ failed", KR(ret)); } else if (OB_FAIL(ddl_service_.init(rpc_proxy_, common_proxy_, sql_proxy_, *schema_service, - lst_operator, server_manager_, zone_manager_, unit_manager_, + lst_operator, zone_manager_, unit_manager_, snapshot_manager_))) { // init ddl service FLOG_WARN("init ddl_service_ failed", KR(ret)); @@ -895,9 +906,14 @@ int ObRootService::init(ObServerConfig &config, FLOG_WARN("init create inner role executor failed", KR(ret)); } else if (OB_FAIL(thread_checker_.init())) { FLOG_WARN("init thread checker failed", KR(ret)); - } else if (OB_FAIL(empty_server_checker_.init(server_manager_, unit_manager_, *lst_operator_, *schema_service_))) { + } else if (OB_FAIL(empty_server_checker_.init( + server_manager_, + unit_manager_, + *lst_operator_, + *schema_service_, + server_zone_op_service_))) { FLOG_WARN("init empty server checker failed", KR(ret)); - } else if (OB_FAIL(lost_replica_checker_.init(server_manager_, *lst_operator_, *schema_service_))) { + } else if (OB_FAIL(lost_replica_checker_.init(*lst_operator_, *schema_service_))) { FLOG_WARN("init empty server checker failed", KR(ret)); } else if (OB_FAIL(root_balancer_.init(*config_, *schema_service_, unit_manager_, server_manager_, zone_manager_, rpc_proxy_, @@ -914,43 +930,38 @@ int ObRootService::init(ObServerConfig &config, } else if (OB_FAIL(backup_lease_service_.init(self_addr_, sql_proxy))) { // backup lease service需要rs切走以后将lease的leader addr重置调,所以不能用sql_proxy_ FLOG_WARN("init backup lease service failed", KR(ret)); - } else if (OB_FAIL(backup_task_scheduler_.init(&server_manager_, &zone_manager_, &rpc_proxy_, + } else if (OB_FAIL(backup_task_scheduler_.init(&zone_manager_, &rpc_proxy_, &backup_service_, sql_proxy_, backup_lease_service_))) { FLOG_WARN("init backup task scheduler failed", KR(ret)); - } else if (OB_FAIL(backup_service_.init(server_manager_, sql_proxy_, rpc_proxy_, *schema_service_, + } else if (OB_FAIL(backup_service_.init(sql_proxy_, rpc_proxy_, *schema_service_, backup_lease_service_, backup_task_scheduler_))) { FLOG_WARN("init backup service failed", KR(ret)); } else if (OB_FAIL(backup_lease_service_.register_mgr(backup_service_))) { FLOG_WARN("register log backup mgr failed", KR(ret)); - } else if (OB_FAIL(archive_service_.init(server_manager_, zone_manager_, unit_manager_, schema_service_, + } else if (OB_FAIL(archive_service_.init(zone_manager_, unit_manager_, schema_service_, rpc_proxy_, sql_proxy_, backup_lease_service_))) { FLOG_WARN("init archive_service_ failed", KR(ret)); } else if (OB_FAIL(backup_lease_service_.register_scheduler(archive_service_))) { FLOG_WARN("register archive_service_ failed", KR(ret)); } else if (OB_FAIL(schema_history_recycler_.init(*schema_service_, zone_manager_, - sql_proxy_, - server_manager_))) { + sql_proxy_))) { FLOG_WARN("fail to init schema history recycler failed", KR(ret)); } else if (OB_FAIL(backup_lease_service_.start())) { FLOG_WARN("start backup lease task failed", KR(ret)); - } else if (OB_FAIL(dbms_job::ObDBMSJobMaster::get_instance().init(&server_manager_, - &sql_proxy_, + } else if (OB_FAIL(dbms_job::ObDBMSJobMaster::get_instance().init(&sql_proxy_, schema_service_))) { FLOG_WARN("init ObDBMSJobMaster failed", KR(ret)); - } else if (OB_FAIL(dbms_scheduler::ObDBMSSchedJobMaster::get_instance().init(&server_manager_, - &unit_manager_, + } else if (OB_FAIL(dbms_scheduler::ObDBMSSchedJobMaster::get_instance().init(&unit_manager_, &sql_proxy_, schema_service_))) { FLOG_WARN("init ObDBMSSchedJobMaster failed", KR(ret)); } else if (OB_FAIL(disaster_recovery_task_executor_.init(lst_operator, - rpc_proxy_, - server_manager_))) { + rpc_proxy_))) { FLOG_WARN("init disaster recovery task executor failed", KR(ret)); } else if (OB_FAIL(disaster_recovery_task_mgr_.init(self, *config_, disaster_recovery_task_executor_, - &server_manager_, &rpc_proxy_, &sql_proxy_, schema_service_))) { @@ -1389,9 +1400,11 @@ int ObRootService::submit_update_all_server_task(const ObAddr &server) LOG_WARN("invalid server", K(server), K(ret)); } else { const bool with_rootserver = (server == self_addr_); - ObAllServerTask task(server_manager_, disaster_recovery_task_mgr_, server, with_rootserver); - if (OB_FAIL(task_queue_.add_async_task(task))) { - LOG_WARN("inner queue push task failed", K(ret)); + if (!ObHeartbeatService::is_service_enabled()) { + ObAllServerTask task(server_manager_, disaster_recovery_task_mgr_, server, with_rootserver); + if (OB_FAIL(task_queue_.add_async_task(task))) { + LOG_WARN("inner queue push task failed", K(ret)); + } } } @@ -1529,10 +1542,13 @@ int ObRootService::schedule_check_server_timer_task() if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_FAIL(task_queue_.add_timer_task(check_server_task_, - config_->server_check_interval, true))) { - LOG_WARN("failed to add check_server task", K(ret)); - } else {} + } else if (!ObHeartbeatService::is_service_enabled()) { + if (OB_FAIL(task_queue_.add_timer_task(check_server_task_, config_->server_check_interval, true))) { + LOG_WARN("failed to add check_server task", K(ret)); + } + } else { + LOG_TRACE("no need to schedule ObCheckServerTask in version >= 4.2"); + } return ret; } @@ -1673,8 +1689,7 @@ int ObRootService::submit_update_rslist_task(const bool force_update) if (ObUpdateRsListTask::try_lock()) { bool task_added = false; ObUpdateRsListTask task; - if (OB_FAIL(task.init(*lst_operator_, addr_agent_, - server_manager_, zone_manager_, + if (OB_FAIL(task.init(*lst_operator_, addr_agent_, zone_manager_, broadcast_rs_list_lock_, force_update, self_addr_))) { LOG_WARN("task init failed", KR(ret)); @@ -1769,7 +1784,7 @@ int ObRootService::update_rslist() ObTimeoutCtx ctx; ctx.set_timeout(config_->rpc_timeout); const bool force_update = true; - if (OB_FAIL(task.init(*lst_operator_, addr_agent_, server_manager_, + if (OB_FAIL(task.init(*lst_operator_, addr_agent_, zone_manager_, broadcast_rs_list_lock_, force_update, self_addr_))) { LOG_WARN("task init failed", K(ret), K(force_update)); } else if (OB_FAIL(task.process_without_lock())) { @@ -1799,7 +1814,7 @@ int ObRootService::update_all_server_and_rslist() if (OB_SUCC(ret)) { ObArray servers; ObZone empty_zone; // empty zone for all servers - if (OB_FAIL(server_manager_.get_servers_of_zone(empty_zone, servers))) { + if (OB_FAIL(SVR_TRACER.get_servers_of_zone(empty_zone, servers))) { LOG_WARN("get server list failed", K(ret)); } else { FOREACH_X(s, servers, OB_SUCC(ret)) { @@ -1987,7 +2002,7 @@ int ObRootService::execute_bootstrap(const obrpc::ObBootstrapArg &arg) FLOG_INFO("[ROOTSERVICE_NOTICE] success to get lock for bootstrap in execute_bootstrap"); ObBootstrap bootstrap(rpc_proxy_, *lst_operator_, ddl_service_, unit_manager_, *config_, arg, common_proxy_); - if (OB_FAIL(bootstrap.execute_bootstrap())) { + if (OB_FAIL(bootstrap.execute_bootstrap(server_zone_op_service_))) { LOG_ERROR("failed to execute_bootstrap", K(server_list), K(ret)); } @@ -2194,8 +2209,7 @@ void ObRootService::construct_lease_expire_time( lease_response.lease_expire_time_ = lease_response.heartbeat_expire_time_; } -int ObRootService::renew_lease(const ObLeaseRequest &lease_request, - ObLeaseResponse &lease_response) +int ObRootService::renew_lease(const ObLeaseRequest &lease_request, ObLeaseResponse &lease_response) { int ret = OB_SUCCESS; ObServerStatus server_stat; @@ -2222,13 +2236,16 @@ int ObRootService::renew_lease(const ObLeaseRequest &lease_request, if (OB_FAIL(zone_manager_.get_lease_info_version(lease_info_version))) { LOG_WARN("get_lease_info_version failed", K(ret)); } else if (OB_FAIL(server_manager_.get_server_status( - lease_request.server_, server_stat))) { + lease_request.server_, server_stat))) { + // get server_stat for construct_lease_expire_time only! LOG_WARN("get server status failed", K(ret), "server", lease_request.server_); - } else if (OB_FAIL(server_manager_.is_server_stopped(lease_request.server_, is_stopped))) { - LOG_WARN("check_server_stopped failed", K(ret), "server", lease_request.server_); + } + if (!ObHeartbeatService::is_service_enabled()) { + if (FAILEDx(server_manager_.is_server_stopped(lease_request.server_, is_stopped))) { + LOG_WARN("check_server_stopped failed", KR(ret), "server", lease_request.server_); + } } } - if (OB_SUCC(ret)) { lease_response.version_ = ObLeaseResponse::LEASE_VERSION; construct_lease_expire_time(lease_request, lease_response, server_stat); @@ -2339,25 +2356,6 @@ int ObRootService::fetch_location( return ret; } -int ObRootService::try_block_server(int rc, const common::ObAddr &server) -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (rc > 0 || rc <= -OB_MAX_ERROR_CODE || !server.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(rc), K(server), K(ret)); - } else if (OB_SERVER_MIGRATE_IN_DENIED == rc - || OB_TOO_MANY_PARTITIONS_ERROR == rc) { - LOG_INFO("receive server deny migrate in, try to block server migrate in", K(server)); - if (OB_FAIL(server_manager_.block_migrate_in(server))) { - LOG_WARN("block migrate in failed", K(ret), K(server)); - } - } - return ret; -} - //////////////////////////////////////////////////////////////// int ObRootService::create_resource_unit(const obrpc::ObCreateResourceUnitArg &arg) @@ -6899,43 +6897,54 @@ int ObRootService::do_tablespace_ddl(const obrpc::ObTablespaceDDLArg &arg) return ret; } - -int ObRootService::construct_rs_list_arg( - ObRsListArg &rs_list_arg) -{ - int ret = OB_SUCCESS; - ObLSInfo ls_info; - int64_t cluster_id = GCONF.cluster_id; - uint64_t tenant_id = OB_SYS_TENANT_ID; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("not init", KR(ret)); - } else if (OB_FAIL(get_lst_operator().get( - GCONF.cluster_id, - tenant_id, - SYS_LS, - share::ObLSTable::DEFAULT_MODE, - ls_info))) { - LOG_WARN("fail to get", KR(ret)); - } else { - rs_list_arg.master_rs_ = GCONF.self_addr_; - FOREACH_CNT_X(replica, ls_info.get_replicas(), OB_SUCCESS == ret) { - if (replica->get_server() == GCONF.self_addr_ - || (replica->is_in_service() - && ObReplicaTypeCheck::is_paxos_replica_V2(replica->get_replica_type()))) { - if (OB_FAIL(rs_list_arg.rs_list_.push_back(replica->get_server()))) { - LOG_WARN("fail to push back", KR(ret)); - } - } - } - } - return ret; -} - //////////////////////////////////////////////////////////////// // server & zone management //////////////////////////////////////////////////////////////// +int ObRootService::add_server_for_bootstrap_in_version_smaller_than_4_2_0( + const common::ObAddr &server, + const common::ObZone &zone) +{ + return server_manager_.add_server(server, zone); +} int ObRootService::add_server(const obrpc::ObAdminServerArg &arg) +{ + int ret = OB_SUCCESS; + ObTimeoutCtx ctx; + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (!arg.is_valid()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arg", KR(ret), K(arg)); + } else if (OB_FAIL(rootserver::ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else {} + if (OB_SUCC(ret)) { + if (!ObHeartbeatService::is_service_enabled()) { // the old logic + LOG_INFO("sys tenant data version < 4.2, add_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(old_add_server(arg))) { + LOG_WARN("fail to add server by using old logic", KR(ret), K(arg)); + } + } else { // the new logic + LOG_INFO("sys tenant data version >= 4.2, add_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(server_zone_op_service_.add_servers(arg.servers_, arg.zone_))) { + LOG_WARN("fail to add servers", KR(ret), K(arg)); + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(load_server_manager())) { + // ** FIXME (linqiucen.lqc): temp. solution. + // ** This will be removed if we do not need whitelist in server_manager + LOG_WARN("fail to load server_manager, please try 'ALTER SYSTEM RELOAD SERVER;'", KR(ret), KR(tmp_ret)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } + } + } + FLOG_INFO("add server", KR(ret), K(arg)); + return ret; +} +int ObRootService::old_add_server(const obrpc::ObAdminServerArg &arg) { int ret = OB_SUCCESS; uint64_t sys_data_version = 0; @@ -6956,7 +6965,6 @@ int ObRootService::add_server(const obrpc::ObAdminServerArg &arg) dp_arg.single_zone_deployment_on_ = OB_FILE_SYSTEM_ROUTER.is_single_zone_deployment_on(); for (int64_t i = 0; OB_SUCC(ret) && i < arg.servers_.count(); ++i) { - const int64_t rpc_timeout = THIS_WORKER.get_timeout_ts() - ObTimeUtility::current_time(); const ObAddr &addr = arg.servers_[i]; Bool is_empty(false); Bool is_deployment_mode_match(false); @@ -6987,8 +6995,50 @@ int ObRootService::add_server(const obrpc::ObAdminServerArg &arg) ROOTSERVICE_EVENT_ADD("root_service", "add_server", K(ret), K(arg)); return ret; } - int ObRootService::delete_server(const obrpc::ObAdminServerArg &arg) +{ + int ret = OB_SUCCESS; + ObTimeoutCtx ctx; + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (!arg.is_valid()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arg", KR(ret), K(arg)); + } else if (OB_FAIL(rootserver::ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else {} + if (OB_SUCC(ret)) { + if (!ObHeartbeatService::is_service_enabled()) { // the old logic + LOG_INFO("sys tenant data version < 4.2, delete_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(old_delete_server(arg))) { + LOG_WARN("fail to delete server by using the old logic", KR(ret), K(arg)); + } + } else { // the new logic + LOG_INFO("sys tenant data version >= 4.2, delete_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(server_zone_op_service_.delete_servers(arg.servers_, arg.zone_))) { + LOG_WARN("fail to delete servers", KR(ret), K(arg)); + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(load_server_manager())) { + // ** FIXME (linqiucen.lqc): temp. solution. + // ** This will be removed if we do not need whitelist in server_manager + LOG_WARN("fail to load server_manager, please try 'ALTER SYSTEM RELOAD SERVER;'", KR(ret), KR(tmp_ret)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } else { + root_balancer_.wakeup(); + empty_server_checker_.wakeup(); + lost_replica_checker_.wakeup(); + LOG_INFO("delete server and load server manager successfully", K(arg)); + } + } + } + FLOG_INFO("delete server", KR(ret), K(arg)); + return ret; +} +int ObRootService::old_delete_server(const obrpc::ObAdminServerArg &arg) { int ret = OB_SUCCESS; bool has_enough = false; @@ -7000,7 +7050,7 @@ int ObRootService::delete_server(const obrpc::ObAdminServerArg &arg) LOG_WARN("invalid arg", K(arg), K(ret)); } else if (OB_FAIL(check_server_have_enough_resource_for_delete_server(arg.servers_, arg.zone_))) { LOG_WARN("not enough resource, cannot delete servers", K(ret), K(arg)); - } else if (OB_FAIL(check_all_ls_has_leader_("delete server"))) { + } else if (OB_FAIL(check_all_ls_has_leader("delete server"))) { LOG_WARN("fail to check all ls has leader", KR(ret), K(arg)); } else if (OB_FAIL(server_manager_.delete_server(arg.servers_, arg.zone_))) { LOG_WARN("delete_server failed", "servers", arg.servers_, "zone", arg.zone_, K(ret)); @@ -7025,6 +7075,7 @@ int ObRootService::check_server_have_enough_resource_for_delete_server( } else { FOREACH_CNT_X(server, servers, OB_SUCC(ret)) { if (zone.is_empty()) { + // still use server manager here, since this func. will be called only in version < 4.2 if (OB_FAIL(server_manager_.get_server_zone(*server, tmp_zone))) { LOG_WARN("fail to get server zone", K(ret)); } @@ -7047,8 +7098,48 @@ int ObRootService::check_server_have_enough_resource_for_delete_server( return ret; } - int ObRootService::cancel_delete_server(const obrpc::ObAdminServerArg &arg) +{ + int ret = OB_SUCCESS; + ObTimeoutCtx ctx; + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (!arg.is_valid()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arg", KR(ret), K(arg)); + } else if (OB_FAIL(rootserver::ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else {} + if (OB_SUCC(ret)) { + if (!ObHeartbeatService::is_service_enabled()) { // the old logic + LOG_INFO("sys tenant data version < 4.2, cancel_delete_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(old_cancel_delete_server(arg))) { + LOG_WARN("fail to cancel delete server by using the old logic", KR(ret), K(arg)); + } + } else { // the new logic + LOG_INFO("sys tenant data version >= 4.2, cancel_delete_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(server_zone_op_service_.cancel_delete_servers(arg.servers_, arg.zone_))) { + LOG_WARN("fail to cancel delete servers", KR(ret), K(arg)); + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(load_server_manager())) { + // ** FIXME (linqiucen.lqc): temp. solution. + // ** This will be removed if we do not need whitelist in server_manager + LOG_WARN("fail to load server_manager, please try 'ALTER SYSTEM RELOAD SERVER;'", KR(ret), KR(tmp_ret)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } else { + root_balancer_.wakeup(); + } + } + } + FLOG_INFO("cancel delete server", KR(ret), K(arg)); + return ret; +} + +int ObRootService::old_cancel_delete_server(const obrpc::ObAdminServerArg &arg) { int ret = OB_SUCCESS; if (!inited_) { @@ -7060,15 +7151,6 @@ int ObRootService::cancel_delete_server(const obrpc::ObAdminServerArg &arg) } else { for (int64_t i = 0; OB_SUCC(ret) && i < arg.servers_.count(); ++i) { const bool commit = false; - const bool force_stop_hb = false; - int tmp_ret = OB_SUCCESS; - // resume heardbeat - if (OB_FAIL(server_manager_.set_force_stop_hb(arg.servers_[i], force_stop_hb))) { - LOG_WARN("set force stop hb failed", K(ret), "server", arg.servers_[i], K(force_stop_hb)); - } else if (OB_SUCCESS != (tmp_ret = request_heartbeats())) { - LOG_WARN("request heartbeats failed", K(ret)); - } - if (OB_FAIL(ret)) { } else if (OB_FAIL(server_manager_.end_delete_server(arg.servers_[i], arg.zone_, commit))) { LOG_WARN("delete_server failed", "server", arg.servers_[i], @@ -7086,140 +7168,82 @@ int ObRootService::cancel_delete_server(const obrpc::ObAdminServerArg &arg) int ObRootService::start_server(const obrpc::ObAdminServerArg &arg) { int ret = OB_SUCCESS; - if (!inited_) { + ObTimeoutCtx ctx; + if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); + LOG_WARN("not init", KR(ret), K(inited_)); } else if (!arg.is_valid()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arg", K(arg), K(ret)); - } else if (OB_FAIL(server_manager_.start_server_list(arg.servers_, arg.zone_))) { - LOG_WARN("start servers failed", "server", arg.servers_, "zone", arg.zone_, K(ret)); - } - ROOTSERVICE_EVENT_ADD("root_service", "start_server", K(ret), K(arg)); - return ret; -} - -int ObRootService::get_readwrite_servers( - const common::ObIArray &input_servers, - common::ObIArray &readwrite_servers) -{ - int ret = OB_SUCCESS; - readwrite_servers.reset(); - for (int64_t i = 0; OB_SUCC(ret) && i < input_servers.count(); ++i) { - const ObAddr &server = input_servers.at(i); - HEAP_VAR(ObZoneInfo, zone_info) { - if (OB_FAIL(server_manager_.get_server_zone(server, zone_info.zone_))) { - LOG_WARN("fail to get server zone", K(ret)); - } else if (OB_FAIL(zone_manager_.get_zone(zone_info))) { - LOG_WARN("fail to get zone", K(ret)); - } else { - ObZoneType zone_type = static_cast(zone_info.zone_type_.value_); - if (common::ZONE_TYPE_READWRITE == zone_type - || common::ZONE_TYPE_ENCRYPTION == zone_type) { - if (OB_FAIL(readwrite_servers.push_back(server))) { - LOG_WARN("fail to push back", K(ret)); - } else {} // no more to do - } else if (common::ZONE_TYPE_READONLY == zone_type) { - // ignore read-only zone - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid zone type", K(ret), K(zone_type), K(server), "zone", zone_info.zone_); - } + LOG_WARN("invalid arg", KR(ret), K(arg)); + } else if (OB_FAIL(rootserver::ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else {} + if (OB_SUCC(ret)) { + if (!ObHeartbeatService::is_service_enabled()) { // the old logic + LOG_INFO("sys tenant data version < 4.2, start_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(server_manager_.start_server_list(arg.servers_, arg.zone_))) { + LOG_WARN("fail to start server by using old logic", KR(ret), K(arg)); + } + } else { // the new logic + LOG_INFO("sys tenant data version >= 4.2, start_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(server_zone_op_service_.start_servers(arg.servers_, arg.zone_))) { + LOG_WARN("fail to start servers", KR(ret), K(arg)); + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(load_server_manager())) { + // ** FIXME (linqiucen.lqc): temp. solution. + // ** This will be removed if we do not need whitelist in server_manager + LOG_WARN("fail to load server_manager, please try 'ALTER SYSTEM RELOAD SERVER;'", KR(ret), KR(tmp_ret)); + ret = OB_SUCC(ret) ? tmp_ret : ret; } } } - return ret; -} - -int ObRootService::check_zone_and_server(const ObIArray &servers, - bool &is_same_zone, - bool &is_all_stopped) -{ - int ret = OB_SUCCESS; - ObZone zone; - is_same_zone = true; - is_all_stopped = true; - ObServerStatus server_status; - for (int64_t i = 0; i < servers.count() && OB_SUCC(ret) && is_same_zone && is_all_stopped; i++) { - if (OB_FAIL(server_manager_.get_server_status(servers.at(i), server_status))) { - LOG_WARN("fail to get server zone", K(ret), K(servers), K(i)); - } else if (i == 0) { - zone = server_status.zone_; - } else if (zone != server_status.zone_) { - is_same_zone = false; - LOG_WARN("server zone not same", K(zone), K(server_status), K(servers)); - } - if (OB_FAIL(ret)) { - } else if (server_status.is_stopped()) { - //nothing todo - } else { - is_all_stopped = false; - } - } + FLOG_INFO("start server", KR(ret), K(arg)); return ret; } int ObRootService::stop_server(const obrpc::ObAdminServerArg &arg) { int ret = OB_SUCCESS; - common::ObArray readwrite_servers; - ObZone zone; - bool is_same_zone = false; - bool is_all_stopped = false; - if (!inited_) { + ObTimeoutCtx ctx; + if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (!arg.is_valid()) { + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (OB_UNLIKELY(!arg.is_valid())) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arg", K(arg), K(ret)); - } else if (OB_FAIL(get_readwrite_servers(arg.servers_, readwrite_servers))) { - LOG_WARN("fail to get readwrite servers", K(ret)); - } else if (readwrite_servers.count() <= 0) { - // no need to do check, stop servers directly - } else if (OB_FAIL(check_zone_and_server(arg.servers_, is_same_zone, is_all_stopped))) { - LOG_WARN("fail to check stop server zone", K(ret), K(arg.servers_)); - } else if (is_all_stopped) { - //nothing todo - } else if (!is_same_zone) { - ret = OB_STOP_SERVER_IN_MULTIPLE_ZONES; - LOG_WARN("can not stop servers in multiple zones", K(ret)); - } else if (OB_FAIL(server_manager_.get_server_zone(readwrite_servers.at(0), zone))) { - LOG_WARN("fail to get server zone", K(ret), K(readwrite_servers)); + LOG_WARN("invalid arg", KR(ret), K(arg)); + } else if (OB_FAIL(rootserver::ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); } else { - if (ObAdminServerArg::ISOLATE == arg.op_) { - //"Isolate server" does not need to check the total number and status of replicas; it cannot be restarted later; - if (OB_FAIL(check_can_stop(zone, arg.servers_, false /*is_stop_zone*/))) { - LOG_WARN("fail to check can stop", KR(ret), K(zone), K(arg)); - if (OB_OP_NOT_ALLOW == ret) { - LOG_USER_ERROR(OB_OP_NOT_ALLOW, "Stop all servers in primary region is"); - } + if (!ObHeartbeatService::is_service_enabled()) { // the old logic + LOG_INFO("sys tenant data version < 4.2, stop_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(server_zone_op_service_.stop_server_precheck(arg.servers_, arg.op_))) { + LOG_WARN("fail to precheck stop server", KR(ret), K(arg)); + } else if (OB_FAIL(server_manager_.stop_server_list(arg.servers_, arg.zone_))) { + LOG_WARN("stop server failed", "server", arg.servers_, "zone", arg.zone_, KR(ret)); } } else { - if (have_other_stop_task(zone)) { - ret = OB_STOP_SERVER_IN_MULTIPLE_ZONES; - LOG_WARN("can not stop servers in multiple zones", KR(ret), K(arg), K(zone)); - LOG_USER_ERROR(OB_STOP_SERVER_IN_MULTIPLE_ZONES, - "cannot stop server or stop zone in multiple zones"); - } else if (OB_FAIL(check_majority_and_log_in_sync_( - readwrite_servers, - arg.force_stop_,/*skip_log_sync_check*/ - "stop server"))) { - LOG_WARN("fail to check majority and log in-sync", KR(ret), K(arg)); + LOG_INFO("sys tenant data version >= 4.2, stop_server", K(arg), + "timeout_ts", ctx.get_timeout()); + if (OB_FAIL(server_zone_op_service_.stop_servers(arg.servers_, arg.zone_, arg.op_))) { + LOG_WARN("stop server failed", KR(ret), K(arg)); } - } - if (OB_SUCC(ret)) { - if (OB_FAIL(server_manager_.stop_server_list(arg.servers_, arg.zone_))) { - LOG_WARN("stop server failed", "server", arg.servers_, "zone", arg.zone_, K(ret)); - } else { - LOG_INFO("stop server ok", K(arg)); - int tmp_ret = OB_SUCCESS; - if (OB_TMP_FAIL(try_notify_switch_leader(obrpc::ObNotifySwitchLeaderArg::STOP_SERVER))) { - LOG_WARN("failed to notify switch leader", KR(ret), KR(tmp_ret)); - } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(load_server_manager())) { + // ** FIXME (linqiucen.lqc): temp. solution. + // ** This will be removed if we do not need whitelist in server_manager + LOG_WARN("fail to load server_manager, please try 'ALTER SYSTEM RELOAD SERVER;'", KR(ret), KR(tmp_ret)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } + if (OB_TMP_FAIL(try_notify_switch_leader(obrpc::ObNotifySwitchLeaderArg::STOP_SERVER))) { + LOG_WARN("failed to notify switch leader", KR(ret), KR(tmp_ret)); } } } - ROOTSERVICE_EVENT_ADD("root_service", "stop_server", K(ret), K(arg)); + FLOG_INFO("stop server", KR(ret), K(arg)); return ret; } @@ -7256,8 +7280,14 @@ int ObRootService::delete_zone(const obrpc::ObAdminZoneArg &arg) // it does not matter while add server after check. int64_t alive_count = 0; int64_t not_alive_count = 0; - if (OB_FAIL(server_manager_.get_server_count(arg.zone_, alive_count, not_alive_count))) { - LOG_WARN("failed to get server count of the zone", K(ret), "zone", arg.zone_); + ObArray servers_info; + if (OB_ISNULL(GCTX.sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.sql_proxy_ is null", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObServerTableOperator::get(*GCTX.sql_proxy_, servers_info))) { + LOG_WARN("fail to get servers_info", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObRootUtils::get_server_count(servers_info, arg.zone_, alive_count, not_alive_count))) { + LOG_WARN("failed to get server count of the zone", KR(ret), K(arg.zone_), K(servers_info)); } else { LOG_INFO("current server count of zone", "zone", arg.zone_, K(alive_count), K(not_alive_count)); @@ -7309,14 +7339,12 @@ int ObRootService::check_can_stop(const ObZone &zone, ObArray to_stop_list; ObArray stopped_zone_list; ObArray stopped_server_list; - + ObArray servers_info_in_table; if ((!is_stop_zone && (0 == servers.count() || zone.is_empty())) || (is_stop_zone && zone.is_empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(servers), K(zone)); - } else if (OB_FAIL(ObRootUtils::get_stopped_zone_list(zone_manager_, - server_manager_, - stopped_zone_list, + } else if (OB_FAIL(ObRootUtils::get_stopped_zone_list(stopped_zone_list, stopped_server_list))) { LOG_WARN("fail to get stopped zone list", KR(ret)); } else if (0 >= stopped_server_list.count()) { @@ -7326,7 +7354,12 @@ int ObRootService::check_can_stop(const ObZone &zone, if (OB_FAIL(to_stop_list.assign(servers))) { LOG_WARN("fail to push back", KR(ret), K(servers)); } - } else if (OB_FAIL(server_manager_.get_servers_of_zone(zone, to_stop_list))) { + } else if (OB_ISNULL(GCTX.sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.sql_proxy_ is null", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObServerTableOperator::get(*GCTX.sql_proxy_, servers_info_in_table))) { + LOG_WARN("fail to get servers_info_in_table", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObRootUtils::get_servers_of_zone(servers_info_in_table, zone, to_stop_list))) { LOG_WARN("fail to get servers of zone", KR(ret), K(zone)); } ObArray tenant_ids; @@ -7360,41 +7393,6 @@ int ObRootService::check_can_stop(const ObZone &zone, return ret; } -//Multiple stop tasks are allowed on a zone; they cannot cross zones; -bool ObRootService::have_other_stop_task(const ObZone &zone) -{ - bool bret = false; - int ret = OB_SUCCESS; - ObArray zone_infos; - bool stopped = false; - //Check if there are other servers in the stopped state on other zones - if (OB_FAIL(server_manager_.check_other_zone_stopped(zone, stopped))) { - LOG_WARN("fail to check other zone stopped", KR(ret), K(zone)); - bret = true; - } else if (stopped) { - bret = true; - LOG_WARN("have other server stop in other zone", K(bret), K(zone)); - } else if (OB_FAIL(zone_manager_.get_zone(zone_infos))) { - LOG_WARN("fail to get zone", KR(ret), K(zone)); - bret = true; - } else { - //Check whether other zones are in the stopped state - FOREACH_CNT_X(zone_info, zone_infos, OB_SUCC(ret) && !bret) { - if (OB_ISNULL(zone_info)) { - ret = OB_ERR_UNEXPECTED; - bret = true; - LOG_WARN("zone info is null", KR(ret), K(zone_infos)); - } else if (zone_info->status_ != ObZoneStatus::ACTIVE - && zone != zone_info->zone_) { - bret = true; - LOG_WARN("have other zone in inactive status", K(bret), K(zone), - "other_zone", zone_info->zone_); - } - } - } - return bret; -} - int ObRootService::stop_zone(const obrpc::ObAdminZoneArg &arg) { int ret = OB_SUCCESS; @@ -7427,7 +7425,7 @@ int ObRootService::stop_zone(const obrpc::ObAdminZoneArg &arg) } } else { //stop zone/force stop zone - if (have_other_stop_task(arg.zone_)) { + if (ObRootUtils::have_other_stop_task(arg.zone_)) { ret = OB_STOP_SERVER_IN_MULTIPLE_ZONES; LOG_WARN("cannot stop zone when other stop task already exist", KR(ret), K(arg)); LOG_USER_ERROR(OB_STOP_SERVER_IN_MULTIPLE_ZONES, @@ -7447,11 +7445,17 @@ int ObRootService::stop_zone(const obrpc::ObAdminZoneArg &arg) } else if (common::ZONE_TYPE_READWRITE == zone_type || common::ZONE_TYPE_ENCRYPTION == zone_type) { ObArray server_list; - if (OB_FAIL(server_manager_.get_servers_of_zone(arg.zone_, server_list))) { - LOG_WARN("get servers of zone failed", K(ret), "zone", arg.zone_); + ObArray servers_info; + if (OB_ISNULL(GCTX.sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.sql_proxy_ is null", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObServerTableOperator::get(*GCTX.sql_proxy_, servers_info))) { + LOG_WARN("fail to get servers_info", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObRootUtils::get_servers_of_zone(servers_info, arg.zone_, server_list))) { + LOG_WARN("get servers of zone failed", KR(ret), K(arg.zone_), K(servers_info)); } else if (server_list.count() <= 0) { //do not need to check anyting while zone is empty - } else if (OB_FAIL(check_majority_and_log_in_sync_( + } else if (OB_FAIL(check_majority_and_log_in_sync( server_list, arg.force_stop_,/*skip_log_sync_check*/ "stop zone"))) { @@ -7533,7 +7537,7 @@ int ObRootService::try_notify_switch_leader(const obrpc::ObNotifySwitchLeaderArg if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_FAIL(server_manager_.get_alive_servers(zone, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(zone, server_list))) { LOG_WARN("failed to get server list", KR(ret), K(zone)); } else if (OB_FAIL(arg.init(OB_INVALID_TENANT_ID, ObLSID(), ObAddr(), comment))) { LOG_WARN("failed to init switch leader arg", KR(ret), K(comment)); @@ -8314,28 +8318,6 @@ int ObRootService::upgrade_table_schema(const obrpc::ObUpgradeTableSchemaArg &ar return ret; } -int ObRootService::merge_finish(const obrpc::ObMergeFinishArg &arg) -{ - int ret = OB_SUCCESS; - LOG_INFO("receive merge finish", K(arg)); - bool zone_merged = false; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (!arg.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid arg", K(arg), K(ret)); - } else if (OB_FAIL(server_manager_.update_merged_version( - arg.server_, arg.frozen_version_, zone_merged))) { - } else { - if (zone_merged) { - LOG_INFO("zone merged, wakeup daily merge thread"); - // daily_merge_scheduler_.wakeup(); - } - } - return ret; -} - int ObRootService::broadcast_ds_action(const obrpc::ObDebugSyncActionArg &arg) { LOG_INFO("receive broadcast debug sync actions", K(arg)); @@ -8348,7 +8330,7 @@ int ObRootService::broadcast_ds_action(const obrpc::ObDebugSyncActionArg &arg) } else if (!arg.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid arg", K(arg), K(ret)); - } else if (OB_FAIL(server_manager_.get_alive_servers(all_zone, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(all_zone, server_list))) { LOG_WARN("get all alive servers failed", K(all_zone), K(ret)); } else { FOREACH_X(s, server_list, OB_SUCCESS == ret) { @@ -8385,52 +8367,13 @@ int ObRootService::fetch_alive_server(const ObFetchAliveServerArg &arg, ret = OB_ERR_UNEXPECTED; LOG_WARN("cluster id mismatch", K(ret), K(arg), "cluster_id", static_cast(config_->cluster_id)); - } else if (OB_FAIL(server_manager_.get_servers_by_status(empty_zone, result.active_server_list_, + } else if (OB_FAIL(SVR_TRACER.get_servers_by_status(empty_zone, result.active_server_list_, result.inactive_server_list_))) { LOG_WARN("get alive servers failed", K(ret)); } return ret; } -int ObRootService::fetch_active_server_status(const ObFetchAliveServerArg &arg, - ObFetchActiveServerAddrResult &result) -{ - LOG_DEBUG("receive fetch alive server request"); - ObZone empty_zone; // for all server - ObArray server_status_list; - int ret = OB_SUCCESS; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("not init", KR(ret)); - } else if (OB_UNLIKELY(!arg.is_valid()) - || OB_ISNULL(config_)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), K(arg), KP(config_)); - } else if (arg.cluster_id_ != config_->cluster_id) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("cluster id mismatch", - K(ret), K(arg), "cluster_id", static_cast(config_->cluster_id)); - } else if (OB_FAIL(server_manager_.get_server_statuses( - empty_zone, server_status_list))) { - LOG_WARN("get alive servers failed", KR(ret)); - } else { - share::ObAliveServerTracer::ServerAddr addr; - for (int64_t i = 0; OB_SUCC(ret) && i < server_status_list.count(); ++i) { - share::ObServerStatus &status = server_status_list.at(i); - if (status.is_active()) { - //get active server - addr.reset(); - if (OB_FAIL(addr.init(status.server_, status.sql_port_))) { - LOG_WARN("failed to init server addr", KR(ret), K(status)); - } else if (OB_FAIL(result.server_addr_list_.push_back(addr))) { - LOG_WARN("failed to push back", KR(ret), K(i), K(addr), K(server_status_list)); - } - } - } - } - return ret; -} - int ObRootService::refresh_server(const bool load_frozen_status, const bool need_retry) { int ret = OB_SUCCESS; @@ -8450,9 +8393,12 @@ int ObRootService::refresh_server(const bool load_frozen_status, const bool need } else { LOG_INFO("build server manager succeed", K(load_frozen_status)); } + if (FAILEDx(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh all server tracer", KR(ret)); + } } // request heartbeats from observers - if (OB_SUCC(ret)) { + if (OB_SUCC(ret) && !ObHeartbeatService::is_service_enabled()) { int temp_ret = OB_SUCCESS; if (OB_SUCCESS != (temp_ret = request_heartbeats())) { LOG_WARN("request heartbeats failed", K(temp_ret)); @@ -8623,7 +8569,7 @@ int ObRootService::report_replica() if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_FAIL(server_manager_.get_alive_servers(null_zone, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(null_zone, server_list))) { LOG_WARN("fail to get alive server", K(ret)); } else { FOREACH_CNT(server, server_list) { @@ -8650,7 +8596,7 @@ int ObRootService::report_single_replica( if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); - } else if (OB_FAIL(server_manager_.get_alive_servers(null_zone, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(null_zone, server_list))) { LOG_WARN("fail to get alive server", KR(ret)); } else if (OB_INVALID_TENANT_ID == tenant_id || !ls_id.is_valid()) { ret = OB_INVALID_ARGUMENT; @@ -8682,7 +8628,7 @@ int ObRootService::update_all_server_config() if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_FAIL(server_manager_.get_servers_of_zone(empty_zone, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_servers_of_zone(empty_zone, server_list))) { LOG_WARN("fail to get server", K(ret)); } else if (OB_FAIL(all_server_config.name_.assign(config_->all_server_list.name()))) { LOG_WARN("fail to assign name", K(ret)); @@ -8965,7 +8911,7 @@ int ObRootService::update_stat_cache(const obrpc::ObUpdateStatCacheArg &arg) if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_FAIL(server_manager_.get_alive_servers(null_zone, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(null_zone, server_list))) { LOG_WARN("fail to get alive server", K(ret)); } else { for (int64_t i = 0; OB_SUCC(ret) && i < server_list.count(); i++) { @@ -9403,38 +9349,6 @@ int ObRootService::generate_user(const ObClusterRole &cluster_role, return ret; } - - -int ObRootService::check_merge_finish(const obrpc::ObCheckMergeFinishArg &arg) -{ - int ret = OB_SUCCESS; - LOG_INFO("receive check_merge_finish request", K(arg)); - SCN last_merged_scn = SCN::min_scn(); - share::ObSimpleFrozenStatus frozen_status; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (!arg.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("arg is invalid", K(ret), K(arg)); - } else if (/*OB_FAIL(zone_manager_.get_global_last_merged_scn(last_merged_scn))*/false) { - LOG_WARN("fail to get last merged version", K(ret)); - } /*else if (OB_FAIL(freeze_info_manager_.get_freeze_info(0, frozen_status))) { - LOG_WARN("fail to get freeze info", K(ret)); - } */ else if (frozen_status.frozen_scn_ != last_merged_scn) { - ret = OB_OP_NOT_ALLOW; - LOG_WARN("can't alter column when major freeze is not finished", K(ret)); - } else if (arg.frozen_scn_ != last_merged_scn) { - ret = OB_OP_NOT_ALLOW; - LOG_WARN("frozen_version is not new enough", K(ret), K(arg), - K(last_merged_scn), K(frozen_status)); - } else if (OB_FAIL(ddl_service_.check_all_server_frozen_scn(last_merged_scn))) { - LOG_WARN("fail to check all servers's frozen version", K(ret), K(last_merged_scn)); - } - LOG_INFO("check_merge_finish finish", K(ret), K(last_merged_scn), K(frozen_status), K(arg)); - return ret; -} - int ObRootService::get_recycle_schema_versions( const obrpc::ObGetRecycleSchemaVersionsArg &arg, obrpc::ObGetRecycleSchemaVersionsResult &result) @@ -9891,8 +9805,8 @@ int ObRootService::flush_opt_stat_monitoring_info(const obrpc::ObFlushOptStatArg if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_FAIL(server_manager_.get_alive_servers(empty_zone, server_list))) { - LOG_WARN("fail to get alive server", K(ret)); + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(empty_zone, server_list))) { + LOG_WARN("fail to get alive server", KR(ret)); } else { for (int64_t i = 0; OB_SUCC(ret) && i < server_list.count(); ++i) { if (OB_FAIL(rpc_proxy_.to(server_list.at(i)).flush_local_opt_stat_monitoring_info(arg))) { @@ -10036,7 +9950,7 @@ int ObRootService::cancel_ddl_task(const ObCancelDDLTaskArg &arg) return ret; } -int ObRootService::check_majority_and_log_in_sync_( +int ObRootService::check_majority_and_log_in_sync( const ObIArray &to_stop_servers, const bool skip_log_sync_check, const char *print_str) @@ -10071,8 +9985,6 @@ int ObRootService::check_majority_and_log_in_sync_( ob_usleep(CHECK_RETRY_INTERVAL); } if (OB_FAIL(ls_status_op.check_all_ls_has_majority_and_log_sync( - zone_manager_, - server_manager_, to_stop_servers, skip_log_sync_check, print_str, @@ -10090,7 +10002,7 @@ int ObRootService::check_majority_and_log_in_sync_( return ret; } -int ObRootService::check_all_ls_has_leader_(const char *print_str) +int ObRootService::check_all_ls_has_leader(const char *print_str) { int ret = OB_SUCCESS; ObLSStatusOperator ls_status_op; diff --git a/src/rootserver/ob_root_service.h b/src/rootserver/ob_root_service.h index 0038e7100..3c768b1c7 100644 --- a/src/rootserver/ob_root_service.h +++ b/src/rootserver/ob_root_service.h @@ -60,6 +60,7 @@ #include "rootserver/ob_disaster_recovery_task_executor.h" #include "rootserver/ob_empty_server_checker.h" #include "rootserver/ob_lost_replica_checker.h" +#include "rootserver/ob_server_zone_op_service.h" namespace oceanbase { @@ -455,9 +456,6 @@ public: int fetch_location(const obrpc::ObFetchLocationArg &arg, obrpc::ObFetchLocationResult &res); int merge_finish(const obrpc::ObMergeFinishArg &arg); - - int try_block_server(int rc, const common::ObAddr &server); - // 4.0 backup // balance over int receive_backup_over(const obrpc::ObBackupTaskRes &res); @@ -467,8 +465,6 @@ public: int check_dangling_replica_finish(const obrpc::ObCheckDanglingReplicaFinishArg &arg); int fetch_alive_server(const obrpc::ObFetchAliveServerArg &arg, obrpc::ObFetchAliveServerResult &result); - int fetch_active_server_status(const obrpc::ObFetchAliveServerArg &arg, - obrpc::ObFetchActiveServerAddrResult &result); int get_tenant_schema_versions(const obrpc::ObGetSchemaArg &arg, obrpc::ObTenantSchemaVersions &tenant_schema_versions); @@ -650,18 +646,39 @@ public: //----End of functions for managing row level security---- // server related + int load_server_manager(); + ObStatusChangeCallback &get_status_change_cb() { return status_change_cb_; } int add_server(const obrpc::ObAdminServerArg &arg); + int add_server_for_bootstrap_in_version_smaller_than_4_2_0( + const common::ObAddr &server, + const common::ObZone &zone); int delete_server(const obrpc::ObAdminServerArg &arg); int cancel_delete_server(const obrpc::ObAdminServerArg &arg); int start_server(const obrpc::ObAdminServerArg &arg); int stop_server(const obrpc::ObAdminServerArg &arg); - + // Check if all ls has leader + // @param [in] print_str: string of operation. Used to print LOG_USER_ERROR "'print_str' not allowed". + int check_all_ls_has_leader(const char *print_str); // zone related int add_zone(const obrpc::ObAdminZoneArg &arg); int delete_zone(const obrpc::ObAdminZoneArg &arg); int start_zone(const obrpc::ObAdminZoneArg &arg); int stop_zone(const obrpc::ObAdminZoneArg &arg); int alter_zone(const obrpc::ObAdminZoneArg &arg); + int check_can_stop( + const common::ObZone &zone, + const common::ObIArray &servers, + const bool is_stop_zone); + // Check if all ls has leader, enough member and if log is in sync. + // @param [in] to_stop_servers: server_list to be stopped. + // @param [in] skip_log_sync_check: whether skip log_sync check. + // @param [in] print_str: string of operation. Used to print LOG_USER_ERROR "'print_str' not allowed". + // @return: OB_SUCCESS if all check is passed. + // OB_OP_NOT_ALLOW if ls doesn't have leader/enough member or ls' log is not in sync. + int check_majority_and_log_in_sync( + const ObIArray &to_stop_servers, + const bool skip_log_sync_check, + const char *print_str); // system admin command (alter system ...) int admin_switch_replica_role(const obrpc::ObAdminSwitchReplicaRoleArg &arg); @@ -760,8 +777,6 @@ public: int broadcast_schema(const obrpc::ObBroadcastSchemaArg &arg); ObDDLService &get_ddl_service() { return ddl_service_; } ObDDLScheduler &get_ddl_scheduler() { return ddl_scheduler_; } - - int check_merge_finish(const obrpc::ObCheckMergeFinishArg &arg); int get_recycle_schema_versions( const obrpc::ObGetRecycleSchemaVersionsArg &arg, obrpc::ObGetRecycleSchemaVersionsResult &result); @@ -794,23 +809,15 @@ private: int refresh_server(const bool fast_recover, const bool need_retry); int refresh_schema(const bool fast_recover); int init_sequence_id(); - int load_server_manager(); int start_timer_tasks(); int stop_timer_tasks(); int request_heartbeats(); int self_check(); int update_all_server_and_rslist(); - int check_zone_and_server(const ObIArray &servers, bool &is_same_zone, bool &is_all_stopped); - int check_can_stop(const common::ObZone &zone, - const common::ObIArray &servers, - const bool is_stop_zone); - bool have_other_stop_task(const ObZone &zone); int init_sys_admin_ctx(ObSystemAdminCtx &ctx); int set_cluster_version(); bool is_replica_count_reach_rs_limit(int64_t replica_count) { return replica_count > OB_MAX_CLUSTER_REPLICA_COUNT; } int update_all_server_config(); - int get_readwrite_servers(const common::ObIArray &input_servers, - common::ObIArray &readwrite_servers); int generate_table_schema_in_tenant_space( const obrpc::ObCreateTableArg &arg, share::schema::ObTableSchema &table_schema); @@ -856,27 +863,14 @@ private: const share::ObLeaseRequest &lease_request, share::ObLeaseResponse &lease_response, const share::ObServerStatus &server_status); - - // Check if all ls has leader, enough member and if log is in sync. - // @param [in] to_stop_servers: server_list to be stopped. - // @param [in] skip_log_sync_check: whether skip log_sync check. - // @param [in] print_str: string of operation. Used to print LOG_USER_ERROR "'print_str' not allowed". - // @return: OB_SUCCESS if all check is passed. - // OB_OP_NOT_ALLOW if ls doesn't have leader/enough member or ls' log is not in sync. - int check_majority_and_log_in_sync_( - const ObIArray &to_stop_servers, - const bool skip_log_sync_check, - const char *print_str); - // Check if all ls has leader - // @param [in] print_str: string of operation. Used to print LOG_USER_ERROR "'print_str' not allowed". - int check_all_ls_has_leader_(const char *print_str); void update_cpu_quota_concurrency_in_memory_(); int set_cpu_quota_concurrency_config_(); int try_notify_switch_leader(const obrpc::ObNotifySwitchLeaderArg::SwitchLeaderComment &comment); private: - int construct_rs_list_arg(obrpc::ObRsListArg &rs_list_arg); int precheck_interval_part(const obrpc::ObAlterTableArg &arg); - + int old_add_server(const obrpc::ObAdminServerArg &arg); + int old_delete_server(const obrpc::ObAdminServerArg &arg); + int old_cancel_delete_server(const obrpc::ObAdminServerArg &arg); private: static const int64_t OB_MAX_CLUSTER_REPLICA_COUNT = 10000000; static const int64_t OB_ROOT_SERVICE_START_FAIL_COUNT_UPPER_LIMIT = 5; @@ -904,6 +898,7 @@ private: ObHeartbeatChecker hb_checker_; ObAllServerChecker server_checker_; RsListChangeCb rs_list_change_cb_; + ObServerZoneOpService server_zone_op_service_; // minor freeze ObRootMinorFreeze root_minor_freeze_; diff --git a/src/rootserver/ob_root_utils.cpp b/src/rootserver/ob_root_utils.cpp index 12be6e55a..c3d2c4d48 100644 --- a/src/rootserver/ob_root_utils.cpp +++ b/src/rootserver/ob_root_utils.cpp @@ -18,6 +18,7 @@ #include "lib/json/ob_json.h" #include "lib/string/ob_sql_string.h" #include "lib/hash/ob_hashset.h" +#include "lib/mysqlclient/ob_mysql_result.h" #include "share/ob_rpc_struct.h" #include "share/ob_share_util.h" #include "share/ob_common_rpc_proxy.h" @@ -35,6 +36,8 @@ #include "logservice/ob_log_service.h" #include "share/system_variable/ob_system_variable_alias.h" #include "share/ob_primary_zone_util.h" // ObPrimaryZoneUtil +#include "share/ob_server_table_operator.h" +#include "share/ob_zone_table_operation.h" using namespace oceanbase::rootserver; using namespace oceanbase::share; @@ -1629,30 +1632,26 @@ int ObRootUtils::get_rs_default_timeout_ctx(ObTimeoutCtx &ctx) } //get all observer that is stopeed, start_service_time<=0 and lease expire -int ObRootUtils::get_invalid_server_list(const ObZoneManager &zone_mgr, - const ObServerManager &server_mgr, - ObIArray &invalid_server_list) +int ObRootUtils::get_invalid_server_list( + const ObIArray &servers_info, + ObIArray &invalid_server_list) { int ret = OB_SUCCESS; invalid_server_list.reset(); ObArray stopped_server_list; ObArray stopped_zone_list; - ObArray server_list; ObZone empty_zone; - if (OB_FAIL(get_stopped_zone_list(zone_mgr, server_mgr, stopped_zone_list, - stopped_server_list))) { + if (OB_FAIL(get_stopped_zone_list(stopped_zone_list, stopped_server_list))) { LOG_WARN("fail to get stopped zone list", KR(ret)); } else if (OB_FAIL(invalid_server_list.assign(stopped_server_list))) { LOG_WARN("fail to assign array", KR(ret), K(stopped_zone_list)); - } else if (OB_FAIL(server_mgr.get_server_statuses(empty_zone, server_list))) { - LOG_WARN("fail to get servers of zone", KR(ret)); } else { - for (int64_t i = 0; i < server_list.count() && OB_SUCC(ret); i++) { - const ObServerStatus &status = server_list.at(i); - if ((!status.is_alive() || !status.in_service()) - && !has_exist_in_array(invalid_server_list, status.server_)) { - if (OB_FAIL(invalid_server_list.push_back(status.server_))) { - LOG_WARN("fail to push back", KR(ret), K(status)); + for (int64_t i = 0; i < servers_info.count() && OB_SUCC(ret); i++) { + const ObServerInfoInTable &server_info = servers_info.at(i); + if ((!server_info.is_alive() || !server_info.in_service()) + && !has_exist_in_array(invalid_server_list, server_info.get_server())) { + if (OB_FAIL(invalid_server_list.push_back(server_info.get_server()))) { + LOG_WARN("fail to push back", KR(ret), K(server_info)); } } } @@ -1660,72 +1659,248 @@ int ObRootUtils::get_invalid_server_list(const ObZoneManager &zone_mgr, return ret; } -int ObRootUtils::get_stopped_zone_list(const ObZoneManager &zone_mgr, - const ObServerManager &server_mgr, - ObIArray &stopped_zone_list, - ObIArray &stopped_server_list) +int ObRootUtils::find_server_info( + const ObIArray &servers_info, + const common::ObAddr &server, + share::ObServerInfoInTable &server_info) { int ret = OB_SUCCESS; - ObServerManager::ObServerStatusArray server_array; - ObZone empty_zone; - if (OB_FAIL(server_mgr.get_server_statuses(empty_zone, server_array))) { - LOG_WARN("fail to get server status", KR(ret)); + bool server_exists = false; + server_info.reset(); + if (OB_UNLIKELY(!server.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server", KR(ret), K(server)); } else { - for (int64_t i = 0; i < server_array.count() && OB_SUCC(ret); i++) { - if (!server_array.at(i).is_stopped()) { - //nothing todo - } else { - if (has_exist_in_array(stopped_zone_list, server_array.at(i).zone_)) { - //nothing todo - } else if (OB_FAIL(stopped_zone_list.push_back(server_array.at(i).zone_))) { - LOG_WARN("fail to push back", KR(ret), "zone", server_array.at(i).zone_); - } - if (OB_FAIL(ret)) { - } else if (has_exist_in_array(stopped_server_list, server_array.at(i).server_)) { - //nothing todo - } else if (OB_FAIL(stopped_server_list.push_back(server_array.at(i).server_))) { - LOG_WARN("fail to push back", KR(ret), "server", server_array.at(i).server_); + for (int64_t i = 0; OB_SUCC(ret) && !server_exists && i < servers_info.count(); i++) { + const ObServerInfoInTable & server_info_i = servers_info.at(i); + if (OB_UNLIKELY(!server_info_i.is_valid())) { + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("server_info_i is not valid", KR(ret), K(server_info_i)); + } else if (server == server_info_i.get_server()) { + server_exists = true; + if (OB_FAIL(server_info.assign(server_info_i))) { + LOG_WARN("fail to assign server_info", KR(ret), K(server_info_i)); } } } } - LOG_INFO("get stop observer", KR(ret), K(stopped_zone_list), K(stopped_server_list)); - //get stopped zone; - ObArray zone_infos; - if (OB_FAIL(ret)) { - } else if (OB_FAIL(zone_mgr.get_zone(zone_infos))) { - LOG_WARN("fail to get zone", K(ret)); + if (OB_SUCC(ret) && !server_exists) { + ret = OB_ENTRY_NOT_EXIST; + LOG_WARN("server not exists", KR(ret), K(server)); + } + return ret; +} + +int ObRootUtils::get_servers_of_zone( + const ObIArray &servers_info, + const common::ObZone &zone, + ObIArray &servers, + bool only_active_servers) +{ + int ret = OB_SUCCESS; + servers.reset(); + if (OB_UNLIKELY(zone.is_empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid zone", KR(ret), K(zone)); } else { - for (int64_t i = 0; i < zone_infos.count() && OB_SUCC(ret); i++) { - if (ObZoneStatus::ACTIVE == zone_infos.at(i).status_) { - //nothing todo + for (int64_t i = 0; OB_SUCC(ret) && i < servers_info.count(); i++) { + const ObServerInfoInTable &server_info = servers_info.at(i); + if (OB_UNLIKELY(!server_info.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server_info", KR(ret), K(server_info)); + } else if (zone != server_info.get_zone() || (only_active_servers && !server_info.is_active())) { + // do nothing + } else if (OB_FAIL(servers.push_back(server_info.get_server()))) { + LOG_WARN("fail to push an element into servers", KR(ret), K(server_info)); + } + } + } + return ret; +} +int ObRootUtils::get_server_count( + const ObIArray &servers_info, + const ObZone &zone, + int64_t &alive_count, + int64_t ¬_alive_count) +{ + int ret = OB_SUCCESS; + alive_count = 0; + not_alive_count = 0; + for (int64_t i = 0; OB_SUCC(ret) && i < servers_info.count(); ++i) { + const ObServerInfoInTable &server_info = servers_info.at(i); + if (server_info.get_zone() == zone || zone.is_empty()) { + if (server_info.is_alive()) { + ++alive_count; } else { - if (has_exist_in_array(stopped_zone_list, zone_infos.at(i).zone_)) { - //nothing todo - } else if (OB_FAIL(stopped_zone_list.push_back(zone_infos.at(i).zone_))) { - LOG_WARN("fail to push back", KR(ret)); - } - ObArray server_list; - if (OB_FAIL(ret)) { - } else if (OB_FAIL(server_mgr.get_servers_of_zone(zone_infos.at(i).zone_, - server_list))) { - LOG_WARN("fail to get server of zone", KR(ret), K(i), "zone", zone_infos.at(i).zone_); - } else { - for (int64_t j = 0; j < server_list.count() && OB_SUCC(ret); j++) { - if (has_exist_in_array(stopped_server_list, server_list.at(j))) { - //nothing todo - } else if (OB_FAIL(stopped_server_list.push_back(server_list.at(j)))) { - LOG_WARN("fail to push back", KR(ret), K(j)); + ++not_alive_count; + } + } + } + return ret; +} +int ObRootUtils::check_server_alive( + const ObIArray &servers_info, + const ObAddr &server, + bool &is_alive) +{ + int ret = OB_SUCCESS; + is_alive = false; + ObServerInfoInTable server_info; + if (OB_UNLIKELY(!server.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server", KR(ret), K(server)); + } else if (OB_FAIL(find_server_info(servers_info, server, server_info))) { + LOG_WARN("fail to find server_info", KR(ret), K(servers_info), K(server)); + } else { + is_alive = server_info.is_alive(); + } + return ret; +} +int ObRootUtils::get_server_resource_info( + const ObIArray &server_resources_info, + const ObAddr &server, + share::ObServerResourceInfo &resource_info) +{ + int ret = OB_SUCCESS; + bool server_exists = false; + resource_info.reset(); + if (OB_UNLIKELY(!server.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server", KR(ret), K(server)); + } else { + for (int64_t i = 0; OB_SUCC(ret) && !server_exists && i < server_resources_info.count(); i++) { + const obrpc::ObGetServerResourceInfoResult &server_resource_info_i = server_resources_info.at(i); + if (OB_UNLIKELY(!server_resource_info_i.is_valid())){ + ret = OB_ERR_UNEXPECTED; + LOG_ERROR("server_resource_info_i is not valid", KR(ret), K(server_resource_info_i)); + } else if (server == server_resource_info_i.get_server()) { + server_exists = true; + resource_info = server_resource_info_i.get_resource_info(); + } + } + } + if (OB_SUCC(ret) && !server_exists) { + ret = OB_ENTRY_NOT_EXIST; + LOG_WARN("server not exists", KR(ret), K(server)); + } + return ret; +} + +int ObRootUtils::get_stopped_zone_list( + ObIArray &stopped_zone_list, + ObIArray &stopped_server_list) +{ + int ret = OB_SUCCESS; + ObSqlString sql; + ObTimeoutCtx ctx; + stopped_zone_list.reset(); + stopped_server_list.reset(); + if (OB_ISNULL(GCTX.sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.sql_proxy_ is null", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", K(ret), K(ctx)); + } else if (OB_FAIL(sql.assign_fmt("SELECT s.svr_ip, s.svr_port, s.zone " + "FROM %s AS s JOIN (SELECT zone, info FROM %s WHERE name = 'status') AS z " + "ON s.zone = z.zone WHERE s.stop_time > 0 OR z.info = 'INACTIVE'", + OB_ALL_SERVER_TNAME, OB_ALL_ZONE_TNAME))) { + LOG_WARN("fail to append sql", KR(ret)); + } else if (OB_FAIL(ObZoneTableOperation::get_inactive_zone_list(*GCTX.sql_proxy_, stopped_zone_list))) { + LOG_WARN("fail to get inactive zone_list", KR(ret), KP(GCTX.sql_proxy_)); + } else { + SMART_VAR(ObMySQLProxy::MySQLResult, res) { + int tmp_ret = OB_SUCCESS; + ObMySQLResult *result = NULL; + if (OB_FAIL(GCTX.sql_proxy_->read(res, OB_SYS_TENANT_ID, sql.ptr()))) { + LOG_WARN("fail to execute sql", KR(ret), K(sql)); + } else if (OB_ISNULL(result = res.get_result())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("fail to get sql result", KR(ret), K(sql)); + } else { + ObZone zone; + ObAddr server; + ObString tmp_zone; + ObString svr_ip; + while (OB_SUCC(ret)) { + if (OB_FAIL(result->next())) { + if (OB_ITER_END != ret) { + LOG_WARN("result next failed", KR(ret)); + } else { + ret = OB_SUCCESS; + break; + } + } else { + int64_t svr_port = 0; + server.reset(); + zone.reset(); + svr_ip.reset(); + tmp_zone.reset(); + EXTRACT_VARCHAR_FIELD_MYSQL(*result, "svr_ip", svr_ip); + EXTRACT_INT_FIELD_MYSQL(*result, "svr_port", svr_port, int64_t); + EXTRACT_VARCHAR_FIELD_MYSQL(*result, "zone", tmp_zone); + if (OB_UNLIKELY(!server.set_ip_addr(svr_ip, static_cast(svr_port)))) { + ret = OB_INVALID_DATA; + LOG_WARN("fail to set ip addr", KR(ret), K(svr_ip), K(svr_port)); + } else if (OB_FAIL(zone.assign(tmp_zone))) { + LOG_WARN("fail to assign zone", KR(ret), K(tmp_zone)); + } else if (OB_FAIL(stopped_server_list.push_back(server))) { + LOG_WARN("fail to push an element into stopped_server_list", KR(ret), K(server)); + } else if (has_exist_in_array(stopped_zone_list, zone)) { + // do nothing + } else if (OB_FAIL(stopped_zone_list.push_back(zone))) { + LOG_WARN("fail to push an element into stopped_zone_list", KR(ret), K(zone)); } } } - } //end else ACTIVE - } //end for zone_infos + } + } } LOG_INFO("get stopped zone list", KR(ret), K(stopped_server_list), K(stopped_zone_list)); return ret; } - +bool ObRootUtils::have_other_stop_task(const ObZone &zone) +{ + int ret = OB_SUCCESS; + bool bret = true; + int64_t cnt = 0; + ObSqlString sql; + ObTimeoutCtx ctx; + if (OB_ISNULL(GCTX.sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.sql_proxy_ is null", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else if (OB_FAIL(sql.assign_fmt("SELECT COUNT(*) AS cnt FROM " + "(SELECT zone FROM %s WHERE stop_time > 0 AND zone != '%s' UNION " + "SELECT zone FROM %s WHERE name = 'status' AND info = 'INACTIVE' AND zone != '%s')", + OB_ALL_SERVER_TNAME, zone.ptr(), OB_ALL_ZONE_TNAME, zone.ptr()))) { + LOG_WARN("fail to append sql", KR(ret), K(zone)); + } else { + SMART_VAR(ObMySQLProxy::MySQLResult, res) { + int tmp_ret = OB_SUCCESS; + ObMySQLResult *result = NULL; + if (OB_FAIL(GCTX.sql_proxy_->read(res, OB_SYS_TENANT_ID, sql.ptr()))) { + LOG_WARN("fail to execute sql", KR(ret), K(sql)); + } else if (OB_ISNULL(result = res.get_result())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("fail to get sql result", KR(ret), K(sql)); + } else if (OB_FAIL(result->next())) { + LOG_WARN("fail to get next", KR(ret), K(sql));; + } else { + EXTRACT_INT_FIELD_MYSQL(*result, "cnt", cnt, int64_t); + } + if (OB_SUCC(ret) && (OB_ITER_END != (tmp_ret = result->next()))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get more row than one", KR(ret), KR(tmp_ret), K(sql)); + } + } + } + if (OB_SUCC(ret) && 0 == cnt) { + bret = false; + } + LOG_INFO("have other stop task", KR(ret), K(bret), K(zone), K(cnt)); + return bret; +} int ObRootUtils::get_tenant_intersection(ObUnitManager &unit_mgr, ObIArray &this_server_list, ObIArray &other_server_list, @@ -1961,6 +2136,27 @@ int ObRootUtils::check_left_f_in_primary_zone(ObZoneManager &zone_mgr, return ret; } +int ObRootUtils::get_proposal_id_from_sys_ls(int64_t &proposal_id, ObRole &role) +{ + int ret = OB_SUCCESS; + storage::ObLSHandle ls_handle; + logservice::ObLogHandler *handler = nullptr; + MTL_SWITCH(OB_SYS_TENANT_ID) { + if (OB_FAIL(MTL(ObLSService*)->get_ls(SYS_LS, ls_handle, ObLSGetMod::RS_MOD))) { + LOG_WARN("fail to get ls", KR(ret)); + } else if (OB_UNLIKELY(!ls_handle.is_valid()) + || OB_ISNULL(ls_handle.get_ls()) + || OB_ISNULL(handler = ls_handle.get_ls()->get_log_handler())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected error", KR(ret), KP(ls_handle.get_ls()), + KP(ls_handle.get_ls()->get_log_handler())); + } else if (OB_FAIL(handler->get_role(role, proposal_id))) { + LOG_WARN("fail to get role", KR(ret)); + } + } + return ret; +} + int ObRootUtils::try_notify_switch_ls_leader( obrpc::ObSrvRpcProxy *rpc_proxy, const share::ObLSInfo &ls_info, diff --git a/src/rootserver/ob_root_utils.h b/src/rootserver/ob_root_utils.h index 88c138853..4eaef7c1f 100644 --- a/src/rootserver/ob_root_utils.h +++ b/src/rootserver/ob_root_utils.h @@ -46,7 +46,6 @@ namespace rootserver { class ObDDLService; class ObUnitManager; -class ObServerManager; class ObZoneManager; class ObLocalityDistribution; template @@ -604,14 +603,34 @@ public: virtual ~ObRootUtils() {} static int get_rs_default_timeout_ctx(ObTimeoutCtx &ctx); - static int get_invalid_server_list(const ObZoneManager &zone_mgr, - const ObServerManager &server_mgr, - common::ObIArray &invalid_server_list); - - static int get_stopped_zone_list(const ObZoneManager &zone_mgr, - const ObServerManager &server_mgr, - common::ObIArray &stopped_zone_list, + static int get_invalid_server_list( + const ObIArray &servers_info, + common::ObIArray &invalid_server_list); + static int find_server_info( + const ObIArray &servers_info, + const common::ObAddr &server, + share::ObServerInfoInTable &server_info); + static int get_servers_of_zone( + const ObIArray &servers_info, + const common::ObZone &zone, + ObIArray &servers, + bool only_active_servers = false); + static int get_server_count( + const ObIArray &servers_info, + const ObZone &zone, + int64_t &alive_count, + int64_t ¬_alive_count); + static int check_server_alive( + const ObIArray &servers_info, + const common::ObAddr &server, + bool &is_alive); + static int get_server_resource_info( + const ObIArray &server_resources_info, + const ObAddr &server, + share::ObServerResourceInfo &resource_info); + static int get_stopped_zone_list(common::ObIArray &stopped_zone_list, common::ObIArray &stopped_server_list); + static bool have_other_stop_task(const ObZone &zone); static int check_primary_region_in_zonelist(share::schema::ObMultiVersionSchemaService *schema_service, ObDDLService *ddl_service, ObUnitManager &unit_mgr, @@ -643,6 +662,7 @@ public: common::ObIArray &this_server_list, common::ObIArray &other_server_list, common::ObIArray &tenant_ids); + static int get_proposal_id_from_sys_ls(int64_t &proposal_id, ObRole &role); static int notify_switch_leader( obrpc::ObSrvRpcProxy *rpc_proxy, diff --git a/src/rootserver/ob_rootservice_util_checker.cpp b/src/rootserver/ob_rootservice_util_checker.cpp index ecc3e2c26..c184d446b 100644 --- a/src/rootserver/ob_rootservice_util_checker.cpp +++ b/src/rootserver/ob_rootservice_util_checker.cpp @@ -30,7 +30,6 @@ ObRootServiceUtilChecker::~ObRootServiceUtilChecker() int ObRootServiceUtilChecker::init( ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, obrpc::ObCommonRpcProxy &common_rpc_proxy, common::ObAddr &self, @@ -44,7 +43,6 @@ int ObRootServiceUtilChecker::init( LOG_WARN("init twice", KR(ret)); } else if (OB_FAIL(migrate_unit_finish_checker_.init( unit_mgr, - server_mgr, zone_mgr, schema_service, sql_proxy, @@ -55,7 +53,6 @@ int ObRootServiceUtilChecker::init( common_rpc_proxy, self, unit_mgr, - server_mgr, zone_mgr, sql_proxy, lst_operator))) { diff --git a/src/rootserver/ob_rootservice_util_checker.h b/src/rootserver/ob_rootservice_util_checker.h index 3d312268c..e659393b2 100644 --- a/src/rootserver/ob_rootservice_util_checker.h +++ b/src/rootserver/ob_rootservice_util_checker.h @@ -28,7 +28,6 @@ public: public: int init( ObUnitManager &unit_mgr, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, obrpc::ObCommonRpcProxy &common_rpc_proxy, common::ObAddr &self, diff --git a/src/rootserver/ob_rs_async_rpc_proxy.h b/src/rootserver/ob_rs_async_rpc_proxy.h index ce4ba320f..fe922ccf3 100644 --- a/src/rootserver/ob_rs_async_rpc_proxy.h +++ b/src/rootserver/ob_rs_async_rpc_proxy.h @@ -19,6 +19,7 @@ #include "rpc/obrpc/ob_rpc_packet.h" #include "rpc/obrpc/ob_rpc_result_code.h" #include "rpc/obrpc/ob_rpc_proxy.h" +#include "share/ob_heartbeat_struct.h" namespace oceanbase { @@ -69,6 +70,8 @@ RPC_F(obrpc::OB_GET_LEADER_LOCATIONS, obrpc::ObGetLeaderLocationsArg, RPC_F(obrpc::OB_DDL_CHECK_TABLET_MERGE_STATUS, obrpc::ObDDLCheckTabletMergeStatusArg, obrpc::ObDDLCheckTabletMergeStatusResult, ObCheckTabletMergeStatusProxy); RPC_F(obrpc::OB_REFRESH_TENANT_INFO, obrpc::ObRefreshTenantInfoArg, obrpc::ObRefreshTenantInfoRes, ObRefreshTenantInfoProxy); +RPC_F(obrpc::OB_SEND_HEARTBEAT, share::ObHBRequest, share::ObHBResponse, ObSendHeartbeatProxy); +RPC_F(obrpc::OB_GET_SERVER_RESOURCE_INFO, obrpc::ObGetServerResourceInfoArg, obrpc::ObGetServerResourceInfoResult, ObGetServerResourceInfoProxy); RPC_F(obrpc::OB_NOTIFY_SWITCH_LEADER, obrpc::ObNotifySwitchLeaderArg, obrpc::ObSrvRpcProxy::ObRpc::Response, ObNotifySwitchLeaderProxy); RPC_F(obrpc::OB_UPDATE_TENANT_INFO_CACHE, obrpc::ObUpdateTenantInfoCacheArg, obrpc::ObUpdateTenantInfoCacheRes, ObUpdateTenantInfoCacheProxy); diff --git a/src/rootserver/ob_rs_reentrant_thread.h b/src/rootserver/ob_rs_reentrant_thread.h index 2d5542562..e5f5a99b7 100644 --- a/src/rootserver/ob_rs_reentrant_thread.h +++ b/src/rootserver/ob_rs_reentrant_thread.h @@ -60,6 +60,7 @@ public: int start(); void stop(); void wait(); + void reset_last_run_timestamp() { ATOMIC_STORE(&last_run_timestamp_, 0); } TO_STRING_KV("name", get_thread_name()); private: diff --git a/src/rootserver/ob_rs_rpc_processor.h b/src/rootserver/ob_rs_rpc_processor.h index 450246d96..4cb95828b 100644 --- a/src/rootserver/ob_rs_rpc_processor.h +++ b/src/rootserver/ob_rs_rpc_processor.h @@ -297,8 +297,8 @@ DEFINE_LIMITED_RS_RPC_PROCESSOR(obrpc::OB_EXECUTE_BOOTSTRAP, ObRpcExecuteBootstr // check server_refreshed_ flag in rootservice DEFINE_LIMITED_RS_RPC_PROCESSOR(obrpc::OB_FETCH_ALIVE_SERVER, ObRpcFetchAliveServerP, fetch_alive_server(arg_, result_)); -DEFINE_RS_RPC_PROCESSOR(obrpc::OB_MERGE_FINISH, ObRpcMergeFinishP, merge_finish(arg_)); -DEFINE_RS_RPC_PROCESSOR(obrpc::OB_FETCH_ACTIVE_SERVER_STATUS, ObRpcFetchActiveServerStatusP, fetch_active_server_status(arg_, result_)); +// DEFINE_RS_RPC_PROCESSOR(obrpc::OB_MERGE_FINISH, ObRpcMergeFinishP, merge_finish(arg_)); +// DEFINE_RS_RPC_PROCESSOR(obrpc::OB_FETCH_ACTIVE_SERVER_STATUS, ObRpcFetchActiveServerStatusP, fetch_active_server_status(arg_, result_)); DEFINE_RS_RPC_PROCESSOR(obrpc::OB_DISASTER_RECOVERY_TASK_REPLY, ObRpcDisasterRecoveryTaskReplyP, disaster_recovery_task_reply(arg_)); DEFINE_RS_RPC_PROCESSOR(obrpc::OB_BACKUP_LS_DATA_RES, ObRpcBackupDataResP, receive_backup_over(arg_)); @@ -492,7 +492,6 @@ protected: DEFINE_DDL_RS_RPC_PROCESSOR(obrpc::OB_DO_SEQUENCE_DDL, ObRpcDoSequenceDDLP, do_sequence_ddl(arg_)); DEFINE_RS_RPC_PROCESSOR(obrpc::OB_BROADCAST_SCHEMA, ObBroadcastSchemaP, broadcast_schema(arg_)); // only for upgrade -DEFINE_RS_RPC_PROCESSOR(obrpc::OB_CHECK_MERGE_FINISH, ObCheckMergeFinishP, check_merge_finish(arg_)); DEFINE_RS_RPC_PROCESSOR(obrpc::OB_GET_RECYCLE_SCHEMA_VERSIONS, ObGetRecycleSchemaVersionsP, get_recycle_schema_versions(arg_, result_)); DEFINE_DDL_RS_RPC_PROCESSOR(obrpc::OB_UPGRADE_TABLE_SCHEMA, ObRpcUpgradeTableSchemaP, upgrade_table_schema(arg_)); //label security ddl diff --git a/src/rootserver/ob_schema_history_recycler.cpp b/src/rootserver/ob_schema_history_recycler.cpp index eea700c22..57a647a4f 100644 --- a/src/rootserver/ob_schema_history_recycler.cpp +++ b/src/rootserver/ob_schema_history_recycler.cpp @@ -24,6 +24,7 @@ #include "share/ob_freeze_info_proxy.h" #include "share/ob_global_merge_table_operator.h" #include "share/ob_zone_merge_info.h" +#include "share/ob_all_server_tracer.h" namespace oceanbase { @@ -87,8 +88,7 @@ int64_t ObSchemaHistoryRecyclerIdling::get_idle_interval_us() ObSchemaHistoryRecycler::ObSchemaHistoryRecycler() : inited_(false), idling_(stop_), schema_service_(NULL), - /*freeze_info_mgr_(NULL),*/ zone_mgr_(NULL), sql_proxy_(NULL), - server_mgr_(NULL), recycle_schema_versions_() + /*freeze_info_mgr_(NULL),*/ zone_mgr_(NULL), sql_proxy_(NULL), recycle_schema_versions_() { } @@ -104,8 +104,7 @@ int ObSchemaHistoryRecycler::init( ObMultiVersionSchemaService &schema_service, //ObFreezeInfoManager &freeze_info_manager, ObZoneManager &zone_manager, - ObMySQLProxy &sql_proxy, - ObServerManager &server_mgr) + ObMySQLProxy &sql_proxy) { int ret = OB_SUCCESS; const int schema_history_recycler_thread_cnt = 1; @@ -122,7 +121,6 @@ int ObSchemaHistoryRecycler::init( //freeze_info_mgr_ = &freeze_info_manager; zone_mgr_ = &zone_manager; sql_proxy_ = &sql_proxy; - server_mgr_ = &server_mgr; inited_ = true; } return ret; @@ -359,15 +357,13 @@ int ObSchemaHistoryRecycler::get_recycle_schema_version_by_server( int ret = OB_SUCCESS; ObArray server_list; obrpc::ObGetMinSSTableSchemaVersionArg arg; + ObZone zone; if (OB_FAIL(check_inner_stat())) { LOG_WARN("fail to check inner stat", KR(ret)); - } else if (OB_ISNULL(server_mgr_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("ptr is null", KR(ret), KP_(server_mgr)); } else if (OB_FAIL(arg.tenant_id_arg_list_.assign(tenant_ids))) { LOG_WARN("fail to assign arg", KR(ret)); - } else if (OB_FAIL(server_mgr_->get_all_server_list(server_list))) { - LOG_WARN("fail to get all server list", KR(ret)); + } else if (OB_FAIL(SVR_TRACER.get_servers_of_zone(zone, server_list))) { + LOG_WARN("fail to get server_list", KR(ret)); } else { rootserver::ObGetMinSSTableSchemaVersionProxy proxy_batch( *(GCTX.srv_rpc_proxy_), &obrpc::ObSrvRpcProxy::get_min_sstable_schema_version); diff --git a/src/rootserver/ob_schema_history_recycler.h b/src/rootserver/ob_schema_history_recycler.h index 3db05b039..d7a00f509 100644 --- a/src/rootserver/ob_schema_history_recycler.h +++ b/src/rootserver/ob_schema_history_recycler.h @@ -18,7 +18,6 @@ #include "rootserver/ob_thread_idling.h" //#include "rootserver/ob_freeze_info_manager.h" #include "rootserver/ob_zone_manager.h" -#include "rootserver/ob_server_manager.h" #include "share/schema/ob_multi_version_schema_service.h" #include "share/config/ob_server_config.h" @@ -162,8 +161,7 @@ public: int init(share::schema::ObMultiVersionSchemaService &schema_service, //ObFreezeInfoManager &freeze_info_manager, ObZoneManager &zone_manager, - common::ObMySQLProxy &sql_proxy, - ObServerManager &server_mgr); + common::ObMySQLProxy &sql_proxy); virtual void run3() override; void wakeup(); void stop(); @@ -211,7 +209,6 @@ private: //ObFreezeInfoManager *freeze_info_mgr_; ObZoneManager *zone_mgr_; common::ObMySQLProxy *sql_proxy_; - ObServerManager *server_mgr_; common::hash::ObHashMap recycle_schema_versions_; DISALLOW_COPY_AND_ASSIGN(ObSchemaHistoryRecycler); }; diff --git a/src/rootserver/ob_server_balancer.cpp b/src/rootserver/ob_server_balancer.cpp index 7ee565d04..ecb3a8cb3 100644 --- a/src/rootserver/ob_server_balancer.cpp +++ b/src/rootserver/ob_server_balancer.cpp @@ -21,6 +21,9 @@ #include "rootserver/ob_root_utils.h" #include "rootserver/ob_root_service.h" #include "storage/ob_file_system_router.h" +#include "share/ob_all_server_tracer.h" +#include "share/ob_server_table_operator.h" +#include "rootserver/ob_heartbeat_service.h" using namespace oceanbase::common; using namespace oceanbase::common::hash; @@ -62,6 +65,41 @@ int ObServerBalancer::init( return ret; } +int ObServerBalancer::get_active_servers_info_and_resource_info_of_zone( + const ObZone &zone, + ObIArray &servers_info, + ObIArray &server_resources_info) +{ + int ret = OB_SUCCESS; + servers_info.reset(); + server_resources_info.reset(); + ObServerResourceInfo resource_info_in_server_status; + obrpc::ObGetServerResourceInfoResult resource_info_result; + if (OB_FAIL(SVR_TRACER.get_active_servers_info(zone, servers_info))) { + LOG_WARN("fail to execute get_active_servers_info", KR(ret), K(zone)); + } else if (OB_ISNULL(server_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("server_mgr_ is null", KR(ret), KP(server_mgr_)); + } else { + for (int64_t i = 0; OB_SUCC(ret) && i < servers_info.count(); i++) { + const ObAddr &server = servers_info.at(i).get_server(); + resource_info_result.reset(); + resource_info_in_server_status.reset(); + if (OB_FAIL(server_mgr_->get_server_resource_info(server, resource_info_in_server_status))) { + LOG_WARN("fail to get resource_info_in_server_status", KR(ret), K(server)); + } else if (OB_UNLIKELY(!resource_info_in_server_status.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid resource_info_in_server_status", KR(ret), K(server), K(resource_info_in_server_status)); + } else if (OB_FAIL(resource_info_result.init(server,resource_info_in_server_status))) { + LOG_WARN("fail to init", KR(ret), K(server), K(resource_info_in_server_status)); + } else if (OB_FAIL(server_resources_info.push_back(resource_info_result))) { + LOG_WARN("fail to push an element into server_resources_info", KR(ret), K(resource_info_result)); + } + } + } + return ret; +} + int ObServerBalancer::tenant_group_balance() { int ret = OB_SUCCESS; @@ -99,82 +137,6 @@ int ObServerBalancer::tenant_group_balance() return ret; } -int ObServerBalancer::check_if_ofs_rs_without_sys_unit( - const share::ObServerStatus &status, - const share::ObUnitInfo &unit_info, - bool &ofs_rs_without_sys_unit) -{ - int ret = OB_SUCCESS; - UNUSED(status); - common::ObArray *unit_load_array; - if (!check_inner_stat()) { - ret = OB_INNER_STAT_ERROR; - LOG_WARN("fail to check inner stat", KR(ret)); - } else if (OB_UNLIKELY(nullptr == unit_mgr_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit_mgr ptr is null", KR(ret), KP(unit_mgr_)); - } else { - ofs_rs_without_sys_unit = true; - if (OB_SYS_TENANT_ID != unit_info.pool_.tenant_id_) { - ofs_rs_without_sys_unit = false; - } else if (OB_FAIL(unit_mgr_->get_loads_by_server(GCTX.self_addr(), unit_load_array))) { - if (OB_ENTRY_NOT_EXIST == ret) { - ret = OB_SUCCESS; // server load empty, no need to distribute - } else { - LOG_WARN("fail to get loads by server", KR(ret)); - } - } else if (OB_UNLIKELY(nullptr == unit_load_array)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit load array ptr is null", KR(ret)); - } else { - for (int64_t i = 0; - OB_SUCC(ret) && i < unit_load_array->count() && ofs_rs_without_sys_unit; - ++i) { - const ObUnit *unit = unit_load_array->at(i).unit_; - const share::ObResourcePool *pool = unit_load_array->at(i).pool_; - // some certain unit exists on observer with rs - if (OB_UNLIKELY(nullptr == unit || nullptr == pool)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit ptr is null", KR(ret), KP(unit), K(pool)); - } else if (OB_SYS_TENANT_ID != pool->tenant_id_) { - // bypass - } else if (unit->server_ == GCTX.self_addr() - || unit->migrate_from_server_ == GCTX.self_addr()) { - ofs_rs_without_sys_unit = false; - } - } - } - } - return ret; -} - -int ObServerBalancer::distribute_for_ofs_sys_unit( - const share::ObServerStatus &status, - const share::ObUnitInfo &unit_info) -{ - int ret = OB_SUCCESS; - if (!check_inner_stat()) { - ret = OB_INNER_STAT_ERROR; - LOG_WARN("fail to check inner stat", KR(ret)); - } else if (!status.is_valid() || !unit_info.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), K(status), K(unit_info)); - } else if (OB_UNLIKELY(nullptr == unit_mgr_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit mgr ptr is null", KR(ret)); - } else if (OB_SYS_TENANT_ID != unit_info.pool_.tenant_id_) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit info tenant unexpected", KR(ret), "tenant_id", unit_info.pool_.tenant_id_); - } else if (!status.is_taken_over_by_rs()) { - ret = OB_STATE_NOT_MATCH; - LOG_WARN("server not taken over by rs", KR(ret), K(status)); - } else if (OB_FAIL(unit_mgr_->migrate_unit( - unit_info.unit_.unit_id_, GCTX.self_addr(), false/*not manual*/))) { - LOG_WARN("fail to migrate unit", KR(ret), K(unit_info)); - } - return ret; -} - int ObServerBalancer::check_has_unit_in_migration( const common::ObIArray *unit_load_array, bool &has_unit_in_migration) @@ -268,28 +230,34 @@ int ObServerBalancer::distribute_pool_for_standalone_sys_unit( ret = OB_ERR_UNEXPECTED; LOG_WARN("unit config ptr is null", K(ret), K(pool)); } else { - ObServerStatus status; + ObServerInfoInTable server_info; ObUnitStat unit_stat; ObArray in_migrate_unit_stat; common::ObArray excluded_servers; common::ObAddr migrate_server; std::string resource_not_enough_reason; + ObArray servers_info_of_zone; + ObArray active_servers_info_of_zone; + ObArray active_servers_resource_info_of_zone; for (int64_t i = 0; OB_SUCC(ret) && i < pool_unit_array->count(); ++i) { excluded_servers.reset(); - status.reset(); + server_info.reset(); unit_stat.reset(); migrate_server.reset(); + servers_info_of_zone.reset(); + active_servers_resource_info_of_zone.reset(); + active_servers_info_of_zone.reset(); share::ObUnit *unit = pool_unit_array->at(i); if (OB_UNLIKELY(nullptr == unit)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("unit ptr is null", K(ret)); } else if (unit->migrate_from_server_.is_valid()) { // unit in migrate, bypass - } else if (OB_FAIL(server_mgr_->get_server_status(unit->server_, status))) { + } else if (OB_FAIL(SVR_TRACER.get_server_info(unit->server_, server_info))) { LOG_WARN("fail to get server status", K(ret), "server", unit->server_); - } else if (!status.is_active()) { + } else if (!server_info.is_active()) { // Only process servers that are active, skip non-active servers - LOG_INFO("unit server status not active", K(ret), K(status), K(*unit)); + LOG_INFO("unit server status not active", K(ret), K(server_info), K(*unit)); } else if (!has_exist_in_array(sys_unit_server_array, unit->server_)) { // bypass } else if (OB_FAIL(unit_stat_mgr_->get_unit_stat( @@ -297,14 +265,32 @@ int ObServerBalancer::distribute_pool_for_standalone_sys_unit( unit->zone_, unit_stat))) { LOG_WARN("fail to locate unit", K(ret), "unit", *unit); - } else if (OB_FAIL(unit_mgr_->get_excluded_servers(*unit, unit_stat, module, excluded_servers))) { - LOG_WARN("fail to get exclude servers", K(ret), "unit", *unit); + } else if (OB_FAIL(SVR_TRACER.get_servers_info(unit->zone_, servers_info_of_zone))) { + LOG_WARN("fail to servers_info_of_zone", KR(ret), K(unit->zone_)); + } else if (OB_FAIL(get_active_servers_info_and_resource_info_of_zone( + unit->zone_, + active_servers_info_of_zone, + active_servers_resource_info_of_zone))) { + LOG_WARN("fail to execute get_active_servers_info_and_resource_info_of_zone", KR(ret), K(unit->zone_)); + } else if (OB_FAIL(unit_mgr_->get_excluded_servers( + *unit, + unit_stat, + module, + servers_info_of_zone, + active_servers_resource_info_of_zone, + excluded_servers))) { + LOG_WARN("fail to get exclude servers", K(ret), KPC(unit), K(servers_info_of_zone), + K(active_servers_resource_info_of_zone)); } else if (OB_FAIL(append(excluded_servers, sys_unit_server_array))) { LOG_WARN("fail tp append sys unit server array", K(ret)); - } else if (OB_FAIL(unit_mgr_->choose_server_for_unit(unit_config->unit_resource(), unit->zone_, - excluded_servers, module, - migrate_server, - resource_not_enough_reason))) { + } else if (OB_FAIL(unit_mgr_->choose_server_for_unit( + unit_config->unit_resource(), unit->zone_, + excluded_servers, + module, + active_servers_info_of_zone, + active_servers_resource_info_of_zone, + migrate_server, + resource_not_enough_reason))) { if (OB_ZONE_RESOURCE_NOT_ENOUGH == ret || OB_ZONE_SERVER_NOT_ENOUGH == ret) { LOG_WARN("has no place to migrate unit", K(module), KR(ret), "unit", *unit, K(excluded_servers), "resource_not_enough_reason", resource_not_enough_reason.c_str()); @@ -479,34 +465,21 @@ int ObServerBalancer::distribute_zone_unit(const ObUnitManager::ZoneUnit &zone_u ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(zone_unit), K(ret)); } else { - ObServerStatus status; + ObServerInfoInTable server_info; FOREACH_CNT_X(unit_info, zone_unit.unit_infos_, OB_SUCCESS == ret) { - status.reset(); + server_info.reset(); if (ObUnit::UNIT_STATUS_ACTIVE != unit_info->unit_.status_) { // ignore the unit that is in deleting - } else if (OB_FAIL(server_mgr_->get_server_status(unit_info->unit_.server_, status))) { - LOG_WARN("get_server_status failed", "server", unit_info->unit_.server_, K(ret)); - } else if (status.is_active()) { - if (OB_FAIL(distribute_for_active(status, *unit_info))) { - LOG_WARN("distribute_for_active failed", K(status), "unit_info", *unit_info, K(ret)); + } else if (OB_FAIL(SVR_TRACER.get_server_info(unit_info->unit_.server_, server_info))) { + LOG_WARN("get_server_info failed", "server", unit_info->unit_.server_, KR(ret)); + } else if (server_info.is_active()) { + if (OB_FAIL(distribute_for_active(server_info, *unit_info))) { + LOG_WARN("distribute_for_active failed", K(server_info), "unit_info", *unit_info, K(ret)); } - } else if (status.is_permanent_offline() - || status.is_deleting() - || status.is_taken_over_by_rs()) { - bool ofs_rs_without_sys_unit = false; - if (OB_FAIL(check_if_ofs_rs_without_sys_unit( - status, *unit_info, ofs_rs_without_sys_unit))) { - LOG_WARN("fail to check if rs without sys unit", KR(ret)); - } else if (ofs_rs_without_sys_unit) { - if (OB_FAIL(distribute_for_ofs_sys_unit(status, *unit_info))) { - LOG_WARN("distribute for ofs sys unit", KR(ret), - K(status), "unit_info", *unit_info); - } - } else { - if (OB_FAIL(distribute_for_permanent_offline_or_delete(status, *unit_info))) { - LOG_WARN("distribute for permanent offline or delete failed", - K(status), "unit_info", *unit_info, K(ret)); - } + } else if (server_info.is_permanent_offline() || server_info.is_deleting()) { + if (OB_FAIL(distribute_for_permanent_offline_or_delete(server_info, *unit_info))) { + LOG_WARN("distribute for permanent offline or delete failed", + K(server_info), "unit_info", *unit_info, KR(ret)); } } } @@ -514,20 +487,24 @@ int ObServerBalancer::distribute_zone_unit(const ObUnitManager::ZoneUnit &zone_u return ret; } -int ObServerBalancer::distribute_for_active(const ObServerStatus &status, - const ObUnitInfo &unit_info) +int ObServerBalancer::distribute_for_active( + const ObServerInfoInTable &server_info, + const ObUnitInfo &unit_info) { int ret = OB_SUCCESS; if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; LOG_WARN("check inner stat failed", K_(inited), K(ret)); - } else if (!status.is_valid() || !status.is_active() || !unit_info.is_valid()) { + } else if (!server_info.is_valid() + || !server_info.is_active() + || !unit_info.is_valid() + || unit_info.unit_.server_ != server_info.get_server()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(status), K(unit_info), K(ret)); + LOG_WARN("invalid argument", K(server_info), K(unit_info), K(ret)); } else { //When the destination is blocked, cancel this migration //Temporary offline does not cancel the task, need to wait for permanent offline - if ((status.is_migrate_in_blocked()) + if ((server_info.is_migrate_in_blocked()) && unit_info.unit_.migrate_from_server_.is_valid()) { LOG_INFO("find unit server active but can't migrate in, " "migrate_from_server is set", "unit", unit_info.unit_); @@ -539,55 +516,34 @@ int ObServerBalancer::distribute_for_active(const ObServerStatus &status, return ret; } -int ObServerBalancer::check_is_ofs_zone_zombie_unit( - const ObUnitInfo &unit_info, - bool &is_ofs_zone_zombie_unit) -{ - int ret = OB_SUCCESS; - if (!check_inner_stat()) { - ret = OB_INNER_STAT_ERROR; - LOG_WARN("fail to check inner stat", K(ret), K_(inited)); - } else if (OB_UNLIKELY(!unit_info.is_valid())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ret), K(unit_info)); - } else { - is_ofs_zone_zombie_unit = false; - const common::ObZone &zone = unit_info.unit_.zone_; - const common::ObAddr &dst_server = unit_info.unit_.server_; - const common::ObAddr &src_server = unit_info.unit_.migrate_from_server_; - if (OB_UNLIKELY(zone.is_empty())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ret), K(zone)); - } - } - return ret; -} - //When the migration destination is permanently offline, //need to change to another destination //Need to make sure that the member has been kicked out after being permanently offline int ObServerBalancer::distribute_for_permanent_offline_or_delete( - const ObServerStatus &status, + const share::ObServerInfoInTable &server_info, const ObUnitInfo &unit_info) { int ret = OB_SUCCESS; const char *module = "UNIT_BALANCE_FOR_SERVER_PERMANENT_OFFLINE_OR_DELETE"; LOG_INFO("find unit server permanent offline or delete, need distribute unit", - K(module), "unit", unit_info.unit_, "server", status.server_); + K(module), "unit", unit_info.unit_, K(server_info)); const bool enable_sys_unit_standalone = GCONF.enable_sys_unit_standalone; bool need_migrate_unit = false; if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; LOG_WARN("check inner stat failed", K_(inited), K(ret)); - } else if (!status.is_valid() || !unit_info.is_valid()) { + } else if (!server_info.is_valid() + || !unit_info.is_valid() + || unit_info.unit_.server_ != server_info.get_server() + || (!server_info.is_deleting() && !server_info.is_permanent_offline())) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(status), K(unit_info), K(ret)); + LOG_WARN("invalid argument", K(server_info), K(unit_info), KR(ret)); } else if (!unit_info.unit_.migrate_from_server_.is_valid()) { //The current unit is in a stable state, move it out need_migrate_unit = true; LOG_INFO("server is permanent offline or in deleting status, need migrate unit", - K(unit_info), K(status)); + K(unit_info), K(server_info)); } else { //Currently moving in, try to cancel bool is_canceled = false; @@ -597,19 +553,21 @@ int ObServerBalancer::distribute_for_permanent_offline_or_delete( //If cancel fails, wait for the result of the check-in process //If the move-in process cannot be ended, //the delete server lasts for too long, and manual intervention should be required - if (!status.is_with_partition()) { - //If there is no local replica, cancel this migration directly - const ObUnitManager::EndMigrateOp op = ObUnitManager::ABORT; - if (OB_FAIL(unit_mgr_->end_migrate_unit(unit_info.unit_.unit_id_, op))) { - LOG_WARN("end_migrate_unit failed", "unit_id", unit_info.unit_.unit_id_, K(op), K(ret)); - } else { - need_migrate_unit = true; - LOG_INFO("unit has no partition, abort the migration", - K(ret), K(unit_info), K(op), K(status)); - } - } + // ** FIXME (linqiucen): now we do not do the following commented process due to the deprecated variable with_partition + // ** FIXME (linqiucen): in the future, we can do this process again by directly looking up the related table + // if (!status.is_with_partition()) { + // //If there is no local replica, cancel this migration directly + // const ObUnitManager::EndMigrateOp op = ObUnitManager::ABORT; + // if (OB_FAIL(unit_mgr_->end_migrate_unit(unit_info.unit_.unit_id_, op))) { + // LOG_WARN("end_migrate_unit failed", "unit_id", unit_info.unit_.unit_id_, K(op), K(ret)); + // } else { + // need_migrate_unit = true; + // LOG_INFO("unit has no partition, abort the migration", + // K(ret), K(unit_info), K(op), K(status)); + // } + // } } else { - LOG_INFO("revert migrate unit success", K(ret), K(unit_info), K(status)); + LOG_INFO("revert migrate unit success", K(ret), K(unit_info), K(server_info)); } } ObUnitStat unit_stat; @@ -617,17 +575,44 @@ int ObServerBalancer::distribute_for_permanent_offline_or_delete( const ObZone zone = unit_info.unit_.zone_; ObAddr migrate_server; std::string resource_not_enough_reason; + ObArray servers_info_of_zone; + ObArray active_servers_info_of_zone; + ObArray active_servers_resource_info_of_zone; if (OB_FAIL(ret) || !need_migrate_unit) { //nothing todo + } else if (OB_ISNULL(unit_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unit_mgr_ is null", KR(ret), KP(unit_mgr_)); } else if (OB_FAIL(unit_stat_mgr_->get_unit_stat( unit_info.unit_.unit_id_, unit_info.unit_.zone_, unit_stat))) { LOG_WARN("fail to locate unit", K(ret), "unit", unit_info.unit_); - } else if (OB_FAIL(unit_mgr_->get_excluded_servers(unit_info.unit_, unit_stat, module, excluded_servers))) { - LOG_WARN("get_excluded_servers failed", "unit", unit_info.unit_, K(ret)); - } else if (OB_FAIL(unit_mgr_->choose_server_for_unit(unit_info.config_.unit_resource(), zone, excluded_servers, module, - migrate_server, resource_not_enough_reason))) { + } else if (OB_FAIL(SVR_TRACER.get_servers_info(unit_info.unit_.zone_, servers_info_of_zone))) { + LOG_WARN("fail to servers_info_of_zone", KR(ret), K(unit_info.unit_.zone_)); + } else if (OB_FAIL(get_active_servers_info_and_resource_info_of_zone( + unit_info.unit_.zone_, + active_servers_info_of_zone, + active_servers_resource_info_of_zone))) { + LOG_WARN("fail to execute get_active_servers_info_and_resource_info_of_zone", KR(ret), K(unit_info.unit_.zone_)); + } else if (OB_FAIL(unit_mgr_->get_excluded_servers( + unit_info.unit_, + unit_stat, + module, + servers_info_of_zone, + active_servers_resource_info_of_zone, + excluded_servers))) { + LOG_WARN("get_excluded_servers failed", "unit", unit_info.unit_, KR(ret), K(servers_info_of_zone), + K(active_servers_resource_info_of_zone)); + } else if (OB_FAIL(unit_mgr_->choose_server_for_unit( + unit_info.config_.unit_resource(), + zone, + excluded_servers, + module, + active_servers_info_of_zone, + active_servers_resource_info_of_zone, + migrate_server, + resource_not_enough_reason))) { if (OB_ZONE_RESOURCE_NOT_ENOUGH == ret || OB_ZONE_SERVER_NOT_ENOUGH == ret) { LOG_WARN("has no place to migrate unit", K(module), KR(ret), K(zone), K(excluded_servers), K(unit_info), "resource_not_enough_reason", resource_not_enough_reason.c_str()); @@ -650,7 +635,7 @@ int ObServerBalancer::distribute_for_permanent_offline_or_delete( migrate_server))) { LOG_WARN("fail to try migrate unit", "unit", unit_info.unit_, K(migrate_server), K(ret)); } else { - LOG_INFO("migrate unit success", K(module), K(unit_info), K(status), "dest_server", migrate_server); + LOG_INFO("migrate unit success", K(module), K(unit_info), K(server_info), "dest_server", migrate_server); } } return ret; @@ -664,21 +649,21 @@ int ObServerBalancer::distribute_for_permanent_offline_or_delete( int ObServerBalancer::distribute_for_migrate_in_blocked(const ObUnitInfo &unit_info) { int ret = OB_SUCCESS; - ObServerStatus status; + ObServerInfoInTable server_info; if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; LOG_WARN("check inner stat failed", K_(inited), K(ret)); } else if (!unit_info.is_valid() || !unit_info.unit_.migrate_from_server_.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(unit_info), K(ret)); - } else if (OB_FAIL(server_mgr_->get_server_status( - unit_info.unit_.migrate_from_server_, status))) { + } else if (OB_FAIL(SVR_TRACER.get_server_info( + unit_info.unit_.migrate_from_server_, server_info))) { LOG_WARN("get_server_status failed", "server", unit_info.unit_.migrate_from_server_, K(ret)); } else if (ObUnit::UNIT_STATUS_ACTIVE != unit_info.unit_.status_) { // ignore the unit which is in deleting } else { - if (status.can_migrate_in()) { + if (server_info.can_migrate_in()) { LOG_INFO("unit migrate_from_server can migrate in, " "migrate unit back to migrate_from_server", "unit", unit_info.unit_); const ObUnitManager::EndMigrateOp op = ObUnitManager::REVERSE; @@ -690,7 +675,7 @@ int ObServerBalancer::distribute_for_migrate_in_blocked(const ObUnitInfo &unit_i //nothing todo LOG_WARN("NOTICE: unit migration is hung. dest server is blocked " "and source server can not migrate in. NEED to be involved manually.", - "unit", unit_info.unit_, "migrate_from_server", status); + "unit", unit_info.unit_, "migrate_from_server", server_info); } /* @@ -781,11 +766,23 @@ int ObServerBalancer::try_migrate_unit(const uint64_t unit_id, const ObAddr &dst) { int ret = OB_SUCCESS; + ObServerResourceInfo dst_resource_info; if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("server balancer not init", K_(inited), K(ret)); + } else if (OB_ISNULL(server_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("server_mgr_ is null", KR(ret), KP(server_mgr_)); + } else if (OB_FAIL(server_mgr_->get_server_resource_info(dst, dst_resource_info))) { + LOG_WARN("fail to get dst_resource_info", KR(ret), K(dst)); } else { - ret = unit_mgr_->try_migrate_unit(unit_id, tenant_id, unit_stat, migrating_unit_stat, dst); + ret = unit_mgr_->try_migrate_unit( + unit_id, + tenant_id, + unit_stat, + migrating_unit_stat, + dst, + dst_resource_info); unit_migrated_ = true; } return ret; @@ -843,17 +840,15 @@ int ObServerBalancer::check_can_execute_rebalance( } else if (OB_UNLIKELY(zone.is_empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(zone)); - } else if (OB_UNLIKELY(NULL == server_mgr_ - || NULL == unit_mgr_ - || NULL == zone_mgr_)) { + } else if (OB_ISNULL(unit_mgr_) || OB_ISNULL(zone_mgr_) || OB_ISNULL(server_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_mgr_ or unit_mgr_ ptr is null", K(ret), KP(server_mgr_), KP(unit_mgr_)); + LOG_WARN("unit_mgr_, zone_mgr_ or server_mgr_ is null", KR(ret), KP(unit_mgr_), KP(zone_mgr_), KP(server_mgr_)); } else if (OB_FAIL(zone_mgr_->get_zone(zone, zone_info))) { LOG_WARN("fail to get zone info", K(ret), K(zone)); } else if (ObZoneStatus::ACTIVE != zone_info.status_) { can_execute_rebalance = false; LOG_INFO("cannot execute server rebalance since zone inactive", K(zone)); - } else if (OB_FAIL(server_mgr_->get_servers_of_zone(zone, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_servers_of_zone(zone, server_list))) { LOG_WARN("fail to get servers of zone", K(ret), K(zone)); } else if (OB_FAIL(unit_mgr_->inner_get_unit_ids(unit_ids))) { LOG_WARN("fail to get unit ids", K(ret)); @@ -862,7 +857,8 @@ int ObServerBalancer::check_can_execute_rebalance( share::ObUnitConfig sum_load; for (int64_t i = 0; can_execute_rebalance && OB_SUCC(ret) && i < server_list.count(); ++i) { const common::ObAddr &server = server_list.at(i); - share::ObServerStatus server_status; + ObServerInfoInTable server_info; + ObServerResourceInfo resource_info; ObArray *unit_loads = nullptr; sum_load.reset(); if (OB_FAIL(unit_mgr_->get_loads_by_server(server, unit_loads))) { @@ -879,18 +875,18 @@ int ObServerBalancer::check_can_execute_rebalance( } if (OB_FAIL(ret)) { // failed - } else if (OB_FAIL(server_mgr_->get_server_status(server, server_status))) { + } else if (OB_FAIL(SVR_TRACER.get_server_info(server, server_info))) { LOG_WARN("fail to get server status", K(ret)); - } else if (server_status.is_temporary_offline() - || server_status.is_stopped() - || ObServerStatus::OB_SERVER_ADMIN_TAKENOVER_BY_RS == server_status.admin_status_) { + } else if (server_info.is_temporary_offline() || server_info.is_stopped()) { can_execute_rebalance = false; - LOG_INFO("cannot execute server rebalance", K(server_status)); - } else if (fabs(server_status.resource_info_.report_cpu_assigned_ - sum_load.min_cpu()) > CPU_EPSILON - || fabs(server_status.resource_info_.report_cpu_max_assigned_ - sum_load.max_cpu()) > CPU_EPSILON - || server_status.resource_info_.report_mem_assigned_ != sum_load.memory_size()) { + LOG_INFO("cannot execute server rebalance", K(server_info)); + } else if (OB_FAIL(server_mgr_->get_server_resource_info(server_info.get_server(), resource_info))) { + LOG_WARN("fail to execute get_server_resource_info", KR(ret), K(server_info.get_server())); + } else if (fabs(resource_info.report_cpu_assigned_ - sum_load.min_cpu()) > CPU_EPSILON + || fabs(resource_info.report_cpu_max_assigned_ - sum_load.max_cpu()) > CPU_EPSILON + || resource_info.report_mem_assigned_ != sum_load.memory_size()) { can_execute_rebalance = false; - LOG_INFO("cannot execute server rebalance", K(server_status), K(sum_load)); + LOG_INFO("cannot execute server rebalance", K(resource_info), K(sum_load)); } else {} // no more to do } for (int64_t j = 0; can_execute_rebalance && OB_SUCC(ret) && j < unit_ids.count(); ++j) { @@ -1225,15 +1221,15 @@ int ObServerBalancer::generate_available_servers( } else if (OB_UNLIKELY(zone.is_empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(zone)); - } else if (OB_UNLIKELY(NULL == server_mgr_ || NULL == zone_mgr_ || NULL == unit_mgr_)) { + } else if (OB_ISNULL(zone_mgr_) || OB_ISNULL(unit_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_mgr_ or zone_mgr_ ptr is null", K(ret), KP(server_mgr_), KP(zone_mgr_)); + LOG_WARN("zone_mgr_ or unit_mgr_ is null", K(ret), KP(unit_mgr_), KP(zone_mgr_)); } else if (OB_FAIL(zone_mgr_->get_zone(zone, zone_info))) { LOG_WARN("fail to get zone info", K(ret), K(zone)); } else if (ObZoneStatus::ACTIVE != zone_info.status_) { ret = OB_STATE_NOT_MATCH; LOG_WARN("zone is not in active", K(ret), K(zone_info)); - } else if (OB_FAIL(server_mgr_->get_servers_of_zone(zone, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_servers_of_zone(zone, server_list))) { LOG_WARN("fail to get servers of zone", K(ret), K(zone)); } else if (OB_FAIL(unit_mgr_->get_tenant_unit_servers( OB_SYS_TENANT_ID, zone, sys_unit_server_array))) { @@ -1241,19 +1237,18 @@ int ObServerBalancer::generate_available_servers( } else { available_servers.reset(); for (int64_t i = 0; OB_SUCC(ret) && i < server_list.count(); ++i) { - share::ObServerStatus server_status; - if (OB_FAIL(server_mgr_->get_server_status(server_list.at(i), server_status))) { + share::ObServerInfoInTable server_info; + if (OB_FAIL(SVR_TRACER.get_server_info(server_list.at(i), server_info))) { LOG_WARN("fail to get server status", K(ret)); - } else if (server_status.is_temporary_offline() - || server_status.is_stopped()) { + } else if (server_info.is_temporary_offline() || server_info.is_stopped()) { ret = OB_STATE_NOT_MATCH; - LOG_WARN("server in zone is not stable, stop balance servers", K(ret), K(server_status), - "is_temporary_offline", server_status.is_temporary_offline(), - "is_stopped", server_status.is_stopped()); + LOG_WARN("server in zone is not stable, stop balance servers", K(ret), K(server_info), + "is_temporary_offline", server_info.is_temporary_offline(), + "is_stopped", server_info.is_stopped()); } else if (excluded_sys_unit_server && has_exist_in_array(sys_unit_server_array, server_list.at(i))) { // bypass - } else if (server_status.is_active()) { + } else if (server_info.is_active()) { if (OB_FAIL(available_servers.push_back(server_list.at(i)))) { LOG_WARN("fail to push back", K(ret)); } @@ -2698,9 +2693,9 @@ int ObServerBalancer::calc_inter_ttg_weights( || NULL == info_need_amend)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), KP(info_need_amend)); - } else if (OB_UNLIKELY(NULL == server_mgr_)) { + } else if (OB_ISNULL(server_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_mgr_ ptr is null", K(ret)); + LOG_WARN("server_mgr_ is null", KR(ret), KP(server_mgr_)); } else { LoadSum load_sum; for (int64_t i = 0; @@ -2747,10 +2742,10 @@ int ObServerBalancer::calc_inter_ttg_weights( ResourceSum resource_sum; for (int64_t i = 0; OB_SUCC(ret) && i < available_servers.count(); ++i) { const common::ObAddr &server = available_servers.at(i); - share::ObServerStatus server_status; - if (OB_FAIL(server_mgr_->get_server_status(server, server_status))) { - LOG_WARN("fail to get server status", K(ret), K(server)); - } else if (OB_FAIL(resource_sum.append_resource(server_status.resource_info_))) { + share::ObServerResourceInfo resource_info; + if (OB_FAIL(server_mgr_->get_server_resource_info(server, resource_info))) { + LOG_WARN("fail to get server resource_info", KR(ret), K(server)); + } else if (OB_FAIL(resource_sum.append_resource(resource_info))) { LOG_WARN("fail to append resource", K(ret)); } else {} // no more to do } @@ -3352,7 +3347,7 @@ int ObServerBalancer::do_migrate_unit_task( if (!unit_migrate_stat.unit_load_.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid argument", K(ret), "unit_load", unit_migrate_stat.unit_load_); - } else if (OB_FAIL(unit_mgr_->check_can_migrate_in( + } else if (OB_FAIL(SVR_TRACER.check_server_can_migrate_in( unit_migrate_stat.arranged_pos_, can_migrate_in))) { LOG_WARN("fail to check can migrate in", K(ret)); } else if (!can_migrate_in) { @@ -3408,8 +3403,9 @@ int ObServerBalancer::do_migrate_unit_task( } else if (!unit_migrate_stat->unit_load_.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("invalid argument", K(ret), "unit_load", unit_migrate_stat->unit_load_); - } else if (OB_FAIL(unit_mgr_->check_can_migrate_in( - unit_migrate_stat->arranged_pos_, can_migrate_in))) { + } else if (OB_FAIL(SVR_TRACER.check_server_can_migrate_in( + unit_migrate_stat->arranged_pos_, + can_migrate_in))) { LOG_WARN("fail to check can migrate in", K(ret)); } else if (!can_migrate_in) { // bypass @@ -3574,9 +3570,9 @@ int ObServerBalancer::check_servers_resource_enough( if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_UNLIKELY(NULL == unit_mgr_ || NULL == server_mgr_)) { + } else if (OB_ISNULL(unit_mgr_) || OB_ISNULL(server_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit_mgr_ or server_mgr_ or unit_stat_mgr_ is null", K(ret)); + LOG_WARN("unit_mgr_ or server_mgr_ is null", KR(ret), KP(unit_mgr_), KP(server_mgr_)); } else if (OB_FAIL(unit_mgr_->get_hard_limit(hard_limit))) { LOG_WARN("fail to hard limit", K(ret)); } else if (OB_FAIL(get_server_balance_critical_disk_waterlevel(disk_waterlevel))) { @@ -3585,15 +3581,16 @@ int ObServerBalancer::check_servers_resource_enough( enough = true; for (int64_t i = 0; OB_SUCC(ret) && enough && i < server_load_sums.count(); ++i) { ObArray *unit_loads = NULL; - share::ObServerStatus server_status; + share::ObServerResourceInfo server_resource_info; const common::ObAddr &server = server_load_sums.at(i).server_; LoadSum load_sum = server_load_sums.at(i).load_sum_; int64_t disk_in_use = server_load_sums.at(i).disk_in_use_; ServerDiskStatistic disk_statistic; if (OB_FAIL(zone_disk_statistic_.get_server_disk_statistic(server, disk_statistic))) { LOG_WARN("fail to get disk statistic", K(ret), K(server)); - } else if (OB_FAIL(server_mgr_->get_server_status(server, server_status))) { - LOG_WARN("fail to get server status", K(ret)); + } else if (OB_FAIL(server_mgr_->get_server_resource_info(server, server_resource_info))) { + // **TODO (linqiucen.lqc): temp.solution + LOG_WARN("fail to get server resource info", KR(ret), K(server)); } else if (OB_FAIL(unit_mgr_->get_loads_by_server(server, unit_loads))) { if (OB_ENTRY_NOT_EXIST != ret) { LOG_WARN("fail to get loads by server", K(ret)); @@ -3617,13 +3614,13 @@ int ObServerBalancer::check_servers_resource_enough( } if (OB_SUCC(ret)) { if (load_sum.load_sum_.max_cpu() - > server_status.resource_info_.cpu_ * hard_limit + > server_resource_info.cpu_ * hard_limit || load_sum.load_sum_.min_cpu() - > server_status.resource_info_.cpu_ + > server_resource_info.cpu_ || static_cast(load_sum.load_sum_.memory_size()) - > static_cast(server_status.resource_info_.mem_total_) + > static_cast(server_resource_info.mem_total_) || static_cast(load_sum.load_sum_.log_disk_size()) - > static_cast(server_status.resource_info_.log_disk_total_) + > static_cast(server_resource_info.log_disk_total_) || static_cast(disk_in_use + disk_statistic.disk_in_use_) > static_cast(disk_statistic.disk_total_) * disk_waterlevel) { enough = false; @@ -4433,25 +4430,22 @@ int ObServerBalancer::generate_complete_server_loads( || zone.is_empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), KP(resource_weights), K(weights_count)); - } else if (OB_UNLIKELY(NULL == unit_mgr_ || NULL == server_mgr_)) { + } else if (OB_ISNULL(unit_mgr_) || OB_ISNULL(server_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit_mgr_ or server_mgr_ ptr is null", K(ret), KP(unit_mgr_), KP(server_mgr_)); - } else if (OB_FAIL(server_mgr_->get_servers_of_zone(zone, zone_servers))) { + LOG_WARN("unit_mgr_ or server_mgr_ is null", K(ret), KP(unit_mgr_), KP(server_mgr_)); + } else if (OB_FAIL(SVR_TRACER.get_servers_of_zone(zone, zone_servers))) { LOG_WARN("fail to get servers of zone", K(ret), K(zone)); } else { for (int64_t i = 0; OB_SUCC(ret) && i < zone_servers.count(); ++i) { const common::ObAddr &server = zone_servers.at(i); ServerTotalLoad server_load; server_load.server_ = server; - share::ObServerStatus server_status; + share::ObServerResourceInfo server_resource_info; ObArray *unit_loads = NULL; LoadSum load_sum; - ResourceSum resource_sum; server_load.wild_server_ = !has_exist_in_array(available_servers, server); - if (OB_FAIL(server_mgr_->get_server_status(server, server_status))) { + if (OB_FAIL(server_mgr_->get_server_resource_info(server, server_resource_info))) { LOG_WARN("fail to get server status", K(ret), K(server)); - } else if (OB_FAIL(resource_sum.append_resource(server_status.resource_info_))) { - LOG_WARN("fail to append resource", K(ret)); } else if (OB_FAIL(unit_mgr_->get_loads_by_server(server, unit_loads))) { if (OB_ENTRY_NOT_EXIST != ret) { LOG_WARN("get loads by server failed", K(ret), K(server)); @@ -4469,7 +4463,7 @@ int ObServerBalancer::generate_complete_server_loads( server_load.resource_weights_[i] = resource_weights[i]; } server_load.load_sum_ = load_sum; - server_load.resource_info_ = server_status.resource_info_; + server_load.resource_info_ = server_resource_info; if (OB_FAIL(server_load.update_load_value())) { LOG_WARN("fail to update load value", K(ret)); } else if (OB_FAIL(server_loads.push_back(server_load))) { @@ -5585,10 +5579,10 @@ int ObServerBalancer::calc_global_balance_resource_weights( || RES_MAX != weights_count)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(zone)); - } else if (OB_UNLIKELY(NULL == unit_mgr_ || NULL == server_mgr_)) { + } else if (OB_ISNULL(unit_mgr_) || OB_ISNULL(server_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit_mgr_ or server_mgr_ is null", K(ret)); - } else if (OB_FAIL(server_mgr_->get_servers_of_zone(zone, zone_servers))) { + LOG_WARN("unit_mgr_ or server_mgr_ is null", KR(ret), KP(unit_mgr_), KP(server_mgr_)); + } else if (OB_FAIL(SVR_TRACER.get_servers_of_zone(zone, zone_servers))) { LOG_WARN("fail to get zone servers", K(ret), K(zone)); } else { LoadSum load_sum; @@ -5610,10 +5604,10 @@ int ObServerBalancer::calc_global_balance_resource_weights( ResourceSum resource_sum; for (int64_t i = 0; OB_SUCC(ret) && i < available_servers.count(); ++i) { const common::ObAddr &server = available_servers.at(i); - share::ObServerStatus server_status; - if (OB_FAIL(server_mgr_->get_server_status(server, server_status))) { - LOG_WARN("fail to get server status", K(ret), K(server)); - } else if (OB_FAIL(resource_sum.append_resource(server_status.resource_info_))) { + share::ObServerResourceInfo resource_info; + if (OB_FAIL(server_mgr_->get_server_resource_info(server, resource_info))) { + LOG_WARN("fail to get resource_info", KR(ret), K(server)); + } else if (OB_FAIL(resource_sum.append_resource(resource_info))) { LOG_WARN("fail to append resource", K(ret)); } else {} // no more to do } @@ -6262,30 +6256,31 @@ int ObServerBalancer::generate_server_load( if (OB_UNLIKELY(!inited_)) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_UNLIKELY(NULL == server_mgr_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_mgr_ ptr is null", K(ret), KP(server_mgr_)); } else if (available_servers.count() <= 0) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(available_servers)); + } else if (OB_ISNULL(server_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("server_mgr_ is null", KR(ret), KP(server_mgr_)); } else { // Place the generated unitgroup load into the corresponding server load server_loads.reset(); ServerLoad server_load; - share::ObServerStatus server_status; + share::ObServerResourceInfo resource_info; share::ObServerResourceInfo intra_ttg_resource_info; // Pre-fill the server first, and fill in the server resource info for (int64_t i = 0; OB_SUCC(ret) && i < available_servers.count(); ++i) { server_load.reset(); - server_status.reset(); + resource_info.reset(); server_load.server_ = available_servers.at(i); - if (OB_FAIL(server_mgr_->get_server_status(server_load.server_, server_status))) { - LOG_WARN("fail to get server status", K(ret)); + if (OB_FAIL(server_mgr_->get_server_resource_info(server_load.server_, resource_info))) { + LOG_WARN("fail to get server status", KR(ret), K(server_load.server_)); } else if (OB_FAIL(try_regulate_intra_ttg_resource_info( - server_status.resource_info_, intra_ttg_resource_info))) { + resource_info, + intra_ttg_resource_info))) { LOG_WARN("fail to try regulate intra resource info", K(ret)); } else { - server_load.resource_info_ = server_status.resource_info_; + server_load.resource_info_ = resource_info; if (OB_FAIL(server_loads.push_back(server_load))) { LOG_WARN("fail to push back", K(ret)); } else {} // no more to do @@ -7866,11 +7861,11 @@ int ObServerBalancer::generate_zone_server_disk_statistic( } else if (OB_UNLIKELY(zone.is_empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(ret), K(zone)); - } else if (OB_UNLIKELY(NULL == server_mgr_)) { + } else if (OB_ISNULL(server_mgr_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_mgr_ ptr is null", K(ret)); - } else if (OB_FAIL(server_mgr_->get_servers_of_zone(zone, server_list))) { - LOG_WARN("fail to get servers of zone", K(ret)); + LOG_WARN("server_mgr_ ptr is null", KR(ret), KP(server_mgr_)); + } else if (OB_FAIL(SVR_TRACER.get_servers_of_zone(zone, server_list))) { + LOG_WARN("fail to get servers of zone", KR(ret), K(zone)); } else if (OB_FAIL(get_server_balance_critical_disk_waterlevel(disk_waterlevel))) { LOG_WARN("fail to get server balance disk water level", K(ret)); } else { @@ -7878,36 +7873,36 @@ int ObServerBalancer::generate_zone_server_disk_statistic( zone_disk_statistic_.zone_ = zone; for (int64_t i = 0; OB_SUCC(ret) && i < server_list.count(); ++i) { const common::ObAddr &server = server_list.at(i); - share::ObServerStatus server_status; + share::ObServerResourceInfo server_resource_info; + share::ObServerInfoInTable server_info; ServerDiskStatistic disk_statistic; - if (OB_FAIL(server_mgr_->get_server_status(server, server_status))) { - LOG_WARN("fail to get server status", K(ret)); - } else if (server_status.is_temporary_offline() - || server_status.is_stopped()) { + if (OB_FAIL(SVR_TRACER.get_server_info(server, server_info))) { + LOG_WARN("fail to get server info", KR(ret), K(server)); + } else if (server_info.is_temporary_offline() || server_info.is_stopped()) { ret = OB_STATE_NOT_MATCH; LOG_WARN("server is not stable, stop balance servers in this zone", - K(ret), K(server), K(zone), - "is_temporary_offline", server_status.is_temporary_offline(), - "is_stopped", server_status.is_stopped()); - } else if (server_status.is_active()) { + KR(ret), K(server), K(zone), + "is_temporary_offline", server_info.is_temporary_offline(), + "is_stopped", server_info.is_stopped()); + } else if (OB_FAIL(server_mgr_->get_server_resource_info(server, server_resource_info))) { + LOG_WARN("fail to get server resource info", KR(ret), K(server)); + } else if (server_info.is_active()) { disk_statistic.server_ = server; disk_statistic.wild_server_ = false; - disk_statistic.disk_in_use_ = server_status.resource_info_.disk_in_use_; - disk_statistic.disk_total_ = server_status.resource_info_.disk_total_; + disk_statistic.disk_in_use_ = server_resource_info.disk_in_use_; + disk_statistic.disk_total_ = server_resource_info.disk_total_; if (static_cast(disk_statistic.disk_in_use_) > disk_waterlevel * static_cast(disk_statistic.disk_total_)) { zone_disk_statistic_.over_disk_waterlevel_ = true; } - } else if (ObServerStatus::OB_SERVER_ADMIN_DELETING == server_status.admin_status_ - || ObServerStatus::OB_SERVER_ADMIN_TAKENOVER_BY_RS == server_status.admin_status_ - || server_status.is_permanent_offline()) { + } else if (server_info.is_deleting() || server_info.is_permanent_offline()) { disk_statistic.server_ = server; disk_statistic.wild_server_ = true; - disk_statistic.disk_in_use_ = server_status.resource_info_.disk_in_use_; - disk_statistic.disk_total_ = server_status.resource_info_.disk_total_; + disk_statistic.disk_in_use_ = server_resource_info.disk_in_use_; + disk_statistic.disk_total_ = server_resource_info.disk_total_; } else { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unknow server status", K(ret), K(server_status)); + LOG_WARN("unknow server_info", K(ret), K(server_info)); } if (OB_FAIL(ret)) { } else if (OB_FAIL(zone_disk_statistic_.append(disk_statistic))) { diff --git a/src/rootserver/ob_server_balancer.h b/src/rootserver/ob_server_balancer.h index 2d02f7b13..dac8cf693 100644 --- a/src/rootserver/ob_server_balancer.h +++ b/src/rootserver/ob_server_balancer.h @@ -268,6 +268,7 @@ public: ObZoneManager &zone_mgr, ObServerManager &server_mgr, ObUnitStatManager &unit_stat_mgr); + int build_active_servers_resource_info(); // 1. migrate units to balance the load // 2. migrate units from offline servers int balance_servers(); @@ -285,19 +286,9 @@ public: common::ObIArray &tenant_groups, bool &legal); private: - int check_is_ofs_zone_zombie_unit( - const share::ObUnitInfo &unit_info, - bool &is_ofs_zone_zombie_unit); bool check_inner_stat() const { return inited_; } // distribute for server online/permanent_offline/migrate_in_blocked int distribute_for_server_status_change(); - int check_if_ofs_rs_without_sys_unit( - const share::ObServerStatus &status, - const share::ObUnitInfo &unit_info, - bool &ofs_rs_without_sys_unit); - int distribute_for_ofs_sys_unit( - const share::ObServerStatus &status, - const share::ObUnitInfo &unit_info); int check_has_unit_in_migration( const common::ObIArray *unit_load_array, bool &has_unit_in_migration); @@ -307,10 +298,11 @@ private: int distribute_by_pool(share::ObResourcePool *pool); int distribute_for_migrate_in_blocked(const share::ObUnitInfo &unit_info); int distribute_zone_unit(const ObUnitManager::ZoneUnit &zone_unit); - int distribute_for_active(const share::ObServerStatus &status, - const share::ObUnitInfo &unit_info); + int distribute_for_active( + const share::ObServerInfoInTable &server_info, + const share::ObUnitInfo &unit_info); int distribute_for_permanent_offline_or_delete( - const share::ObServerStatus &status, + const share::ObServerInfoInTable &server_info, const share::ObUnitInfo &unit_info); int distribute_for_standalone_sys_unit(); @@ -330,6 +322,10 @@ private: const common::ObAddr &dst); int try_cancel_migrate_unit(const share::ObUnit &unit, bool &is_canceled); + int get_active_servers_info_and_resource_info_of_zone( + const ObZone &zone, + ObIArray &servers_info, + ObIArray &server_resources_info); // the new version server balance private: @@ -1421,7 +1417,7 @@ protected: ObUnitStatManager *unit_stat_mgr_; CountBalanceStrategy count_balance_strategy_; InnerTenantGroupBalanceStrategy &inner_ttg_balance_strategy_; - // Each time the unit balance between servers is executed, + // Each time the unit balance between servers is executed, // the disk information of each server in the zone is calculated ZoneServerDiskStatistic zone_disk_statistic_; diff --git a/src/rootserver/ob_server_manager.cpp b/src/rootserver/ob_server_manager.cpp index 50206739d..452e4fcdd 100644 --- a/src/rootserver/ob_server_manager.cpp +++ b/src/rootserver/ob_server_manager.cpp @@ -30,6 +30,7 @@ #include "observer/ob_server.h" #include "rootserver/ob_root_service.h" #include "storage/ob_file_system_router.h" +#include "rootserver/ob_heartbeat_service.h" namespace oceanbase { using namespace common; @@ -159,106 +160,47 @@ int ObServerManager::add_server(const common::ObAddr &server, const ObZone &zone if (OB_SUCC(ret)) { ROOTSERVICE_EVENT_ADD("server", "add_server", K(server)); LOG_INFO("add new server", K(server), K(zone)); - int tmp = server_change_callback_->on_server_change(); - if (OB_SUCCESS != tmp) { - LOG_WARN("fail to callback on server change", K(ret)); + int tmp_ret = server_change_callback_->on_server_change(); + if (OB_SUCCESS != tmp_ret) { + LOG_WARN("fail to callback on server change", KR(ret), K(tmp_ret)); } else { LOG_WARN("callback on add server success"); } + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh all server tracer", KR(ret), KR(tmp_ret)); + } } return ret; } + int ObServerManager::try_delete_server_working_dir( const common::ObZone &zone, const common::ObAddr &server, const int64_t svr_seq) { int ret = OB_SUCCESS; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else { - // not shared storage, no need to delete server working dir, only to clean disk usage table. - ObDiskUsageTableOperator disk_usage_table_operator; - ObSEArray tenant_ids; - char ip_str[MAX_IP_ADDR_LENGTH] = { '\0' }; + // not shared storage, no need to delete server working dir, only to clean disk usage table. + ObDiskUsageTableOperator disk_usage_table_operator; + ObSEArray tenant_ids; + char ip_str[MAX_IP_ADDR_LENGTH] = { '\0' }; - if (OB_FAIL(disk_usage_table_operator.init(OBSERVER.get_mysql_proxy()))) { - LOG_WARN("fail to init disk usage table operator", K(ret)); - } else if (OB_UNLIKELY(!server.ip_to_string(ip_str, MAX_IP_ADDR_LENGTH))) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("fail to get server ip cstring", K(ret), K(server), K(svr_seq)); - } else if (OB_FAIL(disk_usage_table_operator.get_all_tenant_ids(ip_str, - server.get_port(), svr_seq, tenant_ids))) { - LOG_WARN("fail to get all tenant ids", K(ret), K(server), K(svr_seq)); - } else { - for (int64_t i = 0; OB_SUCC(ret) && i < tenant_ids.count(); i++) { - const uint64_t tenant_id = tenant_ids.at(i); - if (OB_FAIL(disk_usage_table_operator.delete_tenant_all(tenant_id, - ip_str, - server.get_port(), - svr_seq))) { - STORAGE_LOG(WARN, "failed to delete tenant all", K(ret), K(tenant_id), K(server)); - } - } - } - } - return ret; -} - -int ObServerManager::finish_server_recovery( - const common::ObAddr &server) -{ - int ret = OB_SUCCESS; - // avoid maintain operation run concurrently - SpinWLockGuard guard(maintaince_lock_); - ObServerStatus *server_status = nullptr; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (OB_UNLIKELY(!server.is_valid())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ret), K(server)); + if (OB_FAIL(disk_usage_table_operator.init(OBSERVER.get_mysql_proxy()))) { + LOG_WARN("fail to init disk usage table operator", K(ret)); + } else if (OB_UNLIKELY(!server.ip_to_string(ip_str, MAX_IP_ADDR_LENGTH))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("fail to get server ip cstring", K(ret), K(server), K(svr_seq)); + } else if (OB_FAIL(disk_usage_table_operator.get_all_tenant_ids(ip_str, + server.get_port(), svr_seq, tenant_ids))) { + LOG_WARN("fail to get all tenant ids", K(ret), K(server), K(svr_seq)); } else { - { - SpinRLockGuard guard(server_status_rwlock_); - if (OB_FAIL(find(server, server_status))) { - LOG_WARN("fail to find server status", K(ret), K(server)); - } else if (nullptr == server_status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("server status is null", K(ret), K(server)); - } else if (ObServerStatus::OB_SERVER_ADMIN_TAKENOVER_BY_RS != server_status->admin_status_) { - ret = OB_STATE_NOT_MATCH; - LOG_WARN("server admin status not match", K(ret), - "admin_status", server_status->admin_status_); - } - } - if (OB_SUCC(ret)) { - common::ObMySQLTransaction trans; - if (OB_FAIL(trans.start(&st_operator_.get_proxy(), OB_SYS_TENANT_ID))) { - LOG_WARN("fail to start trans", K(ret)); - } else if (OB_FAIL(st_operator_.remove(server, trans))) { - LOG_WARN("fail to remove", K(ret), K(server)); - } else if (OB_FAIL(update_admin_status( - server, server_status->admin_status_, true/* remove */))) { - LOG_WARN("fail to update admin status", K(ret)); - } - if (trans.is_started()) { - int tmp_ret = OB_SUCCESS; - if (OB_SUCCESS != (tmp_ret = trans.end(OB_SUCC(ret)))) { - LOG_WARN("fail to end trans", K(tmp_ret)); - ret = OB_SUCC(ret) ? tmp_ret : ret; - } - } - if (OB_SUCC(ret)) { - // delete associated records from __all_clog_history_info_v2 about this server - LOG_INFO("finish server recovery", K(server)); - if (OB_SUCCESS != server_change_callback_->on_server_change()) { - LOG_WARN("fail to callback on server change", K(server)); - } else { - LOG_INFO("callback on server change succeed", K(server)); - } + for (int64_t i = 0; OB_SUCC(ret) && i < tenant_ids.count(); i++) { + const uint64_t tenant_id = tenant_ids.at(i); + if (OB_FAIL(disk_usage_table_operator.delete_tenant_all(tenant_id, + ip_str, + server.get_port(), + svr_seq))) { + STORAGE_LOG(WARN, "failed to delete tenant all", K(ret), K(tenant_id), K(server)); } } } @@ -340,6 +282,9 @@ int ObServerManager::delete_server(const ObIArray &servers, const ObZone LOG_WARN("trans end failed", K(tmp_ret), K(commit)); ret = OB_SUCC(ret) ? tmp_ret : ret; } + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh all server tracer", KR(ret), KR(tmp_ret)); + } } } return ret; @@ -448,12 +393,15 @@ int ObServerManager::end_delete_server(const ObAddr &server, const ObZone &zone, ROOTSERVICE_EVENT_ADD("server", "cancel_delete_server", K(server)); } LOG_INFO("end delete server", K(server), K(commit)); - int tmp = server_change_callback_->on_server_change(); - if (OB_SUCCESS != tmp) { - LOG_WARN("fail to callback on server change", K(ret)); + int tmp_ret = server_change_callback_->on_server_change(); + if (OB_SUCCESS != tmp_ret) { + LOG_WARN("fail to callback on server change", KR(ret), KR(tmp_ret)); } else { LOG_WARN("callback on server change success", K(server)); } + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh all server tracer", KR(ret), KR(tmp_ret)); + } } } return ret; @@ -481,73 +429,6 @@ int ObServerManager::get_server_id( return ret; } -// only add to memory and only used by bootstrap -int ObServerManager::add_server_list(const ObServerInfoList &server_list, uint64_t &server_id) -{ - int ret = OB_SUCCESS; - SpinWLockGuard guard(server_status_rwlock_); - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (server_list.count() <= 0) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("server_list is empty", K(server_list), K(ret)); - } else { - const int64_t now = ObTimeUtility::current_time(); - for (int64_t i = 0; OB_SUCC(ret) && i < server_list.count(); ++i) { - server_id = OB_INIT_SERVER_ID; - ObServerStatus *server_status = NULL; - if (OB_SUCC(find(server_list.at(i).server_, server_status))) { - if (NULL == server_status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_status is null", "server_status ptr", OB_P(server_status), K(ret)); - } else if (server_status->id_ > server_id) { - server_id = server_status->id_; - } - } else if (OB_ENTRY_NOT_EXIST != ret) { - LOG_WARN("find failed", "server", server_list.at(i).server_, K(ret)); - } else { - ret = OB_SUCCESS; - bool server_id_used = false; - while (OB_SUCC(ret)) { - if (OB_FAIL(check_server_id_used(server_id, server_id_used))) { - LOG_WARN("check server_id_used failed", K(server_id), K(ret)); - } else if (server_id_used) { - ++server_id; - } else { - break; - } - } - if (OB_SUCC(ret)) { - ObServerStatus new_server_status; - new_server_status.id_ = server_id; - new_server_status.server_ = server_list.at(i).server_; - new_server_status.zone_ = server_list.at(i).zone_; - new_server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_NORMAL; - new_server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_LEASE_EXPIRED; - new_server_status.lease_expire_time_ = now + ObLeaseRequest::SERVICE_LEASE; - new_server_status.last_hb_time_ = now - config_->lease_time; - new_server_status.with_partition_ = true; - if (OB_FAIL(server_statuses_.push_back(new_server_status))) { - BOOTSTRAP_LOG(WARN, "fail to push back server status", K(ret), K(new_server_status)); - } else { - BOOTSTRAP_LOG(INFO, "add server list success", K(new_server_status)); - ROOTSERVICE_EVENT_ADD("server", "add_server", - "server", new_server_status.server_); - int tmp = server_change_callback_->on_server_change(); - if (OB_SUCCESS != tmp) { - LOG_WARN("fail to callback on server change", K(ret)); - } else { - LOG_WARN("callback on add server success"); - } - } - } - } - } - } - return ret; -} - int ObServerManager::start_server_list(const ObServerList &server_list, const ObZone &zone) { int ret = OB_SUCCESS; @@ -658,30 +539,6 @@ int ObServerManager::is_server_stopped(const ObAddr &server, bool &is_stopped) c return ret; } -int ObServerManager::get_server_leader_cnt(const ObAddr &server, int64_t &leader_cnt) const -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not init", K(ret)); - } else if (!server.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ret), K(server)); - } else { - SpinRLockGuard guard(server_status_rwlock_); - const ObServerStatus *status_ptr = NULL; - if (OB_FAIL(find(server, status_ptr))) { - LOG_WARN("find failed", K(ret), K(server)); - } else if (NULL == status_ptr) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("status ptr is null", K(ret), KP(status_ptr)); - } else { - leader_cnt = status_ptr->leader_cnt_; - } - } - return ret; -} - int ObServerManager::stop_server(const ObAddr &server, const ObZone &zone) { int ret = OB_SUCCESS; @@ -747,75 +604,12 @@ int ObServerManager::start_or_stop_server(const ObAddr &server, const ObZone &zo SpinWLockGuard guard(server_status_rwlock_); server_status->stop_time_ = stop_time; } - } - - return ret; -} - -int ObServerManager::expend_server_lease( - const common::ObAddr &server, - const int64_t new_lease_end) -{ - int ret = OB_SUCCESS; - SpinWLockGuard guard(server_status_rwlock_); - ObServerStatus *status_ptr = NULL; - if (OB_UNLIKELY(!server.is_valid())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ret), K(server)); - } else if (OB_FAIL(find(server, status_ptr))) { - LOG_WARN("fail to find server", K(ret)); - } else if (OB_UNLIKELY(NULL == status_ptr)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("status ptr is null", K(ret), K(server)); - } else if (status_ptr->lease_expire_time_ <= new_lease_end) { - status_ptr->lease_expire_time_ = new_lease_end; - } else {} // no need to update lease_expire_time - return ret; -} - -void ObServerManager::clear_in_recovery_server_takenover_by_rs( - const common::ObAddr &server) -{ - SpinWLockGuard guard(server_status_rwlock_); - ObServerStatus *status_ptr = NULL; - int tmp_ret = find(server, status_ptr); - if (OB_SUCCESS == tmp_ret) { - status_ptr->in_recovery_for_takenover_by_rs_ = false;; - } else if (OB_ENTRY_NOT_EXIST != tmp_ret) { - LOG_WARN_RET(tmp_ret, "find failed", K(server), K(tmp_ret)); - } else { - LOG_WARN_RET(tmp_ret, "fail to find server", K(server), K(tmp_ret)); - } -} - -int ObServerManager::try_modify_recovery_server_takenover_by_rs( - const common::ObAddr &server, - const common::ObZone &zone) -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else if (OB_UNLIKELY(!server.is_valid())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ret), K(server)); - } else { - SpinWLockGuard guard(server_status_rwlock_); - ObServerStatus *status_ptr = nullptr; - if (OB_FAIL(find(server, status_ptr))) { - LOG_WARN("fail to find server status", K(ret), K(server)); - } else if (nullptr == status_ptr) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("status ptr is null", K(ret), KP(status_ptr)); - } else { - status_ptr->hb_status_ = ObServerStatus::OB_HEARTBEAT_ALIVE; - status_ptr->last_hb_time_ = common::ObTimeUtility::current_time(); - status_ptr->admin_status_ = share::ObServerStatus::OB_SERVER_ADMIN_NORMAL; - if (OB_FAIL(status_change_callback_->on_server_status_change(server))) { - LOG_WARN("fail to submit server status change task", K(ret), K(server)); - } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh all server tracer", KR(ret), KR(tmp_ret)); } } + return ret; } @@ -862,53 +656,105 @@ int ObServerManager::receive_hb( } else if (NULL == status_ptr) { ret = OB_ERR_UNEXPECTED; LOG_WARN("status_ptr is null", "status_ptr", OB_P(status_ptr), K(ret)); - } else if (ObServerStatus::OB_SERVER_ADMIN_TAKENOVER_BY_RS == status_ptr->admin_status_) { - ret = OB_STATE_NOT_MATCH; - LOG_WARN("server taken over by rs, state not match", K(ret), "server_status", *status_ptr); - } else if (!status_ptr->zone_.is_empty() && status_ptr->zone_ != lease_request.zone_) { - ret = OB_SERVER_ZONE_NOT_MATCH; - LOG_WARN("server zone not match", "zone", status_ptr->zone_, - "lease zone", lease_request.zone_, K(ret)); - } else { - // if force_stop_hb is true then won't extend server's last_hb_time_ - const int64_t now = status_ptr->force_stop_hb_ ? - status_ptr->last_hb_time_ : ::oceanbase::common::ObTimeUtility::current_time(); - ObServerStatus::HeartBeatStatus old_hb_status = status_ptr->hb_status_; + } else if (ObHeartbeatService::is_service_enabled()) { // the new logic + if (status_ptr->resource_info_ != lease_request.resource_info_) { + LOG_INFO("server resource changed", "old_resource_info", status_ptr->resource_info_, + "new_resource_info", lease_request.resource_info_); + status_ptr->resource_info_ = lease_request.resource_info_; + } + status_ptr->last_hb_time_ = ::oceanbase::common::ObTimeUtility::current_time(); server_id = status_ptr->id_; - if (status_ptr->with_rootserver_ != with_rootserver) { - LOG_INFO("server change with_rootserver", "old with_rootserver", - status_ptr->with_rootserver_, "new with_rootserver", with_rootserver); - if (with_rootserver) { - if (OB_FAIL(reset_existing_rootserver())) { - LOG_WARN("reset_existing_rootserver failed", K(ret)); + } else { + if (!status_ptr->zone_.is_empty() && status_ptr->zone_ != lease_request.zone_) { + ret = OB_SERVER_ZONE_NOT_MATCH; + LOG_WARN("server zone not match", "zone", status_ptr->zone_, + "lease zone", lease_request.zone_, K(ret)); + } else { + // if force_stop_hb is true then won't extend server's last_hb_time_ + const int64_t now = status_ptr->force_stop_hb_ ? + status_ptr->last_hb_time_ : ::oceanbase::common::ObTimeUtility::current_time(); + ObServerStatus::HeartBeatStatus old_hb_status = status_ptr->hb_status_; + server_id = status_ptr->id_; + if (status_ptr->with_rootserver_ != with_rootserver) { + LOG_INFO("server change with_rootserver", "old with_rootserver", + status_ptr->with_rootserver_, "new with_rootserver", with_rootserver); + if (with_rootserver) { + if (OB_FAIL(reset_existing_rootserver())) { + LOG_WARN("reset_existing_rootserver failed", K(ret)); + } } - } - if (OB_FAIL(ret)) { - } else if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { - LOG_WARN("set server status failed", K(lease_request), - K(now), K(with_rootserver), K(ret)); - } else if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { - LOG_WARN("commit server change with_rootserver task failed", "server", - status_ptr->server_, K(ret)); - } - } else if (ObServerStatus::OB_HEARTBEAT_ALIVE != status_ptr->hb_status_ - && !status_ptr->force_stop_hb_) { - const char *hb_status_str = NULL; - // for logging, ignore result and do not check NULL string. - int tmp_ret = ObServerStatus::heartbeat_status_str( - status_ptr->hb_status_, hb_status_str); - if (OB_SUCCESS != tmp_ret) { - LOG_WARN("heartbeat status to string failed", K(tmp_ret), - "hb_status", status_ptr->hb_status_); - } else { - if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { - LOG_WARN("set server status failed", K(lease_request), K(now), K(with_rootserver), K(ret)); + if (OB_FAIL(ret)) { + } else if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { + LOG_WARN("set server status failed", K(lease_request), + K(now), K(with_rootserver), K(ret)); + } else if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { + LOG_WARN("commit server change with_rootserver task failed", "server", + status_ptr->server_, K(ret)); + } + } else if (ObServerStatus::OB_HEARTBEAT_ALIVE != status_ptr->hb_status_ + && !status_ptr->force_stop_hb_) { + const char *hb_status_str = NULL; + // for logging, ignore result and do not check NULL string. + int tmp_ret = ObServerStatus::heartbeat_status_str( + status_ptr->hb_status_, hb_status_str); + if (OB_SUCCESS != tmp_ret) { + LOG_WARN("heartbeat status to string failed", K(tmp_ret), + "hb_status", status_ptr->hb_status_); + } else { + if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { + LOG_WARN("set server status failed", K(lease_request), K(now), K(with_rootserver), K(ret)); + } else { + LOG_INFO("server alive again", "server", status_ptr->server_, + "last hb status", hb_status_str, + "last hb time", status_ptr->last_hb_time_); + status_ptr->register_time_ = now; + //commit server online task + // ignore wakeup balancer and wakeup daily merger failed + int temp_ret = OB_SUCCESS; + if (OB_SUCCESS != (temp_ret = status_change_callback_->wakeup_balancer())) { + LOG_WARN("wakeup_balancer failed", K(temp_ret)); + } else if (OB_SUCCESS != (temp_ret = status_change_callback_->wakeup_daily_merger())) { + LOG_WARN("wakeup_daily_merger failed", K(temp_ret)); + } + if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { + LOG_WARN("commit new server online task failed", "server", + status_ptr->server_, K(ret)); + } else { + ROOTSERVICE_EVENT_ADD("server", "online", "server", status_ptr->server_); + } + } + } + } else if (0 != MEMCMP(status_ptr->build_version_, lease_request.build_version_, + OB_SERVER_VERSION_LENGTH)) { + LOG_INFO("server change build_version", "old svn version", status_ptr->build_version_, + "new svn version", lease_request.build_version_); + //commit server svn version change + if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { + LOG_WARN("set server status failed", K(lease_request), + K(now), K(with_rootserver), K(ret)); + } else if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { + LOG_WARN("commit server change build_version task failed", "server", + status_ptr->server_, K(ret)); + } + } else if (status_ptr->ssl_key_expired_time_ != lease_request.ssl_key_expired_time_) { + LOG_INFO("server ssl_key_expired_time changed", "old ssl_key_expired_time", status_ptr->ssl_key_expired_time_, + "new ssl_key_expired_time", lease_request.ssl_key_expired_time_); + //commit server svn version change + if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { + LOG_WARN("set server status failed", K(lease_request), + K(now), K(with_rootserver), K(ret)); + } else if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { + LOG_WARN("commit server change build_version task failed", "server", + status_ptr->server_, K(ret)); + } + } else if (status_ptr->start_service_time_ != lease_request.start_service_time_) { + LOG_INFO("server change start_service_time", "old start_service_time", + status_ptr->start_service_time_, "new start_service_time", + lease_request.start_service_time_); + if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { + LOG_WARN("set server status failed", K(lease_request), + K(now), K(with_rootserver), K(ret)); } else { - LOG_INFO("server alive again", "server", status_ptr->server_, - "last hb status", hb_status_str, - "last hb time", status_ptr->last_hb_time_); - status_ptr->register_time_ = now; - //commit server online task // ignore wakeup balancer and wakeup daily merger failed int temp_ret = OB_SUCCESS; if (OB_SUCCESS != (temp_ret = status_change_callback_->wakeup_balancer())) { @@ -917,87 +763,42 @@ int ObServerManager::receive_hb( LOG_WARN("wakeup_daily_merger failed", K(temp_ret)); } if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { - LOG_WARN("commit new server online task failed", "server", - status_ptr->server_, K(ret)); + LOG_WARN("commit server change start_service_time task failed", "server", + status_ptr->server_, K(ret)); } else { - ROOTSERVICE_EVENT_ADD("server", "online", "server", status_ptr->server_); + if (0 == status_ptr->start_service_time_ && 0 != lease_request.start_service_time_) { + ROOTSERVICE_EVENT_ADD("server", "start_service", + "server", status_ptr->server_); + } } } - } - } else if (0 != MEMCMP(status_ptr->build_version_, lease_request.build_version_, - OB_SERVER_VERSION_LENGTH)) { - LOG_INFO("server change build_version", "old svn version", status_ptr->build_version_, - "new svn version", lease_request.build_version_); - //commit server svn version change - if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { - LOG_WARN("set server status failed", K(lease_request), - K(now), K(with_rootserver), K(ret)); - } else if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { - LOG_WARN("commit server change build_version task failed", "server", - status_ptr->server_, K(ret)); - } - } else if (status_ptr->ssl_key_expired_time_ != lease_request.ssl_key_expired_time_) { - LOG_INFO("server ssl_key_expired_time changed", "old ssl_key_expired_time", status_ptr->ssl_key_expired_time_, - "new ssl_key_expired_time", lease_request.ssl_key_expired_time_); - //commit server svn version change - if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { - LOG_WARN("set server status failed", K(lease_request), - K(now), K(with_rootserver), K(ret)); - } else if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { - LOG_WARN("commit server change build_version task failed", "server", - status_ptr->server_, K(ret)); - } - } else if (status_ptr->start_service_time_ != lease_request.start_service_time_) { - LOG_INFO("server change start_service_time", "old start_service_time", - status_ptr->start_service_time_, "new start_service_time", - lease_request.start_service_time_); - if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { - LOG_WARN("set server status failed", K(lease_request), - K(now), K(with_rootserver), K(ret)); - } else { - // ignore wakeup balancer and wakeup daily merger failed - int temp_ret = OB_SUCCESS; - if (OB_SUCCESS != (temp_ret = status_change_callback_->wakeup_balancer())) { - LOG_WARN("wakeup_balancer failed", K(temp_ret)); - } else if (OB_SUCCESS != (temp_ret = status_change_callback_->wakeup_daily_merger())) { - LOG_WARN("wakeup_daily_merger failed", K(temp_ret)); - } - if (OB_FAIL(status_change_callback_->on_server_status_change(status_ptr->server_))) { - LOG_WARN("commit server change start_service_time task failed", "server", - status_ptr->server_, K(ret)); + } else if (status_ptr->server_report_status_ != lease_request.server_status_) { + LOG_INFO("server report status change", + "old report status", status_ptr->server_report_status_, + "new report status", lease_request.server_status_); + if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { + LOG_WARN("set server status failed", K(ret), + K(lease_request), K(now), K(with_rootserver)); + } else if (OB_FAIL(process_report_status_change(lease_request, *status_ptr))) { + LOG_WARN("fail to proc report status change", K(ret)); } else { - if (0 == status_ptr->start_service_time_ && 0 != lease_request.start_service_time_) { - ROOTSERVICE_EVENT_ADD("server", "start_service", - "server", status_ptr->server_); - } + status_ptr->server_report_status_ = lease_request.server_status_; + } + } else if (status_ptr->resource_info_ != lease_request.resource_info_) { + LOG_INFO("server resource changed", "old_resource_info", status_ptr->resource_info_, + "new_resource_info", lease_request.resource_info_); + if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { + LOG_WARN("set server status failed", K(lease_request), + K(now), K(with_rootserver), K(ret)); } - } - } else if (status_ptr->server_report_status_ != lease_request.server_status_) { - LOG_INFO("server report status change", - "old report status", status_ptr->server_report_status_, - "new report status", lease_request.server_status_); - if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { - LOG_WARN("set server status failed", K(ret), - K(lease_request), K(now), K(with_rootserver)); - } else if (OB_FAIL(process_report_status_change(lease_request, *status_ptr))) { - LOG_WARN("fail to proc report status change", K(ret)); } else { - status_ptr->server_report_status_ = lease_request.server_status_; + status_ptr->last_hb_time_ = now; } - } else if (status_ptr->resource_info_ != lease_request.resource_info_) { - LOG_INFO("server resource changed", "old_resource_info", status_ptr->resource_info_, - "new_resource_info", lease_request.resource_info_); - if (OB_FAIL(set_server_status(lease_request, now, with_rootserver, *status_ptr))) { - LOG_WARN("set server status failed", K(lease_request), - K(now), K(with_rootserver), K(ret)); - } - } else { - status_ptr->last_hb_time_ = now; - } - if (OB_SUCC(ret)) { - if (ObServerStatus::OB_HEARTBEAT_ALIVE == status_ptr->hb_status_ - && ObServerStatus::OB_HEARTBEAT_ALIVE != old_hb_status) { - to_alive = true; + if (OB_SUCC(ret)) { + if (ObServerStatus::OB_HEARTBEAT_ALIVE == status_ptr->hb_status_ + && ObServerStatus::OB_HEARTBEAT_ALIVE != old_hb_status) { + to_alive = true; + } } } } @@ -1156,40 +957,6 @@ int ObServerManager::check_server_permanent_offline(const ObAddr &server, bool & return ret; } -int ObServerManager::check_server_with_id_exist( - const ObAddr &server, - const uint64_t server_id, - bool &exist) const -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", KR(ret)); - } else if (!server.is_valid() || OB_INVALID_ID == server_id) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server", K(server), K(server_id), KR(ret)); - } else { - SpinRLockGuard guard(server_status_rwlock_); - exist = false; - const ObServerStatus *status_ptr = NULL; - if (OB_FAIL(find(server, status_ptr))) { - if (OB_ENTRY_NOT_EXIST != ret) { - LOG_WARN("find failed", K(server), K(ret)); - } else { - ret = OB_SUCCESS; - exist = false; - LOG_DEBUG("treat not exist server as not alive", K(server)); - } - } else if (NULL == status_ptr) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("status_ptr is null", "status_ptr", OB_P(status_ptr), K(ret)); - } else { - exist = (status_ptr->get_server_id() == server_id); - } - } - return ret; -} - int ObServerManager::check_server_alive(const ObAddr &server, bool &is_alive) const { int ret = OB_SUCCESS; @@ -1221,39 +988,6 @@ int ObServerManager::check_server_alive(const ObAddr &server, bool &is_alive) co return ret; } -int ObServerManager::check_server_takenover_by_rs( - const common::ObAddr &server, - bool &taken_over_by_rs) const -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", KR(ret)); - } else if (OB_UNLIKELY(!server.is_valid())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", KR(ret), K(server)); - } else { - SpinRLockGuard guard(server_status_rwlock_); - const ObServerStatus *status_ptr = nullptr; - int tmp_ret = find(server, status_ptr); - if (OB_SUCCESS == tmp_ret) { - if (OB_UNLIKELY(nullptr == status_ptr)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("status_ptr is null", KR(ret), "status_ptr", OB_P(status_ptr), K(server)); - } else { - taken_over_by_rs = - (status_ptr->admin_status_ == ObServerStatus::OB_SERVER_ADMIN_TAKENOVER_BY_RS); - } - } else if (OB_ENTRY_NOT_EXIST == tmp_ret) { - ret = OB_ENTRY_NOT_EXIST; - } else { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get server status unexpected", KR(ret), K(server)); - } - } - return ret; -} - int ObServerManager::check_server_active(const ObAddr &server, bool &is_active) const { int ret = OB_SUCCESS; @@ -1316,188 +1050,6 @@ int ObServerManager::check_server_stopped(const common::ObAddr &server, bool &is return ret; } -int ObServerManager::check_server_valid_for_partition(const common::ObAddr &server, bool &is_valid) const -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", KR(ret)); - } else if (OB_UNLIKELY(!server.is_valid())) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server",KR(ret), K(server)); - } else { - SpinRLockGuard guard(server_status_rwlock_); - is_valid = false; - bool zone_active = false; - const ObServerStatus *status_ptr = NULL; - if (OB_FAIL(find(server, status_ptr))) { - if (OB_ENTRY_NOT_EXIST != ret) { - LOG_WARN("failed to find server", KR(ret), K(server)); - } else { - ret = OB_SUCCESS; - LOG_INFO("server not exist, not valid for partition", K(server)); - } - } else if (OB_ISNULL(status_ptr)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("status_ptr is null", KR(ret), "status_ptr", OB_P(status_ptr)); - } else if (OB_FAIL(zone_mgr_->check_zone_active(status_ptr->zone_, zone_active))) { - LOG_WARN("fail to check zone active", K(ret), K(server)); - } else { - is_valid = status_ptr->can_migrate_in() - && !status_ptr->is_stopped() - && zone_active; - } - } - return ret; -} - -int ObServerManager::get_servers_by_status(ObIServerArray &active_server_list, - ObIServerArray &inactive_server_list) const -{ - ObZone empty_zone; - return get_servers_by_status(empty_zone, active_server_list, inactive_server_list); -} - - -int ObServerManager::get_servers_by_status(const ObZone &zone, - ObIServerArray &active_server_list, - ObIServerArray &inactive_server_list) const -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else { - active_server_list.reuse(); - SpinRLockGuard guard(server_status_rwlock_); - for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses_.count(); ++i) { - if (server_statuses_[i].zone_ == zone || zone.is_empty()) { - if (server_statuses_[i].is_alive()) { - ret = active_server_list.push_back(server_statuses_[i].server_); - if (OB_FAIL(ret)) { - LOG_WARN("push back to active_server_list failed", K(ret)); - } - } else if (OB_FAIL(inactive_server_list.push_back(server_statuses_[i].server_))) { - LOG_WARN("fail to push back to inactive_server_list", KR(ret)); - } - } - } - } - return ret; -} - -int ObServerManager::get_servers_takenover_by_rs( - const ObZone &zone, - ObIServerArray &server_list) const -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else { - server_list.reuse(); - SpinRLockGuard guard(server_status_rwlock_); - for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses_.count(); ++i) { - if (zone.is_empty() || server_statuses_[i].zone_ == zone) { - if (ObServerStatus::OB_SERVER_ADMIN_TAKENOVER_BY_RS != server_statuses_[i].admin_status_) { - // bypass since this server not taken over by rs - } else if (OB_FAIL(server_list.push_back(server_statuses_[i].server_))) { - LOG_WARN("fail to push back to server list", K(ret)); - } - } else {} // zone not match - } - } - return ret; -} - -int ObServerManager::get_alive_servers(const ObZone &zone, - ObIServerArray &server_list) const -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else { - server_list.reuse(); - SpinRLockGuard guard(server_status_rwlock_); - for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses_.count(); ++i) { - if ((server_statuses_[i].zone_ == zone || zone.is_empty()) - && server_statuses_[i].is_alive()) { - ret = server_list.push_back(server_statuses_[i].server_); - if (OB_FAIL(ret)) { - LOG_WARN("push back to server_list failed", K(ret)); - } - } - } - } - return ret; -} - -int ObServerManager::get_alive_server_count(const ObZone &zone, int64_t &count) const -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else { - count = 0; - SpinRLockGuard guard(server_status_rwlock_); - for (int64_t i = 0; i < server_statuses_.count(); ++i) { - if ((server_statuses_[i].zone_ == zone || zone.is_empty()) - && server_statuses_[i].is_alive()) { - ++count; - } - } - } - return ret; -} - -int ObServerManager::get_active_server_array( - const common::ObZone &zone, - ObIServerArray &active_server_array) const -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not inited", K(ret)); - } else { - active_server_array.reset(); - SpinRLockGuard guard(server_status_rwlock_); - for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses_.count(); ++i) { - if (!zone.is_empty() && zone != server_statuses_.at(i).zone_) { - // zone not match, bypass - } else if (server_statuses_.at(i).is_active()) { - if (OB_FAIL(active_server_array.push_back(server_statuses_.at(i).server_))) { - LOG_WARN("fail to push back", K(ret)); - } - } - } - } - return ret; -} - -// We want count in active servers defined by struct ObServerStatus, -// and to avoid server is temporary offline when counting server’s number, -// we also count in the server which is temporary offline. -int ObServerManager::get_zone_max_unit_num(const ObZone &zone, int64_t &count) const -{ - int ret = OB_SUCCESS; - if (OB_UNLIKELY(!inited_)) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", KR(ret)); - } else { - count = 0; - SpinRLockGuard guard(server_status_rwlock_); - for (int64_t i = 0; i < server_statuses_.count(); ++i) { - if ((server_statuses_[i].zone_ == zone || zone.is_empty()) - && (server_statuses_[i].is_active() || server_statuses_[i].is_temporary_offline())) { - ++count; - } - } - } - return ret; -} - int ObServerManager::get_server_count(const ObZone &zone, int64_t &alive_count, int64_t ¬_alive_count) const { int ret = OB_SUCCESS; @@ -1797,6 +1349,23 @@ int ObServerManager::get_server_status(const ObAddr &server, return ret; } +int ObServerManager::get_server_resource_info( + const common::ObAddr &server, + share::ObServerResourceInfo &resource_info) +{ + int ret = OB_SUCCESS; + ObServerStatus server_status; + if (!inited_) { + ret = OB_NOT_INIT; + LOG_WARN("not init", K(ret)); + } else if (OB_FAIL(get_server_status(server, server_status))) { + LOG_WARN("fail to get server status", KR(ret), K(server)); + } else { + resource_info = server_status.resource_info_; + } + return ret; +} + int ObServerManager::update_server_status(const ObServerStatus &server_status) { int ret = OB_SUCCESS; @@ -1917,18 +1486,6 @@ bool ObServerManager::has_build() const return has_build_; } -int ObServerManager::get_lease_duration(int64_t &lease_time) const -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else { - lease_time = config_->lease_time; - } - return ret; -} - int ObServerManager::get_server_zone(const ObAddr &addr, ObZone &zone) const { int ret = OB_SUCCESS; @@ -1949,29 +1506,6 @@ int ObServerManager::get_server_zone(const ObAddr &addr, ObZone &zone) const return ret; } -int ObServerManager::get_all_server_list( - common::ObIArray &server_list) -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else { - server_list.reset(); - SpinRLockGuard guard(server_status_rwlock_); - for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses_.count(); ++i) { - ObAddr &addr = server_statuses_[i].server_; - if (!addr.is_valid()) { - ret = OB_INVALID_SERVER_STATUS; - LOG_WARN("invalid addr", K(ret), K(addr)); - } else if (OB_FAIL(server_list.push_back(addr))) { - LOG_WARN("fail to push back addr", K(ret), K(addr)); - } - } - } - return ret; -} - int ObServerManager::get_server_statuses(const ObZone &zone, ObServerStatusIArray &server_statuses, bool include_permanent_offline) const @@ -1999,6 +1533,37 @@ int ObServerManager::get_server_statuses(const ObZone &zone, return ret; } +int ObServerManager::build_server_resource_info_result( + const common::ObZone &zone, + ObIArray &active_servers_resource_info) +{ + // empty zone means that get all + int ret = OB_SUCCESS; + active_servers_resource_info.reset(); + obrpc::ObGetServerResourceInfoResult result; + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("server manager has not inited", KR(ret), K(inited_)); + } else { + SpinRLockGuard guard(server_status_rwlock_); + for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses_.count(); ++i) { + const ObServerStatus &status = server_statuses_.at(i); + result.reset(); + if (OB_UNLIKELY(!status.is_valid())) { + ret = OB_INVALID_SERVER_STATUS; + LOG_WARN("server status is not valid", KR(ret), K(status)); + } else if (status.is_active() && (status.zone_ == zone || zone.is_empty())) { + if (OB_FAIL(result.init(status.server_, status.resource_info_))) { + LOG_WARN("fail to init", KR(ret), K(status.server_), K(status.resource_info_)); + } else if (OB_FAIL(active_servers_resource_info.push_back(result))) { + LOG_WARN("push back to active_servers_resource_info failed", KR(ret), K(result)); + } + } + } + } + return ret; +} + int ObServerManager::get_server_statuses(const ObServerArray &servers, ObServerStatusArray &server_statuses) const { @@ -2102,55 +1667,6 @@ void ObServerManager::reset() server_statuses_.reset(); } -int ObServerManager::have_server_stopped(const common::ObZone &zone, bool &stopped) const -{ - int ret = OB_SUCCESS; - stopped = false; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (zone.is_empty()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("zone is empty", K(zone), K(ret)); - } else { - for (int64_t i = 0; i < server_statuses_.count(); ++i) { - if (server_statuses_[i].is_stopped() && - server_statuses_[i].zone_ == zone) - { - stopped = true; - LOG_DEBUG("have server in stopped status", K(zone), "other_server", server_statuses_[i].server_, "other_zone", server_statuses_[i].zone_); - break; - } - } - } - return ret; -} - -int ObServerManager::check_other_zone_stopped(const common::ObZone &zone, - bool &stopped) -{ - int ret = OB_SUCCESS; - stopped = false; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (zone.is_empty()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("zone is empty", K(zone), K(ret)); - } else { - for (int64_t i = 0; i < server_statuses_.count(); ++i) { - if (server_statuses_[i].is_stopped() && - server_statuses_[i].zone_ != zone) - { - stopped = true; - LOG_WARN("have other server in stopped status", K(zone), "other_server", server_statuses_[i].server_, "other_zone", server_statuses_[i].zone_); - break; - } - } - } - return ret; -} - int64_t ObServerManager::to_string(char *buf, const int64_t buf_len) const { int64_t pos = 0; @@ -2236,27 +1752,6 @@ int ObServerManager::fetch_new_server_id(uint64_t &server_id) return ret; } -int ObServerManager::check_server_id_used(const uint64_t server_id, bool &server_id_used) -{ - int ret = OB_SUCCESS; - server_id_used = false; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (OB_INVALID_ID == server_id) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server_id", K(server_id), K(ret)); - } else { - for (int64_t i = 0; i < server_statuses_.count(); ++i) { - if (server_statuses_[i].id_ == server_id) { - server_id_used = true; - break; - } - } - } - return ret; -} - int ObServerManager::get_server_id(const ObZone &zone, const ObAddr &server, uint64_t &server_id) const { int ret = OB_SUCCESS; @@ -2283,148 +1778,6 @@ int ObServerManager::get_server_id(const ObZone &zone, const ObAddr &server, uin return ret; } -int ObServerManager::update_merged_version(const ObAddr &addr, - int64_t frozen_version, bool &zone_merged) -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not inited", K(ret)); - } else if (!addr.is_valid() || frozen_version <= 0) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(addr), K(frozen_version), K(ret)); - } else { - SpinWLockGuard guard(server_status_rwlock_); - ObServerStatus *status = NULL; - if (OB_FAIL(find(addr, status))) { - LOG_WARN("find failed", K(addr), K(ret)); - } else if (NULL == status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("status is null", "status ptr", OB_P(status), K(ret)); - } else if (status->merged_version_ > frozen_version) { - LOG_WARN("receive obs report merge finish, version is invallid", K(frozen_version), "server version", status->merged_version_); - } else { - status->merged_version_ = frozen_version; - zone_merged = true; - FOREACH_X(s, server_statuses_, zone_merged) { - if (s->zone_ == status->zone_ - && s->is_alive() - && s->merged_version_ < frozen_version) { - zone_merged = false; - } - } - } - } - return ret; -} - -int ObServerManager::get_merged_version(const common::ObAddr &addr, - int64_t &merged_version) const -{ - int ret = OB_SUCCESS; - merged_version = 0; - SpinRLockGuard guard(server_status_rwlock_); - const ObServerStatus *status = NULL; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (!addr.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid addr", K(addr), K(ret)); - } else if (OB_FAIL(find(addr, status))) { - LOG_WARN("find failed", K(addr), K(ret)); - } else if (NULL == status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("status is null", "status ptr", OB_P(status), K(ret)); - } else { - merged_version = status->merged_version_; - } - return ret; -} - -int ObServerManager::block_migrate_in(const ObAddr &addr) -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else if (!addr.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid addr", K(addr), K(ret)); - } else { - const bool blocked = true; - if (OB_FAIL(set_migrate_in_blocked(addr, blocked))) { - LOG_WARN("set_migrate_in_blocked failed", K(addr), K(blocked), K(ret)); - } else { - ROOTSERVICE_EVENT_ADD("server", "block_migrate_in", - "server", addr); - } - } - return ret; -} - -int ObServerManager::unblock_migrate_in(const ObAddr &addr) -{ - int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else if (!addr.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid addr", K(addr), K(ret)); - } else { - const bool blocked = false; - if (OB_FAIL(set_migrate_in_blocked(addr, blocked))) { - LOG_WARN("set_migrate_in_blocked failed", K(addr), K(blocked), K(ret)); - } else { - ROOTSERVICE_EVENT_ADD("server", "unblock_migrate_in", - "server", addr); - } - } - return ret; -} - -int ObServerManager::set_migrate_in_blocked(const common::ObAddr &addr, const bool blocked) -{ - int ret = OB_SUCCESS; - SpinWLockGuard guard(server_status_rwlock_); - ObServerStatus *status_ptr = NULL; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager has not inited", K(ret)); - } else if (!addr.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid addr", K(addr), K(ret)); - } else if (OB_FAIL(find(addr, status_ptr))) { - LOG_WARN("find failed", K(addr), K(ret)); - } else if (NULL == status_ptr) { - ret = OB_ERR_UNEXPECTED; - LOG_INFO("status_ptr is null", "status_ptr", OB_P(status_ptr), K(ret)); - } else { - if (!blocked) { - status_ptr->unblock_migrate_in(); - } else { - status_ptr->block_migrate_in(); - } - // ignore wakeup balancer and wakeup daily merger failed - int temp_ret = OB_SUCCESS; - if (OB_SUCCESS != (temp_ret = status_change_callback_->wakeup_balancer())) { - LOG_WARN("wakeup_balancer failed", K(temp_ret)); - } else if (OB_SUCCESS != (temp_ret = status_change_callback_->wakeup_daily_merger())) { - LOG_WARN("wakeup_daily_merger failed", K(temp_ret)); - } - ret = status_change_callback_->on_server_status_change(addr); - if (OB_FAIL(ret)) { - LOG_WARN("commit block migrate in status change task failed", K(addr), - K(blocked), "block_migrate_in_time", status_ptr->block_migrate_in_time_, K(ret)); - } else { - LOG_INFO("commit block migrate in status change task succeed", K(addr), - K(blocked), "block_migrate_in_time", status_ptr->block_migrate_in_time_, K(ret)); - } - } - return ret; -} - int ObServerManager::check_migrate_in_blocked(const common::ObAddr &addr, bool &blocked) const { int ret = OB_SUCCESS; @@ -2481,265 +1834,6 @@ int ObServerManager::check_in_service(const common::ObAddr &addr, bool &in_servi return ret; } -int ObServerManager::set_with_partition(const common::ObAddr &server) -{ - int ret = OB_SUCCESS; - bool need_update = false; - ObServerStatus *status = NULL; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not inited", K(ret)); - } else if (!server.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server", K(ret), K(server)); - } else { - SpinRLockGuard guard(server_status_rwlock_); - if (OB_FAIL(find(server, status))) { - LOG_WARN("find server failed", K(ret), K(server)); - } else if (NULL == status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("NULL server status", K(ret)); - } else if (!status->is_alive()) { - ret = OB_SERVER_NOT_ALIVE; - LOG_WARN("server not alive", K(ret)); - } else { - need_update = !status->with_partition_; - } - status = NULL; - } - - if (OB_SUCC(ret) && need_update) { - need_update = false; - SpinWLockGuard guard(maintaince_lock_); - { - SpinRLockGuard guard(server_status_rwlock_); - // check status again - if (OB_FAIL(find(server, status))) { - LOG_WARN("find server failed", K(ret), K(server)); - } else if (NULL == status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("NULL server status", K(ret)); - } else if (!status->is_alive()) { - ret = OB_SERVER_NOT_ALIVE; - LOG_WARN("server not alive", K(ret)); - } else if (!status->with_partition_) { - status = NULL; - need_update = true; - } - } - if (OB_FAIL(ret)) { - } else if (need_update) { - bool with_partition = true; - if (OB_FAIL(st_operator_.update_with_partition(server, with_partition))) { - LOG_WARN("update with partition failed", K(ret), K(server), K(with_partition)); - } else { - ROOTSERVICE_EVENT_ADD("server", "set_with_partition", K(server)); - LOG_INFO("set with partition", K(server)); - } - - if (OB_SUCC(ret)) { - SpinWLockGuard guard(server_status_rwlock_); - if (OB_FAIL(find(server, status))) { - LOG_WARN("find server failed", K(ret), K(server)); - } else if (NULL == status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("NULL server status", K(ret)); - } else { - // It's safe to update with partition flag here because we hold %maintaince_lock_, - // flag will not be modified after previous check. - status->with_partition_ = true; - } - } - } - } - return ret; -} - -int ObServerManager::clear_with_partiton(const common::ObAddr &server, - const int64_t last_hb_time) -{ - int ret = OB_SUCCESS; - ObServerStatus *status = NULL; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not inited", K(ret)); - } else if (!server.is_valid() || last_hb_time < 0) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server", K(ret), K(server), K(last_hb_time)); - } else { - SpinWLockGuard guard(maintaince_lock_); - bool need_update = false; - { - SpinRLockGuard guard(server_status_rwlock_); - if (OB_FAIL(find(server, status))) { - LOG_WARN("find server failed", K(ret), K(server)); - } else if (NULL == status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("NULL server status", K(ret)); - } else if (!status->with_partition_) { - // with partition flag not set, do nothing - } else if (last_hb_time != status->last_hb_time_) { - // last heartbeat time mismatch, do nothing, return success. - LOG_WARN("last hb time mismatch, do not clear with partition flag", - K(server), K(last_hb_time), "status", *status); - } else { - need_update = true; - status = NULL; - } - } - if (OB_FAIL(ret)) { - } else if (!need_update) { - } else { - bool with_partition = false; - if (OB_FAIL(st_operator_.update_with_partition(server, with_partition))) { - LOG_WARN("update with partition failed", K(ret), K(server), K(with_partition)); - } else { - ROOTSERVICE_EVENT_ADD("server", "clear_with_partition", K(server)); - LOG_INFO("clear with partition", K(server)); - } - - if (OB_SUCC(ret)) { - SpinWLockGuard guard(server_status_rwlock_); - if (OB_FAIL(find(server, status))) { - LOG_WARN("find server failed", K(ret), K(server)); - } else if (NULL == status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("NULL server status", K(ret)); - } else { - // It's safe to update with partition flag here because we hold %maintaince_lock_, - // flag will not be modified after previous check. - status->with_partition_ = false; - } - } - } - } - return ret; -} - -int ObServerManager::set_force_stop_hb(const ObAddr &server, - const bool &force_stop_hb) -{ - int ret = OB_SUCCESS; - ObServerStatus *status = NULL; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not inited", K(ret)); - } else if (!server.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server", K(ret)); - } else { - // force_stop_hb_ is only in memory - SpinWLockGuard guard(server_status_rwlock_); - if (OB_FAIL(find(server, status))) { - LOG_WARN("find server failed", K(ret), K(server)); - } else if (NULL == status) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("server status is NULL", K(ret)); - } else if (ObServerStatus::OB_SERVER_ADMIN_DELETING != status->admin_status_ - && ObServerStatus::OB_SERVER_ADMIN_TAKENOVER_BY_RS != status->admin_status_) { - ret = OB_SERVER_NOT_DELETING; - LOG_WARN("server not in deleting status, cannot set force stop hb", K(ret), K(status)); - } else { - status->force_stop_hb_ = force_stop_hb; - LOG_INFO("success to set force stop hb!", K(force_stop_hb)); - } - } - return ret; -} - -int ObServerManager::get_min_server_version(char min_server_version[OB_SERVER_VERSION_LENGTH]) -{ - int ret = OB_SUCCESS; - ObZone zone; // empty zone, get all server statuses - ObArray server_statuses; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("server manager not inited", K(ret)); - } else { - // no need to add lock - // check all servers' build versions are identical - if (OB_FAIL(get_server_statuses(zone, server_statuses))) { - LOG_WARN("get all server statuses failed", K(ret)); - } else if (OB_UNLIKELY(true == server_statuses.empty())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_statuses is empty", K(ret)); - } else { - ObClusterVersion version_parser; - uint64_t cur_min_version = UINT64_MAX; - FOREACH_CNT_X(status, server_statuses, OB_SUCC(ret)) { - char *saveptr = NULL; - char *version = STRTOK_R(status->build_version_, "_", &saveptr); - if (NULL == version || strlen(version) + 1 > OB_SERVER_VERSION_LENGTH) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid build version format", "build_version", status->build_version_); - } else if (OB_FAIL(version_parser.refresh_cluster_version(version))) { - LOG_WARN("failed to parse version", "version", version); - } else { - if (version_parser.get_cluster_version() < cur_min_version) { - size_t len = strlen(version); - MEMCPY(min_server_version, version, len); - min_server_version[len] = '\0'; - cur_min_version = version_parser.get_cluster_version(); - } - } - } - if (OB_SUCC(ret) && UINT64_MAX == cur_min_version) { - ret = OB_ENTRY_NOT_EXIST; - LOG_WARN("no valid server version found", K(ret)); - } - } - } - - return ret; -} - -bool ObServerManager::have_server_deleting() const -{ - bool bret = false; - int tmp_ret = OB_SUCCESS; - ObZone zone; - ObArray server_statuses; - if (!inited_) { - tmp_ret = OB_NOT_INIT; - LOG_WARN_RET(tmp_ret, "server manager not inited", K(tmp_ret)); - } else if (OB_SUCCESS != (tmp_ret = get_server_statuses(zone, server_statuses))) { - LOG_WARN_RET(tmp_ret, "fail to get server status", K(zone), K(tmp_ret)); - } else { - FOREACH_CNT_X(status, server_statuses, OB_SUCCESS == tmp_ret) { - if (OB_ISNULL(status)) { - tmp_ret = OB_ERR_UNEXPECTED; - LOG_WARN_RET(tmp_ret, "get invalid status", K(tmp_ret)); - } else if (ObServerStatus::OB_SERVER_ADMIN_DELETING == status->admin_status_) { - bret = true; - break; - } - } - } - return bret; -} -int ObServerManager::check_all_server_active(bool &all_active) const -{ - int ret = OB_SUCCESS; - all_active = true; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not inited", K(ret)); - } else { - SpinRLockGuard guard(server_status_rwlock_); - FOREACH_CNT_X(status, server_statuses_, all_active && OB_SUCC(ret)) { - if (OB_ISNULL(status)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("get invalid status", K(ret)); - } else if (status->in_service() && status->is_active()) { - } else { - all_active = false; - LOG_WARN("server status not valid", "server status", *status); - } - } - } - return ret; -} - ////////////////////////////////////// ////////////////////////////////////// ObHeartbeatChecker::ObHeartbeatChecker() @@ -2782,22 +1876,28 @@ void ObHeartbeatChecker::run3() int64_t last_renew_rs_time = 0; int64_t RENEW_INTERVAL = 3 * 1000 * 1000; //3s while (!stop_) { - update_last_run_timestamp(); - LOG_TRACE("begin check all server heartbeat"); - ret = server_manager_->check_servers(); - if (OB_FAIL(ret)) { - LOG_WARN("server managers check servers failed", K(ret)); - } - //ignore ret - now = ObTimeUtility::current_time(); - if (now - last_renew_rs_time > RENEW_INTERVAL) { - last_renew_rs_time = now; - if (OB_FAIL(server_manager_->try_renew_rs_list())) { - LOG_WARN("fail to try renew rs list", KR(ret)); + if (ObHeartbeatService::is_service_enabled()) { // the new logic + reset_last_run_timestamp(); + stop_ = true; + LOG_INFO("no need to run thread ObHeartbeatChecker in version >= 4.2"); + } else { + update_last_run_timestamp(); + LOG_TRACE("begin check all server heartbeat"); + ret = server_manager_->check_servers(); + if (OB_FAIL(ret)) { + LOG_WARN("server managers check servers failed", K(ret)); } + // //ignore ret + now = ObTimeUtility::current_time(); + if (now - last_renew_rs_time > RENEW_INTERVAL) { + last_renew_rs_time = now; + if (OB_FAIL(server_manager_->try_renew_rs_list())) { + LOG_WARN("fail to try renew rs list", KR(ret)); + } + } + DEBUG_SYNC(HUNG_HEARTBEAT_CHECK); + ob_usleep(CHECK_INTERVAL_US); } - DEBUG_SYNC(HUNG_HEARTBEAT_CHECK); - ob_usleep(CHECK_INTERVAL_US); } LOG_INFO("heartbeat checker stop"); } diff --git a/src/rootserver/ob_server_manager.h b/src/rootserver/ob_server_manager.h index 7dfa549a6..136837f28 100644 --- a/src/rootserver/ob_server_manager.h +++ b/src/rootserver/ob_server_manager.h @@ -82,9 +82,6 @@ public: const bool commit = true); virtual int start_server_list(const obrpc::ObServerList &server_list, const common::ObZone &zone); virtual int stop_server_list(const obrpc::ObServerList &server_list, const common::ObZone &zone); - // only add to memory, only used by bootstrap - int add_server_list(const obrpc::ObServerInfoList &server_list, - uint64_t &server_id); // server_id is OB_INVALID_ID before build server manager from __all_server int receive_hb(const share::ObLeaseRequest &lease_request, @@ -94,10 +91,6 @@ public: const common::ObAddr &server, uint64_t &server_id) const; - int expend_server_lease( - const common::ObAddr &server, - const int64_t new_lease_end); - // if server not exist or server's status is not serving, return false // otherwise, return true virtual int check_server_alive(const common::ObAddr &server, bool &is_alive) const; @@ -106,27 +99,6 @@ public: virtual int check_server_stopped(const common::ObAddr &server, bool &is_stopped) const; virtual int check_server_permanent_offline(const common::ObAddr &server, bool &is_offline) const; virtual int check_migrate_in_blocked(const common::ObAddr &addr, bool &blocked) const; - virtual int check_server_takenover_by_rs(const common::ObAddr &addr, bool &taken_over_by_rs) const; - virtual int check_server_valid_for_partition(const common::ObAddr &server, bool &is_valid) const; - virtual int check_server_with_id_exist( - const common::ObAddr &server, - const uint64_t server_id, - bool &exist) const; - - virtual int get_alive_servers(const common::ObZone &zone, ObIServerArray &server_list) const; - virtual int get_servers_by_status(ObIServerArray &active_server_list, - ObIServerArray &inactive_server_list) const; - virtual int get_servers_by_status(const common::ObZone &zone, - ObIServerArray &active_server_list, - ObIServerArray &inactive_server_list) const; - virtual int get_alive_server_count(const common::ObZone &zone, int64_t &count) const; - virtual int get_zone_max_unit_num(const common::ObZone &zone, int64_t &count) const; - virtual int get_active_server_array(const common::ObZone &zone, ObIServerArray &server_list) const; - virtual int get_servers_takenover_by_rs( - const common::ObZone &zone, - ObIServerArray &server_list) const; - virtual int finish_server_recovery(const common::ObAddr &server); - void clear_in_recovery_server_takenover_by_rs(const common::ObAddr &server); virtual int get_servers_of_zone( const common::ObZone &zone, ObServerArray &server_list) const; @@ -143,16 +115,20 @@ public: // get ObServerStatus through server addr, return OB_ENTRY_NOT_EXIST if not exist virtual int get_server_status(const common::ObAddr &server, share::ObServerStatus &server_status) const; + int get_server_resource_info( + const common::ObAddr &server, + share::ObServerResourceInfo &resource_info); int update_server_status(const share::ObServerStatus &server_status); // build ObServerManager from __all_server table int load_server_manager(); int load_server_statuses(const ObServerStatusArray &server_status); virtual bool has_build() const; - virtual int get_all_server_list(common::ObIArray &server_list); - // get server infos of zone, if zone is empty, get all server_infos virtual int get_server_statuses(const common::ObZone &zone, ObServerStatusIArray &server_statuses, bool include_permanent_offline = true) const; + virtual int build_server_resource_info_result( + const common::ObZone &zone, + ObIArray &active_servers_resource_info); virtual int get_server_statuses(const ObServerArray &servers, ObServerStatusArray &server_statuses) const; int get_persist_server_statuses(ObServerStatusArray &server_statuses); @@ -160,36 +136,18 @@ public: const ObAddr &server, ObDRTaskMgr &disaster_recovery_task_mgr, const bool with_rootserver); - int get_lease_duration(int64_t &lease_duration_time) const; virtual int get_server_zone(const common::ObAddr &addr, common::ObZone &zone) const; inline ObIStatusChangeCallback &get_status_change_callback() const; inline const common::ObAddr &get_rs_addr() const { return rs_addr_; } void reset(); - // set %zone_merged to true if servers in the same zone of %addr merged to %frozen_version - virtual int update_merged_version( - const common::ObAddr &addr, int64_t frozen_version, bool &zone_merged); - int get_merged_version(const common::ObAddr &addr, int64_t &merged_version) const; - - int block_migrate_in(const common::ObAddr &addr); - int unblock_migrate_in(const common::ObAddr &addr); - int64_t to_string(char *buf, const int64_t buf_len) const; - - virtual int set_with_partition(const common::ObAddr &server); - virtual int clear_with_partiton(const common::ObAddr &server, const int64_t last_hb_time); - virtual int set_force_stop_hb(const common::ObAddr &server, const bool &force_stop_hb); virtual int is_server_stopped(const common::ObAddr &server, bool &is_stopped) const; - virtual int get_server_leader_cnt(const common::ObAddr &server, int64_t &leader_cnt) const; - int check_other_zone_stopped(const common::ObZone &zone, bool &stopped); - int have_server_stopped(const common::ObZone &zone, bool &is_stopped) const; - int get_min_server_version(char min_server_version[OB_SERVER_VERSION_LENGTH]); - bool have_server_deleting() const; - int check_all_server_active(bool &all_active) const; - int try_modify_recovery_server_takenover_by_rs( - const common::ObAddr &server, - const common::ObZone &zone); int get_server_id(const ObZone &zone, const common::ObAddr &server, uint64_t &server_id) const; + static int try_delete_server_working_dir( + const common::ObZone &zone, + const common::ObAddr &server, + const int64_t svr_seq); protected: int construct_not_empty_server_set( common::hash::ObHashSet ¬_empty_server_set); @@ -201,21 +159,13 @@ protected: const bool with_rootserver, share::ObServerStatus &server_status); int reset_existing_rootserver(); - int try_delete_server_working_dir( - const common::ObZone &zone, - const common::ObAddr &server, - const int64_t svr_seq); - int update_admin_status(const common::ObAddr &server, const share::ObServerStatus::ServerAdminStatus status, const bool remove); - int set_migrate_in_blocked(const common::ObAddr &addr, const bool block); - int find(const common::ObAddr &server, const share::ObServerStatus *&status) const; int find(const common::ObAddr &server, share::ObServerStatus *&status); int fetch_new_server_id(uint64_t &server_id); - int check_server_id_used(const uint64_t server_id, bool &server_id_used); int start_or_stop_server(const common::ObAddr &server, const common::ObZone &zone, const bool is_start); virtual int start_server(const common::ObAddr &server, const common::ObZone &zone); diff --git a/src/rootserver/ob_server_zone_op_service.cpp b/src/rootserver/ob_server_zone_op_service.cpp new file mode 100644 index 000000000..a7f5c97ef --- /dev/null +++ b/src/rootserver/ob_server_zone_op_service.cpp @@ -0,0 +1,759 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#define USING_LOG_PREFIX RS + +#include "ob_server_zone_op_service.h" + +#include "share/ob_zone_table_operation.h" +#include "share/ob_service_epoch_proxy.h" +#include "share/ob_max_id_fetcher.h" +#include "lib/mysqlclient/ob_mysql_transaction.h" // ObMySQLTransaction +#include "rootserver/ob_root_service.h" // callback +#include "share/ob_all_server_tracer.h" +#include "rootserver/ob_server_manager.h" + +namespace oceanbase +{ +using namespace common; +using namespace share; +using namespace obrpc; +namespace rootserver +{ +ObServerZoneOpService::ObServerZoneOpService() + : is_inited_(false), + server_change_callback_(NULL), + rpc_proxy_(NULL), + sql_proxy_(NULL), + lst_operator_(NULL), + unit_manager_(NULL) +{ +} +ObServerZoneOpService::~ObServerZoneOpService() +{ +} +int ObServerZoneOpService::init( + ObIServerChangeCallback &server_change_callback, + ObSrvRpcProxy &rpc_proxy, + ObLSTableOperator &lst_operator, + ObUnitManager &unit_manager, + ObMySQLProxy &sql_proxy +) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(is_inited_)) { + ret = OB_INIT_TWICE; + LOG_WARN("server zone operation service has been inited already", KR(ret), K(is_inited_)); + } else if (OB_FAIL(st_operator_.init(&sql_proxy))) { + LOG_WARN("fail to init server table operator", KR(ret)); + } else { + server_change_callback_ = &server_change_callback; + rpc_proxy_ = &rpc_proxy; + sql_proxy_ = &sql_proxy; + lst_operator_ = &lst_operator; + unit_manager_ = &unit_manager; + is_inited_ = true; + } + return ret; +} +int ObServerZoneOpService::add_servers(const ObIArray &servers, const ObZone &zone, bool is_bootstrap) +{ + int ret = OB_SUCCESS; + uint64_t sys_tenant_data_version = 0; + ObCheckServerForAddingServerArg rpc_arg; + ObCheckServerForAddingServerResult rpc_result; + ObZone picked_zone; + ObTimeoutCtx ctx; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_FAIL(GET_MIN_DATA_VERSION(OB_SYS_TENANT_ID, sys_tenant_data_version))) { + LOG_WARN("fail to get sys tenant's min data version", KR(ret)); + } else if (OB_ISNULL(rpc_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("rpc_proxy_ is null", KR(ret), KP(rpc_proxy_)); + } else if (OB_FAIL(rootserver::ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else if (OB_FAIL(rpc_arg.init( + ObCheckServerForAddingServerArg::ADD_SERVER, + sys_tenant_data_version))) { + LOG_WARN("fail to init rpc arg", KR(ret), K(sys_tenant_data_version)); + } else { + for (int64_t i = 0; OB_SUCC(ret) && i < servers.count(); ++i) { + const ObAddr &addr = servers.at(i); + int64_t timeout = ctx.get_timeout(); + if (OB_UNLIKELY(timeout <= 0)) { + ret = OB_TIMEOUT; + LOG_WARN("ctx time out", KR(ret), K(timeout)); + } else if (OB_FAIL(rpc_proxy_->to(addr) + .timeout(timeout) + .check_server_for_adding_server(rpc_arg, rpc_result))) { + LOG_WARN("fail to check whether the server is empty", KR(ret), K(addr)); + } else if (!rpc_result.get_is_server_empty()) { + ret = OB_OP_NOT_ALLOW; + LOG_WARN("adding non-empty server is not allowed", KR(ret)); + LOG_USER_ERROR(OB_OP_NOT_ALLOW, "add non-empty server"); + } else if (OB_FAIL(zone_checking_for_adding_server_(zone, rpc_result.get_zone(), picked_zone))) { + LOG_WARN("zone checking for adding server is failed", KR(ret), K(zone), K(rpc_result.get_zone())); + } else if (OB_FAIL(add_server_( + addr, + picked_zone, + rpc_result.get_sql_port(), + rpc_result.get_build_version()))) { + LOG_WARN("add_server failed", "server", addr, "zone", picked_zone, "sql_port", + rpc_result.get_sql_port(), "build_version", rpc_result.get_build_version(), KR(ret)); + } else {} + } + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh server tracer", KR(ret), KR(tmp_ret)); + } + return ret; +} +int ObServerZoneOpService::delete_servers( + const ObIArray &servers, + const ObZone &zone) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(GCTX.root_service_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("root_service_ is null", KR(ret), KP(GCTX.root_service_)); + } else if (OB_UNLIKELY(servers.count() <= 0)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(servers)); + } else if (OB_FAIL(check_server_have_enough_resource_for_delete_server_(servers, zone))) { + LOG_WARN("not enough resource, cannot delete servers", KR(ret), K(servers), K(zone)); + } else if (OB_FAIL(GCTX.root_service_->check_all_ls_has_leader("delete server"))) { + LOG_WARN("fail to check all ls has leader", KR(ret)); + } else { + for (int64_t i = 0; OB_SUCC(ret) && i < servers.count(); ++i) { + if (OB_FAIL(delete_server_(servers.at(i), zone))) { + LOG_WARN("delete_server failed", "server", servers.at(i), "zone", zone, KR(ret)); + } + } + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh server tracer", KR(ret), KR(tmp_ret)); + } + return ret; +} +int ObServerZoneOpService::cancel_delete_servers( + const ObIArray &servers, + const ObZone &zone) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(unit_manager_) || OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unit_manager_ or sql_proxy_ or server_change_callback_ is null", KR(ret), + KP(unit_manager_), KP(sql_proxy_)); + } else { + ObServerInfoInTable server_info_in_table; + for (int64_t i = 0; OB_SUCC(ret) && i < servers.count(); ++i) { + const ObAddr &server = servers.at(i); + const int64_t now = ObTimeUtility::current_time(); + ObMySQLTransaction trans; + server_info_in_table.reset(); + if (OB_FAIL(trans.start(sql_proxy_, OB_SYS_TENANT_ID))) { + LOG_WARN("fail to start trans", KR(ret)); + } else if (OB_FAIL(check_and_end_delete_server_(trans, server, zone, true /* is_cancel */, server_info_in_table))) { + LOG_WARN("fail to check and end delete server", KR(ret), K(server), K(zone)); + } else if (OB_FAIL(ObServerTableOperator::update_status( + trans, + server, + ObServerStatus::OB_SERVER_DELETING, + server_info_in_table.is_alive() ? ObServerStatus::OB_SERVER_ACTIVE : ObServerStatus::OB_SERVER_INACTIVE))) { + LOG_WARN("fail to update status in __all_server table", KR(ret), + K(server), K(server_info_in_table)); + } else if (OB_FAIL(unit_manager_->cancel_migrate_out_units(server))) { + LOG_WARN("unit_manager_ cancel_migrate_out_units failed", KR(ret), K(server)); + } + (void) end_trans_and_on_server_change_(ret, trans, "cancel_delete_server", server, server_info_in_table.get_zone(), now); + } + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh server tracer", KR(ret), KR(tmp_ret)); + } + return ret; +} +int ObServerZoneOpService::finish_delete_server( + const ObAddr &server, + const ObZone &zone) +{ + int ret = OB_SUCCESS; + ObServerInfoInTable server_info_in_table; + const int64_t now = ObTimeUtility::current_time(); + ObMySQLTransaction trans; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("sql_proxy_ is null", KR(ret), KP(sql_proxy_)); + } else if (OB_FAIL(trans.start(sql_proxy_, OB_SYS_TENANT_ID))) { + LOG_WARN("fail to start trans", KR(ret)); + } else if (OB_FAIL(check_and_end_delete_server_(trans, server, zone, false /* is_cancel */, server_info_in_table))) { + LOG_WARN("fail to check and end delete server", KR(ret), K(server), K(zone)); + } else if (OB_FAIL(ObServerManager::try_delete_server_working_dir( + server_info_in_table.get_zone(), + server, + server_info_in_table.get_server_id()))) { + LOG_WARN("fail to delete server working dir", KR(ret), K(server_info_in_table)); + } else if (OB_FAIL(st_operator_.remove(server, trans))) { + LOG_WARN("fail to remove this server from __all_server table", KR(ret), K(server)); + } + (void) end_trans_and_on_server_change_(ret, trans, "finish_delete_server", server, server_info_in_table.get_zone(), now); + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh server tracer", KR(ret), KR(tmp_ret)); + } + return ret; +} +int ObServerZoneOpService::stop_servers( + const ObIArray &servers, + const ObZone &zone, + const obrpc::ObAdminServerArg::AdminServerOp &op) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_FAIL(stop_server_precheck(servers, op))) { + LOG_WARN("fail to precheck stop server", KR(ret), K(servers), K(zone)); + } else { + for (int64_t i = 0; OB_SUCC(ret) && i < servers.count(); i++) { + const ObAddr &server = servers.at(i); + if (OB_FAIL(start_or_stop_server_(server, zone, op))) { + LOG_WARN("fail to stop server", KR(ret), K(server), K(zone)); + } + } + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh server tracer", KR(ret), KR(tmp_ret)); + } + return ret; +} +int ObServerZoneOpService::start_servers( + const ObIArray &servers, + const ObZone &zone) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(servers.count() <= 0)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("servers' count is zero", KR(ret), K(servers)); + } else { + for (int64_t i = 0; OB_SUCC(ret) && i < servers.count(); ++i) { + const ObAddr &server = servers.at(i); + if (OB_FAIL(start_or_stop_server_(server, zone, ObAdminServerArg::START))) { + LOG_WARN("fail to start server", KR(ret), K(server), K(zone)); + } + } + } + int tmp_ret = OB_SUCCESS; + if (OB_TMP_FAIL(SVR_TRACER.refresh())) { + LOG_WARN("fail to refresh server tracer", KR(ret), KR(tmp_ret)); + } + return ret; +} +int ObServerZoneOpService::stop_server_precheck( + const ObIArray &servers, + const obrpc::ObAdminServerArg::AdminServerOp &op) +{ + int ret = OB_SUCCESS; + ObZone zone; + bool is_same_zone = false; + bool is_all_stopped = false; + ObArray all_servers_info_in_table; + ObServerInfoInTable server_info; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(servers.count() <= 0)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("servers' count is zero", KR(ret), K(servers)); + } else if (OB_ISNULL(GCTX.root_service_) || OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.root_service_ or sql_proxy_ is null", KR(ret), KP(GCTX.root_service_), KP(sql_proxy_)); + } else if (OB_FAIL(ObServerTableOperator::get(*sql_proxy_, all_servers_info_in_table))) { + LOG_WARN("fail to read __all_server table", KR(ret), KP(sql_proxy_)); + } else if (OB_FAIL(check_zone_and_server_( + all_servers_info_in_table, + servers, + is_same_zone, + is_all_stopped))) { + LOG_WARN("fail to check zone and server", KR(ret), K(all_servers_info_in_table), K(servers)); + } else if (is_all_stopped) { + //nothing todo + } else if (!is_same_zone) { + ret = OB_STOP_SERVER_IN_MULTIPLE_ZONES; + LOG_WARN("can not stop servers in multiple zones", KR(ret), K(server_info), K(servers)); + } else if (OB_FAIL((ObRootUtils::find_server_info(all_servers_info_in_table, servers.at(0), server_info)))) { + LOG_WARN("fail to find server info", KR(ret), K(all_servers_info_in_table), K(servers.at(0))); + } else { + const ObZone &zone = server_info.get_zone(); + if (ObAdminServerArg::ISOLATE == op) { + //"Isolate server" does not need to check the total number and status of replicas; it cannot be restarted later; + if (OB_FAIL(GCTX.root_service_->check_can_stop(zone, servers, false /*is_stop_zone*/))) { + LOG_WARN("fail to check can stop", KR(ret), K(zone), K(servers), K(op)); + if (OB_OP_NOT_ALLOW == ret) { + LOG_USER_ERROR(OB_OP_NOT_ALLOW, "Stop all servers in primary region is"); + } + } + } else { + if (ObRootUtils::have_other_stop_task(zone)) { + ret = OB_STOP_SERVER_IN_MULTIPLE_ZONES; + LOG_WARN("can not stop servers in multiple zones", KR(ret), K(zone), K(servers), K(op)); + LOG_USER_ERROR(OB_STOP_SERVER_IN_MULTIPLE_ZONES, + "cannot stop server or stop zone in multiple zones"); + } else if (OB_FAIL(GCTX.root_service_->check_majority_and_log_in_sync( + servers, + ObAdminServerArg::FORCE_STOP == op,/*skip_log_sync_check*/ + "stop server"))) { + LOG_WARN("fail to check majority and log in-sync", KR(ret), K(zone), K(servers), K(op)); + } + } + } + return ret; +} +int ObServerZoneOpService::zone_checking_for_adding_server_( + const ObZone &command_zone, + const ObZone &rpc_zone, + ObZone &picked_zone) +{ + int ret = OB_SUCCESS; + // command_zone: the zone specified in the system command ADD SERVER + // rpc_zone: the zone specified in the server's local config and send to rs via rpc + // picked_zone: the zone we will use in add_server + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(rpc_zone.is_empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("rpc_zone cannot be empty. It implies that server's local config zone is empty.", + KR(ret), K(rpc_zone)); + } else if (!command_zone.is_empty() && command_zone != rpc_zone) { + ret = OB_SERVER_ZONE_NOT_MATCH; + LOG_WARN("the zone specified in the server's local config is not the same as" + " the zone specified in the command", KR(ret), K(command_zone), K(rpc_zone)); + } else if (OB_FAIL(picked_zone.assign(rpc_zone))) { + LOG_WARN("fail to assign picked_zone", KR(ret), K(rpc_zone)); + } else {} + return ret; +} +int ObServerZoneOpService::add_server_( + const ObAddr &server, + const ObZone &zone, + const int64_t sql_port, + const ObServerInfoInTable::ObBuildVersion &build_version) +{ + int ret = OB_SUCCESS; + bool is_active = false; + uint64_t server_id = OB_INVALID_ID; + const int64_t now = ObTimeUtility::current_time(); + ObServerInfoInTable server_info_in_table; + ObMySQLTransaction trans; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(!server.is_valid() + || zone.is_empty() + || sql_port <= 0 + || build_version.is_empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server), K(zone), + K(sql_port), K(build_version)); + } else if (OB_ISNULL(sql_proxy_) || OB_ISNULL(server_change_callback_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("sql_proxy_ or server_change_callback_ is null", KR(ret), + KP(sql_proxy_), KP(server_change_callback_)); + } else if (OB_FAIL(trans.start(sql_proxy_, OB_SYS_TENANT_ID))) { + LOG_WARN("fail to start trans", KR(ret)); + } else if (OB_FAIL(check_and_update_service_epoch_(trans))) { + LOG_WARN("fail to check and update service epoch", KR(ret)); + } else if (OB_FAIL(ObZoneTableOperation::check_zone_active(trans, zone, is_active))){ + // we do not need to lock the zone info in __all_zone table + // all server/zone operations are mutually exclusive since we locked the service epoch + LOG_WARN("fail to check whether the zone is active", KR(ret), K(zone)); + } else if (OB_UNLIKELY(!is_active)) { + ret = OB_ZONE_NOT_ACTIVE; + LOG_WARN("the zone is not active", KR(ret), K(zone), K(is_active)); + } else if (OB_FAIL(ObServerTableOperator::get(trans, server, server_info_in_table))) { + if (OB_SERVER_NOT_IN_WHITE_LIST == ret) { + ret = OB_SUCCESS; + } else { + LOG_WARN("fail to get server_info in table", KR(ret), K(server)); + } + } else { + ret = OB_ENTRY_EXIST; + LOG_WARN("server exists", KR(ret), K(server_info_in_table)); + } + if (FAILEDx(fetch_new_server_id_(server_id))) { + // fetch a new server id and insert the server into __all_server table + LOG_WARN("fail to fetch new server id", KR(ret)); + } else if (OB_UNLIKELY(OB_INVALID_ID == server_id)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("server id is invalid", KR(ret), K(server_id)); + } else if (OB_FAIL(server_info_in_table.init( + server, + server_id, + zone, + sql_port, + false, /* with_rootserver */ + ObServerStatus::OB_SERVER_ACTIVE, + build_version, + 0, /* stop_time */ + 0, /* start_service_time */ + 0 /* last_offline_time */))) { + LOG_WARN("fail to init server info in table", KR(ret), K(server), K(server_id), K(zone), + K(sql_port), K(build_version), K(now)); + } else if (OB_FAIL(ObServerTableOperator::insert(trans, server_info_in_table))) { + LOG_WARN("fail to insert server info into __all_server table", KR(ret), K(server_info_in_table)); + } + (void) end_trans_and_on_server_change_(ret, trans, "add_server", server, zone, now); + return ret; +} +int ObServerZoneOpService::delete_server_( + const common::ObAddr &server, + const ObZone &zone) +{ + int ret = OB_SUCCESS; + ObServerInfoInTable server_info_in_table; + const int64_t now = ObTimeUtility::current_time(); + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + ObMySQLTransaction trans; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(!server.is_valid() || !server.ip_to_string(ip, sizeof(ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server)); + } else if (OB_ISNULL(sql_proxy_) || OB_ISNULL(server_change_callback_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("sql_proxy_ or server_change_callback_ is null", KR(ret), + KP(sql_proxy_), KP(server_change_callback_)); + } else if (OB_FAIL(trans.start(sql_proxy_, OB_SYS_TENANT_ID))) { + LOG_WARN("fail to start trans", KR(ret)); + } else if (OB_FAIL(check_and_update_service_epoch_(trans))) { + LOG_WARN("fail to check and update service epoch", KR(ret)); + } else if (OB_FAIL(ObServerTableOperator::get(trans, server, server_info_in_table))) { + LOG_WARN("fail to get server_info in table", KR(ret), K(server)); + } else if (!zone.is_empty() && zone != server_info_in_table.get_zone()) { + ret = OB_SERVER_ZONE_NOT_MATCH; + LOG_WARN("zone not matches", KR(ret), K(server), K(zone), K(server_info_in_table)); + } else if (OB_UNLIKELY(server_info_in_table.is_deleting())) { + ret = OB_SERVER_ALREADY_DELETED; + LOG_WARN("the server has been deleted", KR(ret), K(server_info_in_table)); + } else { + int64_t job_id = RS_JOB_CREATE(DELETE_SERVER, trans, "svr_ip", ip, "svr_port", server.get_port()); + if (job_id < 1) { + ret = OB_SQL_OPT_ERROR; + LOG_WARN("insert into all_rootservice_job failed ", K(ret)); + } else if (OB_FAIL(ObServerTableOperator::update_status( + trans, + server, + server_info_in_table.get_status(), + ObServerStatus::OB_SERVER_DELETING))) { + LOG_WARN("fail to update status", KR(ret), K(server), K(server_info_in_table)); + } + } + (void) end_trans_and_on_server_change_(ret, trans, "delete_server", server, server_info_in_table.get_zone(), now); + return ret; +} +int ObServerZoneOpService::check_and_end_delete_server_( + common::ObMySQLTransaction &trans, + const common::ObAddr &server, + const ObZone &zone, + const bool is_cancel, + share::ObServerInfoInTable &server_info) +{ + int ret = OB_SUCCESS; + server_info.reset(); + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(!server.is_valid() || !server.ip_to_string(ip, sizeof(ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server)); + } else if (OB_FAIL(check_and_update_service_epoch_(trans))) { + LOG_WARN("fail to check and update service epoch", KR(ret)); + } else if (OB_FAIL(ObServerTableOperator::get(trans, server, server_info))) { + LOG_WARN("fail to get server_info in table", KR(ret), K(server)); + } else if (!zone.is_empty() && zone != server_info.get_zone()) { + ret = OB_SERVER_ZONE_NOT_MATCH; + LOG_WARN("zone not matches", KR(ret), K(server), K(zone), K(server_info)); + } else if (OB_UNLIKELY(!server_info.is_deleting())) { + ret = OB_SERVER_NOT_DELETING; + LOG_ERROR("server is not in deleting status, cannot be removed from __all_server table", + KR(ret), K(server_info)); + } else { + ObRsJobInfo job_info; + ret = RS_JOB_FIND(job_info, trans, "job_type", "DELETE_SERVER", + "job_status", "INPROGRESS", + "svr_ip", ip, "svr_port", server.get_port()); + if (OB_SUCC(ret) && job_info.job_id_ > 0) { + int tmp_ret = is_cancel ? OB_CANCELED : OB_SUCCESS; + if (OB_FAIL(RS_JOB_COMPLETE(job_info.job_id_, tmp_ret, trans))) { + LOG_WARN("fail to all_rootservice_job" , KR(ret), K(server)); + } + } else { + LOG_WARN("failed to find job", KR(ret), K(server)); + } + } + return ret; +} +int ObServerZoneOpService::start_or_stop_server_( + const common::ObAddr &server, + const ObZone &zone, + const obrpc::ObAdminServerArg::AdminServerOp &op) +{ + int ret = OB_SUCCESS; + const int64_t now = ObTimeUtility::current_time(); + ObServerInfoInTable server_info; + ObMySQLTransaction trans; + bool is_start = (ObAdminServerArg::START == op); + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_UNLIKELY(!server.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server)); + } else if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("sql_proxy_ is null", KR(ret), KP(sql_proxy_)); + } else if (OB_FAIL(trans.start(sql_proxy_, OB_SYS_TENANT_ID))) { + LOG_WARN("fail to start trans", KR(ret)); + } else if (OB_FAIL(check_and_update_service_epoch_(trans))) { + LOG_WARN("fail to check and update service epoch", KR(ret)); + } else if (OB_FAIL(ObServerTableOperator::get(trans, server, server_info))) { + LOG_WARN("fail to get server_info", KR(ret), K(server)); + } else if (!zone.is_empty() && zone != server_info.get_zone()) { + ret = OB_SERVER_ZONE_NOT_MATCH; + LOG_WARN("zone not matches", KR(ret), K(server), K(zone), K(server_info)); + } else if (ObAdminServerArg::STOP == op || ObAdminServerArg::FORCE_STOP == op) { + // check again, if there exists stopped servers in other zones + if (ObRootUtils::have_other_stop_task(server_info.get_zone())) { + ret = OB_STOP_SERVER_IN_MULTIPLE_ZONES; + LOG_WARN("can not stop servers in multiple zones", KR(ret), K(server_info.get_zone())); + LOG_USER_ERROR(OB_STOP_SERVER_IN_MULTIPLE_ZONES, + "cannot stop server or stop zone in multiple zones"); + } + } + if (OB_SUCC(ret)) { + int64_t new_stop_time = is_start ? 0 : now; + int64_t old_stop_time = server_info.get_stop_time(); + if ((is_start && 0 != old_stop_time) || (!is_start && 0 == old_stop_time)) { + if (OB_FAIL(ObServerTableOperator::update_stop_time( + trans, + server, + old_stop_time, + new_stop_time))) { + LOG_WARN("fail to update stop_time", KR(ret), K(server), K(old_stop_time), K(new_stop_time)); + } + } + LOG_INFO("update stop time", KR(ret), K(server_info), + K(old_stop_time), K(new_stop_time), K(op), K(is_start)); + } + const char *op_print_str = is_start ? "start_server" : "stop_server"; + (void) end_trans_and_on_server_change_(ret, trans, op_print_str, server, server_info.get_zone(), now); + return ret; +} + +int ObServerZoneOpService::construct_rs_list_arg(ObRsListArg &rs_list_arg) +{ + int ret = OB_SUCCESS; + ObLSInfo ls_info; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(lst_operator_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("lst operator is null", KR(ret), KP(lst_operator_)); + } else if (OB_FAIL(lst_operator_->get( + GCONF.cluster_id, + OB_SYS_TENANT_ID, + SYS_LS, + share::ObLSTable::DEFAULT_MODE, + ls_info))) { + LOG_WARN("fail to get ls info", KR(ret)); + } else { + rs_list_arg.master_rs_ = GCONF.self_addr_; + FOREACH_CNT_X(replica, ls_info.get_replicas(), OB_SUCC(ret)) { + if (replica->get_server() == GCONF.self_addr_ + || (replica->is_in_service() + && ObReplicaTypeCheck::is_paxos_replica_V2(replica->get_replica_type()))) { + if (OB_FAIL(rs_list_arg.rs_list_.push_back(replica->get_server()))) { + LOG_WARN("fail to push a server into rs list", KR(ret), K(replica->get_server())); + } + } + } + } + return ret; +} +int ObServerZoneOpService::check_and_update_service_epoch_(ObMySQLTransaction &trans) +{ + int ret = OB_SUCCESS; + int64_t service_epoch_in_table = palf::INVALID_PROPOSAL_ID; + int64_t proposal_id = palf::INVALID_PROPOSAL_ID; + ObRole role; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_FAIL(ObRootUtils::get_proposal_id_from_sys_ls(proposal_id, role))) { + LOG_WARN("fail to get proposal id from sys ls", KR(ret)); + } else if (ObRole::LEADER != role) { + ret = OB_NOT_MASTER; + LOG_WARN("not leader ls", KR(ret), K(proposal_id), K(service_epoch_in_table), K(role)); + } else if (palf::INVALID_PROPOSAL_ID == proposal_id) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid proposal id", KR(ret), K(proposal_id)); + } else if (OB_FAIL(ObServiceEpochProxy::check_and_update_service_epoch( + trans, + OB_SYS_TENANT_ID, + ObServiceEpochProxy::SERVER_ZONE_OP_SERVICE_EPOCH, + proposal_id))) { + LOG_WARN("fail to check and update server zone op service epoch", KR(ret), K(proposal_id)); + } else {} + return ret; +} +int ObServerZoneOpService::fetch_new_server_id_(uint64_t &server_id) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("invalid sql proxy", KR(ret), KP(sql_proxy_)); + } else { + uint64_t new_max_id = OB_INVALID_ID; + ObMaxIdFetcher id_fetcher(*sql_proxy_); + if (OB_FAIL(id_fetcher.fetch_new_max_id( + OB_SYS_TENANT_ID, + OB_MAX_USED_SERVER_ID_TYPE, + new_max_id))) { + LOG_WARN("fetch_new_max_id failed", KR(ret)); + } else { + server_id = new_max_id; + } + } + return ret; +} +int ObServerZoneOpService::check_server_have_enough_resource_for_delete_server_( + const ObIArray &servers, + const ObZone &zone) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(unit_manager_) || OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unit_manager_ or sql_proxy_ is null", KR(ret), KP(unit_manager_), KP(sql_proxy_)); + } else { + ObServerInfoInTable server_info; + FOREACH_CNT_X(server, servers, OB_SUCC(ret)) { + server_info.reset(); + if (OB_FAIL(ObServerTableOperator::get(*sql_proxy_, *server, server_info))) { + LOG_WARN("fail to get server_info in table", KR(ret), KP(sql_proxy_), KPC(server)); + } else if (!zone.is_empty() && server_info.get_zone() != zone) { + ret = OB_SERVER_ZONE_NOT_MATCH; + LOG_WARN("the arg zone is not the same as the server's zone in __all_server table", KR(ret), + K(zone), K(server_info)); + } else if (OB_FAIL(unit_manager_->check_enough_resource_for_delete_server( + *server, server_info.get_zone()))) { + LOG_WARN("fail to check enouch resource", KR(ret), KPC(server), K(server_info)); + } + }//end for each + } + return ret; +} +int ObServerZoneOpService::check_zone_and_server_( + const ObIArray &servers_info, + const ObIArray &servers, + bool &is_same_zone, + bool &is_all_stopped) +{ + int ret = OB_SUCCESS; + is_same_zone = true; + is_all_stopped = true; + if (OB_UNLIKELY(!is_inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(is_inited_)); + } else { + ObServerInfoInTable server_info; + ObZone zone; + for (int64_t i = 0; i < servers.count() && OB_SUCC(ret) && (is_same_zone || is_all_stopped); i++) { + const ObAddr &server = servers.at(i); + server_info.reset(); + if (OB_FAIL(ObRootUtils::find_server_info(servers_info, server, server_info))) { + LOG_WARN("fail to get server info", KR(ret), K(servers_info), K(server)); + } else if (0 == i) { + if (OB_FAIL(zone.assign(server_info.get_zone()))) { + LOG_WARN("fail to assign zone", KR(ret), K(server_info.get_zone())); + } + } else if (zone != server_info.get_zone()) { + is_same_zone = false; + LOG_WARN("server zone not same", K(zone), K(server_info), K(servers)); + } + if (OB_FAIL(ret)) { + } else if (!server_info.is_stopped()) { + is_all_stopped = false; + } + } + } + return ret; +} +void ObServerZoneOpService::end_trans_and_on_server_change_( + int &ret, + common::ObMySQLTransaction &trans, + const char *op_print_str, + const common::ObAddr &server, + const ObZone &zone, + const int64_t start_time) +{ + int tmp_ret = OB_SUCCESS; + LOG_INFO("start execute end_trans_and_on_server_change_", KR(ret), + K(op_print_str), K(server), K(zone), K(start_time)); + if (OB_UNLIKELY(!trans.is_started())) { + LOG_WARN("the transaction is not started"); + } else { + if (OB_TMP_FAIL(trans.end(OB_SUCC(ret)))) { + LOG_WARN("fail to commit the transaction", KR(ret), KR(tmp_ret), K(server), K(zone)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } + } + if (OB_ISNULL(server_change_callback_)) { + tmp_ret = OB_ERR_UNEXPECTED; + LOG_WARN("server_change_callback_ is null", KR(ret), KR(tmp_ret), KP(server_change_callback_)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } else if (OB_TMP_FAIL(server_change_callback_->on_server_change())) { + LOG_WARN("fail to callback on server change", KR(ret), KR(tmp_ret)); + } + int64_t time_cost = ::oceanbase::common::ObTimeUtility::current_time() - start_time; + FLOG_INFO(op_print_str, K(server), K(zone), "time cost", time_cost, KR(ret)); + ROOTSERVICE_EVENT_ADD("server", op_print_str, K(server), K(ret)); +} +} +} diff --git a/src/rootserver/ob_server_zone_op_service.h b/src/rootserver/ob_server_zone_op_service.h new file mode 100644 index 000000000..5959dae90 --- /dev/null +++ b/src/rootserver/ob_server_zone_op_service.h @@ -0,0 +1,216 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ + +#ifndef OCEANBASE_ROOTSERVER_OB_SERVER_ZONE_OP_SERVICE_H +#define OCEANBASE_ROOTSERVER_OB_SERVER_ZONE_OP_SERVICE_H + +#include "share/ob_server_table_operator.h" +#include "share/ob_rpc_struct.h" + +namespace oceanbase +{ +namespace obrpc +{ +class ObSrvRpcProxy; +struct ObRsListArg; +// struct ObAdminServerArg; +} +namespace share +{ +class ObLSTableOperator; +class ObAllServerTracer; +} +namespace rootserver +{ +class ObIServerChangeCallback; +class ObUnitManager; +class ObServerZoneOpService +{ +public: + ObServerZoneOpService(); + virtual ~ObServerZoneOpService(); + int init( + ObIServerChangeCallback &server_change_callback, + obrpc::ObSrvRpcProxy &rpc_proxy, + share::ObLSTableOperator &lst_operator, + ObUnitManager &unit_manager, + ObMySQLProxy &sql_proxy + ); + // Add new servers to a specified(optional) zone in the cluster. + // The servers should be empty and the zone should be active. + // This operation is successful + // if the servers' info are inserted into __all_server table successfully. + // + // @param[in] servers the servers which we want to add + // @param[in] zone the zone in which the servers will be located. If it's empty, + // the zone specified in the servers' local config will be picked + // + // @ret OB_SUCCESS add successfully + // @ret OB_ZONE_NOT_ACTIVE the specified zone is not active + // @ret OB_SERVER_ZONE_NOT_MATCH the zone specified in the server's local config is not the same + // as the zone specified in the system command ADD SERVER + // or both are empty + // @ret OB_ENTRY_EXIST there exists servers which are already added + // + // @ret other error code failure + int add_servers(const ObIArray &servers, const ObZone &zone, bool is_bootstrap = false); + int construct_rs_list_arg(obrpc::ObRsListArg &rs_list_arg); + // Try to delete the given servers from the cluster (logically). + // In this func, we only set their statuses in __all_server table be OB_SERVER_DELETING. + // Root balancer will detect servers with such statuses + // and start to migrate units on these servers to other servers. + // Once a server with status OB_SERVER_DELETING has no units and no records in __all_ls_meta_table, + // this server will be deleted from __all_server table, which means this server is no longer in the cluster + // (see related machanism in ObEmptyServerChecker). + // + // @param[in] server the server which we try to delete + // @param[in] zone the zone in which the server is located + // + // @ret OB_SUCCESS set status be OB_SERVER_DELETING in __all_server table successfully + // @ret OB_SERVER_ZONE_NOT_MATCH the arg zone is not the same as the server's zone in __all_server table + // @ret OB_SERVER_ALREADY_DELETED the server's status has been OB_SERVER_DELETING already + // @ret OB_SERVER_NOT_IN_WHITE_LIST the server is not in the cluster + // @ret OB_NOT_MASTER not rs leader, cannot execute the command + // + // @ret other error code failure + int delete_servers( + const ObIArray &servers, + const common::ObZone &zone); + // Revoke the delete operation for the given server from the cluster (logically). + // What we do in this func is to set servers' status be OB_SERVER_ACTIVE + // or OB_SERVER_INACTIVE in __all_server table + // and prevent units on this server be migrated to other servers. + // + // @param[in] server the server for which we want to revoke the delete operation + // @param[in] zone the zone in which the server is located + // + // @ret OB_SUCCESS set status be OB_SERVER_ACTIVE or OB_SERVER_INACTIVE in __all_server table successfully + // @ret OB_SERVER_ZONE_NOT_MATCH the arg zone is not the same as the server's zone in __all_server table + // @ret OB_SERVER_NOT_DELETING the server's status is not OB_SERVER_DELETING, we cannot cancel delete + // @ret OB_SERVER_NOT_IN_WHITE_LIST the server is not in the cluster + // @ret OB_NOT_MASTER not rs leader, cannot execute the command + +// @ret other error code failure + int cancel_delete_servers( + const ObIArray &servers, + const common::ObZone &zone); + // Delete the given server from the cluster + // In this func, we delete the server from __all_server table. + + // @param[in] server the server which we want to delete + // @param[in] zone the zone in which the server is located + + // @ret OB_SUCCESS delete the server from __all_server table successfully + // @ret OB_SERVER_NOT_DELETING the server's status is not OB_SERVER_DELETING, we cannot remove it + // @ret OB_SERVER_NOT_IN_WHITE_LIST the server is not in the cluster + // @ret OB_NOT_MASTER not rs leader, cannot execute the command + + // @ret other error code failure + int finish_delete_server( + const common::ObAddr &server, + const common::ObZone &zone); + // stop the given server + // In this func, we set the server's stop_time be now in __all_server table + // Stopping server should guarantee that there is no other zone's server is stopped. + // Isolating server should guarantee that there still exists started server in primary region after isolating + // In addition, stop server will check majority and log sync. + // + // @param[in] server the server which we want to stop + // @param[in] zone the zone in which the server is located + // @param[in] is_stop true if stop, otherwise isolate + // + // @ret OB_SUCCESS stop the server successfully + // @ret OB_INVALID_ARGUMENT an invalid server + // @ret OB_SERVER_ZONE_NOT_MATCH the arg zone is not the same as the server's zone in __all_server table + // @ret OB_NOT_MASTER not rs leader, cannot execute the command + // @ret OB_SERVER_NOT_IN_WHITE_LIST the server is not in the cluster + + // @ret other error code failure + int stop_servers( + const ObIArray &servers, + const ObZone &zone, + const obrpc::ObAdminServerArg::AdminServerOp &op); + // start the given server + // In this func, we set the server's stop_time be zero in __all_server table + // + // @param[in] server the server which we want to start + // @param[in] zone the zone in which the server is located + // @param[in] op op: isolate, stop, force_stop + // + // @ret OB_SUCCESS start the server successfully + // @ret OB_INVALID_ARGUMENT an invalid server + // @ret OB_SERVER_ZONE_NOT_MATCH the arg zone is not the same as the server's zone in __all_server table + // @ret OB_NOT_MASTER not rs leader, cannot execute the command + // @ret OB_SERVER_NOT_IN_WHITE_LIST the server is not in the cluster + + // @ret other error code failure + int start_servers( + const ObIArray &servers, + const ObZone &zone); + int stop_server_precheck( + const ObIArray &servers, + const obrpc::ObAdminServerArg::AdminServerOp &op); +private: + int zone_checking_for_adding_server_( + const common::ObZone &command_zone, + const common::ObZone &rpc_zone, + ObZone &picked_zone); + int add_server_( + const common::ObAddr &server, + const common::ObZone &zone, + const int64_t sql_port, + const share::ObServerInfoInTable::ObBuildVersion &build_version); + int delete_server_( + const common::ObAddr &server, + const common::ObZone &zone); + int check_and_end_delete_server_( + common::ObMySQLTransaction &trans, + const common::ObAddr &server, + const common::ObZone &zone, + const bool is_cancel, + share::ObServerInfoInTable &server_info); + int start_or_stop_server_( + const common::ObAddr &server, + const ObZone &zone, + const obrpc::ObAdminServerArg::AdminServerOp &op); + int check_and_update_service_epoch_(common::ObMySQLTransaction &trans); + int fetch_new_server_id_(uint64_t &server_id); + int check_server_have_enough_resource_for_delete_server_( + const ObIArray &servers, + const common::ObZone &zone); + int check_zone_and_server_( + const ObIArray &servers_info, + const ObIArray &servers, + bool &is_same_zone, + bool &is_all_stopped); + void end_trans_and_on_server_change_( + int &ret, + common::ObMySQLTransaction &trans, + const char *op_print_str, + const common::ObAddr &server, + const common::ObZone &zone, + const int64_t start_time); + bool is_inited_; + ObIServerChangeCallback *server_change_callback_; + obrpc::ObSrvRpcProxy *rpc_proxy_; + ObMySQLProxy *sql_proxy_; + share::ObLSTableOperator *lst_operator_; + share::ObServerTableOperator st_operator_; + ObUnitManager *unit_manager_; + +private: + DISALLOW_COPY_AND_ASSIGN(ObServerZoneOpService); +}; +} // rootserver +} // oceanbase + +#endif \ No newline at end of file diff --git a/src/rootserver/ob_system_admin_util.cpp b/src/rootserver/ob_system_admin_util.cpp index fd36c882d..5a44c76b9 100644 --- a/src/rootserver/ob_system_admin_util.cpp +++ b/src/rootserver/ob_system_admin_util.cpp @@ -255,7 +255,7 @@ int ObAdminSwitchReplicaRole::get_tenants_of_zone(const ObZone &zone, ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", K(zone), "tenant_id_set created", tenant_id_set.created(), KR(ret)); - } else if (OB_FAIL(ctx_.server_mgr_->get_alive_servers(zone, server_array))) { + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(zone, server_array))) { LOG_WARN("get alive servers failed", K(zone), KR(ret)); } else { FOREACH_CNT_X(server, server_array, OB_SUCCESS == ret) { @@ -320,7 +320,7 @@ int ObAdminCallServer::get_server_list(const ObServerZoneArg &arg, ObIArraycheck_server_alive(arg.server_, is_alive))) { + if (OB_FAIL(SVR_TRACER.check_server_alive(arg.server_, is_alive))) { LOG_WARN("fail to check server alive", KR(ret), "server", arg.server_); } else if (!is_alive) { ret = OB_INVALID_ARGUMENT; @@ -335,7 +335,7 @@ int ObAdminCallServer::get_server_list(const ObServerZoneArg &arg, ObIArrayget_alive_servers(arg.zone_, server_list))) { + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(arg.zone_, server_list))) { LOG_WARN("get alive servers failed", KR(ret), K(arg)); } } @@ -474,6 +474,9 @@ int ObAdminReloadServer::execute() if (!ctx_.is_inited()) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); + } else if (OB_ISNULL(ctx_.server_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("ctx_.server_mgr_ is null", KR(ret), KP(ctx_.server_mgr_)); } else if (OB_FAIL(ctx_.server_mgr_->load_server_manager())) { LOG_WARN("build server status failed", KR(ret)); } @@ -932,7 +935,7 @@ int ObAdminSetConfig::update_config(obrpc::ObAdminSetConfigArg &arg, int64_t new if (false == addr.set_ip_addr(svr_ip, static_cast(svr_port))){ ret = OB_ERR_UNEXPECTED; LOG_WARN("set addr fail", KR(ret), "svr_ip", svr_ip, K(svr_port)); - } else if (OB_FAIL(ctx_.server_mgr_->is_server_exist(addr, is_server_exist))) { + } else if (OB_FAIL(SVR_TRACER.is_server_exist(addr, is_server_exist))) { LOG_WARN("check server exist fail", K(addr)); } else if (!is_server_exist) { ret = OB_INVALID_ARGUMENT; @@ -1495,10 +1498,7 @@ int ObAdminRollingUpgradeCmd::execute(const obrpc::ObAdminRollingUpgradeArg &arg } else if (obrpc::OB_UPGRADE_STAGE_POSTUPGRADE == arg.stage_) { // end rolling upgrade, should raise min_observer_version const char *min_obs_version_name = "min_observer_version"; - if (OB_ISNULL(ctx_.server_mgr_)) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("server_mgr is null", KR(ret)); - } else if (OB_FAIL(ctx_.server_mgr_->get_min_server_version(min_server_version))) { + if (OB_FAIL(SVR_TRACER.get_min_server_version(min_server_version))) { LOG_WARN("failed to get the min server version", KR(ret)); } else if (OB_FAIL(item.name_.assign(min_obs_version_name))) { LOG_WARN("assign min_observer_version config name failed", @@ -1751,6 +1751,7 @@ int ObAdminRootInspection::execute(const obrpc::ObRunJobArg &arg) { int ret = OB_SUCCESS; LOG_INFO("execute root inspection request", K(arg)); + ObAddr rs_addr; if (!ctx_.is_inited()) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); @@ -1760,19 +1761,24 @@ int ObAdminRootInspection::execute(const obrpc::ObRunJobArg &arg) } else if (ROOT_INSPECTION != get_inner_job_value(arg.job_)) { ret = OB_ERR_UNEXPECTED; LOG_WARN("job to run not root inspection", K(arg), KR(ret)); - } else if (!ctx_.server_mgr_->is_inited()) { - ret = OB_INNER_STAT_ERROR; - LOG_WARN("server_mgr_ not inited", KR(ret)); + } else if (OB_ISNULL(GCTX.rs_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.rs_mgr_ is null", KR(ret), KP(GCTX.rs_mgr_)); + } else if (OB_FAIL(GCTX.rs_mgr_->get_master_root_server(rs_addr))) { + LOG_WARN("fail to get master root server", KR(ret)); + } else if (OB_UNLIKELY(!rs_addr.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("rs_addr is invalid", KR(ret), K(rs_addr)); } else if (!ctx_.root_inspection_->is_inited()) { ret = OB_INNER_STAT_ERROR; LOG_WARN("root_inspection not inited", KR(ret)); } else if (!arg.zone_.is_empty()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("root inspection can't execute by zone", K(arg), KR(ret)); - } else if (arg.server_.is_valid() && arg.server_ != ctx_.server_mgr_->get_rs_addr()) { + } else if (arg.server_.is_valid() && arg.server_ != rs_addr) { ret = OB_INVALID_ARGUMENT; LOG_WARN("only rs can execute root inspection", K(arg), - "rs", ctx_.server_mgr_->get_rs_addr(), KR(ret)); + "rs", rs_addr, KR(ret)); } else if (OB_FAIL(ctx_.root_inspection_->check_all())) { LOG_WARN("root_inspection check_all failed", KR(ret)); } @@ -1890,13 +1896,12 @@ int ObTenantServerAdminUtil::get_tenant_servers(const uint64_t tenant_id, common } } else { ObArray pool_ids; - if (OB_ISNULL(ctx_.server_mgr_) || OB_ISNULL(ctx_.unit_mgr_)) { + if (OB_ISNULL(ctx_.unit_mgr_)) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ctx_.server_mgr_), K(ctx_.unit_mgr_), KR(ret)); - } else if (!ctx_.server_mgr_->has_build() || !ctx_.unit_mgr_->check_inner_stat()) { + LOG_WARN("invalid argument", K(ctx_.unit_mgr_), KR(ret)); + } else if (!SVR_TRACER.has_build() || !ctx_.unit_mgr_->check_inner_stat()) { ret = OB_SERVER_IS_INIT; LOG_WARN("server manager or unit manager hasn't built", - "server_mgr built", ctx_.server_mgr_->has_build(), "unit_mgr built", ctx_.unit_mgr_->check_inner_stat(), KR(ret)); } else if (OB_FAIL(ctx_.unit_mgr_->get_pool_ids_of_tenant(tenant_id, pool_ids))) { LOG_WARN("get_pool_ids_of_tenant failed", K(tenant_id), KR(ret)); @@ -1910,7 +1915,7 @@ int ObTenantServerAdminUtil::get_tenant_servers(const uint64_t tenant_id, common for (int64_t j = 0; OB_SUCC(ret) && j < unit_infos.count(); ++j) { bool is_alive = false; const ObUnit &unit = unit_infos.at(j).unit_; - if (OB_FAIL(ctx_.server_mgr_->check_server_alive(unit.server_, is_alive))) { + if (OB_FAIL(SVR_TRACER.check_server_alive(unit.server_, is_alive))) { LOG_WARN("check_server_alive failed", "server", unit.server_, KR(ret)); } else if (is_alive) { if (OB_FAIL(servers.push_back(unit.server_))) { @@ -1919,7 +1924,7 @@ int ObTenantServerAdminUtil::get_tenant_servers(const uint64_t tenant_id, common } if (OB_SUCC(ret)) { if (unit.migrate_from_server_.is_valid()) { - if (OB_FAIL(ctx_.server_mgr_->check_server_alive( + if (OB_FAIL(SVR_TRACER.check_server_alive( unit.migrate_from_server_, is_alive))) { LOG_WARN("check_server_alive failed", "server", unit.migrate_from_server_, KR(ret)); @@ -1943,10 +1948,7 @@ int ObTenantServerAdminUtil::get_all_servers(common::ObIArray &servers) { int ret = OB_SUCCESS; ObZone empty_zone; - if (OB_ISNULL(ctx_.server_mgr_)) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid argument", K(ctx_.server_mgr_), KR(ret)); - } else if (OB_FAIL(ctx_.server_mgr_->get_alive_servers(empty_zone, servers))){ + if (OB_FAIL(SVR_TRACER.get_alive_servers(empty_zone, servers))) { //if zone is empty, get all servers LOG_WARN("fail to get all servers", KR(ret)); } diff --git a/src/rootserver/ob_table_creator.h b/src/rootserver/ob_table_creator.h index 44085152f..46c3f3953 100644 --- a/src/rootserver/ob_table_creator.h +++ b/src/rootserver/ob_table_creator.h @@ -26,7 +26,6 @@ class SCN; } namespace rootserver { -class ObServerManager; class ObTableCreator { public: diff --git a/src/rootserver/ob_tablet_creator.h b/src/rootserver/ob_tablet_creator.h index 241f2ad23..a71f4ba2d 100644 --- a/src/rootserver/ob_tablet_creator.h +++ b/src/rootserver/ob_tablet_creator.h @@ -33,7 +33,6 @@ class ObLSTableOperator; } namespace rootserver { -class ObServerManager; struct ObTabletCreatorArg { public: diff --git a/src/rootserver/ob_tenant_info_loader.cpp b/src/rootserver/ob_tenant_info_loader.cpp index f06dc3027..6056edcb6 100644 --- a/src/rootserver/ob_tenant_info_loader.cpp +++ b/src/rootserver/ob_tenant_info_loader.cpp @@ -304,26 +304,26 @@ void ObTenantInfoLoader::broadcast_tenant_info_content_() if (OB_FAIL(tenant_info_cache_.get_tenant_info(tenant_info, last_sql_update_time, ora_rowscn))) { LOG_WARN("failed to get tenant info", KR(ret)); - } else if (OB_FAIL(share::ObAllServerTracer::get_instance().for_each_server_status( - [&rpc_count, &tenant_info, &proxy, ora_rowscn](const share::ObServerStatus &status) -> int { + } else if (OB_FAIL(share::ObAllServerTracer::get_instance().for_each_server_info( + [&rpc_count, &tenant_info, &proxy, ora_rowscn](const share::ObServerInfoInTable &server_info) -> int { int ret = OB_SUCCESS; obrpc::ObUpdateTenantInfoCacheArg arg; - if (!status.is_valid()) { - LOG_WARN("skip invalid status", KR(ret), K(status)); - } else if (!status.is_alive()) { + if (!server_info.is_valid()) { + LOG_WARN("skip invalid server_info", KR(ret), K(server_info)); + } else if (!server_info.is_alive()) { //not send to alive } else if (OB_FAIL(arg.init(tenant_info.get_tenant_id(), tenant_info, ora_rowscn))) { LOG_WARN("failed to init arg", KR(ret), K(tenant_info), K(ora_rowscn)); // use meta rpc process thread - } else if (OB_FAIL(proxy.call(status.server_, DEFAULT_TIMEOUT_US, gen_meta_tenant_id(tenant_info.get_tenant_id()), arg))) { - LOG_WARN("failed to send rpc", KR(ret), K(status), K(tenant_info), K(arg)); + } else if (OB_FAIL(proxy.call(server_info.get_server(), DEFAULT_TIMEOUT_US, gen_meta_tenant_id(tenant_info.get_tenant_id()), arg))) { + LOG_WARN("failed to send rpc", KR(ret), K(server_info), K(tenant_info), K(arg)); } else { rpc_count++; } return ret; }))) { - LOG_WARN("for each server status failed", KR(ret)); + LOG_WARN("for each server_info failed", KR(ret)); } int tmp_ret = OB_SUCCESS; diff --git a/src/rootserver/ob_unit_manager.cpp b/src/rootserver/ob_unit_manager.cpp index f525e1579..50809c41e 100644 --- a/src/rootserver/ob_unit_manager.cpp +++ b/src/rootserver/ob_unit_manager.cpp @@ -33,6 +33,7 @@ #include "share/ob_max_id_fetcher.h" #include "share/inner_table/ob_inner_table_schema.h" #include "share/ob_tenant_memstore_info_operator.h" +#include "share/ob_rpc_struct.h" #include "storage/ob_file_system_router.h" #include "observer/ob_server_struct.h" #include "rootserver/ob_balance_info.h" @@ -43,6 +44,8 @@ #include "rootserver/ob_root_service.h" #include "rootserver/ob_root_balancer.h" #include "storage/ob_file_system_router.h" +#include "share/ob_all_server_tracer.h" +#include "rootserver/ob_heartbeat_service.h" namespace oceanbase { @@ -3051,9 +3054,7 @@ int ObUnitManager::check_server_enough(const uint64_t tenant_id, } // end for } //Count the number of existing units - if (OB_FAIL(ret)) { - //nothing todo - } else if (OB_FAIL(get_pools_by_tenant(tenant_id, pools))) { + if (FAILEDx(get_pools_by_tenant(tenant_id, pools))) { if (OB_ENTRY_NOT_EXIST == ret) { // a new tenant, without resource pool already granted ret = OB_SUCCESS; @@ -3084,13 +3085,12 @@ int ObUnitManager::check_server_enough(const uint64_t tenant_id, } } ObArray zone_infos; - if (OB_FAIL(ret)) { - } else if (OB_FAIL(zone_mgr_.get_zone(zone_infos))) { + if (FAILEDx(zone_mgr_.get_zone(zone_infos))) { LOG_WARN("fail to get zone infos", K(ret)); } else { //Count the number of units in zone for (int64_t i = 0; i < zone_infos.count() && OB_SUCC(ret) && enough; i++) { - ObZone zone = zone_infos.at(i).zone_; + const ObZone &zone = zone_infos.at(i).zone_; int64_t unit_count = 0; int64_t alive_server_count = 0; for (int64_t j = 0; j < total_unit_infos.count() && OB_SUCC(ret); j++) { @@ -3099,8 +3099,8 @@ int ObUnitManager::check_server_enough(const uint64_t tenant_id, } } if (unit_count > 0) { - if (OB_FAIL(server_mgr_.get_alive_server_count(zone, alive_server_count))) { - LOG_WARN("fail to get alive server count", K(ret), K(zone)); + if (OB_FAIL(SVR_TRACER.get_alive_servers_count(zone, alive_server_count))) { + LOG_WARN("fail to get alive server count", KR(ret), K(zone)); } else if (alive_server_count < unit_count) { //ret = OB_UNIT_NUM_OVER_SERVER_COUNT; enough = false; @@ -4429,9 +4429,9 @@ int ObUnitManager::inner_get_zone_alive_unit_infos_by_tenant( LOG_WARN("unit is empty", K(ret)); } else if (zone != u->unit_.zone_) { // do not belong to this zone - } else if (OB_FAIL(server_mgr_.check_server_alive(u->unit_.server_, is_alive))) { + } else if (OB_FAIL(SVR_TRACER.check_server_alive(u->unit_.server_, is_alive))) { LOG_WARN("check_server_alive failed", "server", u->unit_.server_, K(ret)); - } else if (OB_FAIL(server_mgr_.check_in_service(u->unit_.server_, is_in_service))) { + } else if (OB_FAIL(SVR_TRACER.check_in_service(u->unit_.server_, is_in_service))) { LOG_WARN("check server in service failed", "server", u->unit_.server_, K(ret)); } else if (!is_alive || !is_in_service) { // ignore unit on not-alive server @@ -5055,7 +5055,7 @@ int ObUnitManager::calc_sum_load(const ObArray *unit_loads, return ret; } -int ObUnitManager::check_resource_pool(share::ObResourcePool &resource_pool) const +int ObUnitManager::check_resource_pool(share::ObResourcePool &resource_pool) const { int ret = OB_SUCCESS; if (!check_inner_stat()) { @@ -5110,8 +5110,8 @@ int ObUnitManager::check_resource_pool(share::ObResourcePool &resource_pool) co } FOREACH_CNT_X(zone, resource_pool.zone_list_, OB_SUCCESS == ret) { int64_t alive_server_count = 0; - if (OB_FAIL(server_mgr_.get_alive_server_count(*zone, alive_server_count))) { - LOG_WARN("get_alive_servers failed", "zone", *zone, K(ret)); + if (OB_FAIL(SVR_TRACER.get_alive_servers_count(*zone, alive_server_count))) { + LOG_WARN("get_alive_servers failed", KR(ret), KPC(zone)); } else if (alive_server_count < resource_pool.unit_count_) { ret = OB_UNIT_NUM_OVER_SERVER_COUNT; LOG_WARN("resource pool unit num over zone server count", "unit_count", @@ -5157,19 +5157,17 @@ int ObUnitManager::try_notify_tenant_server_unit_resource( const bool skip_offline_server) { int ret = OB_SUCCESS; - bool is_server_alive = false; + bool is_alive = false; if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; LOG_WARN("check_inner_stat failed", K(ret), K(inited_), K(loaded_)); - } else if (OB_UNLIKELY(nullptr == srv_rpc_proxy_)) { + } else if (OB_ISNULL(srv_rpc_proxy_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("srv_rpc_proxy_ ptr is null", K(ret)); - } else if (OB_FAIL(server_mgr_.check_server_alive(unit.server_, is_server_alive))) { - LOG_WARN("fail to check server alive", K(ret), "server", unit.server_); - } else if (!is_server_alive && (is_delete || skip_offline_server)) { + LOG_WARN("srv_rpc_proxy_ is null", KR(ret), KP(srv_rpc_proxy_)); + } else if (OB_FAIL(SVR_TRACER.check_server_alive(unit.server_, is_alive))) { + LOG_WARN("fail to get server_info", KR(ret), K(unit.server_)); + } else if (!is_alive && (is_delete || skip_offline_server)) { LOG_INFO("server not alive when delete unit, ignore", "server", unit.server_); - } else if (!is_delete && OB_FAIL(server_mgr_.set_with_partition(unit.server_))) { - LOG_WARN("failed to set with partition", KR(ret), K(unit)); } else { share::ObUnitConfig *unit_config = nullptr; if (!is_valid_tenant_id(new_pool.tenant_id_) && !is_delete) { @@ -5317,6 +5315,9 @@ int ObUnitManager::allocate_pool_units_( int ret = OB_SUCCESS; ObUnitConfig *config = NULL; lib::Worker::CompatMode compat_mode = lib::Worker::CompatMode::INVALID; + ObArray servers_info; + ObArray active_servers_info_of_zone; + ObArray active_servers_resource_info_of_zone; if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; @@ -5331,10 +5332,10 @@ int ObUnitManager::allocate_pool_units_( && unit_group_id_array->count() != increase_delta_unit_num) { ret = OB_INVALID_ARGUMENT; LOG_WARN("new unit group id array status not match", - KR(ret), K(increase_delta_unit_num), KP(unit_group_id_array)); - } else if (OB_UNLIKELY(nullptr == srv_rpc_proxy_)) { + KR(ret), K(increase_delta_unit_num), KP(unit_group_id_array)); + } else if (OB_ISNULL(srv_rpc_proxy_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("srv_rpc_proxy_ ptr is null", K(ret)); + LOG_WARN("srv_rpc_proxy_ is null", KR(ret), KP(srv_rpc_proxy_)); } else if (is_valid_tenant_id(pool.tenant_id_) && OB_FAIL(ObCompatModeGetter::get_tenant_mode(pool.tenant_id_, compat_mode))) { LOG_WARN("fail to get tenant compat mode", KR(ret), K(pool.tenant_id_)); @@ -5353,18 +5354,33 @@ int ObUnitManager::allocate_pool_units_( for (int64_t i = 0; OB_SUCC(ret) && i < zones.count(); ++i) { // for each zone const ObZone &zone = zones.at(i); excluded_servers.reuse(); + active_servers_info_of_zone.reuse(); + active_servers_resource_info_of_zone.reuse(); if (FAILEDx(get_excluded_servers(pool.resource_pool_id_, zone, module, new_allocate_pool, excluded_servers))) { LOG_WARN("get excluded servers fail", KR(ret), K(pool.resource_pool_id_), K(zone), K(module), K(new_allocate_pool)); + } else if (OB_FAIL(SVR_TRACER.get_active_servers_info(zone, active_servers_info_of_zone))) { + LOG_WARN("fail to get active_servers_info_of_zone", KR(ret), K(servers_info), K(zone)); + } else if (OB_FAIL(get_servers_resource_info_via_rpc( + active_servers_info_of_zone, + active_servers_resource_info_of_zone))) { + LOG_WARN("fail to get active_servers_resource_info_of_zone", KR(ret), K(active_servers_info_of_zone)); } for (int64_t j = 0; OB_SUCC(ret) && j < increase_delta_unit_num; ++j) { uint64_t unit_id = OB_INVALID_ID; std::string resource_not_enough_reason; ObAddr server; - if (OB_FAIL(choose_server_for_unit(config->unit_resource(), zone, excluded_servers, module, server, + if (OB_FAIL(choose_server_for_unit( + config->unit_resource(), + zone, + excluded_servers, + module, + active_servers_info_of_zone, + active_servers_resource_info_of_zone, + server, resource_not_enough_reason))) { LOG_WARN("choose server for unit failed", K(module), KR(ret), "unit_idx", j, K(increase_delta_unit_num), K(zone), K(excluded_servers), KPC(config)); @@ -5431,34 +5447,44 @@ int ObUnitManager::allocate_pool_units_( return ret; } -int ObUnitManager::get_excluded_servers(const ObUnit &unit, - const ObUnitStat &unit_stat, - const char *module, - ObIArray &servers) const +int ObUnitManager::get_excluded_servers( + const ObUnit &unit, + const ObUnitStat &unit_stat, + const char *module, + const ObIArray &servers_info, // servers info in unit.zone_ + const ObIArray &report_servers_resource_info, // active servers' resource info in unit.zone_ + ObIArray &servers) const { int ret = OB_SUCCESS; //Add all OBS whose disks do not meet the requirements ObArray server_list; + ObServerResourceInfo server_resource_info; const bool new_allocate_pool = false; if (OB_FAIL(get_excluded_servers(unit.resource_pool_id_, unit.zone_, module, new_allocate_pool, servers))) { LOG_WARN("fail to get excluded_servers", K(ret), K(unit), K(new_allocate_pool)); - } else if (OB_FAIL(server_mgr_.get_server_statuses(unit.zone_, server_list))) { - LOG_WARN("fail to get server of zone", K(ret), K(unit)); } else { - for (int64_t i = 0; i < server_list.count() && OB_SUCC(ret); i++) { - ObServerStatus &status = server_list.at(i); + for (int64_t i = 0; i < servers_info.count() && OB_SUCC(ret); i++) { + const ObServerInfoInTable &server_info = servers_info.at(i); + const ObAddr &server = server_info.get_server(); bool is_exclude = false; - if (!status.can_migrate_in()) { + server_resource_info.reset(); + if (!server_info.can_migrate_in()) { is_exclude = true; - LOG_INFO("server can't migrate in, push into excluded_array", K(status), K(module)); + LOG_INFO("server can't migrate in, push into excluded_array", K(server_info), K(module)); + } else if (OB_FAIL(ObRootUtils::get_server_resource_info( + report_servers_resource_info, + server, + server_resource_info))) { + // server which can be migrated in must have its resource_info + LOG_WARN("fail to get server_resource_info", KR(ret), K(report_servers_resource_info), K(server)); } else { - int64_t required_size = unit_stat.required_size_ + status.resource_info_.disk_in_use_; - int64_t total_size = status.resource_info_.disk_total_; + int64_t required_size = unit_stat.required_size_ + server_resource_info.disk_in_use_; + int64_t total_size = server_resource_info.disk_total_; if (total_size <= required_size || total_size <= 0) { is_exclude = true; - LOG_INFO("server total size no bigger than required size", K(module), K(required_size), K(total_size), - K(unit_stat), K(status.resource_info_)); + LOG_INFO("server total size no bigger than required size", K(module), K(required_size), + K(total_size), K(unit_stat), K(server_resource_info)); } else if (required_size <= 0) { //nothing todo } else { @@ -5473,10 +5499,10 @@ int ObUnitManager::get_excluded_servers(const ObUnit &unit, } if (!is_exclude) { //nothing todo - } else if (has_exist_in_array(servers, status.server_)) { + } else if (has_exist_in_array(servers, server)) { //nothing todo - } else if (OB_FAIL(servers.push_back(status.server_))) { - LOG_WARN("fail to push back", K(ret), K(status)); + } else if (OB_FAIL(servers.push_back(server))) { + LOG_WARN("fail to push back", KR(ret), K(server)); } } } @@ -5658,32 +5684,34 @@ int ObUnitManager::get_pool_servers(const uint64_t resource_pool_id, // @ret OB_SUCCESS on success // @ret OB_ZONE_RESOURCE_NOT_ENOUGH zone resource not enough to hold new unit // @ret OB_ZONE_SERVER_NOT_ENOUGH all valid servers are excluded, no server to hold new unit -int ObUnitManager::choose_server_for_unit(const ObUnitResource &config, - const ObZone &zone, - const ObArray &excluded_servers, - const char *module, - ObAddr &choosed_server, - std::string &resource_not_enough_reason) const +int ObUnitManager::choose_server_for_unit( + const ObUnitResource &config, + const ObZone &zone, + const ObArray &excluded_servers, + const char *module, + const ObIArray &active_servers_info, // active_servers_info of the give zone, + const ObIArray &active_servers_resource_info, // active_servers_resource_info of the give zone + ObAddr &choosed_server, + std::string &resource_not_enough_reason) const { int ret = OB_SUCCESS; - ObArray server_statuses; + ObArray statuses; ObArray server_resources; - + ObArray servers_info; + ObArray report_servers_resource_info; if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; LOG_WARN("check_inner_stat failed", K(inited_), K(loaded_), K(ret)); } else if (!config.is_valid() || zone.is_empty()) { // excluded_servers can be empty ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid config", K(config), K(zone), K(ret)); - } else if (OB_FAIL(server_mgr_.get_server_statuses(zone, server_statuses))) { - LOG_WARN("get_server_statuses failed", K(zone), K(ret)); - } else if (OB_FAIL(build_server_resources_(server_statuses, server_resources))) { - LOG_WARN("build server resources fail", KR(ret), K(server_statuses)); - } else if (OB_FAIL(do_choose_server_for_unit_(config, zone, excluded_servers, server_statuses, + LOG_WARN("invalid config", KR(ret), K(config), K(zone)); + } else if (OB_FAIL(build_server_resources_(active_servers_resource_info, server_resources))) { + LOG_WARN("fail to build server resources", KR(ret), K(active_servers_resource_info)); + } else if (OB_FAIL(do_choose_server_for_unit_(config, zone, excluded_servers, active_servers_info, server_resources, module, choosed_server, resource_not_enough_reason))) { - LOG_WARN("fail to choose server for unit", K(module), KR(ret), K(config), K(zone), K(excluded_servers), - K(server_statuses), K(server_resources), + LOG_WARN("fail to choose server for unit", K(module), KR(ret), K(config), K(zone), + K(excluded_servers), K(servers_info), K(server_resources), "resource_not_enough_reason", resource_not_enough_reason.c_str()); } return ret; @@ -5693,7 +5721,7 @@ int ObUnitManager::choose_server_for_unit(const ObUnitResource &config, int ObUnitManager::do_choose_server_for_unit_(const ObUnitResource &config, const ObZone &zone, const ObArray &excluded_servers, - const ObIArray &statuses, + const ObIArray &servers_info, const ObIArray &server_resources, const char *module, ObAddr &choosed_server, @@ -5710,36 +5738,37 @@ int ObUnitManager::do_choose_server_for_unit_(const ObUnitResource &config, if (OB_UNLIKELY(zone.is_empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("zone is empty, unexpected", KR(ret), K(zone)); - } else if (statuses.count() != server_resources.count()) { + } else if (servers_info.count() != server_resources.count()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server statuses and server resources array", KR(ret), K(statuses), K(server_resources)); + LOG_WARN("invalid servers_info and server_resources array", KR(ret), K(servers_info), K(server_resources)); } else if (OB_FAIL(get_hard_limit(hard_limit))) { LOG_WARN("get_hard_limit failed", K(ret)); } else { int64_t not_excluded_server_count = 0; // 1. construct valid servers resource - for (int64_t i = 0; OB_SUCC(ret) && i < statuses.count(); ++i) { // for each active servers + for (int64_t i = 0; OB_SUCC(ret) && i < servers_info.count(); ++i) { // for each active servers ObResourceType not_enough_resource = RES_MAX; AlterResourceErr not_enough_resource_config = ALT_ERR; - const ObServerStatus &server_status = statuses.at(i); + const ObServerInfoInTable &server_info = servers_info.at(i); + const ObAddr &server = server_info.get_server(); const ObUnitPlacementStrategy::ObServerResource &server_resource = server_resources.at(i); - if (has_exist_in_array(excluded_servers, server_status.server_)) { + if (has_exist_in_array(excluded_servers, server)) { // excluded servers are expected, need not show in reason continue; } else { not_excluded_server_count++; - if (!server_status.can_migrate_in()) { - if (! server_status.is_active()) { + if (!server_info.can_migrate_in()) { + if (!server_info.is_active()) { resource_not_enough_reason = - resource_not_enough_reason + "server '" + to_cstring(server_status.server_) + "' is not active\n"; + resource_not_enough_reason + "server '" + to_cstring(server) + "' is not active\n"; } else { // server is block-migrate-in resource_not_enough_reason = - resource_not_enough_reason + "server '" + to_cstring(server_status.server_) + "' is blocked migrate-in\n"; + resource_not_enough_reason + "server '" + to_cstring(server) + "' is blocked migrate-in\n"; } - LOG_WARN("[CHOOSE_SERVER_FOR_UNIT] server can not migrate in", K(module), K(i), "server", server_status); + LOG_WARN("[CHOOSE_SERVER_FOR_UNIT] server can not migrate in", K(module), K(i), "server", server_info); continue; } else { bool is_resource_enough = @@ -5759,7 +5788,7 @@ int ObUnitManager::do_choose_server_for_unit_(const ObUnitResource &config, "not_enough_resource_config", alter_resource_err_to_str(not_enough_resource_config), K(server_resource), "request_unit_config", config); resource_not_enough_reason = - resource_not_enough_reason + "server '" + to_cstring(server_status.server_) + "' " + resource_not_enough_reason + "server '" + to_cstring(server) + "' " + resource_type_to_str(not_enough_resource) + " resource not enough\n"; } } @@ -5770,7 +5799,7 @@ int ObUnitManager::do_choose_server_for_unit_(const ObUnitResource &config, if (0 == not_excluded_server_count) { ret = OB_ZONE_SERVER_NOT_ENOUGH; LOG_WARN("zone server not enough to hold all units", K(module), KR(ret), K(zone), K(excluded_servers), - K(statuses)); + K(servers_info)); } else if (valid_server_resources.count() <= 0) { ret = OB_ZONE_RESOURCE_NOT_ENOUGH; LOG_WARN("zone resource is not enough to hold a new unit", K(module), KR(ret), K(zone), @@ -5791,27 +5820,29 @@ int ObUnitManager::do_choose_server_for_unit_(const ObUnitResource &config, return ret; } -int ObUnitManager::compute_server_resource_(const ObServerStatus &server_status, +int ObUnitManager::compute_server_resource_( + const obrpc::ObGetServerResourceInfoResult &report_server_resource_info, ObUnitPlacementStrategy::ObServerResource &server_resource) const { int ret = OB_SUCCESS; ObUnitConfig sum_load; ObArray *unit_loads = NULL; - - if (OB_UNLIKELY(!server_status.is_valid())) { + const ObAddr &server = report_server_resource_info.get_server(); + const ObServerResourceInfo &report_resource = report_server_resource_info.get_resource_info(); + if (OB_UNLIKELY(!report_server_resource_info.is_valid())) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid argument", K(server_status), KR(ret)); - } else if (OB_FAIL(get_loads_by_server(server_status.server_, unit_loads))) { + LOG_WARN("invalid argument", KR(ret), K(report_server_resource_info)); + } else if (OB_FAIL(get_loads_by_server(server, unit_loads))) { if (OB_ENTRY_NOT_EXIST != ret) { - LOG_WARN("get_loads_by_server failed", "server", server_status.server_, K(ret)); + LOG_WARN("get_loads_by_server failed", "server", server, KR(ret)); } else { ret = OB_SUCCESS; } - } else if (NULL == unit_loads) { + } else if (OB_ISNULL(unit_loads)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit_loads is null", KP(unit_loads), K(ret)); + LOG_WARN("unit_loads is null", KR(ret), KP(unit_loads)); } else if (OB_FAIL(calc_sum_load(unit_loads, sum_load))) { - LOG_WARN("calc_sum_load failed", KP(unit_loads), K(ret)); + LOG_WARN("calc_sum_load failed", KR(ret), KP(unit_loads)); } if (OB_SUCC(ret)) { @@ -5822,28 +5853,26 @@ int ObUnitManager::compute_server_resource_(const ObServerStatus &server_status, // The persistent information of the unit on the observer side is regularly reported to rs by the observer through the heartbeat. // When performing allocation, rs reports the maximum value of resource information from its own resource view // and observer side as a reference for unit resource allocation - - const ObServerResourceInfo &report_resource = server_status.resource_info_; - server_resource.addr_ = server_status.server_; + server_resource.addr_ = server; server_resource.assigned_[RES_CPU] = sum_load.min_cpu() > report_resource.report_cpu_assigned_ ? sum_load.min_cpu() : report_resource.report_cpu_assigned_; server_resource.max_assigned_[RES_CPU] = sum_load.max_cpu() > report_resource.report_cpu_max_assigned_ ? sum_load.max_cpu() : report_resource.report_cpu_max_assigned_; - server_resource.capacity_[RES_CPU] = server_status.resource_info_.cpu_; + server_resource.capacity_[RES_CPU] = report_resource.cpu_; server_resource.assigned_[RES_MEM] = sum_load.memory_size() > report_resource.report_mem_assigned_ ? static_cast(sum_load.memory_size()) : static_cast(report_resource.report_mem_assigned_); server_resource.max_assigned_[RES_MEM] = server_resource.assigned_[RES_MEM]; - server_resource.capacity_[RES_MEM] = static_cast(server_status.resource_info_.mem_total_); + server_resource.capacity_[RES_MEM] = static_cast(report_resource.mem_total_); server_resource.assigned_[RES_LOG_DISK] = static_cast(sum_load.log_disk_size()); server_resource.max_assigned_[RES_LOG_DISK] = static_cast(sum_load.log_disk_size()); - server_resource.capacity_[RES_LOG_DISK] = static_cast(server_status.resource_info_.log_disk_total_); + server_resource.capacity_[RES_LOG_DISK] = static_cast(report_resource.log_disk_total_); } LOG_INFO("compute server resource", KR(ret), - "server", server_status.server_, + "server", server, K(server_resource), - "report_resource_info", server_status.resource_info_, + "report_resource_info", report_resource, "valid_unit_sum", sum_load, "valid_unit_count", unit_loads != NULL ? unit_loads->count(): 0); return ret; @@ -5899,7 +5928,7 @@ bool ObUnitManager::check_resource_enough_for_unit_( // demand_resource may have some invalid items, need not check valid for demand_resource -int ObUnitManager::have_enough_resource(const ObServerStatus &server_status, +int ObUnitManager::have_enough_resource(const obrpc::ObGetServerResourceInfoResult &report_server_resource_info, const ObUnitResource &demand_resource, const double hard_limit, bool &is_enough, @@ -5908,32 +5937,211 @@ int ObUnitManager::have_enough_resource(const ObServerStatus &server_status, int ret = OB_SUCCESS; ObResourceType not_enough_resource = RES_MAX; ObUnitPlacementStrategy::ObServerResource server_resource; - err_index = ALT_ERR; if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; LOG_WARN("check_inner_stat failed", K(inited_), K(loaded_), K(ret)); - } else if (!server_status.is_valid() || hard_limit <= 0) { + } else if (!report_server_resource_info.is_valid() || hard_limit <= 0) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid argument", K(server_status), K(hard_limit), K(ret)); - } else if (OB_FAIL(compute_server_resource_(server_status, server_resource))) { - LOG_WARN("compute server resource fail", KR(ret), K(server_status)); + LOG_WARN("invalid argument", K(report_server_resource_info), K(hard_limit), K(ret)); + } else if (OB_FAIL(compute_server_resource_(report_server_resource_info, server_resource))) { + LOG_WARN("compute server resource fail", KR(ret), K(report_server_resource_info)); } else { is_enough = check_resource_enough_for_unit_(server_resource, demand_resource, hard_limit, not_enough_resource, err_index); } return ret; } - int ObUnitManager::check_enough_resource_for_delete_server( const ObAddr &server, const ObZone &zone) +{ + int ret = OB_SUCCESS; + // get_servers_of_zone + ObArray report_servers_resource_info; + ObArray servers_info; + ObArray servers_info_of_zone; + bool empty = false; + if (OB_UNLIKELY(!server.is_valid() || zone.is_empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server or zone", KR(ret), K(server), K(zone)); + } else if (OB_FAIL(check_server_empty(server, empty))) { + // the validity of the server is checked here + LOG_WARN("fail to check whether the server is empty", KR(ret)); + } else if (empty) { + //nothing todo + } else if (OB_FAIL(SVR_TRACER.get_active_servers_info(zone, servers_info_of_zone))) { + LOG_WARN("fail to get servers_info_of_zone", KR(ret), K(servers_info), K(zone)); + } else if (OB_FAIL(get_servers_resource_info_via_rpc(servers_info_of_zone, report_servers_resource_info))) { + LOG_WARN("fail to get servers_resouce_info via rpc", KR(ret), K(servers_info_of_zone), K(report_servers_resource_info)); + } else if (OB_FAIL(check_enough_resource_for_delete_server_( + server, + zone, + servers_info_of_zone, + report_servers_resource_info))) { + LOG_WARN("fail to check enough resource for delete server", KR(ret), K(server), K(zone), + K(servers_info_of_zone), K(report_servers_resource_info)); + } else {} + return ret; +} +int ObUnitManager::get_servers_resource_info_via_rpc( + const ObIArray &servers_info, + ObIArray &report_servers_resource_info) const +{ + int ret = OB_SUCCESS; + int tmp_ret = OB_SUCCESS; + ObTimeoutCtx ctx; + obrpc::ObGetServerResourceInfoArg arg; + ObArray tmp_report_servers_resource_info; + report_servers_resource_info.reset(); + if (OB_UNLIKELY(servers_info.count() <= 0)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("servers_info.count() should be greater than zero", KR(ret), K(servers_info.count())); + } else if (!ObHeartbeatService::is_service_enabled()) { // old logic + ObServerResourceInfo resource_info; + obrpc::ObGetServerResourceInfoResult result; + for (int64_t i = 0; OB_SUCC(ret) && i < servers_info.count(); i++) { + const ObAddr &server = servers_info.at(i).get_server(); + resource_info.reset(); + result.reset(); + if (OB_FAIL(server_mgr_.get_server_resource_info(server, resource_info))) { + LOG_WARN("fail to get server resource info", KR(ret), K(server)); + } else if (OB_FAIL(result.init(server, resource_info))) { + LOG_WARN("fail to init", KR(ret), K(server)); + } else if (OB_FAIL(report_servers_resource_info.push_back(result))) { + LOG_WARN("fail to push an element into report_servers_resource_info", KR(ret), K(result)); + } + } + } else { // new logic + if (OB_ISNULL(srv_rpc_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("srv_rpc_proxy_ is null", KR(ret), KP(srv_rpc_proxy_)); + } else if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else { + ObGetServerResourceInfoProxy proxy(*srv_rpc_proxy_, &obrpc::ObSrvRpcProxy::get_server_resource_info); + for (int64_t i = 0; OB_SUCC(ret) && i < servers_info.count(); i++) { + const ObServerInfoInTable & server_info = servers_info.at(i); + arg.reset(); + if (OB_UNLIKELY(!server_info.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server_info", KR(ret), K(server_info)); + } else { + const ObAddr &server = server_info.get_server(); + const int64_t time_out = ctx.get_timeout(); + if (OB_FAIL(arg.init(GCTX.self_addr()))) { + LOG_WARN("fail to init arg", KR(ret), K(GCTX.self_addr())); + } else if (OB_FAIL(proxy.call( + server, + time_out, + GCONF.cluster_id, + OB_SYS_TENANT_ID, + arg))) { + LOG_WARN("fail to send get_server_resource_info rpc", KR(ret), KR(tmp_ret), K(server), + K(time_out), K(arg)); + } + } + } + if (OB_TMP_FAIL(proxy.wait())) { + LOG_WARN("fail to wait all batch result", KR(ret), KR(tmp_ret)); + ret = OB_SUCC(ret) ? tmp_ret : ret; + } + tmp_report_servers_resource_info.reset(); + ARRAY_FOREACH_X(proxy.get_results(), idx, cnt, OB_SUCC(ret)) { + const obrpc::ObGetServerResourceInfoResult *rpc_result = proxy.get_results().at(idx); + if (OB_ISNULL(rpc_result)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("rpc_result is null", KR(ret), KP(rpc_result)); + } else if (OB_UNLIKELY(!rpc_result->is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("rpc_result is invalid", KR(ret), KPC(rpc_result)); + } else if (OB_FAIL(tmp_report_servers_resource_info.push_back(*rpc_result))) { + LOG_WARN("fail to push an element into tmp_report_servers_resource_info", KR(ret), KPC(rpc_result)); + } + } + } + // get ordered report_servers_resource_info: since when processing resource_info, + // we assume servers_info.at(i).get_server() = report_servers_resource_info.at(i).get_server() + if (FAILEDx(order_report_servers_resource_info_( + servers_info, + tmp_report_servers_resource_info, + report_servers_resource_info ))) { + LOG_WARN("fail to order report_servers_resource_info", KR(ret), + K(servers_info.count()), K(tmp_report_servers_resource_info.count()), + K(servers_info), K(tmp_report_servers_resource_info)); + } + } + return ret; +} + +int ObUnitManager::order_report_servers_resource_info_( + const ObIArray &servers_info, + const ObIArray &report_servers_resource_info, + ObIArray &ordered_report_servers_resource_info) +{ + // target: servers_info.at(i).get_server() = ordered_report_servers_resource_info.at(i).get_server() + int ret = OB_SUCCESS; + ordered_report_servers_resource_info.reset(); + if (servers_info.count() != report_servers_resource_info.count()) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("the size of servers_info should be equal to the size of report_servers_resource_info", + KR(ret), K(servers_info.count()), K(report_servers_resource_info.count()), + K(servers_info), K(report_servers_resource_info)); + } else { + for (int64_t i = 0; OB_SUCC(ret) && i < servers_info.count(); i++) { + bool find_server = false; + for (int64_t j = 0; OB_SUCC(ret) && !find_server && j < report_servers_resource_info.count(); j++) { + const obrpc::ObGetServerResourceInfoResult &server_resource_info = report_servers_resource_info.at(j); + if (servers_info.at(i).get_server() == server_resource_info.get_server()) { + find_server = true; + if (OB_FAIL(ordered_report_servers_resource_info.push_back(server_resource_info))) { + LOG_WARN("fail to push an element into ordered_report_servers_resource_info", + KR(ret), K(server_resource_info)); + } + } + } + if(OB_SUCC(ret) && !find_server) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("server not exists in report_servers_resource_info", + K(servers_info.at(i)), K(report_servers_resource_info)); + } + } + } + return ret; +} +int ObUnitManager::get_server_resource_info_via_rpc( + const share::ObServerInfoInTable &server_info, + obrpc::ObGetServerResourceInfoResult &report_server_resource_info) const +{ + int ret = OB_SUCCESS; + ObArray servers_info; + ObArray report_resource_info_array; + report_server_resource_info.reset(); + if (OB_UNLIKELY(!server_info.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("server_info is invalid", KR(ret), K(server_info)); + } else if (OB_FAIL(servers_info.push_back(server_info))) { + LOG_WARN("fail to push an element into servers_info", KR(ret), K(server_info)); + } else if (OB_FAIL(get_servers_resource_info_via_rpc(servers_info, report_resource_info_array))) { + LOG_WARN("fail to execute get_servers_resource_info_via_rpc", KR(ret), K(servers_info)); + } else if (OB_UNLIKELY(1 != report_resource_info_array.count())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("report_resource_info_array.count() should be one", KR(ret), K(report_resource_info_array.count())); + } else if (OB_FAIL(report_server_resource_info.assign(report_resource_info_array.at(0)))) { + LOG_WARN("fail to assign report_server_resource_info", KR(ret), K(report_resource_info_array.at(0))); + } + return ret; +} +int ObUnitManager::check_enough_resource_for_delete_server_( + const ObAddr &server, + const ObZone &zone, + const ObIArray &servers_info, + const ObIArray &report_servers_resource_info) { int ret = OB_SUCCESS; ObArray *unit_loads = NULL; ObArray initial_servers_resources; - ObArray statuses; SpinRLockGuard guard(lock_); bool empty = false; if (OB_UNLIKELY(zone.is_empty())) { @@ -5946,19 +6154,17 @@ int ObUnitManager::check_enough_resource_for_delete_server( } else { if (OB_FAIL(get_loads_by_server(server, unit_loads))) { LOG_WARN("fail to get loads by server", K(ret)); - } else if (OB_FAIL(server_mgr_.get_server_statuses(zone, statuses))) { - LOG_WARN("get_server_statuses failed", K(zone), K(ret)); - } else if (OB_FAIL(build_server_resources_(statuses, initial_servers_resources))) { - LOG_WARN("fail to build server resources", KR(ret), K(statuses)); + } else if (OB_FAIL(build_server_resources_(report_servers_resource_info, initial_servers_resources))) { + LOG_WARN("fail to build server resources", KR(ret), K(report_servers_resource_info)); } else { for (int64_t i = 0; i < unit_loads->count() && OB_SUCC(ret); ++i) { std::string resource_not_enough_reason; if (OB_FAIL(check_server_have_enough_resource_for_delete_server_( - unit_loads->at(i), - zone, - statuses, - initial_servers_resources, - resource_not_enough_reason))) { + unit_loads->at(i), + zone, + servers_info, + initial_servers_resources, + resource_not_enough_reason))) { LOG_WARN("fail to check server have enough resource for delete server", K(ret), K(zone), @@ -5994,11 +6200,10 @@ int ObUnitManager::check_enough_resource_for_delete_server( } return ret; } - int ObUnitManager::check_server_have_enough_resource_for_delete_server_( const ObUnitLoad &unit_load, const common::ObZone &zone, - const ObIArray &statuses, + const ObIArray &servers_info, ObIArray &initial_servers_resources, std::string &resource_not_enough_reason) { @@ -6020,9 +6225,9 @@ int ObUnitManager::check_server_have_enough_resource_for_delete_server_( } else if (OB_UNLIKELY(zone.is_empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("zone is empty, unexpected", KR(ret), K(zone)); - } else if (statuses.count() != initial_servers_resources.count()) { + } else if (servers_info.count() != initial_servers_resources.count()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("server statuses does not match initial_servers_resources", KR(ret), K(statuses), + LOG_WARN("servers_info does not match initial_servers_resources", KR(ret), K(servers_info), K(initial_servers_resources)); } else if (OB_ISNULL(unit_load.unit_)) { ret = OB_ERR_UNEXPECTED; @@ -6032,13 +6237,12 @@ int ObUnitManager::check_server_have_enough_resource_for_delete_server_( } else if (OB_FAIL(get_excluded_servers(unit_load.unit_->resource_pool_id_, unit_load.unit_->zone_, module, new_allocate_pool, excluded_servers))) { LOG_WARN("fail to get excluded server", K(module), KR(ret), KPC(unit_load.unit_), K(new_allocate_pool)); - } - // choose right server for target unit - else if (OB_FAIL(do_choose_server_for_unit_(config->unit_resource(), zone, - excluded_servers, statuses, initial_servers_resources, + } else if (OB_FAIL(do_choose_server_for_unit_(config->unit_resource(), zone, + excluded_servers, servers_info, initial_servers_resources, module, choosed_server, resource_not_enough_reason))) { + // choose right server for target unit LOG_WARN("choose server for unit fail", K(module), KR(ret), K(zone), KPC(config), - K(excluded_servers), K(statuses), K(initial_servers_resources), + K(excluded_servers), K(servers_info), K(initial_servers_resources), "resource_not_enough_reason", resource_not_enough_reason.c_str()); } else { // sum target unit resource config on choosed server resource @@ -6062,16 +6266,16 @@ int ObUnitManager::check_server_have_enough_resource_for_delete_server_( } int ObUnitManager::build_server_resources_( - const ObIArray &statuses, - ObIArray &servers_resources) const + const ObIArray &report_servers_resource_info, + ObIArray &servers_resources) const { int ret = OB_SUCCESS; - for (int64_t i = 0; OB_SUCC(ret) && i < statuses.count(); ++i) { - const ObServerStatus &server_status = statuses.at(i); + for (int64_t i = 0; OB_SUCC(ret) && i < report_servers_resource_info.count(); ++i) { + const obrpc::ObGetServerResourceInfoResult &report_resource_info = report_servers_resource_info.at(i); ObUnitPlacementStrategy::ObServerResource server_resource; - if (OB_FAIL(compute_server_resource_(server_status, server_resource))) { - LOG_WARN("compute server resource fail", KR(ret), K(server_status)); + if (OB_FAIL(compute_server_resource_(report_resource_info, server_resource))) { + LOG_WARN("compute server resource fail", KR(ret), K(report_resource_info)); } else if (OB_FAIL(servers_resources.push_back(server_resource))) { LOG_WARN("fail to push back", KR(ret), K(server_resource)); } @@ -7986,7 +8190,6 @@ int ObUnitManager::check_expand_resource_( { int ret = OB_SUCCESS; common::hash::ObHashMap server_ref_count_map; - common::ObZone zone; ObString err_str; AlterResourceErr err_index = ALT_ERR; int temp_ret = OB_SUCCESS; @@ -8009,59 +8212,67 @@ int ObUnitManager::check_expand_resource_( bool can_expand = true; const ObUnitResource delta = new_resource - old_resource; ObUnitResource expand_resource; - + ObServerInfoInTable server_info; _LOG_INFO("[%s] check_expand_resource begin. old=%s, new=%s, delta=%s", module, to_cstring(old_resource), to_cstring(new_resource), to_cstring(delta)); FOREACH_X(iter, server_ref_count_map, OB_SUCCESS == ret) { expand_resource = delta * (iter->second); - + const ObAddr &server = iter->first; + server_info.reset(); _LOG_INFO("[%s] check_expand_resource. svr=%s, pools=%ld, expand_resource=%s", module, - to_cstring(iter->first), iter->second, to_cstring(expand_resource)); - - if (OB_FAIL(check_expand_resource_(iter->first, expand_resource, can_expand, err_index))) { - LOG_WARN("check expand resource failed", K(ret)); + to_cstring(server), iter->second, to_cstring(expand_resource)); + if (OB_FAIL(SVR_TRACER.get_server_info(server, server_info))) { + LOG_WARN("fail to get server_info", KR(ret), K(server)); + } else if (OB_FAIL(check_expand_resource_(server_info, expand_resource, can_expand, err_index))) { + LOG_WARN("check expand resource failed", KR(ret), K(server_info)); } else if (!can_expand) { - if (OB_FAIL(server_mgr_.get_server_zone(iter->first, zone))) { - LOG_WARN("get_server_zone failed", K(iter->first), K(ret)); - } else { - LOG_USER_ERROR(OB_MACHINE_RESOURCE_NOT_ENOUGH, to_cstring(zone), to_cstring(iter->first), - alter_resource_err_to_str(err_index)); - - // return ERROR - ret = OB_MACHINE_RESOURCE_NOT_ENOUGH; - } + const ObZone &zone = server_info.get_zone(); + LOG_USER_ERROR(OB_MACHINE_RESOURCE_NOT_ENOUGH, to_cstring(zone), to_cstring(server), + alter_resource_err_to_str(err_index)); + // return ERROR + ret = OB_MACHINE_RESOURCE_NOT_ENOUGH; } } } return ret; } -int ObUnitManager::check_expand_resource_(const ObAddr &server, - const ObUnitResource &expand_resource, bool &can_expand, AlterResourceErr &err_index) const +int ObUnitManager::check_expand_resource_( + const share::ObServerInfoInTable &server_info, + const ObUnitResource &expand_resource, + bool &can_expand, + AlterResourceErr &err_index) const { int ret = OB_SUCCESS; - ObServerStatus status; double hard_limit = 0; bool can_hold_unit = false; can_expand = true; + obrpc::ObGetServerResourceInfoResult report_server_resource_info; // some item of expand_resource may be negative, so we don't check expand_resource here if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; LOG_WARN("check_inner_stat failed", K_(inited), K_(loaded), K(ret)); - } else if (!server.is_valid()) { + } else if (!server_info.is_valid()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server", K(server), K(ret)); + LOG_WARN("invalid server_info", KR(ret), K(server_info)); } else if (OB_FAIL(get_hard_limit(hard_limit))) { LOG_WARN("get_hard_limit failed", K(ret)); - } else if (OB_FAIL(server_mgr_.get_server_status(server, status))) { - LOG_WARN("get_server_status failed", K(server), K(ret)); - } else if (OB_FAIL(have_enough_resource(status, expand_resource, hard_limit, can_hold_unit, err_index))) { - LOG_WARN("fail to check have enough resource", K(status), K(hard_limit), K(ret)); + } else if (OB_FAIL(get_server_resource_info_via_rpc(server_info, report_server_resource_info))) { + LOG_WARN("get_server_resource_info_via_rpc failed", KR(ret), K(server_info)); + } else if (OB_FAIL(have_enough_resource( + report_server_resource_info, + expand_resource, + hard_limit, + can_hold_unit, + err_index))) { + LOG_WARN("fail to check have enough resource", KR(ret), K(hard_limit), + K(report_server_resource_info), K(expand_resource)); } else if (!can_hold_unit) { can_expand = false; // don't need to set ret - LOG_WARN("find server can't hold expanded resource", K(server), K(status), K(expand_resource)); + LOG_WARN("find server can't hold expanded resource", KR(ret), K(server_info), + K(report_server_resource_info), K(expand_resource)); } else { can_expand = true; } @@ -8594,7 +8805,7 @@ int ObUnitManager::check_tenant_on_server(const uint64_t tenant_id, ObArray servers; if (OB_FAIL(get_pool_ids_of_tenant(tenant_id, pool_ids))) { LOG_WARN("get_pool_ids_of_tenant failed", K(tenant_id), K(ret)); - } else if (OB_FAIL(server_mgr_.get_server_zone(server, zone))) { + } else if (OB_FAIL(SVR_TRACER.get_server_zone(server, zone))) { LOG_WARN("get_server_zone failed", K(server), K(ret)); } else { SpinRLockGuard guard(lock_); @@ -8618,7 +8829,9 @@ int ObUnitManager::admin_migrate_unit( int ret = OB_SUCCESS; ObUnitInfo unit_info; ObArray excluded_servers; - ObServerStatus status; + ObArray servers_info; + ObServerInfoInTable dst_server_info; + obrpc::ObGetServerResourceInfoResult report_dst_server_resource_info; ObUnitConfig left_resource; ObZone src_zone; ObZone dst_zone; @@ -8646,19 +8859,25 @@ int ObUnitManager::admin_migrate_unit( ret = OB_OP_NOT_ALLOW; LOG_WARN("migrate a unit which is in deleting status", KR(ret), K(unit_id)); LOG_USER_ERROR(OB_OP_NOT_ALLOW, "migrate a unit which is in deleting status"); - } else if (OB_FAIL(server_mgr_.get_server_zone(unit_info.unit_.server_, src_zone))) { + } else if (OB_FAIL(SVR_TRACER.get_server_zone(unit_info.unit_.server_, src_zone))) { LOG_WARN("get server zone failed", "server", unit_info.unit_.server_, KR(ret)); } else if (dst == unit_info.unit_.migrate_from_server_ || is_cancel) { // cancel migrate unit + bool can_migrate_in = false; if (is_cancel && !unit_info.unit_.migrate_from_server_.is_valid()) { ret = OB_ERR_UNEXPECTED; LOG_WARN("failed to cancel migrate unit, may be no migrate task", KR(ret), K(unit_info)); LOG_USER_ERROR(OB_ERR_UNEXPECTED,"no migrate task to cancel"); - } else if (OB_FAIL(cancel_migrate_unit( - unit_info.unit_, unit_info.pool_.tenant_id_ == OB_GTS_TENANT_ID))) { - LOG_WARN("failed to cancel migrate unit", KR(ret), K(unit_info)); + } else if (OB_FAIL(SVR_TRACER.check_server_can_migrate_in( + unit_info.unit_.migrate_from_server_, + can_migrate_in))) { + LOG_WARN("fail to check server can_migrate_in", KR(ret), K(servers_info), + K(unit_info.unit_.migrate_from_server_)); + } else if (OB_FAIL(cancel_migrate_unit( + unit_info.unit_, can_migrate_in, unit_info.pool_.tenant_id_ == OB_GTS_TENANT_ID))) { + LOG_WARN("failed to cancel migrate unit", KR(ret), K(unit_info), K(can_migrate_in)); } - } else if (OB_FAIL(server_mgr_.get_server_zone(dst, dst_zone))) { + } else if (OB_FAIL(SVR_TRACER.get_server_zone(dst, dst_zone))) { LOG_WARN("get server zone failed", "server", dst, KR(ret)); } else if (src_zone != dst_zone) { ret = OB_NOT_SUPPORTED; @@ -8671,13 +8890,21 @@ int ObUnitManager::admin_migrate_unit( ret = OB_NOT_SUPPORTED; LOG_USER_ERROR(OB_NOT_SUPPORTED,"hold two units of a tenant in the same server"); LOG_WARN("hold two units of a tenant in the same server is not supported", KR(ret)); - } else if (OB_FAIL(server_mgr_.get_server_status(dst, status))) { - LOG_WARN("get server status failed", "server", dst, KR(ret)); - } else if (!status.can_migrate_in()) { + } else if (OB_FAIL(SVR_TRACER.get_server_info(dst, dst_server_info))) { + LOG_WARN("get dst_server_info failed", KR(ret), K(dst), K(servers_info)); + } else if (!dst_server_info.can_migrate_in()) { ret = OB_SERVER_MIGRATE_IN_DENIED; - LOG_WARN("server can not migrate in", K(dst), K(status), KR(ret)); - } else if (OB_FAIL(have_enough_resource(status, unit_info.config_.unit_resource(), hard_limit, can_hold_unit, err_index))) { - LOG_WARN("calculate_left_resource failed", "status", status, K(hard_limit), K(err_index), KR(ret)); + LOG_WARN("server can not migrate in", K(dst), K(dst_server_info), KR(ret)); + } else if (OB_FAIL(get_server_resource_info_via_rpc(dst_server_info, report_dst_server_resource_info))) { + LOG_WARN("fail to execute get_server_resource_info_via_rpc", KR(ret), K(dst_server_info)); + } else if (OB_FAIL(have_enough_resource( + report_dst_server_resource_info, + unit_info.config_.unit_resource(), + hard_limit, + can_hold_unit, + err_index))) { + LOG_WARN("calculate_left_resource failed", KR(ret), K(report_dst_server_resource_info), + K(hard_limit), K(err_index)); } else if (!can_hold_unit) { ret = OB_MACHINE_RESOURCE_NOT_ENOUGH; LOG_WARN("left resource can't hold unit", "server", dst, @@ -8691,15 +8918,13 @@ int ObUnitManager::admin_migrate_unit( int ObUnitManager::cancel_migrate_unit( const share::ObUnit &unit, + const bool migrate_from_server_can_migrate_in, const bool is_gts_unit) { int ret = OB_SUCCESS; - ObServerStatus status; - if (OB_FAIL(server_mgr_.get_server_status(unit.migrate_from_server_, status))) { - LOG_WARN("get_server_status failed", "server", unit.migrate_from_server_, K(ret)); - } else if (!status.can_migrate_in() && !is_gts_unit) { + if (!migrate_from_server_can_migrate_in && !is_gts_unit) { ret = OB_SERVER_MIGRATE_IN_DENIED; - LOG_WARN("server can not migrate in", "server", unit.migrate_from_server_, K(status), K(ret)); + LOG_WARN("server can not migrate in", K(unit.migrate_from_server_), K(migrate_from_server_can_migrate_in), KR(ret)); } else { const EndMigrateOp op = REVERSE; if (OB_FAIL(end_migrate_unit(unit.unit_id_, op))) { @@ -8714,8 +8939,8 @@ int ObUnitManager::cancel_migrate_unit( int ObUnitManager::try_cancel_migrate_unit(const share::ObUnit &unit, bool &is_canceled) { int ret = OB_SUCCESS; - ObServerStatus status; - bool can_migrate_in = false; + bool migrate_from_server_can_migrate_in = false; + bool server_can_migrate_in = false; is_canceled = false; if (!check_inner_stat()) { ret = OB_INNER_STAT_ERROR; @@ -8723,17 +8948,17 @@ int ObUnitManager::try_cancel_migrate_unit(const share::ObUnit &unit, bool &is_c } else if (!unit.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid unit", K(unit), K(ret)); - } else if (OB_FAIL(check_can_migrate_in(unit.server_, can_migrate_in))) { - LOG_WARN("check_can_migrate_in failed", "server", unit.server_, K(ret)); - } else if (can_migrate_in) { + } else if (OB_FAIL(SVR_TRACER.check_server_can_migrate_in(unit.server_, server_can_migrate_in))) { + LOG_WARN("check_server_can_migrate_in failed", "server", unit.server_, K(ret)); + } else if (server_can_migrate_in) { // ignore, do nothing - } else if (!unit.migrate_from_server_.is_valid()) { - // ignore, do nothing - } else if (OB_FAIL(server_mgr_.get_server_status(unit.migrate_from_server_, status))) { + } else if (OB_FAIL(SVR_TRACER.check_server_can_migrate_in( + unit.migrate_from_server_, + migrate_from_server_can_migrate_in))) { LOG_WARN("get_server_status failed", "server", unit.migrate_from_server_, K(ret)); - } else if (status.can_migrate_in()) { + } else if (migrate_from_server_can_migrate_in) { LOG_INFO("unit migrate_from_server can migrate in, " - "migrate unit back to migrate_from_server", K(unit), K(status)); + "migrate unit back to migrate_from_server", K(unit), K(migrate_from_server_can_migrate_in)); const EndMigrateOp op = REVERSE; if (OB_FAIL(end_migrate_unit(unit.unit_id_, op))) { LOG_WARN("end_migrate_unit failed", "unit_id", unit.unit_id_, K(op), K(ret)); @@ -8745,97 +8970,6 @@ int ObUnitManager::try_cancel_migrate_unit(const share::ObUnit &unit, bool &is_c return ret; } -int ObUnitManager::get_server_loads_internal(const ObZone &zone, - const bool only_active, - ObArray &server_loads, - double &sum_load, - int64_t &alive_server_count, - double *weights, int64_t weights_count) -{ - int ret = OB_SUCCESS; - ObServerManager::ObServerStatusArray server_statuses; - // zone can be empty, don't check it - if (!check_inner_stat()) { - ret = OB_INNER_STAT_ERROR; - LOG_WARN("check inner stat failed", K_(inited), K_(loaded), K(ret)); - } else if (OB_FAIL(server_mgr_.get_server_statuses(zone, server_statuses))) { - LOG_WARN("get_servers_of_zone failed", K(zone), K(ret)); - } else { - alive_server_count = 0; - sum_load = 0; - } - - ObServerLoad server_load; - for (int64_t i = 0; OB_SUCC(ret) && i < server_statuses.count(); ++i) { - ObArray *unit_loads = NULL; - ObServerStatus &status = server_statuses.at(i); - server_load.reset(); - if (only_active && !status.is_active()) { - // filter not active server - } else { - if (status.is_active()) { - ++alive_server_count; - } - if (OB_FAIL(get_loads_by_server(status.server_, unit_loads))) { - if (OB_ENTRY_NOT_EXIST != ret) { - LOG_WARN("get_loads_by_server failed", "server", status.server_, K(ret)); - } else { - ret = OB_SUCCESS; - LOG_DEBUG("server is empty, no unit on it", "server", status.server_); - } - } else if (NULL == unit_loads) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("unit_loads is null", KP(unit_loads), K(ret)); - } - - // unit_loads is null if no unit on it - if (OB_FAIL(ret)) { - } else if (OB_FAIL(server_load.build(unit_loads, status))) { - LOG_WARN("server_load build failed", K(status), K(ret)); - } else if (OB_FAIL(server_loads.push_back(server_load))) { - LOG_WARN("push_back failed", K(ret)); - } else {} - } - } // end for - if (OB_SUCC(ret) && server_loads.count() > 0) { - if (OB_FAIL(ObResourceUtils::calc_server_resource_weight(server_loads, weights, weights_count))) { - LOG_WARN("failed to calc resource weight", K(ret)); - } else { - double load = 0; - ARRAY_FOREACH(server_loads, i) { - ObServerLoad &server_load = server_loads.at(i); - if (OB_FAIL(server_load.get_load(weights, weights_count, load))) { - LOG_WARN("get_load_percentage failed", K(ret)); - } else { - sum_load += load; - } - } // end for - } - } - return ret; -} - -int ObUnitManager::get_server_loads(const ObZone &zone, - ObArray &server_loads, - double *weights, int64_t weights_count) -{ - int ret = OB_SUCCESS; - double sum_load = 0; - int64_t alive_server_count = 0; - const bool only_active = false; - SpinRLockGuard guard(lock_); - // zone can be empty, don't check it - if (!check_inner_stat()) { - ret = OB_INNER_STAT_ERROR; - LOG_WARN("check inner stat failed", K_(inited), K_(loaded), K(ret)); - } else if (OB_FAIL(get_server_loads_internal(zone, only_active, - server_loads, sum_load, alive_server_count, weights, weights_count))) { - LOG_WARN("fail to get server loads internal", K(zone), K(only_active), K(ret)); - } - - return ret; -} - int ObUnitManager::get_hard_limit(double &hard_limit) const { int ret = OB_SUCCESS; @@ -8907,39 +9041,18 @@ int ObUnitManager::check_has_intersect_pg(const share::ObUnit &a, return OB_SUCCESS; } -int ObUnitManager::check_can_migrate_in(const ObAddr &server, bool &can_migrate_in) const -{ - int ret = OB_SUCCESS; - ObServerStatus status; - can_migrate_in = false; - if (!check_inner_stat()) { - ret = OB_INNER_STAT_ERROR; - LOG_WARN("check inner stat failed", K_(inited), K_(loaded), K(ret)); - } else if (!server.is_valid()) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server", K(server), K(ret)); - } else if (OB_FAIL(server_mgr_.get_server_status(server, status))) { - LOG_WARN("get_server_status failed", K(server), K(ret)); - } else { - can_migrate_in = status.can_migrate_in(); - } - return ret; -} - int ObUnitManager::try_migrate_unit(const uint64_t unit_id, const uint64_t tenant_id, const ObUnitStat &unit_stat, const ObIArray &migrating_unit_stat, const ObAddr &dst, + const ObServerResourceInfo &dst_resource_info, const bool is_manual) { int ret = OB_SUCCESS; - ObServerStatus server_status; if (unit_id != unit_stat.unit_id_) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid unit stat", K(unit_id), K(unit_stat), K(ret)); - } else if (OB_FAIL(server_mgr_.get_server_status(dst, server_status))) { - LOG_WARN("fail get server status", K(dst), K(ret)); } else { int64_t mig_partition_cnt = 0; int64_t mig_required_size = 0; @@ -8952,8 +9065,8 @@ int ObUnitManager::try_migrate_unit(const uint64_t unit_id, // mig_partition_cnt + unit_stat.partition_cnt_ + server_status.resource_info_.partition_cnt_; // sstable Space constraints int64_t required_size = - mig_required_size + unit_stat.required_size_ + server_status.resource_info_.disk_in_use_; - int64_t total_size = server_status.resource_info_.disk_total_; + mig_required_size + unit_stat.required_size_ + dst_resource_info.disk_in_use_; + int64_t total_size = dst_resource_info.disk_total_; int64_t required_percent = (100 * required_size) / total_size; int64_t limit_percent = GCONF.data_disk_usage_limit_percentage; // 4.0 does not restrict OB_MAX_PARTITION_NUM_PER_SERVER @@ -9028,8 +9141,8 @@ int ObUnitManager::migrate_unit(const uint64_t unit_id, const ObAddr &dst, const ret = OB_OP_NOT_ALLOW; LOG_WARN("migrate not grant unit not valid", K(ret)); LOG_USER_ERROR(OB_OP_NOT_ALLOW, "migrate unit which has not been granted"); - } else if (OB_FAIL(server_mgr_.get_server_zone(dst, zone))) { - LOG_WARN("server_mgr_ get_server_zone failed", K(dst), K(ret)); + } else if (OB_FAIL(SVR_TRACER.get_server_zone(dst, zone))) { + LOG_WARN("get_server_zone failed", KR(ret), K(dst)); } else if (unit->server_ == dst) { ret = OB_INVALID_ARGUMENT; LOG_WARN("unit->server same as migrate destination server", @@ -9182,7 +9295,7 @@ int ObUnitManager::inner_try_delete_migrate_unit_resource( LOG_WARN("unit ptr is null", K(ret), KP(unit)); } else if (!migrate_from_server.is_valid()) { LOG_INFO("unit not in migrating, no need to delete src resource", K(unit_id)); - } else if (OB_FAIL(server_mgr_.check_server_alive(migrate_from_server, is_alive))) { + } else if (OB_FAIL(SVR_TRACER.check_server_alive(migrate_from_server, is_alive))) { LOG_WARN("fail to check server alive", K(ret), "server", migrate_from_server); } else if (!is_alive) { LOG_INFO("src server not alive, ignore notify", diff --git a/src/rootserver/ob_unit_manager.h b/src/rootserver/ob_unit_manager.h index bb55b1533..c32df503a 100644 --- a/src/rootserver/ob_unit_manager.h +++ b/src/rootserver/ob_unit_manager.h @@ -414,10 +414,6 @@ public: const common::ObIArray &schema_zone_list, const common::ObIArray &zone_locality, bool &is_legal); - // get all server loads - int get_server_loads(const common::ObZone &zone, - common::ObArray &server_loads, - double *weights, int64_t weights_count); static int calc_sum_load(const common::ObArray *unit_loads, share::ObUnitConfig &sum_load); // get hard limit @@ -528,25 +524,32 @@ protected: const share::ObUnitStat &unit_stat, const common::ObIArray &migrating_unit_stat, const common::ObAddr &dst, + const share::ObServerResourceInfo &dst_resource_info, const bool is_manual = false); int get_zone_units(const common::ObArray &pools, common::ObArray &zone_units) const; virtual int end_migrate_unit(const uint64_t unit_id, const EndMigrateOp end_migrate_op = COMMIT); - int get_excluded_servers(const share::ObUnit &unit, - const share::ObUnitStat &unit_stat, - const char *module, - common::ObIArray &servers) const; + int get_excluded_servers( + const share::ObUnit &unit, + const share::ObUnitStat &unit_stat, + const char *module, + const ObIArray &servers_info, // servers info in unit.zone_ + const ObIArray &report_servers_resource_info, // active servers' resource info in unit.zone_ + common::ObIArray &servers) const; int get_excluded_servers(const uint64_t resource_pool_id, const common::ObZone &zone, const char *module, const bool new_allocate_pool, common::ObIArray &excluded_servers) const; - int choose_server_for_unit(const share::ObUnitResource &config, - const common::ObZone &zone, - const common::ObArray &excluded_servers, - const char *module, - common::ObAddr &server, - std::string &resource_not_enough_reason) const; + int choose_server_for_unit( + const share::ObUnitResource &config, + const common::ObZone &zone, + const common::ObArray &excluded_servers, + const char *module, + const ObIArray &active_servers_info, // active_servers_info of the give zone, + const ObIArray &active_servers_resource_info, // active_servers_resource_info of the give zone + common::ObAddr &server, + std::string &resource_not_enough_reason) const; int inner_choose_server_for_unit(const share::ObUnitConfig &config, const common::ObZone &zone, const common::ObArray &excluded_servers, @@ -569,14 +572,7 @@ protected: const uint64_t tenant_id, const int64_t unit_group_num, common::ObIArray &new_unit_group_id_array); - int get_server_loads_internal(const common::ObZone &zone, - const bool only_active, - common::ObArray &server_loads, - double &sum_load, - int64_t &alive_server_count, - double *weights, int64_t weights_count); int check_unit_group_normal(const share::ObUnit &unit, bool &normal); - int check_can_migrate_in(const common::ObAddr &server, bool &can_migrate_in) const; int get_migrate_units_by_server(const ObAddr &server, common::ObIArray &migrate_units) const; int try_cancel_migrate_unit(const share::ObUnit &unit, bool &is_canceled); @@ -590,7 +586,7 @@ protected: int check_has_intersect_pg(const share::ObUnit &a, const share::ObUnit &b, bool &intersect); - int have_enough_resource(const share::ObServerStatus &server_status, + int have_enough_resource(const obrpc::ObGetServerResourceInfoResult &report_server_resource_info, const share::ObUnitResource &unit_resource, const double limit, bool &is_enough, @@ -1047,7 +1043,10 @@ protected: common::ObPooledAllocator > &allocator, const uint64_t id, share::ObResourcePool *resource_pool); - int cancel_migrate_unit(const share::ObUnit &unit, const bool is_gts_unit); + int cancel_migrate_unit( + const share::ObUnit &unit, + const bool migrate_from_server_can_migrate_in, + const bool is_gts_unit); int check_split_pool_name_condition( const common::ObIArray &split_pool_name_list); int check_split_pool_zone_condition( @@ -1133,6 +1132,12 @@ protected: const uint64_t tenant_id, const bool is_active, common::ObIArray &unit_group_id_array); + int get_servers_resource_info_via_rpc( + const ObIArray &servers_info, + ObIArray &report_server_resource_info) const; + int get_server_resource_info_via_rpc( + const share::ObServerInfoInTable &server_inzfo, + obrpc::ObGetServerResourceInfoResult &report_servers_resource_info) const ; private: int check_shrink_resource_(const common::ObIArray &pools, @@ -1146,7 +1151,8 @@ private: const common::ObIArray &pools, const share::ObUnitResource &old_resource, const share::ObUnitResource &new_resource) const; - int check_expand_resource_(const common::ObAddr &server, + int check_expand_resource_( + const share::ObServerInfoInTable &server_info, const share::ObUnitResource &expand_resource, bool &can_expand, AlterResourceErr &err_index) const; @@ -1170,21 +1176,35 @@ private: int expand_pool_unit_num_( share::ObResourcePool *pool, const int64_t unit_num); + int check_enough_resource_for_delete_server_( + const ObAddr &server, + const ObZone &zone, + const ObIArray &servers_info, + const ObIArray &report_servers_resource_info); + int get_servers_resource_info_via_rpc_( + const ObIArray &servers_info, + ObIArray &report_servers_resource_info); + static int order_report_servers_resource_info_( + const ObIArray &servers_info, + const ObIArray &report_servers_resource_info, + ObIArray &ordered_report_servers_resource_info); + int check_server_have_enough_resource_for_delete_server_( const ObUnitLoad &unit_load, const common::ObZone &zone, - const ObIArray &statuses, + const ObIArray &servers_info, ObIArray &initial_servers_resource, std::string &resource_not_enough_reason); - int compute_server_resource_(const share::ObServerStatus &server_status, - ObUnitPlacementStrategy::ObServerResource &server_resource) const; + int compute_server_resource_( + const obrpc::ObGetServerResourceInfoResult &report_server_resource_info, + ObUnitPlacementStrategy::ObServerResource &server_resource) const; int build_server_resources_( - const ObIArray &statuses, + const ObIArray &report_servers_resource_info, ObIArray &initial_server_resource) const; int do_choose_server_for_unit_(const share::ObUnitResource &config, const ObZone &zone, const ObArray &excluded_servers, - const ObIArray &statuses, + const ObIArray &servers_info, const ObIArray &server_resources, const char *module, ObAddr &server, diff --git a/src/rootserver/ob_update_rs_list_task.cpp b/src/rootserver/ob_update_rs_list_task.cpp index 51e4906c6..0586175ea 100644 --- a/src/rootserver/ob_update_rs_list_task.cpp +++ b/src/rootserver/ob_update_rs_list_task.cpp @@ -19,7 +19,7 @@ #include "share/ls/ob_ls_table_operator.h" #include "share/ob_root_addr_agent.h" #include "share/ob_debug_sync.h" -#include "rootserver/ob_server_manager.h" +#include "share/ob_all_server_tracer.h" #include "rootserver/ob_root_utils.h" #include "rootserver/ob_root_service.h" #include "observer/ob_server_struct.h" @@ -33,7 +33,7 @@ using namespace share; ObUpdateRsListTask::ObUpdateRsListTask() : inited_(false), lst_operator_(NULL), - root_addr_agent_(NULL), server_mgr_(NULL), zone_mgr_(NULL), + root_addr_agent_(NULL), zone_mgr_(NULL), lock_(NULL), force_update_(false), self_addr_() { } @@ -71,7 +71,6 @@ void ObUpdateRsListTask::clear_lock() int ObUpdateRsListTask::init(ObLSTableOperator &lst_operator, ObRootAddrAgent *agent, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, SpinRWLock &lock, const bool force_update, @@ -90,7 +89,6 @@ int ObUpdateRsListTask::init(ObLSTableOperator &lst_operator, } else { lst_operator_ = &lst_operator; root_addr_agent_ = agent; - server_mgr_ = &server_mgr; zone_mgr_ = &zone_mgr; lock_ = &lock; force_update_ = force_update; @@ -115,11 +113,10 @@ int ObUpdateRsListTask::process_without_lock() if (!inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); - } else if (OB_ISNULL(lst_operator_) - || OB_ISNULL(server_mgr_)) { + } else if (OB_ISNULL(lst_operator_)) { ret = OB_ERR_UNEXPECTED; - LOG_WARN("is null", KP(lst_operator_), KP(server_mgr_)); - } else if (OB_FAIL(get_rs_list(*lst_operator_, *server_mgr_, self_addr_, + LOG_WARN("lst_operator_ is null", KP(lst_operator_)); + } else if (OB_FAIL(get_rs_list(*lst_operator_, self_addr_, new_rs_list, new_readonly_rs_list, rs_list_diff_member_list))) { LOG_WARN("get_rs_list failed", K(ret)); } else if (common::INVALID_CLUSTER_ROLE == cluster_role) { @@ -208,10 +205,10 @@ ObAsyncTask *ObUpdateRsListTask::deep_copy(char *buf, const int64_t buf_size) co } else { task = new(buf) ObUpdateRsListTask(); if (OB_FAIL(static_cast(task)->init( - *lst_operator_, root_addr_agent_, *server_mgr_, + *lst_operator_, root_addr_agent_, *zone_mgr_, *lock_, force_update_, self_addr_))) { LOG_WARN("init task failed", KP(lst_operator_), KP(root_addr_agent_), - KP(server_mgr_), KP(zone_mgr_), KP(lock_), K(ret)); + KP(zone_mgr_), KP(lock_), K(ret)); } if (OB_FAIL(ret)) { @@ -224,7 +221,6 @@ ObAsyncTask *ObUpdateRsListTask::deep_copy(char *buf, const int64_t buf_size) co int ObUpdateRsListTask::get_rs_list( ObLSTableOperator &lst, - ObServerManager &server_mgr, const ObAddr &self_addr, share::ObIAddrList &rs_list, share::ObIAddrList &readonly_rs_list, @@ -241,8 +237,8 @@ int ObUpdateRsListTask::get_rs_list( ObRootAddr rs; FOREACH_CNT_X(replica, ls_info.get_replicas(), OB_SUCCESS == ret) { bool is_server_alive = false; - if (server_mgr.has_build()) { - if (OB_FAIL(server_mgr.check_server_alive(replica->get_server(), is_server_alive))) { + if (SVR_TRACER.has_build()) { + if (OB_FAIL(SVR_TRACER.check_server_alive(replica->get_server(), is_server_alive))) { LOG_WARN("check_server_alive failed", "server", replica->get_server(), KR(ret)); } } else { diff --git a/src/rootserver/ob_update_rs_list_task.h b/src/rootserver/ob_update_rs_list_task.h index 28a06085a..db0f2d335 100644 --- a/src/rootserver/ob_update_rs_list_task.h +++ b/src/rootserver/ob_update_rs_list_task.h @@ -29,7 +29,6 @@ class ObLSTableOperator; namespace rootserver { class ObRootService; -class ObServerManager; class ObZoneManager; class ObUpdateRsListTask : public share::ObAsyncTask { @@ -39,7 +38,6 @@ public: int init(share::ObLSTableOperator &lst_operator, share::ObRootAddrAgent *addr_agent_, - ObServerManager &server_mgr, ObZoneManager &zone_mgr, common::SpinRWLock &lock, const bool force_update, @@ -49,7 +47,6 @@ public: int64_t get_deep_copy_size() const; share::ObAsyncTask *deep_copy(char *buf, const int64_t buf_size) const; static int get_rs_list(share::ObLSTableOperator &lst, - ObServerManager &server_mgr, const common::ObAddr &self_addr, share::ObIAddrList &rs_list, share::ObIAddrList &readonly_rs_list, @@ -81,7 +78,6 @@ private: bool inited_; share::ObLSTableOperator *lst_operator_; share::ObRootAddrAgent *root_addr_agent_; - ObServerManager *server_mgr_; ObZoneManager *zone_mgr_; common::SpinRWLock *lock_; bool force_update_; diff --git a/src/rootserver/ob_vtable_location_getter.cpp b/src/rootserver/ob_vtable_location_getter.cpp index 442d87269..2638a9051 100644 --- a/src/rootserver/ob_vtable_location_getter.cpp +++ b/src/rootserver/ob_vtable_location_getter.cpp @@ -16,8 +16,9 @@ #include "lib/container/ob_array_serialization.h" #include "lib/container/ob_array_iterator.h" -#include "rootserver/ob_server_manager.h" #include "rootserver/ob_unit_manager.h" +#include "share/ob_all_server_tracer.h" +#include "rootserver/ob_root_utils.h" namespace oceanbase { @@ -25,10 +26,7 @@ using namespace common; using namespace share; namespace rootserver { -ObVTableLocationGetter::ObVTableLocationGetter(ObServerManager &server_mgr, - ObUnitManager &unit_mgr) - : server_mgr_(server_mgr), - unit_mgr_(unit_mgr) +ObVTableLocationGetter::ObVTableLocationGetter(ObUnitManager &unit_mgr) : unit_mgr_(unit_mgr) { } @@ -37,6 +35,8 @@ ObVTableLocationGetter::~ObVTableLocationGetter() { } +// **FIXME (linqiucen.lqc): in the future, we can remove unit_mgr_, +// ** then this func can be executed locally on observers int ObVTableLocationGetter::get(const ObVtableLocationType &vtable_type, ObSArray &servers) { @@ -64,7 +64,7 @@ int ObVTableLocationGetter::get(const ObVtableLocationType &vtable_type, } if (OB_SUCC(ret) && OB_UNLIKELY(servers.count() <= 0)) { ret = OB_LOCATION_NOT_EXIST; - LOG_WARN("servers from server_mgr_ are empty", KR(ret), K(vtable_type), K(servers)); + LOG_WARN("servers are empty", KR(ret), K(vtable_type), K(servers)); } return ret; } @@ -75,11 +75,20 @@ int ObVTableLocationGetter::get_only_rs_vtable_location_( { int ret = OB_SUCCESS; servers.reuse(); + ObAddr rs_addr; if (OB_UNLIKELY(!vtable_type.is_only_rs())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("vtable_type is invalid", K(vtable_type), KR(ret)); - } else if (OB_FAIL(servers.push_back(server_mgr_.get_rs_addr()))) { - LOG_WARN("push_back failed", KR(ret)); + } else if (OB_ISNULL(GCTX.rs_mgr_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.rs_mgr_ is null", KP(GCTX.rs_mgr_)); + } else if (OB_FAIL(GCTX.rs_mgr_->get_master_root_server(rs_addr))) { + LOG_WARN("fail to get master root server", KR(ret), KP(GCTX.rs_mgr_)); + } else if (OB_UNLIKELY(!rs_addr.is_valid() || rs_addr != GCTX.self_addr())) { + ret = OB_ENTRY_NOT_EXIST; + LOG_WARN("rs_addr is invalid or not equal to self_addr", KR(ret), K(rs_addr), K(GCTX.self_addr())); + } else if (OB_FAIL(servers.push_back(rs_addr))) { + LOG_WARN("push_back failed", KR(ret), K(rs_addr)); } return ret; } @@ -94,12 +103,8 @@ int ObVTableLocationGetter::get_global_vtable_location_( if (OB_UNLIKELY(!(vtable_type.is_cluster_distributed()))) { ret = OB_INVALID_ARGUMENT; LOG_WARN("vtable_type is invalid", K(vtable_type), KR(ret)); - } else if (!server_mgr_.has_build()) { - ret = OB_SERVER_IS_INIT; - LOG_WARN("server manager hasn't built", - "server_mgr built", server_mgr_.has_build(), KR(ret)); - } else if (OB_FAIL(server_mgr_.get_alive_servers(zone, servers))) { - LOG_WARN("get_alive_servers failed", KR(ret)); + } else if (OB_FAIL(SVR_TRACER.get_alive_servers(zone, servers))) { + LOG_WARN("get_alive_servers failed", KR(ret), KP(GCTX.sql_proxy_)); } return ret; } @@ -112,16 +117,15 @@ int ObVTableLocationGetter::get_tenant_vtable_location_( servers.reuse(); ObArray unit_servers; ObArray pool_ids; + bool unit_mgr_check = unit_mgr_.check_inner_stat(); if (OB_UNLIKELY(!vtable_type.is_valid() || !vtable_type.is_tenant_distributed() || is_sys_tenant(vtable_type.get_tenant_id()))) { // sys_tenant should get cluster location ret = OB_INVALID_ARGUMENT; LOG_WARN("vtable_type is invalid", KR(ret), K(vtable_type)); - } else if (!server_mgr_.has_build() || !unit_mgr_.check_inner_stat()) { + } else if (OB_UNLIKELY(!unit_mgr_check)) { ret = OB_SERVER_IS_INIT; - LOG_WARN("server manager or unit manager hasn't built", - "server_mgr built", server_mgr_.has_build(), - "unit_mgr built", unit_mgr_.check_inner_stat(), KR(ret)); + LOG_WARN("unit manager hasn't built", "unit_mgr built", unit_mgr_check, KR(ret)); } else if (OB_FAIL(unit_mgr_.get_pool_ids_of_tenant(vtable_type.get_tenant_id(), pool_ids))) { LOG_WARN("get_pool_ids_of_tenant failed", KR(ret), K(vtable_type)); } else { @@ -134,8 +138,8 @@ int ObVTableLocationGetter::get_tenant_vtable_location_( for (int64_t j = 0; OB_SUCC(ret) && j < unit_infos.count(); ++j) { bool is_alive = false; const ObUnit &unit = unit_infos.at(j).unit_; - if (OB_FAIL(server_mgr_.check_server_alive(unit.server_, is_alive))) { - LOG_WARN("check_server_alive failed", "server", unit.server_, KR(ret)); + if (OB_FAIL(SVR_TRACER.check_server_alive(unit.server_, is_alive))) { + LOG_WARN("check_server_alive failed", KR(ret), K(unit.server_)); } else if (is_alive) { if (OB_FAIL(unit_servers.push_back(unit.server_))) { LOG_WARN("push_back failed", KR(ret)); @@ -144,10 +148,8 @@ int ObVTableLocationGetter::get_tenant_vtable_location_( if (OB_SUCC(ret)) { if (unit.migrate_from_server_.is_valid()) { - if (OB_FAIL(server_mgr_.check_server_alive( - unit.migrate_from_server_, is_alive))) { - LOG_WARN("check_server_alive failed", "server", - unit.migrate_from_server_, KR(ret)); + if (OB_FAIL(SVR_TRACER.check_server_alive(unit.migrate_from_server_, is_alive))) { + LOG_WARN("check_server_alive failed", KR(ret), K(unit.migrate_from_server_)); } else if (is_alive) { if (OB_FAIL(unit_servers.push_back(unit.migrate_from_server_))) { LOG_WARN("push_back failed", KR(ret)); diff --git a/src/rootserver/ob_vtable_location_getter.h b/src/rootserver/ob_vtable_location_getter.h index e5173d218..d860a07ea 100644 --- a/src/rootserver/ob_vtable_location_getter.h +++ b/src/rootserver/ob_vtable_location_getter.h @@ -24,14 +24,12 @@ namespace oceanbase { namespace rootserver { -class ObServerManager; class ObUnitManager; class ObVTableLocationGetter { public: - ObVTableLocationGetter(ObServerManager &server_mgr, - ObUnitManager &unit_mgr); + ObVTableLocationGetter(ObUnitManager &unit_mgr); virtual ~ObVTableLocationGetter(); int get(const share::ObVtableLocationType &vtable_type, common::ObSArray &servers); @@ -44,7 +42,6 @@ private: int get_tenant_vtable_location_(const share::ObVtableLocationType &vtable_type, common::ObSArray &servers); - ObServerManager &server_mgr_; ObUnitManager &unit_mgr_; private: DISALLOW_COPY_AND_ASSIGN(ObVTableLocationGetter); diff --git a/src/rootserver/virtual_table/ob_all_virtual_ls_replica_task_plan.cpp b/src/rootserver/virtual_table/ob_all_virtual_ls_replica_task_plan.cpp index 48ccbc4fb..e4e3f0883 100644 --- a/src/rootserver/virtual_table/ob_all_virtual_ls_replica_task_plan.cpp +++ b/src/rootserver/virtual_table/ob_all_virtual_ls_replica_task_plan.cpp @@ -13,6 +13,7 @@ #define USING_LOG_PREFIX RS #include "ob_all_virtual_ls_replica_task_plan.h" #include "rootserver/ob_disaster_recovery_worker.h" +#include "share/ob_all_server_tracer.h" namespace oceanbase { diff --git a/src/share/CMakeLists.txt b/src/share/CMakeLists.txt index 333d0e355..5c450cc64 100644 --- a/src/share/CMakeLists.txt +++ b/src/share/CMakeLists.txt @@ -117,6 +117,7 @@ ob_set_subtarget(ob_share common ob_label_security_os.cpp ob_leader_election_waiter.cpp ob_lease_struct.cpp + ob_heartbeat_struct.cpp ob_list_parser.cpp ob_local_device.cpp ob_locality_info.cpp diff --git a/src/share/backup/ob_backup_connectivity.h b/src/share/backup/ob_backup_connectivity.h index e003a91a4..d7a2e63ce 100644 --- a/src/share/backup/ob_backup_connectivity.h +++ b/src/share/backup/ob_backup_connectivity.h @@ -15,7 +15,6 @@ #include "ob_backup_struct.h" #include "lib/mysqlclient/ob_mysql_proxy.h" -#include "rootserver/ob_server_manager.h" #include "share/backup/ob_backup_store.h" namespace oceanbase { diff --git a/src/share/config/ob_server_config.cpp b/src/share/config/ob_server_config.cpp index a1af03692..e3f560a27 100644 --- a/src/share/config/ob_server_config.cpp +++ b/src/share/config/ob_server_config.cpp @@ -141,6 +141,9 @@ int ObServerConfig::strict_check_special() const if (!cluster_id.check()) { ret = OB_INVALID_CONFIG; SHARE_LOG(WARN, "invalid cluster id", K(ret), K(cluster_id.str())); + } else if (strlen(zone.str()) <= 0) { + ret = OB_INVALID_CONFIG; + SHARE_LOG(WARN, "config zone cannot be empty", KR(ret), K(zone.str())); } } return ret; diff --git a/src/share/inner_table/ob_inner_table_schema.101_150.cpp b/src/share/inner_table/ob_inner_table_schema.101_150.cpp index 3e822d23a..d791283ea 100644 --- a/src/share/inner_table/ob_inner_table_schema.101_150.cpp +++ b/src/share/inner_table/ob_inner_table_schema.101_150.cpp @@ -7369,6 +7369,25 @@ int ObInnerTableSchema::all_server_schema(ObTableSchema &table_schema) with_partition_default, with_partition_default); //default_value } + + if (OB_SUCC(ret)) { + ObObj last_offline_time_default; + last_offline_time_default.set_int(0); + ADD_COLUMN_SCHEMA_T("last_offline_time", //column_name + ++column_id, //column_id + 0, //rowkey_id + 0, //index_id + 0, //part_key_pos + ObIntType, //column_type + CS_TYPE_INVALID, //column_collation_type + sizeof(int64_t), //column_length + -1, //column_precision + -1, //column_scale + false, //is_nullable + false, //is_autoincrement + last_offline_time_default, + last_offline_time_default); //default_value + } table_schema.set_index_using_type(USING_BTREE); table_schema.set_row_store_type(ENCODING_ROW_STORE); table_schema.set_store_format(OB_STORE_FORMAT_DYNAMIC_MYSQL); diff --git a/src/share/inner_table/ob_inner_table_schema.15001_15050.cpp b/src/share/inner_table/ob_inner_table_schema.15001_15050.cpp index b7755554a..07f583ccd 100644 --- a/src/share/inner_table/ob_inner_table_schema.15001_15050.cpp +++ b/src/share/inner_table/ob_inner_table_schema.15001_15050.cpp @@ -6232,6 +6232,21 @@ int ObInnerTableSchema::all_virtual_server_agent_schema(ObTableSchema &table_sch false); //is_autoincrement } + if (OB_SUCC(ret)) { + ADD_COLUMN_SCHEMA("LAST_OFFLINE_TIME", //column_name + ++column_id, //column_id + 0, //rowkey_id + 0, //index_id + 0, //part_key_pos + ObNumberType, //column_type + CS_TYPE_INVALID, //column_collation_type + 38, //column_length + 38, //column_precision + 0, //column_scale + false, //is_nullable + false); //is_autoincrement + } + if (OB_SUCC(ret)) { ADD_COLUMN_SCHEMA("GMT_CREATE", //column_name ++column_id, //column_id diff --git a/src/share/inner_table/ob_inner_table_schema.21151_21200.cpp b/src/share/inner_table/ob_inner_table_schema.21151_21200.cpp index 160a45577..5c2d5bc93 100644 --- a/src/share/inner_table/ob_inner_table_schema.21151_21200.cpp +++ b/src/share/inner_table/ob_inner_table_schema.21151_21200.cpp @@ -610,7 +610,7 @@ int ObInnerTableSchema::dba_ob_servers_schema(ObTableSchema &table_schema) table_schema.set_collation_type(ObCharset::get_default_collation(ObCharset::get_default_charset())); if (OB_SUCC(ret)) { - if (OB_FAIL(table_schema.set_view_definition(R"__( SELECT SVR_IP, SVR_PORT, ID, ZONE, inner_port AS SQL_PORT, CASE with_rootserver WHEN 1 THEN 'YES' ELSE 'NO' END AS WITH_ROOTSERVER, STATUS, CASE start_service_time WHEN 0 THEN NULL ELSE usec_to_time(start_service_time) END AS START_SERVICE_TIME, CASE stop_time WHEN 0 THEN NULL ELSE usec_to_time(stop_time) END AS STOP_TIME, CASE block_migrate_in_time WHEN 0 THEN NULL ELSE usec_to_time(block_migrate_in_time) END AS BLOCK_MIGRATE_IN_TIME, gmt_create AS CREATE_TIME, gmt_modified AS MODIFY_TIME, BUILD_VERSION FROM oceanbase.__all_server )__"))) { + if (OB_FAIL(table_schema.set_view_definition(R"__( SELECT SVR_IP, SVR_PORT, ID, ZONE, inner_port AS SQL_PORT, CASE with_rootserver WHEN 1 THEN 'YES' ELSE 'NO' END AS WITH_ROOTSERVER, STATUS, CASE start_service_time WHEN 0 THEN NULL ELSE usec_to_time(start_service_time) END AS START_SERVICE_TIME, CASE stop_time WHEN 0 THEN NULL ELSE usec_to_time(stop_time) END AS STOP_TIME, CASE block_migrate_in_time WHEN 0 THEN NULL ELSE usec_to_time(block_migrate_in_time) END AS BLOCK_MIGRATE_IN_TIME, gmt_create AS CREATE_TIME, gmt_modified AS MODIFY_TIME, BUILD_VERSION, CASE last_offline_time WHEN 0 THEN NULL ELSE usec_to_time(last_offline_time) END AS LAST_OFFLINE_TIME FROM oceanbase.__all_server )__"))) { LOG_ERROR("fail to set view_definition", K(ret)); } } diff --git a/src/share/inner_table/ob_inner_table_schema_def.py b/src/share/inner_table/ob_inner_table_schema_def.py index 2c2fdbe97..a4077655b 100644 --- a/src/share/inner_table/ob_inner_table_schema_def.py +++ b/src/share/inner_table/ob_inner_table_schema_def.py @@ -608,6 +608,7 @@ def_table_schema( ('start_service_time', 'int'), ('first_sessid', 'int', 'false', '0'), ('with_partition', 'int', 'false', '0'), + ('last_offline_time', 'int', 'false', '0'), ], ) @@ -15819,7 +15820,12 @@ SELECT SVR_IP, gmt_create AS CREATE_TIME, gmt_modified AS MODIFY_TIME, - BUILD_VERSION + BUILD_VERSION, + + CASE last_offline_time + WHEN 0 THEN NULL + ELSE usec_to_time(last_offline_time) END + AS LAST_OFFLINE_TIME FROM oceanbase.__all_server """.replace("\n", " ") ) diff --git a/src/share/ls/ob_ls_creator.h b/src/share/ls/ob_ls_creator.h index e142be946..a3c6ba133 100644 --- a/src/share/ls/ob_ls_creator.h +++ b/src/share/ls/ob_ls_creator.h @@ -45,7 +45,6 @@ namespace share { class SCN; -class ObServerManager; struct ObLSReplicaAddr { common::ObAddr addr_; diff --git a/src/share/ls/ob_ls_status_operator.cpp b/src/share/ls/ob_ls_status_operator.cpp index 0e957f83f..6d52bdd12 100644 --- a/src/share/ls/ob_ls_status_operator.cpp +++ b/src/share/ls/ob_ls_status_operator.cpp @@ -22,7 +22,7 @@ #include "share/ob_share_util.h" #include "lib/mysqlclient/ob_mysql_transaction.h" #include "share/ls/ob_ls_log_stat_info.h" // ObLSLogStatInfo -#include "rootserver/ob_server_manager.h" // ObServerManager +#include "share/ob_server_table_operator.h" #include "rootserver/ob_zone_manager.h" // ObZoneManager #include "rootserver/ob_root_utils.h" // majority #include "logservice/palf/log_define.h" // INVALID_PROPOSAL_ID @@ -891,8 +891,6 @@ int ObLSStatusOperator::construct_ls_log_stat_info_sql_(common::ObSqlString &sql } int ObLSStatusOperator::check_all_ls_has_majority_and_log_sync( - const ObZoneManager &zone_mgr, - const ObServerManager &server_mgr, const common::ObIArray &to_stop_servers, const bool skip_log_sync_check, const char *print_str, @@ -915,8 +913,6 @@ int ObLSStatusOperator::check_all_ls_has_majority_and_log_sync( } else if (OB_FAIL(parse_result_and_check_paxos_( *result, schema_service, - zone_mgr, - server_mgr, to_stop_servers, skip_log_sync_check, print_str, @@ -932,8 +928,6 @@ int ObLSStatusOperator::check_all_ls_has_majority_and_log_sync( int ObLSStatusOperator::parse_result_and_check_paxos_( common::sqlclient::ObMySQLResult &result, schema::ObMultiVersionSchemaService &schema_service, - const ObZoneManager &zone_mgr, - const ObServerManager &server_mgr, const common::ObIArray &to_stop_servers, const bool skip_log_sync_check, const char *print_str, @@ -974,10 +968,8 @@ int ObLSStatusOperator::parse_result_and_check_paxos_( if (OB_FAIL(check_ls_log_stat_info_( schema_service, ls_log_stat_info, - zone_mgr, - server_mgr, to_stop_servers, - skip_log_sync_check, + skip_log_sync_check, print_str, need_retry))) { LOG_WARN("fail to check ls paxos info", KR(ret), K(ls_log_stat_info)); @@ -1003,11 +995,9 @@ int ObLSStatusOperator::parse_result_and_check_paxos_( if (OB_FAIL(check_ls_log_stat_info_( schema_service, ls_log_stat_info, - zone_mgr, - server_mgr, to_stop_servers, skip_log_sync_check, - print_str, + print_str, need_retry))) { LOG_WARN("fail to check ls paxos info", KR(ret), K(ls_log_stat_info)); } @@ -1087,8 +1077,6 @@ int ObLSStatusOperator::construct_ls_log_stat_replica_( int ObLSStatusOperator::check_ls_log_stat_info_( schema::ObMultiVersionSchemaService &schema_service, const ObLSLogStatInfo &ls_log_stat_info, - const ObZoneManager &zone_mgr, - const ObServerManager &server_mgr, const common::ObIArray &to_stop_servers, const bool skip_log_sync_check, const char *print_str, @@ -1145,8 +1133,6 @@ int ObLSStatusOperator::check_ls_log_stat_info_( LOG_USER_ERROR(OB_OP_NOT_ALLOW, err_msg); } else if (OB_FAIL(generate_valid_servers_( leader.get_member_list(), - zone_mgr, - server_mgr, to_stop_servers, valid_servers))) { LOG_WARN("fail to generate valid member_list", KR(ret), @@ -1196,30 +1182,32 @@ int ObLSStatusOperator::check_ls_log_stat_info_( // (skip_servers include to_stop_servers, servers_in_stopped_zone, stopped_servers, not_alive_servers, not_in_service_servers) int ObLSStatusOperator::generate_valid_servers_( const ObLSReplica::MemberList &member_list, - const ObZoneManager &zone_mgr, - const ObServerManager &server_mgr, const common::ObIArray &to_stop_servers, common::ObIArray &valid_servers) { int ret = OB_SUCCESS; valid_servers.reset(); ObArray invalid_servers; - if (OB_UNLIKELY(member_list.empty())) { + ObArray servers_info; + if (OB_ISNULL(GCTX.sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.sql_proxy_ is null", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObServerTableOperator::get(*GCTX.sql_proxy_, servers_info))) { + LOG_WARN("fail to get servers_info in table", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_UNLIKELY(member_list.empty())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("member_list is empty", KR(ret), K(member_list), K(to_stop_servers)); - } else if (OB_FAIL(ObRootUtils::get_invalid_server_list( - zone_mgr, - server_mgr, - invalid_servers))) { - LOG_WARN("fail to get invalid server list", KR(ret)); + } else if (OB_FAIL(ObRootUtils::get_invalid_server_list(servers_info, invalid_servers))) { + LOG_WARN("fail to get invalid server list", KR(ret), K(servers_info)); } else { ARRAY_FOREACH_N(member_list, idx, cnt) { const ObAddr &server = member_list.at(idx).get_server(); bool is_alive = false; - if (OB_FAIL(server_mgr.check_server_alive(server, is_alive))) { // filter deleted server which is only in member_list - LOG_WARN("fail to check is server alive", KR(ret), K(server)); + // filter deleted server which is only in member_list + if (OB_FAIL(ObRootUtils::check_server_alive(servers_info, server, is_alive))) { + LOG_WARN("fail to check is server alive", KR(ret), K(servers_info), K(server)); } else if (!is_alive) { - LOG_INFO("find not alive server in member_list", K(server), K(member_list)); + LOG_INFO("find not alive server in member_list", K(servers_info), K(server), K(member_list)); } else if (!common::has_exist_in_array(invalid_servers, server) && !common::has_exist_in_array(to_stop_servers, server)) { if (OB_FAIL(valid_servers.push_back(server))) { diff --git a/src/share/ls/ob_ls_status_operator.h b/src/share/ls/ob_ls_status_operator.h index d932403c8..aecfcc686 100644 --- a/src/share/ls/ob_ls_status_operator.h +++ b/src/share/ls/ob_ls_status_operator.h @@ -47,7 +47,6 @@ class SCN; namespace rootserver { class ObZoneManager; -class ObServerManager; } namespace share { @@ -349,7 +348,6 @@ public: // those whose status is OB_LS_CREATE_ABORT. And then, check majority and log_in_sync. // // @param [in] zone_mgr: zone manager from rs - // @param [in] server_mgr: server manager from rs // @param [in] to_stop_servers: servers to be stopped // @param [in] skip_log_sync_check: whether skip log_sync check // @param [in] print_str: string of operation. Used to print LOG_USER_ERROR "'print_str' not allowed" @@ -359,8 +357,6 @@ public: // @return: OB_SUCCESS if all check is passed. // OB_OP_NOT_ALLOW if ls doesn't have leader/enough member or ls' log is not in sync. int check_all_ls_has_majority_and_log_sync( - const rootserver::ObZoneManager &zone_mgr, - const rootserver::ObServerManager &server_mgr, const common::ObIArray &to_stop_servers, const bool skip_log_sync_check, const char *print_str, @@ -438,8 +434,6 @@ private: int parse_result_and_check_paxos_( common::sqlclient::ObMySQLResult &result, schema::ObMultiVersionSchemaService &schema_service, - const rootserver::ObZoneManager &zone_mgr, - const rootserver::ObServerManager &server_mgr, const common::ObIArray &to_stop_servers, const bool skip_log_sync_check, const char *print_str, @@ -454,16 +448,12 @@ private: int check_ls_log_stat_info_( schema::ObMultiVersionSchemaService &schema_service, const ObLSLogStatInfo &ls_log_stat_info, - const rootserver::ObZoneManager &zone_mgr, - const rootserver::ObServerManager &server_mgr, const common::ObIArray &to_stop_servers, const bool skip_log_sync_check, const char *print_str, bool &need_retry); int generate_valid_servers_( const ObLSReplica::MemberList &member_list, - const rootserver::ObZoneManager &zone_mgr, - const rootserver::ObServerManager &server_mgr, const common::ObIArray &to_stop_servers, common::ObIArray &valid_servers); int construct_ls_leader_info_sql_(common::ObSqlString &sql); diff --git a/src/share/ob_alive_server_tracer.cpp b/src/share/ob_alive_server_tracer.cpp index 7f70b949e..9861f0fd7 100644 --- a/src/share/ob_alive_server_tracer.cpp +++ b/src/share/ob_alive_server_tracer.cpp @@ -19,7 +19,7 @@ #include "share/ob_web_service_root_addr.h" #include "share/ob_thread_mgr.h" #include "observer/ob_server_struct.h" - +#include "share/ob_all_server_tracer.h" namespace oceanbase { namespace share @@ -122,13 +122,17 @@ int ObAliveServerMap::get_server_status(const ObAddr &addr, bool &alive, return ret; } -int ObAliveServerMap::refresh(common::ObIArray &active_server_list, - common::ObIArray &inactive_server_list) +int ObAliveServerMap::refresh() { int ret = OB_SUCCESS; + common::ObArray active_server_list; + common::ObArray inactive_server_list; + ObZone empty_zone; if (!is_inited_) { ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); + } else if (OB_FAIL(SVR_TRACER.get_servers_by_status(empty_zone, active_server_list, inactive_server_list))) { + LOG_WARN("fail to get servers by status", KR(ret)); } else if (active_server_list.empty()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument, empty server list", K(ret)); @@ -372,16 +376,7 @@ int ObAliveServerTracer::refresh() ret = OB_NOT_INIT; LOG_WARN("not init", K(ret)); } else { - obrpc::ObFetchAliveServerArg arg; - obrpc::ObFetchAliveServerResult result; - arg.cluster_id_ = GCONF.cluster_id; - if (OB_FAIL(rpc_proxy_->fetch_alive_server(arg, result))) { - LOG_WARN("fetch alive server failed", K(ret)); - } else if (!result.is_valid()) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("invalid alive server list", K(ret), K(result)); - } else if (OB_FAIL(last_map_->refresh(result.active_server_list_, - result.inactive_server_list_))) { + if (OB_FAIL(last_map_->refresh())) { LOG_WARN("refresh sever list failed", K(ret)); } else { ObAliveServerMap *volatile map = cur_map_; diff --git a/src/share/ob_alive_server_tracer.h b/src/share/ob_alive_server_tracer.h index b6b8cd016..730bf7764 100644 --- a/src/share/ob_alive_server_tracer.h +++ b/src/share/ob_alive_server_tracer.h @@ -61,8 +61,7 @@ public: virtual int is_alive(const common::ObAddr &addr, bool &alive, int64_t &trace_time) const; virtual int get_server_status(const common::ObAddr &addr, bool &alive, bool &is_server_exist, int64_t &trace_time) const; - virtual int refresh(common::ObIArray &active_server_list, - common::ObIArray &inactive_server_list); + virtual int refresh(); virtual int get_active_server_list(common::ObIArray &addrs) const; private: diff --git a/src/share/ob_all_server_tracer.cpp b/src/share/ob_all_server_tracer.cpp index 5d3c9fc31..4ab5af7ff 100644 --- a/src/share/ob_all_server_tracer.cpp +++ b/src/share/ob_all_server_tracer.cpp @@ -13,13 +13,14 @@ #define USING_LOG_PREFIX SHARE #include "share/ob_all_server_tracer.h" #include "lib/thread/thread_mgr.h" +#include "lib/alloc/alloc_assist.h" #include "observer/ob_server_struct.h" using namespace oceanbase::common; using namespace oceanbase::share; ObServerTraceMap::ObServerTraceMap() - : is_inited_(false), lock_(ObLatchIds::ALL_SERVER_TRACER_LOCK), server_status_arr_() + : is_inited_(false), has_build_(false), lock_(ObLatchIds::ALL_SERVER_TRACER_LOCK), server_info_arr_() { } @@ -35,13 +36,12 @@ int ObServerTraceMap::init() LOG_WARN("ObServerTraceMap has already been inited", K(ret)); } else { SpinWLockGuard guard(lock_); - int64_t block_size = (DEFAULT_SERVER_COUNT * sizeof(ObServerStatus)); - server_status_arr_.set_block_size(block_size); - if (OB_FAIL(server_status_arr_.reserve(DEFAULT_SERVER_COUNT))) { - LOG_WARN("fail to reserve server status array", KR(ret)); - } else if (OB_FAIL(server_table_operator_.init(GCTX.sql_proxy_))) { - LOG_WARN("fail to init server table operator", KR(ret)); + int64_t block_size = (DEFAULT_SERVER_COUNT * sizeof(ObServerInfoInTable)); + server_info_arr_.set_block_size(block_size); + if (OB_FAIL(server_info_arr_.reserve(DEFAULT_SERVER_COUNT))) { + LOG_WARN("fail to reserve server info array", KR(ret)); } else { + has_build_ = false; is_inited_ = true; } } @@ -54,18 +54,18 @@ int ObServerTraceMap::is_server_exist(const common::ObAddr &server, bool &exist) if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("server trace map has not inited", KR(ret)); - } else if (!server.is_valid()) { + } else if (OB_UNLIKELY(!server.is_valid())) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid server", K(server), K(ret)); } else { SpinRLockGuard guard(lock_); - ObServerStatus status; - if (OB_FAIL(find_server_status(server, status))) { + ObServerInfoInTable server_info; + if (OB_FAIL(find_server_info(server, server_info))) { if (OB_ENTRY_NOT_EXIST == ret) { ret = OB_SUCCESS; exist = false; } else { - LOG_WARN("fail to find server status", K(server), K_(server_status_arr), KR(ret)); + LOG_WARN("fail to find server info", K(server), K_(server_info_arr), KR(ret)); } } else { exist = true; @@ -77,46 +77,42 @@ int ObServerTraceMap::is_server_exist(const common::ObAddr &server, bool &exist) int ObServerTraceMap::check_server_alive(const ObAddr &server, bool &is_alive) const { int ret = OB_SUCCESS; - if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("server trace map has not inited", KR(ret)); - } else if (!server.is_valid()) { + } else if (OB_UNLIKELY(!server.is_valid())) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server", K(server), K(ret)); + LOG_WARN("invalid server", KR(ret), K(server)); } else { SpinRLockGuard guard(lock_); - ObServerStatus status; - if (OB_FAIL(find_server_status(server, status))) { - LOG_WARN("fail to find server status", K(server), K_(server_status_arr), KR(ret)); + ObServerInfoInTable server_info; + if (OB_FAIL(find_server_info(server, server_info))) { + LOG_WARN("fail to find server info", KR(ret), K(server), K_(server_info_arr)); } else { - is_alive = status.is_alive(); + is_alive = server_info.is_alive(); } } - return ret; } int ObServerTraceMap::check_in_service(const ObAddr &server, bool &service_started) const { int ret = OB_SUCCESS; - if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("server trace map has not inited", KR(ret)); - } else if (!server.is_valid()) { + } else if (OB_UNLIKELY(!server.is_valid())) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server", K(server), K(ret)); + LOG_WARN("invalid server", KR(ret), K(server)); } else { SpinRLockGuard guard(lock_); - ObServerStatus status; - if (OB_FAIL(find_server_status(server, status))) { - LOG_WARN("fail to find server status", K(server), K_(server_status_arr), KR(ret)); + ObServerInfoInTable server_info; + if (OB_FAIL(find_server_info(server, server_info))) { + LOG_WARN("fail to find server info", KR(ret), K(server), K_(server_info_arr)); } else { - service_started = status.in_service(); + service_started = server_info.in_service(); } } - return ret; } @@ -124,66 +120,64 @@ int ObServerTraceMap::check_server_permanent_offline(const ObAddr &addr, bool &i { int ret = OB_SUCCESS; SpinRLockGuard guard(lock_); - - ObServerStatus status; + ObServerInfoInTable server_info; if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); - } else if (OB_FAIL(find_server_status(addr, status))) { - LOG_WARN("fail to find server status", K(addr), K_(server_status_arr), KR(ret)); + } else if (OB_FAIL(find_server_info(addr, server_info))) { + LOG_WARN("fail to find server info", K(addr), K_(server_info_arr), KR(ret)); } else { - is_offline = status.is_permanent_offline(); + is_offline = server_info.is_permanent_offline(); } - if ((OB_ENTRY_NOT_EXIST == ret) && server_status_arr_.empty()) { + if ((OB_ENTRY_NOT_EXIST == ret) && server_info_arr_.empty()) { // if server list is empty, treat as not offline ret = OB_SUCCESS; is_offline = false; } - return ret; } -int ObServerTraceMap::for_each_server_status( - const ObFunction &functor) +int ObServerTraceMap::for_each_server_info( + const ObFunction &functor) { int ret = OB_SUCCESS; SpinRLockGuard guard(lock_); - - ObServerStatus status; + ObServerInfoInTable server_info; if (IS_NOT_INIT) { ret = OB_NOT_INIT; LOG_WARN("not init", KR(ret)); } else { - for (int64_t i = 0; OB_SUCC(ret) && i < server_status_arr_.count(); i++) { - ObServerStatus &status = server_status_arr_[i]; + for (int64_t i = 0; OB_SUCC(ret) && i < server_info_arr_.count(); i++) { + const ObServerInfoInTable &server_info = server_info_arr_.at(i); if (OB_UNLIKELY(!functor.is_valid())) { ret = OB_EAGAIN; LOG_WARN("functor is invalid"); - } else if (OB_FAIL(functor(status))) { - LOG_WARN("invoke functor failed", K(ret), K(status)); + } else if (OB_FAIL(functor(server_info))) { + LOG_WARN("invoke functor failed", K(ret), K(server_info)); } } } - return ret; } -int ObServerTraceMap::find_server_status(const ObAddr &addr, ObServerStatus &status) const +int ObServerTraceMap::find_server_info(const ObAddr &addr, ObServerInfoInTable &server_info) const { int ret = OB_SUCCESS; bool found = false; - for (int64_t i = 0; (i < server_status_arr_.count()) && !found; i++) { - if (server_status_arr_[i].server_ == addr) { - status = server_status_arr_[i]; - found = true; + for (int64_t i = 0; OB_SUCC(ret) && i < server_info_arr_.count() && !found; i++) { + const ObServerInfoInTable &server_info_i = server_info_arr_.at(i); + if (server_info_i.get_server() == addr) { + if (OB_FAIL(server_info.assign(server_info_i))) { + LOG_WARN("fail to assign server_info", KR(ret), K(server_info_i)); + } else { + found = true; + } } } - - if (!found) { + if (OB_SUCC(ret) && !found) { ret = OB_ENTRY_NOT_EXIST; } - return ret; } @@ -203,26 +197,328 @@ int ObServerTraceMap::check_migrate_in_blocked(const ObAddr &addr, bool &is_bloc return ret; } +int ObServerTraceMap::get_server_zone(const ObAddr &server, ObZone &zone) const +{ + int ret = OB_SUCCESS; + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else if (OB_UNLIKELY(!server.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server", KR(ret), K(server)); + } else { + SpinRLockGuard guard(lock_); + ObServerInfoInTable server_info; + if (OB_FAIL(find_server_info(server, server_info))) { + LOG_WARN("fail to find server info", KR(ret), K(server)); + } else if (OB_FAIL(zone.assign(server_info.get_zone()))) { + LOG_WARN("fail to assign zone", KR(ret), K(server_info)); + } + } + return ret; +} +int ObServerTraceMap::get_servers_of_zone( + const ObZone &zone, + ObIArray &servers) const +{ + int ret = OB_SUCCESS; + servers.reset(); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else { + SpinRLockGuard guard(lock_); + for (int64_t i = 0; OB_SUCC(ret) && i < server_info_arr_.count(); i++) { + const ObAddr& server = server_info_arr_.at(i).get_server(); + const ObZone& server_zone = server_info_arr_.at(i).get_zone(); + if ((server_zone == zone || zone.is_empty())) { + if (OB_FAIL(servers.push_back(server))) { + LOG_WARN("fail to push an element into servers", KR(ret), + K(server), K(zone), K(server_zone)); + } + } + } + } + return ret; +} +int ObServerTraceMap::get_servers_of_zone( + const common::ObZone &zone, + common::ObIArray &servers, + common::ObIArray &server_id_list) const +{ + int ret = OB_SUCCESS; + servers.reset(); + server_id_list.reset(); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else { + SpinRLockGuard guard(lock_); + for (int64_t i = 0; OB_SUCC(ret) && i < server_info_arr_.count(); i++) { + const ObServerInfoInTable &server_info = server_info_arr_.at(i); + const ObAddr& server = server_info.get_server(); + const ObZone& server_zone = server_info.get_zone(); + const uint64_t server_id = server_info.get_server_id(); + if ((server_zone == zone || zone.is_empty())) { + if (OB_FAIL(servers.push_back(server))) { + LOG_WARN("fail to push an element into servers", KR(ret), + K(server), K(zone), K(server_zone)); + } else if (OB_FAIL(server_id_list.push_back(server_id))) { + LOG_WARN("fail to push an element into server_id_list", KR(ret), + K(server), K(zone), K(server_zone), K(server_id)); + } + } + } + } + return ret; +} +int ObServerTraceMap::get_server_info(const common::ObAddr &server, ObServerInfoInTable &server_info) const +{ + int ret = OB_SUCCESS; + server_info.reset(); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else if (OB_UNLIKELY(!server.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server", KR(ret), K(server)); + } else { + SpinRLockGuard guard(lock_); + if (OB_FAIL(find_server_info(server, server_info))) { + LOG_WARN("fail to find server info", KR(ret), K(server)); + } + } + return ret; +} +int ObServerTraceMap::get_servers_info( + const common::ObZone &zone, + common::ObIArray &servers_info, + bool include_permanent_offline) const +{ + int ret = OB_SUCCESS; + servers_info.reset(); + SpinRLockGuard guard(lock_); + for (int64_t i = 0; OB_SUCC(ret) && i < server_info_arr_.count(); ++i) { + const ObServerInfoInTable &server_info = server_info_arr_.at(i); + if (server_info.get_zone() == zone || zone.is_empty()) { + if (include_permanent_offline || !server_info.is_permanent_offline()) { + if (OB_UNLIKELY(!server_info.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("server info is not valid", "server info", server_info, KR(ret)); + } else if (OB_FAIL(servers_info.push_back(server_info))) { + LOG_WARN("push back to servers_info failed", KR(ret), K(server_info)); + } + } + } + } + return ret; +} +int ObServerTraceMap::get_active_servers_info( + const common::ObZone &zone, + common::ObIArray &active_servers_info) const +{ + int ret = OB_SUCCESS; + active_servers_info.reset(); + SpinRLockGuard guard(lock_); + for (int64_t i = 0; OB_SUCC(ret) && i < server_info_arr_.count(); ++i) { + const ObServerInfoInTable &server_info = server_info_arr_.at(i); + if ((server_info.get_zone() == zone || zone.is_empty()) && server_info.is_active()) { + if (OB_UNLIKELY(!server_info.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("server info is not valid", "server info", server_info, KR(ret)); + } else if (OB_FAIL(active_servers_info.push_back(server_info))) { + LOG_WARN("push back to active_servers_info failed", KR(ret), K(server_info)); + } + } + } + return ret; +} +int ObServerTraceMap::get_alive_servers(const ObZone &zone, ObIArray &server_list) const +{ + int ret = OB_SUCCESS; + + server_list.reset(); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else { + SpinRLockGuard guard(lock_); + for (int64_t i = 0; OB_SUCC(ret) && i < server_info_arr_.count(); ++i) { + const ObServerInfoInTable &server_info = server_info_arr_.at(i); + const ObAddr &server = server_info.get_server(); + if ((server_info.get_zone() == zone || zone.is_empty()) && server_info.is_alive()) { + if (OB_FAIL(server_list.push_back(server))) { + LOG_WARN("fail to push an element into server_list", KR(ret), K(server)); + } + } + } + } + + return ret; +} +int ObServerTraceMap::check_server_active(const ObAddr &server, bool &is_active) const +{ + int ret = OB_SUCCESS; + + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else if (OB_UNLIKELY(!server.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server", KR(ret), K(server)); + } else { + SpinRLockGuard guard(lock_); + ObServerInfoInTable server_info; + if (OB_FAIL(find_server_info(server, server_info))) { + LOG_WARN("fail to find server info", KR(ret), K(server), K_(server_info_arr)); + } else { + is_active = server_info.is_active(); + } + } + + return ret; +} + +int ObServerTraceMap::check_server_can_migrate_in(const ObAddr &server, bool &can_migrate_in) const +{ + int ret = OB_SUCCESS; + + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else if (OB_UNLIKELY(!server.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server", KR(ret), K(server)); + } else { + SpinRLockGuard guard(lock_); + ObServerInfoInTable server_info; + if (OB_FAIL(find_server_info(server, server_info))) { + LOG_WARN("fail to find server info", KR(ret), K(server), K_(server_info_arr)); + } else { + can_migrate_in = server_info.can_migrate_in(); + } + } + + return ret; +} + +int ObServerTraceMap::get_alive_servers_count(const common::ObZone &zone, int64_t &count) const +{ + // empty zone means check if there exists stopped servers in the whole cluster + int ret = OB_SUCCESS; + ObArray server_list; + count = 0; + if (OB_FAIL(get_alive_servers(zone, server_list))) { + LOG_WARN("fail to get alive servers", KR(ret), K(zone)); + } else { + count = server_list.count(); + } + return ret; +} +int ObServerTraceMap::get_servers_by_status( + const ObZone &zone, + common::ObIArray &alive_server_list, + common::ObIArray ¬_alive_server_list) const +{ + int ret = OB_SUCCESS; + alive_server_list.reset(); + not_alive_server_list.reset(); + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else { + SpinRLockGuard guard(lock_); + for (int64_t i = 0; OB_SUCC(ret) && i < server_info_arr_.count(); ++i) { + const ObServerInfoInTable &server_info = server_info_arr_.at(i); + if (server_info.get_zone() == zone || zone.is_empty()) { + const ObAddr &server = server_info.get_server(); + if (server_info.is_alive()) { + if (OB_FAIL(alive_server_list.push_back(server))) { + LOG_WARN("fail to push back to alive_server_list", KR(ret), K(server)); + } + } else if (OB_FAIL(not_alive_server_list.push_back(server))) { + LOG_WARN("fail to push back to not_alive_server_list", KR(ret), K(server)); + } + } + } + } + return ret; +} + +int ObServerTraceMap::get_min_server_version(char min_server_version[OB_SERVER_VERSION_LENGTH]) +{ + int ret = OB_SUCCESS; + ObZone zone; // empty zone, get all server statuses + ObArray servers_info; + if (IS_NOT_INIT) { + ret = OB_NOT_INIT; + LOG_WARN("server trace map has not inited", KR(ret)); + } else { + SpinRLockGuard guard(lock_); + if (OB_FAIL(servers_info.assign(server_info_arr_))) { + LOG_WARN("fail to assign servers_info", KR(ret), K(server_info_arr_)); + } else if (OB_UNLIKELY(servers_info.empty())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("servers_info is empty", KR(ret), K(servers_info), K(server_info_arr_)); + } + } + if (OB_SUCC(ret)) { + ObClusterVersion version_parser; + uint64_t cur_min_version = UINT64_MAX; + for (int64_t i = 0; OB_SUCC(ret) && i < servers_info.count(); i++) { + const ObServerInfoInTable::ObBuildVersion &build_version = servers_info.at(i).get_build_version(); + char *saveptr = NULL; + char build_version_ptr[common::OB_SERVER_VERSION_LENGTH] = {0}; + MEMCPY(build_version_ptr, build_version.ptr(), OB_SERVER_VERSION_LENGTH); + char *version = STRTOK_R(build_version_ptr, "_", &saveptr); + if (OB_ISNULL(version) || OB_UNLIKELY(strlen(version) + 1 > OB_SERVER_VERSION_LENGTH)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid build version format", KR(ret), K(build_version_ptr)); + } else if (OB_FAIL(version_parser.refresh_cluster_version(version))) { + LOG_WARN("failed to parse version", KR(ret), K(version)); + } else { + if (version_parser.get_cluster_version() < cur_min_version) { + size_t len = strlen(version); + MEMCPY(min_server_version, version, len); + min_server_version[len] = '\0'; + cur_min_version = version_parser.get_cluster_version(); + } + } + if (OB_SUCC(ret) && UINT64_MAX == cur_min_version) { + ret = OB_ENTRY_NOT_EXIST; + LOG_WARN("no valid server version found", KR(ret)); + } + } + } + return ret; +} + int ObServerTraceMap::refresh() { int ret = OB_SUCCESS; - ObArray server_statuses; + ObArray servers_info; if (OB_UNLIKELY(!is_inited_)) { ret = OB_NOT_INIT; - LOG_WARN("ObServerTraceMap has not been inited", K(ret)); - } else if (OB_FAIL(server_table_operator_.get(server_statuses))) { - LOG_WARN("fail to get server status", K(ret)); + LOG_WARN("ObServerTraceMap has not been inited", KR(ret), K(is_inited_)); + } else if (OB_ISNULL(GCTX.sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("GCTX.sql_proxy_ is null", KR(ret), KP(GCTX.sql_proxy_)); + } else if (OB_FAIL(ObServerTableOperator::get(*GCTX.sql_proxy_, servers_info))) { + LOG_WARN("fail to get servers_info", KR(ret), KP(GCTX.sql_proxy_)); } else { SpinWLockGuard guard(lock_); // reuse memory - server_status_arr_.reuse(); - + server_info_arr_.reuse(); // can not use ObArray's assign, which will reallocate memory - for (int64_t i = 0; (i < server_statuses.count()) && OB_SUCC(ret); ++i) { - if (OB_FAIL(server_status_arr_.push_back(server_statuses[i]))) { - LOG_WARN("fail to push back", K(server_statuses[i]), K(i), KR(ret)); + for (int64_t i = 0; i < servers_info.count() && OB_SUCC(ret); ++i) { + const ObServerInfoInTable &server_info_i = servers_info.at(i); + if (OB_FAIL(server_info_arr_.push_back(server_info_i))) { + LOG_WARN("fail to push back", K(server_info_i), KR(ret)); } } + if (OB_SUCC(ret)) { + has_build_ = true; + } } return ret; } @@ -324,8 +620,95 @@ int ObAllServerTracer::check_migrate_in_blocked(const ObAddr &addr, bool &is_blo return trace_map_.check_migrate_in_blocked(addr, is_block); } -int ObAllServerTracer::for_each_server_status( - const ObFunction &functor) +int ObAllServerTracer::for_each_server_info( + const ObFunction &functor) { - return trace_map_.for_each_server_status(functor); + return trace_map_.for_each_server_info(functor); +} + +int ObAllServerTracer::get_server_zone(const ObAddr &server, ObZone &zone) const +{ + return trace_map_.get_server_zone(server, zone); +} + +int ObAllServerTracer::get_servers_of_zone( + const ObZone &zone, + ObIArray &servers) const +{ + // empty zone means that get all servers + return trace_map_.get_servers_of_zone(zone, servers); +} + +int ObAllServerTracer::get_servers_of_zone( + const common::ObZone &zone, + common::ObIArray &servers, + common::ObIArray &server_id_list) const +{ + // empty zone means that get all servers + return trace_map_.get_servers_of_zone(zone, servers, server_id_list); +} + +int ObAllServerTracer::get_server_info( + const common::ObAddr &server, + ObServerInfoInTable &server_info) const +{ + return trace_map_.get_server_info(server, server_info); +} + +int ObAllServerTracer::get_servers_info( + const common::ObZone &zone, + common::ObIArray &servers_info, + bool include_permanent_offline) const +{ + return trace_map_.get_servers_info(zone, servers_info, include_permanent_offline); +} + +int ObAllServerTracer::get_active_servers_info( + const common::ObZone &zone, + common::ObIArray &active_servers_info) const +{ + return trace_map_.get_active_servers_info(zone, active_servers_info); +} + +int ObAllServerTracer::get_alive_servers(const ObZone &zone, ObIArray &server_list) const +{ + return trace_map_.get_alive_servers(zone, server_list); +} + +int ObAllServerTracer::check_server_active(const ObAddr &server, bool &is_active) const +{ + return trace_map_.check_server_active(server, is_active); +} + +int ObAllServerTracer::check_server_can_migrate_in(const ObAddr &server, bool &can_migrate_in) const +{ + return trace_map_.check_server_can_migrate_in(server, can_migrate_in); +} + +int ObAllServerTracer::get_alive_servers_count(const common::ObZone &zone, int64_t &count) const +{ + return trace_map_.get_alive_servers_count(zone, count); +} + +int ObAllServerTracer::refresh() +{ + return trace_map_.refresh(); +} + +int ObAllServerTracer::get_servers_by_status( + const ObZone &zone, + common::ObIArray &alive_server_list, + common::ObIArray ¬_alive_server_list) const +{ + return trace_map_.get_servers_by_status(zone, alive_server_list, not_alive_server_list); +} + +int ObAllServerTracer::get_min_server_version(char min_server_version[OB_SERVER_VERSION_LENGTH]) +{ + return trace_map_.get_min_server_version(min_server_version); +} + +bool ObAllServerTracer::has_build() const +{ + return trace_map_.has_build(); } diff --git a/src/share/ob_all_server_tracer.h b/src/share/ob_all_server_tracer.h index 9b1aea0d1..007da63cd 100644 --- a/src/share/ob_all_server_tracer.h +++ b/src/share/ob_all_server_tracer.h @@ -35,19 +35,45 @@ public: virtual int check_in_service(const common::ObAddr &addr, bool &service_started) const; virtual int check_migrate_in_blocked(const common::ObAddr &addr, bool &is_block) const; virtual int check_server_permanent_offline(const common::ObAddr &server, bool &is_offline) const; + virtual int check_server_active(const common::ObAddr &server, bool &is_active) const; + virtual int check_server_can_migrate_in(const common::ObAddr &server, bool &can_migrate_in) const; virtual int is_server_stopped(const common::ObAddr &server, bool &is_stopped) const; + virtual int get_server_zone(const common::ObAddr &server, common::ObZone &zone) const; + virtual int get_servers_of_zone( + const common::ObZone &zone, + common::ObIArray &servers) const; + virtual int get_servers_of_zone( + const common::ObZone &zone, + common::ObIArray &servers, + common::ObIArray &server_id_list) const; + virtual int get_server_info(const common::ObAddr &server, ObServerInfoInTable &server_info) const; + virtual int get_servers_info( + const common::ObZone &zone, + common::ObIArray &servers_info, + bool include_permanent_offline) const; + virtual int get_active_servers_info( + const common::ObZone &zone, + common::ObIArray &active_servers_info) const; + virtual int get_alive_servers(const common::ObZone &zone, common::ObIArray &server_list) const; + virtual int get_alive_servers_count(const common::ObZone &zone, int64_t &count) const; + virtual int get_servers_by_status( + const ObZone &zone, + common::ObIArray &alive_server_list, + common::ObIArray ¬_alive_server_list) const; + virtual int get_min_server_version(char min_server_version[OB_SERVER_VERSION_LENGTH]); + bool has_build() const {return has_build_; }; int refresh(); - int for_each_server_status(const ObFunction &functor); + int for_each_server_info(const ObFunction &functor); private: - int find_server_status(const ObAddr &addr, ObServerStatus &status) const; + int find_server_info(const ObAddr &addr, ObServerInfoInTable &server_info) const; private: static const int64_t DEFAULT_SERVER_COUNT = 2048; bool is_inited_; + bool has_build_; mutable common::SpinRWLock lock_; - common::ObArray server_status_arr_; - ObServerTableOperator server_table_operator_; + common::ObArray server_info_arr_; }; class ObServerTraceTask : public common::ObTimerTask @@ -69,13 +95,44 @@ class ObAllServerTracer : public share::ObIServerTrace public: static ObAllServerTracer &get_instance(); int init(int tg_id, ObServerTraceTask &trace_task); - int for_each_server_status(const ObFunction &functor); + int for_each_server_info(const ObFunction &functor); virtual int is_server_exist(const common::ObAddr &server, bool &exist) const; virtual int check_server_alive(const common::ObAddr &server, bool &is_alive) const; virtual int check_in_service(const common::ObAddr &addr, bool &service_started) const; virtual int check_server_permanent_offline(const common::ObAddr &server, bool &is_offline) const; virtual int is_server_stopped(const common::ObAddr &server, bool &is_stopped) const; virtual int check_migrate_in_blocked(const common::ObAddr &addr, bool &is_block) const; + virtual int get_server_zone(const common::ObAddr &server, common::ObZone &zone) const; + // empty zone means that get all servers + virtual int get_servers_of_zone( + const common::ObZone &zone, + common::ObIArray &servers) const; + // empty zone means that get all servers + virtual int get_servers_of_zone( + const common::ObZone &zone, + common::ObIArray &servers, + common::ObIArray &server_id_list) const; + virtual int get_server_info( + const common::ObAddr &server, + ObServerInfoInTable &server_info) const; + virtual int get_servers_info( + const common::ObZone &zone, + common::ObIArray &servers_info, + bool include_permanent_offline = true) const; + virtual int get_active_servers_info( + const common::ObZone &zone, + common::ObIArray &active_servers_info) const; + virtual int get_alive_servers(const common::ObZone &zone, common::ObIArray &server_list) const; + virtual int check_server_active(const common::ObAddr &server, bool &is_active) const; + virtual int refresh(); + virtual int check_server_can_migrate_in(const common::ObAddr &server, bool &can_migrate_in) const; + virtual int get_alive_servers_count(const common::ObZone &zone, int64_t &count) const; + virtual int get_servers_by_status( + const ObZone &zone, + common::ObIArray &alive_server_list, + common::ObIArray ¬_alive_server_list) const; + virtual int get_min_server_version(char min_server_version[OB_SERVER_VERSION_LENGTH]); + bool has_build() const; private: ObAllServerTracer(); virtual ~ObAllServerTracer(); @@ -87,4 +144,6 @@ private: } // end namespace share } // end namespace oceanbase -#endif // OCEANBASE_SHARE_OB_ALL_SERVER_TRACER_H_ +#define SVR_TRACER (::oceanbase::share::ObAllServerTracer::get_instance()) + +#endif // OCEANBASE_SHARE_OB_ALL_SERVER_TRACER_H_ \ No newline at end of file diff --git a/src/share/ob_common_rpc_proxy.h b/src/share/ob_common_rpc_proxy.h index 6abea37c6..8b1bc1828 100644 --- a/src/share/ob_common_rpc_proxy.h +++ b/src/share/ob_common_rpc_proxy.h @@ -47,12 +47,12 @@ public: RPC_S(PR5 report_sys_ls, obrpc::OB_REPORT_SYS_LS, (share::ObLSReplica)); RPC_S(PR5 remove_sys_ls, obrpc::OB_REMOVE_SYS_LS, (obrpc::ObRemoveSysLsArg)); RPC_S(PR5 fetch_location, obrpc::OB_FETCH_LOCATION, (obrpc::ObFetchLocationArg), ObFetchLocationResult); - RPC_S(PR5 merge_finish, obrpc::OB_MERGE_FINISH, (ObMergeFinishArg)); + // RPC_S(PR5 merge_finish, obrpc::OB_MERGE_FINISH, (ObMergeFinishArg)); RPC_S(PR5 broadcast_ds_action, obrpc::OB_BROADCAST_DS_ACTION, (ObDebugSyncActionArg)); RPC_S(PR5 check_dangling_replica_finish, obrpc::OB_CHECK_DANGLING_REPLICA_FINISH, (ObCheckDanglingReplicaFinishArg)); // high priority for fetch_alive_server, make sure it will not be blocked while network partition RPC_S(PR3 fetch_alive_server, obrpc::OB_FETCH_ALIVE_SERVER, (ObFetchAliveServerArg), ObFetchAliveServerResult); - RPC_S(PR5 fetch_active_server_status, obrpc::OB_FETCH_ACTIVE_SERVER_STATUS, (ObFetchAliveServerArg), ObFetchActiveServerAddrResult); + // RPC_S(PR5 fetch_active_server_status, obrpc::OB_FETCH_ACTIVE_SERVER_STATUS, (ObFetchAliveServerArg), ObFetchActiveServerAddrResult); RPC_S(PRD create_tenant, obrpc::OB_CREATE_TENANT, (ObCreateTenantArg), UInt64); RPC_S(PRD create_tenant_end, obrpc::OB_CREATE_TENANT_END, (ObCreateTenantEndArg)); @@ -262,7 +262,6 @@ public: //RPC_S(PRD log_nop_operation, obrpc::OB_LOG_DDL_NOP_OPERATOR, (obrpc::ObDDLNopOpreatorArg)); RPC_S(PRD broadcast_schema, OB_BROADCAST_SCHEMA, (obrpc::ObBroadcastSchemaArg)); //RPC_S(PR5 get_switchover_status, OB_GET_SWITCHOVER_STATUS, obrpc::ObGetSwitchoverStatusRes); - RPC_S(PR5 check_merge_finish, OB_CHECK_MERGE_FINISH, (obrpc::ObCheckMergeFinishArg)); RPC_S(PR5 get_recycle_schema_versions, OB_GET_RECYCLE_SCHEMA_VERSIONS, (obrpc::ObGetRecycleSchemaVersionsArg), obrpc::ObGetRecycleSchemaVersionsResult); // backup and restore diff --git a/src/share/ob_debug_sync_point.h b/src/share/ob_debug_sync_point.h index 84e1b6b14..117846bc6 100644 --- a/src/share/ob_debug_sync_point.h +++ b/src/share/ob_debug_sync_point.h @@ -46,7 +46,6 @@ class ObString; ACT(OBSERVICE_GET_LEADER_CANDIDATES,) \ ACT(CHECK_NEW_TENANT,) \ ACT(BEFORE_CHECK_MAJOR_FREEZE_DONE,) \ - ACT(SET_FORCE_STOP_HB_DONE,) \ ACT(UPDATE_WITH_PARTITION_FLAG_DONE,) \ ACT(MAJOR_FREEZE_AFTER_SYS_COMMIT,) \ ACT(MAJOR_FREEZE_AFTER_ROOTSERVER_COMMIT,) \ @@ -454,6 +453,7 @@ class ObString; ACT(AFTER_LS_GC_DELETE_ALL_TABLETS,)\ ACT(BEFORE_ARCHIVE_ADD_LS_TASK,)\ ACT(AFTER_UPDATE_INDEX_STATUS,)\ + ACT(END_DELETE_SERVER_BEFORE_CHECK_META_TABLE,)\ ACT(BEFORE_MIGRATION_DISABLE_VOTE,)\ ACT(MEMBERLIST_CHANGE_MEMBER,)\ ACT(BEFORE_CHECK_CLEAN_DRTASK,)\ diff --git a/src/share/ob_heartbeat_struct.cpp b/src/share/ob_heartbeat_struct.cpp new file mode 100644 index 000000000..f535234da --- /dev/null +++ b/src/share/ob_heartbeat_struct.cpp @@ -0,0 +1,232 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ +#define USING_LOG_PREFIX SHARE +#include "share/ob_heartbeat_struct.h" +namespace oceanbase +{ +using namespace common; +namespace share +{ +OB_SERIALIZE_MEMBER( + ObHBRequest, + server_, + server_id_, + rs_addr_, + rs_server_status_, + epoch_id_); +OB_SERIALIZE_MEMBER( + ObHBResponse, + zone_, + server_, + sql_port_, + build_version_, + start_service_time_, + server_health_status_); +ObServerHBInfo::ObServerHBInfo () + : server_(), + last_hb_time_(0), + server_health_status_(), + hb_status_(ObServerStatus::OB_HEARTBEAT_MAX) +{ +} +ObServerHBInfo::~ObServerHBInfo() +{ +} +int ObServerHBInfo::init( + const common::ObAddr &server, + const int64_t last_hb_time, + const ObServerStatus::HeartBeatStatus hb_status) +{ + int ret = OB_SUCCESS; + server_health_status_.reset(); + if (OB_UNLIKELY(!server.is_valid() + || last_hb_time <= 0 + || hb_status >= ObServerStatus::OB_HEARTBEAT_MAX)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server), K(last_hb_time), K(hb_status)); + } else if (OB_FAIL(server_health_status_.init(observer::ObServerHealthStatus::DATA_DISK_STATUS_NORMAL))) { + LOG_WARN("fail to init server_health_status_", KR(ret)); + } else { + server_ = server; + last_hb_time_ = last_hb_time; + hb_status_ = hb_status; + } + return ret; +} +int ObServerHBInfo::assign(const ObServerHBInfo &other) +{ + int ret = OB_SUCCESS; + if (OB_FAIL(server_health_status_.assign(other.server_health_status_))) { + LOG_WARN("fail to assign server_health_status_", KR(ret), K(other.server_health_status_)); + } else { + server_ = other.server_; + last_hb_time_ = other.last_hb_time_; + hb_status_ = other.hb_status_; + } + return ret; +} +bool ObServerHBInfo::is_valid() const +{ + return server_.is_valid() + && last_hb_time_ > 0 + && hb_status_ < ObServerStatus::OB_HEARTBEAT_MAX + && server_health_status_.is_valid(); +} +void ObServerHBInfo::reset() +{ + server_.reset(); + last_hb_time_ = 0; + server_health_status_.reset(); + hb_status_ = ObServerStatus::OB_HEARTBEAT_MAX; +} +ObHBRequest::ObHBRequest() + : server_(), + server_id_(OB_INVALID_ID), + rs_addr_(), + rs_server_status_(RSS_INVALID), + epoch_id_(palf::INVALID_PROPOSAL_ID) +{ +} +ObHBRequest::~ObHBRequest() +{ +} +int ObHBRequest::init( + const common::ObAddr &server, + const uint64_t server_id, + const common::ObAddr &rs_addr, + const share::RSServerStatus rs_server_status, + const int64_t epoch_id) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!server.is_valid() + || OB_INVALID_ID == server_id + || !rs_addr.is_valid() + || palf::INVALID_PROPOSAL_ID == epoch_id)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server), K(server_id), K(rs_addr), K(epoch_id)); + } else { + server_ = server; + server_id_ = server_id; + rs_addr_ = rs_addr; + rs_server_status_ = rs_server_status; + epoch_id_ = epoch_id; + } + return ret; +} +int ObHBRequest::assign(const ObHBRequest &other) +{ + int ret = OB_SUCCESS; + server_ = other.server_; + server_id_ = other.server_id_; + rs_addr_ = other.rs_addr_; + rs_server_status_ = other.rs_server_status_; + epoch_id_ = other.epoch_id_; + return ret; +} +bool ObHBRequest::is_valid() const +{ + return server_.is_valid() + && OB_INVALID_ID != server_id_ + && rs_addr_.is_valid() + && rs_server_status_ > RSS_INVALID + && rs_server_status_ < RSS_MAX + && palf::INVALID_PROPOSAL_ID != epoch_id_; +} +void ObHBRequest::reset() +{ + server_.reset(); + server_id_ = OB_INVALID_ID; + rs_addr_.reset(); + rs_server_status_ = RSS_INVALID; + epoch_id_ = palf::INVALID_PROPOSAL_ID; +} +ObHBResponse::ObHBResponse() + : zone_(), + server_(), + sql_port_(0), + build_version_(), + start_service_time_(0), + server_health_status_() +{ +} +ObHBResponse::~ObHBResponse() +{ +} +int ObHBResponse::init( + const common::ObZone &zone, + const common::ObAddr &server, + const int64_t sql_port, + const ObServerInfoInTable::ObBuildVersion &build_version, + const int64_t start_service_time, + const observer::ObServerHealthStatus server_health_status) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(zone.is_empty() + || !server.is_valid() + || sql_port <= 0 + || build_version.is_empty() + || start_service_time < 0 + || !server_health_status.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(zone), K(server), K(sql_port), K(build_version), + K(start_service_time), K(server_health_status)); + } else { + if (OB_FAIL(zone_.assign(zone))) { + LOG_WARN("fail to init zone", KR(ret), K(zone)); + } else if (OB_FAIL(build_version_.assign(build_version))) { + LOG_WARN("fail to init build_version_", KR(ret), K(build_version)); + } else if (OB_FAIL(server_health_status_.assign(server_health_status))) { + LOG_WARN("fail to init server_health_status_", KR(ret), K(server_health_status)); + } else { + server_ = server; + sql_port_ = sql_port; + start_service_time_ = start_service_time; + } + } + return ret; +} +int ObHBResponse::assign(const ObHBResponse &other) +{ + int ret = OB_SUCCESS; + if (OB_FAIL(zone_.assign(other.zone_))) { + LOG_WARN("fail to assign zone", KR(ret), K(other.zone_)); + } else if (OB_FAIL(build_version_.assign(other.build_version_))) { + LOG_WARN("fail to assign build version", KR(ret), K(other.build_version_)); + } else if (OB_FAIL(server_health_status_.assign(other.server_health_status_))) { + LOG_WARN("fail to assign server_health_status_", KR(ret), K(other.server_health_status_)); + } else { + server_ = other.server_; + sql_port_ = other.sql_port_; + start_service_time_ = other.start_service_time_; + } + return ret; +} +bool ObHBResponse::is_valid() const +{ + return !zone_.is_empty() + && server_.is_valid() + && sql_port_ > 0 + && !build_version_.is_empty() + && start_service_time_ >= 0 + && server_health_status_.is_valid(); +} +void ObHBResponse::reset() +{ + zone_.reset(); + server_.reset(); + sql_port_ = 0; + build_version_.reset(); + start_service_time_ = 0; + server_health_status_.reset(); +} +} // share +} // oceanbase \ No newline at end of file diff --git a/src/share/ob_heartbeat_struct.h b/src/share/ob_heartbeat_struct.h new file mode 100644 index 000000000..234d54a75 --- /dev/null +++ b/src/share/ob_heartbeat_struct.h @@ -0,0 +1,189 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ +#ifndef OCEANBASE_OBSERVER_OB_HEARTBEAT_STRUCT_H_ +#define OCEANBASE_OBSERVER_OB_HEARTBEAT_STRUCT_H_ +#include "lib/net/ob_addr.h" +#include "observer/ob_heartbeat_handler.h" // ObServerHealthStatus +#include "ob_server_status.h" +#include "ob_server_table_operator.h" +namespace oceanbase +{ +namespace observer +{ + struct ObServerHealthStatus; +} +namespace share +{ +// volatile memory in heartbeat service +// last_hb_time_: distinguish which server is online/offline +// server_health_status_: decide whether we should stop/start server due to the change of server_health_status_ +// hb_status_: to mark whether the server is alive/lease_expire/permanent_offline +struct ObServerHBInfo +{ +public: + ObServerHBInfo (); + virtual ~ObServerHBInfo(); + int init( + const common::ObAddr &server, + const int64_t last_hb_time, + const ObServerStatus::HeartBeatStatus hb_status); + int assign(const ObServerHBInfo &other); + bool is_valid() const; + void reset(); + const common::ObAddr &get_server() const + { + return server_; + } + int64_t get_last_hb_time() const + { + return last_hb_time_; + } + const observer::ObServerHealthStatus &get_server_health_status() const + { + return server_health_status_; + } + + ObServerStatus::HeartBeatStatus get_hb_status() const + { + return hb_status_; + } + int set_server_health_status(const observer::ObServerHealthStatus &server_health_status) { + return server_health_status_.assign(server_health_status); + } + void set_last_hb_time(const int64_t last_hb_time) { + last_hb_time_ = last_hb_time; + } + void set_hb_status(ObServerStatus::HeartBeatStatus hb_status) + { + hb_status_ = hb_status; + } + TO_STRING_KV( + K_(server), + K_(last_hb_time), + K_(server_health_status), + K_(hb_status)); +private: + common::ObAddr server_; + int64_t last_hb_time_; + observer::ObServerHealthStatus server_health_status_; + ObServerStatus::HeartBeatStatus hb_status_; +}; +// heartbeat service send heartbeat requests to observers on the whitelist +// server_: the request is sent to which server +// server_id_: server_'s unique id in the cluster +// rs_addr_: the request is sent from which server. And this server is current rs leader. +// rs_server_status_: server_ is stopped or not in rs's view. +// epoch_id_: // It indicates the request is based on which whitelist_epoch_id (or which whitelist) +struct ObHBRequest +{ + OB_UNIS_VERSION(1); +public: + ObHBRequest(); + virtual ~ObHBRequest(); + int init( + const common::ObAddr &server, + const uint64_t server_id, + const common::ObAddr &rs_addr, + const share::RSServerStatus rs_server_status, + const int64_t epoch_id); + int assign(const ObHBRequest &other); + bool is_valid() const; + void reset(); + const common::ObAddr &get_server() const + { + return server_; + } + uint64_t get_server_id() const + { + return server_id_; + } + const common::ObAddr &get_rs_addr() const + { + return rs_addr_; + } + share::RSServerStatus get_rs_server_status() const + { + return rs_server_status_; + } + int64_t get_epoch_id() const + { + return epoch_id_; + } + TO_STRING_KV(K_(server), K_(server_id), K_(rs_addr), K_(rs_server_status), K_(epoch_id)); +private: + common::ObAddr server_; + uint64_t server_id_; + common::ObAddr rs_addr_; + share::RSServerStatus rs_server_status_; + int64_t epoch_id_; +}; +// servers send heartbeat responses back to heartbeat service +// to report there own zone, address, sql port, build version, start service time and health status. +struct ObHBResponse +{ + OB_UNIS_VERSION(1); +public: + ObHBResponse(); + virtual ~ObHBResponse(); + int init( + const common::ObZone &zone, + const common::ObAddr &server, + const int64_t sql_port, + const ObServerInfoInTable::ObBuildVersion &build_version, + const int64_t start_service_time, + const observer::ObServerHealthStatus server_health_status); + int assign(const ObHBResponse &other); + bool is_valid() const; + void reset(); + const common::ObZone &get_zone() const + { + return zone_; + } + const common::ObAddr &get_server() const + { + return server_; + } + int64_t get_sql_port() const + { + return sql_port_; + } + const share::ObServerInfoInTable::ObBuildVersion &get_build_version() const + { + return build_version_; + } + int64_t get_start_service_time() const + { + return start_service_time_; + } + const observer::ObServerHealthStatus &get_server_health_status() const + { + return server_health_status_; + } + TO_STRING_KV( + K_(zone), + K_(server), + K_(sql_port), + K_(build_version), + K_(start_service_time), + K_(server_health_status)) + +private: + common::ObZone zone_; + common::ObAddr server_; + int64_t sql_port_; // mysql listen port + share::ObServerInfoInTable::ObBuildVersion build_version_; + int64_t start_service_time_; + observer::ObServerHealthStatus server_health_status_; +}; +} // share +} // oceanbase +#endif \ No newline at end of file diff --git a/src/share/ob_rpc_struct.cpp b/src/share/ob_rpc_struct.cpp index a1933b9af..f3bd9a1e6 100644 --- a/src/share/ob_rpc_struct.cpp +++ b/src/share/ob_rpc_struct.cpp @@ -5731,12 +5731,6 @@ void ObBroadcastSchemaArg::reset() schema_version_ = OB_INVALID_VERSION; } -OB_SERIALIZE_MEMBER(ObCheckMergeFinishArg, frozen_scn_); -bool ObCheckMergeFinishArg::is_valid() const -{ - return frozen_scn_.is_valid(); -} - OB_SERIALIZE_MEMBER(ObGetRecycleSchemaVersionsArg, tenant_ids_); bool ObGetRecycleSchemaVersionsArg::is_valid() const { @@ -5867,6 +5861,59 @@ OB_SERIALIZE_MEMBER((ObLabelSeComponentDDLArg, ObDDLArg), ddl_type_, schema_, po OB_SERIALIZE_MEMBER((ObLabelSeLabelDDLArg, ObDDLArg), ddl_type_, schema_, policy_name_); OB_SERIALIZE_MEMBER((ObLabelSeUserLevelDDLArg, ObDDLArg), ddl_type_, level_schema_, policy_name_); OB_SERIALIZE_MEMBER(ObCheckServerEmptyArg, mode_, sys_data_version_); +OB_SERIALIZE_MEMBER(ObCheckServerForAddingServerArg, mode_, sys_tenant_data_version_); +int ObCheckServerForAddingServerArg::init(const Mode &mode, const uint64_t sys_tenant_data_version) +{ + int ret = OB_SUCCESS; + mode_ = mode; + sys_tenant_data_version_ = sys_tenant_data_version; + return ret; +} +int ObCheckServerForAddingServerArg::assign(const ObCheckServerForAddingServerArg &other) { + int ret = OB_SUCCESS; + mode_ = other.mode_; + sys_tenant_data_version_ = other.sys_tenant_data_version_; + return ret; +} +OB_SERIALIZE_MEMBER( + ObCheckServerForAddingServerResult, + is_server_empty_, + zone_, + sql_port_, + build_version_); +int ObCheckServerForAddingServerResult::init( + const bool is_server_empty, + const ObZone &zone, + const int64_t sql_port, + const share::ObServerInfoInTable::ObBuildVersion &build_version) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(zone.is_empty() || sql_port <= 0 || build_version.is_empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid arg", KR(ret), K(zone), K(sql_port), K(build_version)); + } else if (OB_FAIL(zone_.assign(zone))) { + LOG_WARN("fail to assign zone", KR(ret), K(zone)); + } else if (OB_FAIL(build_version_.assign(build_version))) { + LOG_WARN("fail to assign build version", KR(ret), K(build_version)); + } else { + is_server_empty_ = is_server_empty; + sql_port_ = sql_port; + } + return ret; +} +int ObCheckServerForAddingServerResult::assign(const ObCheckServerForAddingServerResult &other) +{ + int ret = OB_SUCCESS; + if (OB_FAIL(zone_.assign(other.zone_))) { + LOG_WARN("fail to assign zone", KR(ret), K(other.zone_)); + } else if (OB_FAIL(build_version_.assign(other.build_version_))) { + LOG_WARN("fail to assign build_version", KR(ret), K(other.build_version_)); + } else { + is_server_empty_ = other.is_server_empty_; + sql_port_ = other.sql_port_; + } + return ret; +} OB_SERIALIZE_MEMBER(ObCheckDeploymentModeArg, single_zone_deployment_on_); OB_SERIALIZE_MEMBER(ObArchiveLogArg, enable_, tenant_id_, archive_tenant_ids_); @@ -8112,5 +8159,71 @@ int ObRlsContextDDLArg::assign(const ObRlsContextDDLArg &other) OB_SERIALIZE_MEMBER((ObTryAddDepInofsForSynonymBatchArg, ObDDLArg), tenant_id_, synonym_ids_); +OB_SERIALIZE_MEMBER(ObGetServerResourceInfoArg, rs_addr_); + +int ObGetServerResourceInfoArg::init(const common::ObAddr &rs_addr) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!rs_addr.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid rs_addr", KR(ret), K(rs_addr)); + } else { + rs_addr_ = rs_addr; + } + return ret; +} + +int ObGetServerResourceInfoArg::assign(const ObGetServerResourceInfoArg &other) +{ + int ret = OB_SUCCESS; + rs_addr_ = other.rs_addr_; + return ret; +} + +bool ObGetServerResourceInfoArg::is_valid() const +{ + return rs_addr_.is_valid(); +} + +void ObGetServerResourceInfoArg::reset() +{ + rs_addr_.reset(); +} + +OB_SERIALIZE_MEMBER(ObGetServerResourceInfoResult, server_, resource_info_); + +int ObGetServerResourceInfoResult::init( + const common::ObAddr &server, + const share::ObServerResourceInfo &resource_info) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!server.is_valid() || !resource_info.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server or resource_info", KR(ret), K(server), K(resource_info)); + } else { + server_ = server; + resource_info_ = resource_info; + } + return ret; +} + +int ObGetServerResourceInfoResult::assign(const ObGetServerResourceInfoResult &other) +{ + int ret = OB_SUCCESS; + server_ = other.server_; + resource_info_ = other.resource_info_; + return ret; +} + +bool ObGetServerResourceInfoResult::is_valid() const +{ + return server_.is_valid() && resource_info_.is_valid(); +} + +void ObGetServerResourceInfoResult::reset() +{ + server_.reset(); + resource_info_.reset(); +} }//end namespace obrpc }//end namepsace oceanbase diff --git a/src/share/ob_rpc_struct.h b/src/share/ob_rpc_struct.h index da029b832..2a485fdaf 100644 --- a/src/share/ob_rpc_struct.h +++ b/src/share/ob_rpc_struct.h @@ -68,6 +68,7 @@ #include "share/config/ob_config.h" // ObConfigArray #include "logservice/palf/log_meta_info.h"//LogConfigVersion #include "share/scn.h"//SCN +#include "share/ob_server_table_operator.h" namespace oceanbase { @@ -6839,20 +6840,6 @@ public: TO_STRING_KV(K_(tenant_id), K_(schema_version)); }; -struct ObCheckMergeFinishArg -{ - OB_UNIS_VERSION(1); -public: - ObCheckMergeFinishArg() - { - frozen_scn_.set_min(); - } - bool is_valid() const; -public: - share::SCN frozen_scn_; - TO_STRING_KV(K_(frozen_scn)); -}; - struct ObGetRecycleSchemaVersionsArg { OB_UNIS_VERSION(1); @@ -7476,6 +7463,70 @@ public: Mode mode_; uint64_t sys_data_version_; }; +struct ObCheckServerForAddingServerArg +{ + OB_UNIS_VERSION(1); +public: + enum Mode { + ADD_SERVER + }; + + ObCheckServerForAddingServerArg(): mode_(ADD_SERVER), sys_tenant_data_version_(0) {} + TO_STRING_KV(K_(mode), K_(sys_tenant_data_version)); + int init(const Mode &mode, const uint64_t sys_tenant_data_version); + int assign(const ObCheckServerForAddingServerArg &other); + Mode get_mode() const + { + return mode_; + } + uint64_t get_sys_tenant_data_version() const + { + return sys_tenant_data_version_; + } +private: + Mode mode_; + uint64_t sys_tenant_data_version_; +}; +struct ObCheckServerForAddingServerResult +{ + OB_UNIS_VERSION(1); +public: + ObCheckServerForAddingServerResult() + : is_server_empty_(false), + zone_(), + sql_port_(0), + build_version_() + { + } + TO_STRING_KV(K_(is_server_empty), K_(zone), K_(sql_port), K_(build_version)); + int init( + const bool is_server_empty, + const ObZone &zone, + const int64_t sql_port, + const share::ObServerInfoInTable::ObBuildVersion &build_version); + int assign(const ObCheckServerForAddingServerResult &other); + bool get_is_server_empty() const + { + return is_server_empty_; + } + const ObZone& get_zone() const + { + return zone_; + } + int64_t get_sql_port() const + { + return sql_port_; + } + const share::ObServerInfoInTable::ObBuildVersion& get_build_version() const + { + return build_version_; + } +private: + bool is_server_empty_; + ObZone zone_; + int64_t sql_port_; + share::ObServerInfoInTable::ObBuildVersion build_version_; +}; struct ObArchiveLogArg { @@ -8944,6 +8995,37 @@ public: ObSArray synonym_ids_; }; +struct ObGetServerResourceInfoArg +{ + OB_UNIS_VERSION(1); +public: + ObGetServerResourceInfoArg() : rs_addr_() {} + TO_STRING_KV(K_(rs_addr)); + int init(const common::ObAddr &rs_addr); + int assign(const ObGetServerResourceInfoArg &other); + bool is_valid() const; + void reset(); + const common::ObAddr &get_rs_addr() const { return rs_addr_; } +private: + common::ObAddr rs_addr_; +}; + +struct ObGetServerResourceInfoResult +{ + OB_UNIS_VERSION(1); +public: + ObGetServerResourceInfoResult() : server_(), resource_info_() {} + TO_STRING_KV(K_(server), K_(resource_info)); + int init(const common::ObAddr &server, const share::ObServerResourceInfo &resource_info); + int assign(const ObGetServerResourceInfoResult &other); + bool is_valid() const; + void reset(); + const common::ObAddr &get_server() const { return server_; } + const share::ObServerResourceInfo &get_resource_info() const { return resource_info_; } +private: + common::ObAddr server_; + share::ObServerResourceInfo resource_info_; +}; }//end namespace obrpc }//end namespace oceanbase #endif diff --git a/src/share/ob_server_table_operator.cpp b/src/share/ob_server_table_operator.cpp index 3357fc735..7a5ebcece 100644 --- a/src/share/ob_server_table_operator.cpp +++ b/src/share/ob_server_table_operator.cpp @@ -22,6 +22,7 @@ #include "share/ob_dml_sql_splicer.h" #include "common/ob_timeout_ctx.h" #include "rootserver/ob_root_utils.h" +#include "rootserver/ob_heartbeat_service.h" namespace oceanbase { @@ -32,6 +33,184 @@ using namespace rootserver; namespace share { +ObServerInfoInTable::ObServerInfoInTable() +{ + reset(); +} +ObServerInfoInTable::~ObServerInfoInTable() +{ +} +int ObServerInfoInTable::init( + const common::ObAddr &server, + const uint64_t server_id, + const common::ObZone &zone, + const int64_t sql_port, + const bool with_rootserver, + const ObServerStatus::DisplayStatus status, + const ObBuildVersion &build_version, + const int64_t stop_time, + const int64_t start_service_time, + const int64_t last_offline_time) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!server.is_valid() + || OB_INVALID_ID == server_id + || zone.is_empty() + || sql_port <= 0 + || status >= ObServerStatus::OB_DISPLAY_MAX + || build_version.is_empty() + || stop_time < 0 + || start_service_time < 0 + || (0 == last_offline_time_ && ObServerStatus::OB_SERVER_INACTIVE == status_) + || (last_offline_time_ > 0 && ObServerStatus::OB_SERVER_ACTIVE == status_))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server), K(server_id), K(zone), K(sql_port), + K(with_rootserver), K(status), K(build_version), K(stop_time), K(start_service_time), + K(last_offline_time)); + } else { + if (OB_FAIL(zone_.assign(zone))) { + LOG_WARN("fail to assign zone", KR(ret), K(zone)); + } else if (OB_FAIL(build_version_.assign(build_version))) { + LOG_WARN("fail to assign build_version", KR(ret), K(build_version)); + } else { + server_ = server; + server_id_ = server_id; + sql_port_ = sql_port; + with_rootserver_ = with_rootserver; + status_ = status; + stop_time_ = stop_time; + start_service_time_ = start_service_time; + block_migrate_in_time_ = 0; + last_offline_time_ = last_offline_time; + } + } + return ret; +} +int ObServerInfoInTable::assign(const ObServerInfoInTable &other) +{ + int ret = OB_SUCCESS; + if (OB_FAIL(zone_.assign(other.zone_))) { + LOG_WARN("fail to assign zone", KR(ret), K(other.zone_)); + } else if (OB_FAIL(build_version_.assign(other.build_version_))) { + LOG_WARN("fail to assign build version", KR(ret), K(other.build_version_)); + } else { + server_ = other.server_; + server_id_ = other.server_id_; + sql_port_ = other.sql_port_; + with_rootserver_ = other.with_rootserver_; + status_ = other.status_; + stop_time_ = other.stop_time_; + start_service_time_ = other.start_service_time_; + block_migrate_in_time_ = other.block_migrate_in_time_; + last_offline_time_ = other.last_offline_time_; + } + return ret; +} +bool ObServerInfoInTable::is_valid() const +{ + return server_.is_valid() + && OB_INVALID_ID != server_id_ + && !zone_.is_empty() + && sql_port_ > 0 + && status_ < ObServerStatus::OB_DISPLAY_MAX + && !build_version_.is_empty() + && stop_time_ >= 0 + && start_service_time_ >= 0 + && ((0 == last_offline_time_ && ObServerStatus::OB_SERVER_INACTIVE != status_) + || (last_offline_time_ > 0 && ObServerStatus::OB_SERVER_ACTIVE != status_)); +} +void ObServerInfoInTable::reset() +{ + server_.reset(); + server_id_ = OB_INVALID_ID; + zone_.reset(); + sql_port_ = 0; + with_rootserver_ = false; + status_ = ObServerStatus::OB_DISPLAY_MAX; + build_version_.reset(); + stop_time_ = 0; + start_service_time_ = 0; + block_migrate_in_time_ = 0; + last_offline_time_ = 0; +} +int ObServerInfoInTable::build_server_info_in_table(const share::ObServerStatus &server_status) +{ + int ret = OB_SUCCESS; + ObServerInfoInTable::ObBuildVersion build_version; + reset(); + if (OB_UNLIKELY(!server_status.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server status", KR(ret), K(server_status)); + } else if (OB_FAIL(build_version.assign(server_status.build_version_))) { + LOG_WARN("fail to assign build_version", KR(ret), K(server_status.build_version_)); + } else if(OB_FAIL(init( + server_status.server_, + server_status.get_server_id(), + server_status.zone_, + server_status.sql_port_, + server_status.with_rootserver_, + server_status.get_display_status(), + build_version, + server_status.stop_time_, + server_status.start_service_time_, + server_status.last_offline_time_))) { + LOG_WARN("fail to build server_info_in_table", KR(ret), K(server_status), K(build_version)); + } + return ret; +} +int ObServerInfoInTable::build_server_status(share::ObServerStatus &server_status) const +{ + int ret = OB_SUCCESS; + const int64_t now = ::oceanbase::common::ObTimeUtility::current_time(); + server_status.reset(); + if (OB_UNLIKELY(!is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid server info", KR(ret), KPC(this)); + } else if (OB_FAIL(server_status.zone_.assign(zone_))) { + LOG_WARN("fail to assign zone", KR(ret), K(zone_)); + } else { + server_status.server_ = server_; + server_status.id_ = server_id_; + server_status.sql_port_ = sql_port_; + server_status.with_rootserver_ = with_rootserver_; + strncpy(server_status.build_version_, build_version_.ptr(), OB_SERVER_VERSION_LENGTH); + server_status.stop_time_ = stop_time_; + server_status.start_service_time_ = start_service_time_; + server_status.last_offline_time_ = last_offline_time_; + if (ObServerStatus::OB_SERVER_DELETING == status_) { + server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_DELETING; + } else { + server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_NORMAL; + } + if (0 == last_offline_time_) { + server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_ALIVE; + } else if (now - last_offline_time_ >= GCONF.server_permanent_offline_time - GCONF.lease_time) { + // last_offline_time = last_hb_time + GCONF.lease_time + server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE; + } else { + server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_LEASE_EXPIRED; + } + } + return ret; +} +bool ObServerInfoInTable::is_permanent_offline() const +{ + const int64_t now = ::oceanbase::common::ObTimeUtility::current_time(); + bool is_permanent_offline = false; + if (last_offline_time_ > 0) { + int64_t last_hb_time = last_offline_time_ - GCONF.lease_time; + is_permanent_offline = (now - last_hb_time >= GCONF.server_permanent_offline_time); + } + return is_permanent_offline; +} +bool ObServerInfoInTable::is_temporary_offline() const +{ + bool is_temporary_offline = false; + if (last_offline_time_ > 0 && !is_permanent_offline()) { + is_temporary_offline = true; + } + return is_temporary_offline; +} ObServerTableOperator::ObServerTableOperator() : inited_(false), proxy_(NULL) @@ -47,56 +226,63 @@ int ObServerTableOperator::init(common::ObISQLClient *proxy) int ret = OB_SUCCESS; if (inited_) { ret = OB_INIT_TWICE; - LOG_WARN("init twice", K(ret)); + LOG_WARN("init twice", KR(ret)); } else { proxy_ = proxy; inited_ = true; } return ret; } - +int ObServerTableOperator::get(common::ObIArray &server_statuses) +{ + int ret = OB_SUCCESS; + if (OB_UNLIKELY(!inited_)) { + ret = OB_NOT_INIT; + LOG_WARN("not init", KR(ret), K(inited_)); + } else if (OB_ISNULL(proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("proxy_ is null", KR(ret), KP(proxy_)); + } else if (OB_FAIL(get(*proxy_, server_statuses))) { + LOG_WARN("fail to get", KR(ret), KP(proxy_)); + } + return ret; +} int ObServerTableOperator::get( + common::ObISQLClient &sql_proxy, common::ObIArray &server_statuses) { int ret = OB_SUCCESS; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); + server_statuses.reset(); + ObSqlString sql; + ObTimeoutCtx ctx; + if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else if (OB_FAIL(sql.assign_fmt("SELECT *, time_to_usec(gmt_modified) AS last_hb_time " + "FROM %s", OB_ALL_SERVER_TNAME))) { + LOG_WARN("sql assign_fmt failed", KR(ret)); } else { - server_statuses.reset(); - ObSqlString sql; - ObTimeoutCtx ctx; - if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { - LOG_WARN("fail to get timeout ctx", K(ret), K(ctx)); - } else if (OB_FAIL(sql.assign_fmt("SELECT time_to_usec(gmt_modified) AS last_hb_time, " - "id, zone, svr_ip, svr_port, inner_port, status, with_rootserver, " - "block_migrate_in_time, build_version, stop_time, start_service_time, with_partition " - "FROM %s", OB_ALL_SERVER_TNAME))) { - LOG_WARN("sql assign_fmt failed", K(ret)); - } else { - SMART_VAR(ObMySQLProxy::MySQLResult, res) { - ObMySQLResult *result = NULL; - if (OB_FAIL(proxy_->read(res, OB_SYS_TENANT_ID, sql.ptr()))) { - LOG_WARN("execute sql failed", K(sql), K(ret)); - } else if (NULL == (result = res.get_result())) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("execute sql failed", K(sql), K(ret)); - } else { - ObServerStatus server_status; - while (OB_SUCC(ret)) { - server_status.reset(); - if (OB_FAIL(result->next())) { - if (OB_ITER_END != ret) { - LOG_WARN("result next failed", K(ret)); - } else { - ret = OB_SUCCESS; - break; - } - } else if (OB_FAIL(build_server_status(*result, server_status))) { - LOG_WARN("build server status failed", K(ret)); - } else if (OB_FAIL(server_statuses.push_back(server_status))) { - LOG_WARN("build server_status failed", K(ret)); + SMART_VAR(ObMySQLProxy::MySQLResult, res) { + ObMySQLResult *result = NULL; + if (OB_FAIL(sql_proxy.read(res, OB_SYS_TENANT_ID, sql.ptr()))) { + LOG_WARN("execute sql failed", K(sql), KR(ret)); + } else if (NULL == (result = res.get_result())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("execute sql failed", K(sql), KR(ret)); + } else { + ObServerStatus server_status; + while (OB_SUCC(ret)) { + server_status.reset(); + if (OB_FAIL(result->next())) { + if (OB_ITER_END != ret) { + LOG_WARN("result next failed", KR(ret)); + } else { + ret = OB_SUCCESS; + break; } + } else if (OB_FAIL(build_server_status(*result, server_status))) { + LOG_WARN("build server status failed", KR(ret)); + } else if (OB_FAIL(server_statuses.push_back(server_status))) { + LOG_WARN("build server_status failed", KR(ret)); } } } @@ -112,10 +298,7 @@ int ObServerTableOperator::remove(const common::ObAddr &server, common::ObMySQLT int64_t affected_rows = 0; char ip_buf[OB_MAX_SERVER_ADDR_SIZE] = ""; ObTimeoutCtx ctx; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else if (!server.is_valid()) { + if (!server.is_valid()) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid server", K(server), K(ret)); } else if (false == server.ip_to_string(ip_buf, sizeof(ip_buf))) { @@ -150,44 +333,19 @@ int ObServerTableOperator::remove(const common::ObAddr &server, common::ObMySQLT int ObServerTableOperator::update(const ObServerStatus &server_status) { int ret = OB_SUCCESS; - char svr_ip[OB_MAX_SERVER_ADDR_SIZE] = ""; - const char *display_status_str = NULL; + ObDMLSqlSplicer dml; if (!inited_) { ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); + LOG_WARN("not init", KR(ret)); } else if (!server_status.is_valid()) { ret = OB_INVALID_ARGUMENT; - LOG_WARN("invalid server status", K(server_status), K(ret)); - } else if (false == server_status.server_.ip_to_string(svr_ip, sizeof(svr_ip))) { - ret = OB_INVALID_ARGUMENT; - LOG_WARN("convert server ip to string failed", "server", server_status.server_, K(ret)); - } else if (OB_FAIL(ObServerStatus::display_status_str( - server_status.get_display_status(), display_status_str))) { - LOG_WARN("get display status str failed", K(ret), - "display_status", server_status.get_display_status()); - } else if (NULL == display_status_str) { - ret = OB_ERR_UNEXPECTED; - LOG_WARN("null display status string", K(ret)); + LOG_WARN("invalid server status", KR(ret), K(server_status)); + } else if (OB_FAIL(insert_dml_builder(server_status, dml))) { + LOG_WARN("fail to build insert dml", KR(ret), K(server_status)); } else { - const int64_t modify_time = server_status.last_hb_time_; - ObDMLSqlSplicer dml; ObTimeoutCtx ctx; if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { LOG_WARN("fail to get timeout ctx", K(ret), K(ctx)); - } else if (OB_FAIL(dml.add_pk_column(K(svr_ip))) - || OB_FAIL(dml.add_pk_column("svr_port", server_status.server_.get_port())) - || OB_FAIL(dml.add_column("id", server_status.id_)) - || OB_FAIL(dml.add_column("zone", server_status.zone_.ptr())) - || OB_FAIL(dml.add_column("inner_port", server_status.sql_port_)) - || OB_FAIL(dml.add_column(OBJ_K(server_status, with_rootserver))) - || OB_FAIL(dml.add_column(OBJ_K(server_status, block_migrate_in_time))) - || OB_FAIL(dml.add_column(OBJ_K(server_status, start_service_time))) - || OB_FAIL(dml.add_column("status", display_status_str)) - || OB_FAIL(dml.add_column(OBJ_K(server_status, build_version))) - || OB_FAIL(dml.add_column(OBJ_K(server_status, stop_time))) - || OB_FAIL(dml.add_column(OBJ_K(server_status, with_partition))) - || OB_FAIL(dml.add_gmt_modified(modify_time))) { - LOG_WARN("add column failed", K(ret)); } else { ObDMLExecHelper exec(*proxy_, OB_SYS_TENANT_ID); int64_t affected_rows = 0; @@ -199,7 +357,6 @@ int ObServerTableOperator::update(const ObServerStatus &server_status) } } } - if (OB_SUCC(ret)) { LOG_INFO("update server status in all_server table succeed", K(server_status)); } @@ -242,10 +399,11 @@ int ObServerTableOperator::reset_rootserver(const ObAddr &except) return ret; } -int ObServerTableOperator::update_status(const common::ObAddr &server, - const ObServerStatus::DisplayStatus status, - const int64_t last_hb_time, - common::ObMySQLTransaction &trans) +int ObServerTableOperator::update_status( + const common::ObAddr &server, + const ObServerStatus::DisplayStatus status, + const int64_t last_hb_time, + common::ObMySQLTransaction &trans) { int ret = OB_SUCCESS; char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; @@ -333,8 +491,9 @@ int ObServerTableOperator::update_stop_time(const ObAddr &server, return ret; } -int ObServerTableOperator::build_server_status(const ObMySQLResult &res, - ObServerStatus &server_status) const +int ObServerTableOperator::build_server_status( + const ObMySQLResult &res, + ObServerStatus &server_status) { int ret = OB_SUCCESS; server_status.reset(); @@ -345,19 +504,16 @@ int ObServerTableOperator::build_server_status(const ObMySQLResult &res, char svr_status[OB_SERVER_STATUS_LENGTH] = ""; int64_t with_rootserver = 0; int64_t with_partition = 0; - if (!inited_) { - ret = OB_NOT_INIT; - LOG_WARN("not init", K(ret)); - } else { + if (OB_SUCC(ret)) { EXTRACT_INT_FIELD_MYSQL(res, "last_hb_time", last_hb_time, int64_t); EXTRACT_INT_FIELD_MYSQL(res, "id", server_status.id_, uint64_t); EXTRACT_STRBUF_FIELD_MYSQL(res, "zone", server_status.zone_.ptr(), - MAX_ZONE_LENGTH, tmp_real_str_len); + MAX_ZONE_LENGTH, tmp_real_str_len); EXTRACT_STRBUF_FIELD_MYSQL(res, "svr_ip", svr_ip, OB_IP_STR_BUFF, tmp_real_str_len); EXTRACT_INT_FIELD_MYSQL(res, "svr_port", svr_port, int64_t); EXTRACT_INT_FIELD_MYSQL(res, "inner_port", server_status.sql_port_, int64_t); EXTRACT_STRBUF_FIELD_MYSQL(res, "status", svr_status, - OB_SERVER_STATUS_LENGTH, tmp_real_str_len); + OB_SERVER_STATUS_LENGTH, tmp_real_str_len); EXTRACT_INT_FIELD_MYSQL(res, "with_rootserver", with_rootserver, int64_t); EXTRACT_INT_FIELD_MYSQL(res, "block_migrate_in_time", server_status.block_migrate_in_time_, int64_t); @@ -365,61 +521,66 @@ int ObServerTableOperator::build_server_status(const ObMySQLResult &res, server_status.start_service_time_, int64_t); EXTRACT_STRBUF_FIELD_MYSQL(res, "build_version", server_status.build_version_, - OB_SERVER_VERSION_LENGTH, tmp_real_str_len); + OB_SERVER_VERSION_LENGTH, tmp_real_str_len); EXTRACT_INT_FIELD_MYSQL(res, "stop_time", server_status.stop_time_, int64_t); EXTRACT_INT_FIELD_MYSQL(res, "with_partition", with_partition, int64_t); + EXTRACT_INT_FIELD_MYSQL_WITH_DEFAULT_VALUE( + res, "last_offline_time", server_status.last_offline_time_, int64_t, + false/*skip_null_error*/, true/*skip_column_error*/, 0/*invalid_default_value*/); (void) tmp_real_str_len; // make compiler happy server_status.server_.set_ip_addr(svr_ip, static_cast(svr_port)); server_status.with_rootserver_ = static_cast(with_rootserver); server_status.with_partition_ = static_cast(with_partition); ObServerStatus::DisplayStatus display_status = ObServerStatus::OB_DISPLAY_MAX; - if (OB_FAIL(ret)) { - } else if (OB_FAIL(ObServerStatus::str2display_status(svr_status, display_status))) { + const int64_t now = ::oceanbase::common::ObTimeUtility::current_time(); + if (FAILEDx(ObServerStatus::str2display_status(svr_status, display_status))) { LOG_WARN("string to display status failed", K(ret), K(svr_status)); } else if (display_status < 0 || display_status >= ObServerStatus::OB_DISPLAY_MAX) { ret = OB_INVALID_SERVER_STATUS; LOG_WARN("invalid display status", K(svr_status), K(ret)); } else { LOG_INFO("svr_status", K(svr_status), K(display_status)); - if (ObServerStatus::OB_SERVER_DELETING == display_status) { - server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_DELETING; - } else if (ObServerStatus::OB_SERVER_TAKENOVER_BY_RS == display_status) { - server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_TAKENOVER_BY_RS; - } else { - server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_NORMAL; - } - // set server heartbeat status if (ObServerStatus::OB_SERVER_ACTIVE == display_status) { - const int64_t now = ::oceanbase::common::ObTimeUtility::current_time(); + server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_NORMAL; server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_ALIVE; server_status.last_hb_time_ = now; server_status.lease_expire_time_ = now + ObLeaseRequest::SERVICE_LEASE; - } else if (ObServerStatus::OB_SERVER_TAKENOVER_BY_RS == display_status) { - server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE; - server_status.last_hb_time_ = last_hb_time; - server_status.lease_expire_time_ = 1; } else if (ObServerStatus::OB_SERVER_DELETING == display_status) { - const int64_t now = ::oceanbase::common::ObTimeUtility::current_time(); + // Assumption: there is no deleting server while upgrading system to V4.2 + server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_DELETING; + if (0 == server_status.last_offline_time_) { + server_status.last_hb_time_ = now; + server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_ALIVE; + server_status.lease_expire_time_ = now + ObLeaseRequest::SERVICE_LEASE; + } else { + server_status.last_hb_time_ = server_status.last_offline_time_ - GCONF.lease_time; + server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_LEASE_EXPIRED; + if (now - server_status.last_hb_time_ > GCONF.server_permanent_offline_time) { + server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE; + } + } + } else if (ObServerStatus::OB_SERVER_INACTIVE == display_status) { + server_status.admin_status_ = ObServerStatus::OB_SERVER_ADMIN_NORMAL; server_status.last_hb_time_ = last_hb_time; server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_LEASE_EXPIRED; if (now - last_hb_time > GCONF.server_permanent_offline_time) { server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE; } - } else { // ObServerStatus::OB_SERVER_INACTIVE - server_status.last_hb_time_ = last_hb_time; - int64_t now = ::oceanbase::common::ObTimeUtility::current_time(); - server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_LEASE_EXPIRED; - if (now - last_hb_time > GCONF.server_permanent_offline_time) { - server_status.hb_status_ = ObServerStatus::OB_HEARTBEAT_PERMANENT_OFFLINE; + if (0 == server_status.last_offline_time_) { + server_status.last_offline_time_ = server_status.last_hb_time_ + GCONF.lease_time; } + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unknown display status", K(display_status), K(svr_status), K(server_status.server_)); } } } return ret; } -int ObServerTableOperator::update_with_partition(const common::ObAddr &server, +int ObServerTableOperator::update_with_partition( + const common::ObAddr &server, bool with_partition) { int ret = OB_SUCCESS; @@ -502,7 +663,391 @@ int ObServerTableOperator::get_start_service_time( } return ret; } +int ObServerTableOperator::get( + common::ObISQLClient &sql_proxy, + common::ObIArray &all_servers_info_in_table) +{ + int ret = OB_SUCCESS; + ObArray server_statuses; + ObServerInfoInTable server_info_in_table; + all_servers_info_in_table.reset(); + if (OB_FAIL(get(sql_proxy, server_statuses))) { + LOG_WARN("fail to get server status", KR(ret)); + } else { + ARRAY_FOREACH_X(server_statuses, idx, cnt, OB_SUCC(ret)) { + server_info_in_table.reset(); + if (OB_FAIL(server_info_in_table.build_server_info_in_table(server_statuses.at(idx)))) { + LOG_WARN("fail to build server info in table", KR(ret), K(server_statuses.at(idx))); + } else if (OB_FAIL(all_servers_info_in_table.push_back(server_info_in_table))) { + LOG_WARN("fail to push element into all_servers_info_in_table", KR(ret), K(server_info_in_table)); + } else {} + } + } + return ret; +} +int ObServerTableOperator::get( + common::ObISQLClient &sql_proxy, + const common::ObAddr &server, + ObServerInfoInTable &server_info_in_table) +{ + int ret = OB_SUCCESS; + ObSqlString sql; + ObTimeoutCtx ctx; + char svr_ip[OB_IP_STR_BUFF] = ""; + server_info_in_table.reset(); + if (OB_UNLIKELY(!server.is_valid() || !server.ip_to_string(svr_ip, sizeof(svr_ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server)); + } else if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else if (OB_FAIL(sql.assign_fmt("SELECT *, time_to_usec(gmt_modified) AS last_hb_time " + "FROM %s WHERE svr_ip = '%s' AND svr_port = %d", OB_ALL_SERVER_TNAME, svr_ip, server.get_port()))) { + LOG_WARN("fail to append sql", KR(ret), K(server)); + } else { + SMART_VAR(ObMySQLProxy::MySQLResult, res) { + int tmp_ret = OB_SUCCESS; + ObMySQLResult *result = NULL; + ObServerStatus server_status; + if (OB_FAIL(sql_proxy.read(res, OB_SYS_TENANT_ID, sql.ptr()))) { + LOG_WARN("fail to execute sql", KR(ret), K(sql)); + } else if (OB_ISNULL(result = res.get_result())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("fail to get sql result", KR(ret), K(sql)); + } else if (OB_FAIL(result->next())) { + if (OB_ITER_END == ret) { + ret = OB_SERVER_NOT_IN_WHITE_LIST; + } + LOG_WARN("fail to get next", KR(ret), K(sql));; + } else if (OB_FAIL(build_server_status(*result, server_status))) { + LOG_WARN("fail to build server_status",KR(ret)); + } else if (OB_FAIL(server_info_in_table.build_server_info_in_table(server_status))) { + LOG_WARN("fail to build server_info_in_table", KR(ret), K(server_status)); + } + if (OB_SUCC(ret) && (OB_ITER_END != (tmp_ret = result->next()))) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("get more row than one", KR(ret), KR(tmp_ret), K(sql)); + } + } + } + LOG_INFO("get server from table", KR(ret), K(server), K(server_info_in_table)); + return ret; +} +int ObServerTableOperator::insert( + common::ObISQLClient &sql_proxy, + const ObServerInfoInTable &server_info_in_table) +{ + int ret = OB_SUCCESS; + ObDMLSqlSplicer dml; + ObServerStatus server_status; + if (OB_UNLIKELY(!server_info_in_table.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server_info_in_table)); + } else if (OB_FAIL(server_info_in_table.build_server_status(server_status))) { + LOG_WARN("fail to build server status", KR(ret), K(server_info_in_table)); + } else if (OB_FAIL(insert_dml_builder(server_status, dml))) { + LOG_WARN("fail to build insert dml", KR(ret), K(server_info_in_table)); + } else { + ObTimeoutCtx ctx; + if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else { + ObDMLExecHelper exec(sql_proxy, OB_SYS_TENANT_ID); + int64_t affected_rows = 0; + if (OB_FAIL(exec.exec_insert(OB_ALL_SERVER_TNAME, dml, affected_rows))) { + LOG_WARN("fail to exec update", KR(ret), K(server_info_in_table)); + } else if (is_zero_row(affected_rows)) { + ret = OB_NEED_RETRY; + LOG_WARN("no affected rows, please retry the operation or " + "check the table to see if it has been inserted already", + KR(ret), K(affected_rows), K(server_info_in_table)); + } else if (is_single_row(affected_rows)) { + LOG_INFO("insert one row into the table successfully", KR(ret), K(affected_rows), + K(server_info_in_table)); + } else { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected error appears, more than one affected row", + KR(ret), K(affected_rows), K(server_info_in_table)); + } + } + } + return ret; +} +int ObServerTableOperator::update_status( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const ObServerStatus::DisplayStatus old_status, + const ObServerStatus::DisplayStatus new_status) +{ + int ret = OB_SUCCESS; + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + const char *old_display_status_str = NULL; + const char *new_display_status_str = NULL; + if (OB_UNLIKELY(!server.is_valid() || !server.ip_to_string(ip, sizeof(ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server)); + } else if (OB_FAIL(ObServerStatus::display_status_str(old_status, old_display_status_str))) { + LOG_WARN("get display status string failed", KR(ret), K(old_status)); + } else if (OB_FAIL(ObServerStatus::display_status_str(new_status, new_display_status_str))) { + LOG_WARN("get display status string failed", KR(ret), K(new_status)); + } else if (OB_ISNULL(old_display_status_str) || OB_ISNULL(new_display_status_str)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("there exists null display_status_str", KR(ret), KP(old_display_status_str), KP(new_display_status_str)); + } else { + ObSqlString sql; + if (OB_FAIL(sql.assign_fmt( + "UPDATE %s SET status = '%s' WHERE svr_ip = '%s' AND svr_port = %d " + "AND status = '%s'", OB_ALL_SERVER_TNAME, new_display_status_str, ip, server.get_port(), + old_display_status_str))) { + LOG_WARN("fail to assign fmt", KR(ret), K(ip), K(server.get_port())); + } else if (OB_FAIL(exec_write(trans, sql, false /* is_multi_rows_affected */))) { + LOG_WARN("fail to update the table", KR(ret), K(sql)); + } else {} + } + return ret; +} +int ObServerTableOperator::update_with_rootserver( + ObMySQLTransaction &trans, + const common::ObAddr &server) +{ + int ret = OB_SUCCESS; + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + if (OB_UNLIKELY(!server.is_valid() || !server.ip_to_string(ip, sizeof(ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server)); + } else { + ObSqlString sql; + if (OB_FAIL(sql.assign_fmt( + "UPDATE %s SET WITH_ROOTSERVER = (CASE WHEN (SVR_IP, SVR_PORT) = ('%s', %d) THEN 1 ELSE 0 END)", + OB_ALL_SERVER_TNAME, ip, server.get_port()))) { + LOG_WARN("fail to assign fmt", KR(ret)); + } else if (OB_FAIL(exec_write(trans, sql, true /* is_multi_rows_affected*/))) { + LOG_WARN("fail to update the table", KR(ret), K(sql)); + } else {} + } + return ret; +} +int ObServerTableOperator::update_build_version( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const ObServerInfoInTable::ObBuildVersion &old_build_version, + const ObServerInfoInTable::ObBuildVersion &new_build_version) +{ + int ret = OB_SUCCESS; + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + if (OB_UNLIKELY(new_build_version.is_empty() + || old_build_version.is_empty() + || !server.is_valid() + || !server.ip_to_string(ip, sizeof(ip)) + || old_build_version == new_build_version)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server), K(new_build_version), K(old_build_version)); + } else { + ObSqlString sql; + if (OB_FAIL(sql.assign_fmt( + "UPDATE %s SET build_version = '%s' " + "WHERE svr_ip = '%s' AND svr_port = %d AND build_version = '%s'", + OB_ALL_SERVER_TNAME, new_build_version.ptr(), ip, server.get_port(), old_build_version.ptr()))) { + LOG_WARN("fail to assign fmt", KR(ret)); + } else if (OB_FAIL(exec_write(trans, sql, false /* is_multi_rows_affected*/))) { + LOG_WARN("fail to update the table", KR(ret), K(sql)); + } else {} + } + return ret; +} +int ObServerTableOperator::update_start_service_time( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const int64_t old_start_service_time, + const int64_t new_start_service_time) +{ + int ret = OB_SUCCESS; + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + if (OB_UNLIKELY(new_start_service_time < 0 + || old_start_service_time < 0 + || !server.is_valid() + || !server.ip_to_string(ip, sizeof(ip)) + || old_start_service_time == new_start_service_time)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server), K(new_start_service_time), + K(old_start_service_time)); + } else { + ObSqlString sql; + if (OB_FAIL(sql.assign_fmt( + "UPDATE %s SET start_service_time = %ld " + "WHERE svr_ip = '%s' AND svr_port = %d AND start_service_time = %ld", + OB_ALL_SERVER_TNAME, new_start_service_time, ip, server.get_port(), old_start_service_time))) { + LOG_WARN("fail to assign fmt", KR(ret)); + } else if (OB_FAIL(exec_write(trans, sql, false /* is_multi_rows_affected */))) { + LOG_WARN("fail to update the table", KR(ret), K(sql)); + } else {} + } + return ret; +} +int ObServerTableOperator::update_table_for_offline_to_online_server( + ObMySQLTransaction &trans, + const bool is_deleting, + const common::ObAddr &server) +{ + int ret = OB_SUCCESS; + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + if (OB_UNLIKELY(!server.is_valid() + || !server.ip_to_string(ip, sizeof(ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server)); + } else { + ObSqlString sql; + if (OB_FAIL(sql.assign_fmt( + "UPDATE %s SET last_offline_time = 0%s" + " WHERE svr_ip = '%s' AND svr_port = %d AND status != 'ACTIVE'", + OB_ALL_SERVER_TNAME, (is_deleting ? "" : ", status='ACTIVE'"), + ip, server.get_port()))) { + LOG_WARN("fail to assign fmt", KR(ret)); + } else if (OB_FAIL(exec_write(trans, sql, false /* is_multi_rows_affected */))) { + LOG_WARN("fail to update the table", KR(ret), K(sql)); + } else {} + } + return ret; +} +int ObServerTableOperator::update_table_for_online_to_offline_server( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const bool is_deleting, + int64_t last_offline_time) +{ + int ret = OB_SUCCESS; + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + if (OB_UNLIKELY(last_offline_time <= 0 + || !server.is_valid() + || !server.ip_to_string(ip, sizeof(ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server), K(last_offline_time)); + } else { + ObSqlString sql; + if (OB_FAIL(sql.assign_fmt( + "UPDATE %s SET last_offline_time = %ld, start_service_time = 0%s" + " WHERE svr_ip = '%s' AND svr_port = %d AND status != 'INACTIVE'", + OB_ALL_SERVER_TNAME, last_offline_time, (is_deleting ? "" : ", status='INACTIVE'"), + ip, server.get_port()))) { + LOG_WARN("fail to assign fmt", KR(ret)); + } else if (OB_FAIL(exec_write(trans, sql, false /* is_multi_rows_affected */))) { + LOG_WARN("fail to update the table", KR(ret), K(sql)); + } else {} + } + return ret; +} +int ObServerTableOperator::update_stop_time( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const int64_t old_stop_time, + const int64_t new_stop_time) +{ + int ret = OB_SUCCESS; + const int64_t now = ObTimeUtility::current_time(); + char ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + if (OB_UNLIKELY(!server.is_valid() + || !server.ip_to_string(ip, sizeof(ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server)); + } else if (old_stop_time == new_stop_time + || old_stop_time < 0 + || new_stop_time < 0 + || (old_stop_time > 0 && new_stop_time > 0)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(old_stop_time), K(new_stop_time)); + } else { + ObSqlString sql; + if (OB_FAIL(sql.assign_fmt( + "UPDATE %s SET stop_time = %ld " + "WHERE svr_ip = '%s' AND svr_port = %d AND stop_time = %ld", + OB_ALL_SERVER_TNAME, new_stop_time, ip, server.get_port(), old_stop_time))) { + LOG_WARN("assign fmt failed", K(ret)); + } else if (OB_FAIL(exec_write(trans, sql, false /* is_multi_rows_affected */))) { + LOG_WARN("fail to update the table", KR(ret), K(sql)); + } else {} + } + return ret; +} +int ObServerTableOperator::insert_dml_builder( + const ObServerStatus &server_status, + ObDMLSqlSplicer &dml) +{ + int ret = OB_SUCCESS; + char svr_ip[OB_MAX_SERVER_ADDR_SIZE] = ""; + const char *display_status_str = NULL; + dml.reset(); + if (OB_UNLIKELY(!server_status.is_valid() + || !server_status.server_.ip_to_string(svr_ip, sizeof(svr_ip)))) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(server_status)); + } else if (OB_FAIL(ObServerStatus::display_status_str( + server_status.get_display_status(), display_status_str))) { + LOG_WARN("fail to get display status str", KR(ret), + "display_status", server_status.get_display_status()); + } else if (OB_ISNULL(display_status_str)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("null display status string", KR(ret), KP(display_status_str)); + } else { + if (OB_FAIL(dml.add_pk_column(K(svr_ip))) + || OB_FAIL(dml.add_pk_column("svr_port", server_status.server_.get_port())) + || OB_FAIL(dml.add_column("id", server_status.get_server_id())) + || OB_FAIL(dml.add_column("zone", server_status.zone_.ptr())) + || OB_FAIL(dml.add_column("inner_port", server_status.sql_port_)) + || OB_FAIL(dml.add_column("block_migrate_in_time", server_status.block_migrate_in_time_)) + || OB_FAIL(dml.add_column("with_partition", true)) + || OB_FAIL(dml.add_column("with_rootserver", server_status.with_rootserver_) + || OB_FAIL(dml.add_column("start_service_time", server_status.start_service_time_)) + || OB_FAIL(dml.add_column("status", display_status_str)) + || OB_FAIL(dml.add_column("build_version", server_status.build_version_)) + || OB_FAIL(dml.add_column("stop_time", server_status.stop_time_)))) { + LOG_WARN("fail to add column", KR(ret)); + } else { + if (!ObHeartbeatService::is_service_enabled()) { // the old logic + if (OB_FAIL(dml.add_gmt_modified(server_status.last_hb_time_))) { + LOG_WARN("fail to add gmt_modified", KR(ret), K(server_status.last_hb_time_)); + } + } else { // in version 4.2, we add a new column last_offline_time + const ObServerStatus::DisplayStatus display_status = server_status.get_display_status(); + const int64_t last_offline_time = server_status.last_offline_time_; + if ((0 == last_offline_time && ObServerStatus::OB_SERVER_INACTIVE == display_status) + || (last_offline_time > 0 && ObServerStatus::OB_SERVER_ACTIVE == display_status)) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid last_offline_time", KR(ret), K(last_offline_time), K(display_status)); + } else if (OB_FAIL(dml.add_column("last_offline_time", server_status.last_offline_time_))) { + LOG_WARN("fail to add last_offline_time", KR(ret)); + } + } + } + } + return ret; +} +int ObServerTableOperator::exec_write( + ObMySQLTransaction &trans, + ObSqlString &sql, + const bool is_multi_rows_affected) +{ + int ret = OB_SUCCESS; + int64_t affected_rows = 0; + ObTimeoutCtx ctx; + if (OB_UNLIKELY(!sql.is_valid())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("invalid argument", KR(ret), K(sql)); + } else if (OB_FAIL(ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", KR(ret), K(ctx)); + } else if (OB_FAIL(trans.write(sql.ptr(), affected_rows))) { + LOG_WARN("fail to execute sql", KR(ret), K(sql)); + } else if (is_zero_row(affected_rows)) { + ret = OB_NEED_RETRY; + LOG_WARN("no affected rows, please retry the operation or " + "check the table to see if it has been updated already", + KR(ret), K(affected_rows), K(sql)); + } else if (!is_multi_rows_affected && !is_single_row(affected_rows)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("unexpected error appears, more than one affected row", + KR(ret), K(affected_rows), K(sql)); + } else {} + LOG_INFO("update __all_server table", KR(ret), K(affected_rows), K(sql)); + return ret; +} }//end namespace rootserver -}//end namespace oceanbase - +}//end namespace oceanbase \ No newline at end of file diff --git a/src/share/ob_server_table_operator.h b/src/share/ob_server_table_operator.h index f93944fb1..2d4a12dc3 100644 --- a/src/share/ob_server_table_operator.h +++ b/src/share/ob_server_table_operator.h @@ -17,7 +17,6 @@ #include "lib/net/ob_addr.h" #include "lib/time/ob_time_utility.h" #include "common/ob_zone.h" -#include "share/ob_lease_struct.h" #include "share/ob_server_status.h" #include "lib/mysqlclient/ob_mysql_transaction.h" @@ -32,8 +31,79 @@ namespace sqlclient class ObMySQLResult; } } -namespace share +namespace share { +class ObDMLSqlSplicer; +struct ObServerInfoInTable +{ +public: + typedef ObFixedLengthString ObBuildVersion; + ObServerInfoInTable(); + virtual ~ObServerInfoInTable(); + int init( + const common::ObAddr &server, + const uint64_t server_id, + const common::ObZone &zone, + const int64_t sql_port, + const bool with_rootserver, + const ObServerStatus::DisplayStatus status, + const ObBuildVersion &build_version, + const int64_t stop_time, + const int64_t start_service_time, + const int64_t last_offline_time); + int assign(const ObServerInfoInTable &other); + bool is_valid() const; + void reset(); + int build_server_info_in_table(const share::ObServerStatus &server_status); + int build_server_status(share::ObServerStatus &server_status) const; + const common::ObAddr &get_server() const { return server_; } + uint64_t get_server_id() const { return server_id_; } + const common::ObZone& get_zone() const { return zone_; } + int64_t get_sql_port() const { return sql_port_; } + bool get_with_rootserver() const { return with_rootserver_; } + ObServerStatus::DisplayStatus get_status() const { return status_; } + const ObBuildVersion& get_build_version() const { return build_version_; } + int64_t get_stop_time() const { return stop_time_; } + int64_t get_start_service_time() const { return start_service_time_; } + int64_t get_block_migrate_in_time() const { return block_migrate_in_time_; } + int64_t get_last_offline_time() const { return last_offline_time_; } + bool is_stopped() const { return 0 != stop_time_; } + bool is_alive() const { return 0 == last_offline_time_; } + bool in_service() const { return 0 != start_service_time_; } + bool is_deleting() const { return ObServerStatus::OB_SERVER_DELETING == status_; } + bool is_active() const { return ObServerStatus::OB_SERVER_ACTIVE == status_; } + bool is_migrate_in_blocked() const { return 0 != block_migrate_in_time_; } + bool can_migrate_in() const { return is_active() && !is_migrate_in_blocked(); } + bool is_permanent_offline() const; + bool is_temporary_offline() const; + TO_STRING_KV( + K_(server), + K_(server_id), + K_(zone), + K_(sql_port), + K_(with_rootserver), + K_(status), + K_(build_version), + K_(stop_time), + K_(start_service_time), + K_(block_migrate_in_time), + K_(last_offline_time)) +private: + common::ObAddr server_; + uint64_t server_id_; + common::ObZone zone_; + int64_t sql_port_; // sql listen port + bool with_rootserver_; + ObServerStatus::DisplayStatus status_; + ObBuildVersion build_version_; + int64_t stop_time_; + int64_t start_service_time_; + // in the old log (version < 4.2, last_hb_time is weakly equivalent to gmt_modified) + // gmt_modified_ is compatible with last_hb_time + // in the new logic (version >= 4.2), we do not set gmt_modified in __all_server table explicitly + int64_t block_migrate_in_time_; + int64_t last_offline_time_; +}; class ObServerTableOperator { public: @@ -43,7 +113,10 @@ public: int init(common::ObISQLClient *proxy); common::ObISQLClient &get_proxy() const { return *proxy_; } virtual int get(common::ObIArray &server_statuses); - virtual int remove(const common::ObAddr &server, common::ObMySQLTransaction &trans); + static int get( + common::ObISQLClient &sql_proxy, + common::ObIArray &server_statuses); + static int remove(const common::ObAddr &server, common::ObMySQLTransaction &trans); virtual int update(const share::ObServerStatus &server_status); virtual int reset_rootserver(const common::ObAddr &except); virtual int update_status(const common::ObAddr &server, @@ -54,10 +127,164 @@ public: const int64_t stop_time); virtual int update_with_partition(const common::ObAddr &server, bool with_partition); int get_start_service_time(const common::ObAddr &server, int64_t &start_service_time) const; + // read __all_server table and return all servers' info in the table + // + // @param[out] all_servers_info_in_table an array, which represents all rows in __all_server table + // + // @ret OB_SUCCESS get servers' info from __all_server table successfully + // @ret OB_TABLE_NOT_EXIST it occurs in the bootstrap period, we need to wait for some time. + // @ret other error code failure + static int get( + common::ObISQLClient &sql_proxy, + common::ObIArray &all_servers_info_in_table); + // read the given server's corresponding row in __all_server table + // this func can be called in version >= 4.2 + // + // @param[out] all_servers_info_in_table an array, which represents all rows in __all_server table + // + // @ret OB_SUCCESS get servers' info from __all_server table successfully + // @ret OB_TABLE_NOT_EXIST it occurs in the bootstrap period, we need to wait for some time. + + // @ret other error code failure + static int get( + common::ObISQLClient &sql_proxy, + const common::ObAddr &server, + ObServerInfoInTable &server_info_in_table); + // insert the new server's info into __all_server table, + // it is only called when we want to add a new server into clusters + // + // @param[in] trans transaction + // @param[in] server_info_in_table the new server's info which is expected to be inserted + // + // @ret OB_SUCCESS the insertion is successful + // @ret OB_NEED_RETRY no affected rows, probably we need to retry the operation + // or check the table to see whether the server's info + // has been inserted in __all_server table already + // @ret other error code failure + static int insert( + common::ObISQLClient &sql_proxy, + const ObServerInfoInTable &server_info_in_table); + // set the given server's status be new_status in __all_server table + // the prerequisites of a successful setting is that the previous status new_status + // @ret OB_SUCCESS the setting is successful + // @ret OB_NEED_RETRY no affected rows, probably we need to retry the operation + // or check the table to see whether the previous status is correct + // @ret other error code failure + static int update_status( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const ObServerStatus::DisplayStatus old_status, + const ObServerStatus::DisplayStatus new_status); + // the given server's with_rootserver will be set in __all_server table. + // other servers' with_rootserver will be reset. + // + // @param[in] trans transaction + // @param[in] server the server which we want to update its info + // + // @ret OB_SUCCESS the updation is successful + // @ret OB_NEED_RETRY no affected rows, probably we need to retry the operation + // or check the table to see whether the old value is correct + // @ret other error code failure + static int update_with_rootserver( + ObMySQLTransaction &trans, + const common::ObAddr &server); + + // update the build_version of a given server in __all_server table + // the prerequisites of a successful updation is that we give the right current value (old_build_version) + // + // @param[in] trans transaction + // @param[in] server the server which we want to update its info + // @param[in] old_build_version the current value of the given server's build_version + // @param[in] new_build_version the expected new value of the given server's build_version + // + // @ret OB_SUCCESS the updation is successful + // @ret OB_NEED_RETRY no affected rows, probably we need to retry the operation + // or check the table to see whether the old value is correct + // @ret other error code failure + static int update_build_version( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const ObServerInfoInTable::ObBuildVersion &old_build_version, + const ObServerInfoInTable::ObBuildVersion &new_build_version); + + // update the start_service_time of a given server in __all_server table + // the prerequisites of a successful updation is that we give the right current value (old_start_service_time) + // + // @param[in] trans transaction + // @param[in] server the server which we want to update its info + // @param[in] old_start_service_time the current value of the given server's start_service_time + // @param[in] new_start_service_time the expected new value of the given server's start_service_time + // + // @ret OB_SUCCESS the updation is successful + // @ret OB_NEED_RETRY no affected rows, probably we need to retry the operation + // or check the table to see whether the old value is correct + // @ret other error code failure + static int update_start_service_time( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const int64_t old_start_service_time, + const int64_t new_start_service_time); + // The server becomes active/online, its last_offline_time should be zero. + // In addition, if the server's status is inactive, the status should be set active. + // If the server's status is deleting, we do not need to change its status. + // + // @param[in] trans transaction + // @param[in] server the server which we find it becomes inactive + // + // @ret OB_SUCCESS the updation is successful + // @ret OB_NEED_RETRY no affected rows, probably we need to retry the operation + // or check the table to see whether the old value is correct + // @ret other error code failure + static int update_table_for_offline_to_online_server( + ObMySQLTransaction &trans, + const bool is_deleting, + const common::ObAddr &server); + // The server becomes inactive/offline, its last_offline_time should be set, + // and its start_service_time should be zero. + // In addition, if the server's status is active, the status should be set inactive. + // If the server's status is deleting, we do not need to change its status. + // + // @param[in] trans transaction + // @param[in] server the server which we find it becomes inactive + // + // @ret OB_SUCCESS the updation is successful + // @ret OB_NEED_RETRY no affected rows, probably we need to retry the operation + // or check the table to see whether the old value is correct + // @ret other error code failure + static int update_table_for_online_to_offline_server( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const bool is_deleting, + int64_t last_offline_time); + // update the given server's stop_time, + // if is_start is true, set stop_time = 0 (where stop_time != 0), otherwise stop_time will be now + // + // @param[in] trans transaction + // @param[in] server the server which we want to update its stop_time + // @param[in] is_start if true, start server. Otherwise, stop server. + // + // @ret OB_SUCCESS the updation is successful + // @ret OB_NEED_RETRY no affected rows, probably we need to retry the operation + // or check the table to see whether stop_time has been 0 already (if is_start) + // or not 0 (if !is_start) + // @ret other error code failure + static int update_stop_time( + ObMySQLTransaction &trans, + const common::ObAddr &server, + const int64_t old_stop_time, + const int64_t new_stop_time); private: - int build_server_status(const common::sqlclient::ObMySQLResult &res, - share::ObServerStatus &server_status) const; + static int build_server_status( + const common::sqlclient::ObMySQLResult &res, + share::ObServerStatus &server_status); + static int exec_write( + ObMySQLTransaction &trans, + ObSqlString &sql, + const bool is_multi_rows_affected); + static int insert_dml_builder( + const ObServerStatus &server_status, + ObDMLSqlSplicer &dml); private: bool inited_; common::ObISQLClient *proxy_; @@ -66,4 +293,4 @@ private: } // end namespace share } // end namespace oceanbase -#endif // OCEANBASE_SHARE_OB_SERVER_TABLE_OPERATOR_H_ +#endif // OCEANBASE_SHARE_OB_SERVER_TABLE_OPERATOR_H_ \ No newline at end of file diff --git a/src/share/ob_service_epoch_proxy.cpp b/src/share/ob_service_epoch_proxy.cpp index 66085b656..04267c635 100644 --- a/src/share/ob_service_epoch_proxy.cpp +++ b/src/share/ob_service_epoch_proxy.cpp @@ -20,6 +20,7 @@ #include "share/inner_table/ob_inner_table_schema_constants.h" #include "share/ob_dml_sql_splicer.h" #include "share/ob_force_print_log.h" +#include "logservice/palf/log_define.h" namespace oceanbase { @@ -31,34 +32,70 @@ int ObServiceEpochProxy::init_service_epoch( ObISQLClient &sql_proxy, const int64_t tenant_id, const int64_t freeze_service_epoch, - const int64_t arbitration_service_epoch) + const int64_t arbitration_service_epoch, + const int64_t server_zone_op_service_epoch, + const int64_t heartbeat_service_epoch) { int ret = OB_SUCCESS; if (is_user_tenant(tenant_id)) { ret = OB_INVALID_ARGUMENT; LOG_WARN("invalid argument", KR(ret), K(tenant_id)); // sys/meta tenant initialized freeze_service_epoch - } else if (OB_FAIL(insert_service_epoch(sql_proxy, tenant_id, FREEZE_SERVICE_EPOCH, freeze_service_epoch))) { - LOG_WARN("fail to init freeze_service_epoch", KR(ret), K(tenant_id), K(freeze_service_epoch)); - } else if (OB_FAIL(ObServiceEpochProxy::insert_service_epoch( - sql_proxy, - tenant_id, - ARBITRATION_SERVICE_EPOCH, - arbitration_service_epoch))) { - LOG_WARN("fail to init arb service epoch", KR(ret), K(tenant_id), K(arbitration_service_epoch)); + } else if (is_sys_tenant(tenant_id)) { + if (OB_FAIL(insert_service_epoch( + sql_proxy, + tenant_id, + FREEZE_SERVICE_EPOCH, + freeze_service_epoch))) { + LOG_WARN("fail to init freeze_service_epoch", KR(ret), K(tenant_id), K(freeze_service_epoch)); + } else if (OB_FAIL(ObServiceEpochProxy::insert_service_epoch( + sql_proxy, + tenant_id, + ARBITRATION_SERVICE_EPOCH, + arbitration_service_epoch))) { + LOG_WARN("fail to init arb service epoch", KR(ret), K(tenant_id), K(arbitration_service_epoch)); + } else if (OB_FAIL(insert_service_epoch( + sql_proxy, + tenant_id, + SERVER_ZONE_OP_SERVICE_EPOCH, + server_zone_op_service_epoch))) { + LOG_WARN("fail to init server_zone_op_service_epoch", KR(ret), K(tenant_id), K(server_zone_op_service_epoch)); + } else if (OB_FAIL(insert_service_epoch( + sql_proxy, + tenant_id, + HEARTBEAT_SERVICE_EPOCH, + heartbeat_service_epoch))) { + LOG_WARN("fail to init heartbeat_service_epoch", KR(ret), K(tenant_id), K(heartbeat_service_epoch)); + } else {} } else if (is_meta_tenant(tenant_id)) { // user tenant initialized freeze_service_epoch const uint64_t user_tenant_id = gen_user_tenant_id(tenant_id); - if (OB_FAIL(insert_service_epoch(sql_proxy, user_tenant_id, FREEZE_SERVICE_EPOCH, freeze_service_epoch))) { + if (OB_FAIL(insert_service_epoch( + sql_proxy, + user_tenant_id, + FREEZE_SERVICE_EPOCH, + freeze_service_epoch))) { LOG_WARN("fail to init freeze_service_epoch", KR(ret), K(user_tenant_id), K(freeze_service_epoch)); + } else if (OB_FAIL(insert_service_epoch( + sql_proxy, + tenant_id, + FREEZE_SERVICE_EPOCH, + freeze_service_epoch))) { + LOG_WARN("fail to init freeze_service_epoch", KR(ret), K(tenant_id), K(freeze_service_epoch)); } else if (OB_FAIL(ObServiceEpochProxy::insert_service_epoch( - sql_proxy, - user_tenant_id, - ARBITRATION_SERVICE_EPOCH, - arbitration_service_epoch))) { + sql_proxy, + user_tenant_id, + ARBITRATION_SERVICE_EPOCH, + arbitration_service_epoch))) { LOG_WARN("fail to init arb service epoch", KR(ret), K(user_tenant_id), K(arbitration_service_epoch)); - } - } + } else if (OB_FAIL(ObServiceEpochProxy::insert_service_epoch( + sql_proxy, + tenant_id, + ARBITRATION_SERVICE_EPOCH, + arbitration_service_epoch))) { + LOG_WARN("fail to init arb service epoch", KR(ret), K(user_tenant_id), K(arbitration_service_epoch)); + } else {} + } else {} return ret; } @@ -227,5 +264,50 @@ int ObServiceEpochProxy::check_service_epoch_with_trans( return ret; } +int ObServiceEpochProxy::check_and_update_service_epoch( + ObMySQLTransaction &trans, + const int64_t tenant_id, + const char * const name, + const int64_t service_epoch) +{ + int ret = OB_SUCCESS; + int64_t persistent_service_epoch = 0; + if (OB_UNLIKELY(!is_valid_tenant_id(tenant_id) + || palf::INVALID_PROPOSAL_ID == service_epoch) + || OB_ISNULL(name)) { + LOG_WARN("invalid argument", KR(ret), K(tenant_id), K(service_epoch), KP(name)); + } else if (OB_FAIL(ObServiceEpochProxy::select_service_epoch_for_update( + trans, + tenant_id, + name, + persistent_service_epoch ))) { + // check and update service epoch + LOG_WARN("fail to get heartbeat service epoch from inner table", KR(ret), K(tenant_id), K(name)); + } else if (OB_UNLIKELY(service_epoch < persistent_service_epoch)) { + ret = OB_NOT_MASTER; + LOG_WARN("the service_epoch is smaller than the service epoch in __all_service_epoch table, " + "the service cannot be provided", KR(ret), K(tenant_id), K(name), + K(service_epoch), K(persistent_service_epoch)); + } else if (service_epoch > persistent_service_epoch) { + int64_t affected_rows = 0; + if (OB_FAIL(ObServiceEpochProxy::update_service_epoch( + trans, + tenant_id, + name, + service_epoch, + affected_rows))) { + LOG_WARN("fail to update the service epoch", KR(ret), K(tenant_id), K(name), + K(service_epoch), K(persistent_service_epoch), K(affected_rows)); + } else if (1 != affected_rows) { + ret = OB_NEED_RETRY; + LOG_WARN("fail to update service epoch, affected_rows is expected to be one", KR(ret), + K(tenant_id), K(name), K(service_epoch), K(persistent_service_epoch), K(affected_rows)); + } + } else {} + FLOG_INFO("check and update service epoch", KR(ret), K(tenant_id), K(name), + K(service_epoch), K(persistent_service_epoch)); + return ret; +} + } // end namespace share } // end namespace oceanbase diff --git a/src/share/ob_service_epoch_proxy.h b/src/share/ob_service_epoch_proxy.h index 6ebf999f6..9ebd83bce 100644 --- a/src/share/ob_service_epoch_proxy.h +++ b/src/share/ob_service_epoch_proxy.h @@ -49,7 +49,9 @@ public: static int init_service_epoch(common::ObISQLClient &sql_proxy, const int64_t tenant_id, const int64_t freeze_service_epoch, - const int64_t arbitration_service_epoch); + const int64_t arbitration_service_epoch, + const int64_t server_zone_op_service_epoch, + const int64_t heartbeat_service_epoch); static int insert_service_epoch(common::ObISQLClient &sql_proxy, const int64_t tenant_id, @@ -78,10 +80,20 @@ public: const char *name, const int64_t expected_epoch, bool &is_match); + // if service_epoch = persistent service epoch, do nothing + // if service_epoch > persistent service epoch, update persistent service epoch + // otherwise return error code OB_NOT_MASTER; + static int check_and_update_service_epoch( + ObMySQLTransaction &trans, + const int64_t tenant_id, + const char *name, + const int64_t service_epoch); public: constexpr static const char * const FREEZE_SERVICE_EPOCH = "freeze_service_epoch"; constexpr static const char * const ARBITRATION_SERVICE_EPOCH = "arbitration_service_epoch"; + constexpr static const char * const SERVER_ZONE_OP_SERVICE_EPOCH = "server_zone_op_service_epoch"; + constexpr static const char * const HEARTBEAT_SERVICE_EPOCH = "heartbeat_service_epoch"; private: static int inner_get_service_epoch_(common::ObISQLClient &sql_proxy, diff --git a/src/share/ob_srv_rpc_proxy.h b/src/share/ob_srv_rpc_proxy.h index dc7b51911..9d243137f 100644 --- a/src/share/ob_srv_rpc_proxy.h +++ b/src/share/ob_srv_rpc_proxy.h @@ -18,6 +18,7 @@ #include "rpc/obrpc/ob_rpc_proxy.h" #include "share/ob_rpc_struct.h" #include "observer/ob_server_struct.h" +#include "share/ob_heartbeat_struct.h" namespace oceanbase { @@ -90,6 +91,7 @@ public: RPC_S(PR5 wash_memory_fragmentation, OB_WASH_MEMORY_FRAGMENTATION); RPC_S(PR5 bootstrap, OB_BOOTSTRAP, (ObBootstrapArg)); RPC_S(PR5 is_empty_server, OB_IS_EMPTY_SERVER, (ObCheckServerEmptyArg), Bool); + RPC_S(PR5 check_server_for_adding_server, OB_CHECK_SERVER_FOR_ADDING_SERVER, (ObCheckServerForAddingServerArg), ObCheckServerForAddingServerResult); RPC_S(PR5 check_deployment_mode_match, OB_CHECK_DEPLOYMENT_MODE, (ObCheckDeploymentModeArg), Bool); RPC_S(PR5 report_replica, OB_REPORT_REPLICA); RPC_S(PR5 recycle_replica, OB_RECYCLE_REPLICA); @@ -185,6 +187,8 @@ public: RPC_AP(PR5 refresh_tenant_info, OB_REFRESH_TENANT_INFO, (obrpc::ObRefreshTenantInfoArg), obrpc::ObRefreshTenantInfoRes); RPC_S(PR5 sync_rewrite_rules, OB_SYNC_REWRITE_RULES, (ObSyncRewriteRuleArg)); RPC_S(PR5 force_set_ls_as_single_replica, OB_LOG_FORCE_SET_LS_AS_SINGLE_REPLICA, (obrpc::ObForceSetLSAsSingleReplicaArg)); + RPC_AP(PRZ handle_heartbeat, OB_SEND_HEARTBEAT, (share::ObHBRequest), share::ObHBResponse); + RPC_AP(PR5 get_server_resource_info, OB_GET_SERVER_RESOURCE_INFO, (obrpc::ObGetServerResourceInfoArg), obrpc::ObGetServerResourceInfoResult); RPC_AP(PR5 notify_switch_leader, OB_NOTIFY_SWITCH_LEADER, (obrpc::ObNotifySwitchLeaderArg)); RPC_AP(PR5 update_tenant_info_cache, OB_UPDATE_TENANT_INFO_CACHE, (obrpc::ObUpdateTenantInfoCacheArg), obrpc::ObUpdateTenantInfoCacheRes); }; // end of class ObSrvRpcProxy diff --git a/src/share/ob_thread_define.h b/src/share/ob_thread_define.h index 73694e52c..9f695108b 100644 --- a/src/share/ob_thread_define.h +++ b/src/share/ob_thread_define.h @@ -131,4 +131,5 @@ TG_DEF(RedefHeartBeatTask, RedefHeartBeatTask, "", TG_STATIC, TIMER) TG_DEF(MemDumpTimer, MemDumpTimer, "", TG_STATIC, TIMER) TG_DEF(SSTableDefragment, SSTableDefragment, "", TG_STATIC, TIMER) TG_DEF(TenantMetaMemMgr, TenantMetaMemMgr, "", TG_STATIC, TIMER) +TG_DEF(HeartbeatService, HeartbeatService, "", TG_STATIC, REENTRANT_THREAD_POOL, ThreadCountPair(2, 2)) #endif diff --git a/src/share/ob_upgrade_utils.cpp b/src/share/ob_upgrade_utils.cpp index 5877f03fc..99d7461c7 100644 --- a/src/share/ob_upgrade_utils.cpp +++ b/src/share/ob_upgrade_utils.cpp @@ -17,6 +17,7 @@ #include "share/ob_upgrade_utils.h" #include "share/schema/ob_schema_getter_guard.h" #include "share/inner_table/ob_inner_table_schema_constants.h" +#include "share/ob_service_epoch_proxy.h" #include "observer/ob_server_struct.h" #include "rootserver/ob_root_service.h" #include "sql/resolver/expr/ob_raw_expr_util.h" @@ -957,6 +958,44 @@ int ObUpgradeFor4100Processor::recompile_all_views_and_synonyms(const uint64_t t } /* =========== 4100 upgrade processor end ============= */ + +int ObUpgradeFor4200Processor::post_upgrade() +{ + int ret = OB_SUCCESS; + if (OB_FAIL(check_inner_stat())) { + LOG_WARN("fail to check inner stat", KR(ret)); + } else if (OB_FAIL(post_upgrade_for_heartbeat_and_server_zone_op_service())) { + LOG_WARN("post upgrade for heartbeat and server zone op service failed", KR(ret)); + } + return ret; +} + +int ObUpgradeFor4200Processor::post_upgrade_for_heartbeat_and_server_zone_op_service() +{ + int ret = OB_SUCCESS; + int64_t affected_rows = 0; + if (OB_ISNULL(sql_proxy_)) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("error unexpected", KR(ret), KP(sql_proxy_)); + } else if (!is_sys_tenant(tenant_id_)) { + LOG_INFO("only sys tenant need heartbeat and server zone op service", K(tenant_id_)); + } else { + ObSqlString sql; + if (OB_FAIL(sql.assign_fmt("INSERT IGNORE INTO %s (tenant_id, name, value) VALUES " + "(%lu, '%s', 0), " + "(%lu, '%s', 0)", + OB_ALL_SERVICE_EPOCH_TNAME, OB_SYS_TENANT_ID, ObServiceEpochProxy::HEARTBEAT_SERVICE_EPOCH, + OB_SYS_TENANT_ID, ObServiceEpochProxy::SERVER_ZONE_OP_SERVICE_EPOCH))) { + LOG_WARN("fail to assign sql assign", KR(ret)); + } else if (OB_FAIL(sql_proxy_->write(OB_SYS_TENANT_ID, sql.ptr(), affected_rows))) { + LOG_WARN("fail to execute sql", KR(ret), K(sql)); + } else {} + } + FLOG_INFO("insert heartbeat and server zone op service", KR(ret), K(affected_rows)); + return ret; +} + +/* =========== 4200 upgrade processor end ============= */ /* =========== special upgrade processor end ============= */ } // end share } // end oceanbase diff --git a/src/share/ob_upgrade_utils.h b/src/share/ob_upgrade_utils.h index 3cff2d51d..62e4b16f9 100644 --- a/src/share/ob_upgrade_utils.h +++ b/src/share/ob_upgrade_utils.h @@ -189,7 +189,17 @@ private: int init_rewrite_rule_version(const uint64_t tenant_id); static int recompile_all_views_and_synonyms(const uint64_t tenant_id); }; -DEF_SIMPLE_UPGRARD_PROCESSER(4, 2, 0, 0) +// DEF_SIMPLE_UPGRARD_PROCESSER(4, 2, 0, 0) +class ObUpgradeFor4200Processor : public ObBaseUpgradeProcessor +{ +public: + ObUpgradeFor4200Processor() : ObBaseUpgradeProcessor() {} + virtual ~ObUpgradeFor4200Processor() {} + virtual int pre_upgrade() override { return common::OB_SUCCESS; } + virtual int post_upgrade() override; +private: + int post_upgrade_for_heartbeat_and_server_zone_op_service(); +}; /* =========== special upgrade processor end ============= */ /* =========== upgrade processor end ============= */ diff --git a/src/share/ob_zone_info.h b/src/share/ob_zone_info.h index c3f55b00b..755c4cbc8 100644 --- a/src/share/ob_zone_info.h +++ b/src/share/ob_zone_info.h @@ -148,6 +148,9 @@ public: bool is_encryption() const { return zone_type_.value_ == common::ObZoneType::ZONE_TYPE_ENCRYPTION; } + bool is_active() const { + return ObZoneStatus::ACTIVE == status_; + } public: common::ObZone zone_; ObZoneInfoItem::ItemList list_; diff --git a/src/share/ob_zone_table_operation.cpp b/src/share/ob_zone_table_operation.cpp index f1c6e26b6..26fad0839 100644 --- a/src/share/ob_zone_table_operation.cpp +++ b/src/share/ob_zone_table_operation.cpp @@ -63,10 +63,14 @@ int ObZoneTableOperation::set_info_item( } template -int ObZoneTableOperation::load_info(common::ObISQLClient &sql_client, T &info) +int ObZoneTableOperation::load_info( + common::ObISQLClient &sql_client, + T &info, + const bool check_zone_exists) { int ret = OB_SUCCESS; ObSqlString sql; + bool zone_exists = false; SMART_VAR(ObMySQLProxy::MySQLResult, res) { ObMySQLResult *result = NULL; ObTimeoutCtx ctx; @@ -86,6 +90,7 @@ int ObZoneTableOperation::load_info(common::ObISQLClient &sql_client, T &info) int64_t value = 0; char info_str[MAX_ZONE_INFO_LENGTH + 1] = ""; while (OB_SUCCESS == ret && OB_SUCCESS == (ret = result->next())) { + zone_exists = true; EXTRACT_STRBUF_FIELD_MYSQL(*result, "name", name, static_cast(sizeof(name)), tmp_real_str_len); EXTRACT_INT_FIELD_MYSQL(*result, "value", value, int64_t); @@ -100,6 +105,10 @@ int ObZoneTableOperation::load_info(common::ObISQLClient &sql_client, T &info) } if (OB_ITER_END == ret) { ret = OB_SUCCESS; + if (check_zone_exists && !zone_exists) { + ret = OB_ZONE_INFO_NOT_EXIST; + LOG_WARN("zone not exists", KR(ret), K(sql)); + } } else { LOG_WARN("get result failed", K(ret), K(sql)); } @@ -109,14 +118,20 @@ int ObZoneTableOperation::load_info(common::ObISQLClient &sql_client, T &info) return ret; } -int ObZoneTableOperation::load_global_info(ObISQLClient &sql_client, ObGlobalInfo &info) +int ObZoneTableOperation::load_global_info( + ObISQLClient &sql_client, + ObGlobalInfo &info, + const bool check_zone_exists /* = false */) { - return load_info(sql_client, info); + return load_info(sql_client, info, check_zone_exists); } -int ObZoneTableOperation::load_zone_info(ObISQLClient &sql_client, ObZoneInfo &info) +int ObZoneTableOperation::load_zone_info( + ObISQLClient &sql_client, + ObZoneInfo &info, + const bool check_zone_exists /* = false */) { - return load_info(sql_client, info); + return load_info(sql_client, info, check_zone_exists); } template @@ -406,5 +421,124 @@ int ObZoneTableOperation::get_region_list( return ret; } +int ObZoneTableOperation::check_encryption_zone( + common::ObISQLClient &sql_client, + const common::ObZone &zone, + bool &encryption) +{ + int ret = OB_SUCCESS; + encryption = false; + HEAP_VAR(ObZoneInfo, zone_info) { + if (OB_UNLIKELY(zone.is_empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("the zone is empty", KR(ret), K(zone)); + } else if (OB_FAIL(get_zone_info(zone, sql_client, zone_info))) { + LOG_WARN("fail to get zone info", KR(ret), K(zone)); + } else { + encryption = zone_info.is_encryption(); + } + } + return ret; +} +int ObZoneTableOperation::check_zone_active( + common::ObISQLClient &sql_client, + const common::ObZone &zone, + bool &is_active) +{ + int ret = OB_SUCCESS; + is_active = false; + HEAP_VAR(ObZoneInfo, zone_info) { + if (OB_UNLIKELY(zone.is_empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("the zone is empty", KR(ret), K(zone)); + } else if (OB_FAIL(get_zone_info(zone, sql_client, zone_info))) { + LOG_WARN("fail to get zone info", KR(ret), K(zone)); + } else { + is_active = zone_info.is_active(); + } + } + return ret; +} +int ObZoneTableOperation::get_inactive_zone_list( + common::ObISQLClient &sql_client, + common::ObIArray &zone_list) +{ + return get_zone_list_(sql_client, zone_list, false /* is_active */); +} +int ObZoneTableOperation::get_active_zone_list( + common::ObISQLClient &sql_client, + common::ObIArray &zone_list) +{ + return get_zone_list_(sql_client, zone_list, true /* is_active */); +} +int ObZoneTableOperation::get_zone_list_( + common::ObISQLClient &sql_client, + common::ObIArray &zone_list, + const bool is_active) +{ + int ret = OB_SUCCESS; + ObSqlString sql; + ObTimeoutCtx ctx; + zone_list.reset(); + ObZone zone; + if (OB_FAIL(rootserver::ObRootUtils::get_rs_default_timeout_ctx(ctx))) { + LOG_WARN("fail to get timeout ctx", K(ret), K(ctx)); + } else if (OB_FAIL(sql.assign_fmt("SELECT zone FROM %s WHERE name = 'status' AND info = '%s'", + OB_ALL_ZONE_TNAME, is_active ? "ACTIVE" : "INACTIVE"))) { + LOG_WARN("fail to append sql", KR(ret)); + } else { + SMART_VAR(ObMySQLProxy::MySQLResult, res) { + ObMySQLResult *result = NULL; + if (OB_FAIL(sql_client.read(res, OB_SYS_TENANT_ID, sql.ptr()))) { + LOG_WARN("fail to execute sql", KR(ret), K(sql)); + } else if (OB_ISNULL(result = res.get_result())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("fail to get sql result", KR(ret), K(sql)); + } else { + while (OB_SUCC(ret)) { + if (OB_FAIL(result->next())) { + if (OB_ITER_END != ret) { + LOG_WARN("result next failed", KR(ret)); + } else { + ret = OB_SUCCESS; + break; + } + } else { + int64_t tmp_real_str_len = 0; + zone.reset(); + EXTRACT_STRBUF_FIELD_MYSQL(*result, "zone", zone.ptr(), MAX_ZONE_LENGTH, tmp_real_str_len); + (void) tmp_real_str_len; // make compiler happy + if (OB_FAIL(zone_list.push_back(zone))) { + LOG_WARN("fail to push an element into zone_list", KR(ret), K(zone)); + } + } + } + } + } + } + FLOG_INFO("get inactive zone_list", KR(ret), K(zone_list)); + return ret; +} +int ObZoneTableOperation::get_zone_info( + const ObZone &zone, + common::ObISQLClient &sql_client, + ObZoneInfo &zone_info) +{ + int ret = OB_SUCCESS; + zone_info.reset(); + zone_info.zone_ = zone; + bool check_zone_exists = true; + if (OB_UNLIKELY(zone.is_empty())) { + ret = OB_INVALID_ARGUMENT; + LOG_WARN("the zone is empty", KR(ret), K(zone)); + } else if (OB_FAIL(load_zone_info(sql_client, zone_info, check_zone_exists))) { + LOG_WARN("fail to load zone info", KR(ret), K(zone)); + } else if (OB_UNLIKELY(!zone_info.is_valid())) { + ret = OB_ERR_UNEXPECTED; + LOG_WARN("zone_info is unexpectedly invalid", + KR(ret), K(zone), K(zone_info)); + } else {} + return ret; +} }//end namespace share }//end namespace oceanbase diff --git a/src/share/ob_zone_table_operation.h b/src/share/ob_zone_table_operation.h index 3ac0c6b81..55cb545a8 100644 --- a/src/share/ob_zone_table_operation.h +++ b/src/share/ob_zone_table_operation.h @@ -37,9 +37,22 @@ public: common::ObISQLClient &sql_client, common::ObIArray &zone_list); static int get_zone_lease_info(common::ObISQLClient &sql_client, ObZoneLeaseInfo &info); - - static int load_global_info(common::ObISQLClient &sql_client, ObGlobalInfo &info); - static int load_zone_info(common::ObISQLClient &sql_client, ObZoneInfo &info); + // check_zone_exists is a newly added arg. + // If the zone not exists, + // in the previous implementation, OB_SUCCESS will be returned. + // However, if check_zone_exists is set, OB_ZONE_INFO_NOT_EXIST will be returned. + static int load_global_info( + common::ObISQLClient &sql_client, + ObGlobalInfo &info, + const bool check_zone_exists = false); + // check_zone_exists is a newly added arg. + // If the zone not exists, + // in the previous implementation, OB_SUCCESS will be returned. + // However, if check_zone_exists is set, OB_ZONE_INFO_NOT_EXIST will be returned. + static int load_zone_info( + common::ObISQLClient &sql_client, + ObZoneInfo &info, + const bool check_zone_exists = false); static int insert_global_info(common::ObISQLClient &sql_client, ObGlobalInfo &info); static int insert_zone_info(common::ObISQLClient &sql_client, ObZoneInfo &info); @@ -47,15 +60,42 @@ public: static int remove_zone_info(common::ObISQLClient &sql_client, const common::ObZone &zone); static int get_region_list( common::ObISQLClient &sql_client, common::ObIArray ®ion_list); + static int check_encryption_zone( + common::ObISQLClient &sql_client, + const common::ObZone &zone, + bool &encryption); + static int check_zone_active( + common::ObISQLClient &sql_client, + const common::ObZone &zone, + bool &is_active); + static int get_inactive_zone_list( + common::ObISQLClient &sql_client, + common::ObIArray &zone_list); + static int get_active_zone_list( + common::ObISQLClient &sql_client, + common::ObIArray &zone_list); private: template static int set_info_item(const char *name, const int64_t value, const char *info_str, T &info); template - static int load_info(common::ObISQLClient &sql_client, T &info); + static int load_info( + common::ObISQLClient &sql_client, + T &info, + const bool check_zone_exists); template static int insert_info(common::ObISQLClient &sql_client, T &info); static int get_zone_item_count(int64_t &cnt); + static int get_zone_info( + const ObZone &zone, + common::ObISQLClient &sql_client, + ObZoneInfo &zone_info); + // if is_active, then get active zone_list + // if !is_active, then get inactive zone_list + static int get_zone_list_( + common::ObISQLClient &sql_client, + common::ObIArray &zone_list, + const bool is_active); }; }//end namespace share diff --git a/src/share/rc/ob_tenant_base.h b/src/share/rc/ob_tenant_base.h index 5649830cc..8f4ec11e6 100644 --- a/src/share/rc/ob_tenant_base.h +++ b/src/share/rc/ob_tenant_base.h @@ -129,6 +129,7 @@ namespace rootserver class ObRestoreService; class ObRecoveryLSService; class ObArbitrationService; + class ObHeartbeatService; class ObStandbySchemaRefreshTrigger; } namespace observer @@ -245,7 +246,8 @@ using ObPartTransCtxObjPool = common::ObServerObjectPool &snapshot_ser int ret = OB_SUCCESS; ObArray lost_servers; - if (OB_FAIL(share::ObAllServerTracer::get_instance().for_each_server_status( + if (OB_FAIL(share::ObAllServerTracer::get_instance().for_each_server_info( [&snapshot_servers, - &lost_servers](const share::ObServerStatus &status) -> int { + &lost_servers](const share::ObServerInfoInTable &server_info) -> int { int ret = OB_SUCCESS; bool found = false; - // find servers that recorded in the server manger while has + // find servers that recorded in __all_server table while has // not reported its timestamp. for (int64_t i = 0; !found && i < snapshot_servers.count(); ++i) { - if (status.server_ == snapshot_servers[i]) { + if (server_info.get_server() == snapshot_servers[i]) { found = true; } } if (!found) {// not found in __all_reserved_snapshot inner table - if (OB_FAIL(lost_servers.push_back(status.server_))) { + if (OB_FAIL(lost_servers.push_back(server_info.get_server()))) { MVCC_LOG(WARN, "lost servers push back failed", K(ret)); - } else if (!status.is_valid()) { - MVCC_LOG(ERROR, "invalid status", K(ret), K(status)); + } else if (!server_info.is_valid()) { + MVCC_LOG(ERROR, "invalid server info", K(ret), K(server_info)); // if not in service, we ignore it and report the warning - } else if (!status.in_service() || status.is_stopped()) { - MVCC_LOG(WARN, "server is not alive, we will remove soon", K(ret), K(status)); + } else if (!server_info.in_service() || server_info.is_stopped()) { + MVCC_LOG(WARN, "server is not alive, we will remove soon", K(ret), K(server_info)); // if not alive, we ignore it and report the warning - } else if (!status.is_alive()) { - MVCC_LOG(WARN, "server is not alive, please pay attention", K(ret), K(status)); + } else if (!server_info.is_alive()) { + MVCC_LOG(WARN, "server is not alive, please pay attention", K(ret), K(server_info)); } else { // may be lost or do not contain the tenant // TODO(handora.qc): make it better and more clear - MVCC_LOG(INFO, "server is alive when mointor", K(ret), K(status)); + MVCC_LOG(INFO, "server is alive when mointor", K(ret), K(server_info)); } } diff --git a/src/storage/ls/ob_ls.cpp b/src/storage/ls/ob_ls.cpp index 84bf31880..2c46c1eb2 100644 --- a/src/storage/ls/ob_ls.cpp +++ b/src/storage/ls/ob_ls.cpp @@ -40,6 +40,7 @@ #include "rootserver/ob_primary_ls_service.h" #include "rootserver/ob_recovery_ls_service.h" #include "rootserver/restore/ob_restore_scheduler.h" +#include "rootserver/ob_heartbeat_service.h" #include "sql/das/ob_das_id_service.h" #include "storage/tablet/ob_tablet.h" @@ -217,6 +218,12 @@ int ObLS::init(const share::ObLSID &ls_id, } + if (OB_SUCC(ret) && is_sys_tenant(tenant_id) && ls_id.is_sys_ls()) { + //sys tenant + REGISTER_TO_LOGSERVICE(logservice::HEARTBEAT_SERVICE_LOG_BASE_TYPE, MTL(rootserver::ObHeartbeatService *)); + LOG_INFO("heartbeat service is registered successfully"); + } + if (OB_SUCC(ret)) { // don't delete it election_priority_.set_ls_id(ls_id); is_inited_ = true; @@ -646,6 +653,10 @@ void ObLS::destroy() rootserver::ObRestoreService * restore_service = MTL(rootserver::ObRestoreService*); UNREGISTER_FROM_LOGSERVICE(logservice::RESTORE_SERVICE_LOG_BASE_TYPE, restore_service); } + if (is_sys_tenant(MTL_ID()) && ls_meta_.ls_id_.is_sys_ls()) { + rootserver::ObHeartbeatService * heartbeat_service = MTL(rootserver::ObHeartbeatService*); + UNREGISTER_FROM_LOGSERVICE(logservice::HEARTBEAT_SERVICE_LOG_BASE_TYPE, heartbeat_service); + } tx_table_.destroy(); lock_table_.destroy(); ls_tablet_svr_.destroy(); diff --git a/unittest/rootserver/CMakeLists.txt b/unittest/rootserver/CMakeLists.txt index 9d50d46ad..68dab03d2 100644 --- a/unittest/rootserver/CMakeLists.txt +++ b/unittest/rootserver/CMakeLists.txt @@ -20,3 +20,4 @@ rs_unittest(test_primary_ls_service) #rs_unittest(test_zone_merge_manager) #rs_unittest(test_freeze_info_manager) rs_unittest(test_archive_checkpoint) +rs_unittest(test_heartbeat_service) diff --git a/unittest/rootserver/test_heartbeat_service.cpp b/unittest/rootserver/test_heartbeat_service.cpp new file mode 100644 index 000000000..e345ce291 --- /dev/null +++ b/unittest/rootserver/test_heartbeat_service.cpp @@ -0,0 +1,203 @@ +/** + * Copyright (c) 2022 OceanBase + * OceanBase CE is licensed under Mulan PubL v2. + * You can use this software according to the terms and conditions of the Mulan PubL v2. + * You may obtain a copy of Mulan PubL v2 at: + * http://license.coscl.org.cn/MulanPubL-2.0 + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PubL v2 for more details. + */ +#define USING_LOG_PREFIX RS + +#include +#include +#include "lib/ob_errno.h" +#include "lib/oblog/ob_log.h" +#include "share/ob_heartbeat_struct.h" + +namespace oceanbase +{ +namespace rootserver +{ +using ::testing::_; +using ::testing::Invoke; +using ::testing::Return; +using namespace common; +class TestHeartbeatService : public ::testing::Test +{ +public: + TestHeartbeatService(){}; + virtual ~TestHeartbeatService(){}; + int clear_deleted_servers_in_all_servers_hb_info( + ObArray &all_servers_info_in_table, + hash::ObHashMap &all_servers_hb_info); + template + bool has_server_exist_in_array( + const ObIArray &array, + const common::ObAddr &server, + int64_t &idx); +}; +int TestHeartbeatService::clear_deleted_servers_in_all_servers_hb_info( + ObArray &all_servers_info_in_table, + hash::ObHashMap &all_servers_hb_info) +{ + int ret = OB_SUCCESS; + ObAddr server; + hash::ObHashMap::iterator iter = all_servers_hb_info.begin(); + + while (OB_SUCC(ret) && iter != all_servers_hb_info.end()) { + int64_t idx = OB_INVALID_INDEX_INT64; + server.reset(); + server = iter->first; + iter++; + if (!has_server_exist_in_array(all_servers_info_in_table, server, idx)) { + LOG_INFO("the server is deleted, it can be removed from all_servers_hb_info", K(server)); + if (OB_FAIL(all_servers_hb_info.erase_refactored(server))) { + LOG_WARN("fail to remove the server from all_servers_hb_info", KR(ret), K(server)); + } + } + } + + return ret; +} +template +bool TestHeartbeatService::has_server_exist_in_array( + const ObIArray &array, + const common::ObAddr &server, + int64_t &idx) +{ + bool bret = false; + idx = OB_INVALID_INDEX_INT64; + for (int64_t i = 0; i < array.count(); i++) { + if (server == array.at(i).get_server()) { + bret = true; + idx = i; + break; + } + } + return bret; +} +TEST_F(TestHeartbeatService, EraseHBInfo) +{ + int ret = OB_SUCCESS; + hash::ObHashMap all_servers_hb_info; + ObArray all_servers_info_in_table; + share::ObServerHBInfo server_hb_info; + share::ObServerInfoInTable server_info; + const int64_t now = ObTimeUtility::current_time(); + ObAddr server1(ObAddr::IPV4, "127.0.0.1", 4444); + ObAddr server2(ObAddr::IPV4, "127.0.0.1", 5555); + ObAddr server3(ObAddr::IPV4, "127.0.0.1", 6666); + ObAddr server4(ObAddr::IPV4, "127.0.0.1", 7777); + ObZone zone("z1"); + ret = all_servers_hb_info.create(1024, ObModIds::OB_HASH_BUCKET); + ASSERT_EQ(OB_SUCCESS, ret); + // **************** case 1: server1 is deleting **************** // + all_servers_info_in_table.reset(); + all_servers_hb_info.clear(); + server_info.reset(); + server_hb_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server1, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server1,1,zone, 30000, false, share::ObServerStatus::OB_SERVER_DELETING, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server1, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server2, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server2,2,zone, 30001, false, share::ObServerStatus::OB_SERVER_ACTIVE, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_info_in_table.push_back(server_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server2, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server3, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server3,3,zone, 30002, false, share::ObServerStatus::OB_SERVER_ACTIVE, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_info_in_table.push_back(server_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server3, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server4, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server4,4,zone, 30003, false, share::ObServerStatus::OB_SERVER_ACTIVE, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_info_in_table.push_back(server_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server4, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + + clear_deleted_servers_in_all_servers_hb_info(all_servers_info_in_table, all_servers_hb_info); + ASSERT_EQ(OB_HASH_NOT_EXIST, all_servers_hb_info.get_refactored(server1, server_hb_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.get_refactored(server2, server_hb_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.get_refactored(server3, server_hb_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.get_refactored(server4, server_hb_info)); + // **************** case 2: server2 and server3 is deleting **************** // + all_servers_info_in_table.reset(); + all_servers_hb_info.clear(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server1, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server1,1,zone, 30000, false, share::ObServerStatus::OB_SERVER_ACTIVE, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_info_in_table.push_back(server_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server1, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server2, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server2,2,zone, 30001, false, share::ObServerStatus::OB_SERVER_DELETING, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server2, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server3, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server3,3,zone, 30002, false, share::ObServerStatus::OB_SERVER_DELETING, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server3, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server4, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server4,4,zone, 30003, false, share::ObServerStatus::OB_SERVER_ACTIVE, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_info_in_table.push_back(server_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server4, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + + clear_deleted_servers_in_all_servers_hb_info(all_servers_info_in_table, all_servers_hb_info); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.get_refactored(server1, server_hb_info)); + ASSERT_EQ(OB_HASH_NOT_EXIST, all_servers_hb_info.get_refactored(server2, server_hb_info)); + ASSERT_EQ(OB_HASH_NOT_EXIST, all_servers_hb_info.get_refactored(server3, server_hb_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.get_refactored(server4, server_hb_info)); + // **************** case 3: server4 is deleting **************** // + all_servers_info_in_table.reset(); + all_servers_hb_info.clear(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server1, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server1,1,zone, 30000, false, share::ObServerStatus::OB_SERVER_ACTIVE, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_info_in_table.push_back(server_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server1, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server2, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server2,2,zone, 30001, false, share::ObServerStatus::OB_SERVER_ACTIVE, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_info_in_table.push_back(server_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server2, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server3, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server3,3,zone, 30002, false, share::ObServerStatus::OB_SERVER_ACTIVE, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_info_in_table.push_back(server_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server3, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + ASSERT_EQ(OB_SUCCESS, server_hb_info.init(server4, now, share::ObServerStatus::OB_HEARTBEAT_ALIVE)); + ASSERT_EQ(OB_SUCCESS, server_info.init(server4,4,zone, 30003, false, share:: ObServerStatus::OB_SERVER_DELETING, "build_version", 0, 100, 0)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.set_refactored(server4, server_hb_info)); + server_hb_info.reset(); + server_info.reset(); + + clear_deleted_servers_in_all_servers_hb_info(all_servers_info_in_table, all_servers_hb_info); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.get_refactored(server1, server_hb_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.get_refactored(server2, server_hb_info)); + ASSERT_EQ(OB_SUCCESS, all_servers_hb_info.get_refactored(server3, server_hb_info)); + ASSERT_EQ(OB_HASH_NOT_EXIST, all_servers_hb_info.get_refactored(server4, server_hb_info)); +} +} // rootservice +} // oceanbase + +int main(int argc, char **argv) +{ + oceanbase::common::ObLogger::get_logger().set_log_level("INFO"); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file