diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 41278eaf9..bcb3a3779 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -104,6 +104,7 @@ install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/access DESTINATION include else("${ENABLE_MULTIPLE_NODES}" STREQUAL "OFF") install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/access DESTINATION include/postgresql/server PATTERN "extreme_rto" EXCLUDE + PATTERN "ondemand_extreme_rto" EXCLUDE PATTERN "*.h") endif() install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/mb DESTINATION include/postgresql/server) diff --git a/src/bin/gs_guc/cluster_guc.conf b/src/bin/gs_guc/cluster_guc.conf index 05d128ef3..6e008c4c3 100755 --- a/src/bin/gs_guc/cluster_guc.conf +++ b/src/bin/gs_guc/cluster_guc.conf @@ -719,8 +719,10 @@ ss_enable_catalog_centralized|bool|0,0|NULL|NULL| ss_enable_reform|bool|0,0|NULL|NULL| ss_enable_ssl|bool|0,0|NULL|NULL| ss_enable_aio|bool|0,0|NULL|NULL| +ss_enable_ondemand_recovery|bool|0,0|NULL|NULL| ss_interconnect_channel_count|int|1,32|NULL|NULL| ss_work_thread_count|int|16,128|NULL|NULL| +ss_ondemand_recovery_mem_size|int|1048576,104857600|kB|NULL| ss_recv_msg_pool_size|int|1024,1048576|kB|NULL| ss_interconnect_type|string|0,0|NULL|NULL| ss_log_level|int|0,887|NULL|NULL| diff --git a/src/bin/pg_controldata/pg_controldata.cpp b/src/bin/pg_controldata/pg_controldata.cpp index 845d552c7..9666a066e 100644 --- a/src/bin/pg_controldata/pg_controldata.cpp +++ b/src/bin/pg_controldata/pg_controldata.cpp @@ -84,6 +84,20 @@ static const char* dbState(DBState state) return _("unrecognized status code"); } +static const char* SSClusterState(SSGlobalClusterState state) { + switch (state) { + case CLUSTER_IN_ONDEMAND_BUILD: + return _("in on-demand build"); + case CLUSTER_IN_ONDEMAND_RECOVERY: + return _("in on-demand recovery"); + case CLUSTER_NORMAL: + return _("normal"); + default: + break; + } + return _("unrecognized status code"); +} + static const char* wal_level_str(WalLevel wal_level) { switch (wal_level) { @@ -244,8 +258,11 @@ static void display_last_page(ss_reformer_ctrl_t reformerCtrl, int last_page_id) "is expecting. The results below are untrustworthy.\n\n")); } printf(_("\nreformer data (last page id %d)\n\n"), last_page_id); + printf(_("Reform control version number: %u\n"), reformerCtrl.version); printf(_("Stable instances list: %lu\n"), reformerCtrl.list_stable); printf(_("Primary instance ID: %d\n"), reformerCtrl.primaryInstId); + printf(_("Recovery instance ID: %d\n"), reformerCtrl.recoveryInstId); + printf(_("Cluster status: %s\n"), SSClusterState(reformerCtrl.clusterStatus)); } int main(int argc, char* argv[]) @@ -390,7 +407,7 @@ int main(int argc, char* argv[]) exit_safely(2); } display_control_page(ControlFile, display_id, display_all); - } + } /* get the last page from the the pg_control in shared storage mode */ if (enable_dss && display_id > MAX_INSTANCEID) { diff --git a/src/common/backend/parser/analyze.cpp b/src/common/backend/parser/analyze.cpp index 626343653..e97145377 100644 --- a/src/common/backend/parser/analyze.cpp +++ b/src/common/backend/parser/analyze.cpp @@ -614,7 +614,8 @@ Query* transformStmt(ParseState* pstate, Node* parseTree, bool isFirstNode, bool if (nodeTag(parseTree) != T_InsertStmt) { result->rightRefState = nullptr; } - + + PreventCommandDuringSSOndemandRecovery(parseTree); return result; } diff --git a/src/common/backend/utils/error/elog.cpp b/src/common/backend/utils/error/elog.cpp index d9c2b8200..e20efa624 100644 --- a/src/common/backend/utils/error/elog.cpp +++ b/src/common/backend/utils/error/elog.cpp @@ -4284,6 +4284,54 @@ static void append_with_tabs(StringInfo buf, const char* str) } } +/* + * Reaper -- get current time. + */ +void get_time_now(char* nowTime, int timeLen) +{ + time_t formatTime; + struct timeval current = {0}; + const int tmpBufSize = 32; + char tmpBuf[tmpBufSize] = {0}; + + if (nowTime == NULL || timeLen == 0) { + return; + } + + (void)gettimeofday(¤t, NULL); + formatTime = current.tv_sec; + struct tm* pTime = localtime(&formatTime); + strftime(tmpBuf, sizeof(tmpBuf), "%Y-%m-%d %H:%M:%S", pTime); + + errno_t rc = sprintf_s(nowTime, timeLen - 1, "%s.%ld ", tmpBuf, current.tv_usec / 1000); + securec_check_ss(rc, "\0", "\0"); +} + +void write_stderr_with_prefix(const char* fmt, ...) +{ + va_list ap; + const int timeBufSize = 256; + const int bufSize = 2048; + char timeBuf[timeBufSize] = {0}; + char buf[bufSize] = {0}; + + /* syslogger thread can not write log into pipe */ + if (t_thrd.role == SYSLOGGER) { + return; + } + + get_time_now(timeBuf, timeBufSize); + + fmt = _(fmt); + va_start(ap, fmt); + errno_t rc = sprintf_s(buf, bufSize - 1, "%s[%lu] %s\n", timeBuf, t_thrd.proc_cxt.MyProcPid, fmt); + securec_check_ss(rc, "\0", "\0"); + + vfprintf(stderr, buf, ap); + fflush(stderr); + va_end(ap); +} + /* * Write errors to stderr (or by equal means when stderr is * not available). Used before ereport/elog can be used diff --git a/src/common/backend/utils/init/globals.cpp b/src/common/backend/utils/init/globals.cpp index 9acbdf5b9..879821fd0 100644 --- a/src/common/backend/utils/init/globals.cpp +++ b/src/common/backend/utils/init/globals.cpp @@ -75,12 +75,13 @@ bool will_shutdown = false; * NEXT | 92899 | ? | ? * ********************************************/ -const uint32 GRAND_VERSION_NUM = 92900; +const uint32 GRAND_VERSION_NUM = 92901; /******************************************** * 2.VERSION NUM FOR EACH FEATURE * Please write indescending order. ********************************************/ +const uint32 ONDEMAND_REDO_VERSION_NUM = 92901; const uint32 SRF_FUSION_VERSION_NUM = 92847; const uint32 INDEX_HINT_VERSION_NUM = 92845; const uint32 INNER_UNIQUE_VERSION_NUM = 92845; diff --git a/src/common/backend/utils/init/miscinit.cpp b/src/common/backend/utils/init/miscinit.cpp index 72972fed8..aa5483b8d 100644 --- a/src/common/backend/utils/init/miscinit.cpp +++ b/src/common/backend/utils/init/miscinit.cpp @@ -63,6 +63,7 @@ #include "utils/lsyscache.h" #include "gs_policy/policy_common.h" #include "storage/file/fio_device.h" +#include "ddes/dms/ss_reform_common.h" #ifdef ENABLE_MULTIPLE_NODES #include "tsdb/compaction/compaction_entry.h" @@ -2039,6 +2040,17 @@ void register_backend_version(uint32 backend_version){ } } +void SSUpgradeFileBeforeCommit() +{ + // upgrade reform control file + if (pg_atomic_read_u32(&WorkingGrandVersionNum) < ONDEMAND_REDO_VERSION_NUM) { + if (SS_PRIMARY_MODE) { + SSReadControlFile(REFORM_CTRL_PAGE); + SSSaveReformerCtrl(true); + } + } +} + /* * Check whether the version contains the backend_version parameter. */ @@ -2092,14 +2104,58 @@ void ss_initdwsubdir(char *dssdir, int instance_id) g_instance.datadir_cxt.dw_subdir_cxt.dwStorageType = (uint8)DEV_TYPE_DSS; } -/* - * Check whether dss connect is successful. - */ +void initDssPath(char *dssdir) +{ + errno_t rc = EOK; + + rc = snprintf_s(g_instance.datadir_cxt.baseDir, MAXPGPATH, MAXPGPATH - 1, "%s/base", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.globalDir, MAXPGPATH, MAXPGPATH - 1, "%s/global", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.locationDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_location", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.tblspcDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_tblspc", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.clogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_clog", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.csnlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_csnlog", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.serialDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_serial", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.twophaseDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_twophase", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.multixactDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_multixact", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.xlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_xlog%d", dssdir, + g_instance.attr.attr_storage.dms_attr.instance_id); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.controlPath, MAXPGPATH, MAXPGPATH - 1, "%s/pg_control", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.controlBakPath, MAXPGPATH, MAXPGPATH - 1, "%s/pg_control.backup", + dssdir); + securec_check_ss(rc, "", ""); + + ss_initdwsubdir(dssdir, g_instance.attr.attr_storage.dms_attr.instance_id); +} + void initDSSConf(void) { if (!ENABLE_DSS) { return; } + + // check whether dss connect is successful. if (!dss_exist_dir(g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name)) { ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("Could not connect dssserver, vgname: \"%s\", socketpath: \"%s\"", @@ -2107,48 +2163,12 @@ void initDSSConf(void) g_instance.attr.attr_storage.dss_attr.ss_dss_conn_path), errhint("Check vgname and socketpath and restart later."))); } else { - errno_t rc = EOK; char *dssdir = g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name; - rc = snprintf_s(g_instance.datadir_cxt.baseDir, MAXPGPATH, MAXPGPATH - 1, "%s/base", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.globalDir, MAXPGPATH, MAXPGPATH - 1, "%s/global", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.locationDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_location", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.tblspcDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_tblspc", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.clogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_clog", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.csnlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_csnlog", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.serialDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_serial", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.twophaseDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_twophase", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.multixactDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_multixact", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.xlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_xlog%d", dssdir, - g_instance.attr.attr_storage.dms_attr.instance_id); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.controlPath, MAXPGPATH, MAXPGPATH - 1, "%s/pg_control", dssdir); - securec_check_ss(rc, "", ""); - - rc = snprintf_s(g_instance.datadir_cxt.controlBakPath, MAXPGPATH, MAXPGPATH - 1, "%s/pg_control.backup", - dssdir); - securec_check_ss(rc, "", ""); - - ss_initdwsubdir(dssdir, g_instance.attr.attr_storage.dms_attr.instance_id); + // do not overwrite + if (strncmp(g_instance.datadir_cxt.baseDir, dssdir, strlen(dssdir)) != 0) { + initDssPath(dssdir); + } } /* set xlog seg size to 1GB */ diff --git a/src/common/backend/utils/misc/guc-file.l b/src/common/backend/utils/misc/guc-file.l index 3a7096b9c..fc79beb87 100644 --- a/src/common/backend/utils/misc/guc-file.l +++ b/src/common/backend/utils/misc/guc-file.l @@ -330,6 +330,7 @@ ProcessConfigFile(GucContext context) case MASTER_THREAD: { if (strcmp(item->name, "upgrade_mode") == 0) { if (strcmp(pre_value, "0") != 0 && strcmp(post_value, "0") == 0) { + SSUpgradeFileBeforeCommit(); pg_atomic_write_u32(&WorkingGrandVersionNum, GRAND_VERSION_NUM); } } diff --git a/src/common/backend/utils/misc/guc/guc_storage.cpp b/src/common/backend/utils/misc/guc/guc_storage.cpp index 931a0044c..650561b61 100755 --- a/src/common/backend/utils/misc/guc/guc_storage.cpp +++ b/src/common/backend/utils/misc/guc/guc_storage.cpp @@ -215,6 +215,7 @@ static bool check_ss_rdma_work_config(char** newval, void** extra, GucSource sou static bool check_ss_dss_vg_name(char** newval, void** extra, GucSource source); static bool check_ss_dss_conn_path(char** newval, void** extra, GucSource source); static bool check_ss_enable_ssl(bool* newval, void** extra, GucSource source); +static bool check_ss_enable_ondemand_recovery(bool* newval, void** extra, GucSource source); static void assign_ss_enable_aio(bool newval, void *extra); #ifdef USE_ASSERT_CHECKING static void assign_ss_enable_verify_page(bool newval, void *extra); @@ -1035,6 +1036,19 @@ static void InitStorageConfigureNamesBool() NULL, NULL}, + {{"ss_enable_ondemand_recovery", + PGC_POSTMASTER, + NODE_SINGLENODE, + SHARED_STORAGE_OPTIONS, + gettext_noop("Whether use on-demand recovery"), + NULL, + GUC_SUPERUSER_ONLY}, + &g_instance.attr.attr_storage.dms_attr.enable_ondemand_recovery, + false, + check_ss_enable_ondemand_recovery, + NULL, + NULL}, + #ifdef USE_ASSERT_CHECKING {{"ss_enable_verify_page", PGC_SIGHUP, @@ -3608,7 +3622,21 @@ static void InitStorageConfigureNamesInt() 64, NULL, NULL, - NULL}, + NULL}, + {{"ss_ondemand_recovery_mem_size", + PGC_POSTMASTER, + NODE_ALL, + SHARED_STORAGE_OPTIONS, + gettext_noop("Sets the number of on-demand recovery memory buffers."), + NULL, + GUC_SUPERUSER_ONLY | GUC_UNIT_KB}, + &g_instance.attr.attr_storage.dms_attr.ondemand_recovery_mem_size, + 4194304, + 1048576, + 104857600, + NULL, + NULL, + NULL}, /* End-of-list marker */ {{NULL, (GucContext)0, @@ -6031,6 +6059,17 @@ static bool check_ss_enable_ssl(bool *newval, void **extra, GucSource source) return true; } +static bool check_ss_enable_ondemand_recovery(bool* newval, void** extra, GucSource source) +{ + if (*newval) { + if (pg_atomic_read_u32(&WorkingGrandVersionNum) < ONDEMAND_REDO_VERSION_NUM) { + ereport(ERROR, (errmsg("Do not allow enable ondemand_recovery if openGauss run in old version."))); + return false; + } + } + return true; +} + #ifdef USE_ASSERT_CHECKING static void assign_ss_enable_verify_page(bool newval, void *extra) { diff --git a/src/common/backend/utils/misc/postgresql_single.conf.sample b/src/common/backend/utils/misc/postgresql_single.conf.sample index 60d8778f3..1194424f5 100644 --- a/src/common/backend/utils/misc/postgresql_single.conf.sample +++ b/src/common/backend/utils/misc/postgresql_single.conf.sample @@ -849,3 +849,5 @@ job_queue_processes = 10 # Number of concurrent jobs, optional: [0..1000] #ss_log_backup_file_count = 10 #ss_log_max_file_size = 10MB #ss_parallel_thread_count = 16 +#ss_enable_ondemand_recovery = off +#ss_ondemand_recovery_mem_size = 4GB # min: 1GB, max: 100GB diff --git a/src/gausskernel/CMakeLists.txt b/src/gausskernel/CMakeLists.txt index 839f5dc79..29fa43a5c 100755 --- a/src/gausskernel/CMakeLists.txt +++ b/src/gausskernel/CMakeLists.txt @@ -170,6 +170,7 @@ list(APPEND gaussdb_objects $ $ $ + $ $ $ $ diff --git a/src/gausskernel/ddes/adapter/ss_dms.cpp b/src/gausskernel/ddes/adapter/ss_dms.cpp index 2dfdb8750..5f4304530 100644 --- a/src/gausskernel/ddes/adapter/ss_dms.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms.cpp @@ -125,6 +125,7 @@ int ss_dms_func_init() SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_init_logger)); SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_refresh_logger)); SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_validate_drc)); + SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_reform_req_opengauss_ondemand_redo_buffer)); g_ss_dms_func.inited = true; return DMS_SUCCESS; } @@ -333,4 +334,10 @@ void dms_validate_drc(dms_context_t *dms_ctx, dms_buf_ctrl_t *ctrl, unsigned lon unsigned char is_dirty) { return g_ss_dms_func.dms_validate_drc(dms_ctx, ctrl, lsn, is_dirty); +} + +int dms_reform_req_opengauss_ondemand_redo_buffer(dms_context_t *dms_ctx, void *block_key, unsigned int key_len, + int *redo_status) +{ + return g_ss_dms_func.dms_reform_req_opengauss_ondemand_redo_buffer(dms_ctx, block_key, key_len, redo_status); } \ No newline at end of file diff --git a/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp b/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp index bc1700887..0cf347208 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp @@ -28,6 +28,7 @@ #include "storage/smgr/segment.h" #include "utils/resowner.h" #include "ddes/dms/ss_dms_bufmgr.h" +#include "ddes/dms/ss_reform_common.h" #include "securec_check.h" #include "miscadmin.h" #include "access/double_write.h" @@ -244,7 +245,7 @@ void SmgrNetPageCheckDiskLSN(BufferDesc *buf_desc, ReadBufferMode read_mode, con if ((lsn_on_mem != InvalidXLogRecPtr) && (lsn_on_disk > lsn_on_mem)) { RelFileNode rnode = buf_desc->tag.rnode; int elevel = WARNING; - if (!RecoveryInProgress()) { + if (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY) { elevel = PANIC; } ereport(elevel, (errmsg("[%d/%d/%d/%d/%d %d-%d] memory lsn(0x%llx) is less than disk lsn(0x%llx)", @@ -302,6 +303,15 @@ Buffer TerminateReadPage(BufferDesc* buf_desc, ReadBufferMode read_mode, const X ClearReadHint(buf_desc->buf_id); TerminateBufferIO(buf_desc, false, BM_VALID); + + /* + * we need redo items to get lastest page in ondemand recovery + */ + if (t_thrd.role != PAGEREDO && SS_ONDEMAND_BUILD_DONE && SS_PRIMARY_MODE && + !LWLockHeldByMe(buf_desc->content_lock)) { + buf_desc = RedoForOndemandExtremeRTOQuery(buf_desc, RELPERSISTENCE_PERMANENT, buf_desc->tag.forkNum, + buf_desc->tag.blockNum, read_mode); + } return buffer; } @@ -472,12 +482,62 @@ Buffer DmsReadPage(Buffer buffer, LWLockMode mode, ReadBufferMode read_mode, boo return buffer; } + // standby node must notify primary node for prepare lastest page in ondemand recovery + if (SS_STANDBY_ONDEMAND_RECOVERY) { + while (!SSOndemandRequestPrimaryRedo(buf_desc->tag)) { + SSReadControlFile(REFORM_CTRL_PAGE); + if (SS_STANDBY_ONDEMAND_NORMAL) { + break; // ondemand recovery finish, skip + } else if (SS_STANDBY_ONDEMAND_BUILD) { + return 0; // in new reform + } + // still need requset page + } + } + if (!StartReadPage(buf_desc, mode)) { return 0; } return TerminateReadPage(buf_desc, read_mode, OidIsValid(buf_ctrl->pblk_relno) ? &pblk : NULL); } +bool SSOndemandRequestPrimaryRedo(BufferTag tag) +{ + dms_context_t dms_ctx; + int32 redo_status = ONDEMAND_REDO_INVALID; + + if (!SS_STANDBY_ONDEMAND_RECOVERY) { + return true; + } + + ereport(DEBUG1, + (errmodule(MOD_DMS), + errmsg("[On-demand] start request primary node redo page, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u", + tag.rnode.spcNode, tag.rnode.dbNode, tag.rnode.relNode, tag.rnode.bucketNode, tag.forkNum, + tag.blockNum))); + InitDmsContext(&dms_ctx); + dms_ctx.xmap_ctx.dest_id = (unsigned int)SS_PRIMARY_ID; + if (dms_reform_req_opengauss_ondemand_redo_buffer(&dms_ctx, &tag, + (unsigned int)sizeof(BufferTag), &redo_status) != DMS_SUCCESS) { + ereport(LOG, + (errmodule(MOD_DMS), + errmsg("[on-demand] request primary node redo page failed, page id [%d/%d/%d/%d/%d %d-%d], " + "redo statu %d", tag.rnode.spcNode, tag.rnode.dbNode, tag.rnode.relNode, (int)tag.rnode.bucketNode, + (int)tag.rnode.opt, tag.forkNum, tag.blockNum, redo_status))); + return false; + } + ereport(DEBUG1, + (errmodule(MOD_DMS), + errmsg("[On-demand] end request primary node redo page, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u, " + "redo status %d", tag.rnode.spcNode, tag.rnode.dbNode, tag.rnode.relNode, tag.rnode.bucketNode, + tag.forkNum, tag.blockNum, redo_status))); + + if (redo_status != ONDEMAND_REDO_DONE) { + SSReadControlFile(REFORM_CTRL_PAGE); + } + return true; +} + bool DmsReleaseOwner(BufferTag buf_tag, int buf_id) { dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_id); @@ -674,7 +734,8 @@ bool CheckPageNeedSkipInRecovery(Buffer buf) dms_session_e DMSGetProcType4RequestPage() { // proc type used in DMS request page - if (AmDmsReformProcProcess() || AmPageRedoProcess() || AmStartupProcess()) { + if (AmDmsReformProcProcess() || (AmPageRedoProcess() && !SS_ONDEMAND_BUILD_DONE) || + (AmStartupProcess() && !SS_ONDEMAND_BUILD_DONE)) { /* When xlog_file_path is not null and enable_dms is set on, main standby always is in recovery. * When pmState is PM_HOT_STANDBY, this case indicates main standby support to read only. So here * DMS_SESSION_RECOVER_HOT_STANDBY will be returned, it indicates that normal threads can access diff --git a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp index b61c1a970..c519479d8 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp @@ -1714,6 +1714,7 @@ static int CBReformDoneNotify(void *db_handle) g_instance.dms_cxt.SSRecoveryInfo.startup_reform = false; g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = false; g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = NOT_ACTIVE; + SSReadControlFile(REFORM_CTRL_PAGE); Assert(g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy == false); ereport(LOG, (errmodule(MOD_DMS), @@ -1831,6 +1832,57 @@ void DmsCallbackThreadShmemInit(unsigned char need_startup, char **reg_data) t_thrd.postgres_cxt.whereToSendOutput = (int)DestNone; } +int CBOndemandRedoPageForStandby(void *block_key, int32 *redo_status) +{ + BufferTag* tag = (BufferTag *)block_key; + + Assert(SS_PRIMARY_MODE); + // do nothing if not in ondemand recovery + if (!SS_IN_ONDEMAND_RECOVERY) { + ereport(DEBUG1, (errmsg("[On-demand] ignore redo page request, spc/db/rel/bucket " + "fork-block: %u/%u/%u/%d %d-%u", tag->rnode.spcNode, tag->rnode.dbNode, + tag->rnode.relNode, tag->rnode.bucketNode, tag->forkNum, tag->blockNum))); + *redo_status = ONDEMAND_REDO_SKIP; + return GS_SUCCESS;; + } + + Buffer buffer; + SegSpace *spc = NULL; + uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount; + *redo_status = ONDEMAND_REDO_DONE; + PG_TRY(); + { + if (IsSegmentPhysicalRelNode(tag->rnode)) { + spc = spc_open(tag->rnode.spcNode, tag->rnode.dbNode, false, false); + buffer = ReadBufferFast(spc, tag->rnode, tag->forkNum, tag->blockNum, RBM_NORMAL); + } else { + buffer = ReadBufferWithoutRelcache(tag->rnode, tag->forkNum, tag->blockNum, RBM_NORMAL, NULL, NULL); + } + ReleaseBuffer(buffer); + } + PG_CATCH(); + { + t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount; + /* Save error info */ + ErrorData* edata = CopyErrorData(); + FlushErrorState(); + FreeErrorData(edata); + ereport(PANIC, (errmsg("[On-demand] Error happend when primary redo page for standby, spc/db/rel/bucket " + "fork-block: %u/%u/%u/%d %d-%u", tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, + tag->rnode.bucketNode, tag->forkNum, tag->blockNum))); + } + PG_END_TRY(); + + if (BufferIsInvalid(buffer)) { + *redo_status = ONDEMAND_REDO_FAIL; + } + + ereport(DEBUG1, (errmsg("[On-demand] redo page for standby done, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u, " + "redo status: %d", tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, + tag->rnode.bucketNode, tag->forkNum, tag->blockNum, *redo_status))); + return GS_SUCCESS;; +} + void DmsInitCallback(dms_callback_t *callback) { // used in reform @@ -1850,6 +1902,7 @@ void DmsInitCallback(dms_callback_t *callback) callback->failover_promote_opengauss = CBFailoverPromote; callback->reform_start_notify = CBReformStartNotify; callback->reform_set_dms_role = CBReformSetDmsRole; + callback->opengauss_ondemand_redo_buffer = CBOndemandRedoPageForStandby; callback->get_page_hash_val = CBPageHashCode; callback->read_local_page4transfer = CBEnterLocalPage; diff --git a/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp b/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp index e9ae32649..158d90615 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp @@ -119,7 +119,13 @@ bool SSRecoveryNodes() break; } LWLockRelease(ControlFileLock); - + + /* do not wait when on-demand HashMap build done */ + if (SS_ONDEMAND_BUILD_DONE) { + result = true; + break; + } + /* If main standby is set hot standby to on, when it reach consistency or recovery all xlogs in disk, * recovery phase could be regarded successful in hot_standby thus set pmState = PM_HOT_STANDBY, which * indicate database systerm is ready to accept read only connections. @@ -149,98 +155,8 @@ bool SSRecoveryApplyDelay() return true; } -void SSReadControlFile(int id, bool updateDmsCtx) -{ - pg_crc32c crc; - errno_t rc = EOK; - int fd = -1; - char *fname = NULL; - bool retry = false; - int read_size = 0; - int len = 0; - fname = XLOG_CONTROL_FILE; - -loop: - fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); - if (fd < 0) { - ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname))); - } - - off_t seekpos = (off_t)BLCKSZ * id; - - if (id == REFORM_CTRL_PAGE) { - len = sizeof(ss_reformer_ctrl_t); - } else { - len = sizeof(ControlFileData); - } - - read_size = (int)BUFFERALIGN(len); - char buffer[read_size] __attribute__((__aligned__(ALIGNOF_BUFFER))); - if (pread(fd, buffer, read_size, seekpos) != read_size) { - ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m"))); - } - - if (id == REFORM_CTRL_PAGE) { - rc = memcpy_s(&g_instance.dms_cxt.SSReformerControl, len, buffer, len); - securec_check(rc, "", ""); - if (close(fd) < 0) { - ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); - } - - /* Now check the CRC. */ - INIT_CRC32C(crc); - COMP_CRC32C(crc, (char *)&g_instance.dms_cxt.SSReformerControl, offsetof(ss_reformer_ctrl_t, crc)); - FIN_CRC32C(crc); - - if (!EQ_CRC32C(crc, g_instance.dms_cxt.SSReformerControl.crc)) { - if (retry == false) { - ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); - fname = XLOG_CONTROL_FILE_BAK; - retry = true; - goto loop; - } else { - ereport(FATAL, (errmsg("incorrect checksum in control file"))); - } - } - } else { - ControlFileData* controlFile = NULL; - ControlFileData tempControlFile; - if (updateDmsCtx) { - controlFile = &tempControlFile; - } else { - controlFile = t_thrd.shemem_ptr_cxt.ControlFile; - } - - rc = memcpy_s(controlFile, (size_t)len, buffer, (size_t)len); - securec_check(rc, "", ""); - if (close(fd) < 0) { - ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); - } - - /* Now check the CRC. */ - INIT_CRC32C(crc); - COMP_CRC32C(crc, (char *)controlFile, offsetof(ControlFileData, crc)); - FIN_CRC32C(crc); - - if (!EQ_CRC32C(crc, controlFile->crc)) { - if (retry == false) { - ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); - fname = XLOG_CONTROL_FILE_BAK; - retry = true; - goto loop; - } else { - ereport(FATAL, (errmsg("incorrect checksum in control file"))); - } - } - - if (XLByteLE(g_instance.dms_cxt.ckptRedo, controlFile->checkPointCopy.redo)) { - g_instance.dms_cxt.ckptRedo = controlFile->checkPointCopy.redo; - } - } -} - /* initialize reformer ctrl parameter when initdb */ -void SSWriteReformerControlPages(void) +void SSInitReformerControlPages(void) { /* * If already exists control file, reformer page must have been initialized @@ -268,6 +184,9 @@ void SSWriteReformerControlPages(void) Assert(!dss_exist_file(XLOG_CONTROL_FILE)); g_instance.dms_cxt.SSReformerControl.list_stable = 0; g_instance.dms_cxt.SSReformerControl.primaryInstId = SS_MY_INST_ID; + g_instance.dms_cxt.SSReformerControl.recoveryInstId = INVALID_INSTANCEID; + g_instance.dms_cxt.SSReformerControl.version = REFORM_CTRL_VERSION; + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; (void)printf("[SS] Current node:%d initdb first, will become PRIMARY for first-time SS cluster startup.\n", SS_MY_INST_ID); @@ -370,4 +289,4 @@ void ss_switchover_promoting_dw_init() dw_init(); g_instance.dms_cxt.dw_init = true; ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS switchover] dw init finished"))); -} \ No newline at end of file +} diff --git a/src/gausskernel/ddes/adapter/ss_reform_common.cpp b/src/gausskernel/ddes/adapter/ss_reform_common.cpp index 4aa330fa4..c3f1bedfb 100644 --- a/src/gausskernel/ddes/adapter/ss_reform_common.cpp +++ b/src/gausskernel/ddes/adapter/ss_reform_common.cpp @@ -99,12 +99,16 @@ int SSXLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources, char* xlog_ return -1; } -bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, XLogRecPtr targetRecPtr, char *buf) +int SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, XLogRecPtr targetRecPtr, char *buf, + int readLen) { uint32 preReadOff; XLogRecPtr xlogFlushPtrForPerRead = xlogreader->xlogFlushPtrForPerRead; bool isReadFile = true; + Assert(readLen > 0); + Assert(readLen <= XLogPreReadSize); + do { /* * That source is XLOG_FROM_STREAM indicate that walreceiver receive xlog and walrecwriter have wrriten xlog @@ -124,7 +128,7 @@ bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, X if ((XLByteInPreReadBuf(targetPagePtr, xlogreader->preReadStartPtr) && !SS_STANDBY_CLUSTER_MAIN_STANDBY) || (!isReadFile)) { preReadOff = targetPagePtr % XLogPreReadSize; - int err = memcpy_s(buf, XLOG_BLCKSZ, xlogreader->preReadBuf + preReadOff, XLOG_BLCKSZ); + int err = memcpy_s(buf, readLen, xlogreader->preReadBuf + preReadOff, readLen); securec_check(err, "\0", "\0"); break; } else { @@ -143,7 +147,7 @@ bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, X } } while (true); - return true; + return readLen; } XLogReaderState *SSXLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data, Size alignedSize) @@ -178,28 +182,131 @@ XLogReaderState *SSXLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private return state; } -void SSGetXlogPath() +void SSGetRecoveryXlogPath() { - int primaryId = -1; errno_t rc = EOK; char *dssdir = g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name; - /* get primary inst id */ - primaryId = SSGetPrimaryInstId(); - rc = snprintf_s(g_instance.dms_cxt.SSRecoveryInfo.recovery_xlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_xlog%d", - dssdir, primaryId); + dssdir, g_instance.dms_cxt.SSReformerControl.recoveryInstId); securec_check_ss(rc, "", ""); } -void SSSaveReformerCtrl() +static void SSSaveOldReformerCtrl() +{ + ss_reformer_ctrl_t new_ctrl = g_instance.dms_cxt.SSReformerControl; + ss_old_reformer_ctrl_t old_ctrl = {new_ctrl.list_stable, new_ctrl.primaryInstId, new_ctrl.crc}; + + int len = sizeof(ss_old_reformer_ctrl_t); + int write_size = (int)BUFFERALIGN(len); + char buffer[write_size] __attribute__((__aligned__(ALIGNOF_BUFFER))) = { 0 }; + char *fname[2]; + int fd = -1; + + errno_t err = memcpy_s(&buffer, write_size, &old_ctrl, len); + securec_check(err, "\0", "\0"); + + INIT_CRC32C(((ss_old_reformer_ctrl_t *)buffer)->crc); + COMP_CRC32C(((ss_old_reformer_ctrl_t *)buffer)->crc, (char *)buffer, offsetof(ss_old_reformer_ctrl_t, crc)); + FIN_CRC32C(((ss_old_reformer_ctrl_t *)buffer)->crc); + + fname[0] = XLOG_CONTROL_FILE_BAK; + fname[1] = XLOG_CONTROL_FILE; + + for (int i = 0; i < BAK_CTRL_FILE_NUM; i++) { + if (i == 0) { + fd = BasicOpenFile(fname[i], O_CREAT | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + } else { + fd = BasicOpenFile(fname[i], O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + } + + if (fd < 0) { + ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname[i]))); + } + + SSWriteInstanceControlFile(fd, buffer, REFORM_CTRL_PAGE, write_size); + if (close(fd)) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); + } + } +} + +static bool SSReadOldReformerCtrl() +{ + ss_reformer_ctrl_t *new_ctrl = &g_instance.dms_cxt.SSReformerControl; + ss_old_reformer_ctrl_t old_ctrl; + pg_crc32c crc; + int fd = -1; + bool retry = false; + char *fname = XLOG_CONTROL_FILE; + +loop: + fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + if (fd < 0) { + ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname))); + } + + off_t seekpos = (off_t)BLCKSZ * REFORM_CTRL_PAGE; + int len = sizeof(ss_old_reformer_ctrl_t); + + int read_size = (int)BUFFERALIGN(len); + char buffer[read_size] __attribute__((__aligned__(ALIGNOF_BUFFER))); + if (pread(fd, buffer, read_size, seekpos) != read_size) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m"))); + } + + errno_t rc = memcpy_s(&old_ctrl, len, buffer, len); + securec_check(rc, "", ""); + if (close(fd) < 0) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); + } + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *)&old_ctrl, offsetof(ss_old_reformer_ctrl_t, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, old_ctrl.crc)) { + if (retry == false) { + ereport(WARNING, + (errmsg("control file \"%s\" contains incorrect checksum in upgrade mode, try backup file", fname))); + fname = XLOG_CONTROL_FILE_BAK; + retry = true; + goto loop; + } else { + ereport(WARNING, + (errmsg("backup control file \"%s\" contains incorrect checksum in upgrade mode, " + "try again in post-upgrade mode", fname))); + return false; + } + } + + // new params set to initial value + new_ctrl->version = REFORM_CTRL_VERSION;; + new_ctrl->recoveryInstId = INVALID_INSTANCEID; + new_ctrl->clusterStatus = CLUSTER_NORMAL; + + // exist param inherit + new_ctrl->primaryInstId = old_ctrl.primaryInstId; + new_ctrl->list_stable = old_ctrl.list_stable; + new_ctrl->crc = old_ctrl.crc; + + return true; +} + +void SSSaveReformerCtrl(bool force) { int fd = -1; int len; errno_t err = EOK; char *fname[2]; - len = sizeof(ss_reformer_ctrl_t); + if ((pg_atomic_read_u32(&WorkingGrandVersionNum) < ONDEMAND_REDO_VERSION_NUM) && !force) { + SSSaveOldReformerCtrl(); + return; + } + + len = sizeof(ss_reformer_ctrl_t); int write_size = (int)BUFFERALIGN(len); char buffer[write_size] __attribute__((__aligned__(ALIGNOF_BUFFER))) = { 0 }; @@ -231,6 +338,110 @@ void SSSaveReformerCtrl() } } +void SSReadControlFile(int id, bool updateDmsCtx) +{ + pg_crc32c crc; + errno_t rc = EOK; + int fd = -1; + char *fname = NULL; + bool retry = false; + int read_size = 0; + int len = 0; + fname = XLOG_CONTROL_FILE; + + if ((pg_atomic_read_u32(&WorkingGrandVersionNum) < ONDEMAND_REDO_VERSION_NUM) && (id == REFORM_CTRL_PAGE)) { + if (SSReadOldReformerCtrl()) { + return; + } + + // maybe primary node already upgrade pg_control file, sleep and try read in lastest mode again + if (SS_STANDBY_MODE) { + pg_usleep(5000000); /* 5 sec */ + goto loop; + } else { + ereport(PANIC, (errmsg("incorrect checksum in control file"))); + } + } + +loop: + fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + if (fd < 0) { + ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname))); + } + + off_t seekpos = (off_t)BLCKSZ * id; + + if (id == REFORM_CTRL_PAGE) { + len = sizeof(ss_reformer_ctrl_t); + } else { + len = sizeof(ControlFileData); + } + + read_size = (int)BUFFERALIGN(len); + char buffer[read_size] __attribute__((__aligned__(ALIGNOF_BUFFER))); + if (pread(fd, buffer, read_size, seekpos) != read_size) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m"))); + } + + if (id == REFORM_CTRL_PAGE) { + rc = memcpy_s(&g_instance.dms_cxt.SSReformerControl, len, buffer, len); + securec_check(rc, "", ""); + if (close(fd) < 0) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); + } + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *)&g_instance.dms_cxt.SSReformerControl, offsetof(ss_reformer_ctrl_t, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, g_instance.dms_cxt.SSReformerControl.crc)) { + if (retry == false) { + ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); + fname = XLOG_CONTROL_FILE_BAK; + retry = true; + goto loop; + } else { + ereport(FATAL, (errmsg("incorrect checksum in control file"))); + } + } + } else { + ControlFileData* controlFile = NULL; + ControlFileData tempControlFile; + if (updateDmsCtx) { + controlFile = &tempControlFile; + } else { + controlFile = t_thrd.shemem_ptr_cxt.ControlFile; + } + + rc = memcpy_s(controlFile, (size_t)len, buffer, (size_t)len); + securec_check(rc, "", ""); + if (close(fd) < 0) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); + } + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *)controlFile, offsetof(ControlFileData, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, controlFile->crc)) { + if (retry == false) { + ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); + fname = XLOG_CONTROL_FILE_BAK; + retry = true; + goto loop; + } else { + ereport(FATAL, (errmsg("incorrect checksum in control file"))); + } + } + + if (XLByteLE(g_instance.dms_cxt.ckptRedo, controlFile->checkPointCopy.redo)) { + g_instance.dms_cxt.ckptRedo = controlFile->checkPointCopy.redo; + } + } +} + void SSClearSegCache() { (void)LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index 7896e89bd..9672942fb 100644 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -3027,6 +3027,13 @@ int PostmasterMain(int argc, char* argv[]) } ereport(LOG, (errmsg("[SS reform] Success: node:%d wait for PRIMARY:%d to finish 1st reform", g_instance.attr.attr_storage.dms_attr.instance_id, src_id))); + + while (SS_OFFICIAL_RECOVERY_NODE && SS_CLUSTER_NOT_NORAML) { + pg_usleep(SLEEP_ONE_SEC); + SSReadControlFile(REFORM_CTRL_PAGE); + ereport(WARNING, (errmsg("[on-demand] node%d is last primary node, waiting for on-demand recovery done", + g_instance.attr.attr_storage.dms_attr.instance_id))); + } } } @@ -3063,8 +3070,6 @@ int PostmasterMain(int argc, char* argv[]) } } - - /* * We're ready to rock and roll... */ @@ -4001,7 +4006,8 @@ static int ServerLoop(void) (AutoVacuumingActive() || t_thrd.postmaster_cxt.start_autovac_launcher) && pmState == PM_RUN && !dummyStandbyMode && u_sess->attr.attr_common.upgrade_mode != 1 && !g_instance.streaming_dr_cxt.isInSwitchover && - !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) { + !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM && + !SS_IN_ONDEMAND_RECOVERY) { g_instance.pid_cxt.AutoVacPID = initialize_util_thread(AUTOVACUUM_LAUNCHER); if (g_instance.pid_cxt.AutoVacPID != 0) @@ -6621,29 +6627,6 @@ dms_demote: PostmasterStateMachine(); } -/* - * Reaper -- get current time. - */ -static void GetTimeNowForReaperLog(char* nowTime, int timeLen) -{ - time_t formatTime; - struct timeval current = {0}; - const int tmpBufSize = 32; - char tmpBuf[tmpBufSize] = {0}; - - if (nowTime == NULL || timeLen == 0) { - return; - } - - (void)gettimeofday(¤t, NULL); - formatTime = current.tv_sec; - struct tm* pTime = localtime(&formatTime); - strftime(tmpBuf, sizeof(tmpBuf), "%Y-%m-%d %H:%M:%S", pTime); - - errno_t rc = sprintf_s(nowTime, timeLen - 1, "%s.%ld ", tmpBuf, current.tv_usec / 1000); - securec_check_ss(rc, "\0", "\0"); -} - /* * Reaper -- encap reaper prefix log. */ @@ -6653,7 +6636,7 @@ static char* GetReaperLogPrefix(char* buf, int bufLen) char timeBuf[bufSize] = {0}; errno_t rc; - GetTimeNowForReaperLog(timeBuf, bufSize); + get_time_now(timeBuf, bufSize); rc = memset_s(buf, bufLen, 0, bufLen); securec_check(rc, "\0", "\0"); @@ -6859,7 +6842,8 @@ static void reaper(SIGNAL_ARGS) if (!u_sess->proc_cxt.IsBinaryUpgrade && AutoVacuumingActive() && g_instance.pid_cxt.AutoVacPID == 0 && !dummyStandbyMode && u_sess->attr.attr_common.upgrade_mode != 1 && !g_instance.streaming_dr_cxt.isInSwitchover && - !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) + !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM && + !SS_IN_ONDEMAND_RECOVERY) g_instance.pid_cxt.AutoVacPID = initialize_util_thread(AUTOVACUUM_LAUNCHER); if (SS_REFORM_PARTNER) { @@ -7013,8 +6997,10 @@ static void reaper(SIGNAL_ARGS) GetReaperLogPrefix(logBuf, ReaperLogBufSize), wal_get_role_string(get_cur_mode())); /* at this point we are really open for business */ - write_stderr("%s LOG: database system is ready to accept connections\n", - GetReaperLogPrefix(logBuf, ReaperLogBufSize)); + if (!SS_REPLAYED_BY_ONDEMAND) { + write_stderr("%s LOG: database system is ready to accept connections\n", + GetReaperLogPrefix(logBuf, ReaperLogBufSize)); + } continue; } @@ -10014,12 +10000,12 @@ static void sigusr1_handler(SIGNAL_ARGS) } if (ENABLE_DMS && (mode = CheckSwitchoverSignal())) { - if (SS_NORMAL_STANDBY && pmState == PM_RUN) { + SSReadControlFile(REFORM_CTRL_PAGE); + if (SS_NORMAL_STANDBY && pmState == PM_RUN && !SS_STANDBY_ONDEMAND_RECOVERY) { SSDoSwitchover(); } else { ereport(LOG, (errmsg("Current mode is not NORMAL STANDBY, SS switchover command ignored."))); } - } if ((mode = CheckSwitchoverSignal()) != 0 && WalRcvIsOnline() && DataRcvIsOnline() && diff --git a/src/gausskernel/process/tcop/utility.cpp b/src/gausskernel/process/tcop/utility.cpp index 306bde983..7319d69af 100755 --- a/src/gausskernel/process/tcop/utility.cpp +++ b/src/gausskernel/process/tcop/utility.cpp @@ -643,6 +643,30 @@ void PreventCommandDuringRecovery(const char* cmd_name) errmsg("cannot execute %s during recovery", cmd_name))); } +void PreventCommandDuringSSOndemandRecovery(Node* parseTree) +{ + switch(nodeTag(parseTree)) { + case T_InsertStmt: + case T_DeleteStmt: + case T_UpdateStmt: + case T_SelectStmt: + case T_TransactionStmt: + case T_VariableSetStmt: + case T_VariableShowStmt: + break; + default: + if (SS_IN_ONDEMAND_RECOVERY) { + ereport(ERROR, + (errcode(ERRCODE_RUN_TRANSACTION_DURING_RECOVERY), + errmsg("only support INSERT/UPDATE/DELETE/SELECT/SET/SHOW during SS on-demand recovery, " + "command %d", nodeTag(parseTree)))); + } + break; + } + + return; +} + /* * CheckRestrictedOperation: throw error for hazardous command if we're * inside a security restriction context. diff --git a/src/gausskernel/process/threadpool/knl_instance.cpp b/src/gausskernel/process/threadpool/knl_instance.cpp index 0e2373b3c..8dda0bb12 100755 --- a/src/gausskernel/process/threadpool/knl_instance.cpp +++ b/src/gausskernel/process/threadpool/knl_instance.cpp @@ -190,6 +190,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt) dms_cxt->SSRecoveryInfo.in_failover = false; dms_cxt->SSRecoveryInfo.in_flushcopy = false; dms_cxt->SSRecoveryInfo.no_backend_left = false; + dms_cxt->SSRecoveryInfo.in_ondemand_recovery = false; dms_cxt->SSRecoveryInfo.startup_need_exit_normally = false; dms_cxt->SSRecoveryInfo.recovery_trapped_in_page_request = false; dms_cxt->log_timezone = NULL; @@ -301,6 +302,8 @@ static void knl_g_parallel_redo_init(knl_g_parallel_redo_context* predo_cxt) rc = memset_s(&predo_cxt->redoCpuBindcontrl, sizeof(RedoCpuBindControl), 0, sizeof(RedoCpuBindControl)); securec_check(rc, "", ""); + + predo_cxt->redoItemHash = NULL; } static void knl_g_parallel_decode_init(knl_g_parallel_decode_context* pdecode_cxt) diff --git a/src/gausskernel/process/threadpool/knl_thread.cpp b/src/gausskernel/process/threadpool/knl_thread.cpp index 13d33ffe9..a4d76e0be 100755 --- a/src/gausskernel/process/threadpool/knl_thread.cpp +++ b/src/gausskernel/process/threadpool/knl_thread.cpp @@ -1707,6 +1707,14 @@ static void knl_t_dms_context_init(knl_t_dms_context *dms_cxt) securec_check(rc, "\0", "\0"); dms_cxt->flush_copy_get_page_failed = false; } + +static void knl_t_ondemand_xlog_copy_context_init(knl_t_ondemand_xlog_copy_context *ondemand_xlog_copy_cxt) +{ + ondemand_xlog_copy_cxt->openLogFile = -1; + ondemand_xlog_copy_cxt->openLogSegNo = 0; + ondemand_xlog_copy_cxt->openLogOff = 0; +} + static void knl_t_rc_init(knl_t_rc_context* rc_cxt) { errno_t rc = EOK; @@ -1889,6 +1897,7 @@ void knl_thread_init(knl_thread_role role) knl_index_advisor_init(&t_thrd.index_advisor_cxt); knl_t_sql_patch_init(&t_thrd.sql_patch_cxt); knl_t_dms_context_init(&t_thrd.dms_cxt); + knl_t_ondemand_xlog_copy_context_init(&t_thrd.ondemand_xlog_copy_cxt); KnlTApplyLauncherInit(&t_thrd.applylauncher_cxt); KnlTApplyWorkerInit(&t_thrd.applyworker_cxt); KnlTPublicationInit(&t_thrd.publication_cxt); diff --git a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp index 37207c4f0..0fb813d5b 100644 --- a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp +++ b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp @@ -913,38 +913,6 @@ void XLogRecSetSegNewPageInfo(XLogBlockSegNewPage *state, char *mainData, Size l state->dataLen = len; } - -static inline bool AtomicCompareExchangeBuffer(volatile Buffer *ptr, Buffer *expected, Buffer newval) -{ - bool ret = false; - Buffer current; - current = __sync_val_compare_and_swap(ptr, *expected, newval); - ret = (current == *expected); - *expected = current; - return ret; -} - -static inline Buffer AtomicReadBuffer(volatile Buffer *ptr) -{ - return *ptr; -} - -static inline void AtomicWriteBuffer(volatile Buffer* ptr, Buffer val) -{ - *ptr = val; -} - -static inline Buffer AtomicExchangeBuffer(volatile Buffer *ptr, Buffer newval) -{ - Buffer old; - while (true) { - old = AtomicReadBuffer(ptr); - if (AtomicCompareExchangeBuffer(ptr, &old, newval)) - break; - } - return old; -} - /* add for batch redo mem manager */ void *XLogMemCtlInit(RedoMemManager *memctl, Size itemsize, int itemnum) { @@ -1163,6 +1131,10 @@ void XLogRedoBufferSetState(RedoBufferManager *buffermanager, RedoMemSlot *buffe void XLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOperate *refOperate, InterruptFunc interruptOperte) { + if (SS_IN_ONDEMAND_RECOVERY) { + return OndemandXLogParseBufferInit(parsemanager, buffernum, refOperate, interruptOperte); + } + void *allocdata = NULL; allocdata = XLogMemCtlInit(&(parsemanager->memctl), (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc)), buffernum); @@ -1177,6 +1149,11 @@ void XLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOpera void XLogParseBufferDestory(RedoParseManager *parsemanager) { + if (SS_IN_ONDEMAND_RECOVERY) { + OndemandXLogParseBufferDestory(parsemanager); + return; + } + g_parseManager = NULL; if (parsemanager->parsebuffers != NULL) { pfree(parsemanager->parsebuffers); @@ -1188,6 +1165,10 @@ void XLogParseBufferDestory(RedoParseManager *parsemanager) XLogRecParseState *XLogParseBufferAllocList(RedoParseManager *parsemanager, XLogRecParseState *blkstatehead, void *record) { + if (SS_IN_ONDEMAND_RECOVERY) { + return OndemandXLogParseBufferAllocList(parsemanager, blkstatehead, record); + } + RedoMemManager *memctl = &(parsemanager->memctl); RedoMemSlot *allocslot = NULL; ParseBufferDesc *descstate = NULL; @@ -1233,11 +1214,16 @@ XLogRecParseState *XLogParseBufferCopy(XLogRecParseState *srcState) securec_check(rc, "\0", "\0"); newState->isFullSync = srcState->isFullSync; + newState->distributeStatus = srcState->distributeStatus; return newState; } void XLogParseBufferRelease(XLogRecParseState *recordstate) { + if (SS_IN_ONDEMAND_RECOVERY) { + OndemandXLogParseBufferRelease(recordstate); + return; + } RedoMemManager *memctl = &(recordstate->manager->memctl); ParseBufferDesc *descstate = NULL; @@ -1691,7 +1677,9 @@ void ExtremeRtoFlushBuffer(RedoBufferInfo *bufferinfo, bool updateFsm) } else { if (bufferinfo->pageinfo.page != NULL) { BufferDesc *bufDesc = GetBufferDescriptor(bufferinfo->buf - 1); - if (bufferinfo->dirtyflag || XLByteLT(bufDesc->extra->lsn_on_disk, PageGetLSN(bufferinfo->pageinfo.page))) { + /* backends may mark buffer dirty already */ + if (!(bufDesc->state & BM_DIRTY) && + (bufferinfo->dirtyflag || XLByteLT(bufDesc->extra->lsn_on_disk, PageGetLSN(bufferinfo->pageinfo.page)))) { MarkBufferDirty(bufferinfo->buf); if (!bufferinfo->dirtyflag && bufferinfo->blockinfo.forknum == MAIN_FORKNUM) { int mode = WARNING; @@ -1699,8 +1687,8 @@ void ExtremeRtoFlushBuffer(RedoBufferInfo *bufferinfo, bool updateFsm) mode = PANIC; #endif const uint32 shiftSz = 32; - ereport(mode, (errmsg("extreme_rto not mark dirty:lsn %X/%X, lsn_disk %X/%X, \ - lsn_page %X/%X, page %u/%u/%u %u", + ereport(mode, (errmsg("extreme_rto not mark dirty:lsn %X/%X, lsn_disk %X/%X, " + "lsn_page %X/%X, page %u/%u/%u %u", (uint32)(bufferinfo->lsn >> shiftSz), (uint32)(bufferinfo->lsn), (uint32)(bufDesc->extra->lsn_on_disk >> shiftSz), (uint32)(bufDesc->extra->lsn_on_disk), @@ -1764,10 +1752,12 @@ bool XLogBlockRedoForExtremeRTO(XLogRecParseState *redoblocktate, RedoBufferInfo ereport(PANIC, (errmsg("XLogBlockRedoForExtremeRTO: redobuffer checkfailed"))); } if (block_valid <= BLOCK_DATA_FSM_TYPE) { - GetRedoStartTime(redoCost); - Assert(block_valid == g_xlogExtRtoRedoTable[block_valid].block_valid); - g_xlogExtRtoRedoTable[block_valid].xlog_redoextrto(blockhead, blockrecbody, bufferinfo); - CountRedoTime(redoCost); + if (redoaction != BLK_DONE) { + GetRedoStartTime(redoCost); + Assert(block_valid == g_xlogExtRtoRedoTable[block_valid].block_valid); + g_xlogExtRtoRedoTable[block_valid].xlog_redoextrto(blockhead, blockrecbody, bufferinfo); + CountRedoTime(redoCost); + } #ifdef USE_ASSERT_CHECKING if (block_valid != BLOCK_DATA_UNDO_TYPE && !bufferinfo->pageinfo.ignorecheck) { DoRecordCheck(redoblocktate, PageGetLSN(bufferinfo->pageinfo.page), true); @@ -1781,6 +1771,33 @@ bool XLogBlockRedoForExtremeRTO(XLogRecParseState *redoblocktate, RedoBufferInfo return false; } +void XlogBlockRedoForOndemandExtremeRTOQuery(XLogRecParseState *redoBlockState, RedoBufferInfo *bufferInfo) +{ + + XLogBlockHead *blockHead = &redoBlockState->blockparse.blockhead; + void *blockrecBody = &redoBlockState->blockparse.extra_rec; + uint16 blockValid = XLogBlockHeadGetValidInfo(blockHead); + + bool checkValid = XLogBlockRefreshRedoBufferInfo(blockHead, bufferInfo); + if (!checkValid) { + ereport(PANIC, (errmsg("XLogBlockRedoForOndemandExtremeRTOQuery: redobuffer checkfailed"))); + } + if (blockValid <= BLOCK_DATA_FSM_TYPE) { + Assert(blockValid == g_xlogExtRtoRedoTable[blockValid].block_valid); + g_xlogExtRtoRedoTable[blockValid].xlog_redoextrto(blockHead, blockrecBody, bufferInfo); +#ifdef USE_ASSERT_CHECKING + if (blockValid != BLOCK_DATA_UNDO_TYPE) { + DoRecordCheck(redoBlockState, PageGetLSN(bufferInfo->pageinfo.page), true); + } +#endif + } else { + ereport(WARNING, (errmsg("XLogBlockRedoForOndemandExtremeRTOQuery: unsuport type %u, lsn %X/%X", + (uint32)blockValid, + (uint32)(blockHead->end_ptr >> 32), + (uint32)(blockHead->end_ptr)))); + } +} + static const XLogParseBlock g_xlogParseBlockTable[RM_MAX_ID + 1] = { { xlog_redo_parse_to_block, RM_XLOG_ID }, { xact_redo_parse_to_block, RM_XACT_ID }, diff --git a/src/gausskernel/storage/access/transam/extreme_rto/CMakeLists.txt b/src/gausskernel/storage/access/transam/extreme_rto/CMakeLists.txt index 4350a807c..ccb87469a 100755 --- a/src/gausskernel/storage/access/transam/extreme_rto/CMakeLists.txt +++ b/src/gausskernel/storage/access/transam/extreme_rto/CMakeLists.txt @@ -10,7 +10,8 @@ set(TGT_extreme_rto_INC ${LIBCGROUP_INCLUDE_PATH} ${PROJECT_SRC_DIR}/include/libcomm ${ZLIB_INCLUDE_PATH} - ${LIBCURL_INCLUDE_PATH} + ${LIBCURL_INCLUDE_PATH} + ${DCF_INCLUDE_PATH} ) set(extreme_rto_DEF_OPTIONS ${MACRO_OPTIONS}) diff --git a/src/gausskernel/storage/access/transam/extreme_rto/xlog_read.cpp b/src/gausskernel/storage/access/transam/extreme_rto/xlog_read.cpp index 7217f2fa5..e6e4b4da9 100644 --- a/src/gausskernel/storage/access/transam/extreme_rto/xlog_read.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto/xlog_read.cpp @@ -552,7 +552,7 @@ int ParallelXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, if (readSource & XLOG_FROM_STREAM) { readLen = ParallelXLogReadWorkBufRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, readTLI); } else { - if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { + if (ENABLE_DMS && ENABLE_DSS) { readLen = SSXLogPageRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, xlogreader->readBuf, readTLI, NULL); } else { diff --git a/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp b/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp index 84f6d328e..12deca658 100644 --- a/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp @@ -114,14 +114,14 @@ static const f_extreme_rto_redo extreme_rto_redosw[] = { ondemand_extreme_rto::WaitAllReplayWorkerIdle, ondemand_extreme_rto::DispatchCleanInvalidPageMarkToAllRedoWorker, ondemand_extreme_rto::DispatchClosefdMarkToAllRedoWorker, - ondemand_extreme_rto::RecordBadBlockAndPushToRemote, + NULL, ondemand_extreme_rto::CheckCommittingCsnList, ondemand_extreme_rto::ReadNextXLogRecord, ondemand_extreme_rto::ExtremeRtoStopHere, ondemand_extreme_rto::WaitAllRedoWorkerQueueEmpty, ondemand_extreme_rto::GetSafeMinCheckPoint, - ondemand_extreme_rto::ClearRecoveryThreadHashTbl, - ondemand_extreme_rto::BatchClearRecoveryThreadHashTbl, + NULL, + NULL, ondemand_extreme_rto::RedoWorkerIsUndoSpaceWorker, ondemand_extreme_rto::StartRecoveryWorkers, ondemand_extreme_rto::DispatchRedoRecordToFile, diff --git a/src/gausskernel/storage/access/transam/multi_redo_settings.cpp b/src/gausskernel/storage/access/transam/multi_redo_settings.cpp index 6ed92595e..8cb197105 100644 --- a/src/gausskernel/storage/access/transam/multi_redo_settings.cpp +++ b/src/gausskernel/storage/access/transam/multi_redo_settings.cpp @@ -44,6 +44,8 @@ void ConfigRecoveryParallelism() if (g_instance.attr.attr_storage.recovery_parse_workers > 1) { g_instance.comm_cxt.predo_cxt.redoType = EXTREME_REDO; + g_extreme_rto_type = g_instance.attr.attr_storage.dms_attr.enable_ondemand_recovery ? + ONDEMAND_EXTREME_RTO : DEFAULT_EXTREME_RTO; g_instance.attr.attr_storage.batch_redo_num = g_instance.attr.attr_storage.recovery_parse_workers; uint32 total_recovery_parallelism = g_instance.attr.attr_storage.batch_redo_num * 2 + g_instance.attr.attr_storage.recovery_redo_workers_per_paser_worker * diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/CMakeLists.txt b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/CMakeLists.txt index fe3bf8f33..93e2fea14 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/CMakeLists.txt +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/CMakeLists.txt @@ -10,7 +10,8 @@ set(TGT_ondemand_extreme_rto_INC ${LIBCGROUP_INCLUDE_PATH} ${PROJECT_SRC_DIR}/include/libcomm ${ZLIB_INCLUDE_PATH} - ${LIBCURL_INCLUDE_PATH} + ${LIBCURL_INCLUDE_PATH} + ${DCF_INCLUDE_PATH} ) set(ondemand_extreme_rto_DEF_OPTIONS ${MACRO_OPTIONS}) diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/Makefile b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/Makefile index d81b33523..0d3a32716 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/Makefile +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/Makefile @@ -26,6 +26,6 @@ top_builddir = ../../../../../.. include $(top_builddir)/src/Makefile.global OBJS = dispatcher.o page_redo.o posix_semaphore.o redo_item.o \ - spsc_blocking_queue.o txn_redo.o batch_redo.o xlog_read.o + spsc_blocking_queue.o txn_redo.o batch_redo.o xlog_read.o redo_utils.o include $(top_srcdir)/src/gausskernel/common.mk diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp index 69e708094..7d5856c80 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp @@ -43,6 +43,8 @@ #include "access/xlogproc.h" +extern uint32 hashquickany(uint32 seed, register const unsigned char *data, register int len); + namespace ondemand_extreme_rto { static inline void PRXLogRecGetBlockTag(XLogRecParseState *recordBlockState, RelFileNode *rnode, BlockNumber *blknum, ForkNumber *forknum) @@ -64,11 +66,17 @@ static inline void PRXLogRecGetBlockTag(XLogRecParseState *recordBlockState, Rel } } +uint32 XlogTrackTableHashCode(RedoItemTag *tagPtr) +{ + return hashquickany(0xFFFFFFFF, (unsigned char *)tagPtr, sizeof(RedoItemTag)); +} + void PRInitRedoItemEntry(RedoItemHashEntry *redoItemHashEntry) { redoItemHashEntry->redoItemNum = 0; redoItemHashEntry->head = NULL; redoItemHashEntry->tail = NULL; + redoItemHashEntry->redoDone = false; } uint32 RedoItemTagHash(const void *key, Size keysize) @@ -93,10 +101,11 @@ int RedoItemTagMatch(const void *left, const void *right, Size keysize) return 1; } -HTAB *PRRedoItemHashInitialize(MemoryContext context) +HTAB **PRRedoItemHashInitialize(MemoryContext context) { HASHCTL ctl; - HTAB *hTab = NULL; + int batchNum = get_batch_redo_num(); + HTAB **hTab = (HTAB **)MemoryContextAllocZero(context, batchNum * sizeof(HTAB *)); /* * create hashtable that indexes the redo items @@ -108,14 +117,17 @@ HTAB *PRRedoItemHashInitialize(MemoryContext context) ctl.entrysize = sizeof(RedoItemHashEntry); ctl.hash = RedoItemTagHash; ctl.match = RedoItemTagMatch; - hTab = hash_create("Redo item hash by relfilenode and blocknum", INITredoItemHashSIZE, &ctl, - HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_COMPARE); + for (int i = 0; i < batchNum; i++) { + hTab[i] = hash_create("Redo item hash by relfilenode and blocknum", INITredoItemHashSIZE, &ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_SHRCTX | HASH_COMPARE); + } return hTab; } void PRRegisterBlockInsertToList(RedoItemHashEntry *redoItemHashEntry, XLogRecParseState *record) { + ReferenceRecParseState(record); if (redoItemHashEntry->tail != NULL) { redoItemHashEntry->tail->nextrecord = record; redoItemHashEntry->tail = record; diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp index 076aecdbb..e50af152a 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp @@ -76,6 +76,7 @@ #include "gssignal/gs_signal.h" #include "utils/atomic.h" #include "pgstat.h" +#include "ddes/dms/ss_reform_common.h" #ifdef PGXC #include "pgxc/pgxc.h" @@ -113,7 +114,6 @@ static void **CollectStatesFromWorkers(GetStateFunc); static void GetSlotIds(XLogReaderState *record); static void GetUndoSlotIds(XLogReaderState *record); STATIC LogDispatcher *CreateDispatcher(); -static void DestroyRecoveryWorkers(); static void SSDestroyRecoveryWorkers(); static void DispatchRecordWithPages(XLogReaderState *, List *); @@ -169,10 +169,6 @@ static XLogReaderState *GetXlogReader(XLogReaderState *readerState); void CopyDataFromOldReader(XLogReaderState *newReaderState, const XLogReaderState *oldReaderState); void SendSingalToPageWorker(int signal); -#ifdef USE_ASSERT_CHECKING -bool CheckBufHasSpaceToDispatch(XLogRecPtr endRecPtr); -#endif - /* dispatchTable must consistent with RmgrTable */ static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = { { DispatchXLogRecord, RmgrRecordInfoValid, RM_XLOG_ID, XLOG_CHECKPOINT_SHUTDOWN, XLOG_DELAY_XLOG_RECYCLE }, @@ -436,14 +432,20 @@ void StartRecoveryWorkers(XLogReaderState *xlogreader, uint32 privateLen) CheckAlivePageWorkers(); g_dispatcher = CreateDispatcher(); g_dispatcher->oldCtx = MemoryContextSwitchTo(g_instance.comm_cxt.predo_cxt.parallelRedoCtx); - g_dispatcher->maxItemNum = (get_batch_redo_num() + 4) * PAGE_WORK_QUEUE_SIZE * - ITEM_QUQUE_SIZE_RATIO; // 4: a startup, readmanager, txnmanager, txnworker + g_instance.comm_cxt.redoItemCtx = AllocSetContextCreate((MemoryContext)g_instance.instance_context, + "redoItemSharedMemory", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + SHARED_CONTEXT); + g_instance.comm_cxt.predo_cxt.redoItemHash = PRRedoItemHashInitialize(g_instance.comm_cxt.redoItemCtx); + g_dispatcher->maxItemNum = ((get_batch_redo_num() + 4) * PAGE_WORK_QUEUE_SIZE) * ITEM_QUQUE_SIZE_RATIO; + uint32 maxParseBufNum = (uint32)((uint64)g_instance.attr.attr_storage.dms_attr.ondemand_recovery_mem_size * + 1024 / (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc) + sizeof(RedoMemSlot))); + g_dispatcher->maxItemNum = 4 * PAGE_WORK_QUEUE_SIZE * ITEM_QUQUE_SIZE_RATIO + maxParseBufNum; + XLogParseBufferInitFunc(&(g_dispatcher->parseManager), maxParseBufNum, &recordRefOperate, RedoInterruptCallBack); /* alloc for record readbuf */ - if (ENABLE_DMS && ENABLE_DSS) { - SSAllocRecordReadBuffer(xlogreader, privateLen); - } else { - AllocRecordReadBuffer(xlogreader, privateLen); - } + SSAllocRecordReadBuffer(xlogreader, privateLen); StartPageRedoWorkers(get_real_recovery_parallelism()); ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), @@ -663,57 +665,13 @@ static void StopRecoveryWorkers(int code, Datum arg) pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.readWorkerState, WORKER_STATE_EXIT); ShutdownWalRcv(); FreeAllocatedRedoItem(); - if (ENABLE_DSS && ENABLE_DMS) { - SSDestroyRecoveryWorkers(); - } else { - DestroyRecoveryWorkers(); - } + SSDestroyRecoveryWorkers(); g_startupTriggerState = TRIGGER_NORMAL; g_readManagerTriggerFlag = TRIGGER_NORMAL; ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("parallel redo(startup) thread exit"))); } /* Run from the dispatcher thread. */ -static void DestroyRecoveryWorkers() -{ - if (g_dispatcher != NULL) { - SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); - for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { - DestroyPageRedoWorker(g_dispatcher->pageLines[i].batchThd); - DestroyPageRedoWorker(g_dispatcher->pageLines[i].managerThd); - for (uint32 j = 0; j < g_dispatcher->pageLines[i].redoThdNum; j++) { - DestroyPageRedoWorker(g_dispatcher->pageLines[i].redoThd[j]); - } - if (g_dispatcher->pageLines[i].chosedRTIds != NULL) { - pfree(g_dispatcher->pageLines[i].chosedRTIds); - } - } - DestroyPageRedoWorker(g_dispatcher->trxnLine.managerThd); - DestroyPageRedoWorker(g_dispatcher->trxnLine.redoThd); - - DestroyPageRedoWorker(g_dispatcher->readLine.managerThd); - DestroyPageRedoWorker(g_dispatcher->readLine.readThd); - pfree(g_dispatcher->rtoXlogBufState.readsegbuf); - pfree(g_dispatcher->rtoXlogBufState.readBuf); - pfree(g_dispatcher->rtoXlogBufState.errormsg_buf); - pfree(g_dispatcher->rtoXlogBufState.readprivate); -#ifdef USE_ASSERT_CHECKING - if (g_dispatcher->originLsnCheckAddr != NULL) { - pfree(g_dispatcher->originLsnCheckAddr); - g_dispatcher->originLsnCheckAddr = NULL; - g_dispatcher->lsnCheckCtl = NULL; - } -#endif - if (get_real_recovery_parallelism() > 1) { - (void)MemoryContextSwitchTo(g_dispatcher->oldCtx); - MemoryContextDelete(g_instance.comm_cxt.predo_cxt.parallelRedoCtx); - g_instance.comm_cxt.predo_cxt.parallelRedoCtx = NULL; - } - g_dispatcher = NULL; - SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); - } -} - static void SSDestroyRecoveryWorkers() { if (g_dispatcher != NULL) { @@ -826,18 +784,6 @@ void DispatchRedoRecordToFile(XLogReaderState *record, List *expectedTLIs, Times } else { fatalerror = true; } -#ifdef USE_ASSERT_CHECKING - uint64 waitCount = 0; - while (!CheckBufHasSpaceToDispatch(record->EndRecPtr)) { - RedoInterruptCallBack(); - waitCount++; - if ((waitCount & PRINT_ALL_WAIT_COUNT) == PRINT_ALL_WAIT_COUNT) { - ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("DispatchRedoRecordToFile:replayedLsn:%lu, blockcnt:%lu, readEndLSN:%lu", - GetXLogReplayRecPtr(NULL, NULL), waitCount, record->EndRecPtr))); - } - } -#endif ResetChosedPageLineList(); if (fatalerror != true) { #ifdef ENABLE_UT @@ -901,6 +847,14 @@ static void DispatchToOnePageWorker(XLogReaderState *record, const RelFileNode r AddPageRedoItem(g_dispatcher->pageLines[slotId].batchThd, item); } +static void DispatchToSpecificOnePageWorker(XLogReaderState *record, uint32 slotId, List *expectedTLIs) +{ + Assert(slotId <= GetBatchCount()); + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[slotId].batchThd, item); +} + /** * The transaction worker waits until every page worker has replayed * all records before this. We dispatch a LSN marker to every page @@ -980,25 +934,7 @@ static bool DispatchXLogRecord(XLogReaderState *record, List *expectedTLIs, Time uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); if (IsCheckPoint(record)) { - RedoItem *item = GetRedoItemPtr(record); - item->needImmediateCheckpoint = g_dispatcher->needImmediateCheckpoint; - item->record.isFullSync = g_dispatcher->needFullSyncCheckpoint; - g_dispatcher->needImmediateCheckpoint = false; - g_dispatcher->needFullSyncCheckpoint = false; - ReferenceRedoItem(item); - for (uint32 i = 0; i < g_dispatcher->pageLineNum; ++i) { - /* - * A check point record may save a recovery restart point or - * update the timeline. - */ - ReferenceRedoItem(item); - AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); - } - /* ensure eyery pageworker is receive recored to update pageworker Lsn - * trxn record's recordtime must set , see SetLatestXTime - */ - AddTxnRedoItem(g_dispatcher->trxnLine.managerThd, item); - + return isNeedFullSync; } else if ((info == XLOG_FPI) || (info == XLOG_FPI_FOR_HINT)) { DispatchRecordWithPages(record, expectedTLIs); } else { @@ -1198,36 +1134,30 @@ static void DispatchRecordBySegHeadBuffer(XLogReaderState* record, List* expecte DispatchToOnePageWorker(record, rnode, expectedTLIs); } -static void DispatchNblocksRecord(XLogReaderState* record, List* expectedTLIs) -{ - XLogDataSegmentExtend *dataSegExtendInfo = (XLogDataSegmentExtend *)XLogRecGetBlockData(record, 0, NULL); - - RelFileNode rnode; - XLogRecGetBlockTag(record, 0, &rnode, NULL, NULL); - rnode.relNode = dataSegExtendInfo->main_fork_head; - rnode.bucketNode = SegmentBktId; - - DispatchToOnePageWorker(record, rnode, expectedTLIs); -} - static bool DispatchSegpageSmgrRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) { bool isNeedFullSync = false; uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); - /* Sync to all pageworkers so that ensure prepare the metadata before using the data-buffer */ - if (info == XLOG_SEG_ATOMIC_OPERATION || info == XLOG_SEG_CREATE_EXTENT_GROUP || info == XLOG_SEG_INIT_MAPPAGE || - info == XLOG_SEG_INIT_INVRSPTR_PAGE || info == XLOG_SEG_ADD_NEW_GROUP || info == XLOG_SEG_SPACE_SHRINK || - info == XLOG_SEG_SPACE_DROP || info == XLOG_SEG_NEW_PAGE) { - DispatchRecordWithoutPage(record, expectedTLIs); - } else if (info == XLOG_SEG_SEGMENT_EXTEND) { - DispatchNblocksRecord(record, expectedTLIs); - } else if (info == XLOG_SEG_TRUNCATE) { - DispatchRecordBySegHeadBuffer(record, expectedTLIs, 0); - } else { - ereport(PANIC, + switch (info) { + case XLOG_SEG_ATOMIC_OPERATION: + case XLOG_SEG_SEGMENT_EXTEND: + case XLOG_SEG_CREATE_EXTENT_GROUP: + case XLOG_SEG_INIT_MAPPAGE: + case XLOG_SEG_INIT_INVRSPTR_PAGE: + case XLOG_SEG_ADD_NEW_GROUP: + case XLOG_SEG_SPACE_SHRINK: + case XLOG_SEG_SPACE_DROP: + case XLOG_SEG_NEW_PAGE: + DispatchToSpecificOnePageWorker(record, 0, expectedTLIs); + break; + case XLOG_SEG_TRUNCATE: + DispatchRecordBySegHeadBuffer(record, expectedTLIs, 0); + break; + default: + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("[REDO_LOG_TRACE] xlog info %u doesn't belong to segpage.", info))); + errmsg("[SS][REDO_LOG_TRACE] xlog info %u doesn't belong to segpage.", info))); } return isNeedFullSync; @@ -1622,27 +1552,6 @@ void SetLsnCheckInfo(uint64 curPosition, XLogRecPtr curLsn) #endif /* __x86_64__ */ } -bool CheckBufHasSpaceToDispatch(XLogRecPtr endRecPtr) -{ - uint64 curPosition; - XLogRecPtr curLsn; - GetLsnCheckInfo(&curPosition, &curLsn); - - XLogRecPtr endPtr = endRecPtr; - if (endPtr % XLogSegSize == 0) { - XLByteAdvance(endPtr, SizeOfXLogLongPHD); - } else if (endPtr % XLOG_BLCKSZ == 0) { - XLByteAdvance(endPtr, SizeOfXLogShortPHD); - } - - uint32 len = (uint32)(endPtr - curLsn); - if (len < LSN_CHECK_BUF_SIZE) { - return true; - } - - return false; -} - bool PushCheckLsn() { uint64 curPosition; @@ -1937,6 +1846,96 @@ void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code) } } +void SendRecoveryEndMarkToWorkersAndWaitForReach(int code) +{ + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[SS][REDO_LOG_TRACE] On-demand recovery dispatch finish, send RecoveryEndMark to workers, code: %d", + code))); + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readPageThd); + PageRedoPipeline *pl = g_dispatcher->pageLines; + + /* Read finish, need to check if can go to Phase two */ + XLogRecPtr lastReadEndPtr = g_dispatcher->readLine.readPageThd->lastReplayedEndRecPtr; + + /* Wait for trxn finished replay and redo hash table complete */ + while (true) { + XLogRecPtr trxnCompletePtr = GetCompletedRecPtr(g_dispatcher->trxnLine.redoThd); + XLogRecPtr pageMngrCompletePtr = InvalidXLogRecPtr; + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; ++i) { + if (g_dispatcher->allWorkers[i]->role == REDO_PAGE_MNG) { + XLogRecPtr tmpStart = MAX_XLOG_REC_PTR; + XLogRecPtr tmpEnd = MAX_XLOG_REC_PTR; + GetCompletedReadEndPtr(g_dispatcher->allWorkers[i], &tmpStart, &tmpEnd); + if (XLByteLT(tmpEnd, pageMngrCompletePtr) || pageMngrCompletePtr == InvalidXLogRecPtr) { + pageMngrCompletePtr = tmpEnd; + } + } + } + ereport(LOG, (errmsg("[SS][REDO_LOG_TRACE] lastReadXact: %lu, trxnComplete: %lu, pageMgrComplele: %lu", + lastReadEndPtr, trxnCompletePtr, pageMngrCompletePtr))); + if (XLByteEQ(trxnCompletePtr, lastReadEndPtr) && XLByteEQ(pageMngrCompletePtr, lastReadEndPtr)) { + break; + } + + long sleeptime = 5 * 1000; + pg_usleep(sleeptime); + } + /* we only send end mark but don't wait */ + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + SendPageRedoEndMark(pl[i].batchThd); + } + SendPageRedoEndMark(g_dispatcher->trxnLine.managerThd); + + /* Stop Read Thrd only */ + pg_atomic_write_u32(&(g_dispatcher->rtoXlogBufState.xlogReadManagerState), READ_MANAGER_STOP); + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.managerThd); + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readThd); + LsnUpdate(); + XLogRecPtr lastReplayed = GetXLogReplayRecPtr(NULL); + ereport(LOG, (errmsg("[SS][REDO_LOG_TRACE] Current LastReplayed: %lu", lastReplayed))); + (void)RegisterRedoInterruptCallBack(g_dispatcher->oldStartupIntrruptFunc); + } +} + +void WaitRedoFinish() +{ + /* make pmstate as run so db can accept service from now */ + g_instance.fatal_error = false; + g_instance.demotion = NoDemote; + pmState = PM_RUN; + write_stderr_with_prefix("[On-demand] LOG: database system is ready to accept connections"); + + SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = true; + SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); + + /* for other nodes in cluster */ + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_IN_ONDEMAND_RECOVERY; + SSSaveReformerCtrl(); + +#ifdef USE_ASSERT_CHECKING + XLogRecPtr minStart = MAX_XLOG_REC_PTR; + XLogRecPtr minEnd = MAX_XLOG_REC_PTR; + GetReplayedRecPtr(&minStart, &minEnd); + ereport(LOG, (errmsg("[SS][REDO_LOG_TRACE] Current LastReplayed: %lu", minEnd))); +#endif + + PageRedoPipeline *pl = g_dispatcher->pageLines; + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + WaitPageRedoWorkerReachLastMark(pl[i].batchThd); + } + WaitPageRedoWorkerReachLastMark(g_dispatcher->trxnLine.managerThd); + XLogParseBufferDestoryFunc(&(g_dispatcher->parseManager)); + LsnUpdate(); +#ifdef USE_ASSERT_CHECKING + AllItemCheck(); +#endif + SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone = true; + SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); +} + /* Run from each page worker and the txn worker thread. */ int GetDispatcherExitCode() { diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp index b13034fea..4ec868350 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp @@ -42,6 +42,7 @@ #include "access/xlogproc.h" #include "access/nbtree.h" #include "catalog/storage_xlog.h" +#include "ddes/dms/ss_dms_recovery.h" #include "gssignal/gs_signal.h" #include "libpq/pqsignal.h" #include "postmaster/postmaster.h" @@ -228,8 +229,6 @@ PageRedoWorker *CreateWorker(uint32 id) #endif worker->parseManager.memctl.isInit = false; worker->parseManager.parsebuffers = NULL; - worker->remoteReadPageNum = 0; - worker->badPageHashTbl = BadBlockHashTblCreate(); return worker; } @@ -340,11 +339,6 @@ PGPROC *GetPageRedoWorkerProc(PageRedoWorker *worker) return worker->proc; } -void HandlePageRedoPageRepair(RepairBlockKey key, XLogPhyBlock pblk) -{ - RecordBadBlockAndPushToRemote(g_redoWorker->curRedoBlockState, CRC_CHECK_FAIL, InvalidXLogRecPtr, pblk); -} - void HandlePageRedoInterrupts() { if (t_thrd.page_redo_cxt.got_SIGHUP) { @@ -352,11 +346,6 @@ void HandlePageRedoInterrupts() ProcessConfigFile(PGC_SIGHUP); } - if (t_thrd.page_redo_cxt.check_repair && g_instance.pid_cxt.PageRepairPID != 0) { - SeqCheckRemoteReadAndRepairPage(); - t_thrd.page_redo_cxt.check_repair = false; - } - if (t_thrd.page_redo_cxt.shutdown_requested) { ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("page worker id %u exit for request", g_redoWorker->id))); @@ -380,6 +369,18 @@ void DereferenceRedoItem(void *item) SubRefRecord(&redoItem->record); } +void ReferenceRecParseState(XLogRecParseState *recordstate) +{ + ParseBufferDesc *descstate = (ParseBufferDesc *)((char *)recordstate - sizeof(ParseBufferDesc)); + (void)pg_atomic_fetch_add_u32(&(descstate->refcount), 1); +} + +void DereferenceRecParseState(XLogRecParseState *recordstate) +{ + ParseBufferDesc *descstate = (ParseBufferDesc *)((char *)recordstate - sizeof(ParseBufferDesc)); + (void)pg_atomic_fetch_sub_u32(&(descstate->refcount), 1); +} + #define STRUCT_CONTAINER(type, membername, ptr) ((type *)((char *)(ptr)-offsetof(type, membername))) #ifdef USE_ASSERT_CHECKING @@ -590,8 +591,7 @@ void BatchRedoMain() uint32 eleNum; (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); - XLogParseBufferInitFunc(&(g_redoWorker->parseManager), MAX_PARSE_BUFF_NUM, &recordRefOperate, - RedoInterruptCallBack); + g_parseManager = &(g_dispatcher->parseManager); GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); while (SPSCBlockingQueueGetAll(g_redoWorker->queue, &eleArry, &eleNum)) { CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); @@ -607,7 +607,6 @@ void BatchRedoMain() } RedoThrdWaitForExit(g_redoWorker); - XLogParseBufferDestoryFunc(&(g_redoWorker->parseManager)); } uint32 GetWorkerId(const RedoItemTag *redoItemTag, uint32 workerCount) @@ -639,8 +638,91 @@ void RedoPageManagerDistributeToAllOneBlock(XLogRecParseState *ddlParseState) } } +void ReleaseRecParseState(PageRedoPipeline *myRedoLine, HTAB *redoItemHash, RedoItemHashEntry *redoItemEntry, uint32 workId) +{ + XLogRecParseState *cur_state = redoItemEntry->head; + XLogRecParseState *releaseHeadState = redoItemEntry->head; + XLogRecParseState *releaseTailState = NULL; + unsigned int del_from_hash_item_num = 0; + unsigned int new_hash; + LWLock *xlog_partition_lock; + + /* Items that have been replayed(refcount == 0) can be released */ + while (cur_state != NULL) { + ParseBufferDesc *descstate = (ParseBufferDesc *)((char *)cur_state - sizeof(ParseBufferDesc)); + unsigned int refCount = pg_atomic_read_u32(&descstate->refcount); + + if (refCount == 0) { + releaseTailState = cur_state; + del_from_hash_item_num++; + cur_state = (XLogRecParseState *)(cur_state->nextrecord); + } else { + break; + } + } + + new_hash = XlogTrackTableHashCode(&redoItemEntry->redoItemTag); + xlog_partition_lock = XlogTrackMappingPartitionLock(new_hash); + + if (del_from_hash_item_num > 0) { + (void)LWLockAcquire(xlog_partition_lock, LW_EXCLUSIVE); + if (releaseTailState != NULL) { + redoItemEntry->head = (XLogRecParseState *)releaseTailState->nextrecord; + releaseTailState->nextrecord = NULL; + } else { + redoItemEntry->head = NULL; + } + XLogBlockParseStateRelease(releaseHeadState); + redoItemEntry->redoItemNum = redoItemEntry->redoItemNum - del_from_hash_item_num; + LWLockRelease(xlog_partition_lock); + } + + if (redoItemEntry->redoItemNum == 0) { + (void)LWLockAcquire(xlog_partition_lock, LW_EXCLUSIVE); + if (hash_search(redoItemHash, (void *)&redoItemEntry->redoItemTag, HASH_REMOVE, NULL) == NULL) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("redo item hash table corrupted"))); + } + LWLockRelease(xlog_partition_lock); + } + + return; +} + +void RedoPageManagerDistributeToRedoThd(PageRedoPipeline *myRedoLine, + HTAB *redoItemHash, RedoItemHashEntry *redoItemEntry, uint32 workId) +{ + XLogRecParseState *cur_state = redoItemEntry->head; + XLogRecParseState *distribute_head = NULL; + XLogRecParseState *distribute_tail = NULL; + int distribute_item_num = 0; + + while (cur_state != NULL) { + if (cur_state->distributeStatus != XLOG_NO_DISTRIBUTE) { + cur_state = (XLogRecParseState *)cur_state->nextrecord; + continue; + } + + if (distribute_head == NULL) { + distribute_head = cur_state; + } + cur_state->distributeStatus = XLOG_MID_DISTRIBUTE; + distribute_tail = cur_state; + distribute_item_num++; + cur_state = (XLogRecParseState *)cur_state->nextrecord; + } + + if (distribute_item_num > 0) { + distribute_head->distributeStatus = XLOG_HEAD_DISTRIBUTE; + distribute_tail->distributeStatus = XLOG_TAIL_DISTRIBUTE; + AddPageRedoItem(myRedoLine->redoThd[workId], distribute_head); + } + + return; +} + void RedoPageManagerDistributeBlockRecord(HTAB *redoItemHash, XLogRecParseState *parsestate) { + static uint32 total_count = 0; PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; HASH_SEQ_STATUS status; @@ -648,12 +730,11 @@ void RedoPageManagerDistributeBlockRecord(HTAB *redoItemHash, XLogRecParseState HTAB *curMap = redoItemHash; hash_seq_init(&status, curMap); + total_count++; while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { uint32 workId = GetWorkerId(&redoItemEntry->redoItemTag, WorkerNumPerMng); - AddPageRedoItem(myRedoLine->redoThd[workId], redoItemEntry->head); - - if (hash_search(curMap, (void *)&redoItemEntry->redoItemTag, HASH_REMOVE, NULL) == NULL) - ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("hash table corrupted"))); + ReleaseRecParseState(myRedoLine, curMap, redoItemEntry, workId); + RedoPageManagerDistributeToRedoThd(myRedoLine, curMap, redoItemEntry, workId); } if (parsestate != NULL) { @@ -673,6 +754,37 @@ void WaitCurrentPipeLineRedoWorkersQueueEmpty() } } +static void ReleaseReplayedInParse(PageRedoPipeline* myRedoLine, uint32 workerNum) +{ + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + HTAB *curMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; + hash_seq_init(&status, curMap); + + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + if (g_redoWorker->slotId == GetSlotId(redoItemEntry->redoItemTag.rNode, 0, 0, GetBatchCount())) { + uint32 workId = GetWorkerId(&redoItemEntry->redoItemTag, workerNum); + ReleaseRecParseState(myRedoLine, curMap, redoItemEntry, workId); + } + } +} + +static void WaitAndTryReleaseWorkerReplayedRec(PageRedoPipeline *myRedoLine, uint32 workerNum) +{ + bool queueIsEmpty = false; + while (!queueIsEmpty) { + queueIsEmpty = true; + for (uint32 i = 0; i < workerNum; i++) { + if (!RedoWorkerIsIdle(myRedoLine->redoThd[i])) { + queueIsEmpty = false; + ReleaseReplayedInParse(myRedoLine, workerNum); + pg_usleep(50000L); + break; + } + } + } +} + void DispatchEndMarkToRedoWorkerAndWait() { PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; @@ -680,9 +792,12 @@ void DispatchEndMarkToRedoWorkerAndWait() for (uint32 i = 0; i < WorkerNumPerMng; ++i) SendPageRedoEndMark(myRedoLine->redoThd[i]); + /* Need to release the item replayed in time */ + WaitAndTryReleaseWorkerReplayedRec(myRedoLine, WorkerNumPerMng); for (uint32 i = 0; i < myRedoLine->redoThdNum; i++) { WaitPageRedoWorkerReachLastMark(myRedoLine->redoThd[i]); } + ReleaseReplayedInParse(myRedoLine, WorkerNumPerMng); } void RedoPageManagerDdlAction(XLogRecParseState *parsestate) @@ -888,6 +1003,16 @@ void PageManagerProcSegFullSyncState(HTAB *hashMap, XLogRecParseState *parseStat RedoPageManagerSyncDdlAction(parseState); } +void OnDemandPageManagerProcSegFullSyncState(XLogRecParseState *parsestate) +{ + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + RedoPageManagerDdlAction(parsestate); + (void)MemoryContextSwitchTo(oldCtx); + + parsestate->nextrecord = NULL; + XLogBlockParseStateRelease(parsestate); +} + void PageManagerProcSegPipeLineSyncState(HTAB *hashMap, XLogRecParseState *parseState) { RedoPageManagerDistributeBlockRecord(hashMap, NULL); @@ -900,6 +1025,15 @@ void PageManagerProcSegPipeLineSyncState(HTAB *hashMap, XLogRecParseState *parse XLogBlockParseStateRelease(parseState); } +void OnDemandPageManagerProcSegPipeLineSyncState(XLogRecParseState *parseState) +{ + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + RedoPageManagerDdlAction(parseState); + (void)MemoryContextSwitchTo(oldCtx); + + XLogBlockParseStateRelease(parseState); +} + static void WaitNextBarrier(XLogRecParseState *parseState) { bool needWait = parseState->isFullSync; @@ -915,9 +1049,37 @@ static void WaitNextBarrier(XLogRecParseState *parseState) } } +static void OnDemandPageManagerRedoSegParseState(XLogRecParseState *preState) +{ + static uint32 seg_total_count = 0; + static uint32 seg_full_count = 0; + + Assert(g_redoWorker->slotId == 0); + switch (preState->blockparse.blockhead.block_valid) { + case BLOCK_DATA_SEG_EXTEND: + seg_total_count++; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + OnDemandPageManagerProcSegPipeLineSyncState(preState); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + break; + case BLOCK_DATA_SEG_FULL_SYNC_TYPE: + seg_full_count++; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + OnDemandPageManagerProcSegFullSyncState(preState); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + break; + case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: + default: + { + Assert(0); + } + break; + } +} + void PageManagerRedoParseState(XLogRecParseState *preState) { - HTAB *hashMap = g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHash; + HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; switch (preState->blockparse.blockhead.block_valid) { case BLOCK_DATA_MAIN_DATA_TYPE: @@ -926,6 +1088,8 @@ void PageManagerRedoParseState(XLogRecParseState *preState) case BLOCK_DATA_FSM_TYPE: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); PRTrackAddBlock(preState, hashMap); + SetCompletedReadEndPtr(g_redoWorker, preState->blockparse.blockhead.start_ptr, + preState->blockparse.blockhead.end_ptr); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); break; case BLOCK_DATA_DDL_TYPE: @@ -935,7 +1099,7 @@ void PageManagerRedoParseState(XLogRecParseState *preState) break; case BLOCK_DATA_SEG_EXTEND: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); - PageManagerProcSegPipeLineSyncState(hashMap, preState); + OnDemandPageManagerRedoSegParseState(preState); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); break; case BLOCK_DATA_DROP_DATABASE_TYPE: @@ -950,17 +1114,12 @@ void PageManagerRedoParseState(XLogRecParseState *preState) case BLOCK_DATA_CREATE_DATABASE_TYPE: case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); - RedoPageManagerDistributeBlockRecord(hashMap, NULL); - /* wait until queue empty */ - WaitCurrentPipeLineRedoWorkersQueueEmpty(); - /* do atcual action */ - RedoPageManagerSyncDdlAction(preState); + OnDemandPageManagerRedoSegParseState(preState); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); break; case BLOCK_DATA_SEG_FULL_SYNC_TYPE: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); - PageManagerProcSegFullSyncState(hashMap, preState); - CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + OnDemandPageManagerRedoSegParseState(preState); break; case BLOCK_DATA_CREATE_TBLSPC_TYPE: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_7]); @@ -994,13 +1153,15 @@ void PageManagerRedoParseState(XLogRecParseState *preState) bool PageManagerRedoDistributeItems(void **eleArry, uint32 eleNum) { - HTAB *hashMap = g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHash; + HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; for (uint32 i = 0; i < eleNum; i++) { if (eleArry[i] == (void *)&g_redoEndMark) { RedoPageManagerDistributeBlockRecord(hashMap, NULL); return true; } else if (eleArry[i] == (void *)&g_GlobalLsnForwarder) { + SetCompletedReadEndPtr(g_redoWorker, ((RedoItem *)eleArry[i])->record.ReadRecPtr, + ((RedoItem *)eleArry[i])->record.EndRecPtr); RedoPageManagerDistributeBlockRecord(hashMap, NULL); PageManagerProcLsnForwarder((RedoItem *)eleArry[i]); continue; @@ -1032,9 +1193,20 @@ bool PageManagerRedoDistributeItems(void **eleArry, uint32 eleNum) #endif } while (nextState != NULL); } - GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_9]); - RedoPageManagerDistributeBlockRecord(hashMap, NULL); - CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_9]); + + float4 ratio = g_dispatcher->parseManager.memctl.usedblknum / g_dispatcher->parseManager.memctl.totalblknum; + while (ratio > ONDEMAND_DISTRIBUTE_RATIO) { + ereport(WARNING, (errcode(ERRCODE_LOG), + errmsg("[On-demand] Parse buffer num approach critical value, distribute block record by force," + " slotid %d, usedblknum %d, totalblknum %d", g_redoWorker->slotId, + g_dispatcher->parseManager.memctl.usedblknum, g_dispatcher->parseManager.memctl.totalblknum))); + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_9]); + RedoPageManagerDistributeBlockRecord(hashMap, NULL); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_9]); + pg_usleep(1000000); /* 1 sec */ + ratio = g_dispatcher->parseManager.memctl.usedblknum / g_dispatcher->parseManager.memctl.totalblknum; + } + return false; } @@ -1044,9 +1216,7 @@ void RedoPageManagerMain() uint32 eleNum; (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); - g_redoWorker->redoItemHash = PRRedoItemHashInitialize(g_redoWorker->oldCtx); - XLogParseBufferInitFunc(&(g_redoWorker->parseManager), MAX_PARSE_BUFF_NUM, &recordRefOperate, - RedoInterruptCallBack); + g_parseManager = &(g_dispatcher->parseManager); GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); while (SPSCBlockingQueueGetAll(g_redoWorker->queue, &eleArry, &eleNum)) { @@ -1063,7 +1233,6 @@ void RedoPageManagerMain() } RedoThrdWaitForExit(g_redoWorker); - XLogParseBufferDestoryFunc(&(g_redoWorker->parseManager)); } bool IsXactXlog(const XLogReaderState *record) @@ -1434,23 +1603,30 @@ void RedoPageWorkerMain() RedoBufferInfo bufferinfo = {0}; bool notfound = false; bool updateFsm = false; + bool needRelease = true; XLogRecParseState *procState = redoblockstateHead; + Assert(procState->distributeStatus != XLOG_NO_DISTRIBUTE); MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); while (procState != NULL) { XLogRecParseState *redoblockstate = procState; g_redoWorker->curRedoBlockState = (XLogBlockDataParse*)(&redoblockstate->blockparse.extra_rec); - procState = (XLogRecParseState *)procState->nextrecord; - + // nextrecord will be redo in backwards position + procState = (procState->distributeStatus == XLOG_TAIL_DISTRIBUTE) ? + NULL : (XLogRecParseState *)procState->nextrecord; switch (XLogBlockHeadGetValidInfo(&redoblockstate->blockparse.blockhead)) { case BLOCK_DATA_MAIN_DATA_TYPE: case BLOCK_DATA_UNDO_TYPE: case BLOCK_DATA_VM_TYPE: case BLOCK_DATA_FSM_TYPE: + needRelease = false; GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); notfound = XLogBlockRedoForExtremeRTO(redoblockstate, &bufferinfo, notfound, g_redoWorker->timeCostList[TIME_COST_STEP_4], g_redoWorker->timeCostList[TIME_COST_STEP_5]); + DereferenceRecParseState(redoblockstate); + SetCompletedReadEndPtr(g_redoWorker, redoblockstate->blockparse.blockhead.start_ptr, + redoblockstate->blockparse.blockhead.end_ptr); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); break; case BLOCK_DATA_XLOG_COMMON_TYPE: @@ -1509,7 +1685,9 @@ void RedoPageWorkerMain() if (needWait) { pg_atomic_write_u32(&g_redoWorker->fullSyncFlag, 1); } - XLogBlockParseStateRelease(redoblockstateHead); + if (needRelease) { + XLogBlockParseStateRelease(redoblockstateHead); + } /* the same page */ ExtremeRtoFlushBuffer(&bufferinfo, updateFsm); SPSCBlockingQueuePop(g_redoWorker->queue); @@ -1572,9 +1750,8 @@ static inline bool ReadPageWorkerStop() return g_dispatcher->recoveryStop; } -void PushToWorkerLsn(bool force) +void PushToWorkerLsn() { - const uint32 max_record_count = PAGE_WORK_QUEUE_SIZE; static uint32 cur_recor_count = 0; cur_recor_count++; @@ -1583,24 +1760,13 @@ void PushToWorkerLsn(bool force) return; } - if (force) { - uint32 refCount; - do { - refCount = pg_atomic_read_u32(&g_GlobalLsnForwarder.record.refcount); - RedoInterruptCallBack(); - } while (refCount != 0 && !ReadPageWorkerStop()); - cur_recor_count = 0; - SendLsnFowarder(); - } else { - uint32 refCount = pg_atomic_read_u32(&g_GlobalLsnForwarder.record.refcount); - - if (refCount != 0 || cur_recor_count < max_record_count) { - return; - } - - SendLsnFowarder(); - cur_recor_count = 0; - } + uint32 refCount; + do { + refCount = pg_atomic_read_u32(&g_GlobalLsnForwarder.record.refcount); + RedoInterruptCallBack(); + } while (refCount != 0 && !ReadPageWorkerStop()); + cur_recor_count = 0; + SendLsnFowarder(); } void ResetRtoXlogReadBuf(XLogRecPtr targetPagePtr) @@ -1857,7 +2023,7 @@ void XLogForceFinish(XLogReaderState *xlogreader, TermFileData *term_file) ShutdownDataRcv(); pg_atomic_write_u32(&(g_recordbuffer->readSource), XLOG_FROM_PG_XLOG); - PushToWorkerLsn(true); + PushToWorkerLsn(); g_cleanupMark.record.isDecode = true; PutRecordToReadQueue(&g_cleanupMark.record); WaitAllRedoWorkerIdle(); @@ -1988,7 +2154,6 @@ void XLogReadPageWorkerMain() CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_5], g_redoWorker->timeCostList[TIME_COST_STEP_1]); record = XLogParallelReadNextRecord(xlogreader); CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); - PushToWorkerLsn(false); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2]); RedoInterruptCallBack(); ADD_ABNORMAL_POSITION(8); @@ -2005,7 +2170,7 @@ void XLogReadPageWorkerMain() if (!ReadPageWorkerStop()) { /* notify exit */ - PushToWorkerLsn(true); + PushToWorkerLsn(); g_redoEndMark.record = *xlogreader; g_redoEndMark.record.isDecode = true; PutRecordToReadQueue((XLogReaderState *)&g_redoEndMark.record); @@ -2375,10 +2540,6 @@ int RedoMainLoop() instr_time startTime; instr_time endTime; - if (g_instance.pid_cxt.PageRepairPID != 0) { - (void)RegisterRedoPageRepairCallBack(HandlePageRedoPageRepair); - } - INSTR_TIME_SET_CURRENT(startTime); switch (g_redoWorker->role) { case REDO_BATCH: @@ -2641,6 +2802,12 @@ void RedoThrdWaitForExit(const PageRedoWorker *wk) } } +/* Run from the txn worker thread. */ +XLogRecPtr GetCompletedRecPtr(PageRedoWorker *worker) +{ + return pg_atomic_read_u64(&worker->lastReplayedEndRecPtr); +} + /* Run from the worker thread. */ static void ApplySinglePageRecord(RedoItem *item) { @@ -2764,251 +2931,4 @@ bool XactHasSegpageRelFiles(XLogReaderState *record) return false; } - -void RepairPageAndRecoveryXLog(BadBlockRecEnt* page_info, const char *page) -{ - RedoBufferInfo buffer; - RedoBufferTag blockinfo; - bool updateFsm = false; - bool notfound = false; - errno_t rc; - BufferDesc *bufDesc = NULL; - RedoTimeCost timeCost1; - RedoTimeCost timeCost2; - - blockinfo.rnode = page_info->key.relfilenode; - blockinfo.forknum = page_info->key.forknum; - blockinfo.blkno = page_info->key.blocknum; - blockinfo.pblk = page_info->pblk; - - /* read page to buffer pool by RBM_ZERO_AND_LOCK mode and get buffer lock */ - (void)XLogReadBufferForRedoBlockExtend(&blockinfo, RBM_ZERO_AND_LOCK, false, &buffer, - page_info->rec_max_lsn, InvalidXLogRecPtr, false, WITH_NORMAL_CACHE); - - rc = memcpy_s(buffer.pageinfo.page, BLCKSZ, page, BLCKSZ); - securec_check(rc, "", ""); - - MarkBufferDirty(buffer.buf); - bufDesc = GetBufferDescriptor(buffer.buf - 1); - bufDesc->extra->lsn_on_disk = PageGetLSN(buffer.pageinfo.page); - UnlockReleaseBuffer(buffer.buf); - - /* recovery the page xlog */ - rc = memset_s(&buffer, sizeof(RedoBufferInfo), 0, sizeof(RedoBufferInfo)); - securec_check(rc, "", ""); - - XLogRecParseState *procState = page_info->head; - MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); - while (procState != NULL) { - XLogRecParseState *redoblockstate = procState; - procState = (XLogRecParseState *)procState->nextrecord; - (void)XLogBlockRedoForExtremeRTO(redoblockstate, &buffer, notfound, timeCost1, timeCost2); - } - (void)MemoryContextSwitchTo(oldCtx); - updateFsm = XlogNeedUpdateFsm(page_info->head, &buffer); - XLogBlockParseStateRelease(page_info->head); - ExtremeRtoFlushBuffer(&buffer, updateFsm); -} - -HTAB* BadBlockHashTblCreate() -{ - HASHCTL ctl; - errno_t rc; - - rc = memset_s(&ctl, sizeof(ctl), 0, sizeof(ctl)); - securec_check(rc, "", ""); - - ctl.keysize = sizeof(RepairBlockKey); - ctl.entrysize = sizeof(BadBlockRecEnt); - ctl.hash = RepairBlockKeyHash; - ctl.match = RepairBlockKeyMatch; - return hash_create("recovery thread bad block hashtbl", MAX_REMOTE_READ_INFO_NUM, &ctl, - HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); -} - - -/* ClearPageRepairHashTbl - * drop table, or truncate table, need clear the page repair hashTbl, if the - * repair page Filenode match need remove. - */ -void ClearRecoveryThreadHashTbl(const RelFileNode &node, ForkNumber forknum, BlockNumber minblkno, - bool segment_shrink) -{ - HTAB *bad_hash = g_redoWorker->badPageHashTbl; - bool found = false; - BadBlockRecEnt *entry = NULL; - HASH_SEQ_STATUS status; - - hash_seq_init(&status, bad_hash); - while ((entry = (BadBlockRecEnt *)hash_seq_search(&status)) != NULL) { - if (BlockNodeMatch(entry->key, entry->pblk, node, forknum, minblkno, segment_shrink)) { - if (hash_search(bad_hash, &(entry->key), HASH_REMOVE, &found) == NULL) { - ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("recovery thread bad page hash table corrupted"))); - } - g_redoWorker->remoteReadPageNum--; - } - } - - return; -} - -/* BatchClearPageRepairHashTbl - * drop database, or drop segmentspace, need clear the page repair hashTbl, - * if the repair page key dbNode match and spcNode match, need remove. - */ -void BatchClearRecoveryThreadHashTbl(Oid spcNode, Oid dbNode) -{ - HTAB *bad_hash = g_redoWorker->badPageHashTbl; - bool found = false; - BadBlockRecEnt *entry = NULL; - HASH_SEQ_STATUS status; - - hash_seq_init(&status, bad_hash); - while ((entry = (BadBlockRecEnt *)hash_seq_search(&status)) != NULL) { - if (dbNodeandSpcNodeMatch(&(entry->key.relfilenode), spcNode, dbNode)) { - if (hash_search(bad_hash, &(entry->key), HASH_REMOVE, &found) == NULL) { - ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("page repair hash table corrupted"))); - } - g_redoWorker->remoteReadPageNum--; - } - } - - return; -} - - -/* ClearSpecificsPageEntryAndMem - * If the page has been repair, need remove entry of bad page hashtable, - * and release the xlog record mem. - */ -void ClearSpecificsPageEntryAndMem(BadBlockRecEnt *entry) -{ - HTAB *bad_hash = g_redoWorker->badPageHashTbl; - bool found = false; - uint32 need_repair_num = 0; - HASH_SEQ_STATUS status; - BadBlockRecEnt *temp_entry = NULL; - - if ((BadBlockRecEnt*)hash_search(bad_hash, &(entry->key), HASH_REMOVE, &found) == NULL) { - ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("recovery thread bad block hash table corrupted"))); - } - - hash_seq_init(&status, bad_hash); - while ((temp_entry = (BadBlockRecEnt *)hash_seq_search(&status)) != NULL) { - need_repair_num++; - } - - if (need_repair_num == 0) { - XLogParseBufferDestoryFunc(&(g_redoWorker->parseManager)); - } -} - -/* RecordBadBlockAndPushToRemote - * If the bad page has been stored, record the xlog. If the bad page - * has not been stored, need push to page repair thread hash table and record to - * recovery thread hash table. - */ -void RecordBadBlockAndPushToRemote(XLogBlockDataParse *datadecode, PageErrorType error_type, - XLogRecPtr old_lsn, XLogPhyBlock pblk) -{ - bool found = false; - RepairBlockKey key; - gs_thread_t tid; - XLogBlockParse *block = STRUCT_CONTAINER(XLogBlockParse, extra_rec, datadecode); - XLogRecParseState *state = STRUCT_CONTAINER(XLogRecParseState, blockparse, block); - - key.relfilenode.spcNode = state->blockparse.blockhead.spcNode; - key.relfilenode.dbNode = state->blockparse.blockhead.dbNode; - key.relfilenode.relNode = state->blockparse.blockhead.relNode; - key.relfilenode.bucketNode = state->blockparse.blockhead.bucketNode; - key.relfilenode.opt = state->blockparse.blockhead.opt; - key.forknum = state->blockparse.blockhead.forknum; - key.blocknum = state->blockparse.blockhead.blkno; - - tid = gs_thread_get_cur_thread(); - found = PushBadPageToRemoteHashTbl(key, error_type, old_lsn, pblk, tid.thid); - - if (found) { - /* store the record for recovery */ - HTAB *bad_hash = g_redoWorker->badPageHashTbl; - bool thread_found = false; - BadBlockRecEnt *remoteReadInfo = (BadBlockRecEnt*)hash_search(bad_hash, &(key), HASH_FIND, &thread_found); - Assert(thread_found); - if (!thread_found) { - ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), - errmsg("recovery thread bad block hash table corrupted"))); - } - XLogRecParseState *newState = XLogParseBufferCopy(state); - newState->nextrecord = NULL; - remoteReadInfo->tail->nextrecord = newState; - remoteReadInfo->tail = newState; - remoteReadInfo->rec_max_lsn = newState->blockparse.blockhead.end_ptr; - } else { - HTAB *bad_hash = g_redoWorker->badPageHashTbl; - bool thread_found = false; - BadBlockRecEnt *remoteReadInfo = (BadBlockRecEnt*)hash_search(bad_hash, &(key), HASH_ENTER, &thread_found); - - Assert(!thread_found); - if (g_parseManager == NULL) { - XLogParseBufferInitFunc(&(g_redoWorker->parseManager), - MAX_PARSE_BUFF_NUM, &recordRefOperate, RedoInterruptCallBack); - } - XLogRecParseState *newState = XLogParseBufferCopy(state); - newState->nextrecord = NULL; - - remoteReadInfo->key = key; - remoteReadInfo->pblk = pblk; - remoteReadInfo->rec_min_lsn = newState->blockparse.blockhead.end_ptr; - remoteReadInfo->rec_max_lsn = newState->blockparse.blockhead.end_ptr; - remoteReadInfo->head = newState; - remoteReadInfo->tail = newState; - g_redoWorker->remoteReadPageNum++; - - if (g_redoWorker->remoteReadPageNum >= MAX_REMOTE_READ_INFO_NUM) { - ereport(WARNING, (errmsg("recovery thread found %d error block.", g_redoWorker->remoteReadPageNum))); - } - } - - return; -} - -void CheckRemoteReadAndRepairPage(BadBlockRecEnt *entry) -{ - XLogRecPtr rec_min_lsn = InvalidXLogRecPtr; - XLogRecPtr rec_max_lsn = InvalidXLogRecPtr; - bool check = false; - RepairBlockKey key; - - key = entry->key; - rec_min_lsn = entry->rec_min_lsn; - rec_max_lsn = entry->rec_max_lsn; - check = CheckRepairPage(key, rec_min_lsn, rec_max_lsn, g_redoWorker->page); - if (check) { - /* copy page to buffer pool, and recovery the stored xlog */ - RepairPageAndRecoveryXLog(entry, g_redoWorker->page); - /* clear page repair thread hash table */ - ClearSpecificsPageRepairHashTbl(key); - /* clear this thread invalid page hash table */ - forget_specified_invalid_pages(key); - /* clear thread bad block hash entry */ - ClearSpecificsPageEntryAndMem(entry); - g_redoWorker->remoteReadPageNum--; - } -} - -void SeqCheckRemoteReadAndRepairPage() -{ - BadBlockRecEnt *entry = NULL; - HASH_SEQ_STATUS status; - - HTAB *bad_hash = g_redoWorker->badPageHashTbl; - - hash_seq_init(&status, bad_hash); - while ((entry = (BadBlockRecEnt *)hash_seq_search(&status)) != NULL) { - CheckRemoteReadAndRepairPage(entry); - } -} - } // namespace ondemand_extreme_rto \ No newline at end of file diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp new file mode 100644 index 000000000..0bb706aca --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * redo_utils.cpp + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "access/xlogproc.h" +#include "access/ondemand_extreme_rto/batch_redo.h" +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/ondemand_extreme_rto/redo_utils.h" +#include "storage/lock/lwlock.h" + +/* add for batch redo mem manager */ +void *OndemandXLogMemCtlInit(RedoMemManager *memctl, Size itemsize, int itemnum) +{ + void *allocdata = NULL; + RedoMemSlot *nextfreeslot = NULL; + OndemandParseAllocCtrl *ctrl; + Assert(PARSEBUFFER_SIZE == itemsize); + + allocdata = (void *)palloc(sizeof(OndemandParseAllocCtrl)); + ctrl = (OndemandParseAllocCtrl *)allocdata; + ctrl->allocNum = itemnum / ONDEMAND_MAX_PARSEBUFF_PREPALLOC; + if ((int)(ctrl->allocNum * ONDEMAND_MAX_PARSEBUFF_PREPALLOC) != itemnum) { + ctrl->allocNum++; + } + ctrl->memslotEntry = (void *)palloc(sizeof(RedoMemSlot) * itemnum); + + // palloc all parse mem entry + for (int i = 0; i < ctrl->allocNum; i++) { + ctrl->allocEntry[i] = (void *)palloc(ONDEMAND_MAX_PARSESIZE_PREPALLOC); + if (ctrl->allocEntry[i] == NULL) { + ereport(PANIC, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[SS] XLogMemCtlInit Allocated buffer failed!, totalblknum:%d, itemsize:%lu", + itemnum, itemsize))); + /* panic */ + } + errno_t rc = memset_s(ctrl->allocEntry[i], ONDEMAND_MAX_PARSESIZE_PREPALLOC, 0, + ONDEMAND_MAX_PARSESIZE_PREPALLOC); + securec_check(rc, "\0", "\0"); + } + memctl->totalblknum = itemnum; + memctl->usedblknum = 0; + memctl->itemsize = itemsize; + memctl->memslot = (RedoMemSlot *)ctrl->memslotEntry; + nextfreeslot = memctl->memslot; + for (int i = memctl->totalblknum; i > 0; --i) { + memctl->memslot[i - 1].buf_id = i; /* start from 1 , 0 is invalidbuffer */ + memctl->memslot[i - 1].freeNext = i - 1; + } + memctl->firstfreeslot = memctl->totalblknum; + memctl->firstreleaseslot = InvalidBuffer; + return allocdata; +} + +RedoMemSlot *OndemandXLogMemAlloc(RedoMemManager *memctl) +{ + RedoMemSlot *nextfreeslot = NULL; + do { + LWLockAcquire(OndemandXlogMemAllocLock, LW_EXCLUSIVE); + if (memctl->firstfreeslot == InvalidBuffer) { + memctl->firstfreeslot = AtomicExchangeBuffer(&memctl->firstreleaseslot, InvalidBuffer); + pg_read_barrier(); + } + + if (memctl->firstfreeslot != InvalidBuffer) { + nextfreeslot = &(memctl->memslot[memctl->firstfreeslot - 1]); + memctl->firstfreeslot = nextfreeslot->freeNext; + memctl->usedblknum++; + nextfreeslot->freeNext = InvalidBuffer; + } + LWLockRelease(OndemandXlogMemAllocLock); + + if (memctl->doInterrupt != NULL) { + memctl->doInterrupt(); + } + + } while (nextfreeslot == NULL); + + return nextfreeslot; +} + +void OndemandXLogMemRelease(RedoMemManager *memctl, Buffer bufferid) +{ + RedoMemSlot *bufferslot; + if (!RedoMemIsValid(memctl, bufferid)) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("XLogMemRelease failed!, taoalblknum:%u, buf_id:%u", memctl->totalblknum, bufferid))); + /* panic */ + } + bufferslot = &(memctl->memslot[bufferid - 1]); + Assert(bufferslot->freeNext == InvalidBuffer); + LWLockAcquire(OndemandXlogMemAllocLock, LW_EXCLUSIVE); + Buffer oldFirst = AtomicReadBuffer(&memctl->firstreleaseslot); + pg_memory_barrier(); + do { + AtomicWriteBuffer(&bufferslot->freeNext, oldFirst); + } while (!AtomicCompareExchangeBuffer(&memctl->firstreleaseslot, &oldFirst, bufferid)); + memctl->usedblknum--; + LWLockRelease(OndemandXlogMemAllocLock); +} + + +void OndemandXLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOperate *refOperate, + InterruptFunc interruptOperte) +{ + void *allocdata = NULL; + allocdata = OndemandXLogMemCtlInit(&(parsemanager->memctl), (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc)), buffernum); + parsemanager->parsebuffers = allocdata; + parsemanager->refOperate = refOperate; + parsemanager->memctl.doInterrupt = interruptOperte; + parsemanager->memctl.isInit = true; + + g_parseManager = parsemanager; + return; +} + +void OndemandXLogParseBufferDestory(RedoParseManager *parsemanager) +{ + g_parseManager = NULL; + OndemandParseAllocCtrl *ctrl = (OndemandParseAllocCtrl *)parsemanager->parsebuffers; + + if (ctrl != NULL) { + for (int i = 0; i < ctrl->allocNum; i++) { + pfree(ctrl->allocEntry[i]); + } + pfree(ctrl->memslotEntry); + pfree(ctrl); + parsemanager->parsebuffers = NULL; + } + parsemanager->memctl.isInit = false; +} + +ParseBufferDesc *OndemandGetParseMemSlot(OndemandParseAllocCtrl *ctrl, int itemIndex) +{ + int entryIndex = itemIndex / ONDEMAND_MAX_PARSEBUFF_PREPALLOC; + int entryOffset = (itemIndex - (entryIndex * ONDEMAND_MAX_PARSEBUFF_PREPALLOC)) * PARSEBUFFER_SIZE; + return (ParseBufferDesc *)((char *)ctrl->allocEntry[entryIndex] + entryOffset); +} + +XLogRecParseState *OndemandXLogParseBufferAllocList(RedoParseManager *parsemanager, XLogRecParseState *blkstatehead, + void *record) +{ + RedoMemManager *memctl = &(parsemanager->memctl); + RedoMemSlot *allocslot = NULL; + ParseBufferDesc *descstate = NULL; + XLogRecParseState *recordstate = NULL; + + allocslot = OndemandXLogMemAlloc(memctl); + if (allocslot == NULL) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("XLogParseBufferAlloc Allocated buffer failed!, taoalblknum:%u, usedblknum:%u", + memctl->totalblknum, memctl->usedblknum))); + return NULL; + } + + pg_read_barrier(); + Assert(allocslot->buf_id != InvalidBuffer); + Assert(memctl->itemsize == (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc))); + descstate = OndemandGetParseMemSlot((OndemandParseAllocCtrl *)parsemanager->parsebuffers, allocslot->buf_id - 1); + descstate->buff_id = allocslot->buf_id; + Assert(descstate->state == 0); + descstate->state = 1; + descstate->refcount = 0; + recordstate = (XLogRecParseState *)((char *)descstate + sizeof(ParseBufferDesc)); + recordstate->nextrecord = NULL; + recordstate->manager = parsemanager; + recordstate->refrecord = record; + recordstate->isFullSync = false; + recordstate->distributeStatus = XLOG_NO_DISTRIBUTE; + if (blkstatehead != NULL) { + recordstate->nextrecord = blkstatehead->nextrecord; + blkstatehead->nextrecord = (void *)recordstate; + } + + if (parsemanager->refOperate != NULL) + parsemanager->refOperate->refCount(record); + + return recordstate; +} + +void OndemandXLogParseBufferRelease(XLogRecParseState *recordstate) +{ + RedoMemManager *memctl = &(recordstate->manager->memctl); + ParseBufferDesc *descstate = NULL; + + descstate = (ParseBufferDesc *)((char *)recordstate - sizeof(ParseBufferDesc)); + if (!RedoMemIsValid(memctl, descstate->buff_id) || descstate->state == 0) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("XLogParseBufferRelease failed!, taoalblknum:%u, buf_id:%u", memctl->totalblknum, + descstate->buff_id))); + /* panic */ + } + + descstate->state = 0; + + OndemandXLogMemRelease(memctl, descstate->buff_id); +} + +BufferDesc *RedoForOndemandExtremeRTOQuery(BufferDesc *bufHdr, char relpersistence, + ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode) +{ + bool hashFound = false; + bool needMarkDirty = false; + unsigned int new_hash; + LWLock *xlog_partition_lock; + Buffer buf = BufferDescriptorGetBuffer(bufHdr); + ondemand_extreme_rto::RedoItemHashEntry *redoItemEntry = NULL; + ondemand_extreme_rto::RedoItemTag redoItemTag; + XLogRecParseState *procState = NULL; + XLogBlockHead *procBlockHead = NULL; + XLogBlockHead *blockHead = NULL; + RedoBufferInfo bufferInfo; + int rc; + + INIT_REDO_ITEM_TAG(redoItemTag, bufHdr->tag.rnode, forkNum, blockNum); + + uint32 id = ondemand_extreme_rto::GetSlotId(bufHdr->tag.rnode, 0, 0, ondemand_extreme_rto::GetBatchCount()); + HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[id]; + if (hashMap == NULL) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("redo item hash table corrupted, there has invalid hashtable."))); + } + + new_hash = ondemand_extreme_rto::XlogTrackTableHashCode(&redoItemTag); + xlog_partition_lock = XlogTrackMappingPartitionLock(new_hash); + (void)LWLockAcquire(xlog_partition_lock, LW_SHARED); + redoItemEntry = (ondemand_extreme_rto::RedoItemHashEntry *)hash_search(hashMap, (void *)&redoItemTag, HASH_FIND, &hashFound); + + /* Page is already up-to-date, no need to replay. */ + if (!hashFound || redoItemEntry->redoItemNum == 0 || redoItemEntry->redoDone) { + LWLockRelease(xlog_partition_lock); + return bufHdr; + } + + // switch to exclusive lock in replay + LWLockRelease(xlog_partition_lock); + (void)LWLockAcquire(xlog_partition_lock, LW_EXCLUSIVE); + + rc = memset_s(&bufferInfo, sizeof(bufferInfo), 0, sizeof(bufferInfo)); + securec_check(rc, "\0", "\0"); + if (BufferIsValid(buf)) { + bufferInfo.buf = buf; + bufferInfo.pageinfo.page = BufferGetPage(buf); + bufferInfo.pageinfo.pagesize = BufferGetPageSize(buf); + } + + procState = (XLogRecParseState *)redoItemEntry->head; + procBlockHead = &procState->blockparse.blockhead; + + XLogBlockInitRedoBlockInfo(procBlockHead, &bufferInfo.blockinfo); + + Assert(mode == RBM_NORMAL || mode == RBM_ZERO_ON_ERROR); + + /* lock the share buffer for replaying the xlog */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + while (procState != NULL) { + XLogRecParseState *redoBlockState = procState; + ondemand_extreme_rto::ReferenceRecParseState(redoBlockState); + + procState = (XLogRecParseState *)procState->nextrecord; + procBlockHead = &procState->blockparse.blockhead; + + blockHead = &redoBlockState->blockparse.blockhead; + uint16 blockValid = XLogBlockHeadGetValidInfo(blockHead); + + if (XLogRecPtrIsInvalid(bufferInfo.lsn)) { + bufferInfo.lsn = PageGetLSN(bufferInfo.pageinfo.page); + } + if (XLByteLE(XLogBlockHeadGetLSN(blockHead), PageGetLSN(bufferInfo.pageinfo.page))) { + ondemand_extreme_rto::DereferenceRecParseState(redoBlockState); + continue; + } + + switch (blockValid) { + case BLOCK_DATA_MAIN_DATA_TYPE: + case BLOCK_DATA_UNDO_TYPE: + case BLOCK_DATA_VM_TYPE: + case BLOCK_DATA_FSM_TYPE: + needMarkDirty = true; + XlogBlockRedoForOndemandExtremeRTOQuery(redoBlockState, &bufferInfo); + break; + case BLOCK_DATA_XLOG_COMMON_TYPE: + case BLOCK_DATA_DDL_TYPE: + case BLOCK_DATA_DROP_DATABASE_TYPE: + case BLOCK_DATA_NEWCU_TYPE: + default: + Assert(0); + break; + } + + ondemand_extreme_rto::DereferenceRecParseState(redoBlockState); + } + + /* mark the latest buffer dirty */ + if (needMarkDirty) { + MarkBufferDirty(buf); + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + redoItemEntry->redoDone = true; + LWLockRelease(xlog_partition_lock); + + return bufHdr; +} + +void OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(int code) +{ + ondemand_extreme_rto::SendRecoveryEndMarkToWorkersAndWaitForReach(code); +} + +void OnDemandWaitRedoFinish() +{ + ondemand_extreme_rto::WaitRedoFinish(); +} diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp index 12b676eba..e3c8726df 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp @@ -231,7 +231,9 @@ int ParallelXLogReadWorkBufRead(XLogReaderState *xlogreader, XLogRecPtr targetPa */ WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L); ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch); - PushToWorkerLsn(waitXLogCount == pushLsnCount); + if (waitXLogCount == pushLsnCount) { + PushToWorkerLsn(); + } ++waitXLogCount; } @@ -241,305 +243,6 @@ int ParallelXLogReadWorkBufRead(XLogReaderState *xlogreader, XLogRecPtr targetPa return -1; } -void WaitReplayFinishAfterReadXlogFileComplete(XLogRecPtr lastValidRecordLsn) -{ - Assert(t_thrd.xlog_cxt.EndRecPtr == lastValidRecordLsn); - XLogRecPtr lastReplayedLsn = GetXLogReplayRecPtr(NULL); - - while (XLByteLT(lastReplayedLsn, lastValidRecordLsn) && !DoEarlyExit()) { - RedoInterruptCallBack(); - const long sleepTime = 100; - pg_usleep(sleepTime); - lastReplayedLsn = GetXLogReplayRecPtr(NULL); - } -} - -int ParallelXLogPageReadFile(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, - TimeLineID *readTLI) -{ - bool randAccess = false; - uint32 targetPageOff; - volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl; - XLogRecPtr RecPtr = targetPagePtr; - uint32 ret; -#ifdef USE_ASSERT_CHECKING - XLogSegNo targetSegNo; - - XLByteToSeg(targetPagePtr, targetSegNo); -#endif - targetPageOff = targetPagePtr % XLogSegSize; - - /* - * See if we need to switch to a new segment because the requested record - * is not in the currently open one. - */ - if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) { - close(t_thrd.xlog_cxt.readFile); - t_thrd.xlog_cxt.readFile = -1; - t_thrd.xlog_cxt.readSource = 0; - } - - XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo); - XLByteAdvance(RecPtr, reqLen); - -retry: - /* See if we need to retrieve more data */ - if (t_thrd.xlog_cxt.readFile < 0) { - if (t_thrd.xlog_cxt.StandbyMode) { - /* - * In standby mode, wait for the requested record to become - * available, either via restore_command succeeding to restore the - * segment, or via walreceiver having streamed the record. - */ - for (;;) { - RedoInterruptCallBack(); - if (t_thrd.xlog_cxt.readFile >= 0) { - close(t_thrd.xlog_cxt.readFile); - t_thrd.xlog_cxt.readFile = -1; - } - /* Reset curFileTLI if random fetch. */ - if (randAccess) { - t_thrd.xlog_cxt.curFileTLI = 0; - } - - /* - * Try to restore the file from archive, or read an - * existing file from pg_xlog. - */ - uint32 sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG; - if (!(sources & ~t_thrd.xlog_cxt.failedSources)) { - /* - * We've exhausted all options for retrieving the - * file. Retry. - */ - t_thrd.xlog_cxt.failedSources = 0; - - /* - * Before we sleep, re-scan for possible new timelines - * if we were requested to recover to the latest - * timeline. - */ - if (t_thrd.xlog_cxt.recoveryTargetIsLatest) { - if (rescanLatestTimeLine()) { - continue; - } - } - - PushToWorkerLsn(true); - WaitReplayFinishAfterReadXlogFileComplete(t_thrd.xlog_cxt.EndRecPtr); - - if (!xlogctl->IsRecoveryDone) { - g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp(); - g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr; - } - - XLogRecPtr lastReplayedLsn = GetXLogReplayRecPtr(NULL); - ereport(LOG, - (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("ParallelXLogPageReadFile IsRecoveryDone is %s set true," - "ReadRecPtr:%X/%X, EndRecPtr:%X/%X, lastreplayed:%X/%X", - xlogctl->IsRecoveryDone ? "next" : "first", - (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)(t_thrd.xlog_cxt.ReadRecPtr), - (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), (uint32)(t_thrd.xlog_cxt.EndRecPtr), - (uint32)(lastReplayedLsn >> 32), (uint32)(lastReplayedLsn)))); - - /* - * signal postmaster to update local redo end - * point to gaussdb state file. - */ - if (!xlogctl->IsRecoveryDone) { - SendPostmasterSignal(PMSIGNAL_LOCAL_RECOVERY_DONE); - } - - SpinLockAcquire(&xlogctl->info_lck); - xlogctl->IsRecoveryDone = true; - SpinLockRelease(&xlogctl->info_lck); - if (!(IS_SHARED_STORAGE_MODE) || - pg_atomic_read_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage)) { - knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM); - ereport(LOG, - (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("ParallelXLogPageReadFile set redo finish status," - "ReadRecPtr:%X/%X, EndRecPtr:%X/%X", - (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), - (uint32)(t_thrd.xlog_cxt.ReadRecPtr), (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), - (uint32)(t_thrd.xlog_cxt.EndRecPtr)))); - - /* - * If it hasn't been long since last attempt, sleep 1s to - * avoid busy-waiting. - */ - pg_usleep(150000L); - } - /* - * If primary_conninfo is set, launch walreceiver to - * try to stream the missing WAL, before retrying to - * restore from archive/pg_xlog. - * - * If fetching_ckpt is TRUE, RecPtr points to the - * initial checkpoint location. In that case, we use - * RedoStartLSN as the streaming start position - * instead of RecPtr, so that when we later jump - * backwards to start redo at RedoStartLSN, we will - * have the logs streamed already. - */ - - uint32 trigger = pg_atomic_read_u32(&g_readManagerTriggerFlag); - if (trigger > 0) { - pg_atomic_write_u32(&g_readManagerTriggerFlag, TRIGGER_NORMAL); - goto triggered; - } - - load_server_mode(); - if (t_thrd.xlog_cxt.PrimaryConnInfo || t_thrd.xlog_cxt.server_mode == STANDBY_MODE) { - t_thrd.xlog_cxt.receivedUpto = 0; - uint32 failSouce = pg_atomic_read_u32(&g_dispatcher->rtoXlogBufState.failSource); - - if (!(failSouce & XLOG_FROM_STREAM)) { - volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv; - SpinLockAcquire(&walrcv->mutex); - walrcv->receivedUpto = 0; - SpinLockRelease(&walrcv->mutex); - t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM; - t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM; - pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.readSource, - XLOG_FROM_STREAM); - pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.waitRedoDone, 0); - return -1; - } - } - } - /* Don't try to read from a source that just failed */ - sources &= ~t_thrd.xlog_cxt.failedSources; - t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, DEBUG2, sources); - if (t_thrd.xlog_cxt.readFile >= 0) { - break; - } - /* - * Nope, not found in archive and/or pg_xlog.: - */ - t_thrd.xlog_cxt.failedSources |= sources; - - /* - * Check to see if the trigger file exists. Note that we - * do this only after failure, so when you create the - * trigger file, we still finish replaying as much as we - * can from archive and pg_xlog before failover. - */ - uint32 trigger = pg_atomic_read_u32(&g_readManagerTriggerFlag); - if (trigger > 0) { - pg_atomic_write_u32(&g_readManagerTriggerFlag, TRIGGER_NORMAL); - goto triggered; - } - } - } else { - /* In archive or crash recovery. */ - if (t_thrd.xlog_cxt.readFile < 0) { - uint32 sources; - - /* Reset curFileTLI if random fetch. */ - if (randAccess) { - t_thrd.xlog_cxt.curFileTLI = 0; - } - - sources = XLOG_FROM_PG_XLOG; - if (t_thrd.xlog_cxt.InArchiveRecovery) { - sources |= XLOG_FROM_ARCHIVE; - } - - t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, LOG, sources); - - if (t_thrd.xlog_cxt.readFile < 0) { - return -1; - } - } - } - } - - /* - * At this point, we have the right segment open and if we're streaming we - * know the requested record is in it. - */ - Assert(t_thrd.xlog_cxt.readFile != -1); - - /* - * If the current segment is being streamed from master, calculate how - * much of the current page we have received already. We know the - * requested record has been received, but this is for the benefit of - * future calls, to allow quick exit at the top of this function. - */ - t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ; - - /* Read the requested page */ - t_thrd.xlog_cxt.readOff = targetPageOff; - -try_again: - if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { - ereport(emode_for_corrupt_record(LOG, RecPtr), - (errcode_for_file_access(), - errmsg("could not seek in log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; - } - goto next_record_is_invalid; - } - pgstat_report_waitevent(WAIT_EVENT_WAL_READ); - ret = read(t_thrd.xlog_cxt.readFile, xlogreader->readBuf, XLOG_BLCKSZ); - pgstat_report_waitevent(WAIT_EVENT_END); - if (ret != XLOG_BLCKSZ) { - ereport(emode_for_corrupt_record(LOG, RecPtr), - (errcode_for_file_access(), - errmsg("could not read from log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; - } - goto next_record_is_invalid; - } - Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo); - Assert(targetPageOff == t_thrd.xlog_cxt.readOff); - Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen); - - *readTLI = t_thrd.xlog_cxt.curFileTLI; - - return t_thrd.xlog_cxt.readLen; - -next_record_is_invalid: - t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource; - - if (t_thrd.xlog_cxt.readFile >= 0) { - close(t_thrd.xlog_cxt.readFile); - } - t_thrd.xlog_cxt.readFile = -1; - t_thrd.xlog_cxt.readLen = 0; - t_thrd.xlog_cxt.readSource = 0; - - /* In standby-mode, keep trying */ - if (t_thrd.xlog_cxt.StandbyMode) { - goto retry; - } else { - return -1; - } - -triggered: - if (t_thrd.xlog_cxt.readFile >= 0) { - close(t_thrd.xlog_cxt.readFile); - } - t_thrd.xlog_cxt.readFile = -1; - t_thrd.xlog_cxt.readLen = 0; - t_thrd.xlog_cxt.readSource = 0; - t_thrd.xlog_cxt.recoveryTriggered = true; - - return -1; -} - int ParallelXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, TimeLineID *readTLI) { @@ -552,12 +255,8 @@ int ParallelXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, if (readSource & XLOG_FROM_STREAM) { readLen = ParallelXLogReadWorkBufRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, readTLI); } else { - if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { - readLen = SSXLogPageRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, - xlogreader->readBuf, readTLI, NULL); - } else { - readLen = ParallelXLogPageReadFile(xlogreader, targetPagePtr, reqLen, targetRecPtr, readTLI); - } + readLen = SSXLogPageRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, xlogreader->readBuf, + readTLI, NULL); } if (readLen > 0 || t_thrd.xlog_cxt.recoveryTriggered || !t_thrd.xlog_cxt.StandbyMode || DoEarlyExit()) { diff --git a/src/gausskernel/storage/access/transam/xlog.cpp b/src/gausskernel/storage/access/transam/xlog.cpp index 121262b04..eb8978df3 100755 --- a/src/gausskernel/storage/access/transam/xlog.cpp +++ b/src/gausskernel/storage/access/transam/xlog.cpp @@ -360,7 +360,7 @@ static void XLogWrite(const XLogwrtRqst &WriteRqst, bool flexible); static bool XLogWritePaxos(XLogRecPtr WritePaxosRqst); #endif static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool find_free, int *max_advance, - bool use_lock); + bool use_lock, const char *xlog_dir); static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, bool notexistOk); static void XLogFileClose(void); static void KeepFileRestoredFromArchive(const char *path, const char *xlogfname); @@ -414,8 +414,6 @@ static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); static XLogRecPtr XLogInsertRecordSingle(XLogRecData *rdata, XLogRecPtr fpw_lsn); -static int SSXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, - XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path); static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int expectReadLen, XLogRecPtr targetRecPtr, char *buf, TimeLineID *readTLI, char* xlog_path); void ArchiveXlogForForceFinishRedo(XLogReaderState *xlogreader, TermFileData *term_file); @@ -424,6 +422,7 @@ XLogRecPtr mpfl_read_max_flush_lsn(); void mpfl_new_file(); void mpfl_ulink_file(); bool mpfl_pread_file(int fd, void *buf, int32 size, int64 offset); +static void SSOndemandXlogCopy(XLogSegNo copySegNo, uint32 startOffset, char *copyBuffer, Size copyBytes); #ifdef __aarch64__ static XLogRecPtr XLogInsertRecordGroup(XLogRecData *rdata, XLogRecPtr fpw_lsn); @@ -2755,6 +2754,9 @@ static void XLogWrite(const XLogwrtRqst &WriteRqst, bool flexible) t_thrd.xlog_cxt.openLogOff += nbytes; npages = 0; + // write copy to recovery dir */ + SSOndemandXlogCopy(t_thrd.xlog_cxt.openLogSegNo, startoffset, from, nbytes); + /* * If we just wrote the whole last page of a logfile segment, * fsync the segment immediately. This avoids having to go back @@ -3707,7 +3709,7 @@ bool XLogNeedsFlush(XLogRecPtr record) * take down the system on failure). They will promote to PANIC if we are * in a critical section. */ -int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) +static int XLogFileInitInternal(XLogSegNo logsegno, bool *use_existent, bool use_lock, const char *xlog_dir) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; @@ -3720,7 +3722,7 @@ int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) errno_t rc = EOK; gstrace_entry(GS_TRC_ID_XLogFileInit); - rc = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, t_thrd.xlog_cxt.ThisTimeLineID, + rc = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((logsegno) / XLogSegmentsPerXLogId), (uint32)((logsegno) % XLogSegmentsPerXLogId)); securec_check_ss(rc, "", ""); @@ -3752,7 +3754,7 @@ int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) */ ereport(DEBUG2, (errmsg("creating and filling new WAL file"))); - rc = snprintf_s(tmppath, MAXPGPATH, MAXPGPATH - 1, "%s/xlogtemp.%lu", SS_XLOGDIR, gs_thread_self()); + rc = snprintf_s(tmppath, MAXPGPATH, MAXPGPATH - 1, "%s/xlogtemp.%lu", xlog_dir, gs_thread_self()); securec_check_ss(rc, "\0", "\0"); unlink(tmppath); @@ -3834,7 +3836,8 @@ int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) */ installed_segno = logsegno; max_advance = XLOGfileslop; - if (!InstallXLogFileSegment(&installed_segno, (const char *)tmppath, *use_existent, &max_advance, use_lock)) { + if (!InstallXLogFileSegment(&installed_segno, (const char *)tmppath, *use_existent, &max_advance, + use_lock, xlog_dir)) { /* * No need for any more future segments, or InstallXLogFileSegment() * failed to rename the file into place. If the rename failed, opening @@ -3860,6 +3863,11 @@ int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) return fd; } +int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) +{ + return XLogFileInitInternal(logsegno, use_existent, use_lock, SS_XLOGDIR); +} + void XLogFileCutPage(char *buffer, uint32 bufLen, uint32 cpyLen) { if (bufLen != 0) { @@ -4120,13 +4128,13 @@ static void XLogFileTruncate(char *path, XLogRecPtr RecPtr) * file into place. */ static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool find_free, int *max_advance, - bool use_lock) + bool use_lock, const char *xlog_dir) { char path[MAXPGPATH]; struct stat stat_buf; errno_t errorno = EOK; - errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, t_thrd.xlog_cxt.ThisTimeLineID, + errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((*segno) / XLogSegmentsPerXLogId), (uint32)((*segno) % XLogSegmentsPerXLogId)); securec_check_ss(errorno, "", ""); @@ -4152,7 +4160,7 @@ static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool f } (*segno)++; (*max_advance)--; - errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, + errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((*segno) / XLogSegmentsPerXLogId), (uint32)((*segno) % XLogSegmentsPerXLogId)); securec_check_ss(errorno, "", ""); @@ -4178,13 +4186,13 @@ static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool f /* * Open a pre-existing logfile segment for writing. */ -int XLogFileOpen(XLogSegNo segno) +static int XLogFileOpenInternal(XLogSegNo segno, const char *xlog_dir) { char path[MAXPGPATH]; int fd; errno_t errorno = EOK; - errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, t_thrd.xlog_cxt.ThisTimeLineID, + errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId)); securec_check_ss(errorno, "", ""); @@ -4215,6 +4223,11 @@ void SSXLOGCopyFromOldPrimary(XLogReaderState *state, XLogRecPtr pageptr) } } +int XLogFileOpen(XLogSegNo segno) +{ + return XLogFileOpenInternal(segno, SS_XLOGDIR); +} + /* * Open a logfile segment for reading (during recovery). * @@ -5038,7 +5051,7 @@ static void RemoveXlogFile(const char *segname, XLogRecPtr endptr) * symbolic links pointing to a separate archive directory. */ if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && - InstallXLogFileSegment(&endLogSegNo, (const char *)path, true, &max_advance, true)) { + InstallXLogFileSegment(&endLogSegNo, (const char *)path, true, &max_advance, true, SS_XLOGDIR)) { ereport(DEBUG2, (errmsg("recycled transaction log file \"%s\"", segname))); t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled++; /* Needn't recheck that slot on future iterations */ @@ -6362,6 +6375,8 @@ void XLOGShmemInit(void) t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck = g_instance.attr.attr_storage.XLOGbuffers - 1; t_thrd.shemem_ptr_cxt.XLogCtl->SharedRecoveryInProgress = true; t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone = false; t_thrd.shemem_ptr_cxt.XLogCtl->SharedHotStandbyActive = false; t_thrd.shemem_ptr_cxt.XLogCtl->WalWriterSleeping = false; t_thrd.shemem_ptr_cxt.XLogCtl->xlogFlushPtrForPerRead = InvalidXLogRecPtr; @@ -6682,7 +6697,7 @@ void BootStrapXLOG(void) /* In SS, the first node to create control file is will be primary */ if (ENABLE_DSS) { - SSWriteReformerControlPages(); + SSInitReformerControlPages(); } /* Now create pg_control */ @@ -7039,7 +7054,7 @@ static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo) /* * Now move the segment into place with its final name. */ - if (!InstallXLogFileSegment(&endLogSegNo, (const char *)tmppath, false, NULL, false)) + if (!InstallXLogFileSegment(&endLogSegNo, (const char *)tmppath, false, NULL, false, SS_XLOGDIR)) ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("InstallXLogFileSegment should not have failed"))); if (XLogArchivingActive()) { errorno = snprintf_s(xlogpath, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X", endTLI, @@ -8082,7 +8097,7 @@ void ResourceManagerStop(void) errorno = memcpy_s((_oldXlogReader)->readBuf, XLOG_BLCKSZ, (_xlogreader)->readBuf, \ (_oldXlogReader)->readLen); \ securec_check(errorno, "", ""); \ - if (ENABLE_DSS && ENABLE_DMS) { \ + if (ENABLE_DSS && ENABLE_DMS && (_xlogreader)->preReadBuf != NULL) { \ (_oldXlogReader)->preReadStartPtr = (_xlogreader)->preReadStartPtr; \ errorno = memcpy_s((_oldXlogReader)->preReadBuf, XLogPreReadSize, \ (_xlogreader)->preReadBuf, XLogPreReadSize); \ @@ -8133,6 +8148,10 @@ inline void PrintCkpXctlControlFile(XLogRecPtr oldCkpLoc, CheckPoint *oldCkp, XL void CheckForRestartPoint() { + if (SS_IN_ONDEMAND_RECOVERY) { + return; + } + XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl; if (XLByteLT(xlogctl->lastCheckPointRecPtr, g_instance.comm_cxt.predo_cxt.newestCheckpointLoc)) { @@ -8481,6 +8500,10 @@ static void XLogMakeUpRemainSegsContent(char *contentBuffer) void XLogCheckRemainSegs() { + if (SS_ONDEMAND_BUILD_DONE && !SS_ONDEMAND_RECOVERY_DONE) { + return; + } + uint32 contentLen = XLogGetRemainContentLen(); pg_crc32c crc; char* contentBuffer = (char *)palloc_huge(CurrentMemoryContext, (contentLen + sizeof(pg_crc32c))); @@ -8778,12 +8801,27 @@ void StartupXLOG(void) * Note: in most control paths, *ControlFile is already valid and we need * not do ReadControlFile() here, but might as well do it to be sure. */ - if (ENABLE_DMS) { - int src_id = g_instance.attr.attr_storage.dms_attr.instance_id; - if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { - src_id = SSGetPrimaryInstId(); - ereport(LOG, (errmsg("[SS Reform]: Standby:%d promoting, reading control file of original primary:%d", - g_instance.attr.attr_storage.dms_attr.instance_id, src_id))); + if (ENABLE_DMS && ENABLE_DSS) { + int src_id = INVALID_INSTANCEID; + if (SS_CLUSTER_ONDEMAND_RECOVERY && SS_PRIMARY_MODE) { + if (SS_STANDBY_PROMOTING) { + ereport(FATAL, (errmsg("Do not allow switchover if on-demand recovery is not finish"))); + } + + Assert(g_instance.dms_cxt.SSReformerControl.recoveryInstId != INVALID_INSTANCEID); + src_id = g_instance.dms_cxt.SSReformerControl.recoveryInstId; + ereport(LOG, (errmsg("[on-demand]: On-demand recovery do not finish in last reform, " + "reading control file of original primary:%d", src_id))); + } else { + if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { + src_id = SSGetPrimaryInstId(); + ereport(LOG, (errmsg("[SS Reform]: Standby:%d promoting, reading control file of original primary:%d", + g_instance.attr.attr_storage.dms_attr.instance_id, src_id))); + } else { + src_id = g_instance.attr.attr_storage.dms_attr.instance_id; + } + g_instance.dms_cxt.SSReformerControl.recoveryInstId = src_id; + SSSaveReformerCtrl(); } SSReadControlFile(src_id); } else { @@ -8982,16 +9020,14 @@ void StartupXLOG(void) securec_check(errorno, "", ""); if (ENABLE_DMS && ENABLE_DSS) { + SSGetRecoveryXlogPath(); + xlogreader = SSXLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER); + close_readFile_if_open(); if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { - SSGetXlogPath(); - xlogreader = SSXLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER); - close_readFile_if_open(); // init shared memory set page empty SSCSNLOGShmemClear(); SSCLOGShmemClear(); SSMultiXactShmemClear(); - } else { - xlogreader = SSXLogReaderAllocate(&XLogPageRead, &readprivate, ALIGNOF_BUFFER); } } else { xlogreader = XLogReaderAllocate(&XLogPageRead, &readprivate); @@ -9263,6 +9299,8 @@ void StartupXLOG(void) SetMultiXactIdLimit(FirstMultiXactId, TemplateDbOid); t_thrd.shemem_ptr_cxt.XLogCtl->ckptXid = checkPoint.oldestXid; t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone = false; latestCompletedXid = checkPoint.nextXid; TransactionIdRetreat(latestCompletedXid); @@ -9440,6 +9478,17 @@ void StartupXLOG(void) t_thrd.xlog_cxt.InRecovery = false; } + if (SS_PRIMARY_MODE) { + if (ENABLE_ONDEMAND_RECOVERY && t_thrd.xlog_cxt.InRecovery == true) { + g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = true; + /* for other nodes in cluster */ + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_IN_ONDEMAND_BUILD; + } else { + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; + } + SSSaveReformerCtrl(); + } + ReadRemainSegsFile(); /* Determine whether it is currently in the switchover of streaming disaster recovery */ checkHadrInSwitchover(); @@ -9933,7 +9982,12 @@ void StartupXLOG(void) } CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_1]); } while (record != NULL); // end of main redo apply loop - SendRecoveryEndMarkToWorkersAndWaitForFinish(0); + + if (SS_IN_ONDEMAND_RECOVERY) { + OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(0); + } else { + SendRecoveryEndMarkToWorkersAndWaitForFinish(0); + } RecoveryXlogReader(oldXlogReader, xlogreader); if (!(IS_OBS_DISASTER_RECOVER_MODE || IS_DISASTER_RECOVER_MODE)) { @@ -9987,6 +10041,11 @@ void StartupXLOG(void) } else { /* there are no WAL records following the checkpoint */ ereport(LOG, (errmsg("redo is not required"))); + if (SS_IN_ONDEMAND_RECOVERY) { + g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false; + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; + SSSaveReformerCtrl(); + } } } /* Set undoCountThreshold as a proper value after finish recovery. */ @@ -10266,7 +10325,7 @@ void StartupXLOG(void) t_thrd.xlog_cxt.InRecovery = false; g_instance.roach_cxt.isRoachRestore = false; - if (!SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING) { + if (!SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING && !SS_IN_ONDEMAND_RECOVERY) { LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION; t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); @@ -10383,12 +10442,14 @@ void StartupXLOG(void) xlogctl->SharedRecoveryInProgress = false; xlogctl->IsRecoveryDone = true; SpinLockRelease(&xlogctl->info_lck); - NotifyGscRecoveryFinished(); - if (ENABLE_INCRE_CKPT) { - RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state; - (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE); - state->start = state->end; - (void)LWLockRelease(state->recovery_queue_lock); + if (!SS_IN_ONDEMAND_RECOVERY) { + NotifyGscRecoveryFinished(); + if (ENABLE_INCRE_CKPT) { + RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state; + (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE); + state->start = state->end; + (void)LWLockRelease(state->recovery_queue_lock); + } } } @@ -10397,19 +10458,20 @@ void StartupXLOG(void) g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = ALLOW_CKPT; pg_memory_barrier(); } - ereport(LOG, (errmodule(MOD_DMS), - errmsg("[SS switchover/SS failover] standby promoting: start full checkpoint."))); - - RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION; - t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); - UpdateControlFile(); - LWLockRelease(ControlFileLock); - SSRecheckBufferPool(); - ereport(LOG, (errmodule(MOD_DMS), - errmsg("[SS switchover/SS failover] standby promoting: finished full checkpoint" - "and update control file"))); + if (!SS_IN_ONDEMAND_RECOVERY) { + ereport(LOG, (errmodule(MOD_DMS), + errmsg("[SS switchover/SS failover] standby promoting: start full checkpoint."))); + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION; + t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + SSRecheckBufferPool(); + ereport(LOG, (errmodule(MOD_DMS), + errmsg("[SS switchover/SS failover] standby promoting: finished full checkpoint" + "and update control file"))); + } } NextXidAfterReovery = t_thrd.xact_cxt.ShmemVariableCache->nextXid; @@ -10444,6 +10506,44 @@ void StartupXLOG(void) g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTED; } + if (SS_IN_ONDEMAND_RECOVERY) { + /* We wait at here */ + ereport(LOG, (errmsg("[SS] On-demand redo, nextXid: " XID_FMT ", startupMaxXid: " XID_FMT + ", recentLocalXmin: " XID_FMT ", recentGlobalXmin: %lu, PendingPreparedXacts: %d" + ", NextCommitSeqNo: %lu, cutoff_csn_min: %lu.", + NextXidAfterReovery, t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid, + t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin, + t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin, PendingPreparedXactsCount, + t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo, + t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min))); + OnDemandWaitRedoFinish(); + /* to do the work we skip before */ + XLogCheckInvalidPages(); + XLogCheckRemainSegs(); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION; + t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + SSRecheckBufferPool(); + ereport(LOG, (errmodule(MOD_DMS), + errmsg("[SS][on demand recovery] finished full checkpoint and update control file"))); + + NotifyGscRecoveryFinished(); + if (ENABLE_INCRE_CKPT) { + RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state; + (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE); + state->start = state->end; + (void)LWLockRelease(state->recovery_queue_lock); + } + /* for other nodes in cluster */ + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; + SSSaveReformerCtrl(); + g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false; + } + ereport(LOG, (errmsg("redo done, nextXid: " XID_FMT ", startupMaxXid: " XID_FMT ", recentLocalXmin: " XID_FMT ", recentGlobalXmin: %lu, PendingPreparedXacts: %d" ", NextCommitSeqNo: %lu, cutoff_csn_min: %lu.", @@ -16973,49 +17073,36 @@ retry: /* Read the requested page */ t_thrd.xlog_cxt.readOff = targetPageOff; - if (ENABLE_DSS && ENABLE_DMS) { - bool ss_ret = SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, readBuf); - if (!ss_ret) { - ereport(emode_for_corrupt_record(emode, RecPtr), +try_again: + if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { + ereport(emode_for_corrupt_record(emode, RecPtr), (errcode_for_file_access(), - errmsg("[SS] could not read from log file %s to offset %u: %m", + errmsg("could not seek in log file %s to offset %u: %m", XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), t_thrd.xlog_cxt.readOff))); - goto next_record_is_invalid; - } - } else { -try_again: - if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { - ereport(emode_for_corrupt_record(emode, RecPtr), - (errcode_for_file_access(), - errmsg("could not seek in log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; - } - goto next_record_is_invalid; - } - pgstat_report_waitevent(WAIT_EVENT_WAL_READ); - ret = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ); - pgstat_report_waitevent(WAIT_EVENT_END); - if (ret != XLOG_BLCKSZ) { - ereport(emode_for_corrupt_record(emode, RecPtr), - (errcode_for_file_access(), - errmsg("could not read from log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; - } - goto next_record_is_invalid; + if (errno == EINTR) { + errno = 0; + pg_usleep(1000); + goto try_again; } + goto next_record_is_invalid; + } + pgstat_report_waitevent(WAIT_EVENT_WAL_READ); + ret = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ); + pgstat_report_waitevent(WAIT_EVENT_END); + if (ret != XLOG_BLCKSZ) { + ereport(emode_for_corrupt_record(emode, RecPtr), + (errcode_for_file_access(), + errmsg("could not read from log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), + t_thrd.xlog_cxt.readOff))); + if (errno == EINTR) { + errno = 0; + pg_usleep(1000); + goto try_again; + } + goto next_record_is_invalid; } - Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo); Assert(targetPageOff == t_thrd.xlog_cxt.readOff); Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen); @@ -18842,6 +18929,78 @@ bool SSModifySharedLunAllowed() return false; } +static int SSOndemandCopyXLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) +{ + return XLogFileInitInternal(logsegno, use_existent, use_lock, SS_XLOGRECOVERYDIR); +} + +static int SSOndemandCopyXlogFileOpen(XLogSegNo segno) +{ + return XLogFileOpenInternal(segno, SS_XLOGRECOVERYDIR); +} + +static void SSOndemandCopyXlogFileClose(void) +{ + Assert(t_thrd.ondemand_xlog_copy_cxt.openLogFile >= 0); + + if (close(t_thrd.ondemand_xlog_copy_cxt.openLogFile)) { + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not close copy log file %s: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo)))); + } + + t_thrd.ondemand_xlog_copy_cxt.openLogFile = -1; +} + +static void SSOndemandXlogCopy(XLogSegNo copySegNo, uint32 startOffset, char *copyBuffer, Size copyBytes) +{ + // only copy when recovery node and reformer node is not same + if (!SS_IN_ONDEMAND_RECOVERY || SS_OFFICIAL_RECOVERY_NODE) { + return; + } + + if (t_thrd.ondemand_xlog_copy_cxt.openLogSegNo != copySegNo) { + if (t_thrd.ondemand_xlog_copy_cxt.openLogFile >= 0) { + SSOndemandCopyXlogFileClose(); + } + t_thrd.ondemand_xlog_copy_cxt.openLogSegNo = copySegNo; + + bool use_existent = true; + t_thrd.ondemand_xlog_copy_cxt.openLogFile = + SSOndemandCopyXLogFileInit(t_thrd.ondemand_xlog_copy_cxt.openLogSegNo, &use_existent, true); + t_thrd.ondemand_xlog_copy_cxt.openLogOff = 0; + } + + if (t_thrd.ondemand_xlog_copy_cxt.openLogFile <= 0) { + t_thrd.ondemand_xlog_copy_cxt.openLogFile = + SSOndemandCopyXlogFileOpen(t_thrd.ondemand_xlog_copy_cxt.openLogSegNo); + t_thrd.ondemand_xlog_copy_cxt.openLogOff = 0; + } + + if (t_thrd.ondemand_xlog_copy_cxt.openLogOff != startOffset) { + if (lseek(t_thrd.ondemand_xlog_copy_cxt.openLogFile, (off_t)startOffset, SEEK_SET) < 0) { + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not seek in log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo), + startOffset))); + } + t_thrd.ondemand_xlog_copy_cxt.openLogOff = startOffset; + } + + Size actualBytes = write(t_thrd.ondemand_xlog_copy_cxt.openLogFile, copyBuffer, copyBytes); + if (actualBytes != copyBytes) { + /* if write didn't set errno, assume no disk space */ + if (errno == 0) { + errno = ENOSPC; + } + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not write to log file %s at offset %u, length %lu: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo), + t_thrd.ondemand_xlog_copy_cxt.openLogOff, (unsigned long)copyBytes))); + } + t_thrd.ondemand_xlog_copy_cxt.openLogOff += copyBytes; +} + static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int expectReadLen, XLogRecPtr targetRecPtr, char *buf, TimeLineID *readTLI, char* xlog_path) { @@ -19230,8 +19389,14 @@ retry: /* Read the requested page */ t_thrd.xlog_cxt.readOff = targetPageOff; - bool ret = SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, buf); - if (!ret) { + int actualBytes; + if (xlogreader->preReadBuf != NULL) { + actualBytes = SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, buf, XLOG_BLCKSZ); + } else { + actualBytes = (int)pread(t_thrd.xlog_cxt.readFile, buf, XLOG_BLCKSZ, t_thrd.xlog_cxt.readOff); + } + + if (actualBytes != XLOG_BLCKSZ) { ereport(LOG, (errcode_for_file_access(), errmsg("read xlog(start:%X/%X, pos:%u len:%d) failed : %m", static_cast(targetPagePtr >> BIT_NUM_INT32), static_cast(targetPagePtr), targetPageOff, diff --git a/src/gausskernel/storage/access/transam/xlogreader.cpp b/src/gausskernel/storage/access/transam/xlogreader.cpp index b79ba4b8c..73729d16e 100644 --- a/src/gausskernel/storage/access/transam/xlogreader.cpp +++ b/src/gausskernel/storage/access/transam/xlogreader.cpp @@ -105,6 +105,7 @@ XLogReaderState *XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_d state->max_block_id = -1; state->isPRProcess = false; + state->preReadBuf = NULL; /* * Permanently allocate readBuf. We do it this way, rather than just diff --git a/src/gausskernel/storage/access/transam/xlogutils.cpp b/src/gausskernel/storage/access/transam/xlogutils.cpp index 8d17e8e78..211d34d50 100644 --- a/src/gausskernel/storage/access/transam/xlogutils.cpp +++ b/src/gausskernel/storage/access/transam/xlogutils.cpp @@ -513,6 +513,10 @@ static void CollectInvalidPagesStates(uint32 *nstates_ptr, InvalidPagesState *** /* Complain about any remaining invalid-page entries */ void XLogCheckInvalidPages(void) { + if (SS_ONDEMAND_BUILD_DONE && !SS_ONDEMAND_RECOVERY_DONE) { + return; + } + bool foundone = false; if (t_thrd.xlog_cxt.forceFinishHappened) { ereport(WARNING, @@ -671,7 +675,7 @@ XLogRedoAction XLogReadBufferForRedoBlockExtend(RedoBufferTag *redoblock, ReadBu if (pageisvalid) { if (readmethod != WITH_LOCAL_CACHE) { if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) { - if (ENABLE_DMS) + if (ENABLE_DMS && !SS_IN_ONDEMAND_RECOVERY) LockBuffer(buf, BUFFER_LOCK_SHARE); else if (get_cleanup_lock) LockBufferForCleanup(buf); @@ -698,7 +702,7 @@ XLogRedoAction XLogReadBufferForRedoBlockExtend(RedoBufferTag *redoblock, ReadBu return BLK_DONE; } else { if (readmethod != WITH_LOCAL_CACHE && mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK && - ENABLE_DMS) { + ENABLE_DMS && !SS_IN_ONDEMAND_RECOVERY) { Assert(!CheckPageNeedSkipInRecovery(buf)); LockBuffer(buf, BUFFER_LOCK_UNLOCK); if (get_cleanup_lock) { diff --git a/src/gausskernel/storage/buffer/bufmgr.cpp b/src/gausskernel/storage/buffer/bufmgr.cpp index 14c7a6011..8e79236af 100644 --- a/src/gausskernel/storage/buffer/bufmgr.cpp +++ b/src/gausskernel/storage/buffer/bufmgr.cpp @@ -84,6 +84,7 @@ #include "tde_key_management/tde_key_storage.h" #include "ddes/dms/ss_dms_bufmgr.h" #include "ddes/dms/ss_common_attr.h" +#include "ddes/dms/ss_reform_common.h" #include "ddes/dms/ss_transaction.h" const int ONE_MILLISECOND = 1; @@ -2155,7 +2156,8 @@ Buffer ReadBuffer_common_for_dms(ReadBufferMode readmode, BufferDesc* buf_desc, Block bufBlock = BufHdrGetBlock(buf_desc); #ifdef USE_ASSERT_CHECKING - bool need_verify = (!RecoveryInProgress() && ((pg_atomic_read_u32(&buf_desc->state) & BM_VALID) != 0) && ENABLE_VERIFY_PAGE_VERSION); + bool need_verify = (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY && + ((pg_atomic_read_u32(&buf_desc->state) & BM_VALID) != 0) && ENABLE_VERIFY_PAGE_VERSION); char *past_image = NULL; if (need_verify) { past_image = (char *)palloc(BLCKSZ); @@ -2346,8 +2348,11 @@ found_branch: } LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); } + + if (t_thrd.role != PAGEREDO && SS_ONDEMAND_BUILD_DONE && SS_PRIMARY_MODE) { + bufHdr = RedoForOndemandExtremeRTOQuery(bufHdr, relpersistence, forkNum, blockNum, mode); + } } - return BufferDescriptorGetBuffer(bufHdr); } @@ -2409,6 +2414,18 @@ found_branch: /* DMS: Try get page remote */ if (ENABLE_DMS) { + // standby node must notify primary node for prepare lastest page in ondemand recovery + if (SS_STANDBY_ONDEMAND_RECOVERY) { + while (!SSOndemandRequestPrimaryRedo(bufHdr->tag)) { + SSReadControlFile(REFORM_CTRL_PAGE); + if (SS_STANDBY_ONDEMAND_NORMAL) { + break; // ondemand recovery finish, skip + } else if (SS_STANDBY_ONDEMAND_BUILD) { + return 0; // in new reform + } + // still need requset page + } + } MarkReadHint(bufHdr->buf_id, relpersistence, isExtend, pblk); if (mode != RBM_FOR_REMOTE && relpersistence != RELPERSISTENCE_TEMP && !isLocalBuf) { Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); diff --git a/src/gausskernel/storage/lmgr/lwlock.cpp b/src/gausskernel/storage/lmgr/lwlock.cpp index 53e00458f..d00e31e84 100644 --- a/src/gausskernel/storage/lmgr/lwlock.cpp +++ b/src/gausskernel/storage/lmgr/lwlock.cpp @@ -196,7 +196,8 @@ static const char *BuiltinTrancheNames[] = { "FileRepairHashTblLock", "ReplicationOriginLock", "AuditIndextblLock", - "PCABufferContentLock" + "PCABufferContentLock", + "XlogTrackPartLock" }; static void RegisterLWLockTranches(void); @@ -436,6 +437,9 @@ int NumLWLocks(void) /* for barrier preparse hashtbl */ numLocks += 1; + /* for xlog track hash table */ + numLocks += NUM_XLOG_TRACK_PARTITIONS; + /* * Add any requested by loadable modules; for backwards-compatibility * reasons, allocate at least NUM_USER_DEFINED_LWLOCKS of them even if @@ -646,6 +650,10 @@ static void InitializeLWLocks(int numLocks) LWLockInitialize(&lock->lock, LWTRANCHE_STANDBY_STMTHIST); } + for (id = 0; id < NUM_XLOG_TRACK_PARTITIONS; id++, lock++) { + LWLockInitialize(&lock->lock, LWTRANCHE_XLOG_TRACK_PARTITION); + } + Assert((lock - t_thrd.shemem_ptr_cxt.mainLWLockArray) == NumFixedLWLocks); for (id = NumFixedLWLocks; id < numLocks; id++, lock++) { diff --git a/src/gausskernel/storage/lmgr/lwlocknames.txt b/src/gausskernel/storage/lmgr/lwlocknames.txt index c6daaab09..3a3fe6b75 100755 --- a/src/gausskernel/storage/lmgr/lwlocknames.txt +++ b/src/gausskernel/storage/lmgr/lwlocknames.txt @@ -138,3 +138,4 @@ GsStackLock 128 ConfigFileLock 129 DropArchiveSlotLock 130 AboCacheLock 131 +OndemandXlogMemAllocLock 132 diff --git a/src/gausskernel/storage/lmgr/proc.cpp b/src/gausskernel/storage/lmgr/proc.cpp index 8611e39ed..1aae1a251 100755 --- a/src/gausskernel/storage/lmgr/proc.cpp +++ b/src/gausskernel/storage/lmgr/proc.cpp @@ -2448,7 +2448,7 @@ void ProcSendSignal(ThreadId pid) { PGPROC* proc = NULL; - if (RecoveryInProgress()) { + if (RecoveryInProgress() || SS_IN_ONDEMAND_RECOVERY) { ProcBaseLockAccquire(&g_instance.proc_base_mutex_lock); /* diff --git a/src/gausskernel/storage/smgr/segment/segbuffer.cpp b/src/gausskernel/storage/smgr/segment/segbuffer.cpp index cfae8e036..aeb173291 100644 --- a/src/gausskernel/storage/smgr/segment/segbuffer.cpp +++ b/src/gausskernel/storage/smgr/segment/segbuffer.cpp @@ -344,7 +344,7 @@ void SegMarkBufferDirty(Buffer buf) #ifdef USE_ASSERT_CHECKING void SegFlushCheckDiskLSN(SegSpace *spc, RelFileNode rNode, ForkNumber forknum, BlockNumber blocknum, char *buf) { - if (!RecoveryInProgress() && ENABLE_DSS && ENABLE_VERIFY_PAGE_VERSION) { + if (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY && ENABLE_DSS && ENABLE_VERIFY_PAGE_VERSION) { char *origin_buf = (char *)palloc(BLCKSZ + ALIGNOF_BUFFER); char *temp_buf = (char *)BUFFERALIGN(origin_buf); seg_physical_read(spc, rNode, forknum, blocknum, temp_buf); @@ -526,8 +526,9 @@ Buffer ReadSegBufferForDMS(BufferDesc* bufHdr, ReadBufferMode mode, SegSpace *sp #endif } else { #ifdef USE_ASSERT_CHECKING - bool need_verify = (!RecoveryInProgress() && ((pg_atomic_read_u32(&bufHdr->state) & BM_VALID) != 0) && - ENABLE_DSS && ENABLE_VERIFY_PAGE_VERSION); + bool need_verify = (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY && + ((pg_atomic_read_u32(&bufHdr->state) & BM_VALID) != 0) && ENABLE_DSS && + ENABLE_VERIFY_PAGE_VERSION); char *past_image = NULL; if (need_verify) { past_image = (char *)palloc(BLCKSZ); diff --git a/src/include/access/extreme_rto_redo_api.h b/src/include/access/extreme_rto_redo_api.h index d1ba7d52a..09f994d51 100644 --- a/src/include/access/extreme_rto_redo_api.h +++ b/src/include/access/extreme_rto_redo_api.h @@ -27,6 +27,7 @@ #include "access/xlogproc.h" #include "access/redo_statistic.h" +#include "access/ondemand_extreme_rto/redo_utils.h" typedef enum { DEFAULT_EXTREME_RTO, diff --git a/src/include/access/multi_redo_api.h b/src/include/access/multi_redo_api.h index 02efa7d8e..1471c37e6 100644 --- a/src/include/access/multi_redo_api.h +++ b/src/include/access/multi_redo_api.h @@ -35,6 +35,13 @@ #include "access/redo_statistic.h" #include "access/extreme_rto_redo_api.h" +#ifdef ENABLE_LITE_MODE +#define ENABLE_ONDEMAND_RECOVERY false +#else +#define ENABLE_ONDEMAND_RECOVERY (ENABLE_DMS && IsExtremeRedo() \ + && g_instance.attr.attr_storage.dms_attr.enable_ondemand_recovery) +#endif + typedef enum { NOT_PAGE_REDO_THREAD, PAGE_REDO_THREAD_EXIT_NORMAL, diff --git a/src/include/access/ondemand_extreme_rto/batch_redo.h b/src/include/access/ondemand_extreme_rto/batch_redo.h index 0f297985e..5abde5754 100644 --- a/src/include/access/ondemand_extreme_rto/batch_redo.h +++ b/src/include/access/ondemand_extreme_rto/batch_redo.h @@ -46,6 +46,10 @@ namespace ondemand_extreme_rto { #define INIT_REDO_ITEM_TAG(a, xx_rnode, xx_forkNum, xx_blockNum) \ ((a).rNode = (xx_rnode), (a).forkNum = (xx_forkNum), (a).blockNum = (xx_blockNum)) +#define XlogTrackTableHashPartition(hashcode) ((hashcode) % NUM_XLOG_TRACK_PARTITIONS) +#define XlogTrackMappingPartitionLock(hashcode) \ + (&t_thrd.shemem_ptr_cxt.mainLWLockArray[FirstXlogTrackLock + XlogTrackTableHashPartition(hashcode)].lock) + /* * Note: if there are any pad bytes in the struct, INIT_RedoItemTag have * to be fixed to zero them, since this struct is used as a hash key. @@ -61,12 +65,14 @@ typedef struct redoitemhashentry { XLogRecParseState *head; XLogRecParseState *tail; int redoItemNum; + bool redoDone; } RedoItemHashEntry; extern void PRPrintRedoItemHashTab(HTAB *redoItemHash); -extern HTAB *PRRedoItemHashInitialize(MemoryContext context); +extern HTAB **PRRedoItemHashInitialize(MemoryContext context); extern void PRTrackClearBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash); extern void PRTrackAddBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash); +extern uint32 XlogTrackTableHashCode(RedoItemTag *tagPtr); } // namespace ondemand_extreme_rto -#endif /* BATCH_REDO_H */ +#endif /* ONDEMAND_EXTREME_RTO_BATCH_REDO_H */ diff --git a/src/include/access/ondemand_extreme_rto/dispatcher.h b/src/include/access/ondemand_extreme_rto/dispatcher.h index 09cfb3bda..17f9958cf 100644 --- a/src/include/access/ondemand_extreme_rto/dispatcher.h +++ b/src/include/access/ondemand_extreme_rto/dispatcher.h @@ -165,6 +165,7 @@ typedef struct { volatile bool recoveryStop; volatile XLogRedoNumStatics xlogStatics[RM_NEXT_ID][MAX_XLOG_INFO_NUM]; RedoTimeCost *startupTimeCost; + RedoParseManager parseManager; } LogDispatcher; typedef struct { @@ -185,6 +186,7 @@ const static uint64 PRINT_ALL_WAIT_COUNT = 0x7FFFFFFFF; extern RedoItem g_redoEndMark; extern RedoItem g_terminateMark; extern uint32 g_readManagerTriggerFlag; +extern RefOperate recordRefOperate; inline int get_batch_redo_num() { @@ -215,6 +217,8 @@ void FreeRedoItem(RedoItem *item); /* Dispatcher phases. */ void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code); +void SendRecoveryEndMarkToWorkersAndWaitForReach(int code); +void WaitRedoFinish(); /* Dispatcher states. */ int GetDispatcherExitCode(); diff --git a/src/include/access/ondemand_extreme_rto/page_redo.h b/src/include/access/ondemand_extreme_rto/page_redo.h index 6dbacf892..9d55e598c 100644 --- a/src/include/access/ondemand_extreme_rto/page_redo.h +++ b/src/include/access/ondemand_extreme_rto/page_redo.h @@ -40,10 +40,13 @@ namespace ondemand_extreme_rto { -static const uint32 PAGE_WORK_QUEUE_SIZE = 8192; +#define ONDEMAND_DISTRIBUTE_RATIO 0.9 + +static const uint32 PAGE_WORK_QUEUE_SIZE = 2097152; static const uint32 ONDEMAND_EXTREME_RTO_ALIGN_LEN = 16; /* need 128-bit aligned */ static const uint32 MAX_REMOTE_READ_INFO_NUM = 100; +static const uint32 ADVANCE_GLOBALLSN_INTERVAL = 1; /* unit second */ typedef enum { REDO_BATCH, @@ -181,8 +184,6 @@ struct PageRedoWorker { RedoParseManager parseManager; RedoBufferManager bufferManager; RedoTimeCost timeCostList[TIME_COST_NUM]; - uint32 remoteReadPageNum; - HTAB *badPageHashTbl; char page[BLCKSZ]; XLogBlockDataParse *curRedoBlockState; }; @@ -214,6 +215,7 @@ void WaitPageRedoWorkerReachLastMark(PageRedoWorker *worker); /* Redo processing. */ void AddPageRedoItem(PageRedoWorker *worker, void *item); +uint64 GetCompletedRecPtr(PageRedoWorker *worker); void UpdatePageRedoWorkerStandbyState(PageRedoWorker *worker, HotStandbyState newState); /* Redo end states. */ @@ -225,7 +227,9 @@ PageRedoWorker *CreateWorker(uint32 id); extern void UpdateRecordGlobals(RedoItem *item, HotStandbyState standbyState); void ReferenceRedoItem(void *item); void DereferenceRedoItem(void *item); -void PushToWorkerLsn(bool force); +void ReferenceRecParseState(XLogRecParseState *recordstate); +void DereferenceRecParseState(XLogRecParseState *recordstate); +void PushToWorkerLsn(); void GetCompletedReadEndPtr(PageRedoWorker *worker, XLogRecPtr *readPtr, XLogRecPtr *endPtr); void SetReadBufferForExtRto(XLogReaderState *state, XLogRecPtr pageptr, int reqLen); void DumpExtremeRtoReadBuf(); @@ -241,18 +245,5 @@ void DispatchCleanInvalidPageMarkToAllRedoWorker(RepairFileKey key); const char *RedoWokerRole2Str(RedoRole role); - -/* block or file repair function */ -HTAB* BadBlockHashTblCreate(); -void RepairPageAndRecoveryXLog(BadBlockRecEnt *page_info, const char *page); -void CheckRemoteReadAndRepairPage(BadBlockRecEnt *entry); -void ClearSpecificsPageEntryAndMem(BadBlockRecEnt *entry); -void ClearRecoveryThreadHashTbl(const RelFileNode &node, ForkNumber forknum, BlockNumber minblkno, - bool segment_shrink); -void BatchClearRecoveryThreadHashTbl(Oid spcNode, Oid dbNode); -void RecordBadBlockAndPushToRemote(XLogBlockDataParse *datadecode, PageErrorType error_type, - XLogRecPtr old_lsn, XLogPhyBlock pblk); -void SeqCheckRemoteReadAndRepairPage(); - } // namespace ondemand_extreme_rto #endif diff --git a/src/include/access/ondemand_extreme_rto/redo_utils.h b/src/include/access/ondemand_extreme_rto/redo_utils.h new file mode 100644 index 000000000..8b2775785 --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/redo_utils.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * redo_utils.h + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/redo_utils.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_REDO_UTILS_H +#define ONDEMAND_EXTREME_RTO_REDO_UTILS_H + +#include "access/xlogproc.h" + +#define PARSEBUFFER_SIZE (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc)) +#define ONDEMAND_MAX_PARSEBUFF_PREPALLOC ((1024 * 1024 * 1024 - 1) / PARSEBUFFER_SIZE) +#define ONDEMAND_MAX_PARSESIZE_PREPALLOC (ONDEMAND_MAX_PARSEBUFF_PREPALLOC * PARSEBUFFER_SIZE) +#define ONDEMAND_MAX_PARSEBUFF_ALLOCSIZE 100 // 100GB + +typedef struct +{ + int allocNum; + void *allocEntry[ONDEMAND_MAX_PARSEBUFF_ALLOCSIZE]; + void *memslotEntry; +} OndemandParseAllocCtrl; + + +void OndemandXLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOperate *refOperate, + InterruptFunc interruptOperte); +void OndemandXLogParseBufferDestory(RedoParseManager *parsemanager); +XLogRecParseState *OndemandXLogParseBufferAllocList(RedoParseManager *parsemanager, XLogRecParseState *blkstatehead, + void *record); +void OndemandXLogParseBufferRelease(XLogRecParseState *recordstate); +void OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(int code); +void OnDemandWaitRedoFinish(); + +#endif /* ONDEMAND_EXTREME_RTO_REDO_UTILS_H */ \ No newline at end of file diff --git a/src/include/access/ondemand_extreme_rto/xlog_read.h b/src/include/access/ondemand_extreme_rto/xlog_read.h index 99a94f62f..6642a013e 100644 --- a/src/include/access/ondemand_extreme_rto/xlog_read.h +++ b/src/include/access/ondemand_extreme_rto/xlog_read.h @@ -33,4 +33,4 @@ XLogRecord* XLogParallelReadNextRecord(XLogReaderState* xlogreader); XLogRecord *ReadNextXLogRecord(XLogReaderState **xlogreaderptr, int emode); } // namespace ondemand_extreme_rto -#endif /* XLOG_READ_H */ \ No newline at end of file +#endif /* ONDEMAND_EXTREME_RTO_XLOG_READ_H */ \ No newline at end of file diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 707e4eaf0..314357fc2 100755 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -541,6 +541,8 @@ typedef struct XLogCtlData { bool SharedRecoveryInProgress; bool IsRecoveryDone; + bool IsOnDemandBuildDone; + bool IsOnDemandRecoveryDone; /* * SharedHotStandbyActive indicates if we're still in crash or archive @@ -871,6 +873,8 @@ void UpdateMinrecoveryInAchive(); bool NewDataIsInBuf(XLogRecPtr expectedRecPtr); bool rescanLatestTimeLine(void); int XLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources); +int SSXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path); extern XLogRecPtr XlogRemoveSegPrimary; diff --git a/src/include/access/xlog_basic.h b/src/include/access/xlog_basic.h index 31b526292..e03b585bd 100644 --- a/src/include/access/xlog_basic.h +++ b/src/include/access/xlog_basic.h @@ -99,6 +99,7 @@ * The XLog directory and control file (relative to $PGDATA) */ #define SS_XLOGDIR (g_instance.datadir_cxt.xlogDir) +#define SS_XLOGRECOVERYDIR (g_instance.dms_cxt.SSRecoveryInfo.recovery_xlogDir) #define XLOGDIR "pg_xlog" #define ARCHIVEDIR "pg_xlog/archive_status" #define XLOG_CONTROL_FILE (g_instance.datadir_cxt.controlPath) diff --git a/src/include/access/xlogproc.h b/src/include/access/xlogproc.h index e144fefcc..5b5a05df9 100755 --- a/src/include/access/xlogproc.h +++ b/src/include/access/xlogproc.h @@ -111,6 +111,7 @@ typedef struct { typedef struct { Buffer buff_id; pg_atomic_uint32 state; + pg_atomic_uint32 refcount; } ParseBufferDesc; #define RedoBufferSlotGetBuffer(bslot) ((bslot)->buf_id) @@ -687,7 +688,12 @@ typedef struct RefOperate *refOperate; }RedoParseManager; - +typedef enum { + XLOG_NO_DISTRIBUTE, + XLOG_HEAD_DISTRIBUTE, + XLOG_MID_DISTRIBUTE, + XLOG_TAIL_DISTRIBUTE, +} XlogDistributePos; typedef struct { void* nextrecord; @@ -695,6 +701,7 @@ typedef struct { RedoParseManager* manager; void* refrecord; /* origin dataptr, for mem release */ bool isFullSync; + XlogDistributePos distributeStatus; } XLogRecParseState; typedef struct XLogBlockRedoExtreRto { @@ -908,7 +915,36 @@ extern AbnormalProcFunc g_AbFunList[ABNORMAL_NUM]; #define ADD_ABNORMAL_POSITION(pos) #endif +static inline bool AtomicCompareExchangeBuffer(volatile Buffer *ptr, Buffer *expected, Buffer newval) +{ + bool ret = false; + Buffer current; + current = __sync_val_compare_and_swap(ptr, *expected, newval); + ret = (current == *expected); + *expected = current; + return ret; +} +static inline Buffer AtomicReadBuffer(volatile Buffer *ptr) +{ + return *ptr; +} + +static inline void AtomicWriteBuffer(volatile Buffer* ptr, Buffer val) +{ + *ptr = val; +} + +static inline Buffer AtomicExchangeBuffer(volatile Buffer *ptr, Buffer newval) +{ + Buffer old; + while (true) { + old = AtomicReadBuffer(ptr); + if (AtomicCompareExchangeBuffer(ptr, &old, newval)) + break; + } + return old; +} void HeapXlogCleanOperatorPage( RedoBufferInfo* buffer, void* recorddata, void* blkdata, Size datalen, Size* freespace, bool repairFragmentation); @@ -1204,6 +1240,7 @@ extern XLogRecParseState* xact_redo_parse_to_block(XLogReaderState* record, uint extern bool XLogBlockRedoForExtremeRTO(XLogRecParseState* redoblocktate, RedoBufferInfo *bufferinfo, bool notfound, RedoTimeCost &readBufCost, RedoTimeCost &redoCost); +extern void XlogBlockRedoForOndemandExtremeRTOQuery(XLogRecParseState *redoBlockState, RedoBufferInfo *bufferInfo); void XLogBlockParseStateRelease_debug(XLogRecParseState* recordstate, const char *func, uint32 line); #define XLogBlockParseStateRelease(recordstate) XLogBlockParseStateRelease_debug(recordstate, __FUNCTION__, __LINE__) #ifdef USE_ASSERT_CHECKING diff --git a/src/include/ddes/dms/ss_common_attr.h b/src/include/ddes/dms/ss_common_attr.h index 687df87ec..8289a696f 100644 --- a/src/include/ddes/dms/ss_common_attr.h +++ b/src/include/ddes/dms/ss_common_attr.h @@ -140,6 +140,17 @@ #define SS_PRIMARY_STANDBY_CLUSTER_NORMAL_STANDBY \ (SS_NORMAL_STANDBY && (g_instance.attr.attr_storage.xlog_file_path != 0)) +#define SS_CLUSTER_NOT_NORAML (ENABLE_DMS && (g_instance.dms_cxt.SSReformerControl.clusterStatus != CLUSTER_NORMAL)) +#define SS_CLUSTER_ONDEMAND_BUILD \ + (ENABLE_DMS && (g_instance.dms_cxt.SSReformerControl.clusterStatus == CLUSTER_IN_ONDEMAND_BUILD)) +#define SS_CLUSTER_ONDEMAND_RECOVERY \ + (ENABLE_DMS && (g_instance.dms_cxt.SSReformerControl.clusterStatus == CLUSTER_IN_ONDEMAND_RECOVERY)) +#define SS_CLUSTER_ONDEMAND_NORMAL \ + (ENABLE_DMS && (g_instance.dms_cxt.SSReformerControl.clusterStatus == CLUSTER_NORMAL)) +#define SS_STANDBY_ONDEMAND_BUILD (SS_STANDBY_MODE && SS_CLUSTER_ONDEMAND_BUILD) +#define SS_STANDBY_ONDEMAND_RECOVERY (SS_STANDBY_MODE && SS_CLUSTER_ONDEMAND_RECOVERY) +#define SS_STANDBY_ONDEMAND_NORMAL (SS_STANDBY_MODE && SS_CLUSTER_ONDEMAND_NORMAL) + /* DMS_BUF_NEED_LOAD */ #define BUF_NEED_LOAD 0x1 /* DMS_BUF_IS_LOADED */ @@ -207,5 +218,18 @@ typedef enum SSReformType { DMS_REFORM_TYPE_FOR_MAINTAIN } SSReformType; +typedef enum SSGlobalClusterState { + CLUSTER_IN_ONDEMAND_BUILD = 0, + CLUSTER_IN_ONDEMAND_RECOVERY, + CLUSTER_NORMAL +} SSGlobalClusterState; + +typedef enum SSOndemandRequestRedoStatus { + ONDEMAND_REDO_DONE = 0, + ONDEMAND_REDO_SKIP, + ONDEMAND_REDO_FAIL, + ONDEMAND_REDO_INVALID +} SSOndemandRequestRedoStatus; + #endif diff --git a/src/include/ddes/dms/ss_dms.h b/src/include/ddes/dms/ss_dms.h index 3a1d3363d..3430f499b 100644 --- a/src/include/ddes/dms/ss_dms.h +++ b/src/include/ddes/dms/ss_dms.h @@ -80,6 +80,8 @@ typedef struct st_ss_dms_func { void (*dms_refresh_logger)(char *log_field, unsigned long long *value); void (*dms_validate_drc)(dms_context_t *dms_ctx, dms_buf_ctrl_t *ctrl, unsigned long long lsn, unsigned char is_dirty); + int (*dms_reform_req_opengauss_ondemand_redo_buffer)(dms_context_t *dms_ctx, void *block_key, unsigned int key_len, + int *redo_status); } ss_dms_func_t; int ss_dms_func_init(); @@ -123,6 +125,8 @@ bool dms_latch_timed_s(dms_context_t *dms_ctx, dms_drlatch_t *dlatch, unsigned i void dms_unlatch(dms_context_t *dms_ctx, dms_drlatch_t *dlatch); void dms_pre_uninit(void); void dms_validate_drc(dms_context_t *dms_ctx, dms_buf_ctrl_t *ctrl, unsigned long long lsn, unsigned char is_dirty); +int dms_reform_req_opengauss_ondemand_redo_buffer(dms_context_t *dms_ctx, void *block_key, unsigned int key_len, + int *redo_status); #ifdef __cplusplus } #endif diff --git a/src/include/ddes/dms/ss_dms_bufmgr.h b/src/include/ddes/dms/ss_dms_bufmgr.h index 9d10736f6..8807ad54b 100644 --- a/src/include/ddes/dms/ss_dms_bufmgr.h +++ b/src/include/ddes/dms/ss_dms_bufmgr.h @@ -83,4 +83,5 @@ long SSGetBufSleepTime(int retry_times); SMGR_READ_STATUS SmgrNetPageCheckRead(Oid spcNode, Oid dbNode, Oid relNode, ForkNumber forkNum, BlockNumber blockNo, char *blockbuf); void SSUnPinBuffer(BufferDesc* buf_desc); +bool SSOndemandRequestPrimaryRedo(BufferTag tag); #endif diff --git a/src/include/ddes/dms/ss_dms_recovery.h b/src/include/ddes/dms/ss_dms_recovery.h index 6affe3c2c..265362c41 100644 --- a/src/include/ddes/dms/ss_dms_recovery.h +++ b/src/include/ddes/dms/ss_dms_recovery.h @@ -32,11 +32,30 @@ #define SS_BEFORE_RECOVERY (ENABLE_DMS && g_instance.dms_cxt.SSReformInfo.in_reform == true \ && g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag == true) #define SS_IN_FAILOVER (ENABLE_DMS && g_instance.dms_cxt.SSRecoveryInfo.in_failover == true) +#define SS_IN_ONDEMAND_RECOVERY (ENABLE_DMS && g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery == true) +#define SS_ONDEMAND_BUILD_DONE (ENABLE_DMS && SS_IN_ONDEMAND_RECOVERY \ + && t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone == true) +#define SS_ONDEMAND_RECOVERY_DONE (ENABLE_DMS && SS_IN_ONDEMAND_RECOVERY \ + && t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone == true) +#define SS_REPLAYED_BY_ONDEMAND (ENABLE_DMS && !SS_IN_ONDEMAND_RECOVERY && \ + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone == true && \ + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone == true) -typedef struct st_reformer_ctrl { +#define REFORM_CTRL_VERSION 1 + +typedef struct st_old_reformer_ctrl { uint64 list_stable; // stable instances list int primaryInstId; pg_crc32c crc; +} ss_old_reformer_ctrl_t; + +typedef struct st_reformer_ctrl { + uint32 version; + uint64 list_stable; // stable instances list + int primaryInstId; + int recoveryInstId; + SSGlobalClusterState clusterStatus; + pg_crc32c crc; } ss_reformer_ctrl_t; typedef struct st_reform_info { @@ -66,14 +85,14 @@ typedef struct ss_recovery_info { bool no_backend_left; bool startup_need_exit_normally; //used in alive failover bool recovery_trapped_in_page_request; //used in alive failover + bool in_ondemand_recovery; } ss_recovery_info_t; extern bool SSRecoveryNodes(); extern void SSWaitStartupExit(); extern int SSGetPrimaryInstId(); extern void SSSavePrimaryInstId(int id); -extern void SSReadControlFile(int id, bool updateDmsCtx = false); -extern void SSWriteReformerControlPages(void); +extern void SSInitReformerControlPages(void); extern bool SSRecoveryApplyDelay(); extern void SShandle_promote_signal(); extern void ss_failover_dw_init(); diff --git a/src/include/ddes/dms/ss_init.h b/src/include/ddes/dms/ss_init.h index ed83a51b6..28451a991 100644 --- a/src/include/ddes/dms/ss_init.h +++ b/src/include/ddes/dms/ss_init.h @@ -32,8 +32,10 @@ #define DMS_MAX_CONNECTIONS (int32)16000 #define SS_PRIMARY_ID g_instance.dms_cxt.SSReformerControl.primaryInstId // currently master ID is hardcoded as 0 +#define SS_RECOVERY_ID g_instance.dms_cxt.SSReformerControl.recoveryInstId #define SS_MY_INST_ID g_instance.attr.attr_storage.dms_attr.instance_id #define SS_OFFICIAL_PRIMARY (SS_MY_INST_ID == SS_PRIMARY_ID) +#define SS_OFFICIAL_RECOVERY_NODE (SS_MY_INST_ID == SS_RECOVERY_ID) void DMSInit(); void DMSUninit(); diff --git a/src/include/ddes/dms/ss_reform_common.h b/src/include/ddes/dms/ss_reform_common.h index 40cad1884..a934c3f37 100644 --- a/src/include/ddes/dms/ss_reform_common.h +++ b/src/include/ddes/dms/ss_reform_common.h @@ -31,14 +31,18 @@ #define REFORM_WAIT_LONG 100000 /* 0.1 sec */ #define WAIT_REFORM_CTRL_REFRESH_TRIES 1000 +#define REFORM_CTRL_VERSION 1 + typedef struct SSBroadcastCancelTrx { SSBroadcastOp type; // must be first } SSBroadcastCancelTrx; -bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, XLogRecPtr targetRecPtr, char *buf); +int SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, XLogRecPtr targetRecPtr, char *buf, + int readLen); XLogReaderState *SSXLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data, Size alignedSize); -void SSGetXlogPath(); -void SSSaveReformerCtrl(); +void SSGetRecoveryXlogPath(); +void SSSaveReformerCtrl(bool force = false); +void SSReadControlFile(int id, bool updateDmsCtx = false); void SSClearSegCache(); int SSCancelTransactionOfAllStandby(SSBroadcastOp type); int SSProcessCancelTransaction(SSBroadcastOp type); diff --git a/src/include/knl/knl_guc/knl_instance_attr_storage.h b/src/include/knl/knl_guc/knl_instance_attr_storage.h index f113c865e..003a59bf9 100755 --- a/src/include/knl/knl_guc/knl_instance_attr_storage.h +++ b/src/include/knl/knl_guc/knl_instance_attr_storage.h @@ -100,6 +100,8 @@ typedef struct knl_instance_attr_dms { bool enable_catalog_centralized; bool enable_dss_aio; bool enable_verify_page; + bool enable_ondemand_recovery; + int ondemand_recovery_mem_size; int instance_id; int recv_msg_pool_size; char* interconnect_url; diff --git a/src/include/knl/knl_instance.h b/src/include/knl/knl_instance.h index c9aa5edd2..c3096bb84 100755 --- a/src/include/knl/knl_instance.h +++ b/src/include/knl/knl_instance.h @@ -741,6 +741,8 @@ typedef struct knl_g_parallel_redo_context { char* ali_buf; XLogRedoNumStatics xlogStatics[RM_NEXT_ID][MAX_XLOG_INFO_NUM]; RedoCpuBindControl redoCpuBindcontrl; + + HTAB **redoItemHash; /* used in ondemand extreme RTO */ } knl_g_parallel_redo_context; typedef struct knl_g_heartbeat_context { @@ -827,7 +829,7 @@ typedef struct knl_g_comm_context { long lastArchiveRcvTime; void* pLogCtl; bool rejectRequest; - + MemoryContext redoItemCtx; #ifdef USE_SSL libcomm_sslinfo* libcomm_data_port_list; libcomm_sslinfo* libcomm_ctrl_port_list; diff --git a/src/include/knl/knl_thread.h b/src/include/knl/knl_thread.h index 979580bfb..f141b06d8 100755 --- a/src/include/knl/knl_thread.h +++ b/src/include/knl/knl_thread.h @@ -3355,6 +3355,12 @@ typedef struct knl_t_dms_context { bool flush_copy_get_page_failed; //used in flush copy } knl_t_dms_context; +typedef struct knl_t_ondemand_xlog_copy_context { + int openLogFile; + XLogSegNo openLogSegNo; + uint32 openLogOff; +} knl_t_ondemand_xlog_copy_context; + /* thread context. */ typedef struct knl_thrd_context { knl_thread_role role; @@ -3503,6 +3509,7 @@ typedef struct knl_thrd_context { knl_t_cfs_shrinker_context cfs_shrinker_cxt; knl_t_sql_patch_context sql_patch_cxt; knl_t_dms_context dms_cxt; + knl_t_ondemand_xlog_copy_context ondemand_xlog_copy_cxt; knl_t_rc_context rc_cxt; } knl_thrd_context; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 1efaf07fe..47f54f789 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -37,6 +37,7 @@ /***************************************************************************** * Backend version and inplace upgrade staffs *****************************************************************************/ +extern const uint32 ONDEMAND_REDO_VERSION_NUM; extern const uint32 SRF_FUSION_VERSION_NUM; extern const uint32 INNER_UNIQUE_VERSION_NUM; extern const uint32 PARTITION_ENHANCE_VERSION_NUM; @@ -131,6 +132,7 @@ extern const uint32 CREATE_TABLE_AS_VERSION_NUM; extern void register_backend_version(uint32 backend_version); extern bool contain_backend_version(uint32 version_number); +extern void SSUpgradeFileBeforeCommit(); #define INPLACE_UPGRADE_PRECOMMIT_VERSION 1 @@ -402,6 +404,7 @@ extern bool stack_is_too_deep(void); /* in tcop/utility.c */ extern void PreventCommandIfReadOnly(const char* cmdname); extern void PreventCommandDuringRecovery(const char* cmdname); +extern void PreventCommandDuringSSOndemandRecovery(Node* parseTree); extern int trace_recovery(int trace_level); diff --git a/src/include/storage/buf/bufmgr.h b/src/include/storage/buf/bufmgr.h index 255f3d5fa..a02c2837d 100644 --- a/src/include/storage/buf/bufmgr.h +++ b/src/include/storage/buf/bufmgr.h @@ -420,5 +420,6 @@ extern bool StartBufferIO(BufferDesc* buf, bool forInput); extern Buffer ReadBuffer_common_for_dms(ReadBufferMode readmode, BufferDesc *bufDesc, const XLogPhyBlock *pblk); extern void ReadBuffer_common_for_check(ReadBufferMode readmode, BufferDesc* buf_desc, const XLogPhyBlock *pblk, Block bufBlock); - +extern BufferDesc *RedoForOndemandExtremeRTOQuery(BufferDesc *bufHdr, char relpersistence, + ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode); #endif diff --git a/src/include/storage/lock/lwlock.h b/src/include/storage/lock/lwlock.h index 0ab2e3a2b..92858f0dd 100644 --- a/src/include/storage/lock/lwlock.h +++ b/src/include/storage/lock/lwlock.h @@ -128,6 +128,9 @@ const struct LWLOCK_PARTITION_DESC LWLockPartInfo[] = { /* Number of partions of the segment head buffer */ #define NUM_SEGMENT_HEAD_PARTITIONS 128 +/* Number of partitions of the redo xlog track mapping hashtable */ +#define NUM_XLOG_TRACK_PARTITIONS 4096 + /* Number of partions the session roleid hashtable */ #define NUM_SESSION_ROLEID_PARTITIONS 128 @@ -190,8 +193,9 @@ const struct LWLOCK_PARTITION_DESC LWLockPartInfo[] = { #define FirstGPRCMappingLock (FirstSessRoleIdLock + NUM_SESSION_ROLEID_PARTITIONS) /* standby statement history */ #define FirstStandbyStmtHistLock (FirstGPRCMappingLock + NUM_GPRC_PARTITIONS) +#define FirstXlogTrackLock (FirstStandbyStmtHistLock + NUM_STANDBY_STMTHIST_PARTITIONS) /* must be last: */ -#define NumFixedLWLocks (FirstStandbyStmtHistLock + NUM_STANDBY_STMTHIST_PARTITIONS) +#define NumFixedLWLocks (FirstXlogTrackLock + NUM_XLOG_TRACK_PARTITIONS) /* * WARNING----Please keep BuiltinTrancheIds and BuiltinTrancheNames consistent!!! * @@ -270,6 +274,7 @@ enum BuiltinTrancheIds LWTRANCHE_REPLICATION_ORIGIN, LWTRANCHE_AUDIT_INDEX_WAIT, LWTRANCHE_PCA_BUFFER_CONTENT, + LWTRANCHE_XLOG_TRACK_PARTITION, /* * Each trancheId above should have a corresponding item in BuiltinTrancheNames; */ diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index ee324efb8..b3762f7d3 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -638,8 +638,13 @@ extern void write_stderr(const char* fmt, ...) the supplied arguments. */ __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); -extern void getElevelAndSqlstate(int* eLevel, int* sqlState); +extern void write_stderr_with_prefix(const char* fmt, ...) + /* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ + __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); +extern void getElevelAndSqlstate(int* eLevel, int* sqlState); +extern void get_time_now(char* nowTime, int timeLen); void freeSecurityFuncSpace(char* charList, ...); extern void SimpleLogToServer(int elevel, bool silent, const char* fmt, ...) diff --git a/src/test/regress/output/recovery_2pc_tools.source b/src/test/regress/output/recovery_2pc_tools.source index 43742d513..3d9140e01 100644 --- a/src/test/regress/output/recovery_2pc_tools.source +++ b/src/test/regress/output/recovery_2pc_tools.source @@ -617,6 +617,7 @@ select name,vartype,unit,min_val,max_val from pg_settings where name <> 'qunit_c ss_enable_catalog_centralized | bool | | | ss_enable_dms | bool | | | ss_enable_dss | bool | | | + ss_enable_ondemand_recovery | bool | | | ss_enable_reform | bool | | | ss_enable_scrlock | bool | | | ss_enable_scrlock_sleep_mode | bool | | | @@ -638,6 +639,7 @@ select name,vartype,unit,min_val,max_val from pg_settings where name <> 'qunit_c ss_log_max_file_size | integer | kB | 1024 | 4194304 ssl_renegotiation_limit | integer | kB | 0 | 2147483647 ss_ock_log_path | string | | | + ss_ondemand_recovery_mem_size | integer | kB | 1048576 | 104857600 ss_parallel_thread_count | integer | | 0 | 64 ss_rdma_work_config | string | | | ss_recv_msg_pool_size | integer | kB | 1024 | 1048576 diff --git a/src/test/regress/pg_regress.cpp b/src/test/regress/pg_regress.cpp index 21adfa423..58510063c 100644 --- a/src/test/regress/pg_regress.cpp +++ b/src/test/regress/pg_regress.cpp @@ -5412,7 +5412,7 @@ static void CheckCleanCodeWarningInfo(const int baseNum, const int currentNum, return; } -#define BASE_GLOBAL_VARIABLE_NUM 222 +#define BASE_GLOBAL_VARIABLE_NUM 224 #define CMAKE_CMD_BUF_LEN 1000 @@ -5461,7 +5461,7 @@ static void check_global_variables() } } -#define BASE_PGXC_LIKE_MACRO_NUM 1391 +#define BASE_PGXC_LIKE_MACRO_NUM 1392 static void check_pgxc_like_macros() { #ifdef BUILD_BY_CMAKE