diff --git a/src/bin/gs_guc/cluster_guc.conf b/src/bin/gs_guc/cluster_guc.conf index 823394594..22426f8e6 100755 --- a/src/bin/gs_guc/cluster_guc.conf +++ b/src/bin/gs_guc/cluster_guc.conf @@ -754,6 +754,7 @@ ss_enable_catalog_centralized|bool|0,0|NULL|NULL| ss_enable_reform|bool|0,0|NULL|NULL| ss_enable_ssl|bool|0,0|NULL|NULL| ss_enable_aio|bool|0,0|NULL|NULL| +ss_enable_ondemand_realtime_build|bool|0,0|NULL|NULL| ss_enable_ondemand_recovery|bool|0,0|NULL|NULL| ss_interconnect_channel_count|int|1,32|NULL|NULL| ss_work_thread_count|int|16,128|NULL|NULL| diff --git a/src/common/backend/catalog/builtin_funcs.ini b/src/common/backend/catalog/builtin_funcs.ini index 7a52f7bee..6877b223b 100644 --- a/src/common/backend/catalog/builtin_funcs.ini +++ b/src/common/backend/catalog/builtin_funcs.ini @@ -2500,6 +2500,10 @@ "dsqrt", 1, AddBuiltinFunc(_0(230), _1("dsqrt"), _2(1), _3(true), _4(false), _5(dsqrt), _6(701), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('i'), _19(0), _20(1, 701), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("dsqrt"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("square root"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)) ), + AddFuncGroup( + "ondemand_recovery_status", 1, + AddBuiltinFunc(_0(6991), _1("ondemand_recovery_status"), _2(0), _3(false), _4(false), _5(get_ondemand_recovery_status), _6(2249), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(0), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('i'), _19(0), _20(0), _21(10, TEXTOID, TEXTOID, OIDOID, OIDOID, OIDOID, OIDOID, BOOLOID, TEXTOID, TEXTOID, TEXTOID), _22(10, 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o'), _23(10, "primary_checkpoint_redo_lsn", "realtime_build_replayed_lsn", "hashmap_used_blocks", "hashmap_total_blocks", "trxn_queue_blocks", "seg_queue_blocks", "in_ondemand_recovery", "ondemand_recovery_status", "realtime_build_status", "recovery_pause_status"), _24(NULL), _25("get_ondemand_recovery_status"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)) + ), AddFuncGroup( "dss_io_stat", 1, AddBuiltinFunc(_0(6990), _1("dss_io_stat"), _2(1), _3(true), _4(false), _5(dss_io_stat), _6(2249), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('i'), _19(0), _20(1, INT4OID), _21(4, INT4OID, INT8OID, INT8OID, INT4OID), _22(4, 'i', 'o', 'o', 'o'), _23(4, "duration", "read_kilobyte_per_sec", "write_kilobyte_per_sec", "io_times"), _24(NULL), _25("dss_io_stat"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33(NULL), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)) diff --git a/src/common/backend/utils/adt/pgstatfuncs.cpp b/src/common/backend/utils/adt/pgstatfuncs.cpp index b9ed6cec2..f0e84655a 100644 --- a/src/common/backend/utils/adt/pgstatfuncs.cpp +++ b/src/common/backend/utils/adt/pgstatfuncs.cpp @@ -85,6 +85,7 @@ #include "ddes/dms/ss_dms_recovery.h" #include "utils/json.h" #include "utils/jsonapi.h" +#include "access/ondemand_extreme_rto/page_redo.h" #define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32*)&(var)))) #define NUM_PG_LOCKTAG_ID 12 @@ -96,6 +97,7 @@ #define DISPLACEMENTS_VALUE 32 #define MAX_DURATION_TIME 60 #define DSS_IO_STAT_COLUMN_NUM 3 +#define ONDEMAND_RECOVERY_STAT_COLUMN_NUM 10 const uint32 INDEX_STATUS_VIEW_COL_NUM = 3; @@ -14754,6 +14756,113 @@ Datum track_memory_context_detail(PG_FUNCTION_ARGS) } } +Datum get_ondemand_recovery_status(PG_FUNCTION_ARGS) +{ + if (!ENABLE_ONDEMAND_RECOVERY) { + ereport(ERROR, (errmsg("This function only supports when enable ss_enable_ondemand_recovery."))); + } + Datum result; + TupleDesc tupdesc; + ondemand_recovery_stat stat; + errno_t errorno = EOK; + + ondemand_extreme_rto::GetOndemandRecoveryStatus(&stat); + // tuple header + int i = 1; + tupdesc = CreateTemplateTupleDesc(ONDEMAND_RECOVERY_STAT_COLUMN_NUM, false); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "primary_checkpoint_redo_lsn", TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "realtime_build_replayed_lsn", TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "hashmap_used_blocks", OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "hashmap_total_blocks", OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "trxn_queue_blocks", OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "seg_queue_blocks", OIDOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "in_ondemand_recovery", BOOLOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "ondemand_recovery_status", TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "realtime_build_status", TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)i++, "recovery_pause_status", TEXTOID, -1, 0); + + tupdesc = BlessTupleDesc(tupdesc); + + // tuple body + char redoLocation[MAXFNAMELEN]; + char replayedLocation[MAXFNAMELEN]; + + errorno = snprintf_s(redoLocation, sizeof(redoLocation), sizeof(redoLocation) - 1, "%X/%X", + (uint32)(stat.checkpointPtr >> 32), (uint32)stat.checkpointPtr); + securec_check_ss(errorno, "", ""); + errorno = snprintf_s(replayedLocation, sizeof(replayedLocation), sizeof(replayedLocation) - 1, "%X/%X", + (uint32)(stat.replayedPtr >> 32), (uint32)stat.replayedPtr); + securec_check_ss(errorno, "", ""); + + Datum values[ONDEMAND_RECOVERY_STAT_COLUMN_NUM]; + bool nulls[ONDEMAND_RECOVERY_STAT_COLUMN_NUM] = {false}; + i = 0; + values[i++] = CStringGetTextDatum(redoLocation); + values[i++] = CStringGetTextDatum(replayedLocation); + values[i++] = UInt32GetDatum(stat.hmpUsedBlkNum); + values[i++] = UInt32GetDatum(stat.hmpTotalBlkNum); + values[i++] = UInt32GetDatum(stat.trxnQueueNum); + values[i++] = UInt32GetDatum(stat.segQueueNum); + values[i++] = BoolGetDatum(stat.inOndemandRecovery); + + switch (stat.ondemandRecoveryStatus) { + case CLUSTER_IN_ONDEMAND_BUILD: + values[i++] = CStringGetTextDatum("ONDEMAND_RECOVERY_BUILD"); + break; + case CLUSTER_IN_ONDEMAND_REDO: + values[i++] = CStringGetTextDatum("ONDEMAND_RECOVERY_REDO"); + break; + case CLUSTER_NORMAL: + values[i++] = CStringGetTextDatum("NORMAL"); + break; + default: + ereport(ERROR, (errmsg("Invalid ondemand recovery status."))); + break; + } + + switch (stat.realtimeBuildStatus) { + case DISABLED: + values[i++] = CStringGetTextDatum("DISABLED"); + break; + case BUILD_NORMAL: + values[i++] = CStringGetTextDatum("BUILD_NORMAL"); + break; + case BUILD_TO_DISABLED: + values[i++] = CStringGetTextDatum("BUILD_TO_DISABLED"); + break; + case BUILD_TO_REDO: + values[i++] = CStringGetTextDatum("BUILD_TO_REDO"); + break; + default: + ereport(ERROR, (errmsg("Invalid realtime build status."))); + break; + } + + switch (stat.recoveryPauseStatus) { + case NOT_PAUSE: + values[i] = CStringGetTextDatum("NOT PAUSE"); + break; + case PAUSE_FOR_SYNC_REDO: + values[i] = CStringGetTextDatum("PAUSE(for sync record)"); + break; + case PAUSE_FOR_PRUNE_HASHMAP: + values[i] = CStringGetTextDatum("PAUSE(for hashmap full)"); + break; + case PAUSE_FOR_PRUNE_TRXN_QUEUE: + values[i] = CStringGetTextDatum("PAUSE(for trxn queue full)"); + break; + case PAUSE_FOR_PRUNE_SEG_QUEUE: + values[i] = CStringGetTextDatum("PAUSE(for seg queue full)"); + break; + default: + ereport(ERROR, (errmsg("Invalid recovery pause status."))); + break; + } + + HeapTuple heap_tuple = heap_form_tuple(tupdesc, values, nulls); + result = HeapTupleGetDatum(heap_tuple); + PG_RETURN_DATUM(result); +} /* * @Description : Get the statistical information for DSS IO, including read bytes, write bytes and io times. diff --git a/src/common/backend/utils/init/globals.cpp b/src/common/backend/utils/init/globals.cpp index 8a320bfdc..c7bc15977 100644 --- a/src/common/backend/utils/init/globals.cpp +++ b/src/common/backend/utils/init/globals.cpp @@ -75,7 +75,7 @@ bool will_shutdown = false; * NEXT | 92899 | ? | ? * ********************************************/ -const uint32 GRAND_VERSION_NUM = 92924; +const uint32 GRAND_VERSION_NUM = 92925; /******************************************** * 2.VERSION NUM FOR EACH FEATURE diff --git a/src/common/backend/utils/misc/guc/guc_storage.cpp b/src/common/backend/utils/misc/guc/guc_storage.cpp index 99f532ca0..08477451b 100755 --- a/src/common/backend/utils/misc/guc/guc_storage.cpp +++ b/src/common/backend/utils/misc/guc/guc_storage.cpp @@ -263,6 +263,7 @@ static bool check_logical_decode_options_default(char** newval, void** extra, Gu static void assign_logical_decode_options_default(const char* newval, void* extra); static bool check_uwal_devices_path(char** newval, void** extra, GucSource source); static bool check_uwal_log_path(char** newval, void** extra, GucSource source); +static void assign_recovery_parallelism(int newval, void* extra); static const struct config_enum_entry resource_track_log_options[] = { {"summary", SUMMARY, false}, @@ -1080,6 +1081,19 @@ static void InitStorageConfigureNamesBool() NULL, NULL}, + {{"ss_enable_ondemand_realtime_build", + PGC_POSTMASTER, + NODE_SINGLENODE, + SHARED_STORAGE_OPTIONS, + gettext_noop("Whether use on-demand real time build"), + NULL, + GUC_SUPERUSER_ONLY}, + &g_instance.attr.attr_storage.dms_attr.enable_ondemand_realtime_build, + false, + NULL, + NULL, + NULL}, + {{"ss_enable_ondemand_recovery", PGC_POSTMASTER, NODE_SINGLENODE, @@ -3275,7 +3289,7 @@ static void InitStorageConfigureNamesInt() 1, INT_MAX, NULL, - NULL, + assign_recovery_parallelism, NULL}, {{"parallel_recovery_batch", @@ -6637,6 +6651,14 @@ static bool check_ss_txnstatus_cache_size(int* newval, void** extra, GucSource s return true; } +static void assign_recovery_parallelism(int newval, void* extra) +{ + if (IsUnderPostmaster && !(t_thrd.role == STARTUP && t_thrd.is_inited)) { + return; + } + g_instance.attr.attr_storage.real_recovery_parallelism = newval; +} + #ifndef ENABLE_MULTIPLE_NODES static void assign_dcf_election_timeout(int newval, void* extra) diff --git a/src/common/backend/utils/misc/postgresql_single.conf.sample b/src/common/backend/utils/misc/postgresql_single.conf.sample index 92df4843d..6d30563ac 100644 --- a/src/common/backend/utils/misc/postgresql_single.conf.sample +++ b/src/common/backend/utils/misc/postgresql_single.conf.sample @@ -852,6 +852,7 @@ job_queue_processes = 10 # Number of concurrent jobs, optional: [0..1000] #ss_parallel_thread_count = 16 #ss_enable_ondemand_recovery = off #ss_ondemand_recovery_mem_size = 4GB # min: 1GB, max: 100GB +#ss_enable_ondemand_realtime_build = off #ss_enable_dorado = off #ss_stream_cluster = off #enable_segment = off diff --git a/src/gausskernel/ddes/adapter/ss_dms.cpp b/src/gausskernel/ddes/adapter/ss_dms.cpp index 21a46187b..6fe490817 100644 --- a/src/gausskernel/ddes/adapter/ss_dms.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms.cpp @@ -134,6 +134,7 @@ int ss_dms_func_init() SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_info)); SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_get_buf_res)); SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_get_cmd_stat)); + SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_req_opengauss_immediate_ckpt)); g_ss_dms_func.inited = true; return DMS_SUCCESS; @@ -392,4 +393,9 @@ void dms_get_buf_res(unsigned long long *row_id, dv_drc_buf_info *drc_info, int void dms_get_cmd_stat(int index, wait_cmd_stat_result_t *cmd_stat_result) { g_ss_dms_func.dms_get_cmd_stat(index, cmd_stat_result); +} + +int dms_req_opengauss_immediate_checkpoint(dms_context_t *dms_ctx, unsigned long long *redo_lsn) +{ + return g_ss_dms_func.dms_req_opengauss_immediate_ckpt(dms_ctx, redo_lsn); } \ No newline at end of file diff --git a/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp b/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp index 499a4fede..7178728c7 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp @@ -33,6 +33,7 @@ #include "securec_check.h" #include "miscadmin.h" #include "access/double_write.h" +#include "access/ondemand_extreme_rto/dispatcher.h" #include "access/multi_redo_api.h" void InitDmsBufCtrl(void) @@ -216,6 +217,11 @@ RETRY: void SmgrNetPageCheckDiskLSN(BufferDesc *buf_desc, ReadBufferMode read_mode, const XLogPhyBlock *pblk) { + dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); + if (SS_ONDEMAND_REALTIME_BUILD_FAILOVER && (buf_ctrl->state & BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED)) { + return; + } + /* * prerequisite is that the page that initialized to zero in memory should be flush to disk */ @@ -262,6 +268,17 @@ void SmgrNetPageCheckDiskLSN(BufferDesc *buf_desc, ReadBufferMode read_mode, con } #endif +static Buffer ReadBufferInRealtimeBuildFailoverForDMS(BufferDesc* buf_desc, ReadBufferMode read_mode, const XLogPhyBlock *pblk) +{ + Page page = (Page)BufHdrGetBlock(buf_desc); + XLogRecPtr ckptRedoPtr = pg_atomic_read_u64(&ondemand_extreme_rto::g_dispatcher->ckptRedoPtr); + if (XLByteLT(ckptRedoPtr, PageGetLSN(page))) { + return BufferDescriptorGetBuffer(buf_desc); + } else { + return ReadBuffer_common_for_dms(read_mode, buf_desc, pblk); + } +} + Buffer TerminateReadPage(BufferDesc* buf_desc, ReadBufferMode read_mode, const XLogPhyBlock *pblk) { dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); @@ -270,7 +287,21 @@ Buffer TerminateReadPage(BufferDesc* buf_desc, ReadBufferMode read_mode, const X if (g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy && AmDmsReformProcProcess()) { ereport(PANIC, (errmsg("SS In flush copy, can't read from disk!"))); } - buffer = ReadBuffer_common_for_dms(read_mode, buf_desc, pblk); + /* + * do not allow pageredo workers read buffer from disk if standby node in ondemand + * realtime build status, because some buffer need init directly in recovery mode + */ + if (unlikely(AmPageRedoWorker() && (read_mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) && SS_STANDBY_FAILOVER)) { + buf_ctrl->state &= ~BUF_READ_MODE_ONDEMAND_REALTIME_BUILD; + buffer = InvalidBuffer; + } else { + if (SS_ONDEMAND_REALTIME_BUILD_FAILOVER && (read_mode == RBM_NORMAL) && + (buf_ctrl->state & BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED)) { + buffer = ReadBufferInRealtimeBuildFailoverForDMS(buf_desc, read_mode, pblk); + } else { + buffer = ReadBuffer_common_for_dms(read_mode, buf_desc, pblk); + } + } } else { #ifdef USE_ASSERT_CHECKING if (buf_ctrl->state & BUF_IS_EXTEND) { @@ -374,6 +405,11 @@ static bool DmsStartBufferIO(BufferDesc *buf_desc, LWLockMode mode) #ifdef USE_ASSERT_CHECKING void SegNetPageCheckDiskLSN(BufferDesc *buf_desc, ReadBufferMode read_mode, SegSpace *spc) { + dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); + if (SS_ONDEMAND_REALTIME_BUILD_FAILOVER && (buf_ctrl->state & BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED)) { + return; + } + /* * prequisite is that the page that initialized to zero in memory should be flushed to disk, * references to seg_extend @@ -397,12 +433,39 @@ void SegNetPageCheckDiskLSN(BufferDesc *buf_desc, ReadBufferMode read_mode, SegS } #endif +static Buffer ReadSegBufferInRealtimeBuildFailoverForDMS(BufferDesc* buf_desc, ReadBufferMode read_mode, SegSpace *spc) +{ + Page page = (Page)BufHdrGetBlock(buf_desc); + XLogRecPtr ckptRedoPtr = pg_atomic_read_u64(&ondemand_extreme_rto::g_dispatcher->ckptRedoPtr); + if (XLByteLT(ckptRedoPtr, PageGetLSN(page))) { + SegTerminateBufferIO(buf_desc, false, BM_VALID); + return BufferDescriptorGetBuffer(buf_desc); + } else { + return ReadSegBufferForDMS(buf_desc, read_mode, spc); + } +} + Buffer TerminateReadSegPage(BufferDesc *buf_desc, ReadBufferMode read_mode, SegSpace *spc) { dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); Buffer buffer; if (buf_ctrl->state & BUF_NEED_LOAD) { - buffer = ReadSegBufferForDMS(buf_desc, read_mode, spc); + /* + * do not allow pageredo workers read buffer from disk if standby node in ondemand + * realtime build status, because some buffer need init directly in recovery mode + */ + if (unlikely(AmPageRedoWorker() && (read_mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) && SS_STANDBY_FAILOVER)) { + buf_ctrl->state &= ~BUF_READ_MODE_ONDEMAND_REALTIME_BUILD; + SegTerminateBufferIO(buf_desc, false, BM_VALID); + buffer = InvalidBuffer; + } else { + if (SS_ONDEMAND_REALTIME_BUILD_FAILOVER && (read_mode == RBM_NORMAL) && + (buf_ctrl->state & BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED)) { + buffer = ReadSegBufferInRealtimeBuildFailoverForDMS(buf_desc, read_mode, spc); + } else { + buffer = ReadSegBufferForDMS(buf_desc, read_mode, spc); + } + } } else { Page page = (Page)BufHdrGetBlock(buf_desc); PageSetChecksumInplace(page, buf_desc->tag.blockNum); @@ -436,6 +499,12 @@ Buffer DmsReadSegPage(Buffer buffer, LWLockMode mode, ReadBufferMode read_mode, return buffer; } + if (unlikely(AmPageRedoWorker() && (read_mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) && SS_STANDBY_FAILOVER)) { + buf_ctrl->state &= ~BUF_READ_MODE_ONDEMAND_REALTIME_BUILD; + *with_io = false; + return InvalidBuffer; + } + if (!DmsCheckBufAccessible()) { *with_io = false; return 0; @@ -452,6 +521,8 @@ Buffer DmsReadSegPage(Buffer buffer, LWLockMode mode, ReadBufferMode read_mode, if (!StartReadPage(buf_desc, mode)) { return 0; } + + *with_io = false; return TerminateReadSegPage(buf_desc, read_mode); } @@ -464,6 +535,12 @@ Buffer DmsReadPage(Buffer buffer, LWLockMode mode, ReadBufferMode read_mode, boo return buffer; } + if (unlikely(AmPageRedoWorker() && (read_mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) && SS_STANDBY_FAILOVER)) { + buf_ctrl->state &= ~BUF_READ_MODE_ONDEMAND_REALTIME_BUILD; + *with_io = false; + return InvalidBuffer; + } + XLogPhyBlock pblk = {0, 0, 0}; if (OidIsValid(buf_ctrl->pblk_relno)) { Assert(ExtentTypeIsValid(buf_ctrl->pblk_relno)); @@ -494,6 +571,8 @@ Buffer DmsReadPage(Buffer buffer, LWLockMode mode, ReadBufferMode read_mode, boo if (!StartReadPage(buf_desc, mode)) { return 0; } + + *with_io = false; return TerminateReadPage(buf_desc, read_mode, OidIsValid(buf_ctrl->pblk_relno) ? &pblk : NULL); } @@ -678,6 +757,19 @@ void SSCheckBufferIfNeedMarkDirty(Buffer buf) } } +static void SSOndemandCheckBufferState() +{ + for (int buffer = 0; buffer < TOTAL_BUFFER_NUM; buffer++) { + dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buffer); + Assert(!(buf_ctrl->state & BUF_READ_MODE_ONDEMAND_REALTIME_BUILD)); + + // realtime build pinned buffer are already mark dirty in CBFlushCopy, do not need label + if (buf_ctrl->state & BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED) { + buf_ctrl->state &= ~BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED; + } + } +} + void SSRecheckBufferPool() { uint64 buf_state; @@ -708,6 +800,7 @@ void SSRecheckBufferPool() buf_desc->tag.rnode.bucketNode, buf_desc->tag.forkNum, buf_desc->tag.blockNum, (unsigned long long)pagelsn))); } } + SSOndemandCheckBufferState(); } bool CheckPageNeedSkipInRecovery(Buffer buf) @@ -1006,4 +1099,74 @@ bool SSWaitIOTimeout(BufferDesc *buf) tag->forkNum, tag->blockNum, buf->buf_id, buf->io_in_progress_lock)))); } return ret; -} \ No newline at end of file +} + +bool SSOndemandRealtimeBuildAllowFlush(BufferDesc *buf_desc) +{ + if (!ENABLE_DMS || IsInitdb) { + return true; + } + + dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); + if (buf_ctrl->state & BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED) { + if (!SS_ONDEMAND_REALTIME_BUILD_DISABLED && IsExtremeRtoRunning()) { + XLogRecPtr ckptRedoPtr = pg_atomic_read_u64(&ondemand_extreme_rto::g_dispatcher->ckptRedoPtr); + XLogRecPtr bufferLsn = PageGetLSN(BufHdrGetBlock(buf_desc)); + if (XLByteLT(ckptRedoPtr, bufferLsn)) { + return false; + } + } + buf_ctrl->state &= ~BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED; + } + return true; +} + +Buffer SSReadBuffer(BufferTag *tag, ReadBufferMode mode) +{ + Buffer buffer; + if (IsSegmentPhysicalRelNode(tag->rnode)) { + SegSpace *spc = spc_open(tag->rnode.spcNode, tag->rnode.dbNode, false, false); + buffer = ReadBufferFast(spc, tag->rnode, tag->forkNum, tag->blockNum, mode); + } else { + buffer = ReadBufferWithoutRelcache(tag->rnode, tag->forkNum, tag->blockNum, mode, NULL, NULL); + } + return buffer; +} + +void DmsReleaseBuffer(int buffer, bool is_seg) +{ + if (is_seg) { + SegReleaseBuffer(buffer); + } else { + ReleaseBuffer(buffer); + } +} + +bool SSRequestPageInOndemandRealtimeBuild(BufferTag *bufferTag, XLogRecPtr recordLsn, XLogRecPtr *pageLsn) +{ + Buffer buffer = SSReadBuffer(bufferTag, RBM_FOR_ONDEMAND_REALTIME_BUILD); + if (BufferIsInvalid(buffer)) { + WaitUntilRealtimeBuildStatusToFailoverAndUpdatePrunePtr(); + return false; + } + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buffer - 1); + if (buf_ctrl->state & BUF_READ_MODE_ONDEMAND_REALTIME_BUILD) { + buf_ctrl->state &= ~BUF_READ_MODE_ONDEMAND_REALTIME_BUILD; + buf_ctrl->state |= BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED; + + if (pageLsn != NULL) { + *pageLsn = PageGetLSN(BufferGetPage(buffer)); + } + } else { + DmsReleaseBuffer(buffer, IsSegmentPhysicalRelNode(bufferTag->rnode)); + WaitUntilRealtimeBuildStatusToFailoverAndUpdatePrunePtr(); + return false; + } + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + DmsReleaseBuffer(buffer, IsSegmentPhysicalRelNode(bufferTag->rnode)); + return true; +} diff --git a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp index b6af66f96..ffaadaf3c 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp @@ -34,6 +34,7 @@ #include "access/transam.h" #include "access/csnlog.h" #include "access/xlog.h" +#include "access/multi_redo_api.h" #include "ddes/dms/ss_dms_bufmgr.h" #include "storage/buf/buf_internals.h" #include "ddes/dms/ss_transaction.h" @@ -551,15 +552,6 @@ static unsigned long long CBGetGlobalLSN(void *db_handle) return GetInsertRecPtr(); } -static void DmsReleaseBuffer(int buffer, bool is_seg) -{ - if (is_seg) { - SegReleaseBuffer(buffer); - } else { - ReleaseBuffer(buffer); - } -} - static int tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_t **buf_ctrl) { bool is_seg; @@ -1639,17 +1631,11 @@ static int CBFlushCopy(void *db_handle, char *pageid) BufferTag* tag = (BufferTag*)pageid; Buffer buffer; - SegSpace *spc = NULL; uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount; PG_TRY(); { - if (IsSegmentPhysicalRelNode(tag->rnode)) { - spc = spc_open(tag->rnode.spcNode, tag->rnode.dbNode, false, false); - buffer = ReadBufferFast(spc, tag->rnode, tag->forkNum, tag->blockNum, RBM_NORMAL); - } else { - buffer = ReadBufferWithoutRelcache(tag->rnode, tag->forkNum, tag->blockNum, RBM_NORMAL, NULL, NULL); - } + buffer = SSReadBuffer(tag, RBM_NORMAL); } PG_CATCH(); { @@ -2122,18 +2108,12 @@ int CBOndemandRedoPageForStandby(void *block_key, int32 *redo_status) } Buffer buffer; - SegSpace *spc = NULL; uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount; *redo_status = ONDEMAND_REDO_DONE; smgrcloseall(); PG_TRY(); { - if (IsSegmentPhysicalRelNode(tag->rnode)) { - spc = spc_open(tag->rnode.spcNode, tag->rnode.dbNode, false, false); - buffer = ReadBufferFast(spc, tag->rnode, tag->forkNum, tag->blockNum, RBM_NORMAL); - } else { - buffer = ReadBufferWithoutRelcache(tag->rnode, tag->forkNum, tag->blockNum, RBM_NORMAL, NULL, NULL); - } + buffer = SSReadBuffer(tag, RBM_NORMAL); ReleaseBuffer(buffer); } PG_CATCH(); @@ -2177,6 +2157,15 @@ void DmsThreadDeinit() proc_exit(0); } +int CBDoCheckpointImmediately(unsigned long long *ckpt_lsn) +{ + Assert(SS_PRIMARY_MODE); + + RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); + *ckpt_lsn = (unsigned long long)t_thrd.shemem_ptr_cxt.ControlFile->checkPoint; + return GS_SUCCESS; +} + void DmsInitCallback(dms_callback_t *callback) { // used in reform @@ -2246,4 +2235,5 @@ void DmsInitCallback(dms_callback_t *callback) callback->get_buf_info = CBGetBufInfo; callback->buf_ctrl_recycle = CBBufCtrlRecycle; callback->dms_thread_deinit = DmsThreadDeinit; + callback->opengauss_do_ckpt_immediate = CBDoCheckpointImmediately; } diff --git a/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp b/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp index 42fb31b29..60d4738b5 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp @@ -37,7 +37,9 @@ #include "ddes/dms/ss_dms_bufmgr.h" #include "ddes/dms/ss_dms_recovery.h" #include "ddes/dms/ss_reform_common.h" +#include "ddes/dms/ss_transaction.h" #include "access/double_write.h" +#include "access/twophase.h" #include #include #include @@ -153,7 +155,7 @@ bool SSRecoveryApplyDelay() return true; } - while (g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag) { + while (g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag || SS_ONDEMAND_RECOVERY_PAUSE) { /* might change the trigger file's location */ RedoInterruptCallBack(); @@ -231,7 +233,9 @@ void SSInitReformerControlPages(void) void SShandle_promote_signal() { if (pmState == PM_WAIT_BACKENDS) { - g_instance.pid_cxt.StartupPID = initialize_util_thread(STARTUP); + if (SS_ONDEMAND_REALTIME_BUILD_DISABLED) { + g_instance.pid_cxt.StartupPID = initialize_util_thread(STARTUP); + } Assert(g_instance.pid_cxt.StartupPID != 0); pmState = PM_STARTUP; } @@ -300,3 +304,65 @@ void ss_switchover_promoting_dw_init() g_instance.dms_cxt.dw_init = true; ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS switchover] dw init finished"))); } + +XLogRecPtr SSOndemandRequestPrimaryCkptAndGetRedoLsn() +{ + XLogRecPtr primaryRedoLsn = InvalidXLogRecPtr; + + ereport(DEBUG1, (errmodule(MOD_DMS), + errmsg("[On-demand] start request primary node %d do checkpoint", SS_PRIMARY_ID))); + if (SS_ONDEMAND_REALTIME_BUILD_NORMAL) { + dms_context_t dms_ctx; + InitDmsContext(&dms_ctx); + dms_ctx.xmap_ctx.dest_id = (unsigned int)SS_PRIMARY_ID; + if (dms_req_opengauss_immediate_checkpoint(&dms_ctx, (unsigned long long *)&primaryRedoLsn) == GS_SUCCESS) { + ereport(DEBUG1, (errmodule(MOD_DMS), + errmsg("[On-demand] request primary node %d checkpoint success, redoLoc %X/%X", SS_PRIMARY_ID, + (uint32)(primaryRedoLsn << 32), (uint32)primaryRedoLsn))); + return primaryRedoLsn; + } + ereport(DEBUG1, (errmodule(MOD_DMS), + errmsg("[On-demand] request primary node %d checkpoint failed", SS_PRIMARY_ID))); + } + + // read from DMS failed, so read from DSS + SSReadControlFile(SS_PRIMARY_ID, true); + primaryRedoLsn = g_instance.dms_cxt.ckptRedo; + ereport(DEBUG1, (errmodule(MOD_DMS), + errmsg("[On-demand] read primary node %d checkpoint loc in control file, redoLoc %X/%X", SS_PRIMARY_ID, + (uint32)(primaryRedoLsn << 32), (uint32)primaryRedoLsn))); + return primaryRedoLsn; +} + +void StartupOndemandRecovery() +{ + g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = true; + g_instance.dms_cxt.SSRecoveryInfo.cluster_ondemand_status = CLUSTER_IN_ONDEMAND_BUILD; + /* for other nodes in cluster and ondeamnd recovery failed */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_IN_ONDEMAND_BUILD; + g_instance.dms_cxt.SSReformerControl.recoveryInstId = g_instance.dms_cxt.SSRecoveryInfo.recovery_inst_id; + SSUpdateReformerCtrl(); + LWLockRelease(ControlFileLock); + SSRequestAllStandbyReloadReformCtrlPage(); + SetOndemandExtremeRtoMode(); +} + +void OndemandRealtimeBuildHandleFailover() +{ + Assert(SS_ONDEMAND_REALTIME_BUILD_NORMAL); + + SSReadControlFile(SSGetPrimaryInstId()); + ss_failover_dw_init(); + StartupOndemandRecovery(); + StartupReplicationSlots(); + restoreTwoPhaseData(); + OnDemandUpdateRealtimeBuildPrunePtr(); + SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED); + while (g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag) { + pg_usleep(REFORM_WAIT_TIME); + } + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status = BUILD_TO_REDO; + ereport(LOG, (errmodule(MOD_DMS), errmsg("[On-demand] Node:%d receive failover signal, " + "close realtime build and start ondemand build", SS_MY_INST_ID))); +} diff --git a/src/gausskernel/ddes/ddes_commit_id b/src/gausskernel/ddes/ddes_commit_id index 5af9f31aa..556dee8b1 100644 --- a/src/gausskernel/ddes/ddes_commit_id +++ b/src/gausskernel/ddes/ddes_commit_id @@ -1,3 +1,3 @@ -dms_commit_id=036861d3b2e7484244bda56563f8fe3e49579ea6 +dms_commit_id=763fb89fcc19dd53068cd0abc2bf0ce29ab4f0dd dss_commit_id=41ddc77da33f1ff6e513bff77aaf31ff7bdcf0c6 cbb_commit_id=f0b4f881b1c957b9bfe90176c6bbe5336304ca79 \ No newline at end of file diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index 4f05c3382..b9469ea39 100644 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -3442,6 +3442,15 @@ static void CheckExtremeRtoGUCConflicts(void) errhint("Either turn off ss_enable_ondemand_recovery, or set extreme rto param."))); } } + + if (g_instance.attr.attr_storage.dms_attr.enable_ondemand_realtime_build) { + if (!g_instance.attr.attr_storage.dms_attr.enable_ondemand_recovery) { + ereport(ERROR, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("ondemand realtime build only support in ondemand recovery mode."), + errhint("Either turn on ss_enable_ondemand_recovery, or turn off ss_enable_ondemand_realtime_build."))); + } + } } static void CheckRecoveryParaConflict() { @@ -3902,6 +3911,16 @@ static int ServerLoop(void) } } + if (startup_reform_finish && ENABLE_ONDEMAND_REALTIME_BUILD && SS_ONDEMAND_REALTIME_BUILD_DISABLED && + SS_NORMAL_STANDBY && SS_CLUSTER_ONDEMAND_NORMAL) { + if (g_instance.pid_cxt.StartupPID == 0) { + g_instance.pid_cxt.StartupPID = initialize_util_thread(STARTUP); + Assert(g_instance.pid_cxt.StartupPID != 0); + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status = READY_TO_BUILD; + ereport(LOG, (errmsg("[On-demand] Node:%d ondemand realtime build start", SS_MY_INST_ID))); + } + } + this_start_poll_time = mc_timers_us(); if ((this_start_poll_time - last_start_loop_time) != 0) { gs_set_libcomm_used_rate( diff --git a/src/gausskernel/process/threadpool/knl_instance.cpp b/src/gausskernel/process/threadpool/knl_instance.cpp index 408973869..05d905ce5 100755 --- a/src/gausskernel/process/threadpool/knl_instance.cpp +++ b/src/gausskernel/process/threadpool/knl_instance.cpp @@ -202,9 +202,11 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt) dms_cxt->SSRecoveryInfo.in_flushcopy = false; dms_cxt->SSRecoveryInfo.no_backend_left = false; dms_cxt->SSRecoveryInfo.in_ondemand_recovery = false; + dms_cxt->SSRecoveryInfo.ondemand_realtime_build_status = DISABLED; dms_cxt->SSRecoveryInfo.startup_need_exit_normally = false; dms_cxt->SSRecoveryInfo.recovery_trapped_in_page_request = false; dms_cxt->SSRecoveryInfo.dorado_sharestorage_inited = false; + dms_cxt->SSRecoveryInfo.ondemand_recovery_pause_status = NOT_PAUSE; dms_cxt->log_timezone = NULL; pg_atomic_init_u32(&dms_cxt->inDmsThreShmemInitCnt, 0); pg_atomic_init_u32(&dms_cxt->inProcExitCnt, 0); @@ -348,7 +350,7 @@ static void knl_g_parallel_redo_init(knl_g_parallel_redo_context* predo_cxt) predo_cxt->exrto_recyle_xmin = 0; predo_cxt->exrto_snapshot = (ExrtoSnapshot)MemoryContextAllocZero( INSTANCE_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE), sizeof(ExrtoSnapshotData)); - predo_cxt->redoItemHash = NULL; + predo_cxt->redoItemHashCtrl = NULL; predo_cxt->standby_read_delay_ddl_stat.delete_stat = 0; predo_cxt->standby_read_delay_ddl_stat.next_index_can_insert = 0; diff --git a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp index 73ca123d2..5f4b047d1 100644 --- a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp +++ b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp @@ -1141,7 +1141,7 @@ void XLogRedoBufferSetState(RedoBufferManager *buffermanager, RedoMemSlot *buffe void XLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOperate *refOperate, InterruptFunc interruptOperte) { - if (SS_IN_ONDEMAND_RECOVERY) { + if (IsExtremeRedo() && IsOndemandExtremeRtoMode()) { return OndemandXLogParseBufferInit(parsemanager, buffernum, refOperate, interruptOperte); } @@ -1159,7 +1159,7 @@ void XLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOpera void XLogParseBufferDestory(RedoParseManager *parsemanager) { - if (SS_IN_ONDEMAND_RECOVERY) { + if (IsExtremeRedo() && IsOndemandExtremeRtoMode()) { OndemandXLogParseBufferDestory(parsemanager); return; } @@ -1175,7 +1175,7 @@ void XLogParseBufferDestory(RedoParseManager *parsemanager) XLogRecParseState *XLogParseBufferAllocList(RedoParseManager *parsemanager, XLogRecParseState *blkstatehead, void *record) { - if (SS_IN_ONDEMAND_RECOVERY) { + if (IsExtremeRedo() && IsOndemandExtremeRtoMode()) { return OndemandXLogParseBufferAllocList(parsemanager, blkstatehead, record); } @@ -1230,7 +1230,7 @@ XLogRecParseState *XLogParseBufferCopy(XLogRecParseState *srcState) void XLogParseBufferRelease(XLogRecParseState *recordstate) { - if (SS_IN_ONDEMAND_RECOVERY) { + if (IsExtremeRedo() && IsOndemandExtremeRtoMode()) { OndemandXLogParseBufferRelease(recordstate); return; } diff --git a/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp b/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp index 84f6d328e..057873cd6 100644 --- a/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp @@ -134,7 +134,7 @@ static const f_extreme_rto_redo extreme_rto_redosw[] = { ondemand_extreme_rto::FreeAllocatedRedoItem, ondemand_extreme_rto::GetAllWorkerCount, ondemand_extreme_rto::GetXLogInvalidPagesFromWorkers, - ondemand_extreme_rto::SendRecoveryEndMarkToWorkersAndWaitForFinish, + NULL, ondemand_extreme_rto::redo_get_io_event, ondemand_extreme_rto::redo_get_worker_statistic, ondemand_extreme_rto::redo_get_worker_time_count, diff --git a/src/gausskernel/storage/access/transam/multi_redo_settings.cpp b/src/gausskernel/storage/access/transam/multi_redo_settings.cpp index f92a3fe20..a08050372 100644 --- a/src/gausskernel/storage/access/transam/multi_redo_settings.cpp +++ b/src/gausskernel/storage/access/transam/multi_redo_settings.cpp @@ -49,6 +49,10 @@ void ConfigRecoveryParallelism() g_instance.attr.attr_storage.recovery_redo_workers_per_paser_worker * g_instance.attr.attr_storage.batch_redo_num + TRXN_REDO_MANAGER_NUM + TRXN_REDO_WORKER_NUM + XLOG_READER_NUM; + if (IsOndemandExtremeRtoMode()) { + total_recovery_parallelism = total_recovery_parallelism + ONDEMAND_AUXILIARY_WORKER_NUM + + g_instance.attr.attr_storage.batch_redo_num; // hashmap prune worker of each redo pipeline + } sprintf_s(buf, sizeof(buf), "%u", total_recovery_parallelism); ereport(LOG, (errmsg("ConfigRecoveryParallelism, parse workers:%d, " diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp index 9062d110b..99e02ae6b 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp @@ -101,11 +101,11 @@ int RedoItemTagMatch(const void *left, const void *right, Size keysize) return 1; } -HTAB **PRRedoItemHashInitialize(MemoryContext context) +ondemand_htab_ctrl_t *PRRedoItemHashInitialize(MemoryContext context) { HASHCTL ctl; - int batchNum = get_batch_redo_num(); - HTAB **hTab = (HTAB **)MemoryContextAllocZero(context, batchNum * sizeof(HTAB *)); + ondemand_htab_ctrl_t *htab_ctrl = + (ondemand_htab_ctrl_t *)MemoryContextAllocZero(context, sizeof(ondemand_htab_ctrl_t)); /* * create hashtable that indexes the redo items @@ -117,30 +117,107 @@ HTAB **PRRedoItemHashInitialize(MemoryContext context) ctl.entrysize = sizeof(RedoItemHashEntry); ctl.hash = RedoItemTagHash; ctl.match = RedoItemTagMatch; - for (int i = 0; i < batchNum; i++) { - hTab[i] = hash_create("Redo item hash by relfilenode and blocknum", INITredoItemHashSIZE, &ctl, - HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_SHRCTX | HASH_COMPARE); - } + htab_ctrl->hTab = hash_create("Redo item hash by relfilenode and blocknum", INITredoItemHashSIZE, &ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_COMPARE); + htab_ctrl->nextHTabCtrl = NULL; + htab_ctrl->maxRedoItemPtr = InvalidXLogRecPtr; - return hTab; + return htab_ctrl; } -void PRRegisterBlockInsertToList(RedoItemHashEntry *redoItemHashEntry, XLogRecParseState *record) +ondemand_htab_ctrl_t **PRInitRedoItemHashForAllPipeline(MemoryContext context) +{ + int batchNum = get_batch_redo_num(); + ondemand_htab_ctrl_t **htab_ctrl = + (ondemand_htab_ctrl_t **)MemoryContextAllocZero(context, batchNum * sizeof(ondemand_htab_ctrl_t *)); + + for (int i = 0; i < batchNum; i++) { + htab_ctrl[i] = PRRedoItemHashInitialize(context); + } + + return htab_ctrl; +} + +void PRRegisterBlockInsertToListHead(RedoItemHashEntry *redoItemHashEntry, XLogRecParseState *record) +{ + ReferenceRecParseState(record); + if (redoItemHashEntry->head != NULL) { + Assert(XLByteLE(record->blockparse.blockhead.end_ptr, redoItemHashEntry->head->blockparse.blockhead.end_ptr)); + record->nextrecord = redoItemHashEntry->head; + redoItemHashEntry->head = record; + } else { + redoItemHashEntry->tail = record; + redoItemHashEntry->head = record; + } + redoItemHashEntry->redoItemNum++; +} + +void PRRegisterBlockInsertToListTail(RedoItemHashEntry *redoItemHashEntry, XLogRecParseState *record) { ReferenceRecParseState(record); if (redoItemHashEntry->tail != NULL) { redoItemHashEntry->tail->nextrecord = record; redoItemHashEntry->tail = record; } else { - redoItemHashEntry->tail = record; redoItemHashEntry->head = record; + redoItemHashEntry->tail = record; } record->nextrecord = NULL; redoItemHashEntry->redoItemNum++; } -void PRRegisterBlockChangeExtended(XLogRecParseState *recordBlockState, const RelFileNode rNode, ForkNumber forkNum, - BlockNumber blkNo, HTAB *redoItemHash) +#ifdef USE_ASSERT_CHECKING +static void OndemandCheckPRRegister(XLogRecParseState *headRecord, XLogRecParseState *tailRecord, int count) +{ + Assert(headRecord != NULL); + Assert(tailRecord != NULL); + + int checkCount = 1; + XLogRecParseState *nextRecord = headRecord; + while (nextRecord != tailRecord) { + XLogRecParseState *checkRecord = nextRecord; + nextRecord = (XLogRecParseState *)checkRecord->nextrecord; + Assert(XLByteLE(checkRecord->blockparse.blockhead.end_ptr, nextRecord->blockparse.blockhead.end_ptr)); + checkCount++; + } + Assert(nextRecord == tailRecord); + Assert(checkCount == count); +} +#endif + +void PRRegisterBatchBlockInsertToListHead(RedoItemHashEntry *redoItemHashEntry, XLogRecParseState *headRecord, + XLogRecParseState *tailRecord, int count) +{ + if (redoItemHashEntry->head != NULL) { + Assert(XLByteLE(tailRecord->blockparse.blockhead.end_ptr, + redoItemHashEntry->head->blockparse.blockhead.end_ptr)); + tailRecord->nextrecord = redoItemHashEntry->head; + redoItemHashEntry->head = headRecord; + } else { + redoItemHashEntry->head = headRecord; + redoItemHashEntry->tail = tailRecord; + } + redoItemHashEntry->redoItemNum += count; +} + +void PRRegisterBatchBlockInsertToListTail(RedoItemHashEntry *redoItemHashEntry, XLogRecParseState *headRecord, + XLogRecParseState *tailRecord, int count) +{ + if (redoItemHashEntry->tail != NULL) { + Assert(XLByteLE(redoItemHashEntry->tail->blockparse.blockhead.end_ptr, + headRecord->blockparse.blockhead.end_ptr)); + redoItemHashEntry->tail->nextrecord = headRecord; + redoItemHashEntry->tail = tailRecord; + } else { + redoItemHashEntry->head = headRecord; + redoItemHashEntry->tail = tailRecord; + } + tailRecord->nextrecord = NULL; + redoItemHashEntry->redoItemNum += count; +} + +static RedoItemHashEntry *PRRegisterGetHashEntry(const RelFileNode rNode, ForkNumber forkNum, BlockNumber blkNo, + HTAB *redoItemHash) { RedoItemTag redoItemTag; RedoItemHashEntry *redoItemHashEntry = NULL; @@ -162,7 +239,30 @@ void PRRegisterBlockChangeExtended(XLogRecParseState *recordBlockState, const Re if (!found) { PRInitRedoItemEntry(redoItemHashEntry); } - PRRegisterBlockInsertToList(redoItemHashEntry, recordBlockState); + return redoItemHashEntry; +} + +void PRRegisterBlockChangeExtended(XLogRecParseState *recordBlockState, const RelFileNode rNode, ForkNumber forkNum, + BlockNumber blkNo, HTAB *redoItemHash, bool isHead) +{ + RedoItemHashEntry *redoItemHashEntry = PRRegisterGetHashEntry(rNode, forkNum, blkNo, redoItemHash); + if (unlikely(isHead)) { + PRRegisterBlockInsertToListHead(redoItemHashEntry, recordBlockState); + } else { + PRRegisterBlockInsertToListTail(redoItemHashEntry, recordBlockState); + } +} + +void PRRegisterBatchBlockChangeExtended(XLogRecParseState *headBlockState, XLogRecParseState *tailBlockState, int count, + const RelFileNode rNode, ForkNumber forkNum, BlockNumber blkNo, + HTAB *redoItemHash, bool isHead) +{ + RedoItemHashEntry *redoItemHashEntry = PRRegisterGetHashEntry(rNode, forkNum, blkNo, redoItemHash); + if (unlikely(isHead)) { + PRRegisterBatchBlockInsertToListHead(redoItemHashEntry, headBlockState, tailBlockState, count); + } else { + PRRegisterBatchBlockInsertToListTail(redoItemHashEntry, headBlockState, tailBlockState, count); + } } void PRTrackRemoveEntry(HTAB *hashMap, RedoItemHashEntry *entry) @@ -252,6 +352,17 @@ void PRTrackDatabaseDrop(XLogRecParseState *recordBlockState, HTAB *hashMap) XLogBlockParseStateRelease(recordBlockState); } +void PRTrackAllClear(HTAB *redoItemHash) +{ + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + hash_seq_init(&status, redoItemHash); + + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + PRTrackRemoveEntry(redoItemHash, redoItemEntry); + } +} + void PRTrackDropFiles(HTAB *redoItemHash, XLogBlockDdlParse *ddlParse, XLogRecPtr lsn) { ColFileNodeRel *xnodes = (ColFileNodeRel *)ddlParse->mainData; @@ -307,7 +418,7 @@ void PRTrackRelStorageDrop(XLogRecParseState *recordBlockState, HTAB *redoItemHa } // Get relfile node fork num blockNum -void PRTrackRelPageModification(XLogRecParseState *recordBlockState, HTAB *redoItemHash) +void PRTrackRelPageModification(XLogRecParseState *recordBlockState, HTAB *redoItemHash, bool isHead) { RelFileNode relnode; ForkNumber forkNum; @@ -315,16 +426,31 @@ void PRTrackRelPageModification(XLogRecParseState *recordBlockState, HTAB *redoI PRXLogRecGetBlockTag(recordBlockState, &relnode, &blkNo, &forkNum); - PRRegisterBlockChangeExtended(recordBlockState, relnode, forkNum, blkNo, redoItemHash); + PRRegisterBlockChangeExtended(recordBlockState, relnode, forkNum, blkNo, redoItemHash, isHead); } /** for block state, put it in to hash */ -void PRTrackAddBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash) +void PRTrackAddBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash, bool isHead) { Assert(recordBlockState->blockparse.blockhead.block_valid < BLOCK_DATA_DDL_TYPE); - PRTrackRelPageModification(recordBlockState, redoItemHash); + PRTrackRelPageModification(recordBlockState, redoItemHash, isHead); +} + +void PRTrackAddBatchBlock(XLogRecParseState *headBlockState, XLogRecParseState *tailBlockState, int count, + HTAB *redoItemHash, bool isHead) +{ +#ifdef USE_ASSERT_CHECKING + OndemandCheckPRRegister(headBlockState, tailBlockState, count); +#endif + RelFileNode relnode; + ForkNumber forkNum; + BlockNumber blkNo; + + PRXLogRecGetBlockTag(headBlockState, &relnode, &blkNo, &forkNum); + PRRegisterBatchBlockChangeExtended(headBlockState, tailBlockState, count, relnode, forkNum, blkNo, redoItemHash, + isHead); } /** diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp index b10aa36ca..e92721d37 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp @@ -100,7 +100,7 @@ static const int XLOG_INFO_SHIFT_SIZE = 4; /* xlog info flag shift size */ static const int32 MAX_PENDING = 1; static const int32 MAX_PENDING_STANDBY = 1; -static const int32 ITEM_QUQUE_SIZE_RATIO = 1; +static const int32 ITEM_QUQUE_SIZE_RATIO = 16; static const uint32 EXIT_WAIT_DELAY = 100; /* 100 us */ uint32 g_readManagerTriggerFlag = TRIGGER_NORMAL; @@ -121,7 +121,7 @@ static void SSDestroyRecoveryWorkers(); static void DispatchRecordWithPages(XLogReaderState *, List *); static void DispatchRecordWithoutPage(XLogReaderState *, List *); static void DispatchTxnRecord(XLogReaderState *, List *); -static void StartPageRedoWorkers(uint32); +static void StartPageRedoWorkers(uint32 totalThrdNum, bool inRealtimeBuild); static void StopRecoveryWorkers(int, Datum); static bool StandbyWillChangeStandbyState(const XLogReaderState *); static void DispatchToSpecPageWorker(XLogReaderState *record, List *expectedTLIs); @@ -421,6 +421,18 @@ void HandleStartupInterruptsForExtremeRto() if (t_thrd.startup_cxt.check_repair) { t_thrd.startup_cxt.check_repair = false; } + + if (SS_ONDEMAND_REALTIME_BUILD_NORMAL && SS_STANDBY_FAILOVER && pmState == PM_STARTUP) { + OndemandRealtimeBuildHandleFailover(); + } +} + +static void SetOndemandXLogParseFlagValue(uint32 maxParseBufNum) +{ + g_ondemandXLogParseMemFullValue = maxParseBufNum * ONDEMAND_FORCE_PRUNE_RATIO; + g_ondemandXLogParseMemApproachFullVaule = maxParseBufNum * ONDEMAND_DISTRIBUTE_RATIO; + + g_ondemandRealtimeBuildQueueFullValue = REALTIME_BUILD_RECORD_QUEUE_SIZE * ONDEMAND_FORCE_PRUNE_RATIO; } /* Run from the dispatcher thread. */ @@ -440,15 +452,16 @@ void StartRecoveryWorkers(XLogReaderState *xlogreader, uint32 privateLen) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, SHARED_CONTEXT); - g_instance.comm_cxt.predo_cxt.redoItemHash = PRRedoItemHashInitialize(g_instance.comm_cxt.redoItemCtx); + g_instance.comm_cxt.predo_cxt.redoItemHashCtrl = PRInitRedoItemHashForAllPipeline(g_instance.comm_cxt.redoItemCtx); g_dispatcher->maxItemNum = (get_batch_redo_num() + 4) * PAGE_WORK_QUEUE_SIZE * ITEM_QUQUE_SIZE_RATIO; // 4: a startup, readmanager, txnmanager, txnworker uint32 maxParseBufNum = (uint32)((uint64)g_instance.attr.attr_storage.dms_attr.ondemand_recovery_mem_size * 1024 / (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc) + sizeof(RedoMemSlot))); XLogParseBufferInitFunc(&(g_dispatcher->parseManager), maxParseBufNum, &recordRefOperate, RedoInterruptCallBack); + SetOndemandXLogParseFlagValue(maxParseBufNum); /* alloc for record readbuf */ SSAllocRecordReadBuffer(xlogreader, privateLen); - StartPageRedoWorkers(get_real_recovery_parallelism()); + StartPageRedoWorkers(get_real_recovery_parallelism(), SS_ONDEMAND_REALTIME_BUILD_NORMAL); ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("[PR]: max=%d, thrd=%d", g_instance.attr.attr_storage.max_recovery_parallelism, @@ -480,12 +493,15 @@ void DumpDispatcher() pl = &(g_dispatcher->pageLines[i]); DumpPageRedoWorker(pl->batchThd); DumpPageRedoWorker(pl->managerThd); + DumpPageRedoWorker(pl->htabThd); for (uint32 j = 0; j < pl->redoThdNum; j++) { DumpPageRedoWorker(pl->redoThd[j]); } } DumpPageRedoWorker(g_dispatcher->trxnLine.managerThd); DumpPageRedoWorker(g_dispatcher->trxnLine.redoThd); + DumpPageRedoWorker(g_dispatcher->auxiliaryLine.segRedoThd); + DumpPageRedoWorker(g_dispatcher->auxiliaryLine.ctrlThd); DumpXlogCtl(); } } @@ -517,6 +533,8 @@ STATIC LogDispatcher *CreateDispatcher() newDispatcher->needFullSyncCheckpoint = false; newDispatcher->smartShutdown = false; newDispatcher->startupTimeCost = t_thrd.xlog_cxt.timeCost; + newDispatcher->trxnQueue = SPSCBlockingQueueCreate(REALTIME_BUILD_RECORD_QUEUE_SIZE, RedoWorkerQueueCallBack); + newDispatcher->segQueue = SPSCBlockingQueueCreate(REALTIME_BUILD_RECORD_QUEUE_SIZE, RedoWorkerQueueCallBack); return newDispatcher; } @@ -530,7 +548,7 @@ void RedoRoleInit(PageRedoWorker **dstWk, PageRedoWorker *srcWk, RedoRole role, } /* Run from the dispatcher thread. */ -static void StartPageRedoWorkers(uint32 totalThrdNum) +static void StartPageRedoWorkers(uint32 totalThrdNum, bool inRealtimeBuild) { uint32 batchNum = get_batch_redo_num(); uint32 batchWorkerPerMng = get_page_redo_worker_num_per_manager(); @@ -547,7 +565,7 @@ static void StartPageRedoWorkers(uint32 totalThrdNum) g_dispatcher->pageLines = (PageRedoPipeline *)palloc(sizeof(PageRedoPipeline) * batchNum); for (started = 0; started < totalThrdNum; started++) { - g_dispatcher->allWorkers[started] = CreateWorker(started); + g_dispatcher->allWorkers[started] = CreateWorker(started, inRealtimeBuild); if (g_dispatcher->allWorkers[started] == NULL) { ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("[REDO_LOG_TRACE]StartPageRedoWorkers CreateWorker failed, started:%u", started))); @@ -562,6 +580,8 @@ static void StartPageRedoWorkers(uint32 totalThrdNum) RedoRoleInit(&(g_dispatcher->pageLines[i].batchThd), tmpWorkers[workerCnt++], REDO_BATCH, i, isUndoSpaceWorker); RedoRoleInit(&(g_dispatcher->pageLines[i].managerThd), tmpWorkers[workerCnt++], REDO_PAGE_MNG, i, isUndoSpaceWorker); + RedoRoleInit(&(g_dispatcher->pageLines[i].htabThd), tmpWorkers[workerCnt++], REDO_HTAB_MNG, i, + isUndoSpaceWorker); g_dispatcher->pageLines[i].redoThd = (PageRedoWorker **)palloc(sizeof(PageRedoWorker *) * batchWorkerPerMng); g_dispatcher->pageLines[i].chosedRTIds = (uint32 *)palloc(sizeof(uint32) * batchWorkerPerMng); g_dispatcher->pageLines[i].chosedRTCnt = 0; @@ -578,6 +598,8 @@ static void StartPageRedoWorkers(uint32 totalThrdNum) RedoRoleInit(&(g_dispatcher->readLine.managerThd), tmpWorkers[workerCnt++], REDO_READ_MNG, 0, false); RedoRoleInit(&(g_dispatcher->readLine.readPageThd), tmpWorkers[workerCnt++], REDO_READ_PAGE_WORKER, 0, false); RedoRoleInit(&(g_dispatcher->readLine.readThd), tmpWorkers[workerCnt++], REDO_READ_WORKER, 0, false); + RedoRoleInit(&(g_dispatcher->auxiliaryLine.segRedoThd), tmpWorkers[workerCnt++], REDO_SEG_WORKER, 0, false); + RedoRoleInit(&(g_dispatcher->auxiliaryLine.ctrlThd), tmpWorkers[workerCnt++], REDO_CTRL_WORKER, 0, false); for (started = 0; started < totalThrdNum; started++) { if (StartPageRedoWorker(g_dispatcher->allWorkers[started]) == NULL) { @@ -682,6 +704,7 @@ static void SSDestroyRecoveryWorkers() for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { DestroyPageRedoWorker(g_dispatcher->pageLines[i].batchThd); DestroyPageRedoWorker(g_dispatcher->pageLines[i].managerThd); + DestroyPageRedoWorker(g_dispatcher->pageLines[i].htabThd); for (uint32 j = 0; j < g_dispatcher->pageLines[i].redoThdNum; j++) { DestroyPageRedoWorker(g_dispatcher->pageLines[i].redoThd[j]); } @@ -694,9 +717,13 @@ static void SSDestroyRecoveryWorkers() DestroyPageRedoWorker(g_dispatcher->readLine.managerThd); DestroyPageRedoWorker(g_dispatcher->readLine.readThd); + DestroyPageRedoWorker(g_dispatcher->auxiliaryLine.segRedoThd); + DestroyPageRedoWorker(g_dispatcher->auxiliaryLine.ctrlThd); pfree(g_dispatcher->rtoXlogBufState.readBuf); pfree(g_dispatcher->rtoXlogBufState.errormsg_buf); pfree(g_dispatcher->rtoXlogBufState.readprivate); + SPSCBlockingQueueDestroy(g_dispatcher->trxnQueue); + SPSCBlockingQueueDestroy(g_dispatcher->segQueue); #ifdef USE_ASSERT_CHECKING if (g_dispatcher->originLsnCheckAddr != NULL) { pfree(g_dispatcher->originLsnCheckAddr); @@ -810,6 +837,19 @@ void DispatchRedoRecordToFile(XLogReaderState *record, List *expectedTLIs, Times } } +void UpdateCheckpointRedoPtrForPrune(XLogRecPtr prunePtr) +{ + if (SS_ONDEMAND_REALTIME_BUILD_DISABLED) { + return; + } + + XLogRecPtr ckptRedoPtr; + do { + ckptRedoPtr = pg_atomic_read_u64(&g_dispatcher->ckptRedoPtr); + } while (XLByteLT(ckptRedoPtr, prunePtr) && + !pg_atomic_compare_exchange_u64(&g_dispatcher->ckptRedoPtr, &ckptRedoPtr, prunePtr)); +} + /** * process record need sync with page worker and trxn thread * trxnthreadexe is true when the record need execute on trxn thread @@ -937,7 +977,11 @@ static bool DispatchXLogRecord(XLogReaderState *record, List *expectedTLIs, Time uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); if (IsCheckPoint(record)) { - return isNeedFullSync; + RedoItem *item = GetRedoItemPtr(record); + XLogRecPtr ckptRecordRedoPtr = GetRedoLocInCheckpointRecord(record); + FreeRedoItem(item); + UpdateCheckpointRedoPtrForPrune(ckptRecordRedoPtr); + AddTxnRedoItem(g_dispatcher->trxnLine.managerThd, &g_hashmapPruneMark); } else if ((info == XLOG_FPI) || (info == XLOG_FPI_FOR_HINT)) { DispatchRecordWithPages(record, expectedTLIs); } else { @@ -1815,40 +1859,6 @@ void FreeAllocatedRedoItem() } } -/* Run from the dispatcher thread. */ -void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code) -{ - ereport( - LOG, - (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("[REDO_LOG_TRACE]SendRecoveryEndMarkToWorkersAndWaitForFinish, ready to stop redo workers, code: %d", - code))); - if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { - WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readPageThd); - PageRedoPipeline *pl = g_dispatcher->pageLines; - /* send end mark */ - for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { - SendPageRedoEndMark(pl[i].batchThd); - } - SendPageRedoEndMark(g_dispatcher->trxnLine.managerThd); - - /* wait */ - for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { - WaitPageRedoWorkerReachLastMark(pl[i].batchThd); - } - pg_atomic_write_u32(&(g_dispatcher->rtoXlogBufState.xlogReadManagerState), READ_MANAGER_STOP); - - WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.managerThd); - WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readThd); - WaitPageRedoWorkerReachLastMark(g_dispatcher->trxnLine.managerThd); - LsnUpdate(); -#ifdef USE_ASSERT_CHECKING - AllItemCheck(); -#endif - (void)RegisterRedoInterruptCallBack(g_dispatcher->oldStartupIntrruptFunc); - } -} - void SendRecoveryEndMarkToWorkersAndWaitForReach(int code) { ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), @@ -1889,11 +1899,13 @@ void SendRecoveryEndMarkToWorkersAndWaitForReach(int code) SendPageRedoEndMark(pl[i].batchThd); } SendPageRedoEndMark(g_dispatcher->trxnLine.managerThd); + SendPageRedoEndMark(g_dispatcher->auxiliaryLine.ctrlThd); /* Stop Read Thrd only */ pg_atomic_write_u32(&(g_dispatcher->rtoXlogBufState.xlogReadManagerState), READ_MANAGER_STOP); WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.managerThd); WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readThd); + WaitPageRedoWorkerReachLastMark(g_dispatcher->auxiliaryLine.ctrlThd); LsnUpdate(); XLogRecPtr lastReplayed = GetXLogReplayRecPtr(NULL); ereport(LOG, (errmsg("[SS][REDO_LOG_TRACE] Current LastReplayed: %lu", lastReplayed))); @@ -1943,6 +1955,22 @@ void WaitRedoFinish() SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); } +void WaitRealtimeBuildShutdown() +{ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status = BUILD_TO_DISABLED; + + Assert(g_instance.pid_cxt.StartupPID != 0); + SendPostmasterSignal(PMSIGNAL_DMS_TERM_STARTUP); + while (true) { + if (g_instance.pid_cxt.StartupPID == 0) { + break; + } + pg_usleep(5000L); + } + + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status = DISABLED; +} + /* Run from each page worker and the txn worker thread. */ int GetDispatcherExitCode() { @@ -1990,6 +2018,7 @@ void UpdateStandbyState(HotStandbyState newState) pl = &(g_dispatcher->pageLines[i]); UpdatePageRedoWorkerStandbyState(pl->batchThd, newState); UpdatePageRedoWorkerStandbyState(pl->managerThd, newState); + UpdatePageRedoWorkerStandbyState(pl->htabThd, newState); for (uint32 j = 0; j < pl->redoThdNum; j++) { UpdatePageRedoWorkerStandbyState(pl->redoThd[j], newState); } @@ -1999,6 +2028,8 @@ void UpdateStandbyState(HotStandbyState newState) UpdatePageRedoWorkerStandbyState(g_dispatcher->readLine.managerThd, newState); UpdatePageRedoWorkerStandbyState(g_dispatcher->readLine.readPageThd, newState); UpdatePageRedoWorkerStandbyState(g_dispatcher->readLine.readThd, newState); + UpdatePageRedoWorkerStandbyState(g_dispatcher->auxiliaryLine.segRedoThd, newState); + UpdatePageRedoWorkerStandbyState(g_dispatcher->auxiliaryLine.ctrlThd, newState); pg_atomic_write_u32(&(g_dispatcher->standbyState), newState); } } @@ -2171,6 +2202,9 @@ void redo_get_worker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, redoWorker = (g_dispatcher->pageLines[i].managerThd); make_worker_static_info(&workerList[cur_pos++], redoWorker, i, invalid_worker_id); + redoWorker = (g_dispatcher->pageLines[i].htabThd); + make_worker_static_info(&workerList[cur_pos++], redoWorker, i, invalid_worker_id); + for (int j = 0; j < (int)g_dispatcher->pageLines[i].redoThdNum; ++j) { redoWorker = (g_dispatcher->pageLines[i].redoThd[j]); make_worker_static_info(&workerList[cur_pos++], redoWorker, i, j); @@ -2182,6 +2216,8 @@ void redo_get_worker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, make_worker_static_info(&workerList[cur_pos++], g_dispatcher->readLine.readPageThd, 0, invalid_worker_id); make_worker_static_info(&workerList[cur_pos++], g_dispatcher->readLine.readThd, 0, invalid_worker_id); make_worker_static_info(&workerList[cur_pos++], g_dispatcher->readLine.managerThd, 0, invalid_worker_id); + make_worker_static_info(&workerList[cur_pos++], g_dispatcher->auxiliaryLine.segRedoThd, 0, invalid_worker_id); + make_worker_static_info(&workerList[cur_pos++], g_dispatcher->auxiliaryLine.ctrlThd, 0, invalid_worker_id); const char *startupName = "startup"; allocSize = strlen(startupName) + 1; diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp index f41c13a0b..17954531f 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp @@ -43,6 +43,7 @@ #include "access/nbtree.h" #include "catalog/storage_xlog.h" #include "ddes/dms/ss_dms_recovery.h" +#include "ddes/dms/ss_dms_bufmgr.h" #include "gssignal/gs_signal.h" #include "libpq/pqsignal.h" #include "postmaster/postmaster.h" @@ -66,6 +67,7 @@ #include "access/ondemand_extreme_rto/dispatcher.h" #include "access/ondemand_extreme_rto/txn_redo.h" #include "access/ondemand_extreme_rto/xlog_read.h" +#include "access/ondemand_extreme_rto/redo_utils.h" #include "pgstat.h" #include "access/ondemand_extreme_rto/batch_redo.h" #include "access/multi_redo_api.h" @@ -110,6 +112,11 @@ RedoItem g_cleanupMark; RedoItem g_closefdMark; RedoItem g_cleanInvalidPageMark; RedoItem g_forceDistributeMark; +RedoItem g_hashmapPruneMark; + +uint32 g_ondemandXLogParseMemFullValue = 0; +uint32 g_ondemandXLogParseMemApproachFullVaule = 0; +uint32 g_ondemandRealtimeBuildQueueFullValue = 0; static const int PAGE_REDO_WORKER_ARG = 3; static const int REDO_SLEEP_50US = 50; @@ -132,6 +139,10 @@ void RecordBlockCheck(void *rec, XLogRecPtr curPageLsn, uint32 blockId, bool rep #endif void AddRecordReadBlocks(void *rec, uint32 readblocks); +static void AddTrxnHashmap(void *item); +static void AddSegHashmap(void *item); +static void PageManagerPruneIfRealtimeBuildFailover(); + RefOperate recordRefOperate = { AddRefRecord, SubRefRecord, @@ -141,6 +152,35 @@ RefOperate recordRefOperate = { AddRecordReadBlocks, }; +static void OndemandUpdateXLogParseMemUsedBlkNum() +{ + uint32 usedblknum = 0; + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + usedblknum += pg_atomic_read_u32(&g_dispatcher->pageLines[i].batchThd->parseManager.memctl.usedblknum); + } + pg_atomic_write_u32(&g_dispatcher->parseManager.memctl.usedblknum, usedblknum); +} + +static inline bool OndemandXLogParseMemFull() +{ + return (pg_atomic_read_u32(&g_dispatcher->parseManager.memctl.usedblknum) > g_ondemandXLogParseMemFullValue); +} + +static inline bool OndemandXLogParseMemApproachFull() +{ + return (pg_atomic_read_u32(&g_dispatcher->parseManager.memctl.usedblknum) > g_ondemandXLogParseMemApproachFullVaule); +} + +static inline bool OndemandTrxnQueueFullInRealtimeBuild() +{ + return (SPSCGetQueueCount(g_dispatcher->trxnQueue) > g_ondemandRealtimeBuildQueueFullValue); +} + +static inline bool OndemandSegQueueFullInRealtimeBuild() +{ + return (SPSCGetQueueCount(g_dispatcher->segQueue) > g_ondemandRealtimeBuildQueueFullValue); +} + void UpdateRecordGlobals(RedoItem *item, HotStandbyState standbyState) { t_thrd.xlog_cxt.ReadRecPtr = item->record.ReadRecPtr; @@ -189,7 +229,7 @@ bool RedoWorkerIsUndoSpaceWorker() } /* Run from the dispatcher thread. */ -PageRedoWorker *CreateWorker(uint32 id) +PageRedoWorker *CreateWorker(uint32 id, bool inRealtimeBuild) { PageRedoWorker *tmp = (PageRedoWorker *)palloc0(sizeof(PageRedoWorker) + ONDEMAND_EXTREME_RTO_ALIGN_LEN); PageRedoWorker *worker; @@ -230,6 +270,10 @@ PageRedoWorker *CreateWorker(uint32 id) #endif worker->parseManager.memctl.isInit = false; worker->parseManager.parsebuffers = NULL; + + worker->nextPrunePtr = InvalidXLogRecPtr; + worker->inRealtimeBuild = inRealtimeBuild; + worker->currentHtabBlockNum = 0; return worker; } @@ -554,20 +598,23 @@ void WaitAllRedoWorkerQueueEmpty() } } -bool OndemandXLogParseMemApproachLimit() -{ - float4 ratio = (float4)g_dispatcher->parseManager.memctl.usedblknum / g_dispatcher->parseManager.memctl.totalblknum; - if (ratio > ONDEMAND_DISTRIBUTE_RATIO) { - return true; - } - return false; -} - -void BatchRedoSendDistributeMarkToPageRedoManager(RedoItem *distributeMark) +void BatchRedoSendMarkToPageRedoManager(RedoItem *sendMark) { PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; - AddPageRedoItem(myRedoLine->managerThd, distributeMark); - pg_usleep(1000000); // 1 sec + AddPageRedoItem(myRedoLine->managerThd, sendMark); +} + +static void BatchRedoProcIfXLogParseMemFull() +{ + while (SS_ONDEMAND_RECOVERY_HASHMAP_FULL) { + if (SS_ONDEMAND_REALTIME_BUILD_NORMAL) { + BatchRedoSendMarkToPageRedoManager(&g_hashmapPruneMark); + } else { + BatchRedoSendMarkToPageRedoManager(&g_forceDistributeMark); + } + RedoInterruptCallBack(); + pg_usleep(100000); // 100 ms + } } bool BatchRedoDistributeItems(void **eleArry, uint32 eleNum) @@ -585,9 +632,7 @@ bool BatchRedoDistributeItems(void **eleArry, uint32 eleNum) } else if (eleArry[i] == (void *)&g_cleanInvalidPageMark) { forget_range_invalid_pages((void *)eleArry[i]); } else { - if (OndemandXLogParseMemApproachLimit()) { - BatchRedoSendDistributeMarkToPageRedoManager(&g_forceDistributeMark); - } + BatchRedoProcIfXLogParseMemFull(); GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); RedoItem *item = (RedoItem *)eleArry[i]; UpdateRecordGlobals(item, g_redoWorker->standbyState); @@ -605,13 +650,31 @@ bool BatchRedoDistributeItems(void **eleArry, uint32 eleNum) return false; } +void OndemandInitXLogParseBuffer(RedoParseManager *src, RefOperate *refOperate, InterruptFunc interruptOperte) +{ + g_redoWorker->parseManager.parsebuffers = src->parsebuffers; + g_redoWorker->parseManager.refOperate = refOperate; + + g_redoWorker->parseManager.memctl.totalblknum = src->memctl.totalblknum; + g_redoWorker->parseManager.memctl.usedblknum = 0; + g_redoWorker->parseManager.memctl.itemsize = src->memctl.itemsize; + g_redoWorker->parseManager.memctl.firstfreeslot = InvalidBuffer; + g_redoWorker->parseManager.memctl.firstreleaseslot = InvalidBuffer; + g_redoWorker->parseManager.memctl.memslot = src->memctl.memslot; + g_redoWorker->parseManager.memctl.doInterrupt = interruptOperte; + + g_redoWorker->parseManager.memctl.isInit = true; + + g_parseManager = &g_redoWorker->parseManager; +} + void BatchRedoMain() { void **eleArry; uint32 eleNum; (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); - g_parseManager = &(g_dispatcher->parseManager); + OndemandInitXLogParseBuffer(&g_dispatcher->parseManager, &recordRefOperate, RedoInterruptCallBack); GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); while (SPSCBlockingQueueGetAll(g_redoWorker->queue, &eleArry, &eleNum)) { CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); @@ -686,15 +749,12 @@ void ReleaseRecParseState(PageRedoPipeline *myRedoLine, HTAB *redoItemHash, Redo xlog_partition_lock = XlogTrackMappingPartitionLock(new_hash); if (del_from_hash_item_num > 0) { + Assert(releaseTailState != NULL); (void)LWLockAcquire(xlog_partition_lock, LW_EXCLUSIVE); - if (releaseTailState != NULL) { - redoItemEntry->head = (XLogRecParseState *)releaseTailState->nextrecord; - releaseTailState->nextrecord = NULL; - } else { - redoItemEntry->head = NULL; - } + redoItemEntry->head = (XLogRecParseState *)releaseTailState->nextrecord; + releaseTailState->nextrecord = NULL; XLogBlockParseStateRelease(releaseHeadState); - redoItemEntry->redoItemNum = redoItemEntry->redoItemNum - del_from_hash_item_num; + redoItemEntry->redoItemNum -= del_from_hash_item_num; LWLockRelease(xlog_partition_lock); } @@ -741,13 +801,14 @@ void RedoPageManagerDistributeToRedoThd(PageRedoPipeline *myRedoLine, return; } -void RedoPageManagerDistributeBlockRecord(HTAB *redoItemHash, XLogRecParseState *parsestate) +void RedoPageManagerDistributeBlockRecord(XLogRecParseState *parsestate) { + PageManagerPruneIfRealtimeBuildFailover(); PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; HASH_SEQ_STATUS status; RedoItemHashEntry *redoItemEntry = NULL; - HTAB *curMap = redoItemHash; + HTAB *curMap = g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[g_redoWorker->slotId]->hTab; hash_seq_init(&status, curMap); while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { @@ -777,7 +838,7 @@ static void ReleaseReplayedInParse(PageRedoPipeline* myRedoLine, uint32 workerNu { HASH_SEQ_STATUS status; RedoItemHashEntry *redoItemEntry = NULL; - HTAB *curMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; + HTAB *curMap = g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[g_redoWorker->slotId]->hTab; hash_seq_init(&status, curMap); while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { @@ -804,10 +865,15 @@ static void WaitAndTryReleaseWorkerReplayedRec(PageRedoPipeline *myRedoLine, uin } } -void DispatchEndMarkToRedoWorkerAndWait() +void PageManagerDispatchEndMarkAndWait() { PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; const uint32 WorkerNumPerMng = get_page_redo_worker_num_per_manager(); + + SendPageRedoEndMark(myRedoLine->htabThd); + if (g_redoWorker->slotId == SEG_PROC_PIPELINE_SLOT) { + SendPageRedoEndMark(g_dispatcher->auxiliaryLine.segRedoThd); + } for (uint32 i = 0; i < WorkerNumPerMng; ++i) SendPageRedoEndMark(myRedoLine->redoThd[i]); @@ -816,6 +882,11 @@ void DispatchEndMarkToRedoWorkerAndWait() for (uint32 i = 0; i < myRedoLine->redoThdNum; i++) { WaitPageRedoWorkerReachLastMark(myRedoLine->redoThd[i]); } + if (g_redoWorker->slotId == SEG_PROC_PIPELINE_SLOT) { + WaitPageRedoWorkerReachLastMark(g_dispatcher->auxiliaryLine.segRedoThd); + } + WaitPageRedoWorkerReachLastMark(myRedoLine->htabThd); + ReleaseReplayedInParse(myRedoLine, WorkerNumPerMng); } @@ -835,14 +906,13 @@ void RedoPageManagerDdlAction(XLogRecParseState *parsestate) xlog_drop_tblspc(parsestate->blockparse.blockhead.spcNode); break; case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: - { - Assert(0); - } - break; + /* handle by seg worker, function SSProcSegPageCommonRedo */ case BLOCK_DATA_SEG_SPACE_DROP: case BLOCK_DATA_SEG_FULL_SYNC_TYPE: case BLOCK_DATA_SEG_EXTEND: - ProcSegPageCommonRedo(parsestate); + { + Assert(0); + } break; default: break; @@ -900,7 +970,7 @@ void RedoPageManagerDoDropAction(XLogRecParseState *parsestate, HTAB *hashMap) { XLogRecParseState *newState = XLogParseBufferCopy(parsestate); PRTrackClearBlock(newState, hashMap); - RedoPageManagerDistributeBlockRecord(hashMap, parsestate); + RedoPageManagerDistributeBlockRecord(parsestate); WaitCurrentPipeLineRedoWorkersQueueEmpty(); RedoPageManagerSyncDdlAction(parsestate); } @@ -933,7 +1003,7 @@ void RedoPageManagerDoDataTypeAction(XLogRecParseState *parsestate, HTAB *hashMa ddlrecparse->blockddltype == BLOCK_DDL_TRUNCATE_RELNODE) { XLogRecParseState *newState = XLogParseBufferCopy(parsestate); PRTrackClearBlock(newState, hashMap); - RedoPageManagerDistributeBlockRecord(hashMap, parsestate); + RedoPageManagerDistributeBlockRecord(parsestate); WaitCurrentPipeLineRedoWorkersQueueEmpty(); } @@ -941,11 +1011,110 @@ void RedoPageManagerDoDataTypeAction(XLogRecParseState *parsestate, HTAB *hashMa } +static void RedoItemHashPruneWithoutLock(HTAB *redoItemHash, RedoItemHashEntry *redoItemEntry, XLogRecPtr pruneLsn, + bool updateStat) +{ + XLogRecParseState *cur_state = redoItemEntry->head; + XLogRecParseState *releaseHeadState = redoItemEntry->head; + XLogRecParseState *releaseTailState = NULL; + unsigned int del_from_hash_item_num = 0; + + while (cur_state != NULL) { + XLogRecPtr curRedoItemLsn = XLogBlockHeadGetLSN(&cur_state->blockparse.blockhead); + if (XLByteLT(curRedoItemLsn, pruneLsn)) { + releaseTailState = cur_state; + del_from_hash_item_num++; + cur_state = (XLogRecParseState *)(cur_state->nextrecord); + } else { + break; + } + } + + if (del_from_hash_item_num > 0) { + if (releaseTailState != NULL) { + redoItemEntry->head = (XLogRecParseState *)releaseTailState->nextrecord; + releaseTailState->nextrecord = NULL; + } else { + redoItemEntry->head = NULL; + } + XLogBlockParseStateRelease(releaseHeadState); + redoItemEntry->redoItemNum -= del_from_hash_item_num; + } + + if (redoItemEntry->redoItemNum == 0) { + if (hash_search(redoItemHash, (void *)&redoItemEntry->redoItemTag, HASH_REMOVE, NULL) == NULL) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("redo item hash table corrupted"))); + } + } + + pg_memory_barrier(); + if ((del_from_hash_item_num > 0) && updateStat) { + (void)pg_atomic_sub_fetch_u32(&g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->currentHtabBlockNum, + del_from_hash_item_num); + } +} + +static void PageManagerAddRedoItemToSegWorkers(RedoItem *item) +{ + if (g_redoWorker->slotId == SEG_PROC_PIPELINE_SLOT) { + AddPageRedoItem(g_dispatcher->auxiliaryLine.segRedoThd, item); + } +} + +static void PageManagerAddRedoItemToHashMapManager(RedoItem *item) +{ + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + AddPageRedoItem(myRedoLine->htabThd, item); +} + +void PageManagerProcHashmapPrune() +{ + XLogRecPtr ckptPtr = pg_atomic_read_u64(&g_dispatcher->ckptRedoPtr); + if (XLByteLE(ckptPtr, g_redoWorker->nextPrunePtr)) { + return; + } + + XLogRecPtr prunePtr = InvalidXLogRecPtr; + PageManagerAddRedoItemToHashMapManager(&g_hashmapPruneMark); + do { + prunePtr = pg_atomic_read_u64(&g_dispatcher->pageLines[g_redoWorker->slotId].htabThd->nextPrunePtr); + RedoInterruptCallBack(); + pg_usleep(100000L); /* 100 ms */ + } while (XLByteLT(prunePtr, ckptPtr)); + g_redoWorker->nextPrunePtr = ckptPtr; +} + +static void OndemandMergeHashMap(HTAB *srcHashmap, HTAB *dstHashmap) +{ + HASH_SEQ_STATUS status; + RedoItemHashEntry *srcEntry = NULL; + hash_seq_init(&status, srcHashmap); + + while ((srcEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + PRTrackAddBatchBlock(srcEntry->head, srcEntry->tail, srcEntry->redoItemNum, dstHashmap, false); + } +} + +void PageManagerMergeHashMapInRealtimeBuild() +{ + ondemand_htab_ctrl_t *procHtabCtrl = g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[g_redoWorker->slotId]; + ondemand_htab_ctrl_t *targetHtabCtrl = g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHashCtrl; + ondemand_htab_ctrl_t *nextHtabCtrl = procHtabCtrl; + g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHashCtrl = + g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[g_redoWorker->slotId]; + while (nextHtabCtrl != targetHtabCtrl) { + nextHtabCtrl = (ondemand_htab_ctrl_t *)nextHtabCtrl->nextHTabCtrl; + OndemandMergeHashMap(nextHtabCtrl->hTab, procHtabCtrl->hTab); + pfree(nextHtabCtrl); + } +} + void PageManagerProcLsnForwarder(RedoItem *lsnForwarder) { - SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); + PageManagerAddRedoItemToSegWorkers(lsnForwarder); + PageManagerAddRedoItemToHashMapManager(lsnForwarder); PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; @@ -953,11 +1122,16 @@ void PageManagerProcLsnForwarder(RedoItem *lsnForwarder) AddPageRedoItem(myRedoLine->redoThd[i], lsnForwarder); } + PageManagerPruneIfRealtimeBuildFailover(); + /* wait hashmapmng prune and segworker distribute segrecord to hashmap */ uint32 refCount; do { refCount = pg_atomic_read_u32(&g_GlobalLsnForwarder.record.refcount); RedoInterruptCallBack(); } while (refCount != 0); + + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); + RedoPageManagerDistributeBlockRecord(NULL); } void PageManagerDistributeBcmBlock(XLogRecParseState *preState) @@ -974,16 +1148,29 @@ void PageManagerProcCleanupMark(RedoItem *cleanupMark) PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); + PageManagerAddRedoItemToSegWorkers(cleanupMark); for (uint32 i = 0; i < WorkerNumPerMng; ++i) { AddPageRedoItem(myRedoLine->redoThd[i], cleanupMark); } ereport(LOG, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]PageManagerProcCleanupMark has cleaned InvalidPages"))); } -void PageManagerProcCheckPoint(HTAB *hashMap, XLogRecParseState *parseState) +void PageManagerProcClosefdMark(RedoItem *closefdMark) +{ + PageManagerAddRedoItemToSegWorkers(closefdMark); + smgrcloseall(); +} + +void PageManagerProcCleanInvalidPageMark(RedoItem *cleanInvalidPageMark) +{ + PageManagerAddRedoItemToSegWorkers(cleanInvalidPageMark); + forget_range_invalid_pages((void *)cleanInvalidPageMark); +} + +void PageManagerProcCheckPoint(XLogRecParseState *parseState) { Assert(IsCheckPoint(parseState)); - RedoPageManagerDistributeBlockRecord(hashMap, parseState); + RedoPageManagerDistributeBlockRecord(parseState); bool needWait = parseState->isFullSync; if (needWait) { pg_atomic_write_u32(&g_redoWorker->fullSyncFlag, 1); @@ -1006,9 +1193,9 @@ void PageManagerProcCheckPoint(HTAB *hashMap, XLogRecParseState *parseState) } } -void PageManagerProcCreateTableSpace(HTAB *hashMap, XLogRecParseState *parseState) +void PageManagerProcCreateTableSpace(XLogRecParseState *parseState) { - RedoPageManagerDistributeBlockRecord(hashMap, NULL); + RedoPageManagerDistributeBlockRecord(NULL); bool needWait = parseState->isFullSync; if (needWait) { pg_atomic_write_u32(&g_redoWorker->fullSyncFlag, 1); @@ -1022,39 +1209,112 @@ void PageManagerProcCreateTableSpace(HTAB *hashMap, XLogRecParseState *parseStat } } -void PageManagerProcSegFullSyncState(HTAB *hashMap, XLogRecParseState *parseState) +static void SSProcSegPageRedoInSegPageRedoChildState(XLogRecParseState *redoblockstate) { - RedoPageManagerDistributeBlockRecord(hashMap, NULL); - WaitCurrentPipeLineRedoWorkersQueueEmpty(); - RedoPageManagerSyncDdlAction(parseState); + if (SS_ONDEMAND_REALTIME_BUILD_NORMAL) { + BufferTag bufferTag; + XLogRecPtr pageLsn = InvalidXLogRecPtr; + XLogBlockHeadGetBufferTag(&redoblockstate->blockparse.blockhead, &bufferTag); + if (SSRequestPageInOndemandRealtimeBuild(&bufferTag, redoblockstate->blockparse.blockhead.end_ptr, &pageLsn) || + !SSXLogParseRecordNeedReplayInOndemandRealtimeBuild(redoblockstate)) { +#ifdef USE_ASSERT_CHECKING + bool willinit = (XLogBlockDataGetBlockFlags((XLogBlockDataParse *)&redoblockstate->blockparse.extra_rec) & + BKPBLOCK_WILL_INIT); + DoRecordCheck(redoblockstate, pageLsn, !willinit); +#endif + ereport(DEBUG1, (errmsg("[On-demand] standby node request page success during ondemand realtime build, " + "spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u, recordlsn: %X/%X, pagelsn: %X/%X", + bufferTag.rnode.spcNode, bufferTag.rnode.dbNode, bufferTag.rnode.relNode, bufferTag.rnode.bucketNode, + bufferTag.forkNum, bufferTag.blockNum, (uint32)(redoblockstate->blockparse.blockhead.end_ptr >> 32), + (uint32)redoblockstate->blockparse.blockhead.end_ptr, (uint32)(pageLsn >> 32), (uint32)pageLsn))); + XLogBlockParseStateRelease(redoblockstate); + return; + } + ereport(DEBUG1, (errmsg("[On-demand] standby node request page failed during ondemand realtime build, " + "spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u, recordlsn: %X/%X, pagelsn: %X/%X", + bufferTag.rnode.spcNode, bufferTag.rnode.dbNode, bufferTag.rnode.relNode, bufferTag.rnode.bucketNode, + bufferTag.forkNum, bufferTag.blockNum, (uint32)(redoblockstate->blockparse.blockhead.end_ptr >> 32), + (uint32)redoblockstate->blockparse.blockhead.end_ptr, (uint32)(pageLsn >> 32), (uint32)pageLsn))); + } + SegPageRedoChildState(redoblockstate); } -void OnDemandPageManagerProcSegFullSyncState(XLogRecParseState *parsestate) +// for less ondmeand recovery memory consume +static void SSReleaseRefRecordWithoutReplay(XLogRecParseState *redoblockstate) +{ + RedoItem *item = GetRedoItemPtr((XLogReaderState *)redoblockstate->refrecord); +#ifdef USE_ASSERT_CHECKING + DoRecordCheck(redoblockstate, InvalidXLogRecPtr, false); +#endif + DereferenceRedoItem(item); + redoblockstate->refrecord = NULL; +} + +static void SSProcPageRedoInSegPageRedoChildState(XLogRecParseState *redoblockstate) +{ + AddSegHashmap(redoblockstate); +} + +static void SSSegPageRedoChildState(XLogRecParseState *childStateList) +{ + BufferTag bufferTag; + XLogRecParseState *procState = childStateList; + while (procState != NULL) { + XLogRecParseState *redoblockstate = procState; + procState = (XLogRecParseState *)procState->nextrecord; + redoblockstate->nextrecord = NULL; + XLogBlockHeadGetBufferTag(&redoblockstate->blockparse.blockhead, &bufferTag); + if (IsSegmentPhysicalRelNode(bufferTag.rnode)) { + SSProcSegPageRedoInSegPageRedoChildState(redoblockstate); + } else { + SSProcPageRedoInSegPageRedoChildState(redoblockstate); + } + } +} + +static void SSProcSegPageCommonRedo(XLogRecParseState *parseState) +{ + uint8 info = XLogBlockHeadGetInfo(&parseState->blockparse.blockhead) & ~XLR_INFO_MASK; + switch (info) { + // has child list + case XLOG_SEG_ATOMIC_OPERATION: + case XLOG_SEG_SEGMENT_EXTEND: + case XLOG_SEG_INIT_MAPPAGE: + case XLOG_SEG_INIT_INVRSPTR_PAGE: + case XLOG_SEG_ADD_NEW_GROUP: + { + XLogRecParseState *child = + (XLogRecParseState *)parseState->blockparse.extra_rec.blocksegfullsyncrec.childState; + SSSegPageRedoChildState(child); + break; + } + case XLOG_SEG_CREATE_EXTENT_GROUP: + case XLOG_SEG_SPACE_SHRINK: + case XLOG_SEG_NEW_PAGE: + case XLOG_SEG_SPACE_DROP: + Assert(!SS_ONDEMAND_REALTIME_BUILD_NORMAL); + ProcSegPageCommonRedo(parseState); + break; + default: + ereport(PANIC, (errmsg("SSProcSegPageCommonRedo: unknown op code %u", info))); + break; + } +} + +void OnDemandSegWorkerProcSegFullSyncState(XLogRecParseState *parsestate) { MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); - RedoPageManagerDdlAction(parsestate); + SSProcSegPageCommonRedo(parsestate); (void)MemoryContextSwitchTo(oldCtx); parsestate->nextrecord = NULL; XLogBlockParseStateRelease(parsestate); } -void PageManagerProcSegPipeLineSyncState(HTAB *hashMap, XLogRecParseState *parseState) -{ - RedoPageManagerDistributeBlockRecord(hashMap, NULL); - WaitCurrentPipeLineRedoWorkersQueueEmpty(); - MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); - - RedoPageManagerDdlAction(parseState); - - (void)MemoryContextSwitchTo(oldCtx); - XLogBlockParseStateRelease(parseState); -} - -void OnDemandPageManagerProcSegPipeLineSyncState(XLogRecParseState *parseState) +void OnDemandSegWorkerProcSegPipeLineSyncState(XLogRecParseState *parseState) { MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); - RedoPageManagerDdlAction(parseState); + SSProcSegPageCommonRedo(parseState); (void)MemoryContextSwitchTo(oldCtx); XLogBlockParseStateRelease(parseState); @@ -1075,15 +1335,14 @@ static void WaitNextBarrier(XLogRecParseState *parseState) } } -static void OnDemandPageManagerRedoSegParseState(XLogRecParseState *preState) +static void OnDemandSegWorkerRedoSegParseState(XLogRecParseState *preState) { - Assert(g_redoWorker->slotId == 0); switch (preState->blockparse.blockhead.block_valid) { case BLOCK_DATA_SEG_EXTEND: - OnDemandPageManagerProcSegPipeLineSyncState(preState); + OnDemandSegWorkerProcSegPipeLineSyncState(preState); break; case BLOCK_DATA_SEG_FULL_SYNC_TYPE: - OnDemandPageManagerProcSegFullSyncState(preState); + OnDemandSegWorkerProcSegFullSyncState(preState); break; case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: default: @@ -1094,10 +1353,98 @@ static void OnDemandPageManagerRedoSegParseState(XLogRecParseState *preState) } } +static void OnDemandDispatchSegParseStateToSegWorker(XLogRecParseState *preState) +{ + Assert(g_redoWorker->slotId == SEG_PROC_PIPELINE_SLOT); + AddPageRedoItem(g_dispatcher->auxiliaryLine.segRedoThd, preState); +} + +static bool WaitPrimaryDoCheckpointAndAllPRTrackEmpty(XLogRecParseState *preState, HTAB *redoItemHash) +{ + if (SS_ONDEMAND_REALTIME_BUILD_DISABLED) { + return false; + } + + bool notifyDone = false; + bool waitDone = false; + XLogRecPtr ddlSyncPtr = preState->blockparse.blockhead.end_ptr; + + // notify dispatcher thread and wait for primary checkpoint + XLogRecPtr syncRecordPtr; + do { + syncRecordPtr = pg_atomic_read_u64(&g_dispatcher->syncRecordPtr); + if (XLByteLT(ddlSyncPtr, syncRecordPtr)) { + break; + } + } while (!pg_atomic_compare_exchange_u64(&g_dispatcher->syncRecordPtr, &syncRecordPtr, ddlSyncPtr)); + + do { + if (pg_atomic_read_u32(&g_redoWorker->currentHtabBlockNum) == 0) { + // exit if hashmap manager already clear all hashmap + waitDone = true; + break; + } else if (SS_ONDEMAND_REALTIME_BUILD_FAILOVER) { + // exit if primary node crash + waitDone = false; + break; + } + PageManagerProcHashmapPrune(); + pg_usleep(100000L); /* 100 ms */ + } while (true); + + // clear all blocks in hashmap + g_redoWorker->nextPrunePtr = pg_atomic_read_u64(&g_dispatcher->ckptRedoPtr); + + return waitDone; +} + +static void PageManagerPruneIfRealtimeBuildFailover() +{ + if (SS_ONDEMAND_REALTIME_BUILD_FAILOVER && g_redoWorker->inRealtimeBuild) { + PageManagerProcHashmapPrune(); + PageManagerMergeHashMapInRealtimeBuild(); + g_redoWorker->inRealtimeBuild = false; + } +} + +void ReleaseBlockParseStateIfNotReplay(XLogRecParseState *preState) +{ +#ifdef USE_ASSERT_CHECKING + XLogRecParseState *nextBlockState = preState; + while (nextBlockState != NULL) { + DoRecordCheck(nextBlockState, InvalidXLogRecPtr, false); + nextBlockState = (XLogRecParseState *)(nextBlockState->nextrecord); + } +#endif + XLogBlockParseStateRelease(preState); +} + +static void OndemandSwitchHTABIfBlockNumUpperLimit() +{ + if (SS_ONDEMAND_REALTIME_BUILD_NORMAL && (g_redoWorker->currentHtabBlockNum > ONDEMAND_HASHTAB_SWITCH_LIMIT)) { + ondemand_htab_ctrl_t *oldHTabCtrl = g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHashCtrl; + ondemand_htab_ctrl_t *newHTabCtrl = PRRedoItemHashInitialize(g_instance.comm_cxt.redoItemCtx); + + oldHTabCtrl->nextHTabCtrl = (void *)newHTabCtrl; + g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHashCtrl = newHTabCtrl; + g_redoWorker->currentHtabBlockNum = 0; + } +} + void PageManagerRedoParseState(XLogRecParseState *preState) { - HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; - RedoItem *item = GetRedoItemPtr((XLogReaderState *)preState->refrecord); + PageManagerPruneIfRealtimeBuildFailover(); + if (XLByteLT(preState->blockparse.blockhead.end_ptr, g_redoWorker->nextPrunePtr)) { + ReleaseBlockParseStateIfNotReplay(preState); + return; + } + + HTAB *hashMap = g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHashCtrl->hTab; + XLogRecParseType type = GetCurrentXLogRecParseType(preState); + if (type == PARSE_TYPE_DDL && WaitPrimaryDoCheckpointAndAllPRTrackEmpty(preState, hashMap)) { + ReleaseBlockParseStateIfNotReplay(preState); + return; + } switch (preState->blockparse.blockhead.block_valid) { case BLOCK_DATA_MAIN_DATA_TYPE: @@ -1105,14 +1452,12 @@ void PageManagerRedoParseState(XLogRecParseState *preState) case BLOCK_DATA_VM_TYPE: case BLOCK_DATA_FSM_TYPE: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); - PRTrackAddBlock(preState, hashMap); + PRTrackAddBlock(preState, hashMap, false); + g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->currentHtabBlockNum++; SetCompletedReadEndPtr(g_redoWorker, preState->blockparse.blockhead.start_ptr, preState->blockparse.blockhead.end_ptr); -#ifdef USE_ASSERT_CHECKING - DoRecordCheck(preState, InvalidXLogRecPtr, false); -#endif - DereferenceRedoItem(item); // for less ondmeand recovery memory consume - preState->refrecord = NULL; + SSReleaseRefRecordWithoutReplay(preState); + g_redoWorker->redoItemHashCtrl->maxRedoItemPtr = preState->blockparse.blockhead.end_ptr; CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); break; case BLOCK_DATA_DDL_TYPE: @@ -1122,7 +1467,7 @@ void PageManagerRedoParseState(XLogRecParseState *preState) break; case BLOCK_DATA_SEG_EXTEND: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); - OnDemandPageManagerRedoSegParseState(preState); + OnDemandDispatchSegParseStateToSegWorker(preState); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); break; case BLOCK_DATA_DROP_DATABASE_TYPE: @@ -1136,7 +1481,7 @@ void PageManagerRedoParseState(XLogRecParseState *preState) break; case BLOCK_DATA_CREATE_DATABASE_TYPE: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); - RedoPageManagerDistributeBlockRecord(hashMap, NULL); + RedoPageManagerDistributeBlockRecord(NULL); /* wait until queue empty */ WaitCurrentPipeLineRedoWorkersQueueEmpty(); /* do atcual action */ @@ -1146,68 +1491,97 @@ void PageManagerRedoParseState(XLogRecParseState *preState) case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: case BLOCK_DATA_SEG_FULL_SYNC_TYPE: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); - OnDemandPageManagerRedoSegParseState(preState); + OnDemandDispatchSegParseStateToSegWorker(preState); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); break; case BLOCK_DATA_CREATE_TBLSPC_TYPE: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_7]); - PageManagerProcCreateTableSpace(hashMap, preState); + PageManagerProcCreateTableSpace(preState); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_7]); break; case BLOCK_DATA_XLOG_COMMON_TYPE: - PageManagerProcCheckPoint(hashMap, preState); + PageManagerProcCheckPoint(preState); break; case BLOCK_DATA_NEWCU_TYPE: - RedoPageManagerDistributeBlockRecord(hashMap, NULL); + RedoPageManagerDistributeBlockRecord(NULL); PageManagerDistributeBcmBlock(preState); break; case BLOCK_DATA_SEG_SPACE_DROP: case BLOCK_DATA_SEG_SPACE_SHRINK: GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); - RedoPageManagerDistributeBlockRecord(hashMap, preState); + RedoPageManagerDistributeBlockRecord(preState); WaitCurrentPipeLineRedoWorkersQueueEmpty(); RedoPageManagerSyncDdlAction(preState); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); break; case BLOCK_DATA_BARRIER_TYPE: - RedoPageManagerDistributeBlockRecord(hashMap, preState); + RedoPageManagerDistributeBlockRecord(preState); WaitNextBarrier(preState); break; default: XLogBlockParseStateRelease(preState); break; } + OndemandSwitchHTABIfBlockNumUpperLimit(); } +#ifdef USE_ASSERT_CHECKING +static void OndemandCheckHashMapDistributeDone() +{ + Assert(g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[g_redoWorker->slotId] == + g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHashCtrl); + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + HTAB *hashMap = g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHashCtrl->hTab; + hash_seq_init(&status, hashMap); + + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + XLogRecParseState *procState = redoItemEntry->head; + while (procState != NULL) { + XLogRecParseState *nextState = (XLogRecParseState *)procState->nextrecord; + Assert(procState->distributeStatus != XLOG_NO_DISTRIBUTE); + if (nextState != NULL) { + Assert(XLByteLE(procState->blockparse.blockhead.end_ptr, nextState->blockparse.blockhead.end_ptr)); + } + procState = nextState; + } + } +} +#endif + bool PageManagerRedoDistributeItems(void **eleArry, uint32 eleNum) { - HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; - for (uint32 i = 0; i < eleNum; i++) { if (eleArry[i] == (void *)&g_redoEndMark) { - RedoPageManagerDistributeBlockRecord(hashMap, NULL); +#ifdef USE_ASSERT_CHECKING + OndemandCheckHashMapDistributeDone(); +#endif return true; } else if (eleArry[i] == (void *)&g_GlobalLsnForwarder) { PageManagerProcLsnForwarder((RedoItem *)eleArry[i]); - RedoPageManagerDistributeBlockRecord(hashMap, NULL); continue; } else if (eleArry[i] == (void *)&g_cleanupMark) { PageManagerProcCleanupMark((RedoItem *)eleArry[i]); continue; } else if (eleArry[i] == (void *)&g_closefdMark) { - smgrcloseall(); + PageManagerProcClosefdMark((RedoItem *)eleArry[i]); continue; } else if (eleArry[i] == (void *)&g_cleanInvalidPageMark) { - forget_range_invalid_pages((void *)eleArry[i]); + PageManagerProcCleanInvalidPageMark((RedoItem *)eleArry[i]); + continue; + } else if (eleArry[i] == (void *)&g_hashmapPruneMark) { + PageManagerProcHashmapPrune(); continue; } else if (eleArry[i] == (void *)&g_forceDistributeMark) { + Assert(!SS_ONDEMAND_REALTIME_BUILD_NORMAL); // double check - if (OndemandXLogParseMemApproachLimit()) { - RedoPageManagerDistributeBlockRecord(hashMap, NULL); + if (SS_ONDEMAND_RECOVERY_HASHMAP_FULL) { ereport(WARNING, (errcode(ERRCODE_LOG), errmsg("[On-demand] Parse buffer num approach critical value, distribute block record by force," " slotid %d, usedblknum %d, totalblknum %d", g_redoWorker->slotId, - g_dispatcher->parseManager.memctl.usedblknum, g_dispatcher->parseManager.memctl.totalblknum))); + pg_atomic_read_u32(&g_dispatcher->parseManager.memctl.usedblknum), + g_dispatcher->parseManager.memctl.totalblknum))); + RedoPageManagerDistributeBlockRecord(NULL); } continue; } @@ -1239,7 +1613,7 @@ void RedoPageManagerMain() uint32 eleNum; (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); - g_parseManager = &(g_dispatcher->parseManager); + g_redoWorker->redoItemHashCtrl = g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[g_redoWorker->slotId]; GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); while (SPSCBlockingQueueGetAll(g_redoWorker->queue, &eleArry, &eleNum)) { @@ -1281,12 +1655,72 @@ void TrxnManagerProcCleanupMark(RedoItem *cleanupMark) ereport(LOG, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]TrxnManagerProcCleanupMark has cleaned InvalidPages"))); } +static void TrxnManagerProcHashMapPrune() +{ + if (SS_ONDEMAND_REALTIME_BUILD_DISABLED || !g_redoWorker->inRealtimeBuild) { + return; + } + + XLogRecPtr prunePtr = pg_atomic_read_u64(&g_dispatcher->ckptRedoPtr); + if (XLByteLT(g_redoWorker->nextPrunePtr, prunePtr)) { + while (!SPSCBlockingQueueIsEmpty(g_dispatcher->trxnQueue)) { + RedoItem *item = (RedoItem *)SPSCBlockingQueueTop(g_dispatcher->trxnQueue); + if (XLByteLT(prunePtr, item->record.EndRecPtr)) { + break; + } + DereferenceRedoItem(item); + SPSCBlockingQueuePop(g_dispatcher->trxnQueue); + } + g_redoWorker->nextPrunePtr = prunePtr; + } +} + +static void TrxnManagerPruneAndDistributeIfRealtimeBuildFailover() +{ + if (SS_ONDEMAND_REALTIME_BUILD_FAILOVER && g_redoWorker->inRealtimeBuild) { + TrxnManagerProcHashMapPrune(); + while (!SPSCBlockingQueueIsEmpty(g_dispatcher->trxnQueue)) { + RedoItem *item = (RedoItem *)SPSCBlockingQueueTop(g_dispatcher->trxnQueue); + AddPageRedoItem(g_dispatcher->trxnLine.redoThd, item); + SPSCBlockingQueuePop(g_dispatcher->trxnQueue); + } + g_redoWorker->inRealtimeBuild = false; + } +} + +static void TrxnManagerPruneIfQueueFullInRealtimeBuild() +{ + while (SS_ONDEMAND_RECOVERY_TRXN_QUEUE_FULL && SS_ONDEMAND_REALTIME_BUILD_NORMAL) { + TrxnManagerProcHashMapPrune(); + RedoInterruptCallBack(); + } +} + +static void TrxnManagerAddTrxnRecord(RedoItem *item, bool syncRecord) +{ + if (g_redoWorker->inRealtimeBuild) { + if (syncRecord) { + if (XactHasSegpageRelFiles(&item->record)) { + uint32 expected = 1; + pg_atomic_compare_exchange_u32((volatile uint32 *)&(g_dispatcher->segpageXactDoneFlag), &expected, 0); + } + TrxnManagerProcHashMapPrune(); + DereferenceRedoItem(item); + } else { + AddTrxnHashmap(item); + } + } else { + AddPageRedoItem(g_dispatcher->trxnLine.redoThd, item); + } +} + bool TrxnManagerDistributeItemsBeforeEnd(RedoItem *item) { bool exitFlag = false; if (item == &g_redoEndMark) { exitFlag = true; } else if (item == (RedoItem *)&g_GlobalLsnForwarder) { + TrxnManagerPruneAndDistributeIfRealtimeBuildFailover(); TrxnManagerProcLsnForwarder(item); } else if (item == (RedoItem *)&g_cleanupMark) { TrxnManagerProcCleanupMark(item); @@ -1294,12 +1728,21 @@ bool TrxnManagerDistributeItemsBeforeEnd(RedoItem *item) smgrcloseall(); } else if (item == (void *)&g_cleanInvalidPageMark) { forget_range_invalid_pages((void *)item); + } else if (item == (void *)&g_hashmapPruneMark) { + TrxnManagerProcHashMapPrune(); } else { + if (XLByteLT(item->record.EndRecPtr, g_redoWorker->nextPrunePtr)) { + DereferenceRedoItem(item); + return exitFlag; + } + + bool syncRecord = false; GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); - if (IsCheckPoint(&item->record) || IsTableSpaceDrop(&item->record) || IsTableSpaceCreate(&item->record) || - (IsXactXlog(&item->record) && XactWillRemoveRelFiles(&item->record)) || IsBarrierRelated(&item->record)) { + if (IsTableSpaceDrop(&item->record) || IsTableSpaceCreate(&item->record) || IsBarrierRelated(&item->record) || + (IsXactXlog(&item->record) && XactWillRemoveRelFiles(&item->record))) { uint32 relCount; do { + syncRecord = true; RedoInterruptCallBack(); relCount = pg_atomic_read_u32(&item->record.refcount); } while (relCount != 1); @@ -1309,7 +1752,9 @@ bool TrxnManagerDistributeItemsBeforeEnd(RedoItem *item) TestXLogReaderProbe(UTEST_EVENT_RTO_TRXNMGR_DISTRIBUTE_ITEMS, __FUNCTION__, &item->record); #endif - AddPageRedoItem(g_dispatcher->trxnLine.redoThd, item); + TrxnManagerPruneIfQueueFullInRealtimeBuild(); + TrxnManagerPruneAndDistributeIfRealtimeBuildFailover(); + TrxnManagerAddTrxnRecord(item, syncRecord); CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_5]); } return exitFlag; @@ -1520,8 +1965,47 @@ void RedoPageWorkerCheckPoint(const XLogRecParseState *redoblockstate) } } +static void SegWorkerRedoAllSegBlockRecord() +{ + RedoTimeCost timeCost1; + RedoTimeCost timeCost2; + + while (!SPSCBlockingQueueIsEmpty(g_dispatcher->segQueue)) { + XLogRecParseState *segRecord = (XLogRecParseState *)SPSCBlockingQueueTop(g_dispatcher->segQueue); + RedoBufferInfo bufferinfo = {0}; + (void)XLogBlockRedoForExtremeRTO(segRecord, &bufferinfo, false, timeCost1, timeCost2); + if (bufferinfo.pageinfo.page != NULL) { + MarkSegPageRedoChildPageDirty(&bufferinfo); + } + XLogBlockParseStateRelease(segRecord); + SPSCBlockingQueuePop(g_dispatcher->segQueue); + } +} + void PageWorkerProcLsnForwarder(RedoItem *lsnForwarder) { + (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); + + /* wait all worker proc done */ + uint32 refCount; + do { + refCount = pg_atomic_read_u32(&g_GlobalLsnForwarder.record.refcount); + RedoInterruptCallBack(); + } while (refCount != 0); + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); +} + +void SegWorkerProcLsnForwarder(RedoItem *lsnForwarder) +{ + uint32 refCount; + do { + refCount = pg_atomic_read_u32(&g_GlobalLsnForwarder.record.refcount); + RedoInterruptCallBack(); + } while (refCount != 1); + + // prune done, redo all seg block record + SegWorkerRedoAllSegBlockRecord(); + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); } @@ -1592,7 +2076,7 @@ bool checkBlockRedoDoneFromHashMapAndLock(LWLock **lock, RedoItemTag redoItemTag { bool hashFound = false; uint32 id = GetSlotId(redoItemTag.rNode, 0, 0, GetBatchCount()); - HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[id]; + HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[id]->hTab; if (hashMap == NULL) { ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("redo item hash table corrupted, there has invalid hashtable."))); @@ -1877,6 +2361,7 @@ void StartupSendFowarder(RedoItem *item) } AddPageRedoItem(g_dispatcher->trxnLine.managerThd, item); + AddPageRedoItem(g_dispatcher->auxiliaryLine.ctrlThd, item); } void SendLsnFowarder() @@ -2266,11 +2751,6 @@ void XLogReadPageWorkerMain() InitXLogRecordReadBuffer(&xlogreader); pg_atomic_write_u32(&(g_recordbuffer->readPageWorkerState), WORKER_STATE_RUN); - if (IsRecoveryDone()) { - t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM; - t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM; - pg_atomic_write_u32(&(g_recordbuffer->readSource), XLOG_FROM_STREAM); - } XLogRecord *record = XLogParallelReadNextRecord(xlogreader); while (record != NULL) { @@ -2573,6 +3053,188 @@ void StartRequestXLogFromStream() } } +void SegWorkerMain() +{ + (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); + + XLogRecParseState *redoblockstateHead = NULL; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + while ((redoblockstateHead = (XLogRecParseState *)SPSCBlockingQueueTop(g_redoWorker->queue)) != + (XLogRecParseState *)&g_redoEndMark) { + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); + if ((void *)redoblockstateHead == (void *)&g_cleanupMark) { + g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); + SPSCBlockingQueuePop(g_redoWorker->queue); + ereport(LOG, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]SegWorkerMain has cleaned InvalidPages"))); + continue; + } else if ((void *)redoblockstateHead == (void *)&g_closefdMark) { + smgrcloseall(); + SPSCBlockingQueuePop(g_redoWorker->queue); + continue; + } else if ((void *)redoblockstateHead == (void *)&g_cleanInvalidPageMark) { + forget_range_invalid_pages((void *)redoblockstateHead); + SPSCBlockingQueuePop(g_redoWorker->queue); + continue; + } else if ((void *)redoblockstateHead == (void *)&g_GlobalLsnForwarder) { + SegWorkerProcLsnForwarder((RedoItem *)redoblockstateHead); + SPSCBlockingQueuePop(g_redoWorker->queue); + continue; + } + + Assert(GetCurrentXLogRecParseType(redoblockstateHead) == PARSE_TYPE_SEG); + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + if (XLByteLT(g_redoWorker->nextPrunePtr, redoblockstateHead->blockparse.blockhead.end_ptr)) { + OnDemandSegWorkerRedoSegParseState(redoblockstateHead); + } else { + ReleaseBlockParseStateIfNotReplay(redoblockstateHead); + } + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3], g_redoWorker->timeCostList[TIME_COST_STEP_4]); + SPSCBlockingQueuePop(g_redoWorker->queue); + SetCompletedReadEndPtr(g_redoWorker, redoblockstateHead->blockparse.blockhead.start_ptr, + redoblockstateHead->blockparse.blockhead.end_ptr); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + + RedoInterruptCallBack(); + ADD_ABNORMAL_POSITION(11); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2], g_redoWorker->timeCostList[TIME_COST_STEP_1]); + } + + SPSCBlockingQueuePop(g_redoWorker->queue); + RedoThrdWaitForExit(g_redoWorker); +} + +static void HashMapManagerProcHashmapPrune(HTAB *redoItemHash, XLogRecPtr prunePtr, bool updateStat) +{ + if (redoItemHash == NULL) { + return; + } + + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + HTAB *curMap = redoItemHash; + hash_seq_init(&status, curMap); + + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + RedoItemHashPruneWithoutLock(curMap, redoItemEntry, prunePtr, updateStat); + } +} + +static void HashMapManagerProcLsnForwarder(RedoItem *lsnForwarder) +{ + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); + (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); + uint32 refCount; + do { + refCount = pg_atomic_read_u32(&g_GlobalLsnForwarder.record.refcount); + RedoInterruptCallBack(); + } while (refCount != 0); +} + +void HashMapManagerMain() +{ + (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); + + do { + bool pruneMax = false; + bool updateStat = true; + if (!SPSCBlockingQueueIsEmpty(g_redoWorker->queue)) { + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + RedoItem *item = (RedoItem *)SPSCBlockingQueueTop(g_redoWorker->queue); + if (item == &g_redoEndMark) { + break; + } else if (item == &g_GlobalLsnForwarder) { + HashMapManagerProcLsnForwarder(item); + } else if (item == &g_hashmapPruneMark) { + pruneMax = true; + } + SPSCBlockingQueuePop(g_redoWorker->queue); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + } + + XLogRecPtr ckptRedoPtr = pg_atomic_read_u64(&g_dispatcher->ckptRedoPtr); + // step1: prune seg record queue + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + while ((g_redoWorker->slotId == SEG_PROC_PIPELINE_SLOT) && !SPSCBlockingQueueIsEmpty(g_dispatcher->segQueue)) { + XLogRecParseState *segRecord = (XLogRecParseState *)SPSCBlockingQueueTop(g_dispatcher->segQueue); + if (XLByteLT(ckptRedoPtr, segRecord->blockparse.blockhead.end_ptr)) { + break; + } +#ifdef USE_ASSERT_CHECKING + DoRecordCheck(segRecord, InvalidXLogRecPtr, false); +#endif + XLogBlockParseStateRelease(segRecord); + SPSCBlockingQueuePop(g_dispatcher->segQueue); + } + + // step2: prune idle hashmap + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_2]); + ondemand_htab_ctrl_t *nextHtabCtrl = g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[g_redoWorker->slotId]; + ondemand_htab_ctrl_t *targetHtabCtrl = g_dispatcher->pageLines[g_redoWorker->slotId].managerThd->redoItemHashCtrl; + while (nextHtabCtrl != targetHtabCtrl) { + ondemand_htab_ctrl_t *procHtabCtrl = nextHtabCtrl; + nextHtabCtrl = (ondemand_htab_ctrl_t *)nextHtabCtrl->nextHTabCtrl; + if (XLByteLT(procHtabCtrl->maxRedoItemPtr, ckptRedoPtr)) { + PRTrackAllClear(procHtabCtrl->hTab); + pg_atomic_write_u64(&g_redoWorker->nextPrunePtr, procHtabCtrl->maxRedoItemPtr); + pfree(procHtabCtrl); + g_instance.comm_cxt.predo_cxt.redoItemHashCtrl[g_redoWorker->slotId] = nextHtabCtrl; + } else { + updateStat = false; + break; + } + } + + // step3: prune current hashmap + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2], t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_3]); + if (pruneMax) { + HashMapManagerProcHashmapPrune(nextHtabCtrl->hTab, ckptRedoPtr, updateStat); + pg_atomic_write_u64(&g_redoWorker->nextPrunePtr, ckptRedoPtr); + } + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + + RedoInterruptCallBack(); + ADD_ABNORMAL_POSITION(12); + pg_usleep(500000L); /* 500 ms */ + } while (true); + + SPSCBlockingQueuePop(g_redoWorker->queue); + RedoThrdWaitForExit(g_redoWorker); + } + +void OndemandCtrlWorkerMain() +{ + (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); + + do { + if (!SPSCBlockingQueueIsEmpty(g_redoWorker->queue)) { + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + RedoItem *item = (RedoItem *)SPSCBlockingQueueTop(g_redoWorker->queue); + if (item == &g_redoEndMark) { + break; + } else if (item == &g_GlobalLsnForwarder) { + PageWorkerProcLsnForwarder(item); + } else { + Assert(0); + } + SPSCBlockingQueuePop(g_redoWorker->queue); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + } + + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + OndemandUpdateXLogParseMemUsedBlkNum(); + + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); + OndemandRequestPrimaryDoCkptIfNeed(); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2]); + + RedoInterruptCallBack(); + ADD_ABNORMAL_POSITION(13); + pg_usleep(500000L); /* 500 ms */ + } while (true); + + SPSCBlockingQueuePop(g_redoWorker->queue); + RedoThrdWaitForExit(g_redoWorker); +} void XLogReadManagerMain() { @@ -2705,6 +3367,15 @@ int RedoMainLoop() case REDO_READ_MNG: XLogReadManagerMain(); break; + case REDO_SEG_WORKER: + SegWorkerMain(); + break; + case REDO_HTAB_MNG: + HashMapManagerMain(); + break; + case REDO_CTRL_WORKER: + OndemandCtrlWorkerMain(); + break; default: break; } @@ -2719,10 +3390,11 @@ int RedoMainLoop() int exitCode = GetDispatcherExitCode(); g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); g_redoWorker->committingCsnList = XLogReleaseAndGetCommittingCsnList(); + const char *role_name = RedoWokerRole2Str(g_redoWorker->role); ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("worker[%d]: exitcode = %d, total elapsed = %ld", g_redoWorker->id, exitCode, - INSTR_TIME_GET_MICROSEC(endTime)))); + errmsg("worker[%d]: rolename = %s, exitcode = %d, total elapsed = %ld", g_redoWorker->id, role_name, + exitCode, INSTR_TIME_GET_MICROSEC(endTime)))); (void)MemoryContextSwitchTo(g_redoWorker->oldCtx); @@ -2773,6 +3445,15 @@ const char *RedoWokerRole2Str(RedoRole role) case REDO_READ_MNG: return "read_manager"; break; + case REDO_SEG_WORKER: + return "seg_worker"; + break; + case REDO_HTAB_MNG: + return "htab_manager"; + break; + case REDO_CTRL_WORKER: + return "redo_ctrl"; + break; default: return "unkown"; break; @@ -2925,7 +3606,7 @@ void RedoThrdWaitForExit(const PageRedoWorker *wk) WaitPageRedoWorkerReachLastMark(g_dispatcher->pageLines[sd].managerThd); break; case REDO_PAGE_MNG: - DispatchEndMarkToRedoWorkerAndWait(); + PageManagerDispatchEndMarkAndWait(); break; case REDO_PAGE_WORKER: break; /* Don't need to wait for anyone */ @@ -2936,6 +3617,12 @@ void RedoThrdWaitForExit(const PageRedoWorker *wk) break; case REDO_TRXN_WORKER: break; /* Don't need to wait for anyone */ + case REDO_SEG_WORKER: + break; /* Don't need to wait for anyone */ + case REDO_HTAB_MNG: + break; /* Don't need to wait for anyone */ + case REDO_CTRL_WORKER: + break; /* Don't need to wait for anyone */ default: break; } @@ -2975,6 +3662,16 @@ void AddPageRedoItem(PageRedoWorker *worker, void *item) SPSCBlockingQueuePut(worker->queue, item); } +static void AddTrxnHashmap(void *item) +{ + SPSCBlockingQueuePut(g_dispatcher->trxnQueue, item); +} + +static void AddSegHashmap(void *item) +{ + SPSCBlockingQueuePut(g_dispatcher->segQueue, item); +} + /* Run from the dispatcher thread. */ bool SendPageRedoEndMark(PageRedoWorker *worker) { @@ -3101,4 +3798,102 @@ void BatchClearRecoveryThreadHashTbl(Oid spcNode, Oid dbNode) return; } +static bool OndemandNeedHandleSyncRecord() +{ + return (XLByteLT(pg_atomic_read_u64(&g_dispatcher->ckptRedoPtr), pg_atomic_read_u64(&g_dispatcher->syncRecordPtr))); +} + +static XLogRecPtr RequestPrimaryCkptAndUpdateCkptRedoPtr() +{ + XLogRecPtr ckptRedoPtr = SSOndemandRequestPrimaryCkptAndGetRedoLsn(); + UpdateCheckpointRedoPtrForPrune(ckptRedoPtr); + return ckptRedoPtr; +} + +static void OndemandPauseRedoAndRequestPrimaryDoCkpt(OndemandCheckPauseCB activatePauseFunc, + OndemandCheckPauseCB inactivatePauseFunc, OndemandRefreshPauseStatusCB refreshPauseStatusFunc, + ondemand_recovery_pause_status_t pauseState) +{ + while (activatePauseFunc() && SS_ONDEMAND_REALTIME_BUILD_NORMAL) { + g_instance.dms_cxt.SSRecoveryInfo.ondemand_recovery_pause_status = pauseState; + (void)RequestPrimaryCkptAndUpdateCkptRedoPtr(); + + if ((inactivatePauseFunc != NULL) && !inactivatePauseFunc()) { + break; + } + + if (refreshPauseStatusFunc != NULL) { + refreshPauseStatusFunc(); + } + + RedoInterruptCallBack(); + pg_usleep(100000L); /* 100 ms */ + } + g_instance.dms_cxt.SSRecoveryInfo.ondemand_recovery_pause_status = NOT_PAUSE; +} + +void OndemandRequestPrimaryDoCkptIfNeed() +{ + if (!SS_ONDEMAND_REALTIME_BUILD_NORMAL) { + return; + } + + /* check whether parse mem is not enough */ + OndemandPauseRedoAndRequestPrimaryDoCkpt(&OndemandXLogParseMemApproachFull, &OndemandXLogParseMemFull, + &OndemandUpdateXLogParseMemUsedBlkNum, PAUSE_FOR_PRUNE_HASHMAP); + + /* check whether trxn record queue is full */ + OndemandPauseRedoAndRequestPrimaryDoCkpt(&OndemandTrxnQueueFullInRealtimeBuild, NULL, NULL, + PAUSE_FOR_PRUNE_TRXN_QUEUE); + + /* check whether seg record queue is full */ + OndemandPauseRedoAndRequestPrimaryDoCkpt(&OndemandSegQueueFullInRealtimeBuild, NULL, NULL, + PAUSE_FOR_PRUNE_SEG_QUEUE); + + /* check whether redo workers need handle sync record */ + OndemandPauseRedoAndRequestPrimaryDoCkpt(&OndemandNeedHandleSyncRecord, NULL, NULL, + PAUSE_FOR_SYNC_REDO); +} + +bool SSXLogParseRecordNeedReplayInOndemandRealtimeBuild(XLogRecParseState *redoblockstate) +{ + XLogRecPtr ckptRedoPtr = g_redoWorker->nextPrunePtr; + if (XLByteLT(redoblockstate->blockparse.blockhead.end_ptr, ckptRedoPtr) || SS_ONDEMAND_REALTIME_BUILD_SHUTDOWN) { + return false; + } + return true; +} + +void GetOndemandRecoveryStatus(ondemand_recovery_stat *stat) +{ + if (IsExtremeRtoRunning()) { + XLogRecPtr tmpStart = MAX_XLOG_REC_PTR; + XLogRecPtr tmpEnd = MAX_XLOG_REC_PTR; + OndemandUpdateXLogParseMemUsedBlkNum(); + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; ++i) { + if (g_dispatcher->allWorkers[i]->role == REDO_READ_PAGE_WORKER) { + GetCompletedReadEndPtr(g_dispatcher->allWorkers[i], &tmpStart, &tmpEnd); + break; + } + } + stat->checkpointPtr = pg_atomic_read_u64(&g_dispatcher->ckptRedoPtr); + stat->replayedPtr = tmpEnd; + stat->hmpUsedBlkNum = pg_atomic_read_u32(&g_dispatcher->parseManager.memctl.usedblknum); + stat->hmpTotalBlkNum = g_dispatcher->parseManager.memctl.totalblknum; + stat->trxnQueueNum = SPSCGetQueueCount(g_dispatcher->trxnQueue); + stat->segQueueNum = SPSCGetQueueCount(g_dispatcher->segQueue); + } else { + stat->checkpointPtr = InvalidXLogRecPtr; + stat->replayedPtr = InvalidXLogRecPtr; + stat->hmpUsedBlkNum = 0; + stat->hmpTotalBlkNum = 0; + stat->trxnQueueNum = 0; + stat->segQueueNum = 0; + } + stat->inOndemandRecovery = SS_IN_ONDEMAND_RECOVERY; + stat->ondemandRecoveryStatus = g_instance.dms_cxt.SSRecoveryInfo.cluster_ondemand_status; + stat->realtimeBuildStatus = g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status; + stat->recoveryPauseStatus = g_instance.dms_cxt.SSRecoveryInfo.ondemand_recovery_pause_status; +} + } // namespace ondemand_extreme_rto \ No newline at end of file diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp index c4b9bb5cd..6c3132cae 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp @@ -28,6 +28,7 @@ #include "access/ondemand_extreme_rto/redo_utils.h" #include "access/ondemand_extreme_rto/xlog_read.h" #include "storage/lock/lwlock.h" +#include "catalog/storage_xlog.h" /* * Add xlog reader private structure for page read. @@ -95,11 +96,38 @@ void *OndemandXLogMemCtlInit(RedoMemManager *memctl, Size itemsize, int itemnum) return (void *)t_thrd.storage_cxt.ondemandXLogMem; } +static RedoMemSlot *OndemandGlobalXLogMemAlloc() +{ + RedoMemManager *glbmemctl = &ondemand_extreme_rto::g_dispatcher->parseManager.memctl; + Buffer firstfreebuffer = AtomicReadBuffer(&glbmemctl->firstfreeslot); + while (firstfreebuffer != InvalidBuffer) { + RedoMemSlot *firstfreeslot = &glbmemctl->memslot[firstfreebuffer - 1]; + Buffer nextfreebuffer = firstfreeslot->freeNext; + if (AtomicCompareExchangeBuffer(&glbmemctl->firstfreeslot, &firstfreebuffer, nextfreebuffer)) { + firstfreeslot->freeNext = InvalidBuffer; + return firstfreeslot; + } + firstfreebuffer = AtomicReadBuffer(&glbmemctl->firstfreeslot); + } + return NULL; +} + +static void OndemandGlobalXLogMemReleaseIfNeed(RedoMemManager *memctl) +{ + RedoMemManager *glbmemctl = &ondemand_extreme_rto::g_dispatcher->parseManager.memctl; + if (AtomicReadBuffer(&glbmemctl->firstfreeslot) == InvalidBuffer) { + Buffer firstreleaseslot = AtomicExchangeBuffer(&memctl->firstreleaseslot, InvalidBuffer); + Buffer invalidbuffer = InvalidBuffer; + if (!AtomicCompareExchangeBuffer(&glbmemctl->firstfreeslot, &invalidbuffer, firstreleaseslot)) { + AtomicWriteBuffer(&memctl->firstreleaseslot, firstreleaseslot); + } + } +} + RedoMemSlot *OndemandXLogMemAlloc(RedoMemManager *memctl) { RedoMemSlot *nextfreeslot = NULL; do { - LWLockAcquire(OndemandXLogMemAllocLock, LW_EXCLUSIVE); if (memctl->firstfreeslot == InvalidBuffer) { memctl->firstfreeslot = AtomicExchangeBuffer(&memctl->firstreleaseslot, InvalidBuffer); pg_read_barrier(); @@ -108,17 +136,19 @@ RedoMemSlot *OndemandXLogMemAlloc(RedoMemManager *memctl) if (memctl->firstfreeslot != InvalidBuffer) { nextfreeslot = &(memctl->memslot[memctl->firstfreeslot - 1]); memctl->firstfreeslot = nextfreeslot->freeNext; - memctl->usedblknum++; nextfreeslot->freeNext = InvalidBuffer; } - LWLockRelease(OndemandXLogMemAllocLock); + + if (nextfreeslot == NULL) { + nextfreeslot = OndemandGlobalXLogMemAlloc(); + } if (memctl->doInterrupt != NULL) { memctl->doInterrupt(); } - } while (nextfreeslot == NULL); + pg_atomic_fetch_add_u32(&memctl->usedblknum, 1); return nextfreeslot; } @@ -132,14 +162,14 @@ void OndemandXLogMemRelease(RedoMemManager *memctl, Buffer bufferid) } bufferslot = &(memctl->memslot[bufferid - 1]); Assert(bufferslot->freeNext == InvalidBuffer); - LWLockAcquire(OndemandXLogMemAllocLock, LW_EXCLUSIVE); Buffer oldFirst = AtomicReadBuffer(&memctl->firstreleaseslot); pg_memory_barrier(); do { AtomicWriteBuffer(&bufferslot->freeNext, oldFirst); } while (!AtomicCompareExchangeBuffer(&memctl->firstreleaseslot, &oldFirst, bufferid)); - memctl->usedblknum--; - LWLockRelease(OndemandXLogMemAllocLock); + pg_atomic_fetch_sub_u32(&memctl->usedblknum, 1); + + OndemandGlobalXLogMemReleaseIfNeed(memctl); } @@ -344,6 +374,73 @@ bool IsTargetBlockState(XLogRecParseState *targetblockstate, XLogRecParseState* return true; } +XLogRecParseType GetCurrentXLogRecParseType(XLogRecParseState *preState) +{ + XLogRecParseType type; + switch (preState->blockparse.blockhead.block_valid) { + case BLOCK_DATA_MAIN_DATA_TYPE: + case BLOCK_DATA_UNDO_TYPE: + case BLOCK_DATA_VM_TYPE: + case BLOCK_DATA_FSM_TYPE: + type = PARSE_TYPE_DATA; + break; + case BLOCK_DATA_SEG_EXTEND: + case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: + type = PARSE_TYPE_SEG; + break; + case BLOCK_DATA_SEG_FULL_SYNC_TYPE: + { + uint8 recordType = XLogBlockHeadGetInfo(&preState->blockparse.blockhead) & ~XLR_INFO_MASK; + if (unlikely((recordType == XLOG_SEG_CREATE_EXTENT_GROUP) || (recordType == XLOG_SEG_NEW_PAGE))) { + type = PARSE_TYPE_DDL; + } else { + type = PARSE_TYPE_SEG; + } + break; + } + + default: + type = PARSE_TYPE_DDL; + break; + } + + return type; +} + +static bool IsRecParseStateHaveChildState(XLogRecParseState *checkState) +{ + if (GetCurrentXLogRecParseType(checkState) == PARSE_TYPE_SEG) { + uint8 info = XLogBlockHeadGetInfo(&checkState->blockparse.blockhead) & ~XLR_INFO_MASK; + if ((info == XLOG_SEG_ATOMIC_OPERATION) || (info == XLOG_SEG_SEGMENT_EXTEND) || + (info == XLOG_SEG_INIT_MAPPAGE) || (info == XLOG_SEG_INIT_INVRSPTR_PAGE) || + (info == XLOG_SEG_ADD_NEW_GROUP)) { + return true; + } + } + return false; +} + +static XLogRecParseState *OndemandFindTargetBlockStateInOndemandRedo(XLogRecParseState *checkState, + XLogRecParseState *srcState) +{ + Assert(!IsRecParseStateHaveChildState(checkState)); + XLogRecParseState *nextState = checkState; + XLogRecParseState *targetState = NULL; + do { + XLogRecParseState *preState = nextState; + nextState = (XLogRecParseState *)nextState->nextrecord; + preState->nextrecord = NULL; + + if (IsTargetBlockState(preState, srcState)) { + targetState = preState; + } else { + OndemandXLogParseBufferRelease(preState); + } + } while (nextState != NULL); + + return targetState; +} + // only used in ondemand redo stage XLogRecParseState *OndemandRedoReloadXLogRecord(XLogRecParseState *redoblockstate) { @@ -362,9 +459,15 @@ XLogRecParseState *OndemandRedoReloadXLogRecord(XLogRecParseState *redoblockstat true, g_instance.dms_cxt.SSRecoveryInfo.recovery_xlog_dir); if (record == NULL) { ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("[On-demand] reload xlog record failed at %X/%X, errormsg: %s", - (uint32)(redoblockstate->blockparse.blockhead.start_ptr >> 32), - (uint32)redoblockstate->blockparse.blockhead.start_ptr, errormsg))); + errmsg("[On-demand] reload xlog record failed at %X/%X, spc/db/rel/bucket " + "fork-block: %u/%u/%u/%d %d-%u, errormsg: %s", + (uint32)(recordBlockState->blockparse.blockhead.start_ptr >> 32), + (uint32)recordBlockState->blockparse.blockhead.start_ptr, + recordBlockState->blockparse.blockhead.spcNode, recordBlockState->blockparse.blockhead.dbNode, + recordBlockState->blockparse.blockhead.relNode, + recordBlockState->blockparse.blockhead.bucketNode, + recordBlockState->blockparse.blockhead.forknum, recordBlockState->blockparse.blockhead.blkno, + errormsg))); } // step2: parse to block @@ -377,20 +480,18 @@ XLogRecParseState *OndemandRedoReloadXLogRecord(XLogRecParseState *redoblockstat } while (true); // step3: find target parse state - XLogRecParseState *nextState = recordBlockState; - XLogRecParseState *targetState = NULL; - do { - XLogRecParseState *preState = nextState; - nextState = (XLogRecParseState *)nextState->nextrecord; - preState->nextrecord = NULL; - - if (IsTargetBlockState(preState, redoblockstate)) { - targetState = preState; - } else { - OndemandXLogParseBufferRelease(preState); - } - } while (nextState != NULL); - + XLogRecParseState *targetState = OndemandFindTargetBlockStateInOndemandRedo(recordBlockState, redoblockstate); + if (targetState == NULL) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[On-demand] reload xlog record failed at %X/%X, spc/db/rel/bucket " + "fork-block: %u/%u/%u/%d %d-%u, errormsg: can not find target block-record", + (uint32)(recordBlockState->blockparse.blockhead.start_ptr >> 32), + (uint32)recordBlockState->blockparse.blockhead.start_ptr, + recordBlockState->blockparse.blockhead.spcNode, recordBlockState->blockparse.blockhead.dbNode, + recordBlockState->blockparse.blockhead.relNode, + recordBlockState->blockparse.blockhead.bucketNode, + recordBlockState->blockparse.blockhead.forknum, recordBlockState->blockparse.blockhead.blkno))); + } return targetState; } @@ -410,3 +511,42 @@ void OnDemandWaitRedoFinish() { ondemand_extreme_rto::WaitRedoFinish(); } + +void OnDemandWaitRealtimeBuildShutDown() +{ + ondemand_extreme_rto::WaitRealtimeBuildShutdown(); +} + +void OnDemandUpdateRealtimeBuildPrunePtr() +{ + ondemand_extreme_rto::UpdateCheckpointRedoPtrForPrune(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo); +} + +XLogRecPtr GetRedoLocInCheckpointRecord(XLogReaderState *record) +{ + CheckPoint checkPoint; + CheckPointUndo checkPointUndo; + errno_t rc; + + Assert(IsCheckPoint(record)); + + if (XLogRecGetDataLen(record) >= sizeof(checkPoint) && XLogRecGetDataLen(record) < sizeof(checkPointUndo)) { + rc = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(record), sizeof(CheckPoint)); + securec_check(rc, "", ""); + } else if (XLogRecGetDataLen(record) >= sizeof(checkPointUndo)) { + rc = memcpy_s(&checkPointUndo, sizeof(CheckPointUndo), XLogRecGetData(record), sizeof(CheckPointUndo)); + securec_check(rc, "", ""); + checkPoint = checkPointUndo.ori_checkpoint; + } + return checkPoint.redo; +} + +void WaitUntilRealtimeBuildStatusToFailoverAndUpdatePrunePtr() +{ + while (SS_ONDEMAND_REALTIME_BUILD_NORMAL) { + pg_usleep(100000L); /* 100 ms */ + } + Assert(SS_ONDEMAND_REALTIME_BUILD_FAILOVER); + ondemand_extreme_rto::g_redoWorker->nextPrunePtr = + pg_atomic_read_u64(&ondemand_extreme_rto::g_dispatcher->ckptRedoPtr); +} diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp index 075e17027..321b01a54 100644 --- a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp @@ -600,6 +600,7 @@ err: XLogRecord *XLogParallelReadNextRecord(XLogReaderState *xlogreader) { XLogRecord *record = NULL; + int retry = 0; /* This is the first try to read this page. */ t_thrd.xlog_cxt.failedSources = 0; @@ -618,7 +619,7 @@ XLogRecord *XLogParallelReadNextRecord(XLogReaderState *xlogreader) * In StandbyMode that only happens if we have been triggered, so * we shouldn't loop anymore in that case. */ - if (errormsg != NULL) + if (errormsg != NULL && ++retry > 3) ereport(emode_for_corrupt_record(LOG, t_thrd.xlog_cxt.EndRecPtr), (errmsg_internal("%s", errormsg) /* already translated */)); } @@ -659,42 +660,21 @@ XLogRecord *XLogParallelReadNextRecord(XLogReaderState *xlogreader) /* No valid record available from this source */ t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource; - if (t_thrd.xlog_cxt.readFile >= 0) { - close(t_thrd.xlog_cxt.readFile); - t_thrd.xlog_cxt.readFile = -1; + /* In ondemand realtime build mode, loop back to retry. Otherwise, give up. */ + if (SS_ONDEMAND_REALTIME_BUILD_NORMAL) { + xlogreader->preReadStartPtr = InvalidXlogPreReadStartPtr; + retry = 0; } - /* - * If archive recovery was requested, but we were still doing - * crash recovery, switch to archive recovery and retry using the - * offline archive. We have now replayed all the valid WAL in - * pg_xlog, so we are presumably now consistent. - * - * We require that there's at least some valid WAL present in - * pg_xlog, however (!fetch_ckpt). We could recover using the WAL - * from the archive, even if pg_xlog is completely empty, but we'd - * have no idea how far we'd have to replay to reach consistency. - * So err on the safe side and give up. - */ - if (!t_thrd.xlog_cxt.InArchiveRecovery && t_thrd.xlog_cxt.ArchiveRecoveryRequested) { - t_thrd.xlog_cxt.InArchiveRecovery = true; - if (t_thrd.xlog_cxt.StandbyModeRequested) - t_thrd.xlog_cxt.StandbyMode = true; - /* construct a minrecoverypoint, update LSN */ - UpdateMinrecoveryInAchive(); - /* - * Before we retry, reset lastSourceFailed and currentSource - * so that we will check the archive next. - */ - t_thrd.xlog_cxt.failedSources = 0; + if (retry <= 3) { continue; - } - - /* In standby mode, loop back to retry. Otherwise, give up. */ - if (t_thrd.xlog_cxt.StandbyMode && !t_thrd.xlog_cxt.recoveryTriggered && !DoEarlyExit()) - continue; - else + } else { + if (t_thrd.xlog_cxt.readFile >= 0) { + close(t_thrd.xlog_cxt.readFile); + t_thrd.xlog_cxt.readFile = -1; + } return NULL; + } } } } diff --git a/src/gausskernel/storage/access/transam/xlog.cpp b/src/gausskernel/storage/access/transam/xlog.cpp index e7b813bd6..86d558dce 100755 --- a/src/gausskernel/storage/access/transam/xlog.cpp +++ b/src/gausskernel/storage/access/transam/xlog.cpp @@ -56,6 +56,7 @@ #include "access/xlogproc.h" #include "access/parallel_recovery/dispatcher.h" #include "access/extreme_rto/page_redo.h" +#include "access/ondemand_extreme_rto/page_redo.h" #include "commands/tablespace.h" #include "commands/matview.h" @@ -5653,6 +5654,10 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, in t_thrd.xlog_cxt.readFile = -1; } + if (xlogreader->preReadBuf != NULL) { + xlogreader->preReadStartPtr = InvalidXlogPreReadStartPtr; + } + /* * If archive recovery was requested, but we were still doing * crash recovery, switch to archive recovery and retry using the @@ -5708,7 +5713,8 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, in ProcTxnWorkLoad(false); /* In standby mode, loop back to retry. Otherwise, give up. */ - if (t_thrd.xlog_cxt.StandbyMode && !dummyStandbyMode && !t_thrd.xlog_cxt.recoveryTriggered) { + if ((t_thrd.xlog_cxt.StandbyMode && !dummyStandbyMode && !t_thrd.xlog_cxt.recoveryTriggered) || + SS_ONDEMAND_REALTIME_BUILD_NORMAL) { continue; } else { if (u_sess->attr.attr_storage.HaModuleDebug) { @@ -9289,7 +9295,7 @@ void StartupXLOG(void) ereport(LOG, (errmsg("[On-demand]: Ondemand recovery do not finish in last reform, " "reading control file of original primary:%d", src_id))); SSOndemandRecoveryExitNormal = false; - } else if (SS_DORADO_CLUSTER) { + } else if (SS_DORADO_CLUSTER || SS_ONDEMAND_REALTIME_BUILD_READY_TO_BUILD) { src_id = SSGetPrimaryInstId(); } else { if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { @@ -9507,7 +9513,7 @@ void StartupXLOG(void) } xlogreader = SSXLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER); close_readFile_if_open(); - if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { + if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING || SS_ONDEMAND_REALTIME_BUILD_READY_TO_BUILD) { // init shared memory set page empty SSCSNLOGShmemClear(); SSCLOGShmemClear(); @@ -9966,7 +9972,7 @@ void StartupXLOG(void) } } - if (SS_STANDBY_MODE && t_thrd.xlog_cxt.InRecovery == true) { + if (SS_STANDBY_MODE && t_thrd.xlog_cxt.InRecovery == true && SS_ONDEMAND_REALTIME_BUILD_DISABLED) { /* do not need replay anything in SS standby mode */ ereport(LOG, (errmsg("[SS] Skip redo replay in standby mode"))); t_thrd.xlog_cxt.InRecovery = false; @@ -9977,16 +9983,7 @@ void StartupXLOG(void) if (SS_PRIMARY_MODE && ENABLE_ONDEMAND_RECOVERY && (SS_STANDBY_FAILOVER || SS_PRIMARY_NORMAL_REFORM) && t_thrd.xlog_cxt.InRecovery == true) { if (SSOndemandRecoveryExitNormal) { - g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = true; - g_instance.dms_cxt.SSRecoveryInfo.cluster_ondemand_status = CLUSTER_IN_ONDEMAND_BUILD; - /* for other nodes in cluster and ondeamnd recovery failed */ - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_IN_ONDEMAND_BUILD; - g_instance.dms_cxt.SSReformerControl.recoveryInstId = g_instance.dms_cxt.SSRecoveryInfo.recovery_inst_id; - SSUpdateReformerCtrl(); - LWLockRelease(ControlFileLock); - SSRequestAllStandbyReloadReformCtrlPage(); - SetOndemandExtremeRtoMode(); + StartupOndemandRecovery(); ereport(LOG, (errmsg("[On-demand] replayed in extreme rto ondemand recovery mode"))); } else { ereport(LOG, (errmsg("[On-demand] do not allow replay in ondemand recovery if last ondemand recovery " @@ -10000,6 +9997,17 @@ void StartupXLOG(void) LWLockRelease(ControlFileLock); } + if (SS_ONDEMAND_REALTIME_BUILD_READY_TO_BUILD) { + t_thrd.xlog_cxt.InRecovery = true; + SetOndemandExtremeRtoMode(); + g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = false; + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status = BUILD_NORMAL; + ereport(LOG, (errmsg("[On-demand] realtime build start finish, set status to BUILD_NORMAL"))); + } + + /* refresh recovery parallelism */ + ConfigRecoveryParallelism(); + ReadRemainSegsFile(); /* Determine whether it is currently in the switchover of streaming disaster recovery */ checkHadrInSwitchover(); @@ -10098,7 +10106,7 @@ void StartupXLOG(void) } t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); /* No need to hold ControlFileLock yet, we aren't up far enough */ - if (!SS_STANDBY_FAILOVER) { + if (!SS_STANDBY_FAILOVER && SS_ONDEMAND_REALTIME_BUILD_DISABLED) { UpdateControlFile(); } @@ -11065,6 +11073,7 @@ void StartupXLOG(void) if (SS_PRIMARY_MODE) { g_instance.dms_cxt.SSRecoveryInfo.cluster_ondemand_status = CLUSTER_NORMAL; + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status = DISABLED; /* for other nodes in cluster */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; diff --git a/src/gausskernel/storage/access/transam/xlogutils.cpp b/src/gausskernel/storage/access/transam/xlogutils.cpp index e3c68c33d..de74c1dc7 100644 --- a/src/gausskernel/storage/access/transam/xlogutils.cpp +++ b/src/gausskernel/storage/access/transam/xlogutils.cpp @@ -1222,7 +1222,15 @@ Buffer XLogReadBufferExtendedForSegpage(const RelFileNode &rnode, ForkNumber for if (BufferIsValid(buffer)) { Page page = BufferGetPage(buffer); - if (mode == RBM_NORMAL) { + /* + * We check not SS_IN_ONDEMAND_RECOVERY for these reasons: + * 1. DMS mode (shared storage) do not support page repair. + * 2. In standby failover, some pages meet replay request which + * are in standby shared memorys, but there DRC are lost in + * last primary node. So use LockBuffer in XLogReadBufferExtendedForSegpage + * will read from DISK and cover these newest pages. + */ + if (mode == RBM_NORMAL && !SS_IN_ONDEMAND_RECOVERY) { bool buffer_is_locked = false; if (ENABLE_DMS && (GetDmsBufCtrl(buffer - 1)->lock_mode == DMS_LOCK_NULL)) { buffer_is_locked = true; diff --git a/src/gausskernel/storage/buffer/bufmgr.cpp b/src/gausskernel/storage/buffer/bufmgr.cpp index 3ffda0e6a..92d8c2e17 100644 --- a/src/gausskernel/storage/buffer/bufmgr.cpp +++ b/src/gausskernel/storage/buffer/bufmgr.cpp @@ -2534,6 +2534,7 @@ Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber fork bool isExtend = false; bool isLocalBuf = SmgrIsTemp(smgr); bool need_repair = false; + dms_buf_ctrl_t *buf_ctrl = NULL; *hit = false; @@ -2605,6 +2606,13 @@ Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber fork } } + if (ENABLE_DMS) { + buf_ctrl = GetDmsBufCtrl(bufHdr->buf_id); + if (mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) { + buf_ctrl->state |= BUF_READ_MODE_ONDEMAND_REALTIME_BUILD; + } + } + found_branch: /* At this point we do NOT hold any locks. * @@ -2633,7 +2641,7 @@ found_branch: if (!isLocalBuf) { if (mode == RBM_ZERO_AND_LOCK) { if (ENABLE_DMS) { - GetDmsBufCtrl(bufHdr->buf_id)->state |= BUF_READ_MODE_ZERO_LOCK; + buf_ctrl->state |= BUF_READ_MODE_ZERO_LOCK; LockBuffer(BufferDescriptorGetBuffer(bufHdr), BUFFER_LOCK_EXCLUSIVE); } else { LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE); @@ -2649,7 +2657,7 @@ found_branch: BufferDescSetPBLK(bufHdr, pblk); } else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) { if (ENABLE_DMS) { - GetDmsBufCtrl(bufHdr->buf_id)->state |= BUF_READ_MODE_ZERO_LOCK; + buf_ctrl->state |= BUF_READ_MODE_ZERO_LOCK; } LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); } @@ -2753,7 +2761,6 @@ found_branch: goto found_branch; } - dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(bufHdr->buf_id); LWLockMode req_lock_mode = isExtend ? LW_EXCLUSIVE : LW_SHARED; if (!LockModeCompatible(buf_ctrl, req_lock_mode)) { if (!StartReadPage(bufHdr, req_lock_mode)) { @@ -2783,7 +2790,13 @@ found_branch: break; } while (true); - return TerminateReadPage(bufHdr, mode, pblk); + Buffer tmp_buffer = TerminateReadPage(bufHdr, mode, pblk); + if (BufferIsInvalid(tmp_buffer) && (mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) && + !(buf_ctrl->state & BUF_READ_MODE_ONDEMAND_REALTIME_BUILD)) { + SSUnPinBuffer(bufHdr); + return InvalidBuffer; + } + return tmp_buffer; } ClearReadHint(bufHdr->buf_id); } @@ -3074,7 +3087,7 @@ retry: /* Pin the buffer and then release the buffer spinlock */ PinBuffer_Locked(buf); - if (!SSHelpFlushBufferIfNeed(buf)) { + if (!SSHelpFlushBufferIfNeed(buf) || !SSOndemandRealtimeBuildAllowFlush(buf)) { // for dms this page cannot eliminate, get another one UnpinBuffer(buf, true); continue; @@ -6203,10 +6216,14 @@ retry: if (ENABLE_DMS && mode != BUFFER_LOCK_UNLOCK) { LWLockMode lock_mode = (mode == BUFFER_LOCK_SHARE) ? LW_SHARED : LW_EXCLUSIVE; Buffer tmp_buffer; + dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buffer - 1); ReadBufferMode read_mode = RBM_NORMAL; - if (lock_mode == LW_EXCLUSIVE && (GetDmsBufCtrl(buffer - 1)->state & BUF_READ_MODE_ZERO_LOCK)) { + if (lock_mode == LW_EXCLUSIVE && (buf_ctrl->state & BUF_READ_MODE_ZERO_LOCK)) { read_mode = RBM_ZERO_AND_LOCK; - GetDmsBufCtrl(buffer - 1)->state &= ~BUF_READ_MODE_ZERO_LOCK; + buf_ctrl->state &= ~BUF_READ_MODE_ZERO_LOCK; + } + if (buf_ctrl->state & BUF_READ_MODE_ONDEMAND_REALTIME_BUILD) { + read_mode = RBM_FOR_ONDEMAND_REALTIME_BUILD; } bool with_io_in_progress = true; @@ -6225,7 +6242,7 @@ retry: TerminateBufferIO(buf, false, 0); } } - + LWLockRelease(buf->content_lock); if (AmDmsReformProcProcess() && dms_reform_failed()) { @@ -6237,6 +6254,11 @@ retry: g_instance.dms_cxt.SSRecoveryInfo.recovery_trapped_in_page_request = true; } + if ((read_mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) && + !(buf_ctrl->state & BUF_READ_MODE_ONDEMAND_REALTIME_BUILD)) { + return; + } + dms_retry_times++; long sleep_time = SSGetBufSleepTime(dms_retry_times); if (sleep_time == SS_BUF_MAX_WAIT_TIME && !SS_IN_REFORM) { @@ -6249,6 +6271,9 @@ retry: tag->forkNum, tag->blockNum, buf->buf_id)))); t_thrd.postgres_cxt.whereToSendOutput = output_backup; } + if (read_mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) { + sleep_time = SS_BUF_WAIT_TIME_IN_ONDEMAND_REALTIME_BUILD; + } pg_usleep(sleep_time); goto retry; } diff --git a/src/gausskernel/storage/lmgr/lwlocknames.txt b/src/gausskernel/storage/lmgr/lwlocknames.txt index c2f253d39..9934949c9 100755 --- a/src/gausskernel/storage/lmgr/lwlocknames.txt +++ b/src/gausskernel/storage/lmgr/lwlocknames.txt @@ -138,7 +138,6 @@ GsStackLock 128 ConfigFileLock 129 DropArchiveSlotLock 130 AboCacheLock 131 -OndemandXLogMemAllocLock 132 OndemandXLogFileHandleLock 133 ExrtoSnapshotLock 134 RedoTruncateLock 135 diff --git a/src/gausskernel/storage/smgr/segment/data_file.cpp b/src/gausskernel/storage/smgr/segment/data_file.cpp index f400d2626..32c05c252 100644 --- a/src/gausskernel/storage/smgr/segment/data_file.cpp +++ b/src/gausskernel/storage/smgr/segment/data_file.cpp @@ -528,6 +528,12 @@ void df_extend_internal(SegLogicFile *sf) } SegmentCheck(new_size <= DF_FILE_SLICE_SIZE); + if (fd < 0) { + char *filename = slice_filename(sf->filename, sf->file_num - 1); + int fd = dv_open_file(filename, O_RDONLY | PG_BINARY, SEGMENT_FILE_MODE); + sf->segfiles[sf->file_num - 1].fd = fd; + pfree(filename); + } if (ftruncate(fd, new_size) != 0) { char *filename = slice_filename(sf->filename, sf->file_num - 1); diff --git a/src/gausskernel/storage/smgr/segment/segbuffer.cpp b/src/gausskernel/storage/smgr/segment/segbuffer.cpp index 2b1ad95f1..ef382cb3e 100644 --- a/src/gausskernel/storage/smgr/segment/segbuffer.cpp +++ b/src/gausskernel/storage/smgr/segment/segbuffer.cpp @@ -590,12 +590,20 @@ Buffer ReadSegBufferForDMS(BufferDesc* bufHdr, ReadBufferMode mode, SegSpace *sp Buffer ReadBufferFast(SegSpace *spc, RelFileNode rnode, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode) { bool found = false; + dms_buf_ctrl_t *buf_ctrl; /* Make sure we will have room to remember the buffer pin */ ResourceOwnerEnlargeBuffers(t_thrd.utils_cxt.CurrentResourceOwner); BufferDesc *bufHdr = SegBufferAlloc(spc, rnode, forkNum, blockNum, &found); + if (ENABLE_DMS) { + buf_ctrl = GetDmsBufCtrl(bufHdr->buf_id); + if (mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) { + buf_ctrl->state |= BUF_READ_MODE_ONDEMAND_REALTIME_BUILD; + } + } + if (!found) { SegmentCheck(!(pg_atomic_read_u64(&bufHdr->state) & BM_VALID)); @@ -618,7 +626,6 @@ Buffer ReadBufferFast(SegSpace *spc, RelFileNode rnode, ForkNumber forkNum, Bloc goto found_branch; } - dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(bufHdr->buf_id); LWLockMode lockmode = LW_SHARED; if (!LockModeCompatible(buf_ctrl, lockmode)) { if (!StartReadPage(bufHdr, lockmode)) { @@ -648,7 +655,13 @@ Buffer ReadBufferFast(SegSpace *spc, RelFileNode rnode, ForkNumber forkNum, Bloc break; } while (true); - return TerminateReadSegPage(bufHdr, mode, spc); + Buffer tmp_buffer = TerminateReadSegPage(bufHdr, mode, spc); + if (BufferIsInvalid(tmp_buffer) && (mode == RBM_FOR_ONDEMAND_REALTIME_BUILD) && + !(buf_ctrl->state & BUF_READ_MODE_ONDEMAND_REALTIME_BUILD)) { + SSUnPinBuffer(bufHdr); + return InvalidBuffer; + } + return tmp_buffer; } if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK || mode == RBM_ZERO) { @@ -762,7 +775,7 @@ retry: SegPinBufferLocked(buf, &new_tag); - if (!SSHelpFlushBufferIfNeed(buf)) { + if (!SSHelpFlushBufferIfNeed(buf) || !SSOndemandRealtimeBuildAllowFlush(buf)) { SegUnpinBuffer(buf); continue; } diff --git a/src/include/access/extreme_rto_redo_api.h b/src/include/access/extreme_rto_redo_api.h index 13df44646..9b35e4cf4 100644 --- a/src/include/access/extreme_rto_redo_api.h +++ b/src/include/access/extreme_rto_redo_api.h @@ -55,9 +55,6 @@ inline bool IsOndemandExtremeRtoMode() { return (g_extreme_rto_type == ONDEMAND_EXTREME_RTO); } -inline void SetOndemandExtremeRtoMode(); -inline bool IsDefaultExtremeRtoMode(); -inline bool IsOndemandExtremeRtoMode(); void ExtremeWaitAllReplayWorkerIdle(); void ExtremeDispatchCleanInvalidPageMarkToAllRedoWorker(RepairFileKey key); void ExtremeDispatchClosefdMarkToAllRedoWorker(); diff --git a/src/include/access/multi_redo_api.h b/src/include/access/multi_redo_api.h index b3bda3fa7..8f07d35a7 100644 --- a/src/include/access/multi_redo_api.h +++ b/src/include/access/multi_redo_api.h @@ -38,9 +38,12 @@ #ifdef ENABLE_LITE_MODE #define ENABLE_ONDEMAND_RECOVERY false +#define ENABLE_ONDEMAND_REALTIME_BUILD false #else #define ENABLE_ONDEMAND_RECOVERY (ENABLE_DMS && IsExtremeRedo() \ && g_instance.attr.attr_storage.dms_attr.enable_ondemand_recovery) +#define ENABLE_ONDEMAND_REALTIME_BUILD (ENABLE_ONDEMAND_RECOVERY \ + && g_instance.attr.attr_storage.dms_attr.enable_ondemand_realtime_build) #endif typedef enum { @@ -98,14 +101,12 @@ inline bool IsParallelRedo() return g_instance.comm_cxt.predo_cxt.redoType == PARALLEL_REDO && (get_real_recovery_parallelism() > 1); } - static inline bool IsMultiThreadRedo() { return (get_real_recovery_parallelism() > 1); } uint32 GetRedoWorkerCount(); - bool IsMultiThreadRedoRunning(); void DispatchRedoRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); void GetThreadNameIfMultiRedo(int argc, char* argv[], char** threadNamePtr); diff --git a/src/include/access/multi_redo_settings.h b/src/include/access/multi_redo_settings.h index b097d8c6c..00b08f39d 100644 --- a/src/include/access/multi_redo_settings.h +++ b/src/include/access/multi_redo_settings.h @@ -56,6 +56,7 @@ static const int MAX_REDO_WORKERS_PER_PARSE = 8; static const int TRXN_REDO_MANAGER_NUM = 1; static const int TRXN_REDO_WORKER_NUM = 1; static const int XLOG_READER_NUM = 3; +static const int ONDEMAND_AUXILIARY_WORKER_NUM = 2; // segredoworker and ctrlworker, only for ondemand recovery static const int MAX_EXTREME_THREAD_NUM = MAX_PARSE_WORKERS * MAX_REDO_WORKERS_PER_PARSE + MAX_PARSE_WORKERS + MAX_PARSE_WORKERS + TRXN_REDO_MANAGER_NUM + TRXN_REDO_WORKER_NUM + XLOG_READER_NUM; diff --git a/src/include/access/ondemand_extreme_rto/batch_redo.h b/src/include/access/ondemand_extreme_rto/batch_redo.h index 5abde5754..15b8966a3 100644 --- a/src/include/access/ondemand_extreme_rto/batch_redo.h +++ b/src/include/access/ondemand_extreme_rto/batch_redo.h @@ -69,9 +69,13 @@ typedef struct redoitemhashentry { } RedoItemHashEntry; extern void PRPrintRedoItemHashTab(HTAB *redoItemHash); -extern HTAB **PRRedoItemHashInitialize(MemoryContext context); -extern void PRTrackClearBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash); -extern void PRTrackAddBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash); +extern ondemand_htab_ctrl_t *PRRedoItemHashInitialize(MemoryContext context); +extern ondemand_htab_ctrl_t **PRInitRedoItemHashForAllPipeline(MemoryContext context); +extern void PRTrackClearBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash); +extern void PRTrackAddBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash, bool isHead = false); +extern void PRTrackAddBatchBlock(XLogRecParseState *headBlockState, XLogRecParseState *tailBlockState, int count, + HTAB *redoItemHash, bool isHead); +extern void PRTrackAllClear(HTAB *redoItemHash); extern uint32 XlogTrackTableHashCode(RedoItemTag *tagPtr); } // namespace ondemand_extreme_rto diff --git a/src/include/access/ondemand_extreme_rto/dispatcher.h b/src/include/access/ondemand_extreme_rto/dispatcher.h index 70a216f6a..224aebe88 100644 --- a/src/include/access/ondemand_extreme_rto/dispatcher.h +++ b/src/include/access/ondemand_extreme_rto/dispatcher.h @@ -43,6 +43,7 @@ namespace ondemand_extreme_rto { typedef struct { PageRedoWorker *batchThd; /* BatchRedoThread */ PageRedoWorker *managerThd; /* PageRedoManager */ + PageRedoWorker *htabThd; /* HashMapManager */ PageRedoWorker **redoThd; /* RedoThreadPool */ uint32 redoThdNum; uint32 *chosedRTIds; /* chosedRedoThdIds */ @@ -60,6 +61,11 @@ typedef struct ReadPipeline { PageRedoWorker *readThd; /* readthrd */ } ReadPipeline; +typedef struct AuxiliaryPipeLine { + PageRedoWorker *segRedoThd; + PageRedoWorker *ctrlThd; +} AuxiliaryPipeLine; + #define MAX_XLOG_READ_BUFFER (0xFFFFF) /* 8k uint */ typedef enum { @@ -130,6 +136,7 @@ typedef struct { uint32 chosedPLCnt; /* chosedPageLineCount */ TrxnRedoPipeline trxnLine; ReadPipeline readLine; + AuxiliaryPipeLine auxiliaryLine; RecordBufferState rtoXlogBufState; PageRedoWorker **allWorkers; /* Array of page redo workers. */ uint32 allWorkersCnt; @@ -166,6 +173,11 @@ typedef struct { volatile XLogRedoNumStatics xlogStatics[RM_NEXT_ID][MAX_XLOG_INFO_NUM]; RedoTimeCost *startupTimeCost; RedoParseManager parseManager; + /* used in realtime ondemand extreme rto */ + volatile XLogRecPtr ckptRedoPtr; + volatile XLogRecPtr syncRecordPtr; + SPSCBlockingQueue *trxnQueue; + SPSCBlockingQueue *segQueue; } LogDispatcher; typedef struct { @@ -180,6 +192,7 @@ extern LogDispatcher *g_dispatcher; extern RedoItem g_GlobalLsnForwarder; extern RedoItem g_cleanupMark; extern RedoItem g_forceDistributeMark; +extern RedoItem g_hashmapPruneMark; extern THR_LOCAL RecordBufferState *g_recordbuffer; const static uint64 OUTPUT_WAIT_COUNT = 0x7FFFFFF; @@ -213,13 +226,14 @@ void StartRecoveryWorkers(XLogReaderState *xlogreader, uint32 privateLen); /* RedoItem lifecycle. */ void DispatchRedoRecordToFile(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +void UpdateCheckpointRedoPtrForPrune(XLogRecPtr prunePtr); void ProcessPendingRecords(bool fullSync = false); void FreeRedoItem(RedoItem *item); /* Dispatcher phases. */ -void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code); void SendRecoveryEndMarkToWorkersAndWaitForReach(int code); void WaitRedoFinish(); +void WaitRealtimeBuildShutdown(); /* Dispatcher states. */ int GetDispatcherExitCode(); diff --git a/src/include/access/ondemand_extreme_rto/page_redo.h b/src/include/access/ondemand_extreme_rto/page_redo.h index fe5f295e5..8bf5fc059 100644 --- a/src/include/access/ondemand_extreme_rto/page_redo.h +++ b/src/include/access/ondemand_extreme_rto/page_redo.h @@ -41,14 +41,25 @@ namespace ondemand_extreme_rto { -#define ONDEMAND_DISTRIBUTE_RATIO 0.9 +#define ONDEMAND_DISTRIBUTE_RATIO 0.95 +#define ONDEMAND_FORCE_PRUNE_RATIO 0.99 +#define ONDEMAND_HASHTAB_SWITCH_LIMIT 100000 +#define SEG_PROC_PIPELINE_SLOT 0 static const uint32 PAGE_WORK_QUEUE_SIZE = 65536; +static const uint32 REALTIME_BUILD_RECORD_QUEUE_SIZE = 4194304; static const uint32 ONDEMAND_EXTREME_RTO_ALIGN_LEN = 16; /* need 128-bit aligned */ static const uint32 MAX_REMOTE_READ_INFO_NUM = 100; static const uint32 ADVANCE_GLOBALLSN_INTERVAL = 1; /* unit second */ +extern uint32 g_ondemandXLogParseMemFullValue; +extern uint32 g_ondemandXLogParseMemApproachFullVaule; +extern uint32 g_ondemandRealtimeBuildQueueFullValue; + +typedef bool (*OndemandCheckPauseCB)(void); +typedef void (*OndemandRefreshPauseStatusCB)(void); + typedef enum { REDO_BATCH, REDO_PAGE_MNG, @@ -58,10 +69,13 @@ typedef enum { REDO_READ_WORKER, REDO_READ_PAGE_WORKER, REDO_READ_MNG, + REDO_SEG_WORKER, + REDO_HTAB_MNG, + REDO_CTRL_WORKER, REDO_ROLE_NUM, } RedoRole; -typedef struct BadBlockRecEnt{ +typedef struct BadBlockRecEnt { RepairBlockKey key; XLogPhyBlock pblk; XLogRecPtr rec_min_lsn; @@ -173,7 +187,7 @@ struct PageRedoWorker { PosixSemaphore phaseMarker; MemoryContext oldCtx; - HTAB *redoItemHash; + ondemand_htab_ctrl_t *redoItemHashCtrl; TimeLineID recoveryTargetTLI; bool ArchiveRecoveryRequested; bool StandbyModeRequested; @@ -186,6 +200,11 @@ struct PageRedoWorker { RedoBufferManager bufferManager; RedoTimeCost timeCostList[TIME_COST_NUM]; char page[BLCKSZ]; + + /* for ondemand realtime build */ + XLogRecPtr nextPrunePtr; + bool inRealtimeBuild; + uint32 currentHtabBlockNum; }; @@ -223,7 +242,7 @@ void ClearBTreeIncompleteActions(PageRedoWorker *worker); void *GetXLogInvalidPages(PageRedoWorker *worker); bool RedoWorkerIsIdle(PageRedoWorker *worker); void DumpPageRedoWorker(PageRedoWorker *worker); -PageRedoWorker *CreateWorker(uint32 id); +PageRedoWorker *CreateWorker(uint32 id, bool inRealtimeBuild); extern void UpdateRecordGlobals(RedoItem *item, HotStandbyState standbyState); void ReferenceRedoItem(void *item); void DereferenceRedoItem(void *item); @@ -250,6 +269,11 @@ void RecordBadBlockAndPushToRemote(XLogBlockDataParse *datadecode, PageErrorType const char *RedoWokerRole2Str(RedoRole role); bool checkBlockRedoDoneFromHashMapAndLock(LWLock **lock, RedoItemTag redoItemTag, RedoItemHashEntry **redoItemEntry, bool holdLock); +void RedoWorkerQueueCallBack(); +void OndemandRequestPrimaryDoCkptIfNeed(); +void GetOndemandRecoveryStatus(ondemand_recovery_stat *stat); +void ReleaseBlockParseStateIfNotReplay(XLogRecParseState *preState); +bool SSXLogParseRecordNeedReplayInOndemandRealtimeBuild(XLogRecParseState *redoblockstate); } // namespace ondemand_extreme_rto #endif diff --git a/src/include/access/ondemand_extreme_rto/redo_utils.h b/src/include/access/ondemand_extreme_rto/redo_utils.h index 6a56cb3d9..04d6bc00a 100644 --- a/src/include/access/ondemand_extreme_rto/redo_utils.h +++ b/src/include/access/ondemand_extreme_rto/redo_utils.h @@ -26,6 +26,12 @@ #include "access/xlogproc.h" +typedef enum { + PARSE_TYPE_DATA = 0, + PARSE_TYPE_DDL, + PARSE_TYPE_SEG, +} XLogRecParseType; + Size OndemandRecoveryShmemSize(void); void OndemandRecoveryShmemInit(void); void OndemandXlogFileIdCacheInit(void); @@ -39,5 +45,10 @@ XLogRecParseState *OndemandRedoReloadXLogRecord(XLogRecParseState *redoblockstat void OndemandRedoReleaseXLogRecord(XLogRecParseState *reloadBlockState); void OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(int code); void OnDemandWaitRedoFinish(); +void OnDemandWaitRealtimeBuildShutDown(); +XLogRecPtr GetRedoLocInCheckpointRecord(XLogReaderState *record); +void OnDemandUpdateRealtimeBuildPrunePtr(); +XLogRecParseType GetCurrentXLogRecParseType(XLogRecParseState *preState); +void WaitUntilRealtimeBuildStatusToFailoverAndUpdatePrunePtr(); #endif /* ONDEMAND_EXTREME_RTO_REDO_UTILS_H */ \ No newline at end of file diff --git a/src/include/access/xlog_basic.h b/src/include/access/xlog_basic.h index cb24df302..817cfa8d9 100644 --- a/src/include/access/xlog_basic.h +++ b/src/include/access/xlog_basic.h @@ -61,7 +61,7 @@ #define XLogSegmentsNum(val) (((val) * XLogBaseSize + XLogSegSize - 1) / XLogSegSize) -#define XLogPreReadSize 67108864 // 64MB +#define XLogPreReadSize 4194304 // 4MB /* Compute XLogRecPtr with segment number and offset. */ #define XLogSegNoOffsetToRecPtr(segno, offset, dest) \ diff --git a/src/include/access/xlogproc.h b/src/include/access/xlogproc.h index 3b33b258e..81343c5ef 100755 --- a/src/include/access/xlogproc.h +++ b/src/include/access/xlogproc.h @@ -59,6 +59,16 @@ typedef void (*relasexlogreadstate)(void* record); #define XLogBlockHeadGetCompressOpt(blockhead) ((blockhead)->opt) #define XLogBlockHeadGetValidInfo(blockhead) ((blockhead)->block_valid) #define XLogBlockHeadGetPhysicalBlock(blockhead) ((blockhead)->pblk) +#define XLogBlockHeadGetBufferTag(blockhead, buffertag) \ + do { \ + (buffertag)->rnode.spcNode = (blockhead)->spcNode; \ + (buffertag)->rnode.dbNode = (blockhead)->dbNode; \ + (buffertag)->rnode.relNode = (blockhead)->relNode; \ + (buffertag)->rnode.bucketNode = (blockhead)->bucketNode; \ + (buffertag)->rnode.opt = (blockhead)->opt; \ + (buffertag)->forkNum = (blockhead)->forknum; \ + (buffertag)->blockNum = (blockhead)->blkno; \ + } while (0) /* for common blockhead end */ /* for block data beging */ @@ -101,7 +111,7 @@ extern void GetFlushBufferInfo(void *buf, RedoBufferInfo *bufferinfo, uint64 *bu #define RedoBufferDirtyClear(bufferinfo) ((bufferinfo)->dirtyflag = false) #define IsRedoBufferDirty(bufferinfo) ((bufferinfo)->dirtyflag == true) -#define RedoMemIsValid(memctl, bufferid) (((bufferid) > InvalidBuffer) && ((bufferid) <= (memctl->totalblknum))) +#define RedoMemIsValid(memctl, bufferid) (((bufferid) > InvalidBuffer) && ((uint32)(bufferid) <= (memctl->totalblknum))) typedef struct { RedoBufferTag blockinfo; @@ -655,15 +665,15 @@ typedef void (*InterruptFunc)(); typedef struct { - int totalblknum; /* total slot */ - int usedblknum; /* used slot */ + uint32 totalblknum; /* total slot */ + uint32 usedblknum; /* used slot */ Size itemsize; Buffer firstfreeslot; /* first free slot */ Buffer firstreleaseslot; /* first release slot */ RedoMemSlot *memslot; /* slot itme */ bool isInit; InterruptFunc doInterrupt; -}RedoMemManager; +} RedoMemManager; typedef void (*RefOperateFunc)(void *record); #ifdef USE_ASSERT_CHECKING @@ -1158,6 +1168,7 @@ XLogRecParseState* xlog_redo_parse_to_block(XLogReaderState* record, uint32* blo XLogRecParseState* smgr_redo_parse_to_block(XLogReaderState* record, uint32* blocknum); XLogRecParseState* segpage_redo_parse_to_block(XLogReaderState* record, uint32* blocknum); void ProcSegPageCommonRedo(XLogRecParseState *parseState); +void SegPageRedoChildState(XLogRecParseState *childStateList); void ProcSegPageJustFreeChildState(XLogRecParseState *parseState); XLogRecParseState* XactXlogClogParseToBlock(XLogReaderState* record, XLogRecParseState* recordstatehead, uint32* blocknum, TransactionId xid, int nsubxids, TransactionId* subxids, CLogXidStatus status); @@ -1289,5 +1300,6 @@ bool is_backup_end(const XLogRecParseState *parse_state); void redo_atomic_xlog_dispatch(uint8 opCode, RedoBufferInfo *redo_buf, const char *data); void seg_redo_new_page_copy_and_flush(BufferTag *tag, char *data, XLogRecPtr lsn); void redo_target_page(const BufferTag& buf_tag, StandbyReadLsnInfoArray* lsn_info, Buffer base_page_buf); +void MarkSegPageRedoChildPageDirty(RedoBufferInfo *bufferinfo); #endif diff --git a/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback-post_catalog_maindb_92_925.sql b/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback-post_catalog_maindb_92_925.sql new file mode 100644 index 000000000..2fb6dd0fb --- /dev/null +++ b/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback-post_catalog_maindb_92_925.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS pg_catalog.ondemand_recovery_status() CASCADE; \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback-post_catalog_otherdb_92_925.sql b/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback-post_catalog_otherdb_92_925.sql new file mode 100644 index 000000000..2fb6dd0fb --- /dev/null +++ b/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback-post_catalog_otherdb_92_925.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS pg_catalog.ondemand_recovery_status() CASCADE; \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade-post_catalog_maindb_92_925.sql b/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade-post_catalog_maindb_92_925.sql new file mode 100644 index 000000000..80f59ac15 --- /dev/null +++ b/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade-post_catalog_maindb_92_925.sql @@ -0,0 +1,15 @@ +DROP FUNCTION IF EXISTS pg_catalog.ondemand_recovery_status() CASCADE; +SET LOCAL inplace_upgrade_next_system_object_oids=IUO_PROC, 6991; +CREATE FUNCTION pg_catalog.ondemand_recovery_status( + out primary_checkpoint_redo_lsn text, + out realtime_build_replayed_lsn text, + out hashmap_used_blocks oid, + out hashmap_total_blocks oid, + out trxn_queue_blocks oid, + out seg_queue_blocks oid, + out in_ondemand_recovery boolean, + out ondemand_recovery_status text, + out realtime_build_status text, + out recovery_pause_status text +) +RETURNS SETOF record LANGUAGE INTERNAL as 'ondemand_recovery_status' stable; \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade-post_catalog_otherdb_92_925.sql b/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade-post_catalog_otherdb_92_925.sql new file mode 100644 index 000000000..80f59ac15 --- /dev/null +++ b/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade-post_catalog_otherdb_92_925.sql @@ -0,0 +1,15 @@ +DROP FUNCTION IF EXISTS pg_catalog.ondemand_recovery_status() CASCADE; +SET LOCAL inplace_upgrade_next_system_object_oids=IUO_PROC, 6991; +CREATE FUNCTION pg_catalog.ondemand_recovery_status( + out primary_checkpoint_redo_lsn text, + out realtime_build_replayed_lsn text, + out hashmap_used_blocks oid, + out hashmap_total_blocks oid, + out trxn_queue_blocks oid, + out seg_queue_blocks oid, + out in_ondemand_recovery boolean, + out ondemand_recovery_status text, + out realtime_build_status text, + out recovery_pause_status text +) +RETURNS SETOF record LANGUAGE INTERNAL as 'ondemand_recovery_status' stable; \ No newline at end of file diff --git a/src/include/ddes/dms/dms_api.h b/src/include/ddes/dms/dms_api.h index 50f94b746..60a956aa5 100644 --- a/src/include/ddes/dms/dms_api.h +++ b/src/include/ddes/dms/dms_api.h @@ -32,7 +32,7 @@ extern "C" { #define DMS_LOCAL_MINOR_VER_WEIGHT 1000 #define DMS_LOCAL_MAJOR_VERSION 0 #define DMS_LOCAL_MINOR_VERSION 0 -#define DMS_LOCAL_VERSION 126 +#define DMS_LOCAL_VERSION 127 #define DMS_SUCCESS 0 #define DMS_ERROR (-1) @@ -582,6 +582,7 @@ typedef enum en_dms_wait_event { DMS_EVT_DCS_REQ_XA_OWNER_ID, DMS_EVT_DCS_REQ_XA_IN_USE, DMS_EVT_DCS_REQ_END_XA, + DMS_EVT_REQ_CKPT, // add new enum at tail, or make adaptations to openGauss DMS_EVT_COUNT, @@ -830,6 +831,7 @@ typedef void (*dms_thread_init_t)(unsigned char need_startup, char **reg_data); typedef void (*dms_thread_deinit_t)(void); typedef int (*dms_get_db_primary_id)(void *db_handle, unsigned int *primary_id); typedef int (*dms_opengauss_ondemand_redo_buffer)(void *block_key, int *redo_status); +typedef int (*dms_opengauss_do_ckpt_immediate)(unsigned long long *ckpt_loc); // for ssl typedef int(*dms_decrypt_pwd_t)(const char *cipher, unsigned int len, char *plain, unsigned int size); @@ -965,6 +967,7 @@ typedef struct st_dms_callback { dms_get_opengauss_update_xid get_opengauss_update_xid; dms_get_opengauss_txn_status get_opengauss_txn_status; dms_opengauss_lock_buffer opengauss_lock_buffer; + dms_opengauss_do_ckpt_immediate opengauss_do_ckpt_immediate; dms_get_txn_snapshot get_txn_snapshot; dms_get_opengauss_txn_snapshot get_opengauss_txn_snapshot; dms_get_opengauss_txn_of_master get_opengauss_txn_of_master; diff --git a/src/include/ddes/dms/ss_common_attr.h b/src/include/ddes/dms/ss_common_attr.h index db51b8494..d5ac3e095 100644 --- a/src/include/ddes/dms/ss_common_attr.h +++ b/src/include/ddes/dms/ss_common_attr.h @@ -131,6 +131,10 @@ #define BUF_DIRTY_NEED_FLUSH 0x100 #define BUF_ERTO_NEED_MARK_DIRTY 0x200 +#define BUF_READ_MODE_ONDEMAND_REALTIME_BUILD 0x400 +/* mark buffer is pinned in ondemand realtime build, which do not allow eliminated */ +#define BUF_IS_ONDEMAND_REALTIME_BUILD_PINNED 0x800 + #define SS_BROADCAST_FAILED_RETRYCOUNTS 4 #define SS_BROADCAST_WAIT_INFINITE (0xFFFFFFFF) #define SS_BROADCAST_WAIT_FIVE_SECONDS (5000) diff --git a/src/include/ddes/dms/ss_dms.h b/src/include/ddes/dms/ss_dms.h index 5c41ce4bb..5f4302b3e 100644 --- a/src/include/ddes/dms/ss_dms.h +++ b/src/include/ddes/dms/ss_dms.h @@ -91,6 +91,7 @@ typedef struct st_ss_dms_func { int (*dms_info)(char *buf, unsigned int len, dms_info_id_e id); void (*dms_get_buf_res)(unsigned long long *row_id, dv_drc_buf_info *drc_info, int type); void (*dms_get_cmd_stat)(int index, wait_cmd_stat_result_t *cmd_stat_result); + int (*dms_req_opengauss_immediate_ckpt)(dms_context_t *dms_ctx, unsigned long long *ckpt_loc); } ss_dms_func_t; int ss_dms_func_init(); @@ -140,7 +141,7 @@ int dms_reform_req_opengauss_ondemand_redo_buffer(dms_context_t *dms_ctx, void * int *redo_status); unsigned int dms_get_mes_max_watting_rooms(void); int dms_send_opengauss_oldest_xmin(dms_context_t *dms_ctx, unsigned long long oldest_xmin, unsigned char dest_id); - +int dms_req_opengauss_immediate_checkpoint(dms_context_t *dms_ctx, unsigned long long *redo_lsn); int get_drc_info(int *is_found, dv_drc_buf_info *drc_info); int dms_info(char *buf, unsigned int len, dms_info_id_e id); void dms_get_buf_res(unsigned long long *row_id, dv_drc_buf_info *drc_info, int type); diff --git a/src/include/ddes/dms/ss_dms_bufmgr.h b/src/include/ddes/dms/ss_dms_bufmgr.h index 5602d93cc..8157b62d8 100644 --- a/src/include/ddes/dms/ss_dms_bufmgr.h +++ b/src/include/ddes/dms/ss_dms_bufmgr.h @@ -30,6 +30,7 @@ #define GetDmsBufCtrl(id) (&t_thrd.storage_cxt.dmsBufCtl[(id)]) #define SS_BUF_MAX_WAIT_TIME (1000L * 1000 * 20) // 20s +#define SS_BUF_WAIT_TIME_IN_ONDEMAND_REALTIME_BUILD (100000L) // 100ms #define DmsInitLatch(drid, _type, _oid, _idx, _parent_part, _part, _uid) \ do { \ @@ -88,4 +89,8 @@ bool SSOndemandRequestPrimaryRedo(BufferTag tag); bool SSLWLockAcquireTimeout(LWLock* lock, LWLockMode mode); bool SSWaitIOTimeout(BufferDesc *buf); void buftag_get_buf_info(BufferTag tag, stat_buf_info_t *buf_info); +Buffer SSReadBuffer(BufferTag *tag, ReadBufferMode mode); +void DmsReleaseBuffer(int buffer, bool is_seg); +bool SSRequestPageInOndemandRealtimeBuild(BufferTag *bufferTag, XLogRecPtr recordLsn, XLogRecPtr *pageLsn); +bool SSOndemandRealtimeBuildAllowFlush(BufferDesc *buf); #endif diff --git a/src/include/ddes/dms/ss_dms_recovery.h b/src/include/ddes/dms/ss_dms_recovery.h index b8f0dc065..bb5034444 100644 --- a/src/include/ddes/dms/ss_dms_recovery.h +++ b/src/include/ddes/dms/ss_dms_recovery.h @@ -40,15 +40,25 @@ #define SS_REPLAYED_BY_ONDEMAND (ENABLE_DMS && !SS_IN_ONDEMAND_RECOVERY && \ t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone == true && \ t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRedoDone == true) +#define SS_ONDEMAND_REALTIME_BUILD_DISABLED (ENABLE_DMS && \ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status == DISABLED) +#define SS_ONDEMAND_REALTIME_BUILD_READY_TO_BUILD (ENABLE_DMS && \ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status == READY_TO_BUILD) +#define SS_ONDEMAND_REALTIME_BUILD_NORMAL (ENABLE_DMS && \ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status == BUILD_NORMAL) +#define SS_ONDEMAND_REALTIME_BUILD_SHUTDOWN (ENABLE_DMS && \ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status == BUILD_TO_DISABLED) +#define SS_ONDEMAND_REALTIME_BUILD_FAILOVER (ENABLE_DMS && \ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_realtime_build_status == BUILD_TO_REDO) + +#define SS_ONDEMAND_RECOVERY_PAUSE (ENABLE_DMS && \ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_recovery_pause_status != NOT_PAUSE) +#define SS_ONDEMAND_RECOVERY_HASHMAP_FULL (ENABLE_DMS && \ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_recovery_pause_status == PAUSE_FOR_PRUNE_HASHMAP) +#define SS_ONDEMAND_RECOVERY_TRXN_QUEUE_FULL (ENABLE_DMS && \ + g_instance.dms_cxt.SSRecoveryInfo.ondemand_recovery_pause_status == PAUSE_FOR_PRUNE_TRXN_QUEUE) #define REFORM_CTRL_VERSION 1 - -typedef struct st_old_reformer_ctrl { - uint64 list_stable; // stable instances list - int primaryInstId; - pg_crc32c crc; -} ss_old_reformer_ctrl_t; - typedef struct st_reformer_ctrl { uint32 version; uint64 list_stable; // stable instances list @@ -84,6 +94,35 @@ typedef enum st_failover_ckpt_status { ALLOW_CKPT } failover_ckpt_status_t; +typedef enum st_ondemand_realtime_build_status { + DISABLED = 0, + BUILD_NORMAL, + READY_TO_BUILD, + BUILD_TO_DISABLED, + BUILD_TO_REDO +} ondemand_realtime_build_status_t; + +typedef enum st_ondemand_recovery_pause_status { + NOT_PAUSE = 0, + PAUSE_FOR_SYNC_REDO, + PAUSE_FOR_PRUNE_HASHMAP, + PAUSE_FOR_PRUNE_SEG_QUEUE, + PAUSE_FOR_PRUNE_TRXN_QUEUE +} ondemand_recovery_pause_status_t; + +typedef struct ondemand_recovery_stat { + XLogRecPtr checkpointPtr; + XLogRecPtr replayedPtr; + uint32 hmpUsedBlkNum; + uint32 hmpTotalBlkNum; + uint32 trxnQueueNum; + uint32 segQueueNum; + bool inOndemandRecovery; + SSGlobalClusterState ondemandRecoveryStatus; + ondemand_realtime_build_status_t realtimeBuildStatus; + ondemand_recovery_pause_status_t recoveryPauseStatus; +} ondemand_recovery_stat; + typedef struct ss_recovery_info { bool recovery_pause_flag; volatile failover_ckpt_status_t failover_ckpt_status; @@ -103,9 +142,17 @@ typedef struct ss_recovery_info { bool startup_need_exit_normally; //used in alive failover bool recovery_trapped_in_page_request; //used in alive failover bool in_ondemand_recovery; + volatile ondemand_realtime_build_status_t ondemand_realtime_build_status; bool dorado_sharestorage_inited; // used in dorado mode + volatile ondemand_recovery_pause_status_t ondemand_recovery_pause_status; } ss_recovery_info_t; +typedef struct ondemand_htab_ctrl { + HTAB *hTab; + void *nextHTabCtrl; + XLogRecPtr maxRedoItemPtr; +} ondemand_htab_ctrl_t; + extern bool SSRecoveryNodes(); extern void SSWaitStartupExit(); extern int SSGetPrimaryInstId(); @@ -115,6 +162,9 @@ extern bool SSRecoveryApplyDelay(); extern void SShandle_promote_signal(); extern void ss_failover_dw_init(); extern void ss_switchover_promoting_dw_init(); +extern XLogRecPtr SSOndemandRequestPrimaryCkptAndGetRedoLsn(); +void StartupOndemandRecovery(); +void OndemandRealtimeBuildHandleFailover(); #endif \ No newline at end of file diff --git a/src/include/knl/knl_guc/knl_instance_attr_storage.h b/src/include/knl/knl_guc/knl_instance_attr_storage.h index 5c93de563..0ea91341a 100755 --- a/src/include/knl/knl_guc/knl_instance_attr_storage.h +++ b/src/include/knl/knl_guc/knl_instance_attr_storage.h @@ -100,6 +100,7 @@ typedef struct knl_instance_attr_dms { bool enable_catalog_centralized; bool enable_dss_aio; bool enable_verify_page; + bool enable_ondemand_realtime_build; bool enable_ondemand_recovery; int ondemand_recovery_mem_size; int instance_id; diff --git a/src/include/knl/knl_instance.h b/src/include/knl/knl_instance.h index 75b4ee9ed..aac9c7d7f 100755 --- a/src/include/knl/knl_instance.h +++ b/src/include/knl/knl_instance.h @@ -757,7 +757,7 @@ typedef struct knl_g_parallel_redo_context { char* ali_buf; XLogRedoNumStatics xlogStatics[RM_NEXT_ID][MAX_XLOG_INFO_NUM]; RedoCpuBindControl redoCpuBindcontrl; - HTAB **redoItemHash; /* used in ondemand extreme RTO */ + ondemand_htab_ctrl_t **redoItemHashCtrl; /* used in ondemand extreme RTO */ /* extreme-rto standby read */ TransactionId exrto_recyle_xmin; XLogRecPtr global_recycle_lsn; diff --git a/src/include/knl/knl_thread.h b/src/include/knl/knl_thread.h index ef9cdef48..4ac8befe7 100755 --- a/src/include/knl/knl_thread.h +++ b/src/include/knl/knl_thread.h @@ -212,6 +212,16 @@ read page worker get a record make lsn forwarder get new item startup get a record check stop delay redo dispatch(total) decode null null null +for ondemand extreme rto +thread step1 step2 step3 step4 + step5 step6 step7 step8 +seg redo worker get a record redo record(total) redo seg xlog get a record + null null null null +hashmap manager prune seg record prune hashmap(history) prune hashmap(lastest) get a record(instruct) + null null null null +ctrl worker update usedblknum request primary ckpt null null + null null null null + for parallel redo thread step1 step2 step3 step4 step5 step6 step7 step8 step9 diff --git a/src/include/storage/buf/bufmgr.h b/src/include/storage/buf/bufmgr.h index 4c1ae95fd..e9828fe75 100644 --- a/src/include/storage/buf/bufmgr.h +++ b/src/include/storage/buf/bufmgr.h @@ -75,7 +75,10 @@ typedef enum { RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */ RBM_NORMAL_NO_LOG, /* Don't log page as invalid during WAL * replay; otherwise same as RBM_NORMAL */ - RBM_FOR_REMOTE /* Like RBM_NORMAL, but not remote read again when PageIsVerified failed. */ + RBM_FOR_REMOTE, /* Like RBM_NORMAL, but not remote read again when PageIsVerified failed. */ + RBM_FOR_ONDEMAND_REALTIME_BUILD /* Like RBM_NORMAL, only used in ondemand realtime time + * build (shared storage mode), need newest page by DMS, + * but do not load from disk */ } ReadBufferMode; typedef enum diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 269caa449..c89b117c2 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -1876,6 +1876,7 @@ extern Datum compress_ratio_info(PG_FUNCTION_ARGS); extern Datum compress_statistic_info(PG_FUNCTION_ARGS); extern Datum pg_read_binary_file_blocks(PG_FUNCTION_ARGS); extern Datum dss_io_stat(PG_FUNCTION_ARGS); +extern Datum get_ondemand_recovery_status(PG_FUNCTION_ARGS); /* plhandler.cpp */ extern Datum generate_procoverage_report(PG_FUNCTION_ARGS); diff --git a/src/test/regress/output/recovery_2pc_tools.source b/src/test/regress/output/recovery_2pc_tools.source index 29d73cd6b..fe71a564e 100644 --- a/src/test/regress/output/recovery_2pc_tools.source +++ b/src/test/regress/output/recovery_2pc_tools.source @@ -648,6 +648,7 @@ select name,vartype,unit,min_val,max_val from pg_settings where name <> 'qunit_c ss_enable_catalog_centralized | bool | | | ss_enable_dms | bool | | | ss_enable_dss | bool | | | + ss_enable_ondemand_realtime_build | bool | | | ss_enable_ondemand_recovery | bool | | | ss_enable_reform | bool | | | ss_enable_scrlock | bool | | |