openGauss-server/src/gausskernel/storage/access/transam/xlog.cpp

/* -------------------------------------------------------------------------
 *
 * xlog.cpp
 *		openGauss transaction log manager
 *
 *
 * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 * IDENTIFICATION
 *        src/gausskernel/storage/access/transam/xlog.cpp
 *
 * -------------------------------------------------------------------------
 */
#include "c.h"
#include "postgres.h"
#include "knl/knl_variable.h"

#include <ctype.h>
#include <signal.h>
#include <time.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <sys/ioctl.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <net/if.h>
#ifdef __USE_NUMA
#include <numa.h>
#endif

#include "access/cbmparsexlog.h"
#include "access/clog.h"
#include "access/csnlog.h"
#include "access/cstore_insert.h"
#include "access/double_write.h"
#include "access/heapam.h"
#include "access/multixact.h"
#include "access/rewriteheap.h"
#include "access/subtrans.h"
#include "access/transam.h"
#include "access/tuptoaster.h"
#include "access/twophase.h"
#include "access/ustore/undo/knl_uundoapi.h"
#include "access/xact.h"
#include "access/xlog_internal.h"
#include "access/xloginsert.h"
#include "access/xlogreader.h"
#include "access/xlogutils.h"
#include "access/xlogdefs.h"
#include "access/hash.h"
#include "access/xlogproc.h"
#include "access/parallel_recovery/dispatcher.h"

#include "commands/tablespace.h"
#include "commands/matview.h"

#include "catalog/catalog.h"
#include "catalog/catversion.h"
#include "catalog/pg_control.h"
#include "catalog/pg_database.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "gs_bbox.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#ifdef PGXC
#include "pgxc/barrier.h"
#endif
#include "pgstat.h"
#include "port.h"
#include "postmaster/bgwriter.h"
#include "postmaster/startup.h"
#include "postmaster/postmaster.h"
#include "postmaster/pagewriter.h"
#include "postmaster/pagerepair.h"
#include "replication/logical.h"
#include "replication/bcm.h"
#include "replication/basebackup.h"
#include "replication/datareceiver.h"
#include "replication/datasender.h"
#include "replication/dataqueue.h"
#include "replication/dcf_data.h"
#include "replication/origin.h"
#include "replication/reorderbuffer.h"
#include "replication/replicainternal.h"
#include "replication/shared_storage_walreceiver.h"
#include "replication/slot.h"
#include "replication/snapbuild.h"
#include "replication/syncrep.h"
#include "replication/walreceiver.h"
#include "replication/walsender_private.h"
#include "replication/walsender.h"
#include "replication/walprotocol.h"
#include "replication/archive_walreceiver.h"
#include "replication/dcf_replication.h"
#include "storage/buf/bufmgr.h"
#include "storage/smgr/fd.h"
#include "storage/copydir.h"
#include "storage/ipc.h"
#include "storage/latch.h"
#include "storage/pmsignal.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/reinit.h"
#include "storage/smgr/segment.h"
#include "storage/smgr/smgr.h"
#include "storage/spin.h"
#include "storage/smgr/relfilenode.h"
#include "storage/lock/lwlock.h"
#include "storage/dorado_operation/dorado_fd.h"
#include "storage/xlog_share_storage/xlog_share_storage.h"
#include "utils/builtins.h"
#include "utils/elog.h"
#include "utils/guc.h"
#include "utils/pg_lsn.h"
#include "utils/ps_status.h"
#include "utils/relmapper.h"
#include "utils/snapmgr.h"
#include "utils/timestamp.h"
#include "utils/inet.h"
#include "utils/atomic.h"
#include "pg_trace.h"
#include "gssignal/gs_signal.h"
#include "gstrace/gstrace_infra.h"
#include "gstrace/access_gstrace.h"
#include "postmaster/pagerepair.h"
#include <sched.h>
#include <utmpx.h>
#include <time.h>

#ifdef ENABLE_MOT
#include "storage/mot/mot_fdw.h"
#endif

#ifndef ENABLE_MULTIPLE_NODES
#include "dcf_interface.h"
#endif
/* just for libpqrcv_connect_for_TLI and ha_set_rebuild_connerror */
#include "replication/libpqwalreceiver.h"
/* Used for barrier preparse */
#include "postmaster/barrier_preparse.h"

/* Used for parallel recovery */

#include "access/redo_statistic.h"
#include "access/multi_redo_api.h"
#include "access/parallel_recovery/dispatcher.h"
#include "vectorsonic/vsonichash.h"

#include "ddes/dms/ss_reform_common.h"
#include "ddes/dms/ss_dms_recovery.h"
#include "ddes/dms/ss_dms_bufmgr.h"
#include "storage/file/fio_device.h"
#ifdef ENABLE_UT
#define STATIC
#else
#define STATIC static
#endif

/* File path names (all relative to $PGDATA) */
#define RECOVERY_COMMAND_FILE "recovery.conf"
#define RECOVERY_COMMAND_DONE "recovery.done"
#define FAILOVER_SIGNAL_FILE "failover"
#define SWITCHOVER_SIGNAL_FILE "switchover"
#define SWITCHOVER_TIMEOUT_SIGNAL_FILE "switchover_timeout"
#define PRIMARY_SIGNAL_FILE "primary"
#define STANDBY_SIGNAL_FILE "standby"
#define CASCADE_STANDBY_SIGNAL_FILE "cascade_standby"
#define XLOG_SWITCH_HISTORY_FILE "switch.history"
#define MAX_PATH_LEN 1024
#define MAX(A, B) ((B) > (A) ? (B) : (A))
#define ENABLE_INCRE_CKPT g_instance.attr.attr_storage.enableIncrementalCheckpoint
#define SS_STANDBY_INST_SKIP_SHARED_FILE (ENABLE_DSS && IsInitdb &&                               \
                                         g_instance.attr.attr_storage.dms_attr.instance_id !=     \
                                         g_instance.dms_cxt.SSReformerControl.primaryInstId)

#define RecoveryFromDummyStandby() (t_thrd.postmaster_cxt.ReplConnArray[2] != NULL && IS_DN_DUMMY_STANDYS_MODE())

/* MaxMacAddrList controls GetMACAddrHash function to get the max mac number */
#define MaxMacAddrList 10

#define BILLION 1000000000L
const int ONE_SECOND_TO_MICROSECOND = 1000000L;
const uint64 PAGE_SIZE_BYTES = 4096;
const uint64 XLOG_FLUSH_SIZE_INIT = 1024 * 1024;

const int SIZE_OF_UINT64 = 8;
const int SIZE_OF_UINT32 = 4;
const int SIZE_OF_TWO_UINT64 = 16;
const int XLOG_LSN_SWAP = 32;
const int XLOGFILENAMELEN = 24;
const char *DemoteModeDescs[] = { "unknown", "smart", "fast", "immediate" };
const int DemoteModeNum = sizeof(DemoteModeDescs) / sizeof(char *);
static const int g_retryTimes = 3;

static const uint64 REDO_SPEED_LOG_LEN = (XLogSegSize * 64); /* 64 segs */
static const int PG_TBLSPCS = 10;                            /* strlen(pg_tblspcs/) */

/* MAX_PAGE_FLUSH_LSN_FILE SIZE : 1024 pages, 8k each, file size 8M in total */
static const int MPFL_PAGE_NUM = NUM_MAX_PAGE_FLUSH_LSN_PARTITIONS;
static const int MPFL_FILE_SIZE = (BLCKSZ * MPFL_PAGE_NUM);
static const int MPFL_FILE_FLAG = (O_RDWR | O_SYNC | O_DIRECT | PG_BINARY);
static const mode_t MPFL_FILE_PERM = (S_IRUSR | S_IWUSR);

/* the maximum size that we can send to DCF, 1MB */
static const Size MaxSendSizeBytes = 1048576;

THR_LOCAL bool redo_oldversion_xlog = false;

static const int ONE_SECOND = 1000000;

/*
 * XLOGfileslop is the maximum number of preallocated future XLOG segments.
 * When we are done with an old XLOG segment file, we will recycle it as a
 * future XLOG segment as long as there aren't already XLOGfileslop future
 * segments; else we'll delete it.  This could be made a separate GUC
 * variable, but at present I think it's sufficient to hardwire it as
 * 2*CheckPointSegments+1.	Under normal conditions, a checkpoint will free
 * no more than 2*CheckPointSegments log segments, and we want to recycle all
 * of them; the +1 allows boundary cases to happen without wasting a
 * delete/create-segment cycle.
 */
#define XLOGfileslop (2 * XLogSegmentsNum(u_sess->attr.attr_storage.CheckPointSegments) + 1)

/*
 * GUC support
 */
struct config_enum_entry sync_method_options[] = {
    { "fsync", SYNC_METHOD_FSYNC, false },
#ifdef HAVE_FSYNC_WRITETHROUGH
    { "fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false },
#endif
#ifdef HAVE_FDATASYNC
    { "fdatasync", SYNC_METHOD_FDATASYNC, false },
#endif
#ifdef OPEN_SYNC_FLAG
    { "open_sync", SYNC_METHOD_OPEN, false },
#endif
#ifdef OPEN_DATASYNC_FLAG
    { "open_datasync", SYNC_METHOD_OPEN_DSYNC, false },
#endif
    { NULL, 0, false }
};

XLogRecPtr latestValidRecord = InvalidXLogRecPtr;
pg_crc32 latestRecordCrc = InvalidXLogRecPtr;
uint32 latestRecordLen = 0;
XLogSegNo XlogRemoveSegPrimary = InvalidXLogSegPtr;

/* The nextXid and oldestXid in ShmemVariableCache when recovery done */
TransactionId NextXidAfterReovery;
TransactionId OldestXidAfterRecovery;

/*
 * We maintain an image of pg_control in shared memory.
 */
THR_LOCAL ControlFileData *ControlFile = NULL;

static void remove_xlogtemp_files(void);
static bool validate_parse_delay_ddl_file(DelayDDLRange *delayRange);
static bool write_delay_ddl_file(const DelayDDLRange &delayRange, bool onErrDelete);
extern void CalculateLocalLatestSnapshot(bool forceCalc);

/*
 * Calculate the amount of space left on the page after 'endptr'. Beware
 * multiple evaluation!
 */
#define INSERT_FREESPACE(endptr) (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))

/* Macro to advance to next buffer index. */
#define NextBufIdx(idx) (((idx) == t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))

/* Added for XLOG scaling */
/*
 * Added for XLOG scaling
 *
 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
 * would hold if it was in cache, the page containing 'recptr'.
 *
 * XLogRecEndPtrToBufIdx is the same, but a pointer to the first byte of a
 * page is taken to mean the previous page.
 */
#define XLogRecPtrToBufIdx(recptr) (((recptr) / XLOG_BLCKSZ) % (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck + 1))

/*
 * Return the previous xlblock[i] mapped to the same page
 */
#define X_LOG_PREV_BLOCK_IN_BUF_PAGE(recptr)                                                     \
    (((recptr) < (XLogRecPtr)(XLOG_BLCKSZ * (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck + 1))) \
            ? 0                                                                                     \
            : (XLogRecPtr)((recptr)-XLOG_BLCKSZ * (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck + 1)))

/*
 * These are the number of bytes in a WAL page and segment usable for WAL data.
 */
#define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
#define UsableBytesInSegment \
    ((XLogSegSize / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))

/*
 * Add xlog reader private structure for page read.
 */
typedef struct XLogPageReadPrivate {
    int emode;
    bool fetching_ckpt; /* are we fetching a checkpoint record? */
    bool randAccess;
} XLogPageReadPrivate;

/*
 * xlog switch information for forcing failover.
 */
typedef struct XLogSwitchInfo {
    uint32 termId;
    uint64 switchLsn;
    uint64 catchLsn;
    uint64 oldestRunningXid;
    uint64 latestCompletedXid;
    uint32 invaildPageCnt;
    uint32 imcompleteActionCnt;
} XLogSwitchInfo;

volatile bool IsPendingXactsRecoveryDone = false;

static void XLogFlushCore(XLogRecPtr writeRqstPtr);
static void XLogSelfFlush(void);
static void XLogSelfFlushWithoutStatus(int numHitsOnStartPage, XLogRecPtr currPos, int currLRC);

static void XLogArchiveNotify(const char *xlog);
static void XLogArchiveNotifySeg(XLogSegNo segno);
static bool XLogArchiveCheckDone(const char *xlog);
static bool XLogArchiveIsBusy(const char *xlog);
static bool XLogArchiveIsReady(const char *xlog);
static void XLogArchiveCleanup(const char *xlog);
static void readRecoveryCommandFile(void);
static XLogSegNo GetOldestXLOGSegNo(const char *workingPath);
static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endSegNo);
static bool recoveryStopsHere(XLogReaderState *record, bool *includeThis);
static void recoveryPausesHere(void);
#ifndef ENABLE_MULTIPLE_NODES
static bool RecoveryApplyDelay(const XLogReaderState *record);
#endif
static void SetCurrentChunkStartTime(TimestampTz xtime);
static void CheckRequiredParameterValues(bool DBStateShutdown);
static void XLogReportParameters(void);
static void LocalSetXLogInsertAllowed(void);
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags, bool doFullCheckpoint);
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo, XLogRecPtr curInsert);

static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);

template <bool isGroupInsert>
static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic, PGPROC *proc = NULL);

static bool XLogCheckpointNeeded(XLogSegNo new_segno);
static void XLogWrite(const XLogwrtRqst &WriteRqst, bool flexible);
#ifndef ENABLE_MULTIPLE_NODES
static bool XLogWritePaxos(XLogRecPtr WritePaxosRqst);
#endif
static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool find_free, int *max_advance,
                                   bool use_lock, const char *xlog_dir);
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, bool notexistOk);
static void XLogFileClose(void);
static void KeepFileRestoredFromArchive(const char *path, const char *xlogfname);
static bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize);
static void ExecuteRecoveryCommand(char *command, char *commandName, bool failOnerror);
static void PreallocXlogFiles(XLogRecPtr endptr);
static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr);
static void RemoveXlogFile(const char *segname, XLogRecPtr ednptr);
static void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI);
static void UpdateLastRemovedPtr(const char *filename);
static void ValidateXLOGDirectoryStructure(void);
static void CleanupBackupHistory(void);
static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, bool fetching_ckpt);
void CheckRecoveryConsistency(void);
static bool existsTimeLineHistory(TimeLineID probeTLI);
static TimeLineID findNewestTimeLine(TimeLineID startTLI);
STATIC void WriteControlFile(void);
STATIC void ReadControlFile(void);
static void RecoverControlFile(void);
static char *str_time(pg_time_t tnow);
static bool CheckForPrimaryTrigger(void);

#ifdef WAL_DEBUG
static void xlog_outrec(StringInfo buf, XLogReaderState *record);
#endif
static void pg_start_backup_callback(int code, Datum arg);
static void DoAbortBackupCallback(int code, Datum arg);
static void AbortExcluseBackupCallback(int code, Datum arg);
static bool read_tablespace_map(List **tablespaces);
static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, bool *backupFromStandby,
                              bool *backupFromRoach);
static void check_roach_backup_condition(bool backup_started_in_recovery, const char *backupidstr);
static int get_sync_bit(int method);
static void ResetSlotLSNEndRecovery(StringInfo slotname);
static void ShutdownReadFileFacility(void);
static void SetDummyStandbyEndRecPtr(XLogReaderState *xlogreader);

/* XLOG scaling: start */
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, XLogRecPtr StartPos,
                                XLogRecPtr EndPos, int32 *const currlrc_ptr);
static void ReserveXLogInsertLocation(uint32 size, XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr,
                                      int32 *const currlrc_ptr);
static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
static void StartSuspendWalInsert(int32 *const lastlrc_ptr);
static void StopSuspendWalInsert(int32 lastlrc);

template <bool isGroupInsert>
static char *GetXLogBuffer(XLogRecPtr ptr, PGPROC *proc = NULL);
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);

static XLogRecPtr XLogInsertRecordSingle(XLogRecData *rdata, XLogRecPtr fpw_lsn);
static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
                      XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path);
void ArchiveXlogForForceFinishRedo(XLogReaderState *xlogreader, TermFileData *term_file);
TermFileData GetTermFileDataAndClear(void);
XLogRecPtr mpfl_read_max_flush_lsn();
void mpfl_new_file();
void mpfl_ulink_file();
bool mpfl_pread_file(int fd, void *buf, int32 size, int64 offset);
static void SSOndemandXlogCopy(XLogSegNo copySegNo, uint32 startOffset, char *copyBuffer, Size copyBytes);

#ifdef __aarch64__
static XLogRecPtr XLogInsertRecordGroup(XLogRecData *rdata, XLogRecPtr fpw_lsn);

static void XLogInsertRecordNolock(XLogRecData *rdata, PGPROC *proc, XLogRecPtr StartPos, XLogRecPtr EndPos,
                                   XLogRecPtr PrevPos, int32 *const currlrc_ptr);
static void ReserveXLogInsertByteLocation(uint32 size, uint32 lastRecordSize, uint64 *StartBytePos, uint64 *EndBytePos,
                                          uint64 *PrevBytePos, int32 *const currlrc_ptr);
static void CopyXLogRecordToWALForGroup(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos,
                                        PGPROC *proc, int32* const currlrc_ptr);

static void WakeUpProc(uint32 wakeidx)
{
    while (wakeidx != INVALID_PGPROCNO) {
        PGPROC *proc = g_instance.proc_base_all_procs[wakeidx];

        wakeidx = pg_atomic_read_u32(&proc->xlogGroupNext);
        pg_atomic_write_u32(&proc->xlogGroupNext, INVALID_PGPROCNO);
        /* ensure all previous writes are visible before xlogGroupMember is set. */
        pg_memory_barrier();
        proc->xlogGroupMember = false;

#ifdef ENABLE_MULTIPLE_NODES
        if (proc != t_thrd.proc) {
            /* acts as memory barrier */
            PGSemaphoreUnlock(&proc->sem);
        }
#else
        pg_write_barrier();
#endif
    }
}

static void XLogInsertRecordGroupLeader(PGPROC *leader, uint64 *end_byte_pos_ptr, int32 *const currlrc_ptr)
{
    uint32 record_size;
    uint64 prev_byte_pos = 0;
    uint64 start_byte_pos = 0;
    uint64 end_byte_pos = 0;
    int32 current_lrc = 0;
    uint64 dirty_page_queue_lsn = 0;

    *leader->xlogGroupRedoRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->Insert.RedoRecPtr;

    /*
     * In some cases, some xlog records about full page write are not needed to write to buffer,
     * so reserving space for these xlog records is not needed.
     */
    *leader->xlogGroupDoPageWrites = (t_thrd.shemem_ptr_cxt.XLogCtl->Insert.fullPageWrites ||
                                      t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites);

    bool flag = (leader->xlogGroupfpw_lsn != InvalidXLogRecPtr) &&
                (leader->xlogGroupfpw_lsn <= *leader->xlogGroupRedoRecPtr) && *leader->xlogGroupDoPageWrites;
    if (unlikely(flag)) {
        leader->xlogGroupReturntRecPtr = InvalidXLogRecPtr;
        leader->xlogGroupIsFPW = true;
    }

    record_size = MAXALIGN(((XLogRecord *)(leader->xlogGrouprdata->data))->xl_tot_len);
    Assert(record_size != 0);

    /* insert the leader WAL first */
    if (unlikely(leader->xlogGroupIsFPW)) {
        leader->xlogGroupIsFPW = false;
        return;
    } else {
        if (likely(record_size != 0)) {
            ReserveXLogInsertByteLocation(record_size, record_size, &start_byte_pos, &end_byte_pos, &prev_byte_pos,
                                          &current_lrc);
            dirty_page_queue_lsn = start_byte_pos;
        }
        XLogInsertRecordNolock(leader->xlogGrouprdata, leader, XLogBytePosToRecPtr(start_byte_pos),
                               XLogBytePosToEndRecPtr(
                                   start_byte_pos +
                                   MAXALIGN(((XLogRecord *)(leader->xlogGrouprdata->data))->xl_tot_len)),
                               XLogBytePosToRecPtr(prev_byte_pos), &current_lrc);
    }

    if (dirty_page_queue_lsn != 0) {
        update_dirty_page_queue_rec_lsn(XLogBytePosToRecPtr(dirty_page_queue_lsn));
    }

    if (end_byte_pos_ptr != NULL) {
        *end_byte_pos_ptr = end_byte_pos;
    }

    if (currlrc_ptr != NULL) {
        *currlrc_ptr = current_lrc;
    }
}

static void XLogInsertRecordGroupFollowers(PGPROC *leader, const uint32 head, uint64 *end_byte_pos_ptr,
                                           int32* const currlrc_ptr)
{
    uint32 nextidx;
    uint32 total_size = 0;
    uint32 record_size = 0;
    PGPROC *follower = NULL;
    uint64 start_byte_pos = 0;
    uint64 end_byte_pos = 0;
    uint64 prev_byte_pos = 0;
    int32 current_lrc = 0;
    uint64 dirty_page_queue_lsn = 0;

    /* Walk the list and update the status of all xloginserts. */
    nextidx = head;
    while (nextidx != (uint32)(leader->pgprocno)) {
        follower = g_instance.proc_base_all_procs[nextidx];
        *follower->xlogGroupRedoRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->Insert.RedoRecPtr;

        /*
         * In some cases, some xlog records about full page write are not needed to
         * write to buffer, so reserving space for these xlog records is not needed.
         */
        *follower->xlogGroupDoPageWrites = (t_thrd.shemem_ptr_cxt.XLogCtl->Insert.fullPageWrites ||
                                            t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites);

        bool flag = (follower->xlogGroupfpw_lsn != InvalidXLogRecPtr) &&
                    (follower->xlogGroupfpw_lsn <= *follower->xlogGroupRedoRecPtr) && *follower->xlogGroupDoPageWrites;
        if (unlikely(flag)) {
            follower->xlogGroupReturntRecPtr = InvalidXLogRecPtr;
            follower->xlogGroupIsFPW = true;
            nextidx = pg_atomic_read_u32(&follower->xlogGroupNext);
            continue;
        }

        record_size = MAXALIGN(((XLogRecord *)(follower->xlogGrouprdata->data))->xl_tot_len);
        Assert(record_size != 0);
        /* Calculate total size in the group. */
        total_size += record_size;
        /* Move to next proc in list. */
        nextidx = pg_atomic_read_u32(&follower->xlogGroupNext);
    }

    if (likely(total_size != 0)) {
        ReserveXLogInsertByteLocation(total_size, record_size, &start_byte_pos, &end_byte_pos, &prev_byte_pos,
                                      &current_lrc);
        dirty_page_queue_lsn = start_byte_pos;
    }

    nextidx = head;
    /* The lead thread insert xlog records in the group one by one. */
    while (nextidx != (uint32)(leader->pgprocno)) {
        follower = g_instance.proc_base_all_procs[nextidx];

        if (unlikely(follower->xlogGroupIsFPW)) {
            nextidx = pg_atomic_read_u32(&follower->xlogGroupNext);
            follower->xlogGroupIsFPW = false;
            continue;
        }
        XLogInsertRecordNolock(follower->xlogGrouprdata, follower, XLogBytePosToRecPtr(start_byte_pos),
                               XLogBytePosToEndRecPtr(
                                   start_byte_pos +
                                   MAXALIGN(((XLogRecord *)(follower->xlogGrouprdata->data))->xl_tot_len)),
                               XLogBytePosToRecPtr(prev_byte_pos), &current_lrc);
        prev_byte_pos = start_byte_pos;
        start_byte_pos += MAXALIGN(((XLogRecord *)(follower->xlogGrouprdata->data))->xl_tot_len);
        /* Move to next proc in list. */
        nextidx = pg_atomic_read_u32(&follower->xlogGroupNext);
    }

    if (dirty_page_queue_lsn != 0) {
        update_dirty_page_queue_rec_lsn(XLogBytePosToRecPtr(dirty_page_queue_lsn));
    }

    if (end_byte_pos_ptr != NULL) {
        *end_byte_pos_ptr = end_byte_pos;
    }
    if (currlrc_ptr != NULL) {
        *currlrc_ptr = current_lrc;
    }
}

static void XLogReportCopyLocation(const int32 current_lrc, const uint64 end_byte_pos)
{
    bool stop = true;
    uint32 current_entry =
        current_lrc &
        (GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power) - 1);
    volatile WALInsertStatusEntry *status_entry_ptr =
        &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(current_entry)];

    if (g_instance.wal_cxt.isWalWriterUp != 0) {
        do {
            stop = (status_entry_ptr->LRC == current_lrc && (status_entry_ptr->status == WAL_NOT_COPIED));
        } while (!stop);
    } else {
        while (status_entry_ptr->LRC != current_lrc) {
            XLogSelfFlush();
        }
    }
    pg_memory_barrier();
    (void)pg_atomic_exchange_u64(&status_entry_ptr->endLSN, XLogBytePosToEndRecPtr(end_byte_pos));
    pg_memory_barrier();
    status_entry_ptr->status = WAL_COPIED;
}

/*
 * @Description: Insert an XLOG record represented by an already-constructed chain of data
 * chunks. some threads in one xlog insert lock can be inserted in a group.
 * @in rdata: the xlog data.
 * @in fpw_lsn: the LSN of full page write.
 * @return: the LSN of the insert end position.
 */
static XLogRecPtr XLogInsertRecordGroup(XLogRecData *rdata, XLogRecPtr fpw_lsn)
{
    PGPROC *proc = t_thrd.proc;
    uint32 head = 0;
    uint32 nextidx = 0;
    int32 current_lrc_leader = 0;
    int32 current_lrc = 0;
    uint64 end_byte_pos_leader = 0;
    uint64 end_byte_pos = 0;

    /* cross-check on whether we should be here or not */
    if (unlikely(!XLogInsertAllowed())) {
        ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("cannot make new WAL entries during recovery")));
    }

    if (!SSXLogInsertAllowed()) {
        ereport(FATAL, (errmsg("SS standby cannot insert XLOG entries")));
    }

    START_CRIT_SECTION();

    /* Add ourselves to the list of processes needing a group xlog status update. */
    proc->xlogGroupMember = true;
    proc->xlogGrouprdata = rdata;
    proc->xlogGroupfpw_lsn = fpw_lsn;
    proc->xlogGroupProcLastRecPtr = &t_thrd.xlog_cxt.ProcLastRecPtr;
    proc->xlogGroupXactLastRecEnd = &t_thrd.xlog_cxt.XactLastRecEnd;
    proc->xlogGroupCurrentTransactionState = GetCurrentTransactionState();
    proc->xlogGroupRedoRecPtr = &t_thrd.xlog_cxt.RedoRecPtr;
    proc->xlogGroupLogwrtResult = t_thrd.xlog_cxt.LogwrtResult;
    proc->xlogGroupTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
    proc->xlogGroupDoPageWrites = &t_thrd.xlog_cxt.doPageWrites;

    int groupnum = (proc->pgprocno / g_instance.shmem_cxt.numaNodeNum) % (g_instance.wal_cxt.num_locks_in_group);

    nextidx = pg_atomic_read_u32(&t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks[groupnum].l.xlogGroupFirst);
    while (true) {
        pg_atomic_write_u32(&proc->xlogGroupNext, nextidx);

        /* Ensure all previous writes are visible before follower continues. */
        pg_write_barrier();

        if (pg_atomic_compare_exchange_u32(&t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks[groupnum].l.xlogGroupFirst,
                                           &nextidx, (uint32)proc->pgprocno)) {
            break;
        }
    }

    /*
     * If the list was not empty, the leader will insert all xlog in the same xlog insert slot.
     * It is impossible to have followers without a leader because the first process that
     * has added itself to the list will always have nextidx as INVALID_PGPROCNO.
     */
    if (nextidx != INVALID_PGPROCNO) {
        int extra_waits = 0;

        /* Sleep until the leader updates our XLOG insert status. */
        for (;;) {
#ifdef ENABLE_MULTIPLE_NODES
            /* acts as a read barrier */
            PGSemaphoreLock(&proc->sem, false);
            if (!proc->xlogGroupMember) {
                break;
            }
#else
            if (extra_waits % FOLLOWER_TRIGER_SLEEP_LOOP_COUNT == 0 && extra_waits > 0) {
                pg_usleep(FOLLOWER_SLEEP_USECS);
            }

            pg_read_barrier();
            if (!proc->xlogGroupMember) {
                break;
            } else {
                (void)sched_yield();
            }
#endif
            extra_waits++;
        }

#ifdef ENABLE_MULTIPLE_NODES
        /* Fix semaphore count for any absorbed wakeups */
        while (extra_waits-- > 0) {
            PGSemaphoreUnlock(&proc->sem);
        }
#endif
        END_CRIT_SECTION();

        /*
         * We needs a memory barrier between reading "xlogGroupMember" and "xlogGroupReturntRecPtr'
         * in case of memory reordering in relaxed memory model like ARM.
         */
        pg_memory_barrier();
        Assert(pg_atomic_read_u32(&proc->xlogGroupNext) == INVALID_PGPROCNO);
        return proc->xlogGroupReturntRecPtr;
    }

    /* I am the leader; Let me insert my own WAL first. */
    PGPROC *leader = t_thrd.proc;
    XLogInsertRecordGroupLeader(leader, &end_byte_pos_leader, &current_lrc_leader);
    if (end_byte_pos_leader != 0) {
        XLogReportCopyLocation(current_lrc_leader, end_byte_pos_leader);
    }

    /*
     * Clear the list of processes waiting for group xlog insert, saving a pointer to the head of the list.
     * Trying to pop elements one at a time could lead to an ABA problem.
     */
    head = pg_atomic_exchange_u32(&t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks[groupnum].l.xlogGroupFirst,
                                  INVALID_PGPROCNO);

    bool has_follower = (head != (uint32)(leader->pgprocno));
    if (has_follower) {
        /* I have at least one follower and I will next insert their WAL for them. */
        XLogInsertRecordGroupFollowers(leader, head, &end_byte_pos, &current_lrc);
    }

    /*
     * Wake all waiting threads up.
     */
    WakeUpProc(head);

    /*
     * Examine the entry's LRC and status all together to make sure the writer already
     * finishes updating this entry and this entry is now owned by this thread
     *
     * If the status entry is owned by this thread, update it with the copied xlog record's
     * end LSN and set the WAL copy status to WAL_COPIED to signal copy completion as well
     */
    if (has_follower && end_byte_pos != 0) {
        XLogReportCopyLocation(current_lrc, end_byte_pos);
    }

    if (g_instance.wal_cxt.isWalWriterSleeping) {
        pthread_mutex_lock(&g_instance.wal_cxt.criticalEntryMutex);
        pthread_cond_signal(&g_instance.wal_cxt.criticalEntryCV);
        pthread_mutex_unlock(&g_instance.wal_cxt.criticalEntryMutex);
    }

    END_CRIT_SECTION();

    return proc->xlogGroupReturntRecPtr;
}

/*
 * @Description: Insert an XLOG record represented by an already-constructed chain of data
 * chunks. Becaus of the group insert mode, the xlog insert lock is not needed.
 * @in rdata: the xlog data.
 * @in proc: the waiting proc.
 * @in StartPos: the start LSN postion.
 * @in EndPos: the end LSN postion.
 * @in PrevPos: the prev LSN postion.
 */
static void XLogInsertRecordNolock(XLogRecData *rdata, PGPROC *proc, XLogRecPtr StartPos, XLogRecPtr EndPos,
                                   XLogRecPtr PrevPos, int32 *const currlrc_ptr)
{
    XLogRecord *rechdr = (XLogRecord *)rdata->data;
    rechdr->xl_prev = PrevPos;
    /* pg_crc32c as same as pg_crc32 */
    pg_crc32c rdata_crc;

    Assert(XLByteLT(PrevPos, StartPos));
    /* we assume that all of the record header is in the first chunk */
    Assert(rdata->len >= SizeOfXLogRecord);

    START_CRIT_SECTION();

    /* Now that xl_prev has been filled in, calculate CRC of the record header. */
    rdata_crc = ((XLogRecord *)rechdr)->xl_crc;

    COMP_CRC32C(rdata_crc, (XLogRecord *)rechdr, offsetof(XLogRecord, xl_crc));
    FIN_CRC32C(rdata_crc); /* FIN_CRC32C as same as FIN_CRC32 */
    ((XLogRecord *)rechdr)->xl_crc = rdata_crc;

    /*
     * All the record data, including the header, is now ready to be
     * inserted. Copy the record in the space reserved.
     */
    CopyXLogRecordToWALForGroup(rechdr->xl_tot_len, rdata, StartPos, EndPos, proc, currlrc_ptr);

    CopyTransactionIdLoggedIfAny((TransactionState)proc->xlogGroupCurrentTransactionState);
    END_CRIT_SECTION();

    /* Update shared LogwrtRqst.Write, if we crossed page boundary. */
    if (unlikely(StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)) {
        SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
        /* advance global request to include new block(s) */
        if (t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write < EndPos) {
            t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write = EndPos;
        }
        /* update local result copy while I have the chance */
        *((XLogwrtResult *)proc->xlogGroupLogwrtResult) = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult;
        SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
    }

#ifdef WAL_DEBUG
    if (u_sess->attr.attr_storage.XLOG_DEBUG) {
        StringInfoData buf;

        initStringInfo(&buf);
        appendStringInfo(&buf, "INSERT @ %X/%X: ", (uint32)(EndPos >> 32), (uint32)EndPos);
        xlog_outrec(&buf, rechdr);
        if (rdata->data != NULL) {
            appendStringInfo(&buf, " - ");
            RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
        }
        ereport(LOG, (errmsg("%s", buf.data)));
        pfree_ext(buf.data);
    }
#endif

    /* Update our global variables */
    *proc->xlogGroupProcLastRecPtr = StartPos;
    *proc->xlogGroupXactLastRecEnd = EndPos;
    proc->xlogGroupReturntRecPtr = EndPos;

    return;
}

/*
 * @Description: Reserves the right amount of space for a given size from the WAL.
 * already-reserved area in the WAL. The StartBytePos, EndBytePos and PrevBytePos
 * are stored as "usable byte positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
 * @in size: the size for right amount of space.
 * @in lastRecordSize: the last record size in the group.
 * @out StartBytePos: the start position of the WAL.
 * @out EndBytePos: the end position of the WAL.
 * @out PrevBytePos: the previous position of the WAL.
 */
static void ReserveXLogInsertByteLocation(uint32 size, uint32 lastRecordSize, uint64 *StartBytePos, uint64 *EndBytePos,
                                          uint64 *PrevBytePos, int32* const currlrc_ptr)
{
    volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;

    size = MAXALIGN(size);

    /* All (non xlog-switch) records should contain data. */
    Assert(size > SizeOfXLogRecord);

    /*
     * The duration the spinlock needs to be held is minimized by minimizing
     * the calculations that have to be done while holding the lock. The
     * current tip of reserved WAL is kept in CurrBytePos, as a byte position
     * that only counts "usable" bytes in WAL, that is, it excludes all WAL
     * page headers. The mapping between "usable" byte positions and physical
     * positions (XLogRecPtrs) can be done outside the locked region, and
     * because the usable byte position doesn't include any headers, reserving
     * X bytes from WAL is almost as simple as "CurrBytePos += X".
     */
#if defined(__x86_64__) || defined(__aarch64__)
    union Union128 compare;
    union Union128 exchange;
    union Union128 current;

    compare.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos);

    Assert(sizeof(Insert->CurrBytePos) == SIZE_OF_UINT64);
    Assert(sizeof(Insert->PrevByteSize) == SIZE_OF_UINT32);
    Assert(sizeof(Insert->CurrLRC) == SIZE_OF_UINT32);
loop:
    /*
     * |CurrBytePos			|bytePos	  | CurrLRC
     * ------------------------+------------ +-----------
     * |64 bits				|32 bits	  |32 bits
     */

    /*
     * Suspend WAL insert threads when currlrc equals WAL_COPY_SUSPEND
     */
    if (unlikely(compare.struct128.LRC == WAL_COPY_SUSPEND)) {
        compare.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos);
        goto loop;
    }

    /* increment currlrc by 1 and store it back to the global LRC for the next record */

    exchange.struct128.currentBytePos = compare.struct128.currentBytePos + size;
    exchange.struct128.byteSize = lastRecordSize;
    exchange.struct128.LRC = (compare.struct128.LRC + 1) & 0x7FFFFFFF;

    current.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos, compare.value,
                                                 exchange.value);

    if (!UINT128_IS_EQUAL(compare.value, current.value)) {
        UINT128_COPY(compare.value, current.value);
        goto loop;
    }
#else
    uint64 startbytepos;
    uint64 endbytepos;
    uint64 prevbytepos;

    SpinLockAcquire(&Insert->insertpos_lck);

    startbytepos = Insert->CurrBytePos;
    endbytepos = startbytepos + size;
    prevbytepos = Insert->PrevBytePos;
    Insert->CurrBytePos = endbytepos;
    Insert->PrevBytePos = endbytepos - lastRecordSize;

    SpinLockRelease(&Insert->insertpos_lck);
#endif /* __x86_64__ || __aarch64__ */
    *currlrc_ptr = compare.struct128.LRC;
    *StartBytePos = compare.struct128.currentBytePos;
    *EndBytePos = exchange.struct128.currentBytePos;
    *PrevBytePos = compare.struct128.currentBytePos - compare.struct128.byteSize;
}

/*
 * @Description: In xlog group insert mode, copy a WAL record to an
 * already-reserved area in the WAL.
 * @in write_len: the length of xlog data.
 * @in rdata: the xlog data.
 * @in StartPos: the start LSN.
 * @in EndPos: the end LSN.
 * @in proc: the waiting proc.
 */
static void CopyXLogRecordToWALForGroup(int write_len, XLogRecData *rdata, XLogRecPtr StartPos, XLogRecPtr EndPos,
                                        PGPROC *proc, int32 *const currlrc_ptr)
{
    char *currpos = NULL;
    uint32 freespace;
    int written;
    XLogRecPtr CurrPos;
    XLogPageHeader pagehdr;
    errno_t errorno = EOK;

    /*
     * The following 2 variables are used to handle large records
     */
    uint32 startBufPageIdx = XLogRecPtrToBufIdx(StartPos);
    int numHitsOnStartPage = 1;

    /* Get a pointer to the right place in the right WAL buffer to start inserting to. */
    CurrPos = StartPos;
    currpos = GetXLogBuffer<true>(CurrPos, proc);
    freespace = INSERT_FREESPACE(CurrPos);

    /*
     * there should be enough space for at least the first field (xl_tot_len)
     * on this page.
     */
    Assert(freespace >= sizeof(uint32));

    /* Copy record data */
    written = 0;
    while (rdata != NULL) {
        char *rdata_data = rdata->data;
        uint32 rdata_len = rdata->len;

        while (rdata_len > freespace) {
            /* Write what fits on this page, and continue on the next page. */
            Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
            errorno = memcpy_s(currpos, SECUREC_STRING_MAX_LEN, rdata_data, freespace);
            securec_check(errorno, "", "");

            rdata_data += freespace;
            rdata_len -= freespace;
            written += freespace;
            CurrPos += freespace;

            /*
             * We hit the page we start from within the same copy. We must initiate
             * a flush without updating WAL copy status entry or we will be stuck
             */
            if (unlikely(XLogRecPtrToBufIdx(CurrPos) == startBufPageIdx)) {
                numHitsOnStartPage++;
                XLogSelfFlushWithoutStatus(numHitsOnStartPage, CurrPos, *currlrc_ptr);
            }

            /*
             * Get pointer to beginning of next page, and set the xlp_rem_len
             * in the page header. Set XLP_FIRST_IS_CONTRECORD.
             *
             * It's safe to set the contrecord flag and xlp_rem_len without a
             * lock on the page. All the other flags were already set when the
             * page was initialized, in AdvanceXLInsertBuffer, and we're the
             * only backend that needs to set the contrecord flag.
             */
            currpos = GetXLogBuffer<true>(CurrPos, proc);
            pagehdr = (XLogPageHeader)currpos;
            pagehdr->xlp_rem_len = write_len - written;
            pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;

            /* skip over the page header */
            if (CurrPos % XLogSegSize == 0) {
                CurrPos += SizeOfXLogLongPHD;
                currpos += SizeOfXLogLongPHD;
            } else {
                CurrPos += SizeOfXLogShortPHD;
                currpos += SizeOfXLogShortPHD;
            }
            freespace = INSERT_FREESPACE(CurrPos);
        }

        Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
        errorno = memcpy_s(currpos, SECUREC_STRING_MAX_LEN, rdata_data, rdata_len);
        securec_check(errorno, "", "");

        currpos += rdata_len;
        CurrPos += rdata_len;
        freespace -= rdata_len;
        written += rdata_len;

        rdata = rdata->next;
    }
    Assert(written == write_len);

    /* Align the end position, so that the next record starts aligned */
    CurrPos = MAXALIGN(CurrPos);
    if (SECUREC_UNLIKELY(CurrPos != EndPos)) {
        ereport(PANIC, (errmsg("space reserved for WAL record does not match what was written")));
    }

    pgstat_report_waitevent_count(WAIT_EVENT_WAL_BUFFER_ACCESS);
}

#endif

inline void SetMinRecoverPointForStats(XLogRecPtr lsn)
{
    if (XLByteLT(g_instance.comm_cxt.predo_cxt.redoPf.min_recovery_point, lsn)) {
        g_instance.comm_cxt.predo_cxt.redoPf.min_recovery_point = lsn;
    }
}

/*
 * @Description: Insert an XLOG record represented by an already-constructed chain of data
 * chunks. The action is the same as the function XLogInsertRecord.
 * @in rdata: the xlog data.
 * @in fpw_lsn: the LSN of full page write.
 * @in isupgrade: whether this xlog insert is for upgrade.
 * @return: the LSN of the insert end position.
 */
static XLogRecPtr XLogInsertRecordSingle(XLogRecData *rdata, XLogRecPtr fpw_lsn)
{
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    XLogRecord *rechdr = (XLogRecord *)rdata->data;
    pg_crc32c rdata_crc; /* pg_crc32c as same as pg_crc32 */
    bool inserted = false;
    bool stop = true;
    XLogRecPtr StartPos = InvalidXLogRecPtr;
    XLogRecPtr EndPos = InvalidXLogRecPtr;
    int32 currlrc = 0;

    bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && rechdr->xl_info == XLOG_SWITCH);

    /* we assume that all of the record header is in the first chunk */
    Assert(rdata->len >= SizeOfXLogRecord);

    /* cross-check on whether we should be here or not */
    if (!XLogInsertAllowed()) {
        ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("cannot make new WAL entries during recovery")));
    }

    if (!SSXLogInsertAllowed()) {
        ereport(FATAL, (errmsg("SS standby cannot insert XLOG entries")));
    }

    /* ----------
     *
     * We have now done all the preparatory work we can without holding a
     * lock or modifying shared state. From here on, inserting the new WAL
     * record to the shared WAL buffer cache is a two-step process:
     *
     * 1. Reserve the right amount of space from the WAL. The current head of
     *	  reserved space is kept in Insert->CurrBytePos, and is protected by
     *	  insertpos_lck.
     *
     * 2. Copy the record to the reserved WAL space. This involves finding the
     *	  correct WAL buffer containing the reserved space, and copying the
     *	  record in place. This can be done concurrently in multiple processes.
     *
     * To keep track of which insertions are still in-progress, each concurrent
     * inserter acquires an insertion lock. In addition to just indicating that
     * an insertion is in progress, the lock tells others how far the inserter
     * has progressed. There is a small fixed number of insertion locks,
     * determined by num_xloginsert_locks. When an inserter crosses a page
     * boundary, it updates the value stored in the lock to the how far it has
     * inserted, to allow the previous buffer to be flushed.
     *
     * Holding onto an insertion lock also protects RedoRecPtr and
     * fullPageWrites from changing until the insertion is finished.
     *
     * Step 2 can usually be done completely in parallel. If the required WAL
     * page is not initialized yet, you have to grab WALBufMappingLock to
     * initialize it, but the WAL writer tries to do that ahead of insertions
     * to avoid that from happening in the critical path.
     *
     * ----------
     */
    START_CRIT_SECTION();
    if (isLogSwitch) {
        StartSuspendWalInsert(&currlrc);
    }

    /*
     * Check to see if my copy of RedoRecPtr or doPageWrites is out of date.
     * If so, may have to go back and have the caller recompute everything.
     * This can only happen just after a checkpoint, so it's better to be slow
     * in this case and fast otherwise.
     *
     * If we aren't doing full-page writes then RedoRecPtr doesn't actually
     * affect the contents of the XLOG record, so we'll update our local copy
     * but not force a recomputation.  (If doPageWrites was just turned off,
     * we could recompute the record without full pages, but we choose not to
     * bother.)
     */
    if (t_thrd.xlog_cxt.RedoRecPtr != Insert->RedoRecPtr) {
        Assert(t_thrd.xlog_cxt.RedoRecPtr < Insert->RedoRecPtr);
        t_thrd.xlog_cxt.RedoRecPtr = Insert->RedoRecPtr;
    }
    t_thrd.xlog_cxt.doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);

    if (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= t_thrd.xlog_cxt.RedoRecPtr && t_thrd.xlog_cxt.doPageWrites) {
        /* Oops, some buffer now needs to be backed up that the caller didn't back up.  Start over. */
        END_CRIT_SECTION();
        return InvalidXLogRecPtr;
    }

    /*
     * Reserve space for the record in the WAL. This also sets the xl_prev
     * pointer.
     */
    if (isLogSwitch) {
        XLogRecPtr tmp_xl_prev = InvalidXLogRecPtr;
        inserted = ReserveXLogSwitch(&StartPos, &EndPos, &tmp_xl_prev);
        rechdr->xl_prev = tmp_xl_prev;
    } else {
        ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos, &rechdr->xl_prev, &currlrc);
        inserted = true;
    }

    if (inserted) {
        uint32 current_entry =
            currlrc &
            (GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power) - 1);
        volatile WALInsertStatusEntry *status_entry_ptr =
            &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(current_entry)];

        /* Now that xl_prev has been filled in, calculate CRC of the record header. */
        rdata_crc = rechdr->xl_crc;
        /* using CRC32C */
        COMP_CRC32C(rdata_crc, (XLogRecord *)rechdr, offsetof(XLogRecord, xl_crc));

        FIN_CRC32C(rdata_crc); /* FIN_CRC32C as same as FIN_CRC32 */
        rechdr->xl_crc = rdata_crc;

        /*
         * All the record data, including the header, is now ready to be
         * inserted. Copy the record in the space reserved.
         */
        CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata, StartPos, EndPos, &currlrc);

        /*
         * Examine the entry's LRC and status all together to make sure the writer already
         * finishes updating this entry and this entry is now owned by this thread
         *
         * If the status entry is owned by this thread, update it with the copied xlog record's
         * end LSN and set the WAL copy status to WAL_COPIED to signal copy completion as well
         *
         * XLog switch already handles its flush in CopyXLogRecordToWAL so here we only
         * update the WAL copy status for the non switch records
         */
        if (!isLogSwitch) {
            if (g_instance.wal_cxt.isWalWriterUp) {
                do {
                    stop = (status_entry_ptr->LRC == currlrc && (status_entry_ptr->status == WAL_NOT_COPIED));
                } while (!stop);
            } else {
                while (status_entry_ptr->LRC != currlrc) {
                    XLogSelfFlush();
                }
            }

            /*
             * We now own this status entry
             * Update status_entry_ptr->endLSN with an atomic call to prevent half updated value being
             * read by the writer. The writer should use atomic read
             * The 2nd CAS call should succeed in updating the value since this thread is the only
             * one updating this field at this moment
             */
            pg_memory_barrier();
            (void)pg_atomic_exchange_u64(&status_entry_ptr->endLSN, EndPos);

            pg_memory_barrier();

            status_entry_ptr->status = WAL_COPIED;
        }

        if (g_instance.wal_cxt.isWalWriterSleeping) {
            pthread_mutex_lock(&g_instance.wal_cxt.criticalEntryMutex);
            pthread_cond_signal(&g_instance.wal_cxt.criticalEntryCV);
            pthread_mutex_unlock(&g_instance.wal_cxt.criticalEntryMutex);
        }
    } else {
        /*
         * This was an xlog-switch record, but the current insert location was
         * already exactly at the beginning of a segment, so there was no need
         * to do anything.
         */
    }

    /* Done! Let others know that we're finished. */
    if (isLogSwitch) {
        /*
         * Swap the recorded LRC back into the shared memory location after adding 1
         * for the next WAL insert thread to pick up
         */
        StopSuspendWalInsert(currlrc);
    }
    MarkCurrentTransactionIdLoggedIfAny();
    END_CRIT_SECTION();

    /*
     * Use the StartPos update the dirty page queue reclsn, the dirty page push to
     * dirty page queue use this reclsn, then record XLog, the dirty page reclsn is
     * smaller than the redo lsn.
     */
    update_dirty_page_queue_rec_lsn(StartPos);

    /*
     * Update shared LogwrtRqst.Write, if we crossed page boundary.
     */
    if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ) {
        SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
        /* advance global request to include new block(s) */
        if (t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write < EndPos) {
            t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write = EndPos;
        }
        /* update local result copy while I have the chance */
        *t_thrd.xlog_cxt.LogwrtResult = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult;
        SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
    }

    /*
     * If this was an XLOG_SWITCH record, flush the record and the empty
     * padding space that fills the rest of the segment, and perform
     * end-of-segment actions (eg, notifying archiver).
     */
    if (isLogSwitch) {
        TRACE_POSTGRESQL_XLOG_SWITCH();
        XLogWaitFlush(EndPos);

        /*
         * Even though we reserved the rest of the segment for us, which is
         * reflected in EndPos, we return a pointer to just the end of the
         * xlog-switch record.
         */
        if (inserted) {
            EndPos = StartPos + MAXALIGN(SizeOfXLogRecord);
            if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ && EndPos % XLOG_BLCKSZ != 0) {
                if (EndPos % XLogSegSize == EndPos % XLOG_BLCKSZ) {
                    EndPos += SizeOfXLogLongPHD;
                } else {
                    EndPos += SizeOfXLogShortPHD;
                }
            }
        }
    }

#ifdef WAL_DEBUG
    if (u_sess->attr.attr_storage.XLOG_DEBUG) {
        StringInfoData buf;

        initStringInfo(&buf);
        appendStringInfo(&buf, "INSERT @ %X/%X: ", (uint32)(EndPos >> 32), (uint32)EndPos);
        xlog_outrec(&buf, rechdr);
        if (rdata->data != NULL) {
            appendStringInfo(&buf, " - ");
            RmgrTable[rechdr->xl_rmid].rm_desc(&buf, rechdr->xl_info, rdata->data);
        }
        ereport(LOG, (errmsg("%s", buf.data)));
        pfree_ext(buf.data);
    }
#endif

    // Update our global variables
    t_thrd.xlog_cxt.ProcLastRecPtr = StartPos;
    t_thrd.xlog_cxt.XactLastRecEnd = EndPos;

    return EndPos;
}

/*
 * Insert an XLOG record represented by an already-constructed chain of data
 * chunks.  This is a low-level routine; to construct the WAL record header
 * and data, use the higher-level routines in xloginsert.cpp.
 *
 * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
 * WAL record applies to, that were not included in the record as full page
 * images.  If fpw_lsn >= RedoRecPtr, the function does not perform the
 * insertion and returns InvalidXLogRecPtr.  The caller can then recalculate
 * which pages need a full-page image, and retry.  If fpw_lsn is invalid, the
 * record is always inserted.
 *
 * The first XLogRecData in the chain must be for the record header, and its
 * data must be MAXALIGNed.  XLogInsertRecord fills in the xl_prev and
 * xl_crc fields in the header, the rest of the header must already be filled
 * by the caller.
 *
 * Returns XLOG pointer to end of record (beginning of next record).
 * This can be used as LSN for data pages affected by the logged action.
 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 * before the data page can be written out.  This implements the basic
 * WAL rule "write the log before the data".)
 *
 * @in rdata: the xlog data.
 * @in fpw_lsn: the LSN of full page write.
 * @in isupgrade: whether this xlog insert is for upgrade.
 * @return: the LSN of the insert end position.
 */
XLogRecPtr XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn)
{
#ifdef __aarch64__
    /*
     * In ARM architecture, insert an XLOG record represented by an already-constructed chain of data
     * chunks. If the record is LogSwitch or upgrade data, insert the record in single mode, and in other
     * situation, insert the record in group mode.
     */
    XLogRecord *rechdr = (XLogRecord *)rdata->data;
    bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID && rechdr->xl_info == XLOG_SWITCH);
    if (isLogSwitch) {
        return XLogInsertRecordSingle(rdata, fpw_lsn);
    } else {
        return XLogInsertRecordGroup(rdata, fpw_lsn);
    }
#else
    return XLogInsertRecordSingle(rdata, fpw_lsn);
#endif /* __aarch64__ */
}

/*
 * Reserves the right amount of space for a record of given size from the WAL.
 * *StartPos is set to the beginning of the reserved section, *EndPos to
 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
 * used to set the xl_prev of this record.
 *
 * This is the performance critical part of XLogInsert that must be serialized
 * across backends. The rest can happen mostly in parallel. Try to keep this
 * section as short as possible, insertpos_lck can be heavily contended on a
 * busy system.
 *
 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
 * where we actually copy the record to the reserved space.
 */
static void ReserveXLogInsertLocation(uint32 size, XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr,
                                      int32* const currlrc_ptr)
{
    volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;

    size = MAXALIGN(size);

    /* All (non xlog-switch) records should contain data. */
    Assert(size > SizeOfXLogRecord);

    /*
     * The duration the spinlock needs to be held is minimized by minimizing
     * the calculations that have to be done while holding the lock. The
     * current tip of reserved WAL is kept in CurrBytePos, as a byte position
     * that only counts "usable" bytes in WAL, that is, it excludes all WAL
     * page headers. The mapping between "usable" byte positions and physical
     * positions (XLogRecPtrs) can be done outside the locked region, and
     * because the usable byte position doesn't include any headers, reserving
     * X bytes from WAL is almost as simple as "CurrBytePos += X".
     */
#if defined(__x86_64__) || defined(__aarch64__)
    union Union128 compare;
    union Union128 exchange;
    union Union128 current;

    compare.value = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos);
    Assert(sizeof(Insert->CurrBytePos) == SIZE_OF_UINT64);
    Assert(sizeof(Insert->PrevByteSize) == SIZE_OF_UINT32);
    Assert(sizeof(Insert->CurrLRC) == SIZE_OF_UINT32);

loop1:
    /*
     * |CurrBytePos 		   |bytePos 	 | CurrLRC
     * ------------------------+------------ +-----------
     * |64 bits                |32 bits      |32 bits
     */

    /*
     * Suspend WAL insert threads when currlrc equals WAL_COPY_SUSPEND
     */
    if (unlikely(compare.struct128.LRC == WAL_COPY_SUSPEND)) {
        compare.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos);
        pg_usleep(1);
        goto loop1;
    }

    /* increment currlrc by 1 and store it back to the global LRC for the next record */

    exchange.struct128.currentBytePos = compare.struct128.currentBytePos + size;
    exchange.struct128.byteSize = size;
    exchange.struct128.LRC = (compare.struct128.LRC + 1) & 0x7FFFFFFF;

    current.value = atomic_compare_and_swap_u128((volatile uint128_u *)&Insert->CurrBytePos, compare.value,
                                                 exchange.value);
    if (!UINT128_IS_EQUAL(compare.value, current.value)) {
        UINT128_COPY(compare.value, current.value);
        goto loop1;
    }

#else
    uint64 startbytepos;
    uint64 endbytepos;
    uint64 prevbytepos;

    SpinLockAcquire(&Insert->insertpos_lck);

    startbytepos = Insert->CurrBytePos;
    prevbytepos = Insert->PrevBytePos;
    endbytepos = startbytepos + size;
    Insert->CurrBytePos = endbytepos;
    Insert->PrevBytePos = startbytepos;

    SpinLockRelease(&Insert->insertpos_lck);
#endif /* __x86_64__|| __aarch64__ */
    *currlrc_ptr = compare.struct128.LRC;
    *StartPos = XLogBytePosToRecPtr(compare.struct128.currentBytePos);
    *EndPos = XLogBytePosToEndRecPtr(exchange.struct128.currentBytePos);
    *PrevPtr = XLogBytePosToRecPtr(compare.struct128.currentBytePos - compare.struct128.byteSize);
}

/*
 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
 *
 * A log-switch record is handled slightly differently. The rest of the
 * segment will be reserved for this insertion, as indicated by the returned
 * *EndPos value. However, if we are already at the beginning of the current
 * segment, *StartPos and *EndPos are set to the current location without
 * reserving any space, and the function returns false.
 */
static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
{
    volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    uint64 startbytepos;
    uint64 endbytepos;
    uint32 prevbytesize;
    int32 currlrc;
    uint32 size = MAXALIGN(SizeOfXLogRecord);
    XLogRecPtr ptr;
    uint32 segleft;

    /*
     * These calculations are a bit heavy-weight to be done while holding a
     * spinlock, but since we're holding all the WAL insertion locks, there
     * are no other inserters competing for it. GetXLogInsertRecPtr() does
     * compete for it, but that's not called very frequently.
     */
#if defined(__x86_64__) || defined(__aarch64__)
    uint128_u exchange;
    uint128_u current;
    uint128_u compare = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos);

loop:
    startbytepos = compare.u64[0];

    ptr = XLogBytePosToEndRecPtr(startbytepos);
    if (ptr % XLogSegSize == 0) {
        *EndPos = *StartPos = ptr;
        return false;
    }

    endbytepos = startbytepos + size;
    prevbytesize = compare.u32[2];
    currlrc = (int32)compare.u32[3];

    *StartPos = XLogBytePosToRecPtr(startbytepos);
    *EndPos = XLogBytePosToEndRecPtr(endbytepos);

    segleft = XLogSegSize - ((*EndPos) % XLogSegSize);
    if (segleft != XLogSegSize) {
        /* consume the rest of the segment */
        *EndPos += segleft;
        endbytepos = XLogRecPtrToBytePos(*EndPos);
    }

    exchange.u64[0] = endbytepos;
    size = (uint32)(endbytepos - startbytepos);
    exchange.u64[1] = (uint64)size;
    exchange.u32[3] = currlrc;

    current = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos, compare, exchange);
    if (!UINT128_IS_EQUAL(compare, current)) {
        UINT128_COPY(compare, current);
        goto loop;
    }
#else
    SpinLockAcquire(&Insert->insertpos_lck);

    startbytepos = Insert->CurrBytePos;

    ptr = XLogBytePosToEndRecPtr(startbytepos);
    if (ptr % XLogSegSize == 0) {
        SpinLockRelease(&Insert->insertpos_lck);
        *EndPos = *StartPos = ptr;
        return false;
    }

    endbytepos = startbytepos + size;
    prevbytesize = startbytepos - Insert->PrevByteSize;

    *StartPos = XLogBytePosToRecPtr(startbytepos);
    *EndPos = XLogBytePosToEndRecPtr(endbytepos);

    segleft = XLogSegSize - ((*EndPos) % XLogSegSize);
    if (segleft != XLogSegSize) {
        /* consume the rest of the segment */
        *EndPos += segleft;
        endbytepos = XLogRecPtrToBytePos(*EndPos);
    }
    Insert->CurrBytePos = endbytepos;
    Insert->PrevByteSize = size;

    SpinLockRelease(&Insert->insertpos_lck);
#endif /* __x86_64__ || __aarch64__ */

    *PrevPtr = XLogBytePosToRecPtr(startbytepos - prevbytesize);

    Assert((*EndPos) % XLogSegSize == 0);
    Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
    Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
    Assert(XLogRecPtrToBytePos(*PrevPtr) == (startbytepos - prevbytesize));

    return true;
}

static void StartSuspendWalInsert(int32 *const lastlrc_ptr)
{
    volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    uint64 startbytepos;
    uint32 prevbytesize;
    int32 currlrc;
    int32 ientry;
    volatile WALInsertStatusEntry *entry;
    uint128_u compare;
    uint128_u exchange;
    uint128_u current;

    compare = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos);
    Assert(sizeof(Insert->CurrBytePos) == 8);
    Assert(sizeof(Insert->PrevByteSize) == 4);
    Assert(sizeof(Insert->CurrLRC) == 4);

loop:
    /*
     * |CurrBytePos            |PrevByteSize | CurrLRC
     * ------------------------+------------ +-----------
     * |64 bits                |32 bits      |32 bits
     */

    startbytepos = compare.u64[0];
    prevbytesize = compare.u32[2];
    currlrc = (int32)compare.u32[3];

    /*
     * if WAL Insert threads are already suspended, wait
     */
    if (unlikely(currlrc == WAL_COPY_SUSPEND)) {
        compare = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos);
        goto loop;
    }

    /*
     * Return the last LRC
     */
    *lastlrc_ptr = currlrc;

    currlrc = WAL_COPY_SUSPEND;

    /*
     * Swap in currlrc to suspend WAL copy threads but keep startbytepos and prevbytesize
     * as it is
     */
    exchange.u64[0] = startbytepos;
    exchange.u32[2] = prevbytesize;
    exchange.u32[3] = currlrc;

    current = atomic_compare_and_swap_u128((uint128_u *)&Insert->CurrBytePos, compare, exchange);
    if (!UINT128_IS_EQUAL(compare, current)) {
        UINT128_COPY(compare, current);
        goto loop;
    }

    /*
     * Wait for the WAL copy thread obtaining the "*lastlrc_ptr - 1" to finish
     * *lastlrc_ptr is the LRC for the next WAL insert thread to acquire
     * I probably should give it a better name other than "lastlrc_ptr" that is
     * somewhat misleading but I am trying to say that is the last LRC stored
     * in the shared memory before we swap in WAL_COPY_SUSPEND
     * The last working WAL insert thread should be holding "*lastlrc_ptr - 1"
     */
    int ilrc = (*lastlrc_ptr - 1) & 0x7FFFFFFF;
    ientry = ilrc & (GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power) - 1);
    entry = &g_instance.wal_cxt.walInsertStatusTable[ientry];

    /*
     * Wait if the WAL copy thread obtaining the "*lastlrc_ptr - 1" didn't finish.
     */
    bool stop = true;
    while (stop) {
        if (!g_instance.wal_cxt.isWalWriterUp) {
            XLogSelfFlush();
        }

        /*
         * Wait till the Wal writer scans to "*lastlrc_ptr - 1"
         * except that there is nobody before me after StartupXLOG
         * this edge case needs to be excluded can be identified by:
         *
         * (*lastlrc_ptr == 0 && entry->endLSN == 0)
         *
         * if the current LRC  is 0 and the endLSN corresponding to the LRC before
         * the current LRC is also 0 then we know there is nobody doing the WAL
         * insert till this point. We need to also check entry->endLSN because we
         * need to consider the 31bit LRC wrap-around
         */
        stop = ((*lastlrc_ptr != 0 || entry->endLSN != 0) && (g_instance.wal_cxt.lastLRCScanned != ilrc));
    }

    return;
}

/*
 * This function must be called by whoever successfully suspended all WAL insert threads or
 * it will be a disaster
 */
static void StopSuspendWalInsert(int32 lastlrc)
{
    volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;

    /*
     * Since I already suspend all other WAL copy threads,
     * it is safe to do a direct 32 bit value assignment
     */
    Insert->CurrLRC = lastlrc;

    return;
}

/*
 * Subroutine of XLogInsert.  Copies a WAL record to an already-reserved
 * area in the WAL.
 */
static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata, XLogRecPtr StartPos,
                                XLogRecPtr EndPos, int32 *const currlrc_ptr)
{
    char *currpos = NULL;
    uint32 freespace;
    int written;
    XLogRecPtr CurrPos;
    XLogPageHeader pagehdr;
    errno_t errorno = EOK;
    bool stop = true;
    /*
     * The following 2 variables are used to handle large records
     */
    uint32 startBufPageIdx = XLogRecPtrToBufIdx(StartPos);
    int numHitsOnStartPage = 1;

    /*
     * Get a pointer to the right place in the right WAL buffer to start
     * inserting to.
     */
    CurrPos = StartPos;
    currpos = GetXLogBuffer<false>(CurrPos);
    freespace = INSERT_FREESPACE(CurrPos);

    /*
     * there should be enough space for at least the first field (xl_tot_len)
     * on this page.
     */
    Assert(freespace >= sizeof(uint32));

    /* Copy record data */
    written = 0;
    while (rdata != NULL) {
        char *rdata_data = rdata->data;
        uint32 rdata_len = rdata->len;

        while (rdata_len > freespace) {
            /*
             * Write what fits on this page, and continue on the next page.
             */
            Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
            errorno = memcpy_s(currpos, SECUREC_STRING_MAX_LEN, rdata_data, freespace);
            securec_check(errorno, "", "");

            rdata_data += freespace;
            rdata_len -= freespace;
            written += freespace;
            CurrPos += freespace;

            /*
             * We hit the page we start from within the same copy. We must initiate
             * a flush without updating WAL copy status entry or we will be stuck
             */
            if (unlikely(XLogRecPtrToBufIdx(CurrPos) == startBufPageIdx)) {
                numHitsOnStartPage++;
                XLogSelfFlushWithoutStatus(numHitsOnStartPage, CurrPos, *currlrc_ptr);
            }

            /*
             * Get pointer to beginning of next page, and set the xlp_rem_len
             * in the page header. Set XLP_FIRST_IS_CONTRECORD.
             *
             * It's safe to set the contrecord flag and xlp_rem_len without a
             * lock on the page. All the other flags were already set when the
             * page was initialized, in AdvanceXLInsertBuffer, and we're the
             * only backend that needs to set the contrecord flag.
             */
            currpos = GetXLogBuffer<false>(CurrPos);
            pagehdr = (XLogPageHeader)currpos;
            pagehdr->xlp_rem_len = write_len - written;
            pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;

            /* skip over the page header */
            if (CurrPos % XLogSegSize == 0) {
                CurrPos += SizeOfXLogLongPHD;
                currpos += SizeOfXLogLongPHD;
            } else {
                CurrPos += SizeOfXLogShortPHD;
                currpos += SizeOfXLogShortPHD;
            }
            freespace = INSERT_FREESPACE(CurrPos);
        }

        Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
        errorno = memcpy_s(currpos, SECUREC_STRING_MAX_LEN, rdata_data, rdata_len);
        securec_check(errorno, "", "");

        currpos += rdata_len;
        CurrPos += rdata_len;
        freespace -= rdata_len;
        written += rdata_len;

        rdata = rdata->next;
    }
    Assert(written == write_len);

    /* Align the end position, so that the next record starts aligned */
    CurrPos = MAXALIGN(CurrPos);
    /*
     * If this was an xlog-switch, it's not enough to write the switch record,
     * we also have to consume all the remaining space in the WAL segment.
     * We have already reserved it for us, but we still need to make sure it's
     * allocated and zeroed in the WAL buffers so that when the caller (or
     * someone else) does XLogWrite(), it can really write out all the zeros.
     */
    if (isLogSwitch && CurrPos % XLogSegSize != 0) {
        int currlrc = *currlrc_ptr;
        /* An xlog-switch record doesn't contain any data besides the header */
        Assert(write_len == SizeOfXLogRecord);

        /*
         * We do this one page at a time, to make sure we don't deadlock
         * against ourselves if wal_buffers < XLOG_SEG_SIZE.
         */
        Assert(EndPos % XLogSegSize == 0);

        /* Use up all the remaining space on the first page */
        CurrPos += freespace;

        /*
         * When inserting XLOG switch, we communicate with the WAL writer
         * page by page
         */
        while (CurrPos <= EndPos) {
            uint32 current_entry =
                currlrc &
                (GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power) - 1);
            volatile WALInsertStatusEntry *status_entry_ptr =
                &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(current_entry)];
            /*
             * Initialize the WAL buffer page if needed
             * Please note that endPtr passed into XLogWaitBufferInit is the LSN currently tagged
             * on the buffer page we want
             */
            int idx = (int)XLogRecPtrToBufIdx(CurrPos - 1);
            XLogRecPtr endPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[idx];
            XLogRecPtr expectedEndPtr = CurrPos;
            if (expectedEndPtr != endPtr) {
                pgstat_report_waitevent(WAIT_EVENT_WAL_BUFFER_FULL);
                XLogRecPtr waitEndPtr = X_LOG_PREV_BLOCK_IN_BUF_PAGE(expectedEndPtr);
                XLogWaitBufferInit(waitEndPtr);
                pgstat_report_waitevent(WAIT_EVENT_END);
            }
            /*
             * Update the WAL copy status entry to trigger flush
             *
             * First we wait to own a WAL copy status entry assigned
             * to us if the WAL writer is alive
             *
             * If the wal writer is not up for the xlog flush job, we need to invoke XLogBackgroundFlush
             * ourselves to push the status array recycling.
             */
            if (g_instance.wal_cxt.isWalWriterUp) {
                do {
                    stop = (status_entry_ptr->LRC == currlrc && (status_entry_ptr->status == WAL_NOT_COPIED));
                } while (!stop);
            } else if (status_entry_ptr->LRC != currlrc) {
                /*
                 * Here we don't need to loop on XLogSelfFlush to ensure our stuff is
                 * flushed because:
                 * 1) I am an XLog switch so I already called StartSuspendWalInsert to suspend
                 *    all other XLog insert accesses and the flush function already scanned till
                 *    the LRC right before the suspended LRC
                 * 2) XLog switch is inserted and flushed page by page sequentially
                 */
                XLogSelfFlush();
            }

            /*
             * We own the entry so we don't expect need to loop the CAS call
             * The CAS call here is to protext reads in the WAL writer thread
             */
            pg_memory_barrier();
            (void)pg_atomic_exchange_u64(&status_entry_ptr->endLSN, CurrPos);
            pg_memory_barrier();
            status_entry_ptr->status = WAL_COPIED;
            /*
             * Trigger a flush if WAL writer thread is not up yet
             */
            if (!g_instance.wal_cxt.isWalWriterUp) {
                /*
                 * Here we don't need to loop on XLogSelfFlush to ensure our stuff is
                 * flushed because:
                 * 1) I am an XLog switch so I already called StartSuspendWalInsert to suspend
                 *    all other XLog insert accesses and the flush function already scanned till
                 *    the LRC right before the suspended LRC
                 * 2) XLog switch is inserted and flushed page by page sequentially
                 */
                XLogSelfFlush();
            }
            currlrc = (currlrc + 1) & 0x7FFFFFFF;
            CurrPos += XLOG_BLCKSZ;
        }
        *currlrc_ptr = currlrc;
    }
    pgstat_report_waitevent_count(WAIT_EVENT_WAL_BUFFER_ACCESS);
}

/*
 * Get a pointer to the right location in the WAL buffer containing the
 * given XLogRecPtr.
 *
 * If the page is not initialized yet, it is initialized. That might require
 * evicting an old dirty buffer from the buffer cache, which means I/O.
 *
 * The caller must ensure that the page containing the requested location
 * isn't evicted yet, and won't be evicted. The way to ensure that is to
 * hold onto an XLogInsertLock with the xlogInsertingAt position set to
 * something <= ptr. GetXLogBuffer() will update xlogInsertingAt if it needs
 * to evict an old page from the buffer. (This means that once you call
 * GetXLogBuffer() with a given 'ptr', you must not access anything before
 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
 * later, because older buffers might be recycled already)
 */
template <bool isGroupInsert>
static char *GetXLogBuffer(XLogRecPtr ptr, PGPROC *proc)
{
    int idx;
    XLogRecPtr endptr;
    XLogRecPtr expectedEndPtr;

    /*
     * Fast path for the common case that we need to access again the same
     * page as last time.
     */
    if (ptr / XLOG_BLCKSZ == t_thrd.xlog_cxt.cachedPage) {
        Assert(((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
        Assert(((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
        return t_thrd.xlog_cxt.cachedPos + ptr % XLOG_BLCKSZ;
    }

    /*
     * The XLog buffer cache is organized so that a page is always loaded
     * to a particular buffer.  That way we can easily calculate the buffer
     * a given page must be loaded into, from the XLogRecPtr alone.
     */
    idx = XLogRecPtrToBufIdx(ptr);

    /*
     * See what page is loaded in the buffer at the moment. It could be the
     * page we're looking for, or something older. It can't be anything newer
     * - that would imply the page we're looking for has already been written
     * out to disk and evicted, and the caller is responsible for making sure
     * that doesn't happen.
     *
     * However, we don't hold a lock while we read the value. If someone has
     * just initialized the page, it's possible that we get a "torn read" of
     * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
     * that case we will see a bogus value. That's ok, we'll grab the mapping
     * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
     * the page we're looking for. But it means that when we do this unlocked
     * read, we might see a value that appears to be ahead of the page we're
     * looking for. Don't PANIC on that, until we've verified the value while
     * holding the lock.
     */
    expectedEndPtr = ptr;
    expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;

    endptr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[idx];
    if (expectedEndPtr != endptr) {
        pgstat_report_waitevent(WAIT_EVENT_WAL_BUFFER_FULL);
        /*
         * We need to wait for the background flush to flush till the endptr
         * The background flush will also initialize the flushed pages
         * with new pages
         */
        XLogRecPtr waitEndPtr = X_LOG_PREV_BLOCK_IN_BUF_PAGE(expectedEndPtr);
        XLogWaitBufferInit(waitEndPtr);

        pgstat_report_waitevent(WAIT_EVENT_END);

        endptr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[idx];
        if (expectedEndPtr != endptr) {
            ereport(PANIC, (errmsg("could not find WAL buffer for %X/%X", (uint32)(ptr >> 32), (uint32)ptr)));
        }
    } else {
        /*
         * Make sure the initialization of the page is visible to us, and
         * won't arrive later to overwrite the WAL data we write on the page.
         */
        pg_memory_barrier();
    }

    /*
     * Found the buffer holding this page. Return a pointer to the right
     * offset within the page.
     */
    t_thrd.xlog_cxt.cachedPage = ptr / XLOG_BLCKSZ;
    t_thrd.xlog_cxt.cachedPos = t_thrd.shemem_ptr_cxt.XLogCtl->pages + idx * (Size)XLOG_BLCKSZ;

    Assert((((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_magic == XLOG_PAGE_MAGIC));
    Assert(((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ) ||
           (((((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_pageaddr >> 32) |
             (((XLogPageHeader)t_thrd.xlog_cxt.cachedPos)->xlp_pageaddr << 32))) == ptr - (ptr % XLOG_BLCKSZ));

    return t_thrd.xlog_cxt.cachedPos + ptr % XLOG_BLCKSZ;
}

/*
 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
 * is the position starting from the beginning of WAL, excluding all WAL
 * page headers.
 */
static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos)
{
    uint64 fullsegs;
    uint64 fullpages;
    uint64 bytesleft;
    uint32 seg_offset;
    XLogRecPtr result;

    fullsegs = bytepos / UsableBytesInSegment;
    bytesleft = bytepos % UsableBytesInSegment;

    if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) {
        /* fits on first page of segment */
        seg_offset = bytesleft + SizeOfXLogLongPHD;
    } else {
        /* account for the first page on segment with long header */
        seg_offset = XLOG_BLCKSZ;
        bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;

        fullpages = bytesleft / UsableBytesInPage;
        bytesleft = bytesleft % UsableBytesInPage;

        seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
    }

    XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);

    return result;
}

/*
 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
 * returns a pointer to the beginning of the page (ie. before page header),
 * not to where the first xlog record on that page would go to. This is used
 * when converting a pointer to the end of a record.
 */
static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos)
{
    uint64 fullsegs;
    uint64 fullpages;
    uint64 bytesleft;
    uint32 seg_offset;
    XLogRecPtr result;

    fullsegs = bytepos / UsableBytesInSegment;
    bytesleft = bytepos % UsableBytesInSegment;

    if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD) {
        /* fits on first page of segment */
        if (bytesleft == 0) {
            seg_offset = 0;
        } else {
            seg_offset = bytesleft + SizeOfXLogLongPHD;
        }
    } else {
        /* account for the first page on segment with long header */
        seg_offset = XLOG_BLCKSZ;
        bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;

        fullpages = bytesleft / UsableBytesInPage;
        bytesleft = bytesleft % UsableBytesInPage;

        if (bytesleft == 0) {
            seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
        } else {
            seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
        }
    }

    XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);

    return result;
}

/*
 * Convert an XLogRecPtr to a "usable byte position".
 */
static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr)
{
    uint64 fullsegs;
    uint32 fullpages;
    uint32 offset;
    uint64 result;

    XLByteToSeg(ptr, fullsegs);

    fullpages = (ptr % XLogSegSize) / XLOG_BLCKSZ;
    offset = ptr % XLOG_BLCKSZ;

    if (fullpages == 0) {
        result = fullsegs * UsableBytesInSegment;
        if (offset > 0) {
            Assert(offset >= SizeOfXLogLongPHD);
            result += offset - SizeOfXLogLongPHD;
        }
    } else {
        result = fullsegs * UsableBytesInSegment + (XLOG_BLCKSZ - SizeOfXLogLongPHD) /* account for first page */
                 + (fullpages - 1) * UsableBytesInPage;                              /* full pages */
        if (offset > 0) {
            Assert(offset >= SizeOfXLogShortPHD);
            result += offset - SizeOfXLogShortPHD;
        }
    }

    return result;
}

/*
 * XLogArchiveNotify
 *
 * Create an archive notification file
 *
 * The name of the notification file is the message that will be picked up
 * by the archiver, e.g. we write 0000000100000001000000C6.ready
 * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
 * then when complete, rename it to 0000000100000001000000C6.done
 */
static void XLogArchiveNotify(const char *xlog)
{
    char archiveStatusPath[MAXPGPATH];
    FILE *fd = NULL;
    errno_t errorno = EOK;

    /* insert an otherwise empty file called <XLOG>.ready */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
    securec_check_ss(errorno, "", "");

    fd = AllocateFile(archiveStatusPath, "w");
    if (fd == NULL) {
        ereport(LOG, (errcode_for_file_access(),
                      errmsg("could not create archive status file \"%s\": %m", archiveStatusPath)));
        return;
    }
    if (FreeFile(fd)) {
        ereport(LOG, (errcode_for_file_access(),
                      errmsg("could not write archive status file \"%s\": %m", archiveStatusPath)));
        fd = NULL;
        return;
    }

    /* Notify archiver that it's got something to do */
    if (IsUnderPostmaster) {
        SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
    }
}

/*
 * Convenience routine to notify using segment number representation of filename
 */
static void XLogArchiveNotifySeg(XLogSegNo segno)
{
    char xlog[MAXFNAMELEN];
    errno_t errorno = EOK;

    errorno = snprintf_s(xlog, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
                         (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    XLogArchiveNotify(xlog);
}

/*
 * XLogArchiveForceDone
 *
 * Emit notification forcibly that an XLOG segment file has been successfully
 * archived, by creating <XLOG>.done regardless of whether <XLOG>.ready
 * exists or not.
 */
void XLogArchiveForceDone(const char *xlog)
{
    char archiveReady[MAXPGPATH];
    char archiveDone[MAXPGPATH];
    struct stat stat_buf;
    FILE *fd = NULL;
    errno_t errorno = EOK;

    /* Exit if already known done */
    errorno = snprintf_s(archiveDone, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
    securec_check_ss(errorno, "", "");

    if (stat(archiveDone, &stat_buf) == 0) {
        return;
    }

    /* If .ready exists, rename it to .done */
    errorno = snprintf_s(archiveReady, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
    securec_check_ss(errorno, "", "");

    if (stat(archiveReady, &stat_buf) == 0) {
        (void)durable_rename(archiveReady, archiveDone, WARNING);
        return;
    }

    /* insert an otherwise empty file called <XLOG>.done */
    fd = AllocateFile(archiveDone, "w");
    if (fd == NULL) {
        ereport(LOG,
                (errcode_for_file_access(), errmsg("could not create archive status file \"%s\": %m", archiveDone)));
        return;
    }
    if (FreeFile(fd)) {
        ereport(LOG,
                (errcode_for_file_access(), errmsg("could not write archive status file \"%s\": %m", archiveDone)));
        fd = NULL;
        return;
    }
}

/*
 * XLogArchiveCheckDone
 *
 * This is called when we are ready to delete or recycle an old XLOG segment
 * file or backup history file.  If it is okay to delete it then return true.
 * If it is not time to delete it, make sure a .ready file exists, and return
 * false.
 *
 * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
 * then return false; else create <XLOG>.ready and return false.
 *
 * The reason we do things this way is so that if the original attempt to
 * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
 */
static bool XLogArchiveCheckDone(const char *xlog)
{
    char archiveStatusPath[MAXPGPATH];
    struct stat stat_buf;
    errno_t errorno = EOK;

    /* Always deletable if archiving is off or in recovery process. Archiving is always disabled on standbys. */
    if (!XLogArchivingActive() || RecoveryInProgress()) {
        return true;
    }

    /* First check for .done --- this means archiver is done with it */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
    securec_check_ss(errorno, "", "");
    if (stat(archiveStatusPath, &stat_buf) == 0) {
        return true;
    }

    /* check for .ready --- this means archiver is still busy with it */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
    securec_check_ss(errorno, "", "");
    if (stat(archiveStatusPath, &stat_buf) == 0) {
        return false;
    }

    /* Race condition --- maybe archiver just finished, so recheck */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
    securec_check_ss(errorno, "", "");
    if (stat(archiveStatusPath, &stat_buf) == 0) {
        return true;
    }

    /* Retry creation of the .ready file */
    XLogArchiveNotify(xlog);
    return false;
}

/*
 * XLogArchiveIsBusy
 *
 * Check to see if an XLOG segment file is still unarchived.
 * This is almost but not quite the inverse of XLogArchiveCheckDone: in
 * the first place we aren't chartered to recreate the .ready file, and
 * in the second place we should consider that if the file is already gone
 * then it's not busy.  (This check is needed to handle the race condition
 * that a checkpoint already deleted the no-longer-needed file.)
 */
static bool XLogArchiveIsBusy(const char *xlog)
{
    char archiveStatusPath[MAXPGPATH];
    struct stat stat_buf;
    errno_t errorno = EOK;

    /* First check for .done --- this means archiver is done with it */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
    securec_check_ss(errorno, "", "");
    if (stat(archiveStatusPath, &stat_buf) == 0) {
        return false;
    }

    /* check for .ready --- this means archiver is still busy with it */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
    securec_check_ss(errorno, "", "");
    if (stat(archiveStatusPath, &stat_buf) == 0) {
        return true;
    }

    /* Race condition --- maybe archiver just finished, so recheck */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
    securec_check_ss(errorno, "", "");
    if (stat(archiveStatusPath, &stat_buf) == 0) {
        return false;
    }

    /*
     * Check to see if the WAL file has been removed by checkpoint, which
     * implies it has already been archived, and explains why we can't see a
     * status file for it.
     */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%s", xlog);
    securec_check_ss(errorno, "", "");

    if (stat(archiveStatusPath, &stat_buf) != 0 && errno == ENOENT) {
        return false;
    }

    return true;
}

/*
 * XLogArchiveIsReady
 *
 * Check to see if an XLOG segment file has an archive notification (.ready)
 * file.
 */
bool XLogArchiveIsReady(const char *xlog)
{
    char archiveStatusPath[MAXPGPATH];
    struct stat stat_buf;
    errno_t rc = EOK;
    rc = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
    securec_check_ss(rc, "", "");

    if (stat(archiveStatusPath, &stat_buf) == 0) {
        return true;
    }

    return false;
}

/*
 * XLogArchiveCleanup
 *
 * Cleanup archive notification file(s) for a particular xlog segment
 */
static void XLogArchiveCleanup(const char *xlog)
{
    char archiveStatusPath[MAXPGPATH];
    errno_t errorno = EOK;

    /* Remove the .done file */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".done");
    securec_check_ss(errorno, "", "");
    unlink(archiveStatusPath);

    /* Remove the .ready file if present --- normally it shouldn't be */
    errorno = snprintf_s(archiveStatusPath, MAXPGPATH, MAXPGPATH - 1, "%s/%s%s", ARCHIVEDIR, xlog, ".ready");
    securec_check_ss(errorno, "", "");
    unlink(archiveStatusPath);
}

/*
 * Initialize XLOG buffers, writing out old buffers if they still contain
 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
 * true, initialize as many pages as we can without having to write out
 * unwritten data. Any new pages are initialized to zeros, with pages headers
 * initialized properly.
 */
template <bool isGroupInsert>
static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic, PGPROC *proc)
{
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    int nextidx;
    XLogRecPtr OldPageRqstPtr;
    XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
    XLogRecPtr NewPageBeginPtr;
    XLogPageHeader NewPage;
    int npages = 0;
    errno_t errorno = EOK;
    XLogwrtResult *LogwrtResultPtr = NULL;
    TimeLineID xlogTimeLineID = 0;

#ifdef __aarch64__
    if (isGroupInsert) {
        LogwrtResultPtr = (XLogwrtResult *)proc->xlogGroupLogwrtResult;
        xlogTimeLineID = proc->xlogGroupTimeLineID;
    } else {
        LogwrtResultPtr = t_thrd.xlog_cxt.LogwrtResult;
        xlogTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
    }
#else
    LogwrtResultPtr = t_thrd.xlog_cxt.LogwrtResult;
    xlogTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
#endif

    LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);

    nextidx = (int)XLogRecPtrToBufIdx(t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo);
    /*
     * Get ending-offset of the buffer page we need to replace (this may
     * be zero if the buffer hasn't been used yet).  Fall through if it's
     * already written out.
     */
    OldPageRqstPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[nextidx];

    /*
     * Now that we have the lock, check if someone initialized the page
     * already.
     */
    while (upto >= OldPageRqstPtr) {
        /* Now the next buffer slot is free and we can set it up to be the next output page. */
        NewPageBeginPtr = t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo;
        NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;

        Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == (uint32)nextidx);

        NewPage = (XLogPageHeader)(t_thrd.shemem_ptr_cxt.XLogCtl->pages + nextidx * (Size)XLOG_BLCKSZ);

        /*
         * Be sure to re-zero the buffer so that bytes beyond what we've
         * written will look like zeroes and not valid XLOG records...
         */
        errorno = memset_s((char *)NewPage, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
        securec_check(errorno, "", "");

        /* Fill the new page's header */
        NewPage->xlp_magic = XLOG_PAGE_MAGIC;
        NewPage->xlp_tli = xlogTimeLineID;
        NewPage->xlp_pageaddr = NewPageBeginPtr;

        /*
         * If online backup is not in progress, mark the header to indicate
         * that* WAL records beginning in this page have removable backup
         * blocks.  This allows the WAL archiver to know whether it is safe to
         * compress archived WAL data by transforming full-block records into
         * the non-full-block format.  It is sufficient to record this at the
         * page level because we force a page switch (in fact a segment switch)
         * when starting a backup, so the flag will be off before any records
         * can be written during the backup.  At the end of a backup, the last
         * page will be marked as all unsafe when perhaps only part is unsafe,
         * but at worst the archiver would miss the opportunity to compress a
         * few records.
         */
        if (!Insert->forcePageWrites) {
            NewPage->xlp_info |= XLP_BKP_REMOVABLE;
        }

        /* If first page of an XLOG segment file, make it a long header. */
        if ((NewPage->xlp_pageaddr % XLogSegSize) == 0) {
            XLogLongPageHeader NewLongPage = (XLogLongPageHeader)NewPage;
            NewLongPage->xlp_sysid = t_thrd.shemem_ptr_cxt.ControlFile->system_identifier;
            NewLongPage->xlp_seg_size = XLogSegSize;
            NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
            NewPage->xlp_info |= XLP_LONG_HEADER;
        }

        /*
         * Make sure the initialization of the page becomes visible to others
         * before the xlblocks update. GetXLogBuffer() reads xlblocks without
         * holding a lock.
         */
        pg_write_barrier();

        *((volatile XLogRecPtr *)&t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
        t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo = NewPageEndPtr;

        nextidx = (int)XLogRecPtrToBufIdx(t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo);
        OldPageRqstPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[nextidx];

        npages++;
    }

    LWLockRelease(WALBufMappingLock);

#ifdef WAL_DEBUG
    if (npages > 0) {
        ereport(DEBUG1, (errmsg("initialized %d pages, upto %X/%X", npages, (uint32)(NewPageEndPtr >> 32),
                                (uint32)NewPageEndPtr)));
    }
#endif
}

template static void AdvanceXLInsertBuffer<true>(XLogRecPtr, bool, PGPROC *);
template static void AdvanceXLInsertBuffer<false>(XLogRecPtr, bool, PGPROC *);

/*
 * Check whether we've consumed enough xlog space that a checkpoint is needed.
 *
 * new_segno indicates a log file that has just been filled up (or read
 * during recovery). We measure the distance from RedoRecPtr to new_segno

 * and see if that exceeds CheckPointSegments.
 *
 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
 */
static bool XLogCheckpointNeeded(XLogSegNo new_segno)
{
    XLogSegNo old_segno;

    XLByteToSeg(t_thrd.xlog_cxt.RedoRecPtr, old_segno);

    if (new_segno >= old_segno + ((uint32)XLogSegmentsNum(u_sess->attr.attr_storage.CheckPointSegments) - 1)) {
        return true;
    }
    return false;
}

#ifndef ENABLE_MULTIPLE_NODES
/*
 * Write the log to paxos storage at least as far as PaxosWriteRqst indicates.
 * Must be called with WALPaxosWriteLock held.
 */
static bool XLogWritePaxos(XLogRecPtr PaxosWriteRqst)
{
    int curridx;
    uint32 offset;
    Size WriteTotal;
    Size CriticalNBytes;
    Size nBytes;
    Size XLogCacheSize;
    XLogRecPtr WriteLSN;

    /*
     * Update local LogwrtResult (caller probably did this already, but...)
     */
    *t_thrd.xlog_cxt.LogwrtPaxos = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtPaxos;

    /* total size of xlog cache */
    XLogCacheSize = (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck + 1) * XLOG_BLCKSZ;

    while (XLByteLT(t_thrd.xlog_cxt.LogwrtPaxos->Write, PaxosWriteRqst)) {
        curridx = XLogRecPtrToBufIdx(t_thrd.xlog_cxt.LogwrtPaxos->Write);
        offset = t_thrd.xlog_cxt.LogwrtPaxos->Write % XLOG_BLCKSZ;

        /* size from LogwrtPaxos->Write to PaxosWriteRqst */
        WriteTotal = PaxosWriteRqst - t_thrd.xlog_cxt.LogwrtPaxos->Write;

        /* size from Paxoswrite to end of cache */
        CriticalNBytes = (t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck - curridx + 1) * XLOG_BLCKSZ - offset;

        /*
         * nBytes is the maximum size that we can send to paxos this time, we set it to be
         * the minimum of WriteTotal/MaxSendSizeBytes/CriticalNBytes. But when MaxSendSizeBytes
         * is the minimum, it may cut a record, so we round back to page boundary by reference
         * to stream replcation's flow limit.
         */
        nBytes = Min(WriteTotal, Min(MaxSendSizeBytes, CriticalNBytes));
        if (nBytes != WriteTotal && nBytes != CriticalNBytes) {
            XLogRecPtr tmpLsn = t_thrd.xlog_cxt.LogwrtPaxos->Write + MaxSendSizeBytes;
            uint32 tmpoffset = tmpLsn % XLOG_BLCKSZ;
            Assert(MaxSendSizeBytes > tmpoffset);
            nBytes = MaxSendSizeBytes - tmpoffset;
            ereport(
                DEBUG1,
                (errmsg("Flow limit! nBytes is %ld, WriteTotal is %ld, SendLimitBytes is %ld, CriticalNBytes is %ld",
                        nBytes, WriteTotal, MaxSendSizeBytes, CriticalNBytes)));
        }

        Assert(XLByteLE(nBytes, XLogCacheSize));

        char *from = t_thrd.shemem_ptr_cxt.XLogCtl->pages + curridx * (Size)XLOG_BLCKSZ + offset;
        WriteLSN = t_thrd.xlog_cxt.LogwrtPaxos->Write + nBytes;

        unsigned long long paxosIdx = 0;
        /* always try to write XLog data into DCF. */
        while (dcf_write(1, from, nBytes, WriteLSN, &paxosIdx) != 0) {
            ereport(WARNING, (errmsg("Failed writing to paxos: lsn = %ld, len = %ld, paxosIdx = %llu.", WriteLSN,
                                     nBytes, paxosIdx)));
            pg_usleep(5000);
        }
        ereport(DEBUG1,
                (errmsg("Writed to paxos: lsn = %ld, len = %ld, paxosIdx = %llu.", WriteLSN, nBytes, paxosIdx)));
        t_thrd.xlog_cxt.LogwrtPaxos->Write = WriteLSN;
    }

    /* Update shared-memory status */
    {
        XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->LogwrtPaxos.Write = t_thrd.xlog_cxt.LogwrtPaxos->Write;
        SpinLockRelease(&xlogctl->info_lck);
        volatile DcfContextInfo *dcfCtx = t_thrd.dcf_cxt.dcfCtxInfo;
        SpinLockAcquire(&dcfCtx->recordDcfIdxMutex);
        /*
         * XLog upto recordLsn has been flushed to Disk, now it's safe
         * to set applied index corresponding to recordLsn in DCF, which is
         * called in Consensus Log call back.
         */
        if (XLByteLE(dcfCtx->recordLsn, xlogctl->LogwrtResult.Flush))
            dcfCtx->isRecordIdxBlocked = false;
        SpinLockRelease(&dcfCtx->recordDcfIdxMutex);
    }

    return true;
}
#endif

/*
 * Write and/or fsync the log at least as far as WriteRqst indicates.
 *
 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
 * may stop at any convenient boundary (such as a cache or logfile boundary).
 * This option allows us to avoid uselessly issuing multiple writes when a
 * single one would do.
 *
 * If xlog_switch == TRUE, we are intending an xlog segment switch, so
 * perform end-of-segment actions after writing the last page, even if
 * it's not physically the end of its segment.  (NB: this will work properly
 * only if caller specifies WriteRqst == page-end and flexible == false,
 * and there is some data to write.)
 *
 * Must be called with WALWriteLock held.
 */
static void XLogWrite(const XLogwrtRqst &WriteRqst, bool flexible)
{
    bool ispartialpage = false;
    bool last_iteration = false;
    bool finishing_seg = false;
    bool use_existent = false;
    int curridx = 0;
    int npages = 0;
    int startidx = 0;
    uint32 startoffset = 0;
    Size actualBytes;
    XLogRecPtr lastWrited = InvalidXLogRecPtr;

    /* We should always be inside a critical section here */
    Assert(t_thrd.int_cxt.CritSectionCount > 0);

    // Update local LogwrtResult (caller probably did this already, but...)
    *t_thrd.xlog_cxt.LogwrtResult = t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult;
    lastWrited = t_thrd.xlog_cxt.LogwrtResult->Write;

    /*
     * Since successive pages in the xlog cache are consecutively allocated,
     * we can usually gather multiple pages together and issue just one
     * write() call.  npages is the number of pages we have determined can be
     * written together; startidx is the cache block index of the first one,
     * and startoffset is the file offset at which it should go. The latter
     * two variables are only valid when npages > 0, but we must initialize
     * all of them to keep the compiler quiet.
     */
    npages = 0;
    startidx = 0;
    startoffset = 0;

    /*
     * Within the loop, curridx is the cache block index of the page to
     * consider writing.  We advance Write->curridx only after successfully
     * writing pages.  (Right now, this refinement is useless since we are
     * going to PANIC if any error occurs anyway; but someday it may come in
     * useful.)
     */
    curridx = XLogRecPtrToBufIdx(t_thrd.xlog_cxt.LogwrtResult->Write);

    while (XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, WriteRqst.Write)) {
        /*
         * Make sure we're not ahead of the insert process.  This could happen
         * if we're passed a bogus WriteRqst.Write that is past the end of the
         * last page that's been initialized by AdvanceXLInsertBuffer.
         */
        XLogRecPtr EndPtr = t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[curridx];
        if (!XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, EndPtr))
            ereport(PANIC,
                    (errmsg("xlog write request %X/%X is past end of log %X/%X",
                            (uint32)(t_thrd.xlog_cxt.LogwrtResult->Write >> 32),
                            (uint32)t_thrd.xlog_cxt.LogwrtResult->Write, (uint32)(EndPtr >> 32), (uint32)EndPtr)));

        /* Advance LogwrtResult.Write to end of current buffer page */
        t_thrd.xlog_cxt.LogwrtResult->Write = EndPtr;
        ispartialpage = XLByteLT(WriteRqst.Write, t_thrd.xlog_cxt.LogwrtResult->Write);

        if (!XLByteInPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo)) {
            /*
             * Switch to new logfile segment.  We cannot have any pending
             * pages here (since we dump what we have at segment end).
             */
            Assert(npages == 0);
            if (t_thrd.xlog_cxt.openLogFile >= 0) {
                XLogFileClose();
            }
            XLByteToPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo);

            /* create/use new log file */
            use_existent = true;
            t_thrd.xlog_cxt.openLogFile = XLogFileInit(t_thrd.xlog_cxt.openLogSegNo, &use_existent, true);
            t_thrd.xlog_cxt.openLogOff = 0;

            /*
             * Unlock WalAuxiliary thread to init new xlog segment if we are running out
             * of xlog segments.
             */
            if (!use_existent) {
                g_instance.wal_cxt.globalEndPosSegNo = t_thrd.xlog_cxt.openLogSegNo;
                WakeupWalSemaphore(&g_instance.wal_cxt.walInitSegLock->l.sem);
            }
        }

        /* Make sure we have the current logfile open */
        if (t_thrd.xlog_cxt.openLogFile <= 0) {
            XLByteToPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo);
            t_thrd.xlog_cxt.openLogFile = XLogFileOpen(t_thrd.xlog_cxt.openLogSegNo);
            t_thrd.xlog_cxt.openLogOff = 0;
        }

        /* Add current page to the set of pending pages-to-dump */
        if (npages == 0) {
            /* first of group */
            startidx = curridx;
            startoffset = (t_thrd.xlog_cxt.LogwrtResult->Write - XLOG_BLCKSZ) % XLogSegSize;
        }
        npages++;

        /*
         * Dump the set if this will be the last loop iteration, or if we are
         * at the last page of the cache area (since the next page won't be
         * contiguous in memory), or if we are at the end of the logfile
         * segment.
         */
        last_iteration = !XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Write, WriteRqst.Write);
        finishing_seg = !ispartialpage && (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;

        if (last_iteration || curridx == t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck || finishing_seg) {
            /* record elapsed time */
            instr_time startTime;
            instr_time endTime;
            PgStat_Counter elapsedTime;

            /* Need to seek in the file? */
            if (t_thrd.xlog_cxt.openLogOff != startoffset) {
                if (lseek(t_thrd.xlog_cxt.openLogFile, (off_t)startoffset, SEEK_SET) < 0) {
                    ereport(PANIC, (errcode_for_file_access(),
                                    errmsg("could not seek in log file %s to offset %u: %m",
                                           XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.openLogSegNo),
                                           startoffset)));
                }
                t_thrd.xlog_cxt.openLogOff = startoffset;
            }

            /* OK to write the page(s) */
            char *from = t_thrd.shemem_ptr_cxt.XLogCtl->pages + startidx * (Size)XLOG_BLCKSZ;
            Size nbytes = npages * (Size)XLOG_BLCKSZ;
#ifndef ENABLE_MULTIPLE_NODES
            if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted &&
                ispartialpage) {
                /* avoid XLog divergence on DCF mode */
                nbytes -= (EndPtr - WriteRqst.Write);
            }
#endif
            errno = 0;

            pgstat_report_waitevent(WAIT_EVENT_WAL_WRITE);
            INSTR_TIME_SET_CURRENT(startTime);
            actualBytes = write(t_thrd.xlog_cxt.openLogFile, from, nbytes);
            INSTR_TIME_SET_CURRENT(endTime);
            INSTR_TIME_SUBTRACT(endTime, startTime);
            /* when track_activities and enable_instr_track_wait are on,
             * elapsedTime can be replaced by beentry->waitInfo.event_info.duration.
             */
            elapsedTime = (PgStat_Counter)INSTR_TIME_GET_MICROSEC(endTime);
            pgstat_report_waitevent(WAIT_EVENT_END);

            if (actualBytes != (Size)nbytes) {
                /* if write didn't set errno, assume no disk space */
                if (errno == 0) {
                    errno = ENOSPC;
                }
                ereport(PANIC, (errcode_for_file_access(),
                                errmsg("could not write to log file %s at offset %u, length %lu: %m",
                                       XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.openLogSegNo),
                                       t_thrd.xlog_cxt.openLogOff, (unsigned long)nbytes)));
            }

            if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
                ++g_instance.wal_cxt.xlogFlushStats->writeTimes;
                g_instance.wal_cxt.xlogFlushStats->totalActualXlogSyncBytes += actualBytes;
                g_instance.wal_cxt.xlogFlushStats->totalWriteTime += elapsedTime;
            }

            reportRedoWrite((PgStat_Counter)npages, elapsedTime);

            /* Update state for write */
            t_thrd.xlog_cxt.openLogOff += nbytes;
            npages = 0;

            // write copy to recovery dir */
            SSOndemandXlogCopy(t_thrd.xlog_cxt.openLogSegNo, startoffset, from, nbytes);

            /*
             * If we just wrote the whole last page of a logfile segment,
             * fsync the segment immediately.  This avoids having to go back
             * and re-open prior segments when an fsync request comes along
             * later. Doing it here ensures that one and only one backend will
             * perform this fsync.
             *
             * We also do this if this is the last page written for an xlog
             * switch.
             *
             * This is also the right place to notify the Archiver that the
             * segment is ready to copy to archival storage, and to update the
             * timer for archive_timeout, and to signal for a checkpoint if
             * too many logfile segments have been used since the last
             * checkpoint.
             */
            if (finishing_seg) {
                instr_time startTime;
                instr_time endTime;
                uint64 elapsedTime;
                INSTR_TIME_SET_CURRENT(startTime);
                issue_xlog_fsync(t_thrd.xlog_cxt.openLogFile, t_thrd.xlog_cxt.openLogSegNo);
                INSTR_TIME_SET_CURRENT(endTime);
                INSTR_TIME_SUBTRACT(endTime, startTime);
                elapsedTime = INSTR_TIME_GET_MICROSEC(endTime);
                /* Add statistics */
                if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
                    ++g_instance.wal_cxt.xlogFlushStats->syncTimes;
                    g_instance.wal_cxt.xlogFlushStats->totalSyncTime += elapsedTime;
                }

                /* signal that we need to wakeup walsenders later */
                WalSndWakeupRequest();
                t_thrd.xlog_cxt.LogwrtResult->Flush = t_thrd.xlog_cxt.LogwrtResult->Write; /* end of page */
                AddShareStorageXLopCopyBackendWakeupRequest();
                if (XLogArchivingActive()) {
                    XLogArchiveNotifySeg(t_thrd.xlog_cxt.openLogSegNo);
                }
                t_thrd.shemem_ptr_cxt.XLogCtl->lastSegSwitchTime = (pg_time_t)time(NULL);

                /*
                 * Request a checkpoint if we've consumed too much xlog since
                 * the last one.  For speed, we first check using the local
                 * copy of RedoRecPtr, which might be out of date; if it looks
                 * like a checkpoint is needed, forcibly update RedoRecPtr and
                 * recheck.
                 */
                if (IsUnderPostmaster && XLogCheckpointNeeded(t_thrd.xlog_cxt.openLogSegNo)) {
                    (void)GetRedoRecPtr();
                    if (XLogCheckpointNeeded(t_thrd.xlog_cxt.openLogSegNo)) {
                        RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
                    }
                }
            }
        }

        if (ispartialpage) {
            /* Only asked to write a partial page */
            t_thrd.xlog_cxt.LogwrtResult->Write = WriteRqst.Write;
            break;
        }
        curridx = NextBufIdx(curridx);

        /* If flexible, break out of loop as soon as we wrote something */
        if (flexible && npages == 0) {
            break;
        }
    }

    Assert(npages == 0);

    // If asked to flush, do so
    if (XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Flush, WriteRqst.Flush) &&
        XLByteLT(t_thrd.xlog_cxt.LogwrtResult->Flush, t_thrd.xlog_cxt.LogwrtResult->Write)) {
        /*
         * Could get here without iterating above loop, in which case we might
         * have no open file or the wrong one.	However, we do not need to
         * fsync more than one file.
         */
        if (u_sess->attr.attr_storage.sync_method != SYNC_METHOD_OPEN &&
            u_sess->attr.attr_storage.sync_method != SYNC_METHOD_OPEN_DSYNC) {
            if (t_thrd.xlog_cxt.openLogFile >= 0 &&
                !XLByteInPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo)) {
                XLogFileClose();
            }
            if (t_thrd.xlog_cxt.openLogFile < 0) {
                XLByteToPrevSeg(t_thrd.xlog_cxt.LogwrtResult->Write, t_thrd.xlog_cxt.openLogSegNo);
                t_thrd.xlog_cxt.openLogFile = XLogFileOpen(t_thrd.xlog_cxt.openLogSegNo);
                t_thrd.xlog_cxt.openLogOff = 0;
            }
            instr_time startTime;
            instr_time endTime;
            uint64 elapsedTime;
            INSTR_TIME_SET_CURRENT(startTime);
            issue_xlog_fsync(t_thrd.xlog_cxt.openLogFile, t_thrd.xlog_cxt.openLogSegNo);
            INSTR_TIME_SET_CURRENT(endTime);
            INSTR_TIME_SUBTRACT(endTime, startTime);
            elapsedTime = INSTR_TIME_GET_MICROSEC(endTime);
            if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
                ++g_instance.wal_cxt.xlogFlushStats->syncTimes;
                g_instance.wal_cxt.xlogFlushStats->totalSyncTime += elapsedTime;
            }
        }
        /* signal that we need to wakeup walsenders later */
        WalSndWakeupRequest();
        t_thrd.xlog_cxt.LogwrtResult->Flush = t_thrd.xlog_cxt.LogwrtResult->Write;
        AddShareStorageXLopCopyBackendWakeupRequest();
    }

    /*
     * Update shared-memory status
     *
     * We make sure that the shared 'request' values do not fall behind the
     * 'result' values.  This is not absolutely essential, but it saves some
     * code in a couple of places.
     */
    {
        /* use volatile pointer to prevent code rearrangement */
        XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->LogwrtResult = *t_thrd.xlog_cxt.LogwrtResult;
        if (XLByteLT(xlogctl->LogwrtRqst.Write, t_thrd.xlog_cxt.LogwrtResult->Write)) {
            xlogctl->LogwrtRqst.Write = t_thrd.xlog_cxt.LogwrtResult->Write;
        }
        if (XLByteLT(xlogctl->LogwrtRqst.Flush, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
            xlogctl->LogwrtRqst.Flush = t_thrd.xlog_cxt.LogwrtResult->Flush;
            g_instance.comm_cxt.predo_cxt.redoPf.primary_flush_ptr = t_thrd.xlog_cxt.LogwrtResult->Flush;
        }
        SpinLockRelease(&xlogctl->info_lck);
    }

    if (g_instance.wal_cxt.xlogFlushStats->statSwitch) {
        g_instance.wal_cxt.xlogFlushStats->totalXlogSyncBytes += (t_thrd.xlog_cxt.LogwrtResult->Write - lastWrited);
        g_instance.wal_cxt.xlogFlushStats->currOpenXlogSegNo = t_thrd.xlog_cxt.openLogSegNo;
    }
}


/*
 * Record the LSN for an asynchronous transaction commit/abort
 * and nudge the WALWriter if there is work for it to do.
 * (This should not be called for synchronous commits.)
 */
void XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
{
    XLogRecPtr WriteRqstPtr = asyncXactLSN;
    bool sleeping = false;

    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    *t_thrd.xlog_cxt.LogwrtResult = xlogctl->LogwrtResult;
    sleeping = xlogctl->WalWriterSleeping;
    if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN)) {
        xlogctl->asyncXactLSN = asyncXactLSN;
    }
    SpinLockRelease(&xlogctl->info_lck);

    /*
     * If the WALWriter is sleeping, we should kick it to make it come out of
     * low-power mode.	Otherwise, determine whether there's a full page of
     * WAL available to write.
     */
    if (!sleeping) {
        /* back off to last completed page boundary */
        WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;

        /* if we have already flushed that far, we're done */
        if (XLByteLE(WriteRqstPtr, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
            return;
        }
    }

    /*
     * Nudge the WALWriter: it has a full page of WAL to write, or we want it
     * to come out of low-power mode so that this async commit will reach disk
     * within the expected amount of time.
     */
    if (g_instance.proc_base->walwriterLatch) {
        SetLatch(g_instance.proc_base->walwriterLatch);
    }
}

/*
 * Record the LSN up to which we can remove WAL because it's not required by
 * any replication slot.
 */
void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->replicationSlotMinLSN = lsn;
    SpinLockRelease(&xlogctl->info_lck);
    if (!ENABLE_DMS && XLByteEQ(lsn, InvalidXLogRecPtr) && !IsInitdb && !RecoveryInProgress()
        && !AM_WAL_SENDER && !AM_WAL_DB_SENDER &&
        strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
        ereport(WARNING, (errmsg("replicationSlotMinLSN is InvalidXLogRecPtr!!!")));
    }
}

/*
 * Return the oldest LSN we must retain to satisfy the needs of some
 * replication slot.
 */
static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr retval;
    SpinLockAcquire(&xlogctl->info_lck);
    retval = xlogctl->replicationSlotMinLSN;
    SpinLockRelease(&xlogctl->info_lck);

    return retval;
}

/*
 * To facilitate calling static function 'XLogGetReplicationSlotMinimumLSN()' in other files
 */
XLogRecPtr XLogGetReplicationSlotMinimumLSNByOther(void)
{
    XLogRecPtr minlsn;
    minlsn = XLogGetReplicationSlotMinimumLSN();
    return minlsn;
}

/*
 * Record the LSN up to which we can choose it as a startpoint to dummy standby.
 */
void XLogSetReplicationSlotMaximumLSN(XLogRecPtr lsn)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->replicationSlotMaxLSN = lsn;
    SpinLockRelease(&xlogctl->info_lck);
    if (!ENABLE_DMS && XLByteEQ(lsn, InvalidXLogRecPtr) && !IsInitdb && !RecoveryInProgress()
        && !AM_WAL_SENDER && !AM_WAL_DB_SENDER &&
        strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
        ereport(WARNING, (errmsg("replicationSlotMaxLSN is InvalidXLogRecPtr!!!")));
    }
}

/*
 * Return the latest LSN we must retain to satisfy the needs of some
 * replication slot.
 */
XLogRecPtr XLogGetReplicationSlotMaximumLSN(void)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr retval;

    SpinLockAcquire(&xlogctl->info_lck);
    retval = xlogctl->replicationSlotMaxLSN;
    SpinLockRelease(&xlogctl->info_lck);

    return retval;
}

/*
 * Reset replication slot before promote to primary.
 *	We should free the slotname in caller.
 */
static void ResetSlotLSNEndRecovery(StringInfo slotname)
{
    int i = 0;
    bool resetslot = false;
    errno_t rc = 0;
    ReplicationSlot *slot = NULL;

    Assert(t_thrd.slot_cxt.ReplicationSlotCtl != NULL);
    Assert(slotname != NULL);

    if (slotname->len <= 0) {
        return;
    }

    LWLockAcquire(ReplicationSlotControlLock, LW_EXCLUSIVE);
    for (i = 0; i < g_instance.attr.attr_storage.max_replication_slots; i++) {
        ReplicationSlot *s = &t_thrd.slot_cxt.ReplicationSlotCtl->replication_slots[i];

        /* We got the valid peer slot, reset it */
        if (s->in_use) {
            SpinLockAcquire(&s->mutex);
            if (s->data.isDummyStandby) {
                s->data.restart_lsn = 0;
            } else if (strncmp(NameStr(s->data.name), slotname->data, strlen(slotname->data)) == 0) {
                s->data.restart_lsn = latestValidRecord;
                resetslot = true;
            }
            s->just_dirtied = true;
            s->dirty = true;
            SpinLockRelease(&s->mutex);
        }
    }

    /*
     * If we can not find slot for PGXCNodeName, we will create it.
     * Explame for the scene: standby promotes(failover or switchover) to
     * primary firstly, pg_replslot is empty.
     */
    if (!resetslot) {
        /* choose an invalid slot record, and set as peer slot */
        for (i = 0; i < g_instance.attr.attr_storage.max_replication_slots; i++) {
            ReplicationSlot *s = &t_thrd.slot_cxt.ReplicationSlotCtl->replication_slots[i];
            SpinLockAcquire(&s->mutex);
            if (!s->in_use) {
                rc = strncpy_s(NameStr(s->data.name), NAMEDATALEN, slotname->data, NAMEDATALEN - 1);
                securec_check(rc, "\0", "\0");
                NameStr(s->data.name)[NAMEDATALEN - 1] = '\0';
                s->data.restart_lsn = latestValidRecord;
                SET_SLOT_PERSISTENCY(s->data, RS_PERSISTENT);
                s->data.database = InvalidOid;
                s->in_use = true;
                s->just_dirtied = true;
                s->dirty = true;
                SpinLockRelease(&s->mutex);
                slot = s;
                resetslot = true;
                break;
            }
            SpinLockRelease(&s->mutex);
        }
    }
    LWLockRelease(ReplicationSlotControlLock);

    if (!resetslot) {
        ereport(WARNING,
                (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
                 errmsg("could not reset replication slot at the end of recovery"),
                 errhint("Delete replication slot or increase g_instance.attr.attr_storage.max_replication_slots.")));
    }

    if (slot != NULL) {
        CreateSlotOnDisk(slot);
    }

    CheckPointReplicationSlots();
    ReplicationSlotsComputeRequiredLSN(NULL);
}

void inline XlogUpMinRecovPointSwitchChk(XLogRecPtr lsn)
{
    if (g_instance.comm_cxt.predo_cxt.pre_enable_switch !=
        g_instance.attr.attr_storage.enable_update_max_page_flush_lsn) {
        ereport(WARNING, (errmsg("XlogUpMinRecovPointSwitchChk lsn %X/%X, local minRecoveryPoint %X/%X, "
                                 "ControlFile minRecoveryPoint %X/%X, updateMinRecoveryPointSwitch:%u change to :%u",
                                 (uint32)(lsn >> 32), (uint32)lsn, (uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32),
                                 (uint32)t_thrd.xlog_cxt.minRecoveryPoint,
                                 (uint32)(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint >> 32),
                                 (uint32)(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint),
                                 g_instance.comm_cxt.predo_cxt.pre_enable_switch,
                                 g_instance.attr.attr_storage.enable_update_max_page_flush_lsn)));
        g_instance.comm_cxt.predo_cxt.pre_enable_switch = g_instance.attr.attr_storage.enable_update_max_page_flush_lsn;
    }
}

/*
 * Advance minRecoveryPoint in control file.
 *
 * If we crash during recovery, we must reach this point again before the
 * database is consistent.
 *
 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
 * is only updated if it's not already greater than or equal to 'lsn'.
 */
void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
{
    /* Quick check using our local copy of the variable */
    if (!t_thrd.xlog_cxt.updateMinRecoveryPoint || (!force && XLByteLE(lsn, t_thrd.xlog_cxt.minRecoveryPoint))) {
        return;
    }

    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

    /* update local copy */
    t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
    SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);

    /*
     * An invalid minRecoveryPoint means that we need to recover all the WAL,
     * i.e., we're doing crash recovery.  We never modify the control file's
     * value in that case, so we can short-circuit future checks here too.
     */
    if (t_thrd.xlog_cxt.minRecoveryPoint == 0) {
        t_thrd.xlog_cxt.updateMinRecoveryPoint = false;
    } else if (force || XLByteLT(t_thrd.xlog_cxt.minRecoveryPoint, lsn)) {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
        XLogRecPtr newMinRecoveryPoint;

        /*
         * To avoid having to update the control file too often, we update it
         * all the way to the last record being replayed, even though 'lsn'
         * would suffice for correctness.  This also allows the 'force' case
         * to not need a valid 'lsn' value.
         *
         * Another important reason for doing it this way is that the passed
         * 'lsn' value could be bogus, i.e., past the end of available WAL, if
         * the caller got it from a corrupted heap page.  Accepting such a
         * value as the min recovery point would prevent us from coming up at
         * all.  Instead, we just log a warning and continue with recovery.
         * (See also the comments about corrupt LSNs in XLogFlush.)
         */
        SpinLockAcquire(&xlogctl->info_lck);
        newMinRecoveryPoint = xlogctl->lastReplayedEndRecPtr;
        SpinLockRelease(&xlogctl->info_lck);

        if (!force && XLByteLT(newMinRecoveryPoint, lsn) && !enable_heap_bcm_data_replication()) {
            ereport(DEBUG1, (errmsg("xlog min recovery request %X/%X is past current point %X/%X", (uint32)(lsn >> 32),
                                    (uint32)lsn, (uint32)(newMinRecoveryPoint >> 32), (uint32)newMinRecoveryPoint)));
        }

        /* update control file */
        if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, newMinRecoveryPoint)) {
            t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = newMinRecoveryPoint;
            UpdateControlFile();
            t_thrd.xlog_cxt.minRecoveryPoint = newMinRecoveryPoint;
            SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
            ereport(DEBUG1,
                    (errmsg("updated min recovery point to %X/%X", (uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32),
                            (uint32)t_thrd.xlog_cxt.minRecoveryPoint)));
        }
    }
    LWLockRelease(ControlFileLock);
}

void XLogWaitFlush(XLogRecPtr recptr)
{
    /*
     * During REDO, we are reading not writing WAL.
     *
     * Test XLogInsertAllowed() instead of RecoveryInProgress(). XLogInsertAllowed()
     * allows WAL writing at the end of recovery by setting a thread local variable
     * if this thread local variable is not set, it returns RecoveryInProgress()
     *
     * The checkpointer acts this way as well - when it tries to write
     * end-of-recovery checkpoint, this function should indeed flush
     *
     * During recovery, if the caller thread does not set allow XLogInsert, only
     * update minRecoveryPoint and return
     */
    if (!XLogInsertAllowed()) {
        if (g_supportHotStandby) {
            UpdateMinRecoveryPoint(recptr, false);
        }
        return;
    }

    if (!SSXLogInsertAllowed()) {
        return;
    }

    volatile XLogRecPtr flushTo = gs_compare_and_swap_u64(&g_instance.wal_cxt.flushResult, 0, 0);

    while (XLByteLT(flushTo, recptr)) {
        if (!g_instance.wal_cxt.isWalWriterUp) {
            XLogSelfFlush();
        } else if (LWLockAcquireOrWait(g_instance.wal_cxt.walFlushWaitLock->l.lock, LW_EXCLUSIVE)) {
            PGSemaphoreLock(&g_instance.wal_cxt.walFlushWaitLock->l.sem, true);
            LWLockRelease(g_instance.wal_cxt.walFlushWaitLock->l.lock);
        }
        flushTo = pg_atomic_barrier_read_u64(&g_instance.wal_cxt.flushResult);
    }

    return;
}

void XLogWaitBufferInit(XLogRecPtr recptr)
{
    volatile XLogRecPtr sentTo = gs_compare_and_swap_u64(&g_instance.wal_cxt.sentResult, 0, 0);
    while (XLByteLT(sentTo, recptr)) {
        if (!g_instance.wal_cxt.isWalWriterUp) {
            XLogSelfFlush();
        } else if (LWLockAcquireOrWait(g_instance.wal_cxt.walBufferInitWaitLock->l.lock, LW_EXCLUSIVE)) {
            PGSemaphoreLock(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem, true);
            LWLockRelease(g_instance.wal_cxt.walBufferInitWaitLock->l.lock);
        }
        sentTo = gs_compare_and_swap_u64(&g_instance.wal_cxt.sentResult, 0, 0);
    }

    return;
}

static void XLogFlushCore(XLogRecPtr writeRqstPtr)
{
    START_CRIT_SECTION();

    XLogwrtRqst WriteRqst;

    WriteRqst.Write = writeRqstPtr;
    WriteRqst.Flush = writeRqstPtr;
    XLogWrite(WriteRqst, false);
    END_CRIT_SECTION();

    pg_memory_barrier();

    /* wake up walsenders now that we've released heavily contended locks */
    WalSndWakeupProcessRequests();
    SendShareStorageXLogCopyBackendWakeupRequest();
    /*
     * Wake up waiting commit threads to check on the latest LSN flushed.
     */
    (void)pg_atomic_exchange_u64((uint64 *)&g_instance.wal_cxt.flushResult,
                                 t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult.Flush);

    /*
     * Notifity commit agents to wake up and check latest flushResult
     */
    WakeupWalSemaphore(&g_instance.wal_cxt.walFlushWaitLock->l.sem);
}

/*
 * Flush xlog, but without specifying exactly where to flush to.
 *
 * We normally flush only completed blocks; but if there is nothing to do on
 * that basis, we check for unflushed async commits in the current incomplete
 * block, and flush through the latest one of those.  Thus, if async commits
 * are not being used, we will flush complete blocks only.	We can guarantee
 * that async commits reach disk after at most three cycles; normally only
 * one or two.	(When flushing complete blocks, we allow XLogWrite to write
 * "flexibly", meaning it can stop at the end of the buffer ring; this makes a
 * difference only with very high load or long wal_writer_delay, but imposes
 * one extra cycle for the worst case for async commits.)
 *
 * This routine is invoked periodically by the background walwriter process.
 *
 * Returns TRUE if we flushed anything.
 */
bool XLogBackgroundFlush(void)
{
    XLogRecPtr WriteRqstPtr = InvalidXLogRecPtr;
    XLogRecPtr InitializeRqstPtr = InvalidXLogRecPtr;
    int start_entry_idx, curr_entry_idx, next_entry_idx, entry_idx;
    volatile WALInsertStatusEntry *start_entry_ptr = NULL;
    volatile WALInsertStatusEntry *curr_entry_ptr = NULL;
    volatile WALInsertStatusEntry *next_entry_ptr = NULL;
#ifndef ENABLE_MULTIPLE_NODES
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr consensus_lsn = InvalidXLogRecPtr;
    int paxos_entry_idx = -1;
#endif
    volatile WALInsertStatusEntry *status_entry_ptr = NULL;
    uint32 entry_count = 0;

    /*
     *  Entry #:                  g_instance.wal_cxt.walInsertStatusTable
     *                            +---------------+--------+-------+
     *  0:                        |               |        |       |
     *                            +---------------+--------+-------+
     *                            ......
     *                            +---------------+--------+-------+
     *  *lastWalStatusEntry+1:    |               |        |COPIED | <--- while loop start here till hit a hole or
     *                            +---------------+--------+-------+      and older LRC
     *                            ......                    COPIED
     *                            +---------------+--------+-------+
     *  prev_entry:               |   endLSN(N-1) |LRC(N-1)|COPIED |  <== curr_entry_ptr <--- flush till here
     *                            +---------------+--------+-------+
     *  curr_entry_idx:           |   endLSN(N)   |LRC(N)  |NOT_CP |  <== next_entry_ptr
     *                            +---------------+--------+-------+
     *  next_entry_idx:           |   endLSN(N+1) |LRC(N+1)|COPIED |
     *                            +---------------+--------+-------+
     *                            ......
     *                            +---------------+--------+-------+
     *  WAL_INSERT_STATUS_ENTRIES:|               |        |       |
     *  - 1                       +---------------+--------+-------+
     *
     */
    start_entry_idx = GET_NEXT_STATUS_ENTRY(g_instance.attr.attr_storage.wal_insert_status_entries_power,
                                            g_instance.wal_cxt.lastWalStatusEntryFlushed);
    start_entry_ptr = &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(start_entry_idx)];
    /*
     * Bail out if the very first entry is not copied.
     */
    if (start_entry_ptr->status != WAL_COPIED) {
        return false;
    }

#ifndef ENABLE_MULTIPLE_NODES
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
        SpinLockAcquire(&xlogctl->info_lck);
        consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
        SpinLockRelease(&xlogctl->info_lck);

        if (XLByteLE((XLogRecPtr)start_entry_ptr->endLSN, consensus_lsn))
            paxos_entry_idx = start_entry_idx;
    }
#endif

    next_entry_idx = start_entry_idx;
    next_entry_ptr = start_entry_ptr;
    uint64 stTime = GetCurrentTimestamp();
    uint64 startLSN = start_entry_ptr->endLSN;
    uint64 totalXlogIterBytes = g_instance.wal_cxt.totalXlogIterBytes;
    uint64 totalXlogIterTimes = g_instance.wal_cxt.totalXlogIterTimes;

    // calculate current actual xlog flush length used for judge the sleep is need or not
    // to accumulate more xlog when the first uncopied record found in the current loop
    uint64 averageXlogFlushBytes = (totalXlogIterTimes == 0) ? 0 : totalXlogIterBytes / totalXlogIterTimes;
    uint64 curAverageXlogFlushBytes = (averageXlogFlushBytes == 0) ? XLOG_FLUSH_SIZE_INIT :
                                      (averageXlogFlushBytes / PAGE_SIZE_BYTES + 1) * PAGE_SIZE_BYTES;
    do {
        curr_entry_ptr = next_entry_ptr;
        curr_entry_idx = next_entry_idx;
        next_entry_idx = GET_NEXT_STATUS_ENTRY(g_instance.attr.attr_storage.wal_insert_status_entries_power,
                                               curr_entry_idx);
        next_entry_ptr = &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(next_entry_idx)];
        entry_count++;

        Assert(curr_entry_ptr->status == WAL_COPIED);

#ifndef ENABLE_MULTIPLE_NODES
        if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
            SpinLockAcquire(&xlogctl->info_lck);
            consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
            SpinLockRelease(&xlogctl->info_lck);

            if (XLByteLE((XLogRecPtr)curr_entry_ptr->endLSN, consensus_lsn))
                paxos_entry_idx = curr_entry_idx;
        }
#endif

        if (((curr_entry_ptr->LRC + 1) & 0x7FFFFFFF) != next_entry_ptr->LRC) {
            break;
        }

        /*
         * Flush if accumulate enough bytes or till the LSN in the entry before
         * an entry associated with the first uncopied record found in the current loop.
         */
        if (next_entry_ptr->status == WAL_NOT_COPIED) {
            if (((curr_entry_ptr->endLSN - startLSN) > curAverageXlogFlushBytes) ||
                (GetCurrentTimestamp() - stTime >= (uint64)g_instance.attr.attr_storage.wal_flush_timeout)) {
                break;
            }
            pg_usleep(g_instance.attr.attr_storage.wal_flush_delay);
            next_entry_idx = curr_entry_idx;
            next_entry_ptr = curr_entry_ptr;
            entry_count--;
        }
    } while (true);

    /* update continuous LRC entries that have been copied without a hole */
    g_instance.wal_cxt.lastLRCScanned = curr_entry_ptr->LRC;

    /*
     * During REDO, we are reading not writing WAL.
     *
     * Test XLogInsertAllowed(), not RecoveryInProgress. XLogInsertAllowed()
     * allows WAL writing at the end of recovery by setting a thread local variable
     * if this thread local variable is not set, it returns RecoveryInProgress()
     *
     * The checkpointer acts this way as well - when it tries to write
     * end-of-recovery checkpoint, this function should indeed flush.
     */
    if (!XLogInsertAllowed()) {
        return false;
    }

    if (!SSXLogInsertAllowed()) {
        return false;
    }

#ifndef ENABLE_MULTIPLE_NODES
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
        WriteRqstPtr = (XLogRecPtr)pg_atomic_barrier_read_u64(&curr_entry_ptr->endLSN);
        LWLockAcquire(WALWritePaxosLock, LW_EXCLUSIVE);
        XLogWritePaxos(WriteRqstPtr);
        LWLockRelease(WALWritePaxosLock);

        /* return if no wal entry reaches dcf consensus */
        if (paxos_entry_idx == -1) {
            return false;
        } else {
            /* XLog LSN in disk can't be greater than consensus LSN of DCF, Or XLog divergence will occur */
            curr_entry_idx = paxos_entry_idx;
            curr_entry_ptr = &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(paxos_entry_idx)];
        }
    }
#endif

    pg_read_barrier();
    WriteRqstPtr = (XLogRecPtr)pg_atomic_barrier_read_u64(&curr_entry_ptr->endLSN);

#ifdef WAL_DEBUG
    if (u_sess->attr.attr_storage.XLOG_DEBUG) {
        ereport(LOG, (errmsg("xlog bg flush request %X/%X; write %X/%X; flush %X/%X", (uint32)(WriteRqstPtr >> 32),
                             (uint32)WriteRqstPtr, (uint32)(t_thrd.xlog_cxt.LogwrtResult->Write >> 32),
                             (uint32)t_thrd.xlog_cxt.LogwrtResult->Write,
                             (uint32)(t_thrd.xlog_cxt.LogwrtResult->Flush >> 32),
                             (uint32)t_thrd.xlog_cxt.LogwrtResult->Flush)));
    }
#endif

    XLogFlushCore(WriteRqstPtr);

#ifndef ENABLE_MULTIPLE_NODES
#ifdef USE_ASSERT_CHECKING
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
        SpinLockAcquire(&xlogctl->info_lck);
        consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
        SpinLockRelease(&xlogctl->info_lck);

        Assert(XLByteLE(WriteRqstPtr, consensus_lsn));
    }
#endif
#endif

    /* update flush position */
    g_instance.wal_cxt.lastWalStatusEntryFlushed = curr_entry_idx;
    g_instance.wal_cxt.lastLRCFlushed = curr_entry_ptr->LRC;

    g_instance.wal_cxt.totalXlogIterTimes++;
    g_instance.wal_cxt.totalXlogIterBytes += (curr_entry_ptr->endLSN - startLSN);
    /*
     * Update LRC and status in the status entry that is already flushed (from start to current entry).
     * All WAL insert threads need to examine both of the LRC and the status
     * fiels before copying its endLSN and set status to WAL_COPIED in this
     * entry. This is to ensure it truly owns this entry, i.e., the writer thread
     * finishes updating the entry.
     */
    entry_idx = start_entry_idx - 1;
    do {
        /*
         * Increment entry_idx by 1 and wrap the result around WAL_INSERT_STATUS_ENTRIES.
         */
        entry_idx = GET_NEXT_STATUS_ENTRY(g_instance.attr.attr_storage.wal_insert_status_entries_power, entry_idx);
        status_entry_ptr = &g_instance.wal_cxt.walInsertStatusTable[GET_STATUS_ENTRY_INDEX(entry_idx)];
        status_entry_ptr->LRC =
            (status_entry_ptr->LRC +
             GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power)) &
            0x7FFFFFFF;
        /* Reset the status to WAL_NOT_COPIED for the flushed entries. */
        pg_write_barrier();
        status_entry_ptr->status = WAL_NOT_COPIED;
    } while (entry_idx != curr_entry_idx);

    InitializeRqstPtr = g_instance.wal_cxt.flushResult - g_instance.wal_cxt.flushResult % XLOG_BLCKSZ;

    if (InitializeRqstPtr != InvalidXLogRecPtr && XLByteLT(g_instance.wal_cxt.sentResult, InitializeRqstPtr)) {
        AdvanceXLInsertBuffer<false>(InitializeRqstPtr, false);

        (void)pg_atomic_exchange_u64((uint64 *)&g_instance.wal_cxt.sentResult, InitializeRqstPtr);

        WakeupWalSemaphore(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem);
    }

    return true;
}

/*
 * This function calls XLogBackgroundFlush ONCE when no one else is
 * doing it
 * The caller of this function should loop on this function to make
 * sure the LSN the caller waiting on is indeed flushed
 */
static void XLogSelfFlush(void)
{
    /*
     * The current callers of this function all check whether
     * the WAL writer is present, but as a generic function,
     * it should also does the check.
     */
    if (g_instance.wal_cxt.isWalWriterUp) {
        return;
    }

    /*
     * We only try self-flush when nobody else is already doing it
     */
    if (LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE)) {
        XLogBackgroundFlush();
        LWLockRelease(WALWriteLock);
    }

    return;
}

/*
 * This function is called by CopyXLogRecordToWAL or CopyXLogRecordToWALForGroup
 * when large records ( > wal buffer size ) are copied to WAL.
 *
 * During copying such a large record, we will wrap around the wal buffer and
 * access the page the large record starts from at least twice. When an agent
 * hits the start page again, the agent needs to initiate a full wal buffer
 * flush or it will get stuck - it has no more buffer page to use but
 * the wal writer cannot flush for this agent because the wal writer should only
 * flush when a status entry is updated but this agent cannot update its status
 * entry before it finishes copying its record.
 *
 * This function should not change the wal copy status entries.
 *
 * numHitsOnStartPage - the 2nd hit on the start page needs to wait for all
 *                      potential concurrent flushes to finish while the
 *                      subsequent hits on the start page don't
 * CurrPos - The requested LSN
 * currLRC - need this to calculate the previous LRC for use to wait for
 */
static void XLogSelfFlushWithoutStatus(int numHitsOnStartPage, XLogRecPtr currPos, int currLRC)
{
    /*
     * I only needs to wait for the wal writer or other agents'
     * flushes (when wal writer is absent) to finish when I hit
     * the start page the 2nd time within one WAL copy because
     * after that the wal flush is blocked by me
     */
    if (numHitsOnStartPage == 2) {
        int prevLRC = (currLRC - 1) & 0x7FFFFFFF;
        while (g_instance.wal_cxt.lastLRCFlushed != prevLRC) {
            if (!g_instance.wal_cxt.isWalWriterUp) {
                XLogSelfFlush();
            }
            pg_usleep(1);
        }
    }

#ifndef ENABLE_MULTIPLE_NODES
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
        XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
        XLogRecPtr consensus_lsn = InvalidXLogRecPtr;

        LWLockAcquire(WALWritePaxosLock, LW_EXCLUSIVE);
        XLogWritePaxos(currPos);
        LWLockRelease(WALWritePaxosLock);

        SpinLockAcquire(&xlogctl->info_lck);
        consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
        SpinLockRelease(&xlogctl->info_lck);
        /* XLog LSN in disk can't be greater than consensus LSN of DCF */
        while (XLByteLT(consensus_lsn, currPos)) {
            pg_usleep(1);

            SpinLockAcquire(&xlogctl->info_lck);
            consensus_lsn = xlogctl->LogwrtPaxos.Consensus;
            SpinLockRelease(&xlogctl->info_lck);
        }
    }
#endif

    /*
     * To this point, we are the one holding up the wal writer or
     * anyone that does the flush but we will still get the write
     * lock to be safe since acquiring a lock without contention
     * doesn't hurt performance here
     */
    LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
    XLogFlushCore(currPos);
    LWLockRelease(WALWriteLock);

    XLogRecPtr initializeRqstPtr = currPos - currPos % XLOG_BLCKSZ;
    AdvanceXLInsertBuffer<false>(initializeRqstPtr, false);
    (void)pg_atomic_exchange_u64((uint64 *)&g_instance.wal_cxt.sentResult, initializeRqstPtr);
    WakeupWalSemaphore(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem);
}

/*
 * Test whether XLOG data has been flushed up to (at least) the given position.
 *
 * Returns true if a flush is still needed.  (It may be that someone else
 * is already in process of flushing that far, however.)
 */
bool XLogNeedsFlush(XLogRecPtr record)
{
    /*
     * During recovery, we don't flush WAL but update minRecoveryPoint
     * instead. So "needs flush" is taken to mean whether minRecoveryPoint
     * would need to be updated.
     */
    if (RecoveryInProgress()) {
        /* Quick exit if already known updated */
        if (XLByteLE(record, t_thrd.xlog_cxt.minRecoveryPoint) || !t_thrd.xlog_cxt.updateMinRecoveryPoint) {
            return false;
        }

        /*
         * Update local copy of minRecoveryPoint. But if the lock is busy,
         * just return a conservative guess.
         */
        if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED)) {
            return true;
        }
        t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
        SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
        LWLockRelease(ControlFileLock);

        /*
         * An invalid minRecoveryPoint means that we need to recover all the
         * WAL, i.e., we're doing crash recovery.  We never modify the control
         * file's value in that case, so we can short-circuit future checks
         * here too.
         */
        if (t_thrd.xlog_cxt.minRecoveryPoint == 0) {
            t_thrd.xlog_cxt.updateMinRecoveryPoint = false;
        }

        /* check again */
        if (XLByteLE(record, t_thrd.xlog_cxt.minRecoveryPoint) || !t_thrd.xlog_cxt.updateMinRecoveryPoint) {
            return false;
        } else {
            return true;
        }
    }

    /* Quick exit if already known flushed */
    if (XLByteLE(record, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
        return false;
    }

    /* read LogwrtResult and update local state */
    {
        /* use volatile pointer to prevent code rearrangement */
        XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        *t_thrd.xlog_cxt.LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /* check again */
    if (XLByteLE(record, t_thrd.xlog_cxt.LogwrtResult->Flush)) {
        return false;
    }

    return true;
}

/*
 * Create a new XLOG file segment, or open a pre-existing one.
 *
 * log, seg: identify segment to be created/opened.
 *
 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
 * pre-existing file will be deleted).	On return, TRUE if a pre-existing
 * file was used.
 *
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
 * place.  This should be TRUE except during bootstrap log creation.  The
 * caller must *not* hold the lock at call.
 *
 * Returns FD of opened file.
 *
 * Note: errors here are ERROR not PANIC because we might or might not be
 * inside a critical section (eg, during checkpoint there is no reason to
 * take down the system on failure).  They will promote to PANIC if we are
 * in a critical section.
 */
static int XLogFileInitInternal(XLogSegNo logsegno, bool *use_existent, bool use_lock, const char *xlog_dir)
{
    char path[MAXPGPATH];
    char tmppath[MAXPGPATH];
    char *zbuffer = NULL;
    char zbuffer_raw[XLOG_BLCKSZ + ALIGNOF_BUFFER];
    XLogSegNo installed_segno;
    int max_advance;
    int fd;
    int nbytes;
    errno_t rc = EOK;

    gstrace_entry(GS_TRC_ID_XLogFileInit);
    rc = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID,
                    (uint32)((logsegno) / XLogSegmentsPerXLogId), (uint32)((logsegno) % XLogSegmentsPerXLogId));
    securec_check_ss(rc, "", "");

    // Try to use existent file (checkpoint maker may have created it already)
    if (*use_existent) {
        fd = BasicOpenFile(path, O_RDWR | PG_BINARY | (unsigned int)get_sync_bit(u_sess->attr.attr_storage.sync_method),
                           S_IRUSR | S_IWUSR);
        if (fd < 0) {
            if (!FILE_POSSIBLY_DELETED(errno)) {
                ereport(ERROR,
                        (errcode_for_file_access(), errmsg("could not open file \"%s\" (log segment %s): %m", path,
                                                           XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, logsegno))));
            }
        } else {
            gstrace_exit(GS_TRC_ID_XLogFileInit);
            return fd;
        }
    }

    if (!SSXLogInsertAllowed()) {
        ereport(FATAL, (errmsg("SS standby cannot init xlog files due to DSS")));
    }

    /*
     * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
     * another process is doing the same thing.  If so, we will end up
     * pre-creating an extra log segment.  That seems OK, and better than
     * holding the lock throughout this lengthy process.
     */
    ereport(DEBUG2, (errmsg("creating and filling new WAL file")));

    rc = snprintf_s(tmppath, MAXPGPATH, MAXPGPATH - 1, "%s/xlogtemp.%lu", xlog_dir, gs_thread_self());
    securec_check_ss(rc, "\0", "\0");

    unlink(tmppath);

    /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
    if (fd < 0) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath)));
    }

    /*
     * Zero-fill the file.	We have to do this the hard way to ensure that all
     * the file space has really been allocated --- on platforms that allow
     * "holes" in files, just seeking to the end doesn't allocate intermediate
     * space.  This way, we know that we have all the space and (after the
     * fsync below) that all the indirect blocks are down on disk.	Therefore,
     * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
     * log file.
     *
     * Note: ensure the buffer is reasonably well-aligned; this may save a few
     * cycles transferring data to the kernel.
     */
    zbuffer = (char *)BUFFERALIGN(zbuffer_raw);
    rc = memset_s(zbuffer, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
    securec_check(rc, "\0", "\0");

    if (is_dss_fd(fd)) {
        /* extend file and fill space at once to avoid performance issue */
        pgstat_report_waitevent(WAIT_EVENT_WAL_INIT_WRITE);
        errno = 0;
        if (ftruncate(fd, XLogSegSize) != 0) {
            int save_errno = errno;
            close(fd);
            unlink(tmppath);
            /* if write didn't set errno, assume problem is no disk space */
            errno = save_errno ? save_errno : ENOSPC;
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath)));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);
    } else {
        for (nbytes = 0; (uint32)nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ) {
            errno = 0;
            pgstat_report_waitevent(WAIT_EVENT_WAL_INIT_WRITE);
            if ((int)write(fd, zbuffer, XLOG_BLCKSZ) != (int)XLOG_BLCKSZ) {
                int save_errno = errno;

                // If we fail to make the file, delete it to release disk space
                close(fd);
                unlink(tmppath);
                /* if write didn't set errno, assume problem is no disk space */
                errno = save_errno ? save_errno : ENOSPC;

                ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath)));
            }
            pgstat_report_waitevent(WAIT_EVENT_END);
        }
    }

    pgstat_report_waitevent(WAIT_EVENT_WAL_INIT_SYNC);
    if (pg_fsync(fd) != 0) {
        int save_errno = errno;
        close(fd);
        errno = save_errno;
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath)));
    }
    pgstat_report_waitevent(WAIT_EVENT_END);

    if (close(fd)) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tmppath)));
    }

    /*
     * Now move the segment into place with its final name.
     *
     * If caller didn't want to use a pre-existing file, get rid of any
     * pre-existing file.  Otherwise, cope with possibility that someone else
     * has created the file while we were filling ours: if so, use ours to
     * pre-create a future log segment.
     */
    installed_segno = logsegno;
    max_advance = XLOGfileslop;
    if (!InstallXLogFileSegment(&installed_segno, (const char *)tmppath, *use_existent, &max_advance,
                                use_lock, xlog_dir)) {
        /*
         * No need for any more future segments, or InstallXLogFileSegment()
         * failed to rename the file into place. If the rename failed, opening
         * the file below will fail.
         */
        unlink(tmppath);
    }

    /* Set flag to tell caller there was no existent file */
    *use_existent = false;

    /* Now open original target segment (might not be file I just made) */
    fd = BasicOpenFile(path, O_RDWR | PG_BINARY | (unsigned int)get_sync_bit(u_sess->attr.attr_storage.sync_method),
                       S_IRUSR | S_IWUSR);
    if (fd < 0) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log segment %s): %m", path,
                                                          XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, logsegno))));
    }

    ereport(DEBUG2, (errmsg("done creating and filling new WAL file:%s", path)));

    gstrace_exit(GS_TRC_ID_XLogFileInit);
    return fd;
}

int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
{
    return XLogFileInitInternal(logsegno, use_existent, use_lock, SS_XLOGDIR);
}

void XLogFileCutPage(char *buffer, uint32 bufLen, uint32 cpyLen)
{
    if (bufLen != 0) {
        uint32 offsetLen = cpyLen % bufLen;
        uint32 truncateLen = bufLen - offsetLen;
        errno_t rc = EOK;
        ereport(LOG, (errcode_for_file_access(),
                      errmsg("XLogFileCutPage begin, bufLen:%u, cpyLen:%u, offsetLen:%u, truncateLen:%u", bufLen,
                             cpyLen, offsetLen, truncateLen)));
        rc = memset_s((buffer + offsetLen), truncateLen, 0, truncateLen);
        securec_check(rc, "\0", "\0");
    }
}

bool PreInitXlogFileForStandby(XLogRecPtr requestLsn)
{
    /* The last processed xlog lsn. */
    static XLogRecPtr lastWriteLsn = 0;
    /* The next segment to be created. */
    static XLogSegNo nextSegNo = 0;

    if (requestLsn < lastWriteLsn) { /* switch over. need reset */
        ereport(WARNING, (errmsg("PreInitXlogFileForStandby need reset. requestLsn: %X/%X; lastWriteLsn: %X/%X",
                                 (uint32)(requestLsn >> 32), (uint32)requestLsn, (uint32)(lastWriteLsn >> 32),
                                 (uint32)lastWriteLsn)));
        lastWriteLsn = requestLsn;
        nextSegNo = 0;
    } else if (requestLsn < lastWriteLsn + XLogSegSize) {
        /* If the requestLsn is not more than one segement, skip! */
        return false;
    } else {
        lastWriteLsn = requestLsn;
    }

    /* Start to pre-init xlog segment. */
    XLogSegNo reqSegNo;
    XLByteToPrevSeg(requestLsn, reqSegNo);
    /* If the request pos is bigger, start from the segment next to requestLsn. */
    if (nextSegNo <= reqSegNo) {
        nextSegNo = reqSegNo + 1;
    }
    XLogSegNo targetSegNo = reqSegNo + g_instance.attr.attr_storage.advance_xlog_file_num;
    for (; nextSegNo <= targetSegNo; nextSegNo++) {
        bool use_existent = true;
        int lf = XLogFileInit(nextSegNo, &use_existent, true);
        if (!use_existent)
            ereport(DEBUG1,
                    (errmsg("PreInitXlogFileForStandby new nextSegNo: %X/%X ThisTimeLineID: %u, RecoveryTargetTLI: %u",
                            (uint32)(nextSegNo >> 32), (uint32)nextSegNo, t_thrd.shemem_ptr_cxt.XLogCtl->ThisTimeLineID,
                            t_thrd.shemem_ptr_cxt.XLogCtl->RecoveryTargetTLI)));
        if (lf >= 0)
            close(lf);
    }

    return true;
}

void PreInitXlogFileForPrimary(int advance_xlog_file_num)
{
    XLogSegNo startSegNo, nextSegNo, target;
    int lf;
    bool use_existent;

    g_xlog_stat_shared->walAuxWakeNum++;

    if (g_instance.wal_cxt.globalEndPosSegNo == InvalidXLogSegPtr) {
        return;
    }
    startSegNo = g_instance.wal_cxt.globalEndPosSegNo + 1;
    target = startSegNo + advance_xlog_file_num - 1;
    for (nextSegNo = startSegNo; nextSegNo <= target; nextSegNo++) {
        use_existent = true;
        lf = XLogFileInit(nextSegNo, &use_existent, true);
        if (lf >= 0) {
            close(lf);
        }
    }
}
/*
 * Create a new XLOG file segment by copying a pre-existing one.
 *
 * log, seg: identify segment to be created.
 *
 * srcTLI, srclog, srcseg: identify segment to be copied (could be from
 *		a different timeline)
 *
 * Currently this is only used during recovery, and so there are no locking
 * considerations.	But we should be just as tense as XLogFileInit to avoid
 * emplacing a bogus file.
 */
static void XLogFileCopy(char *path, char *tmppath)
{
    char buffer[XLOG_BLCKSZ];
    int srcfd;
    int fd;
    uint32 nbytes;
    errno_t rc = EOK;

    Assert(path);
    Assert(tmppath);
    /*
     * Open the source file
     */
    srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
    if (srcfd < 0) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path)));
    }
    /*
     * Copy into a temp file name.
     */
    unlink(tmppath);

    /* do not use get_sync_bit() here --- want to fsync only at end of fill */
    fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
    if (fd < 0) {
        close(srcfd);
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath)));
    }

    /*
     * Do the data copying.
     */
    for (nbytes = 0; nbytes < (uint32)XLogSegSize; nbytes += sizeof(buffer)) {
        errno = 0;
        pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_READ);
        if ((int)read(srcfd, buffer, sizeof(buffer)) != (int)sizeof(buffer)) {
            close(srcfd);
            close(fd);
            if (errno != 0) {
                ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", path)));
            } else {
                ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("not enough data in file \"%s\"", path)));
            }
        }
        pgstat_report_waitevent(WAIT_EVENT_END);
        errno = 0;
        pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_WRITE);
        if ((int)write(fd, buffer, sizeof(buffer)) != (int)sizeof(buffer)) {
            int save_errno = errno;

            close(srcfd);
            close(fd);
            /*
             * If we fail to make the file, delete it to release disk space
             */
            unlink(tmppath);
            /* if write didn't set errno, assume problem is no disk space */
            errno = save_errno ? save_errno : ENOSPC;
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", tmppath)));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);
    }

    rc = memset_s(buffer, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
    securec_check(rc, "\0", "\0");

    pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_SYNC);
    if (pg_fsync(fd) != 0) {
        close(srcfd);
        close(fd);
        ereport(data_sync_elevel(ERROR),
                (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", tmppath)));
    }
    pgstat_report_waitevent(WAIT_EVENT_END);

    if (close(fd)) {
        close(srcfd);
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tmppath)));
    }

    close(srcfd);
}

static void XLogFileTruncateInner(char *path, int fd, uint32 targetPageOff, char *buffer, int bufLen)
{
    uint32 nbytes;
    errno_t rc = EOK;
    uint32 startBytes = targetPageOff;
    if (lseek(fd, (off_t)targetPageOff, SEEK_SET) < 0) {
        ereport(ERROR, (errcode_for_file_access(),
                        errmsg("lseek2:could not seek in log file %s to offset %u: %m", path, targetPageOff)));
    }
    for (nbytes = startBytes; nbytes < (uint32)XLogSegSize; nbytes += bufLen) {
        errno = 0;
        pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_WRITE);
        if ((int)write(fd, buffer, bufLen) != bufLen) {
            close(fd);
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not write to file \"%s\": %m", path)));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);
        if (nbytes == startBytes) {
            rc = memset_s(buffer, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
            securec_check(rc, "\0", "\0");
        }
    }
    pgstat_report_waitevent(WAIT_EVENT_WAL_COPY_SYNC);
    if (pg_fsync(fd) != 0) {
        close(fd);
        ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", path)));
    }
    pgstat_report_waitevent(WAIT_EVENT_END);
}

static void XLogFileTruncate(char *path, XLogRecPtr RecPtr)
{
    char buffer[XLOG_BLCKSZ];
    int fd;
    XLogRecPtr targetPagePtr;
    uint32 targetPageOff;
    targetPagePtr = RecPtr - RecPtr % XLOG_BLCKSZ;
    targetPageOff = (targetPagePtr % XLogSegSize);
    fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
    if (fd < 0)
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path)));
    if (lseek(fd, (off_t)targetPageOff, SEEK_SET) < 0) {
        ereport(ERROR, (errcode_for_file_access(),
                        errmsg("could not seek in log file %s to offset %u: %m", path, targetPageOff)));
    }
    if ((int)read(fd, buffer, sizeof(buffer)) != (int)sizeof(buffer)) {
        close(fd);
        if (errno != 0)
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", path)));
        else
            ereport(ERROR, (errcode(ERRCODE_IO_ERROR), errmsg("not enough data in file \"%s\"", path)));
    }
    XLogFileCutPage(buffer, XLOG_BLCKSZ, RecPtr % XLogSegSize);
    XLogFileTruncateInner(path, fd, targetPageOff, buffer, (int)(sizeof(buffer)));
    close(fd);
}

/*
 * Install a new XLOG segment file as a current or future log segment.
 *
 * This is used both to install a newly-created segment (which has a temp
 * filename while it's being created) and to recycle an old segment.
 *
 * *segno: identify segment to install as (or first possible target).
 * When find_free is TRUE, this is modified on return to indicate the
 * actual installation location or last segment searched.
 *
 * tmppath: initial name of file to install.  It will be renamed into place.
 *
 * find_free: if TRUE, install the new segment at the first empty segno
 * number at or after the passed numbers.  If FALSE, install the new segment
 * exactly where specified, deleting any existing segment file there.
 *
 * *max_advance: maximum number of segno slots to advance past the starting
 * point.  Fail if no free slot is found in this range.  On return, reduced
 * by the number of slots skipped over.  (Irrelevant, and may be NULL,
 * when find_free is FALSE.)
 *
 * use_lock: if TRUE, acquire ControlFileLock while moving file into
 * place.  This should be TRUE except during bootstrap log creation.  The
 * caller must *not* hold the lock at call.
 *
 * Returns TRUE if the file was installed successfully.  FALSE indicates that
 * max_advance limit was exceeded, or an error occurred while renaming the
 * file into place.
 */
static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool find_free, int *max_advance,
                                   bool use_lock, const char *xlog_dir)
{
    char path[MAXPGPATH];
    struct stat stat_buf;
    errno_t errorno = EOK;

    errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID,
                         (uint32)((*segno) / XLogSegmentsPerXLogId), (uint32)((*segno) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    /*
     * We want to be sure that only one process does this at a time.
     */
    if (use_lock) {
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    }

    if (!find_free) {
        /* Force installation: get rid of any pre-existing segment file */
        unlink(path);
    } else {
        /* Find a free slot to put it in */
        while (stat(path, &stat_buf) == 0) {
            if (*max_advance <= 0) {
                /* Failed to find a free slot within specified range */
                if (use_lock) {
                    LWLockRelease(ControlFileLock);
                }
                return false;
            }
            (*segno)++;
            (*max_advance)--;
            errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir,
                                 t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((*segno) / XLogSegmentsPerXLogId),
                                 (uint32)((*segno) % XLogSegmentsPerXLogId));
            securec_check_ss(errorno, "", "");
        }
    }

    if (durable_rename(tmppath, path, LOG) != 0) {
        if (use_lock) {
            LWLockRelease(ControlFileLock);
        }

        /* durable_rename already emitted log message */
        return false;
    }

    if (use_lock) {
        LWLockRelease(ControlFileLock);
    }

    return true;
}

/*
 * Open a pre-existing logfile segment for writing.
 */
static int XLogFileOpenInternal(XLogSegNo segno, const char *xlog_dir)
{
    char path[MAXPGPATH];
    int fd;
    errno_t errorno = EOK;

    errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID,
                         (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    fd = BasicOpenFile(path, O_RDWR | PG_BINARY | (unsigned int)get_sync_bit(u_sess->attr.attr_storage.sync_method),
                       S_IRUSR | S_IWUSR);
    if (fd < 0) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not open xlog file \"%s\" (log segment %s): %m", path,
                                                          XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno))));
    }

    return fd;
}

/*
 * Copy one page starting with targetpageptr of EndOfLog from old priamry to new primary.
 */
void SSXLOGCopyFromOldPrimary(XLogReaderState *state, XLogRecPtr pageptr)
{
    ssize_t actualBytes = write(t_thrd.xlog_cxt.openLogFile, state->readBuf, XLOG_BLCKSZ);
    if (actualBytes != XLOG_BLCKSZ) {
        if (errno == 0) {
            errno = ENOSPC;
        }
        uint32 shiftSize = 32;
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not write xlog at start:%X/%X, length %d: %m",
                                                          static_cast<uint32>(pageptr >> shiftSize),
                                                          static_cast<uint32>(pageptr), XLOG_BLCKSZ)));
    }
}

int XLogFileOpen(XLogSegNo segno)
{
    return XLogFileOpenInternal(segno, SS_XLOGDIR);
}

/*
 * Open a logfile segment for reading (during recovery).
 *
 * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
 * Otherwise, it's assumed to be already available in pg_xlog.
 */
static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, bool notfoundOk)
{
    char xlogfname[MAXFNAMELEN];
    char activitymsg[MAXFNAMELEN + 16];
    char path[MAXPGPATH];
    int fd;
    errno_t errorno = EOK;

    errorno = snprintf_s(xlogfname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", tli,
                         (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    switch (source) {
        case XLOG_FROM_ARCHIVE:
            /* Report recovery progress in PS display */
            errorno = snprintf_s(activitymsg, sizeof(activitymsg), sizeof(activitymsg) - 1, "waiting for %s",
                                 xlogfname);
            securec_check_ss(errorno, "", "");
            set_ps_display(activitymsg, false);

            t_thrd.xlog_cxt.restoredFromArchive = RestoreArchivedFile(path, xlogfname, "RECOVERYXLOG", XLogSegSize);
            if (!t_thrd.xlog_cxt.restoredFromArchive) {
                return -1;
            }
            break;

        case XLOG_FROM_PG_XLOG:
        case XLOG_FROM_STREAM:
            errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, tli,
                                 (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
            securec_check_ss(errorno, "", "");
            t_thrd.xlog_cxt.restoredFromArchive = false;
            break;

        default:
            ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("invalid XLogFileRead source %d", source)));
            break;
    }

    /*
     * If the segment was fetched from archival storage, replace the existing
     * xlog segment (if any) with the archival version.
     */
    if (source == XLOG_FROM_ARCHIVE) {
        KeepFileRestoredFromArchive((const char *)path, (const char *)xlogfname);

        // Set path to point at the new file in pg_xlog.
        errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, xlogfname);
        securec_check_ss(errorno, "", "");
    }

    fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
    if (fd >= 0) {
        /* Success! */
        t_thrd.xlog_cxt.curFileTLI = tli;

        /* Report recovery progress in PS display */
        errorno = snprintf_s(activitymsg, sizeof(activitymsg), sizeof(activitymsg) - 1, "recovering %s", xlogfname);
        securec_check_ss(errorno, "", "");
        set_ps_display(activitymsg, false);

        /* Track source of data in assorted state variables */
        t_thrd.xlog_cxt.readSource = source;
        t_thrd.xlog_cxt.XLogReceiptSource = source;
        /* In FROM_STREAM case, caller tracks receipt time, not me */
        if (source != XLOG_FROM_STREAM) {
            t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
        }

        return fd;
    }
    if (!FILE_POSSIBLY_DELETED(errno) || !notfoundOk) { /* unexpected failure? */
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log segment %s): %m", path,
                                                          XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno))));
    }
    return -1;
}

/*
 * Open a logfile segment for reading (during recovery).
 * This version searches for the segment with any TLI listed in expectedTLIs.
 */
int XLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources)
{
    char path[MAXPGPATH];
    ListCell *cell = NULL;
    int fd = -1;
    errno_t errorno = EOK;

    /*
     * Loop looking for a suitable timeline ID: we might need to read any of
     * the timelines listed in expectedTLIs.
     *
     * We expect curFileTLI on entry to be the TLI of the preceding file in
     * sequence, or 0 if there was no predecessor.	We do not allow curFileTLI
     * to go backwards; this prevents us from picking up the wrong file when a
     * parent timeline extends to higher segment numbers than the child we
     * want to read.
     */
    foreach (cell, t_thrd.xlog_cxt.expectedTLIs) {
        TimeLineID tli = (TimeLineID)lfirst_int(cell);
        if (tli < t_thrd.xlog_cxt.curFileTLI) {
            break; /* don't bother looking at too-old TLIs */
        }

        if (sources & XLOG_FROM_ARCHIVE) {
            fd = XLogFileRead(segno, emode, tli, XLOG_FROM_ARCHIVE, true);
            if (fd != -1) {
                ereport(DEBUG1, (errmsg("got WAL segment from archive")));
                return fd;
            }
        }

        if (sources & XLOG_FROM_PG_XLOG) {
            fd = XLogFileRead(segno, emode, tli, XLOG_FROM_PG_XLOG, true);
            if (fd != -1) {
                return fd;
            }
        }
    }

    /* Couldn't find it.  For simplicity, complain about front timeline */
    errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR,
                         t_thrd.xlog_cxt.recoveryTargetTLI, (uint32)((segno) / XLogSegmentsPerXLogId),
                         (uint32)((segno) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    errno = ENOENT;
    ereport(emode, (errcode_for_file_access(), errmsg("could not open file \"%s\" (log segment %s): %m", path,
                                                      XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno))));
    return -1;
}

/*
 * Close the current logfile segment for writing.
 */
static void XLogFileClose(void)
{
    Assert(t_thrd.xlog_cxt.openLogFile >= 0);

    /*
     * WAL segment files will not be re-read in normal operation, so we advise
     * the OS to release any cached pages.	But do not do so if WAL archiving
     * or streaming is active, because archiver and walsender process could
     * use the cache to read the WAL segment.
     */
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
    if (!XLogIsNeeded()) {
        (void)posix_fadvise(t_thrd.xlog_cxt.openLogFile, 0, 0, POSIX_FADV_DONTNEED);
    }
#endif

    if (close(t_thrd.xlog_cxt.openLogFile)) {
        ereport(PANIC, (errcode_for_file_access(),
                        errmsg("could not close log file %s: %m",
                               XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.openLogSegNo))));
    }

    t_thrd.xlog_cxt.openLogFile = -1;
}

/*
 * A file was restored from the archive under a temporary filename (path),
 * and now we want to keep it. Rename it under the permanent filename in
 * in pg_xlog (xlogfname), replacing any existing file with the same name.
 */
static void KeepFileRestoredFromArchive(const char *path, const char *xlogfname)
{
    char xlogfpath[MAXPGPATH];
    bool reload = false;
    struct stat statbuf;
    errno_t errorno = EOK;

    errorno = snprintf_s(xlogfpath, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%s", xlogfname);
    securec_check_ss(errorno, "", "");

    if (stat(xlogfpath, &statbuf) == 0) {
        char oldpath[MAXPGPATH] = {0};

        errorno = memset_s(oldpath, MAXPGPATH, 0, MAXPGPATH);
        securec_check(errorno, "", "");

#ifdef WIN32
        /*
         * On Windows, if another process (e.g a walsender process) holds
         * the file open in FILE_SHARE_DELETE mode, unlink will succeed,
         * but the file will still show up in directory listing until the
         * last handle is closed, and we cannot rename the new file in its
         * place until that. To avoid that problem, rename the old file to
         * a temporary name first. Use a counter to create a unique
         * filename, because the same file might be restored from the
         * archive multiple times, and a walsender could still be holding
         * onto an old deleted version of it.
         */
        errorno = snprintf_s(oldpath, MAXPGPATH, MAXPGPATH - 1, "%s.deleted%u", xlogfpath,
                             t_thrd.xlog_cxt.deletedcounter++);
        securec_check_ss(errorno, "", "");

        if (rename(xlogfpath, oldpath) != 0) {
            ereport(ERROR, (errcode_for_file_access(),
                            errmsg("could not rename file \"%s\" to \"%s\": %m", xlogfpath, oldpath)));
        }
#else
        errorno = strncpy_s(oldpath, MAXPGPATH, xlogfpath, MAXPGPATH - 1);
        securec_check(errorno, "", "");
#endif
        if (unlink(oldpath) != 0) {
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", xlogfpath)));
        }
        reload = true;
    }

    durable_rename(path, xlogfpath, ERROR);

    // Create .done file forcibly to prevent the restored segment from being archived again later.
    XLogArchiveForceDone(xlogfname);

    /*
     * If the existing file was replaced, since walsenders might have it
     * open, request them to reload a currently-open segment. This is only
     * required for WAL segments, walsenders don't hold other files open,
     * but there's no harm in doing this too often, and we don't know what
     * kind of a file we're dealing with here.
     */
    if (reload) {
        WalSndRqstFileReload();
    }

    /* Signal walsender that new WAL has arrived */
    if (AllowCascadeReplication()) {
        WalSndWakeup();
    }
}

/*
 * Attempt to retrieve the specified file from off-line archival storage.
 * If successful, fill "path" with its complete path (note that this will be
 * a temp file name that doesn't follow the normal naming convention), and
 * return TRUE.
 * If not successful, fill "path" with the name of the normal on-line file
 * (which may or may not actually exist, but we'll try to use it), and return FALSE.
 * For fixed-size files, the caller may pass the expected size as an
 * additional crosscheck on successful recovery. If the file size is not
 * known, set expectedSize = 0.
 */
static bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize)
{
    char xlogpath[MAXPGPATH];
    char xlogRestoreCmd[MAXPGPATH];
    char lastRestartPointFname[MAXPGPATH];
    char *dp = NULL;
    char *endp = NULL;
    const char *sp = NULL;
    int rc = 0;
    bool signaled = false;
    struct stat stat_buf;
    XLogSegNo restartSegNo;
    errno_t errorno = EOK;

    /* In standby mode, restore_command might not be supplied */
    if (t_thrd.xlog_cxt.recoveryRestoreCommand == NULL) {
        goto not_available;
    }

    /*
     * When doing archive recovery, we always prefer an archived log file even
     * if a file of the same name exists in XLOGDIR.  The reason is that the
     * file in XLOGDIR could be an old, un-filled or partly-filled version
     * that was copied and restored as part of backing up $PGDATA.
     *
     * We could try to optimize this slightly by checking the local copy
     * lastchange timestamp against the archived copy, but we have no API to
     * do this, nor can we guarantee that the lastchange timestamp was
     * preserved correctly when we copied to archive. Our aim is robustness,
     * so we elect not to do this.
     *
     * If we cannot obtain the log file from the archive, however, we will try
     * to use the XLOGDIR file if it exists.  This is so that we can make use
     * of log segments that weren't yet transferred to the archive.
     *
     * Notice that we don't actually overwrite any files when we copy back
     * from archive because the recoveryRestoreCommand may inadvertently
     * restore inappropriate xlogs, or they may be corrupt, so we may wish to
     * fallback to the segments remaining in current XLOGDIR later. The
     * copy-from-archive filename is always the same, ensuring that we don't
     * run out of disk space on long recoveries.
     */
    errorno = snprintf_s(xlogpath, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%s", recovername);
    securec_check_ss(errorno, "", "");
    /*
     * Make sure there is no existing file named recovername.
     */
    if (stat(xlogpath, &stat_buf) != 0) {
        if (errno != ENOENT) {
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", xlogpath)));
        }
    } else {
        if (unlink(xlogpath) != 0) {
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", xlogpath)));
        }
    }

    /*
     * Calculate the archive file cutoff point for use during log shipping
     * replication. All files earlier than this point can be deleted from the
     * archive, though there is no requirement to do so.
     *
     * We initialise this with the filename of an InvalidXLogRecPtr, which
     * will prevent the deletion of any WAL files from the archive because of
     * the alphabetic sorting property of WAL filenames.
     *
     * Once we have successfully located the redo pointer of the checkpoint
     * from which we start recovery we never request a file prior to the redo
     * pointer of the last restartpoint. When redo begins we know that we have
     * successfully located it, so there is no need for additional status
     * flags to signify the point when we can begin deleting WAL files from
     * the archive.
     */
    if (t_thrd.xlog_cxt.InRedo) {
        XLByteToSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, restartSegNo);
        errorno = snprintf_s(lastRestartPointFname, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X",
                             t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID,
                             (uint32)((restartSegNo) / XLogSegmentsPerXLogId),
                             (uint32)((restartSegNo) % XLogSegmentsPerXLogId));
        securec_check_ss(errorno, "", "");

        /* we shouldn't need anything earlier than last restart point */
        Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
    } else {
        errorno = snprintf_s(lastRestartPointFname, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X", 0,
                             (uint32)((0L) / XLogSegmentsPerXLogId), (uint32)((0L) % XLogSegmentsPerXLogId));
        securec_check_ss(errorno, "", "");
    }

    /*
     * construct the command to be executed
     */
    dp = xlogRestoreCmd;
    endp = xlogRestoreCmd + MAXPGPATH - 1;
    *endp = '\0';

    for (sp = t_thrd.xlog_cxt.recoveryRestoreCommand; *sp; sp++) {
        if (*sp == '%') {
            switch (sp[1]) {
                case 'p':
                    /* %p: relative path of target file */
                    sp++;
                    errorno = strncpy_s(dp, MAXPGPATH - (dp - xlogRestoreCmd), xlogpath, strlen(xlogpath));
                    securec_check(errorno, "", "");
                    make_native_path(dp);
                    dp += strlen(dp);
                    break;
                case 'f':
                    /* %f: filename of desired file */
                    sp++;
                    errorno = strncpy_s(dp, MAXPGPATH - (dp - xlogRestoreCmd), xlogfname, strlen(xlogfname));
                    securec_check(errorno, "", "");
                    dp += strlen(dp);
                    break;
                case 'r':
                    /* %r: filename of last restartpoint */
                    sp++;
                    errorno = strncpy_s(dp, MAXPGPATH - (dp - xlogRestoreCmd), lastRestartPointFname,
                                        strlen(lastRestartPointFname));
                    securec_check(errorno, "", "");
                    dp += strlen(dp);
                    break;
                case '%':
                    /* convert %% to a single % */
                    sp++;
                    if (dp < endp) {
                        *dp++ = *sp;
                    }
                    break;
                default:
                    /* otherwise treat the % as not special */
                    if (dp < endp) {
                        *dp++ = *sp;
                    }
                    break;
            }
        } else {
            if (dp < endp) {
                *dp++ = *sp;
            }
        }
    }
    *dp = '\0';

    ereport(DEBUG3, (errmsg_internal("executing restore command \"%s\"", xlogRestoreCmd)));

    // Check signals before restore command and reset afterwards.
    PreRestoreCommand();

    // Copy xlog from archival storage to XLOGDIR
    rc = gs_popen_security(xlogRestoreCmd);

    PostRestoreCommand();

    if (rc == 0) {
        // command apparently succeeded, but let's make sure the file is
        // really there now and has the correct size.
        if (stat(xlogpath, &stat_buf) == 0) {
            if (expectedSize > 0 && stat_buf.st_size != expectedSize) {
                /*
                 * If we find a partial file in standby mode, we assume it's
                 * because it's just being copied to the archive, and keep
                 * trying.
                 *
                 * Otherwise treat a wrong-sized file as FATAL to ensure the
                 * DBA would notice it, but is that too strong? We could try
                 * to plow ahead with a local copy of the file ... but the
                 * problem is that there probably isn't one, and we'd
                 * incorrectly conclude we've reached the end of WAL and we're
                 * done recovering ...
                 */
                int elevel = (t_thrd.xlog_cxt.StandbyMode && stat_buf.st_size < expectedSize) ? DEBUG1 : FATAL;
                ereport(elevel, (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu", xlogfname,
                                        (unsigned long)stat_buf.st_size, (unsigned long)expectedSize)));
                return false;
            } else {
                ereport(LOG, (errmsg("restored log file \"%s\" from archive", xlogfname)));
                errorno = strcpy_s(path, MAXPGPATH, xlogpath);
                securec_check(errorno, "", "");
                return true;
            }
        } else if (errno != ENOENT) { /* stat failed */
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", xlogpath)));
        }
    }

    /*
     * Remember, we rollforward UNTIL the restore fails so failure here is
     * just part of the process... that makes it difficult to determine
     * whether the restore failed because there isn't an archive to restore,
     * or because the administrator has specified the restore program
     * incorrectly.  We have to assume the former.
     *
     * However, if the failure was due to any sort of signal, it's best to
     * punt and abort recovery.  (If we "return false" here, upper levels will
     * assume that recovery is complete and start up the database!) It's
     * essential to abort on child SIGINT and SIGQUIT, because per spec
     * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
     * those it's a good bet we should have gotten it too.
     *
     * On SIGTERM, assume we have received a fast shutdown request, and exit
     * cleanly. It's pure chance whether we receive the SIGTERM first, or the
     * child process. If we receive it first, the signal handler will call
     * proc_exit, otherwise we do it here. If we or the child process received
     * SIGTERM for any other reason than a fast shutdown request, postmaster
     * will perform an immediate shutdown when it sees us exiting
     * unexpectedly.
     *
     * Per the Single Unix Spec, shells report exit status > 128 when a called
     * command died on a signal.  Also, 126 and 127 are used to report
     * problems such as an unfindable command; treat those as fatal errors
     * too.
     */
    if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM) {
        proc_exit(1);
    }

    signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;

    ereport(signaled ? FATAL : DEBUG2,
            (errmsg("could not restore file \"%s\" from archive: return code %d", xlogfname, rc)));

not_available:

    /*
     * if an archived file is not available, there might still be a version of
     * this file in XLOGDIR, so return that as the filename to open.
     *
     * In many recovery scenarios we expect this to fail also, but if so that
     * just means we've reached the end of WAL.
     */
    errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%s", xlogfname);
    securec_check_ss(errorno, "", "");

    return false;
}

/*
 * Attempt to execute an external shell command during recovery.
 *
 * 'command' is the shell command to be executed, 'commandName' is a
 * human-readable name describing the command emitted in the logs. If
 * 'failOnSignal' is true and the command is killed by a signal, a FATAL
 * error is thrown. Otherwise a WARNING is emitted.
 *
 * This is currently used for recovery_end_command and archive_cleanup_command.
 */
static void ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
{
    char xlogRecoveryCmd[MAXPGPATH];
    char lastRestartPointFname[MAXPGPATH];
    char *dp = NULL;
    char *endp = NULL;
    const char *sp = NULL;
    int rc = 0;
    bool signaled = false;
    XLogSegNo restartSegNo;
    errno_t errorno = EOK;

    Assert(command && commandName);

    /*
     * Calculate the archive file cutoff point for use during log shipping
     * replication. All files earlier than this point can be deleted from the
     * archive, though there is no requirement to do so.
     */
    LWLockAcquire(ControlFileLock, LW_SHARED);
    XLByteToSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, restartSegNo);

    errorno = snprintf_s(lastRestartPointFname, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X",
                         t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID,
                         (uint32)((restartSegNo) / XLogSegmentsPerXLogId),
                         (uint32)((restartSegNo) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");
    LWLockRelease(ControlFileLock);

    // construct the command to be executed
    dp = xlogRecoveryCmd;
    endp = xlogRecoveryCmd + MAXPGPATH - 1;
    *endp = '\0';

    for (sp = command; *sp; sp++) {
        if (*sp == '%') {
            switch (sp[1]) {
                case 'r':
                    /* %r: filename of last restartpoint */
                    sp++;
                    errorno = strncpy_s(dp, MAXPGPATH - (dp - xlogRecoveryCmd), lastRestartPointFname,
                                        strlen(lastRestartPointFname));
                    securec_check(errorno, "", "");
                    dp += strlen(dp);
                    break;
                case '%':
                    /* convert %% to a single % */
                    sp++;
                    if (dp < endp) {
                        *dp++ = *sp;
                    }
                    break;
                default:
                    /* otherwise treat the % as not special */
                    if (dp < endp) {
                        *dp++ = *sp;
                    }
                    break;
            }
        } else {
            if (dp < endp) {
                *dp++ = *sp;
            }
        }
    }
    *dp = '\0';

    ereport(DEBUG3, (errmsg_internal("executing %s \"%s\"", commandName, command)));

    // execute the constructed command
    rc = gs_popen_security(xlogRecoveryCmd);
    if (rc != 0) {
        /*
         * If the failure was due to any sort of signal, it's best to punt and
         * abort recovery. See also detailed comments on signals in RestoreArchivedFile().
         */
        signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
        ereport((signaled && failOnSignal) ? FATAL : WARNING,
                (errmsg("%s \"%s\": return code %d", commandName, command, rc)));
    }
}

/*
 * Preallocate log files beyond the specified log endpoint.
 *
 * XXX this is currently extremely conservative, since it forces only one
 * future log segment to exist, and even that only if we are 75% done with
 * the current one.  This is only appropriate for very low-WAL-volume systems.
 * High-volume systems will be OK once they've built up a sufficient set of
 * recycled log segments, but the startup transient is likely to include
 * a lot of segment creations by foreground processes, which is not so good.
 */
static void PreallocXlogFiles(XLogRecPtr endptr)
{
    XLogSegNo _logSegNo;
    int lf;
    bool use_existent = false;

    gstrace_entry(GS_TRC_ID_PreallocXlogFiles);
    XLByteToPrevSeg(endptr, _logSegNo);
    if ((endptr - 1) % XLogSegSize >= (uint32)(0.75 * XLogSegSize)) {
        _logSegNo++;
        use_existent = true;
        lf = XLogFileInit(_logSegNo, &use_existent, true);
        close(lf);
        if (!use_existent) {
            t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_added++;
        }
    }

    gstrace_exit(GS_TRC_ID_PreallocXlogFiles);
}

/*
 * Throws an error if the given log segment has already been removed or
 * recycled. The caller should only pass a segment that it knows to have
 * existed while the server has been running, as this function always
 * succeeds if no WAL segments have been removed since startup.
 * 'tli' is only used in the error message.
 */
void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogSegNo lastRemovedSegNo;
    errno_t errorno = EOK;

    SpinLockAcquire(&xlogctl->info_lck);
    lastRemovedSegNo = xlogctl->lastRemovedSegNo;
    SpinLockRelease(&xlogctl->info_lck);

    if (segno <= lastRemovedSegNo) {
        char filename[MAXFNAMELEN];

        errorno = snprintf_s(filename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", tli,
                             (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
        securec_check_ss(errorno, "", "");

        WalSegmemtRemovedhappened = true;
        ereport(ERROR,
                (errcode_for_file_access(), errmsg("requested WAL segment %s has already been removed", filename)));
    }
}

/*
 * Return the last WAL segment removed, or 0 if no segment has been removed
 * since startup.
 *
 * NB: the result can be out of date arbitrarily fast, the caller has to deal
 * with that.
 */
XLogSegNo XLogGetLastRemovedSegno(void)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogSegNo lastRemovedSegNo;

    SpinLockAcquire(&xlogctl->info_lck);
    lastRemovedSegNo = xlogctl->lastRemovedSegNo;
    SpinLockRelease(&xlogctl->info_lck);

    return lastRemovedSegNo;
}

static void remove_xlogtemp_files(void)
{
    DIR *dir = NULL;
    char fullpath[MAXPGPATH] = {0};
    struct dirent *de = NULL;
    struct stat st;
    errno_t errorno = EOK;

    if ((dir = opendir(SS_XLOGDIR)) != NULL) {
        while ((de = readdir(dir)) != NULL) {
            /* Skip special stuff */
            if (strncmp(de->d_name, ".", 1) == 0 || strncmp(de->d_name, "..", 2) == 0) {
                continue;
            }

            if (!strstr(de->d_name, "xlogtemp")) {
                continue;
            }

            errorno = snprintf_s(fullpath, sizeof(fullpath), sizeof(fullpath) - 1, "%s/%s", SS_XLOGDIR, de->d_name);
            securec_check_ss(errorno, "\0", "\0");

            if (lstat(fullpath, &st) != 0) {
                if (!FILE_POSSIBLY_DELETED(errno)) {
                    ereport(WARNING, (errmsg("could not stat file or directory : %s", fullpath)));
                }
                /* If the file went away while scanning, it's not an error. */
                continue;
            }

            if (S_ISDIR(st.st_mode)) {
                continue;
            }
            /* only handle the normal file */
            if (S_ISREG(st.st_mode)) {
                (void)unlink(fullpath);
            }
        }
        (void)closedir(dir);
    }
}

/*
 * Update the last removed segno pointer in shared memory, to reflect
 * that the given XLOG file has been removed.
 */
static void UpdateLastRemovedPtr(const char *filename)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    uint32 tli;
    uint32 log_temp = 0;
    uint32 seg_temp = 0;
    XLogSegNo segno;

    if (sscanf_s(filename, "%08X%08X%08X", &tli, &log_temp, &seg_temp) != 3) {
        ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                        errmsg("could not parse xlog segment file \"%s\"", filename)));
    }
    segno = (uint64)log_temp * XLogSegmentsPerXLogId + seg_temp;

    SpinLockAcquire(&xlogctl->info_lck);
    if (segno > xlogctl->lastRemovedSegNo) {
        xlogctl->lastRemovedSegNo = segno;
    }
    SpinLockRelease(&xlogctl->info_lck);
}

/*
 * Recycle or remove all log files older or equal to passed segno#
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
 */
static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr)
{
    if (!SSXLogInsertAllowed())
        return;

    DIR *xldir = NULL;
    struct dirent *xlde = NULL;
    char lastoff[MAXFNAMELEN];
    errno_t errorno = EOK;

    xldir = AllocateDir(SS_XLOGDIR);
    if (xldir == NULL) {
        ereport(ERROR,
                (errcode_for_file_access(), errmsg("could not open transaction log directory \"%s\": %m", SS_XLOGDIR)));
    }

    errorno = snprintf_s(lastoff, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
                         (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");
    /* segno is about to remove, so (segno + 1) will be the oldest seg */
    g_instance.comm_cxt.predo_cxt.redoPf.oldest_segment = segno + 1;
    ereport(LOG, (errmsg("attempting to remove WAL segments older than log file %s", lastoff)));

    gstrace_entry(GS_TRC_ID_RemoveOldXlogFiles);
    while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) {
        /* Ignore files that are not XLOG segments */
        if (strlen(xlde->d_name) != 24 || strspn(xlde->d_name, "0123456789ABCDEF") != 24) {
            continue;
        }

        /*
         * We ignore the timeline part of the XLOG segment identifiers in
         * deciding whether a segment is still needed.	This ensures that we
         * won't prematurely remove a segment from a parent timeline. We could
         * probably be a little more proactive about removing segments of
         * non-parent timelines, but that would be a whole lot more
         * complicated.
         *
         * We use the alphanumeric sorting property of the filenames to decide
         * which ones are earlier than the lastoff segment.
         */
        if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0) {
            if (XLogArchiveCheckDone(xlde->d_name)) {
                /* Update the last removed location in shared memory first */
                UpdateLastRemovedPtr((const char *)xlde->d_name);

                RemoveXlogFile(xlde->d_name, endptr);
            }
        }
    }
    FreeDir(xldir);
    gstrace_data(1, GS_TRC_ID_RemoveOldXlogFiles, TRC_DATA_FMT_DFLT, lastoff, MAXFNAMELEN);
    gstrace_exit(GS_TRC_ID_RemoveOldXlogFiles);
}

/*
 * Recycle or remove a log file that's no longer needed.
 *
 * endptr is current (or recent) end of xlog; this is used to determine
 * whether we want to recycle rather than delete no-longer-wanted log files.
 */
static void RemoveXlogFile(const char *segname, XLogRecPtr endptr)
{
    char path[MAXPGPATH];
#ifdef WIN32
    char newpath[MAXPGPATH];
#endif
    struct stat statbuf;
    XLogSegNo endLogSegNo;
    int max_advance;

    /*
     * Initialize info about where to try to recycle to.  We allow recycling
     * segments up to XLOGfileslop segments beyond the current XLOG location.
     */
    XLByteToPrevSeg(endptr, endLogSegNo);
    max_advance = XLOGfileslop;

    errno_t errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", SS_XLOGDIR, segname);
    securec_check_ss(errorno, "", "");

    /*
     * Before deleting the file, see if it can be recycled as a future log
     * segment. Only recycle normal files, pg_standby for example can create
     * symbolic links pointing to a separate archive directory.
     */
    if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
        InstallXLogFileSegment(&endLogSegNo, (const char *)path, true, &max_advance, true, SS_XLOGDIR)) {
        ereport(DEBUG2, (errmsg("recycled transaction log file \"%s\"", segname)));
        t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled++;
        /* Needn't recheck that slot on future iterations */
        if (max_advance > 0) {
            endLogSegNo++;
            max_advance--;
        }
    } else {
        /* No need for any more future segments... */
        int rc;

        ereport(DEBUG2, (errmsg("removing transaction log file \"%s\"", segname)));

#ifdef WIN32
        /*
         * On Windows, if another process (e.g another backend) holds the file
         * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
         * will still show up in directory listing until the last handle is
         * closed. To avoid confusing the lingering deleted file for a live
         * WAL file that needs to be archived, rename it before deleting it.
         *
         * If another process holds the file open without FILE_SHARE_DELETE
         * flag, rename will fail. We'll try again at the next checkpoint.
         */
        errorno = snprintf_s(newpath, MAXPGPATH, MAXPGPATH - 1, "%s.deleted", path);
        securec_check_ss(errorno, "", "");

        if (rename(path, newpath) != 0) {
            ereport(LOG,
                    (errcode_for_file_access(), errmsg("could not rename old transaction log file \"%s\": %m", path)));
            return;
        }
        rc = unlink(newpath);
#else
        rc = unlink(path);
#endif
        if (rc != 0) {
            ereport(LOG,
                    (errcode_for_file_access(), errmsg("could not remove old transaction log file \"%s\": %m", path)));
            return;
        }
        t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_removed++;
    }
    XLogArchiveCleanup(segname);
}

/*
 * Verify whether pg_xlog and pg_xlog/archive_status exist.
 * If the latter does not exist, recreate it.
 *
 * It is not the goal of this function to verify the contents of these
 * directories, but to help in cases where someone has performed a cluster
 * copy for PITR purposes but omitted pg_xlog from the copy.
 *
 * We could also recreate pg_xlog if it doesn't exist, but a deliberate
 * policy decision was made not to.  It is fairly common for pg_xlog to be
 * a symlink, and if that was the DBA's intent then automatically making a
 * plain directory would result in degraded performance with no notice.
 */
static void ValidateXLOGDirectoryStructure(void)
{
    char path[MAXPGPATH];
    struct stat stat_buf;
    errno_t errorno = EOK;

    /* Check for pg_xlog; if it doesn't exist, error out */
    if (stat(SS_XLOGDIR, &stat_buf) != 0 || !S_ISDIR(stat_buf.st_mode))
        ereport(FATAL, (errmsg("required WAL directory \"%s\" does not exist", SS_XLOGDIR)));

    /* Check for archive_status */
    errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/archive_status");
    securec_check_ss(errorno, "", "");

    if (stat(path, &stat_buf) == 0) {
        /* Check for weird cases where it exists but isn't a directory */
        if (!S_ISDIR(stat_buf.st_mode)) {
            ereport(FATAL, (errmsg("required WAL directory \"%s\" does not exist", path)));
        }
    } else {
        ereport(LOG, (errmsg("creating missing WAL directory \"%s\"", path)));
        if (mkdir(path, S_IRWXU) < 0) {
            ereport(FATAL, (errmsg("could not create missing directory \"%s\": %m", path)));
        }
    }
}

/*
 * Remove previous backup history files.  This also retries creation of
 * .ready files for any backup history files for which XLogArchiveNotify
 * failed earlier.
 */
static void CleanupBackupHistory(void)
{
    DIR *xldir = NULL;
    struct dirent *xlde = NULL;
    char path[MAXPGPATH];
    errno_t errorno = EOK;

    xldir = AllocateDir(XLOGDIR);
    if (xldir == NULL) {
        ereport(ERROR,
                (errcode_for_file_access(), errmsg("could not open transaction log directory \"%s\": %m", XLOGDIR)));
    }

    while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) {
        if (strlen(xlde->d_name) > 24 && strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
            strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"), ".backup") == 0) {
            if (XLogArchiveCheckDone(xlde->d_name)) {
                ereport(DEBUG2, (errmsg("removing transaction log backup history file \"%s\"", xlde->d_name)));
                errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%s", xlde->d_name);
                securec_check_ss(errorno, "", "");

                unlink(path);
                XLogArchiveCleanup(xlde->d_name);
            }
        }
    }

    FreeDir(xldir);
}

/*
 * Attempt to read an XLOG record.
 *
 * If RecPtr is not NULL, try to read a record at that position.  Otherwise
 * try to read a record just after the last one previously read.
 *
 * If no valid record is available, returns NULL, or fails if emode is PANIC.
 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
 * record is available.
 *
 * The record is copied into readRecordBuf, so that on successful return,
 * the returned record pointer always points there.
 */
static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, bool fetching_ckpt)
{
    XLogRecord *record = NULL;
    uint32 streamFailCount = 0;
    XLogPageReadPrivate *readprivate = (XLogPageReadPrivate *)xlogreader->private_data;

    /* Pass through parameters to XLogPageRead */
    readprivate->fetching_ckpt = fetching_ckpt;
    readprivate->emode = emode;
    readprivate->randAccess = !XLByteEQ(RecPtr, InvalidXLogRecPtr);

    /* This is the first try to read this page. */
    t_thrd.xlog_cxt.failedSources = 0;

    for (;;) {
        char *errormsg = NULL;

        record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
        t_thrd.xlog_cxt.ReadRecPtr = xlogreader->ReadRecPtr;
        t_thrd.xlog_cxt.EndRecPtr = xlogreader->EndRecPtr;
        g_instance.comm_cxt.predo_cxt.redoPf.read_ptr = t_thrd.xlog_cxt.ReadRecPtr;

        if (record == NULL) {
            if (t_thrd.xlog_cxt.readFile >= 0) {
                close(t_thrd.xlog_cxt.readFile);
                t_thrd.xlog_cxt.readFile = -1;
            }

            /*
             * We only end up here without a message when XLogPageRead() failed
             * - in that case we already logged something.
             * In StandbyMode that only happens if we have been triggered, so
             * we shouldn't loop anymore in that case.
             */
            if (errormsg != NULL) {
                ereport(emode_for_corrupt_record(emode, XLByteEQ(RecPtr, InvalidXLogRecPtr) ? t_thrd.xlog_cxt.EndRecPtr
                                                                                            : RecPtr),
                        (errmsg_internal("%s", errormsg) /* already translated */));
            }
        } else if ((!timeLineInHistory(xlogreader->latestPageTLI, t_thrd.xlog_cxt.expectedTLIs)) &&
                   (!(g_instance.attr.attr_storage.IsRoachStandbyCluster && dummyStandbyMode))) {
            /* Check page TLI is one of the expected values. */
            /* Check page TLI is one of the expected values. */
            char fname[MAXFNAMELEN];
            XLogSegNo targetSegNo;
            uint32 offset;
            errno_t errorno = EOK;

            XLByteToSeg(xlogreader->latestPagePtr, targetSegNo);
            offset = xlogreader->latestPagePtr % XLogSegSize;

            errorno = snprintf_s(fname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", xlogreader->readPageTLI,
                                 (uint32)((targetSegNo) / XLogSegmentsPerXLogId),
                                 (uint32)((targetSegNo) % XLogSegmentsPerXLogId));
            securec_check_ss(errorno, "", "");

            ereport(emode_for_corrupt_record(emode,
                                             XLByteEQ(RecPtr, InvalidXLogRecPtr) ? t_thrd.xlog_cxt.EndRecPtr : RecPtr),
                    (errmsg("unexpected timeline ID %u in log segment %s, offset %u", xlogreader->latestPageTLI, fname,
                            offset)));
            record = NULL;
        }

        if (record != NULL) {
            /* Set up lastest valid record */
            latestValidRecord = t_thrd.xlog_cxt.ReadRecPtr;
            latestRecordCrc = record->xl_crc;
            latestRecordLen = record->xl_tot_len;
            /* Great, got a record */
            return record;
        } else {
            /* No valid record available from this source */
            if (streamFailCount < XLOG_STREAM_READREC_MAXTRY) {
                streamFailCount++;
                /* if xlog record failed retry 100 ms. */
                if (u_sess->attr.attr_storage.HaModuleDebug) {
                    ereport(LOG,
                            (errmsg("reread xlog record for %uth times at %X/%X", streamFailCount,
                                    (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)t_thrd.xlog_cxt.ReadRecPtr)));
                }
                pg_usleep(XLOG_STREAM_READREC_INTERVAL);
                ProcTxnWorkLoad(true);
            } else {
                t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource;
                if (u_sess->attr.attr_storage.HaModuleDebug) {
                    ereport(LOG,
                            (errmsg("xlog record failed failedsource %u at %X/%X", t_thrd.xlog_cxt.failedSources,
                                    (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)t_thrd.xlog_cxt.ReadRecPtr)));
                }
            }

            if (t_thrd.xlog_cxt.readFile >= 0) {
                close(t_thrd.xlog_cxt.readFile);
                t_thrd.xlog_cxt.readFile = -1;
            }

            /*
             * If archive recovery was requested, but we were still doing
             * crash recovery, switch to archive recovery and retry using the
             * offline archive. We have now replayed all the valid WAL in
             * pg_xlog, so we are presumably now consistent.
             *
             * We require that there's at least some valid WAL present in
             * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
             * from the archive, even if pg_xlog is completely empty, but we'd
             * have no idea how far we'd have to replay to reach consistency.
             * So err on the safe side and give up.
             */
            if (!t_thrd.xlog_cxt.InArchiveRecovery && t_thrd.xlog_cxt.ArchiveRecoveryRequested && !fetching_ckpt) {
                ProcTxnWorkLoad(false);
                volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
                XLogRecPtr newMinRecoveryPoint;
                ereport(DEBUG1, (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
                t_thrd.xlog_cxt.InArchiveRecovery = true;
                if (t_thrd.xlog_cxt.StandbyModeRequested) {
                    t_thrd.xlog_cxt.StandbyMode = true;
                }

                /* initialize minRecoveryPoint to this record */
                LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
                SpinLockAcquire(&xlogctl->info_lck);
                newMinRecoveryPoint = xlogctl->lastReplayedEndRecPtr;
                SpinLockRelease(&xlogctl->info_lck);
                if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, newMinRecoveryPoint)) {
                    t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = newMinRecoveryPoint;
                }

                /* update local copy */
                t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
                SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);

                UpdateControlFile();
                LWLockRelease(ControlFileLock);

                CheckRecoveryConsistency();

                ereport(LOG, (errmsg("update minrecovery point to %X/%X in archive recovery",
                                     (uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32),
                                     (uint32)(t_thrd.xlog_cxt.minRecoveryPoint))));

                /*
                 * Before we retry, reset lastSourceFailed and currentSource
                 * so that we will check the archive next.
                 */
                t_thrd.xlog_cxt.failedSources = 0;
                continue;
            }
            ProcTxnWorkLoad(false);

            /* In standby mode, loop back to retry. Otherwise, give up. */
            if (t_thrd.xlog_cxt.StandbyMode && !dummyStandbyMode && !t_thrd.xlog_cxt.recoveryTriggered) {
                continue;
            } else {
                return NULL;
            }
        }
    }
}

void UpdateMinrecoveryInAchive()
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr newMinRecoveryPoint;

    /* initialize minRecoveryPoint to this record */
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
    SpinLockAcquire(&xlogctl->info_lck);
    newMinRecoveryPoint = xlogctl->lastReplayedEndRecPtr;
    SpinLockRelease(&xlogctl->info_lck);
    if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, newMinRecoveryPoint)) {
        t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = newMinRecoveryPoint;
    }

    /* update local copy */
    t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
    g_instance.comm_cxt.predo_cxt.redoPf.min_recovery_point = t_thrd.xlog_cxt.minRecoveryPoint;

    UpdateControlFile();
    LWLockRelease(ControlFileLock);
    MultiRedoUpdateMinRecovery(t_thrd.xlog_cxt.minRecoveryPoint);

    ereport(LOG,
            (errmsg("update minrecovery point to %X/%X in archive recovery",
                    (uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32), (uint32)(t_thrd.xlog_cxt.minRecoveryPoint))));
}

/*
 * Try to read a timeline's history file.
 *
 * If successful, return the list of component TLIs (the given TLI followed by
 * its ancestor TLIs).	If we can't find the history file, assume that the
 * timeline has no parents, and return a list of just the specified timeline
 * ID.
 */
List *readTimeLineHistory(TimeLineID targetTLI)
{
    List *result = NIL;
    char path[MAXPGPATH];
    char histfname[MAXFNAMELEN];
    char fline[MAXPGPATH];
    FILE *fd = NULL;
    bool fromArchive = false;
    errno_t errorno = EOK;

    /* Timeline 1 does not have a history file, so no need to check */
    if (targetTLI == 1) {
        return list_make1_int((int)targetTLI);
    }

    if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
        errorno = snprintf_s(histfname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X.history", targetTLI);
        securec_check_ss(errorno, "", "");
        fromArchive = RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
    } else {
        errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%08X.history", targetTLI);
        securec_check_ss(errorno, "", "");
    }

    fd = AllocateFile(path, "r");
    if (fd == NULL) {
        if (errno != ENOENT) {
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path)));
        }
        /* Not there, so assume no parents */
        return list_make1_int((int)targetTLI);
    }

    result = NIL;

    /*
     * Parse the file...
     */
    while (fgets(fline, sizeof(fline), fd) != NULL) {
        /* skip leading whitespace and check for # comment */
        char *ptr = NULL;
        char *endptr = NULL;
        TimeLineID tli;

        for (ptr = fline; *ptr; ptr++) {
            if (!isspace((unsigned char)*ptr)) {
                break;
            }
        }
        if (*ptr == '\0' || *ptr == '#') {
            continue;
        }

        /* expect a numeric timeline ID as first field of line */
        tli = (TimeLineID)strtoul(ptr, &endptr, 0);
        if (errno == EINVAL || errno == ERANGE) {
            ereport(FATAL, (errmsg("recovery_target_timeline is not a valid number: \"%s\"", ptr)));
        }
        if (endptr == ptr) {
            (void)FreeFile(fd);
            fd = NULL;
            ereport(FATAL,
                    (errmsg("syntax error in history file: %s", fline), errhint("Expected a numeric timeline ID.")));
        }

        if (result != NIL && tli <= (TimeLineID)linitial_int(result)) {
            (void)FreeFile(fd);
            fd = NULL;
            ereport(FATAL, (errmsg("invalid data in history file: %s", fline),
                            errhint("Timeline IDs must be in increasing sequence.")));
        }

        /* Build list with newest item first */
        result = lcons_int((int)tli, result);

        /* we ignore the remainder of each line */
    }

    (void)FreeFile(fd);
    fd = NULL;

    if (result != NULL && targetTLI <= (TimeLineID)linitial_int(result)) {
        ereport(FATAL, (errmsg("invalid data in history file \"%s\"", path),
                        errhint("Timeline IDs must be less than child timeline's ID.")));
    }

    result = lcons_int((int)targetTLI, result);

    ereport(DEBUG3, (errmsg_internal("history of timeline %u is %s", targetTLI, nodeToString(result))));

    // If the history file was fetched from archive, save it in pg_xlog for future reference.
    if (fromArchive) {
        KeepFileRestoredFromArchive((const char *)path, (const char *)histfname);
    }

    return result;
}

/*
 * Probe whether a timeline history file exists for the given timeline ID
 */
static bool existsTimeLineHistory(TimeLineID probeTLI)
{
    char path[MAXPGPATH];
    char histfname[MAXFNAMELEN];
    FILE *fd = NULL;
    errno_t errorno = EOK;

    /* Timeline 1 does not have a history file, so no need to check */
    if (probeTLI == 1) {
        return false;
    }

    if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
        errorno = snprintf_s(histfname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X.history", probeTLI);
        securec_check_ss(errorno, "", "");
        RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
    } else {
        errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%08X.history", probeTLI);
        securec_check_ss(errorno, "", "");
    }

    fd = AllocateFile(path, "r");
    if (fd != NULL) {
        (void)FreeFile(fd);
        fd = NULL;
        return true;
    } else {
        if (errno != ENOENT) {
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path)));
        }
        return false;
    }
}

/*
 * Scan for new timelines that might have appeared in the archive since we
 * started recovery.
 *
 * If there are any, the function changes recovery target TLI to the latest
 * one and returns 'true'.
 */
bool rescanLatestTimeLine(void)
{
    TimeLineID newtarget;

    newtarget = findNewestTimeLine(t_thrd.xlog_cxt.recoveryTargetTLI);
    if (newtarget != t_thrd.xlog_cxt.recoveryTargetTLI) {
        /*
         * Determine the list of expected TLIs for the new TLI
         */
        List *newExpectedTLIs = readTimeLineHistory(newtarget);

        /*
         * If the current timeline is not part of the history of the new
         * timeline, we cannot proceed to it.
         *
         * XXX This isn't foolproof: The new timeline might have forked from
         * the current one, but before the current recovery location. In that
         * case we will still switch to the new timeline and proceed replaying
         * from it even though the history doesn't match what we already
         * replayed. That's not good. We will likely notice at the next online
         * checkpoint, as the TLI won't match what we expected, but it's not
         * guaranteed. The admin needs to make sure that doesn't happen.
         */
        if (!list_member_int(newExpectedTLIs, (int)t_thrd.xlog_cxt.recoveryTargetTLI)) {
            ereport(LOG, (errmsg("new timeline %u is not a child of database system timeline %u", newtarget,
                                 t_thrd.xlog_cxt.ThisTimeLineID)));
        } else {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

            /* Switch target */
            t_thrd.xlog_cxt.recoveryTargetTLI = newtarget;
            list_free(t_thrd.xlog_cxt.expectedTLIs);
            t_thrd.xlog_cxt.expectedTLIs = newExpectedTLIs;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->RecoveryTargetTLI = t_thrd.xlog_cxt.recoveryTargetTLI;
            SpinLockRelease(&xlogctl->info_lck);

            ereport(LOG, (errmsg("new target timeline is %u", t_thrd.xlog_cxt.recoveryTargetTLI)));
            return true;
        }
    }
    return false;
}

/*
 * Find the newest existing timeline, assuming that startTLI exists.
 *
 * Note: while this is somewhat heuristic, it does positively guarantee
 * that (result + 1) is not a known timeline, and therefore it should
 * be safe to assign that ID to a new timeline.
 */
static TimeLineID findNewestTimeLine(TimeLineID startTLI)
{
    TimeLineID newestTLI;
    TimeLineID probeTLI;

    /*
     * The algorithm is just to probe for the existence of timeline history
     * files.  XXX is it useful to allow gaps in the sequence?
     */
    newestTLI = startTLI;

    for (probeTLI = startTLI + 1;; probeTLI++) {
        if (existsTimeLineHistory(probeTLI)) {
            newestTLI = probeTLI; /* probeTLI exists */
        } else {
            /* doesn't exist, assume we're done */
            break;
        }
    }

    return newestTLI;
}

static off_t GetControlPageByInstanceId()
{
    off_t seekpos = 0;
    if (ENABLE_DSS) {
        seekpos = (off_t)BLCKSZ * g_instance.attr.attr_storage.dms_attr.instance_id;
    }

    return seekpos;
}

void SSWriteInstanceControlFile(int fd, const char* buffer, int id, off_t wsize)
{
    errno = 0;
    off_t seekpos = 0;
    off_t actual_size = 0;

    /* find write postion of instance according to noid_id */
    if (id != -1) {
        seekpos = (off_t)BLCKSZ * id;

        off_t fileSize = dss_get_file_size(XLOG_CONTROL_FILE);
        ereport(LOG, (errmsg_internal("start fileSize = %ld", fileSize)));
        if ((off_t)seekpos > fileSize) {
            (void)dss_fallocate_file(fd, 0, fileSize, seekpos - fileSize);
        }

        actual_size = lseek(fd, seekpos, SEEK_SET);
        if (actual_size == -1) {
            ereport(PANIC, (errcode_for_file_access(),
                    errmsg("write position %ld is larger than max write size in dss", seekpos)));
        }
    }

    pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_WRITE);
    if (write(fd, buffer, wsize) != wsize) {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0) {
            errno = ENOSPC;
        }
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to control file: %m")));
    }
    pgstat_report_waitevent(WAIT_EVENT_END);

    pgstat_report_waitevent((uint32)WAIT_EVENT_CONTROL_FILE_SYNC);
    if (pg_fsync(fd) != 0) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync control file: %m")));
    }
    pgstat_report_waitevent(WAIT_EVENT_END);
}

/*
 * I/O routines for pg_control
 *
 * *ControlFile is a buffer in shared memory that holds an image of the
 * contents of pg_control.	WriteControlFile() initializes pg_control
 * given a preloaded buffer, ReadControlFile() loads the buffer from
 * the pg_control file (during postmaster or standalone-backend startup),
 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
 *
 * For simplicity, WriteControlFile() initializes the fields of pg_control
 * that are related to checking backend/database compatibility, and
 * ReadControlFile() verifies they are correct.  We could split out the
 * I/O and compatibility-check functions, but there seems no need currently.
 */
STATIC void WriteControlFile(void)
{
    int fd = -1;
    char buffer[PG_CONTROL_SIZE] __attribute__((__aligned__(ALIGNOF_BUFFER))); /* need to be aligned */
    errno_t errorno = EOK;

    /*
     * Initialize version and compatibility-check fields
     */
    t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version = PG_CONTROL_VERSION;
    t_thrd.shemem_ptr_cxt.ControlFile->catalog_version_no = CATALOG_VERSION_NO;

    t_thrd.shemem_ptr_cxt.ControlFile->maxAlign = MAXIMUM_ALIGNOF;
    t_thrd.shemem_ptr_cxt.ControlFile->floatFormat = FLOATFORMAT_VALUE;

    t_thrd.shemem_ptr_cxt.ControlFile->blcksz = BLCKSZ;
    t_thrd.shemem_ptr_cxt.ControlFile->relseg_size = RELSEG_SIZE;
    t_thrd.shemem_ptr_cxt.ControlFile->xlog_blcksz = XLOG_BLCKSZ;
    t_thrd.shemem_ptr_cxt.ControlFile->xlog_seg_size = XLogSegSize;

    t_thrd.shemem_ptr_cxt.ControlFile->nameDataLen = NAMEDATALEN;
    t_thrd.shemem_ptr_cxt.ControlFile->indexMaxKeys = INDEX_MAX_KEYS;

    t_thrd.shemem_ptr_cxt.ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;

#ifdef HAVE_INT64_TIMESTAMP
    t_thrd.shemem_ptr_cxt.ControlFile->enableIntTimes = true;
#else
    t_thrd.shemem_ptr_cxt.ControlFile->enableIntTimes = false;
#endif
    t_thrd.shemem_ptr_cxt.ControlFile->float4ByVal = FLOAT4PASSBYVAL;
    t_thrd.shemem_ptr_cxt.ControlFile->float8ByVal = FLOAT8PASSBYVAL;

    /* Timeline */
    t_thrd.shemem_ptr_cxt.ControlFile->timeline = 0;

    /* Contents are protected with a CRC */
    INIT_CRC32C(t_thrd.shemem_ptr_cxt.ControlFile->crc);
    COMP_CRC32C(t_thrd.shemem_ptr_cxt.ControlFile->crc, (char *)t_thrd.shemem_ptr_cxt.ControlFile,
                offsetof(ControlFileData, crc));
    FIN_CRC32C(t_thrd.shemem_ptr_cxt.ControlFile->crc);

    /*
     * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
     * excess over sizeof(ControlFileData).  This reduces the odds of
     * premature-EOF errors when reading pg_control.  We'll still fail when we
     * check the contents of the file, but hopefully with a more specific
     * error than "couldn't read pg_control".
     */
    if (sizeof(ControlFileData) > PG_CONTROL_SIZE) {
        ereport(PANIC, (errmsg("sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one")));
    }

    errorno = memset_s(buffer, PG_CONTROL_SIZE, 0, PG_CONTROL_SIZE);
    securec_check(errorno, "", "");

    errorno = memcpy_s(buffer, PG_CONTROL_SIZE, t_thrd.shemem_ptr_cxt.ControlFile, sizeof(ControlFileData));
    securec_check(errorno, "", "");

#ifdef USE_ASSERT_CHECKING
#define MAX_SIZE 1024
    char current_absolute_path[MAX_SIZE] = {0};
    getcwd(current_absolute_path, MAX_SIZE);
#endif

    fd = BasicOpenFile(XLOG_CONTROL_FILE, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, S_IRUSR | S_IWUSR);
    if (fd < 0) {
        ereport(PANIC,
                (errcode_for_file_access(), errmsg("could not create control file \"%s\": %m", XLOG_CONTROL_FILE)));
    }

    /* create pg_control file */
    if (ENABLE_DSS) {
        SSWriteInstanceControlFile(fd, buffer, g_instance.attr.attr_storage.dms_attr.instance_id, PG_CONTROL_SIZE);
    } else {
        errno = 0;
        pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_WRITE);
        if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE) {
            /* if write didn't set errno, assume problem is no disk space */
            if (errno == 0) {
                errno = ENOSPC;
            }
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to control file: %m")));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);

        pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_SYNC);
        if (pg_fsync(fd) != 0) {
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync control file: %m")));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);
    }

    if (close(fd)) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m")));
    }
}

STATIC void ReadControlFile(void)
{
    pg_crc32c crc;
    int fd = -1;
    char *fname = NULL;
    bool retry = false;
    errno_t errorno = EOK;

    // Read data...
#ifdef USE_ASSERT_CHECKING
#define MAX_SIZE 1024
    char current_absolute_path[MAX_SIZE] = {0};
    getcwd(current_absolute_path, MAX_SIZE);
#endif

    fname = XLOG_CONTROL_FILE;

loop:
    fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
    if (fd < 0) {
        ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname)));
    }

    off_t seekpos = GetControlPageByInstanceId();
    (void)lseek(fd, seekpos, SEEK_SET);

    if (ENABLE_DSS) {
        int read_size = BUFFERALIGN(sizeof(ControlFileData));
        char buffer[read_size] __attribute__((__aligned__(ALIGNOF_BUFFER)));

        pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_READ);
        if (read(fd, buffer, read_size) != read_size) {
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m")));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);

        errorno = memcpy_s(t_thrd.shemem_ptr_cxt.ControlFile,
            sizeof(ControlFileData), buffer, sizeof(ControlFileData));
        securec_check_c(errorno, "\0", "\0");
    } else {
        pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_READ);
        if (read(fd, t_thrd.shemem_ptr_cxt.ControlFile, sizeof(ControlFileData)) !=
            sizeof(ControlFileData)) {
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m")));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);
    }

    if (close(fd)) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m")));
    }

    /* Now check the CRC. */
    INIT_CRC32C(crc);
    COMP_CRC32C(crc, (char *)t_thrd.shemem_ptr_cxt.ControlFile, offsetof(ControlFileData, crc));
    FIN_CRC32C(crc);

    if (!EQ_CRC32C(crc, t_thrd.shemem_ptr_cxt.ControlFile->crc)) {
        if (retry == false) {
            ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname)));
            fname = XLOG_CONTROL_FILE_BAK;
            retry = true;
            goto loop;
        } else {
            ereport(FATAL, (errmsg("incorrect checksum in control file")));
        }
    }

    /*
     * Check for expected pg_control format version.  If this is wrong, the
     * CRC check will likely fail because we'll be checking the wrong number
     * of bytes.  Complaining about wrong version will probably be more
     * enlightening than complaining about wrong CRC.
     */
    if (t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version != PG_CONTROL_VERSION &&
        t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version % 65536 == 0 &&
        t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version / 65536 != 0) {
        ereport(
            FATAL,
            (errmsg("database files are incompatible with server"),
             errdetail("The database cluster was initialized with PG_CONTROL_VERSION %u (0x%08x),"
                       " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
                       t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version,
                       t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version, PG_CONTROL_VERSION, PG_CONTROL_VERSION),
             errhint("This could be a problem of mismatched byte ordering.  It looks like you need to gs_initdb.")));
    }

    if (t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version != PG_CONTROL_VERSION) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with PG_CONTROL_VERSION %u,"
                                  " but the server was compiled with PG_CONTROL_VERSION %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->pg_control_version, PG_CONTROL_VERSION),
                        errhint("It looks like you need to gs_initdb.")));
    }

    /*
     * Do compatibility checking immediately.  If the database isn't
     * compatible with the backend executable, we want to abort before we can
     * possibly do any damage.
     */
    if (t_thrd.shemem_ptr_cxt.ControlFile->catalog_version_no != CATALOG_VERSION_NO) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with CATALOG_VERSION_NO %u,"
                                  " but the server was compiled with CATALOG_VERSION_NO %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->catalog_version_no, CATALOG_VERSION_NO),
                        errhint("It looks like you need to gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->maxAlign != MAXIMUM_ALIGNOF) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with MAXALIGN %u,"
                                  " but the server was compiled with MAXALIGN %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->maxAlign, MAXIMUM_ALIGNOF),
                        errhint("It looks like you need to gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->floatFormat != FLOATFORMAT_VALUE) {
        ereport(FATAL,
                (errmsg("database files are incompatible with server"),
                 errdetail("The database cluster appears to use a different floating-point number format than the "
                           "server executable."),
                 errhint("It looks like you need to gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->blcksz != BLCKSZ) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with BLCKSZ %u"
                                  " but the server was compiled with BLCKSZ %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->blcksz, BLCKSZ),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->relseg_size != RELSEG_SIZE) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with RELSEG_SIZE %u,"
                                  " but the server was compiled with RELSEG_SIZE %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->relseg_size, RELSEG_SIZE),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->xlog_blcksz != XLOG_BLCKSZ) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with XLOG_BLCKSZ %u,"
                                  " but the server was compiled with XLOG_BLCKSZ %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->xlog_blcksz, XLOG_BLCKSZ),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->xlog_seg_size != XLogSegSize) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with XLOG_SEG_SIZE %u,"
                                  " but the server was compiled with XLOG_SEG_SIZE %lu.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->xlog_seg_size, XLogSegSize),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->nameDataLen != NAMEDATALEN) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with NAMEDATALEN %u,"
                                  " but the server was compiled with NAMEDATALEN %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->nameDataLen, NAMEDATALEN),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->indexMaxKeys != INDEX_MAX_KEYS) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with INDEX_MAX_KEYS %u"
                                  " but the server was compiled with INDEX_MAX_KEYS %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
    if (t_thrd.shemem_ptr_cxt.ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %u,"
                                  " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
                                  t_thrd.shemem_ptr_cxt.ControlFile->toast_max_chunk_size, (int)TOAST_MAX_CHUNK_SIZE),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }

#ifdef HAVE_INT64_TIMESTAMP
    if (t_thrd.shemem_ptr_cxt.ControlFile->enableIntTimes != true) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
                                  " but the server was compiled with HAVE_INT64_TIMESTAMP."),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
#else
    if (t_thrd.shemem_ptr_cxt.ControlFile->enableIntTimes != false) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
                                  " but the server was compiled without HAVE_INT64_TIMESTAMP."),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
#endif

#ifdef USE_FLOAT4_BYVAL
    if (t_thrd.shemem_ptr_cxt.ControlFile->float4ByVal != true) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
                                  " but the server was compiled with USE_FLOAT4_BYVAL."),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
#else
    if (t_thrd.shemem_ptr_cxt.ControlFile->float4ByVal != false) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
                                  " but the server was compiled without USE_FLOAT4_BYVAL."),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
#endif

#ifdef USE_FLOAT8_BYVAL
    if (t_thrd.shemem_ptr_cxt.ControlFile->float8ByVal != true) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
                                  " but the server was compiled with USE_FLOAT8_BYVAL."),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
#else
    if (t_thrd.shemem_ptr_cxt.ControlFile->float8ByVal != false) {
        ereport(FATAL, (errmsg("database files are incompatible with server"),
                        errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
                                  " but the server was compiled without USE_FLOAT8_BYVAL."),
                        errhint("It looks like you need to recompile or gs_initdb.")));
    }
#endif
    /* recover the bad primary control file */
    if (retry == true) {
        RecoverControlFile();
    }
}

/*
 * copy ControlFile to local variable, then calculate CRC and write to pg_control
 * with the local variable.
 * If all work is with ControlFile, there is a time window between calculating CRC
 * and writing file, other thread can modify ControlFile before writing file, then
 * crc in ControlFile is invalid.
 */
void UpdateControlFile(void)
{
    int fd = -1;
    int len;
    errno_t err = EOK;
    char *fname[2];
    len = sizeof(ControlFileData);
    int write_size;

    write_size = (int)BUFFERALIGN(len);
    char buffer[write_size] __attribute__((__aligned__(ALIGNOF_BUFFER))) = {0};

    err = memcpy_s(&buffer, write_size, t_thrd.shemem_ptr_cxt.ControlFile, len);
    securec_check(err, "\0", "\0");

    INIT_CRC32C(((ControlFileData *)buffer)->crc);
    COMP_CRC32C(((ControlFileData *)buffer)->crc, (char *)buffer, offsetof(ControlFileData, crc));
    FIN_CRC32C(((ControlFileData *)buffer)->crc);

#ifdef USE_ASSERT_CHECKING
#define MAX_SIZE 1024
    char current_absolute_path[MAX_SIZE] = {0};
    getcwd(current_absolute_path, MAX_SIZE);
#endif

    fname[0] = XLOG_CONTROL_FILE_BAK;
    fname[1] = XLOG_CONTROL_FILE;

    for (int i = 0; i < 2; i++) {
        if (i == 0) {
            fd = BasicOpenFile(fname[i], O_CREAT | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
        } else {
            fd = BasicOpenFile(fname[i], O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
        }

        if (fd < 0) {
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname[i])));
        }

        off_t seekpos = GetControlPageByInstanceId();
        (void)lseek(fd, seekpos, SEEK_SET);

        errno = 0;
        pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
        if (write(fd, buffer, write_size) != write_size) {
            /* if write didn't set errno, assume problem is no disk space */
            if (errno == 0) {
                errno = ENOSPC;
            }
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to control file: %m")));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);

        pgstat_report_waitevent(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
        if (fsync(fd) != 0) {
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync control file: %m")));
        }
        pgstat_report_waitevent(WAIT_EVENT_END);

        if (close(fd)) {
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m")));
        }
    }
#ifndef ENABLE_MULTIPLE_NODES
    /*
     * Truncate DCF Log before Min(flushedIdx, minAppliedIdx) to avoid space waste,
     * flushedIdx is the DCF index corresponding to flushed XLog LSN,
     * minAppliedIdx is used to avoid full build caused by slow DCF follower.
     */
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf && t_thrd.dcf_cxt.dcfCtxInfo->isDcfStarted) {
        DcfLogTruncate();
    }
#endif
}

/*
 * when incorrect checksum is detected in control file,
 * we should recover the control file using the content of backup file
 */
static void RecoverControlFile(void)
{
    int fd;
    char buffer[PG_CONTROL_SIZE];
    errno_t errorno = EOK;

    ereport(WARNING, (errmsg("recover the control file due to incorrect checksum")));

    errorno = memset_s(buffer, PG_CONTROL_SIZE, 0, PG_CONTROL_SIZE);
    securec_check(errorno, "", "");

    errorno = memcpy_s(buffer, PG_CONTROL_SIZE, t_thrd.shemem_ptr_cxt.ControlFile, sizeof(ControlFileData));
    securec_check(errorno, "", "");

    fd = BasicOpenFile(XLOG_CONTROL_FILE, O_TRUNC | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
    if (fd < 0) {
        ereport(FATAL, (errcode_for_file_access(),
                        errmsg("recover failed could not open control file \"%s\": %m", XLOG_CONTROL_FILE)));
    }

    off_t seekpos = GetControlPageByInstanceId();
    (void)lseek(fd, seekpos, SEEK_SET);

    errno = 0;
    /* write the whole block */
    if (write(fd, &buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE) {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0) {
            errno = ENOSPC;
        }
        ereport(PANIC, (errcode_for_file_access(), errmsg("recover failed could not write to control file: %m")));
    }

    if (pg_fsync(fd) != 0) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("recover failed could not fsync control file: %m")));
    }

    if (close(fd)) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("recover failed could not close control file: %m")));
    }
}

/*
 * Returns the unique system identifier from control file.
 */
uint64 GetSystemIdentifier(void)
{
    Assert(t_thrd.shemem_ptr_cxt.ControlFile != NULL);
    return t_thrd.shemem_ptr_cxt.ControlFile->system_identifier;
}

void SetSystemIdentifier(uint64 system_identifier)
{
    Assert(t_thrd.shemem_ptr_cxt.ControlFile != NULL);
    t_thrd.shemem_ptr_cxt.ControlFile->system_identifier = system_identifier;
    return;
}

TimeLineID GetThisTimeID(void)
{
    Assert(t_thrd.shemem_ptr_cxt.ControlFile != NULL);
    return t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID;
}

void SetThisTimeID(TimeLineID timelineID)
{
    Assert(t_thrd.shemem_ptr_cxt.ControlFile != NULL);
    t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID = timelineID;
    return;
}

/*
 * Auto-tune the number of XLOG buffers.
 *
 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
 * a maximum of one XLOG segment (there is little reason to think that more
 * is helpful, at least so long as we force an fsync when switching log files)
 * and a minimum of 8 blocks (which was the default value prior to openGauss
 * 9.1, when auto-tuning was added).
 *
 * This should not be called until NBuffers has received its final value.
 */
static int XLOGChooseNumBuffers(void)
{
    int xbuffers = g_instance.attr.attr_storage.NBuffers / 32;
    if (xbuffers > (int)XLogSegSize / XLOG_BLCKSZ) {
        xbuffers = (int)XLogSegSize / XLOG_BLCKSZ;
    }
    if (xbuffers < 8) {
        xbuffers = 8;
    }
    return xbuffers;
}

/*
 * GUC check_hook for wal_buffers
 */
bool check_wal_buffers(int *newval, void **extra, GucSource source)
{
    /* -1 indicates a request for auto-tune. */
    if (*newval == -1) {
        /*
         * If we haven't yet changed the boot_val default of -1, just let it
         * be.	We'll fix it when XLOGShmemSize is called.
         */
        if (g_instance.attr.attr_storage.XLOGbuffers == -1) {
            return true;
        }

        /* Otherwise, substitute the auto-tune value */
        *newval = XLOGChooseNumBuffers();
    }

    /*
     * We clamp manually-set values to at least 4 blocks.  Prior to PostgreSQL
     * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
     * the case, we just silently treat such values as a request for the
     * minimum.  (We could throw an error instead, but that doesn't seem very
     * helpful.)
     */
    if (*newval < 4) {
        *newval = 4;
    }

    return true;
}

/*
 * Initialization of shared memory for XLOG
 */
Size XLOGShmemSize(void)
{
    Size size;
    errno_t errorno = EOK;

    /*
     * If the value of wal_buffers is -1, use the preferred auto-tune value.
     * This isn't an amazingly clean place to do this, but we must wait till
     * NBuffers has received its final value, and must do it before using the
     * value of XLOGbuffers to do anything important.
     */
    if (g_instance.attr.attr_storage.XLOGbuffers == -1) {
        char buf[32];

        errorno = snprintf_s(buf, sizeof(buf), sizeof(buf) - 1, "%d", XLOGChooseNumBuffers());
        securec_check_ss(errorno, "", "");
        SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
    }
    Assert(g_instance.attr.attr_storage.XLOGbuffers > 0);

    /* XLogCtl */
    size = sizeof(XLogCtlData) + 16;

    /* xlog insertion slots, plus alignment will be alloced individually */
    /* xlblocks array */
    size = add_size(size, mul_size(sizeof(XLogRecPtr), g_instance.attr.attr_storage.XLOGbuffers));
    /* extra alignment padding for XLOG I/O buffers */
    size = add_size(size, XLOG_BLCKSZ);
    /* and the buffers themselves */
    size = add_size(size, mul_size(XLOG_BLCKSZ, g_instance.attr.attr_storage.XLOGbuffers));

    /*
     * Note: we don't count ControlFileData, it comes out of the "slop factor"
     * added by CreateSharedMemoryAndSemaphores.  This lets us use this
     * routine again below to compute the actual allocation size.
     */
    return size;
}

void XLOGShmemInit(void)
{
    bool foundCFile = false;
    bool foundDCFData = false;
    bool foundXLog = false;
    char *allocptr = NULL;
    int i;
    errno_t errorno = EOK;

    int nNumaNodes = g_instance.shmem_cxt.numaNodeNum;
    if (!IsUnderPostmaster && g_instance.attr.attr_storage.num_xloginsert_locks % nNumaNodes != 0) {
        ereport(PANIC,
                (errmsg("XLOGShmemInit num_xloginsert_locks should be multiple of NUMA node number in the system.")));
    }
    g_instance.wal_cxt.num_locks_in_group = g_instance.attr.attr_storage.num_xloginsert_locks / nNumaNodes;

    t_thrd.shemem_ptr_cxt.ControlFile = (ControlFileData *)ShmemInitStruct("Control File", sizeof(ControlFileData),
                                                                           &foundCFile);
    t_thrd.shemem_ptr_cxt.dcfData = NULL;
#ifndef ENABLE_MULTIPLE_NODES
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
        t_thrd.shemem_ptr_cxt.dcfData = (DCFData *)ShmemInitStruct("DCF data", sizeof(DCFData), &foundDCFData);
    }
#endif
    t_thrd.shemem_ptr_cxt.XLogCtl = (XLogCtlData *)ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);

    t_thrd.shemem_ptr_cxt.XLogCtl = (XLogCtlData *)TYPEALIGN(16, t_thrd.shemem_ptr_cxt.XLogCtl);

    if (foundCFile || foundXLog || foundDCFData) {
        /* both should be present or neither */
        Assert(foundCFile && foundXLog);
#ifndef ENABLE_MULTIPLE_NODES
        if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
            Assert(foundDCFData);
        }
#endif
        /* Initialize local copy of WALInsertLocks and register the tranche */
        t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks = t_thrd.shemem_ptr_cxt.XLogCtl->Insert.WALInsertLocks;
        t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks =
            t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[t_thrd.proc->nodeno];

        /* Reset walbuffer only before startup thread init, in which StartupXLOG pushes LSN */
        if (ENABLE_DMS && t_thrd.role == STARTUP && (((SS_STANDBY_PROMOTING || SS_PRIMARY_DEMOTED) &&
            g_instance.dms_cxt.SSRecoveryInfo.new_primary_reset_walbuf_flag == true) ||
            SS_STANDBY_FAILOVER)) {
            g_instance.dms_cxt.SSRecoveryInfo.new_primary_reset_walbuf_flag = false;
            errorno = memset_s(t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks,
                sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers, 0,
                sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers);
            securec_check(errorno, "", "");
            ereport(LOG, (errmsg("[SS %s] Successfully reset xlblocks when thrd:%lu with role:%d started",
                SS_PERFORMING_SWITCHOVER ? "switchover" : "failover", t_thrd.proc->pid, (int)t_thrd.role)));
        }
        return;
    }
    errorno = memset_s(t_thrd.shemem_ptr_cxt.XLogCtl, sizeof(XLogCtlData), 0, sizeof(XLogCtlData));
    securec_check(errorno, "", "");

    /*
     * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
     * multiple of the alignment for same, so no extra alignment padding is
     * needed here.
     */
    allocptr = ((char *)t_thrd.shemem_ptr_cxt.XLogCtl) + sizeof(XLogCtlData);
    t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks = (XLogRecPtr *)allocptr;
    errorno = memset_s(t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks,
                       sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers, 0,
                       sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers);
    securec_check(errorno, "", "");

    allocptr += sizeof(XLogRecPtr) * g_instance.attr.attr_storage.XLOGbuffers;

    /* WAL insertion locks. Ensure they're aligned to the full padded size */
    WALInsertLockPadded **insertLockGroupPtr = (WALInsertLockPadded **)CACHELINEALIGN(
        palloc0(nNumaNodes * sizeof(WALInsertLockPadded *) + PG_CACHE_LINE_SIZE));
#ifdef __USE_NUMA
    if (nNumaNodes > 1) {
        size_t allocSize = sizeof(WALInsertLockPadded) * g_instance.wal_cxt.num_locks_in_group + PG_CACHE_LINE_SIZE;
        for (int i = 0; i < nNumaNodes; i++) {
            char *pInsertLock = (char *)numa_alloc_onnode(allocSize, i);
            if (pInsertLock == NULL) {
                ereport(PANIC, (errmsg("XLOGShmemInit could not alloc memory on node %d", i)));
            }
            add_numa_alloc_info(pInsertLock, allocSize);
            insertLockGroupPtr[i] = (WALInsertLockPadded *)(CACHELINEALIGN(pInsertLock));
        }
    } else {
#endif
        char *pInsertLock = (char *)CACHELINEALIGN(palloc(
            sizeof(WALInsertLockPadded) * g_instance.attr.attr_storage.num_xloginsert_locks + PG_CACHE_LINE_SIZE));
        insertLockGroupPtr[0] = (WALInsertLockPadded *)(CACHELINEALIGN(pInsertLock));
#ifdef __USE_NUMA
    }
#endif

    t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks = t_thrd.shemem_ptr_cxt.XLogCtl->Insert.WALInsertLocks =
        insertLockGroupPtr;
    t_thrd.shemem_ptr_cxt.LocalGroupWALInsertLocks = t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[0];

    for (int processorIndex = 0; processorIndex < nNumaNodes; processorIndex++) {
        for (i = 0; i < g_instance.wal_cxt.num_locks_in_group; i++) {
            LWLockInitialize(&t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[processorIndex][i].l.lock,
                             LWTRANCHE_WAL_INSERT);
            t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[processorIndex][i].l.insertingAt = InvalidXLogRecPtr;
#ifdef __aarch64__
            pg_atomic_init_u32(&t_thrd.shemem_ptr_cxt.GlobalWALInsertLocks[processorIndex][i].l.xlogGroupFirst,
                               INVALID_PGPROCNO);
#endif
        }
    }

    /*
     * Align the start of the page buffers to a full xlog block size boundary.
     * This simplifies some calculations in XLOG insertion. It is also required
     * for O_DIRECT.
     */
    allocptr = (char *)TYPEALIGN(XLOG_BLCKSZ, allocptr);
    t_thrd.shemem_ptr_cxt.XLogCtl->pages = allocptr;

    /* The memory of the memset sometimes exceeds 2 GB. so, memset_s cannot be used. */
    MemSet(t_thrd.shemem_ptr_cxt.XLogCtl->pages, 0, (Size)XLOG_BLCKSZ * g_instance.attr.attr_storage.XLOGbuffers);

    if (BBOX_BLACKLIST_XLOG_BUFFER) {
        bbox_blacklist_add(XLOG_BUFFER, t_thrd.shemem_ptr_cxt.XLogCtl->pages,
                           (uint64)XLOG_BLCKSZ * g_instance.attr.attr_storage.XLOGbuffers);
    }

    /*
     * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
     * in additional info.)
     */
    t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck = g_instance.attr.attr_storage.XLOGbuffers - 1;
    t_thrd.shemem_ptr_cxt.XLogCtl->SharedRecoveryInProgress = true;
    t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false;
    t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = false;
    t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRedoDone = false;
    t_thrd.shemem_ptr_cxt.XLogCtl->SharedHotStandbyActive = false;
    t_thrd.shemem_ptr_cxt.XLogCtl->WalWriterSleeping = false;
    t_thrd.shemem_ptr_cxt.XLogCtl->xlogFlushPtrForPerRead = InvalidXLogRecPtr;
    if (!IsInitdb && !dummyStandbyMode) {
        t_thrd.shemem_ptr_cxt.XLogCtl->lastRemovedSegNo = GetOldestXLOGSegNo(t_thrd.proc_cxt.DataDir);
    }

#if (!defined __x86_64__) && (!defined __aarch64__)
    SpinLockInit(&t_thrd.shemem_ptr_cxt.XLogCtl->Insert.insertpos_lck);
#endif
    SpinLockInit(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
    InitSharedLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
    InitSharedLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->dataRecoveryLatch);

    /*
     * If we are not in bootstrap mode, pg_control should already exist. Read
     * and validate it immediately (see comments in ReadControlFile() for the
     * reasons why).
     */
    if (!IsBootstrapProcessingMode()) {
        ReadControlFile();
    }
}

static XLogSegNo GetOldestXLOGSegNo(const char *workingPath)
{
    DIR *xlogDir = NULL;
    struct dirent *dirEnt = NULL;
    char xlogDirStr[MAXPGPATH] = {0};
    char oldestXLogFileName[MAXPGPATH] = {0};
    TimeLineID tli = 0;
    uint32 xlogReadLogid = -1;
    uint32 xlogReadLogSeg = -1;
    XLogSegNo segno;
    errno_t rc = EOK;

    if (ENABLE_DSS) {
        rc = snprintf_s(xlogDirStr, MAXPGPATH, MAXPGPATH - 1, "%s", SS_XLOGDIR);
    } else {
        rc = snprintf_s(xlogDirStr, MAXPGPATH, MAXPGPATH - 1, "%s/%s", workingPath, XLOGDIR);
    }
    securec_check_ss(rc, "", "");
    xlogDir = opendir(xlogDirStr);
    if (!xlogDir) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not open xlog dir in GetOldestXLOGSegNo.")));
    }
    while ((dirEnt = readdir(xlogDir)) != NULL) {
        if (strlen(dirEnt->d_name) == XLOGFILENAMELEN &&
            strspn(dirEnt->d_name, "0123456789ABCDEF") == XLOGFILENAMELEN) {
            if (strlen(oldestXLogFileName) == 0 || strcmp(dirEnt->d_name, oldestXLogFileName) < 0) {
                rc = strncpy_s(oldestXLogFileName, MAXPGPATH - 1, dirEnt->d_name, strlen(dirEnt->d_name) + 1);
                securec_check_ss(rc, "", "");
                oldestXLogFileName[strlen(dirEnt->d_name)] = '\0';
            }
        }
    }

    (void)closedir(xlogDir);

    if (sscanf_s(oldestXLogFileName, "%08X%08X%08X", &tli, &xlogReadLogid, &xlogReadLogSeg) != 3) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("failed to translate name to xlog in GetOldestXLOGSegNo.")));
    }
    segno = (uint64)xlogReadLogid * XLogSegmentsPerXLogId + xlogReadLogSeg - 1;

    return segno;
}

XLogSegNo GetNewestXLOGSegNo(const char *workingPath)
{
    DIR *xlogDir = NULL;
    struct dirent *dirEnt = NULL;
    char xlogDirStr[MAXPGPATH] = {0};
    char newestXLogFileName[MAXPGPATH] = {0};
    TimeLineID tli = 0;
    uint32 xlogReadLogid = -1;
    uint32 xlogReadLogSeg = -1;
    XLogSegNo segno;
    errno_t rc = EOK;

    if (ENABLE_DSS) {
        rc = snprintf_s(xlogDirStr, MAXPGPATH, MAXPGPATH - 1, "%s", SS_XLOGDIR);
    } else {
        rc = snprintf_s(xlogDirStr, MAXPGPATH, MAXPGPATH - 1, "%s/%s", workingPath, XLOGDIR);
    }
    securec_check_ss(rc, "", "");
    xlogDir = opendir(xlogDirStr);
    if (!xlogDir) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not open xlog dir in GetNewestXLOGSegNo.")));
    }
    while ((dirEnt = readdir(xlogDir)) != NULL) {
        if (strlen(dirEnt->d_name) == XLOGFILENAMELEN &&
            strspn(dirEnt->d_name, "0123456789ABCDEF") == XLOGFILENAMELEN) {
            if (strlen(newestXLogFileName) == 0 || strcmp(dirEnt->d_name, newestXLogFileName) > 0) {
                rc = strncpy_s(newestXLogFileName, MAXPGPATH - 1, dirEnt->d_name, strlen(dirEnt->d_name) + 1);
                securec_check_ss(rc, "", "");
                newestXLogFileName[strlen(dirEnt->d_name)] = '\0';
            }
        }
    }

    (void)closedir(xlogDir);

    if (sscanf_s(newestXLogFileName, "%08X%08X%08X", &tli, &xlogReadLogid, &xlogReadLogSeg) != 3) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("failed to translate name to xlog in GetNewestXLOGSegNo.")));
    }
    segno = (uint64)xlogReadLogid * XLogSegmentsPerXLogId + xlogReadLogSeg;

    return segno;
}

static uint64 GetMACAddr(void)
{
    macaddr mac;
    uint64 macAddr;
    int sockFd = NO_SOCKET;
    struct ifconf ifconfInfo;
    struct ifreq ifreqInfo;
    char *buf = NULL;
    errno_t ss_rc = EOK;
    uint32 i;

    ss_rc = memset_s((void *)&mac, sizeof(macaddr), 0, sizeof(macaddr));
    securec_check(ss_rc, "\0", "\0");

    sockFd = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
    if (sockFd != NO_SOCKET) {
        buf = (char *)palloc(MaxMacAddrList * sizeof(ifreq));
        ifconfInfo.ifc_len = MaxMacAddrList * sizeof(ifreq);
        ifconfInfo.ifc_buf = buf;

        if (ioctl(sockFd, SIOCGIFCONF, &ifconfInfo) != -1) {
            struct ifreq *ifrepTmp = ifconfInfo.ifc_req;
            for (i = 0; i < (ifconfInfo.ifc_len / sizeof(struct ifreq)); i++) {
                ss_rc = strcpy_s(ifreqInfo.ifr_name, strlen(ifrepTmp->ifr_name) + 1, ifrepTmp->ifr_name);
                securec_check(ss_rc, "\0", "\0");

                if (ioctl(sockFd, SIOCGIFFLAGS, &ifreqInfo) == 0) {
                    if (!(ifreqInfo.ifr_flags & IFF_LOOPBACK)) {
                        if (ioctl(sockFd, SIOCGIFHWADDR, &ifreqInfo) == 0) {
                            mac.a = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[0];
                            mac.b = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[1];
                            mac.c = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[2];
                            mac.d = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[3];
                            mac.e = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[4];
                            mac.f = (unsigned char)ifreqInfo.ifr_hwaddr.sa_data[5];
                            break;
                        }
                    }
                }
                ifrepTmp++;
            }
        }
        pfree_ext(buf);
        close(sockFd);
    }
    macAddr = ((uint64)mac.a << 40) | ((uint64)mac.b << 32) | ((uint64)mac.c << 24) | ((uint64)mac.d << 16) |
              ((uint64)mac.e << 8) | (uint64)mac.f;
    return macAddr;
}

/*
 * This func must be called ONCE on system install.  It creates pg_control
 * and the initial XLOG segment.
 */
void BootStrapXLOG(void)
{
    CheckPoint checkPoint;
    char *buffer = NULL;
    XLogPageHeader page;
    XLogLongPageHeader longpage;
    XLogRecord *record = NULL;
    char *recptr = NULL;
    bool use_existent = false;
    uint64 sysidentifier;
    uint64 macAddr;
    pg_crc32c crc;
    errno_t ret = EOK;
    TransactionId latestCompletedXid;
    int fd = 0;
    int len = 0;
    int randomNum;

    /*
     * Select a hopefully-unique system identifier code for this installation.
     * We use the result of GetMACAddr() to make the first 48 bits.
     * And the last is a random number from linux system to make sure the difference.
     */
    macAddr = GetMACAddr();
    sysidentifier = macAddr << 16;
    fd = open("/dev/urandom", O_RDONLY, 0);
    if (fd == -1) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("cann't read a random number from file \"/dev/urandom\".")));
    }
    if (fd > 0) {
        len = read(fd, &randomNum, sizeof(randomNum));
        if (len != sizeof(randomNum)) {
            close(fd);
            ereport(ERROR,
                    (errcode_for_file_access(), errmsg("cann't read a random number from file \"/dev/urandom\".")));
        }
    }
    close(fd);
    sysidentifier |= (uint32)randomNum & 0xFFFF;
    ereport(WARNING, (errmsg_internal("macAddr is %u/%u, sysidentifier is %u/%u, randomNum is %u",
                                      (uint32)(macAddr >> 32), (uint32)macAddr, (uint32)(sysidentifier >> 32),
                                      (uint32)sysidentifier, (uint32)randomNum)));

    /* First timeline ID is always 1 */
    t_thrd.xlog_cxt.ThisTimeLineID = 1;

    /* page buffer must be aligned suitably for O_DIRECT */
    buffer = (char *)palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
    page = (XLogPageHeader)TYPEALIGN(XLOG_BLCKSZ, buffer);
    ret = memset_s(page, XLOG_BLCKSZ, 0, XLOG_BLCKSZ);
    securec_check(ret, "", "");

    /*
     * Set up information for the initial checkpoint record
     *
     * The initial checkpoint record is written to the beginning of the WAL
     * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
     * used, so that we can use 0/0 to mean "before any valid WAL segment".
     */
    if (ENABLE_INCRE_CKPT) {
        u_sess->attr.attr_storage.fullPageWrites = false;
    }
    checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
    checkPoint.ThisTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
    checkPoint.fullPageWrites = u_sess->attr.attr_storage.fullPageWrites;
    checkPoint.nextXid = FirstNormalTransactionId + 1;
    checkPoint.nextOid = FirstBootstrapObjectId;
    checkPoint.nextMulti = FirstMultiXactId + 1;
    checkPoint.nextMultiOffset = 0;
    checkPoint.oldestXid = FirstNormalTransactionId;
    checkPoint.oldestXidDB = TemplateDbOid;
    checkPoint.time = (pg_time_t)time(NULL);
    checkPoint.oldestActiveXid = InvalidTransactionId;
    checkPoint.remove_seg = InvalidXLogSegPtr;

    t_thrd.xact_cxt.ShmemVariableCache->nextXid = checkPoint.nextXid;
    t_thrd.xact_cxt.ShmemVariableCache->nextOid = checkPoint.nextOid;
    t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;

    t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = COMMITSEQNO_FIRST_NORMAL + 1;
    t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo - 1;
    latestCompletedXid = checkPoint.nextXid;
    TransactionIdRetreat(latestCompletedXid);
    t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestCompletedXid;

    /* init recentGlobalXmin and xmin, to nextXid */
    t_thrd.xact_cxt.ShmemVariableCache->xmin = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
    t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin = latestCompletedXid;
    t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
    MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    SetMultiXactIdLimit(FirstMultiXactId, TemplateDbOid);

    /* Set up the XLOG page header */
    page->xlp_magic = XLOG_PAGE_MAGIC;
    page->xlp_info = XLP_LONG_HEADER;
    page->xlp_tli = t_thrd.xlog_cxt.ThisTimeLineID;
    page->xlp_pageaddr = XLogSegSize;
    longpage = (XLogLongPageHeader)page;
    longpage->xlp_sysid = sysidentifier;
    longpage->xlp_seg_size = XLogSegSize;
    longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;

    /* Insert the initial checkpoint record */
    recptr = ((char *)page + SizeOfXLogLongPHD);
    record = (XLogRecord *)recptr;
    record->xl_prev = 0;
    record->xl_xid = InvalidTransactionId;
    record->xl_tot_len = CHECKPOINT_LEN;
    record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
    record->xl_rmid = RM_XLOG_ID;
    recptr += SizeOfXLogRecord;
    /* fill the XLogRecordDataHeaderShort struct */
    *(recptr++) = XLR_BLOCK_ID_DATA_SHORT;
    *(recptr++) = sizeof(checkPoint);
    ret = memcpy_s(recptr, sizeof(checkPoint), &checkPoint, sizeof(checkPoint));
    securec_check(ret, "", "");
    recptr += sizeof(checkPoint);
    Assert(recptr - (char *)record == record->xl_tot_len);

    INIT_CRC32C(crc);
    COMP_CRC32C(crc, ((char *)record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
    COMP_CRC32C(crc, (char *)record, offsetof(XLogRecord, xl_crc));
    FIN_CRC32C(crc);
    record->xl_crc = crc;

    /* Create first XLOG segment file */
    use_existent = false;
    t_thrd.xlog_cxt.openLogFile = XLogFileInit(1, &use_existent, false);

    /* Write the first page with the initial record */
    errno = 0;
    pgstat_report_waitevent(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
    if (write(t_thrd.xlog_cxt.openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ) {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0) {
            errno = ENOSPC;
        }
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not write bootstrap transaction log file: %m")));
    }
    pgstat_report_waitevent(WAIT_EVENT_END);

    pgstat_report_waitevent(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
    if (pg_fsync(t_thrd.xlog_cxt.openLogFile) != 0) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync bootstrap transaction log file: %m")));
    }
    pgstat_report_waitevent(WAIT_EVENT_END);

    if (close(t_thrd.xlog_cxt.openLogFile)) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not close bootstrap transaction log file: %m")));
    }

    t_thrd.xlog_cxt.openLogFile = -1;

    /* In SS, the first node to create control file is will be primary */
    if (ENABLE_DSS) {
        SSInitReformerControlPages();
    }

    /* Now create pg_control */
    ret = memset_s(t_thrd.shemem_ptr_cxt.ControlFile, sizeof(ControlFileData), 0, sizeof(ControlFileData));
    securec_check(ret, "", "");

    /* Initialize pg_control status fields */
    t_thrd.shemem_ptr_cxt.ControlFile->system_identifier = sysidentifier;
    t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED;
    t_thrd.shemem_ptr_cxt.ControlFile->time = checkPoint.time;
    t_thrd.shemem_ptr_cxt.ControlFile->checkPoint = checkPoint.redo;
    t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy = checkPoint;

    /* Set important parameter values for use when replaying WAL */
    t_thrd.shemem_ptr_cxt.ControlFile->MaxConnections = g_instance.shmem_cxt.MaxConnections;
    t_thrd.shemem_ptr_cxt.ControlFile->max_prepared_xacts = g_instance.attr.attr_storage.max_prepared_xacts;
    t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact = g_instance.attr.attr_storage.max_locks_per_xact;
    t_thrd.shemem_ptr_cxt.ControlFile->wal_level = g_instance.attr.attr_storage.wal_level;
    t_thrd.shemem_ptr_cxt.ControlFile->bootstrap_segment = EnableInitDBSegment;

    /* some additional ControlFile fields are set in WriteControlFile() */
    WriteControlFile();

    if (IS_SHARED_STORAGE_MODE) {
        ShareStorageXLogCtl *ctlInfo = g_instance.xlog_cxt.shareStorageXLogCtl;
        InitShareStorageCtlInfo(ctlInfo, sysidentifier);
        int writeLen = WriteXlogToShareStorage(XLogSegSize, (char *)page, XLOG_BLCKSZ);
        if (writeLen != XLOG_BLCKSZ) {
            ereport(PANIC, (errcode_for_file_access(),
                            errmsg("write checkpoint info faild: len:%u, version:%u, start:%lu, end:%lu",
                                   ctlInfo->length, ctlInfo->version, ctlInfo->insertHead, ctlInfo->insertTail)));
        }
        ctlInfo->insertHead = MAXALIGN((recptr - (char *)page) + XLogSegSize);
        ctlInfo->crc = CalShareStorageCtlInfoCrc(ctlInfo);
        UpdateShareStorageCtlInfo(ctlInfo);
        FsyncXlogToShareStorage();
        ereport(LOG, (errcode_for_file_access(),
                      errmsg("update dorado ctl info: len:%u, version:%u, start:%lu, end:%lu", ctlInfo->length,
                             ctlInfo->version, ctlInfo->insertHead, ctlInfo->insertTail)));
    }
#ifndef ENABLE_MULTIPLE_NODES
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
        ret = memset_s(t_thrd.shemem_ptr_cxt.dcfData, sizeof(DCFData), 0, sizeof(DCFData));
        securec_check(ret, "", "");
        /* Write DCF paxos index file during initdb */
        t_thrd.shemem_ptr_cxt.dcfData->dcfDataVersion = DCF_DATA_VERSION;
        t_thrd.shemem_ptr_cxt.dcfData->appliedIndex = 0;
        t_thrd.shemem_ptr_cxt.dcfData->realMinAppliedIdx = 0;
        INIT_CRC32C(t_thrd.shemem_ptr_cxt.dcfData->crc);
        COMP_CRC32C(t_thrd.shemem_ptr_cxt.dcfData->crc, (char *)t_thrd.shemem_ptr_cxt.dcfData, offsetof(DCFData, crc));
        FIN_CRC32C(t_thrd.shemem_ptr_cxt.dcfData->crc);
        if (!SaveAppliedIndex()) {
            ereport(PANIC, (errmsg("Write paxos index failed!")));
        }
    }
#endif
    /* bootstrap double write */
    MemoryContext mem_cxt = AllocSetContextCreate(INSTANCE_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE),
                                                  "DoubleWriteContext", ALLOCSET_DEFAULT_MINSIZE,
                                                  ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, SHARED_CONTEXT);

    MemoryContext old_mem_cxt = MemoryContextSwitchTo(mem_cxt);
    dw_bootstrap();
    (void)MemoryContextSwitchTo(old_mem_cxt);

    /* Bootstrap the commit log, too */
    if (!SS_STANDBY_INST_SKIP_SHARED_FILE) {
        BootStrapCLOG();
        BootStrapCSNLOG();
        BootStrapMultiXact();
    }

    pfree_ext(buffer);
}

static char *str_time(pg_time_t tnow)
{
    pg_strftime(t_thrd.xlog_cxt.buf, sizeof(t_thrd.xlog_cxt.buf), "%Y-%m-%d %H:%M:%S %Z",
                pg_localtime(&tnow, log_timezone));

    return t_thrd.xlog_cxt.buf;
}

/*
 * See if there is a recovery command file (recovery.conf), and if so
 * read in parameters for archive recovery and XLOG streaming.
 *
 * The file is parsed using the main configuration parser.
 */
static void readRecoveryCommandFile(void)
{
    FILE *fd = NULL;
    TimeLineID rtli = 0;
    bool rtliGiven = false;
    ConfigVariable *item = NULL;
    ConfigVariable *head = NULL;
    ConfigVariable *tail = NULL;

    fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
    if (fd == NULL) {
        if (errno == ENOENT) {
            return; /* not there, so no archive recovery */
        }
        ereport(FATAL, (errcode_for_file_access(),
                        errmsg("could not open recovery command file \"%s\": %m", RECOVERY_COMMAND_FILE)));
    }

    /*
     * Since we're asking ParseConfigFp() to report errors as FATAL, there's
     * no need to check the return value.
     */
    (void)ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);

    (void)FreeFile(fd);
    fd = NULL;

    for (item = head; item; item = item->next) {
        if (strcmp(item->name, "restore_command") == 0 && t_thrd.xlog_cxt.server_mode != STANDBY_MODE) {
            t_thrd.xlog_cxt.recoveryRestoreCommand = pstrdup(item->value);
            ereport(DEBUG2, (errmsg_internal("restore_command = '%s'", t_thrd.xlog_cxt.recoveryRestoreCommand)));
        } else if (strcmp(item->name, "recovery_end_command") == 0) {
            t_thrd.xlog_cxt.recoveryEndCommand = pstrdup(item->value);
            ereport(DEBUG2, (errmsg_internal("recovery_end_command = '%s'", t_thrd.xlog_cxt.recoveryEndCommand)));
        } else if (strcmp(item->name, "archive_cleanup_command") == 0) {
            t_thrd.xlog_cxt.archiveCleanupCommand = pstrdup(item->value);
            ereport(DEBUG2, (errmsg_internal("archive_cleanup_command = '%s'", t_thrd.xlog_cxt.archiveCleanupCommand)));
        } else if (strcmp(item->name, "pause_at_recovery_target") == 0) {
            if (!parse_bool(item->value, &t_thrd.xlog_cxt.recoveryPauseAtTarget)) {
                ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                errmsg("parameter \"%s\" requires a Boolean value", "pause_at_recovery_target")));
            }
            ereport(DEBUG2, (errmsg_internal("pause_at_recovery_target = '%s'", item->value)));
        } else if (strcmp(item->name, "recovery_target_timeline") == 0) {
            rtliGiven = true;
            if (strcmp(item->value, "latest") == 0) {
                rtli = 0;
            } else {
                errno = 0;
                rtli = (TimeLineID)strtoul(item->value, NULL, 0);
                if (errno == EINVAL || errno == ERANGE) {
                    ereport(FATAL, (errmsg("recovery_target_timeline is not a valid number: \"%s\"", item->value)));
                }
            }
            if (rtli) {
                ereport(DEBUG2, (errmsg_internal("recovery_target_timeline = %u", rtli)));
            } else {
                ereport(DEBUG2, (errmsg_internal("recovery_target_timeline = latest")));
            }
        } else if (strcmp(item->name, "recovery_target_xid") == 0) {
            errno = 0;
            t_thrd.xlog_cxt.recoveryTargetXid = (TransactionId)pg_strtouint64(item->value, NULL, 0);
            if (errno == EINVAL || errno == ERANGE) {
                ereport(FATAL, (errmsg("recovery_target_xid is not a valid number: \"%s\"", item->value)));
            }
            ereport(DEBUG2, (errmsg_internal("recovery_target_xid = " XID_FMT, t_thrd.xlog_cxt.recoveryTargetXid)));
            t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_XID;
        } else if (strcmp(item->name, "recovery_target_time") == 0) {
            /*
             * if recovery_target_xid or recovery_target_name specified, then
             * this overrides recovery_target_time
             */
            if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID ||
                t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_NAME) {
                continue;
            }
            t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_TIME;

            /*
             * Convert the time string given by the user to TimestampTz form.
             */
            t_thrd.xlog_cxt.recoveryTargetTime =
                DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in, CStringGetDatum(item->value),
                                                        ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1)));
            ereport(DEBUG2, (errmsg_internal("recovery_target_time = '%s'",
                                             timestamptz_to_str(t_thrd.xlog_cxt.recoveryTargetTime))));
#ifdef PGXC
        } else if (strcmp(item->name, "recovery_target_barrier") == 0) {
            t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_BARRIER;
            t_thrd.xlog_cxt.recoveryTargetBarrierId = pstrdup(item->value);
#endif
        } else if (strcmp(item->name, "recovery_target_time_obs") == 0) {
            t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_TIME_OBS;
            g_instance.roach_cxt.targetTimeInPITR = pstrdup(item->value);

            /*
             * Convert the time string given by the user to TimestampTz form.
             */
            t_thrd.xlog_cxt.obsRecoveryTargetTime = pstrdup(item->value);
            ereport(LOG, (errmsg_internal("recovery_target_time_obs = '%s'", t_thrd.xlog_cxt.obsRecoveryTargetTime)));
        } else if (strcmp(item->name, "recovery_target_name") == 0) {
            /*
             * if recovery_target_xid specified, then this overrides
             * recovery_target_name
             */
            if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID) {
                continue;
            }
            t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_NAME;
            t_thrd.xlog_cxt.recoveryTargetName = pstrdup(item->value);
            if (strlen(t_thrd.xlog_cxt.recoveryTargetName) >= MAXFNAMELEN) {
                ereport(FATAL, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                errmsg("recovery_target_name is too long (maximum %d characters)", MAXFNAMELEN - 1)));
            }

            ereport(DEBUG2, (errmsg_internal("recovery_target_name = '%s'", t_thrd.xlog_cxt.recoveryTargetName)));
        } else if (strcmp(item->name, "recovery_target_lsn") == 0) {
            /*
             * if recovery_target_xid or recovery_target_name or recovery_target_time
             * specified, then this overrides recovery_target_lsn
             */
            if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID ||
                t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_NAME ||
                t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME) {
                continue;
            }
            t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_LSN;

            /*
             * Convert the LSN string given by the user to XLogRecPtr form.
             */
            t_thrd.xlog_cxt.recoveryTargetLSN = DatumGetLSN(DirectFunctionCall3(pg_lsn_in, CStringGetDatum(item->value),
                                                                                ObjectIdGetDatum(InvalidOid),
                                                                                Int32GetDatum(-1)));
            ereport(DEBUG2,
                    (errmsg_internal("recovery_target_lsn = '%X/%X'", (uint32)(t_thrd.xlog_cxt.recoveryTargetLSN >> 32),
                                     (uint32)t_thrd.xlog_cxt.recoveryTargetLSN)));
        } else if (strcmp(item->name, "recovery_target_inclusive") == 0) {
            // does nothing if a recovery_target is not also set
            if (!parse_bool(item->value, &t_thrd.xlog_cxt.recoveryTargetInclusive)) {
                ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                errmsg("parameter \"%s\" requires a Boolean value", "recovery_target_inclusive")));
            }
            ereport(DEBUG2, (errmsg_internal("recovery_target_inclusive = %s", item->value)));
        } else if (strcmp(item->name, "standby_mode") == 0) {
            if (!parse_bool(item->value, &t_thrd.xlog_cxt.StandbyModeRequested)) {
                ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                errmsg("parameter \"%s\" requires a Boolean value", "standby_mode")));
            }
            ereport(DEBUG2, (errmsg_internal("standby_mode = '%s'", item->value)));
        } else if (strcmp(item->name, "primary_conninfo") == 0) {
            t_thrd.xlog_cxt.PrimaryConnInfo = pstrdup(item->value);
            ereport(DEBUG2, (errmsg_internal("primary_conninfo = '%s'", t_thrd.xlog_cxt.PrimaryConnInfo)));
        } else if (strcmp(item->name, "primary_slotname") == 0) {
            ReplicationSlotValidateName(item->value, ERROR);
            u_sess->attr.attr_storage.PrimarySlotName = pstrdup(item->value);
            ereport(DEBUG2, (errmsg_internal("primary_slotname = '%s'", u_sess->attr.attr_storage.PrimarySlotName)));
        } else if (strcmp(item->name, "trigger_file") == 0) {
            t_thrd.xlog_cxt.TriggerFile = pstrdup(item->value);
            ereport(DEBUG2, (errmsg_internal("trigger_file = '%s'", t_thrd.xlog_cxt.TriggerFile)));
        } else {
            ereport(FATAL, (errmsg("unrecognized recovery parameter \"%s\"", item->name)));
        }
    }

    /*
     * Check for compulsory parameters
     */
    if (t_thrd.xlog_cxt.StandbyModeRequested) {
        if (t_thrd.xlog_cxt.PrimaryConnInfo == NULL && t_thrd.xlog_cxt.recoveryRestoreCommand == NULL) {
            ereport(WARNING,
                    (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
                            RECOVERY_COMMAND_FILE),
                     errhint("The database server will regularly poll the pg_xlog subdirectory to check for files "
                             "placed there.")));
        }
    } else {
        if (t_thrd.xlog_cxt.recoveryRestoreCommand == NULL && XLogInsertAllowed()) {
            ereport(
                FATAL,
                (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
                        RECOVERY_COMMAND_FILE)));
        }
    }

    /* Enable fetching from archive recovery area */
    t_thrd.xlog_cxt.ArchiveRestoreRequested = true;

    /*
     * If user specified recovery_target_timeline, validate it or compute the
     * "latest" value.	We can't do this until after we've gotten the restore
     * command and set InArchiveRecovery, because we need to fetch timeline
     * history files from the archive.
     */
    if (rtliGiven) {
        if (rtli) {
            /* Timeline 1 does not have a history file, all else should */
            if (rtli != 1 && !existsTimeLineHistory(rtli)) {
                ereport(FATAL, (errmsg("recovery target timeline %u does not exist", rtli)));
            }
            t_thrd.xlog_cxt.recoveryTargetTLI = rtli;
            t_thrd.xlog_cxt.recoveryTargetIsLatest = false;
        } else {
            /* We start the "latest" search from pg_control's timeline */
            t_thrd.xlog_cxt.recoveryTargetTLI = findNewestTimeLine(t_thrd.xlog_cxt.recoveryTargetTLI);
            t_thrd.xlog_cxt.recoveryTargetIsLatest = true;
        }
    }

    FreeConfigVariables(head);
}

void close_readFile_if_open()
{
    if (t_thrd.xlog_cxt.readFile >= 0) {
        close(t_thrd.xlog_cxt.readFile);
        t_thrd.xlog_cxt.readFile = -1;
    }
}

/*
 * Exit archive-recovery state
 */
static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo)
{
    struct stat statbuf;
    char recoveryPath[MAXPGPATH];
    char xlogpath[MAXPGPATH];
    errno_t errorno = EOK;

    // We are no longer in archive recovery state.
    t_thrd.xlog_cxt.InArchiveRecovery = false;

    // Update min recovery point one last time.
    UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);

    /*
     * If the ending log segment is still open, close it (to avoid problems on
     * Windows with trying to rename or delete an open file).
     */
    close_readFile_if_open();

    /*
     * If we are establishing a new timeline, we have to copy data from the
     * last WAL segment of the old timeline to create a starting WAL segment
     * for the new timeline.
     *
     * Notify the archiver that the last WAL segment of the old timeline is
     * ready to copy to archival storage. Otherwise, it is not archived for a
     * while.
     */
    if (endTLI != t_thrd.xlog_cxt.ThisTimeLineID) {
        char path[MAXPGPATH];
        char tmppath[MAXPGPATH];

        errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%08X%08X%08X", endTLI,
                             (uint32)((endLogSegNo) / XLogSegmentsPerXLogId),
                             (uint32)((endLogSegNo) % XLogSegmentsPerXLogId));
        securec_check_ss(errorno, "", "");

        errorno = snprintf_s(tmppath, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/xlogtemp.%lu", gs_thread_self());
        securec_check_ss(errorno, "\0", "\0");

        XLogFileCopy(path, tmppath);
        /*
         * Now move the segment into place with its final name.
         */
        if (!InstallXLogFileSegment(&endLogSegNo, (const char *)tmppath, false, NULL, false, SS_XLOGDIR))
            ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("InstallXLogFileSegment should not have failed")));
        if (XLogArchivingActive()) {
            errorno = snprintf_s(xlogpath, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X", endTLI,
                                 (uint32)((endLogSegNo) / XLogSegmentsPerXLogId),
                                 (uint32)((endLogSegNo) % XLogSegmentsPerXLogId));
            securec_check_ss(errorno, "", "");

            XLogArchiveNotify(xlogpath);
        }
    }

    /*
     * Let's just make real sure there are not .ready or .done flags posted
     * for the new segment.
     */
    errorno = snprintf_s(xlogpath, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
                         (uint32)((endLogSegNo) / XLogSegmentsPerXLogId),
                         (uint32)((endLogSegNo) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    XLogArchiveCleanup(xlogpath);

    // Since there might be a partial WAL segment named RECOVERYXLOG, get rid of it.
    errorno = snprintf_s(recoveryPath, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/RECOVERYXLOG");
    securec_check_ss(errorno, "", "");
    unlink(recoveryPath); /* ignore any error */

    /* Get rid of any remaining recovered timeline-history file, too */
    errorno = snprintf_s(recoveryPath, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/RECOVERYHISTORY");
    securec_check_ss(errorno, "", "");
    unlink(recoveryPath); /* ignore any error */

    /* If recovery.conf doesn't exist, the server is in standby mode */
    if (stat(RECOVERY_COMMAND_FILE, &statbuf) != 0) {
        ereport(LOG, (errmsg("archive recovery complete")));
        return;
    }

    /*
     * Rename the config file out of the way, so that we don't accidentally
     * re-enter archive recovery mode in a subsequent crash.
     */
    unlink(RECOVERY_COMMAND_DONE);
    durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);

    ereport(LOG, (errmsg("archive recovery complete")));
}

/*
 * Remove WAL files that are not part of the given timeline's history.
 *
 * This is called during recovery, whenever we switch to follow a new
 * timeline, and at the end of recovery when we create a new timeline. We
 * wouldn't otherwise care about extra WAL files lying in pg_xlog, but they
 * can be pre-allocated or recycled WAL segments on the old timeline that we
 * haven't used yet, and contain garbage. If we just leave them in pg_xlog,
 * they will eventually be archived, and we can't let that happen. Files that
 * belong to our timeline history are valid, because we have successfully
 * replayed them, but from others we can't be sure.
 *
 * 'switchpoint' is the current point in WAL where we switch to new timeline,
 * and 'newTLI' is the new timeline we switch to.
 */
static void RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
{
    DIR *xldir = NULL;
    struct dirent *xlde = NULL;
    char switchseg[MAXFNAMELEN];
    XLogSegNo endLogSegNo;
    errno_t errorno = EOK;

    XLByteToPrevSeg(switchpoint, endLogSegNo);

    xldir = AllocateDir(XLOGDIR);
    if (xldir == NULL) {
        ereport(ERROR,
                (errcode_for_file_access(), errmsg("could not open transaction log directory \"%s\": %m", XLOGDIR)));
    }

    // Construct a filename of the last segment to be kept.
    errorno = snprintf_s(switchseg, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", newTLI,
                         (uint32)((endLogSegNo) / XLogSegmentsPerXLogId),
                         (uint32)((endLogSegNo) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    ereport(DEBUG2, (errmsg("attempting to remove WAL segments newer than log file %s", switchseg)));

    while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) {
        /* Ignore files that are not XLOG segments */
        if (strlen(xlde->d_name) != 24 || strspn(xlde->d_name, "0123456789ABCDEF") != 24) {
            continue;
        }

        /*
         * Remove files that are on a timeline older than the new one we're
         * switching to, but with a segment number >= the first segment on
         * the new timeline.
         */
        if (strncmp(xlde->d_name, switchseg, 8) < 0 && strcmp(xlde->d_name + 8, switchseg + 8) > 0) {
            /*
             * If the file has already been marked as .ready, however, don't
             * remove it yet. It should be OK to remove it - files that are
             * not part of our timeline history are not required for recovery
             * - but seems safer to let them be archived and removed later.
             */
            if (!XLogArchiveIsReady(xlde->d_name)) {
                RemoveXlogFile(xlde->d_name, switchpoint);
            }
        }
    }

    FreeDir(xldir);
    xldir = NULL;
}

void CheckBeforeTruncateXLog(XLogRecPtr endRecPtr)
{
    const uint32 shiftSize = 32;

    if (XLogRecPtrIsInvalid(t_thrd.xlog_cxt.minRecoveryPoint) ||
        XLByteLT(endRecPtr, t_thrd.xlog_cxt.minRecoveryPoint) ||
        XLogRecPtrIsValid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint)) {
        ereport(FATAL, (errmsg("truncate xlog LSN is before consistent recovery point WAL location (LSN) %X/%X",
                               (uint32)(endRecPtr >> shiftSize), (uint32)endRecPtr)));
    }
}

/**
 * @Description: After date restore, truncate XLOG after barrier lsn of standby DN.
 * off - The offset of the XLOG log needs to be cleared .
 */
void TruncateAndRemoveXLogForRoachRestore(XLogReaderState *record)
{
    CheckBeforeTruncateXLog(record->EndRecPtr);
    /* wal receiver and wal receiver writer must be stopped before we truncate xlog */
    ShutdownWalRcv();

    uint32 xlogOff;
    XLogSegNo xlogsegno;
    char xlogFileName[1024] = {0};
    XLByteToSeg(record->EndRecPtr, xlogsegno);
    XLogFileName(xlogFileName, MAXFNAMELEN, record->readPageTLI, xlogsegno);
    xlogOff = (uint32)(record->EndRecPtr) % XLogSegSize;
    int fd = 0;
    char *writeContent = NULL;
    uint32 truncateLength = 0;
    char XLogFilePath[MAX_PATH_LEN] = {0};
    errno_t rc = EOK;

    rc = snprintf_s(XLogFilePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, "%s/pg_xlog/%s", t_thrd.proc_cxt.DataDir,
                    xlogFileName);
    securec_check_ss(rc, "", "");

    if (xlogOff > 0) {
        fd = BasicOpenFile(XLogFilePath,
                           O_RDWR | PG_BINARY | (unsigned int)get_sync_bit(u_sess->attr.attr_storage.sync_method),
                           S_IRUSR | S_IWUSR);
        if (fd < 0) {
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not open file \"%s\"", XLogFilePath)));
        }

        if (lseek(fd, (off_t)xlogOff, SEEK_SET) < 0) {
            fprintf(stderr, "lseek error !\n");
            close(fd);
            ereport(ERROR, (errcode_for_file_access(), errmsg("lseek file error \"%s\" ", XLogFilePath)));
            return;
        }

        truncateLength = XLogSegSize - xlogOff;
        writeContent = (char *)palloc0(truncateLength);
        if (write(fd, writeContent, truncateLength) != truncateLength) {
            close(fd);
            pfree(writeContent);
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\" ", XLogFilePath)));
            return;
        }
        if (fsync(fd) != 0) {
            close(fd);
            pfree(writeContent);
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync file \"%s\" ", XLogFilePath)));
            return;
        }
        if (close(fd)) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\" ", XLogFilePath)));
        }
        pfree(writeContent);
    }

    if (((IS_OBS_DISASTER_RECOVER_MODE == false) && (IS_MULTI_DISASTER_RECOVER_MODE == false)) ||
        (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME_OBS))
        durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);

    DIR *xldir = NULL;
    struct dirent *xlde = NULL;

    xldir = AllocateDir(XLOGDIR);
    if (xldir == NULL) {
        ereport(ERROR,
                (errcode_for_file_access(), errmsg("could not open transaction log directory \"%s\": %m", XLOGDIR)));
    }

    while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL) {
        /* Ignore files that are not XLOG segments */
        if (strlen(xlde->d_name) != 24 || strspn(xlde->d_name, "0123456789ABCDEF") != 24) {
            continue;
        }

        if (strcmp(xlde->d_name, xlogFileName) > 0 || (xlogOff == 0 && strcmp(xlde->d_name, xlogFileName) == 0)) {
            rc = snprintf_s(XLogFilePath, MAX_PATH_LEN, MAX_PATH_LEN - 1, XLOGDIR "/%s", xlde->d_name);
            securec_check_ss(rc, "", "");
            rc = unlink(XLogFilePath);
            if (rc != 0) {
                ereport(LOG, (errcode_for_file_access(),
                              errmsg("could not remove old transaction log file \"%s\": %m", XLogFilePath)));
            }
        }
    }
    FreeDir(xldir);
    return;
}

static bool xlogCanStop(XLogReaderState *record)
{
    /* We only consider stoppping at COMMIT, ABORT or BARRIER records */
    return (XLogRecGetRmid(record) != RM_XACT_ID && XLogRecGetRmid(record) != RM_BARRIER_ID &&
            XLogRecGetRmid(record) != RM_XLOG_ID);
}

#ifndef ENABLE_LITE_MODE
static void DropAllRecoverySlotForPitr()
{
    List *all_archive_slots = NIL;
    all_archive_slots = GetAllRecoverySlotsName();
    if (all_archive_slots == NIL || all_archive_slots->length == 0) {
        list_free_deep(all_archive_slots);
        return;
    }

    foreach_cell(cell, all_archive_slots) {
        char* slotname = (char*)lfirst(cell);
        char path[MAXPGPATH];
        struct stat stat_buf;
        int nRet = snprintf_s(path, sizeof(path), MAXPGPATH - 1, "pg_replslot/%s", slotname);
        securec_check_ss(nRet, "\0", "\0");
        ReplicationSlotDrop(slotname, false);
        if (stat(path, &stat_buf) != 0 && errno == ENOENT) {
            ereport(LOG, (errmsg("last step in PITR recovery, delete recovery slot %s successfully.", slotname)));
        } else {
            ereport(WARNING, (errmsg("last step in PITR recovery, delete recovery slot %s failed.", slotname)));
        }
    }
    list_free_deep(all_archive_slots);
    return;
}
#endif

/*
 * For point-in-time recovery, this function decides whether we want to
 * stop applying the XLOG at or after the current record.
 *
 * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
 * *includeThis is set TRUE if we should apply this record before stopping.
 *
 * We also track the timestamp of the latest applied COMMIT/ABORT
 * record in XLogCtl->recoveryLastXTime, for logging purposes.
 * Also, some information is saved in recoveryStopXid et al for use in
 * annotating the new timeline's history file.
 */
static bool recoveryStopsHere(XLogReaderState *record, bool *includeThis)
{
    bool stopsHere = false;
    char recordRPName[MAXFNAMELEN] = {0};
    uint8 record_info;
    /* use volatile pointer to prevent code rearrangement */
    volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
    errno_t rc = EOK;
#ifdef PGXC
    bool stopsAtThisBarrier = false;
#endif
#ifdef PGXC
    TimestampTz recordXtime = 0;
#else
    TimestampTz recordXtime;
#endif
    char *recordBarrierId = NULL;
    char *stopBarrierId = NULL;
    if (xlogCanStop(record)) {
        return false;
    }

    record_info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
#ifdef ENABLE_MULTIPLE_NODES
    if (XLogRecGetRmid(record) == RM_XACT_ID) {
#endif
        if (XLogRecGetRmid(record) == RM_XACT_ID && record_info == XLOG_XACT_COMMIT_COMPACT) {
            xl_xact_commit_compact *recordXactCommitData = (xl_xact_commit_compact *)XLogRecGetData(record);
            recordXtime = recordXactCommitData->xact_time;
        } else if (XLogRecGetRmid(record) == RM_XACT_ID && record_info == XLOG_XACT_COMMIT) {
            xl_xact_commit *recordXactCommitData = (xl_xact_commit *)XLogRecGetData(record);
            recordXtime = recordXactCommitData->xact_time;
        } else if (XLogRecGetRmid(record) == RM_XACT_ID &&
                   (record_info == XLOG_XACT_ABORT || record_info == XLOG_XACT_ABORT_WITH_XID)) {
            xl_xact_abort *recordXactAbortData = (xl_xact_abort *)XLogRecGetData(record);
            recordXtime = recordXactAbortData->xact_time;
        }
#ifdef ENABLE_MULTIPLE_NODES
        /* end if (record->xl_rmid == RM_XACT_ID) */
    }
#endif
    else if (XLogRecGetRmid(record) == RM_BARRIER_ID) {
#ifndef ENABLE_LITE_MODE
        if (record_info == XLOG_BARRIER_CREATE) {
            recordBarrierId = (char *)XLogRecGetData(record);
            ereport(u_sess->attr.attr_storage.HaModuleDebug ? LOG : DEBUG4,
                    (errmsg("processing barrier xlog record for %s", recordBarrierId)));
            if (IS_OBS_DISASTER_RECOVER_MODE && t_thrd.xlog_cxt.recoveryTarget != RECOVERY_TARGET_TIME_OBS)
                update_recovery_barrier();
        }
#else
        FEATURE_ON_LITE_MODE_NOT_SUPPORTED();
#endif
    } else if (XLogRecGetRmid(record) == RM_XLOG_ID) {
        if (record_info == XLOG_RESTORE_POINT) {
            xl_restore_point *recordRestorePointData = (xl_restore_point *)XLogRecGetData(record);
            recordXtime = recordRestorePointData->rp_time;
            rc = strncpy_s(recordRPName, MAXFNAMELEN, recordRestorePointData->rp_name, MAXFNAMELEN - 1);
            securec_check(rc, "", "");
        }
    } else {
        return false;
    }

    /* Check if target LSN has been reached */
    if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_LSN &&
        record->ReadRecPtr >= t_thrd.xlog_cxt.recoveryTargetLSN) {
        *includeThis = t_thrd.xlog_cxt.recoveryTargetInclusive;

        t_thrd.xlog_cxt.recoveryStopAfter = *includeThis;
        t_thrd.xlog_cxt.recoveryStopXid = InvalidTransactionId;
        t_thrd.xlog_cxt.recoveryStopLSN = record->ReadRecPtr;
        t_thrd.xlog_cxt.recoveryStopTime = 0;
        t_thrd.xlog_cxt.recoveryStopName[0] = '\0';

        if (t_thrd.xlog_cxt.recoveryStopAfter) {
            ereport(LOG,
                    (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
                            (uint32)(t_thrd.xlog_cxt.recoveryStopLSN >> 32), (uint32)t_thrd.xlog_cxt.recoveryStopLSN)));
        } else {
            ereport(LOG,
                    (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
                            (uint32)(t_thrd.xlog_cxt.recoveryStopLSN >> 32), (uint32)t_thrd.xlog_cxt.recoveryStopLSN)));
        }

        if (stopsHere) {
            TruncateAndRemoveXLogForRoachRestore(record);
        }

        return true;
    }

    /* Do we have a PITR target at all? */
    if ((IS_OBS_DISASTER_RECOVER_MODE || IS_MULTI_DISASTER_RECOVER_MODE) &&
        t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_UNSET) {
        // Save timestamp of latest transaction commit/abort if this is a
        // transaction record
        if (strlen((char *)walrcv->recoveryStopBarrierId) == 0 &&
            strlen((char *)walrcv->recoverySwitchoverBarrierId) == 0) {
            if (XLogRecGetRmid(record) == RM_XACT_ID) {
                SetLatestXTime(recordXtime);
            }
            return false;
        } else {
            t_thrd.xlog_cxt.recoveryTarget = RECOVERY_TARGET_BARRIER;
        }
    }

#ifdef ENABLE_MULTIPLE_NODES
    if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID) {
#else
    if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID && XLogRecGetRmid(record) == RM_XACT_ID) {
#endif
        /*
         * There can be only one transaction end record with this exact
         * transactionid
         *
         * when testing for an xid, we MUST test for equality only, since
         * transactions are numbered in the order they start, not the order
         * they complete. A higher numbered xid will complete before you about
         * 50% of the time...
         */
        stopsHere = (XLogRecGetXid(record) == t_thrd.xlog_cxt.recoveryTargetXid);
        if (stopsHere) {
            TruncateAndRemoveXLogForRoachRestore(record);
            *includeThis = t_thrd.xlog_cxt.recoveryTargetInclusive;
        }
    } else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME_OBS) {
#ifndef ENABLE_LITE_MODE
        stopsHere = false;
        if (XLogRecGetRmid(record) == RM_XACT_ID && !g_instance.roach_cxt.isGtmFreeCsn) {
            uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
            if (g_instance.roach_cxt.globalBarrierRecordForPITR != NULL && (info == XLOG_XACT_COMMIT ||
                info == XLOG_XACT_COMMIT_COMPACT || info == XLOG_XACT_COMMIT_PREPARED)) {
                char* token = NULL;
                char* lastPtr = NULL;
                uint64 targetCsn = InvalidCommitSeqNo;
                uint64 currCsn = InvalidCommitSeqNo;
                long ts = 0;
                if (strncmp(g_instance.roach_cxt.globalBarrierRecordForPITR, "csn_", 4) == 0) {
                    token = strtok_s(g_instance.roach_cxt.globalBarrierRecordForPITR, "-", &lastPtr);
                    if (sscanf_s(token, "csn_%021lu_%013ld", &targetCsn, &ts) != 2) {
                        ereport(PANIC, (errmsg("recovery target csn could not find!")));
                    }
                    if (info == XLOG_XACT_COMMIT) {
                        xl_xact_commit* xlrec = (xl_xact_commit *)XLogRecGetData(record);
                        currCsn = xlrec->csn;
                    } else if (info == XLOG_XACT_COMMIT_COMPACT) {
                        xl_xact_commit_compact* xlrec = (xl_xact_commit_compact *)XLogRecGetData(record);
                        currCsn = xlrec->csn;
                    } else if (info == XLOG_XACT_COMMIT_PREPARED) {
                        xl_xact_commit_prepared* xlrec = (xl_xact_commit_prepared *)XLogRecGetData(record);
                        currCsn = xlrec->crec.csn;
                    } else {
                        ereport(PANIC, (errmsg("recovery target csn xact content error!")));
                    }
                    if (currCsn > targetCsn) {
                        ereport(LOG, (errmsg("For PITR recovery, reach target csn %lu, current csn is %lu,"
                            " and current xlog location is %08X/%08X.", targetCsn, currCsn,
                            (uint32)(record->EndRecPtr >> 32), (uint32)record->EndRecPtr)));
                        stopsHere = true;
                        *includeThis = true;
                        uint32 xlogOff;
                        pfree_ext(g_instance.roach_cxt.globalBarrierRecordForPITR);
                        pfree_ext(g_instance.roach_cxt.targetRestoreTimeFromMedia);
                        TruncateAndRemoveXLogForRoachRestore(record);
                        xlogOff = (uint32)(record->EndRecPtr) % XLOG_BLCKSZ;
                        if (xlogOff > 0 && xlogOff < XLOG_BLCKSZ) {
                            rc = memset_s(&record->readBuf[xlogOff], XLOG_BLCKSZ - xlogOff, 0, XLOG_BLCKSZ - xlogOff);
                            securec_check(rc, "", "");
                        }
                        DropAllRecoverySlotForPitr();
                    }
                }
            }
        } else if ((XLogRecGetRmid(record) == RM_BARRIER_ID) && (record_info == XLOG_BARRIER_CREATE)) {
            ereport(DEBUG2, (errmsg("checking if barrier record matches the target obs time")));
            Assert(t_thrd.xlog_cxt.obsRecoveryTargetTime != NULL);
            Assert(recordBarrierId != NULL);
            if (t_thrd.xlog_cxt.obsRecoveryTargetTime == NULL || recordBarrierId == NULL) {
                ereport(PANIC,
                        (errmsg("recovery target is RECOVERY_TARGET_TIME_OBS, but obsRecoveryTargetTime not set")));
            }
            if (strncmp(recordBarrierId, "hadr_", 5) == 0 || strncmp(recordBarrierId, "csn_", 4) == 0) {
                char* barrierTime = strrchr(recordBarrierId, '_');
                barrierTime += 1;
                if (g_instance.roach_cxt.targetRestoreTimeFromMedia != NULL &&
                    strcmp(g_instance.roach_cxt.targetRestoreTimeFromMedia, barrierTime) == 0) {
                    stopsAtThisBarrier = true;
                    *includeThis = true;
                    uint32 xlogOff;
                    ereport(u_sess->attr.attr_storage.HaModuleDebug ? LOG : DEBUG4,
                            (errmsg("stop barrier success xlog record for %s", recordBarrierId)));
                    pfree_ext(g_instance.roach_cxt.globalBarrierRecordForPITR);
                    pfree_ext(g_instance.roach_cxt.targetRestoreTimeFromMedia);
                    /* truncate XLOG after barrier time. */
                    if (IS_OBS_DISASTER_RECOVER_MODE || IS_MULTI_DISASTER_RECOVER_MODE) {
                        TruncateAndRemoveXLogForRoachRestore(record);
                        xlogOff = (uint32)(record->EndRecPtr) % XLOG_BLCKSZ;
                        if (xlogOff > 0 && xlogOff < XLOG_BLCKSZ) {
                            rc = memset_s(&record->readBuf[xlogOff], XLOG_BLCKSZ - xlogOff, 0, XLOG_BLCKSZ - xlogOff);
                            securec_check(rc, "", "");
                        }
                        /* after recovery at target time, drop recovery slots */
                        DropAllRecoverySlotForPitr();
                    }
                }
            }
        }
#else
        FEATURE_ON_LITE_MODE_NOT_SUPPORTED();
#endif
    } else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_BARRIER) {
        stopsHere = false;
        if ((XLogRecGetRmid(record) == RM_BARRIER_ID) && (record_info == XLOG_BARRIER_CREATE)) {
            ereport(DEBUG2, (errmsg("checking if barrier record matches the target barrier")));
            if ((t_thrd.xlog_cxt.recoveryTargetBarrierId != NULL && recordBarrierId != NULL &&
                 strcmp(t_thrd.xlog_cxt.recoveryTargetBarrierId, recordBarrierId) == 0) ||
                (recordBarrierId != NULL && strcmp((char *)walrcv->recoveryStopBarrierId, recordBarrierId) == 0)) {
                stopsAtThisBarrier = true;
                *includeThis = true;
                uint32 xlogOff;

                /* truncate XLOG after barrier lsn of standby DN. */
                if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE || IS_OBS_DISASTER_RECOVER_MODE) {
                    TruncateAndRemoveXLogForRoachRestore(record);
                    xlogOff = (uint32)(record->EndRecPtr) % XLOG_BLCKSZ;
                    if (xlogOff > 0 && xlogOff < XLOG_BLCKSZ) {
                        rc = memset_s(&record->readBuf[xlogOff], XLOG_BLCKSZ - xlogOff, 0, XLOG_BLCKSZ - xlogOff);
                        securec_check(rc, "", "");
                    }
                }
            } else if (recordBarrierId != NULL &&
                       strcmp((char *)walrcv->recoverySwitchoverBarrierId, recordBarrierId) == 0) {
                stopsAtThisBarrier = true;
                ereport(LOG, (errmsg("reach switchover barrierID %s", (char *)walrcv->recoverySwitchoverBarrierId)));
            }
        }
    } else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_NAME) {
        /*
         * There can be many restore points that share the same name, so we
         * stop at the first one
         */
        stopsHere = (strcmp(recordRPName, t_thrd.xlog_cxt.recoveryTargetName) == 0);

        if (stopsHere) {
            TruncateAndRemoveXLogForRoachRestore(record);
        }

        // Ignore recoveryTargetInclusive because this is not a transaction record
        *includeThis = false;
#ifdef ENABLE_MULTIPLE_NODES
    } else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME) {
#else
    } else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME && XLogRecGetRmid(record) == RM_XACT_ID) {
#endif
        /*
         * There can be many transactions that share the same commit time, so
         * we stop after the last one, if we are inclusive, or stop at the
         * first one if we are exclusive
         */
        if (t_thrd.xlog_cxt.recoveryTargetInclusive) {
            stopsHere = (recordXtime > t_thrd.xlog_cxt.recoveryTargetTime);
        } else {
            stopsHere = (recordXtime >= t_thrd.xlog_cxt.recoveryTargetTime);
        }
        if (stopsHere) {
            TruncateAndRemoveXLogForRoachRestore(record);
            *includeThis = false;
        }
    }

    if (stopsHere) {
        t_thrd.xlog_cxt.recoveryStopXid = XLogRecGetXid(record);
        t_thrd.xlog_cxt.recoveryStopTime = recordXtime;
        t_thrd.xlog_cxt.recoveryStopAfter = *includeThis;
        t_thrd.xlog_cxt.recoveryStopLSN = InvalidXLogRecPtr;

        if (record_info == XLOG_XACT_COMMIT_COMPACT || record_info == XLOG_XACT_COMMIT) {
            if (t_thrd.xlog_cxt.recoveryStopAfter)
                ereport(LOG, (errmsg("recovery stopping after commit of transaction " XID_FMT ", time %s",
                                     t_thrd.xlog_cxt.recoveryStopXid,
                                     timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
            else
                ereport(LOG, (errmsg("recovery stopping before commit of transaction " XID_FMT ", time %s",
                                     t_thrd.xlog_cxt.recoveryStopXid,
                                     timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
        } else if (record_info == XLOG_XACT_ABORT || record_info == XLOG_XACT_ABORT_WITH_XID) {
            if (t_thrd.xlog_cxt.recoveryStopAfter)
                ereport(LOG, (errmsg("recovery stopping after abort of transaction " XID_FMT ", time %s",
                                     t_thrd.xlog_cxt.recoveryStopXid,
                                     timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
            else
                ereport(LOG, (errmsg("recovery stopping before abort of transaction " XID_FMT ", time %s",
                                     t_thrd.xlog_cxt.recoveryStopXid,
                                     timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
        } else {
            rc = strncpy_s(t_thrd.xlog_cxt.recoveryStopName, MAXFNAMELEN, recordRPName, MAXFNAMELEN - 1);
            securec_check(rc, "", "");

            ereport(LOG, (errmsg("recovery stopping at restore point \"%s\", time %s", t_thrd.xlog_cxt.recoveryStopName,
                                 timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
        }

        /*
         * Note that if we use a RECOVERY_TARGET_TIME then we can stop at a
         * restore point since they are timestamped, though the latest
         * transaction time is not updated.
         */
        if (XLogRecGetRmid(record) == RM_XACT_ID && t_thrd.xlog_cxt.recoveryStopAfter) {
            SetLatestXTime(recordXtime);
        }
#ifdef PGXC
    } else if (stopsAtThisBarrier) {
        t_thrd.xlog_cxt.recoveryStopTime = recordXtime;
        if (strlen((char *)walrcv->recoveryStopBarrierId) != 0) {
            stopBarrierId = (char *)walrcv->recoveryStopBarrierId;
        } else {
            stopBarrierId = (char *)walrcv->recoverySwitchoverBarrierId;
        }
        ereport(LOG, (errmsg("recovery stopping at barrier %s, time %s",
                             IS_OBS_DISASTER_RECOVER_MODE ? stopBarrierId : t_thrd.xlog_cxt.recoveryTargetBarrierId,
                             timestamptz_to_str(t_thrd.xlog_cxt.recoveryStopTime))));
        return true;
#endif
    } else if (XLogRecGetRmid(record) == RM_XACT_ID) {
        SetLatestXTime(recordXtime);
    }

    return stopsHere;
}

bool RecoveryIsSuspend(void)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    bool recoverySuspend = false;

    SpinLockAcquire(&xlogctl->info_lck);
    recoverySuspend = xlogctl->recoverySusPend;
    SpinLockRelease(&xlogctl->info_lck);

    return recoverySuspend;
}

void SetRecoverySuspend(bool recoverySuspend)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->recoverySusPend = recoverySuspend;
    SpinLockRelease(&xlogctl->info_lck);
}

/*
 * Wait until shared recoveryPause flag is cleared.
 *
 * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
 * Probably not worth the trouble though.  This state shouldn't be one that
 * anyone cares about server power consumption in.
 */
static void recoveryPausesHere(void)
{
    /* Don't pause unless users can connect! */
    if (!t_thrd.xlog_cxt.LocalHotStandbyActive) {
        return;
    }

    ereport(LOG, (errmsg("recovery has paused"), errhint("Execute pg_xlog_replay_resume() to continue.")));

    while (RecoveryIsPaused()) {
        pg_usleep(1000000L); /* 1000 ms */
        RedoInterruptCallBack();
    }
}

bool RecoveryIsPaused(void)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    bool recoveryPause = false;

    SpinLockAcquire(&xlogctl->info_lck);
    recoveryPause = xlogctl->recoveryPause;
    SpinLockRelease(&xlogctl->info_lck);

    return recoveryPause;
}

void SetRecoveryPause(bool recoveryPause)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->recoveryPause = recoveryPause;
    SpinLockRelease(&xlogctl->info_lck);
}

#ifndef ENABLE_MULTIPLE_NODES
static bool CheckApplyDelayReady(void)
{
    /* nothing to do if no delay configured */
    if (u_sess->attr.attr_storage.recovery_min_apply_delay <= 0) {
        return false;
    }

    /* no delay is applied on a database not yet consistent */
    if (!t_thrd.xlog_cxt.reachedConsistency) {
        return false;
    }

    /* nothing to do if crash recovery is requested */
    if (!t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
        return false;
    }

    /* the walreceiver process is started behind here,
     * ensure that the walreceiver process has been started,
     * otherwise, the stream replication will be disconnected */
    if (g_instance.pid_cxt.WalReceiverPID == 0) {
        return false;
    }

    return true;
}

/*
 * When recovery_min_apply_delay is set, we wait long enough to make sure
 * certain record types are applied at least that interval behind the master.
 *
 * Returns true if we waited.
 *
 * Note that the delay is calculated between the WAL record log time and
 * the current time on standby. We would prefer to keep track of when this
 * standby received each WAL record, which would allow a more consistent
 * approach and one not affected by time synchronisation issues, but that
 * is significantly more effort and complexity for little actual gain in
 * usability.
 */
static bool RecoveryApplyDelay(const XLogReaderState *record)
{
    uint8 xactInfo;
    TimestampTz xtime;
    long secs;
    int microsecs;
    long waitTime = 0;

    if (!CheckApplyDelayReady()) {
        return false;
    }

    /*
     * Is it a COMMIT record?
     * We deliberately choose not to delay aborts since they have no effect on
     * MVCC. We already allow replay of records that don't have a timestamp,
     * so there is already opportunity for issues caused by early conflicts on
     * standbys
     */
    if (XLogRecGetRmid(record) != RM_XACT_ID) {
        return false;
    }

    xactInfo = XLogRecGetInfo(record) & (~XLR_INFO_MASK);
    if ((xactInfo == XLOG_XACT_COMMIT) || (xactInfo == XLOG_STANDBY_CSN_COMMITTING)) {
        xtime = ((xl_xact_commit *)XLogRecGetData(record))->xact_time;
    } else if (xactInfo == XLOG_XACT_COMMIT_COMPACT) {
        xtime = ((xl_xact_commit_compact *)XLogRecGetData(record))->xact_time;
    } else {
        return false;
    }

    u_sess->attr.attr_storage.recoveryDelayUntilTime =
        TimestampTzPlusMilliseconds(xtime, u_sess->attr.attr_storage.recovery_min_apply_delay);

    /* Exit without arming the latch if it's already past time to apply this record */
    TimestampDifference(GetCurrentTimestamp(), u_sess->attr.attr_storage.recoveryDelayUntilTime, &secs, &microsecs);
    if (secs <= 0 && microsecs <= 0) {
        return false;
    }

    while (true) {
        ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);

        /* might change the trigger file's location */
        RedoInterruptCallBack();

        if (CheckForFailoverTrigger() || CheckForSwitchoverTrigger() || CheckForStandbyTrigger()) {
            break;
        }

        /* Wait for difference between GetCurrentTimestamp() and recoveryDelayUntilTime */
        TimestampDifference(GetCurrentTimestamp(), u_sess->attr.attr_storage.recoveryDelayUntilTime, &secs, &microsecs);

        /* To clear LSNMarker item that has not been processed  in  pageworker */
        ProcTxnWorkLoad(false);

        /* NB: We're ignoring waits below min_apply_delay's resolution. */
        if (secs <= 0 && microsecs / 1000 <= 0) {
            break;
        }

        /* delay 1s every time */
        waitTime = (secs >= 1) ? 1000 : (microsecs / 1000);
        (void)WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, waitTime);
    }
    return true;
}
#endif

/*
 * Save timestamp of latest processed commit/abort record.
 *
 * We keep this in XLogCtl, not a simple static variable, so that it can be
 * seen by processes other than the startup process.  Note in particular
 * that CreateRestartPoint is executed in the checkpointer.
 */
void SetLatestXTime(TimestampTz xtime)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->recoveryLastXTime = xtime;
    SpinLockRelease(&xlogctl->info_lck);
}

/*
 * Fetch timestamp of latest processed commit/abort record.
 */
TimestampTz GetLatestXTime(void)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    TimestampTz xtime;

    SpinLockAcquire(&xlogctl->info_lck);
    xtime = xlogctl->recoveryLastXTime;
    SpinLockRelease(&xlogctl->info_lck);

    return xtime;
}

/*
 * Save timestamp of the next chunk of WAL records to apply.
 *
 * We keep this in XLogCtl, not a simple static variable, so that it can be
 * seen by all backends.
 */
static void SetCurrentChunkStartTime(TimestampTz xtime)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->currentChunkStartTime = xtime;
    SpinLockRelease(&xlogctl->info_lck);
}

/*
 * Fetch timestamp of latest processed commit/abort record.
 * Startup process maintains an accurate local copy in XLogReceiptTime
 */
TimestampTz GetCurrentChunkReplayStartTime(void)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    TimestampTz xtime;

    SpinLockAcquire(&xlogctl->info_lck);
    xtime = xlogctl->currentChunkStartTime;
    SpinLockRelease(&xlogctl->info_lck);

    return xtime;
}

void SetXLogReceiptTime(TimestampTz time)
{
    t_thrd.xlog_cxt.XLogReceiptTime = time;
}

/*
 * Returns time of receipt of current chunk of XLOG data, as well as
 * whether it was received from streaming replication or from archives.
 */
void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
{
    /*
     * This must be executed in the startup process, since we don't export the
     * relevant state to shared memory.
     */
    Assert(t_thrd.xlog_cxt.InRecovery);

    *rtime = t_thrd.xlog_cxt.XLogReceiptTime;
    *fromStream = (t_thrd.xlog_cxt.XLogReceiptSource == XLOG_FROM_STREAM);
}

TimestampTz GetXLogReceiptTimeOnly()
{
    return t_thrd.xlog_cxt.XLogReceiptTime;
}

void SetXLogReceiptSource(int source)
{
    t_thrd.xlog_cxt.XLogReceiptSource = source;
}

int GetXLogReceiptSource()
{
    return t_thrd.xlog_cxt.XLogReceiptSource;
}

/*
 * Note that text field supplied is a parameter name and does not require
 * translation
 */
#define RecoveryRequiresIntParameter(param_name, currValue, minValue) do { \
    if ((currValue) < (minValue)) {                                                           \
        ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),                             \
                        errmsg("hot standby is not possible because "                         \
                               "%s = %d is a lower setting than on the master server "        \
                               "(its value was %d)",                                          \
                               param_name, currValue, minValue),                              \
                        errhint("You might need to increase the value of %s ", param_name))); \
    }                                                                                         \
} while (0)

/*
 * Check to see if required parameters are set high enough on this server
 * for various aspects of recovery operation.
 */
static void CheckRequiredParameterValues(bool DBStateShutdown)
{
    /*
     * For archive recovery, the WAL must be generated with at least 'archive'
     * wal_level.
     */
    if (t_thrd.xlog_cxt.InArchiveRecovery && t_thrd.shemem_ptr_cxt.ControlFile->wal_level == WAL_LEVEL_MINIMAL) {
        ereport(WARNING,
                (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
                 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
    }

    /*
     * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
     * we must have at least as many backend slots as the primary.
     */
    if (t_thrd.xlog_cxt.InArchiveRecovery && g_instance.attr.attr_storage.EnableHotStandby && !DBStateShutdown) {
        if (t_thrd.shemem_ptr_cxt.ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY) {
            ereport(ERROR,
                    (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                     errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" or higher on "
                            "the master server"),
                     errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
        }

        /* We ignore autovacuum_max_workers when we make this test. */
        RecoveryRequiresIntParameter("max_locks_per_transaction", g_instance.attr.attr_storage.max_locks_per_xact,
                                     t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact);
    }
}

void StartupDummyStandby(void)
{
    char conninfo[MAXCONNINFO];
    int replIdx = 1;

    ereport(LOG, (errmsg("database system startup Secondary Standby")));

    ReadControlFile();

    TimeLineID timeline = t_thrd.xlog_cxt.ThisTimeLineID = GetThisTimeID();
    DummyStandbySetRecoveryTargetTLI(timeline);
    sync_system_identifier = GetSystemIdentifier();

    ereport(LOG, (errmsg("timline from control file = %u, systemid = %lu", (uint32)timeline, sync_system_identifier)));

    /* Now we can let the postmaster know we are ready to enter into hot standby. */
    SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);

    do {
        /* Handle interrupt signals of startup process */
        RedoInterruptCallBack();
        /* if we have lost the walreceiver process, try to start a new one */
        if (!WalRcvInProgress()) {
            while (1) {
                int replIdxFromFile = -1;

                if (WalSndInProgress(SNDROLE_DUMMYSTANDBY_STANDBY)) {
                    goto retry;
                }

                /* dead loop till success */
                get_failover_host_conninfo_for_dummy(&replIdxFromFile);
                if (replIdxFromFile < 0) {
                    replIdx = ((replIdx - 1) % REP_CONN_ARRAY) + 1;
                } else {
                    replIdx = (replIdxFromFile % REP_CONN_ARRAY) + 1;
                }
                connect_dn_str(conninfo, replIdx);
                replIdx++;

                RedoInterruptCallBack();
                if (libpqrcv_connect_for_TLI(&timeline, conninfo)) {
                    ereport(LOG, (errmsg("timline from primary = %u", (uint32)timeline)));
                    DummyStandbySetRecoveryTargetTLI(timeline);
                    SetThisTimeID(timeline);
                    UpdateControlFile();
                    break;
                }

                sleep(DUMMYSTANDBY_CONNECT_INTERVAL);
            }

            /* For dummystandby, startpos is not required, Primary will choose the startpos */
            ShutdownWalRcv();

            XLogRecPtr startpos = 0;
            RequestXLogStreaming(&startpos, conninfo, REPCONNTARGET_DEFAULT, u_sess->attr.attr_storage.PrimarySlotName);
        }

        if (!g_instance.attr.attr_storage.enable_mix_replication && !IS_DN_MULTI_STANDYS_MODE()) {
            /* if we have lost the datareceiver process, try to start a new one */
            if (!DataRcvInProgress()) {
                if (DataSndInProgress(SNDROLE_DUMMYSTANDBY_STANDBY)) {
                    goto retry;
                }

                RequestDataStreaming(conninfo, REPCONNTARGET_DEFAULT);
            }
        }
    retry:
        sleep(DUMMYSTANDBY_CHECK_INTERVAL);
    } while (1);
}

void ResourceManagerStartup(void)
{
    /* Initialize resource managers */
    for (uint32 rmid = 0; rmid <= RM_MAX_ID; rmid++) {
        if (RmgrTable[rmid].rm_startup != NULL) {
            RmgrTable[rmid].rm_startup();
        }
    }
}

void ResourceManagerStop(void)
{
    // Allow resource managers to do any required cleanup.
    for (uint32 rmid = 0; rmid <= RM_MAX_ID; rmid++) {
        if (RmgrTable[rmid].rm_cleanup != NULL) {
            RmgrTable[rmid].rm_cleanup();
        }
    }
}

#define RecoveryXlogReader(_oldXlogReader, _xlogreader) do {                                   \
    if (get_real_recovery_parallelism() > 1) {                                                 \
        if (GetRedoWorkerCount() > 0) {                                                        \
            errno_t errorno;                                                                   \
            (_oldXlogReader)->ReadRecPtr = (_xlogreader)->ReadRecPtr;                          \
            (_oldXlogReader)->EndRecPtr = (_xlogreader)->EndRecPtr;                            \
            (_oldXlogReader)->readSegNo = (_xlogreader)->readSegNo;                            \
            (_oldXlogReader)->readOff = (_xlogreader)->readOff;                                \
            (_oldXlogReader)->readLen = (_xlogreader)->readLen;                                \
            (_oldXlogReader)->readPageTLI = (_xlogreader)->readPageTLI;                        \
            (_oldXlogReader)->curReadOff = (_xlogreader)->curReadOff;                          \
            (_oldXlogReader)->curReadSegNo = (_xlogreader)->curReadSegNo;                      \
            (_oldXlogReader)->currRecPtr = (_xlogreader)->currRecPtr;                          \
            (_oldXlogReader)->latestPagePtr = (_xlogreader)->latestPagePtr;                    \
            (_oldXlogReader)->latestPageTLI = (_xlogreader)->latestPageTLI;                    \
            (_oldXlogReader)->isPRProcess = false;                                             \
            errorno = memcpy_s((_oldXlogReader)->readBuf, XLOG_BLCKSZ, (_xlogreader)->readBuf, \
                               (_oldXlogReader)->readLen);                                     \
            securec_check(errorno, "", "");                                                    \
            if (ENABLE_DSS && ENABLE_DMS && (_xlogreader)->preReadBuf != NULL) {               \
                (_oldXlogReader)->preReadStartPtr = (_xlogreader)->preReadStartPtr;            \
                errorno = memcpy_s((_oldXlogReader)->preReadBuf, XLogPreReadSize,              \
                    (_xlogreader)->preReadBuf, XLogPreReadSize);                               \
                securec_check(errorno, "", "");                                                \
            }                                                                                  \
            ResetDecoder(_oldXlogReader);                                                      \
            (_xlogreader) = (_oldXlogReader);                                                  \
        }                                                                                      \
        EndDispatcherContext();                                                                \
    }                                                                                          \
} while (0)

static void EndRedoXlog()
{
    if (IsExtremeRtoRunning()) {
        ExtremeCheckCommittingCsnList();
    }

    if ((get_real_recovery_parallelism() > 1) && (!parallel_recovery::DispatchPtrIsNull())) {
        SwitchToDispatcherContext();
        t_thrd.xlog_cxt.incomplete_actions =
            parallel_recovery::CheckImcompleteAction(t_thrd.xlog_cxt.incomplete_actions);
        FreeAllocatedRedoItem();
        EndDispatcherContext();
    }
    ResourceManagerStop();
    XLogWaitFlush(t_thrd.xlog_cxt.XactLastRecEnd);
}

int XLogSemas(void)
{
    return 4 + 2 * g_instance.attr.attr_storage.max_wal_senders;
}

/*
 * Log checkPoint
 */
inline void PrintCkpXctlControlFile(XLogRecPtr oldCkpLoc, CheckPoint *oldCkp, XLogRecPtr newCkpLoc, CheckPoint *newCkp,
                                    XLogRecPtr preCkpLoc, char *name)
{
    ereport(LOG,
            (errmsg("%s PrintCkpXctlControlFile: [checkPoint] oldCkpLoc:%X/%X, oldRedo:%X/%X, newCkpLoc:%X/%X, "
                    "newRedo:%X/%X, preCkpLoc:%X/%X",
                    name, (uint32)(oldCkpLoc >> 32), (uint32)oldCkpLoc, (uint32)(oldCkp->redo >> 32),
                    (uint32)oldCkp->redo, (uint32)(newCkpLoc >> 32), (uint32)newCkpLoc, (uint32)(newCkp->redo >> 32),
                    (uint32)newCkp->redo, (uint32)(preCkpLoc >> 32), (uint32)preCkpLoc)));
}

void CheckForRestartPoint()
{
    if (SS_IN_ONDEMAND_RECOVERY) {
        return;
    }

    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    if (XLByteLT(xlogctl->lastCheckPointRecPtr, g_instance.comm_cxt.predo_cxt.newestCheckpointLoc)) {
        Assert(XLByteLT(xlogctl->lastCheckPoint.redo, g_instance.comm_cxt.predo_cxt.newestCheckpoint.redo));
        Assert(g_instance.comm_cxt.predo_cxt.newestCheckpoint.nextXid >= xlogctl->lastCheckPoint.nextXid);
        Assert(g_instance.comm_cxt.predo_cxt.newestCheckpoint.nextOid >= xlogctl->lastCheckPoint.nextOid);

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->lastCheckPointRecPtr = g_instance.comm_cxt.predo_cxt.newestCheckpointLoc;
        xlogctl->lastCheckPoint = const_cast<CheckPoint &>(g_instance.comm_cxt.predo_cxt.newestCheckpoint);
        SpinLockRelease(&xlogctl->info_lck);
    }

    /*
     * Perform a checkpoint to update all our recovery activity to disk.
     *
     * Note that we write a shutdown checkpoint rather than an on-line
     * one. This is not particularly critical, but since we may be
     * assigning a new TLI, using a shutdown checkpoint allows us to have
     * the rule that TLI only changes in shutdown checkpoints, which
     * allows some extra error checking in xlog_redo.
     *
     * In most scenarios (except new timeline switch), the checkpoint
     * seems unnecessary 'cause we need to improve the performance of
     * switchover. We transfer the checkpoint to the checkpoint process
     * which seems effective. However this would lead to new issue
     * about dirty pages without checkpoint before we do the first
     * checkpoint. So we temporarily switch to full-page write mode,
     * and turn it off after the first checkpoint.
     */
    if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
        if (t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched) {
            RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
        } else {
            CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
        }
    } else {
        t_thrd.shemem_ptr_cxt.XLogCtl->FpwBeforeFirstCkpt = true;
        if (t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched) {
            RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE);
        }
    }
}

#define REL_NODE_FORMAT(rnode) (rnode).spcNode, (rnode).dbNode, (rnode).relNode, (rnode).bucketNode

void WriteRemainSegsFile(int fd, const char *buffer, uint32 usedLen)
{
    if (write(fd, buffer, usedLen) != usedLen) {
        /* if write didn't set errno, assume problem is no disk space */
        if (errno == 0) {
            errno = ENOSPC;
        }
        ereport(PANIC,
                (errcode_for_file_access(), errmsg("could not write remain segs check start point to file: %m")));
    }

    if (pg_fsync(fd) != 0) {
        ereport(PANIC, (errcode_for_file_access(), errmsg("could not fsync remain segs file: %m")));
    }
}

void XLogCopyRemainSegsToBuf(int fd)
{
    uint32 remainSegsNum = 0;
    if (t_thrd.xlog_cxt.remain_segs == NULL) {
        WriteRemainSegsFile(fd, (char *)&remainSegsNum, sizeof(uint32));
        return;
    }

    remainSegsNum = hash_get_num_entries(t_thrd.xlog_cxt.remain_segs);
    WriteRemainSegsFile(fd, (char *)&remainSegsNum, sizeof(uint32));

    HASH_SEQ_STATUS status;
    hash_seq_init(&status, t_thrd.xlog_cxt.remain_segs);

    ExtentTag *extentTag = NULL;
    while ((extentTag = (ExtentTag *)hash_seq_search(&status)) != NULL) {
        ereport(WARNING, (errmsg("Segment [%u, %u, %u, %d] is remained segment, remain extent type %s.",
                                 REL_NODE_FORMAT(extentTag->remainExtentHashTag.rnode),
                                 XlogGetRemainExtentTypeName(extentTag->remainExtentType))));
        WriteRemainSegsFile(fd, (char *)extentTag, sizeof(ExtentTag));
    }
}

void RecoveryRemainSegsFromRemainSegsFile(struct HTAB *remainSegsHtbl, uint32 remainSegNum, char *buf)
{
    if (remainSegNum == 0) {
        return;
    }

    while (remainSegNum-- > 0) {
        bool found = false;
        ExtentTag* extentTag = (ExtentTag *)buf;
        ExtentTag* new_entry = (ExtentTag *)hash_search(remainSegsHtbl, (void *)&(extentTag->remainExtentHashTag),
                                                        HASH_ENTER, &found);
        if (found) {
            ereport(WARNING, (errmsg("Extent [%u, %u, %u, %d] should not repeatedly found in alloc segs htbl.",
                    REL_NODE_FORMAT(extentTag->remainExtentHashTag.rnode))));
        } else {
            new_entry->forkNum = extentTag->forkNum;
            new_entry->remainExtentType = extentTag->remainExtentType;
            new_entry->xid = extentTag->xid;
            new_entry->lsn = extentTag->lsn;
        }

        buf += sizeof(ExtentTag);
    }
}

bool IsRemainSegsFileExist()
{
    struct stat segFileSt;
    if (stat(XLOG_REMAIN_SEGS_FILE_PATH, &segFileSt) < 0) {
        if (errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(),
                            errmsg("could not stat directory \"%s\": %m", XLOG_REMAIN_SEGS_FILE_PATH)));
        }
        return false;
    }

    if (!S_ISREG(segFileSt.st_mode)) {
        return false;
    }

    return true;
}

static void RenameRemainSegsFile()
{
    int errorNo = rename(XLOG_REMAIN_SEGS_FILE_PATH, XLOG_REMAIN_SEGS_BACKUP_FILE_PATH);
    if (errorNo != 0) {
        ereport(WARNING, (errcode_for_file_access(), errmsg("Failed to rename file \"%s\":%m.",
                XLOG_REMAIN_SEGS_FILE_PATH)));
    } else {
        ereport(LOG, (errmodule(MOD_SEGMENT_PAGE), errmsg("Rename file \"%s\" to \"%s\":%m.",
                XLOG_REMAIN_SEGS_FILE_PATH, XLOG_REMAIN_SEGS_BACKUP_FILE_PATH)));
    }
}

static bool ReadRemainSegsFileContentToBuffer(int fd, char**contentBuffer)
{
    uint32 magicNumber = 0;
    XLogRecPtr oldStartPoint;
    uint32_t readLen = read(fd, &magicNumber, sizeof(const uint32));
    if ((readLen < sizeof(const uint32)) || (magicNumber != REMAIN_SEG_MAGIC_NUM)) {
        ereport(WARNING, (errcode_for_file_access(), errmsg("read magic number res %u, len %u from remain segs file"
                "\"%s\" is less than %lu: %m.", magicNumber, readLen, XLOG_REMAIN_SEGS_FILE_PATH,
                sizeof(const uint32))));
        return false;
    }

    readLen = read(fd, &oldStartPoint, sizeof(XLogRecPtr));
    if (readLen < sizeof(XLogRecPtr)) {
        ereport(WARNING, (errcode_for_file_access(), errmsg("read startpoint len %u from remain segs file \"%s\""
                " is less than %lu: %m.", readLen, XLOG_REMAIN_SEGS_FILE_PATH, sizeof(XLogRecPtr))));
        return false;
    }

    uint32 remainEntryNum = 0;
    readLen = read(fd, &remainEntryNum, sizeof(uint32));
    if (readLen < sizeof(uint32)) {
        ereport(WARNING, (errcode_for_file_access(), errmsg("read remain entry num len %u from remain segs file"
                                                            "\"%s\" is less than %lu: %m.",
                                                            readLen, XLOG_REMAIN_SEGS_FILE_PATH, sizeof(uint32))));
        return false;
    }

    uint32 contentLen = sizeof(const uint32) + sizeof(XLogRecPtr) + sizeof(uint32);
    contentLen += remainEntryNum * sizeof(ExtentTag);
    contentLen += sizeof(const uint32);

    char *buffer = (char *)palloc_huge(CurrentMemoryContext, (contentLen + sizeof(pg_crc32c)));

    uint32 usedLen = 0;
    *(uint32 *)buffer = REMAIN_SEG_MAGIC_NUM;
    usedLen += sizeof(uint32);
    *(XLogRecPtr *)(buffer + usedLen) = oldStartPoint;
    usedLen += sizeof(XLogRecPtr);
    *(uint32 *)(buffer + usedLen) = remainEntryNum;
    usedLen += sizeof(uint32);

    readLen = read(fd, buffer + usedLen, remainEntryNum * sizeof(ExtentTag));
    if (readLen < (remainEntryNum * sizeof(ExtentTag))) {
        ereport(WARNING, (errcode_for_file_access(), errmsg("read extenttag len %u from remain segs file"
                "\"%s\" is less than %lu:%m.", readLen, XLOG_REMAIN_SEGS_FILE_PATH,
                remainEntryNum * sizeof(ExtentTag))));
        pfree_ext(buffer);
        return false;
    }
    usedLen += (remainEntryNum * sizeof(ExtentTag));

    readLen = read(fd, &magicNumber, sizeof(const uint32));
    if (readLen < sizeof(const uint32) || (magicNumber != REMAIN_SEG_MAGIC_NUM)) {
        ereport(WARNING, (errcode_for_file_access(), errmsg("read magic number %u, len %u from remain segs file"
                "\"%s\" is less than %lu: %m.", magicNumber, readLen, XLOG_REMAIN_SEGS_FILE_PATH,
                sizeof(const uint32))));
        pfree_ext(buffer);
        return false;
    }
    *(uint32 *)(buffer + usedLen) = magicNumber;
    usedLen += sizeof(const uint32);

    pg_crc32c oldCrc32c;
    readLen = read(fd, &oldCrc32c, sizeof(pg_crc32c));
    if (readLen < sizeof(pg_crc32c)) {
        pfree_ext(buffer);
        return false;
    }

    pg_crc32c curCrc32c;
    INIT_CRC32C(curCrc32c);
    COMP_CRC32C(curCrc32c, buffer, usedLen);
    FIN_CRC32C(curCrc32c);

    if (oldCrc32c != curCrc32c) {
        ereport(WARNING, (errcode_for_file_access(), errmsg("Read crc isn't same with content crc.")));
        pfree_ext(buffer);
        return false;
    }

    *contentBuffer = buffer;
    return true;
}

static bool ReadRemainSegsFileContent(int fd, HTAB* remainSegsHtbl, XLogRecPtr *oldStartPoint)
{
    if (remainSegsHtbl == NULL || oldStartPoint == NULL) {
        ereport(DEBUG5, (errmodule(MOD_SEGMENT_PAGE), errmsg("oldStartPoint or remainSegsHtbl is null")));
        return true;
    }

    if (oldStartPoint != NULL) {
        *oldStartPoint = InvalidXLogRecPtr;
    }

    char *contentBuffer = NULL;
    bool readRes = ReadRemainSegsFileContentToBuffer(fd, &contentBuffer);
    if (readRes == false || contentBuffer == NULL) {
        return false;
    }

    uint32 usedLen = sizeof(const uint32);

    *oldStartPoint = *(XLogRecPtr *)(contentBuffer + usedLen);
    usedLen += sizeof(XLogRecPtr);

    uint32 remainEntryNum = *(uint32 *)(contentBuffer + usedLen);
    usedLen += sizeof(uint32);

    RecoveryRemainSegsFromRemainSegsFile(remainSegsHtbl, remainEntryNum, contentBuffer + usedLen);

    pfree_ext(contentBuffer);
    return true;
}

static void ReadRemainSegsFile()
{
    bool isFileExist = IsRemainSegsFileExist();
    if (!isFileExist) {
        return;
    }

    if (t_thrd.xlog_cxt.remain_segs == NULL) {
        t_thrd.xlog_cxt.remain_segs = redo_create_remain_segs_htbl();
    }

    HeapMemResetHash(t_thrd.xlog_cxt.remain_segs, "remain_segs");

    int fd = BasicOpenFile(XLOG_REMAIN_SEGS_FILE_PATH, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
    if (fd < 0) {
        ereport(WARNING, (errcode_for_file_access(),
                          errmsg("could not open remain segs file \"%s\": %m", XLOG_REMAIN_SEGS_FILE_PATH)));
        return;
    }

    XLogRecPtr oldStartPoint = InvalidXLogRecPtr;
    bool readRightContent = ReadRemainSegsFileContent(fd, t_thrd.xlog_cxt.remain_segs, &oldStartPoint);
    if (readRightContent) {
        SetRemainSegsStartPoint(oldStartPoint);
    } else {
        HeapMemResetHash(t_thrd.xlog_cxt.remain_segs, "remain_segs");
    }

    if (close(fd)) {
        ereport(WARNING, (errcode_for_file_access(),
                          errmsg("could not close remain segs file \"%s\": %m", XLOG_REMAIN_SEGS_FILE_PATH)));
    }

    if (!readRightContent) {
        RenameRemainSegsFile();
    }
}

static uint32 XLogGetRemainContentLen()
{
    uint32 contentLen = 0;
    contentLen += sizeof(const uint32);
    contentLen += sizeof(XLogRecPtr);
    contentLen += sizeof(uint32);
    if (t_thrd.xlog_cxt.remain_segs != NULL) {
        contentLen += hash_get_num_entries(t_thrd.xlog_cxt.remain_segs) * sizeof(ExtentTag);
    }
    contentLen += sizeof(const uint32);
    return contentLen;
}

static void XLogMakeUpRemainSegsContent(char *contentBuffer)
{
    uint32 usedLen = 0;
    const uint32 magicNum = REMAIN_SEG_MAGIC_NUM;
    errno_t errc = memcpy_s(contentBuffer, sizeof(const uint32), &magicNum, sizeof(const uint32));
    securec_check(errc, "\0", "\0");
    usedLen += sizeof(uint32);

    XLogRecPtr recPtr = GetRemainSegsStartPoint();
    errc = memcpy_s(contentBuffer + usedLen, sizeof(XLogRecPtr), &recPtr, sizeof(XLogRecPtr));
    securec_check(errc, "\0", "\0");
    usedLen += sizeof(XLogRecPtr);

    uint32 remainSegsNum = 0;
    if (t_thrd.xlog_cxt.remain_segs != NULL) {
        remainSegsNum = hash_get_num_entries(t_thrd.xlog_cxt.remain_segs);
    }

    errc = memcpy_s(contentBuffer + usedLen, sizeof(uint32), &remainSegsNum, sizeof(uint32));
    securec_check(errc, "\0", "\0");
    usedLen += sizeof(uint32);

    if (remainSegsNum != 0) {
        HASH_SEQ_STATUS status;
        hash_seq_init(&status, t_thrd.xlog_cxt.remain_segs);

        ExtentTag *extentTag = NULL;
        while ((extentTag = (ExtentTag *)hash_seq_search(&status)) != NULL) {
            errc = memcpy_s(contentBuffer + usedLen, sizeof(ExtentTag), extentTag, sizeof(ExtentTag));
            securec_check(errc, "\0", "\0");
            usedLen += sizeof(ExtentTag);
        }
    }

    errc = memcpy_s(contentBuffer + usedLen, sizeof(const uint32), &magicNum, sizeof(const uint32));
    securec_check(errc, "\0", "\0");
    return;
}

void XLogCheckRemainSegs()
{
    if (SS_ONDEMAND_BUILD_DONE && !SS_ONDEMAND_REDO_DONE) {
        return;
    }

    uint32 contentLen = XLogGetRemainContentLen();
    pg_crc32c crc;
    char* contentBuffer = (char *)palloc_huge(CurrentMemoryContext, (contentLen + sizeof(pg_crc32c)));

    XLogMakeUpRemainSegsContent(contentBuffer);

    /* Contents are protected with a CRC */
    INIT_CRC32C(crc);
    COMP_CRC32C(crc, contentBuffer, contentLen);
    FIN_CRC32C(crc);
    *(pg_crc32c *)(contentBuffer + contentLen) = crc;

    int fd = BasicOpenFile(XLOG_REMAIN_SEGS_FILE_PATH, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY, S_IRUSR | S_IWUSR);
    if (fd < 0) {
        pfree_ext(contentBuffer);
        ereport(WARNING, (errcode_for_file_access(),
                errmsg("could not create remain segs file \"%s\": %m", XLOG_REMAIN_SEGS_FILE_PATH)));
        return;
    }

    WriteRemainSegsFile(fd, contentBuffer, contentLen + sizeof(pg_crc32c));

    if (close(fd)) {
        ereport(WARNING, (errcode_for_file_access(), errmsg("could not close remain segs file \"%s\": %m",
                XLOG_REMAIN_SEGS_FILE_PATH)));
    }

    pfree_ext(contentBuffer);
    if (t_thrd.xlog_cxt.remain_segs != NULL) {
        HeapMemResetHash(t_thrd.xlog_cxt.remain_segs, "remain_segs");
    }
}

void XLogUpdateRemainCommitLsn()
{
    if (t_thrd.xlog_cxt.remain_segs == NULL) {
        return;
    }

    uint32 remainSegsNum = hash_get_num_entries(t_thrd.xlog_cxt.remain_segs);
    if (remainSegsNum == 0) {
        return;
    }

    HASH_SEQ_STATUS status;
    hash_seq_init(&status, t_thrd.xlog_cxt.remain_segs);
    ExtentTag *extentTag = NULL;
    while ((extentTag = (ExtentTag *)hash_seq_search(&status)) != NULL) {
        if (extentTag->remainExtentType == DROP_SEGMENT) {
            UpdateRemainCommitLsn(extentTag->lsn);
        }
    }
    XLogRecPtr remaincommit = GetRemainCommitLsn();
    if (!XLogRecPtrIsInvalid(remaincommit)) {
        ereport(WARNING, (errmsg("There is dropped extent leaking, incremental checkpoint is not safe before %X/%X.",
                                 (uint32)(remaincommit >> 32), (uint32)remaincommit)));
    }
}

/*
 * When adding or decreasing semaphores used by xlog in the following function
 * please update XLogSemas accordingly
 */
void InitWalSemaphores()
{
    /* Create a memory context and allocate semaphores used by wal writer. */
    g_instance.wal_context = AllocSetContextCreate(g_instance.instance_context, "WalContext", ALLOCSET_DEFAULT_MINSIZE,
                                                   ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE, SHARED_CONTEXT);
    MemoryContext old_context = MemoryContextSwitchTo(g_instance.wal_context);

    /*
     * Lock structure used for XLog local flush notification
     */
    g_instance.wal_cxt.walFlushWaitLock = (WALFlushWaitLockPadded *)palloc(sizeof(WALFlushWaitLockPadded) * 2);
    g_instance.wal_cxt.walFlushWaitLock = (WALFlushWaitLockPadded *)TYPEALIGN(sizeof(WALFlushWaitLockPadded),
                                                                              g_instance.wal_cxt.walFlushWaitLock);
    g_instance.wal_cxt.walFlushWaitLock->l.lock = LWLockAssign(LWTRANCHE_WAL_FLUSH_WAIT);
    PGSemaphoreCreate(&g_instance.wal_cxt.walFlushWaitLock->l.sem);
    PGSemaphoreReset(&g_instance.wal_cxt.walFlushWaitLock->l.sem);

    /*
     * Lock structure used for WAL buffer initialization notification
     */
    g_instance.wal_cxt.walBufferInitWaitLock =
        (WALBufferInitWaitLockPadded *)palloc(sizeof(WALBufferInitWaitLockPadded) * 2);
    g_instance.wal_cxt.walBufferInitWaitLock =
        (WALBufferInitWaitLockPadded *)TYPEALIGN(sizeof(WALBufferInitWaitLockPadded),
                                                 g_instance.wal_cxt.walBufferInitWaitLock);
    g_instance.wal_cxt.walBufferInitWaitLock->l.lock = LWLockAssign(LWTRANCHE_WAL_BUFFER_INIT_WAIT);
    PGSemaphoreCreate(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem);
    PGSemaphoreReset(&g_instance.wal_cxt.walBufferInitWaitLock->l.sem);

    /*
     * Lock structure used for XLog init-ahead notification
     */
    g_instance.wal_cxt.walInitSegLock = (WALInitSegLockPadded *)palloc(sizeof(WALInitSegLockPadded) * 2);
    g_instance.wal_cxt.walInitSegLock = (WALInitSegLockPadded *)TYPEALIGN(sizeof(WALInitSegLockPadded),
                                                                          g_instance.wal_cxt.walInitSegLock);
    g_instance.wal_cxt.walInitSegLock->l.lock = LWLockAssign(LWTRANCHE_WAL_INIT_SEGMENT);
    PGSemaphoreCreate(&g_instance.wal_cxt.walInitSegLock->l.sem);
    PGSemaphoreReset(&g_instance.wal_cxt.walInitSegLock->l.sem);

    (void)MemoryContextSwitchTo(old_context);
}

static void checkHadrInSwitchover()
{
    FILE *fd = NULL;
    char *switchoverFile = "cluster_maintance";

    fd = fopen(switchoverFile, "r");

    if (fd == NULL) {
        ereport(LOG, (errmsg("===Not in the streaming DR switchover===")));
        return;
    }

    ereport(LOG, (errmsg("===In the streaming DR switchover===")));
    g_instance.streaming_dr_cxt.isInSwitchover = true;
    fclose(fd);

    return;
}

void ExtremeRedoWaitRecoverySusPendFinish(XLogRecPtr lsn)
{
    if (RecoveryIsSuspend()) {
        pg_atomic_write_u64(&g_instance.startup_cxt.suspend_lsn, lsn);
        pg_memory_barrier();
        while (RecoveryIsSuspend()) {
            pg_usleep(ONE_SECOND_TO_MICROSECOND); /* sleep 1s wait file repair */
            RedoInterruptCallBack();
        }
        pg_atomic_write_u64(&g_instance.startup_cxt.suspend_lsn, InvalidXLogRecPtr);
    }
}

void handleRecoverySusPend(XLogRecPtr lsn)
{
    if (RecoveryIsSuspend()) {
        if (IsExtremeRedo()) {
            ExtremeDispatchClosefdMarkToAllRedoWorker();
            ExtremeWaitAllReplayWorkerIdle();
        } else if (IsParallelRedo()) {
            if (AmStartupProcess()) {
                ProcTxnWorkLoad(true);
            }
            parallel_recovery::SendClosefdMarkToAllWorkers();
            parallel_recovery::WaitAllPageWorkersQueueEmpty();
        } else {
            /* nothing */
        }
        smgrcloseall();

        pg_atomic_write_u64(&g_instance.startup_cxt.suspend_lsn, lsn);
        pg_memory_barrier();
        ereport(LOG, (errmsg("recovery suspend, set suspend lsn is %X/%X",
            (uint32)(lsn >> XLOG_LSN_SWAP), (uint32)lsn)));

        ThreadId PageRepairPID = g_instance.pid_cxt.PageRepairPID;
        if (PageRepairPID != 0) {
            gs_signal_send(PageRepairPID, SIGUSR2);
        }
        /* wait all dirty page flush */
        RequestCheckpoint(CHECKPOINT_FLUSH_DIRTY|CHECKPOINT_WAIT);
        CheckNeedRenameFile();
        if (t_thrd.xlog_cxt.readFile >= 0) {
            close(t_thrd.xlog_cxt.readFile);
            t_thrd.xlog_cxt.readFile = -1;
        }
        while (RecoveryIsSuspend()) {
            if (!WalRcvInProgress() && g_instance.pid_cxt.WalReceiverPID == 0) {
                volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
                SpinLockAcquire(&walrcv->mutex);
                walrcv->receivedUpto = 0;
                SpinLockRelease(&walrcv->mutex);

                RequestXLogStreaming(&lsn, t_thrd.xlog_cxt.PrimaryConnInfo, REPCONNTARGET_PRIMARY,
                                     u_sess->attr.attr_storage.PrimarySlotName);
            }
            pg_usleep(ONE_SECOND_TO_MICROSECOND); /* sleep 1s wait file repair */
            RedoInterruptCallBack();
        }

        pg_atomic_write_u64(&g_instance.startup_cxt.suspend_lsn, InvalidXLogRecPtr);
        ereport(LOG, (errmsg("cancle recovery suspend")));
    }
}

/*
 *  Read term from xlog and update term_from_xlog.
 */
static inline void UpdateTermFromXLog(uint32 xlTerm)
{
    uint32 readTerm = xlTerm & XLOG_MASK_TERM;
    if (readTerm > g_instance.comm_cxt.localinfo_cxt.term_from_xlog) {
        g_instance.comm_cxt.localinfo_cxt.term_from_xlog = readTerm;
    }
}

/*
 * This must be called ONCE during postmaster or standalone-backend startup
 */
void StartupXLOG(void)
{
    XLogCtlInsert *Insert = NULL;
    CheckPoint checkPoint;
    CheckPointNew checkPointNew;   /* to adapt update and not to modify the storage format */
    CheckPointPlus checkPointPlus; /* to adapt update and not to modify the storage format for global clean */
    CheckPointUndo checkPointUndo;
    uint32 recordLen = 0;
    bool wasShutdown = false;
    bool DBStateShutdown = false;
    bool reachedStopPoint = false;
    bool haveBackupLabel = false;
    bool haveTblspcMap = false;
    XLogRecPtr RecPtr, checkPointLoc, EndOfLog;
    XLogSegNo endLogSegNo;
    XLogRecord *record = NULL;
    TransactionId oldestActiveXID;
    bool backupEndRequired = false;
    bool backupFromStandby = false;
    bool backupFromRoach = false;
    DBState dbstate_at_startup;
    XLogReaderState *xlogreader = NULL;
    XLogPageReadPrivate readprivate;
    bool RecoveryByPending = false;        /* recovery caused by pending mode */
    bool ArchiveRecoveryByPending = false; /* archive recovery caused by pending mode */
    bool AbnormalShutdown = true;
    bool SSOndemandRecoveryExitNormal = true; /* status of last ondemand recovery */
    struct stat st;
    errno_t rcm = 0;
    TransactionId latestCompletedXid;
    bool wasCheckpoint = false;
    errno_t errorno = EOK;
    XLogRecPtr max_page_flush_lsn = MAX_XLOG_REC_PTR;
    checkPointNew.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
    checkPointPlus.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
    checkPointPlus.length = 0;
    checkPointPlus.recent_global_xmin = InvalidTransactionId;
    checkPointUndo.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
    checkPointUndo.length = 0;
    checkPointUndo.recent_global_xmin = InvalidTransactionId;
    checkPointUndo.globalRecycleXid = InvalidTransactionId;
    t_thrd.xlog_cxt.startup_processing = true;
    t_thrd.xlog_cxt.RedoDone = false;
    t_thrd.xlog_cxt.currentRetryTimes = 0;
    t_thrd.xlog_cxt.forceFinishHappened = false;
    g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = 0;
    g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = 0;
    pg_atomic_write_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo), 0);

    NotifyGscRecoveryStarted();

    /*
     * Initialize WAL insert status array and the flush index - lastWalStatusEntryFlushed.
     */
    if (g_instance.wal_cxt.walInsertStatusTable == NULL) {
        MemoryContext old_context = MemoryContextSwitchTo(g_instance.wal_context);

        /* Initialize size of walInsertStatusTable first. */
        InitXlogStatuEntryTblSize();
        g_instance.wal_cxt.walInsertStatusTable = (WALInsertStatusEntry *)palloc(
            sizeof(WALInsertStatusEntry) *
            (GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power) + 1));
        g_instance.wal_cxt.walInsertStatusTable =
            (WALInsertStatusEntry *)TYPEALIGN(sizeof(WALInsertStatusEntry), g_instance.wal_cxt.walInsertStatusTable);

        /* Initialize size of XlogFlushStats. */
        g_instance.wal_cxt.xlogFlushStats = (XlogFlushStatistics *)palloc0(sizeof(XlogFlushStatistics));

        (void)MemoryContextSwitchTo(old_context);
    }

    errorno =
        memset_s(g_instance.wal_cxt.walInsertStatusTable,
                 sizeof(WALInsertStatusEntry) *
                     GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power),
                 0,
                 sizeof(WALInsertStatusEntry) *
                     GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power));
    securec_check(errorno, "", "");

    for (int i = 0; i < GET_WAL_INSERT_STATUS_ENTRY_CNT(g_instance.attr.attr_storage.wal_insert_status_entries_power);
         i++) {
        g_instance.wal_cxt.walInsertStatusTable[i].LRC = i;
    }

    g_instance.wal_cxt.lastWalStatusEntryFlushed = -1;
    g_instance.wal_cxt.lastLRCScanned = WAL_SCANNED_LRC_INIT;
    g_instance.wal_cxt.lastLRCFlushed = WAL_SCANNED_LRC_INIT;

    /*
     * Read control file and check XLOG status looks valid.
     *
     * Note: in most control paths, *ControlFile is already valid and we need
     * not do ReadControlFile() here, but might as well do it to be sure.
     */
    if (ENABLE_DMS && ENABLE_DSS) {
        int src_id = INVALID_INSTANCEID;
        SSReadControlFile(REFORM_CTRL_PAGE);
        if (SS_CLUSTER_ONDEMAND_NOT_NORAML && SS_PRIMARY_MODE) {
            if (SS_STANDBY_PROMOTING) {
                ereport(FATAL, (errmsg("Do not allow switchover if on-demand recovery is not finish")));
            }
            Assert(g_instance.dms_cxt.SSReformerControl.recoveryInstId != INVALID_INSTANCEID);
            src_id = g_instance.dms_cxt.SSReformerControl.recoveryInstId;
            ereport(LOG, (errmsg("[on-demand]: On-demand recovery do not finish in last reform, "
                                 "reading control file of original primary:%d", src_id)));
            SSOndemandRecoveryExitNormal = false;
        } else {
            if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
                src_id = SSGetPrimaryInstId();
                ereport(LOG, (errmsg("[SS Reform]: Standby:%d promoting, reading control file of original primary:%d",
                    g_instance.attr.attr_storage.dms_attr.instance_id, src_id)));
            } else {
                src_id = g_instance.attr.attr_storage.dms_attr.instance_id;
            }
        }
        g_instance.dms_cxt.SSRecoveryInfo.recovery_inst_id = src_id;
        SSReadControlFile(src_id);
    } else {
        ReadControlFile();
    }
    if (FORCE_FINISH_ENABLED) {
        max_page_flush_lsn = mpfl_read_max_flush_lsn();
        /* we can't exit proc here, because init gaussdb will run through here and there must be no LsnInfoFile. */
        ereport(LOG,
                (errcode_for_file_access(), errmsg("StartupXLOG: read page lsn info from file, LSN: %X/%X.",
                                                   (uint32)(max_page_flush_lsn >> 32), (uint32)max_page_flush_lsn)));
        if (max_page_flush_lsn == MAX_XLOG_REC_PTR) {
            mpfl_new_file();
        } else {
            SetMinRecoverPointForStats(max_page_flush_lsn);
        }
    } else {
        mpfl_ulink_file();
    }
    ereport(LOG, (errcode(ERRCODE_LOG),
                  errmsg("StartupXLOG: biggest_lsn_in_page is set to %X/%X, enable_update_max_page_flush_lsn:%u",
                         (uint32)(max_page_flush_lsn >> 32), (uint32)max_page_flush_lsn,
                         g_instance.attr.attr_storage.enable_update_max_page_flush_lsn)));
    t_thrd.xlog_cxt.max_page_flush_lsn = max_page_flush_lsn;
    pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.permitFinishRedo), ATOMIC_FALSE);
    set_global_max_page_flush_lsn(max_page_flush_lsn);
    /* description: timeline > MAX_INT32 */
    if (IsUnderPostmaster) {
        t_thrd.shemem_ptr_cxt.ControlFile->timeline = t_thrd.shemem_ptr_cxt.ControlFile->timeline + 1;

        ereport(LOG, (errmsg("database system timeline: %u", t_thrd.shemem_ptr_cxt.ControlFile->timeline)));

        if (t_thrd.shemem_ptr_cxt.ControlFile->timeline == 0) {
            t_thrd.shemem_ptr_cxt.ControlFile->timeline = 1;
        }
    }

    if (t_thrd.shemem_ptr_cxt.ControlFile->state < DB_SHUTDOWNED ||
        t_thrd.shemem_ptr_cxt.ControlFile->state > DB_IN_PRODUCTION ||
        !XRecOffIsValid(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint)) {
        ereport(FATAL, (errmsg("control file contains invalid data")));
    }

    if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_SHUTDOWNED) {
        DBStateShutdown = true;
        AbnormalShutdown = false;
        ereport(LOG,
                (errmsg("database system was shut down at %s", str_time(t_thrd.shemem_ptr_cxt.ControlFile->time))));
    } else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY) {
        DBStateShutdown = true;
        ereport(LOG, (errmsg("database system was shut down in recovery at %s",
                             str_time(t_thrd.shemem_ptr_cxt.ControlFile->time))));
    } else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_SHUTDOWNING) {
        ereport(LOG, (errmsg("database system shutdown was interrupted; last known up at %s",
                             str_time(t_thrd.shemem_ptr_cxt.ControlFile->time))));
    } else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_IN_CRASH_RECOVERY) {
        ereport(LOG, (errmsg("database system was interrupted while in recovery at %s",
                             str_time(t_thrd.shemem_ptr_cxt.ControlFile->time)),
                      errhint("This probably means that some data is corrupted and"
                              " you will have to use the last backup for recovery.")));
    } else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_IN_ARCHIVE_RECOVERY) {
        ereport(LOG, (errmsg("database system was interrupted while in recovery at log time %s",
                             str_time(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.time)),
                      errhint("If this has occurred more than once some data might be corrupted"
                              " and you might need to choose an earlier recovery target.")));
    } else if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_IN_PRODUCTION) {
        ereport(LOG, (errmsg("database system was interrupted; last known up at %s",
                             str_time(t_thrd.shemem_ptr_cxt.ControlFile->time))));
    }

    /* This is just to allow attaching to startup process with a debugger */
#ifdef XLOG_REPLAY_DELAY
    if (t_thrd.shemem_ptr_cxt.ControlFile->state != DB_SHUTDOWNED) {
        pg_usleep(60000000L);
    }
#endif

    /*
     * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
     * someone has performed a copy for PITR, these directories may have been
     * excluded and need to be re-created.
     */
    ValidateXLOGDirectoryStructure();

    /* delete xlogtemp files. */
    remove_xlogtemp_files();
    /*
     * Clear out any old relcache cache files.	This is *necessary* if we do
     * any WAL replay, since that would probably result in the cache files
     * being out of sync with database reality.  In theory we could leave them
     * in place if the database had been cleanly shut down, but it seems
     * safest to just remove them always and let them be rebuilt during the
     * first backend startup.
     */
    RelationCacheInitFileRemove();

    /*
     * Initialize on the assumption we want to recover to the same timeline
     * that's active according to pg_control.
     */
    t_thrd.xlog_cxt.recoveryTargetTLI = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID;

    /* clean temp relation files */
    if (g_instance.attr.attr_storage.max_active_gtt > 0) {
        RemovePgTempFiles();
    }

    // Check for recovery control file, and if so set up state for offline recovery
    readRecoveryCommandFile();
    if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_UNSET && g_instance.pid_cxt.PageRepairPID != 0) {
        g_instance.repair_cxt.support_repair = true;
    } else {
        g_instance.repair_cxt.support_repair = false;
    }

    if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
        t_thrd.xlog_cxt.ArchiveRecoveryRequested = true;
    }

    load_server_mode();

    if (t_thrd.xlog_cxt.server_mode == PENDING_MODE && !t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
        ArchiveRecoveryByPending = true;
    }

    if (t_thrd.xlog_cxt.server_mode == PENDING_MODE || t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
        t_thrd.xlog_cxt.ArchiveRecoveryRequested = true;
        t_thrd.xlog_cxt.StandbyModeRequested = true;
    }
    /* Now we can determine the list of expected TLIs */
    t_thrd.xlog_cxt.expectedTLIs = readTimeLineHistory(t_thrd.xlog_cxt.recoveryTargetTLI);

    /*
     * If pg_control's timeline is not in expectedTLIs, then we cannot
     * proceed: the backup is not part of the history of the requested
     * timeline.
     */
    if (!list_member_int(t_thrd.xlog_cxt.expectedTLIs,
                         (int)t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID)) {
        ereport(FATAL, (errmsg("requested timeline %u is not a child of database system timeline %u",
                               t_thrd.xlog_cxt.recoveryTargetTLI,
                               t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID)));
    }

    /*
     * Save the selected recovery target timeline ID and
     * archive_cleanup_command in shared memory so that other processes can
     * see them
     */
    t_thrd.shemem_ptr_cxt.XLogCtl->RecoveryTargetTLI = t_thrd.xlog_cxt.recoveryTargetTLI;
    errorno = strncpy_s(t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand,
                        sizeof(t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand) - 1,
                        t_thrd.xlog_cxt.archiveCleanupCommand ? t_thrd.xlog_cxt.archiveCleanupCommand : "",
                        sizeof(t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand) - 1);
    securec_check(errorno, "", "");
    t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand[MAXPGPATH - 1] = '\0';

    if (t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
        if (t_thrd.xlog_cxt.StandbyModeRequested) {
            ereport(LOG, (errmsg("entering standby mode")));
        } else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_XID) {
            ereport(LOG,
                    (errmsg("starting point-in-time recovery to XID " XID_FMT, t_thrd.xlog_cxt.recoveryTargetXid)));
        } else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_TIME) {
            ereport(LOG, (errmsg("starting point-in-time recovery to %s",
                                 timestamptz_to_str(t_thrd.xlog_cxt.recoveryTargetTime))));
        }
#ifdef PGXC
        else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_BARRIER) {
            ereport(LOG, (errmsg("starting point-in-time recovery to barrier %s",
                                 (t_thrd.xlog_cxt.recoveryTargetBarrierId))));
        }
#endif
        else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_NAME) {
            ereport(LOG, (errmsg("starting point-in-time recovery to \"%s\"", t_thrd.xlog_cxt.recoveryTargetName)));
        } else if (t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_LSN) {
            ereport(LOG, (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
                                 (uint32)(t_thrd.xlog_cxt.recoveryTargetLSN >> 32),
                                 (uint32)t_thrd.xlog_cxt.recoveryTargetLSN)));
        } else {
            ereport(LOG, (errmsg("starting archive recovery")));
        }
    }

    /*
     * Take ownership of the wakeup latch if we're going to sleep during
     * recovery.
     */
    if (t_thrd.xlog_cxt.StandbyModeRequested) {
        OwnLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
        OwnLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->dataRecoveryLatch);
    }

    /* Set up XLOG reader facility */
    errorno = memset_s(&readprivate, sizeof(XLogPageReadPrivate), 0, sizeof(XLogPageReadPrivate));
    securec_check(errorno, "", "");

    if (ENABLE_DMS && ENABLE_DSS) {
        SSGetRecoveryXlogPath();
        xlogreader = SSXLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER);
        close_readFile_if_open();
        if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
            // init shared memory set page empty
            SSCSNLOGShmemClear();
            SSCLOGShmemClear();
            SSMultiXactShmemClear();
        }
    } else {
        xlogreader = XLogReaderAllocate(&XLogPageRead, &readprivate);
    }

    if (xlogreader == NULL) {
        ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"),
                        errdetail("Failed while allocating an XLog reading processor")));
    }
    xlogreader->system_identifier = t_thrd.shemem_ptr_cxt.ControlFile->system_identifier;

    g_instance.comm_cxt.predo_cxt.redoPf.redo_start_time = GetCurrentTimestamp();
    if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby, &backupFromRoach)) {
        List *tablespaces = NIL;
        /*
         * Archive recovery was requested, and thanks to the backup label file,
         * we know how far we need to replay to reach consistency. Enter
         * archive recovery directly.
         */
        t_thrd.xlog_cxt.InArchiveRecovery = true;
        if (t_thrd.xlog_cxt.StandbyModeRequested) {
            t_thrd.xlog_cxt.StandbyMode = true;
        }

        ereport(LOG, (errmsg("request archive recovery due to backup label file")));

        /*
         * When a backup_label file is present, we want to roll forward from
         * the checkpoint it identifies, rather than using pg_control.
         */
        record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0);
        if (record != NULL) {
            rcm = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(xlogreader), sizeof(CheckPoint));
            securec_check(rcm, "", "");

            /* decide which kind of checkpoint is used */
            recordLen = record->xl_tot_len;
            if (record->xl_tot_len == CHECKPOINTNEW_LEN) {
                rcm = memcpy_s(&checkPointNew, sizeof(checkPointNew), XLogRecGetData(xlogreader),
                               sizeof(checkPointNew));
                securec_check(rcm, "", "");
            } else if (record->xl_tot_len == CHECKPOINTPLUS_LEN) {
                rcm = memcpy_s(&checkPointPlus, sizeof(checkPointPlus), XLogRecGetData(xlogreader),
                               sizeof(checkPointPlus));
                securec_check(rcm, "", "");
            } else if (record->xl_tot_len == CHECKPOINTUNDO_LEN) {
                rcm = memcpy_s(&checkPointUndo, sizeof(checkPointUndo), XLogRecGetData(xlogreader),
                               sizeof(checkPointUndo));
                securec_check(rcm, "", "");
            }
            wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
            UpdateTermFromXLog(record->xl_term);
            ereport(DEBUG1,
                    (errmsg("checkpoint record is at %X/%X", (uint32)(checkPointLoc >> 32), (uint32)checkPointLoc)));
            t_thrd.xlog_cxt.InRecovery = true; /* force recovery even if SHUTDOWNED */

            /*
             * Make sure that REDO location exists. This may not be the case
             * if there was a crash during an online backup, which left a
             * backup_label around that references a WAL segment that's
             * already been archived.
             */
            if (XLByteLT(checkPoint.redo, checkPointLoc) && !ReadRecord(xlogreader, checkPoint.redo, LOG, false)) {
                ereport(FATAL,
                        (errmsg("could not find redo location referenced by checkpoint record"),
                         errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".",
                                 t_thrd.proc_cxt.DataDir)));
            }
        } else {
            ereport(FATAL,
                    (errmsg("could not locate required checkpoint record"),
                     errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".",
                             t_thrd.proc_cxt.DataDir)));
            wasShutdown = false; /* keep compiler quiet */
        }

        /* read the tablespace_map file if present and create symlinks. */
        if (read_tablespace_map(&tablespaces)) {
            ListCell *lc;
            errno_t rc = EOK;
            foreach (lc, tablespaces) {
                tablespaceinfo *ti = (tablespaceinfo *)lfirst(lc);
                int length = PG_TBLSPCS + strlen(ti->oid) + 1;
                char *linkloc = (char *)palloc0(length);
                rc = snprintf_s(linkloc, length, length - 1, "pg_tblspc/%s", ti->oid);
                securec_check_ss_c(rc, "", "");
                /*
                 * Remove the existing symlink if any and Create the symlink
                 * under PGDATA.
                 */
                remove_tablespace_symlink(linkloc);

                if (symlink(ti->path, linkloc) < 0) {
                    pfree(linkloc);
                    ereport(ERROR,
                            (errcode_for_file_access(), errmsg("could not create symbolic link \"%s\": %m", linkloc)));
                }

                pfree(linkloc);
                pfree(ti->oid);
                pfree(ti->path);
                pfree(ti);
            }

            /* set flag to delete it later */
            haveTblspcMap = true;
        }
        /* set flag to delete it later */
        haveBackupLabel = true;
    } else {
        /*
         * If tablespace_map file is present without backup_label file, there
         * is no use of such file.  There is no harm in retaining it, but it
         * is better to get rid of the map file so that we don't have any
         * redundant file in data directory and it will avoid any sort of
         * confusion.  It seems prudent though to just rename the file out
         * of the way rather than delete it completely, also we ignore any
         * error that occurs in rename operation as even if map file is
         * present without backup_label file, it is harmless.
         */
        if (stat(TABLESPACE_MAP, &st) == 0) {
            unlink(TABLESPACE_MAP_OLD);
            if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
                ereport(LOG, (errmsg("ignoring file \"%s\" because no file \"%s\" exists", TABLESPACE_MAP,
                                     BACKUP_LABEL_FILE),
                              errdetail("File \"%s\" was renamed to \"%s\".", TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
            else
                ereport(LOG,
                        (errmsg("ignoring \"%s\" file because no \"%s\" file exists", TABLESPACE_MAP,
                                BACKUP_LABEL_FILE),
                         errdetail("Could not rename file \"%s\" to \"%s\": %m.", TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
        }

        /*
         * It's possible that archive recovery was requested, but we don't
         * know how far we need to replay the WAL before we reach consistency.
         * This can happen for example if a base backup is taken from a running
         * server using an atomic filesystem snapshot, without calling
         * pg_start/stop_backup. Or if you just kill a running master server
         * and put it into archive recovery by creating a recovery.conf file.
         *
         * Our strategy in that case is to perform crash recovery first,
         * replaying all the WAL present in pg_xlog, and only enter archive
         * recovery after that.
         *
         * But usually we already know how far we need to replay the WAL (up to
         * minRecoveryPoint, up to backupEndPoint, or until we see an
         * end-of-backup record), and we can enter archive recovery directly.
         */
        if (t_thrd.xlog_cxt.ArchiveRecoveryRequested &&
            (!XLByteEQ(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, InvalidXLogRecPtr) ||
             t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired ||
             !XLByteEQ(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint, InvalidXLogRecPtr) ||
             t_thrd.shemem_ptr_cxt.ControlFile->state == DB_SHUTDOWNED)) {
            t_thrd.xlog_cxt.InArchiveRecovery = true;
            if (t_thrd.xlog_cxt.StandbyModeRequested) {
                t_thrd.xlog_cxt.StandbyMode = true;
            }
        }

        /*
         * Get the last valid checkpoint record.  If the latest one according
         * to pg_control is broken, try the next-to-last one.
         */
        checkPointLoc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
        t_thrd.xlog_cxt.RedoStartLSN = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
        g_instance.comm_cxt.predo_cxt.redoPf.redo_start_ptr = t_thrd.xlog_cxt.RedoStartLSN;
        record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1);

        if (record != NULL) {
            ereport(DEBUG1,
                    (errmsg("checkpoint record is at %X/%X", (uint32)(checkPointLoc >> 32), (uint32)checkPointLoc)));
        } else if (t_thrd.xlog_cxt.StandbyMode) {
            /*
             * The last valid checkpoint record required for a streaming
             * recovery exists in neither standby nor the primary.
             */
            ereport(PANIC, (errmsg("could not locate a valid checkpoint record")));
        } else {
            checkPointLoc = t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint;
            record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2);
            if (record != NULL) {
                ereport(LOG, (errmsg("using previous checkpoint record at %X/%X", (uint32)(checkPointLoc >> 32),
                                     (uint32)checkPointLoc)));
                t_thrd.xlog_cxt.InRecovery = true; /* force recovery even if SHUTDOWNED */
            } else {
                ereport(PANIC, (errmsg("could not locate a valid checkpoint record")));
            }
        }
        rcm = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(xlogreader), sizeof(CheckPoint));
        securec_check(rcm, "", "");

        recordLen = record->xl_tot_len;
        if (record->xl_tot_len == CHECKPOINTNEW_LEN) {
            rcm = memcpy_s(&checkPointNew, sizeof(checkPointNew), XLogRecGetData(xlogreader),
                           sizeof(checkPointNew));
            securec_check(rcm, "", "");
        } else if (record->xl_tot_len == CHECKPOINTPLUS_LEN) {
            rcm = memcpy_s(&checkPointPlus, sizeof(checkPointPlus), XLogRecGetData(xlogreader),
                           sizeof(checkPointPlus));
            securec_check(rcm, "", "");
        } else if (record->xl_tot_len == CHECKPOINTUNDO_LEN) {
            rcm = memcpy_s(&checkPointUndo, sizeof(checkPointUndo), XLogRecGetData(xlogreader),
                           sizeof(checkPointUndo));
            securec_check(rcm, "", "");
        }

        UpdateTermFromXLog(record->xl_term);

        wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
        wasCheckpoint = wasShutdown || (record->xl_info == XLOG_CHECKPOINT_ONLINE);
    }

    /*
     * initialize double write, recover partial write
     * in SS Switchover, skip dw init since we didn't do ShutdownXLOG
     */

    if ((ENABLE_REFORM && SS_REFORM_REFORMER && !SS_STANDBY_FAILOVER && !SS_PERFORMING_SWITCHOVER) ||
        !ENABLE_DMS || !ENABLE_REFORM) {
        /* process assist file of chunk recycling */
        dw_ext_init();
        dw_init();
        if (ENABLE_DMS) {
            g_instance.dms_cxt.dw_init = true;
        }
    }

    if (SS_REFORM_REFORMER && SS_STANDBY_PROMOTING) {
        ss_switchover_promoting_dw_init();
    }

    if (SS_IN_FAILOVER && SS_REFORM_REFORMER) {
        ss_failover_dw_init();
    }

    /* Recover meta of undo subsystem. */
    undo::RecoveryUndoSystemMeta();

    /* initialize account hash table lock */
    g_instance.policy_cxt.account_table_lock = LWLockAssign(LWTRANCHE_ACCOUNT_TABLE);

    t_thrd.xlog_cxt.LastRec = RecPtr = checkPointLoc;
    ereport(LOG, (errmsg("redo record is at %X/%X; shutdown %s", (uint32)(checkPoint.redo >> 32),
                         (uint32)checkPoint.redo, wasShutdown ? "TRUE" : "FALSE")));
    ereport(DEBUG1, (errmsg("next MultiXactId: " XID_FMT "; next MultiXactOffset: " XID_FMT, checkPoint.nextMulti,
                            checkPoint.nextMultiOffset)));
    ereport(DEBUG1, (errmsg("oldest unfrozen transaction ID: " XID_FMT ", in database %u", checkPoint.oldestXid,
                            checkPoint.oldestXidDB)));
    if (!TransactionIdIsNormal(checkPoint.nextXid)) {
        ereport(PANIC, (errmsg("invalid next transaction ID")));
    }

#ifdef ENABLE_MOT
    /*
     * Recover MOT
     */
    if (!IsInitdb) {
        MOTRecover();
    }
#endif

    /* initialize shared memory variables from the checkpoint record */
    t_thrd.xact_cxt.ShmemVariableCache->nextXid = checkPoint.nextXid;
    t_thrd.xact_cxt.ShmemVariableCache->nextOid = checkPoint.nextOid;
    t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;
    MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
    SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
    SetMultiXactIdLimit(FirstMultiXactId, TemplateDbOid);
    t_thrd.shemem_ptr_cxt.XLogCtl->ckptXid = checkPoint.oldestXid;
    t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false;
    t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = false;
    t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRedoDone = false;

    latestCompletedXid = checkPoint.nextXid;
    TransactionIdRetreat(latestCompletedXid);
    t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestCompletedXid;

    /* init recentGlobalXmin and xmin, to nextXid */
    t_thrd.xact_cxt.ShmemVariableCache->xmin = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
    t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin = latestCompletedXid + 1;
    t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
    g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point = checkPoint.redo;

    /* init dirty page queue rec lsn to checkpoint.redo */
    update_dirty_page_queue_rec_lsn(checkPoint.redo, true);

    /*
     * for gtm environment, we need to set the local csn to next xid to increase.
     * for gtm environment, set local csn to 0 first and then it will be set by redo xact
     * or transaction's commit, which ensure local csn will not be larger than gtm csn.
     */
    if (GTM_FREE_MODE) {
        if ((recordLen == CHECKPOINTNEW_LEN)) {
            t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointNew.next_csn;
            ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu, next xid %lu", "GTM-Free",
                                 checkPointNew.next_csn, t_thrd.xact_cxt.ShmemVariableCache->nextXid)));
        } else if ((recordLen == CHECKPOINTPLUS_LEN)) {
            t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointPlus.next_csn;
            ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu, next xid %lu", "GTM-Free",
                                 checkPointPlus.next_csn, t_thrd.xact_cxt.ShmemVariableCache->nextXid)));
        } else if (recordLen == CHECKPOINTUNDO_LEN) {
            t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointUndo.next_csn;
            ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu, next xid %lu", "GTM-Free",
                                 checkPointUndo.next_csn, t_thrd.xact_cxt.ShmemVariableCache->nextXid)));
        } else {
            t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
            ereport(LOG, (errmsg("%s Mode: start local next csn from next_xid %lu", "GTM-Free",
                                 t_thrd.xact_cxt.ShmemVariableCache->nextXid)));
        }
    } else {
        /* Init nextCommitSeqNo in GTM-Lite Mode */
        t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = COMMITSEQNO_FIRST_NORMAL + 1;
        if (recordLen == CHECKPOINTNEW_LEN) {
            ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu", "GTM-Lite",
                                 (checkPointNew.next_csn))));
            t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointNew.next_csn;
        } else if (recordLen == CHECKPOINTPLUS_LEN) {
            ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu", "GTM-Lite",
                                 (checkPointPlus.next_csn))));
            t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointPlus.next_csn;
        } else if (recordLen == CHECKPOINTUNDO_LEN) {
            ereport(LOG, (errmsg("%s Mode: start local next csn from checkpoint %lu", "GTM-Lite",
                                 (checkPointUndo.next_csn))));
            t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo = checkPointUndo.next_csn;
        }
    }

    /* initialize shared memory variable standbyXmin from checkpoint record */
    if (!TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin)) {
        if (recordLen == CHECKPOINTPLUS_LEN) {
            t_thrd.xact_cxt.ShmemVariableCache->standbyXmin = checkPointPlus.recent_global_xmin;
        } else if (recordLen == CHECKPOINTUNDO_LEN) {
            t_thrd.xact_cxt.ShmemVariableCache->standbyXmin = checkPointUndo.recent_global_xmin;
        } else if (wasShutdown) {
            t_thrd.xact_cxt.ShmemVariableCache->standbyXmin = checkPoint.nextXid - 1;
        }
    }

    t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = InvalidTransactionId;
    /* Read oldest xid having undo from checkpoint and set in proc global. */
    if (t_thrd.proc->workingVersionNum >= INPLACE_UPDATE_VERSION_NUM) {
        pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, checkPointUndo.globalRecycleXid);
        ereport(LOG, (errmsg("Startup: write global globalRecycleXid %lu from checkpoint %lu",
            g_instance.undo_cxt.globalRecycleXid, checkPointUndo.globalRecycleXid)));
    } else {
        pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, InvalidTransactionId);
    }

    /*
     * Initialize replication slots, before there's a chance to remove
     * required resources.
     */

    t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo - 1;
    init_instance_slot();
    init_instance_slot_thread();
    if (!ENABLE_DMS || (SS_REFORM_REFORMER && !SS_PRIMARY_DEMOTED)) {
        StartupReplicationSlots();
    }

    /*
     * Startup logical state, needs to be setup now so we have proper data
     * during crash recovery.
     */
    StartupReorderBuffer();

    /*
     * Initialize start LSN point of CBM system by validating the last-finished CBM file.
     * At this moment, checkpoint would not happen so it is safe to do the desicion
     * without holding the control file lock.
     */
    if (u_sess->attr.attr_storage.enable_cbm_tracking) {
        CBMTrackInit(true, checkPoint.redo);
        t_thrd.cbm_cxt.XlogCbmSys->needReset = false;
    } else {
        t_thrd.cbm_cxt.XlogCbmSys->needReset = true;
    }

    /* If xlog delay or ddl delay was enabled last time, continue to enable them */
    startupInitDelayXlog();
    startupInitDelayDDL();

    /*
     * We must replay WAL entries using the same TimeLineID they were created
     * under, so temporarily adopt the TLI indicated by the checkpoint (see
     * also xlog_redo()).
     */
    t_thrd.xlog_cxt.ThisTimeLineID = checkPoint.ThisTimeLineID;

    /*
     * Before running in recovery, scan pg_twophase and fill in its status
     * to be able to work on entries generated by redo.  Doing a scan before
     * taking any recovery action has the merit to discard any 2PC files that
     * are newer than the first record to replay, saving from any conflicts at
     * replay.  This avoids as well any subsequent scans when doing recovery
     * of the on-disk two-phase data.
     */
    if (!ENABLE_DMS || SS_PRIMARY_MODE) {
        restoreTwoPhaseData();
    }

    StartupCSNLOG();

    t_thrd.xlog_cxt.lastFullPageWrites = checkPoint.fullPageWrites;
    t_thrd.xlog_cxt.RedoRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->RedoRecPtr =
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.RedoRecPtr = checkPoint.redo;

    if (ENABLE_INCRE_CKPT) {
        t_thrd.xlog_cxt.doPageWrites = false;
    } else {
        t_thrd.xlog_cxt.doPageWrites = t_thrd.xlog_cxt.lastFullPageWrites;
    }

    if (XLByteLT(RecPtr, checkPoint.redo)) {
        ereport(PANIC, (errmsg("invalid redo in checkpoint record")));
    }

    /*
     * Check whether we need to force recovery from WAL.  If it appears to
     * have been a clean shutdown and we did not have a recovery.conf file,
     * then assume no recovery needed.
     */
    if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
        t_thrd.xlog_cxt.InRecovery = true;
        if (SS_STANDBY_PROMOTING) {
            ereport(LOG, (errmsg("[SS switchover] Standby promote: redo shutdown checkpoint now")));
        }
    }
    if (XLByteLT(checkPoint.redo, RecPtr)) {
#ifdef ENABLE_MULTIPLE_NODES
        if (wasShutdown) {
            ereport(PANIC, (errmsg("invalid redo record in shutdown checkpoint")));
        }
#endif
        t_thrd.xlog_cxt.InRecovery = true;
    } else if (t_thrd.shemem_ptr_cxt.ControlFile->state != DB_SHUTDOWNED) {
        t_thrd.xlog_cxt.InRecovery = true;
    } else if (t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
        /* force recovery due to presence of recovery.conf */
        t_thrd.xlog_cxt.InRecovery = true;
        if (ArchiveRecoveryByPending) {
            RecoveryByPending = true;
        }
    }

    if (SS_STANDBY_MODE && t_thrd.xlog_cxt.InRecovery == true) {
        /* do not need replay anything in SS standby mode */
        ereport(LOG, (errmsg("[SS] Skip redo replay in standby mode")));
        t_thrd.xlog_cxt.InRecovery = false;
    }

    if (SS_PRIMARY_MODE && ENABLE_ONDEMAND_RECOVERY && (SS_STANDBY_FAILOVER || SS_PRIMARY_NORMAL_REFORM) &&
        t_thrd.xlog_cxt.InRecovery == true) {
        if (SSOndemandRecoveryExitNormal) {
            g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = true;
            /* for other nodes in cluster and ondeamnd recovery failed */
            g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_IN_ONDEMAND_BUILD;
            g_instance.dms_cxt.SSReformerControl.recoveryInstId = g_instance.dms_cxt.SSRecoveryInfo.recovery_inst_id;
            SSSaveReformerCtrl();
            SetOndemandExtremeRtoMode();
            ereport(LOG, (errmsg("[On-demand] replayed in extreme rto ondemand recovery mode")));
        } else {
            ereport(LOG, (errmsg("[On-demand] do not allow replay in ondemand recovery if last ondemand recovery "
                "crash, replayed in extreme rto recovery mode")));
        }
    }

    ReadRemainSegsFile();
    /* Determine whether it is currently in the switchover of streaming disaster recovery */
    checkHadrInSwitchover();
    /* REDO */
    if (t_thrd.xlog_cxt.InRecovery) {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

        /*
         * Update pg_control to show that we are recovering and to show the
         * selected checkpoint as the place we are starting from. We also mark
         * pg_control with any minimum recovery stop point obtained from a
         * backup history file.
         */
        dbstate_at_startup = t_thrd.shemem_ptr_cxt.ControlFile->state;
        if (t_thrd.xlog_cxt.InArchiveRecovery) {
            t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
        } else {
            ereport(LOG, (errmsg("database system was not properly shut down; "
                                 "automatic recovery in progress")));
            t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY;
        }
        PrintCkpXctlControlFile(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint,
                                &(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy), checkPointLoc, &checkPoint,
                                t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint, "StartupXLOG");
        t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
        t_thrd.shemem_ptr_cxt.ControlFile->checkPoint = checkPointLoc;
        t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy = checkPoint;
        if (t_thrd.xlog_cxt.InArchiveRecovery) {
            /* initialize minRecoveryPoint if not set yet */
            if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, checkPoint.redo)) {
                t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = checkPoint.redo;
            }
        }

        /*
         * Set backupStartPoint if we're starting recovery from a backup label guide.
         *
         * NOTICE: there are several scenarios for this as follow, hope you like it.
         * 1. recovery a backup-interrupted master, ArchiveRecoveryRequested(true),
         *    ArchiveRestoreRequested(false), backupEndRequired(false) and
         *    backupFromStandby(false).
         * 2. restore a master exclusive backup, ArchiveRecoveryRequested(true),
         *    ArchiveRestoreRequested(true), backupEndRequired(false) and
         *    backupFromStandby(false).
         * 3. restore a standby exclusive backup, ArchiveRecoveryRequested(true),
         *    ArchiveRestoreRequested(true), backupEndRequired(false) and
         *    backupFromStandby(true).
         * 4. restore a gs_ctl full build standby, ArchiveRecoveryRequested(true),
         *    ArchiveRestoreRequested(false), backupEndRequired(true) and
         *    backupFromStandby(false).
         * 5. restore a gs_rewind inc build standby, ArchiveRecoveryRequested(true),
         *    ArchiveRestoreRequested(false), backupEndRequired(false) and
         *    backupFromStandby(true).
         * 6. recovery as pending or standby, ArchiveRecoveryRequested(true),
         *    ArchiveRestoreRequested(false), backupEndRequired(false) and
         *    backupFromStandby(false) without backup label file.
         *
         * When we try to recover a backup-interrupted master, treat it as a normal
         * starting even we choose to recover from the checkpoint loaded from backup
         * label file.
         *
         * Set backupEndPoint and use minRecoveryPoint as the backup end location
         * if we're starting recovery from a base backup which was taken from the
         * standby. In this case, the database system status in pg_control must
         * indicate DB_IN_ARCHIVE_RECOVERY. If not, which means that backup is
         * corrupted, so we cancel recovery.
         */
        if (haveBackupLabel) {
            /* set backup to control file except backup-interrupted recovery */
            if (t_thrd.xlog_cxt.ArchiveRestoreRequested || backupEndRequired || backupFromStandby) {
                t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint = checkPoint.redo;
                t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired = backupEndRequired;
            } else {
                t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint = InvalidXLogRecPtr;
                t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired = false;
            }

            if (backupFromStandby) {
                if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY) {
                    ereport(FATAL, (errmsg("backup_label contains data inconsistent with control file"),
                                    errhint("This means that the backup is corrupted and you will "
                                            "have to use another backup for recovery.")));
                }
                t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
            } else if (backupFromRoach) {
                t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
                ereport(LOG, (errmsg("perform roach backup restore and set backup end point to %X/%X",
                                     (uint32)(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint >> 32),
                                     (uint32)t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint)));
                g_instance.roach_cxt.isRoachRestore = true;
            }
        }
        t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
        /* No need to hold ControlFileLock yet, we aren't up far enough */
        if (!SS_STANDBY_FAILOVER) {
            UpdateControlFile();
        }

        /* initialize our local copy of minRecoveryPoint */
        t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
        SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
        /*
         * Reset pgstat data if database system was abnormal shutdown (as
         * crash), because it may be invalid after recovery.
         */
        if (AbnormalShutdown) {
            pgstat_reset_all();
        }

        /*
         * If there was a backup label file, it's done its job and the info
         * has now been propagated into pg_control.  We must get rid of the
         * label file so that if we crash during recovery, we'll pick up at
         * the latest recovery restartpoint instead of going all the way back
         * to the backup start point.  It seems prudent though to just rename
         * the file out of the way rather than delete it completely.
         */
        if (haveBackupLabel) {
            unlink(BACKUP_LABEL_OLD);
            durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
        }
        /*
         * If there was a tablespace_map file, it's done its job and the
         * symlinks have been created.  We must get rid of the map file so
         * that if we crash during recovery, we don't create symlinks again.
         * It seems prudent though to just rename the file out of the way
         * rather than delete it completely.
         */
        if (haveTblspcMap) {
            unlink(TABLESPACE_MAP_OLD);
            durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
        }

        /* Check that the GUCs used to generate the WAL allow recovery */
        CheckRequiredParameterValues(DBStateShutdown);

        /*
         * We're in recovery, so unlogged relations may be trashed and must be
         * reset.  This should be done BEFORE allowing Hot Standby
         * connections, so that read-only backends don't try to read whatever
         * garbage is left over from before.
         */
        if (!RecoveryByPending && (!SS_STANDBY_FAILOVER && SSModifySharedLunAllowed())) {
            ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
        }

        /*
         * Likewise, delete any saved transaction snapshot files that got left
         * behind by crashed backends.
         */
        DeleteAllExportedSnapshotFiles();

        /*
         * Initialize for Hot Standby, if enabled. We won't let backends in
         * yet, not until we've reached the min recovery point specified in
         * control file and we've established a recovery snapshot from a
         * running-xacts WAL record.
         */
        if (t_thrd.xlog_cxt.ArchiveRecoveryRequested && g_instance.attr.attr_storage.EnableHotStandby) {
            TransactionId *xids = NULL;
            int nxids;
            ereport(DEBUG1, (errmsg("initializing for hot standby")));

            InitRecoveryTransactionEnvironment();

            if (wasShutdown) {
                oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
            } else {
                oldestActiveXID = checkPoint.oldestActiveXid;
            }
            Assert(TransactionIdIsValid(oldestActiveXID));

            /* Tell procarray about the range of xids it has to deal with */
            ProcArrayInitRecovery(t_thrd.xact_cxt.ShmemVariableCache->nextXid);

            /* Startup commit log and subtrans only. Other SLRUs are not
             * maintained during recovery and need not be started yet. */
            StartupCLOG();

            /*
             * If we're beginning at a shutdown checkpoint, we know that
             * nothing was running on the master at this point. So fake-up an
             * empty running-xacts record and use that here and now. Recover
             * additional standby state for prepared transactions.
             */
            if (wasShutdown) {
#ifdef ENABLE_MULTIPLE_NODES
                TransactionId latestnextXid;
                RunningTransactionsData running;

                /*
                 * Construct a RunningTransactions snapshot representing a
                 * shut down server, with only prepared transactions still
                 * alive. We're never overflowed at this point because all
                 * subxids are listed with their parent prepared transactions.
                 */
                running.nextXid = checkPoint.nextXid;
                running.oldestRunningXid = oldestActiveXID;
                latestnextXid = checkPoint.nextXid;
                TransactionIdRetreat(latestnextXid);
                Assert(TransactionIdIsNormal(latestnextXid));
                running.latestCompletedXid = latestnextXid;
                ProcArrayApplyRecoveryInfo(&running);
#endif
                StandbyRecoverPreparedTransactions();
            }
        }

        /*
         * Initialize shared variables for tracking progress of WAL replay, as
         * if we had just replayed the record before the REDO location (or the
         * checkpoint record itself, if it's a shutdown checkpoint).
         */
        SpinLockAcquire(&xlogctl->info_lck);
        if (XLByteLT(checkPoint.redo, RecPtr)) {
            xlogctl->replayEndRecPtr = checkPoint.redo;
            xlogctl->lastReplayedReadRecPtr = checkPoint.redo;
        } else {
            xlogctl->replayEndRecPtr = t_thrd.xlog_cxt.EndRecPtr;
            xlogctl->lastReplayedReadRecPtr = t_thrd.xlog_cxt.ReadRecPtr;
        }
        g_instance.comm_cxt.predo_cxt.redoPf.last_replayed_end_ptr = xlogctl->lastReplayedEndRecPtr;
        xlogctl->lastReplayedEndRecPtr = xlogctl->replayEndRecPtr;
        xlogctl->recoveryLastXTime = 0;
        xlogctl->currentChunkStartTime = 0;
        xlogctl->recoveryPause = false;
        SpinLockRelease(&xlogctl->info_lck);

        /* Also ensure XLogReceiptTime has a sane value */
        t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();

        /*
         * Let postmaster know we've started redo now, so that it can launch
         * checkpointer to perform restartpoints.  We don't bother during
         * crash recovery as restartpoints can only be performed during
         * archive recovery.  And we'd like to keep crash recovery simple, to
         * avoid introducing bugs that could affect you when recovering after
         * crash.
         *
         * After this point, we can no longer assume that we're the only
         * process in addition to postmaster!  Also, fsync requests are
         * subsequently to be handled by the checkpointer, not locally.
         */
        if ((t_thrd.xlog_cxt.ArchiveRecoveryRequested || IS_PGXC_COORDINATOR || IS_SINGLE_NODE || isRestoreMode) &&
            IsUnderPostmaster) {
            PublishStartupProcessInformation();
            EnableSyncRequestForwarding();
            SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);

            if (ENABLE_INCRE_CKPT) {
                t_thrd.xlog_cxt.pagewriter_launched = true;
            } else {
                t_thrd.xlog_cxt.bgwriterLaunched = true;
            }
        }

        /*
         * Report the values of relevant variables used for the follow consistent check.
         * In some scenario, it takes a long time to recover up to the consistent point
         * before starting up. We can evaluate the remainning time based on the current
         * redo LSN and future consistent point.
         */
        ereport(LOG, (errmsg("redo minRecoveryPoint at %X/%X; backupStartPoint at %X/%X; "
                             "backupEndRequired %s",
                             (uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32), (uint32)t_thrd.xlog_cxt.minRecoveryPoint,
                             (uint32)(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint >> 32),
                             (uint32)t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint,
                             t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired ? "TRUE" : "FALSE")));

        pg_atomic_write_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage, false);
        // Allow read-only connections immediately if we're consistent already.
        CheckRecoveryConsistency();
        ResetXLogStatics();
        // Find the first record that logically follows the checkpoint --- it
        //  might physically precede it, though.
        if (XLByteLT(checkPoint.redo, RecPtr) || SS_STANDBY_PROMOTING) {
            /* back up to find the record */
            record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
        } else {
            /* just have to read next record after CheckPoint */
            record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
        }

        XLogReaderState *oldXlogReader = xlogreader;

        if (record != NULL) {
            bool recoveryContinue = true;
            bool recoveryApply = true;
            TimestampTz xtime;
            instr_time rec_startTime;
            instr_time rec_endTime;
            XLogRecPtr redoStartPtr = t_thrd.xlog_cxt.ReadRecPtr;
            uint64 redoTotalBytes;
            UpdateTermFromXLog(record->xl_term);

            t_thrd.xlog_cxt.InRedo = true;
            g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr = 0;
            ResourceManagerStartup();
            StartUpMultiRedo(xlogreader, sizeof(readprivate));

            if (IsExtremeRedo()) {
                xlogreader->isPRProcess = true;
                record = ExtremeReadNextXLogRecord(&xlogreader, LOG);
                if (record == NULL) {
                    ereport(PANIC, (errmsg("redo starts at %X/%X", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
                                           (uint32)t_thrd.xlog_cxt.ReadRecPtr)));
                }
                t_thrd.xlog_cxt.readSource = 0;
            } else if (IsParallelRedo()) {
                xlogreader->isPRProcess = true;
                xlogreader = parallel_recovery::NewReaderState(xlogreader, true);
            }

            ereport(LOG, (errmsg("redo starts at %X/%X", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
                                 (uint32)t_thrd.xlog_cxt.ReadRecPtr)));

#ifndef ENABLE_LITE_MODE
            update_stop_barrier();
#endif
            INSTR_TIME_SET_CURRENT(rec_startTime);
            t_thrd.xlog_cxt.RedoStartLSN = t_thrd.xlog_cxt.ReadRecPtr;
            g_instance.comm_cxt.predo_cxt.redoPf.redo_start_ptr = t_thrd.xlog_cxt.RedoStartLSN;
            knl_g_set_redo_finish_status(0);
            ereport(LOG, (errmsg("set knl_g_set_redo_finish_status to false when starting redo")));

            do {
                TermFileData term_file;

#ifdef WAL_DEBUG
                if (u_sess->attr.attr_storage.XLOG_DEBUG ||
                    (rmid == RM_XACT_ID && u_sess->attr.attr_common.trace_recovery_messages <= DEBUG2) ||
                    (rmid != RM_XACT_ID && u_sess->attr.attr_common.trace_recovery_messages <= DEBUG3)) {
                    StringInfoData buf;

                    initStringInfo(&buf);
                    appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
                                     (uint32)t_thrd.xlog_cxt.ReadRecPtr, (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
                                     (uint32)t_thrd.xlog_cxt.EndRecPtr);
                    xlog_outrec(&buf, xlogreader);
                    appendStringInfo(&buf, " - ");

                    RmgrTable[record->xl_rmid].rm_desc(&buf, xlogreader);
                    ereport(LOG, (errmsg("%s", buf.data)));
                    pfree_ext(buf.data);
                }
#endif

                /* Handle interrupt signals of startup process */
                RedoInterruptCallBack();

                /*
                 * Pause WAL replay, if requested by a hot-standby session via SetRecoveryPause().
                 *
                 * Note that we intentionally don't take the info_lck spinlock
                 * here.  We might therefore read a slightly stale value of
                 * the recoveryPause flag, but it can't be very stale (no
                 * worse than the last spinlock we did acquire).  Since a
                 * pause request is a pretty asynchronous thing anyway,
                 * possibly responding to it one WAL record later than we
                 * otherwise would is a minor issue, so it doesn't seem worth
                 * adding another spinlock cycle to prevent that.
                 */
                if (xlogctl->recoveryPause) {
                    recoveryPausesHere();
                }
                GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_2]);
                // Have we reached our recovery target?
                if (recoveryStopsHere(xlogreader, &recoveryApply)) {
                    reachedStopPoint = true; /* see below */
                    recoveryContinue = false;
                    ExtremeExtremeRtoStopHere();
                    /* Exit loop if we reached non-inclusive recovery target */
                    if (!recoveryApply &&
                        (t_thrd.xlog_cxt.server_mode == PRIMARY_MODE || t_thrd.xlog_cxt.server_mode == NORMAL_MODE ||
                         (IS_OBS_DISASTER_RECOVER_MODE &&
                         (t_thrd.xlog_cxt.recoveryTarget != RECOVERY_TARGET_TIME_OBS)))) {
                        ExtremeWaitAllRedoWorkerQueueEmpty();
                        break;
                    }
                }

#ifndef ENABLE_MULTIPLE_NODES
                CountAndGetRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_2],
                    t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_3]);
                /*
                 * If we've been asked to lag the master, wait on latch until
                 * enough time has passed.
                 */
                if (RecoveryApplyDelay(xlogreader)) {
                    /*
                     * We test for paused recovery again here. If user sets
                     * delayed apply, it may be because they expect to pause
                     * recovery in case of problems, so we must test again
                     * here otherwise pausing during the delay-wait wouldn't
                     * work.
                     */
                    if (xlogctl->recoveryPause) {
                        recoveryPausesHere();
                    }
                }
                CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_3]);
#else
                CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_2]);
#endif

                if (ENABLE_DMS && !SS_PERFORMING_SWITCHOVER && SSRecoveryApplyDelay()) {
                    if (xlogctl->recoveryPause) {
                        recoveryPausesHere();
                    }
                }

                /*
                 * ShmemVariableCache->nextXid must be beyond record's xid.
                 *
                 * We don't expect anyone else to modify nextXid, hence we
                 * don't need to hold a lock while examining it.  We still
                 * acquire the lock to modify it, though.
                 */
                if (TransactionIdFollowsOrEquals(record->xl_xid, t_thrd.xact_cxt.ShmemVariableCache->nextXid)) {
                    LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
                    if (TransactionIdFollowsOrEquals(record->xl_xid, t_thrd.xact_cxt.ShmemVariableCache->nextXid)) {
                        t_thrd.xact_cxt.ShmemVariableCache->nextXid = record->xl_xid;
                        TransactionIdAdvance(t_thrd.xact_cxt.ShmemVariableCache->nextXid);
                    }
                    LWLockRelease(XidGenLock);
                }

                /*
                 * Update shared replayEndRecPtr before replaying this record,
                 * so that XLogFlush will update minRecoveryPoint correctly.
                 */
                SpinLockAcquire(&xlogctl->info_lck);
                xlogctl->replayEndRecPtr = t_thrd.xlog_cxt.EndRecPtr;
                SpinLockRelease(&xlogctl->info_lck);

                /* If we are attempting to enter Hot Standby mode, process XIDs we see */
                TransactionId xl_xid = record->xl_xid;
                if (t_thrd.xlog_cxt.standbyState >= STANDBY_INITIALIZED && TransactionIdIsValid(xl_xid)) {
                    CSNLogRecordAssignedTransactionId(xl_xid);
                } else if (TransactionIdIsValid(xl_xid)) {
                    ExtendCSNLOG(xl_xid);
                }
                xtime = GetLatestXTime();
                XLogReaderState *newXlogReader = xlogreader;
                if (xlogreader->isPRProcess && !IsExtremeRedo()) {
                    newXlogReader = parallel_recovery::NewReaderState(xlogreader);
                }
                GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_4]);
                DispatchRedoRecord(xlogreader, t_thrd.xlog_cxt.expectedTLIs, xtime);
                CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_4]);
                /* Remember this record as the last-applied one */
                t_thrd.xlog_cxt.LastRec = t_thrd.xlog_cxt.ReadRecPtr;

                /* Exit loop if we reached inclusive recovery target */
                if (!recoveryContinue &&
                    (t_thrd.xlog_cxt.server_mode == PRIMARY_MODE || t_thrd.xlog_cxt.server_mode == NORMAL_MODE ||
                     (IS_OBS_DISASTER_RECOVER_MODE && (t_thrd.xlog_cxt.recoveryTarget != RECOVERY_TARGET_TIME_OBS)) ||
                     IS_MULTI_DISASTER_RECOVER_MODE)) {
                    ExtremeWaitAllRedoWorkerQueueEmpty();
                    break;
                }

                if (!IsExtremeRedo() && CheckForForceFinishRedoTrigger(&term_file)) {
                    t_thrd.xlog_cxt.forceFinishHappened = true;
                    ereport(WARNING,
                            (errmsg("[ForceFinish]force finish redo triggered when StartupXLOG, ReadRecPtr:%08X/%08X,"
                                    "EndRecPtr:%08X/%08X, StandbyMode:%u, startup_processing:%u, dummyStandbyMode:%u",
                                    (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)t_thrd.xlog_cxt.ReadRecPtr,
                                    (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), (uint32)t_thrd.xlog_cxt.EndRecPtr,
                                    t_thrd.xlog_cxt.StandbyMode, t_thrd.xlog_cxt.startup_processing,
                                    dummyStandbyMode)));
                    ArchiveXlogForForceFinishRedo(xlogreader, &term_file);
                    newXlogReader->readOff = INVALID_READ_OFF;
                }

                if (xlogctl->recoverySusPend) {
                    if (IsExtremeRedo()) {
                        ExtremeRedoWaitRecoverySusPendFinish(xlogreader->EndRecPtr);
                    } else {
                        handleRecoverySusPend(xlogreader->EndRecPtr);
                    }
                }

                GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_1]);
                if (xlogreader->isPRProcess && IsExtremeRedo()) {
                    record = ExtremeReadNextXLogRecord(&xlogreader, LOG);
                } else {
                    xlogreader = newXlogReader;
                    record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
                }
                CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_1]);
            } while (record != NULL);  // end of main redo apply loop

            if (SS_IN_ONDEMAND_RECOVERY) {
                OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(0);
            } else {
                SendRecoveryEndMarkToWorkersAndWaitForFinish(0);
            }
            RecoveryXlogReader(oldXlogReader, xlogreader);

            if (!(IS_OBS_DISASTER_RECOVER_MODE || IS_MULTI_DISASTER_RECOVER_MODE)) {
                if (t_thrd.xlog_cxt.recoveryPauseAtTarget && reachedStopPoint) {
                    SetRecoveryPause(true);
                    recoveryPausesHere();
                }
            }
            /* redo finished, we set is_recovery_done to true for query */
            g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr;
            g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp();
            knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM);
            ereport(LOG, (errmsg("set knl_g_set_redo_finish_status to true when redo done")));
            ereport(LOG, (errmsg("redo done at %X/%X, end at %X/%X", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
                                 (uint32)t_thrd.xlog_cxt.ReadRecPtr, (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
                                 (uint32)t_thrd.xlog_cxt.EndRecPtr)));

            INSTR_TIME_SET_CURRENT(rec_endTime);
            INSTR_TIME_SUBTRACT(rec_endTime, rec_startTime);
            redoTotalBytes = t_thrd.xlog_cxt.EndRecPtr - redoStartPtr;
            uint64 totalTime = INSTR_TIME_GET_MICROSEC(rec_endTime);
            uint64 speed = 0;  // MB/s
            if (totalTime > 0) {
                speed = (redoTotalBytes / totalTime) * 1000000 / 1024 / 1024;
            }
            ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                          errmsg("[PR]: Recoverying elapsed: %lu us, redoTotalBytes:%lu,"
                                 "EndRecPtr:%lu, redoStartPtr:%lu,speed:%lu MB/s, totalTime:%lu",
                                 INSTR_TIME_GET_MICROSEC(rec_endTime), redoTotalBytes, t_thrd.xlog_cxt.EndRecPtr,
                                 redoStartPtr, speed, totalTime)));
            redo_unlink_stats_file();
            parallel_recovery::redo_dump_all_stats();
            /* check all the received xlog have been redo when switchover */
            if ((!IS_SHARED_STORAGE_STANDBY_CLUSTER_STANDBY_MODE) && CheckForSwitchoverTrigger()) {
                XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL);
                XLogRecPtr EndRecPtrTemp = t_thrd.xlog_cxt.EndRecPtr;
                XLByteAdvance(EndRecPtrTemp, SizeOfXLogRecord);
                if (XLByteLT(EndRecPtrTemp, receivedUpto) && !FORCE_FINISH_ENABLED) {
                    ereport(PANIC, (errmsg("there are some received xlog have not been redo "
                                           "the tail of last redo lsn:%X/%X, received lsn:%X/%X",
                                           (uint32)(EndRecPtrTemp >> 32), (uint32)EndRecPtrTemp,
                                           (uint32)(receivedUpto >> 32), (uint32)receivedUpto)));
                }
            }

            xtime = GetLatestXTime();
            if (xtime) {
                ereport(LOG, (errmsg("last completed transaction was at log time %s", timestamptz_to_str(xtime))));
            }
            t_thrd.xlog_cxt.InRedo = false;
        } else {
            /* there are no WAL records following the checkpoint */
            ereport(LOG, (errmsg("redo is not required")));
            if (SS_IN_ONDEMAND_RECOVERY) {
                g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false;
                g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL;
                SSSaveReformerCtrl();
            }
        }
    }
    /* Set undoCountThreshold as a proper value after finish recovery. */
    undo::InitUndoCountThreshold();

    /*
     * There exists no XLOG_BACKUP_END(xlog) in backup of standby server.
     * If we restore from backup of standby server, we check the backupEndPoint to
     * determine whether the backup is complete.
     * Since the backupEndPoint may be the last xlog in backup, we must check whether
     * we reach the consistency point after redoing all the xlog.
     */
    if (backupFromStandby) {
        CheckRecoveryConsistency();
    }

    /* Update Remain Segment Start Point */
    UpdateRemainSegsStartPoint(t_thrd.xlog_cxt.EndRecPtr);
    XLogUpdateRemainCommitLsn();

    /*
     * Kill WAL receiver, if it's still running, before we continue to write
     * the startup checkpoint record. It will trump over the checkpoint and
     * subsequent records if it's still alive when we start writing WAL.
     */
    ShutdownWalRcv();
    ShutdownDataRcv();

    /* Move this check to redo done not in the process of redo.
     *
     * Check to see if the XLOG sequence contained any unresolved
     * references to uninitialized pages.
     */
    XLogCheckInvalidPages();

    if (IS_MULTI_DISASTER_RECOVER_MODE) {
        ereport(LOG, (errmsg("reach stop barrier wait startupxlog here")));
        while (reachedStopPoint) {
            pg_usleep(1000000L); /* 1000 ms */
            RedoInterruptCallBack();
        }
    }

    /* Check to see if the XLOG sequence contained any segments, which not be used due to error. */
    XLogCheckRemainSegs();

    /*
     * We don't need the latch anymore. It's not strictly necessary to disown
     * it, but let's do it for the sake of tidiness.
     */
    if (t_thrd.xlog_cxt.StandbyModeRequested) {
        if (!IsExtremeRedo()) {
            DisownLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
        }
        DisownLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->dataRecoveryLatch);
    }

    /*
     * We are now done reading the xlog from stream. Turn off streaming
     * recovery to force fetching the files (which would be required at end of
     * recovery, e.g., timeline history file) from archive or pg_xlog.
     */
    t_thrd.xlog_cxt.StandbyMode = false;
    /*
     * Re-fetch the last valid or last applied record, so we can identify the
     * exact endpoint of what we consider the valid portion of WAL.
     */
    xlogreader->readSegNo = 0;
    xlogreader->readOff = 0;
    xlogreader->readLen = 0;
    record = ReadRecord(xlogreader, t_thrd.xlog_cxt.LastRec, PANIC, false);
    UpdateTermFromXLog(record->xl_term);

    EndOfLog = t_thrd.xlog_cxt.EndRecPtr;
    XLByteToPrevSeg(EndOfLog, endLogSegNo);
    if ((ENABLE_DMS && SS_STANDBY_FAILOVER) || SS_STANDBY_PROMOTING) {
        bool use_existent = true;
        (void)XLogFileInit(endLogSegNo, &use_existent, true);
    }
    uint32 redoReadOff = t_thrd.xlog_cxt.readOff;

    /* only primary mode can call getwritepermissionsharedstorage wnen dorado hyperreplication
     * and dms enabled.
     */
    if(!SS_PRIMARY_STANDBY_CLUSTER_STANDBY) {
        GetWritePermissionSharedStorage();
        CheckShareStorageCtlInfo(EndOfLog);
    }

    /*
     * Complain if we did not roll forward far enough to render the backup
     * dump consistent.  Note: it is indeed okay to look at the local variable
     * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
     * be further ahead --- ControlFile->minRecoveryPoint cannot have been
     * advanced beyond the WAL we processed.
     */
    if (t_thrd.xlog_cxt.InRecovery && (XLByteLT(EndOfLog, t_thrd.xlog_cxt.minRecoveryPoint) ||
                                       !XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint))) {
        if (reachedStopPoint) {
            /* stopped because of stop request */
            ereport(FATAL, (errmsg("requested recovery stop point is before consistent recovery point")));
        }

        /*
         * Ran off end of WAL before reaching end-of-backup WAL record, or
         * minRecoveryPoint. That's usually a bad sign, indicating that you
         * tried to recover from an online backup but never called
         * pg_stop_backup(), or you didn't archive all the WAL up to that
         * point. However, this also happens in crash recovery, if the system
         * crashes while an online backup is in progress. We must not treat
         * that as an error, or the database will refuse to start up.
         */
        if (t_thrd.xlog_cxt.ArchiveRecoveryRequested || t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired) {
            if (t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired) {
                ereport(FATAL,
                        (errmsg("WAL ends before end of online backup"),
                         errhint("All WAL generated while online backup was taken must be available at recovery.")));
            } else if (!XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint)) {
                ereport(FATAL,
                        (errmsg("WAL ends before end of online backup"),
                         errhint(
                             "Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and "
                             "all WAL up to that point must be available at recovery.")));
            } else {
                ereport(FATAL, (errmsg("WAL ends before consistent recovery point")));
            }
        }
    }

    /* Save the selected TimeLineID in shared memory, too */
    t_thrd.shemem_ptr_cxt.XLogCtl->ThisTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;

    /*
     * We are now done reading the old WAL.  Turn off archive fetching if it
     * was active, and make a writable copy of the last WAL segment. (Note
     * that we also have a copy of the last block of the old WAL in readBuf;
     * we will use that below.)
     */
    if (t_thrd.xlog_cxt.ArchiveRecoveryRequested) {
        exitArchiveRecovery(xlogreader->readPageTLI, endLogSegNo);
    }

    /*
     * Prepare to write WAL starting at EndOfLog position, and init xlog
     * buffer cache using the block containing the last record from the
     * previous incarnation.
     */
    t_thrd.xlog_cxt.openLogSegNo = endLogSegNo;
    t_thrd.xlog_cxt.openLogFile = XLogFileOpen(t_thrd.xlog_cxt.openLogSegNo);
    t_thrd.xlog_cxt.openLogOff = 0;
    Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
    Insert->PrevByteSize = XLogRecPtrToBytePos(EndOfLog) - XLogRecPtrToBytePos(t_thrd.xlog_cxt.LastRec);
    Insert->CurrLRC = 0;

    /*
     * Tricky point here: readBuf contains the *last* block that the LastRec
     * record spans, not the one it starts in.  The last block is indeed the
     * one we want to use.
     */
    if (EndOfLog % XLOG_BLCKSZ != 0) {
        char *page = NULL;
        int len;
        int firstIdx;
        XLogRecPtr pageBeginPtr;

        pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
        if (redoReadOff != (pageBeginPtr % XLogSegSize)) {
            ereport(FATAL, (errmsg("redo check error: EndOfLog: %lx, pageBeginPtr: %lx, redoReadOff: %x", EndOfLog,
                                   pageBeginPtr, redoReadOff)));
        }

        firstIdx = XLogRecPtrToBufIdx(EndOfLog);

        /* Copy the valid part of the last block, and zero the rest */
        page = &t_thrd.shemem_ptr_cxt.XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
        len = EndOfLog % XLOG_BLCKSZ;
        errorno = memcpy_s(page, XLOG_BLCKSZ, xlogreader->readBuf, len);
        securec_check(errorno, "", "");

        errorno = memset_s(page + len, XLOG_BLCKSZ - len, 0, XLOG_BLCKSZ - len);
        securec_check(errorno, "", "");

        t_thrd.shemem_ptr_cxt.XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
        t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
    } else {
        /*
         * There is no partial block to copy. Just set InitializedUpTo,
         * and let the first attempt to insert a log record to initialize
         * the next buffer.
         */
        t_thrd.shemem_ptr_cxt.XLogCtl->InitializedUpTo = EndOfLog;
    }

    t_thrd.xlog_cxt.LogwrtResult->Write = t_thrd.xlog_cxt.LogwrtResult->Flush = EndOfLog;
    t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtResult = *t_thrd.xlog_cxt.LogwrtResult;
#ifndef ENABLE_MULTIPLE_NODES
    t_thrd.xlog_cxt.LogwrtPaxos->Write = t_thrd.xlog_cxt.LogwrtPaxos->Consensus = EndOfLog;
    t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtPaxos = *t_thrd.xlog_cxt.LogwrtPaxos;
#endif
    t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Write = EndOfLog;
    t_thrd.shemem_ptr_cxt.XLogCtl->LogwrtRqst.Flush = EndOfLog;
    g_instance.comm_cxt.predo_cxt.redoPf.primary_flush_ptr = EndOfLog;

    /*
     * Prepare WAL buffer and other variables for WAL writing before any
     * call to LocalSetXLogInsertAllowed() calls that enables WAL writing
     */
    g_instance.wal_cxt.flushResult = EndOfLog;
    AdvanceXLInsertBuffer<false>(EndOfLog, false, NULL);
    g_instance.wal_cxt.sentResult = EndOfLog;
    if (ENABLE_DMS) {
        ereport(LOG, (errmsg("[SS reform] EndOfLog:%lu.", EndOfLog)));
    }

    /* Pre-scan prepared transactions to find out the range of XIDs present */
    oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);

    /*
     * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
     * record before resource manager writes cleanup WAL records or checkpoint
     * record is written.
     */
    if (ENABLE_INCRE_CKPT) {
        Insert->fullPageWrites = false;
    } else {
        Insert->fullPageWrites = t_thrd.xlog_cxt.lastFullPageWrites;
    }
    LocalSetXLogInsertAllowed();
    UpdateFullPageWrites();
    t_thrd.xlog_cxt.LocalXLogInsertAllowed = -1;

    if (t_thrd.xlog_cxt.InRecovery) {
        /*
         * Resource managers might need to write WAL records, eg, to record
         * index cleanup actions.  So temporarily enable XLogInsertAllowed in
         * this process only.
         */
        LocalSetXLogInsertAllowed();
        EndRedoXlog();
        /* Disallow XLogInsert again */
        t_thrd.xlog_cxt.LocalXLogInsertAllowed = -1;
        CheckForRestartPoint();

        /*
         * And finally, execute the recovery_end_command, if any.
         */
        if (t_thrd.xlog_cxt.recoveryEndCommand) {
            ExecuteRecoveryCommand(t_thrd.xlog_cxt.recoveryEndCommand, "recovery_end_command", true);
        }
    }

    /*
     * Clean up any (possibly bogus) future WAL segments on the old timeline.
     */
    if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
        RemoveNonParentXlogFiles(EndOfLog, t_thrd.xlog_cxt.ThisTimeLineID);
    }

    /*
     * Preallocate additional log files, if wanted.
     */
    PreallocXlogFiles(EndOfLog);

    /*
     * Reset initial contents of unlogged relations.  This has to be done
     * AFTER recovery is complete so that any unlogged relations created
     * during recovery also get picked up.
     */
    if (t_thrd.xlog_cxt.InRecovery && !RecoveryByPending && SSModifySharedLunAllowed()) {
        ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
    }

    /*
     * Okay, we're officially UP.
     */
    t_thrd.xlog_cxt.InRecovery = false;
    g_instance.roach_cxt.isRoachRestore = false;

    if (!SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING && !SS_IN_ONDEMAND_RECOVERY) {
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION;
        t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
    }

    /* start the archive_timeout timer running */
    t_thrd.shemem_ptr_cxt.XLogCtl->lastSegSwitchTime = (pg_time_t)time(NULL);

    /* also initialize latestCompletedXid, to nextXid - 1 */
    latestCompletedXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
    TransactionIdRetreat(latestCompletedXid);
    t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestCompletedXid;

    /*
     * Start up the commit log and subtrans, if not already done for hot
     * standby.
     */
    StartupCLOG();

    /*
     * Perform end of recovery actions for any SLRUs that need it.
     */
    StartupMultiXact();

    /*
     * Recover knowledge about replay progress of known replication partners.
     */
#ifndef ENABLE_MULTIPLE_NODES
    StartupReplicationOrigin();
#endif
    TrimCLOG();

    /* Reload shared-memory state for prepared transactions */
    RecoverPreparedTransactions();

    /*
     * If any of the critical GUCs have changed, log them before we allow
     * backends to write WAL.
     */
    LocalSetXLogInsertAllowed();

    /*
     * Shutdown the recovery environment. This must occur after
     * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
     */
    if (t_thrd.xlog_cxt.standbyState != STANDBY_DISABLED) {
        ShutdownRecoveryTransactionEnvironment();
    }

    /* Shut down readFile facility, free space. */
    ShutdownReadFileFacility();

    if (SS_STANDBY_FAILOVER && SS_PRIMARY_CLUSTER_STANDBY) {
        ereport(LOG, (errmodule(MOD_DMS),
                      errmsg("[SS failover] standby promoting: copy endofxlog pageptr from primary to standby")));
        XLogRecPtr EndOfLogPagePtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
        SSXLOGCopyFromOldPrimary(xlogreader, EndOfLogPagePtr);

        ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS failover] standby promoting: copy xlog from local to dorado")));
        ShareStorageXLogCtl *sharestorageCtl = g_instance.xlog_cxt.shareStorageXLogCtl;
        XLogRecPtr localFlush = InvalidXLogRecPtr;
        localFlush = EndOfLog;
        XLogRecPtr maxPosCanWrite = GetMaxPosCanOverWrite();
        uint32 shiftSize = 32;
        ereport(LOG,
                (errmsg("start to copy xlog from local to shared storage, localFlush: %X/%X, maxPosCanWrite: %X/%X.",
                        static_cast<uint32>(localFlush >> shiftSize), static_cast<uint32>(localFlush),
                        static_cast<uint32>(maxPosCanWrite >> shiftSize), static_cast<uint32>(maxPosCanWrite))));

        if (XLByteLT(sharestorageCtl->insertHead, maxPosCanWrite)) {
            XLogRecPtr expectPos = XLByteLT(localFlush, maxPosCanWrite) ? localFlush : maxPosCanWrite;
            SSDoXLogCopyFromLocal(expectPos);
        }
    }

    /* Shut down the xlog reader facility. */
    XLogReaderFree(xlogreader);
    xlogreader = NULL;

    XLogReportParameters();

    /*
     * if switch gtm-lite mode to gtm-free mode, local next csn maybe larger than next xid,
     * set next xid to the larger one.
     */
    if (GTM_FREE_MODE &&
        t_thrd.xact_cxt.ShmemVariableCache->nextXid < t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo) {
        t_thrd.xact_cxt.ShmemVariableCache->nextXid = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
    }
    t_thrd.xact_cxt.ShmemVariableCache->nextXid += g_instance.shmem_cxt.MaxBackends;

    /*
     * nextxid may access the next clog page which has not extended yet,
     * If we do checkpoint, nextxid record as a part of runningxacts, and
     * we may try to access according clog page during recovery.
     * Just extend it here to avoid this situation.
     */
    ExtendCLOG(t_thrd.xact_cxt.ShmemVariableCache->nextXid);
    ExtendCSNLOG(t_thrd.xact_cxt.ShmemVariableCache->nextXid);

    /*
     * All done.  Allow backends to write WAL.	(Although the bool flag is
     * probably atomic in itself, we use the info_lck here to ensure that
     * there are no race conditions concerning visibility of other recent
     * updates to shared memory.)
     */
    {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->SharedRecoveryInProgress = false;
        xlogctl->IsRecoveryDone = true;
        SpinLockRelease(&xlogctl->info_lck);
        if (!SS_IN_ONDEMAND_RECOVERY) {
            NotifyGscRecoveryFinished();
            if (ENABLE_INCRE_CKPT) {
                RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
                (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
                state->start = state->end;
                (void)LWLockRelease(state->recovery_queue_lock);
            }
        }
    }

    if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
        if (SS_STANDBY_FAILOVER) {
            g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = ALLOW_CKPT;
            pg_memory_barrier();
        }
        if (!SS_IN_ONDEMAND_RECOVERY) {
            ereport(LOG, (errmodule(MOD_DMS),
                errmsg("[SS switchover/SS failover] standby promoting: start full checkpoint.")));
            RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
            LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
            t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION;
            t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
            UpdateControlFile();
            LWLockRelease(ControlFileLock);
            SSRecheckBufferPool();
            ereport(LOG, (errmodule(MOD_DMS),
                errmsg("[SS switchover/SS failover] standby promoting: finished full checkpoint"
                    "and update control file")));
        }
    }

    NextXidAfterReovery = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
    OldestXidAfterRecovery = t_thrd.xact_cxt.ShmemVariableCache->oldestXid;
    PendingPreparedXactsCount = GetPendingXactCount();
    IsPendingXactsRecoveryDone = false;
    t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
    /* for recentGlobalXmin, if it is still invalid, update it use the value from checkpoint */
    if (GTM_LITE_MODE && !TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin)) {
        if (recordLen == CHECKPOINTPLUS_LEN) {
            t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPointPlus.recent_global_xmin;
        } else if (recordLen == CHECKPOINTUNDO_LEN) {
            t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPointUndo.recent_global_xmin;
        } else if (wasShutdown) {
            t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPoint.nextXid - 1;
        }
    }

    /*
     * May exist gds with no xlog for heap page,
     * so the xid in page may bigger than ShmemVariableCache->nextXid
     */

    /* for primary node, calculate the first multi version snapshot. */
    LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
    CalculateLocalLatestSnapshot(true);
    LWLockRelease(ProcArrayLock);

    if (SS_PERFORMING_SWITCHOVER && g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTING) {
        ereport(LOG, (errmsg("[SS switchover] Standby promote: StartupXLOG finished, promote success")));
        Assert(g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTING);
        g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTED;
    }

    if (SS_IN_ONDEMAND_RECOVERY) {
        /* We wait at here */
        ereport(LOG, (errmsg("[SS] On-demand redo, nextXid: " XID_FMT ", startupMaxXid: " XID_FMT
                             ", recentLocalXmin: " XID_FMT ", recentGlobalXmin: %lu, PendingPreparedXacts: %d"
                             ", NextCommitSeqNo: %lu, cutoff_csn_min: %lu.",
                             NextXidAfterReovery, t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid,
                             t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin,
                             t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin, PendingPreparedXactsCount,
                             t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo,
                             t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min)));
        OnDemandWaitRedoFinish();
        /* to do the work we skip before */
        XLogCheckInvalidPages();
        XLogCheckRemainSegs();

        RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION;
        t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
        SSRecheckBufferPool();
        ereport(LOG, (errmodule(MOD_DMS),
            errmsg("[SS][on demand recovery] finished full checkpoint and update control file")));

        NotifyGscRecoveryFinished();
        if (ENABLE_INCRE_CKPT) {
            RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
            (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
            state->start = state->end;
            (void)LWLockRelease(state->recovery_queue_lock);
        }
        g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false;
    }

    if (SS_PRIMARY_MODE) {
        /* for other nodes in cluster */
        g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL;
        SSSaveReformerCtrl();
    }

    ereport(LOG, (errmsg("redo done, nextXid: " XID_FMT ", startupMaxXid: " XID_FMT ", recentLocalXmin: " XID_FMT
                         ", recentGlobalXmin: %lu, PendingPreparedXacts: %d"
                         ", NextCommitSeqNo: %lu, cutoff_csn_min: %lu.",
                         NextXidAfterReovery, t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid,
                         t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin,
                         t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin, PendingPreparedXactsCount,
                         t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo,
                         t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min)));

#ifdef ENABLE_MOT
    /*
     * Cleanup MOT recovery
     */
    if (!IsInitdb) {
        MOTRecoveryDone();
    }
#endif

    if (ENABLE_DMS && ENABLE_REFORM && !SS_PRIMARY_DEMOTED && !DORADO_STANDBY_CLUSTER) {
        StartupWaitReform();
    }
}

void CopyXlogForForceFinishRedo(XLogSegNo logSegNo, uint32 termId, XLogReaderState *xlogreader,
                                XLogRecPtr lastRplEndLsn)
{
    char srcPath[MAXPGPATH];
    char dstPath[MAXPGPATH];
    errno_t errorno = EOK;

    errorno = snprintf_s(srcPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, xlogreader->readPageTLI,
                         (uint32)((logSegNo) / XLogSegmentsPerXLogId), (uint32)((logSegNo) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    errorno = snprintf_s(dstPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X%08X", SS_XLOGDIR, termId,
                         t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((logSegNo) / XLogSegmentsPerXLogId),
                         (uint32)((logSegNo) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");
    XLogFileCopy(srcPath, dstPath);
    XLogFileTruncate(srcPath, lastRplEndLsn);
}

/* there is nothing to do with beginSegNo, we already get the copy of beginSegNo. */
void RenameXlogForForceFinishRedo(XLogSegNo beginSegNo, TimeLineID tli, uint32 termId)
{
    ereport(WARNING, (errcode(ERRCODE_LOG), errmsg("RenameXlogForForceFinishRedo INFO beginSegNo:%lu, tli:%u, "
                                                   "termId:%u",
                                                   beginSegNo, tli, termId)));

    for (uint32 sn = beginSegNo + 1;; ++sn) {
        char srcPath[MAXPGPATH];
        char dstPath[MAXPGPATH];
        errno_t errorno = EOK;

        errorno = snprintf_s(srcPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, tli,
                             (uint32)((sn) / XLogSegmentsPerXLogId), (uint32)((sn) % XLogSegmentsPerXLogId));
        securec_check_ss(errorno, "", "");
        errorno = snprintf_s(dstPath, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X%08X", SS_XLOGDIR, termId, tli,
                             (uint32)((sn) / XLogSegmentsPerXLogId), (uint32)((sn) % XLogSegmentsPerXLogId));
        securec_check_ss(errorno, "", "");
        if (durable_rename(srcPath, dstPath, WARNING) != 0) {
            ereport(WARNING, (errcode(ERRCODE_LOG), errmsg("RenameXlogForForceFinishRedo failed, srcPath:%s, "
                                                           "dstPath:%s",
                                                           srcPath, dstPath)));
            break;
        }
    }
}

void SetSwitchHistoryFile(XLogRecPtr switchLsn, XLogRecPtr catchLsn, uint32 termId)
{
    char path[MAXPGPATH];
    char tmpPath[MAXPGPATH];
    errno_t errorno = EOK;
    XLogSwitchInfo info;
    FILE *hfile = NULL;

    info.termId = termId;
    info.switchLsn = switchLsn;
    info.catchLsn = catchLsn;
    info.oldestRunningXid = PrescanPreparedTransactions(NULL, NULL);
    info.latestCompletedXid = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
    info.invaildPageCnt = t_thrd.xlog_cxt.invaildPageCnt;
    info.imcompleteActionCnt = t_thrd.xlog_cxt.imcompleteActionCnt;
    errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%08X%s", termId, XLOG_SWITCH_HISTORY_FILE);
    securec_check_ss(errorno, "", "");
    errorno = snprintf_s(tmpPath, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%08X%s.temp", termId, XLOG_SWITCH_HISTORY_FILE);
    securec_check_ss(errorno, "", "");

    ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                  errmsg("[REDO_STATS]SetSwitchHistoryFile: ready to set switch file, termId:%u, "
                         "switchLsn:%lu, catchLsn:%lu, oldestRunningXid:%lu, latestCompleteXid:%lu, "
                         "invaildPageCnt:%u, imcompleteActionCnt:%u",
                         termId, switchLsn, catchLsn, info.oldestRunningXid, info.latestCompletedXid,
                         info.invaildPageCnt, info.imcompleteActionCnt)));

    canonicalize_path(tmpPath);
    hfile = fopen(tmpPath, "w");
    if (hfile == NULL) {
        ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                          errmsg("[REDO_STATS]SetSwitchHistoryFile: fopen tmpPath:%s failed, %m", tmpPath)));
        return;
    }
    if (0 == (fwrite(&info, 1, sizeof(XLogSwitchInfo), hfile))) {
        ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                          errmsg("[REDO_STATS]SetSwitchHistoryFile: fwrite tmpPath:%s failed, %m", tmpPath)));
    }
    fclose(hfile);
    if (durable_rename(tmpPath, path, WARNING) != 0) {
        ereport(WARNING,
                (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("SetSwitchHistoryFile should not have failed, tmpPath:%s, "
                                                         "path:%s",
                                                         tmpPath, path)));
    }
}

bool FallbackXlogreader(XLogReaderState *xlogreader, XLogRecPtr readPtr, XLogRecPtr endPtr)
{
    errno_t rc = EOK;
    /* next read, ReadRecPtr will be set to RecPtr in function XLogReadRecord */
    xlogreader->ReadRecPtr = readPtr;
    /* EndRecPtr will be next record start location */
    xlogreader->EndRecPtr = endPtr;
    xlogreader->readOff = INVALID_READ_OFF;
    rc = memset_s(xlogreader->readBuf, xlogreader->readLen, 0, xlogreader->readLen);
    securec_check(rc, "\0", "\0");
    t_thrd.xlog_cxt.LastRec = readPtr;
    if (t_thrd.xlog_cxt.readFile >= 0) {
        close(t_thrd.xlog_cxt.readFile);
        t_thrd.xlog_cxt.readFile = -1;
        return true;
    }
    return false;
}

void ReOpenXlog(XLogReaderState *xlogreader)
{
    uint32 sources;
    int emode;

    sources = XLOG_FROM_PG_XLOG;
    emode = ((XLogPageReadPrivate *)(xlogreader->private_data))->emode;
    if (t_thrd.xlog_cxt.InArchiveRecovery)
        sources |= XLOG_FROM_ARCHIVE;
    t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
    if (t_thrd.xlog_cxt.readFile < 0) {
        ereport(WARNING, (errmsg("FallbackXlogreader: XLogFileReadAnyTLI failed, "
                                 "readseg:%lu, emode:%d, sources:%u",
                                 t_thrd.xlog_cxt.readSegNo, emode, sources)));
    }
}

void ArchiveXlogForForceFinishRedo(XLogReaderState *xlogreader, TermFileData *term_file)
{
    bool finishRedo = term_file->finish_redo;
    uint32 termId = term_file->term;
    if (finishRedo == true) {
        if ((get_real_recovery_parallelism() > 1) && (GetRedoWorkerCount() > 0)) {
            ProcTxnWorkLoad(true);

            parallel_recovery::SendClearMarkToAllWorkers();
            parallel_recovery::WaitRedoWorkerIdle();
        }

        /* local replayed lsn */
        XLogRecPtr lastRplReadLsn;
        XLogRecPtr lastRplEndLsn = GetXLogReplayRecPtr(NULL, &lastRplReadLsn);
        /* scene1: startup scene, receivedUpto is 0; */
        /* scene2: walRcv scene, receivedUpto is non-zero; */
        XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL);
        XLogSegNo beginSegNo;
        bool closed = false;
        ShutdownWalRcv();
        ShutdownDataRcv();
        ereport(WARNING,
                (errcode(ERRCODE_LOG),
                 errmsg("[ForceFinish] ArchiveXlogForForceFinishRedo INFO lastRplReadLsn:%lu, lastRplEndLsn:%lu, "
                        "receivedUpto:%lu",
                        lastRplReadLsn, lastRplEndLsn, receivedUpto)));
        XLByteToSeg(lastRplEndLsn, beginSegNo);

        closed = FallbackXlogreader(xlogreader, lastRplReadLsn, lastRplEndLsn);
        CopyXlogForForceFinishRedo(beginSegNo, termId, xlogreader, lastRplEndLsn);
        RenameXlogForForceFinishRedo(beginSegNo, xlogreader->readPageTLI, termId);
        if (closed) {
            ReOpenXlog(xlogreader);
        }
        t_thrd.xlog_cxt.invaildPageCnt = 0;
        t_thrd.xlog_cxt.imcompleteActionCnt = 0;
        XLogCheckInvalidPages();
        btree_clear_imcompleteAction();
        SetSwitchHistoryFile(lastRplEndLsn, receivedUpto, termId);
        t_thrd.xlog_cxt.invaildPageCnt = 0;
        t_thrd.xlog_cxt.imcompleteActionCnt = 0;
        set_wal_rcv_write_rec_ptr(lastRplEndLsn);
        t_thrd.xlog_cxt.receivedUpto = lastRplEndLsn;
        pg_atomic_write_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo), 0);
        ereport(WARNING, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]ArchiveXlogForForceFinishRedo is over!")));
    }
}

void backup_cut_xlog_file(XLogRecPtr lastReplayedEndRecPtr)
{
    errno_t errorno = EOK;
    ereport(DEBUG1, (errmsg("end of backup reached")));

    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

    if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, lastReplayedEndRecPtr)) {
        t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
        SetMinRecoverPointForStats(lastReplayedEndRecPtr);
    }

    errorno = memset_s(&t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint, sizeof(XLogRecPtr), 0, sizeof(XLogRecPtr));
    securec_check(errorno, "", "");
    errorno = memset_s(&t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint, sizeof(XLogRecPtr), 0, sizeof(XLogRecPtr));
    securec_check(errorno, "", "");

    t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired = false;
    UpdateControlFile();

    LWLockRelease(ControlFileLock);
}

static void sendPMBeginHotStby()
{
    if (!t_thrd.xlog_cxt.LocalHotStandbyActive &&
        t_thrd.xlog_cxt.reachedConsistency && IsUnderPostmaster) {
        bool xminValid = true;
        XLogRecPtr lastReplayedEndRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->lastReplayedEndRecPtr;
#ifndef ENABLE_MULTIPLE_NODES
        xminValid = TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin) ||
                    !g_instance.attr.attr_storage.EnableHotStandby;
#endif

        if (xminValid) {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->SharedHotStandbyActive = true;
            SpinLockRelease(&xlogctl->info_lck);

            t_thrd.xlog_cxt.LocalHotStandbyActive = true;
            pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.hotStdby), ATOMIC_TRUE);
            ereport(LOG, (errmsg("send signal to be hot standby at %X/%X", (uint32)(lastReplayedEndRecPtr >> 32),
                                 (uint32)lastReplayedEndRecPtr)));
#ifdef ENABLE_MULTIPLE_NODES
            /*
             * If we are in cluster-standby-mode, we need launch barreir preparse
             * thread from the minrecoverypoint point.
             */
            if (IS_MULTI_DISASTER_RECOVER_MODE && g_instance.pid_cxt.BarrierPreParsePID == 0) {
                SetBarrierPreParseLsn(t_thrd.xlog_cxt.minRecoveryPoint);
            }
#endif
            SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
        }
    }
}

/*
 * Checks if recovery has reached a consistent state. When consistency is
 * reached and we have a valid starting standby snapshot, tell postmaster
 * that it can start accepting read-only connections.
 */
void CheckRecoveryConsistency(void)
{
    /* Standby can receive read connection when DCF thread started in dcf mode */
#ifndef ENABLE_MULTIPLE_NODES
    if (!IsDCFReadyOrDisabled()) {
        return;
    }
#endif

    XLogRecPtr lastReplayedEndRecPtr;
    XLogCtlData *XLogCtl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&XLogCtl->info_lck);
    lastReplayedEndRecPtr = t_thrd.shemem_ptr_cxt.XLogCtl->lastReplayedEndRecPtr;
    SpinLockRelease(&XLogCtl->info_lck);
    if (pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.permitFinishRedo)) == ATOMIC_FALSE &&
        XLByteLE(t_thrd.xlog_cxt.max_page_flush_lsn, lastReplayedEndRecPtr)) {
        pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.permitFinishRedo), ATOMIC_TRUE);
        ereport(WARNING, (errmsg("permit finish redo at %X/%X", (uint32)(lastReplayedEndRecPtr >> 32),
                                 (uint32)lastReplayedEndRecPtr)));
    }
    /*
     * During crash recovery, we don't reach a consistent state until we've
     * replayed all the WAL.
     */
    if (XLogRecPtrIsInvalid(t_thrd.xlog_cxt.minRecoveryPoint))
        return;
    /*
     * Have we reached the point where our base backup was completed?
     */
    if (!XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint) &&
        !XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint) &&
        XLByteLE(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint, lastReplayedEndRecPtr)) {
        /*
         * We have reached the end of base backup, as indicated by pg_control.
         * The data on disk is now consistent. Reset backupStartPoint and
         * backupEndPoint, and update minRecoveryPoint to make sure we don't
         * allow starting up at an earlier point even if recovery is stopped
         * and restarted soon after this.
         */
        backup_cut_xlog_file(lastReplayedEndRecPtr);
    }

    /*
     * Have we passed our safe starting point? Note that minRecoveryPoint
     * is known to be incorrectly set if ControlFile->backupEndRequired,
     * until the XLOG_BACKUP_RECORD arrives to advise us of the correct
     * minRecoveryPoint. All we know prior to that is that we're not
     * consistent yet.
     */
    if (!t_thrd.xlog_cxt.reachedConsistency && !t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired &&
        XLByteLE(t_thrd.xlog_cxt.minRecoveryPoint, lastReplayedEndRecPtr) &&
        XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint)) {
        /*
         * We found the maintence of `minRecoveryPoint' is not correct,
         * in some scenario, `minRecoveryPoint' set too early, actually it does not
         * reach the Consitency.
         * For MppDB, standby is a warm standby not hot standby, standby does not
         * supoort read only query any more, we do `XLogCheckInvalidPages` when xlog
         * redo done not in the process of redo to work around this problem.
         */
        t_thrd.xlog_cxt.reachedConsistency = true;
        ereport(LOG, (errmsg("consistent recovery state reached at %X/%X", (uint32)(lastReplayedEndRecPtr >> 32),
                             (uint32)lastReplayedEndRecPtr)));
        parallel_recovery::redo_dump_all_stats();
        redo_unlink_stats_file();
    }

    CheckIsStopRecovery();

    /*
     * Have we got a valid starting snapshot that will allow queries to be
     * run? If so, we can tell postmaster that the database is consistent now,
     * enabling connections.
     */
    sendPMBeginHotStby();
}

/*
 * Is the system still in recovery?
 *
 * Unlike testing InRecovery, this works in any process that's connected to
 * shared memory.
 *
 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
 * variables the first time we see that recovery is finished.
 */
bool RecoveryInProgress(void)
{
    /*
     * We check shared state each time only until we leave recovery mode. We
     * can't re-enter recovery, so there's no need to keep checking after the
     * shared variable has once been seen false.
     */
    if (!t_thrd.xlog_cxt.LocalRecoveryInProgress) {
        if (!ENABLE_DMS || (ENABLE_DMS && !SS_STANDBY_PROMOTING &&
            g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status == NOT_ACTIVE)) {
            return false;
        }
    }

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    t_thrd.xlog_cxt.LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;

    /*
     * Initialize TimeLineID and RedoRecPtr when we discover that recovery
     * is finished. InitPostgres() relies upon this behaviour to ensure
     * that InitXLOGAccess() is called at backend startup.	(If you change
     * this, see also LocalSetXLogInsertAllowed.)
     */
    if (!t_thrd.xlog_cxt.LocalRecoveryInProgress) {
        InitXLOGAccess();
    }

    return t_thrd.xlog_cxt.LocalRecoveryInProgress;
}

/*
 * Is HotStandby active yet? This is only important in special backends
 * since normal backends won't ever be able to connect until this returns
 * true. Postmaster knows this by way of signal, not via shared memory.
 *
 * Unlike testing standbyState, this works in any process that's connected to
 * shared memory.  (And note that standbyState alone doesn't tell the truth
 * anyway.)
 */
bool HotStandbyActive(void)
{
    /*
     * We check shared state each time only until Hot Standby is active. We
     * can't de-activate Hot Standby, so there's no need to keep checking
     * after the shared variable has once been seen true.
     */
    if (t_thrd.xlog_cxt.LocalHotStandbyActive) {
        return true;
    }

    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    /* spinlock is essential on machines with weak memory ordering! */
    SpinLockAcquire(&xlogctl->info_lck);
    t_thrd.xlog_cxt.LocalHotStandbyActive = xlogctl->SharedHotStandbyActive;
    SpinLockRelease(&xlogctl->info_lck);

    return t_thrd.xlog_cxt.LocalHotStandbyActive;
}

/*
 * Like HotStandbyActive(), but to be used only in WAL replay code,
 * where we don't need to ask any other process what the state is.
 */
bool HotStandbyActiveInReplay(void)
{
    Assert(AmStartupProcess() || !IsPostmasterEnvironment || AmPageRedoProcess());
    return t_thrd.xlog_cxt.LocalHotStandbyActive;
}

/*
 * Is this process allowed to insert new WAL records?
 *
 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
 * But we also have provisions for forcing the result "true" or "false"
 * within specific processes regardless of the global state.
 */
bool XLogInsertAllowed(void)
{
    // If value is "unconditionally true" or "unconditionally false", just
    // return it.  This provides the normal fast path once recovery is known done.
    if (t_thrd.xlog_cxt.LocalXLogInsertAllowed >= 0) {
        return (bool)t_thrd.xlog_cxt.LocalXLogInsertAllowed;
    }

    // Else, must check to see if we're still in recovery.
    if (RecoveryInProgress()) {
        return false;
    }

    // On exit from recovery, reset to "unconditionally true", since there is
    // no need to keep checking.
    t_thrd.xlog_cxt.LocalXLogInsertAllowed = 1;
    return true;
}

/*
 * Make XLogInsertAllowed() return true in the current process only.
 *
 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
 * and even call LocalSetXLogInsertAllowed() again after that.
 */
static void LocalSetXLogInsertAllowed(void)
{
    Assert(t_thrd.xlog_cxt.LocalXLogInsertAllowed == -1);
    t_thrd.xlog_cxt.LocalXLogInsertAllowed = 1;

    /* Initialize as RecoveryInProgress() would do when switching state */
    InitXLOGAccess();
}

/*
 * Subroutine to try to fetch and validate a prior checkpoint record.
 *
 * whichChkpt identifies the checkpoint (merely for reporting purposes).
 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
 */
XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int whichChkpt)
{
    XLogRecord *record = NULL;

    if (!XRecOffIsValid(RecPtr)) {
        switch (whichChkpt) {
            case 1:
                ereport(LOG, (errmsg("invalid primary checkpoint link in control file")));
                break;
            case 2:
                ereport(LOG, (errmsg("invalid secondary checkpoint link in control file")));
                break;
            default:
                ereport(LOG, (errmsg("invalid checkpoint link in backup_label file")));
                break;
        }
        return NULL;
    }

    record = ReadRecord(xlogreader, RecPtr, LOG, true);

    if (record == NULL) {
        switch (whichChkpt) {
            case 1:
                ereport(LOG, (errmsg("invalid primary checkpoint record")));
                break;
            case 2:
                ereport(LOG, (errmsg("invalid secondary checkpoint record")));
                break;
            default:
                ereport(LOG, (errmsg("invalid checkpoint record")));
                break;
        }
        return NULL;
    }

    if (record->xl_rmid != RM_XLOG_ID) {
        switch (whichChkpt) {
            case 1:
                ereport(LOG, (errmsg("invalid resource manager ID in primary checkpoint record")));
                break;
            case 2:
                ereport(LOG, (errmsg("invalid resource manager ID in secondary checkpoint record")));
                break;
            default:
                ereport(LOG, (errmsg("invalid resource manager ID in checkpoint record")));
                break;
        }
        return NULL;
    }
    if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN && record->xl_info != XLOG_CHECKPOINT_ONLINE) {
        switch (whichChkpt) {
            case 1:
                ereport(LOG, (errmsg("invalid xl_info in primary checkpoint record")));
                break;
            case 2:
                ereport(LOG, (errmsg("invalid xl_info in secondary checkpoint record")));
                break;
            default:
                ereport(LOG, (errmsg("invalid xl_info in checkpoint record")));
                break;
        }
        return NULL;
    }

    if (record->xl_tot_len != CHECKPOINT_LEN && record->xl_tot_len != CHECKPOINTNEW_LEN &&
        record->xl_tot_len != CHECKPOINTPLUS_LEN && record->xl_tot_len != CHECKPOINTUNDO_LEN) {
        switch (whichChkpt) {
            case 1:
                ereport(LOG, (errmsg("invalid length of primary checkpoint record")));
                break;
            case 2:
                ereport(LOG, (errmsg("invalid length of secondary checkpoint record")));
                break;
            default:
                ereport(LOG, (errmsg("invalid length of checkpoint record")));
                break;
        }
        return NULL;
    }
    return record;
}

/*
 * This must be called during startup of a backend process, except that
 * it need not be called in a standalone backend (which does StartupXLOG
 * instead).  We need to initialize the local copies of ThisTimeLineID and
 * RedoRecPtr.
 *
 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
 * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
 * unnecessary however, since the postmaster itself never touches XLOG anyway.
 */
void InitXLOGAccess(void)
{
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;

    /* ThisTimeLineID doesn't change so we need no lock to copy it */
    t_thrd.xlog_cxt.ThisTimeLineID = t_thrd.shemem_ptr_cxt.XLogCtl->ThisTimeLineID;
    if (!ENABLE_DMS) {
        Assert(t_thrd.xlog_cxt.ThisTimeLineID != 0 || IsBootstrapProcessingMode());
    }

    /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
    (void)GetRedoRecPtr();

    /* Also update our copy of doPageWrites. */
    if (ENABLE_INCRE_CKPT) {
        t_thrd.xlog_cxt.doPageWrites = false;
    } else {
        t_thrd.xlog_cxt.doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
    }

    /* Also initialize the working areas for constructing WAL records */
    InitXLogInsert();
}

/*
 * Return the current Redo pointer from shared memory.
 *
 * As a side-effect, the local RedoRecPtr copy is updated.
 */
XLogRecPtr GetRedoRecPtr(void)
{
    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr ptr;

    /*
     * The possibly not up-to-date copy in XlogCtl is enough. Even if we
     * grabbed a WAL insertion slot to read the master copy, someone might
     * update it just after we've released the lock.
     */
    SpinLockAcquire(&xlogctl->info_lck);
    ptr = xlogctl->RedoRecPtr;
    SpinLockRelease(&xlogctl->info_lck);

    if (t_thrd.xlog_cxt.RedoRecPtr < ptr) {
        t_thrd.xlog_cxt.RedoRecPtr = ptr;
    }

    return t_thrd.xlog_cxt.RedoRecPtr;
}

/*
 * Return information needed to decide whether a modified block needs a
 * full-page image to be included in the WAL record.
 *
 * The returned values are cached copies from backend-private memory, and
 * possibly out-of-date.  XLogInsertRecord will re-check them against
 * up-to-date values, while holding the WAL insert lock.
 */
void GetFullPageWriteInfo(XLogFPWInfo *fpwInfo_p)
{
    fpwInfo_p->redoRecPtr = t_thrd.xlog_cxt.RedoRecPtr;
    fpwInfo_p->doPageWrites = t_thrd.xlog_cxt.doPageWrites && !ENABLE_INCRE_CKPT;

    fpwInfo_p->forcePageWrites = t_thrd.shemem_ptr_cxt.XLogCtl->FpwBeforeFirstCkpt && !IsInitdb && !ENABLE_INCRE_CKPT;
}

/*
 * GetInsertRecPtr -- Returns the current insert position.
 *
 * NOTE: The value *actually* returned is the position of the last full
 * xlog page. It lags behind the real insert position by at most 1 page.
 * For that, we don't need to acquire WALInsertLock which can be quite
 * heavily contended, and an approximation is enough for the current
 * usage of this function.
 */
XLogRecPtr GetInsertRecPtr(void)
{
    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr recptr;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->LogwrtRqst.Write;
    SpinLockRelease(&xlogctl->info_lck);

    return recptr;
}

/*
 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
 * position known to be fsync'd to disk.
 */
XLogRecPtr GetFlushRecPtr(void)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr recptr;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->LogwrtResult.Flush;
    SpinLockRelease(&xlogctl->info_lck);

    return recptr;
}

/*
 * Get the time of the last xlog segment switch
 */
pg_time_t GetLastSegSwitchTime(void)
{
    pg_time_t result;

    /* Need WALWriteLock, but shared lock is sufficient */
    LWLockAcquire(WALWriteLock, LW_SHARED);
    result = t_thrd.shemem_ptr_cxt.XLogCtl->lastSegSwitchTime;
    LWLockRelease(WALWriteLock);

    return result;
}

/*
 * GetRecoveryTargetTLI - get the current recovery target timeline ID
 */
TimeLineID GetRecoveryTargetTLI(void)
{
    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    TimeLineID result;

    SpinLockAcquire(&xlogctl->info_lck);
    result = xlogctl->RecoveryTargetTLI;
    SpinLockRelease(&xlogctl->info_lck);

    return result;
}

TimeLineID GetThisTargetTLI(void)
{
    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    TimeLineID result;

    SpinLockAcquire(&xlogctl->info_lck);
    result = xlogctl->ThisTimeLineID;
    SpinLockRelease(&xlogctl->info_lck);

    return result;
}

/*
 * DummyStandbySetRecoveryTargetTLI - set the current recovery target timeline ID
 */
void DummyStandbySetRecoveryTargetTLI(TimeLineID timeLineID)
{
    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->RecoveryTargetTLI = timeLineID;
    xlogctl->ThisTimeLineID = timeLineID;
    SpinLockRelease(&xlogctl->info_lck);

    return;
}

/*
 * This must be called ONCE during postmaster or standalone-backend shutdown
 */
void ShutdownXLOG(int code, Datum arg)
{
    if (SS_PRIMARY_DEMOTING) {
        ereport(LOG, (errmsg("[SS switchover] primary demote: doing shutdown checkpoint")));
        if (DORADO_STANDBY_CLUSTER) {
            CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
        } else {
            CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
        }
        ckpt_shutdown_pagewriter();

        if (g_instance.ckpt_cxt_ctl->dirty_page_queue != NULL) {
            pfree(g_instance.ckpt_cxt_ctl->dirty_page_queue);
            g_instance.ckpt_cxt_ctl->dirty_page_queue = NULL;
        }
        return;
    }

    ereport(LOG, (errmsg("shutting down")));

    if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) {
        ereport(LOG, (errmsg("[SS failover/SS switchover] Standby promote: skipping shutdown checkpoint")));
    } else {
        if (RecoveryInProgress()) {
            (void)CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
        } else {
            /*
            * If archiving is enabled, rotate the last XLOG file so that all the
            * remaining records are archived (postmaster wakes up the archiver
            * process one more time at the end of shutdown). The checkpoint
            * record will go to the next XLOG file and won't be archived (yet).
            */
            if (!IsInitdb && XLogArchivingActive() && XLogArchiveCommandSet()) {
                (void)RequestXLogSwitch();
            }

            if (g_instance.wal_cxt.upgradeSwitchMode != ExtremelyFast) {
                if (DORADO_STANDBY_CLUSTER) {
                    CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
                } else {
                    CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
                }
            }
        }
    }

    /* Stop DCF after primary CreateCheckPoint or standby CreateRestartPoint */
#ifndef ENABLE_MULTIPLE_NODES
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
        StopPaxosModule();
    }
#endif

    /* Shutdown all the page writer threads. */
    ckpt_shutdown_pagewriter();
    if (g_instance.ckpt_cxt_ctl->dirty_page_queue != NULL) {
        pfree(g_instance.ckpt_cxt_ctl->dirty_page_queue);
        g_instance.ckpt_cxt_ctl->dirty_page_queue = NULL;
    }

    g_instance.bgwriter_cxt.unlink_rel_hashtbl = NULL;
    g_instance.bgwriter_cxt.unlink_rel_fork_hashtbl = NULL;
    g_instance.ckpt_cxt_ctl->prune_queue_lock = NULL;
    g_instance.ckpt_cxt_ctl->ckpt_redo_state.recovery_queue_lock = NULL;
    g_instance.ckpt_cxt_ctl->snapshotBlockLock = NULL;
    g_instance.bgwriter_cxt.rel_hashtbl_lock = NULL;
    g_instance.bgwriter_cxt.rel_one_fork_hashtbl_lock = NULL;

    if (ENABLE_DSS && IsInitdb &&
        g_instance.dms_cxt.SSReformerControl.primaryInstId == INVALID_INSTANCEID) {
        SSReadControlFile(REFORM_CTRL_PAGE);
    }

    if ((!ENABLE_DMS || (!SS_STANDBY_MODE && !SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING))
        && !SS_STANDBY_INST_SKIP_SHARED_FILE) {
        ShutdownCLOG();
        ShutdownCSNLOG();
        ShutdownMultiXact();
    }

    /* Shutdown double write. */
    dw_exit(true);
    dw_exit(false);

    /* try clear page repair thread mem again */
    if (!ENABLE_DMS || g_instance.dms_cxt.SSClusterState == NODESTATE_NORMAL) {
        ClearPageRepairTheadMem();
        g_instance.repair_cxt.page_repair_hashtbl_lock = NULL;
        g_instance.repair_cxt.file_repair_hashtbl_lock = NULL;
    }

    if (IsInitdb) {
        ShutdownShareStorageXLogCopy();
        if (t_thrd.xlog_cxt.openLogFile >= 0) {
            XLogFileClose();
        }
    }
    ereport(LOG, (errmsg("database system is shut down")));
}

/*
 * Log start of a checkpoint.
 */
static void LogCheckpointStart(unsigned int flags, bool restartpoint)
{
    const char *msg = NULL;

    /*
     * XXX: This is hopelessly untranslatable. We could call gettext_noop for
     * the main message, but what about all the flags?
     */
    if (restartpoint) {
        msg = "restartpoint starting:%s%s%s%s%s%s%s";
    } else {
        msg = "checkpoint starting:%s%s%s%s%s%s%s";
    }

    ereport(LOG, (errmsg(msg, (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
                         (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
                         (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "", (flags & CHECKPOINT_FORCE) ? " force" : "",
                         (flags & CHECKPOINT_WAIT) ? " wait" : "", (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
                         (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "")));
}

/*
 * Log end of a checkpoint.
 */
static void LogCheckpointEnd(bool restartpoint)
{
    long write_secs, sync_secs, total_secs, longest_secs, average_secs;
    int write_usecs, sync_usecs, total_usecs, longest_usecs, average_usecs;
    uint64 average_sync_time;

    t_thrd.xlog_cxt.CheckpointStats->ckpt_end_t = GetCurrentTimestamp();

    TimestampDifference(t_thrd.xlog_cxt.CheckpointStats->ckpt_write_t, t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_t,
                        &write_secs, &write_usecs);

    TimestampDifference(t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_t, t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_end_t,
                        &sync_secs, &sync_usecs);

    /* Accumulate checkpoint timing summary data, in milliseconds. */
    u_sess->stat_cxt.BgWriterStats->m_checkpoint_write_time += write_secs * 1000 + write_usecs / 1000;
    u_sess->stat_cxt.BgWriterStats->m_checkpoint_sync_time += sync_secs * 1000 + sync_usecs / 1000;

    /*
     * All of the published timing statistics are accounted for.  Only
     * continue if a log message is to be written.
     */
    if (!u_sess->attr.attr_common.log_checkpoints) {
        return;
    }

    TimestampDifference(t_thrd.xlog_cxt.CheckpointStats->ckpt_start_t, t_thrd.xlog_cxt.CheckpointStats->ckpt_end_t,
                        &total_secs, &total_usecs);

    /*
     * Timing values returned from CheckpointStats are in microseconds.
     * Convert to the second plus microsecond form that TimestampDifference
     * returns for homogeneous printing.
     */
    longest_secs = (long)(t_thrd.xlog_cxt.CheckpointStats->ckpt_longest_sync / 1000000);
    longest_usecs = t_thrd.xlog_cxt.CheckpointStats->ckpt_longest_sync - (uint64)longest_secs * 1000000;

    average_sync_time = 0;
    if (t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_rels > 0) {
        average_sync_time = t_thrd.xlog_cxt.CheckpointStats->ckpt_agg_sync_time /
                            t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_rels;
    }
    average_secs = (long)(average_sync_time / 1000000);
    average_usecs = average_sync_time - (uint64)average_secs * 1000000;

    if (restartpoint) {
        ereport(LOG, (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
                             "%d transaction log file(s) added, %d removed, %d recycled; "
                             "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
                             "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written,
                             (double)t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written * 100 /
                                 g_instance.attr.attr_storage.NBuffers,
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_added,
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_removed,
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled, write_secs, write_usecs / 1000,
                             sync_secs, sync_usecs / 1000, total_secs, total_usecs / 1000,
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_rels, longest_secs, longest_usecs / 1000,
                             average_secs, average_usecs / 1000)));
    } else {
        ereport(LOG, (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
                             "%d transaction log file(s) added, %d removed, %d recycled; "
                             "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
                             "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written,
                             (double)t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written * 100 /
                                 g_instance.attr.attr_storage.NBuffers,
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_added,
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_removed,
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled, write_secs, write_usecs / 1000,
                             sync_secs, sync_usecs / 1000, total_secs, total_usecs / 1000,
                             t_thrd.xlog_cxt.CheckpointStats->ckpt_sync_rels, longest_secs, longest_usecs / 1000,
                             average_secs, average_usecs / 1000)));
    }
}

/*
 * Perform a checkpoint --- either during shutdown, or on-the-fly
 *
 * flags is a bitwise OR of the following:
 *	CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
 *	CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
 *	CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
 *		ignoring checkpoint_completion_target parameter.
 *	CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
 *		since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
 *		CHECKPOINT_END_OF_RECOVERY).
 *
 * Note: flags contains other bits, of interest here only for logging purposes.
 * In particular note that this routine is synchronous and does not pay
 * attention to CHECKPOINT_WAIT.
 */
void CreateCheckPoint(int flags)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    bool shutdown = false;
    bool removexlog = false;
    CheckPoint checkPoint;
    CheckPointNew checkPointNew;   /* to adapt update and not to modify the storage format */
    CheckPointPlus checkPointPlus; /* to adapt update and not to modify the storage format for global clean */
    CheckPointUndo checkPointUndo;
    XLogRecPtr recptr = InvalidXLogRecPtr;
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    uint32 freespace;
    XLogSegNo _logSegNo;
    XLogRecPtr curInsert;
    VirtualTransactionId *vxids = NULL;
    int nvxids = 0;
    int32 lastlrc = 0;
    errno_t errorno = EOK;
    XLogRecPtr curMinRecLSN = InvalidXLogRecPtr;
    bool doFullCheckpoint = !ENABLE_INCRE_CKPT;
    TransactionId oldest_active_xid = InvalidTransactionId;
    TransactionId globalXmin = InvalidTransactionId;

    /*
     * An end-of-recovery checkpoint is really a shutdown checkpoint, just
     * issued at a different time.
     */
    gstrace_entry(GS_TRC_ID_CreateCheckPoint);
    if ((unsigned int)flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY)) {
        shutdown = true;
    }

    /* sanity check */
    if (RecoveryInProgress() && ((unsigned int)flags & CHECKPOINT_END_OF_RECOVERY) == 0) {
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("can't create a checkpoint during recovery")));
    }

    if (!SSXLogInsertAllowed()) {
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("can't create a checkpoint on SS standby node")));
    }

    /* allow standby do checkpoint only after it has promoted AND has finished recovery. */
    if (ENABLE_DMS && SS_STANDBY_MODE) {
        if (shutdown) {
            START_CRIT_SECTION();
            LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
            t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED;
            t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
            UpdateControlFile();
            LWLockRelease(ControlFileLock);
            END_CRIT_SECTION();
        }
        return;
    } else if (g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status == NOT_ALLOW_CKPT) {
        ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS failover] do not do CreateCheckpoint during failover")));
        return;
    }

    /* CHECKPOINT_IS_SHUTDOWN CHECKPOINT_END_OF_RECOVERY CHECKPOINT_FORCE shuld do full checkpoint */
    if (shutdown || ((unsigned int)flags & (CHECKPOINT_FORCE))) {
        doFullCheckpoint = true;
    }

    /*
     * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
     * (This is just pro forma, since in the present system structure there is
     * only one process that is allowed to issue checkpoints at any given
     * time.)
     */
    LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

    /*
     * Prepare to accumulate statistics.
     *
     * Note: because it is possible for log_checkpoints to change while a
     * checkpoint proceeds, we always accumulate stats, even if
     * log_checkpoints is currently off.
     */
    errorno = memset_s(t_thrd.xlog_cxt.CheckpointStats, sizeof(CheckpointStatsData), 0, sizeof(CheckpointStatsData));
    securec_check(errorno, "", "");
    t_thrd.xlog_cxt.CheckpointStats->ckpt_start_t = GetCurrentTimestamp();

    /*
     * Use a critical section to force system panic if we have trouble.
     */
    START_CRIT_SECTION();

    if (shutdown) {
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNING;
        t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
    }

    /*
     * Let smgr prepare for checkpoint; this has to happen before we determine
     * the REDO pointer.  Note that smgr must not do anything that'd have to
     * be undone if we decide no checkpoint is needed.
     */
    SyncPreCheckpoint();

    /* Begin filling in the checkpoint WAL record */
    errorno = memset_s(&checkPoint, sizeof(checkPoint), 0, sizeof(checkPoint));
    securec_check(errorno, "", "");
    checkPoint.time = (pg_time_t)time(NULL);

    /*
     * For Hot Standby, derive the oldestActiveXid before we fix the redo
     * pointer. This allows us to begin accumulating changes to assemble our
     * starting snapshot of locks and transactions.
     */
    if (!shutdown) {
        oldest_active_xid = GetOldestActiveTransactionId(&globalXmin);
    }
    if (!shutdown && XLogStandbyInfoActive()) {
        checkPoint.oldestActiveXid = oldest_active_xid;
    } else {
        checkPoint.oldestActiveXid = InvalidTransactionId;
    }

#ifdef ENABLE_MOT
    if (doFullCheckpoint) {
        CallCheckpointCallback(EVENT_CHECKPOINT_CREATE_SNAPSHOT, 0);
    }
#endif

    StartSuspendWalInsert(&lastlrc);

    curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);

    if ((ENABLE_INCRE_CKPT && (flags & CHECKPOINT_CAUSE_TIME)) || doFullCheckpoint) {
        update_dirty_page_queue_rec_lsn(curInsert, true);
    }

    /*
     * If this isn't a shutdown or forced checkpoint, and we have not inserted
     * any XLOG records since the start of the last checkpoint, skip the
     * checkpoint.	The idea here is to avoid inserting duplicate checkpoints
     * when the system is idle. That wastes log space, and more importantly it
     * exposes us to possible loss of both current and previous checkpoint
     * records if the machine crashes just as we're writing the update.
     * (Perhaps it'd make even more sense to checkpoint only when the previous
     * checkpoint record is in a different xlog page?)
     *
     * We have to make two tests to determine that nothing has happened since
     * the start of the last checkpoint: current insertion point must match
     * the end of the last checkpoint record, and its redo pointer must point
     * to itself.
     */
    if (((unsigned int)flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE)) == 0) {
        if (curInsert ==
                t_thrd.shemem_ptr_cxt.ControlFile->checkPoint + MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
            t_thrd.shemem_ptr_cxt.ControlFile->checkPoint == t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo) {
#ifdef ENABLE_MOT
            if (doFullCheckpoint) {
                CallCheckpointCallback(EVENT_CHECKPOINT_ABORT, 0);
            }
#endif
            StopSuspendWalInsert(lastlrc);
            LWLockRelease(CheckpointLock);
            END_CRIT_SECTION();
            gstrace_exit(GS_TRC_ID_CreateCheckPoint);
            return;
        }
    } else if (ENABLE_INCRE_CKPT && doFullCheckpoint) {
        /*
         * enableIncrementalCheckpoint guc is on, but some conditions shuld do
         * full checkpoint.
         */
        g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc = get_dirty_page_queue_tail();
        g_instance.ckpt_cxt_ctl->full_ckpt_redo_ptr = curInsert;
        pg_memory_barrier();
        if (get_dirty_page_num() > 0) {
            g_instance.ckpt_cxt_ctl->flush_all_dirty_page = true;
        }
        ereport(LOG, (errmsg("will do full checkpoint, need flush %ld pages.", get_dirty_page_num())));
    }

    /* incremental checkpoint */
    if (!doFullCheckpoint) {
        curMinRecLSN = ckpt_get_min_rec_lsn();
        if (XLogRecPtrIsInvalid(curMinRecLSN)) {
            /*
             * If dirty page list is empty, use current queue rec lsn.
             * Check queue head buffer rec lsn again (if some page dirtied after getting
             * get_dirty_page_queue_rec_lsn), get min rec lsn.
             */
            XLogRecPtr cur_queue_rec_lsn = get_dirty_page_queue_rec_lsn();
            curMinRecLSN = ckpt_get_min_rec_lsn();

            if (XLogRecPtrIsInvalid(curMinRecLSN) || XLByteLT(cur_queue_rec_lsn, curMinRecLSN)) {
                curMinRecLSN = cur_queue_rec_lsn;
            }
            if (u_sess->attr.attr_storage.log_pagewriter) {
                ereport(DEBUG1,
                        (errmodule(MOD_INCRE_CKPT),
                         errmsg("get min_recLSN is invalid, dirty page num is %ld, get queue reclsn is %08X/%08X",
                                get_dirty_page_num(), (uint32)(curMinRecLSN >> XLOG_LSN_SWAP), (uint32)curMinRecLSN)));
            }
        }
        XLogRecPtr remaincommit = GetRemainCommitLsn();
        if (XLByteEQ(curMinRecLSN, g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point) ||
            XLByteLT(curMinRecLSN, remaincommit)) {
            StopSuspendWalInsert(lastlrc);
            LWLockRelease(CheckpointLock);
            END_CRIT_SECTION();

            if (u_sess->attr.attr_storage.log_pagewriter) {
                ereport(LOG, (errmodule(MOD_INCRE_CKPT),
                    errmsg("Checkpoint meets prev checkpoint lsn is %08X/%08X, now min rec lsn is %08X/%08X, "
                                 "checkpoint flag is %d, remaincommit lsn is %08X/%08X",
                     (uint32)(g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point >> XLOG_LSN_SWAP),
                     (uint32)g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point,
                     (uint32)(curMinRecLSN >> XLOG_LSN_SWAP),
                     (uint32)(curMinRecLSN), flags,
                     (uint32)(remaincommit >> XLOG_LSN_SWAP), (uint32)(remaincommit))));
            }
            gstrace_exit(GS_TRC_ID_CreateCheckPoint);
            return;
        } else if (XLByteLT(curMinRecLSN,
            g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point)) {
            ereport(PANIC,
                    (errmodule(MOD_INCRE_CKPT),
                     errmsg("curMinRecLSN little prev checkpoint lsn is %08X/%08X,now lsn is %08X/%08X",
                            (uint32)(g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point >> XLOG_LSN_SWAP),
                            (uint32)g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point,
                            (uint32)(curMinRecLSN >> XLOG_LSN_SWAP), (uint32)curMinRecLSN)));
        }

        if (u_sess->attr.attr_storage.log_pagewriter) {
            ereport(LOG,
                    (errmodule(MOD_INCRE_CKPT),
                     errmsg("Checkpoint prev checkpoint lsn is %08X/%08X, now will create new checkpoint is %08X/%08X",
                            (uint32)(g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point >> XLOG_LSN_SWAP),
                            (uint32)g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point,
                            (uint32)(curMinRecLSN >> XLOG_LSN_SWAP), (uint32)curMinRecLSN)));
        }
    }
    /*
     * An end-of-recovery checkpoint is created before anyone is allowed to
     * write WAL. To allow us to write the checkpoint record, temporarily
     * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
     * initialized, which we need here and in AdvanceXLInsertBuffer.)
     */
    if ((unsigned int)flags & CHECKPOINT_END_OF_RECOVERY) {
        LocalSetXLogInsertAllowed();
    }

    checkPoint.ThisTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID;
    checkPoint.fullPageWrites = Insert->fullPageWrites;

    /*
     * Compute new REDO record ptr = location of next XLOG record.
     *
     * NB: this is NOT necessarily where the checkpoint record itself will be,
     * since other backends may insert more XLOG records while we're off doing
     * the buffer flush work.  Those XLOG records are logically after the
     * checkpoint, even though physically before it.  Got that?
     */
    if (!doFullCheckpoint) {
        /*
         * Incremental Checkpoint use queue first page recLSN, when the dirty page queue is empty,
         * choose the dirty page queue recLSN. Dirty page queue lsn has computed redo ptr when
         * update_dirty_page_queue_rec_lsn.
         */
        checkPoint.redo = curMinRecLSN;
    } else {
        /* Full checkpoint use curInsert */
        freespace = INSERT_FREESPACE(curInsert);
        if (freespace == 0) {
            if (curInsert % XLogSegSize == 0) {
                curInsert += SizeOfXLogLongPHD;
            } else {
                curInsert += SizeOfXLogShortPHD;
            }
        }
        checkPoint.redo = curInsert;
    }

    /*
     * Here we update the shared RedoRecPtr for future XLogInsert calls; this
     * must be done while holding the insert lock AND the info_lck.
     *
     * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
     * pointing past where it really needs to point.  This is okay; the only
     * consequence is that XLogInsert might back up whole buffers that it
     * didn't really need to.  We can't postpone advancing RedoRecPtr because
     * XLogInserts that happen while we are dumping buffers must assume that
     * their buffer changes are not included in the checkpoint.
     */
    t_thrd.xlog_cxt.RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;

#ifdef ENABLE_MOT
    if (doFullCheckpoint) {
        CallCheckpointCallback(EVENT_CHECKPOINT_SNAPSHOT_READY, checkPoint.redo);
    }
#endif

    /*
     * Now we can release WAL insert lock, allowing other xacts to proceed
     * while we are flushing disk buffers.
     */
    StopSuspendWalInsert(lastlrc);

    /* Update the info_lck-protected copy of RedoRecPtr as well */
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->RedoRecPtr = checkPoint.redo;
    SpinLockRelease(&xlogctl->info_lck);

    /*
     * If enabled, log checkpoint start.  We postpone this until now so as not
     * to log anything if we decided to skip the checkpoint.
     */
    if (u_sess->attr.attr_common.log_checkpoints) {
        LogCheckpointStart((unsigned int)flags, false);
    }

    TRACE_POSTGRESQL_CHECKPOINT_START(flags);

    /*
     * Get the other info we need for the checkpoint record.
     */
    LWLockAcquire(XidGenLock, LW_SHARED);
    checkPoint.nextXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
    checkPoint.oldestXid = t_thrd.xact_cxt.ShmemVariableCache->oldestXid;
    checkPoint.oldestXidDB = t_thrd.xact_cxt.ShmemVariableCache->oldestXidDB;
    LWLockRelease(XidGenLock);

    MultiXactGetCheckptMulti(shutdown, &checkPoint.nextMulti, &checkPoint.nextMultiOffset);

    checkPointUndo.globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);

    /*
     * Having constructed the checkpoint record, ensure all shmem disk buffers
     * and commit-log buffers are flushed to disk.
     *
     * This I/O could fail for various reasons.  If so, we will fail to
     * complete the checkpoint, but there is no reason to force a system
     * panic. Accordingly, exit critical section while doing it.
     */
    END_CRIT_SECTION();

    /*
     * In some cases there are groups of actions that must all occur on
     * one side or the other of a checkpoint record. Before flushing the
     * checkpoint record we must explicitly wait for any backend currently
     * performing those groups of actions.
     *
     * One example is end of transaction, so we must wait for any transactions
     * that are currently in commit critical sections.  If an xact inserted
     * its commit record into XLOG just before the REDO point, then a crash
     * restart from the REDO point would not replay that record, which means
     * that our flushing had better include the xact's update of pg_clog.  So
     * we wait till he's out of his commit critical section before proceeding.
     * See notes in RecordTransactionCommit().
     *
     * Because we've already released WALInsertLock, this test is a bit fuzzy:
     * it is possible that we will wait for xacts we didn't really need to
     * wait for.  But the delay should be short and it seems better to make
     * checkpoint take a bit longer than to hold locks longer than necessary.
     * (In fact, the whole reason we have this issue is that xact.c does
     * commit record XLOG insertion and clog update as two separate steps
     * protected by different locks, but again that seems best on grounds of
     * minimizing lock contention.)
     *
     * A transaction that has not yet set delayChkpt when we look cannot be at
     * risk, since he's not inserted his commit record yet; and one that's
     * already cleared it is not at risk either, since he's done fixing clog
     * and we will correctly flush the update below.  So we cannot miss any
     * xacts we need to wait for.
     */
    vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
    if (nvxids > 0) {
        do {
            pg_usleep(10000L); /* wait for 10 msec */
        } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
    }
    pfree_ext(vxids);

    /*
     * Truncate pg_csnlog if possible.  We can throw away all data before
     * the oldest XMIN of any running transaction.  No future transaction will
     * attempt to reference any pg_csnlog entry older than that (see Asserts
     * in csnlog.c).  During recovery, though, we mustn't do this because
     * StartupCSNLOG hasn't been called yet.
     */
    pg_time_t now = (pg_time_t)time(NULL);
    int elapsed_secs = now - t_thrd.checkpoint_cxt.last_truncate_log_time;

    if (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY &&
        (GTM_FREE_MODE || TransactionIdIsNormal(t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin))) {
        /*
         * Reduce the frequency of trucate CSN log to avoid the probability of lock contention
         * Incremental chekpoint does not require frequent truncate of csnlog.
         */
        if (doFullCheckpoint || elapsed_secs >= u_sess->attr.attr_storage.fullCheckPointTimeout) {
            TransactionId cutoff_xid = GetOldestXmin(NULL);
            /*
             * Since vacuum xid,xmin are not included in oldestxmin,
             * local oldest active xid may lower than oldestxmin,
             * don't truncate it for safe.
             */
            if (TransactionIdIsNormal(globalXmin) && TransactionIdPrecedes(globalXmin, cutoff_xid)) {
                cutoff_xid = globalXmin;
            }
            TransactionId globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
            if (TransactionIdIsNormal(globalRecycleXid) && TransactionIdPrecedes(globalRecycleXid, cutoff_xid)) {
                cutoff_xid = globalRecycleXid;
            }
            TruncateCSNLOG(cutoff_xid);
            t_thrd.checkpoint_cxt.last_truncate_log_time = now;
        }
    }

    CheckPointGuts(checkPoint.redo, flags, doFullCheckpoint);

#ifdef ENABLE_MOT
    if (doFullCheckpoint) {
        CallCheckpointCallback(EVENT_CHECKPOINT_BEGIN_CHECKPOINT, 0);
    }
#endif

    /*
     * Take a snapshot of running transactions and write this to WAL. This
     * allows us to reconstruct the state of running transactions during
     * archive recovery, if required. Skip, if this info disabled.
     *
     * If we are shutting down, or Startup process is completing crash
     * recovery we don't need to write running xact data.
     */

    if (!shutdown && XLogStandbyInfoActive()) {
        LogStandbySnapshot();
    }

    START_CRIT_SECTION();

    LWLockAcquire(OidGenLock, LW_SHARED);
    checkPoint.nextOid = t_thrd.xact_cxt.ShmemVariableCache->nextOid;
    if (!shutdown) {
        checkPoint.nextOid += t_thrd.xact_cxt.ShmemVariableCache->oidCount;
    }
    LWLockRelease(OidGenLock);
    /*
     * Select point at which we can truncate the log, which we base on the
     * prior checkpoint's earliest info.
     */
    XLByteToSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, _logSegNo);
    checkPoint.remove_seg = 1;

    if (_logSegNo && !GetDelayXlogRecycle()) {
        KeepLogSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, &_logSegNo, curInsert);
        checkPoint.remove_seg = _logSegNo;
        _logSegNo--;
        /*
         * Save one more segment to handle the corner case. For example, standby has replayed to
         * the end to one segment file (000000010000002400000078) and the LSN is exactly 24/79000000.
         * The corresponding slot on primary is 24/79000000. As a result, if standby restart to
         * request xlog stream from the lastest valid record (23/xxxxxxx), cannot find on primary.
         */
        if (_logSegNo > 0) {
            _logSegNo--;
        }
        removexlog = true;
#ifdef ENABLE_DISTRIBUTE_TEST
        if (TEST_STUB(DN_PRIMARY_CHECKPOINT_KEEPXLOG, default_error_emit)) {
            ereport(get_distribute_test_param()->elevel,
                    (errmsg("default_error_emit create check point time:%ds, stub_name:%s",
                            get_distribute_test_param()->sleep_time, get_distribute_test_param()->test_stub_name)));
            checkPoint.remove_seg = 1;
        }
#endif
    }

    /*
     * Now insert the checkpoint record into XLOG.
     */
    XLogBeginInsert();

    /* use version num to control */
    if (t_thrd.proc->workingVersionNum >= INPLACE_UPDATE_VERSION_NUM) {
        errno_t rcm = memcpy_s(&checkPointUndo, sizeof(CheckPoint), &checkPoint, sizeof(CheckPoint));
        securec_check(rcm, "", "");
        if (IsBootstrapProcessingMode() ||
            !COMMITSEQNO_IS_COMMITTED(t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo)) {
            checkPointUndo.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
            checkPointUndo.length = (uint64)sizeof(CheckPointUndo);
            checkPointUndo.recent_global_xmin = InvalidTransactionId;
            checkPointUndo.globalRecycleXid = 0;
        } else {
            checkPointUndo.next_csn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
            checkPointUndo.length = (uint64)sizeof(CheckPointUndo);
            checkPointUndo.recent_global_xmin = t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin;
            checkPointUndo.globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
        }
        XLogRegisterData((char *)(&checkPointUndo), sizeof(checkPointUndo));
    } else if (t_thrd.proc->workingVersionNum >= GTMLITE_VERSION_NUM && u_sess->attr.attr_common.upgrade_mode != 1) {
        errno_t rcm = memcpy_s(&checkPointPlus, sizeof(CheckPoint), &checkPoint, sizeof(CheckPoint));
        securec_check(rcm, "", "");
        if (IsBootstrapProcessingMode() ||
            !COMMITSEQNO_IS_COMMITTED(t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo)) {
            checkPointPlus.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
            checkPointPlus.length = (uint64)sizeof(CheckPointPlus);
            checkPointPlus.recent_global_xmin = InvalidTransactionId;
        } else {
            checkPointPlus.next_csn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
            checkPointPlus.length = (uint64)sizeof(CheckPointPlus);
            checkPointPlus.recent_global_xmin = t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin;
        }
        XLogRegisterData((char *)(&checkPointPlus), sizeof(checkPointPlus));
    } else if (t_thrd.proc->workingVersionNum > GTM_OLD_VERSION_NUM && u_sess->attr.attr_common.upgrade_mode != 1) {
        errno_t rcm = memcpy_s(&checkPointNew, sizeof(CheckPoint), &checkPoint, sizeof(CheckPoint));
        securec_check(rcm, "", "");
        if (IsBootstrapProcessingMode() ||
            !COMMITSEQNO_IS_COMMITTED(t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo)) {
            checkPointNew.next_csn = COMMITSEQNO_FIRST_NORMAL + 1;
        } else {
            checkPointNew.next_csn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
        }
        XLogRegisterData((char *)(&checkPointNew), sizeof(checkPointNew));
    } else {
        XLogRegisterData((char *)(&checkPoint), sizeof(checkPoint));
    }
    recptr = XLogInsert(RM_XLOG_ID, shutdown ? XLOG_CHECKPOINT_SHUTDOWN : XLOG_CHECKPOINT_ONLINE);

    XLogWaitFlush(recptr);

    /*
     * We mustn't write any new WAL after a shutdown checkpoint, or it will be
     * overwritten at next startup.  No-one should even try, this just allows
     * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
     * to just temporarily disable writing until the system has exited
     * recovery.
     */
    if (shutdown) {
        if ((unsigned int)flags & CHECKPOINT_END_OF_RECOVERY) {
            t_thrd.xlog_cxt.LocalXLogInsertAllowed = -1; /* return to "check" state */
        } else {
            t_thrd.xlog_cxt.LocalXLogInsertAllowed = 0; /* never again write WAL */
        }
    }

    if (doFullCheckpoint && ENABLE_INCRE_CKPT) {
        XLogRecPtr MinRecLSN = ckpt_get_min_rec_lsn();
        if (!XLogRecPtrIsInvalid(MinRecLSN) && XLByteLT(MinRecLSN, t_thrd.xlog_cxt.RedoRecPtr)) {
            ereport(PANIC, (errmsg("current dirty page list head recLSN %08X/%08X smaller than redo lsn %08X/%08X",
                                   (uint32)(MinRecLSN >> XLOG_LSN_SWAP), (uint32)MinRecLSN,
                                   (uint32)(t_thrd.xlog_cxt.RedoRecPtr >> XLOG_LSN_SWAP),
                                   (uint32)t_thrd.xlog_cxt.RedoRecPtr)));
        }
    }
    /*
     * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
     * = end of actual checkpoint record.
     */
#ifdef ENABLE_MULTIPLE_NODES
    if (shutdown && !XLByteEQ(checkPoint.redo, t_thrd.xlog_cxt.ProcLastRecPtr)) {
        ereport(PANIC, (errmsg("concurrent transaction log activity while database system is shutting down")));
    }
#else
    if (shutdown && !XLByteEQ(checkPoint.redo, t_thrd.xlog_cxt.ProcLastRecPtr) && !XLogStandbyInfoActive()) {
        ereport(PANIC, (errmsg("concurrent transaction log activity while database system is shutting down")));
    }
#endif
    PrintCkpXctlControlFile(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint,
                            &(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy), t_thrd.xlog_cxt.ProcLastRecPtr,
                            &checkPoint, t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint, "CreateCheckPoint");

    /*
     * Update the control file.
     */
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    if (shutdown) {
        t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED;
    }
    t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
    t_thrd.shemem_ptr_cxt.ControlFile->checkPoint = t_thrd.xlog_cxt.ProcLastRecPtr;
    t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy = checkPoint;
    t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
    g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point = checkPoint.redo;
    /* crash recovery should always recover to the end of WAL */
    errorno = memset_s(&t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, sizeof(XLogRecPtr), 0, sizeof(XLogRecPtr));
    securec_check(errorno, "", "");
    ereport(LOG, (errmsg("will update control file (create checkpoint), shutdown:%d", shutdown)));
    UpdateControlFile();
    LWLockRelease(ControlFileLock);

    /* Update shared-memory copy of checkpoint XID/epoch */
    {
        /* use volatile pointer to prevent code rearrangement */
        xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        xlogctl->ckptXid = checkPoint.nextXid;
        SpinLockRelease(&xlogctl->info_lck);
    }

    /*
     * We are now done with critical updates; no need for system panic if we
     * have trouble while fooling with old log segments.
     */
    END_CRIT_SECTION();

    SyncPostCheckpoint();

    if (!t_thrd.cbm_cxt.XlogCbmSys->firstCPCreated) {
        t_thrd.cbm_cxt.XlogCbmSys->firstCPCreated = true;
    }

    if (doFullCheckpoint || ((unsigned int)flags & CHECKPOINT_IMMEDIATE) ||
        elapsed_secs >= u_sess->attr.attr_storage.fullCheckPointTimeout) {
        if (g_instance.proc_base->cbmwriterLatch) {
            SetLatch(g_instance.proc_base->cbmwriterLatch);
        }
    }
    /*
     * Delete old log files (those no longer needed even for previous
     * checkpoint or the standbys in XLOG streaming).
     */
    if (removexlog) {
        RemoveOldXlogFiles(_logSegNo, recptr);
    }
    /*
     * Make more log segments if needed.  (Do this after recycling old log
     * segments, since that may supply some of the needed files.)
     */
    if (!shutdown) {
        PreallocXlogFiles(recptr);
    }

    /* Real work is done, but log and update stats before releasing lock. */
    LogCheckpointEnd(false);

    TRACE_POSTGRESQL_CHECKPOINT_DONE(t_thrd.xlog_cxt.CheckpointStats->ckpt_bufs_written,
                                     g_instance.attr.attr_storage.NBuffers,
                                     t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_added,
                                     t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_removed,
                                     t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled);

    LWLockRelease(CheckpointLock);
    gstrace_exit(GS_TRC_ID_CreateCheckPoint);
}

/*
 * Flush all data in shared memory to disk, and fsync
 *
 * This is the common code shared between regular checkpoints and
 * recovery restartpoints.
 */
static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags, bool doFullCheckpoint)
{
    if (ENABLE_DSS && IsInitdb &&
        g_instance.dms_cxt.SSReformerControl.primaryInstId == INVALID_INSTANCEID) {
        SSReadControlFile(REFORM_CTRL_PAGE);
    }

    if (SS_STANDBY_INST_SKIP_SHARED_FILE) {
        return;
    }

    gstrace_entry(GS_TRC_ID_CheckPointGuts);
    CheckPointCLOG();
    CheckPointCSNLOG();
    CheckPointMultiXact();
    CheckPointPredicate();
    CheckPointRelationMap();
    CheckPointReplicationSlots();
    CheckPointSnapBuild();
    /*
     * If enable_incremental_checkpoint is on, there are two scenarios:
     * Incremental checkpoint, don't need flush any dirty page, and don't wait
     * pagewriter thread flush dirty page.
     * Do full checkpoint, checkpoint thread also don't flush any dirty page, but
     * need wait pagewriter thread flush dirty page.
     */
    CheckPointBuffers(flags, doFullCheckpoint); /* performs all required fsyncs */
#ifndef ENABLE_MULTIPLE_NODES
    CheckPointReplicationOrigin();
#endif
    /* We deliberately delay 2PC checkpointing as long as possible */
    CheckPointTwoPhase(checkPointRedo);
    undo::CheckPointUndoSystemMeta(checkPointRedo);
    gstrace_exit(GS_TRC_ID_CheckPointGuts);
}

void PushRestartPointToQueue(XLogRecPtr recordReadRecPtr, const CheckPoint checkPoint)
{
    RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
    uint loc = 0;

    if (!ENABLE_INCRE_CKPT) {
        return;
    }

    (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
    if (state->end - state->start + 1 >= RESTART_POINT_QUEUE_LEN) {
        state->start++;
    }
    loc = state->end % RESTART_POINT_QUEUE_LEN;
    state->ckpt_rec_queue[loc].CkptLSN = recordReadRecPtr;
    state->ckpt_rec_queue[loc].checkpoint = checkPoint;
    state->end++;
    LWLockRelease(state->recovery_queue_lock);
}

/*
 * Save a checkpoint for recovery restart if appropriate
 *
 * This function is called each time a checkpoint record is read from XLOG.
 * It must determine whether the checkpoint represents a safe restartpoint or
 * not.  If so, the checkpoint record is stashed in shared memory so that
 * CreateRestartPoint can consult it.  (Note that the latter function is
 * executed by the checkpointer, while this one will be executed by the
 * startup process.)
 */
static void RecoveryRestartPoint(const CheckPoint checkPoint, XLogRecPtr recordReadRecPtr)
{
    const uint32 shitRightLength = 32;
    g_instance.comm_cxt.predo_cxt.newestCheckpointLoc = recordReadRecPtr;
    const_cast<CheckPoint &>(g_instance.comm_cxt.predo_cxt.newestCheckpoint) = const_cast<CheckPoint &>(checkPoint);
    if (!IsRestartPointSafe(checkPoint.redo)) {
        return;
    }

    if (IsExtremeRedo()) {
        XLogRecPtr safeCheckPoint = ExtremeGetSafeMinCheckPoint();
        if (XLByteEQ(safeCheckPoint, MAX_XLOG_REC_PTR) || XLByteLT(safeCheckPoint, recordReadRecPtr)) {
            ereport(WARNING, (errmsg("RecoveryRestartPoint is false at %X/%X,last safe point is %X/%X",
                                     (uint32)(recordReadRecPtr >> 32), (uint32)(recordReadRecPtr),
                                     (uint32)(safeCheckPoint >> 32), (uint32)(safeCheckPoint))));
            return;
        }
    } else if (!parallel_recovery::IsRecoveryRestartPointSafeForWorkers(recordReadRecPtr)) {
        ereport(WARNING, (errmsg("RecoveryRestartPointSafe is false at %X/%X",
                                 static_cast<uint32>(recordReadRecPtr >> shitRightLength),
                                 static_cast<uint32>(recordReadRecPtr))));
        return;
    }

    update_dirty_page_queue_rec_lsn(recordReadRecPtr, true);
    pg_write_barrier();
    PushRestartPointToQueue(recordReadRecPtr, checkPoint);
    /*
     * Copy the checkpoint record to shared memory, so that checkpointer can
     * work out the next time it wants to perform a restartpoint.
     */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->lastCheckPointRecPtr = recordReadRecPtr;
    xlogctl->lastCheckPoint = checkPoint;
    SpinLockRelease(&xlogctl->info_lck);

    if (t_thrd.xlog_cxt.needImmediateCkp) {
        ereport(LOG, (errmsg("RecoveryRestartPoint do ImmediateCkp at %X/%X",
                             static_cast<uint32>(recordReadRecPtr >> shitRightLength),
                             static_cast<uint32>(recordReadRecPtr))));
        RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);
        t_thrd.xlog_cxt.needImmediateCkp = false;
    }
}

bool IsRestartPointSafe(const XLogRecPtr checkPoint)
{
    /*
     * Is it safe to restartpoint?	We must ask each of the resource managers
     * whether they have any partial state information that might prevent a
     * correct restart from this point.  If so, we skip this opportunity, but
     * return at the next checkpoint record for another try.
     */
    for (int rmid = 0; rmid <= RM_MAX_ID; rmid++) {
        if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
            if (!(RmgrTable[rmid].rm_safe_restartpoint())) {
                ereport(WARNING, (errmsg("RM %d not safe to record restart point at %X/%X", rmid,
                                         (uint32)(checkPoint >> 32), (uint32)checkPoint)));
                return false;
            }
    }

    /*
     * Also refrain from creating a restartpoint if we have seen any
     * references to non-existent pages. Restarting recovery from the
     * restartpoint would not see the references, so we would lose the
     * cross-check that the pages belonged to a relation that was dropped
     * later.
     */
    if (XLogHaveInvalidPages()) {
        ereport(WARNING, (errmsg("could not record restart point at %X/%X because there "
                                 "are unresolved references to invalid pages",
                                 (uint32)(checkPoint >> 32), (uint32)checkPoint)));
        return false;
    }
    return true;
}

void wait_all_dirty_page_flush(int flags, XLogRecPtr redo)
{
    /* need wait all dirty page finish flush */
    if (ENABLE_INCRE_CKPT) {
        g_instance.ckpt_cxt_ctl->full_ckpt_redo_ptr = redo;
        g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc = get_dirty_page_queue_tail();
        pg_memory_barrier();
        if (get_dirty_page_num() > 0) {
            g_instance.ckpt_cxt_ctl->flush_all_dirty_page = true;
            ereport(LOG, (errmsg("CreateRestartPoint, need flush %ld pages.", get_dirty_page_num())));
            CheckPointBuffers(flags, true);
        }
    }
    return;
}

bool RecoveryQueueIsEmpty()
{
    RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
    int num;

    (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
    num = state->end - state->start;
    (void)LWLockRelease(state->recovery_queue_lock);

    if (num == 0) {
        return true;
    } else {
        return false;
    }
}

/*
 * In the recovery phase, don't push the redo point to the latest position, avoid the I/O peak.
 * calculate the redo point based on the page flushing speed and max_redo_log_size.
 */
const int BYTE_PER_KB = 1024;
XLogRecPtr GetRestartPointInRecovery(CheckPoint *restartCheckPoint)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr restartRecPtr = InvalidXLogRecPtr;

    XLogRecPtr curMinRecLSN = ckpt_get_min_rec_lsn();
    RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
    if (XLogRecPtrIsInvalid(curMinRecLSN)) {
        /* The dirty page queue is empty, so the redo point can be updated to the latest position. */
        (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
        state->start = state->end > 0 ? state->end - 1 : state->end;
        (void)LWLockRelease(state->recovery_queue_lock);

        SpinLockAcquire(&xlogctl->info_lck);
        restartRecPtr = xlogctl->lastCheckPointRecPtr;
        *restartCheckPoint = const_cast<CheckPoint &>(xlogctl->lastCheckPoint);
        SpinLockRelease(&xlogctl->info_lck);
        Assert(XLByteLE(restartRecPtr, get_dirty_page_queue_rec_lsn()));
    } else {
        int num = 0;
        int loc = 0;
        int i = 0;
        XLogRecPtr replayLastLSN;
        XLogRecPtr targetLSN;

        if (RecoveryQueueIsEmpty()) {
            return restartRecPtr;
        }
        SpinLockAcquire(&xlogctl->info_lck);
        replayLastLSN = xlogctl->lastCheckPointRecPtr;
        SpinLockRelease(&xlogctl->info_lck);

        if (replayLastLSN > (uint64)u_sess->attr.attr_storage.max_redo_log_size * BYTE_PER_KB) {
            targetLSN = replayLastLSN - (uint64)u_sess->attr.attr_storage.max_redo_log_size * BYTE_PER_KB;
        } else {
            targetLSN = curMinRecLSN;
        }

        (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
        num = state->end - state->start;

        /*
         * If the pagewriter flush dirty page to disk quickly, push the checkpoint loc point to
         * the position closest to curMinRecLSN .
         */
        if (XLByteLE(targetLSN, curMinRecLSN)) {
            for (i = 0; i < num; i++) {
                loc = (state->start + i) % RESTART_POINT_QUEUE_LEN;
                if (state->ckpt_rec_queue[loc].CkptLSN > curMinRecLSN) {
                    if (i > 0) {
                        loc = (state->start + i - 1) % RESTART_POINT_QUEUE_LEN;
                        state->start = state->start + i - 1;
                    }
                    restartRecPtr = state->ckpt_rec_queue[loc].CkptLSN;
                    *restartCheckPoint = state->ckpt_rec_queue[loc].checkpoint;
                    break;
                }
            }
        } else {
            /* In other cases, push the checkpoint loc to the position closest to targetLSN. */
            for (i = 0; i < num; i++) {
                loc = (state->start + i) % RESTART_POINT_QUEUE_LEN;
                if (state->ckpt_rec_queue[loc].CkptLSN > targetLSN) {
                    if (i > 0) {
                        uint64 gap = state->ckpt_rec_queue[loc].CkptLSN - targetLSN;
                        int prevLoc = (state->start + i - 1) % RESTART_POINT_QUEUE_LEN;
                        if (targetLSN - state->ckpt_rec_queue[prevLoc].CkptLSN < gap) {
                            restartRecPtr = state->ckpt_rec_queue[prevLoc].CkptLSN;
                            *restartCheckPoint = state->ckpt_rec_queue[prevLoc].checkpoint;
                            state->start = state->start + i - 1;
                        } else {
                            restartRecPtr = state->ckpt_rec_queue[loc].CkptLSN;
                            *restartCheckPoint = state->ckpt_rec_queue[loc].checkpoint;
                            state->start = state->start + i;
                        }
                    } else {
                        restartRecPtr = state->ckpt_rec_queue[loc].CkptLSN;
                        *restartCheckPoint = state->ckpt_rec_queue[loc].checkpoint;
                    }
                    break;
                }
            }
        }

        if (XLogRecPtrIsInvalid(restartRecPtr) && num > 0) {
            loc = (state->end - 1) % RESTART_POINT_QUEUE_LEN;
            restartRecPtr = state->ckpt_rec_queue[loc].CkptLSN;
            *restartCheckPoint = state->ckpt_rec_queue[loc].checkpoint;
            state->start = state->end - 1;
        }
        (void)LWLockRelease(state->recovery_queue_lock);
    }
    return restartRecPtr;
}

/*
 * Establish a restartpoint if possible.
 *
 * This is similar to CreateCheckPoint, but is used during WAL recovery
 * to establish a point from which recovery can roll forward without
 * replaying the entire recovery log.
 *
 * Returns true if a new restartpoint was established. We can only establish
 * a restartpoint if we have replayed a safe checkpoint record since last
 * restartpoint.
 */
bool CreateRestartPoint(int flags)
{
    XLogRecPtr lastCheckPointRecPtr;
    CheckPoint lastCheckPoint;
    XLogSegNo _logSegNo;
    TimestampTz xtime;
    int32 lastlrc = 0;
    errno_t errorno = EOK;
    bool recoveryInProgress = true;
    bool doFullCkpt = !ENABLE_INCRE_CKPT;

    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    /*
     * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
     * happens at a time.
     */
    gstrace_entry(GS_TRC_ID_CreateRestartPoint);
    LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);

    recoveryInProgress = RecoveryInProgress();
    /*
     * Check that we're still in recovery mode. It's ok if we exit recovery
     * mode after this check, the restart point is valid anyway.
     */
    if (!recoveryInProgress) {
        ereport(DEBUG2, (errmsg("skipping restartpoint, recovery has already ended")));
        LWLockRelease(CheckpointLock);
        gstrace_exit(GS_TRC_ID_CreateRestartPoint);
        return false;
    }

    if (doFullCkpt ||
        ((unsigned int)flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE))) {
        doFullCkpt = true;
        if (ENABLE_INCRE_CKPT) {
            RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state;
            (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE);
            state->start = state->end > 0 ? state->end - 1 : state->end;
            (void)LWLockRelease(state->recovery_queue_lock);
        }
        SpinLockAcquire(&xlogctl->info_lck);
        lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
        lastCheckPoint = xlogctl->lastCheckPoint;
        SpinLockRelease(&xlogctl->info_lck);
    } else {
        lastCheckPointRecPtr = GetRestartPointInRecovery(&lastCheckPoint);
    }

    /*
     * If the last checkpoint record we've replayed is already our last
     * restartpoint, we can't perform a new restart point. We still update
     * minRecoveryPoint in that case, so that if this is a shutdown restart
     * point, we won't start up earlier than before. That's not strictly
     * necessary, but when hot standby is enabled, it would be rather weird if
     * the database opened up for read-only connections at a point-in-time
     * before the last shutdown. Such time travel is still possible in case of
     * immediate shutdown, though.
     *
     * We don't explicitly advance minRecoveryPoint when we do create a
     * restartpoint. It's assumed that flushing the buffers will do that as a
     * side-effect.
     */
    if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
        XLByteLE(lastCheckPoint.redo, t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo)) {
#ifdef USE_ASSERT_CHECKING
        int mode = LOG;
#else
        int mode = DEBUG2;
#endif
        ereport(mode, (errmsg("skipping restartpoint, already performed at %X/%X", (uint32)(lastCheckPoint.redo >> 32),
                              (uint32)lastCheckPoint.redo)));

        UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
        if ((unsigned int)flags & CHECKPOINT_IS_SHUTDOWN) {
            wait_all_dirty_page_flush(flags, lastCheckPoint.redo);
            LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
            t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
            t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
            UpdateControlFile();
            LWLockRelease(ControlFileLock);
        }
        LWLockRelease(CheckpointLock);
        gstrace_exit(GS_TRC_ID_CreateRestartPoint);
        return false;
    }
    PrintCkpXctlControlFile(t_thrd.shemem_ptr_cxt.ControlFile->checkPoint,
                            &(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy), lastCheckPointRecPtr, &lastCheckPoint,
                            t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint, "CreateRestartPoint");

#ifdef ENABLE_MOT
    CallCheckpointCallback(EVENT_CHECKPOINT_CREATE_SNAPSHOT, 0);
#endif

    /*
     * Update the shared RedoRecPtr so that the startup process can calculate
     * the number of segments replayed since last restartpoint, and request a
     * restartpoint if it exceeds checkpoint_segments.
     *
     * Like in CreateCheckPoint(), hold off insertions to update it, although
     * during recovery this is just pro forma, because no WAL insertions are
     * happening.
     */
    StartSuspendWalInsert(&lastlrc);
    xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
#ifdef ENABLE_MOT
    CallCheckpointCallback(EVENT_CHECKPOINT_SNAPSHOT_READY, lastCheckPointRecPtr);
#endif
    StopSuspendWalInsert(lastlrc);

    /* Also update the info_lck-protected copy */
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->RedoRecPtr = lastCheckPoint.redo;
    SpinLockRelease(&xlogctl->info_lck);

    /*
     * Prepare to accumulate statistics.
     *
     * Note: because it is possible for log_checkpoints to change while a
     * checkpoint proceeds, we always accumulate stats, even if
     * log_checkpoints is currently off.
     */
    errorno = memset_s(t_thrd.xlog_cxt.CheckpointStats, sizeof(CheckpointStatsData), 0, sizeof(CheckpointStatsData));
    securec_check(errorno, "", "");

    t_thrd.xlog_cxt.CheckpointStats->ckpt_start_t = GetCurrentTimestamp();

    if (u_sess->attr.attr_common.log_checkpoints) {
        LogCheckpointStart((unsigned int)flags, true);
    }

    if (ENABLE_INCRE_CKPT && doFullCkpt) {
        g_instance.ckpt_cxt_ctl->full_ckpt_redo_ptr = lastCheckPoint.redo;
        g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc = get_dirty_page_queue_tail();
        pg_memory_barrier();
        if (get_dirty_page_num() > 0) {
            g_instance.ckpt_cxt_ctl->flush_all_dirty_page = true;
        }
        ereport(LOG, (errmsg("CreateRestartPoint, need flush %ld pages.", get_dirty_page_num())));
    } else if (ENABLE_INCRE_CKPT) {
        g_instance.ckpt_cxt_ctl->full_ckpt_redo_ptr = lastCheckPoint.redo;
        (void)LWLockAcquire(g_instance.ckpt_cxt_ctl->prune_queue_lock, LW_EXCLUSIVE);
        g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc = get_loc_for_lsn(lastCheckPointRecPtr);
        pg_memory_barrier();
        LWLockRelease(g_instance.ckpt_cxt_ctl->prune_queue_lock);

        uint64 head = pg_atomic_read_u64(&g_instance.ckpt_cxt_ctl->dirty_page_queue_head);
        int64 need_flush_num = g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc > head
                                    ? g_instance.ckpt_cxt_ctl->full_ckpt_expected_flush_loc - head
                                    : 0;

        if (need_flush_num > 0) {
            g_instance.ckpt_cxt_ctl->flush_all_dirty_page = true;
        }
        ereport(LOG, (errmsg("CreateRestartPoint, need flush %ld pages", need_flush_num)));
    }

    CheckPointGuts(lastCheckPoint.redo, flags, true);

#ifdef ENABLE_MOT
    CallCheckpointCallback(EVENT_CHECKPOINT_BEGIN_CHECKPOINT, 0);
#endif

    /*
     * Select point at which we can truncate the xlog, which we base on the
     * prior checkpoint's earliest info.
     */
    XLByteToSeg(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, _logSegNo);
    if (ENABLE_INCRE_CKPT) {
        XLogRecPtr MinRecLSN = ckpt_get_min_rec_lsn();
        if (!XLogRecPtrIsInvalid(MinRecLSN) && XLByteLT(MinRecLSN, lastCheckPointRecPtr)) {
            ereport(WARNING, (errmsg("current dirty page list head recLSN %08X/%08X smaller than lsn %08X/%08X",
                                     (uint32)(MinRecLSN >> XLOG_LSN_SWAP), (uint32)MinRecLSN,
                                     (uint32)(lastCheckPointRecPtr >> XLOG_LSN_SWAP), (uint32)lastCheckPointRecPtr)));
            LWLockRelease(CheckpointLock);
            gstrace_exit(GS_TRC_ID_CreateRestartPoint);
            return false;
        }
    }

    /*
     * Update pg_control, using current time.  Check that it still shows
     * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
     * this is a quick hack to make sure nothing really bad happens if somehow
     * we get here after the end-of-recovery checkpoint.
     */
    LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
    if (t_thrd.shemem_ptr_cxt.ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
        XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo, lastCheckPoint.redo)) {
        t_thrd.shemem_ptr_cxt.ControlFile->prevCheckPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
        t_thrd.shemem_ptr_cxt.ControlFile->checkPoint = lastCheckPointRecPtr;
        t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy = lastCheckPoint;
        t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL);
        g_instance.ckpt_cxt_ctl->ckpt_view.ckpt_current_redo_point = lastCheckPoint.redo;
        if ((unsigned int)flags & CHECKPOINT_IS_SHUTDOWN) {
            t_thrd.shemem_ptr_cxt.ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
        }
        UpdateControlFile();
    }
    LWLockRelease(ControlFileLock);

    /* wake up the cbmwriter */
    if (g_instance.proc_base->cbmwriterLatch) {
        SetLatch(g_instance.proc_base->cbmwriterLatch);
    }

    /*
     * Delete old log files (those no longer needed even for previous
     * checkpoint/restartpoint) to prevent the disk holding the xlog from
     * growing full.
     */
    if (_logSegNo) {
        XLogRecPtr endptr;

        /* Get the current (or recent) end of xlog */
        endptr = GetStandbyFlushRecPtr(NULL);

        /* The third param is not used */
        KeepLogSeg(endptr, &_logSegNo, InvalidXLogRecPtr);
        _logSegNo--;

        /*
         * Save one more segment to handle the corner case. For example, standby has replayed to
         * the end to one segment file (000000010000002400000078) and the LSN is exactly 24/79000000.
         * The corresponding slot on primary is 24/79000000. As a result, if standby restart to
         * request xlog stream from the lastest valid record (23/xxxxxxx), cannot find on primary.
         */
        if (_logSegNo > 0) {
            _logSegNo--;
        }

        /*
         * Update ThisTimeLineID to the timeline we're currently replaying,
         * so that we install any recycled segments on that timeline.
         *
         * There is no guarantee that the WAL segments will be useful on the
         * current timeline; if recovery proceeds to a new timeline right
         * after this, the pre-allocated WAL segments on this timeline will
         * not be used, and will go wasted until recycled on the next
         * restartpoint. We'll live with that.
         */
        SpinLockAcquire(&xlogctl->info_lck);
        t_thrd.xlog_cxt.ThisTimeLineID = t_thrd.shemem_ptr_cxt.XLogCtl->lastCheckPoint.ThisTimeLineID;
        SpinLockRelease(&xlogctl->info_lck);

        RemoveOldXlogFiles(_logSegNo, endptr);

        /*
         * Make more log segments if needed.  (Do this after recycling old log
         * segments, since that may supply some of the needed files.)
         */
        PreallocXlogFiles(endptr);
    }

    /*
     * Truncate pg_csnlog if possible.  We can throw away all data before
     * the oldest XMIN of any running transaction.  No future transaction will
     * attempt to reference any pg_csnlog entry older than that (see Asserts
     * in csnlog.c).  When hot standby is disabled, though, we mustn't do
     * this because StartupCSNLOG hasn't been called yet.
     */
    if (g_instance.attr.attr_storage.EnableHotStandby && !IS_MULTI_DISASTER_RECOVER_MODE) {
        pg_time_t now;
        int elapsed_secs;
        now = (pg_time_t)time(NULL);
        elapsed_secs = now - t_thrd.checkpoint_cxt.last_truncate_log_time;
        /*
         * Reduce the frequency of trucate CSN log to avoid the probability of lock contention.
         * Incremental chekpoint does not require frequent truncate of csnlog.
         */
        if (!ENABLE_INCRE_CKPT || elapsed_secs >= u_sess->attr.attr_storage.fullCheckPointTimeout) {
            TransactionId globalXmin = InvalidTransactionId;
            (void)GetOldestActiveTransactionId(&globalXmin);
            TransactionId cutoffXid = GetOldestXmin(NULL);
            if (TransactionIdIsNormal(globalXmin) && TransactionIdPrecedes(globalXmin, cutoffXid)) {
                cutoffXid = globalXmin;
            }
            TransactionId globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
            if (TransactionIdIsNormal(globalRecycleXid) && TransactionIdPrecedes(globalRecycleXid, cutoffXid)) {
                cutoffXid = globalRecycleXid;
            }
            TruncateCSNLOG(cutoffXid);
            t_thrd.checkpoint_cxt.last_truncate_log_time = now;
        }
    }

    /* Real work is done, but log and update before releasing lock. */
    LogCheckpointEnd(true);

    xtime = GetLatestXTime();
    ereport((u_sess->attr.attr_common.log_checkpoints ? LOG : DEBUG2),
            (errmsg("recovery restart point at %X/%X", (uint32)(lastCheckPoint.redo >> 32),
                    (uint32)lastCheckPoint.redo),
             xtime ? errdetail("last completed transaction was at log time %s", timestamptz_to_str(xtime)) : 0));

    LWLockRelease(CheckpointLock);

    /*
     * Finally, execute archive_cleanup_command, if any.
     */
    if (t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand[0]) {
        ExecuteRecoveryCommand(t_thrd.shemem_ptr_cxt.XLogCtl->archiveCleanupCommand, "archive_cleanup_command", false);
    }

    gstrace_exit(GS_TRC_ID_CreateRestartPoint);
    return true;
}

static XLogSegNo CalcRecycleSegNo(XLogRecPtr curInsert, XLogRecPtr quorumMinRequired, XLogRecPtr minToolsRequired)
{
    /* the unit of max_size_for_xlog_prune is KB */
    uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_for_xlog_prune << 10);
    XLogSegNo SegNoCanRecycled = 1;
    if (XLByteLT(maxKeepSize + XLogSegSize, curInsert)) {
        XLByteToSeg(curInsert - maxKeepSize, SegNoCanRecycled);
    }

    /*
     * For asynchronous replication, we do not care connected sync standbys,
     * since standby build won't block xact commit on master.
     */
    if (u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
        if (!XLByteEQ(quorumMinRequired, InvalidXLogRecPtr)) {
            XLogSegNo quorumMinSegNo;
            XLByteToSeg(quorumMinRequired, quorumMinSegNo);
            if (quorumMinSegNo < SegNoCanRecycled) {
                SegNoCanRecycled = quorumMinSegNo;
            }
        }
    }

    if (!XLByteEQ(minToolsRequired, InvalidXLogRecPtr)) {
        XLogSegNo minToolsSegNo;
        XLByteToSeg(minToolsRequired, minToolsSegNo);
        if (minToolsSegNo < SegNoCanRecycled && minToolsSegNo > 0) {
            SegNoCanRecycled = minToolsSegNo;
        }
    }

    ereport(LOG, (errmsg("keep xlog segments after %lu, current wal size %lu(Bytes), curInsert %lu, "
        "quorumMinRequired %lu, minToolsRequired %lu", SegNoCanRecycled, maxKeepSize, curInsert, quorumMinRequired,
        minToolsRequired)));

    return SegNoCanRecycled;
}

static XLogSegNo CalcRecycleSegNoForHadrMainStandby(XLogRecPtr curFlush, XLogSegNo segno, XLogRecPtr minRequired)
{
    XLogSegNo SegNoCanRecycled = segno;

    if (g_instance.attr.attr_storage.max_replication_slots > 0 && !XLByteEQ(minRequired, InvalidXLogRecPtr)) {
        XLogSegNo slotSegNo;

        XLByteToSeg(minRequired, slotSegNo);

        if (slotSegNo <= 0) {
            /* segno = 1 show all file should be keep */
            SegNoCanRecycled = 1;
            ereport(LOG, (errmsg("main standby keep all the xlog segments, because the minimal replication slot segno "
                                 "is less than or equal to zero")));
        } else if (slotSegNo < SegNoCanRecycled) {
            SegNoCanRecycled = slotSegNo;
        }
    }
    uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_for_xlog_prune << 10);

    if (WalSndInProgress(SNDROLE_PRIMARY_BUILDSTANDBY)) {
        SegNoCanRecycled = 1;
        ereport(LOG, (errmsg("keep all the xlog segments in Main standby, "
            "because there is a full standby building processing.")));
    } else if (!WalSndAllInProgressForMainStandby(SNDROLE_PRIMARY_STANDBY)) {
        if (XLByteLT(maxKeepSize + XLogSegSize, curFlush)) {
            XLByteToSeg(curFlush - maxKeepSize, SegNoCanRecycled);
        } else {
            SegNoCanRecycled = 1;
            ereport(LOG, (errmsg("keep all the xlog segments in Main standby, "
                "because there is a Cascade standby offline.")));
        }
    }
    return SegNoCanRecycled;
}

static XLogSegNo CalcRecycleArchiveSegNo()
{
    XLogRecPtr min_required = InvalidXLogRecPtr;
    XLogSegNo min_required_segno;

    for (int i = 0; i < g_instance.attr.attr_storage.max_replication_slots; i++) {
        ReplicationSlot *s = &t_thrd.slot_cxt.ReplicationSlotCtl->replication_slots[i];
        volatile ReplicationSlot *vslot = s;
        SpinLockAcquire(&s->mutex);
        XLogRecPtr restart_lsn;
        if (!s->in_use) {
            SpinLockRelease(&s->mutex);
            continue;
        }
        restart_lsn = vslot->data.restart_lsn;
        if (s->extra_content == NULL) {
            if ((!XLByteEQ(restart_lsn, InvalidXLogRecPtr)) &&
                (XLByteEQ(min_required, InvalidXLogRecPtr) || XLByteLT(restart_lsn, min_required))) {
                min_required = restart_lsn;
            }
        }
        SpinLockRelease(&s->mutex);
    }
    XLByteToSeg(min_required, min_required_segno);
    return min_required_segno;
}

static XLogSegNo CalculateCNRecycleSegNoForStreamingHadr(XLogRecPtr curInsert, XLogSegNo logSegNo,
    XLogRecPtr minToolsRequired)
{
    XLogSegNo slotSegNo = logSegNo;

    uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_for_xlog_prune << 10);
    if (WalSndInProgress(SNDROLE_PRIMARY_BUILDSTANDBY)) {
        slotSegNo = 1;
        ereport(LOG, (errmsg("keep all the xlog segments in Main Coordinator, "
            "because there is a full building processing.")));
    } else if (!WalSndAllInProgressForMainStandby(SNDROLE_PRIMARY_STANDBY)) {
        if (XLByteLT(maxKeepSize + XLogSegSize, curInsert)) {
            XLByteToSeg(curInsert - maxKeepSize, slotSegNo);
        } else {
            slotSegNo = 1;
            ereport(LOG, (errmsg("keep all the xlog segments in Main Coordinator, "
                "because there is a standby Coordinator offline.")));
        }
    }
    if (!XLByteEQ(minToolsRequired, InvalidXLogRecPtr)) {
        XLogSegNo minToolsSegNo;
        XLByteToSeg(minToolsRequired, minToolsSegNo);
        if (minToolsSegNo < slotSegNo) {
            slotSegNo = minToolsSegNo;
        }
    }
    if (slotSegNo < logSegNo) {
        return slotSegNo;
    } else {
        return logSegNo;
    }
}

/*
 * Retreat *logSegNo to the last segment that we need to retain because of
 * either wal_keep_segments or replication slots.
 *
 * This is calculated by subtracting wal_keep_segments from the given xlog
 * location, recptr and by making sure that that result is below the
 * requirement of replication slots.
 *
 * DCF feature: Replication slot is no longer needed as consensus of XLog
 * is achieved in DCF. Rather than WalSnd, XLog data are replicated via DCF.
 */
static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo, XLogRecPtr curInsert)
{
    XLogSegNo segno;
    XLogSegNo wal_keep_segno;
    XLogRecPtr keep;
    XLogRecPtr xlogcopystartptr;
    ReplicationSlotState repl_slot_state;

    gstrace_entry(GS_TRC_ID_KeepLogSeg);
    XLByteToSeg(recptr, segno);

    /* avoid underflow, don't go below 1 */
    if (segno <= (uint64)(uint32)XLogSegmentsNum(u_sess->attr.attr_storage.wal_keep_segments)) {
        /* segno = 1 show all file should be keep */
        ereport(LOG, (errmsg("keep all the xlog segments, because current segno = %lu, "
            "less than wal_keep_segments = %d", segno,
            (int)XLogSegmentsNum(u_sess->attr.attr_storage.wal_keep_segments))));
        segno = 1;
    } else {
        segno = segno - (uint32)XLogSegmentsNum(u_sess->attr.attr_storage.wal_keep_segments);
    }

    wal_keep_segno = segno;

    ReplicationSlotsComputeRequiredLSN(&repl_slot_state);
    keep = repl_slot_state.min_required;
    ReplicationSlotReportRestartLSN();

    /* then check whether slots limit removal further */
    if (g_instance.attr.attr_storage.max_replication_slots > 0 && !XLByteEQ(keep, InvalidXLogRecPtr)) {
        XLogSegNo slotSegNo;

        XLByteToSeg(keep, slotSegNo);

        if (slotSegNo <= 0) {
            /* segno = 1 show all file should be keep */
            segno = 1;
            ereport(LOG, (errmsg("keep all the xlog segments, because the minimal replication slot segno "
                                 "is less than or equal to zero")));
        } else if (slotSegNo < segno) {
            segno = slotSegNo;
        }
    }

    LWLockAcquire(FullBuildXlogCopyStartPtrLock, LW_SHARED);
    xlogcopystartptr = XlogCopyStartPtr;
    LWLockRelease(FullBuildXlogCopyStartPtrLock);

    if (!XLByteEQ(xlogcopystartptr, InvalidXLogRecPtr)) {
        XLogSegNo slotSegNo;
        XLByteToSeg(xlogcopystartptr, slotSegNo);

        if (slotSegNo <= 0) {
            /* segno = 1 show all file should be keep */
            segno = 1;
            ereport(LOG, (errmsg("keep all the xlog segments, because there is a full-build task in the backend, "
                                 "and start segno is less than or equal to zero")));
        } else if (slotSegNo < segno) {
            segno = slotSegNo;
        }
    }

    load_server_mode();

    /* In streaming hadr mode, we need consider how to recycle xlog for Coordinator in Main cluster with standby. */
    if (t_thrd.xlog_cxt.server_mode == NORMAL_MODE && IS_PGXC_COORDINATOR) {
        XLogSegNo mainCNSegNo = CalculateCNRecycleSegNoForStreamingHadr(curInsert, segno,
                                                                        repl_slot_state.min_tools_required);
        if (mainCNSegNo < segno && mainCNSegNo > 0) {
            segno = mainCNSegNo;
        }
    }

    /*
     * In primary mode, we should do additional check.
     *
     * 1.When dn is in build(full or incremental), just keep log.
     * 2.When not all standbys are alive in multi standby mode, further check enable_xlog_prune.
     * If false, just keep log; otherwise, keep log only if walSize less than config.
     * 3.When standby is not alive in dummy standby mode, just keep log.
     * 4.Notice the users if slot is invalid
     *
     * while dms and dss enable, t_thrd.xlog_cxt.server_mode only is normal_mode, we do additional
     * check for dms and dss enabling.
     */
    if (!ENABLE_DMS && t_thrd.xlog_cxt.server_mode == PRIMARY_MODE) {
        if (WalSndInProgress(SNDROLE_PRIMARY_BUILDSTANDBY) ||
            pg_atomic_read_u32(&g_instance.comm_cxt.current_gsrewind_count) > 0) {
            /* segno = 1 show all file should be keep */
            segno = 1;
            ereport(LOG, (errmsg("keep all the xlog segments, because there is a build task in the backend, "
                                 "and gs_rewind count is more than zero")));
        } else if (IS_DN_MULTI_STANDYS_MODE() && !WalSndAllInProgress(SNDROLE_PRIMARY_STANDBY) &&
                   !g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
            if (!u_sess->attr.attr_storage.enable_xlog_prune) {
                /* segno = 1 show all file should be keep */
                segno = 1;
                ereport(LOG, (errmsg("keep all the xlog segments, because the value of enable_xlog_prune(GUC) "
                    "is false")));
            } else {
                segno = CalcRecycleSegNo(curInsert, repl_slot_state.quorum_min_required,
                    repl_slot_state.min_tools_required);
            }
        } else if (IS_DN_DUMMY_STANDYS_MODE() && !WalSndInProgress(SNDROLE_PRIMARY_STANDBY)) {
            /* segno = 1 show all file should be keep */
            segno = 1;
            ereport(LOG, (errmsg("keep all the xlog segments, because not all the wal senders are "
                "in the role PRIMARY_STANDBY")));
        } else {
            uint64 maxKeepSize = ((uint64)u_sess->attr.attr_storage.max_size_for_xlog_prune << 10);
            XLogSegNo archiveSlotSegNo;
            XLByteToSeg(repl_slot_state.min_archive_slot_required, archiveSlotSegNo);
            if (!XLByteEQ(repl_slot_state.min_archive_slot_required, InvalidXLogRecPtr) &&
                (XLByteLT(maxKeepSize + XLogSegSize, (curInsert - repl_slot_state.min_archive_slot_required)) &&
                    archiveSlotSegNo == segno)) {
                segno = CalcRecycleArchiveSegNo();
                g_instance.roach_cxt.isXLogForceRecycled = true;
                ereport(LOG, (errmsg("force recycle archiving xlog, because archive required xlog keep segement size "
                    "is bigger than max keep size: %lu", maxKeepSize)));
            }
        }

        if (XLByteEQ(keep, InvalidXLogRecPtr) && repl_slot_state.exist_in_use &&
            !g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
            /* there are slots and lsn is invalid, we keep it */
            segno = 1;
            ereport(WARNING, (errmsg("invalid replication slot recptr"),
                              errhint("Check slot configuration or setup standby/secondary")));
        }

        /* take wal_keep_segments into consideration */
        if (wal_keep_segno < segno) {
            segno = wal_keep_segno;
        }
    }

    if (u_sess->attr.attr_storage.enable_cbm_tracking) {
        XLogSegNo cbm_tracked_segno;
        XLByteToSeg(GetCBMTrackedLSN(), cbm_tracked_segno);

        if (cbm_tracked_segno < segno && cbm_tracked_segno > 0) {
            segno = cbm_tracked_segno;
        }
    }
    if (t_thrd.xlog_cxt.server_mode != PRIMARY_MODE && t_thrd.xlog_cxt.server_mode != NORMAL_MODE) {
        LWLockAcquire(XlogRemoveSegLock, LW_SHARED);
        if (XlogRemoveSegPrimary < segno && XlogRemoveSegPrimary > 0) {
            segno = XlogRemoveSegPrimary;
        }
        LWLockRelease(XlogRemoveSegLock);
    }
    if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE && t_thrd.xlog_cxt.is_hadr_main_standby &&
        !g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
        XLogSegNo mainStandbySegNo = CalcRecycleSegNoForHadrMainStandby(recptr, segno, repl_slot_state.min_required);
        if (mainStandbySegNo < segno && mainStandbySegNo > 0) {
            segno = mainStandbySegNo;
        }
    }
    /* don't delete WAL segments newer than the calculated segment */
    if (segno < *logSegNo && segno > 0) {
        *logSegNo = segno;
    }
    gstrace_exit(GS_TRC_ID_KeepLogSeg);
}

/*
 * Write a NEXTOID log record
 */
void XLogPutNextOid(Oid nextOid)
{
    XLogBeginInsert();
    XLogRegisterData((char *)(&nextOid), sizeof(Oid));

    (void)XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);

    /*
     * We need not flush the NEXTOID record immediately, because any of the
     * just-allocated OIDs could only reach disk as part of a tuple insert or
     * update that would have its own XLOG record that must follow the NEXTOID
     * record.	Therefore, the standard buffer LSN interlock applied to those
     * records will ensure no such OID reaches disk before the NEXTOID record
     * does.
     *
     * Note, however, that the above statement only covers state "within" the
     * database.  When we use a generated OID as a file or directory name, we
     * are in a sense violating the basic WAL rule, because that filesystem
     * change may reach disk before the NEXTOID WAL record does.  The impact
     * of this is that if a database crash occurs immediately afterward, we
     * might after restart re-generate the same OID and find that it conflicts
     * with the leftover file or directory.  But since for safety's sake we
     * always loop until finding a nonconflicting filename, this poses no real
     * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
     */
}

/*
 * Write an XLOG SWITCH record.
 *
 * Here we just blindly issue an XLogInsert request for the record.
 * All the magic happens inside XLogInsert.
 *
 * The return value is either the end+1 address of the switch record,
 * or the end+1 address of the prior segment if we did not need to
 * write a switch record because we are already at segment start.
 */
XLogRecPtr RequestXLogSwitch(void)
{
    /* XLOG SWITCH has no data */
    XLogBeginInsert();

    return XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
}

/*
 * Write a RESTORE POINT record
 */
XLogRecPtr XLogRestorePoint(const char *rpName)
{
    XLogRecPtr RecPtr;
    xl_restore_point xlrec;
    errno_t retcode;

    xlrec.rp_time = GetCurrentTimestamp();
    retcode = strncpy_s(xlrec.rp_name, MAXFNAMELEN, rpName, strlen(rpName));
    securec_check(retcode, "\0", "\0");

    XLogBeginInsert();
    XLogRegisterData((char *)&xlrec, sizeof(xl_restore_point));

    RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);

    ereport(LOG, (errmsg("restore point \"%s\" created at %X/%X", rpName, (uint32)(RecPtr >> 32), (uint32)RecPtr)));

    return RecPtr;
}

/*
 * Check if any of the GUC parameters that are critical for hot standby
 * have changed, and update the value in pg_control file if necessary.
 */
static void XLogReportParameters(void)
{
    if (g_instance.attr.attr_storage.wal_level != t_thrd.shemem_ptr_cxt.ControlFile->wal_level ||
        g_instance.shmem_cxt.MaxConnections != t_thrd.shemem_ptr_cxt.ControlFile->MaxConnections ||
        g_instance.attr.attr_storage.max_prepared_xacts != t_thrd.shemem_ptr_cxt.ControlFile->max_prepared_xacts ||
        g_instance.attr.attr_storage.max_locks_per_xact != t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact) {
        /*
         * The change in number of backend slots doesn't need to be WAL-logged
         * if archiving is not enabled, as you can't start archive recovery
         * with wal_level=minimal anyway. We don't really care about the
         * values in pg_control either if wal_level=minimal, but seems better
         * to keep them up-to-date to avoid confusion.
         */
        if (g_instance.attr.attr_storage.wal_level != t_thrd.shemem_ptr_cxt.ControlFile->wal_level || XLogIsNeeded()) {
            xl_parameter_change xlrec;
            XLogRecPtr recptr;

            xlrec.MaxConnections = g_instance.shmem_cxt.MaxConnections;
            xlrec.max_prepared_xacts = g_instance.attr.attr_storage.max_prepared_xacts;
            xlrec.max_locks_per_xact = g_instance.attr.attr_storage.max_locks_per_xact;
            xlrec.wal_level = g_instance.attr.attr_storage.wal_level;

            XLogBeginInsert();
            XLogRegisterData((char *)&xlrec, sizeof(xlrec));

            recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
            XLogWaitFlush(recptr);
        }

        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        t_thrd.shemem_ptr_cxt.ControlFile->MaxConnections = g_instance.shmem_cxt.MaxConnections;
        t_thrd.shemem_ptr_cxt.ControlFile->max_prepared_xacts = g_instance.attr.attr_storage.max_prepared_xacts;
        t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact = g_instance.attr.attr_storage.max_locks_per_xact;
        t_thrd.shemem_ptr_cxt.ControlFile->wal_level = g_instance.attr.attr_storage.wal_level;
        UpdateControlFile();
        LWLockRelease(ControlFileLock);
    }
}

/*
 * Update full_page_writes in shared memory, and write an
 * XLOG_FPW_CHANGE record if necessary.
 *
 * Note: this function assumes there is no other process running
 * concurrently that could update it.
 */
void UpdateFullPageWrites(void)
{
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    int32 lastlrc = 0;

    /*
     * Do nothing if full_page_writes has not been changed.
     *
     * It's safe to check the shared full_page_writes without the lock,
     * because we assume that there is no concurrently running process which
     * can update it.
     */
    if (ENABLE_INCRE_CKPT) {
        u_sess->attr.attr_storage.fullPageWrites = false;
    }
    if (u_sess->attr.attr_storage.fullPageWrites == Insert->fullPageWrites) {
        return;
    }

    START_CRIT_SECTION();

    /*
     * It's always safe to take full page images, even when not strictly
     * required, but not the other round. So if we're setting full_page_writes
     * to true, first set it true and then write the WAL record. If we're
     * setting it to false, first write the WAL record and then set the global
     * flag.
     */
    if (u_sess->attr.attr_storage.fullPageWrites) {
        StartSuspendWalInsert(&lastlrc);
        Insert->fullPageWrites = true;
        StopSuspendWalInsert(lastlrc);
    }

    /*
     * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
     * full_page_writes during archive recovery, if required.
     */
    if (XLogStandbyInfoActive() && !RecoveryInProgress()) {
        XLogBeginInsert();
        XLogRegisterData((char *)(&u_sess->attr.attr_storage.fullPageWrites), sizeof(bool));

        (void)XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
    }

    if (!u_sess->attr.attr_storage.fullPageWrites) {
        StartSuspendWalInsert(&lastlrc);
        Insert->fullPageWrites = false;
        StopSuspendWalInsert(lastlrc);
    }
    END_CRIT_SECTION();
}

// PR:
bool IsCheckPoint(XLogReaderState *record)
{
    uint8 info;
    RmgrId rmid;

    info = XLogRecGetInfo(record) & (~XLR_INFO_MASK);
    rmid = XLogRecGetRmid(record);

    return rmid == RM_XLOG_ID && (info == XLOG_CHECKPOINT_SHUTDOWN || info == XLOG_CHECKPOINT_ONLINE);
}

bool IsCheckPoint(const XLogRecParseState *parseState)
{
    uint8 info = parseState->blockparse.blockhead.xl_info & (~XLR_INFO_MASK);
    RmgrId rmid = parseState->blockparse.blockhead.xl_rmid;

    return rmid == RM_XLOG_ID && (info == XLOG_CHECKPOINT_SHUTDOWN || info == XLOG_CHECKPOINT_ONLINE);
}

bool HasTimelineUpdate(XLogReaderState *record)
{
    uint8 info;
    int rmid;

    info = XLogRecGetInfo(record) & (~XLR_INFO_MASK);
    rmid = XLogRecGetRmid(record);

    return (rmid == RM_XLOG_ID && info == XLOG_CHECKPOINT_SHUTDOWN);
}

void UpdateTimeline(CheckPoint *checkPoint)
{
    if (checkPoint->ThisTimeLineID != t_thrd.xlog_cxt.ThisTimeLineID) {
        if (checkPoint->ThisTimeLineID < t_thrd.xlog_cxt.ThisTimeLineID ||
            !list_member_int(t_thrd.xlog_cxt.expectedTLIs, (int)checkPoint->ThisTimeLineID))
            ereport(PANIC, (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
                                   checkPoint->ThisTimeLineID, t_thrd.xlog_cxt.ThisTimeLineID)));
        /* Following WAL records should be run with new TLI */
        t_thrd.xlog_cxt.ThisTimeLineID = checkPoint->ThisTimeLineID;
    }
}

/*
 * XLOG resource manager's routines
 *
 * Definitions of info values are in include/catalog/pg_control.h, though
 * not all record types are related to control file updates.
 */
void xlog_redo(XLogReaderState *record)
{
    uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
    XLogRecPtr lsn = record->EndRecPtr;

    /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
    Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT || !XLogRecHasAnyBlockRefs(record));

    if (info == XLOG_NEXTOID) {
        Oid nextOid;
        errno_t rc;

        /*
         * We used to try to take the maximum of ShmemVariableCache->nextOid
         * and the recorded nextOid, but that fails if the OID counter wraps
         * around.  Since no OID allocation should be happening during replay
         * anyway, better to just believe the record exactly.  We still take
         * OidGenLock while setting the variable, just in case.
         */
        rc = memcpy_s(&nextOid, sizeof(Oid), XLogRecGetData(record), sizeof(Oid));
        securec_check(rc, "", "");

        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        t_thrd.xact_cxt.ShmemVariableCache->nextOid = nextOid;
        t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;
        LWLockRelease(OidGenLock);
    } else if (info == XLOG_CHECKPOINT_SHUTDOWN) {
        CheckPoint checkPoint;
        CheckPointUndo checkPointUndo;
        errno_t rc;

        if (XLogRecGetDataLen(record) >= sizeof(checkPoint) && XLogRecGetDataLen(record) < sizeof(checkPointUndo)) {
            rc = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(record), sizeof(CheckPoint));
            securec_check(rc, "", "");
        } else if (XLogRecGetDataLen(record) >= sizeof(checkPointUndo)) {
            rc = memcpy_s(&checkPointUndo, sizeof(CheckPointUndo), XLogRecGetData(record), sizeof(CheckPointUndo));
            securec_check(rc, "", "");
            checkPoint = checkPointUndo.ori_checkpoint;
            if (t_thrd.proc->workingVersionNum >= INPLACE_UPDATE_VERSION_NUM) {
                pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, checkPointUndo.globalRecycleXid);
            } else {
                pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, InvalidTransactionId);
            }
            if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin,
                checkPointUndo.recent_global_xmin)) {
                t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPointUndo.recent_global_xmin;
            }
        }
        /* In a SHUTDOWN checkpoint, believe the counters exactly */
        LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
        if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->nextXid, checkPoint.nextXid)) {
            t_thrd.xact_cxt.ShmemVariableCache->nextXid = checkPoint.nextXid;
        }
        ExtendCSNLOG(checkPoint.nextXid);
        LWLockRelease(XidGenLock);
        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        t_thrd.xact_cxt.ShmemVariableCache->nextOid = checkPoint.nextOid;
        t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;
        LWLockRelease(OidGenLock);
#ifdef ENABLE_DISTRIBUTE_TEST
        if (TEST_STUB(DN_SLAVE_CHECKPOINT_KEEPXLOG, default_error_emit)) {
            ereport(get_distribute_test_param()->elevel,
                    (errmsg("default_error_emit xlog_redo time:%ds, stub_name:%s",
                            get_distribute_test_param()->sleep_time, get_distribute_test_param()->test_stub_name)));
            checkPoint.remove_seg = 1;
        }
#endif
        LWLockAcquire(XlogRemoveSegLock, LW_EXCLUSIVE);
        ereport(DEBUG1, (errmsg("redo checkpoint xlog change remove seg from %X/%X to %X/%X",
                                (uint32)(XlogRemoveSegPrimary >> 32), (uint32)XlogRemoveSegPrimary,
                                (uint32)(checkPoint.remove_seg >> 32), (uint32)checkPoint.remove_seg)));
        XlogRemoveSegPrimary = checkPoint.remove_seg;
        LWLockRelease(XlogRemoveSegLock);
        MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);

        SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);

        /*
         * If we see a shutdown checkpoint while waiting for an end-of-backup
         * record, the backup was canceled and the end-of-backup record will
         * never arrive.
         */
        if (t_thrd.xlog_cxt.ArchiveRestoreRequested &&
            !XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint) &&
            XLogRecPtrIsInvalid(t_thrd.shemem_ptr_cxt.ControlFile->backupEndPoint)) {
            if (IsRoachRestore()) {
                ereport(WARNING,
                        (errmsg("exist a shutdown log during xlog redo due to resume backup appear switchover.")));
            } else {
                ereport(PANIC, (errmsg("online backup was canceled, recovery cannot continue")));
            }
        }
        /*
         * If we see a shutdown checkpoint, we know that nothing was running
         * on the master at this point. So fake-up an empty running-xacts
         * record and use that here and now. Recover additional standby state
         * for prepared transactions.
         */
        if (t_thrd.xlog_cxt.standbyState >= STANDBY_INITIALIZED) {
            TransactionId *xids = NULL;
            int nxids = 0;
            TransactionId oldestActiveXID = InvalidTransactionId;
            TransactionId latestCompletedXid = InvalidTransactionId;
            RunningTransactionsData running;

            oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);

            /*
             * Construct a RunningTransactions snapshot representing a shut
             * down server, with only prepared transactions still alive. We're
             * never overflowed at this point because all subxids are listed
             * with their parent prepared transactions.
             */
            running.nextXid = checkPoint.nextXid;
            running.oldestRunningXid = oldestActiveXID;
            latestCompletedXid = checkPoint.nextXid;
            TransactionIdRetreat(latestCompletedXid);
            Assert(TransactionIdIsNormal(latestCompletedXid));
            running.latestCompletedXid = latestCompletedXid;

            ProcArrayApplyRecoveryInfo(&running);

            StandbyRecoverPreparedTransactions();
        }

        /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
        LWLockRelease(ControlFileLock);

        /* Update shared-memory copy of checkpoint XID/epoch */
        {
            /* use volatile pointer to prevent code rearrangement */
            SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
            t_thrd.shemem_ptr_cxt.XLogCtl->ckptXid = checkPoint.nextXid;
            SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck);
        }

        /*
         * TLI may change in a shutdown checkpoint, but it shouldn't decrease
         */
        if (checkPoint.ThisTimeLineID != t_thrd.xlog_cxt.ThisTimeLineID) {
            if (checkPoint.ThisTimeLineID < t_thrd.xlog_cxt.ThisTimeLineID ||
                !list_member_int(t_thrd.xlog_cxt.expectedTLIs, (int)checkPoint.ThisTimeLineID)) {
                ereport(PANIC, (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
                                       checkPoint.ThisTimeLineID, t_thrd.xlog_cxt.ThisTimeLineID)));
            }
            /* Following WAL records should be run with new TLI */
            t_thrd.xlog_cxt.ThisTimeLineID = checkPoint.ThisTimeLineID;
        }

        RecoveryRestartPoint(checkPoint, record->ReadRecPtr);
    } else if (info == XLOG_CHECKPOINT_ONLINE) {
        CheckPoint checkPoint;
        CheckPointUndo checkPointUndo;
        errno_t rc;

        if (XLogRecGetDataLen(record) >= sizeof(checkPoint) && XLogRecGetDataLen(record) < sizeof(checkPointUndo)) {
            rc = memcpy_s(&checkPoint, sizeof(CheckPoint), XLogRecGetData(record), sizeof(CheckPoint));
            securec_check(rc, "", "");
        } else if (XLogRecGetDataLen(record) >= sizeof(checkPointUndo)) {
            rc = memcpy_s(&checkPointUndo, sizeof(CheckPointUndo), XLogRecGetData(record), sizeof(CheckPointUndo));
            securec_check(rc, "", "");
            checkPoint = checkPointUndo.ori_checkpoint;
            if (t_thrd.proc->workingVersionNum >= INPLACE_UPDATE_VERSION_NUM) {
                pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, checkPointUndo.globalRecycleXid);
            } else {
                pg_atomic_write_u64(&g_instance.undo_cxt.globalRecycleXid, InvalidTransactionId);
            }
            if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin,
                checkPointUndo.recent_global_xmin)) {
                t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = checkPointUndo.recent_global_xmin;
            }
        }
        /* In an ONLINE checkpoint, treat the XID counter as a minimum */
        LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
        if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->nextXid, checkPoint.nextXid)) {
            t_thrd.xact_cxt.ShmemVariableCache->nextXid = checkPoint.nextXid;
        }
        ExtendCSNLOG(checkPoint.nextXid);
        LWLockRelease(XidGenLock);
        /* ... but still treat OID counter as exact */
        LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
        t_thrd.xact_cxt.ShmemVariableCache->nextOid = checkPoint.nextOid;
        t_thrd.xact_cxt.ShmemVariableCache->oidCount = 0;

        LWLockRelease(OidGenLock);
        MultiXactAdvanceNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);

        if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->oldestXid, checkPoint.oldestXid)) {
            SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
        }

        /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
        LWLockRelease(ControlFileLock);
#ifdef ENABLE_DISTRIBUTE_TEST
        if (TEST_STUB(DN_SLAVE_CHECKPOINT_KEEPXLOG, default_error_emit)) {
            ereport(get_distribute_test_param()->elevel,
                    (errmsg("default_error_emit xlog_redo time:%ds, stub_name:%s",
                            get_distribute_test_param()->sleep_time, get_distribute_test_param()->test_stub_name)));
            checkPoint.remove_seg = 1;
        }
#endif
        LWLockAcquire(XlogRemoveSegLock, LW_EXCLUSIVE);
        ereport(DEBUG1, (errmsg("redo checkpoint xlog change remove seg from %X/%X to %X/%X",
                                (uint32)(XlogRemoveSegPrimary >> 32), (uint32)XlogRemoveSegPrimary,
                                (uint32)(checkPoint.remove_seg >> 32), (uint32)checkPoint.remove_seg)));
        XlogRemoveSegPrimary = checkPoint.remove_seg;
        LWLockRelease(XlogRemoveSegLock);
        /* Update shared-memory copy of checkpoint XID/epoch */
        {
            /* use volatile pointer to prevent code rearrangement */
            volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

            SpinLockAcquire(&xlogctl->info_lck);
            xlogctl->ckptXid = checkPoint.nextXid;
            SpinLockRelease(&xlogctl->info_lck);
        }

        /* TLI should not change in an on-line checkpoint */
        if (checkPoint.ThisTimeLineID != t_thrd.xlog_cxt.ThisTimeLineID) {
            ereport(PANIC, (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
                                   checkPoint.ThisTimeLineID, t_thrd.xlog_cxt.ThisTimeLineID)));
        }

        RecoveryRestartPoint(checkPoint, record->ReadRecPtr);
    } else if (info == XLOG_NOOP) {
        /* nothing to do here */
    } else if (info == XLOG_SWITCH) {
        /* nothing to do here */
    } else if (info == XLOG_RESTORE_POINT) {
        /* nothing to do here */
    } else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT) {
        RedoBufferInfo buffer;

        /*
         * Full-page image (FPI) records contain nothing else but a backup
         * block. The block reference must include a full-page image -
         * otherwise there would be no point in this record.
         *
         * No recovery conflicts are generated by these generic records - if a
         * resource manager needs to generate conflicts, it has to define a
         * separate WAL record type and redo routine.
         *
         * XLOG_FPI_FOR_HINT records are generated when a page needs to be
         * WAL- logged because of a hint bit update. They are only generated
         * when checksums are enabled. There is no difference in handling
         * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
         * code just to distinguish them for statistics purposes.
         */
        if (XLogReadBufferForRedo(record, 0, &buffer) != BLK_RESTORED) {
            ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND),
                            errmsg("unexpected XLogReadBufferForRedo result when restoring backup block")));
        }
        UnlockReleaseBuffer(buffer.buf);
    } else if (info == XLOG_BACKUP_END) {
        XLogRecPtr startpoint;
        errno_t rc = EOK;

        rc = memcpy_s(&startpoint, sizeof(startpoint), XLogRecGetData(record), sizeof(startpoint));
        securec_check(rc, "", "");

        if (XLByteEQ(t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint, startpoint)) {
            /*
             * We have reached the end of base backup, the point where
             * pg_stop_backup() was done. The data on disk is now consistent.
             * Reset backupStartPoint, and update minRecoveryPoint to make
             * sure we don't allow starting up at an earlier point even if
             * recovery is stopped and restarted soon after this.
             */
            ereport(DEBUG1, (errmsg("end of backup reached")));

            LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

            if (XLByteLT(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint, lsn)) {
                t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = lsn;
            }
            rc = memset_s(&t_thrd.shemem_ptr_cxt.ControlFile->backupStartPoint, sizeof(XLogRecPtr), 0,
                          sizeof(XLogRecPtr));
            securec_check(rc, "\0", "\0");
            t_thrd.shemem_ptr_cxt.ControlFile->backupEndRequired = false;
            UpdateControlFile();

            LWLockRelease(ControlFileLock);
        }
    } else if (info == XLOG_PARAMETER_CHANGE) {
        xl_parameter_change xlrec;
        errno_t rc = EOK;

        /* Update our copy of the parameters in pg_control */
        rc = memcpy_s(&xlrec, sizeof(xl_parameter_change), XLogRecGetData(record), sizeof(xl_parameter_change));
        securec_check(rc, "", "");

        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        t_thrd.shemem_ptr_cxt.ControlFile->MaxConnections = xlrec.MaxConnections;
        t_thrd.shemem_ptr_cxt.ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
        t_thrd.shemem_ptr_cxt.ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
        t_thrd.shemem_ptr_cxt.ControlFile->wal_level = xlrec.wal_level;

        /*
         * Update minRecoveryPoint to ensure that if recovery is aborted, we
         * recover back up to this point before allowing hot standby again.
         * This is particularly important if wal_level was set to 'archive'
         * before, and is now 'hot_standby', to ensure you don't run queries
         * against the WAL preceding the wal_level change. Same applies to
         * decreasing max_* settings.
         */
        t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
        SetMinRecoverPointForStats(t_thrd.xlog_cxt.minRecoveryPoint);
        if ((t_thrd.xlog_cxt.minRecoveryPoint != 0) && XLByteLT(t_thrd.xlog_cxt.minRecoveryPoint, lsn)) {
            t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint = lsn;
        }

        UpdateControlFile();
        LWLockRelease(ControlFileLock);

        ereport(LOG, (errmsg("update ControlFile->minRecoveryPoint to %X/%X for xlog parameter change",
                             (uint32)((t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint) >> 32),
                             (uint32)(t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint))));

        /* Check to see if any changes to max_connections give problems */
        CheckRequiredParameterValues(false);
    } else if (info == XLOG_FPW_CHANGE) {
        /* use volatile pointer to prevent code rearrangement */
        XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
        bool fpw = false;
        errno_t rc = EOK;

        rc = memcpy_s(&fpw, sizeof(bool), XLogRecGetData(record), sizeof(bool));
        securec_check(rc, "", "");

        /*
         * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
         * do_pg_start_backup() and do_pg_stop_backup() can check whether
         * full_page_writes has been disabled during online backup.
         */
        if (!fpw) {
            SpinLockAcquire(&xlogctl->info_lck);
            if (XLByteLT(xlogctl->lastFpwDisableRecPtr, record->ReadRecPtr)) {
                xlogctl->lastFpwDisableRecPtr = record->ReadRecPtr;
            }
            SpinLockRelease(&xlogctl->info_lck);
        }

        /* Keep track of full_page_writes */
        t_thrd.xlog_cxt.lastFullPageWrites = fpw;
    } else if (info == XLOG_DELAY_XLOG_RECYCLE) {
        bool delay = false;
        errno_t rc = EOK;
        rc = memcpy_s(&delay, sizeof(bool), XLogRecGetData(record), sizeof(bool));
        securec_check(rc, "", "");
        if (delay == true) {
            enable_delay_xlog_recycle(true);
        } else {
            disable_delay_xlog_recycle(true);
        }
    }
}

#ifdef WAL_DEBUG

static void xlog_outrec(StringInfo buf, XLogReaderState *record)
{
    int block_id;

    appendStringInfo(buf, "prev %X/%X; xid %u", (uint32)(XLogRecGetPrev(record) >> 32), (uint32)XLogRecGetPrev(record),
                     XLogRecGetXid(record));

    appendStringInfo(buf, "; len %u", XLogRecGetDataLen(record));

    /* decode block references */
    for (block_id = 0; block_id <= record->max_block_id; block_id++) {
        RelFileNode rnode;
        ForkNumber forknum;
        BlockNumber blk;

        if (!XLogRecHasBlockRef(record, block_id)) {
            continue;
        }

        XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
        if (forknum != MAIN_FORKNUM) {
            appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u", block_id, rnode.spcNode, rnode.dbNode,
                             rnode.relNode, forknum, blk);
        } else {
            appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u", block_id, rnode.spcNode, rnode.dbNode,
                             rnode.relNode, blk);
        }
        if (XLogRecHasBlockImage(record, block_id)) {
            appendStringInfo(buf, " FPW");
        }
    }
}
#endif /* WAL_DEBUG */

/*
 * Return the (possible) sync flag used for opening a file, depending on the
 * value of the GUC wal_sync_method.
 */
static int get_sync_bit(int method)
{
    uint32 o_direct_flag = 0;

    /* If fsync is disabled, never open in sync mode */
    if (!u_sess->attr.attr_storage.enableFsync) {
        return 0;
    }

    /*
     * Optimize writes by bypassing kernel cache with O_DIRECT when using
     * O_SYNC/O_FSYNC and O_DSYNC.	But only if archiving and streaming are
     * disabled, otherwise the archive command or walsender process will read
     * the WAL soon after writing it, which is guaranteed to cause a physical
     * read if we bypassed the kernel cache. We also skip the
     * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
     * reason.
     *
     * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
     * written by walreceiver is normally read by the startup process soon
     * after its written. Also, walreceiver performs unaligned writes, which
     * don't work with O_DIRECT, so it is required for correctness too.
     */
    if (!XLogIsNeeded() && !AmWalReceiverProcess()) {
        o_direct_flag = PG_O_DIRECT;
    }

    switch (method) {
            /*
             * enum values for all sync options are defined even if they are
             * not supported on the current platform.  But if not, they are
             * not included in the enum option array, and therefore will never
             * be seen here.
             */
        case SYNC_METHOD_FSYNC:
        case SYNC_METHOD_FSYNC_WRITETHROUGH:
        case SYNC_METHOD_FDATASYNC:
            return 0;
#ifdef OPEN_SYNC_FLAG
        case SYNC_METHOD_OPEN:
            return OPEN_SYNC_FLAG | o_direct_flag;
#endif
#ifdef OPEN_DATASYNC_FLAG
        case SYNC_METHOD_OPEN_DSYNC:
            return OPEN_DATASYNC_FLAG | o_direct_flag;
#endif
        default:
            /* can't happen (unless we are out of sync with option array) */
            ereport(ERROR,
                    (errcode(ERRCODE_MOST_SPECIFIC_TYPE_MISMATCH), errmsg("unrecognized wal_sync_method: %d", method)));
            return 0; /* silence warning */
    }
}

/*
 * GUC support
 */
void assign_xlog_sync_method(int new_sync_method, void *extra)
{
    if (u_sess->attr.attr_storage.sync_method != new_sync_method) {
        /*
         * To ensure that no blocks escape unsynced, force an fsync on the
         * currently open log segment (if any).  Also, if the open flag is
         * changing, close the log file so it will be reopened (with new flag
         * bit) at next use.
         */
        if (t_thrd.xlog_cxt.openLogFile >= 0) {
            pgstat_report_waitevent(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
            if (pg_fsync(t_thrd.xlog_cxt.openLogFile) != 0) {
                ereport(PANIC, (errcode_for_file_access(),
                                errmsg("could not fsync log segment %s: %m",
                                       XLogFileNameP(t_thrd.xlog_cxt.curFileTLI, t_thrd.xlog_cxt.readSegNo))));
            }
            pgstat_report_waitevent(WAIT_EVENT_END);
            if (get_sync_bit(u_sess->attr.attr_storage.sync_method) != get_sync_bit(new_sync_method)) {
                XLogFileClose();
            }
        }
    }
}

/*
 * Issue appropriate kind of fsync (if any) for an XLOG output file.
 *
 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
 * 'log' and 'seg' are for error reporting purposes.
 */
void issue_xlog_fsync(int fd, XLogSegNo segno)
{
    if (ENABLE_DSS) {
        return;
    }

    switch (u_sess->attr.attr_storage.sync_method) {
        case SYNC_METHOD_FSYNC:
            if (pg_fsync_no_writethrough(fd) != 0) {
                ereport(PANIC,
                        (errcode_for_file_access(), errmsg("could not fsync log file %s: %m",
                                                           XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno))));
            }
            break;
#ifdef HAVE_FSYNC_WRITETHROUGH
        case SYNC_METHOD_FSYNC_WRITETHROUGH:
            if (pg_fsync_writethrough(fd) != 0) {
                ereport(PANIC,
                        (errcode_for_file_access(), errmsg("could not fsync write-through log file %s: %m",
                                                           XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno))));
            }
            break;
#endif
#ifdef HAVE_FDATASYNC
        case SYNC_METHOD_FDATASYNC:
            if (pg_fdatasync(fd) != 0) {
                ereport(PANIC,
                        (errcode_for_file_access(), errmsg("could not fdatasync log file %s: %m",
                                                           XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, segno))));
            }
            break;
#endif
        case SYNC_METHOD_OPEN:
        case SYNC_METHOD_OPEN_DSYNC:
            /* write synced it already */
            break;
        default:
            ereport(PANIC, (errmsg("unrecognized wal_sync_method: %d", u_sess->attr.attr_storage.sync_method)));
            break;
    }
}

/*
 * Return the filename of given log segment, as a palloc'd string.
 */
char *XLogFileNameP(TimeLineID tli, XLogSegNo segno)
{
    errno_t errorno = EOK;
    char *result = (char *)palloc(MAXFNAMELEN);

    errorno = snprintf_s(result, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", tli,
                         (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    return result;
}

bool check_roach_start_backup(const char *slotName)
{
    char backup_label[MAXPGPATH];
    errno_t rc = EOK;
    struct stat statbuf;

    /* check roach backup label file existence. */
    rc = snprintf_s(backup_label, sizeof(backup_label), sizeof(backup_label) - 1, "%s.%s", BACKUP_LABEL_FILE_ROACH,
                    slotName);
    securec_check_ss(rc, "", "");

    if (stat(backup_label, &statbuf)) {
        if (errno != ENOENT) {
            ereport(WARNING, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", backup_label)));
        }
        ereport(DEBUG1, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                         errmsg("roach backup %s is not in progress", slotName)));
        return false;
    }

    return true;
}

static void CollectTableSpace(DIR *tblspcdir, List **tablespaces, StringInfo tblspc_mapfbuf, bool infotbssize)
{
    struct dirent *de = NULL;
    tablespaceinfo *ti = NULL;
    const char *dataDir = NULL;
    int datadirpathlen = -1;

    dataDir = is_dss_file(TBLSPCDIR) ? g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name : t_thrd.proc_cxt.DataDir;
    datadirpathlen = strlen(dataDir);
    /* Collect information about all tablespaces */
    while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL) {
        char fullpath[MAXPGPATH + PG_TBLSPCS];
        char linkpath[MAXPGPATH];
        char *relpath = NULL;
        int rllen;
        errno_t errorno = EOK;

        errorno = memset_s(fullpath, MAXPGPATH + PG_TBLSPCS, '\0', MAXPGPATH + PG_TBLSPCS);
        securec_check(errorno, "", "");

        errorno = memset_s(linkpath, MAXPGPATH, '\0', MAXPGPATH);
        securec_check(errorno, "", "");

        /* Skip special stuff */
        if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
            continue;

        errorno = snprintf_s(fullpath, MAXPGPATH + PG_TBLSPCS, MAXPGPATH + PG_TBLSPCS - 1,
            "%s/%s", TBLSPCDIR, de->d_name);
        securec_check_ss(errorno, "\0", "\0");

#if defined(HAVE_READLINK) || defined(WIN32)
        rllen = readlink(fullpath, linkpath, sizeof(linkpath));
        if (rllen < 0) {
            ereport(WARNING, (errmsg("could not read symbolic link \"%s\": %m", fullpath)));
            continue;
        } else if (rllen >= (int)sizeof(linkpath)) {
            ereport(WARNING, (errmsg("symbolic link \"%s\" target is too long", fullpath)));
            continue;
        }
        linkpath[rllen] = '\0';

        /*
         * Relpath holds the relative path of the tablespace directory
         * when it's located within PGDATA, or NULL if it's located
         * elsewhere.
         */
        if (rllen > datadirpathlen && strncmp(linkpath, dataDir, datadirpathlen) == 0 &&
            IS_DIR_SEP(linkpath[datadirpathlen]))
            relpath = linkpath + datadirpathlen + 1;

        ti = (tablespaceinfo *)palloc(sizeof(tablespaceinfo));
        ti->oid = pstrdup(de->d_name);
        ti->path = pstrdup(linkpath);
        ti->relativePath = relpath ? pstrdup(relpath) : NULL;
        ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
        if (tablespaces)
            *tablespaces = lappend(*tablespaces, ti);
        appendStringInfo(tblspc_mapfbuf, "%s %s\n", ti->oid, ti->path);
#else
        /*
         * If the platform does not have symbolic links, it should not be
         * possible to have tablespaces - clearly somebody else created
         * them. Warn about it and ignore.
         */
        ereport(WARNING,
                (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform")));
#endif
    }
}
/*
 * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
 * function. It creates the necessary starting checkpoint and constructs the
 * backup label file.
 *
 * There are two kind of backups: exclusive and non-exclusive. An exclusive
 * backup is started with pg_start_backup(), and there can be only one active
 * at a time. The backup label file of an exclusive backup is written to
 * $PGDATA/backup_label, and it is removed by pg_stop_backup().
 *
 * A non-exclusive backup is used for the streaming base backups (see
 * src/backend/replication/basebackup.c). The difference to exclusive backups
 * is that the backup label file is not written to disk. Instead, its would-be
 * contents are returned in *labelfile, and the caller is responsible for
 * including it in the backup archive as 'backup_label'. There can be many
 * non-exclusive backups active at the same time, and they don't conflict
 * with an exclusive backup either.
 *
 * Every successfully started non-exclusive backup must
 * be stopped by calling do_pg_stop_backup() or do_pg_abort_backup().
 */
XLogRecPtr do_pg_start_backup(const char *backupidstr, bool fast, char **labelfile, DIR *tblspcdir,
                              char **tblspcmapfile, List **tablespaces, bool infotbssize, bool needtblspcmapfile)
{
    bool exclusive = (labelfile == NULL);
    bool backup_started_in_recovery = false;
    XLogRecPtr checkpointloc;
    XLogRecPtr startpoint;
    pg_time_t stamp_time;
    char strfbuf[128];
    char xlogfilename[MAXFNAMELEN];
    XLogSegNo _logSegNo;
    struct stat stat_buf;
    FILE *fp = NULL;
    StringInfoData labelfbuf;
    StringInfoData tblspc_mapfbuf;
    int32 lastlrc = 0;
    errno_t errorno = EOK;

    gstrace_entry(GS_TRC_ID_do_pg_start_backup);

    backup_started_in_recovery = RecoveryInProgress();

    if (!superuser() && !has_rolreplication(GetUserId()) &&
        !(isOperatoradmin(GetUserId()) && u_sess->attr.attr_security.operation_mode)) {
        ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                        errmsg("must be system admin or replication role "
                               "or operator admin in operation mode to run a backup")));
    }

    /*
     * Currently only non-exclusive backup can be taken during recovery.
     */
    if (backup_started_in_recovery && exclusive) {
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"),
                        errhint("WAL control functions cannot be executed during recovery.")));
    }

    /*
     * During recovery, we don't need to check WAL level. Because, if WAL
     * level is not sufficient, it's impossible to get here during recovery.
     */
    if (!backup_started_in_recovery && !XLogIsNeeded()) {
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("WAL level not sufficient for making an online backup"),
                 errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
    }

    if (strlen(backupidstr) > MAXPGPATH) {
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("backup label too long (max %d bytes)", MAXPGPATH)));
    }

    /*
     * Mark backup active in shared memory.  We must do full-page WAL writes
     * during an on-line backup even if not doing so at other times, because
     * it's quite possible for the backup dump to obtain a "torn" (partially
     * written) copy of a database page if it reads the page concurrently with
     * our write to the same page.	This can be fixed as long as the first
     * write to the page in the WAL sequence is a full-page write. Hence, we
     * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
     * are no dirty pages in shared memory that might get dumped while the
     * backup is in progress without having a corresponding WAL record.  (Once
     * the backup is complete, we need not force full-page writes anymore,
     * since we expect that any pages not modified during the backup interval
     * must have been correctly captured by the backup.)
     *
     * Note that forcePageWrites has no effect during an online backup from
     * the standby.
     *
     * We must hold WALInsertLock to change the value of forcePageWrites, to
     * ensure adequate interlocking against XLogInsertRecord().
     */
    StartSuspendWalInsert(&lastlrc);
    if (exclusive) {
        if (t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup) {
            StopSuspendWalInsert(lastlrc);
            ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                            errmsg("a backup is already in progress"), errhint("Run pg_stop_backup() and try again.")));
        }
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup = true;
    } else {
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups++;
    }
    if (ENABLE_INCRE_CKPT) {
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
    } else {
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = true;
    }
    StopSuspendWalInsert(lastlrc);

    /* Ensure we release forcePageWrites if fail below */
    PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum)BoolGetDatum(exclusive));
    {
        if (strcmp(backupidstr, "gs_ctl full build") == 0) {
            if (!backup_started_in_recovery) {
                (void)RequestXLogSwitch();
            }
            LWLockAcquire(ControlFileLock, LW_SHARED);
            checkpointloc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
            startpoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
            LWLockRelease(ControlFileLock);
        } else {
            bool gotUniqueStartpoint = false;

            /*
             * Force an XLOG file switch before the checkpoint, to ensure that the
             * WAL segment the checkpoint is written to doesn't contain pages with
             * old timeline IDs.  That would otherwise happen if you called
             * pg_start_backup() right after restoring from a PITR archive: the
             * first WAL segment containing the startup checkpoint has pages in
             * the beginning with the old timeline ID.	That can cause trouble at
             * recovery: we won't have a history file covering the old timeline if
             * pg_xlog directory was not included in the base backup and the WAL
             * archive was cleared too before starting the backup.
             *
             * This also ensures that we have emitted a WAL page header that has
             * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
             * Therefore, if a WAL archiver (such as pglesslog) is trying to
             * compress out removable backup blocks, it won't remove any that
             * occur after this point.
             *
             * During recovery, we skip forcing XLOG file switch, which means that
             * the backup taken during recovery is not available for the special
             * recovery case described above.
             */
            if (!backup_started_in_recovery) {
                (void)RequestXLogSwitch();
            }

            do {
                bool checkpointfpw = false;

                /*
                 * Force a CHECKPOINT.  Aside from being necessary to prevent torn
                 * page problems, this guarantees that two successive backup runs
                 * will have different checkpoint positions and hence different
                 * history file names, even if nothing happened in between.
                 *
                 * During recovery, establish a restartpoint if possible. We use
                 * the last restartpoint as the backup starting checkpoint. This
                 * means that two successive backup runs can have same checkpoint
                 * positions.
                 *
                 * Since the fact that we are executing do_pg_start_backup()
                 * during recovery means that checkpointer is running, we can use
                 * RequestCheckpoint() to establish a restartpoint.
                 *
                 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
                 * passing fast = true).  Otherwise this can take awhile.
                 */
                RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT | (fast ? CHECKPOINT_IMMEDIATE : 0));

                /*
                 * Now we need to fetch the checkpoint record location, and also
                 * its REDO pointer.  The oldest point in WAL that would be needed
                 * to restore starting from the checkpoint is precisely the REDO
                 * pointer.
                 */
                LWLockAcquire(ControlFileLock, LW_SHARED);
                checkpointloc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
                startpoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
                checkpointfpw = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.fullPageWrites;
                LWLockRelease(ControlFileLock);

                if (backup_started_in_recovery) {
                    /* use volatile pointer to prevent code rearrangement */
                    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
                    XLogRecPtr recptr;

                    /*
                     * Check to see if all WAL replayed during online backup
                     * (i.e., since last restartpoint used as backup starting
                     * checkpoint) contain full-page writes.
                     */
                    SpinLockAcquire(&xlogctl->info_lck);
                    recptr = xlogctl->lastFpwDisableRecPtr;
                    SpinLockRelease(&xlogctl->info_lck);
#ifdef ENABLE_MULTIPLE_NODES
                    if (!checkpointfpw || XLByteLE(startpoint, recptr)) {
                        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                                        errmsg("WAL generated with full_page_writes=off was replayed "
                                               "since last restartpoint"),
                                        errhint("This means that the backup being taken on the standby "
                                                "is corrupt and should not be used. "
                                                "Enable full_page_writes and run CHECKPOINT on the master, "
                                                "and then try an online backup again.")));
                    }
#endif
                    /*
                     * During recovery, since we don't use the end-of-backup WAL
                     * record and don't write the backup history file, the
                     * starting WAL location doesn't need to be unique. This means
                     * that two base backups started at the same time might use
                     * the same checkpoint as starting locations.
                     */
                    gotUniqueStartpoint = true;
                }

                /*
                 * If two base backups are started at the same time (in WAL sender
                 * processes), we need to make sure that they use different
                 * checkpoints as starting locations, because we use the starting
                 * WAL location as a unique identifier for the base backup in the
                 * end-of-backup WAL record and when we write the backup history
                 * file. Perhaps it would be better generate a separate unique ID
                 * for each backup instead of forcing another checkpoint, but
                 * taking a checkpoint right after another is not that expensive
                 * either because only few buffers have been dirtied yet.
                 */
                StartSuspendWalInsert(&lastlrc);
                if (XLByteLT(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.lastBackupStart, startpoint)) {
                    t_thrd.shemem_ptr_cxt.XLogCtl->Insert.lastBackupStart = startpoint;
                    gotUniqueStartpoint = true;
                }
                StopSuspendWalInsert(lastlrc);
            } while (!gotUniqueStartpoint);
        }
        XLByteToSeg(startpoint, _logSegNo);
        errorno = snprintf_s(xlogfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
                             (uint32)((_logSegNo) / XLogSegmentsPerXLogId),
                             (uint32)((_logSegNo) % XLogSegmentsPerXLogId));
        securec_check_ss(errorno, "", "");

        /*
         * Construct tablespace_map file
         */
        initStringInfo(&tblspc_mapfbuf);

        CollectTableSpace(tblspcdir, tablespaces, &tblspc_mapfbuf, infotbssize);

        /*
         * Construct backup label file
         */
        initStringInfo(&labelfbuf);

        /* Use the log timezone here, not the session timezone */
        stamp_time = (pg_time_t)time(NULL);
        pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone));
        appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n", (uint32)(startpoint >> 32),
                         (uint32)startpoint, xlogfilename);
        appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n", (uint32)(checkpointloc >> 32),
                         (uint32)checkpointloc);
        appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n", exclusive ? "pg_start_backup" : "streamed");
        appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n", backup_started_in_recovery ? "standby" : "master");
        appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
        appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);

        /*
         * Okay, write the file, or return its contents to caller.
         */
        if (exclusive) {
            /*
             * Check for existing backup label --- implies a backup is already
             * running.  (XXX given that we checked exclusiveBackup above,
             * maybe it would be OK to just unlink any such label file?)
             */
            char *fileName = BACKUP_LABEL_FILE;

            if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") == 0) {
                fileName = BACKUP_LABEL_FILE_ROACH;
            }
            if (stat(fileName, &stat_buf) != 0) {
                if (errno != ENOENT) {
                    ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", fileName)));
                }
            } else {
                ereport(ERROR,
                        (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("a backup is already in progress"),
                         errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
                                 fileName)));
            }

            fp = AllocateFile(fileName, "w");

            if (fp == NULL) {
                ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", fileName)));
            }
            if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 || fflush(fp) != 0 || pg_fsync(fileno(fp)) != 0 ||
                ferror(fp) || FreeFile(fp)) {
                ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\": %m", fileName)));
            }
            pfree(labelfbuf.data);

            /* Write backup tablespace_map file. */
            if (tblspc_mapfbuf.len > 0) {
                if (stat(TABLESPACE_MAP, &stat_buf) != 0) {
                    if (errno != ENOENT)
                        ereport(ERROR,
                                (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", TABLESPACE_MAP)));
                } else
                    ereport(ERROR,
                            (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                             errmsg("a backup is already in progress"),
                             errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
                                     TABLESPACE_MAP)));

                fp = AllocateFile(TABLESPACE_MAP, "w");

                if (!fp)
                    ereport(ERROR,
                            (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", TABLESPACE_MAP)));
                if (fwrite(tblspc_mapfbuf.data, tblspc_mapfbuf.len, 1, fp) != 1 || fflush(fp) != 0 ||
                    pg_fsync(fileno(fp)) != 0 || ferror(fp) || FreeFile(fp))
                    ereport(ERROR,
                            (errcode_for_file_access(), errmsg("could not write file \"%s\": %m", TABLESPACE_MAP)));
            }
            pfree(tblspc_mapfbuf.data);
        } else {
            *labelfile = labelfbuf.data;
            if (tblspc_mapfbuf.len > 0)
                *tblspcmapfile = tblspc_mapfbuf.data;
        }
    }
    PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum)BoolGetDatum(exclusive));

    if (exclusive) {
        u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
    } else {
        u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
    }

    gstrace_exit(GS_TRC_ID_do_pg_start_backup);

    /*
     * We're done.  As a convenience, return the starting WAL location.
     */
    return startpoint;
}

XLogRecPtr StandbyDoStartBackup(const char *backupidstr, char **labelFile, char **tblSpcMapFile, List **tableSpaces,
                                DIR *tblSpcDir, bool infoTbsSize)
{
    StringInfoData labelfbuf;
    StringInfoData tblspc_mapfbuf;
    pg_time_t stamp_time;
    char strfbuf[128];
    char xlogFileName[MAXFNAMELEN];
    XLogSegNo _logSegNo;
    XLogRecPtr checkPointLoc;
    XLogRecPtr startPoint;
    errno_t errorno = EOK;

    LWLockAcquire(ControlFileLock, LW_SHARED);
    checkPointLoc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
    startPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
    LWLockRelease(ControlFileLock);

    XLByteToSeg(startPoint, _logSegNo);
    errorno = snprintf_s(xlogFileName, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
                         (uint32)((_logSegNo) / XLogSegmentsPerXLogId), (uint32)((_logSegNo) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    /*
     * Construct tablespace_map file
     */
    initStringInfo(&tblspc_mapfbuf);
    CollectTableSpace(tblSpcDir, tableSpaces, &tblspc_mapfbuf, infoTbsSize);

    /*
     * Construct backup label file
     */
    initStringInfo(&labelfbuf);

    /* Use the log timezone here, not the session timezone */
    stamp_time = (pg_time_t)time(NULL);
    pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone));
    appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n", (uint32)(startPoint >> 32),
                     (uint32)startPoint, xlogFileName);
    appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n", (uint32)(checkPointLoc >> 32), (uint32)checkPointLoc);
    appendStringInfo(&labelfbuf, "BACKUP METHOD: streamed\n");
    appendStringInfo(&labelfbuf, "BACKUP FROM: standby\n");
    appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
    appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);

    /*
     * Okay, write the file, or return its contents to caller.
     */
    *labelFile = labelfbuf.data;
    if (tblspc_mapfbuf.len > 0) {
        *tblSpcMapFile = tblspc_mapfbuf.data;
    }
    return startPoint;
}

/* Error cleanup callback for pg_start_backup */
static void pg_start_backup_callback(int code, Datum arg)
{
    bool exclusive = DatumGetBool(arg);
    int32 lastlrc = 0;

    /* Update backup counters and forcePageWrites on failure */
    StartSuspendWalInsert(&lastlrc);
    if (exclusive) {
        Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup);
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup = false;
    } else {
        Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups > 0);
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups--;
    }

    if (!t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup &&
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups == 0) {
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
    }
    StopSuspendWalInsert(lastlrc);
}

/*
 * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
 * function.

 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
 * the non-exclusive backup specified by 'labelfile'.
 */
XLogRecPtr do_pg_stop_backup(char *labelfile, bool waitforarchive, unsigned long long *consensusPaxosIdx)
{
    bool exclusive = (labelfile == NULL);
    bool backup_started_in_recovery = false;
    XLogRecPtr startpoint;
    XLogRecPtr stoppoint;
    pg_time_t stamp_time;
    char strfbuf[128];
    char histfilepath[MAXPGPATH];
    char startxlogfilename[MAXFNAMELEN];
    char stopxlogfilename[MAXFNAMELEN];
    char lastxlogfilename[MAXFNAMELEN];
    char histfilename[MAXFNAMELEN];
    char backupfrom[20];
    XLogSegNo _logSegNo;
    FILE *lfp = NULL;
    FILE *fp = NULL;
    char ch;
    int seconds_before_warning;
    int waits = 0;
    bool reported_waiting = false;
    char *remaining = NULL;
    char *ptr = NULL;
    uint32 hi, lo;
    int32 lastlrc = 0;
    errno_t errorno = EOK;

    gstrace_entry(GS_TRC_ID_do_pg_stop_backup);

    backup_started_in_recovery = RecoveryInProgress();

    if (!superuser() && !has_rolreplication(GetUserId()) &&
        !(isOperatoradmin(GetUserId()) && u_sess->attr.attr_security.operation_mode)) {
        ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                        (errmsg("must be system admin or replication role "
                                "or operator admin in operation mode to run a backup"))));
    }

    /*
     * Currently only non-exclusive backup can be taken during recovery.
     */
    if (backup_started_in_recovery && exclusive) {
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"),
                        errhint("WAL control functions cannot be executed during recovery.")));
    }

    /*
     * During recovery, we don't need to check WAL level. Because, if WAL
     * level is not sufficient, it's impossible to get here during recovery.
     */
    if (!backup_started_in_recovery && !XLogIsNeeded()) {
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("WAL level not sufficient for making an online backup"),
                 errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
    }

    /*
     * OK to update backup counters and forcePageWrites
     */
    StartSuspendWalInsert(&lastlrc);
    if (exclusive) {
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup = false;
    } else {
        /*
         * The user-visible pg_start/stop_backup() functions that operate on
         * exclusive backups can be called at any time, but for non-exclusive
         * backups, it is expected that each do_pg_start_backup() call is
         * matched by exactly one do_pg_stop_backup() call.
         */
        Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups > 0);
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups--;
    }

    if (!t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup &&
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups == 0) {
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
    }
    u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NONE;
    StopSuspendWalInsert(lastlrc);

    /*
     * Clean up session-level lock.
     *
     * You might think that WALInsertLockRelease() can be called before
     * cleaning up session-level lock because session-level lock doesn't need
     * to be protected with WAL insertion lock. But since
     * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
     * cleaned up before it.
     */
    u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NONE;

    if (exclusive) {
        /*
         * Read the existing label file into memory.
         */
        struct stat statbuf;
        int r;
        char *fileName = BACKUP_LABEL_FILE;

        if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") == 0) {
            fileName = BACKUP_LABEL_FILE_ROACH;
        }
        if (stat(fileName, &statbuf)) {
            if (errno != ENOENT) {
                ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", fileName)));
            }
            ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("a backup is not in progress")));
        }

        lfp = AllocateFile(fileName, "r");
        if (lfp == NULL) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fileName)));
        }
        if (statbuf.st_size > BLCKSZ) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("file size is wrong, \"%s\": %m", fileName)));
        }
        labelfile = (char *)palloc(statbuf.st_size + 1);
        r = fread(labelfile, statbuf.st_size, 1, lfp);
        labelfile[statbuf.st_size] = '\0';

        /*
         * Close and remove the backup label file
         */
        if (r != 1 || ferror(lfp) || FreeFile(lfp)) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", fileName)));
        }
        if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
            if (unlink(fileName) != 0) {
                ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", fileName)));
            }
            if (unlink(TABLESPACE_MAP) != 0) {
                ereport(DEBUG1,
                        (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", TABLESPACE_MAP)));
            }
        }
    }

    /*
     * Read and parse the START WAL LOCATION line (this code is pretty crude,
     * but we are not expecting any variability in the file format).
     */
    if (sscanf_s(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c", &hi, &lo, startxlogfilename,
                 sizeof(startxlogfilename), &ch, 1) != 4 ||
        ch != '\n') {
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    }
    startpoint = (((uint64)hi) << 32) | lo;
    remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */

    /*
     * Parse the BACKUP FROM line. If we are taking an online backup from the
     * standby, we confirm that the standby has not been promoted during the
     * backup.
     */
    ptr = strstr(remaining, "BACKUP FROM:");
    if ((ptr == NULL) || sscanf_s(ptr, "BACKUP FROM: %19s\n", backupfrom, sizeof(backupfrom)) != 1) {
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    }
    if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery) {
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("the standby was promoted during online backup"),
                        errhint("This means that the backup being taken is corrupt "
                                "and should not be used. "
                                "Try taking another online backup.")));
    }

    /*
     * During recovery, we don't write an end-of-backup record. We assume that
     * pg_control was backed up last and its minimum recovery point can be
     * available as the backup end location. Since we don't have an
     * end-of-backup record, we use the pg_control value to check whether
     * we've reached the end of backup when starting recovery from this
     * backup. We have no way of checking if pg_control wasn't backed up last
     * however.
     *
     * We don't force a switch to new WAL file and wait for all the required
     * files to be archived. This is okay if we use the backup to start the
     * standby. But, if it's for an archive recovery, to ensure all the
     * required files are available, a user should wait for them to be
     * archived, or include them into the backup.
     *
     * We return the current minimum recovery point as the backup end
     * location. Note that it can be greater than the exact backup end
     * location if the minimum recovery point is updated after the backup of
     * pg_control. This is harmless for current uses.
     *
     * XXX currently a backup history file is for informational and debug
     * purposes only. It's not essential for an online backup. Furthermore,
     * even if it's created, it will not be archived during recovery because
     * an archiver is not invoked. So it doesn't seem worthwhile to write a
     * backup history file during recovery.
     */
    if (backup_started_in_recovery) {
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
        XLogRecPtr recptr;

        /*
         * Check to see if all WAL replayed during online backup contain
         * full-page writes.
         */
        SpinLockAcquire(&xlogctl->info_lck);
        recptr = xlogctl->lastFpwDisableRecPtr;
        SpinLockRelease(&xlogctl->info_lck);

#ifdef ENABLE_MULTIPLE_NODES
        if (XLByteLE(startpoint, recptr)) {
            ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                            errmsg("WAL generated with full_page_writes=off was replayed "
                                   "during online backup"),
                            errhint("This means that the backup being taken on the standby "
                                    "is corrupt and should not be used. "
                                    "Enable full_page_writes and run CHECKPOINT on the master, "
                                    "and then try an online backup again.")));
        }
#endif

        LWLockAcquire(ControlFileLock, LW_SHARED);
        stoppoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
        LWLockRelease(ControlFileLock);

        if (XLByteLE(stoppoint, startpoint)) {
            stoppoint = GetXLogReplayRecPtr(NULL);
        }

        gstrace_exit(GS_TRC_ID_do_pg_stop_backup);
        return stoppoint;
    }

    /*
     * Write the backup-end xlog record
     */
    XLogBeginInsert();
    XLogRegisterData((char *)(&startpoint), sizeof(startpoint));
    stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
    if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") == 0) {
        char *fileName = BACKUP_LABEL_FILE_ROACH;
        struct stat statbuf;

        /* stoppoint need to flush */
        XLogWaitFlush(stoppoint);
        if (stat(fileName, &statbuf)) {
            if (errno != ENOENT) {
                ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", fileName)));
            }
            ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                            errmsg("the file is not exist \"%s\": %m", fileName)));
        }
        if (durable_rename(BACKUP_LABEL_FILE_ROACH, BACKUP_LABEL_FILE_ROACH_DONE, ERROR)) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m",
                                                              BACKUP_LABEL_FILE_ROACH, BACKUP_LABEL_FILE_ROACH_DONE)));
        }
    }

    /*
     * Force a switch to a new xlog segment file, so that the backup is valid
     * as soon as archiver moves out the current segment file.
     */
    (void)RequestXLogSwitch();

    /* stoppoint need to flush */
    XLogWaitFlush(stoppoint);

    XLByteToPrevSeg(stoppoint, _logSegNo);
    errorno = snprintf_s(stopxlogfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
                         (uint32)((_logSegNo) / XLogSegmentsPerXLogId), (uint32)((_logSegNo) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    /* Use the log timezone here, not the session timezone */
    stamp_time = (pg_time_t)time(NULL);
    pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone));

    /*
     * Write the backup history file
     */
    XLByteToSeg(startpoint, _logSegNo);
    errorno = snprintf_s(histfilepath, MAXPGPATH, MAXPGPATH - 1, XLOGDIR "/%08X%08X%08X.%08X.backup",
                         t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((_logSegNo) / XLogSegmentsPerXLogId),
                         (uint32)((_logSegNo) % XLogSegmentsPerXLogId), (uint32)(startpoint % XLogSegSize));
    securec_check_ss(errorno, "", "");

    fp = AllocateFile(histfilepath, "w");
    if (fp == NULL) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", histfilepath)));
    }
    fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n", (uint32)(startpoint >> 32), (uint32)startpoint,
            startxlogfilename);
    fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n", (uint32)(stoppoint >> 32), (uint32)stoppoint, stopxlogfilename);
    /* transfer remaining lines from label to history file */
    fprintf(fp, "%s", remaining);
    fprintf(fp, "STOP TIME: %s\n", strfbuf);
    if (fflush(fp) || ferror(fp) || FreeFile(fp)) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\": %m", histfilepath)));
    }

    /*
     * Clean out any no-longer-needed history files.  As a side effect, this
     * will post a .ready file for the newly created history file, notifying
     * the archiver that history file may be archived immediately.
     */
    CleanupBackupHistory();

    /*
     * If archiving is enabled, wait for all the required WAL files to be
     * archived before returning. If archiving isn't enabled, the required WAL
     * needs to be transported via streaming replication (hopefully with
     * wal_keep_segments set high enough), or some more exotic mechanism like
     * polling and copying files from pg_xlog with script. We have no
     * knowledge of those mechanisms, so it's up to the user to ensure that he
     * gets all the required WAL.
     *
     * We wait until both the last WAL file filled during backup and the
     * history file have been archived, and assume that the alphabetic sorting
     * property of the WAL files ensures any earlier WAL files are safely
     * archived as well.
     *
     * We wait forever, since archive_command is supposed to work and we
     * assume the admin wanted his backup to work completely. If you don't
     * wish to wait, you can set statement_timeout.  Also, some notices are
     * issued to clue in anyone who might be doing this interactively.
     */
    if (waitforarchive && XLogArchivingActive()) {
        XLByteToPrevSeg(stoppoint, _logSegNo);
        errorno = snprintf_s(lastxlogfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X",
                             t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((_logSegNo) / XLogSegmentsPerXLogId),
                             (uint32)((_logSegNo) % XLogSegmentsPerXLogId));
        securec_check_ss(errorno, "", "");

        XLByteToSeg(startpoint, _logSegNo);
        errorno = snprintf_s(histfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X.%08X.backup",
                             t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((_logSegNo) / XLogSegmentsPerXLogId),
                             (uint32)((_logSegNo) % XLogSegmentsPerXLogId), (uint32)(startpoint % XLogSegSize));
        securec_check_ss(errorno, "", "");

        seconds_before_warning = 60;
        waits = 0;

        while (XLogArchiveIsBusy(lastxlogfilename) || XLogArchiveIsBusy(histfilename)) {
            CHECK_FOR_INTERRUPTS();

            if (!reported_waiting && waits > 5) {
                ereport(NOTICE,
                        (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
                reported_waiting = true;
            }

            pg_usleep(1000000L);

            if (++waits >= seconds_before_warning) {
                seconds_before_warning *= 2; /* This wraps in >10 years... */
                ereport(WARNING,
                        (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds "
                                "elapsed)",
                                waits),
                         errhint("Check that your archive_command is executing properly.  "
                                 "pg_stop_backup can be canceled safely, "
                                 "but the database backup will not be usable without all the WAL segments.")));
            }
        }

        ereport(NOTICE, (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
    } else if (waitforarchive) {
        ereport(NOTICE,
                (errmsg(
                    "WAL archiving is not enabled; you must ensure that all required WAL segments are copied through "
                    "other means to complete the backup")));
    }

    gstrace_exit(GS_TRC_ID_do_pg_stop_backup);
    /*
     * We're done.  As a convenience, return the ending WAL location.
     */
    return stoppoint;
}

/*
 *
 */
XLogRecPtr StandbyDoStopBackup(char *labelfile)
{
    XLogRecPtr stopPoint;
    XLogRecPtr startPoint;
    uint32 hi, lo;
    char startxlogfilename[MAXFNAMELEN];
    char ch;
    char *remaining = NULL;
    /*
     * During recovery, we don't need to check WAL level. Because, if WAL
     * level is not sufficient, it's impossible to get here during recovery.
     */
    if (!XLogIsNeeded()) {
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("WAL level not sufficient for making an online backup"),
                 errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
    }
    /*
     * Read and parse the START WAL LOCATION line (this code is pretty crude,
     * but we are not expecting any variability in the file format).
     */
    if (sscanf_s(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c", &hi, &lo, startxlogfilename,
                 sizeof(startxlogfilename), &ch, 1) != 4 ||
        ch != '\n') {
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    }
    startPoint = (((uint64)hi) << 32) | lo;
    remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */

    XLogRecPtr minRecoveryPoint;
    LWLockAcquire(ControlFileLock, LW_SHARED);
    stopPoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
    minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
    LWLockRelease(ControlFileLock);
    if (XLByteLE(stopPoint, startPoint)) {
        stopPoint = GetXLogReplayRecPtr(NULL);
    }

    if (XLByteLT(stopPoint, minRecoveryPoint)) {
        stopPoint = minRecoveryPoint;
    }

    if (XLByteLT(stopPoint, g_instance.comm_cxt.predo_cxt.redoPf.read_ptr)) {
        stopPoint = g_instance.comm_cxt.predo_cxt.redoPf.read_ptr;
    }
    return stopPoint;
}

/*
 * do_pg_abort_backup: abort a running backup
 *
 * This does just the most basic steps of do_pg_stop_backup(), by taking the
 * system out of backup mode, thus making it a lot more safe to call from
 * an error handler.
 *
 * NB: This is only for aborting a non-exclusive backup that doesn't write
 * backup_label. A backup started with pg_stop_backup() needs to be finished
 * with pg_stop_backup().
 */
void do_pg_abort_backup(void)
{
    if (u_sess->proc_cxt.sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
        return;

    int32 lastlrc = 0;

    StartSuspendWalInsert(&lastlrc);
    /*
     * Quick exit if session is not keeping around a non-exclusive backup
     * already started.
     */
    if (u_sess->proc_cxt.sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
        return;

    Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups > 0);
    t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups--;

    if (!t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup &&
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups == 0) {
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
    }
    u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NONE;

    StopSuspendWalInsert(lastlrc);
}

/*
 * Error cleanup callback for do_pg_abort_backup
 */
static void DoAbortBackupCallback(int code, Datum arg)
{
    do_pg_abort_backup();
}

/*
 * Register a handler that will warn about unterminated backups at the end of
 * session, unless this has already been done.
 */
void RegisterPersistentAbortBackupHandler(void)
{
    if (u_sess->proc_cxt.registerAbortBackupHandlerdone)
        return;
    on_shmem_exit(DoAbortBackupCallback, BoolGetDatum(true));
    u_sess->proc_cxt.registerAbortBackupHandlerdone = true;
}

/*
 * start roach backup.
 * get current checkpoint location, together with its redo location, and write them into
 * roach backup label.
 * We avoid requesting extra checkpoint here and adopt a similar recovery logic as
 * recovery from standby backup, i.e., backup pg_control last and use its minrecovery point
 * to judge recovery consistency.
 */
XLogRecPtr do_roach_start_backup(const char *backupidstr)
{
    bool backup_started_in_recovery = false;
    XLogRecPtr checkpointloc;
    XLogRecPtr startpoint;
    pg_time_t stamp_time;
    char strfbuf[128];
    char xlogfilename[MAXFNAMELEN];
    XLogSegNo _logSegNo;
    struct stat stat_buf;
    FILE *fp = NULL;
    StringInfoData labelfbuf;
    errno_t errorno = EOK;

    gstrace_entry(GS_TRC_ID_do_pg_start_backup);

    backup_started_in_recovery = RecoveryInProgress();

    check_roach_backup_condition(backup_started_in_recovery, backupidstr);

    /*
     * Now we need to fetch the checkpoint record location, and also
     * its REDO pointer. Different from pg_do_start_backup, in order to
     * save IO, this checkpoint is an existing one, rather than a newly-forced full checkpoint.
     */
    LWLockAcquire(ControlFileLock, LW_SHARED);
    checkpointloc = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
    startpoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
    LWLockRelease(ControlFileLock);

    /* wait for checkpoint record to be copied */
    if (!backup_started_in_recovery &&
        u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
        if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
            SyncPaxosWaitForLSN(checkpointloc);
        } else {
            SyncRepWaitForLSN(checkpointloc);
        }
    }

    XLByteToSeg(startpoint, _logSegNo);
    errorno = snprintf_s(xlogfilename, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", t_thrd.xlog_cxt.ThisTimeLineID,
                         (uint32)((_logSegNo) / XLogSegmentsPerXLogId), (uint32)((_logSegNo) % XLogSegmentsPerXLogId));
    securec_check_ss(errorno, "", "");

    /* Construct backup label file with roach slot name */
    initStringInfo(&labelfbuf);
    /* Use the log timezone here, not the session timezone */
    stamp_time = (pg_time_t)time(NULL);
    pg_strftime(strfbuf, sizeof(strfbuf), "%Y-%m-%d %H:%M:%S %Z", pg_localtime(&stamp_time, log_timezone));
    appendStringInfo(&labelfbuf, "START WAL LOCATION: %X/%X (file %s)\n", (uint32)(startpoint >> 32),
                     (uint32)startpoint, xlogfilename);
    appendStringInfo(&labelfbuf, "CHECKPOINT LOCATION: %X/%X\n", (uint32)(checkpointloc >> 32), (uint32)checkpointloc);
    appendStringInfo(&labelfbuf, "BACKUP METHOD: %s\n", "pg_start_backup");
    appendStringInfo(&labelfbuf, "BACKUP FROM: %s\n", "roach_master");
    appendStringInfo(&labelfbuf, "START TIME: %s\n", strfbuf);
    appendStringInfo(&labelfbuf, "LABEL: %s\n", backupidstr);

    errorno = snprintf_s(strfbuf, sizeof(strfbuf), sizeof(strfbuf) - 1, "%s.%s", BACKUP_LABEL_FILE_ROACH, backupidstr);
    securec_check_ss(errorno, "\0", "\0");

    if (stat(strfbuf, &stat_buf) != 0) {
        if (errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", strfbuf)));
        }
    } else {
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("a backup is already in progress"),
                 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.", strfbuf)));
    }

    fp = AllocateFile(strfbuf, "w");

    if (fp == NULL) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", strfbuf)));
    }
    if (fwrite(labelfbuf.data, labelfbuf.len, 1, fp) != 1 || fflush(fp) != 0 || pg_fsync(fileno(fp)) != 0 ||
        ferror(fp) || FreeFile(fp)) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not write file \"%s\": %m", strfbuf)));
    }
    pfree(labelfbuf.data);

    gstrace_exit(GS_TRC_ID_do_pg_start_backup);

    return startpoint;
}

/*
 * stop roach backup
 * For primary, get current insert position.
 * For standby, get current minRecoveryPoint.
 * As roach start backup, we avoid requesting full checkpoint to save IO.
 */
XLogRecPtr do_roach_stop_backup(const char *backupidstr)
{
    bool backup_started_in_recovery = false;
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    uint64 current_bytepos;
    XLogRecPtr stoppoint;
    char backup_label[MAXPGPATH];
    char backup_label_done[MAXPGPATH];
    errno_t errorno = EOK;
    struct stat statbuf;

    gstrace_entry(GS_TRC_ID_do_pg_stop_backup);

    backup_started_in_recovery = RecoveryInProgress();

    check_roach_backup_condition(backup_started_in_recovery, backupidstr);

    /* check roach backup label file existence. */
    errorno = snprintf_s(backup_label, sizeof(backup_label), sizeof(backup_label) - 1, "%s.%s", BACKUP_LABEL_FILE_ROACH,
                         backupidstr);
    securec_check_ss(errorno, "\0", "\0");

    if (stat(backup_label, &statbuf)) {
        if (errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", backup_label)));
        }
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("roach backup %s is not in progress", backupidstr)));
    }

    /* get backup stop lsn. For standby backup logic, see comments in do_pg_stop_backup. */
    if (backup_started_in_recovery) {
        /* Do we need to worry about reverted minRecoveryPoint? */
        LWLockAcquire(ControlFileLock, LW_SHARED);
        stoppoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
        LWLockRelease(ControlFileLock);
    } else {
#if defined(__x86_64__) || defined(__aarch64__)
        current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
        stoppoint = XLogBytePosToEndRecPtr(current_bytepos);
#else
        SpinLockAcquire(&Insert->insertpos_lck);
        stoppoint = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
        SpinLockRelease(&Insert->insertpos_lck);
#endif
    }

    /* wait for stop lsn xlog to be copied */
    if (!backup_started_in_recovery &&
        u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
        if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
            SyncPaxosWaitForLSN(stoppoint);
        } else {
            SyncRepWaitForLSN(stoppoint);
        }
    }

    /* rename roach backup label for resume backup */
    errorno = snprintf_s(backup_label_done, sizeof(backup_label_done), sizeof(backup_label_done) - 1, "%s.%s",
                         BACKUP_LABEL_FILE_ROACH_DONE, backupidstr);
    securec_check_ss(errorno, "\0", "\0");
    if (durable_rename(backup_label, backup_label_done, ERROR)) {
        ereport(ERROR, (errcode_for_file_access(),
                        errmsg("could not rename file \"%s\" to \"%s\": %m", backup_label, backup_label_done)));
    }

    gstrace_exit(GS_TRC_ID_do_pg_stop_backup);

    return stoppoint;
}

void check_roach_backup_condition(bool backup_started_in_recovery, const char *backupidstr)
{
    if (!superuser() && !has_rolreplication(GetUserId()) &&
        !(isOperatoradmin(GetUserId()) && u_sess->attr.attr_security.operation_mode)) {
        ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                        errmsg("must be system admin or replication role or "
                               "operator admin in operation mode to run a roach backup")));
    }

    /* Currently roach backup can only be taken during recovery. */
    if (backup_started_in_recovery) {
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("recovery is in progress"),
                        errhint("roach backup cannot be executed during recovery.")));
    }

    /*
     * During recovery, we don't need to check WAL level. Because, if WAL
     * level is not sufficient, it's impossible to get here during recovery.
     */
    if (!backup_started_in_recovery && !XLogIsNeeded()) {
        ereport(ERROR,
                (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                 errmsg("WAL level not sufficient for making an online roach backup"),
                 errhint("wal_level must be set to \"archive\", \"hot_standby\" or \"logical\" at server start.")));
    }

    /* currently, roach backup id is the same as roach backup replication slot */
    if (strlen(backupidstr) >= NAMEDATALEN) {
        ereport(ERROR,
                (errcode(ERRCODE_NAME_TOO_LONG), errmsg("roach backup id name \"%s\" is too long", backupidstr)));
    }

    if (strncmp(backupidstr, "gs_roach", strlen("gs_roach")) == 0 &&
        strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
        ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("Backup id name starting with gs_roach is internally reserverd")));
    }
}

/*
 *Error cleanup callback for exclusive backup
 */
static void AbortExcluseBackupCallback(int code, Datum arg)
{
    if (u_sess->proc_cxt.sessionBackupState != SESSION_BACKUP_EXCLUSIVE) {
        return;
    }
    int32 lastlrc = 0;

    /* Update backup counters and forcePageWrites on failure */
    StartSuspendWalInsert(&lastlrc);
    Assert(t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup);
    t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup = false;
    if (!t_thrd.shemem_ptr_cxt.XLogCtl->Insert.exclusiveBackup &&
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.nonExclusiveBackups == 0) {
        t_thrd.shemem_ptr_cxt.XLogCtl->Insert.forcePageWrites = false;
    }
    u_sess->proc_cxt.sessionBackupState = SESSION_BACKUP_NONE;
    StopSuspendWalInsert(lastlrc);
    if (strcmp(u_sess->attr.attr_common.application_name, "gs_roach") != 0) {
        (void)unlink(BACKUP_LABEL_FILE);
    }
}

/*
 *Register a handler that will warn about unterminated exclusive backups
 *at end of session, unless this has already been done.
 */
void RegisterAbortExclusiveBackup()
{
    if (u_sess->proc_cxt.registerExclusiveHandlerdone) {
        return;
    }
    on_shmem_exit(AbortExcluseBackupCallback, BoolGetDatum(true));
    u_sess->proc_cxt.registerExclusiveHandlerdone = true;
}

void enable_delay_xlog_recycle(bool isRedo)
{
    FILE *fp = NULL;

    ereport(LOG, (errmsg("start delaying xlog recycle so that checkpoint "
                         "no longer removes stale xlog segments")));

    fp = AllocateFile(DELAY_XLOG_RECYCLE_FILE, PG_BINARY_W);
    if (fp == NULL || fflush(fp) != 0 || pg_fsync(fileno(fp)) != 0 || ferror(fp) || FreeFile(fp)) {
        ereport(ERROR, (errcode_for_file_access(),
                        errmsg("failed to touch %s file during enable xlog delay: %m", DELAY_XLOG_RECYCLE_FILE)));
    }

    SetDelayXlogRecycle(true, isRedo);

    /*
     * acquire and release RowPageReplicationLock to ensure that
     * concurrent heap_multi_insert will no longer use page replication after
     * we return
     */
    LWLockAcquire(RowPageReplicationLock, LW_EXCLUSIVE);
    LWLockRelease(RowPageReplicationLock);
}

void disable_delay_xlog_recycle(bool isRedo)
{
    ereport(LOG, (errmsg("stop delaying xlog recycle and consequent checkpoint "
                         "will remove stale xlog segments")));

    if (unlink(DELAY_XLOG_RECYCLE_FILE) < 0 && errno != ENOENT) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %m. This will lead to residual of "
                                                          "stale xlog segments",
                                                          DELAY_XLOG_RECYCLE_FILE)));
    }

    SetDelayXlogRecycle(false, isRedo);
}

void startupInitDelayXlog(void)
{
    struct stat st;

    if (!lstat(DELAY_XLOG_RECYCLE_FILE, &st)) {
        ereport(LOG, (errmsg("xlog delay function was enabled at previous database shutdown")));

        /*
         * Do not turn on xlog delay function if we are recovering from archive or
         * if we are standby. In these cases, our xlog segments will be modified unexpectedly and
         * it is meaningless to keep them for back up.
         */
        if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
            disable_delay_xlog_recycle(true);
        } else {
            SetDelayXlogRecycle(true, true);
        }
    } else if (errno != ENOENT) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("failed to stat %s file:%m", DELAY_XLOG_RECYCLE_FILE)));
    }
}

char *getLastRewindTime()
{
    char buf[MAXPGPATH] = {0};
    char *retStr = NULL;
    errno_t ret;
    FILE *fd = NULL;
    struct stat st;

    /* if the file is not there, return */
    if (stat(REWIND_LABLE_FILE, &st)) {
        if (errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", REWIND_LABLE_FILE)));
        }
        return "";
    }

    fd = fopen(REWIND_LABLE_FILE, "r");
    if (fd == NULL) {
        ereport(ERROR, (errcode_for_file_access(), errmsg("could not open rewind file: %s\n", gs_strerror(errno))));
    }

    if (fgets(buf, lengthof(buf), fd) == NULL) {
        if (fclose(fd)) {
            ereport(LOG, (errcode_for_file_access(),
                          errmsg("get null bits, could not close file \"%s\": %m", REWIND_LABLE_FILE)));
        }
        fd = NULL;

        return "";
    }

    retStr = (char *)palloc0(sizeof(char) * (strlen(buf) + 1));
    ret = snprintf_s(retStr, strlen(buf) + 1, strlen(buf), "%s", buf);
    securec_check_ss(ret, "\0", "\0");

    if (fclose(fd)) {
        ereport(LOG, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", REWIND_LABLE_FILE)));
    }
    fd = NULL;
    return retStr;
}

XLogRecPtr enable_delay_ddl_recycle(void)
{
    DelayDDLRange prevDelayRange = InvalidDelayRange;
    DelayDDLRange curDelayRange = InvalidDelayRange;
    XLogRecPtr globalDelayStartPtr = InvalidXLogRecPtr;
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    bool usePrevDelayFile = false;

    if (!u_sess->attr.attr_storage.enable_cbm_tracking) {
        ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                        errmsg("could not enable delay ddl when enable_cbm_tracking is off!")));
    }

    ereport(LOG, (errmsg("start delaying ddl recycle so that column relation files "
                         "will remain even after they are logically dropped")));

    LWLockAcquire(DelayDDLLock, LW_EXCLUSIVE);
    if (!validate_parse_delay_ddl_file(&prevDelayRange)) {
        if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %m", DELAY_DDL_RECYCLE_FILE)));
        }

        SetDDLDelayStartPtr(InvalidXLogRecPtr);
    }

    globalDelayStartPtr = GetDDLDelayStartPtr();

    if (!XLogRecPtrIsInvalid(prevDelayRange.startLSN) && XLByteLT(prevDelayRange.startLSN, prevDelayRange.endLSN)) {
        Assert(XLogRecPtrIsInvalid(globalDelayStartPtr));
        if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %m", DELAY_DDL_RECYCLE_FILE)));
        }

        SetDDLDelayStartPtr(InvalidXLogRecPtr);
        prevDelayRange = InvalidDelayRange;
    }

    LWLockAcquire(CBMParseXlogLock, LW_EXCLUSIVE);

#if defined(__x86_64__) || defined(__aarch64__)
    uint64 current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
    curDelayRange.startLSN = XLogBytePosToEndRecPtr(current_bytepos);
#else
    SpinLockAcquire(&Insert->insertpos_lck);
    curDelayRange.startLSN = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
    SpinLockRelease(&Insert->insertpos_lck);
#endif

    XLogWaitFlush(curDelayRange.startLSN);

    advanceXlogPtrToNextPageIfNeeded(&(curDelayRange.startLSN));

    /*
     * We only care about the case that prevDelayRange.startLSN is smaller
     * than curDelayRange.startLSN. Because we hold exclusive wal insert
     * lock to get curDelayRange.startLSN, larger prevDelayRange.startLSN
     * is meaningless to us.
     */
    if (!XLogRecPtrIsInvalid(prevDelayRange.startLSN) && XLByteLE(prevDelayRange.startLSN, curDelayRange.startLSN)) {
        if (!XLogRecPtrIsInvalid(globalDelayStartPtr) && XLByteLE(globalDelayStartPtr, prevDelayRange.startLSN)) {
            curDelayRange.startLSN = prevDelayRange.startLSN;
            usePrevDelayFile = true;
        } else {
            ereport(WARNING, (errmsg("unexpected global xlogctl delay start lsn %08X/%08X "
                                     "and previous delay start lsn %08X/%08X ",
                                     (uint32)(globalDelayStartPtr >> 32), (uint32)(globalDelayStartPtr),
                                     (uint32)(prevDelayRange.startLSN >> 32), (uint32)(prevDelayRange.startLSN))));
        }
    }

    if (!usePrevDelayFile) {
        if (!write_delay_ddl_file(curDelayRange, true))
            ereport(ERROR, (errcode_for_file_access(),
                            errmsg("failed to write %s file during enable delay ddl recycle", DELAY_DDL_RECYCLE_FILE)));
    }

    SetDDLDelayStartPtr(curDelayRange.startLSN);
    LWLockRelease(DelayDDLLock);

    ForceTrackCBMOnce(curDelayRange.startLSN, ENABLE_DDL_DELAY_TIMEOUT, true, true);

    return curDelayRange.startLSN;
}

/*
 * We will grab at most 4 exclusive LW locks at the same time, so the
 * sequence is important and deadlock should be avoided.
 *
 * If isForce is true, Please do not pass in valid barrierLSN.
 * In this case, we will ensure global delay lsn is reset and
 * delay file is unlink by ingoring possible errors, though,
 * there may be some residual of dropped column relation files
 * if we do encounter errors.
 */
void disable_delay_ddl_recycle(XLogRecPtr barrierLSN, bool isForce, XLogRecPtr *startLSN, XLogRecPtr *endLSN)
{
    DelayDDLRange delayRange = InvalidDelayRange;
    XLogRecPtr globalDelayStartPtr = InvalidXLogRecPtr;
    XLogRecPtr forceTrackLSN = InvalidXLogRecPtr;
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    bool cbmTrackFailed = false;

    if (!u_sess->attr.attr_storage.enable_cbm_tracking) {
        ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                        errmsg(" could not disable delay ddl when enable_cbm_tracking is off!")));
    }

    *startLSN = InvalidXLogRecPtr;
    *endLSN = InvalidXLogRecPtr;

    ereport(LOG, (errmsg("stop delaying ddl recycle and clear column relation files "
                         "that have been dropped during previous delay period. If any error occurs "
                         "in this progress, there may be residual of physical relation files "
                         "that need to be removed manually")));

    if (isForce && !XLogRecPtrIsInvalid(barrierLSN)) {
        ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("valid input lsn is not supported "
                                                                "while doing forceful disable delay ddl")));
    }

    /* hold this lock to prevent concurrent ddl still delay unlinking */
    LWLockAcquire(DelayDDLLock, LW_EXCLUSIVE);
    if (!validate_parse_delay_ddl_file(&delayRange)) {
        if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %m", DELAY_DDL_RECYCLE_FILE)));
        }

        SetDDLDelayStartPtr(InvalidXLogRecPtr);
        LWLockRelease(DelayDDLLock);

        ereport(isForce ? LOG : ERROR,
                (errcode(ERRCODE_CASE_NOT_FOUND),
                 errmsg("failed to validate %s file during disable delay ddl recycle", DELAY_DDL_RECYCLE_FILE)));

        return;
    }

    globalDelayStartPtr = GetDDLDelayStartPtr();

    if (!XLogRecPtrIsInvalid(delayRange.endLSN) || !XLByteEQ(globalDelayStartPtr, delayRange.startLSN) ||
        (!XLogRecPtrIsInvalid(barrierLSN) && XLByteLT(barrierLSN, delayRange.startLSN))) {
        ereport(isForce ? LOG : ERROR,
                (errcode(ERRCODE_CASE_NOT_FOUND),
                 errmsg("inconsistent delay lsn range: global start lsn %08X/%08X, "
                        "%s file start lsn %08X/%08X end lsn %08X/%08X, "
                        "barrier lsn %08X/%08X",
                        (uint32)(globalDelayStartPtr >> 32), (uint32)(globalDelayStartPtr), DELAY_DDL_RECYCLE_FILE,
                        (uint32)(delayRange.startLSN >> 32), (uint32)(delayRange.startLSN),
                        (uint32)(delayRange.endLSN >> 32), (uint32)(delayRange.endLSN), (uint32)(barrierLSN >> 32),
                        (uint32)(barrierLSN))));

        if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %m", DELAY_DDL_RECYCLE_FILE)));
        }

        SetDDLDelayStartPtr(InvalidXLogRecPtr);
        LWLockRelease(DelayDDLLock);

        return;
    }

    /* hold this lock to push cbm parse exact to the ddl stop position */
    LWLockAcquire(CBMParseXlogLock, LW_EXCLUSIVE);

    /*
     * hold this lock to avoid mis-unlinking column attribute files that
     * have been reused.
     * At present, only alter table add column for col rels will reuse existing files
     * instead of throwing ERROR. If other operations allow such reuse, must
     * also hold this lock in advance!
     */
    LWLockAcquire(RelfilenodeReuseLock, LW_EXCLUSIVE);

#if defined(__x86_64__) || defined(__aarch64__)
    uint64 current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
    delayRange.endLSN = XLogBytePosToEndRecPtr(current_bytepos);
#else
    SpinLockAcquire(&Insert->insertpos_lck);
    delayRange.endLSN = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
    SpinLockRelease(&Insert->insertpos_lck);
#endif

    XLogWaitFlush(delayRange.endLSN);

    advanceXlogPtrToNextPageIfNeeded(&(delayRange.endLSN));

    /* if there was no xlog record since enable-delay-ddl, just remove the delay file */
    if (XLByteEQ(delayRange.startLSN, delayRange.endLSN)) {
        if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
            ereport(PANIC, (errcode_for_file_access(), errmsg("could not remove %s file: %m", DELAY_DDL_RECYCLE_FILE)));
        }

        SetDDLDelayStartPtr(InvalidXLogRecPtr);
        LWLockRelease(RelfilenodeReuseLock);
        LWLockRelease(CBMParseXlogLock);
        LWLockRelease(DelayDDLLock);

        *startLSN = delayRange.startLSN;
        *endLSN = delayRange.endLSN;
        return;
    }

    if (!write_delay_ddl_file(delayRange, false)) {
        ereport(ERROR, (errcode_for_file_access(),
                        errmsg("failed to write %s file during disable delay ddl recycle", DELAY_DDL_RECYCLE_FILE)));
    }

    /*
     * We do not allow long jump before we finish executing delayed ddl and unlinking the
     * delay flag file, though we can ignore exiting finished delay file the next time we enable
     * or disable ddl recycle. Instead, we now take a more aggressive way so that
     * startup can have a chance to clean up the residual column files.
     */
    START_CRIT_SECTION();

    SetDDLDelayStartPtr(InvalidXLogRecPtr);
    LWLockRelease(DelayDDLLock);

    forceTrackLSN = ForceTrackCBMOnce(delayRange.endLSN, DISABLE_DDL_DELAY_TIMEOUT, true, true);

    if (XLogRecPtrIsInvalid(forceTrackLSN)) {
        ereport(WARNING, (errmsg("failed to push cbm track point to delay end lsn %08X/%08X, ",
                                 (uint32)(delayRange.endLSN >> 32), (uint32)(delayRange.endLSN))));

        cbmTrackFailed = true;
    }

    execDelayedDDL(delayRange.startLSN, delayRange.endLSN, cbmTrackFailed);

    END_CRIT_SECTION();

    LWLockRelease(RelfilenodeReuseLock);

    *startLSN = delayRange.startLSN;
    *endLSN = delayRange.endLSN;
}

XLogRecPtr enable_delay_ddl_recycle_with_slot(const char *slotname)
{
    XLogRecPtr delay_start_lsn;
    XLogRecPtr ret_lsn;
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;

    if (!u_sess->attr.attr_storage.enable_cbm_tracking) {
        ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                        errmsg("could not enable delay ddl when enable_cbm_tracking is off!")));
    }

    /* we are about to start streaming switch over, stop any xlog insert. */
    if (t_thrd.xlog_cxt.LocalXLogInsertAllowed == 0 && g_instance.streaming_dr_cxt.isInSwitchover == true) {
        ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                errmsg("cannot enable delay ddl during streaming disaster recovery")));
    }

    ereport(LOG, (errmsg("start delaying ddl recycle with backup slot %s", slotname)));

    LWLockAcquire(DelayDDLLock, LW_EXCLUSIVE);

    /* hold this lock to push cbm parse exact to the ddl stop position */
    LWLockAcquire(CBMParseXlogLock, LW_EXCLUSIVE);

#if defined(__x86_64__) || defined(__aarch64__)
    uint64 current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
    delay_start_lsn = XLogBytePosToEndRecPtr(current_bytepos);
#else
    SpinLockAcquire(&Insert->insertpos_lck);
    delay_start_lsn = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
    SpinLockRelease(&Insert->insertpos_lck);
#endif

    XLogWaitFlush(delay_start_lsn);

    advanceXlogPtrToNextPageIfNeeded(&delay_start_lsn);

    ret_lsn = slot_advance_confirmed_lsn(slotname, delay_start_lsn);

    LWLockRelease(DelayDDLLock);

    /*
     * Currently, we do not check pushed cbm track location at delay start.
     * If failed to push to the exact point, stop delay will complain and there will be
     * residual col files.
     * Later, we could retry set and push.
     */
    (void)ForceTrackCBMOnce(ret_lsn, ENABLE_DDL_DELAY_TIMEOUT, true, true);

    return ret_lsn;
}

/*
 * We will grab at most 4 exclusive LW locks at the same time, so the
 * sequence is important and deadlock should be avoided.
 *
 * If isForce is true, Please do not pass in valid barrierLSN.
 * In this case, we will ensure global delay lsn is reset and
 * delay file is unlink by ingoring possible errors, though,
 * there may be some residual of dropped column relation files
 * if we do encounter errors.
 */
void disable_delay_ddl_recycle_with_slot(const char *slotname, XLogRecPtr *startLSN, XLogRecPtr *endLSN)
{
    XLogRecPtr delay_start_lsn = InvalidXLogRecPtr;
    XLogRecPtr delay_end_lsn = InvalidXLogRecPtr;
    XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    bool need_recycle = true;

    if (!u_sess->attr.attr_storage.enable_cbm_tracking) {
        ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                        errmsg(" could not disable delay ddl when enable_cbm_tracking is off!")));
    }

    /* we are about to start streaming switch over, stop any xlog insert. */
    if (t_thrd.xlog_cxt.LocalXLogInsertAllowed == 0 && g_instance.streaming_dr_cxt.isInSwitchover == true) {
        ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
                errmsg("cannot disable delay ddl during streaming disaster recovery")));
    }

    ereport(LOG, (errmsg("stop delaying ddl recycle with backup slot %s", slotname)));

    LWLockAcquire(DelayDDLLock, LW_EXCLUSIVE);

    /* hold this lock to push cbm parse exact to the ddl stop position */
    LWLockAcquire(CBMParseXlogLock, LW_EXCLUSIVE);

    /*
     * hold this lock to avoid mis-unlinking column attribute files that
     * have been reused.
     * At present, only alter table add column for col rels will reuse existing files
     * instead of throwing ERROR. If other operations allow such reuse, must
     * also hold this lock in advance!
     */
    LWLockAcquire(RelfilenodeReuseLock, LW_EXCLUSIVE);

#if defined(__x86_64__) || defined(__aarch64__)
    uint64 current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
    delay_end_lsn = XLogBytePosToEndRecPtr(current_bytepos);
#else
    SpinLockAcquire(&Insert->insertpos_lck);
    delay_end_lsn = XLogBytePosToEndRecPtr(Insert->CurrBytePos);
    SpinLockRelease(&Insert->insertpos_lck);
#endif

    XLogWaitFlush(delay_end_lsn);

    advanceXlogPtrToNextPageIfNeeded(&delay_end_lsn);

    need_recycle = slot_reset_confirmed_lsn(slotname, &delay_start_lsn);

    /* no need to do recycle if no new xlog written or there are concurrent delay ddl operations */
    if (!need_recycle || XLByteEQ(delay_start_lsn, delay_end_lsn)) {
        LWLockRelease(RelfilenodeReuseLock);
        LWLockRelease(CBMParseXlogLock);
        LWLockRelease(DelayDDLLock);

        *startLSN = delay_start_lsn;
        *endLSN = (need_recycle ? delay_end_lsn : InvalidXLogRecPtr);
        return;
    }

    /* If we fail before removing all delayed col files, there will be residual. */
    LWLockRelease(DelayDDLLock);

    /*
     * Currently not check push result. See comments in enable_delay_ddl_recycle_with_slot.
     * There is a very small race condition with enable_delay_ddl_recycle_with_slot for pushing cbm tracked
     * position. Currently, we do not address this issue. The worst result is col file residual.
     */
    (void)ForceTrackCBMOnce(delay_end_lsn, DISABLE_DDL_DELAY_TIMEOUT, true, true);

    execDelayedDDL(delay_start_lsn, delay_end_lsn, true);

    LWLockRelease(RelfilenodeReuseLock);

    *startLSN = delay_start_lsn;
    *endLSN = delay_end_lsn;
}

/*
 * If system previously crashed while disabling delay ddl, we will try
 * to finish that in startup.
 * However, if cbm has already tracked far beyond delay end point, then we
 * could do nothing except leaving residual files of dropped col tables. The chance
 * is low and the result is safe, anyway.
 */
void startupInitDelayDDL(void)
{
    DelayDDLRange delayRange = InvalidDelayRange;

    if (!validate_parse_delay_ddl_file(&delayRange) || !u_sess->attr.attr_storage.enable_cbm_tracking) {
        if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove %s file: %m", DELAY_DDL_RECYCLE_FILE)));
        }
    } else {
        Assert(!XLogRecPtrIsInvalid(delayRange.startLSN));

        /*
         * Do not turn on ddl delay function if we are recovering from archive or
         * if we are standby. In these cases, our xlog segments and data pages
         * might be modified unexpectedly and it is meaningless to keep them for back up.
         */
        if (t_thrd.xlog_cxt.ArchiveRestoreRequested) {
            ereport(LOG, (errmsg("Have to turn off ddl delay (start lsn %08X/%08X "
                                 "to end lsn %08X/%08X) before entering archive recovery "
                                 "or standby mode. There may be residual of column data files. "
                                 "Please refer to previous log to clean them manually if necessary.",
                                 (uint32)(delayRange.startLSN >> 32), (uint32)(delayRange.startLSN),
                                 (uint32)(delayRange.endLSN >> 32), (uint32)(delayRange.endLSN))));

            if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
                ereport(ERROR,
                        (errcode_for_file_access(), errmsg("could not remove %s file: %m", DELAY_DDL_RECYCLE_FILE)));
            }
        } else if (XLogRecPtrIsInvalid(delayRange.endLSN)) {
            ereport(LOG, (errmsg("ddl delay was enabled last time from start lsn %08X/%08X",
                                 (uint32)(delayRange.startLSN >> 32), (uint32)(delayRange.startLSN))));

            SetDDLDelayStartPtr(delayRange.startLSN);
        } else {
            ForceTrackCBMOnce(delayRange.endLSN, 0, false, false);
            execDelayedDDL(delayRange.startLSN, delayRange.endLSN, true);
        }
    }
}

bool validate_parse_delay_ddl_file(DelayDDLRange *delayRange)
{
    struct stat st;
    FILE *fp = NULL;
    DelayDDLRange tmpRange = InvalidDelayRange;
    bool result = false;
    errno_t err;

    if (stat(DELAY_DDL_RECYCLE_FILE, &st) == -1 && errno == ENOENT) {
        ereport(DEBUG5, (errmsg("%s file does not exist", DELAY_DDL_RECYCLE_FILE)));
        return result;
    }

    fp = AllocateFile(DELAY_DDL_RECYCLE_FILE, PG_BINARY_R);
    if (SECUREC_UNLIKELY(fp == NULL)) {
        ereport(LOG,
                (errcode_for_file_access(), errmsg("could not open existing file \"%s\": %m", DELAY_DDL_RECYCLE_FILE)));
        return result;
    }

    if (fread(&tmpRange, 1, sizeof(DelayDDLRange), fp) != sizeof(DelayDDLRange)) {
        ereport(WARNING, (errmsg("%s file is corrupted. We will zap it anyway", DELAY_DDL_RECYCLE_FILE)));
        goto END;
    }

    if (!XLogRecPtrIsInvalid(tmpRange.startLSN) &&
        (XLogRecPtrIsInvalid(tmpRange.endLSN) || XLByteLT(tmpRange.startLSN, tmpRange.endLSN))) {
        ereport(LOG, (errmsg("find existing %s file (start lsn %08X/%08X, end lsn %08X/%08X)", DELAY_DDL_RECYCLE_FILE,
                             (uint32)(tmpRange.startLSN >> 32), (uint32)(tmpRange.startLSN),
                             (uint32)(tmpRange.endLSN >> 32), (uint32)(tmpRange.endLSN))));
    } else {
        ereport(WARNING, (errmsg("%s file has unexpected start lsn %08X/%08X and end lsn "
                                 "%08X/%08X. While there may be residual of dropped column files, "
                                 "We will zap it anyway",
                                 DELAY_DDL_RECYCLE_FILE, (uint32)(tmpRange.startLSN >> 32), (uint32)(tmpRange.startLSN),
                                 (uint32)(tmpRange.endLSN >> 32), (uint32)(tmpRange.endLSN))));
        goto END;
    }

    err = memcpy_s(delayRange, sizeof(DelayDDLRange), &tmpRange, sizeof(DelayDDLRange));
    securec_check(err, "\0", "\0");
    result = true;

END:
    if (FreeFile(fp)) {
        ereport(LOG, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", DELAY_DDL_RECYCLE_FILE)));
    }
    fp = NULL;
    return result;
}

bool write_delay_ddl_file(const DelayDDLRange &delayRange, bool onErrDelete)
{
    FILE *fp = AllocateFile(DELAY_DDL_RECYCLE_FILE, PG_BINARY_W);
    if (fp == NULL) {
        ereport(LOG, (errcode_for_file_access(),
                      errmsg("could not open file \"%s\" for write: %m", DELAY_DDL_RECYCLE_FILE)));
        return false;
    }

    if (fwrite(&delayRange, 1, sizeof(DelayDDLRange), fp) != sizeof(DelayDDLRange) || fflush(fp) != 0 ||
        pg_fsync(fileno(fp)) != 0 || ferror(fp) || FreeFile(fp)) {
        ereport(onErrDelete ? LOG : PANIC,
                (errcode_for_file_access(), errmsg("failed to write file \"%s\": %m", DELAY_DDL_RECYCLE_FILE)));

        /*
         * PANIC may be unnecessary here, because if we failed to write delay file during
         * enable delay and leave unset xlogctl delay start lsn, we would fix it the next time
         * we try to enable delay. To be safe, at present, we use PANIC.
         */
        if (onErrDelete && unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
            ereport(PANIC, (errcode_for_file_access(),
                            errmsg("could not remove %s file after write failure: %m", DELAY_DDL_RECYCLE_FILE)));
        }

        return false;
    }

    return true;
}

void execDelayedDDL(XLogRecPtr startLSN, XLogRecPtr endLSN, bool ignoreCBMErr)
{
    CBMArray *cbmArray = NULL;
    MemoryContext currentContext = CurrentMemoryContext;
    MemoryContext cbmParseContext;
    bool cbmParseFailed = false;
    long i;
    int saveInterruptHoldoffCount;

    cbmParseContext = AllocSetContextCreate(CurrentMemoryContext, "cbm_parse_context", ALLOCSET_DEFAULT_MINSIZE,
                                            ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE);

    LWLockAcquire(CBMParseXlogLock, LW_SHARED);
    saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
    PG_TRY();
    {
        (void)MemoryContextSwitchTo(cbmParseContext);
        cbmArray = CBMGetMergedArray(startLSN, endLSN);
    }
    PG_CATCH();
    {
        t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
        (void)MemoryContextSwitchTo(currentContext);
        ErrorData *edata = CopyErrorData();
        FlushErrorState();
        MemoryContextDelete(cbmParseContext);
        cbmParseContext = NULL;

        if (!ignoreCBMErr) {
            LWLockRelease(CBMParseXlogLock);
        }

        if (ignoreCBMErr) {
            ereport(LOG, (errmsg("Failed to get cbm information during execute delayed DDL: %s", edata->message)));
        } else {
            ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND),
                            errmsg("Failed to get cbm information during execute delayed DDL: %s", edata->message)));
        }

        cbmParseFailed = true;
    }
    PG_END_TRY();
    LWLockRelease(CBMParseXlogLock);

    /*
     * step1. remove delay file
     * step2. remove dropped column file
     *
     * We use this sequence to avoid unlink newly created relations that
     * reuse the relfilenodes.
     * If we fail in step2, there will be residual of
     * column files.
     *
     * If caller told us to ignore cbm parse failue, just return after
     * unlinking the delay file.
     */
    if (unlink(DELAY_DDL_RECYCLE_FILE) < 0 && errno != ENOENT) {
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not remove %s file before unlink col relation files: %m", DELAY_DDL_RECYCLE_FILE)));
    }

    if (cbmParseFailed) {
        return;
    }

    if (!XLByteEQ(cbmArray->startLSN, startLSN) || !XLByteEQ(cbmArray->endLSN, endLSN)) {
        ereport(WARNING, (errmsg("delay start-end %08X/%08X-%08X/%08X is inconsistent with cbm tracking "
                                 "result start-end %08X/%08X-%08X/%08X. To be safe, while there may be residual "
                                 "of dropped column files, We will leave them anyway",
                                 (uint32)(startLSN >> 32), (uint32)(startLSN), (uint32)(endLSN >> 32), (uint32)(endLSN),
                                 (uint32)(cbmArray->startLSN >> 32), (uint32)(cbmArray->startLSN),
                                 (uint32)(cbmArray->endLSN >> 32), (uint32)(cbmArray->endLSN))));
    } else {
        for (i = 0; i < cbmArray->arrayLength; i++) {
            /*
             * For row relations, relfilenode can not be reused if the physical file has not
             * been unlinked. However, for column relations, add column can reuse
             * the file that a previously crashed add-column-xact leaved. Such operations
             * must hold RelfilenodeReuseLock lock to avoid concurrency with execDelayedDDL.
             */
            if ((cbmArray->arrayEntry[i].changeType & PAGETYPE_DROP) &&
                !(cbmArray->arrayEntry[i].changeType & PAGETYPE_CREATE) && !(cbmArray->arrayEntry[i].totalBlockNum) &&
                IsValidColForkNum(cbmArray->arrayEntry[i].cbmTag.forkNum)) {
                CStore::UnlinkColDataFile(cbmArray->arrayEntry[i].cbmTag.rNode,
                                          ColForkNum2ColumnId(cbmArray->arrayEntry[i].cbmTag.forkNum), false);
            }
        }
    }

    FreeCBMArray(cbmArray);
    cbmArray = NULL;
    (void)MemoryContextSwitchTo(currentContext);
    MemoryContextDelete(cbmParseContext);
}

void RedoSpeedDiag(XLogRecPtr readPtr, XLogRecPtr endPtr)
{
    if (g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr == 0) {
        g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr = endPtr;
        INSTR_TIME_SET_CURRENT(g_instance.comm_cxt.predo_cxt.redoPf.preTime);
    }

    uint64 redoBytes = (uint64)(endPtr - g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr);
    instr_time newTime;
    INSTR_TIME_SET_CURRENT(newTime);
    instr_time tmpTime = newTime;
    INSTR_TIME_SUBTRACT(tmpTime, g_instance.comm_cxt.predo_cxt.redoPf.preTime);

    uint64 totalTime = INSTR_TIME_GET_MICROSEC(tmpTime);
    if (redoBytes >= REDO_SPEED_LOG_LEN || (redoBytes > 0 && totalTime >= MAX_OUT_INTERVAL)) {
        uint64 speed = 0;     /* KB/s */
        uint64 speed_raw = 0; /* B/s */
        if (totalTime > 0) {
            speed_raw = redoBytes * US_TRANSFER_TO_S / totalTime;
            speed = speed_raw / BYTES_TRANSFER_KBYTES;
        }
        /* 32 bits is enough , redo speed could not faster than 40G/s recently */
        g_instance.comm_cxt.predo_cxt.redoPf.speed_according_seg = static_cast<uint32>(speed_raw);
        if (g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr == 0 || module_logging_is_on(MOD_REDO)) {
            ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                          errmsg("RedoSpeedDiag: %lu us, redoBytes:%lu,"
                                 "preEndPtr:%lu, readPtr:%lu, endPtr:%lu, speed:%lu KB/s, totalTime:%lu",
                                 INSTR_TIME_GET_MICROSEC(tmpTime), redoBytes,
                                 g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr, readPtr, endPtr, speed, totalTime)));
        }
        redo_refresh_stats(speed_raw);
        g_instance.comm_cxt.predo_cxt.redoPf.preEndPtr = endPtr;
        g_instance.comm_cxt.predo_cxt.redoPf.preTime = newTime;
    }
}

void set_walrcv_reply_dueto_commit(bool need_reply)
{
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    xlogctl->walrcv_reply_dueto_commit = need_reply;
}

bool get_walrcv_reply_dueto_commit(void)
{
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    return xlogctl->walrcv_reply_dueto_commit;
}

/*
 * Get latest redo apply position.
 *
 * Optionally, returns the current recovery target timeline. Callers not
 * interested in that may pass NULL for targetTLI.
 *
 * Exported to allow WALReceiver to read the pointer directly.
 */
XLogRecPtr GetXLogReplayRecPtr(TimeLineID *targetTLI, XLogRecPtr *ReplayReadPtr)
{
    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr recptr;

    SpinLockAcquire(&xlogctl->info_lck);
    recptr = xlogctl->lastReplayedEndRecPtr;
    if (targetTLI != NULL) {
        *targetTLI = xlogctl->RecoveryTargetTLI;
    }
    if (ReplayReadPtr != NULL) {
        *ReplayReadPtr = xlogctl->lastReplayedReadRecPtr;
    }
    SpinLockRelease(&xlogctl->info_lck);

    return recptr;
}

void SetXLogReplayRecPtr(XLogRecPtr readRecPtr, XLogRecPtr endRecPtr)
{
    bool isUpdated = false;
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    if (XLByteLT(xlogctl->lastReplayedReadRecPtr, endRecPtr)) {
        xlogctl->lastReplayedReadRecPtr = readRecPtr;
        xlogctl->lastReplayedEndRecPtr = endRecPtr;
        g_instance.comm_cxt.predo_cxt.redoPf.last_replayed_end_ptr = endRecPtr;
        isUpdated = true;
    }
    SpinLockRelease(&xlogctl->info_lck);
    if (isUpdated) {
        RedoSpeedDiag(readRecPtr, endRecPtr);
    }
    update_dirty_page_queue_rec_lsn(readRecPtr);
#ifndef ENABLE_MULTIPLE_NODES
    if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
        int ret = dcf_set_election_priority(1, endRecPtr);
        if (ret != 0) {
            ereport(WARNING, (errmsg("In dcf mode, could not update current replay location:%08X/%08X",
                (uint32)(endRecPtr >> 32), (uint32)(endRecPtr))));
        }
    }
#endif
}

void DumpXlogCtl()
{
    ereport(LOG,
            (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
             errmsg("[REDO_LOG_TRACE]txn : lastReplayedReadRecPtr %lu, lastReplayedEndRecPtr %lu,"
                    " replayEndRecPtr %lu, lastCheckPointRecPtr %lu, lastCheckPoint.redo %lu",
                    t_thrd.shemem_ptr_cxt.XLogCtl->lastReplayedReadRecPtr,
                    t_thrd.shemem_ptr_cxt.XLogCtl->lastReplayedEndRecPtr,
                    t_thrd.shemem_ptr_cxt.XLogCtl->replayEndRecPtr, t_thrd.shemem_ptr_cxt.XLogCtl->lastCheckPointRecPtr,
                    t_thrd.shemem_ptr_cxt.XLogCtl->lastCheckPoint.redo)));
}

/*
 * This function is used for Cluster Manager(CM), when recovery has done
 * it return the current replay record, if recovery is doing, return
 * InvalidXLogRecPtr, so that CM will not use the Replay LSN to arbitrate.
 */
XLogRecPtr GetXLogReplayRecPtrInPending(void)
{
    HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;

    if (PENDING_MODE == hashmdata->current_mode) {
        bool localrecoverydone = false;
        XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        localrecoverydone = xlogctl->IsRecoveryDone;
        SpinLockRelease(&xlogctl->info_lck);

        if (localrecoverydone == false) {
            return InvalidXLogRecPtr;
        }
    }

    return GetXLogReplayRecPtr(NULL);
}
/*
 * Get current standby flush position, ie, the last WAL position
 * known to be fsync'd to disk in standby.
 *
 * If 'targetTLI' is not NULL, it's set to the current recovery target
 * timeline.
 */
XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *targetTLI)
{
    XLogRecPtr receivePtr;
    XLogRecPtr replayPtr;

    receivePtr = GetWalRcvWriteRecPtr(NULL);
    replayPtr = GetXLogReplayRecPtr(targetTLI);

    if (XLByteLT(receivePtr, replayPtr)) {
        return replayPtr;
    } else {
        return receivePtr;
    }
}

/*
 * Get latest WAL insert pointer
 */
XLogRecPtr GetXLogInsertRecPtr(void)
{
    volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    uint64 current_bytepos;

#if defined(__x86_64__) || defined(__aarch64__)
    current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
#else
    SpinLockAcquire(&Insert->insertpos_lck);
    current_bytepos = Insert->CurrBytePos;
    SpinLockRelease(&Insert->insertpos_lck);
#endif

    return XLogBytePosToRecPtr(current_bytepos);
}

/*
 * Get latest WAL insert pointer.
 * Like GetXLogInsertRecPtr, but if the position is at a page boundary,
 * returns a pointer to the beginning of the page (ie. before page header),
 * not to where the first xlog record on that page would go to.
 */
XLogRecPtr GetXLogInsertEndRecPtr(void)
{
    volatile XLogCtlInsert *Insert = &t_thrd.shemem_ptr_cxt.XLogCtl->Insert;
    uint64 current_bytepos;

#if defined(__x86_64__) || defined(__aarch64__)
    current_bytepos = pg_atomic_barrier_read_u64((uint64 *)&Insert->CurrBytePos);
#else
    SpinLockAcquire(&Insert->insertpos_lck);
    current_bytepos = Insert->CurrBytePos;
    SpinLockRelease(&Insert->insertpos_lck);
#endif

    return XLogBytePosToEndRecPtr(current_bytepos);
}

/*
 * Get latest WAL write pointer
 */
XLogRecPtr GetXLogWriteRecPtr(void)
{
    {
        /* use volatile pointer to prevent code rearrangement */
        XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        *t_thrd.xlog_cxt.LogwrtResult = xlogctl->LogwrtResult;
        SpinLockRelease(&xlogctl->info_lck);
    }

    return t_thrd.xlog_cxt.LogwrtResult->Write;
}

#ifndef ENABLE_MULTIPLE_NODES
/*
 * Get latest paxos write pointer
 */
XLogRecPtr GetPaxosWriteRecPtr(void)
{
    {
        /* use volatile pointer to prevent code rearrangement */
        XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

        SpinLockAcquire(&xlogctl->info_lck);
        *t_thrd.xlog_cxt.LogwrtPaxos = xlogctl->LogwrtPaxos;
        SpinLockRelease(&xlogctl->info_lck);
    }

    return t_thrd.xlog_cxt.LogwrtPaxos->Write;
}

/*
 * Get latest paxos commit pointer
 */
XLogRecPtr GetPaxosConsensusRecPtr(void)
{
    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    *t_thrd.xlog_cxt.LogwrtPaxos = xlogctl->LogwrtPaxos;
    SpinLockRelease(&xlogctl->info_lck);

    return t_thrd.xlog_cxt.LogwrtPaxos->Consensus;
}
#endif

/*
 * read_backup_label: check to see if a backup_label file is present
 *
 * If we see a backup_label during recovery, we assume that we are recovering
 * from a backup dump file, and we therefore roll forward from the checkpoint
 * identified by the label file, NOT what pg_control says. This avoids the
 * problem that pg_control might have been archived one or more checkpoints
 * later than the start of the dump, and so if we rely on it as the start
 * point, we will fail to restore a consistent database state.
 *
 * Returns TRUE if a backup_label was found (and fills the checkpoint
 * location and its REDO location into *checkPointLoc and RedoStartLSN,
 * respectively); returns FALSE if not. If this backup_label came from a
 * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
 * was created during recovery, *backupFromStandby is set to TRUE. If this backup_label
 * was created during primary mode by gs_roach, *backupFromRoach is set to TRUE.
 */
static bool read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired, bool *backupFromStandby,
                              bool *backupFromRoach)
{
    char startxlogfilename[MAXFNAMELEN];
    TimeLineID tli;
    FILE *lfp = NULL;
    char ch;
    char backuptype[20];
    char backupfrom[20];
    uint32 hi, lo;

    *backupEndRequired = false;
    *backupFromStandby = false;

    /*
     * See if label file is present
     */
    lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
    if (lfp == NULL) {
        if (errno != ENOENT) {
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE)));
        }
        return false; /* it's not there, all is fine */
    }

    /*
     * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
     * is pretty crude, but we are not expecting any variability in the file
     * format).
     */
    if (fscanf_s(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c", &hi, &lo, &tli, startxlogfilename,
                 sizeof(startxlogfilename), &ch, 1) != 5 ||
        ch != '\n') {
        ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    }
    t_thrd.xlog_cxt.RedoStartLSN = (((uint64)hi) << 32) | lo;
    g_instance.comm_cxt.predo_cxt.redoPf.redo_start_ptr = t_thrd.xlog_cxt.RedoStartLSN;
    if (fscanf_s(lfp, "CHECKPOINT LOCATION: %X/%X%c", &hi, &lo, &ch, 1) != 3 || ch != '\n') {
        ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                        errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
    }
    *checkPointLoc = (((uint64)hi) << 32) | lo;
    /*
     * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
     * from an older backup anyway, but since the information on it is not
     * strictly required, don't error out if it's missing for some reason.
     */
    if (fscanf_s(lfp, "BACKUP METHOD: %19s\n", backuptype, sizeof(backuptype)) == 1) {
        if (strcmp(backuptype, "streamed") == 0) {
            *backupEndRequired = true;
        }
    }

    if (fscanf_s(lfp, "BACKUP FROM: %19s\n", backupfrom, sizeof(backupfrom)) == 1) {
        if (strcmp(backupfrom, "standby") == 0) {
            *backupFromStandby = true;
        } else if (strcmp(backupfrom, "roach_master") == 0) {
            *backupFromRoach = true;
        }
    }

    if (ferror(lfp) || FreeFile(lfp)) {
        ereport(FATAL, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", BACKUP_LABEL_FILE)));
    }

    return true;
}

/*
 * read_tablespace_map: check to see if a tablespace_map file is present
 *
 * If we see a tablespace_map file during recovery, we assume that we are
 * recovering from a backup dump file, and we therefore need to create symlinks
 * as per the information present in tablespace_map file.
 *
 * Returns TRUE if a tablespace_map file was found (and fills the link
 * information for all the tablespace links present in file); returns FALSE
 * if not.
 */
static bool read_tablespace_map(List **tablespaces)
{
    tablespaceinfo *ti;
    FILE *lfp;
    char tbsoid[MAXPGPATH];
    char *tbslinkpath;
    char str[MAXPGPATH];
    int ch, prevCh = -1, i = 0, n;

    /*
     * See if tablespace_map file is present
     */
    lfp = AllocateFile(TABLESPACE_MAP, "r");
    if (!lfp) {
        if (errno != ENOENT)
            ereport(FATAL, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", TABLESPACE_MAP)));
        return false; /* it's not there, all is fine */
    }

    /*
     * Read and parse the link name and path lines from tablespace_map file
     * (this code is pretty crude, but we are not expecting any variability in
     * the file format).  While taking backup we embed escape character '\\'
     * before newline in tablespace path, so that during reading of
     * tablespace_map file, we could distinguish newline in tablespace path
     * and end of line.  Now while reading tablespace_map file, remove the
     * escape character that has been added in tablespace path during backup.
     */
    while ((ch = fgetc(lfp)) != EOF) {
        if ((ch == '\n' || ch == '\r') && prevCh != '\\') {
            str[i] = '\0';
            if (sscanf_s(str, "%s %n", tbsoid, MAXPGPATH, &n) != 1)
                ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
                                errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
            tbslinkpath = str + n;
            i = 0;

            ti = (tablespaceinfo *)palloc(sizeof(tablespaceinfo));
            ti->oid = pstrdup(tbsoid);
            ti->path = pstrdup(tbslinkpath);

            *tablespaces = lappend(*tablespaces, ti);
            continue;
        } else if ((ch == '\n' || ch == '\r') && prevCh == '\\')
            str[i - 1] = ch;
        else
            str[i++] = ch;
        prevCh = ch;
    }

    if (ferror(lfp) || FreeFile(lfp))
        ereport(FATAL, (errcode_for_file_access(), errmsg("could not read file \"%s\": %m", TABLESPACE_MAP)));

    return true;
}

/* * Error context callback for errors occurring during rm_redo(). */
void rm_redo_error_callback(void *arg)
{
    XLogReaderState *record = (XLogReaderState *)arg;
    StringInfoData buf;

    initStringInfo(&buf);
    RmgrTable[XLogRecGetRmid(record)].rm_desc(&buf, record);

    errcontext("xlog redo %s", buf.data);

    pfree_ext(buf.data);
}

/*
 * BackupInProgress: check if online backup mode is active
 *
 * This is done by checking for existence of the "backup_label" file.
 */
bool BackupInProgress(void)
{
    struct stat stat_buf;

    return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
}

/*
 * CancelBackup: rename the "backup_label" file to cancel backup mode
 *
 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
 * Note that this will render an online backup in progress useless.
 * To correctly finish an online backup, pg_stop_backup must be called.
 */
void CancelBackup(void)
{
    struct stat stat_buf;

    /* if the file is not there, return */
    if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0) {
        return;
    }

    /* remove leftover file from previously canceled backup if it exists */
    unlink(BACKUP_LABEL_OLD);

    if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) == 0) {
        ereport(LOG, (errmsg("online backup mode canceled"),
                      errdetail("\"%s\" was renamed to \"%s\".", BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
        return;
    }

    /* if the tablespace_map file is not there, return */
    if (stat(TABLESPACE_MAP, &stat_buf) < 0) {
        ereport(LOG, (errmsg("online backup mode canceled"),
                      errdetail("File \"%s\" was renamed to \"%s\".", BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
        return;
    }

    /* remove leftover file from previously canceled backup if it exists */
    unlink(TABLESPACE_MAP_OLD);

    if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0) {
        ereport(LOG, (errmsg("online backup mode canceled"),
                      errdetail("Files \"%s\" and \"%s\" were renamed to "
                                "\"%s\" and \"%s\", respectively.",
                                BACKUP_LABEL_FILE, TABLESPACE_MAP, BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
    } else {
        ereport(WARNING, (errcode_for_file_access(), errmsg("online backup mode was not canceled"),
                          errdetail("Could not rename \"%s\" to \"%s\": %m.", BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
    }
}

static bool IsRedoDonePromoting(void)
{
    bool result = false;

    XLogRecPtr redoPos = InvalidXLogRecPtr;
    XLogRecPtr receivePos = InvalidXLogRecPtr;

    if (CheckForFailoverTrigger()) {
        volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;

        SpinLockAcquire(&walrcv->mutex);
        redoPos = walrcv->receiver_replay_location;
        receivePos = walrcv->receivedUpto;
        SpinLockRelease(&walrcv->mutex);

        result = XLByteEQ(redoPos, receivePos) ? true : false;
    }

    return result;
}

bool XLogReadFromWriteBuffer(XLogRecPtr targetStartPtr, int reqLen, char *readBuf, uint32 *rereadlen)
{
    uint32 startoff = targetStartPtr % XLogSegSize;
    if (t_thrd.xlog_cxt.readFile < 0 || !XLByteInSeg(targetStartPtr, t_thrd.xlog_cxt.readSegNo)) {
        char path[MAXPGPATH];
        if (t_thrd.xlog_cxt.readFile >= 0) {
            (void)close(t_thrd.xlog_cxt.readFile);
        }
        XLByteToSeg(targetStartPtr, t_thrd.xlog_cxt.readSegNo);
        XLogFilePath(path, MAXPGPATH, t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo);
        t_thrd.xlog_cxt.readFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
        if (t_thrd.xlog_cxt.readFile < 0) {
            /*
             * If the file is not found, assume it's because the standby
             * asked for a too old WAL segment that has already been
             * removed or recycled.
             */
            if (FILE_POSSIBLY_DELETED(errno)) {
                ereport(ERROR, (errcode_for_file_access(),
                                errmsg("requested WAL segment %s has already been removed",
                                       XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo))));
            } else {
                ereport(ERROR, (errcode_for_file_access(),
                                errmsg("could not open file \"%s\" (log segment %s): %m", path,
                                       XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo))));
            }
        }
        t_thrd.xlog_cxt.readOff = 0;
    }

    if (startoff != t_thrd.xlog_cxt.readOff) {
        if (lseek(t_thrd.xlog_cxt.readFile, (off_t)startoff, SEEK_SET) < 0) {
            (void)close(t_thrd.xlog_cxt.readFile);
            t_thrd.xlog_cxt.readFile = -1;
            ereport(ERROR, (errcode_for_file_access(), errmsg("could not seek in log segment %s to offset %u: %m",
                XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), startoff)));
        }
        t_thrd.xlog_cxt.readOff = startoff;
    }

    if (reqLen > (int)(XLogSegSize - startoff)) {
        reqLen = (int)(XLogSegSize - startoff);
    }

    int readbytes = read(t_thrd.xlog_cxt.readFile, readBuf, reqLen);
    if (readbytes <= 0) {
        (void)close(t_thrd.xlog_cxt.readFile);
        t_thrd.xlog_cxt.readFile = -1;
        ereport(ERROR,
                (errcode_for_file_access(),
                 errmsg("could not read from log segment %s, offset %u, length %lu: %m",
                        XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
                        t_thrd.xlog_cxt.readOff, INT2ULONG(reqLen))));
    }

    t_thrd.xlog_cxt.readOff += reqLen;

    *rereadlen = reqLen;
    return true;
}

bool NewDataIsInBuf(XLogRecPtr expectedRecPtr)
{
    bool havedata = false;
    if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
        havedata = true;
    } else {
        XLogRecPtr latestChunkStart;

        t_thrd.xlog_cxt.receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
        if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
            havedata = true;
            if (!XLByteLT(expectedRecPtr, latestChunkStart)) {
                t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
                SetCurrentChunkStartTime(t_thrd.xlog_cxt.XLogReceiptTime);
            }
        } else {
            havedata = false;
        }
    }

    return havedata;
}

static ReplConnTarget GetRepConntarget(void)
{
    if (t_thrd.xlog_cxt.is_cascade_standby) {
        return REPCONNTARGET_STANDBY;
    } else {
        return REPCONNTARGET_PRIMARY;
    }
}

void HandleCascadeStandbyPromote(XLogRecPtr *recptr)
{
    if (!t_thrd.xlog_cxt.is_cascade_standby || t_thrd.xlog_cxt.server_mode != STANDBY_MODE ||
        !IS_DN_MULTI_STANDYS_MODE()) {
        return;
    }

    ShutdownWalRcv();

    /* switchover */
    volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
    t_thrd.xlog_cxt.receivedUpto = 0;
    SpinLockAcquire(&walrcv->mutex);
    walrcv->receivedUpto = 0;
    SpinLockRelease(&walrcv->mutex);
    t_thrd.xlog_cxt.is_cascade_standby = false;
    if (t_thrd.postmaster_cxt.HaShmData->is_cross_region) {
        t_thrd.xlog_cxt.is_hadr_main_standby = true;
        SpinLockAcquire(&t_thrd.postmaster_cxt.HaShmData->mutex);
        t_thrd.postmaster_cxt.HaShmData->is_cascade_standby = false;
        t_thrd.postmaster_cxt.HaShmData->is_hadr_main_standby = true;
        SpinLockRelease(&t_thrd.postmaster_cxt.HaShmData->mutex);
    }
    t_thrd.xlog_cxt.switchover_triggered = false;

    /* failover */
    ResetFailoverTriggered();
    t_thrd.xlog_cxt.failover_triggered = false;

    if (t_thrd.xlog_cxt.readFile >= 0) {
        close(t_thrd.xlog_cxt.readFile);
        t_thrd.xlog_cxt.readFile = -1;
    }

    SendPostmasterSignal(PMSIGNAL_UPDATE_NORMAL);

    /* request postmaster to start walreceiver again. */
    RequestXLogStreaming(recptr, t_thrd.xlog_cxt.PrimaryConnInfo, GetRepConntarget(),
                         u_sess->attr.attr_storage.PrimarySlotName);
}

void CheckMaxPageFlushLSN(XLogRecPtr reqLsn)
{
    /* if recovery is done and force finish is enabled, update max_page_flush_lsn so */
    /* that force finish really starts to work from now on. */
    if (FORCE_FINISH_ENABLED && t_thrd.xlog_cxt.max_page_flush_lsn == MAX_XLOG_REC_PTR) {
        t_thrd.xlog_cxt.max_page_flush_lsn = reqLsn;
        ereport(LOG, (errmsg("update max_page_flush_lsn to %X/%X for standby before request streaming",
                             (uint32)(t_thrd.xlog_cxt.max_page_flush_lsn >> 32),
                             (uint32)(t_thrd.xlog_cxt.max_page_flush_lsn))));
        update_max_page_flush_lsn(reqLsn, t_thrd.proc_cxt.MyProcPid, true);
        set_global_max_page_flush_lsn(reqLsn);
    }
}

/*
 * For single-instance cluster, non-pitr roach restore does not have restore target end point and
 * recovery.conf file is renamed to indicate restore end when local xlog files have all been replayed.
 */
void rename_recovery_conf_for_roach() {
    if (g_instance.role == VSINGLENODE &&
        g_instance.roach_cxt.isRoachRestore &&
        t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_UNSET &&
        t_thrd.xlog_cxt.isRoachSingleReplayDone == false) {
        ereport(LOG, (errmsg("rename recovery.conf to recovery.done for roach")));
        durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
        t_thrd.xlog_cxt.isRoachSingleReplayDone = true;
    }
}

/*
 * Read the XLOG page containing RecPtr into readBuf (if not read already).
 * Returns number of bytes read, if the page is read successfully, or -1
 * in case of errors.  When errors occur, they are ereport'ed, but only
 * if they have not been previously reported.
 *
 * This is responsible for restoring files from archive as needed, as well
 * as for waiting for the requested WAL record to arrive in standby mode.
 *
 * 'emode' specifies the log level used for reporting "file not found" or
 * "end of WAL" situations in archive recovery, or in standby mode when a
 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
 * false in those situations, on higher log levels the ereport() won't
 * return.
 *
 * In standby mode, if after a successful return of XLogPageRead() the
 * caller finds the record it's interested in to be broken, it should
 * ereport the error with the level determined by
 * emode_for_corrupt_record(), and then set "failedSources |= readSource"
 * and call XLogPageRead() again with the same arguments. This lets
 * XLogPageRead() to try fetching the record from another source, or to
 * sleep and retry.
 *
 * In standby mode, this only returns false if promotion has been triggered.
 * Otherwise it keeps sleeping and retrying indefinitely.
 */
int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr,
                 char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
    /* Load reader private data */
    XLogPageReadPrivate *readprivate = (XLogPageReadPrivate *)xlogreader->private_data;
    int emode = readprivate->emode;
    bool randAccess = readprivate->randAccess;
    bool fetching_ckpt = readprivate->fetching_ckpt;
    uint32 targetPageOff;
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr RecPtr = targetPagePtr;
    bool processtrxn = false;
    XLogSegNo replayedSegNo;
    uint32 ret;
#ifdef USE_ASSERT_CHECKING
    XLogSegNo targetSegNo;

    XLByteToSeg(targetPagePtr, targetSegNo);
#endif
    targetPageOff = targetPagePtr % XLogSegSize;

    /*
     * See if we need to switch to a new segment because the requested record
     * is not in the currently open one.
     */
    if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) {
        /*
         * Request a restartpoint if we've replayed too much xlog since the
         * last one.
         */
        if (t_thrd.xlog_cxt.StandbyModeRequested &&
            (t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched) && !dummyStandbyMode) {
            if (get_real_recovery_parallelism() > 1) {
                XLByteToSeg(GetXLogReplayRecPtr(NULL), replayedSegNo);
            } else {
                replayedSegNo = t_thrd.xlog_cxt.readSegNo;
            }
            if (XLogCheckpointNeeded(replayedSegNo)) {
                (void)GetRedoRecPtr();
                if (XLogCheckpointNeeded(replayedSegNo)) {
                    RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
                }
            }
        }

        close(t_thrd.xlog_cxt.readFile);
        t_thrd.xlog_cxt.readFile = -1;
        t_thrd.xlog_cxt.readSource = 0;
    }

    XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo);
    XLByteAdvance(RecPtr, reqLen);

retry:
    /* See if we need to retrieve more data */
    if (t_thrd.xlog_cxt.readFile < 0 ||
        (t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM && XLByteLT(t_thrd.xlog_cxt.receivedUpto, RecPtr))) {
        if (t_thrd.xlog_cxt.StandbyMode && t_thrd.xlog_cxt.startup_processing && !dummyStandbyMode) {
            /*
             * In standby mode, wait for the requested record to become
             * available, either via restore_command succeeding to restore the
             * segment, or via walreceiver having streamed the record.
             */
            for (;;) {
                /*
                 * Need to check here also for the case where consistency level is
                 * already reached without replaying any record i.e. just after reading
                 * of checkpoint data it has reached to minRecoveryPoint. Also
                 * whenever we are going to loop in the data receive from master
                 * node its bettwe we check if consistency level has reached. So
                 * instead of keeping in all places before ReadRecord, we can keep
                 * here in centralised location.
                 */
                ProcTxnWorkLoad(false);

                CheckRecoveryConsistency();
                if (WalRcvInProgress()) {
                    XLogRecPtr expectedRecPtr = RecPtr;
                    bool havedata = false;

                    // Check to see if the trigger file exists. If so, update the gaussdb state file.
                    if (CheckForStandbyTrigger()
#ifndef ENABLE_MULTIPLE_NODES
                        && IsDCFReadyOrDisabled()
#endif
                        ) {
                        SendPostmasterSignal(PMSIGNAL_UPDATE_NORMAL);
                    }

                    /*
                     * If we find an invalid record in the WAL streamed from
                     * master, something is seriously wrong. There's little
                     * chance that the problem will just go away, but PANIC is
                     * not good for availability either, especially in hot
                     * standby mode. Disconnect, and retry from
                     * archive/pg_xlog again. The WAL in the archive should be
                     * identical to what was streamed, so it's unlikely that
                     * it helps, but one can hope...
                     */
                    if (t_thrd.xlog_cxt.failedSources & XLOG_FROM_STREAM) {
                        ProcTxnWorkLoad(true);
                        ereport(LOG, (errmsg("read from stream failed, request xlog receivedupto at %X/%X.",
                                             (uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
                                             (uint32)t_thrd.xlog_cxt.receivedUpto)));
                        ShutdownWalRcv();
                        continue;
                    }

                    if (!g_instance.attr.attr_storage.enable_mix_replication && !IS_DN_MULTI_STANDYS_MODE() &&
                        !GetArchiveRecoverySlot() && !g_instance.attr.attr_storage.dcf_attr.enable_dcf &&
                        !IS_SHARED_STORAGE_MODE) {
                        /* Startup Data Streaming if none */
                        StartupDataStreaming();
                    }

                    /*
                     * Walreceiver is active, so see if new data has arrived.
                     *
                     * We only advance XLogReceiptTime when we obtain fresh
                     * WAL from walreceiver and observe that we had already
                     * processed everything before the most recent "chunk"
                     * that it flushed to disk.  In steady state where we are
                     * keeping up with the incoming data, XLogReceiptTime will
                     * be updated on each cycle.  When we are behind,
                     * XLogReceiptTime will not advance, so the grace time
                     * alloted to conflicting queries will decrease.
                     */
                    if (RecPtr % XLogSegSize == 0) {
                        XLByteAdvance(expectedRecPtr, SizeOfXLogLongPHD);
                    } else if (RecPtr % XLOG_BLCKSZ == 0) {
                        XLByteAdvance(expectedRecPtr, SizeOfXLogShortPHD);
                    }

                    if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
                        havedata = true;
                    } else {
                        XLogRecPtr latestChunkStart;

                        t_thrd.xlog_cxt.receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
                        if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
                            havedata = true;
                            if (!XLByteLT(RecPtr, latestChunkStart)) {
                                t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
                                SetCurrentChunkStartTime(t_thrd.xlog_cxt.XLogReceiptTime);
                            }
                        } else {
                            havedata = false;
                        }
                    }
                    if (havedata) {
                        /*
                         * Great, streamed far enough. Open the file if it's
                         * not open already.  Use XLOG_FROM_STREAM so that
                         * source info is set correctly and XLogReceiptTime
                         * isn't changed.
                         */
                        if (t_thrd.xlog_cxt.readFile < 0) {
                            t_thrd.xlog_cxt.readFile = XLogFileRead(t_thrd.xlog_cxt.readSegNo, PANIC,
                                                                    t_thrd.xlog_cxt.recoveryTargetTLI, XLOG_FROM_STREAM,
                                                                    false);
                            Assert(t_thrd.xlog_cxt.readFile >= 0);
                        } else {
                            /* just make sure source info is correct... */
                            t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM;
                            t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM;
                        }

                        break;
                    }

                    t_thrd.xlog_cxt.RedoDone = IsRedoDonePromoting();
                    pg_memory_barrier();

                    /*
                     * The judging rules are the following:
                     * (1)For the One-Primary-Multi-Standbys deployment the dummy_status is set
                     * to be true which means no any dummy standby synchronization info need to be noticed;
                     * (2)For the Primary-Standby-DummyStandby deployment the dummy standby synchronization
                     * info need to be judged depending on enable_mix_replication.
                     */
                    bool dummy_status = RecoveryFromDummyStandby()
                                            ? (GetWalRcvDummyStandbySyncPercent() == SYNC_DUMMY_STANDBY_END &&
                                               (g_instance.attr.attr_storage.enable_mix_replication
                                                    ? true
                                                    : GetDataRcvDummyStandbySyncPercent() == SYNC_DUMMY_STANDBY_END))
                                            : true;

                    /* make sure all xlog from dummystandby fulshed to disk */
                    if (WalRcvIsDone() && CheckForFailoverTrigger() && dummy_status) {
                        t_thrd.xlog_cxt.receivedUpto = GetWalRcvWriteRecPtr(NULL);
                        if (XLByteLT(RecPtr, t_thrd.xlog_cxt.receivedUpto)) {
                            /* wait xlog redo done */
                            continue;
                        }

                        ProcTxnWorkLoad(true);
                        /* use volatile pointer to prevent code rearrangement */
                        volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
                        SpinLockAcquire(&walrcv->mutex);
                        walrcv->dummyStandbyConnectFailed = false;
                        SpinLockRelease(&walrcv->mutex);

                        ereport(LOG, (errmsg("RecPtr(%X/%X),receivedUpto(%X/%X)", (uint32)(RecPtr >> 32),
                                             (uint32)RecPtr, (uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
                                             (uint32)t_thrd.xlog_cxt.receivedUpto)));

                        if (t_thrd.xlog_cxt.is_cascade_standby) {
                            ereport(LOG, (errmsg("Cascade Standby has synchronized all the xlog and replication data, "
                                                 "promote to standby")));

                        } else {
                            ereport(LOG,
                                    (errmsg(
                                        "Secondary Standby has synchronized all the xlog and replication data, standby "
                                        "promote to primary")));
                        }

                        ShutdownWalRcv();
                        ShutdownDataRcv();

                        if (t_thrd.xlog_cxt.is_cascade_standby) {
                            HandleCascadeStandbyPromote(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr);
                            continue;
                        } else {
                            goto triggered;
                        }
                    }

                    if (IS_SHARED_STORAGE_MODE) {
                        uint32 disableConnectionNode =
                            pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
                        if (disableConnectionNode && WalRcvIsRunning()) {
                            ereport(LOG, (errmsg("request xlog receivedupto at %X/%X.",
                                                 (uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
                                                 (uint32)t_thrd.xlog_cxt.receivedUpto)));
                            ShutdownWalRcv();
                        }
                    }
                    /*
                     * Data not here yet, so check for trigger then sleep for
                     * five seconds like in the WAL file polling case below.
                     */
                    if (WalRcvIsDone() && (CheckForSwitchoverTrigger() || CheckForFailoverTrigger())) {
                        if (t_thrd.xlog_cxt.is_cascade_standby && t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
                            HandleCascadeStandbyPromote(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr);
                            continue;
                        }
                        goto retry;
                    }
                    if (!processtrxn) {
                        ProcTxnWorkLoad(true);
                        processtrxn = true;
                        goto retry;
                    }

                    /*
                     * Wait for more WAL to arrive, or timeout to be reached
                     */
                    WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L);
                    processtrxn = false;
                    ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
                } else {
                    uint32 sources;

                    /*
                     * Until walreceiver manages to reconnect, poll the
                     * archive.
                     */
                    if (t_thrd.xlog_cxt.readFile >= 0) {
                        close(t_thrd.xlog_cxt.readFile);
                        t_thrd.xlog_cxt.readFile = -1;
                    }
                    /* Reset curFileTLI if random fetch. */
                    if (randAccess) {
                        t_thrd.xlog_cxt.curFileTLI = 0;
                    }

                    /*
                     * Try to restore the file from archive, or read an
                     * existing file from pg_xlog.
                     */
                    sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
                    ereport(DEBUG5, (errmsg("failedSources: %u", t_thrd.xlog_cxt.failedSources)));
                    if (!(sources & ~t_thrd.xlog_cxt.failedSources)) {
                        /*
                         * We've exhausted all options for retrieving the
                         * file. Retry.
                         */
                        t_thrd.xlog_cxt.failedSources = 0;

                        /*
                         * Before we sleep, re-scan for possible new timelines
                         * if we were requested to recover to the latest
                         * timeline.
                         */
                        if (t_thrd.xlog_cxt.recoveryTargetIsLatest) {
                            if (rescanLatestTimeLine()) {
                                continue;
                            }
                        }

                        if (t_thrd.startup_cxt.shutdown_requested) {
                            ereport(LOG, (errmsg("startup shutdown")));
                            proc_exit(0);
                        }

                        if (!xlogctl->IsRecoveryDone) {
                            g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp();
                            g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr;
                            ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                                          errmsg("XLogPageRead IsRecoveryDone is set true,"
                                                 "ReadRecPtr:%X/%X, EndRecPtr:%X/%X",
                                                 (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
                                                 (uint32)(t_thrd.xlog_cxt.ReadRecPtr),
                                                 (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
                                                 (uint32)(t_thrd.xlog_cxt.EndRecPtr))));
                            parallel_recovery::redo_dump_all_stats();
                        }

                        /*
                         * signal postmaster to update local redo end
                         * point to gaussdb state file.
                         */
                        ProcTxnWorkLoad(true);
                        if (!xlogctl->IsRecoveryDone) {
                            SendPostmasterSignal(PMSIGNAL_LOCAL_RECOVERY_DONE);
                        }

                        SpinLockAcquire(&xlogctl->info_lck);
                        xlogctl->IsRecoveryDone = true;
                        SpinLockRelease(&xlogctl->info_lck);
                        static uint64 printFrequency = 0;
                        if (!(IS_SHARED_STORAGE_MODE) ||
                            pg_atomic_read_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage)) {
                            knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM);
                            if ((printFrequency & 0xFF) == 0) {
                                ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                                          errmsg("XLogPageRead set redo finish status,"
                                                 "ReadRecPtr:%X/%X, EndRecPtr:%X/%X",
                                                 (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
                                                 (uint32)(t_thrd.xlog_cxt.ReadRecPtr),
                                                 (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
                                                 (uint32)(t_thrd.xlog_cxt.EndRecPtr))));
                            }
                            printFrequency++;
                            /*
                             * If it hasn't been long since last attempt, sleep 1s to
                             * avoid busy-waiting.
                             */
                            if (IS_SHARED_STORAGE_MODE) {
                                uint32 connMode =
                                    pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
                                if (connMode) {
                                    pg_atomic_write_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node,
                                                        false);
                                }
                                pg_usleep(2000000L);
                            } else {
#ifdef ENABLE_LITE_MODE
                                pg_usleep(1000000L);
#else
                                pg_usleep(50000L);
#endif
                            }
                        }
                        /*
                         * If primary_conninfo is set, launch walreceiver to
                         * try to stream the missing WAL, before retrying to
                         * restore from archive/pg_xlog.
                         *
                         * If fetching_ckpt is TRUE, RecPtr points to the
                         * initial checkpoint location. In that case, we use
                         * RedoStartLSN as the streaming start position
                         * instead of RecPtr, so that when we later jump
                         * backwards to start redo at RedoStartLSN, we will
                         * have the logs streamed already.
                         */
                        load_server_mode();

                        /* Get xlog and data from DummyStandby */
                        if (CheckForFailoverTrigger()) {
                            if (t_thrd.xlog_cxt.is_cascade_standby) {
                                HandleCascadeStandbyPromote(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN
                                                                            : &targetRecPtr);
                                continue;
                            }

                            if (!RecoveryFromDummyStandby()) {
                                goto triggered;
                            }

                            ereport(LOG, (errmsg("Standby connect to DummyStandby begin ......")));
                            if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
                                /* use volatile pointer to prevent code rearrangement */
                                volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;

                                /* request xlog from secondary standby */
                                ereport(LOG, (errmsg("request xlog stream at %X/%X.",
                                                     fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
                                                                    : (uint32)(targetRecPtr >> 32),
                                                     fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
                                                                    : (uint32)targetRecPtr)));
                                ShutdownWalRcv();
                                t_thrd.xlog_cxt.receivedUpto = 0;
                                SpinLockAcquire(&walrcv->mutex);
                                walrcv->receivedUpto = 0;
                                SpinLockRelease(&walrcv->mutex);

                                RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr,
                                                     t_thrd.xlog_cxt.PrimaryConnInfo, REPCONNTARGET_DUMMYSTANDBY,
                                                     u_sess->attr.attr_storage.PrimarySlotName);
                                if (!g_instance.attr.attr_storage.enable_mix_replication &&
                                    !IS_DN_MULTI_STANDYS_MODE() && !g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
                                    StartupDataStreaming();
                                }
                                continue;
                            }
                        } else if (IS_OBS_DISASTER_RECOVER_MODE && !IsRoachRestore()) {
                            ProcTxnWorkLoad(false);
                            /* use volatile pointer to prevent code rearrangement */
                            volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;

                            ereport(LOG, (errmsg("request xlog stream from obs at %X/%X.",
                                                 fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
                                                                : (uint32)(targetRecPtr >> 32),
                                                 fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
                                                                : (uint32)targetRecPtr)));
                            ShutdownWalRcv();
                            t_thrd.xlog_cxt.receivedUpto = 0;
                            SpinLockAcquire(&walrcv->mutex);
                            walrcv->receivedUpto = 0;
                            SpinLockRelease(&walrcv->mutex);

                            RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr, 0,
                                                 REPCONNTARGET_OBS, 0);
                            continue;
                        } else if (IS_SHARED_STORAGE_STANBY_MODE && !IS_SHARED_STORAGE_MAIN_STANDBY_MODE) {
                            ProcTxnWorkLoad(false);
                            /* use volatile pointer to prevent code rearrangement */
                            volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
#ifndef ENABLE_MULTIPLE_NODES
                            rename_recovery_conf_for_roach();
#endif

                            ereport(LOG, (errmsg("request xlog stream from shared storage at %X/%X.",
                                                 fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
                                                                : (uint32)(targetRecPtr >> 32),
                                                 fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
                                                                : (uint32)targetRecPtr)));
                            ShutdownWalRcv();
                            t_thrd.xlog_cxt.receivedUpto = 0;
                            SpinLockAcquire(&walrcv->mutex);
                            walrcv->receivedUpto = 0;
                            SpinLockRelease(&walrcv->mutex);

                            RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr, 0,
                                                 REPCONNTARGET_SHARED_STORAGE, 0);
                            continue;
                        } else if (t_thrd.xlog_cxt.PrimaryConnInfo || t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
                            ProcTxnWorkLoad(false);
                            /* use volatile pointer to prevent code rearrangement */
                            volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
                            CheckMaxPageFlushLSN(targetRecPtr);

#ifndef ENABLE_MULTIPLE_NODES
                            rename_recovery_conf_for_roach();
#endif

                            ereport(LOG, (errmsg("request xlog stream at %X/%X.",
                                                 fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
                                                                : (uint32)(targetRecPtr >> 32),
                                                 fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
                                                                : (uint32)targetRecPtr)));
                            ShutdownWalRcv();
                            t_thrd.xlog_cxt.receivedUpto = 0;
                            SpinLockAcquire(&walrcv->mutex);
                            walrcv->receivedUpto = 0;
                            SpinLockRelease(&walrcv->mutex);

                            RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr,
                                                 t_thrd.xlog_cxt.PrimaryConnInfo, REPCONNTARGET_PRIMARY,
                                                 u_sess->attr.attr_storage.PrimarySlotName);
                            if (!g_instance.attr.attr_storage.enable_mix_replication && !IS_DN_MULTI_STANDYS_MODE() &&
                                !g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
                                StartupDataStreaming();
                            }
                            continue;
                        }
                    }
                    /* Don't try to read from a source that just failed */
                    sources &= ~t_thrd.xlog_cxt.failedSources;
                    t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, DEBUG2, sources);
                    if (t_thrd.xlog_cxt.readFile >= 0) {
                        break;
                    }

                    ereport(DEBUG5, (errmsg("do not find any more files.sources=%u failedSources=%u", sources,
                                            t_thrd.xlog_cxt.failedSources)));
                    /*
                     * Nope, not found in archive and/or pg_xlog.
                     */
                    t_thrd.xlog_cxt.failedSources |= sources;

                    /*
                     * Check to see if the trigger file exists. Note that we
                     * do this only after failure, so when you create the
                     * trigger file, we still finish replaying as much as we
                     * can from archive and pg_xlog before failover.
                     */
                    if (CheckForPrimaryTrigger()) {
                        goto triggered;
                    }

                    if ((CheckForFailoverTrigger() && !RecoveryFromDummyStandby()) || CheckForSwitchoverTrigger()) {
                        if (t_thrd.xlog_cxt.is_cascade_standby && t_thrd.xlog_cxt.server_mode == STANDBY_MODE) {
                            HandleCascadeStandbyPromote(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr);
                            continue;
                        }
                        XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL);
                        XLogRecPtr EndRecPtrTemp = t_thrd.xlog_cxt.EndRecPtr;
                        XLByteAdvance(EndRecPtrTemp, SizeOfXLogRecord);
                        if (XLByteLT(EndRecPtrTemp, receivedUpto) && !FORCE_FINISH_ENABLED &&
                            t_thrd.xlog_cxt.currentRetryTimes++ < g_retryTimes) {
                            ereport(WARNING, (errmsg("there are some received xlog have not been redo "
                                "the tail of last redo lsn:%X/%X, received lsn:%X/%X, retry %d times",
                                (uint32)(EndRecPtrTemp >> 32), (uint32)EndRecPtrTemp,
                                (uint32)(receivedUpto >> 32), (uint32)receivedUpto,
                                t_thrd.xlog_cxt.currentRetryTimes)));
                            return -1;
                        }
                        ereport(LOG,
                                (errmsg("read record failed when promoting, current lsn (%X/%X), received lsn(%X/%X),"
                                        "sources[%u], failedSources[%u], readSource[%u], readFile[%d], readId[%u],"
                                        "readSeg[%u], readOff[%u], readLen[%u]",
                                        (uint32)(RecPtr >> 32), (uint32)RecPtr,
                                        (uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
                                        (uint32)t_thrd.xlog_cxt.receivedUpto, sources, t_thrd.xlog_cxt.failedSources,
                                        t_thrd.xlog_cxt.readSource, t_thrd.xlog_cxt.readFile,
                                        (uint32)(t_thrd.xlog_cxt.readSegNo >> 32), (uint32)t_thrd.xlog_cxt.readSegNo,
                                        t_thrd.xlog_cxt.readOff, t_thrd.xlog_cxt.readLen)));
                        goto triggered;
                    }
                }

                /*
                 * This possibly-long loop needs to handle interrupts of
                 * startup process.
                 */
                RedoInterruptCallBack();
            }
        } else {
            /* In archive or crash recovery. */
            if (t_thrd.xlog_cxt.readFile < 0) {
                uint32 sources;

                /* Reset curFileTLI if random fetch. */
                if (randAccess) {
                    t_thrd.xlog_cxt.curFileTLI = 0;
                }

                sources = XLOG_FROM_PG_XLOG;
                if (t_thrd.xlog_cxt.InArchiveRecovery) {
                    sources |= XLOG_FROM_ARCHIVE;
                }

                t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources);
                if (dummyStandbyMode) {
                    ereport(LOG,
                            (errmsg("readseg:%lu, readFile:%d", t_thrd.xlog_cxt.readSegNo, t_thrd.xlog_cxt.readFile)));
                }

                if (t_thrd.xlog_cxt.readFile < 0) {
                    return -1;
                }
            }
        }
    }

    /*
     * At this point, we have the right segment open and if we're streaming we
     * know the requested record is in it.
     */
    Assert(t_thrd.xlog_cxt.readFile != -1);

    /*
     * If the current segment is being streamed from master, calculate how
     * much of the current page we have received already. We know the
     * requested record has been received, but this is for the benefit of
     * future calls, to allow quick exit at the top of this function.
     */
    if (t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM) {
        if ((targetPagePtr / XLOG_BLCKSZ) != (t_thrd.xlog_cxt.receivedUpto / XLOG_BLCKSZ)) {
            t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
        } else {
            t_thrd.xlog_cxt.readLen = t_thrd.xlog_cxt.receivedUpto % XLogSegSize - targetPageOff;
        }
    } else {
        t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
    }

    /* Read the requested page */
    t_thrd.xlog_cxt.readOff = targetPageOff;

try_again:
    if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) {
        ereport(emode_for_corrupt_record(emode, RecPtr),
                (errcode_for_file_access(),
                    errmsg("could not seek in log file %s to offset %u: %m",
                        XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
                        t_thrd.xlog_cxt.readOff)));
        if (errno == EINTR) {
            errno = 0;
            pg_usleep(1000);
            goto try_again;
        }
        goto next_record_is_invalid;
    }
    pgstat_report_waitevent(WAIT_EVENT_WAL_READ);
    ret = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ);
    pgstat_report_waitevent(WAIT_EVENT_END);
    if (ret != XLOG_BLCKSZ) {
        ereport(emode_for_corrupt_record(emode, RecPtr),
                (errcode_for_file_access(),
                    errmsg("could not read from log file %s to offset %u: %m",
                        XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
                        t_thrd.xlog_cxt.readOff)));
        if (errno == EINTR) {
            errno = 0;
            pg_usleep(1000);
            goto try_again;
        }
        goto next_record_is_invalid;
    }
    Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo);
    Assert(targetPageOff == t_thrd.xlog_cxt.readOff);
    Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen);

    *readTLI = t_thrd.xlog_cxt.curFileTLI;

    return t_thrd.xlog_cxt.readLen;

next_record_is_invalid:
    t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource;

    if (t_thrd.xlog_cxt.readFile >= 0) {
        close(t_thrd.xlog_cxt.readFile);
    }
    t_thrd.xlog_cxt.readFile = -1;
    t_thrd.xlog_cxt.readLen = 0;
    t_thrd.xlog_cxt.readSource = 0;

    /* In standby-mode, keep trying */
    if (t_thrd.xlog_cxt.StandbyMode) {
        goto retry;
    } else {
        return -1;
    }

triggered:
    if (t_thrd.xlog_cxt.readFile >= 0) {
        close(t_thrd.xlog_cxt.readFile);
    }
    t_thrd.xlog_cxt.readFile = -1;
    t_thrd.xlog_cxt.readLen = 0;
    t_thrd.xlog_cxt.readSource = 0;
    t_thrd.xlog_cxt.recoveryTriggered = true;

    return -1;
}

/*
 * Determine what log level should be used to report a corrupt WAL record
 * in the current WAL page, previously read by XLogPageRead().
 *
 * 'emode' is the error mode that would be used to report a file-not-found
 * or legitimate end-of-WAL situation.	 Generally, we use it as-is, but if
 * we're retrying the exact same record that we've tried previously, only
 * complain the first time to keep the noise down.	However, we only do when
 * reading from pg_xlog, because we don't expect any invalid records in archive
 * or in records streamed from master. Files in the archive should be complete,
 * and we should never hit the end of WAL because we stop and wait for more WAL
 * to arrive before replaying it.
 *
 * NOTE: This function remembers the RecPtr value it was last called with,
 * to suppress repeated messages about the same record. Only call this when
 * you are about to ereport(), or you might cause a later message to be
 * erroneously suppressed.
 */
int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
{
    if (t_thrd.xlog_cxt.readSource == XLOG_FROM_PG_XLOG && emode == LOG) {
        if (XLByteEQ(RecPtr, t_thrd.xlog_cxt.lastComplaint)) {
            emode = DEBUG1;
        } else {
            t_thrd.xlog_cxt.lastComplaint = RecPtr;
        }
    }
    return emode;
}

/*
 * Check to see whether the user-specified trigger file exists and whether a
 * failover request has arrived.  If either condition holds, request postmaster
 * to shut down walreceiver, wait for it to exit, and return true.
 */
bool CheckForFailoverTrigger(void)
{
    struct stat stat_buf;
    StringInfo slotname = NULL;
    bool ret = false;

    if (t_thrd.xlog_cxt.server_mode != STANDBY_MODE) {
        ResetFailoverTriggered();
        t_thrd.xlog_cxt.failover_triggered = false;
        ret = false;
        goto exit;
    }

    if (t_thrd.xlog_cxt.failover_triggered) {
        ret = true;
        goto exit;
    }

    slotname = get_rcv_slot_name();
    Assert(slotname != NULL);

    if (IsFailoverTriggered()) {
        ereport(LOG, (errmsg("received failover notification")));
        ShutdownWalRcv();
        ShutdownDataRcv();
        ResetFailoverTriggered();
        t_thrd.xlog_cxt.failover_triggered = true;

        /*
         * If database has a mass of bcm page, clear bcm page maybe very slowly,
         * so we do not clear it.
         * getBcmFileList(true);
         */
        ResetSlotLSNEndRecovery(slotname);

        SendPostmasterSignal(PMSIGNAL_UPDATE_PROMOTING);
        ret = true;
        goto exit;
    }

    if (t_thrd.xlog_cxt.TriggerFile == NULL) {
        ret = false;
        goto exit;
    }

    if (stat(t_thrd.xlog_cxt.TriggerFile, &stat_buf) == 0) {
        ereport(LOG, (errmsg("trigger file found: %s", t_thrd.xlog_cxt.TriggerFile)));
        ShutdownWalRcv();
        ShutdownDataRcv();
        unlink(t_thrd.xlog_cxt.TriggerFile);
        t_thrd.xlog_cxt.failover_triggered = true;

        /*
         * If database has a mass of bcm page, clear bcm page maybe very slowly,
         * so we do not clear it.
         * getBcmFileList(true);
         */
        ResetSlotLSNEndRecovery(slotname);

        ret = true;
    }
exit:
    if (slotname != NULL) {
        if (slotname->data) {
            pfree_ext(slotname->data);
        }
        pfree_ext(slotname);
    }
    return ret;
}

bool CheckForForceFinishRedoTrigger(TermFileData *term_file)
{
    uint32 permit_finish_redo = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.permitFinishRedo));
    uint32 hot_standby = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.hotStdby));
    if (hot_standby == ATOMIC_TRUE && permit_finish_redo == ATOMIC_TRUE && FORCE_FINISH_ENABLED) {
        *term_file = GetTermFileDataAndClear();
        return term_file->finish_redo;
    } else {
        return false;
    }
}

/*
 * Check to see whether the user-specified trigger file exists and whether a
 * switchover request has arrived.  If either condition holds, request postmaster
 * to shut down walreceiver, wait for it to exit, and return true.
 */
bool CheckForSwitchoverTrigger(void)
{
    if (t_thrd.xlog_cxt.switchover_triggered) {
        return true;
    }

    if (IsSwitchoverTriggered()) {
        StringInfo slotname = NULL;

        ereport(LOG, (errmsg("received switchover notification")));

        slotname = get_rcv_slot_name();
        Assert(slotname != NULL);

        ShutdownWalRcv();
        ShutdownDataRcv();
        ResetSwitchoverTriggered();
        t_thrd.xlog_cxt.switchover_triggered = true;

        /*
         * If database has a mass of bcm page, clear bcm page maybe very slowly,
         * so we do not clear it.
         * getBcmFileList(true);
         */
        ResetSlotLSNEndRecovery(slotname);

        if (slotname->data != NULL) {
            pfree_ext(slotname->data);
        }
        pfree_ext(slotname);

        SendPostmasterSignal(PMSIGNAL_UPDATE_PROMOTING);
        return true;
    }

    return false;
}

static bool CheckForPrimaryTrigger(void)
{
    if (AmStartupProcess()) {
        if (IsPrimaryTriggered()) {
            ereport(LOG, (errmsg("received primary request")));

            ResetPrimaryTriggered();
            return true;
        }
        return false;
    } else {
        /* check for primary */
        uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState);
        if (tgigger == TRIGGER_PRIMARY) {
            (void)pg_atomic_compare_exchange_u32(&g_startupTriggerState, &tgigger, TRIGGER_NORMAL);
            return true;
        }
    }

    return false;
}

bool CheckForStandbyTrigger(void)
{
    if (AmStartupProcess()) {
        if (IsStandbyTriggered()) {
            ereport(LOG, (errmsg("received standby request")));

            ResetStandbyTriggered();
            return true;
        }
        return false;
    } else {
        /* check for primary */
        uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState);
        if (tgigger == TRIGGER_STADNBY) {
            (void)pg_atomic_compare_exchange_u32(&(g_startupTriggerState), &tgigger, TRIGGER_NORMAL);
            return true;
        }
    }
    return false;
}

bool CheckFinishRedoSignal(void)
{
    bool is_finish_redo = false;
    int cnt = 0;
    int fp = -1;
    if ((fp = open("finish_redo_file", O_RDONLY, S_IRUSR)) >= 0) {
        cnt = read(fp, &is_finish_redo, sizeof(bool));
        if (close(fp)) {
            ereport(LOG, (errcode_for_file_access(), errmsg("cannot close finish redo file.\n")));
        }
        if (cnt != sizeof(bool)) {
            ereport(LOG, (0, errmsg("cannot read finish redo file.\n")));
            unlink("finish_redo_file");
            return false;
        }

        ereport(LOG, (errmsg("CheckFinishRedoSignal read is_finish_redo successfully, forced_finishredo: %d\n",
                             is_finish_redo)));
    } else {
        return false;
    }
    unlink("finish_redo_file");
    return is_finish_redo;
}

Enum_TriggeredState CheckForSatartupStatus(void)
{
    if (t_thrd.startup_cxt.primary_triggered) {
        ereport(LOG, (errmsg("received primary request")));
        ResetPrimaryTriggered();
        return TRIGGER_PRIMARY;
    }
    if (t_thrd.startup_cxt.standby_triggered) {
        ereport(LOG, (errmsg("received standby request")));
        ResetStandbyTriggered();
        return TRIGGER_STADNBY;
    }
    if (t_thrd.startup_cxt.failover_triggered) {
        ereport(LOG, (errmsg("received failover request")));
        ResetFailoverTriggered();
        return TRIGGER_FAILOVER;
    }
    if (t_thrd.startup_cxt.switchover_triggered) {
        ereport(LOG, (errmsg("received switchover request")));
        ResetSwitchoverTriggered();
        return TRIGGER_FAILOVER;
    }
    return TRIGGER_NORMAL;
}

/*
 * Check to see if a promote request has arrived. Should be
 * called by postmaster after receiving SIGUSR1.
 */
bool CheckPromoteSignal(void)
{
    uint32 Term = g_InvalidTermDN;
    int cnt = 0;
    int fp = -1;
    if ((fp = open("term_file", O_RDONLY, S_IRUSR)) >= 0) {
        cnt = read(fp, &Term, sizeof(uint32));
        if (close(fp)) {
            ereport(LOG, (errcode_for_file_access(), errmsg("cannot close term file when checking promote signal.\n")));
        }

        if (cnt != sizeof(uint32)) {
            ereport(LOG, (errcode_for_file_access(), errmsg("cannot read term file.\n")));
            return false;
        }
    }

    if (Term > g_instance.comm_cxt.localinfo_cxt.term_from_file &&
        t_thrd.postmaster_cxt.HaShmData->current_mode == PRIMARY_MODE) {
        g_instance.comm_cxt.localinfo_cxt.term_from_file = Term;
    }

    struct stat stat_buf;

    if (stat(FAILOVER_SIGNAL_FILE, &stat_buf) == 0) {
        /*
         * Since we are in a signal handler, it's not safe to ereport. We
         * silently ignore any error from unlink.
         */
        unlink(FAILOVER_SIGNAL_FILE);
        if (Term > g_instance.comm_cxt.localinfo_cxt.term_from_file) {
            g_instance.comm_cxt.localinfo_cxt.term_from_file = Term;
            g_instance.comm_cxt.localinfo_cxt.set_term = false;
            ereport(LOG, (errmsg("CheckPromoteSignal read term file, term %d\n", Term)));
        }
        return true;
    }

    if (CheckPostmasterSignal(PMSIGNAL_PROMOTE_STANDBY)) {
        /*
         * Since we are in a signal handler, it's not safe to ereport. We
         * silently ignore any error from unlink.
         */
        if (Term > g_instance.comm_cxt.localinfo_cxt.term_from_file) {
            g_instance.comm_cxt.localinfo_cxt.term_from_file = Term;
            g_instance.comm_cxt.localinfo_cxt.set_term = false;
            ereport(LOG, (errmsg("CheckPromoteSignal read term file, term %d\n", Term)));
        }

        /* Reset the seqno for matview */
        MatviewShmemSetInvalid();

        return true;
    }

    return false;
}

TermFileData GetTermFileDataAndClear(void)
{
    TermFileData term_file;
    uint32 readRst;
    term_file.term = Max(g_instance.comm_cxt.localinfo_cxt.term_from_file,
                         g_instance.comm_cxt.localinfo_cxt.term_from_xlog);
    readRst = pg_atomic_read_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo));
    term_file.finish_redo = (readRst == 1);
    if (term_file.finish_redo && knl_g_get_local_redo_finish_status()) {
        ereport(WARNING, (0, errmsg("GetTermFileData: skip force finish redo when local redo finish")));
        term_file.finish_redo = false;
        pg_atomic_write_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo), 0);
    }
    return term_file;
}

/*
 * Check whether the signal is primary signal
 */
bool CheckPrimarySignal(void)
{
    struct stat stat_buf;
    uint32 term = g_InvalidTermDN;
    int cnt = 0;
    int fp = -1;

    if ((fp = open("term_file", O_RDONLY, S_IRUSR)) >= 0) {
        cnt = read(fp, &term, sizeof(uint32));
        if (close(fp)) {
            ereport(LOG, (errcode_for_file_access(), errmsg("cannot close term file when checking primary signal.\n")));
        }

        if (cnt != sizeof(uint32)) {
            ereport(LOG, (errcode_for_file_access(), errmsg("cannot read term file for notify primary.\n")));
            return false;
        }
    }

    if (stat(PRIMARY_SIGNAL_FILE, &stat_buf) == 0) {
        /*
         * Since we are in a signal handler, it's not safe to ereport. We
         * silently ignore any error from unlink.
         */
        (void)unlink(PRIMARY_SIGNAL_FILE);
        if (term > g_instance.comm_cxt.localinfo_cxt.term_from_file) {
            g_instance.comm_cxt.localinfo_cxt.term_from_file = term;
            ereport(LOG, (errmsg("CheckPrimarySignal term %u\n", term)));
        }
        return true;
    }
    return false;
}

/*
 * Check whether the signal is standby signal
 */
bool CheckStandbySignal(void)
{
    struct stat stat_buf;

    if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0) {
        /*
         * Since we are in a signal handler, it's not safe to elog. We
         * silently ignore any error from unlink.
         */
        (void)unlink(STANDBY_SIGNAL_FILE);
        return true;
    }
    return false;
}

/*
 * Check whether the signal is cascade standby signal
 */
bool CheckCascadeStandbySignal(void)
{
    struct stat stat_buf;

    if (stat(CASCADE_STANDBY_SIGNAL_FILE, &stat_buf) == 0) {
        /*
         * Since we are in a signal handler, it's not safe to elog. We
         * silently ignore any error from unlink.
         */
        (void)unlink(CASCADE_STANDBY_SIGNAL_FILE);
        return true;
    }
    return false;
}

/*
 * Check to see if a switchover request has arrived and
 * read demote mode from switchover signal file.
 * Should be called by postmaster after receiving SIGUSR1.
 */
int CheckSwitchoverSignal(void)
{
    int mode = 0;
    int switchfile = -1;
    int returnCode;

    /*
     * Since we are in a signal handler, it's not safe to elog. We
     * silently ignore any error from fopen/fread/fclose/unlink.
     */
    switchfile = open(SWITCHOVER_SIGNAL_FILE, O_RDONLY, S_IRUSR);
    if (switchfile < 0) {
        return mode;
    }
    returnCode = read(switchfile, &mode, sizeof(int));
    if (returnCode != sizeof(int)) {
        mode = 0;
    }

    if (mode < SmartDemote || mode > ExtremelyFast) {
        mode = 0;
    }

    if (close(switchfile)) {
        ereport(LOG, (errcode_for_file_access(), errmsg("cannot close switchfile.\n")));
    }
    (void)unlink(SWITCHOVER_SIGNAL_FILE);

    return mode;
}

/*
 * Check to see if a switchover timeout signal has arrived.
 * Should be called by postmaster after receiving SIGUSR1.
 */
bool CheckSwitchoverTimeoutSignal(void)
{
    int fp = -1;
    if ((fp = open(SWITCHOVER_TIMEOUT_SIGNAL_FILE, O_RDONLY, S_IRUSR)) >= 0) {
        if (close(fp)) {
            ereport(LOG, (errcode_for_file_access(), errmsg("cannot close switchover timeout file.\n")));
        }
        unlink(SWITCHOVER_TIMEOUT_SIGNAL_FILE);
        return true;
    } else {
        return false;
    }
}

/*
 * Wake up startup process to replay newly arrived WAL, or to notice that
 * failover has been requested.
 */
void WakeupRecovery(void)
{
    SetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
}

void WakeupDataRecovery(void)
{
    SetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->dataRecoveryLatch);
}

/*
 * Update the WalWriterSleeping flag.
 */
void SetWalWriterSleeping(bool sleeping)
{
    /* use volatile pointer to prevent code rearrangement */
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->WalWriterSleeping = sleeping;
    SpinLockRelease(&xlogctl->info_lck);
}

/*
 * CloseXlogFilesAtThreadExit
 *		Close opened xlog files at thread exit time
 */
void CloseXlogFilesAtThreadExit(void)
{
    if (t_thrd.xlog_cxt.openLogFile >= 0) {
        XLogFileClose();
    }
    closeXLogRead();
}

void StartupDataStreaming(void)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    /* Only Single mode && multi standby mode doesn't need data receiver */
    if (IS_SINGLE_NODE && !IS_DN_DUMMY_STANDYS_MODE())
        return;

    if (xlogctl->IsRecoveryDone && !DataRcvInProgress()) {
        load_server_mode();

        if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE && !CheckForSwitchoverTrigger()) {
            if (CheckForFailoverTrigger()) {
                /* if we got all the data in secondary, no need reconnect again */
                if (GetDataRcvDummyStandbySyncPercent() != SYNC_DUMMY_STANDBY_END) {
                    RequestDataStreaming(NULL, REPCONNTARGET_DUMMYSTANDBY);
                }
            } else {
                RequestDataStreaming(NULL, REPCONNTARGET_PRIMARY);
            }
        }
    }
}

/*
 * load the current ongoing mode from the HaShmData
 */
void load_server_mode(void)
{
    volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;

    SpinLockAcquire(&hashmdata->mutex);
    t_thrd.xlog_cxt.server_mode = hashmdata->current_mode;
    t_thrd.xlog_cxt.is_cascade_standby = hashmdata->is_cascade_standby;
    t_thrd.xlog_cxt.is_hadr_main_standby = hashmdata->is_hadr_main_standby;
    SpinLockRelease(&hashmdata->mutex);
}

/*
 * Get the current ongoing mode from the HaShmData
 */
bool IsServerModeStandby()
{
    load_server_mode();
    return (t_thrd.xlog_cxt.server_mode == STANDBY_MODE);
}

/*
 * Calculate difference between two lsns.
 */
uint64 XLogDiff(XLogRecPtr end, XLogRecPtr start)
{
    int64 len = 0;

    if (XLByteLT(end, start)) {
        return 0;
    }

    len = end - start;

    return len;
}

/*
 * get  and set walsender EndRecPtr for dummy standby.
 */
static void SetDummyStandbyEndRecPtr(XLogReaderState *xlogreader)
{
    XLogRecord *record = NULL;

    do {
        record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
    } while (record != NULL);

    /* use volatile pointer to prevent code rearrangement */
    volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;

    SpinLockAcquire(&walrcv->mutex);
    if (XLByteLT(walrcv->receivedUpto, t_thrd.xlog_cxt.EndRecPtr))
        walrcv->receivedUpto = t_thrd.xlog_cxt.EndRecPtr;
    SpinLockRelease(&walrcv->mutex);
}

/*
 * get the crc of the xlog record
 */
pg_crc32 GetXlogRecordCrc(XLogRecPtr RecPtr, bool &crcvalid, XLogPageReadCB pagereadfunc, Size bufAlignSize)
{
    XLogReaderState *xlogreader = NULL;
    XLogPageReadPrivate readprivate;
    pg_crc32 reccrc = 0;
    TimeLineID tli = 0;
    XLogRecord *rec = NULL;
    char *errorMsg = NULL;
    errno_t rc = EOK;

    if (!t_thrd.xlog_cxt.expectedTLIs && t_thrd.shemem_ptr_cxt.XLogCtl && t_thrd.shemem_ptr_cxt.ControlFile) {
        tli = t_thrd.shemem_ptr_cxt.XLogCtl->ThisTimeLineID;
        t_thrd.xlog_cxt.expectedTLIs = list_make1_int((int)tli);

        tli = t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.ThisTimeLineID;
        /* Build list with newest item first */
        if (!list_member_int(t_thrd.xlog_cxt.expectedTLIs, (int)tli)) {
            t_thrd.xlog_cxt.expectedTLIs = lcons_int((int)tli, t_thrd.xlog_cxt.expectedTLIs);
        }
    }

    if (!XRecOffIsValid(RecPtr)) {
        ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND),
                        errmsg("invalid record offset at %X/%X.", (uint32)(RecPtr >> 32), (uint32)RecPtr)));
    }

    /* Set up XLOG reader facility */
    rc = memset_s(&readprivate, sizeof(XLogPageReadPrivate), 0, sizeof(XLogPageReadPrivate));
    securec_check(rc, "\0", "\0");

    /* we need to read xlog from dss when dms and dss enabled */
    if (ENABLE_DMS && ENABLE_DSS) {
        xlogreader = SSXLogReaderAllocate(pagereadfunc, &readprivate, ALIGNOF_BUFFER);
    } else {
        xlogreader = XLogReaderAllocate(pagereadfunc, &readprivate, bufAlignSize);
    }

    if (xlogreader == NULL) {
        ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"),
                        errdetail("Failed while allocating an XLog reading processor")));
    }
    xlogreader->system_identifier = GetSystemIdentifier();

    if (dummyStandbyMode) {
        rec = ReadRecord(xlogreader, RecPtr, LOG, false);
    } else {
        rec = XLogReadRecord(xlogreader, RecPtr, &errorMsg, true);
    }

    crcvalid = true;

    if (rec != NULL) {
        reccrc = rec->xl_crc;

        if (dummyStandbyMode) {
            SetDummyStandbyEndRecPtr(xlogreader);
        }
    } else {
        crcvalid = false;
    }

    /* Shut down readFile facility, free space. */
    ShutdownReadFileFacility();

    /* Shut down the xlog reader facility. */
    XLogReaderFree(xlogreader);
    xlogreader = NULL;

    return reccrc;
}

static void ShutdownReadFileFacility(void)
{
    if (t_thrd.xlog_cxt.readFile >= 0) {
        close(t_thrd.xlog_cxt.readFile);
        t_thrd.xlog_cxt.readFile = -1;
    }
    if (t_thrd.xlog_cxt.expectedTLIs) {
        list_free(t_thrd.xlog_cxt.expectedTLIs);
        t_thrd.xlog_cxt.expectedTLIs = NULL;
    }
}

bool CheckFpwBeforeFirstCkpt(void)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    return xlogctl->FpwBeforeFirstCkpt;
}

void DisableFpwBeforeFirstCkpt(void)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->FpwBeforeFirstCkpt = false;
    SpinLockRelease(&xlogctl->info_lck);
}

/*
 * calculate the LsnXlogFlushData file size
 */
Size LsnXlogFlushChkShmemSize(void)
{
    Size size = sizeof(LsnXlogFlushData);
    size = MAXALIGN(size);
    return size;
}

uint32 get_controlfile_timeline(void)
{
    return t_thrd.shemem_ptr_cxt.ControlFile->timeline;
}

/*
 * initital the LsnXlogFlushData share memory
 */
void LsnXlogFlushChkShmInit(void)
{
    bool found = false;

    t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile = (LsnXlogFlushData *)ShmemInitStruct("LsnXlogFlushChk File",
                                                                                      sizeof(LsnXlogFlushData), &found);

    if (t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile == NULL) {
        ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY),
                        errmsg("not enough shared memory for pg_lsnxlogflushchk share memory")));
    }

    if (!found) {
        t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->localLsnFlushPoint = 0;
        t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->peerXlogFlushPoint = 0;

        /* calculate the crc value */
        INIT_CRC32(t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->crc);
        COMP_CRC32(t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->crc,
                   (char *)t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile, offsetof(LsnXlogFlushData, crc));
        FIN_CRC32(t_thrd.shemem_ptr_cxt.g_LsnXlogFlushChkFile->crc);
    }

    return;
}

void heap_xlog_bcm_new_page(xl_heap_logical_newpage *xlrec, RelFileNode node, char *cuData)
{
    CFileNode cFileNode(node);
    cFileNode.m_forkNum = xlrec->forknum;
    cFileNode.m_attid = xlrec->attid;
    CUStorage *cuStorage = New(CurrentMemoryContext) CUStorage(cFileNode);
    cuStorage->SaveCU(cuData, xlrec->offset, xlrec->blockSize, false);
    DELETE_EX(cuStorage);
}

static bool heap_xlog_logical_new_page_with_data(XLogReaderState *record)
{
    bool result = false;
    xl_heap_logical_newpage *xlrec = (xl_heap_logical_newpage *)XLogRecGetData(record);
    if (xlrec->type == COLUMN_STORE && xlrec->hasdata) {
        char *cuData = XLogRecGetData(record) + SizeOfHeapLogicalNewPage;
        RelFileNode tmp_node;
        RelFileNodeCopy(tmp_node, xlrec->node, (int16)XLogRecGetBucketId(record));

        Assert(XLogRecGetBucketId(record) == InvalidBktId);

        heap_xlog_bcm_new_page(xlrec, tmp_node, cuData);
        result = true;
    }
    return result;
}

/* reserve function */
void heap_xlog_logical_new_page(XLogReaderState *record)
{
    /*
     * one primary muti standby and enable_mix_replication is off need reocord cu xlog.
     * NO ereport(ERROR) from here till newpage op is logged
     */
    if (IS_DN_MULTI_STANDYS_MODE()) {
        heap_xlog_logical_new_page_with_data(record);
        return;
    }

    /* data streaming is not supported in DCF mode */
    if (!g_instance.attr.attr_storage.enable_mix_replication && !IS_DN_MULTI_STANDYS_MODE() &&
        !g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
        /* timeseries table use xlog to sync data */
        if (g_instance.attr.attr_common.enable_tsdb) {
            bool result = heap_xlog_logical_new_page_with_data(record);
            if (!result) {
                StartupDataStreaming();
            }
        } else {
            StartupDataStreaming();
        }
    }

    return;
}

/*
 * Returns true if 'expectedTLEs' contains a timeline with id 'tli'
 */
bool timeLineInHistory(TimeLineID tli, List *expectedTLEs)
{
    ListCell *cell = NULL;

    foreach (cell, expectedTLEs) {
        if ((TimeLineID)lfirst_int(cell) == tli) {
            return true;
        }
    }

    return false;
}

void SetCBMTrackedLSN(XLogRecPtr trackedLSN)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->cbmTrackedLSN = trackedLSN;
    SpinLockRelease(&xlogctl->info_lck);

    ereport(LOG, (errmsg("set CBM tracked LSN point: %08X/%08X ", (uint32)(trackedLSN >> 32), (uint32)trackedLSN)));
}

XLogRecPtr GetCBMTrackedLSN(void)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr trackedLSN;

    SpinLockAcquire(&xlogctl->info_lck);
    trackedLSN = xlogctl->cbmTrackedLSN;
    SpinLockRelease(&xlogctl->info_lck);
    ereport(DEBUG5, (errmsg("get CBM tracked LSN point: %08X/%08X ", (uint32)(trackedLSN >> 32), (uint32)trackedLSN)));
    return trackedLSN;
}

void SetCBMRotateLsn(XLogRecPtr rotate_lsn)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr curr_rotate_lsn;
    curr_rotate_lsn = GetCBMRotateLsn();
    while (XLByteLT(curr_rotate_lsn, rotate_lsn) &&
           pg_atomic_compare_exchange_u64(&xlogctl->cbmMaxRequestRotateLsn, &curr_rotate_lsn, rotate_lsn) == false)
        ;
    ereport(LOG, (errmsg("set CBM SetCBMRotateLsn point: %08X/%08X ", (uint32)(rotate_lsn >> 32), (uint32)rotate_lsn)));
}

XLogRecPtr GetCBMRotateLsn(void)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    volatile XLogRecPtr rotate_lsn;
    rotate_lsn = pg_atomic_read_u64(&xlogctl->cbmMaxRequestRotateLsn);
    return rotate_lsn;
}

void SetCBMFileStartLsn(XLogRecPtr currCbmNameStartLsn)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->currCbmFileStartLsn = currCbmNameStartLsn;
    SpinLockRelease(&xlogctl->info_lck);
    ereport(DEBUG1, (errmsg("set CBM SetCBMFileStartLsn %08X/%08X ", (uint32)(currCbmNameStartLsn >> 32),
                            (uint32)currCbmNameStartLsn)));
}

XLogRecPtr GetCBMFileStartLsn()
{
    XLogRecPtr currCbmNameStartLsn;
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    currCbmNameStartLsn = xlogctl->currCbmFileStartLsn;
    SpinLockRelease(&xlogctl->info_lck);
    return currCbmNameStartLsn;
}

void SetDelayXlogRecycle(bool toDelay, bool isRedo)
{
    XLogRecPtr recptr;
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->delayXlogRecycle = toDelay;
    SpinLockRelease(&xlogctl->info_lck);
    if (isRedo == false && t_thrd.proc->workingVersionNum >= 92233) {
        START_CRIT_SECTION();
        XLogBeginInsert();
        XLogRegisterData((char *)(&toDelay), sizeof(toDelay));
        recptr = XLogInsert(RM_XLOG_ID, XLOG_DELAY_XLOG_RECYCLE);
        XLogWaitFlush(recptr);
        END_CRIT_SECTION();
    }
    ereport(DEBUG5, (errmsg("set delay xlog recycle to %s", toDelay ? "true" : "false")));
}

bool GetDelayXlogRecycle(void)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    bool toDelay = false;

    SpinLockAcquire(&xlogctl->info_lck);
    toDelay = xlogctl->delayXlogRecycle;
    SpinLockRelease(&xlogctl->info_lck);

    ereport(DEBUG5, (errmsg("get delay xlog recycle of %s", toDelay ? "true" : "false")));

    return toDelay;
}

void SetDDLDelayStartPtr(XLogRecPtr ddlDelayStartPtr)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->ddlDelayStartPtr = ddlDelayStartPtr;
    SpinLockRelease(&xlogctl->info_lck);

    ereport(LOG, (errmsg("set DDL delay start LSN point: %08X/%08X ", (uint32)(ddlDelayStartPtr >> 32),
                         (uint32)ddlDelayStartPtr)));
}

XLogRecPtr GetDDLDelayStartPtr(void)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogRecPtr start_lsn;
    XLogRecPtr old_start_lsn;

    start_lsn = ReplicationSlotsComputeConfirmedLSN();

    if (t_thrd.proc != NULL && t_thrd.proc->workingVersionNum < BACKUP_SLOT_VERSION_NUM) {
        SpinLockAcquire(&xlogctl->info_lck);
        old_start_lsn = xlogctl->ddlDelayStartPtr;
        SpinLockRelease(&xlogctl->info_lck);

        if (XLByteLT(old_start_lsn, start_lsn)) {
            start_lsn = old_start_lsn;
        }
    }
    ereport(DEBUG5, (errmsg("get DDL delay start LSN: %08X/%08X ", (uint32)(start_lsn >> 32), (uint32)start_lsn)));

    return start_lsn;
}

bool IsRoachRestore(void)
{
    return (t_thrd.xlog_cxt.InRecovery && ((t_thrd.xlog_cxt.recoveryTarget == RECOVERY_TARGET_BARRIER &&
            t_thrd.xlog_cxt.recoveryTargetBarrierId != NULL &&
            strncmp(t_thrd.xlog_cxt.recoveryTargetBarrierId, ROACH_BACKUP_PREFIX, strlen(ROACH_BACKUP_PREFIX)) == 0) ||
            (g_instance.role == VSINGLENODE && g_instance.roach_cxt.isRoachRestore)));
}

const uint UPDATE_REC_XLOG_NUM = 4;
#if defined(__x86_64__) || defined(__aarch64__)
bool atomic_update_dirty_page_queue_rec_lsn(XLogRecPtr current_insert_lsn, bool need_immediately_update)
{
    XLogRecPtr cur_rec_lsn = InvalidXLogRecPtr;
    uint128_u compare;
    uint128_u exchange;
    uint128_u current;

    compare = atomic_compare_and_swap_u128((uint128_u *)&g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn);
    Assert(sizeof(g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn) == SIZE_OF_UINT64);
    Assert(sizeof(g_instance.ckpt_cxt_ctl->dirty_page_queue_tail) == SIZE_OF_UINT64);

loop:
    cur_rec_lsn = compare.u64[0];
    /* if we already left behind dirty array queue reclsn, do nothing */
    if (!XLByteLE(current_insert_lsn, cur_rec_lsn) &&
        (need_immediately_update || current_insert_lsn - cur_rec_lsn > XLogBaseSize * UPDATE_REC_XLOG_NUM)) {
        exchange.u64[0] = current_insert_lsn;
        exchange.u64[1] = compare.u64[1];

        current = atomic_compare_and_swap_u128((uint128_u *)&g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn, compare,
                                               exchange);
        if (!UINT128_IS_EQUAL(compare, current)) {
            UINT128_COPY(compare, current);
            goto loop;
        }
        return true;
    }
    return false;
}
#endif

/**
 * @Description: Push dirty buffer to dirty page queue, need ensure the recLSN incrementally. when is
 *              not in recovery mode, XLog insert or do full checkpoint will update the dirty page
 *              queue recLSN. when is in recovery mode, redo checkpoint XLog, update the dirty page
 *              queue recLSN to checkpoint's redo lsn.
 */
void update_dirty_page_queue_rec_lsn(XLogRecPtr current_insert_lsn, bool need_immediately_update)
{
    bool is_update = false;
    uint32 freespace;

    if (unlikely(!ENABLE_INCRE_CKPT)) {
        return;
    }

    /* Compute new incremental checkpoint redo record ptr = location of next XLOG record. */
    freespace = INSERT_FREESPACE(current_insert_lsn);
    if (freespace == 0) {
        if (current_insert_lsn % XLogSegSize == 0) {
            current_insert_lsn += SizeOfXLogLongPHD;
        } else {
            current_insert_lsn += SizeOfXLogShortPHD;
        }
    }

#if defined(__x86_64__) || defined(__aarch64__)
    is_update = atomic_update_dirty_page_queue_rec_lsn(current_insert_lsn, need_immediately_update);
#else
    SpinLockAcquire(&g_instance.ckpt_cxt_ctl->queue_lock);

    if (!XLByteLE(current_insert_lsn, g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn) &&
        (need_immediately_update ||
         current_insert_lsn - g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn > XLogBaseSize * UPDATE_REC_XLOG_NUM)) {
        g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn = current_insert_lsn;
        is_update = true;
    }
    SpinLockRelease(&g_instance.ckpt_cxt_ctl->queue_lock);

#endif
    return;
}

uint64 get_dirty_page_queue_rec_lsn()
{
    uint64 dirty_page_queue_rec_lsn = 0;
#if defined(__x86_64__) || defined(__aarch64__)
    dirty_page_queue_rec_lsn = pg_atomic_barrier_read_u64(&g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn);
#else
    SpinLockAcquire(&g_instance.ckpt_cxt_ctl->queue_lock);
    dirty_page_queue_rec_lsn = g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn;
    SpinLockRelease(&g_instance.ckpt_cxt_ctl->queue_lock);
#endif
    return dirty_page_queue_rec_lsn;
}

/**
 * @Description: get the recLSN of the first page in the dirty page queue.
 * @return: dirty page queue first valid recLSN.
 */
XLogRecPtr ckpt_get_min_rec_lsn(void)
{
    uint64 queue_loc;
    XLogRecPtr min_rec_lsn = InvalidXLogRecPtr;

    /*
     * If head recLSN is Invalid, then add head, get next buffer recLSN, if head equal tail,
     * return InvalidXLogRecPtr.
     */
    if (get_dirty_page_num() == 0) {
        return InvalidXLogRecPtr;
    }
    queue_loc = pg_atomic_read_u64(&g_instance.ckpt_cxt_ctl->dirty_page_queue_head);
    while (XLogRecPtrIsInvalid(min_rec_lsn) && (queue_loc < get_dirty_page_queue_tail())) {
        Buffer buffer;
        BufferDesc *buf_desc = NULL;
        XLogRecPtr page_rec_lsn = InvalidXLogRecPtr;
        uint64 temp_loc = queue_loc % g_instance.ckpt_cxt_ctl->dirty_page_queue_size;
        volatile DirtyPageQueueSlot *slot = &g_instance.ckpt_cxt_ctl->dirty_page_queue[temp_loc];

        /* slot location is pre-occupied, but the buffer not set finish, need wait and retry. */
        if (!(pg_atomic_read_u32(&slot->slot_state) & SLOT_VALID)) {
            pg_usleep(1);
            queue_loc = pg_atomic_read_u64(&g_instance.ckpt_cxt_ctl->dirty_page_queue_head);
            continue;
        }
        pg_memory_barrier();
        queue_loc++;

        buffer = slot->buffer;
        /* slot state is vaild, buffer is invalid, the slot buffer set 0 when BufferAlloc or InvalidateBuffer */
        if (BufferIsInvalid(buffer)) {
            continue;
        }
        buf_desc = GetBufferDescriptor(buffer - 1);
        page_rec_lsn = pg_atomic_read_u64(&buf_desc->extra->rec_lsn);
        if (!BufferIsInvalid(slot->buffer)) {
            min_rec_lsn = page_rec_lsn;
        }
    }
    return min_rec_lsn;
}

void WaitCheckpointSync(void)
{
    if (t_thrd.shemem_ptr_cxt.ControlFile != NULL &&
        u_sess->attr.attr_storage.guc_synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH) {
        LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        XLogRecPtr checkpoint = t_thrd.shemem_ptr_cxt.ControlFile->checkPoint;
        LWLockRelease(ControlFileLock);
        if (!IsInitdb && g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
            SyncPaxosWaitForLSN(checkpoint);
        } else {
            SyncRepWaitForLSN(checkpoint);
        }
    }
}

void GetRecoveryLatch()
{
    OwnLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
}

void ReLeaseRecoveryLatch()
{
    DisownLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
}

XLogRecPtr mpfl_get_max_lsn_from_buf(char *buf)
{
    uint32 i = 0;
    XLogRecPtr *lsn = NULL;
    XLogRecPtr tmp_lsn = 0;
    Assert(buf != NULL);
    for (i = 0; i < MPFL_PAGE_NUM; ++i) {
        lsn = (XLogRecPtr *)(buf + i * BLCKSZ);
        pg_atomic_write_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[i]), *lsn);
        if (XLByteLT(tmp_lsn, *lsn)) {
            tmp_lsn = *lsn;
        }
    }
    return tmp_lsn;
}

XLogRecPtr mpfl_read_max_flush_lsn()
{
    int fd = -1;

    char *file_head = g_instance.comm_cxt.predo_cxt.ali_buf;
    XLogRecPtr max_flush_lsn;
    if (!file_exists(MAX_PAGE_FLUSH_LSN_FILE)) {
        ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_REDO), errmsg("mpfl file does not exist")));
        return MAX_XLOG_REC_PTR;
    }
    /* O_DSYNC for less IO */
    fd = open(MAX_PAGE_FLUSH_LSN_FILE, MPFL_FILE_FLAG, MPFL_FILE_PERM);
    if (fd == -1) {
        ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_DW),
                          errmsg("mpfl could not open file \"%s\": %m", MAX_PAGE_FLUSH_LSN_FILE)));
        return MAX_XLOG_REC_PTR;
    }

    /* alignment for O_DIRECT */
    Assert(file_head != NULL);
    pgstat_report_waitevent(WAIT_EVENT_MPFL_READ);
    if (!mpfl_pread_file(fd, file_head, MPFL_FILE_SIZE, 0)) {
        max_flush_lsn = MAX_XLOG_REC_PTR;
    } else {
        max_flush_lsn = mpfl_get_max_lsn_from_buf(file_head);
    }
    pgstat_report_waitevent(WAIT_EVENT_END);
    close(fd);
    if (max_flush_lsn == 0) {
        max_flush_lsn = MAX_XLOG_REC_PTR;
    }
    return max_flush_lsn;
}

void mpfl_ulink_file()
{
    int ret = 0;
    ret = unlink(MAX_PAGE_FLUSH_LSN_FILE);
    if (ret < 0 && errno != ENOENT) {
        ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                          errmsg("[mpfl_ulink_file]: unlink %s failed! ret:%u", MAX_PAGE_FLUSH_LSN_FILE, ret)));
    } else {
        ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                      errmsg("[mpfl_ulink_file]: unlink %s sucessfully! ret:%u", MAX_PAGE_FLUSH_LSN_FILE, ret)));
    }
}

bool mpfl_pread_file(int fd, void *buf, int32 size, int64 offset)
{
    int32 curr_size, total_size;
    total_size = 0;
    do {
        curr_size = pread64(fd, ((char *)buf + total_size), (size - total_size), offset);
        if (curr_size == -1) {
            ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_REDO),
                              errmsg("mpfl_pread_file:Read file error, size:%d, offset:%ld, total_size:%d", size,
                                     offset, total_size)));
            return false;
        }

        total_size += curr_size;
        offset += curr_size;
    } while (curr_size > 0);

    if (total_size != size) {
        ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_REDO),
                          errmsg("mpfl_pread_file: file size mismatch: expected %d, read %d, offset:%ld", size,
                                 total_size, offset)));
        return false;
    }
    return true;
}

void mpfl_pwrite_file(int fd, const void *buf, int size, int64 offset)
{
    int write_size = 0;
    uint32 try_times = 0;
    const uint64 mpfl_try_times = 8;
    const uint64 mpfl_sleep_us = 256L;

    if ((offset + size) > MPFL_FILE_SIZE) {
        ereport(PANIC, (errmodule(MOD_REDO), errmsg("mpfl_pwrite_file failed, MPFL_FILE_SIZE %d, "
                                                    "offset %ld, write_size %d",
                                                    MPFL_FILE_SIZE, offset, size)));
    }

    while (try_times < mpfl_try_times) {
        write_size = pwrite64(fd, buf, size, offset);
        if (write_size == 0) {
            try_times++;
            pg_usleep(mpfl_sleep_us);
            continue;
        } else if (write_size < 0) {
            ereport(PANIC,
                    (errcode_for_file_access(), errmodule(MOD_REDO), errmsg("mpfl_pwrite_file: Write file error")));
        } else {
            break;
        }
    }
    if (write_size != size) {
        ereport(PANIC,
                (errcode_for_file_access(), errmodule(MOD_REDO),
                 errmsg("mpfl_pwrite_file: Write file size mismatch: expected %d, written %d", size, write_size)));
    }
}

void mpfl_new_file()
{
    errno_t errorno = EOK;
    char *file_head = g_instance.comm_cxt.predo_cxt.ali_buf;
    int fd = -1;
    if (file_exists(MAX_PAGE_FLUSH_LSN_FILE)) {
        ereport(WARNING, (errcode_for_file_access(), errmodule(MOD_REDO), "max_flush_lsn file exists, deleting it"));
        if (unlink(MAX_PAGE_FLUSH_LSN_FILE) != 0) {
            ereport(PANIC,
                    (errcode_for_file_access(), errmodule(MOD_REDO), errmsg("Could not remove max_flush_lsn file")));
        }
    }
    /* Open file with O_SYNC, to make sure the data and file system control info on file after block writing. */
    fd = open(MAX_PAGE_FLUSH_LSN_FILE, (MPFL_FILE_FLAG | O_CREAT), MPFL_FILE_PERM);
    if (fd == -1) {
        ereport(PANIC, (errcode_for_file_access(), errmodule(MOD_REDO),
                        errmsg("Could not create file \"%s\"", MAX_PAGE_FLUSH_LSN_FILE)));
    }
    Assert(file_head != NULL);
    errorno = memset_s(file_head, MPFL_FILE_SIZE, 0, MPFL_FILE_SIZE);
    securec_check(errorno, "", "");
    pgstat_report_waitevent(WAIT_EVENT_MPFL_INIT);
    mpfl_pwrite_file(fd, file_head, MPFL_FILE_SIZE, 0);
    pgstat_report_waitevent(WAIT_EVENT_END);
    close(fd);
}

void update_max_page_flush_lsn(XLogRecPtr biggest_lsn, ThreadId thdId, bool is_force)
{
    uint32 hash_value = hashquickany(0xFFFFFFFF, (unsigned char *)(&thdId), sizeof(ThreadId));
    uint32 lock_id = hash_value % NUM_MAX_PAGE_FLUSH_LSN_PARTITIONS;
    LWLock *p_lock = NULL;
    int fd = -1;
    char *page = NULL;
    int64 offset;
    XLogRecPtr *new_lsn;
    XlogUpMinRecovPointSwitchChk(biggest_lsn);
    if (XLByteLE(biggest_lsn, pg_atomic_read_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[lock_id]))) &&
        !is_force) {
        return;
    }
    /* we must keep consistency between g_instance.comm_cxt.predo_cxt.max_page_flush_lsn */
    /* and info.max_page_flush_lsn. */
    p_lock = &(t_thrd.shemem_ptr_cxt.mainLWLockArray[FirstMPFLLock + (lock_id)].lock);
    LWLockAcquire(p_lock, LW_EXCLUSIVE);
    if (is_force) {
        pg_atomic_write_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[lock_id]), biggest_lsn);
    } else {
        if (XLByteLT(pg_atomic_read_u64(&g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[lock_id]), biggest_lsn)) {
            pg_atomic_write_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[lock_id]), biggest_lsn);
        } else {
            LWLockRelease(p_lock);
            return;
        }
    }
    /* Open file with O_SYNC, to make sure the data and file system control info on file after block writing. */
    fd = open(MAX_PAGE_FLUSH_LSN_FILE, (MPFL_FILE_FLAG), MPFL_FILE_PERM);
    if (fd == -1) {
        LWLockRelease(p_lock);
        ereport(PANIC, (errcode_for_file_access(), errmodule(MOD_REDO),
                        errmsg("Could not open file \"%s\"", MAX_PAGE_FLUSH_LSN_FILE)));
    }
    offset = (int64)lock_id * (int64)BLCKSZ;
    page = g_instance.comm_cxt.predo_cxt.ali_buf + offset;
    new_lsn = (XLogRecPtr *)page;
    *new_lsn = biggest_lsn;
    pgstat_report_waitevent(WAIT_EVENT_MPFL_WRITE);
    mpfl_pwrite_file(fd, page, BLCKSZ, offset);
    pgstat_report_waitevent(WAIT_EVENT_END);
    close(fd);
    LWLockRelease(p_lock);
}

void set_global_max_page_flush_lsn(XLogRecPtr lsn)
{
    LWLock *p_lock = NULL;
    for (uint32 i = 0; i < NUM_MAX_PAGE_FLUSH_LSN_PARTITIONS; ++i) {
        p_lock = &(t_thrd.shemem_ptr_cxt.mainLWLockArray[FirstMPFLLock + (i)].lock);
        LWLockAcquire(p_lock, LW_EXCLUSIVE);
        pg_atomic_write_u64(&(g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[i]), lsn);
        LWLockRelease(p_lock);
    }
}

XLogRecPtr get_global_max_page_flush_lsn()
{
    XLogRecPtr maxLsn = 0;
    for (uint32 i = 0; i < NUM_MAX_PAGE_FLUSH_LSN_PARTITIONS; ++i) {
        XLogRecPtr tmpLsn = pg_atomic_read_u64(&g_instance.comm_cxt.predo_cxt.max_page_flush_lsn[i]);
        if (XLByteLT(maxLsn, tmpLsn)) {
            maxLsn = tmpLsn;
        }
    }
    return maxLsn;
}

bool IsRecoveryDone()
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    return xlogctl->IsRecoveryDone;
}

void ExtremRtoUpdateMinCheckpoint()
{
    if (t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone && t_thrd.xlog_cxt.minRecoveryPoint == InvalidXLogRecPtr) {
        t_thrd.xlog_cxt.minRecoveryPoint = t_thrd.shemem_ptr_cxt.ControlFile->minRecoveryPoint;
    }
}

#ifndef ENABLE_MULTIPLE_NODES
void DcfUpdateConsensusLsnAndIndex(XLogRecPtr ConsensusLsn, unsigned long long dcfIndex)
{
    /* use volatile pointer to prevent code rearrangement */
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->LogwrtPaxos.Consensus = ConsensusLsn;
    xlogctl->LogwrtPaxos.PaxosIndex = dcfIndex;
    SpinLockRelease(&xlogctl->info_lck);
}
#endif

void SetRemainSegsStartPoint(XLogRecPtr remainSegsStartPoint)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->remain_segs_start_point = remainSegsStartPoint;
    SpinLockRelease(&xlogctl->info_lck);
}

XLogRecPtr GetRemainSegsStartPoint(void)
{
    XLogRecPtr remainSegsStartPoint;
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    remainSegsStartPoint = xlogctl->remain_segs_start_point;
    SpinLockRelease(&xlogctl->info_lck);
    return remainSegsStartPoint;
}

void UpdateRemainSegsStartPoint(XLogRecPtr newStartPoint)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    if (XLByteLT(xlogctl->remain_segs_start_point, newStartPoint)) {
        xlogctl->remain_segs_start_point = newStartPoint;
    }
    SpinLockRelease(&xlogctl->info_lck);
}

bool IsNeedLogRemainSegs(XLogRecPtr curLsn)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    if (xlogctl->is_need_log_remain_segs) {
        return true;
    }

    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->is_need_log_remain_segs = (XLByteLT(xlogctl->remain_segs_start_point, curLsn));
    SpinLockRelease(&xlogctl->info_lck);
    return xlogctl->is_need_log_remain_segs;
}

void SetRemainCommitLsn(XLogRecPtr remainCommitLsn)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    xlogctl->remainCommitLsn = remainCommitLsn;
    SpinLockRelease(&xlogctl->info_lck);
}

XLogRecPtr GetRemainCommitLsn(void)
{
    XLogRecPtr remainCommitLsn;
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    remainCommitLsn = xlogctl->remainCommitLsn;
    SpinLockRelease(&xlogctl->info_lck);
    return remainCommitLsn;
}

void UpdateRemainCommitLsn(XLogRecPtr newRemainCommitLsn)
{
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    SpinLockAcquire(&xlogctl->info_lck);
    if (XLByteLT(xlogctl->remainCommitLsn, newRemainCommitLsn)) {
        xlogctl->remainCommitLsn = newRemainCommitLsn;
    }
    SpinLockRelease(&xlogctl->info_lck);
}

uint32 GetDigitalByLog2(uint32 num)
{
    if (num == 1) {
        return 0;
    } else {
        return 1 + GetDigitalByLog2(num >> 1);
    }
}

void InitXlogStatuEntryTblSize()
{
    uint32 conCfgNum = 0;
    uint32 threadCfgNum = 0;
    int maxConn = g_instance.attr.attr_network.MaxConnections;

    conCfgNum = (maxConn <= 0) ? 0 : GetDigitalByLog2(maxConn) + 1;

    if (g_instance.attr.attr_common.enable_thread_pool) {
        char *attr = TrimStr(g_instance.attr.attr_common.thread_pool_attr);
        char *ptoken = NULL;
        char *psave = NULL;
        const char *pdelimiter = ",";

        if ((attr) == NULL || (attr)[0] == '\0') {
            threadCfgNum = 0;
        } else {
            ptoken = TrimStr(strtok_r(attr, pdelimiter, &psave));
            if ((ptoken) == NULL || (ptoken)[0] == '\0') {
                threadCfgNum = 0;
            } else {
                int32 threadNum = pg_strtoint32(ptoken);
                threadCfgNum = (threadNum <= 0) ? 0 : GetDigitalByLog2(threadNum) + 1;
            }
        }
    } else {
        threadCfgNum = 0;
    }

    /* Compare conCfgNum and threadCfgNum and use the max one. */
    g_instance.attr.attr_storage.wal_insert_status_entries_power =
        ((conCfgNum >= threadCfgNum) ? conCfgNum : threadCfgNum) + WAL_INSERT_STATUS_TBL_EXPAND_FACTOR;

    if (g_instance.attr.attr_storage.wal_insert_status_entries_power < MIN_WAL_INSERT_STATUS_ENTRY_POW) {
        g_instance.attr.attr_storage.wal_insert_status_entries_power = MIN_WAL_INSERT_STATUS_ENTRY_POW;
    }
}

void InitShareStorageCtlInfo(ShareStorageXLogCtl *ctlInfo, uint64 sysidentifier)
{
    ctlInfo->magic = SHARE_STORAGE_CTL_MAGIC;
    ctlInfo->length = SizeOfShareStorageXLogCtl;
    ctlInfo->version = CURRENT_SHARE_STORAGE_CTL_VERSION;
    ctlInfo->systemIdentifier = sysidentifier;
    ctlInfo->insertHead = XLogSegSize;
    ctlInfo->xlogFileSize = g_instance.attr.attr_storage.xlog_file_size;
    ctlInfo->checkNumber = SHARE_STORAGE_CTL_CHCK_NUMBER;
    ctlInfo->insertTail = XLogSegSize;
    ctlInfo->term = 1;
    ctlInfo->pad1 = 0;
    ctlInfo->pad2 = 0;
    ctlInfo->pad3 = 0;
    ctlInfo->pad4 = 0;
}

void UpdateShareStorageCtlInfo(const ShareStorageXLogCtl *ctlInfo)
{
    Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit && (g_instance.xlog_cxt.shareStorageopCtl.opereateIf != NULL));
    g_instance.xlog_cxt.shareStorageopCtl.opereateIf->WriteCtlInfo(ctlInfo);
}

void ReadShareStorageCtlInfo(ShareStorageXLogCtl *ctlInfo)
{
    Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit && (g_instance.xlog_cxt.shareStorageopCtl.opereateIf != NULL));
    g_instance.xlog_cxt.shareStorageopCtl.opereateIf->ReadCtlInfo(ctlInfo);
}

int ReadXlogFromShareStorage(XLogRecPtr startLsn, char *buf, int expectReadLen)
{
    Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit && (g_instance.xlog_cxt.shareStorageopCtl.opereateIf != NULL));
    return g_instance.xlog_cxt.shareStorageopCtl.opereateIf->readXlog(startLsn, buf, expectReadLen);
}

int WriteXlogToShareStorage(XLogRecPtr startLsn, char *buf, int writeLen)
{
    Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit && (g_instance.xlog_cxt.shareStorageopCtl.opereateIf != NULL));
    return g_instance.xlog_cxt.shareStorageopCtl.opereateIf->WriteXlog(startLsn, buf, writeLen);
}

pg_crc32c CalShareStorageCtlInfoCrc(const ShareStorageXLogCtl *ctlInfo)
{
    Assert(g_instance.xlog_cxt.shareStorageopCtl.isInit);
    pg_crc32c crc;
    INIT_CRC32C(crc);
    COMP_CRC32C(crc, (char*)ctlInfo, offsetof(ShareStorageXLogCtl, crc));
    FIN_CRC32C(crc);

    return crc;
}

void UpdatePostgresqlFile(const char *optName, const char *gucLine)
{
    char guc[MAXPGPATH];
    char gucBak[MAXPGPATH];
    char gucLock[MAXPGPATH];
    errno_t ret = snprintf_s(guc, MAXPGPATH, MAXPGPATH - 1, "%s/postgresql.conf", t_thrd.proc_cxt.DataDir);
    securec_check_ss(ret, "\0", "\0");
    ret = snprintf_s(gucBak, MAXPGPATH, MAXPGPATH - 1, "%s/%s", t_thrd.proc_cxt.DataDir, CONFIG_BAK_FILENAME_WAL);
    securec_check_ss(ret, "\0", "\0");

    struct stat statbuf;
    if (lstat(guc, &statbuf) != 0) {
        if (errno != ENOENT) {
            ereport(FATAL, (errmsg("UpdatePostgresqlFile could not stat file \"%s\": %m", guc)));
        }
    }

    ConfFileLock filelock = { NULL, 0 };
    ret = snprintf_s(gucLock, MAXPGPATH, MAXPGPATH - 1, "%s/postgresql.conf.lock", t_thrd.proc_cxt.DataDir);
    securec_check_ss(ret, "\0", "\0");
    /* 1. lock postgresql.conf */
    if (get_file_lock(gucLock, &filelock) != CODE_OK) {
        ereport(FATAL, (errmsg("UpdatePostgresqlFile:Modify postgresql.conf failed : can not get the file lock ")));
    }

    copy_file_internal(guc, gucBak, true);
    char** opt_lines = read_guc_file(gucBak);
    if (opt_lines == NULL) {
        release_file_lock(&filelock);
        ereport(FATAL, (errmsg("UpdatePostgresqlFile:the config file has no data,please check it.")));
    }

    modify_guc_one_line(&opt_lines, optName, gucLine);
    ret = write_guc_file(gucBak, opt_lines);
    release_opt_lines(opt_lines);
    opt_lines = NULL;

    if (ret != CODE_OK) {
        release_file_lock(&filelock);
        ereport(FATAL, (errmsg("UpdatePostgresqlFile:the config file %s updates failed.", gucBak)));
    }

    if (rename(gucBak, guc) != 0) {
        release_file_lock(&filelock);
        ereport(FATAL, (errcode_for_file_access(), errmsg("could not rename \"%s\" to \"%s\": %m", gucBak, guc)));
    }

    if (lstat(guc, &statbuf) != 0) {
        if (errno != ENOENT) {
            release_file_lock(&filelock);
            ereport(PANIC, (errmsg("UpdatePostgresqlFile2:could not stat file \"%s\": %m", guc)));
        }
    }

    if (statbuf.st_size > 0) {
        copy_file_internal(guc, gucBak, true);
    }
    release_file_lock(&filelock);
}

void ShareStorageInit()
{
    if (g_instance.attr.attr_storage.xlog_file_path != NULL && g_instance.attr.attr_storage.xlog_file_size > 0) {
        bool found = false;
        void *tmpBuf = ShmemInitStruct("share storage Ctl", CalShareStorageCtlSize(), &found);
        g_instance.xlog_cxt.shareStorageXLogCtl = (ShareStorageXLogCtl *)TYPEALIGN(MEMORY_ALIGNED_SIZE, tmpBuf);
        g_instance.xlog_cxt.shareStorageXLogCtlOrigin = tmpBuf;
        InitDoradoStorage(g_instance.attr.attr_storage.xlog_file_path,
            (uint64)g_instance.attr.attr_storage.xlog_file_size);
        if (!IsInitdb && g_instance.attr.attr_storage.xlog_lock_file_path != NULL) {
            g_instance.xlog_cxt.shareStorageLockFd = BasicOpenFile(g_instance.attr.attr_storage.xlog_lock_file_path,
                O_CREAT | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
            if (g_instance.xlog_cxt.shareStorageLockFd < 0) {
                ereport(ERROR, (errcode_for_file_access(), errmsg("could not open file \"%s\": %m",
                    g_instance.attr.attr_storage.xlog_lock_file_path)));
            }
        }

        if (((uint64)g_instance.attr.attr_storage.xlog_file_size !=
            g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize) && (IS_SHARED_STORAGE_STANDBY_CLUSTER)) {
            uint32 const bufSz = 256;
            char buf[bufSz];

            errno_t errorno = snprintf_s(buf, sizeof(buf), sizeof(buf) - 1, "%lu",
                g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize);
            securec_check_ss(errorno, "", "");
            SetConfigOption("xlog_file_size", buf, PGC_POSTMASTER, PGC_S_ARGV);
            char option[bufSz];
            errorno = snprintf_s(option, sizeof(option), sizeof(option) - 1, "xlog_file_size=%lu\n",
                g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize);
            securec_check_ss(errorno, "", "");
            UpdatePostgresqlFile("xlog_file_size", option);
        }

        if (ENABLE_DMS) {
            g_instance.dms_cxt.SSRecoveryInfo.dorado_sharestorage_inited = true;
        }
    }
}

void ShareStorageSetBuildErrorAndExit(HaRebuildReason reason, bool setRcvDone)
{
    if (!setRcvDone) {
        /* need build return error */
        ha_set_rebuild_connerror(reason, REPL_INFO_ERROR);
    } else {
        uint32 disableConnectionNode =
            pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
        if (disableConnectionNode) {
            pg_atomic_write_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage, true);
        }
    }
    /* exit from walreceiber */
    ereport(ERROR, (errcode(ERRCODE_INVALID_STATUS),
        errmsg("ShareStorageSetBuildErrorAndExit, exit from walreceiver! build reason: %s, RcvDone:%d",
            wal_get_rebuild_reason_string(reason), (int)setRcvDone)));
}

void FindLastRecordCheckInfoOnShareStorage(XLogRecPtr *lastRecordPtr, pg_crc32 *lastRecordCrc, int *lastRecordLen)
{
    ShareStorageXLogCtl *ctlInfo = AlignAllocShareStorageCtl();
    ReadShareStorageCtlInfo(ctlInfo);
    XLogRecPtr startLsn = ctlInfo->insertHead - (ctlInfo->insertHead % XLOG_BLCKSZ);
    XLogRecPtr endPtr = XLogSegSize;
    if (ctlInfo->insertHead > g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize) {
        endPtr = ctlInfo->insertHead - g_instance.xlog_cxt.shareStorageopCtl.xlogFileSize;
    }

    XLogReaderState *xlogReader = XLogReaderAllocate(&SharedStorageXLogPageRead, NULL,
                                                     g_instance.xlog_cxt.shareStorageopCtl.blkSize);
    if (xlogReader == NULL) {
        ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory"),
                        errdetail("FindLastRecordCheckInfoOnShareStorage Failed while allocating an XLog reader")));
    }
    bool findValidXLogFile = false;
    XLogRecPtr curLsn;

    while (XLByteLE(endPtr, startLsn)) {
        XLogRecPtr curEnd;
        curLsn = XLogFindNextRecord(xlogReader, startLsn, &curEnd);
        if (XLogRecPtrIsInvalid(curLsn) || XLByteLT(ctlInfo->insertHead, curEnd)) {
            startLsn = startLsn - XLOG_BLCKSZ;
            continue;
        } else {
            findValidXLogFile = true;
            break;
        }
    }

    if (!findValidXLogFile) {
        if (xlogReader != NULL) {
            XLogReaderFree(xlogReader);
            xlogReader = NULL;
        }
        AlignFreeShareStorageCtl(ctlInfo);
        ereport(ERROR, (errcode(ERRCODE_INVALID_STATUS), errmsg("could not find valid xlog on share storage")));
        return;
    }

    for (;;) {
        char *errorMsg = NULL;
        XLogRecord *record = XLogReadRecord(xlogReader, curLsn, &errorMsg);
        if (record == NULL || XLByteLT(ctlInfo->insertHead, xlogReader->EndRecPtr)) {
            break;
        }
        curLsn = InvalidXLogRecPtr;
        *lastRecordCrc = record->xl_crc;
        *lastRecordLen = (int)(xlogReader->EndRecPtr - xlogReader->ReadRecPtr);
        *lastRecordPtr = xlogReader->ReadRecPtr;
    }

    if (xlogReader != NULL) {
        XLogReaderFree(xlogReader);
        xlogReader = NULL;
    }

    if (XLogRecPtrIsInvalid(*lastRecordPtr)) {
        ereport(ERROR, (errcode(ERRCODE_INVALID_STATUS), errmsg("could not find max xlog on share storage")));
    } else {
        uint32 shiftSize = 32;
        ereport(LOG, (errcode(ERRCODE_INVALID_STATUS),
                      errmsg("find max xlog on share storage %X/%X", static_cast<uint32>(*lastRecordPtr >> shiftSize),
                             static_cast<uint32>(*lastRecordPtr))));
    }

    AlignFreeShareStorageCtl(ctlInfo);
}

Size SimpleValidatePage(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, char *page)
{
    XLogPageHeader hdr = (XLogPageHeader)page;
    bool ret = ValidXLogPageHeader(xlogreader, targetPagePtr, hdr);
    if (!ret) {
        elog(LOG, "SimpleValidatePage:%s", xlogreader->errormsg_buf);
        return 0;
    }

    return hdr->xlp_total_len;
}

int SharedStorageXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
                              XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
    int read_len = ReadXlogFromShareStorage(targetPagePtr, readBuf, Max(XLOG_BLCKSZ, reqLen));
    return read_len;
}

void GetWritePermissionSharedStorage()
{
    if (!IS_SHARED_STORAGE_MODE || IsInitdb) {
        return;
    }

    const int maxRetryTimes = 20;
    int retryTimes = 0;

    if (g_instance.attr.attr_storage.xlog_lock_file_path == NULL) {
        /* SCSI-3 */
        char cmd[MAX_PATH_LEN] = {0};
        char *appName = u_sess->attr.attr_common.application_name;
        char *shareStoragePath = g_instance.attr.attr_storage.xlog_file_path;
        errno_t rc = 0;

        rc = snprintf_s(cmd, MAX_PATH_LEN, MAX_PATH_LEN - 1, "gs_persist %s %s -R", shareStoragePath, appName);
        securec_check_ss_c(rc, "\0", "\0");
        while (retryTimes < maxRetryTimes) {
            ++retryTimes;
            rc = system(cmd);
            if (rc == 0) {
                break;
            }

            (void)sleep(1);
        }
        if (rc != 0) {
            ereport(FATAL, (errmsg("could not reserve the key of %s on %s", appName, shareStoragePath)));
        }
        ereport(LOG, (errmsg("reserve the key of %s on shared storage: %s", appName, shareStoragePath)));
    } else {
        /* NAS lock file */
        bool locked = false;
        while (retryTimes < maxRetryTimes) {
            ++retryTimes;
            locked = LockNasWriteFile(g_instance.xlog_cxt.shareStorageLockFd);
            if (locked) {
                break;
            }

            (void)sleep(1);
        }

        if (!locked) {
            ereport(FATAL, (errmsg("could not lock lock file")));
        }
        ereport(LOG, (errmsg("lock lock file(%d) %s", g_instance.xlog_cxt.shareStorageLockFd,
            g_instance.attr.attr_storage.xlog_lock_file_path)));
    }
}

XLogRecPtr GetFlushMainStandby()
{
    XLogRecPtr recptr = InvalidXLogRecPtr;
    XLogRecPtr replayptr = InvalidXLogRecPtr;
    XLogRecPtr flushptr = InvalidXLogRecPtr;
    volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
    volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;

    SpinLockAcquire(&walrcv->mutex);
    recptr = walrcv->receivedUpto;
    SpinLockRelease(&walrcv->mutex);
    SpinLockAcquire(&xlogctl->info_lck);
    replayptr = xlogctl->lastReplayedEndRecPtr;
    SpinLockRelease(&xlogctl->info_lck);
    flushptr = XLByteLT(recptr, replayptr)? replayptr : recptr;

    return flushptr;
}

/* SS bans data write unless current node is:
 * 1. normal: running as primary;
 * 2. switchover: primary demoting, doing shutdown checkpoint;
 * 3. switchover: standby promoting, doing StartupXLOG;
 * 4. switchover: standby promoted, running as primary de facto,
 *    waiting for DMS reformer thread to update its role.
 * 5. failover: doing StartupXLOG
 */
bool SSXLogInsertAllowed()
{
    /* allow xlog write as long as to different VGs */
    return true;
}

bool SSModifySharedLunAllowed()
{
    if (!ENABLE_DMS ||
        SS_PRIMARY_MODE ||
        g_instance.dms_cxt.SSClusterState == NODESTATE_PRIMARY_DEMOTING ||
        g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTING ||
        g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTED ||
        SS_STANDBY_FAILOVER) {
        return true;
    }
    return false;
}

static int SSOndemandCopyXLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
{
    return XLogFileInitInternal(logsegno, use_existent, use_lock, SS_XLOGRECOVERYDIR);
}

static int SSOndemandCopyXlogFileOpen(XLogSegNo segno)
{
    return XLogFileOpenInternal(segno, SS_XLOGRECOVERYDIR);
}

static void SSOndemandCopyXlogFileClose(void)
{
    Assert(t_thrd.ondemand_xlog_copy_cxt.openLogFile >= 0);

    if (close(t_thrd.ondemand_xlog_copy_cxt.openLogFile)) {
        ereport(PANIC, (errcode_for_file_access(),
            errmsg("could not close copy log file %s: %m",
                XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo))));
    }

    t_thrd.ondemand_xlog_copy_cxt.openLogFile = -1;
}

static void SSOndemandXlogCopy(XLogSegNo copySegNo, uint32 startOffset, char *copyBuffer, Size copyBytes)
{
    // only copy when recovery node and reformer node is not same
    if (!SS_IN_ONDEMAND_RECOVERY || SS_OFFICIAL_RECOVERY_NODE) {
        return;
    }

    if (t_thrd.ondemand_xlog_copy_cxt.openLogSegNo != copySegNo) {
        if (t_thrd.ondemand_xlog_copy_cxt.openLogFile >= 0) {
            SSOndemandCopyXlogFileClose();
        }
        t_thrd.ondemand_xlog_copy_cxt.openLogSegNo = copySegNo;

        bool use_existent = true;
        t_thrd.ondemand_xlog_copy_cxt.openLogFile =
            SSOndemandCopyXLogFileInit(t_thrd.ondemand_xlog_copy_cxt.openLogSegNo, &use_existent, true);
        t_thrd.ondemand_xlog_copy_cxt.openLogOff = 0;
    }

    if (t_thrd.ondemand_xlog_copy_cxt.openLogFile <= 0) {
        t_thrd.ondemand_xlog_copy_cxt.openLogFile =
            SSOndemandCopyXlogFileOpen(t_thrd.ondemand_xlog_copy_cxt.openLogSegNo);
        t_thrd.ondemand_xlog_copy_cxt.openLogOff = 0;
    }

    if (t_thrd.ondemand_xlog_copy_cxt.openLogOff != startOffset) {
        if (lseek(t_thrd.ondemand_xlog_copy_cxt.openLogFile, (off_t)startOffset, SEEK_SET) < 0) {
            ereport(PANIC, (errcode_for_file_access(),
                errmsg("could not seek in log file %s to offset %u: %m",
                        XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo),
                        startOffset)));
        }
        t_thrd.ondemand_xlog_copy_cxt.openLogOff = startOffset;
    }

    Size actualBytes = write(t_thrd.ondemand_xlog_copy_cxt.openLogFile, copyBuffer, copyBytes);
    if (actualBytes != copyBytes) {
        /* if write didn't set errno, assume no disk space */
        if (errno == 0) {
            errno = ENOSPC;
        }
        ereport(PANIC, (errcode_for_file_access(),
            errmsg("could not write to log file %s at offset %u, length %lu: %m",
                XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo),
                t_thrd.ondemand_xlog_copy_cxt.openLogOff, (unsigned long)copyBytes)));
    }
    t_thrd.ondemand_xlog_copy_cxt.openLogOff += copyBytes;
}

static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
                      XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
    /* Load reader private data */
    XLogPageReadPrivate *readprivate = (XLogPageReadPrivate *)xlogreader->private_data;
    int emode = IsExtremeRedo() ? LOG : readprivate->emode;
    XLogRecPtr RecPtr = targetPagePtr;
    uint32 targetPageOff;
    bool processtrxn = false;
    bool fetching_ckpt = readprivate->fetching_ckpt;
    bool randAccess = IsExtremeRedo() ? false : readprivate->randAccess;
    XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
    XLogSegNo replayedSegNo;
    uint32 actualBytes;

#ifdef USE_ASSERT_CHECKING
    XLogSegNo targetSegNo;

    XLByteToSeg(targetPagePtr, targetSegNo);
#endif
    targetPageOff = targetPagePtr % XLogSegSize;

    if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) {
        /*
         * Request a restartpoint if we've replayed too much xlog since the
         * last one.
         */
        if (t_thrd.xlog_cxt.StandbyModeRequested &&
            (t_thrd.xlog_cxt.bgwriterLaunched || t_thrd.xlog_cxt.pagewriter_launched) && !dummyStandbyMode) {
            if (get_real_recovery_parallelism() > 1) {
                XLByteToSeg(GetXLogReplayRecPtr(NULL), replayedSegNo);
            } else {
                replayedSegNo = t_thrd.xlog_cxt.readSegNo;
            }
            if (XLogCheckpointNeeded(replayedSegNo)) {
                (void)GetRedoRecPtr();
                if (XLogCheckpointNeeded(replayedSegNo)) {
                    RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
                }
            }
        }

        close(t_thrd.xlog_cxt.readFile);
        t_thrd.xlog_cxt.readFile = -1;
        t_thrd.xlog_cxt.readSource = 0;
    }

    XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo);
    XLByteAdvance(RecPtr, reqLen);

retry:
    /* See if we need to retrieve more data */
    if (t_thrd.xlog_cxt.readFile < 0 ||
        (t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM && XLByteLT(t_thrd.xlog_cxt.receivedUpto, RecPtr))) {
        if (t_thrd.xlog_cxt.StandbyMode && t_thrd.xlog_cxt.startup_processing && DORADO_STANDBY_CLUSTER) {
            /*
             * In standby mode, wait for the requested record to become
             * available, either via restore_command succeeding to restore the
             * segment, or via walreceiver having streamed the record.
             */
            for (;;) {
                /*
                 * Need to check here also for the case where consistency level is
                 * already reached without replaying any record i.e. just after reading
                 * of checkpoint data it has reached to minRecoveryPoint. Also
                 * whenever we are going to loop in the data receive from master
                 * node its bettwe we check if consistency level has reached. So
                 * instead of keeping in all places before ReadRecord, we can keep
                 * here in centralised location.
                 */
                ProcTxnWorkLoad(false);

                CheckRecoveryConsistency();
                if (WalRcvInProgress()) {
                    XLogRecPtr expectedRecPtr = RecPtr;
                    bool havedata = false;

                    /*
                     * If we find an invalid record in the WAL streamed from
                     * master, something is seriously wrong. There's little
                     * chance that the problem will just go away, but PANIC is
                     * not good for availability either, especially in hot
                     * standby mode. Disconnect, and retry from
                     * archive/pg_xlog again. The WAL in the archive should be
                     * identical to what was streamed, so it's unlikely that
                     * it helps, but one can hope...
                     */
                    if (t_thrd.xlog_cxt.failedSources & XLOG_FROM_STREAM) {
                        ProcTxnWorkLoad(true);
                        ereport(LOG, (errmsg("read from stream failed, request xlog receivedupto at %X/%X. targetRecPtr:%X/%x",
                                             (uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
                                             (uint32)t_thrd.xlog_cxt.receivedUpto,
                                             (uint32)(targetRecPtr >> 32),
                                             (uint32)targetRecPtr
                                             )));
                        ShutdownWalRcv();
                        continue;
                    }

                    /*
                     * Walreceiver is active, so see if new data has arrived.
                     *
                     * We only advance XLogReceiptTime when we obtain fresh
                     * WAL from walreceiver and observe that we had already
                     * processed everything before the most recent "chunk"
                     * that it flushed to disk.  In steady state where we are
                     * keeping up with the incoming data, XLogReceiptTime will
                     * be updated on each cycle.  When we are behind,
                     * XLogReceiptTime will not advance, so the grace time
                     * alloted to conflicting queries will decrease.
                     */
                    if (RecPtr % XLogSegSize == 0) {
                        XLByteAdvance(expectedRecPtr, SizeOfXLogLongPHD);
                    } else if (RecPtr % XLOG_BLCKSZ == 0) {
                        XLByteAdvance(expectedRecPtr, SizeOfXLogShortPHD);
                    }

                    if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
                        havedata = true;
                    } else {
                        XLogRecPtr latestChunkStart;

                        t_thrd.xlog_cxt.receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
                        if (XLByteLT(expectedRecPtr, t_thrd.xlog_cxt.receivedUpto)) {
                            havedata = true;
                            if (!XLByteLT(RecPtr, latestChunkStart)) {
                                t_thrd.xlog_cxt.XLogReceiptTime = GetCurrentTimestamp();
                                SetCurrentChunkStartTime(t_thrd.xlog_cxt.XLogReceiptTime);
                            }
                        } else {
                            havedata = false;
                        }
                    }
                    if (havedata) {
                        /*
                         * Great, streamed far enough. Open the file if it's
                         * not open already.  Use XLOG_FROM_STREAM so that
                         * source info is set correctly and XLogReceiptTime
                         * isn't changed.
                         */
                        if (t_thrd.xlog_cxt.readFile < 0) {
                            t_thrd.xlog_cxt.readFile = XLogFileRead(t_thrd.xlog_cxt.readSegNo, PANIC,
                                                                    t_thrd.xlog_cxt.recoveryTargetTLI, XLOG_FROM_STREAM,
                                                                    false);
                            Assert(t_thrd.xlog_cxt.readFile >= 0);
                        } else {
                            /* just make sure source info is correct... */
                            t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM;
                            t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM;
                        }

                        break;
                    }

                    t_thrd.xlog_cxt.RedoDone = IsRedoDonePromoting();
                    pg_memory_barrier();

                    if (IS_SHARED_STORAGE_MODE) {
                        uint32 disableConnectionNode =
                            pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
                        if (disableConnectionNode && WalRcvIsRunning()) {
                            ereport(LOG, (errmsg("request xlog receivedupto at %X/%X.",
                                                 (uint32)(t_thrd.xlog_cxt.receivedUpto >> 32),
                                                 (uint32)t_thrd.xlog_cxt.receivedUpto)));
                            ShutdownWalRcv();
                        }
                    }
                    if (!processtrxn) {
                        ProcTxnWorkLoad(true);
                        processtrxn = true;
                        goto retry;
                    }
                    /*
                     * Wait for more WAL to arrive, or timeout to be reached
                     */
                    WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L);
                    processtrxn = false;
                    ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch);
                } else {
                    uint32 sources;
                    /*
                     * Until walreceiver manages to reconnect, poll the
                     * archive.
                     */
                    if (t_thrd.xlog_cxt.readFile >= 0) {
                        close(t_thrd.xlog_cxt.readFile);
                        t_thrd.xlog_cxt.readFile = -1;
                    }
                    /* Reset curFileTLI if random fetch. */
                    if (randAccess) {
                        t_thrd.xlog_cxt.curFileTLI = 0;
                    }

                    /*
                     * Try to restore the file from archive, or read an
                     * existing file from pg_xlog.
                     */
                    sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
                    ereport(DEBUG5, (errmsg("failedSources: %u", t_thrd.xlog_cxt.failedSources)));
                    if (!(sources & ~t_thrd.xlog_cxt.failedSources)) {
                        /*
                         * We've exhausted all options for retrieving the
                         * file. Retry.
                         */
                        t_thrd.xlog_cxt.failedSources = 0;

                        /*
                         * Before we sleep, re-scan for possible new timelines
                         * if we were requested to recover to the latest
                         * timeline.
                         */
                        if (t_thrd.xlog_cxt.recoveryTargetIsLatest) {
                            if (rescanLatestTimeLine()) {
                                continue;
                            }
                        }

                        if (t_thrd.startup_cxt.shutdown_requested) {
                            ereport(LOG, (errmsg("startup shutdown")));
                            proc_exit(0);
                        }

                        if (!xlogctl->IsRecoveryDone) {
                            g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp();
                            g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr;
                            ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                                          errmsg("XLogPageRead IsRecoveryDone is set true,"
                                                 "ReadRecPtr:%X/%X, EndRecPtr:%X/%X",
                                                 (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
                                                 (uint32)(t_thrd.xlog_cxt.ReadRecPtr),
                                                 (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
                                                 (uint32)(t_thrd.xlog_cxt.EndRecPtr))));
                            parallel_recovery::redo_dump_all_stats();
                        }

                        /*
                         * signal postmaster to update local redo end
                         * point to gaussdb state file.
                         */
                        ProcTxnWorkLoad(true);
                        if (!xlogctl->IsRecoveryDone) {
                            SendPostmasterSignal(PMSIGNAL_LOCAL_RECOVERY_DONE);
                            if (DORADO_STANDBY_CLUSTER && SS_PERFORMING_SWITCHOVER) {
                                g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTED;
                            }
                        }

                        SpinLockAcquire(&xlogctl->info_lck);
                        xlogctl->IsRecoveryDone = true;
                        SpinLockRelease(&xlogctl->info_lck);
                        static uint64 printFrequency = 0;
                        if (!(IS_SHARED_STORAGE_MODE) ||
                            pg_atomic_read_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage)) {
                            knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM);
                            if ((printFrequency & 0xFF) == 0) {
                                ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
                                          errmsg("XLogPageRead set redo finish status,"
                                                 "ReadRecPtr:%X/%X, EndRecPtr:%X/%X",
                                                 (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32),
                                                 (uint32)(t_thrd.xlog_cxt.ReadRecPtr),
                                                 (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32),
                                                 (uint32)(t_thrd.xlog_cxt.EndRecPtr))));
                            }
                            printFrequency++;
                            /*
                             * If it hasn't been long since last attempt, sleep 1s to
                             * avoid busy-waiting.
                             */
                            if (IS_SHARED_STORAGE_MODE) {
                                uint32 connMode =
                                    pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node);
                                if (connMode) {
                                    pg_atomic_write_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node,
                                                        false);
                                }
                                pg_usleep(2000000L);
                            } else {
#ifdef ENABLE_LITE_MODE
                                pg_usleep(1000000L);
#else
                                pg_usleep(50000L);
#endif
                            }
                        }
                        /*
                         * If primary_conninfo is set, launch walreceiver to
                         * try to stream the missing WAL, before retrying to
                         * restore from archive/pg_xlog.
                         *
                         * If fetching_ckpt is TRUE, RecPtr points to the
                         * initial checkpoint location. In that case, we use
                         * RedoStartLSN as the streaming start position
                         * instead of RecPtr, so that when we later jump
                         * backwards to start redo at RedoStartLSN, we will
                         * have the logs streamed already.
                         */
                        load_server_mode();

                        if (IS_SHARED_STORAGE_STANBY_MODE && !IS_SHARED_STORAGE_MAIN_STANDBY_MODE) {
                            ProcTxnWorkLoad(false);
                            /* use volatile pointer to prevent code rearrangement */
                            volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
#ifndef ENABLE_MULTIPLE_NODES
                            rename_recovery_conf_for_roach();
#endif

                            ereport(LOG, (errmsg("request xlog stream from shared storage at %X/%X.",
                                                 fetching_ckpt ? (uint32)(t_thrd.xlog_cxt.RedoStartLSN >> 32)
                                                                : (uint32)(targetRecPtr >> 32),
                                                 fetching_ckpt ? (uint32)t_thrd.xlog_cxt.RedoStartLSN
                                                                : (uint32)targetRecPtr)));
                            ShutdownWalRcv();
                            t_thrd.xlog_cxt.receivedUpto = 0;
                            SpinLockAcquire(&walrcv->mutex);
                            walrcv->receivedUpto = 0;
                            SpinLockRelease(&walrcv->mutex);

                            RequestXLogStreaming(fetching_ckpt ? &t_thrd.xlog_cxt.RedoStartLSN : &targetRecPtr, 0,
                                                 REPCONNTARGET_SHARED_STORAGE, 0);
                            continue;
                        }
                    }
                    /* Don't try to read from a source that just failed */
                    sources &= ~t_thrd.xlog_cxt.failedSources;
                    t_thrd.xlog_cxt.readFile = SSXLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, emode, sources, xlog_path);
                    if (t_thrd.xlog_cxt.readFile >= 0) {
                        break;
                    }

                    ereport(DEBUG5, (errmsg("do not find any more files.sources=%u failedSources=%u", sources,
                                            t_thrd.xlog_cxt.failedSources)));
                    /*
                     * Nope, not found in archive and/or pg_xlog.
                     */
                    t_thrd.xlog_cxt.failedSources |= sources;

                }

                /*
                 * This possibly-long loop needs to handle interrupts of
                 * startup process.
                 */
                RedoInterruptCallBack();
            }
        } else {
            /* In archive or crash recovery. */
            if (t_thrd.xlog_cxt.readFile < 0) {
                uint32 sources;

                /* Reset curFileTLI if random fetch. */
                if (randAccess) {
                    t_thrd.xlog_cxt.curFileTLI = 0;
                }

                sources = XLOG_FROM_PG_XLOG;
                if (t_thrd.xlog_cxt.InArchiveRecovery) {
                    sources |= XLOG_FROM_ARCHIVE;
                }

                t_thrd.xlog_cxt.readFile = SSXLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo,
                    emode, sources, xlog_path);

                if (t_thrd.xlog_cxt.readFile < 0) {
                    return -1;
                }
            }
        }
    }

    /*
     * At this point, we have the right segment open and if we're streaming we
     * know the requested record is in it.
     */
    Assert(t_thrd.xlog_cxt.readFile != -1);

    if (t_thrd.xlog_cxt.readSource == XLOG_FROM_STREAM) {
        if ((targetPagePtr / XLOG_BLCKSZ) != (t_thrd.xlog_cxt.receivedUpto / XLOG_BLCKSZ)) {
            t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
        } else {
            t_thrd.xlog_cxt.readLen = t_thrd.xlog_cxt.receivedUpto % XLogSegSize - targetPageOff;
        }
    } else {
        t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ;
    }

    /* Read the requested page */
    t_thrd.xlog_cxt.readOff = targetPageOff;

    if (DORADO_STANDBY_CLUSTER_MAINSTANDBY_NODE) {
try_again:
        if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) {
            ereport(emode_for_corrupt_record(emode, RecPtr),
                    (errcode_for_file_access(),
                        errmsg("could not seek in log file %s to offset %u: %m",
                            XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
                            t_thrd.xlog_cxt.readOff)));
            if (errno == EINTR) {
                errno = 0;
                pg_usleep(1000);
                goto try_again;
            }
            goto next_record_is_invalid;
        }
        pgstat_report_waitevent(WAIT_EVENT_WAL_READ);
        actualBytes = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ);
        pgstat_report_waitevent(WAIT_EVENT_END);
        if (actualBytes != XLOG_BLCKSZ) {
            ereport(emode_for_corrupt_record(emode, RecPtr),
                    (errcode_for_file_access(),
                        errmsg("could not read from log file %s to offset %u: %m",
                            XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
                            t_thrd.xlog_cxt.readOff)));
            if (errno == EINTR) {
                errno = 0;
                pg_usleep(1000);
                goto try_again;
            }
            goto next_record_is_invalid;
        }
    } else {
        if (xlogreader->preReadBuf != NULL) {
            actualBytes = (uint32)SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, readBuf, XLOG_BLCKSZ);
        } else {
            actualBytes = (uint32)pread(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ, t_thrd.xlog_cxt.readOff);
        }

        if (actualBytes != XLOG_BLCKSZ) {
            ereport(LOG, (errcode_for_file_access(), errmsg("read xlog(start:%X/%X, pos:%u len:%d) failed : %m",
                                                            static_cast<uint32>(targetPagePtr >> BIT_NUM_INT32),
                                                            static_cast<uint32>(targetPagePtr), targetPageOff,
                                                            reqLen)));
            ereport(emode_for_corrupt_record(emode, RecPtr),
                    (errcode_for_file_access(),
                        errmsg("could not read from log file %s to offset %u: %m",
                            XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo),
                            t_thrd.xlog_cxt.readOff)));
            goto next_record_is_invalid;
        }
    }

    Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo);
    Assert(targetPageOff == t_thrd.xlog_cxt.readOff);
    Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen);

    *readTLI = t_thrd.xlog_cxt.curFileTLI;

    return t_thrd.xlog_cxt.readLen;

next_record_is_invalid:
    t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource;

    if (t_thrd.xlog_cxt.readFile >= 0) {
        close(t_thrd.xlog_cxt.readFile);
    }
    t_thrd.xlog_cxt.readFile = -1;
    t_thrd.xlog_cxt.readLen = 0;
    t_thrd.xlog_cxt.readSource = 0;

    return -1;
}

int SSXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
    XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path)
{
    int read_len = SSReadXLog(xlogreader, targetPagePtr, reqLen, targetRecPtr,
                              readBuf, readTLI, g_instance.dms_cxt.SSRecoveryInfo.recovery_xlog_dir);
    return read_len;
}