openGauss-server/src/gausskernel/process/postmaster/bgwriter.cpp

/*
 *
 * bgwriter.cpp
 *
 * The background writer (bgwriter) is new as of Postgres 8.0.	It attempts
 * to keep regular backends from having to write out dirty shared buffers
 * (which they would only do when needing to free a shared buffer to read in
 * another page).  In the best scenario all writes from shared buffers will
 * be issued by the background writer process.	However, regular backends are
 * still empowered to issue writes if the bgwriter fails to maintain enough
 * clean shared buffers.
 *
 * As of Postgres 9.2 the bgwriter no longer handles checkpoints.
 *
 * The bgwriter is started by the postmaster as soon as the startup subprocess
 * finishes, or as soon as recovery begins if we are doing archive recovery.
 * It remains alive until the postmaster commands it to terminate.
 * Normal termination is by SIGTERM, which instructs the bgwriter to exit(0).
 * Emergency termination is by SIGQUIT; like any backend, the bgwriter will
 * simply abort and exit on SIGQUIT.
 *
 * If the bgwriter exits unexpectedly, the postmaster treats that the same
 * as a backend crash: shared memory may be corrupted, so remaining backends
 * should be killed by SIGQUIT and then a recovery cycle started.
 *
 *
 * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  src/gausskernel/process/postmaster/bgwriter.cpp
 *
 * -------------------------------------------------------------------------
 */
#include "postgres.h"
#include "knl/knl_variable.h"

#include <signal.h>
#include <sys/time.h>

#include "access/xlog_internal.h"
#include "access/double_write.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "postmaster/pagewriter.h"
#include "storage/buf/bufmgr.h"
#include "storage/ipc.h"
#include "storage/lock/lwlock.h"
#include "storage/proc.h"
#include "storage/shmem.h"
#include "storage/smgr/smgr.h"
#include "storage/spin.h"
#include "storage/standby.h"
#include "utils/guc.h"
#include "utils/builtins.h"
#include "utils/memutils.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include "gssignal/gs_signal.h"
#include "replication/slot.h"

/*
 * Multiplier to apply to BgWriterDelay when we decide to hibernate.
 * (Perhaps this needs to be configurable?)
 */
#define HIBERNATE_FACTOR 50

/*
 * LSN and timestamp at which we last issued a LogStandbySnapshot(), to avoid
 * doing so too often or repeatedly if there has been no other write activity
 * in the system.
 */
static TimestampTz last_snapshot_ts;
static XLogRecPtr last_snapshot_lsn = InvalidXLogRecPtr;

/* Signal handlers */
static void bgwriter_quickdie(SIGNAL_ARGS);
static void bgwriter_sighup_handler(SIGNAL_ARGS);
static void bgwriter_request_shutdown_handler(SIGNAL_ARGS);
static void bgwriter_sigusr1_handler(SIGNAL_ARGS);
extern void write_term_log(uint32 term);

/* incremental checkpoint bgwriter thread function */
const int MAX_THREAD_NAME_LEN = 128;

static void setup_bgwriter_signalhook(void)
{
    /*
     * Reset some signals that are accepted by postmaster but not here
     */
    (void)gspqsignal(SIGHUP, bgwriter_sighup_handler);      /* set flag to read config file */
    (void)gspqsignal(SIGINT, SIG_IGN);
    (void)gspqsignal(SIGTERM, bgwriter_request_shutdown_handler);       /* shutdown */
    (void)gspqsignal(SIGQUIT, bgwriter_quickdie);       /* hard crash time */
    (void)gspqsignal(SIGALRM, SIG_IGN);
    (void)gspqsignal(SIGPIPE, SIG_IGN);
    (void)gspqsignal(SIGUSR1, bgwriter_sigusr1_handler);
    (void)gspqsignal(SIGUSR2, SIG_IGN);
    (void)gspqsignal(SIGURG, print_stack);
    /*
     * Reset some signals that are accepted by postmaster but not here
     */
    (void)gspqsignal(SIGCHLD, SIG_DFL);
    (void)gspqsignal(SIGTTIN, SIG_DFL);
    (void)gspqsignal(SIGTTOU, SIG_DFL);
    (void)gspqsignal(SIGCONT, SIG_DFL);
    (void)gspqsignal(SIGWINCH, SIG_DFL);

    /* We allow SIGQUIT (quickdie) at all times */
    sigdelset(&t_thrd.libpq_cxt.BlockSig, SIGQUIT);
}

static void bgwriter_handle_exceptions(WritebackContext *wb_context, MemoryContext bgwriter_cxt)
{
    /*
     * Close all open files after any error.  This is helpful on Windows,
     * where holding deleted files open causes various strange errors.
     * It's not clear we need it elsewhere, but shouldn't hurt.
     */

    /* Since not using PG_TRY, must reset error stack by hand */
    t_thrd.log_cxt.error_context_stack = NULL;

    t_thrd.log_cxt.call_stack = NULL;

    /* Prevent interrupts while cleaning up */
    HOLD_INTERRUPTS();

    /* Report the error to the server log */
    EmitErrorReport();

    /* abort async io, must before LWlock release */
    AbortAsyncListIO();

    /* release resource held by lsc */
    AtEOXact_SysDBCache(false);

    /*
     * These operations are really just a minimal subset of
     * AbortTransaction().  We don't have very many resources to worry
     * about in bgwriter, but we do have LWLocks, buffers, and temp files.
     */
    LWLockReleaseAll();
    AbortBufferIO();
    UnlockBuffers();
    /* buffer pins are released here: */
    ResourceOwnerRelease(t_thrd.utils_cxt.CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true);
    /* we needn't bother with the other ResourceOwnerRelease phases */
    AtEOXact_Buffers(false);
    AtEOXact_SMgr();
    AtEOXact_Files();
    AtEOXact_HashTables(false);

    /* release compression ctx */
    crps_destory_ctxs();

    /*
     * Now return to normal top-level context and clear ErrorContext for
     * next time.
     */
    (void)MemoryContextSwitchTo(bgwriter_cxt);
    FlushErrorState();

    /* Flush any leaked data in the top-level context */
    MemoryContextResetAndDeleteChildren(bgwriter_cxt);

    /* re-initialize to avoid repeated errors causing problems */
    WritebackContextInit(wb_context, &u_sess->attr.attr_storage.bgwriter_flush_after);

    /* Now we can allow interrupts again */
    RESUME_INTERRUPTS();

    /*
     * Sleep at least 1 second after any error.  A write error is likely
     * to be repeated, and we don't want to be filling the error logs as
     * fast as we can.
     */
    pg_usleep(1000000L);
    return;
}

/*
 * Main entry point for bgwriter process
 *
 * This is invoked from AuxiliaryProcessMain, which has already created the
 * basic execution environment, but not enabled signals yet.
 */
void BackgroundWriterMain(void)
{
    sigjmp_buf local_sigjmp_buf;
    MemoryContext bgwriter_context;
    bool prev_hibernate = false;
    WritebackContext wb_context;

    t_thrd.role = BGWRITER;

    ereport(LOG, (errmsg("bgwriter started")));

    setup_bgwriter_signalhook();

    /*
     * We just started, assume there has been either a shutdown or
     * end-of-recovery snapshot.
     */
    last_snapshot_ts = GetCurrentTimestamp();

    /*
     * Create a resource owner to keep track of our resources (currently only
     * buffer pins).
     */
    t_thrd.utils_cxt.CurrentResourceOwner = ResourceOwnerCreate(NULL, "Background Writer",
        THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE));
    /*
     * Create a memory context that we will do all our work in.  We do this so
     * that we can reset the context during error recovery and thereby avoid
     * possible memory leaks.  Formerly this code just ran in
     * t_thrd.top_mem_cxt, but resetting that would be a really bad idea.
     */
    bgwriter_context = AllocSetContextCreate(t_thrd.top_mem_cxt,
        "Background Writer",
        ALLOCSET_DEFAULT_MINSIZE,
        ALLOCSET_DEFAULT_INITSIZE,
        ALLOCSET_DEFAULT_MAXSIZE);
    MemoryContextSwitchTo(bgwriter_context);

    WritebackContextInit(&wb_context, &u_sess->attr.attr_storage.bgwriter_flush_after);

    /*
     * If an exception is encountered, processing resumes here.
     *
     * See notes in postgres.c about the design of this coding.
     */
    int curTryCounter;
    int* oldTryCounter = NULL;
    if (sigsetjmp(local_sigjmp_buf, 1) != 0) {
        gstrace_tryblock_exit(true, oldTryCounter);
        bgwriter_handle_exceptions(&wb_context, bgwriter_context);

        /* Report wait end here, when there is no further possibility of wait */
        pgstat_report_waitevent(WAIT_EVENT_END);
    }
    oldTryCounter = gstrace_tryblock_entry(&curTryCounter);

    /* We can now handle ereport(ERROR) */
    t_thrd.log_cxt.PG_exception_stack = &local_sigjmp_buf;

    /*
     * Unblock signals (they were blocked when the postmaster forked us)
     */
    gs_signal_setmask(&t_thrd.libpq_cxt.UnBlockSig, NULL);
    (void)gs_signal_unblock_sigusr2();

    /*
     * Use the recovery target timeline ID during recovery
     */
    if (RecoveryInProgress())
        t_thrd.xlog_cxt.ThisTimeLineID = GetRecoveryTargetTLI();

    /*
     * Reset hibernation state after any error.
     */
    prev_hibernate = false;

    /* init compression ctx for page compression */
    crps_create_ctxs(t_thrd.role);

    pgstat_report_appname("Background writer");
    pgstat_report_activity(STATE_IDLE, NULL);

    /*
     * Loop forever
     */
    for (;;) {
        bool can_hibernate = false;
        int rc;

        /*
         * when double write is disabled, pg_dw_meta will be created with dw_file_num = 0, so
         * here is for upgrading process. bgwriter will run when enable_incremetal_checkpoint = off.
         */
        if (pg_atomic_read_u32(&g_instance.dw_batch_cxt.dw_version) < DW_SUPPORT_REABLE_DOUBLE_WRITE
            && t_thrd.proc->workingVersionNum >= DW_SUPPORT_REABLE_DOUBLE_WRITE) {
            dw_upgrade_renable_double_write();
        }

        /* Clear any already-pending wakeups */
        ResetLatch(&t_thrd.proc->procLatch);

        pgstat_report_activity(STATE_RUNNING, NULL);

        if (t_thrd.bgwriter_cxt.got_SIGHUP) {
            t_thrd.bgwriter_cxt.got_SIGHUP = false;
            ProcessConfigFile(PGC_SIGHUP);
        }

        if (t_thrd.bgwriter_cxt.shutdown_requested) {
            /*
             * From here on, elog(ERROR) should end with exit(1), not send
             * control back to the sigsetjmp block above
             */
            u_sess->attr.attr_common.ExitOnAnyError = true;
            /* Normal exit from the bgwriter is here */

            /* release compression ctx */
            crps_destory_ctxs();

            proc_exit(0); /* done */
        }

        /*
         * Do one cycle of dirty-buffer writing.
         */
        can_hibernate = BgBufferSync(&wb_context);

        /*
         * Send off activity statistics to the stats collector
         */
        pgstat_send_bgwriter();

        if (FirstCallSinceLastCheckpoint()) {
            /*
             * After any checkpoint, close all smgr files.	This is so we
             * won't hang onto smgr references to deleted files indefinitely.
             */
            smgrcloseall();
        }
        /*
         * Log a new xl_running_xacts every now and then so replication can get
         * into a consistent state faster (think of suboverflowed snapshots)
         * and clean up resources (locks, KnownXids*) more frequently. The
         * costs of this are relatively low, so doing it 4 times
         * (LOG_SNAPSHOT_INTERVAL_MS) a minute seems fine.
         *
         * We assume the interval for writing xl_running_xacts is
         * significantly bigger than BgWriterDelay, so we don't complicate the
         * overall timeout handling but just assume we're going to get called
         * often enough even if hibernation mode is active. It's not that
         * important that log_snap_interval_ms is met strictly. To make sure
         * we're not waking the disk up unneccesarily on an idle system we
         * check whether there has been any WAL inserted since the last time
         * we've logged a running xacts.
         *
         * We do this logging in the bgwriter as its the only process thats
         * run regularly and returns to its mainloop all the
         * time. E.g. Checkpointer, when active, is barely ever in its
         * mainloop and thus makes it hard to log regularly.
         */
        if (XLogStandbyInfoActive() && !RecoveryInProgress()) {
            TimestampTz timeout = 0;
            TimestampTz now = GetCurrentTimestamp();
            timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, LOG_SNAPSHOT_INTERVAL_MS);

            /*
             * only log if enough time has passed and some xlog record has been
             * inserted.
             */
            if (now >= timeout && !XLByteEQ(last_snapshot_lsn, GetXLogInsertRecPtr())) {
                last_snapshot_lsn = LogStandbySnapshot();
                last_snapshot_ts = now;
            }
            if (now >= timeout) {
                LogCheckSlot();
            }
        }

        /*
         * Sleep until we are signaled or BgWriterDelay has elapsed.
         *
         * Note: the feedback control loop in BgBufferSync() expects that we
         * will call it every BgWriterDelay msec.  While it's not critical for
         * correctness that that be exact, the feedback loop might misbehave
         * if we stray too far from that.  Hence, avoid loading this process
         * down with latch events that are likely to happen frequently during
         * normal operation.
         */
        pgstat_report_activity(STATE_IDLE, NULL);
        rc = WaitLatch(&t_thrd.proc->procLatch,
            WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
            u_sess->attr.attr_storage.BgWriterDelay /* ms */);

        /*
         * If no latch event and BgBufferSync says nothing's happening, extend
         * the sleep in "hibernation" mode, where we sleep for much longer
         * than bgwriter_delay says.  Fewer wakeups save electricity.  When a
         * backend starts using buffers again, it will wake us up by setting
         * our latch.  Because the extra sleep will persist only as long as no
         * buffer allocations happen, this should not distort the behavior of
         * BgBufferSync's control loop too badly; essentially, it will think
         * that the system-wide idle interval didn't exist.
         *
         * There is a race condition here, in that a backend might allocate a
         * buffer between the time BgBufferSync saw the alloc count as zero
         * and the time we call StrategyNotifyBgWriter.  While it's not
         * critical that we not hibernate anyway, we try to reduce the odds of
         * that by only hibernating when BgBufferSync says nothing's happening
         * for two consecutive cycles.	Also, we mitigate any possible
         * consequences of a missed wakeup by not hibernating forever.
         */
        if (rc == WL_TIMEOUT && can_hibernate && prev_hibernate) {
            /* Ask for notification at next buffer allocation */
            StrategyNotifyBgWriter(t_thrd.proc->pgprocno);
            /* Sleep ... */
            rc = WaitLatch(&t_thrd.proc->procLatch,
                WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
                u_sess->attr.attr_storage.BgWriterDelay * HIBERNATE_FACTOR);
            /* Reset the notification request in case we timed out */
            StrategyNotifyBgWriter(-1);
        }

        /*
         * Emergency bailout if postmaster has died.  This is to avoid the
         * necessity for manual cleanup of all postmaster children.
         */
        if (rc & WL_POSTMASTER_DEATH) {

            /* release compression ctx */
            crps_destory_ctxs();

            gs_thread_exit(1);
        }

        prev_hibernate = can_hibernate;
    }
}

/* --------------------------------
 *		signal handler routines
 * --------------------------------
 */
/*
 * bg_quickdie() occurs when signalled SIGQUIT by the postmaster.
 *
 * Some backend has bought the farm,
 * so we need to stop what we're doing and exit.
 */
static void bgwriter_quickdie(SIGNAL_ARGS)
{
    gs_signal_setmask(&t_thrd.libpq_cxt.BlockSig, NULL);

    /*
     * We DO NOT want to run proc_exit() callbacks -- we're here because
     * shared memory may be corrupted, so we don't want to try to clean up our
     * transaction.  Just nail the windows shut and get out of town.  Now that
     * there's an atexit callback to prevent third-party code from breaking
     * things by calling exit() directly, we have to reset the callbacks
     * explicitly to make this work as intended.
     */
    on_exit_reset();

    /*
     * Note we do exit(2) not exit(0).	This is to force the postmaster into a
     * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
     * backend.  This is necessary precisely because we don't clean up our
     * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
     * should ensure the postmaster sees this as a crash, too, but no harm in
     * being doubly sure.)
     */

    /* release compression ctx */
    crps_destory_ctxs();

    exit(2);
}

/* SIGHUP: set flag to re-read config file at next convenient time */
static void bgwriter_sighup_handler(SIGNAL_ARGS)
{
    int save_errno = errno;

    t_thrd.bgwriter_cxt.got_SIGHUP = true;

    if (t_thrd.proc)
        SetLatch(&t_thrd.proc->procLatch);

    errno = save_errno;
}

/* SIGTERM: set flag to shutdown and exit */
static void bgwriter_request_shutdown_handler(SIGNAL_ARGS)
{
    int save_errno = errno;

    t_thrd.bgwriter_cxt.shutdown_requested = true;
    t_thrd.int_cxt.ProcDiePending = true;

    if (t_thrd.proc)
        SetLatch(&t_thrd.proc->procLatch);

    errno = save_errno;
}

/* SIGUSR1: used for latch wakeups */
static void bgwriter_sigusr1_handler(SIGNAL_ARGS)
{
    int save_errno = errno;

    latch_sigusr1_handler();

    errno = save_errno;
}

bool IsBgwriterProcess(void)
{
    return (t_thrd.role == BGWRITER);
}

/* bgwriter view function */
Datum bgwriter_view_get_node_name()
{
    if (g_instance.attr.attr_common.PGXCNodeName == NULL || g_instance.attr.attr_common.PGXCNodeName[0] == '\0') {
        return CStringGetTextDatum("not define");
    } else {
        return CStringGetTextDatum(g_instance.attr.attr_common.PGXCNodeName);
    }
}

Datum bgwriter_view_get_actual_flush_num()
{
    return Int64GetDatum(0);
}


Datum bgwriter_view_get_last_flush_num()
{
    return Int32GetDatum(0);
}

Datum bgwriter_view_get_candidate_nums()
{
    int candidate_num = get_curr_candidate_nums(CAND_LIST_NORMAL) + get_curr_candidate_nums(CAND_LIST_NVM) +
        get_curr_candidate_nums(CAND_LIST_SEG);
    return Int32GetDatum(candidate_num);
}

Datum bgwriter_view_get_num_candidate_list()
{
    return Int64GetDatum(g_instance.ckpt_cxt_ctl->get_buf_num_candidate_list);
}

Datum bgwriter_view_get_num_clock_sweep()
{
    return Int64GetDatum(g_instance.ckpt_cxt_ctl->get_buf_num_clock_sweep);
}

const incre_ckpt_view_col g_bgwriter_view_col[INCRE_CKPT_BGWRITER_VIEW_COL_NUM] = {
    {"node_name", TEXTOID, bgwriter_view_get_node_name},
    {"bgwr_actual_flush_total_num", INT8OID, bgwriter_view_get_actual_flush_num},
    {"bgwr_last_flush_num", INT4OID, bgwriter_view_get_last_flush_num},
    {"candidate_slots", INT4OID, bgwriter_view_get_candidate_nums},
    {"get_buffer_from_list", INT8OID, bgwriter_view_get_num_candidate_list},
    {"get_buf_clock_sweep", INT8OID, bgwriter_view_get_num_clock_sweep}};


const uint THREAD_SLEEP_TIME = 10 * 60 * 1000;
void invalid_buffer_bgwriter_main()
{
    sigjmp_buf	localSigjmpBuf;
    MemoryContext bgwriter_context;
    char name[MAX_THREAD_NAME_LEN] = {0};
    WritebackContext wb_context;
    t_thrd.role = SPBGWRITER;

    setup_bgwriter_signalhook();
    ereport(LOG, (errmsg("invalidate buffer bgwriter started")));

    errno_t err_rc = snprintf_s(name, MAX_THREAD_NAME_LEN, MAX_THREAD_NAME_LEN - 1, "%s", "spbgwriter");
    securec_check_ss(err_rc, "", "");

    /* Create a resource owner to keep track of our resources (currently only buffer pins). */
    t_thrd.utils_cxt.CurrentResourceOwner = ResourceOwnerCreate(NULL, name,
            THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE));

    /*
     * Create a memory context that we will do all our work in.  We do this so
     * that we can reset the context during error recovery and thereby avoid
     * possible memory leaks.  Formerly this code just ran in
     * t_thrd.top_mem_cxt, but resetting that would be a really bad idea.
     */
    bgwriter_context = AllocSetContextCreate(t_thrd.top_mem_cxt, name, ALLOCSET_DEFAULT_MINSIZE,
        ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE);
    MemoryContextSwitchTo(bgwriter_context);

    WritebackContextInit(&wb_context, &u_sess->attr.attr_storage.bgwriter_flush_after);

    if (sigsetjmp(localSigjmpBuf, 1) != 0) {
        ereport(WARNING, (errmsg("invalidate buffer bgwriter exception occured.")));
        bgwriter_handle_exceptions(&wb_context, bgwriter_context);
    }

    /* We can now handle ereport(ERROR) */
    t_thrd.log_cxt.PG_exception_stack = &localSigjmpBuf;

    /* Unblock signals (they were blocked when the postmaster forked us) */
    gs_signal_setmask(&t_thrd.libpq_cxt.UnBlockSig, NULL);
    (void)gs_signal_unblock_sigusr2();

    /* Use the recovery target timeline ID during recovery */
    if (RecoveryInProgress()) {
        t_thrd.xlog_cxt.ThisTimeLineID = GetRecoveryTargetTLI();
    }

    pgstat_report_appname("InvalidBufferBgWriter");
    pgstat_report_activity(STATE_IDLE, NULL);
    g_instance.bgwriter_cxt.invalid_buf_proc_latch = &t_thrd.proc->procLatch;
	/* Loop forever */
    for (;;) {
        int rc;

        if (t_thrd.bgwriter_cxt.got_SIGHUP) {
            t_thrd.bgwriter_cxt.got_SIGHUP = false;
            ProcessConfigFile(PGC_SIGHUP);
        }

        if (t_thrd.bgwriter_cxt.shutdown_requested) {
            ereport(LOG, (errmsg("invalidate buffer bgwriter thread shut down")));
            u_sess->attr.attr_common.ExitOnAnyError = true;
            proc_exit(0);
        }

        rc = WaitLatch(&t_thrd.proc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, THREAD_SLEEP_TIME);
        if (rc & WL_POSTMASTER_DEATH) {
            gs_thread_exit(1);
        }

        /* Clear any already-pending wakeups */
        ResetLatch(&t_thrd.proc->procLatch);
        drop_rel_all_forks_buffers();
        drop_rel_one_fork_buffers();
    }
}

const int HASH_TABLE_ELEMENT_MIN_NUM = 512;
HTAB *relfilenode_hashtbl_create(const char *name, bool use_heap_mem)
{
    HASHCTL hashCtrl;
    HTAB *hashtbl = NULL;
    errno_t rc;

    rc = memset_s(&hashCtrl, sizeof(hashCtrl), 0, sizeof(hashCtrl));
    securec_check(rc, "", "");
    hashCtrl.hcxt = (MemoryContext)CurrentMemoryContext;
    hashCtrl.hash = tag_hash;
    hashCtrl.keysize = sizeof(RelFileNode);
    /* keep entrysize >= keysize, stupid limits */
    hashCtrl.entrysize = sizeof(DelFileTag);

    if (use_heap_mem) {
        hashtbl = HeapMemInitHash(name, HASH_TABLE_ELEMENT_MIN_NUM,
            Max(g_instance.attr.attr_common.max_files_per_process, t_thrd.storage_cxt.max_userdatafiles),  &hashCtrl,
            (HASH_FUNCTION | HASH_ELEM));
        if (hashtbl == NULL) {
            ereport(FATAL, (errmsg("could not initialize unlinik relation hash table")));
        }
    } else {
        hashtbl = hash_create(name, HASH_TABLE_ELEMENT_MIN_NUM, &hashCtrl, (HASH_CONTEXT | HASH_FUNCTION | HASH_ELEM));
    }
    return hashtbl;
}

HTAB *relfilenode_fork_hashtbl_create(const char* name, bool use_heap_mem)
{
    HASHCTL hashCtrl;
    HTAB *hashtbl = NULL;
    errno_t rc;

    rc = memset_s(&hashCtrl, sizeof(hashCtrl), 0, sizeof(hashCtrl));
    securec_check(rc, "", "");
    hashCtrl.hcxt = (MemoryContext)CurrentMemoryContext;
    hashCtrl.hash = tag_hash;
    hashCtrl.keysize = sizeof(ForkRelFileNode);
    /* keep  entrysize >= keysize, stupid limits */
    hashCtrl.entrysize = sizeof(DelForkFileTag);

    if (use_heap_mem) {
        hashtbl = HeapMemInitHash(name, HASH_TABLE_ELEMENT_MIN_NUM,
            Max(g_instance.attr.attr_common.max_files_per_process, t_thrd.storage_cxt.max_userdatafiles),
            &hashCtrl, (HASH_FUNCTION | HASH_ELEM));
        if (hashtbl == NULL) {
            ereport(FATAL, (errmsg("could not initialize unlinik relation hash table")));
        }
    } else {
        hashtbl = hash_create(name, HASH_TABLE_ELEMENT_MIN_NUM, &hashCtrl, (HASH_CONTEXT | HASH_FUNCTION | HASH_ELEM));
    }
    return hashtbl;
}

void drop_rel_all_forks_buffers()
{
    HASH_SEQ_STATUS status;
    DelFileTag *entry = NULL;
    DelFileTag *temp_entry = NULL;
    bool found = false;
    uint rel_num = 0;
    HTAB *unlink_rel_hashtbl = g_instance.bgwriter_cxt.unlink_rel_hashtbl;
    HTAB *rel_bak = relfilenode_hashtbl_create("unlink_rel_bak", false);

    /* Obtains the entry in hashtable. */
    LWLockAcquire(g_instance.bgwriter_cxt.rel_hashtbl_lock, LW_SHARED);
    hash_seq_init(&status, unlink_rel_hashtbl);
    while ((temp_entry = (DelFileTag *)hash_seq_search(&status)) != NULL) {
        entry = (DelFileTag*)hash_search(rel_bak, (void *)&temp_entry->rnode, HASH_ENTER, &found);
        if (!found) {
            entry->rnode = temp_entry->rnode;
            entry->maxSegNo = temp_entry->maxSegNo;
            rel_num++;
        }
    }
    LWLockRelease(g_instance.bgwriter_cxt.rel_hashtbl_lock);

    if (rel_num > 0) {
        DropRelFileNodeAllBuffersUsingHash(rel_bak);

        hash_seq_init(&status, rel_bak);
        while ((temp_entry = (DelFileTag *)hash_seq_search(&status)) != NULL) {
            if (temp_entry->maxSegNo == -1) {
                ereport(DEBUG1, (errmodule(MOD_INCRE_BG),
                    errmsg("the max segno is -1, skip forget this rel %u/%u/%u, bucketNode is %d",
                        temp_entry->rnode.spcNode, temp_entry->rnode.dbNode, temp_entry->rnode.relNode,
                        temp_entry->rnode.bucketNode)));
                continue;
            }

            for (int32 i = 0; i < temp_entry->maxSegNo; i++) {
                for (int fork_num = 0; fork_num <= (int)MAX_FORKNUM; fork_num++) {
                    md_register_forget_request(temp_entry->rnode, fork_num, i);
                }
            }

            LWLockAcquire(g_instance.bgwriter_cxt.rel_hashtbl_lock, LW_EXCLUSIVE);
            if (hash_search(unlink_rel_hashtbl, (void *)&temp_entry->rnode, HASH_REMOVE, NULL) == NULL) {
                ereport(DEBUG1, (errmodule(MOD_INCRE_BG),
                    errmsg("rel %u/%u/%u, bucketNode is %d has already been invalidated",
                        temp_entry->rnode.spcNode, temp_entry->rnode.dbNode, temp_entry->rnode.relNode,
                        temp_entry->rnode.bucketNode)));
            } else {
                ereport(DEBUG1, (errmodule(MOD_INCRE_BG),
                    errmsg("invalidate buffer has been finished for rel %u/%u/%u, bucketNode is %d",
                        temp_entry->rnode.spcNode, temp_entry->rnode.dbNode, temp_entry->rnode.relNode,
                        temp_entry->rnode.bucketNode)));
            }
            LWLockRelease(g_instance.bgwriter_cxt.rel_hashtbl_lock);
        }
    }

    hash_destroy(rel_bak);
}

void drop_rel_one_fork_buffers()
{
    HASH_SEQ_STATUS status;
    DelForkFileTag *entry = NULL;
    DelForkFileTag *temp_entry = NULL;
    bool found = false;
    uint rel_num = 0;
    HTAB *unlink_rel_fork_hashtbl = g_instance.bgwriter_cxt.unlink_rel_fork_hashtbl;
    HTAB *rel_bak = relfilenode_fork_hashtbl_create("unlink_rel_one_fork_bak", false);
    /* Obtains the entry in hashtable. */
    LWLockAcquire(g_instance.bgwriter_cxt.rel_one_fork_hashtbl_lock, LW_SHARED);
    hash_seq_init(&status, unlink_rel_fork_hashtbl);
    while ((temp_entry = (DelForkFileTag *)hash_seq_search(&status)) != NULL) {
        entry = (DelForkFileTag*)hash_search(rel_bak, temp_entry, HASH_ENTER, &found);
        if (!found) {
            entry->forkrnode.rnode.spcNode = temp_entry->forkrnode.rnode.spcNode;
            entry->forkrnode.rnode.dbNode = temp_entry->forkrnode.rnode.dbNode;
            entry->forkrnode.rnode.relNode = temp_entry->forkrnode.rnode.relNode;
            entry->forkrnode.rnode.bucketNode = temp_entry->forkrnode.rnode.bucketNode;
            entry->forkrnode.forkNum = temp_entry->forkrnode.forkNum;
            entry->maxSegNo = temp_entry->maxSegNo;
            rel_num++;
        }
    }
    LWLockRelease(g_instance.bgwriter_cxt.rel_one_fork_hashtbl_lock);

    if (rel_num > 0) {
        DropRelFileNodeOneForkAllBuffersUsingHash(rel_bak);
        hash_seq_init(&status, rel_bak);
        while ((temp_entry = (DelForkFileTag *)hash_seq_search(&status)) != NULL) {
            if (temp_entry->maxSegNo == -1) {
                ereport(DEBUG1, (errmodule(MOD_INCRE_BG),
                errmsg("the max segno is -1, skip forget this rel %u/%u/%u, bucketNode is %d",
                temp_entry->forkrnode.rnode.spcNode, temp_entry->forkrnode.rnode.dbNode,
                temp_entry->forkrnode.rnode.relNode, temp_entry->forkrnode.rnode.bucketNode)));
                continue;
            }
            for (int32 i = 0; i < temp_entry->maxSegNo; i++) {
                md_register_forget_request(temp_entry->forkrnode.rnode, temp_entry->forkrnode.forkNum, i);
            }
            LWLockAcquire(g_instance.bgwriter_cxt.rel_one_fork_hashtbl_lock, LW_EXCLUSIVE);
            if (hash_search(unlink_rel_fork_hashtbl, (void *)temp_entry, HASH_REMOVE, NULL) == NULL) {
                ereport(LOG, (errcode(ERRCODE_DATA_CORRUPTED),
                    errmsg("%u/%u/%u, bucketNode is %d, forkNum is %d has been invalidated",
                    temp_entry->forkrnode.rnode.spcNode, temp_entry->forkrnode.rnode.dbNode,
                    temp_entry->forkrnode.rnode.relNode, temp_entry->forkrnode.rnode.bucketNode,
                    temp_entry->forkrnode.forkNum)));
            } else {
                ereport(LOG, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invaslidate buffer has been finished for rel "
                    "%u/%u/%u, bucketNode is %d, forkNum is %d",
                    temp_entry->forkrnode.rnode.spcNode, temp_entry->forkrnode.rnode.dbNode,
                    temp_entry->forkrnode.rnode.relNode, temp_entry->forkrnode.rnode.bucketNode,
                    temp_entry->forkrnode.forkNum)));
            }
            LWLockRelease(g_instance.bgwriter_cxt.rel_one_fork_hashtbl_lock);
        }
    }
    hash_destroy(rel_bak);
}