diff --git a/src/gausskernel/cbb/bbox/gs_bbox.cpp b/src/gausskernel/cbb/bbox/gs_bbox.cpp index e3e94074c..148027ce4 100644 --- a/src/gausskernel/cbb/bbox/gs_bbox.cpp +++ b/src/gausskernel/cbb/bbox/gs_bbox.cpp @@ -33,12 +33,16 @@ #include "utils/elog.h" #include "utils/guc.h" #include "utils/fatal_err.h" +#include "storage/buf/buf_internals.h" #define BBOX_PATH_SIZE 512 #define DEFAULT_BLACKLIST_MASK (0xFFFFFFFFFFFFFFFF) #define INVALID_TID (-1) +#define SIGBUS_MCEERR_AR 4 +#define SIGBUS_MCEERR_AO 5 + static char g_bbox_dump_path[BBOX_PATH_SIZE] = {0}; #ifdef ENABLE_UT @@ -69,11 +73,6 @@ static void coredump_handler(int sig, siginfo_t *si, void *uc) if (g_instance.attr.attr_common.enable_ffic_log) { (void)gen_err_msg(sig, si, (ucontext_t *)uc); } - if (sig == SIGBUS) { - g_instance.sigbus_cxt.sigbus_addr = si->si_addr; - g_instance.sigbus_cxt.sigbus_code = si->si_code; - SIGBUS_handler(sig); - } } else { /* * Subsequent fatal error will go to here. If it comes from different thread, @@ -102,11 +101,6 @@ static void bbox_handler(int sig, siginfo_t *si, void *uc) if (g_instance.attr.attr_common.enable_ffic_log) { (void)gen_err_msg(sig, si, (ucontext_t *)uc); } - if (sig == SIGBUS) { - g_instance.sigbus_cxt.sigbus_addr = si->si_addr; - g_instance.sigbus_cxt.sigbus_code = si->si_code; - SIGBUS_handler(sig); - } #ifndef ENABLE_MEMORY_CHECK sigset_t intMask; sigset_t oldMask; @@ -133,6 +127,96 @@ static void bbox_handler(int sig, siginfo_t *si, void *uc) } } +/* + * SIGBUS -- When uce failure occurs in system memory, sigbus_handler will exit according to the region + * of its logical address. + * 1. If the enableIncrementalCheckpoint is turned off, the uce feature no longer takes effect + * 2. Calculate the buffer pool address range to determine whether the error address is in the buffer pool. + * 3. For addresses outside the buffer pool range, print the PANIC log and exit + * 4. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty + * 5. If the page is not dirty, the thread will send SIGINT to poma, then the thread that triggers the SIGBUS + * exit first and print warning message. If the page is dirty, print the PANIC log and coredump. + */ +void sigbus_handler(int sig, siginfo_t *si, void *uc) +{ + if (!g_instance.attr.attr_storage.enableIncrementalCheckpoint) { + if (u_sess->attr.attr_common.enable_bbox_dump) + bbox_handler(sig, si, uc); + else { + coredump_handler(sig, si, uc); + } + } + + static volatile int64 first_tid = INVALID_TID; + int64 cur_tid = (int64)pthread_self(); + uint64 buffer_size; + int buf_id; + int si_code = si->si_code; + unsigned long long sigbus_addr = (unsigned long long)si->si_addr; + + if (first_tid == INVALID_TID && + __sync_bool_compare_and_swap(&first_tid, INVALID_TID, cur_tid)) { + /* Only first fatal error will set db state and generate fatal error log */ + (void)SetDBStateFileState(COREDUMP_STATE, false); + if (g_instance.attr.attr_common.enable_ffic_log) { + (void)gen_err_msg(sig, si, (ucontext_t *)uc); + } +#ifndef ENABLE_MEMORY_CHECK + sigset_t intMask; + sigset_t oldMask; + + sigfillset(&intMask); + pthread_sigmask(SIG_SETMASK, &intMask, &oldMask); +#endif + /* If si_code is not 4 or 5, it is not Uncorrected Error. then gaussdb will PANIC*/ + if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR))); + } +#ifdef __aarch64__ + buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE; +#else + buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ; +#endif + unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks; + unsigned long long endaddr = startaddr + buffer_size; + /* Determine the range of address carried by sigbus, And print the log according to the page state. */ + if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) { + buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ); + BufferDesc* buf_desc = GetBufferDescriptor(buf_id); + if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED) { + ereport(PANIC, + (errcode(ERRCODE_UE_DIRTY_PAGE), + errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut " + "down immediately.", + ERRCODE_UE_DIRTY_PAGE, sigbus_addr))); + } else { + ereport(WARNING, + (errcode(ERRCODE_UE_CLEAN_PAGE), + errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will " + "shutdown.", + ERRCODE_UE_CLEAN_PAGE, sigbus_addr))); + gs_signal_send(PostmasterPid, SIGINT); + gs_thread_exit(1); // Prevent the same thread from being paused after entering the handler again, and cannot be correctly exited by POMA + } + } else if (sigbus_addr == 0) { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR))); + } else { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR, sigbus_addr))); + } + } else { + (void)pause(); + } +} + /* * get_bbox_coredump_pattern_path - get the core dump path from the file "/proc/sys/kernel/core_pattern" */ @@ -345,12 +429,12 @@ void assign_bbox_coredump(const bool newval, void* extra) if (newval && !FencedUDFMasterMode) { (void)install_signal(SIGABRT, bbox_handler); - (void)install_signal(SIGBUS, bbox_handler); + (void)install_signal(SIGBUS, sigbus_handler); (void)install_signal(SIGILL, bbox_handler); (void)install_signal(SIGSEGV, bbox_handler); } else { (void)install_signal(SIGABRT, coredump_handler); - (void)install_signal(SIGBUS, coredump_handler); + (void)install_signal(SIGBUS, sigbus_handler); (void)install_signal(SIGILL, coredump_handler); (void)install_signal(SIGSEGV, coredump_handler); } diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index a59fb1885..0df6d519d 100755 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -263,8 +263,7 @@ static bool isNeedGetLCName = true; #define PM_POLL_TIMEOUT_SECOND 20 #define PM_POLL_TIMEOUT_MINUTE 58*SECS_PER_MINUTE*60*1000000L #define CHECK_TIMES 10 -#define SIGBUS_MCEERR_AR 4 -#define SIGBUS_MCEERR_AO 5 + static char gaussdb_state_file[MAXPGPATH] = {0}; uint32 noProcLogicTid = 0; @@ -334,7 +333,6 @@ static Port* ConnCreateToRecvGssock(pollfd* ufds, int idx, int* nSockets); static Port* ConnCreate(int serverFd); static void reset_shared(int port); static void SIGHUP_handler(SIGNAL_ARGS); -void SIGBUS_handler(SIGNAL_ARGS); static void pmdie(SIGNAL_ARGS); static void startup_alarm(SIGNAL_ARGS); static void SetWalsndsNodeState(ClusterNodeState requester, ClusterNodeState others); @@ -4258,65 +4256,6 @@ static void SIGHUP_handler(SIGNAL_ARGS) errno = save_errno; } -/* - * SIGBUS -- When uce failure occurs in system memory, sigbus_handler will exit according to the region - of its logical address. - 1. Calculate the buffer pool address range to determine whether the error address is in the buffer pool. - 2. For addresses outside the buffer pool range, print the NIC log and exit - 3. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty - 4. If the page is not dirty, execute pmdie to exit normally and print warning message. If the page is dirty, - print the PANIC log and exit - */ -void SIGBUS_handler(SIGNAL_ARGS) -{ - uint64 buffer_size; - int buf_id; - int si_code = g_instance.sigbus_cxt.sigbus_code; - unsigned long long sigbus_addr = (unsigned long long)g_instance.sigbus_cxt.sigbus_addr; - if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR))); - } -#ifdef __aarch64__ - buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE; -#else - buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ; -#endif - unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks; - unsigned long long endaddr = startaddr + buffer_size; - /* Determine the range of address carried by sigbus, And print the log according to the page state. */ - if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) { - buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ); - BufferDesc* buf_desc = GetBufferDescriptor(buf_id); - if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED || - buf_desc->state & BM_IO_IN_PROGRESS) { - ereport(PANIC, - (errcode(ERRCODE_UE_DIRTY_PAGE), - errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut " - "down immediately.", - ERRCODE_UE_DIRTY_PAGE, sigbus_addr))); - } else { - ereport(WARNING, - (errcode(ERRCODE_UE_CLEAN_PAGE), - errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will " - "shutdown.", - ERRCODE_UE_CLEAN_PAGE, sigbus_addr))); - pmdie(SIGBUS); - } - } else if (sigbus_addr == 0) { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR))); - } else { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR, sigbus_addr))); - } -} void KillGraceThreads(void) { @@ -4357,7 +4296,6 @@ static void pmdie(SIGNAL_ARGS) switch (postgres_signal_arg) { case SIGTERM: case SIGINT: - case SIGBUS: if (STANDBY_MODE == t_thrd.postmaster_cxt.HaShmData->current_mode && !dummyStandbyMode && SIGTERM == postgres_signal_arg) { diff --git a/src/include/knl/knl_instance.h b/src/include/knl/knl_instance.h index 67eacca24..2e11841ff 100644 --- a/src/include/knl/knl_instance.h +++ b/src/include/knl/knl_instance.h @@ -768,11 +768,6 @@ typedef struct knl_g_hypo_context { List* hypo_index_list; } knl_g_hypo_context; -typedef struct knl_sigbus_context { - void* sigbus_addr; - int sigbus_code; -} knl_sigbus_context; - typedef struct knl_instance_context { knl_virtual_role role; volatile int status; @@ -870,7 +865,6 @@ typedef struct knl_instance_context { knl_g_archive_standby_context archive_standby_cxt; struct HTAB* ngroup_hash_table; knl_g_hypo_context hypo_cxt; - knl_sigbus_context sigbus_cxt; #ifndef ENABLE_MULTIPLE_NODES void *raw_parser_hook[DB_CMPT_MAX]; diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 39977f02e..8170931b2 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -244,5 +244,4 @@ extern uint64_t mc_timers_us(void); extern bool SetDBStateFileState(DbState state, bool optional); extern void GPCResetAll(); extern void initRandomState(TimestampTz start_time, TimestampTz stop_time); -extern void SIGBUS_handler(SIGNAL_ARGS); #endif /* _POSTMASTER_H */