From aeab09c54ce5c837c4ab211ba0561e85d1b2c71a Mon Sep 17 00:00:00 2001 From: "Racardo.Cui" Date: Thu, 17 Jun 2021 18:15:44 +0800 Subject: [PATCH 1/3] change address type in uce log --- src/gausskernel/cbb/bbox/gs_bbox.cpp | 91 ++++++++++++++++--- .../process/postmaster/postmaster.cpp | 64 +------------ src/include/knl/knl_instance.h | 6 -- src/include/postmaster/postmaster.h | 1 - 4 files changed, 80 insertions(+), 82 deletions(-) diff --git a/src/gausskernel/cbb/bbox/gs_bbox.cpp b/src/gausskernel/cbb/bbox/gs_bbox.cpp index e3e94074c..ab6167026 100644 --- a/src/gausskernel/cbb/bbox/gs_bbox.cpp +++ b/src/gausskernel/cbb/bbox/gs_bbox.cpp @@ -33,12 +33,16 @@ #include "utils/elog.h" #include "utils/guc.h" #include "utils/fatal_err.h" +#include "storage/buf/buf_internals.h" #define BBOX_PATH_SIZE 512 #define DEFAULT_BLACKLIST_MASK (0xFFFFFFFFFFFFFFFF) #define INVALID_TID (-1) +#define SIGBUS_MCEERR_AR 4 +#define SIGBUS_MCEERR_AO 5 + static char g_bbox_dump_path[BBOX_PATH_SIZE] = {0}; #ifdef ENABLE_UT @@ -69,11 +73,6 @@ static void coredump_handler(int sig, siginfo_t *si, void *uc) if (g_instance.attr.attr_common.enable_ffic_log) { (void)gen_err_msg(sig, si, (ucontext_t *)uc); } - if (sig == SIGBUS) { - g_instance.sigbus_cxt.sigbus_addr = si->si_addr; - g_instance.sigbus_cxt.sigbus_code = si->si_code; - SIGBUS_handler(sig); - } } else { /* * Subsequent fatal error will go to here. If it comes from different thread, @@ -102,11 +101,6 @@ static void bbox_handler(int sig, siginfo_t *si, void *uc) if (g_instance.attr.attr_common.enable_ffic_log) { (void)gen_err_msg(sig, si, (ucontext_t *)uc); } - if (sig == SIGBUS) { - g_instance.sigbus_cxt.sigbus_addr = si->si_addr; - g_instance.sigbus_cxt.sigbus_code = si->si_code; - SIGBUS_handler(sig); - } #ifndef ENABLE_MEMORY_CHECK sigset_t intMask; sigset_t oldMask; @@ -133,6 +127,79 @@ static void bbox_handler(int sig, siginfo_t *si, void *uc) } } +/* + * SIGBUS -- When uce failure occurs in system memory, sigbus_handler will exit according to the region + * of its logical address. + * 1. Calculate the buffer pool address range to determine whether the error address is in the buffer pool. + * 2. For addresses outside the buffer pool range, print the NIC log and exit + * 3. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty + * 4. If the page is not dirty, the thread will send SIGINT to poma, then the thread that triggers the SIGBUS + * exit first and print warning message. If the page is dirty, print the PANIC log and coredump. + */ +void sigbus_handler(int sig, siginfo_t *si, void *uc) +{ + static volatile int64 first_tid = INVALID_TID; + int64 cur_tid = (int64)pthread_self(); + if (first_tid == INVALID_TID && + __sync_bool_compare_and_swap(&first_tid, INVALID_TID, cur_tid)) { + /* Only first fatal error will set db state and generate fatal error log */ + (void)SetDBStateFileState(COREDUMP_STATE, false); + if (g_instance.attr.attr_common.enable_ffic_log) { + (void)gen_err_msg(sig, si, (ucontext_t *)uc); + } + } else { + (void)pause(); + } + uint64 buffer_size; + int buf_id; + int si_code = si->si_code; + unsigned long long sigbus_addr = (unsigned long long)si->si_addr; + if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR))); + } +#ifdef __aarch64__ + buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE; +#else + buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ; +#endif + unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks; + unsigned long long endaddr = startaddr + buffer_size; + /* Determine the range of address carried by sigbus, And print the log according to the page state. */ + if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) { + buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ); + BufferDesc* buf_desc = GetBufferDescriptor(buf_id); + if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED || + buf_desc->state & BM_IO_IN_PROGRESS) { + ereport(PANIC, + (errcode(ERRCODE_UE_DIRTY_PAGE), + errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut " + "down immediately.", + ERRCODE_UE_DIRTY_PAGE, sigbus_addr))); + } else { + ereport(WARNING, + (errcode(ERRCODE_UE_CLEAN_PAGE), + errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will " + "shutdown.", + ERRCODE_UE_CLEAN_PAGE, sigbus_addr))); + (void)gs_signal_send(PostmasterPid, SIGINT); + gs_thread_exit(1); + } + } else if (sigbus_addr == 0) { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR))); + } else { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR, sigbus_addr))); + } +} + /* * get_bbox_coredump_pattern_path - get the core dump path from the file "/proc/sys/kernel/core_pattern" */ @@ -345,12 +412,12 @@ void assign_bbox_coredump(const bool newval, void* extra) if (newval && !FencedUDFMasterMode) { (void)install_signal(SIGABRT, bbox_handler); - (void)install_signal(SIGBUS, bbox_handler); + (void)install_signal(SIGBUS, sigbus_handler); (void)install_signal(SIGILL, bbox_handler); (void)install_signal(SIGSEGV, bbox_handler); } else { (void)install_signal(SIGABRT, coredump_handler); - (void)install_signal(SIGBUS, coredump_handler); + (void)install_signal(SIGBUS, sigbus_handler); (void)install_signal(SIGILL, coredump_handler); (void)install_signal(SIGSEGV, coredump_handler); } diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index 67a06ae32..f23d36571 100755 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -263,8 +263,7 @@ static bool isNeedGetLCName = true; #define PM_POLL_TIMEOUT_SECOND 20 #define PM_POLL_TIMEOUT_MINUTE 58*SECS_PER_MINUTE*60*1000000L #define CHECK_TIMES 10 -#define SIGBUS_MCEERR_AR 4 -#define SIGBUS_MCEERR_AO 5 + static char gaussdb_state_file[MAXPGPATH] = {0}; uint32 noProcLogicTid = 0; @@ -334,7 +333,6 @@ static Port* ConnCreateToRecvGssock(pollfd* ufds, int idx, int* nSockets); static Port* ConnCreate(int serverFd); static void reset_shared(int port); static void SIGHUP_handler(SIGNAL_ARGS); -void SIGBUS_handler(SIGNAL_ARGS); static void pmdie(SIGNAL_ARGS); static void startup_alarm(SIGNAL_ARGS); static void SetWalsndsNodeState(ClusterNodeState requester, ClusterNodeState others); @@ -4248,65 +4246,6 @@ static void SIGHUP_handler(SIGNAL_ARGS) errno = save_errno; } -/* - * SIGBUS -- When uce failure occurs in system memory, sigbus_handler will exit according to the region - of its logical address. - 1. Calculate the buffer pool address range to determine whether the error address is in the buffer pool. - 2. For addresses outside the buffer pool range, print the NIC log and exit - 3. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty - 4. If the page is not dirty, execute pmdie to exit normally and print warning message. If the page is dirty, - print the PANIC log and exit - */ -void SIGBUS_handler(SIGNAL_ARGS) -{ - uint64 buffer_size; - int buf_id; - int si_code = g_instance.sigbus_cxt.sigbus_code; - unsigned long long sigbus_addr = (unsigned long long)g_instance.sigbus_cxt.sigbus_addr; - if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR))); - } -#ifdef __aarch64__ - buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE; -#else - buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ; -#endif - unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks; - unsigned long long endaddr = startaddr + buffer_size; - /* Determine the range of address carried by sigbus, And print the log according to the page state. */ - if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) { - buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ); - BufferDesc* buf_desc = GetBufferDescriptor(buf_id); - if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED || - buf_desc->state & BM_IO_IN_PROGRESS) { - ereport(PANIC, - (errcode(ERRCODE_UE_DIRTY_PAGE), - errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut " - "down immediately.", - ERRCODE_UE_DIRTY_PAGE, sigbus_addr))); - } else { - ereport(WARNING, - (errcode(ERRCODE_UE_CLEAN_PAGE), - errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will " - "shutdown.", - ERRCODE_UE_CLEAN_PAGE, sigbus_addr))); - pmdie(SIGBUS); - } - } else if (sigbus_addr == 0) { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR))); - } else { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR, sigbus_addr))); - } -} void KillGraceThreads(void) { @@ -4347,7 +4286,6 @@ static void pmdie(SIGNAL_ARGS) switch (postgres_signal_arg) { case SIGTERM: case SIGINT: - case SIGBUS: if (STANDBY_MODE == t_thrd.postmaster_cxt.HaShmData->current_mode && !dummyStandbyMode && SIGTERM == postgres_signal_arg) { diff --git a/src/include/knl/knl_instance.h b/src/include/knl/knl_instance.h index 8d8a32ac2..371c647d6 100644 --- a/src/include/knl/knl_instance.h +++ b/src/include/knl/knl_instance.h @@ -765,11 +765,6 @@ typedef struct knl_g_hypo_context { List* hypo_index_list; } knl_g_hypo_context; -typedef struct knl_sigbus_context { - void* sigbus_addr; - int sigbus_code; -} knl_sigbus_context; - typedef struct knl_instance_context { knl_virtual_role role; volatile int status; @@ -867,7 +862,6 @@ typedef struct knl_instance_context { knl_g_archive_standby_context archive_standby_cxt; struct HTAB* ngroup_hash_table; knl_g_hypo_context hypo_cxt; - knl_sigbus_context sigbus_cxt; } knl_instance_context; extern long random(); diff --git a/src/include/postmaster/postmaster.h b/src/include/postmaster/postmaster.h index 39977f02e..8170931b2 100644 --- a/src/include/postmaster/postmaster.h +++ b/src/include/postmaster/postmaster.h @@ -244,5 +244,4 @@ extern uint64_t mc_timers_us(void); extern bool SetDBStateFileState(DbState state, bool optional); extern void GPCResetAll(); extern void initRandomState(TimestampTz start_time, TimestampTz stop_time); -extern void SIGBUS_handler(SIGNAL_ARGS); #endif /* _POSTMASTER_H */ From 409d2a0cd871ef1f15d4d9972952a13c80c2b4bc Mon Sep 17 00:00:00 2001 From: "Ricaro.Cui" Date: Tue, 29 Jun 2021 16:45:12 +0800 Subject: [PATCH 2/3] avc --- src/gausskernel/cbb/bbox/gs_bbox.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gausskernel/cbb/bbox/gs_bbox.cpp b/src/gausskernel/cbb/bbox/gs_bbox.cpp index ab6167026..ab6d5a336 100644 --- a/src/gausskernel/cbb/bbox/gs_bbox.cpp +++ b/src/gausskernel/cbb/bbox/gs_bbox.cpp @@ -148,7 +148,7 @@ void sigbus_handler(int sig, siginfo_t *si, void *uc) (void)gen_err_msg(sig, si, (ucontext_t *)uc); } } else { - (void)pause(); + gs_thread_exit(2); } uint64 buffer_size; int buf_id; From 8b6c64304f6c75627feceeea764ba58b0e41c8a6 Mon Sep 17 00:00:00 2001 From: "Ricaro.Cui" Date: Thu, 1 Jul 2021 16:45:01 +0800 Subject: [PATCH 3/3] RAS:UCE --- src/gausskernel/cbb/bbox/gs_bbox.cpp | 117 +++++++++++++++------------ 1 file changed, 67 insertions(+), 50 deletions(-) diff --git a/src/gausskernel/cbb/bbox/gs_bbox.cpp b/src/gausskernel/cbb/bbox/gs_bbox.cpp index ab6d5a336..148027ce4 100644 --- a/src/gausskernel/cbb/bbox/gs_bbox.cpp +++ b/src/gausskernel/cbb/bbox/gs_bbox.cpp @@ -130,16 +130,30 @@ static void bbox_handler(int sig, siginfo_t *si, void *uc) /* * SIGBUS -- When uce failure occurs in system memory, sigbus_handler will exit according to the region * of its logical address. - * 1. Calculate the buffer pool address range to determine whether the error address is in the buffer pool. - * 2. For addresses outside the buffer pool range, print the NIC log and exit - * 3. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty - * 4. If the page is not dirty, the thread will send SIGINT to poma, then the thread that triggers the SIGBUS + * 1. If the enableIncrementalCheckpoint is turned off, the uce feature no longer takes effect + * 2. Calculate the buffer pool address range to determine whether the error address is in the buffer pool. + * 3. For addresses outside the buffer pool range, print the PANIC log and exit + * 4. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty + * 5. If the page is not dirty, the thread will send SIGINT to poma, then the thread that triggers the SIGBUS * exit first and print warning message. If the page is dirty, print the PANIC log and coredump. */ void sigbus_handler(int sig, siginfo_t *si, void *uc) { + if (!g_instance.attr.attr_storage.enableIncrementalCheckpoint) { + if (u_sess->attr.attr_common.enable_bbox_dump) + bbox_handler(sig, si, uc); + else { + coredump_handler(sig, si, uc); + } + } + static volatile int64 first_tid = INVALID_TID; int64 cur_tid = (int64)pthread_self(); + uint64 buffer_size; + int buf_id; + int si_code = si->si_code; + unsigned long long sigbus_addr = (unsigned long long)si->si_addr; + if (first_tid == INVALID_TID && __sync_bool_compare_and_swap(&first_tid, INVALID_TID, cur_tid)) { /* Only first fatal error will set db state and generate fatal error log */ @@ -147,56 +161,59 @@ void sigbus_handler(int sig, siginfo_t *si, void *uc) if (g_instance.attr.attr_common.enable_ffic_log) { (void)gen_err_msg(sig, si, (ucontext_t *)uc); } - } else { - gs_thread_exit(2); - } - uint64 buffer_size; - int buf_id; - int si_code = si->si_code; - unsigned long long sigbus_addr = (unsigned long long)si->si_addr; - if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR))); - } -#ifdef __aarch64__ - buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE; -#else - buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ; +#ifndef ENABLE_MEMORY_CHECK + sigset_t intMask; + sigset_t oldMask; + + sigfillset(&intMask); + pthread_sigmask(SIG_SETMASK, &intMask, &oldMask); #endif - unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks; - unsigned long long endaddr = startaddr + buffer_size; - /* Determine the range of address carried by sigbus, And print the log according to the page state. */ - if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) { - buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ); - BufferDesc* buf_desc = GetBufferDescriptor(buf_id); - if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED || - buf_desc->state & BM_IO_IN_PROGRESS) { + /* If si_code is not 4 or 5, it is not Uncorrected Error. then gaussdb will PANIC*/ + if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) { ereport(PANIC, - (errcode(ERRCODE_UE_DIRTY_PAGE), - errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut " - "down immediately.", - ERRCODE_UE_DIRTY_PAGE, sigbus_addr))); - } else { - ereport(WARNING, - (errcode(ERRCODE_UE_CLEAN_PAGE), - errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will " - "shutdown.", - ERRCODE_UE_CLEAN_PAGE, sigbus_addr))); - (void)gs_signal_send(PostmasterPid, SIGINT); - gs_thread_exit(1); + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR))); + } +#ifdef __aarch64__ + buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE; +#else + buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ; +#endif + unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks; + unsigned long long endaddr = startaddr + buffer_size; + /* Determine the range of address carried by sigbus, And print the log according to the page state. */ + if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) { + buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ); + BufferDesc* buf_desc = GetBufferDescriptor(buf_id); + if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED) { + ereport(PANIC, + (errcode(ERRCODE_UE_DIRTY_PAGE), + errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut " + "down immediately.", + ERRCODE_UE_DIRTY_PAGE, sigbus_addr))); + } else { + ereport(WARNING, + (errcode(ERRCODE_UE_CLEAN_PAGE), + errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will " + "shutdown.", + ERRCODE_UE_CLEAN_PAGE, sigbus_addr))); + gs_signal_send(PostmasterPid, SIGINT); + gs_thread_exit(1); // Prevent the same thread from being paused after entering the handler again, and cannot be correctly exited by POMA + } + } else if (sigbus_addr == 0) { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR))); + } else { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR, sigbus_addr))); } - } else if (sigbus_addr == 0) { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR))); } else { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR, sigbus_addr))); + (void)pause(); } }