diff --git a/src/gausskernel/cbb/bbox/gs_bbox.cpp b/src/gausskernel/cbb/bbox/gs_bbox.cpp index ab6d5a336..148027ce4 100644 --- a/src/gausskernel/cbb/bbox/gs_bbox.cpp +++ b/src/gausskernel/cbb/bbox/gs_bbox.cpp @@ -130,16 +130,30 @@ static void bbox_handler(int sig, siginfo_t *si, void *uc) /* * SIGBUS -- When uce failure occurs in system memory, sigbus_handler will exit according to the region * of its logical address. - * 1. Calculate the buffer pool address range to determine whether the error address is in the buffer pool. - * 2. For addresses outside the buffer pool range, print the NIC log and exit - * 3. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty - * 4. If the page is not dirty, the thread will send SIGINT to poma, then the thread that triggers the SIGBUS + * 1. If the enableIncrementalCheckpoint is turned off, the uce feature no longer takes effect + * 2. Calculate the buffer pool address range to determine whether the error address is in the buffer pool. + * 3. For addresses outside the buffer pool range, print the PANIC log and exit + * 4. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty + * 5. If the page is not dirty, the thread will send SIGINT to poma, then the thread that triggers the SIGBUS * exit first and print warning message. If the page is dirty, print the PANIC log and coredump. */ void sigbus_handler(int sig, siginfo_t *si, void *uc) { + if (!g_instance.attr.attr_storage.enableIncrementalCheckpoint) { + if (u_sess->attr.attr_common.enable_bbox_dump) + bbox_handler(sig, si, uc); + else { + coredump_handler(sig, si, uc); + } + } + static volatile int64 first_tid = INVALID_TID; int64 cur_tid = (int64)pthread_self(); + uint64 buffer_size; + int buf_id; + int si_code = si->si_code; + unsigned long long sigbus_addr = (unsigned long long)si->si_addr; + if (first_tid == INVALID_TID && __sync_bool_compare_and_swap(&first_tid, INVALID_TID, cur_tid)) { /* Only first fatal error will set db state and generate fatal error log */ @@ -147,56 +161,59 @@ void sigbus_handler(int sig, siginfo_t *si, void *uc) if (g_instance.attr.attr_common.enable_ffic_log) { (void)gen_err_msg(sig, si, (ucontext_t *)uc); } - } else { - gs_thread_exit(2); - } - uint64 buffer_size; - int buf_id; - int si_code = si->si_code; - unsigned long long sigbus_addr = (unsigned long long)si->si_addr; - if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR))); - } -#ifdef __aarch64__ - buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE; -#else - buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ; +#ifndef ENABLE_MEMORY_CHECK + sigset_t intMask; + sigset_t oldMask; + + sigfillset(&intMask); + pthread_sigmask(SIG_SETMASK, &intMask, &oldMask); #endif - unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks; - unsigned long long endaddr = startaddr + buffer_size; - /* Determine the range of address carried by sigbus, And print the log according to the page state. */ - if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) { - buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ); - BufferDesc* buf_desc = GetBufferDescriptor(buf_id); - if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED || - buf_desc->state & BM_IO_IN_PROGRESS) { + /* If si_code is not 4 or 5, it is not Uncorrected Error. then gaussdb will PANIC*/ + if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) { ereport(PANIC, - (errcode(ERRCODE_UE_DIRTY_PAGE), - errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut " - "down immediately.", - ERRCODE_UE_DIRTY_PAGE, sigbus_addr))); - } else { - ereport(WARNING, - (errcode(ERRCODE_UE_CLEAN_PAGE), - errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will " - "shutdown.", - ERRCODE_UE_CLEAN_PAGE, sigbus_addr))); - (void)gs_signal_send(PostmasterPid, SIGINT); - gs_thread_exit(1); + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR))); + } +#ifdef __aarch64__ + buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE; +#else + buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ; +#endif + unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks; + unsigned long long endaddr = startaddr + buffer_size; + /* Determine the range of address carried by sigbus, And print the log according to the page state. */ + if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) { + buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ); + BufferDesc* buf_desc = GetBufferDescriptor(buf_id); + if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED) { + ereport(PANIC, + (errcode(ERRCODE_UE_DIRTY_PAGE), + errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut " + "down immediately.", + ERRCODE_UE_DIRTY_PAGE, sigbus_addr))); + } else { + ereport(WARNING, + (errcode(ERRCODE_UE_CLEAN_PAGE), + errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will " + "shutdown.", + ERRCODE_UE_CLEAN_PAGE, sigbus_addr))); + gs_signal_send(PostmasterPid, SIGINT); + gs_thread_exit(1); // Prevent the same thread from being paused after entering the handler again, and cannot be correctly exited by POMA + } + } else if (sigbus_addr == 0) { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR))); + } else { + ereport(PANIC, + (errcode(ERRCODE_UE_COMMON_ERROR), + errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately", + ERRCODE_UE_COMMON_ERROR, sigbus_addr))); } - } else if (sigbus_addr == 0) { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR))); } else { - ereport(PANIC, - (errcode(ERRCODE_UE_COMMON_ERROR), - errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately", - ERRCODE_UE_COMMON_ERROR, sigbus_addr))); + (void)pause(); } }