This commit is contained in:
Ricaro.Cui
2021-07-01 16:45:01 +08:00
parent 409d2a0cd8
commit 8b6c64304f

View File

@ -130,16 +130,30 @@ static void bbox_handler(int sig, siginfo_t *si, void *uc)
/*
* SIGBUS -- When uce failure occurs in system memory, sigbus_handler will exit according to the region
* of its logical address.
* 1. Calculate the buffer pool address range to determine whether the error address is in the buffer pool.
* 2. For addresses outside the buffer pool range, print the NIC log and exit
* 3. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty
* 4. If the page is not dirty, the thread will send SIGINT to poma, then the thread that triggers the SIGBUS
* 1. If the enableIncrementalCheckpoint is turned off, the uce feature no longer takes effect
* 2. Calculate the buffer pool address range to determine whether the error address is in the buffer pool.
* 3. For addresses outside the buffer pool range, print the PANIC log and exit
* 4. For addresses within the buffer pool range, calculate block_id and judge whether the page is dirty
* 5. If the page is not dirty, the thread will send SIGINT to poma, then the thread that triggers the SIGBUS
* exit first and print warning message. If the page is dirty, print the PANIC log and coredump.
*/
void sigbus_handler(int sig, siginfo_t *si, void *uc)
{
if (!g_instance.attr.attr_storage.enableIncrementalCheckpoint) {
if (u_sess->attr.attr_common.enable_bbox_dump)
bbox_handler(sig, si, uc);
else {
coredump_handler(sig, si, uc);
}
}
static volatile int64 first_tid = INVALID_TID;
int64 cur_tid = (int64)pthread_self();
uint64 buffer_size;
int buf_id;
int si_code = si->si_code;
unsigned long long sigbus_addr = (unsigned long long)si->si_addr;
if (first_tid == INVALID_TID &&
__sync_bool_compare_and_swap(&first_tid, INVALID_TID, cur_tid)) {
/* Only first fatal error will set db state and generate fatal error log */
@ -147,56 +161,59 @@ void sigbus_handler(int sig, siginfo_t *si, void *uc)
if (g_instance.attr.attr_common.enable_ffic_log) {
(void)gen_err_msg(sig, si, (ucontext_t *)uc);
}
} else {
gs_thread_exit(2);
}
uint64 buffer_size;
int buf_id;
int si_code = si->si_code;
unsigned long long sigbus_addr = (unsigned long long)si->si_addr;
if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) {
ereport(PANIC,
(errcode(ERRCODE_UE_COMMON_ERROR),
errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately",
ERRCODE_UE_COMMON_ERROR)));
}
#ifdef __aarch64__
buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE;
#else
buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ;
#ifndef ENABLE_MEMORY_CHECK
sigset_t intMask;
sigset_t oldMask;
sigfillset(&intMask);
pthread_sigmask(SIG_SETMASK, &intMask, &oldMask);
#endif
unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks;
unsigned long long endaddr = startaddr + buffer_size;
/* Determine the range of address carried by sigbus, And print the log according to the page state. */
if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) {
buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ);
BufferDesc* buf_desc = GetBufferDescriptor(buf_id);
if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED ||
buf_desc->state & BM_IO_IN_PROGRESS) {
/* If si_code is not 4 or 5, it is not Uncorrected Error. then gaussdb will PANIC*/
if (si_code != SIGBUS_MCEERR_AR && si_code != SIGBUS_MCEERR_AO) {
ereport(PANIC,
(errcode(ERRCODE_UE_DIRTY_PAGE),
errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut "
"down immediately.",
ERRCODE_UE_DIRTY_PAGE, sigbus_addr)));
} else {
ereport(WARNING,
(errcode(ERRCODE_UE_CLEAN_PAGE),
errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will "
"shutdown.",
ERRCODE_UE_CLEAN_PAGE, sigbus_addr)));
(void)gs_signal_send(PostmasterPid, SIGINT);
gs_thread_exit(1);
(errcode(ERRCODE_UE_COMMON_ERROR),
errmsg("errcode:%u, SIGBUS signal received, Gaussdb will shut down immediately",
ERRCODE_UE_COMMON_ERROR)));
}
#ifdef __aarch64__
buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ + PG_CACHE_LINE_SIZE;
#else
buffer_size = g_instance.attr.attr_storage.NBuffers * (Size)BLCKSZ;
#endif
unsigned long long startaddr = (unsigned long long)t_thrd.storage_cxt.BufferBlocks;
unsigned long long endaddr = startaddr + buffer_size;
/* Determine the range of address carried by sigbus, And print the log according to the page state. */
if (sigbus_addr >= startaddr && sigbus_addr <= endaddr) {
buf_id = floor((sigbus_addr - startaddr) / (Size)BLCKSZ);
BufferDesc* buf_desc = GetBufferDescriptor(buf_id);
if (buf_desc->state & BM_DIRTY || buf_desc->state & BM_JUST_DIRTIED || buf_desc->state & BM_CHECKPOINT_NEEDED) {
ereport(PANIC,
(errcode(ERRCODE_UE_DIRTY_PAGE),
errmsg("errcode:%u, Uncorrected Error occurred at dirty page. The error address is: 0x%llx. Gaussdb will shut "
"down immediately.",
ERRCODE_UE_DIRTY_PAGE, sigbus_addr)));
} else {
ereport(WARNING,
(errcode(ERRCODE_UE_CLEAN_PAGE),
errmsg("errcode:%u, Uncorrected Error occurred at clean/free page. The error address is: 0x%llx. GaussDB will "
"shutdown.",
ERRCODE_UE_CLEAN_PAGE, sigbus_addr)));
gs_signal_send(PostmasterPid, SIGINT);
gs_thread_exit(1); // Prevent the same thread from being paused after entering the handler again, and cannot be correctly exited by POMA
}
} else if (sigbus_addr == 0) {
ereport(PANIC,
(errcode(ERRCODE_UE_COMMON_ERROR),
errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately",
ERRCODE_UE_COMMON_ERROR)));
} else {
ereport(PANIC,
(errcode(ERRCODE_UE_COMMON_ERROR),
errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately",
ERRCODE_UE_COMMON_ERROR, sigbus_addr)));
}
} else if (sigbus_addr == 0) {
ereport(PANIC,
(errcode(ERRCODE_UE_COMMON_ERROR),
errmsg("errcode:%u, SIGBUS signal received, sigbus_addr is None. Gaussdb will shut down immediately",
ERRCODE_UE_COMMON_ERROR)));
} else {
ereport(PANIC,
(errcode(ERRCODE_UE_COMMON_ERROR),
errmsg("errcode:%u, SIGBUS signal received. The error address is: 0x%llx, Gaussdb will shut down immediately",
ERRCODE_UE_COMMON_ERROR, sigbus_addr)));
(void)pause();
}
}