902 lines
31 KiB
C++
Executable File
902 lines
31 KiB
C++
Executable File
/*
|
|
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
|
* Portions Copyright (c) 2010-2013, PostgreSQL Global Development Group
|
|
*
|
|
*
|
|
* walrcvwriter.cpp
|
|
* functions for xlog receive management
|
|
*
|
|
* IDENTIFICATION
|
|
* src/gausskernel/storage/replication/walrcvwriter.cpp
|
|
*
|
|
* -------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
#include "knl/knl_variable.h"
|
|
|
|
#include "access/xlog_internal.h"
|
|
#include "access/xlogutils.h"
|
|
#include "catalog/pg_tablespace.h"
|
|
#include "libpq/pqsignal.h"
|
|
#include "miscadmin.h"
|
|
#include "replication/walreceiver.h"
|
|
#include "replication/dataqueue.h"
|
|
#include "replication/datareceiver.h"
|
|
#include "replication/walsender.h"
|
|
#include "replication/dcf_replication.h"
|
|
#include "storage/buf/bufmgr.h"
|
|
#include "storage/ipc.h"
|
|
#include "storage/lmgr.h"
|
|
#include "storage/proc.h"
|
|
#include "storage/smgr/smgr.h"
|
|
#include "utils/guc.h"
|
|
#include "access/xlog.h"
|
|
#include "access/multi_redo_api.h"
|
|
|
|
#include "utils/memutils.h"
|
|
#include "utils/ps_status.h"
|
|
#include "utils/resowner.h"
|
|
#include "gssignal/gs_signal.h"
|
|
#include "gs_bbox.h"
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
#include "postmaster/barrier_preparse.h"
|
|
#endif
|
|
|
|
/*
|
|
* These variables are used similarly to openLogFile/SegNo/Off,
|
|
* but for walreceiver to write the XLOG. recvFileTLI is the TimeLineID
|
|
* corresponding the filename of recvFile.
|
|
*/
|
|
static volatile int recvFile = -1;
|
|
static volatile TimeLineID recvFileTLI = 0;
|
|
static volatile XLogSegNo recvSegNo = 0;
|
|
static volatile uint32 recvOff = 0;
|
|
|
|
#define MAX_DUMMY_DATA_FILE (RELSEG_SIZE * BLCKSZ)
|
|
|
|
/* max dummy data write file (default: 1GB) */
|
|
static int ws_dummy_data_writer_file_fd = -1;
|
|
static uint32 ws_dummy_data_writer_file_offset = 0;
|
|
bool ws_dummy_data_writer_use_file = false;
|
|
|
|
/* Signal handlers */
|
|
static void WSDataWriteOnDummyStandby(const char *buf, uint32 nbytes);
|
|
static void WSDataRcvWriteOnDummyStandby(const char *buf, uint32 nbytes);
|
|
static void walrcvWriterSigHupHandler(SIGNAL_ARGS);
|
|
static void walrcvWriterQuickDie(SIGNAL_ARGS);
|
|
static void reqShutdownHandler(SIGNAL_ARGS);
|
|
|
|
void SetWalRcvWriterPID(ThreadId tid)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->writerPid = tid;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
|
|
static void setWalRcvWriterLatch(void)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->walrcvWriterLatch = &t_thrd.proc->procLatch;
|
|
walrcv->writerPid = t_thrd.proc_cxt.MyProcPid;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
|
|
static void emptyWalRcvWriterLatch(void)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->walrcvWriterLatch = NULL;
|
|
walrcv->writerPid = 0;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
|
|
bool WalRcvWriterInProgress(void)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
if (walrcv->writerPid == 0) {
|
|
SpinLockRelease(&walrcv->mutex);
|
|
return false;
|
|
}
|
|
SpinLockRelease(&walrcv->mutex);
|
|
return true;
|
|
}
|
|
|
|
WalRcvCtlBlock *getCurrentWalRcvCtlBlock(void)
|
|
{
|
|
WalRcvCtlBlock *walrcb = NULL;
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcb = walrcv->walRcvCtlBlock;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
return walrcb;
|
|
}
|
|
|
|
/*
|
|
* Write XLOG data to disk.
|
|
*/
|
|
static void XLogWalRcvWrite(WalRcvCtlBlock *walrcb, char *buf, Size nbytes, XLogRecPtr recptr)
|
|
{
|
|
int startoff;
|
|
int byteswritten;
|
|
int nRetCode = 0;
|
|
Size write_bytes = nbytes;
|
|
|
|
while (nbytes > 0) {
|
|
int segbytes;
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
|
|
if (recvFile < 0 || !XLByteInSeg(recptr, recvSegNo)) {
|
|
bool use_existent = false;
|
|
|
|
/*
|
|
* fsync() and close current file before we switch to next one. We
|
|
* would otherwise have to reopen this file to fsync it later
|
|
*/
|
|
if (recvFile >= 0) {
|
|
char xlogfname[MAXFNAMELEN];
|
|
|
|
/*
|
|
* XLOG segment files will be re-read by recovery in startup
|
|
* process soon, so we don't advise the OS to release cache
|
|
* pages associated with the file like XLogFileClose() does.
|
|
*/
|
|
if (close(recvFile) != 0)
|
|
ereport(PANIC, (errcode_for_file_access(),
|
|
errmsg("could not close log file %s: %m",
|
|
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, recvSegNo))));
|
|
|
|
/*
|
|
* Create .done file forcibly to prevent the restored segment from
|
|
* being archived again later.
|
|
*/
|
|
XLogFileName(xlogfname, MAXFNAMELEN, recvFileTLI, recvSegNo);
|
|
XLogArchiveForceDone(xlogfname);
|
|
}
|
|
recvFile = -1;
|
|
|
|
/* Create/use new log file */
|
|
XLByteToSeg(recptr, recvSegNo);
|
|
use_existent = true;
|
|
recvFile = XLogFileInit(recvSegNo, &use_existent, true);
|
|
recvFileTLI = t_thrd.xlog_cxt.ThisTimeLineID;
|
|
recvOff = 0;
|
|
}
|
|
|
|
/* Calculate the start offset of the received logs */
|
|
startoff = (int)(recptr % XLogSegSize);
|
|
|
|
if (startoff + nbytes > XLogSegSize)
|
|
segbytes = (int)(XLogSegSize - startoff);
|
|
else
|
|
segbytes = nbytes;
|
|
|
|
/* Need to seek in the file? */
|
|
if (recvOff != (uint32)startoff) {
|
|
if (lseek(recvFile, (off_t)startoff, SEEK_SET) < 0)
|
|
ereport(PANIC, (errcode_for_file_access(),
|
|
errmsg("could not seek in log file %s to offset %d: %m",
|
|
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, recvSegNo), startoff)));
|
|
recvOff = startoff;
|
|
}
|
|
|
|
/* OK to write the logs */
|
|
errno = 0;
|
|
|
|
byteswritten = write(recvFile, buf, segbytes);
|
|
if (byteswritten <= 0) {
|
|
/* if write didn't set errno, assume no disk space */
|
|
if (errno == 0)
|
|
errno = ENOSPC;
|
|
ereport(PANIC,
|
|
(errcode_for_file_access(),
|
|
errmsg("could not write to log file %s at offset %u, length %lu: %m",
|
|
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, recvSegNo), recvOff, INT2ULONG(segbytes))));
|
|
}
|
|
if (recvOff == (uint32)0 && segbytes >= (int)sizeof(XLogPageHeaderData)) {
|
|
if (((XLogPageHeader)buf)->xlp_magic == XLOG_PAGE_MAGIC &&
|
|
(recvSegNo * XLogSegSize) != ((XLogPageHeader)buf)->xlp_pageaddr) {
|
|
ereport(PANIC, (errcode_for_file_access(),
|
|
errmsg("unexpected page addr %lu of log file %s", ((XLogPageHeader)buf)->xlp_pageaddr,
|
|
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, recvSegNo))));
|
|
}
|
|
}
|
|
/* Update state for write */
|
|
XLByteAdvance(recptr, byteswritten);
|
|
|
|
recvOff += byteswritten;
|
|
nbytes -= byteswritten;
|
|
buf += byteswritten;
|
|
|
|
issue_xlog_fsync(recvFile, recvSegNo);
|
|
|
|
t_thrd.walrcvwriter_cxt.walStreamWrite = recptr;
|
|
|
|
SpinLockAcquire(&walrcb->mutex);
|
|
walrcb->writePtr = walrcb->flushPtr = recptr;
|
|
SpinLockRelease(&walrcb->mutex);
|
|
|
|
/* Update shared-memory status */
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
if (XLByteLT(walrcv->receivedUpto, t_thrd.walrcvwriter_cxt.walStreamWrite)) {
|
|
walrcv->latestChunkStart = walrcv->receivedUpto;
|
|
walrcv->receivedUpto = t_thrd.walrcvwriter_cxt.walStreamWrite;
|
|
}
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
#ifdef ENABLE_MULTIPLE_NODES
|
|
WakeUpBarrierPreParseBackend();
|
|
|
|
#endif
|
|
/* Signal the startup process and walsender that new WAL has arrived */
|
|
WakeupRecovery();
|
|
if (AllowCascadeReplication())
|
|
WalSndWakeup();
|
|
|
|
/* Report XLOG streaming progress in PS display */
|
|
if (u_sess->attr.attr_common.update_process_title) {
|
|
char activitymsg[50];
|
|
nRetCode = snprintf_s(activitymsg, sizeof(activitymsg), sizeof(activitymsg) - 1,
|
|
"walrcvwriter streaming %X/%X",
|
|
(uint32)(t_thrd.walrcvwriter_cxt.walStreamWrite >> 32),
|
|
(uint32)t_thrd.walrcvwriter_cxt.walStreamWrite);
|
|
securec_check_ss(nRetCode, "\0", "\0");
|
|
set_ps_display(activitymsg, false);
|
|
}
|
|
|
|
ereport(DEBUG2, (errmsg("write xlog done: start %X/%X %lu bytes", (uint32)(recptr >> 32), (uint32)recptr,
|
|
write_bytes)));
|
|
}
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) {
|
|
UpdateRecordIdxState();
|
|
}
|
|
#endif
|
|
}
|
|
|
|
void WalRcvXLogClose(void)
|
|
{
|
|
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
|
|
|
|
if (recvFile >= 0) {
|
|
/*
|
|
* XLOG segment files will be re-read by recovery in startup
|
|
* process soon, so we don't advise the OS to release cache
|
|
* pages associated with the file like XLogFileClose() does.
|
|
*/
|
|
if (close(recvFile) != 0)
|
|
ereport(PANIC,
|
|
(errcode_for_file_access(), errmsg("could not close log file %s: %m",
|
|
XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, recvSegNo))));
|
|
}
|
|
recvFile = -1;
|
|
|
|
LWLockRelease(WALWriteLock);
|
|
}
|
|
|
|
static void WSDataWriteOnDummyStandby(const char *buf, uint32 nbytes)
|
|
{
|
|
ssize_t write_len = -1;
|
|
char path[MAXPGPATH] = {0};
|
|
int nRet = 0;
|
|
|
|
/*
|
|
* get the file num to store:
|
|
* 1. If MAX_DUMMY_DATA_FILE <= wal_data_writer_file_offset, open
|
|
* the next file to store
|
|
* 2. If the current file can not store the nbytes, discard the current file space,
|
|
* open the next file to store.
|
|
*/
|
|
if (ws_dummy_data_writer_file_offset >= MAX_DUMMY_DATA_FILE ||
|
|
(MAX_DUMMY_DATA_FILE - ws_dummy_data_writer_file_offset < (uint32)(nbytes + sizeof(nbytes)))) {
|
|
ereport(DEBUG2, (errmsg("data file num %u", t_thrd.walrcvwriter_cxt.ws_dummy_data_writer_file_num)));
|
|
t_thrd.walrcvwriter_cxt.ws_dummy_data_writer_file_num++;
|
|
ws_dummy_data_writer_file_offset = 0;
|
|
|
|
if (ws_dummy_data_writer_file_fd >= 0) {
|
|
close(ws_dummy_data_writer_file_fd);
|
|
ws_dummy_data_writer_file_fd = -1;
|
|
}
|
|
}
|
|
|
|
/* open the file */
|
|
if (ws_dummy_data_writer_file_fd < 0) {
|
|
/* get the file path */
|
|
nRet = snprintf_s(path, sizeof(path), MAXPGPATH - 1, "base/dummy_standby/%u",
|
|
t_thrd.walrcvwriter_cxt.ws_dummy_data_writer_file_num);
|
|
securec_check_ss_c(nRet, "\0", "\0");
|
|
|
|
ws_dummy_data_writer_file_fd = open(path, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR);
|
|
if (ws_dummy_data_writer_file_fd < 0)
|
|
ereport(PANIC, (errcode_for_file_access(),
|
|
errmsg("could not create data file \"%s\", dummy_data_writer_file_fd=%d: %m", path,
|
|
ws_dummy_data_writer_file_fd)));
|
|
}
|
|
|
|
errno = 0;
|
|
/* OK to write the data */
|
|
write_len = write(ws_dummy_data_writer_file_fd, &nbytes, sizeof(uint32));
|
|
if (write_len < (ssize_t)sizeof(uint32)) {
|
|
/* if write didn't set errno, assume no disk space */
|
|
if (errno == 0) {
|
|
errno = ENOSPC;
|
|
}
|
|
ereport(PANIC, (errcode_for_file_access(),
|
|
errmsg("could not write to data file %s buffer len %u, length %u: %m", path, nbytes, nbytes)));
|
|
}
|
|
|
|
errno = 0;
|
|
write_len = write(ws_dummy_data_writer_file_fd, buf, nbytes);
|
|
if (write_len < nbytes) {
|
|
/* if write didn't set errno, assume no disk space */
|
|
if (errno == 0)
|
|
errno = ENOSPC;
|
|
ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to data file %s "
|
|
"at offset %u, length %u: %m",
|
|
path, (uint32)ws_dummy_data_writer_file_offset, nbytes)));
|
|
}
|
|
|
|
ws_dummy_data_writer_file_offset = ws_dummy_data_writer_file_offset + nbytes + sizeof(nbytes);
|
|
|
|
ereport(u_sess->attr.attr_storage.HaModuleDebug ? LOG : DEBUG2,
|
|
(errmsg("the dummy data write info: writer_file_num %u %u bytes",
|
|
t_thrd.walrcvwriter_cxt.ws_dummy_data_writer_file_num, nbytes)));
|
|
|
|
/* use fdatasync to make sure the received data to flush the disk */
|
|
if (pg_fdatasync(ws_dummy_data_writer_file_fd) != 0)
|
|
ereport(PANIC, (errcode_for_file_access(),
|
|
errmsg("could not fdatasync data file num %u, fd %d: %m",
|
|
t_thrd.walrcvwriter_cxt.ws_dummy_data_writer_file_num, ws_dummy_data_writer_file_fd)));
|
|
}
|
|
|
|
/*
|
|
* Write data to disk of DummyStandby.
|
|
*/
|
|
static void WSDataRcvWriteOnDummyStandby(const char *buf, uint32 nbytes)
|
|
{
|
|
ws_dummy_data_writer_use_file = true;
|
|
WSDataWriteOnDummyStandby(buf, (uint32)nbytes);
|
|
ws_dummy_data_writer_use_file = false;
|
|
}
|
|
|
|
/*
|
|
* close data fd for rm data file
|
|
*/
|
|
void CloseWSDataFileOnDummyStandby(void)
|
|
{
|
|
close(ws_dummy_data_writer_file_fd);
|
|
ws_dummy_data_writer_file_fd = -1;
|
|
ws_dummy_data_writer_file_offset = 0;
|
|
}
|
|
|
|
void InitWSDataNumOnDummyStandby(void)
|
|
{
|
|
DIR *dir = NULL;
|
|
struct dirent *de = NULL;
|
|
int max_num_file = 0;
|
|
int min_num_file = 0;
|
|
char *dirpath = DUMMY_STANDBY_DATADIR;
|
|
|
|
/* open the dir of base/dummy_standby */
|
|
errno = 0;
|
|
dir = AllocateDir(dirpath);
|
|
if (dir == NULL && errno == ENOENT) {
|
|
if (mkdir(dirpath, S_IRWXU) < 0) {
|
|
/* Failure other than not exists */
|
|
ereport(ERROR, (errcode_for_file_access(), errmsg("could not create directory \"%s\": %m", dirpath)));
|
|
}
|
|
dir = AllocateDir(dirpath);
|
|
}
|
|
if (dir == NULL) {
|
|
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", dirpath)));
|
|
return;
|
|
}
|
|
|
|
/* loop read the file name of base/dummy_standby */
|
|
while ((de = ReadDir(dir, dirpath)) != NULL) {
|
|
/* Skip special stuff */
|
|
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
|
|
continue;
|
|
|
|
min_num_file = atoi(de->d_name);
|
|
break;
|
|
}
|
|
|
|
FreeDir(dir);
|
|
dir = AllocateDir(dirpath);
|
|
|
|
/* loop read the file name of base/dummy_standby */
|
|
while ((de = ReadDir(dir, dirpath)) != NULL) {
|
|
/* Skip special stuff */
|
|
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
|
|
continue;
|
|
|
|
/* search the max num file */
|
|
max_num_file = Max(atoi(de->d_name), max_num_file);
|
|
min_num_file = Min(atoi(de->d_name), min_num_file);
|
|
ereport(DEBUG5, (errmsg("InitDummyDataNum de->d_name=%s; max_num_path=%d; min_num_file=%d.", de->d_name,
|
|
max_num_file, min_num_file)));
|
|
}
|
|
|
|
t_thrd.walsender_cxt.ws_dummy_data_read_file_num = min_num_file;
|
|
t_thrd.walrcvwriter_cxt.ws_dummy_data_writer_file_num = max_num_file;
|
|
FreeDir(dir);
|
|
}
|
|
|
|
/*
|
|
* Write all the wal data to disk.
|
|
*/
|
|
int WalDataRcvWrite(void)
|
|
{
|
|
/* Use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
char *writeBuf = NULL;
|
|
char *cur_buf = NULL;
|
|
uint32 left_nbytes = 0;
|
|
uint32 get_queue_nbytes = 0;
|
|
uint32 remainbytes = 0;
|
|
uint32 cur_data_nbytes = 0;
|
|
DataQueuePtr start_pos = { 0, 0 };
|
|
DataQueuePtr end_pos = { 0, 0 };
|
|
WalRcvCtlBlock *walrcb = getCurrentWalRcvCtlBlock();
|
|
errno_t errorno = EOK;
|
|
XLogRecPtr recptr = InvalidXLogRecPtr;
|
|
char data_flag;
|
|
|
|
if (!g_instance.attr.attr_storage.enable_mix_replication)
|
|
return walRcvWrite(walrcb);
|
|
else if (!t_thrd.xlog_cxt.InRecovery)
|
|
t_thrd.xlog_cxt.InRecovery = true;
|
|
|
|
/* avoid concurrent WalDataRcvWrite by WalRcvWriteLock */
|
|
START_CRIT_SECTION();
|
|
LWLockAcquire(RcvWriteLock, LW_EXCLUSIVE);
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
start_pos.queueid = walrcv->local_write_pos.queueid;
|
|
start_pos.queueoff = walrcv->local_write_pos.queueoff;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
get_queue_nbytes = GetFromDataQueue(writeBuf, g_instance.attr.attr_storage.DataQueueBufSize * 1024, start_pos,
|
|
end_pos, true, t_thrd.dataqueue_cxt.DataWriterQueue);
|
|
if (get_queue_nbytes == 0) {
|
|
LWLockRelease(RcvWriteLock);
|
|
END_CRIT_SECTION();
|
|
return 0;
|
|
}
|
|
|
|
cur_buf = writeBuf;
|
|
left_nbytes = get_queue_nbytes;
|
|
|
|
while (left_nbytes > 0) {
|
|
errorno = memcpy_s(&cur_data_nbytes, sizeof(uint32), cur_buf, sizeof(uint32));
|
|
securec_check(errorno, "\0", "\0");
|
|
data_flag = *(cur_buf + sizeof(uint32));
|
|
|
|
switch (data_flag) {
|
|
case 'w':
|
|
Assert(cur_data_nbytes > (sizeof(uint32) + 1 + sizeof(XLogRecPtr)));
|
|
errorno = memcpy_s(&recptr, sizeof(XLogRecPtr), cur_buf + sizeof(uint32) + 1, sizeof(XLogRecPtr));
|
|
securec_check(errorno, "\0", "\0");
|
|
Assert(!XLogRecPtrIsInvalid(recptr));
|
|
|
|
(void)WSWalRcvWrite(walrcb, cur_buf + sizeof(uint32) + 1 + sizeof(XLogRecPtr),
|
|
cur_data_nbytes - (sizeof(uint32) + 1 + sizeof(XLogRecPtr)), recptr);
|
|
|
|
break;
|
|
case 'd':
|
|
if (u_sess->attr.attr_storage.HaModuleDebug)
|
|
WSDataRcvCheck(cur_buf, cur_data_nbytes);
|
|
|
|
/* Write data. */
|
|
if (dummyStandbyMode)
|
|
WSDataRcvWriteOnDummyStandby(cur_buf, cur_data_nbytes);
|
|
else {
|
|
/* skip the data message prefix (total_len + 'd' + start_ptr + end_ptr)
|
|
*
|
|
* 1. total_len has already been parsed as cur_data_nbytes
|
|
*/
|
|
cur_buf += sizeof(uint32);
|
|
/* 2. check and skip tag 'd' */
|
|
Assert(cur_buf[0] == 'd');
|
|
cur_buf += 1;
|
|
/* 3. skip the start_ptr and end_ptr */
|
|
cur_buf += sizeof(XLogRecPtr) * 2;
|
|
/* 4. calculate the post-proccess cur_data_nbytes */
|
|
cur_data_nbytes -= WS_DATA_MSG_PREFIX_LEN;
|
|
left_nbytes -= WS_DATA_MSG_PREFIX_LEN;
|
|
|
|
remainbytes = DoDataWrite(cur_buf, cur_data_nbytes);
|
|
Assert(remainbytes == 0);
|
|
cur_data_nbytes -= remainbytes;
|
|
}
|
|
/* Close all open files */
|
|
smgrcloseall();
|
|
break;
|
|
default:
|
|
Assert(false);
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unexpected wal data type %c", data_flag)));
|
|
break;
|
|
}
|
|
|
|
cur_buf += cur_data_nbytes;
|
|
left_nbytes -= cur_data_nbytes;
|
|
}
|
|
Assert(left_nbytes == 0);
|
|
|
|
/* Update share memory from the the next operation. */
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->local_write_pos.queueid = start_pos.queueid;
|
|
walrcv->local_write_pos.queueoff = start_pos.queueoff;
|
|
DQByteAdvance(walrcv->local_write_pos, get_queue_nbytes);
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
PopFromDataQueue(((WalRcvData *)walrcv)->local_write_pos, t_thrd.dataqueue_cxt.DataWriterQueue);
|
|
|
|
LWLockRelease(RcvWriteLock);
|
|
END_CRIT_SECTION();
|
|
|
|
return (int)get_queue_nbytes;
|
|
}
|
|
|
|
/*
|
|
* Flush the log to disk.
|
|
*
|
|
* If we're in the midst of dying, it's unwise to do anything that might throw
|
|
* an error, so we skip sending a reply in that case.
|
|
*/
|
|
int walRcvWrite(WalRcvCtlBlock *walrcb)
|
|
{
|
|
int64 walfreeoffset;
|
|
int64 walwriteoffset;
|
|
char *walrecvbuf = NULL;
|
|
XLogRecPtr startptr;
|
|
int64 recBufferSize = g_instance.attr.attr_storage.WalReceiverBufSize * 1024;
|
|
int nbytes = 0;
|
|
|
|
if (walrcb == NULL)
|
|
return 0;
|
|
|
|
START_CRIT_SECTION();
|
|
|
|
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
|
|
SpinLockAcquire(&walrcb->mutex);
|
|
|
|
if (walrcb->walFreeOffset == walrcb->walWriteOffset) {
|
|
SpinLockRelease(&walrcb->mutex);
|
|
LWLockRelease(WALWriteLock);
|
|
|
|
END_CRIT_SECTION();
|
|
return 0;
|
|
}
|
|
walfreeoffset = walrcb->walFreeOffset;
|
|
walwriteoffset = walrcb->walWriteOffset;
|
|
walrecvbuf = walrcb->walReceiverBuffer;
|
|
startptr = walrcb->walStart;
|
|
SpinLockRelease(&walrcb->mutex);
|
|
|
|
nbytes = (walfreeoffset < walwriteoffset) ? (recBufferSize - walwriteoffset) : (walfreeoffset - walwriteoffset);
|
|
|
|
XLogWalRcvWrite(walrcb, walrecvbuf + walwriteoffset, nbytes, startptr);
|
|
XLByteAdvance(startptr, nbytes);
|
|
ereport(DEBUG5,
|
|
(errmsg("walRcvWrite: write len:%d, at %u,%X", nbytes, (uint32)(startptr >> 32), (uint32)startptr)));
|
|
|
|
SpinLockAcquire(&walrcb->mutex);
|
|
walrcb->walWriteOffset += nbytes;
|
|
walrcb->walStart = startptr;
|
|
|
|
if (walrcb->walWriteOffset == recBufferSize) {
|
|
walrcb->walWriteOffset = 0;
|
|
if (walrcb->walFreeOffset == recBufferSize) {
|
|
walrcb->walFreeOffset = 0;
|
|
}
|
|
}
|
|
walfreeoffset = walrcb->walFreeOffset;
|
|
walwriteoffset = walrcb->walWriteOffset;
|
|
SpinLockRelease(&walrcb->mutex);
|
|
|
|
ereport(DEBUG5, (errmsg("walRcvWrite: nbytes(%d),walfreeoffset(%ld),walwriteoffset(%ld),startptr(%u:%u)", nbytes,
|
|
walfreeoffset, walwriteoffset, (uint32)(startptr >> 32), (uint32)startptr)));
|
|
|
|
LWLockRelease(WALWriteLock);
|
|
|
|
END_CRIT_SECTION();
|
|
/* the max lsn to be replayed */
|
|
g_instance.comm_cxt.predo_cxt.redoPf.local_max_lsn = startptr;
|
|
return nbytes;
|
|
}
|
|
|
|
/*
|
|
* The main difference between walRcvWrite() and WSWalRcvWrite() is that
|
|
* for walRcvWrite() the wal source is from the walrcb and for WSWalRcvWrite()
|
|
* the wal source is from the wal write queue.
|
|
*/
|
|
int WSWalRcvWrite(WalRcvCtlBlock *walrcb, char *buf, Size nbytes, XLogRecPtr start_ptr)
|
|
{
|
|
if (walrcb == NULL) {
|
|
ereport(NOTICE, (errmsg("have nothing of the wal receiver wal block info.")));
|
|
return 0;
|
|
}
|
|
|
|
START_CRIT_SECTION();
|
|
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
|
|
|
|
XLogWalRcvWrite(walrcb, buf, nbytes, start_ptr);
|
|
|
|
LWLockRelease(WALWriteLock);
|
|
END_CRIT_SECTION();
|
|
|
|
return nbytes;
|
|
}
|
|
|
|
/*
|
|
* Called when the WalRcvWriterMain is ending.
|
|
*/
|
|
static void ShutdownWalRcvWriter(int code, Datum arg)
|
|
{
|
|
/* clear WriterPid */
|
|
emptyWalRcvWriterLatch();
|
|
}
|
|
|
|
/* SIGUSR1: let latch facility handle the signal */
|
|
static void WalRcvWriterProcSigUsr1Handler(SIGNAL_ARGS)
|
|
{
|
|
int saveErrno = errno;
|
|
|
|
latch_sigusr1_handler();
|
|
|
|
errno = saveErrno;
|
|
}
|
|
|
|
void walrcvWriterMain(void)
|
|
{
|
|
sigjmp_buf localSigjmpBuf;
|
|
sigset_t oldSigMask;
|
|
MemoryContext walrcvWriterContext;
|
|
|
|
ereport(LOG, (errmsg("walrcvwriter thread started")));
|
|
/*
|
|
* Reset some signals that are accepted by postmaster but not here
|
|
*/
|
|
(void)gspqsignal(SIGHUP, walrcvWriterSigHupHandler);
|
|
(void)gspqsignal(SIGINT, SIG_IGN);
|
|
(void)gspqsignal(SIGTERM, reqShutdownHandler);
|
|
(void)gspqsignal(SIGQUIT, walrcvWriterQuickDie); /* hard crash time */
|
|
(void)gspqsignal(SIGALRM, SIG_IGN);
|
|
(void)gspqsignal(SIGPIPE, SIG_IGN);
|
|
(void)gspqsignal(SIGUSR1, WalRcvWriterProcSigUsr1Handler);
|
|
(void)gspqsignal(SIGUSR2, SIG_IGN);
|
|
(void)gspqsignal(SIGURG, print_stack);
|
|
/*
|
|
* Reset some signals that are accepted by postmaster but not here
|
|
*/
|
|
(void)gspqsignal(SIGCHLD, SIG_DFL);
|
|
(void)gspqsignal(SIGTTIN, SIG_DFL);
|
|
(void)gspqsignal(SIGTTOU, SIG_DFL);
|
|
(void)gspqsignal(SIGCONT, SIG_DFL);
|
|
(void)gspqsignal(SIGWINCH, SIG_DFL);
|
|
|
|
/* We allow SIGQUIT (quickdie) at all times */
|
|
(void)sigdelset(&t_thrd.libpq_cxt.BlockSig, SIGQUIT);
|
|
|
|
on_shmem_exit(ShutdownWalRcvWriter, 0);
|
|
|
|
/*
|
|
* Create a resource owner to keep track of our resources (currently only
|
|
* buffer pins).
|
|
*/
|
|
t_thrd.utils_cxt.CurrentResourceOwner = ResourceOwnerCreate(NULL, "WalReceive Writer",
|
|
THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE));
|
|
|
|
/*
|
|
* Create a memory context that we will do all our work in. We do this so
|
|
* that we can reset the context during error recovery and thereby avoid
|
|
* possible memory leaks. Formerly this code just ran in
|
|
* t_thrd.top_mem_cxt, but resetting that would be a really bad idea.
|
|
*/
|
|
walrcvWriterContext = AllocSetContextCreate(t_thrd.top_mem_cxt, "WalReceive Writer", ALLOCSET_DEFAULT_MINSIZE,
|
|
ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE);
|
|
(void)MemoryContextSwitchTo(walrcvWriterContext);
|
|
|
|
/* init the dummy standby data num to write */
|
|
if (dummyStandbyMode) {
|
|
InitWSDataNumOnDummyStandby();
|
|
t_thrd.walrcvwriter_cxt.ws_dummy_data_writer_file_num++;
|
|
ws_dummy_data_writer_file_offset = 0;
|
|
}
|
|
|
|
/*
|
|
* If an exception is encountered, processing resumes here.
|
|
*
|
|
* See notes in postgres.c about the design of this coding.
|
|
*/
|
|
int curTryCounter = 0;
|
|
int *oldTryCounter = nullptr;
|
|
if (sigsetjmp(localSigjmpBuf, 1) != 0) {
|
|
gstrace_tryblock_exit(true, oldTryCounter);
|
|
|
|
// We need restore the signal mask of current thread
|
|
//
|
|
pthread_sigmask(SIG_SETMASK, &oldSigMask, NULL);
|
|
|
|
/* Since not using PG_TRY, must reset error stack by hand */
|
|
t_thrd.log_cxt.error_context_stack = NULL;
|
|
|
|
/* Prevent interrupts while cleaning up */
|
|
HOLD_INTERRUPTS();
|
|
|
|
/* Report the error to the server log */
|
|
EmitErrorReport();
|
|
|
|
/* abort async io, must before LWlock release */
|
|
AbortAsyncListIO();
|
|
|
|
/* release resource held by lsc */
|
|
AtEOXact_SysDBCache(false);
|
|
|
|
/*
|
|
* These operations are really just a minimal subset of
|
|
* AbortTransaction(). We don't have very many resources to worry
|
|
* about in pagewriter, but we do have LWLocks, buffers, and temp files.
|
|
*/
|
|
LWLockReleaseAll();
|
|
AbortBufferIO();
|
|
UnlockBuffers();
|
|
/* buffer pins are released here: */
|
|
ResourceOwnerRelease(t_thrd.utils_cxt.CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true);
|
|
/* we needn't bother with the other ResourceOwnerRelease phases */
|
|
AtEOXact_Buffers(false);
|
|
AtEOXact_Files();
|
|
AtEOXact_HashTables(false);
|
|
|
|
/*
|
|
* Now return to normal top-level context and clear ErrorContext for
|
|
* next time.
|
|
*/
|
|
(void)MemoryContextSwitchTo(walrcvWriterContext);
|
|
FlushErrorState();
|
|
|
|
/* Flush any leaked data in the top-level context */
|
|
MemoryContextResetAndDeleteChildren(walrcvWriterContext);
|
|
|
|
/* Now we can allow interrupts again */
|
|
RESUME_INTERRUPTS();
|
|
|
|
/*
|
|
* Sleep at least 1 second after any error. A write error is likely
|
|
* to be repeated, and we don't want to be filling the error logs as
|
|
* fast as we can.
|
|
*/
|
|
pg_usleep(1000000L);
|
|
|
|
/*
|
|
* Close all open files after any error. This is helpful on Windows,
|
|
* where holding deleted files open causes various strange errors.
|
|
* It's not clear we need it elsewhere, but shouldn't hurt.
|
|
*/
|
|
smgrcloseall();
|
|
}
|
|
oldTryCounter = gstrace_tryblock_entry(&curTryCounter);
|
|
|
|
/* We can now handle ereport(ERROR) */
|
|
t_thrd.log_cxt.PG_exception_stack = &localSigjmpBuf;
|
|
|
|
/*
|
|
* Unblock signals (they were blocked when the postmaster forked us)
|
|
*/
|
|
gs_signal_setmask(&t_thrd.libpq_cxt.UnBlockSig, NULL);
|
|
(void)gs_signal_unblock_sigusr2();
|
|
|
|
/*
|
|
* Use the recovery target timeline ID during recovery
|
|
*/
|
|
if (!RecoveryInProgress())
|
|
ereport(FATAL, (errmsg("cannot continue WAL streaming, recovery has already ended")));
|
|
|
|
t_thrd.xlog_cxt.ThisTimeLineID = GetRecoveryTargetTLI();
|
|
|
|
/*
|
|
* register procLatch to walrcv shared memory
|
|
*/
|
|
setWalRcvWriterLatch();
|
|
|
|
t_thrd.walrcvwriter_cxt.walStreamWrite = GetXLogReplayRecPtr(NULL);
|
|
|
|
/*
|
|
* Loop forever
|
|
*/
|
|
for (;;) {
|
|
int rc;
|
|
|
|
/* Clear any already-pending wakeups */
|
|
ResetLatch(&t_thrd.proc->procLatch);
|
|
|
|
if (t_thrd.walrcvwriter_cxt.gotSIGHUP) {
|
|
t_thrd.walrcvwriter_cxt.gotSIGHUP = false;
|
|
ProcessConfigFile(PGC_SIGHUP);
|
|
}
|
|
|
|
while (!t_thrd.walrcvwriter_cxt.shutdownRequested && WalDataRcvWrite() > 0)
|
|
;
|
|
|
|
if (t_thrd.walrcvwriter_cxt.shutdownRequested) {
|
|
ereport(LOG, (errmsg("walrcvwriter thread shut down")));
|
|
/*
|
|
* From here on, elog(ERROR) should end with exit(1), not send
|
|
* control back to the sigsetjmp block above
|
|
*/
|
|
u_sess->attr.attr_common.ExitOnAnyError = true;
|
|
/* Normal exit from the pagewriter is here */
|
|
proc_exit(0); /* done */
|
|
}
|
|
|
|
rc = WaitLatch(&t_thrd.proc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, (long)1000 /* ms */);
|
|
if (rc & WL_POSTMASTER_DEATH) {
|
|
ereport(LOG, (errmsg("walrcvwriter thread shut down with code 1")));
|
|
gs_thread_exit(1);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void walrcvWriterSigHupHandler(SIGNAL_ARGS)
|
|
{
|
|
int saveErrno = errno;
|
|
|
|
t_thrd.walrcvwriter_cxt.gotSIGHUP = true;
|
|
if (t_thrd.proc) {
|
|
SetLatch(&t_thrd.proc->procLatch);
|
|
}
|
|
|
|
errno = saveErrno;
|
|
}
|
|
|
|
static void walrcvWriterQuickDie(SIGNAL_ARGS)
|
|
{
|
|
gs_signal_setmask(&t_thrd.libpq_cxt.BlockSig, NULL);
|
|
|
|
/*
|
|
* We DO NOT want to run proc_exit() callbacks -- we're here because
|
|
* shared memory may be corrupted, so we don't want to try to clean up our
|
|
* transaction. Just nail the windows shut and get out of town. Now that
|
|
* there's an atexit callback to prevent third-party code from breaking
|
|
* things by calling exit() directly, we have to reset the callbacks
|
|
* explicitly to make this work as intended.
|
|
*/
|
|
on_exit_reset();
|
|
|
|
/*
|
|
* Note we do exit(2) not exit(0). This is to force the postmaster into a
|
|
* system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
|
|
* backend. This is necessary precisely because we don't clean up our
|
|
* shared memory state. (The "dead man switch" mechanism in pmsignal.c
|
|
* should ensure the postmaster sees this as a crash, too, but no harm in
|
|
* being doubly sure.)
|
|
*/
|
|
gs_thread_exit(2);
|
|
}
|
|
|
|
static void reqShutdownHandler(SIGNAL_ARGS)
|
|
{
|
|
int saveErrno = errno;
|
|
|
|
t_thrd.walrcvwriter_cxt.shutdownRequested = true;
|
|
if (t_thrd.proc) {
|
|
SetLatch(&t_thrd.proc->procLatch);
|
|
}
|
|
|
|
errno = saveErrno;
|
|
}
|