2743 lines
106 KiB
C++
2743 lines
106 KiB
C++
/* -------------------------------------------------------------------------
|
|
*
|
|
* walreceiver.cpp
|
|
*
|
|
* The WAL receiver process (walreceiver) is new as of Postgres 9.0. It
|
|
* is the process in the standby server that takes charge of receiving
|
|
* XLOG records from a primary server during streaming replication.
|
|
*
|
|
* When the startup process determines that it's time to start streaming,
|
|
* it instructs postmaster to start walreceiver. Walreceiver first connects
|
|
* to the primary server (it will be served by a walsender process
|
|
* in the primary server), and then keeps receiving XLOG records and
|
|
* writing them to the disk as long as the connection is alive. As XLOG
|
|
* records are received and flushed to disk, it updates the
|
|
* WalRcv->receivedUpto variable in shared memory, to inform the startup
|
|
* process of how far it can proceed with XLOG replay.
|
|
*
|
|
* Normal termination is by SIGTERM, which instructs the walreceiver to
|
|
* exit(0). Emergency termination is by SIGQUIT; like any postmaster child
|
|
* process, the walreceiver will simply abort and exit on SIGQUIT. A close
|
|
* of the connection and a FATAL error are treated not as a crash but as
|
|
* normal operation.
|
|
*
|
|
* This file contains the server-facing parts of walreceiver. The libpq-
|
|
* specific parts are in the libpqwalreceiver module. It's loaded
|
|
* dynamically to avoid linking the server with libpq.
|
|
*
|
|
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
|
* Portions Copyright (c) 2010-2012, PostgreSQL Global Development Group
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/gausskernel/storage/replication/walreceiver.cpp
|
|
*
|
|
* -------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
#include "knl/knl_variable.h"
|
|
|
|
#ifndef WIN32
|
|
#include <syscall.h>
|
|
#endif
|
|
#include <sys/stat.h>
|
|
|
|
#include "access/xlog_internal.h"
|
|
#include "access/xlog.h"
|
|
#include "access/multi_redo_api.h"
|
|
|
|
#include "funcapi.h"
|
|
#include "nodes/execnodes.h"
|
|
#include "libpq/libpq-fe.h"
|
|
#include "libpq/pqsignal.h"
|
|
#include "miscadmin.h"
|
|
#include "replication/replicainternal.h"
|
|
#include "replication/dataqueue.h"
|
|
#include "replication/walprotocol.h"
|
|
#include "replication/walreceiver.h"
|
|
#include "replication/walsender.h"
|
|
#include "replication/walsender_private.h"
|
|
#include "storage/copydir.h"
|
|
#include "storage/ipc.h"
|
|
#include "storage/latch.h"
|
|
#include "storage/pmsignal.h"
|
|
#include "storage/copydir.h"
|
|
#include "storage/procarray.h"
|
|
#include "utils/guc.h"
|
|
#include "utils/builtins.h"
|
|
#include "utils/guc.h"
|
|
#include "utils/ps_status.h"
|
|
#include "utils/resowner.h"
|
|
#include "utils/timestamp.h"
|
|
#include "gssignal/gs_signal.h"
|
|
#include "gs_bbox.h"
|
|
|
|
#include "flock.h"
|
|
#include "postmaster/postmaster.h"
|
|
#include "hotpatch/hotpatch.h"
|
|
#include "utils/distribute_test.h"
|
|
|
|
bool wal_catchup = false;
|
|
|
|
#define NAPTIME_PER_CYCLE 1 /* max sleep time between cycles (1ms) */
|
|
|
|
#define CONFIG_BAK_FILENAME "postgresql.conf.bak"
|
|
|
|
#define WAL_DATA_LEN ((sizeof(uint32) + 1 + sizeof(XLogRecPtr)))
|
|
|
|
#define TEMP_CONF_FILE "postgresql.conf.bak"
|
|
|
|
const char *g_reserve_param[RESERVE_SIZE] = {
|
|
"application_name",
|
|
"archive_command",
|
|
"audit_directory",
|
|
"available_zone",
|
|
"comm_control_port",
|
|
"comm_sctp_port",
|
|
"listen_addresses",
|
|
"log_directory",
|
|
"port",
|
|
"replconninfo1",
|
|
"replconninfo2",
|
|
"replconninfo3",
|
|
"replconninfo4",
|
|
"replconninfo5",
|
|
"replconninfo6",
|
|
"replconninfo7",
|
|
"replconninfo8",
|
|
"ssl",
|
|
"ssl_ca_file",
|
|
"ssl_cert_file",
|
|
"ssl_ciphers",
|
|
"ssl_crl_file",
|
|
"ssl_key_file",
|
|
"ssl_renegotiation_limit",
|
|
"ssl_cert_notify_time",
|
|
"synchronous_standby_names",
|
|
"local_bind_address",
|
|
"perf_directory",
|
|
"query_log_directory",
|
|
"asp_log_directory",
|
|
"streaming_router_port",
|
|
"enable_upsert_to_merge",
|
|
"archive_dest",
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
"recovery_min_apply_delay",
|
|
"sync_config_strategy"
|
|
#else
|
|
NULL,
|
|
NULL
|
|
#endif
|
|
};
|
|
|
|
const WalReceiverFunc WalReceiverFuncTable[] = {
|
|
{ libpqrcv_connect, libpqrcv_receive, libpqrcv_send, libpqrcv_disconnect },
|
|
{ obs_connect, obs_receive, obs_send, obs_disconnect },
|
|
};
|
|
|
|
const int FUNC_LIBPQ_IDX = 0;
|
|
const int FUNC_OBS_IDX = 1;
|
|
#define GET_FUNC_IDX \
|
|
(t_thrd.walreceiverfuncs_cxt.WalRcv->conn_target == REPCONNTARGET_OBS ? FUNC_OBS_IDX : FUNC_LIBPQ_IDX)
|
|
|
|
/* Prototypes for private functions */
|
|
static void EnableWalRcvImmediateExit(void);
|
|
static void DisableWalRcvImmediateExit(void);
|
|
static void WalRcvDie(int code, Datum arg);
|
|
static void XLogWalRcvDataPageReplication(char *buf, Size len);
|
|
static void XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len);
|
|
static void XLogWalRcvReceive(char *buf, Size nbytes, XLogRecPtr recptr);
|
|
static void XLogWalRcvReceiveInBuf(char *buf, Size nbytes, XLogRecPtr recptr);
|
|
static void XLogWalRcvSendHSFeedback(void);
|
|
static void XLogWalRcvSendSwitchRequest(void);
|
|
static void WalDataRcvReceive(char *buf, Size nbytes, XLogRecPtr recptr);
|
|
static void ProcessSwitchResponse(int code);
|
|
static void ProcessWalSndrMessage(XLogRecPtr *walEnd, TimestampTz sendTime);
|
|
|
|
static void ProcessKeepaliveMessage(PrimaryKeepaliveMessage *keepalive);
|
|
static void ProcessRmXLogMessage(RmXLogMessage *rmXLogMessage);
|
|
static void ProcessEndXLogMessage(EndXLogMessage *endXLogMessage);
|
|
static void ProcessWalHeaderMessage(WalDataMessageHeader *msghdr);
|
|
static void ProcessWalDataHeaderMessage(WalDataPageMessageHeader *msghdr);
|
|
const char *wal_get_rebuild_reason_string(HaRebuildReason reason);
|
|
static void wal_get_ha_rebuild_reason(char *buildReason, ServerMode local_role, bool isRunning);
|
|
Datum pg_stat_get_wal_receiver(PG_FUNCTION_ARGS);
|
|
/* Signal handlers */
|
|
static void WalRcvSigHupHandler(SIGNAL_ARGS);
|
|
static void WalRcvShutdownHandler(SIGNAL_ARGS);
|
|
static void WalRcvQuickDieHandler(SIGNAL_ARGS);
|
|
static void sigusr1_handler(SIGNAL_ARGS);
|
|
static void ConfigFileTimer(void);
|
|
static bool ProcessConfigFileMessage(char *buf, Size len);
|
|
static void firstSynchStandbyFile(void);
|
|
|
|
static TimestampTz GetHeartbeatLastReplyTimestamp();
|
|
static bool WalRecCheckTimeOut(TimestampTz nowtime, TimestampTz last_recv_timestamp, bool ping_sent);
|
|
static void WalRcvRefreshPercentCountStartLsn(XLogRecPtr currentMaxLsn, XLogRecPtr currentDoneLsn);
|
|
static void ProcessArchiveXlogMessage(const ArchiveXlogMessage* archive_xlog_message);
|
|
static void WalRecvSendArchiveXlogResponse();
|
|
|
|
void ProcessWalRcvInterrupts(void)
|
|
{
|
|
/*
|
|
* Although walreceiver interrupt handling doesn't use the same scheme as
|
|
* regular backends, call CHECK_FOR_INTERRUPTS() to make sure we receive
|
|
* any incoming signals on Win32.
|
|
*/
|
|
CHECK_FOR_INTERRUPTS();
|
|
|
|
if (t_thrd.walreceiver_cxt.got_SIGTERM) {
|
|
t_thrd.walreceiver_cxt.WalRcvImmediateInterruptOK = false;
|
|
ereport(FATAL, (errcode(ERRCODE_ADMIN_SHUTDOWN),
|
|
errmsg("terminating walreceiver process due to administrator command")));
|
|
}
|
|
}
|
|
|
|
static void EnableWalRcvImmediateExit(void)
|
|
{
|
|
t_thrd.walreceiver_cxt.WalRcvImmediateInterruptOK = true;
|
|
ProcessWalRcvInterrupts();
|
|
}
|
|
|
|
static void DisableWalRcvImmediateExit(void)
|
|
{
|
|
t_thrd.walreceiver_cxt.WalRcvImmediateInterruptOK = false;
|
|
ProcessWalRcvInterrupts();
|
|
}
|
|
|
|
void wakeupWalRcvWriter()
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
if (walrcv->walrcvWriterLatch != NULL)
|
|
SetLatch(walrcv->walrcvWriterLatch);
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
|
|
static void walRcvCtlBlockInit()
|
|
{
|
|
char *buf = NULL;
|
|
int64 recBufferSize = g_instance.attr.attr_storage.WalReceiverBufSize * 1024;
|
|
size_t len = offsetof(WalRcvCtlBlock, walReceiverBuffer) + recBufferSize;
|
|
errno_t rc = 0;
|
|
|
|
Assert(t_thrd.walreceiver_cxt.walRcvCtlBlock == NULL);
|
|
buf = (char *)MemoryContextAlloc(THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE), len);
|
|
|
|
if (buf == NULL) {
|
|
ereport(FATAL, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")));
|
|
}
|
|
|
|
rc = memset_s(buf, sizeof(WalRcvCtlBlock), 0, sizeof(WalRcvCtlBlock));
|
|
securec_check_c(rc, "\0", "\0");
|
|
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock = (WalRcvCtlBlock *)buf;
|
|
if (BBOX_BLACKLIST_WALREC_CTL_BLOCK) {
|
|
bbox_blacklist_add(WALRECIVER_CTL_BLOCK, t_thrd.walreceiver_cxt.walRcvCtlBlock, len);
|
|
}
|
|
|
|
SpinLockInit(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
}
|
|
|
|
static void walRcvCtlBlockFini()
|
|
{
|
|
if (BBOX_BLACKLIST_WALREC_CTL_BLOCK) {
|
|
bbox_blacklist_remove(WALRECIVER_CTL_BLOCK, t_thrd.walreceiver_cxt.walRcvCtlBlock);
|
|
}
|
|
|
|
pfree(t_thrd.walreceiver_cxt.walRcvCtlBlock);
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock = NULL;
|
|
}
|
|
|
|
/*
|
|
* Clean up data in receive buffer.
|
|
* This function should be called on thread exit.
|
|
*/
|
|
void walRcvDataCleanup()
|
|
{
|
|
while (WalDataRcvWrite() > 0) {
|
|
};
|
|
}
|
|
|
|
bool walRcvCtlBlockIsEmpty(void)
|
|
{
|
|
volatile WalRcvCtlBlock *walrcb = NULL;
|
|
|
|
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
|
|
walrcb = getCurrentWalRcvCtlBlock();
|
|
|
|
if (walrcb == NULL) {
|
|
LWLockRelease(WALWriteLock);
|
|
return true;
|
|
}
|
|
|
|
bool retState = false;
|
|
|
|
SpinLockAcquire(&walrcb->mutex);
|
|
if (IsExtremeRedo()) {
|
|
if (walrcb->walFreeOffset == walrcb->walReadOffset) {
|
|
retState = true;
|
|
}
|
|
} else {
|
|
if (walrcb->walFreeOffset == walrcb->walWriteOffset) {
|
|
retState = true;
|
|
}
|
|
}
|
|
|
|
SpinLockRelease(&walrcb->mutex);
|
|
LWLockRelease(WALWriteLock);
|
|
return retState;
|
|
}
|
|
|
|
void setObsArchLatch(const Latch* latch)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->obsArchLatch = (Latch *)latch;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
|
|
static void wakeupObsArchLatch()
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
if (walrcv->obsArchLatch != NULL) {
|
|
SetLatch(walrcv->obsArchLatch);
|
|
}
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
|
|
void RefuseConnect()
|
|
{
|
|
WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
knl_g_disconn_node_context_data disconn_node =
|
|
g_instance.comm_cxt.localinfo_cxt.disable_conn_node.disable_conn_node_data;
|
|
|
|
if (disconn_node.conn_mode == POLLING_CONNECTION) {
|
|
return;
|
|
}
|
|
|
|
if (disconn_node.conn_mode == SPECIFY_CONNECTION &&
|
|
strcmp(disconn_node.disable_conn_node_host, (char *)walrcv->conn_channel.remotehost) == 0 &&
|
|
disconn_node.disable_conn_node_port == walrcv->conn_channel.remoteport) {
|
|
return;
|
|
}
|
|
ereport(FATAL,
|
|
(errmsg("Refuse WAL streaming, connection mode is %d, connertion IP is %s:%d\n", disconn_node.conn_mode,
|
|
disconn_node.disable_conn_node_host, disconn_node.disable_conn_node_port)));
|
|
}
|
|
|
|
void WalRcvrProcessData(TimestampTz *last_recv_timestamp, bool *ping_sent)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
unsigned char type;
|
|
char *buf = NULL;
|
|
int len;
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(DN_WALRECEIVE_MAINLOOP, stub_sleep_emit)) {
|
|
ereport(get_distribute_test_param()->elevel,
|
|
(errmsg("sleep_emit happen during WalReceiverMain time:%ds, stub_name:%s",
|
|
get_distribute_test_param()->sleep_time, get_distribute_test_param()->test_stub_name)));
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Emergency bailout if postmaster has died. This is to avoid the
|
|
* necessity for manual cleanup of all postmaster children.
|
|
*/
|
|
if (!PostmasterIsAlive())
|
|
gs_thread_exit(1);
|
|
if(walrcv->conn_target != REPCONNTARGET_OBS)
|
|
RefuseConnect();
|
|
|
|
/*
|
|
* Exit walreceiver if we're not in recovery. This should not happen,
|
|
* but cross-check the status here.
|
|
*/
|
|
if (!RecoveryInProgress())
|
|
ereport(FATAL, (errmsg("cannot continue WAL streaming, recovery has already ended")));
|
|
|
|
/* Process any requests or signals received recently */
|
|
ProcessWalRcvInterrupts();
|
|
|
|
if (t_thrd.walreceiver_cxt.got_SIGHUP) {
|
|
t_thrd.walreceiver_cxt.got_SIGHUP = false;
|
|
ProcessConfigFile(PGC_SIGHUP);
|
|
}
|
|
|
|
volatile unsigned int *pitr_task_status = &g_instance.archive_obs_cxt.pitr_task_status;
|
|
if (unlikely(pg_atomic_read_u32(pitr_task_status) == PITR_TASK_DONE)) {
|
|
WalRecvSendArchiveXlogResponse();
|
|
pg_memory_barrier();
|
|
pg_atomic_write_u32(pitr_task_status, PITR_TASK_NONE);
|
|
}
|
|
|
|
if (!WalRcvWriterInProgress())
|
|
ereport(FATAL, (errmsg("terminating walreceiver process due to the death of walrcvwriter")));
|
|
|
|
if (t_thrd.walreceiver_cxt.start_switchover && walrcv->conn_target != REPCONNTARGET_OBS) {
|
|
t_thrd.walreceiver_cxt.start_switchover = false;
|
|
XLogWalRcvSendSwitchRequest();
|
|
}
|
|
|
|
/* Wait a while for data to arrive */
|
|
if ((WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_receive(NAPTIME_PER_CYCLE, &type, &buf, &len)) {
|
|
*last_recv_timestamp = GetCurrentTimestamp();
|
|
*ping_sent = false;
|
|
/* Accept the received data, and process it */
|
|
XLogWalRcvProcessMsg(type, buf, len);
|
|
|
|
/* Receive any more data we can without sleeping */
|
|
while ((t_thrd.walreceiver_cxt.start_switchover == false) &&
|
|
(WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_receive(0, &type, &buf, &len) ) {
|
|
*last_recv_timestamp = GetCurrentTimestamp();
|
|
*ping_sent = false;
|
|
XLogWalRcvProcessMsg(type, buf, len);
|
|
}
|
|
|
|
/* Let the master know that we received some data. */
|
|
if(walrcv->conn_target != REPCONNTARGET_OBS)
|
|
XLogWalRcvSendReply(false, false);
|
|
} else if(walrcv->conn_target != REPCONNTARGET_OBS) {
|
|
/*
|
|
* We didn't receive anything new. If we haven't heard anything
|
|
* from the server for more than u_sess->attr.attr_storage.wal_receiver_timeout / 2,
|
|
* ping the server. Also, if it's been longer than
|
|
* u_sess->attr.attr_storage.wal_receiver_status_interval since the last update we sent,
|
|
* send a status update to the master anyway, to report any
|
|
* progress in applying WAL.
|
|
*/
|
|
TimestampTz nowtime = GetCurrentTimestamp();
|
|
bool requestReply = WalRecCheckTimeOut(nowtime, *last_recv_timestamp, *ping_sent);
|
|
if (requestReply) {
|
|
*ping_sent = true;
|
|
*last_recv_timestamp = nowtime;
|
|
}
|
|
|
|
XLogWalRcvSendReply(requestReply, requestReply);
|
|
XLogWalRcvSendHSFeedback();
|
|
}
|
|
ConfigFileTimer();
|
|
}
|
|
|
|
/* Main entry point for walreceiver process */
|
|
void WalReceiverMain(void)
|
|
{
|
|
char conninfo[MAXCONNINFO];
|
|
char slotname[NAMEDATALEN];
|
|
XLogRecPtr startpoint;
|
|
TimestampTz last_recv_timestamp;
|
|
bool ping_sent = false;
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
int channel_identifier = 0;
|
|
int nRet = 0;
|
|
errno_t rc = 0;
|
|
uint32 isRedoFinish;
|
|
|
|
t_thrd.walreceiver_cxt.last_sendfilereply_timestamp = GetCurrentTimestamp();
|
|
t_thrd.walreceiver_cxt.standby_config_modify_time = time(NULL);
|
|
isRedoFinish = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.isRedoFinish));
|
|
knl_g_set_redo_finish_status(isRedoFinish | REDO_FINISH_STATUS_CM);
|
|
ereport(LOG, (errmsg("set knl_g_set_redo_finish_status_CM to true when connecting to the primary")));
|
|
|
|
/*
|
|
* WalRcv should be set up already (if we are a backend, we inherit this
|
|
* by fork() or EXEC_BACKEND mechanism from the postmaster).
|
|
*/
|
|
Assert(walrcv != NULL);
|
|
|
|
ereport(LOG, (errmsg("walreceiver thread started")));
|
|
|
|
/* Initialize walrcv buffer for walreceive optimization */
|
|
walRcvCtlBlockInit();
|
|
|
|
load_server_mode();
|
|
|
|
/*
|
|
* Mark walreceiver as running in shared memory.
|
|
*
|
|
* Do this as early as possible, so that if we fail later on, we'll set
|
|
* state to STOPPED. If we die before this, the startup process will keep
|
|
* waiting for us to start up, until it times out.
|
|
*/
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
Assert(walrcv->pid == 0);
|
|
switch (walrcv->walRcvState) {
|
|
case WALRCV_STOPPING:
|
|
/* If we've already been requested to stop, don't start up. */
|
|
walrcv->walRcvState = WALRCV_STOPPED;
|
|
// fall through
|
|
case WALRCV_STOPPED:
|
|
SpinLockRelease(&walrcv->mutex);
|
|
ereport(WARNING, (errmsg("walreceiver requested to stop when starting up.")));
|
|
KillWalRcvWriter();
|
|
proc_exit(1);
|
|
break;
|
|
|
|
case WALRCV_STARTING:
|
|
/* The usual case */
|
|
break;
|
|
|
|
case WALRCV_RUNNING:
|
|
/* Shouldn't happen */
|
|
ereport(PANIC, (errmsg("walreceiver still running according to shared memory state")));
|
|
}
|
|
/* Advertise our PID so that the startup process can kill us */
|
|
if (walrcv->conn_target == REPCONNTARGET_PRIMARY || walrcv->conn_target == REPCONNTARGET_OBS)
|
|
walrcv->node_state = NODESTATE_NORMAL;
|
|
walrcv->pid = t_thrd.proc_cxt.MyProcPid;
|
|
walrcv->obsArchLatch = NULL;
|
|
#ifndef WIN32
|
|
walrcv->lwpId = syscall(SYS_gettid);
|
|
#else
|
|
walrcv->lwpId = (int)t_thrd.proc_cxt.MyProcPid;
|
|
#endif
|
|
walrcv->isRuning = false;
|
|
walrcv->walRcvState = WALRCV_RUNNING;
|
|
|
|
rc = memset_s(slotname, NAMEDATALEN, 0, NAMEDATALEN);
|
|
securec_check(rc, "\0", "\0");
|
|
|
|
rc = memset_s(conninfo, MAXCONNINFO, 0, MAXCONNINFO);
|
|
securec_check(rc, "\0", "\0");
|
|
|
|
/* Fetch information required to start streaming */
|
|
rc = strncpy_s(conninfo, MAXCONNINFO, (char *)walrcv->conninfo, MAXCONNINFO - 1);
|
|
securec_check(rc, "\0", "\0");
|
|
rc = strncpy_s(slotname, NAMEDATALEN, (char *)walrcv->slotname, NAMEDATALEN - 1);
|
|
securec_check(rc, "\0", "\0");
|
|
|
|
startpoint = walrcv->receiveStart;
|
|
|
|
/* Initialise to a sanish value */
|
|
walrcv->lastMsgSendTime = walrcv->lastMsgReceiptTime = walrcv->latestWalEndTime = GetCurrentTimestamp();
|
|
|
|
WalRcvCtlAcquireExitLock();
|
|
walrcv->walRcvCtlBlock = t_thrd.walreceiver_cxt.walRcvCtlBlock;
|
|
WalRcvCtlReleaseExitLock();
|
|
if(walrcv->conn_target != REPCONNTARGET_OBS) {
|
|
t_thrd.walreceiver_cxt.AmWalReceiverForFailover =
|
|
(walrcv->conn_target == REPCONNTARGET_DUMMYSTANDBY || walrcv->conn_target == REPCONNTARGET_STANDBY) ? true
|
|
: false;
|
|
t_thrd.walreceiver_cxt.AmWalReceiverForStandby = (walrcv->conn_target == REPCONNTARGET_STANDBY) ? true : false;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
/* using localport for channel identifier */
|
|
if (!t_thrd.walreceiver_cxt.AmWalReceiverForStandby) {
|
|
volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
|
|
SpinLockAcquire(&hashmdata->mutex);
|
|
int walreplindex = hashmdata->current_repl;
|
|
SpinLockRelease(&hashmdata->mutex);
|
|
|
|
if (t_thrd.postmaster_cxt.ReplConnArray[walreplindex])
|
|
channel_identifier = t_thrd.postmaster_cxt.ReplConnArray[walreplindex]->localport;
|
|
}
|
|
}
|
|
else {
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
/* Arrange to clean up at walreceiver exit */
|
|
on_shmem_exit(WalRcvDie, 0);
|
|
|
|
/* Reset some signals that are accepted by postmaster but not here */
|
|
(void)gspqsignal(SIGHUP, WalRcvSigHupHandler); /* set flag to read config file */
|
|
(void)gspqsignal(SIGINT, SIG_IGN);
|
|
(void)gspqsignal(SIGTERM, WalRcvShutdownHandler); /* request shutdown */
|
|
(void)gspqsignal(SIGQUIT, WalRcvQuickDieHandler); /* hard crash time */
|
|
(void)gspqsignal(SIGALRM, SIG_IGN);
|
|
(void)gspqsignal(SIGPIPE, SIG_IGN);
|
|
(void)gspqsignal(SIGUSR1, sigusr1_handler);
|
|
(void)gspqsignal(SIGUSR2, SIG_IGN);
|
|
|
|
/* Reset some signals that are accepted by postmaster but not here */
|
|
(void)gspqsignal(SIGCHLD, SIG_DFL);
|
|
(void)gspqsignal(SIGTTIN, SIG_DFL);
|
|
(void)gspqsignal(SIGTTOU, SIG_DFL);
|
|
(void)gspqsignal(SIGCONT, SIG_DFL);
|
|
(void)gspqsignal(SIGWINCH, SIG_DFL);
|
|
|
|
/* We allow SIGQUIT (quickdie) at all times */
|
|
sigdelset(&t_thrd.libpq_cxt.BlockSig, SIGQUIT);
|
|
|
|
/*
|
|
* Create a resource owner to keep track of our resources (not clear that
|
|
* we need this, but may as well have one).
|
|
*/
|
|
t_thrd.utils_cxt.CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Receiver", MEMORY_CONTEXT_STORAGE);
|
|
|
|
/* Unblock signals (they were blocked when the postmaster forked us) */
|
|
gs_signal_setmask(&t_thrd.libpq_cxt.UnBlockSig, NULL);
|
|
(void)gs_signal_unblock_sigusr2();
|
|
if(walrcv->conn_target != REPCONNTARGET_OBS)
|
|
SetWalRcvDummyStandbySyncPercent(0);
|
|
|
|
t_thrd.xlog_cxt.ThisTimeLineID = GetRecoveryTargetTLI();
|
|
/* Establish the connection to the primary for XLOG streaming */
|
|
EnableWalRcvImmediateExit();
|
|
|
|
WalReceiverFuncTable[GET_FUNC_IDX].walrcv_connect(conninfo, &startpoint, slotname[0] != '\0' ? slotname : NULL,
|
|
channel_identifier);
|
|
DisableWalRcvImmediateExit();
|
|
|
|
if (GetWalRcvDummyStandbySyncPercent() == SYNC_DUMMY_STANDBY_END && walrcv->conn_target != REPCONNTARGET_OBS) {
|
|
Assert(t_thrd.walreceiver_cxt.AmWalReceiverForFailover == true);
|
|
ereport(LOG, (errmsg("Secondary Standby has no xlog")));
|
|
}
|
|
|
|
rc = memset_s(t_thrd.walreceiver_cxt.reply_message, sizeof(StandbyReplyMessage), 0, sizeof(StandbyReplyMessage));
|
|
securec_check(rc, "\0", "\0");
|
|
rc = memset_s(t_thrd.walreceiver_cxt.feedback_message, sizeof(StandbyHSFeedbackMessage), 0,
|
|
sizeof(StandbyHSFeedbackMessage));
|
|
securec_check(rc, "\0", "\0");
|
|
ereport(LOG, (errmsg("start replication at start point %X/%X", (uint32)(startpoint >> 32), (uint32)startpoint)));
|
|
|
|
last_recv_timestamp = GetCurrentTimestamp();
|
|
|
|
if (t_thrd.proc_cxt.DataDir) {
|
|
nRet = snprintf_s(t_thrd.walreceiver_cxt.gucconf_file, MAXPGPATH, MAXPGPATH - 1, "%s/postgresql.conf",
|
|
t_thrd.proc_cxt.DataDir);
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
|
|
nRet = snprintf_s(t_thrd.walreceiver_cxt.temp_guc_conf_file, MAXPGPATH, MAXPGPATH - 1, "%s/%s",
|
|
t_thrd.proc_cxt.DataDir, TEMP_CONF_FILE);
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
|
|
nRet = snprintf_s(t_thrd.walreceiver_cxt.gucconf_lock_file, MAXPGPATH, MAXPGPATH - 1, "%s/postgresql.conf.lock",
|
|
t_thrd.proc_cxt.DataDir);
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
}
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->isRuning = true;
|
|
walrcv->local_write_pos.queueid = 0;
|
|
walrcv->local_write_pos.queueoff = 0;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
if (!dummyStandbyMode) {
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->receivePtr = t_thrd.walreceiver_cxt.walRcvCtlBlock->writePtr =
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->flushPtr = GetXLogReplayRecPtr(NULL);
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
} else {
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->receivePtr = t_thrd.walreceiver_cxt.walRcvCtlBlock->writePtr =
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->flushPtr = startpoint;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
}
|
|
|
|
/*
|
|
* Synchronize standby's configure file once the HA build successfully.
|
|
*
|
|
* Note: If switchover in one hour, and there is no parameter is reloaded,
|
|
* the parameters set by client will be disabled. So we should do this.
|
|
*/
|
|
if(walrcv->conn_target != REPCONNTARGET_OBS) {
|
|
firstSynchStandbyFile();
|
|
set_disable_conn_mode();
|
|
}
|
|
|
|
knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL);
|
|
ereport(LOG, (errmsg("set knl_g_set_redo_finish_status to false when connecting to the primary")));
|
|
/*
|
|
* Prevent the effect of the last wallreceiver connection.
|
|
*/
|
|
InitHeartbeatTimestamp();
|
|
|
|
/* Loop until end-of-streaming or error */
|
|
for (;;) {
|
|
WalRcvrProcessData(&last_recv_timestamp, &ping_sent);
|
|
}
|
|
}
|
|
|
|
static TimestampTz GetHeartbeatLastReplyTimestamp()
|
|
{
|
|
int replindex;
|
|
volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
|
|
SpinLockAcquire(&hashmdata->mutex);
|
|
replindex = hashmdata->current_repl;
|
|
SpinLockRelease(&hashmdata->mutex);
|
|
|
|
return get_last_reply_timestamp(replindex);
|
|
}
|
|
|
|
/* return timeout time */
|
|
static inline TimestampTz CalculateTimeout(TimestampTz last_reply_time)
|
|
{
|
|
return TimestampTzPlusMilliseconds(last_reply_time, u_sess->attr.attr_storage.wal_receiver_timeout / 2);
|
|
}
|
|
|
|
/*
|
|
* Check if time since last receive from primary has reached the
|
|
* configured limit. If we didn't receive anything new for half of receiver
|
|
* replication timeout, need ping the server.
|
|
*
|
|
* NB: the timeout stategy is different from the sender due to ping_sent,
|
|
* if pint_sent is set true, abnormal heartbeat for (wal_receiver_timeout / 2) will cause timeout.
|
|
*/
|
|
static bool WalRecCheckTimeOut(TimestampTz nowtime, TimestampTz last_recv_timestamp, bool ping_sent)
|
|
{
|
|
bool requestReply = false;
|
|
TimestampTz heartbeat = GetHeartbeatLastReplyTimestamp();
|
|
TimestampTz calculateTime = CalculateTimeout(heartbeat);
|
|
TimestampTz hbValid = TimestampTzPlusMilliseconds(heartbeat, u_sess->attr.attr_storage.wal_receiver_timeout * 2);
|
|
/*
|
|
* The host locally records the last communication time of the standby,
|
|
* when the time exceeds heartbeat_ Timeout: if the heartbeat message is not received,
|
|
* the heartbeat timeout will be triggered and walsender will exit.
|
|
* In switchover scenario, if the host exits the heartbeat thread,
|
|
* the standby will exit the walkreceiver thread. This causes switchover to fail,
|
|
* so heartbeat timeout is not judged during switchover.
|
|
*/
|
|
if (timestamptz_cmp_internal(nowtime, calculateTime) >= 0 && timestamptz_cmp_internal(hbValid, nowtime) >= 0 &&
|
|
(t_thrd.walreceiverfuncs_cxt.WalRcv != NULL &&
|
|
t_thrd.walreceiverfuncs_cxt.WalRcv->node_state != NODESTATE_STANDBY_WAITING)) {
|
|
ereport(ERROR, (errmsg(
|
|
"terminating walreceiver due to heartbeat timeout,now time(%s) last heartbeat time(%s) calculateTime(%s)",
|
|
timestamptz_to_str(nowtime), timestamptz_to_str(heartbeat), timestamptz_to_str(calculateTime))));
|
|
}
|
|
|
|
/* don't bail out if we're doing something that doesn't require timeouts */
|
|
if (u_sess->attr.attr_storage.wal_receiver_timeout <= 0) {
|
|
return requestReply;
|
|
}
|
|
|
|
/*
|
|
* Use static last_reply_time to avoid call GetHeartbeatLastReplyTimestamp frequently
|
|
* when last_recv_timestamp has meet the timeout condition
|
|
* but last heartbeat time doesn't.
|
|
*/
|
|
static TimestampTz last_reply_time = last_recv_timestamp;
|
|
if (timestamptz_cmp_internal(last_recv_timestamp, last_reply_time) > 0) {
|
|
last_reply_time = last_recv_timestamp;
|
|
}
|
|
|
|
TimestampTz timeout = CalculateTimeout(last_reply_time);
|
|
if (nowtime < timeout) {
|
|
return requestReply;
|
|
}
|
|
|
|
/* If heartbeat newer, use heartbeat to recalculate timeout. */
|
|
if (timestamptz_cmp_internal(heartbeat, last_reply_time) > 0) {
|
|
last_reply_time = heartbeat;
|
|
timeout = CalculateTimeout(last_reply_time);
|
|
}
|
|
|
|
/*
|
|
* We didn't receive anything new, for half of receiver
|
|
* replication timeout. Ping the server.
|
|
*/
|
|
if (nowtime >= timeout) {
|
|
WalReplicationTimestampInfo tpInfo;
|
|
if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2) {
|
|
WalReplicationTimestampToString(&tpInfo, nowtime, timeout, last_recv_timestamp, heartbeat);
|
|
ereport(DEBUG2,
|
|
(errmsg("now time(%s) timeout time(%s) last recv time(%s), heartbeat time(%s), ping_sent(%d)",
|
|
tpInfo.nowTimeStamp, tpInfo.timeoutStamp, tpInfo.lastRecStamp, tpInfo.heartbeatStamp, ping_sent)));
|
|
}
|
|
if (!ping_sent) {
|
|
requestReply = true;
|
|
} else {
|
|
knl_g_set_redo_finish_status(0);
|
|
ereport(LOG, (errmsg("set knl_g_set_redo_finish_status to false in WalRecCheckTimeOut")));
|
|
if (log_min_messages <= ERROR || client_min_messages <= ERROR) {
|
|
WalReplicationTimestampToString(&tpInfo, nowtime, timeout, last_recv_timestamp, heartbeat);
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_TIMED_OUT), errmsg("terminating walreceiver due to timeout "
|
|
"now time(%s) timeout time(%s) last recv time(%s) heartbeat time(%s)",
|
|
tpInfo.nowTimeStamp, tpInfo.timeoutStamp, tpInfo.lastRecStamp, tpInfo.heartbeatStamp)));
|
|
}
|
|
}
|
|
}
|
|
return requestReply;
|
|
}
|
|
|
|
/*
|
|
* Mark us as STOPPED in proc at exit.
|
|
*/
|
|
static void WalRcvDie(int code, Datum arg)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
|
|
/*
|
|
* Shutdown WalRcvWriter thread, clear the data receive buffer.
|
|
* Ensure that all WAL records received are flushed to disk.
|
|
*/
|
|
KillWalRcvWriter();
|
|
|
|
/* we have to set REDO_FINISH_STATUS_LOCAL to false here, or there will be problems in this case:
|
|
extremRTO is on, and DN received force finish signal, if cleanup is blocked, the force finish
|
|
signal will be ignored!
|
|
*/
|
|
knl_g_clear_local_redo_finish_status();
|
|
ereport(LOG, (errmsg("set local_redo_finish_status to false in WalRcvDie")));
|
|
|
|
walRcvDataCleanup();
|
|
|
|
LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
Assert(walrcv->walRcvState == WALRCV_RUNNING || walrcv->walRcvState == WALRCV_STOPPING);
|
|
walrcv->walRcvState = WALRCV_STOPPED;
|
|
walrcv->pid = 0;
|
|
walrcv->lwpId = 0;
|
|
walrcv->isRuning = false;
|
|
if (walrcv->walRcvCtlBlock != NULL)
|
|
walrcv->walRcvCtlBlock = NULL;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
WalRcvCtlAcquireExitLock();
|
|
walRcvCtlBlockFini();
|
|
WalRcvCtlReleaseExitLock();
|
|
LWLockRelease(WALWriteLock);
|
|
|
|
/* Terminate the connection gracefully. */
|
|
(WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_disconnect();
|
|
|
|
/* Wake up the startup process to notice promptly that we're gone */
|
|
WakeupRecovery();
|
|
|
|
if (t_thrd.libwalreceiver_cxt.recvBuf != NULL) {
|
|
PQfreemem(t_thrd.libwalreceiver_cxt.recvBuf);
|
|
t_thrd.libwalreceiver_cxt.recvBuf = NULL;
|
|
}
|
|
|
|
/* reset conn_channel */
|
|
errno_t rc = memset_s((void*)&walrcv->conn_channel,
|
|
sizeof(walrcv->conn_channel), 0, sizeof(walrcv->conn_channel));
|
|
securec_check_c(rc, "\0", "\0");
|
|
|
|
ereport(LOG, (errmsg("walreceiver thread shut down")));
|
|
}
|
|
|
|
/* SIGHUP: set flag to re-read config file at next convenient time */
|
|
static void WalRcvSigHupHandler(SIGNAL_ARGS)
|
|
{
|
|
t_thrd.walreceiver_cxt.got_SIGHUP = true;
|
|
}
|
|
|
|
/* SIGTERM: set flag for main loop, or shutdown immediately if safe */
|
|
static void WalRcvShutdownHandler(SIGNAL_ARGS)
|
|
{
|
|
t_thrd.walreceiver_cxt.got_SIGTERM = true;
|
|
}
|
|
|
|
/*
|
|
* WalRcvQuickDieHandler() occurs when signalled SIGQUIT by the postmaster.
|
|
*
|
|
* Some backend has bought the farm, so we need to stop what we're doing and
|
|
* exit.
|
|
*/
|
|
static void WalRcvQuickDieHandler(SIGNAL_ARGS)
|
|
{
|
|
gs_signal_setmask(&t_thrd.libpq_cxt.BlockSig, NULL);
|
|
|
|
/*
|
|
* We DO NOT want to run proc_exit() callbacks -- we're here because
|
|
* shared memory may be corrupted, so we don't want to try to clean up our
|
|
* transaction. Just nail the windows shut and get out of town. Now that
|
|
* there's an atexit callback to prevent third-party code from breaking
|
|
* things by calling exit() directly, we have to reset the callbacks
|
|
* explicitly to make this work as intended.
|
|
*/
|
|
on_exit_reset();
|
|
|
|
/*
|
|
* Note we do exit(2) not exit(0). This is to force the postmaster into a
|
|
* system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
|
|
* backend. This is necessary precisely because we don't clean up our
|
|
* shared memory state. (The "dead man switch" mechanism in pmsignal.c
|
|
* should ensure the postmaster sees this as a crash, too, but no harm in
|
|
* being doubly sure.)
|
|
*/
|
|
exit(2);
|
|
}
|
|
|
|
/*
|
|
* handle signal conditions from other processes
|
|
*/
|
|
static void sigusr1_handler(SIGNAL_ARGS)
|
|
{
|
|
int save_errno = errno;
|
|
|
|
gs_signal_setmask(&t_thrd.libpq_cxt.BlockSig, NULL);
|
|
|
|
if (t_thrd.walreceiverfuncs_cxt.WalRcv &&
|
|
t_thrd.walreceiverfuncs_cxt.WalRcv->node_state >= NODESTATE_SMART_DEMOTE_REQUEST &&
|
|
t_thrd.walreceiverfuncs_cxt.WalRcv->node_state <= NODESTATE_FAST_DEMOTE_REQUEST) {
|
|
/* Tell walreceiver process to start switchover */
|
|
t_thrd.walreceiver_cxt.start_switchover = true;
|
|
}
|
|
|
|
gs_signal_setmask(&t_thrd.libpq_cxt.UnBlockSig, NULL);
|
|
|
|
errno = save_errno;
|
|
}
|
|
|
|
/* Wal receiver is shut down? */
|
|
bool WalRcvIsShutdown(void)
|
|
{
|
|
return t_thrd.walreceiver_cxt.got_SIGTERM;
|
|
}
|
|
|
|
static void XLogWalRcvDataPageReplication(char *buf, Size len)
|
|
{
|
|
WalDataPageMessageHeader msghdr;
|
|
Assert(true == g_instance.attr.attr_storage.enable_mix_replication);
|
|
|
|
if (!g_instance.attr.attr_storage.enable_mix_replication) {
|
|
ereport(PANIC, (errmsg("WAL streaming isn't employed to sync all the replication data log.")));
|
|
}
|
|
if (len < sizeof(WalDataPageMessageHeader)) {
|
|
ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION),
|
|
errmsg_internal("invalid wal data page message received from primary")));
|
|
}
|
|
|
|
/* memcpy is required here for alignment reasons */
|
|
error_t rc = memcpy_s(&msghdr, sizeof(WalDataPageMessageHeader), buf, sizeof(WalDataPageMessageHeader));
|
|
securec_check(rc, "\0", "\0");
|
|
|
|
ProcessWalDataHeaderMessage(&msghdr);
|
|
|
|
buf += sizeof(WalDataPageMessageHeader);
|
|
len -= sizeof(WalDataPageMessageHeader);
|
|
|
|
if (len > WS_MAX_DATA_QUEUE_SIZE) {
|
|
Assert(false);
|
|
ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION),
|
|
errmsg_internal(
|
|
"unexpected wal data size %lu bytes exceeds the max receiving data queue size %u bytes",
|
|
len, WS_MAX_DATA_QUEUE_SIZE)));
|
|
}
|
|
if (u_sess->attr.attr_storage.HaModuleDebug) {
|
|
WSDataRcvCheck(buf, len);
|
|
}
|
|
WalDataRcvReceive(buf, len, 0);
|
|
}
|
|
|
|
/*
|
|
* Accept the message from XLOG stream, and process it.
|
|
*/
|
|
static void XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len)
|
|
{
|
|
errno_t errorno = EOK;
|
|
|
|
ereport(DEBUG5, (errmsg("received wal message type: %c", type)));
|
|
|
|
switch (type) {
|
|
case 'e': /* dummy standby sendxlog end. */
|
|
{
|
|
EndXLogMessage endXLogMessage;
|
|
CHECK_MSG_SIZE(len, EndXLogMessage, "invalid EndXLogMessage message received from Secondary Standby");
|
|
/* memcpy is required here for alignment reasons */
|
|
errorno = memcpy_s(&endXLogMessage, sizeof(EndXLogMessage), buf, sizeof(EndXLogMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
ProcessEndXLogMessage(&endXLogMessage);
|
|
break;
|
|
}
|
|
case 'w': /* WAL records */
|
|
{
|
|
WalDataMessageHeader msghdr;
|
|
if (len < sizeof(WalDataMessageHeader))
|
|
ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION),
|
|
errmsg_internal("invalid WAL message received from primary")));
|
|
/* memcpy is required here for alignment reasons */
|
|
errorno = memcpy_s(&msghdr, sizeof(WalDataMessageHeader), buf, sizeof(WalDataMessageHeader));
|
|
securec_check(errorno, "", "");
|
|
|
|
ProcessWalHeaderMessage(&msghdr);
|
|
|
|
buf += sizeof(WalDataMessageHeader);
|
|
len -= sizeof(WalDataMessageHeader);
|
|
if (IsExtremeRedo()) {
|
|
XLogWalRcvReceiveInBuf(buf, len, msghdr.dataStart);
|
|
} else {
|
|
XLogWalRcvReceive(buf, len, msghdr.dataStart);
|
|
}
|
|
break;
|
|
}
|
|
case 'd': /* Data page replication for the logical xlog */
|
|
{
|
|
XLogWalRcvDataPageReplication(buf, len);
|
|
break;
|
|
}
|
|
case 'k': /* Keepalive */
|
|
{
|
|
CHECK_MSG_SIZE(len, PrimaryKeepaliveMessage, "invalid keepalive message received from primary");
|
|
PrimaryKeepaliveMessage keepalive;
|
|
/* memcpy is required here for alignment reasons */
|
|
errorno = memcpy_s(&keepalive, sizeof(PrimaryKeepaliveMessage), buf, sizeof(PrimaryKeepaliveMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
|
|
ProcessKeepaliveMessage(&keepalive);
|
|
|
|
/* If the primary requested a reply, send one immediately */
|
|
if (keepalive.replyRequested)
|
|
XLogWalRcvSendReply(true, false);
|
|
break;
|
|
}
|
|
case 'p': /* Promote standby */
|
|
{
|
|
PrimarySwitchResponseMessage response;
|
|
CHECK_MSG_SIZE(len, PrimarySwitchResponseMessage, "invalid switchover response message received from primary")
|
|
/* memcpy is required here for alignment reasons */
|
|
errorno = memcpy_s(&response, sizeof(PrimarySwitchResponseMessage), buf,
|
|
sizeof(PrimarySwitchResponseMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
ProcessWalSndrMessage(&response.walEnd, response.sendTime);
|
|
|
|
ereport(LOG, (errmsg("received switchover response message from primary")));
|
|
ProcessSwitchResponse(response.switchResponse);
|
|
break;
|
|
}
|
|
case 'm': /* config file */
|
|
{
|
|
if (len < sizeof(ConfigModifyTimeMessage)) {
|
|
ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg_internal("invalid config file message")));
|
|
}
|
|
ConfigModifyTimeMessage primary_config_file;
|
|
/* memcpy is required here for alignment reasons */
|
|
errorno = memcpy_s(&primary_config_file, sizeof(ConfigModifyTimeMessage), buf,
|
|
sizeof(ConfigModifyTimeMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
t_thrd.walreceiver_cxt.Primary_config_modify_time = primary_config_file.config_modify_time;
|
|
buf += sizeof(ConfigModifyTimeMessage);
|
|
len -= sizeof(ConfigModifyTimeMessage);
|
|
ereport(LOG, (errmsg("walreceiver received gaussdb config file size: %lu", len)));
|
|
if (true != ProcessConfigFileMessage(buf, len)) {
|
|
ereport(LOG, (errmsg("walreceiver update config file failed")));
|
|
}
|
|
break;
|
|
}
|
|
case 'x': /* rm xlog */
|
|
{
|
|
RmXLogMessage rmXLogMessage;
|
|
CHECK_MSG_SIZE(len, RmXLogMessage, "invalid RmXLog message received from primary");
|
|
/* memcpy is required here for alignment reasons */
|
|
errorno = memcpy_s(&rmXLogMessage, sizeof(RmXLogMessage), buf, sizeof(RmXLogMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
ProcessRmXLogMessage(&rmXLogMessage);
|
|
break;
|
|
}
|
|
case 'a': /* pitr archive xlog */
|
|
{
|
|
ArchiveXlogMessage archiveXLogMessage;
|
|
CHECK_MSG_SIZE(len, ArchiveXlogMessage, "invalid ArchiveXlogMessage message received from primary");
|
|
/* memcpy is required here for alignment reasons */
|
|
errorno = memcpy_s(&archiveXLogMessage, sizeof(ArchiveXlogMessage), buf, sizeof(ArchiveXlogMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
ProcessArchiveXlogMessage(&archiveXLogMessage);
|
|
break;
|
|
}
|
|
default:
|
|
ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION),
|
|
errmsg_internal("invalid replication message type %c", type)));
|
|
}
|
|
}
|
|
|
|
void WSDataRcvCheck(char *data_buf, Size nbytes)
|
|
{
|
|
errno_t errorno = EOK;
|
|
char *cur_buf = NULL;
|
|
uint32 total_len = 0;
|
|
XLogRecPtr ref_xlog_ptr = InvalidXLogRecPtr;
|
|
|
|
cur_buf = data_buf;
|
|
|
|
errorno = memcpy_s(&total_len, sizeof(uint32), cur_buf, sizeof(uint32));
|
|
securec_check(errorno, "\0", "\0");
|
|
cur_buf += sizeof(uint32);
|
|
|
|
if (total_len != nbytes) {
|
|
Assert(false);
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("the corrupt data total len is %u bytes, the expected len is %lu bytes.", total_len, nbytes)));
|
|
}
|
|
|
|
if (cur_buf[0] != 'd') {
|
|
Assert(false);
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("the unexpected data flag is %c, the expected data flag is 'd'.", cur_buf[0])));
|
|
}
|
|
cur_buf += 1;
|
|
|
|
errorno = memcpy_s(&ref_xlog_ptr, sizeof(XLogRecPtr), cur_buf, sizeof(XLogRecPtr));
|
|
securec_check(errorno, "\0", "\0");
|
|
if (XLogRecPtrIsInvalid(ref_xlog_ptr)) {
|
|
Assert(false);
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("the start xlog employed for the wal data is invalid.")));
|
|
}
|
|
cur_buf += sizeof(XLogRecPtr);
|
|
|
|
errorno = memcpy_s(&ref_xlog_ptr, sizeof(XLogRecPtr), cur_buf, sizeof(XLogRecPtr));
|
|
securec_check(errorno, "\0", "\0");
|
|
if (XLogRecPtrIsInvalid(ref_xlog_ptr)) {
|
|
Assert(false);
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("the end xlog employed for the wal data is invalid.")));
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Receive all the required replication data page.
|
|
*/
|
|
static void WalDataRcvReceive(char *buf, Size nbytes, XLogRecPtr recptr)
|
|
{
|
|
/* buf unit */
|
|
uint32 expected_len = 0;
|
|
#ifdef DATA_DEBUG
|
|
pg_crc32 crc;
|
|
#endif
|
|
Size left_len = nbytes;
|
|
char *cur_buf = buf;
|
|
errno_t errorno = EOK;
|
|
|
|
/* 'd' means the replication data, 'w' means the xlog. */
|
|
char data_flag = 0;
|
|
XLogRecPtr received_ptr = InvalidXLogRecPtr;
|
|
bool empty_streaming_body = false;
|
|
|
|
while (left_len > 0) {
|
|
errorno = memcpy_s(&expected_len, sizeof(uint32), cur_buf, sizeof(uint32));
|
|
securec_check(errorno, "\0", "\0");
|
|
cur_buf += sizeof(uint32);
|
|
|
|
/* skip the 'd' flag */
|
|
data_flag = cur_buf[0];
|
|
Assert(data_flag == 'd' || data_flag == 'w');
|
|
cur_buf += 1;
|
|
|
|
if (data_flag == 'd') {
|
|
if (expected_len <= (sizeof(uint32) + 1 + sizeof(XLogRecPtr) * 2)) {
|
|
Assert(false);
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("the received wal data is unexpected %u bytes at least more than %lu bytes",
|
|
expected_len, (sizeof(uint32) + 1 + sizeof(XLogRecPtr) * 2))));
|
|
}
|
|
} else if (data_flag == 'w') {
|
|
if (expected_len < (sizeof(uint32) + 1 + sizeof(XLogRecPtr))) {
|
|
Assert(false);
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("the received xlog is unexpected %u bytes at least more than %lu bytes.",
|
|
expected_len, (sizeof(uint32) + 1 + sizeof(XLogRecPtr)))));
|
|
}
|
|
|
|
errorno = memcpy_s(&received_ptr, sizeof(XLogRecPtr), cur_buf, sizeof(XLogRecPtr));
|
|
securec_check(errorno, "\0", "\0");
|
|
|
|
if (expected_len == (sizeof(uint32) + 1 + sizeof(XLogRecPtr))) {
|
|
ereport(DEBUG2, (errmsg("received empty streaming body at %X/%X.", (uint32)(received_ptr >> 32),
|
|
(uint32)received_ptr)));
|
|
|
|
empty_streaming_body = true;
|
|
}
|
|
|
|
if (!empty_streaming_body) {
|
|
XLByteAdvance(recptr, (uint32)(expected_len - WAL_DATA_LEN));
|
|
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->receivePtr = recptr;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
}
|
|
} else {
|
|
Assert(false);
|
|
ereport(
|
|
ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("fail to push some wal data to the wal streaming writer queue: unexpected wal data flag %c.",
|
|
data_flag)));
|
|
}
|
|
|
|
if (!empty_streaming_body) {
|
|
/* skip the message header */
|
|
(void)PushToWriterQueue(cur_buf - sizeof(uint32) - 1, expected_len);
|
|
ereport(DEBUG5, (errmsg("push some wal data to the wal streaming writer queue: data flag %c, %u bytes.",
|
|
data_flag, expected_len)));
|
|
} else
|
|
empty_streaming_body = false;
|
|
|
|
cur_buf += (expected_len - (sizeof(uint32) + 1));
|
|
left_len -= expected_len;
|
|
|
|
wakeupWalRcvWriter();
|
|
}
|
|
Assert(left_len == 0);
|
|
|
|
wakeupWalRcvWriter();
|
|
}
|
|
|
|
void UpdateWalRcvCtl(struct WalRcvCtlBlock* walRcvCtlBlock, const XLogRecPtr recptr, const int segbytes)
|
|
{
|
|
const int64 recBufferSize = g_instance.attr.attr_storage.WalReceiverBufSize * 1024;
|
|
SpinLockAcquire(&walRcvCtlBlock->mutex);
|
|
walRcvCtlBlock->walFreeOffset += segbytes;
|
|
if (walRcvCtlBlock->walFreeOffset == recBufferSize && walRcvCtlBlock->walWriteOffset > 0 &&
|
|
walRcvCtlBlock->walReadOffset > 0) {
|
|
walRcvCtlBlock->walFreeOffset = 0;
|
|
}
|
|
walRcvCtlBlock->receivePtr = recptr;
|
|
SpinLockRelease(&walRcvCtlBlock->mutex);
|
|
}
|
|
|
|
inline void WalReceiverWaitCopyXLogCount(XLogRecPtr recptr, XLogRecPtr startptr, int64 walfreeoffset,
|
|
int64 walwriteoffset, int64 walreadoffset)
|
|
{
|
|
static uint64 waitCount = 0;
|
|
++waitCount;
|
|
const uint64 printInterval = 0xFFFF;
|
|
if ((waitCount & printInterval) == 0) {
|
|
const uint32 rightShiftSize = 32;
|
|
ereport(WARNING, (errmsg("WalReceiverWaitCopyXLogCount: recptr(%X:%X),walfreeoffset(%ld),"
|
|
"walwriteoffset(%ld),walreadoffset(%ld),startptr(%X:%X)",
|
|
(uint32)(recptr >> rightShiftSize), (uint32)recptr, walfreeoffset, walwriteoffset,
|
|
walreadoffset, (uint32)(startptr >> rightShiftSize), (uint32)startptr)));
|
|
}
|
|
}
|
|
/*
|
|
* Receive XLOG data into receiver buffer.
|
|
*/
|
|
static void XLogWalRcvReceiveInBuf(char *buf, Size nbytes, XLogRecPtr recptr)
|
|
{
|
|
int64 walfreeoffset;
|
|
int64 walwriteoffset;
|
|
int64 walreadoffset;
|
|
char *walrecvbuf = NULL;
|
|
XLogRecPtr startptr;
|
|
int64 recBufferSize = g_instance.attr.attr_storage.WalReceiverBufSize * 1024;
|
|
|
|
while (nbytes > 0) {
|
|
int segbytes;
|
|
int endPoint = recBufferSize;
|
|
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
if (t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset ==
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset) {
|
|
// no data to be flushed
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walStart = recptr;
|
|
} else if (t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset == recBufferSize &&
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset > 0 &&
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walReadOffset > 0) {
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset = 0;
|
|
}
|
|
walfreeoffset = t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset;
|
|
walwriteoffset = t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset;
|
|
walreadoffset = t_thrd.walreceiver_cxt.walRcvCtlBlock->walReadOffset;
|
|
walrecvbuf = t_thrd.walreceiver_cxt.walRcvCtlBlock->walReceiverBuffer;
|
|
startptr = t_thrd.walreceiver_cxt.walRcvCtlBlock->walStart;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
|
|
ereport(DEBUG5, (errmsg("XLogWalRcvReceive: recptr(%X:%X),nbytes(%d),"
|
|
"walfreeoffset(%ld),walwriteoffset(%ld),startptr(%X:%X)",
|
|
(uint32)(recptr >> 32), (uint32)recptr, (int)nbytes, walfreeoffset, walwriteoffset,
|
|
(uint32)(startptr >> 32), (uint32)startptr)));
|
|
|
|
XLogWalRcvSendReply(false, false);
|
|
|
|
Assert(walrecvbuf != NULL);
|
|
Assert(walfreeoffset <= recBufferSize);
|
|
Assert(walwriteoffset <= recBufferSize);
|
|
Assert(walreadoffset <= recBufferSize);
|
|
|
|
if (walfreeoffset < walreadoffset) {
|
|
endPoint = walreadoffset - 1;
|
|
}
|
|
|
|
if (endPoint == walfreeoffset) {
|
|
if (WalRcvWriterInProgress()) {
|
|
wakeupWalRcvWriter();
|
|
WakeupRecovery();
|
|
/* Process any requests or signals received recently */
|
|
ProcessWalRcvInterrupts();
|
|
/* Keepalived with primary when waiting flush wal data */
|
|
XLogWalRcvSendReply(false, false);
|
|
pg_usleep(1000);
|
|
WalReceiverWaitCopyXLogCount(recptr, startptr, walfreeoffset, walwriteoffset, walreadoffset);
|
|
} else {
|
|
walRcvDataCleanup();
|
|
WakeupRecovery();
|
|
ProcessWalRcvInterrupts();
|
|
}
|
|
continue;
|
|
}
|
|
|
|
segbytes = ((walfreeoffset + (int)nbytes > endPoint) ? (endPoint - walfreeoffset) : (int)nbytes);
|
|
|
|
/* Need to seek in the buffer? */
|
|
if (walfreeoffset != walwriteoffset) {
|
|
if (walfreeoffset > walwriteoffset) {
|
|
XLByteAdvance(startptr, (uint32)(walfreeoffset - walwriteoffset));
|
|
} else {
|
|
XLByteAdvance(startptr, (uint32)(recBufferSize - walwriteoffset + walfreeoffset));
|
|
}
|
|
if (!XLByteEQ(startptr, recptr)) {
|
|
/* wait for finishing flushing all wal data */
|
|
while (true) {
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
if (t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset ==
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset) {
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walStart = recptr;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
break;
|
|
}
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
|
|
if (WalRcvWriterInProgress()) {
|
|
wakeupWalRcvWriter();
|
|
WakeupRecovery();
|
|
/* Process any requests or signals received recently */
|
|
ProcessWalRcvInterrupts();
|
|
/* Keepalived with primary when waiting flush wal data */
|
|
XLogWalRcvSendReply(false, false);
|
|
pg_usleep(1000); /* 1ms */
|
|
} else {
|
|
walRcvDataCleanup();
|
|
WakeupRecovery();
|
|
ProcessWalRcvInterrupts();
|
|
}
|
|
}
|
|
|
|
ereport(FATAL,
|
|
(errmsg("Unexpected seek in the walreceiver buffer. "
|
|
"xlogrecptr is (%X:%X) but local xlogptr is (%X:%X)."
|
|
"nbyte is %lu, walfreeoffset is %ld walwriteoffset is %ld walreadoffset is %ld",
|
|
(uint32)(recptr >> 32), (uint32)recptr, (uint32)(startptr >> 32), (uint32)startptr,
|
|
nbytes, walfreeoffset, walwriteoffset, walreadoffset)));
|
|
}
|
|
}
|
|
|
|
/* OK to receive the logs */
|
|
Assert(walfreeoffset + segbytes <= recBufferSize);
|
|
errno_t errorno = memcpy_s(walrecvbuf + walfreeoffset, recBufferSize - walfreeoffset, buf, segbytes);
|
|
securec_check(errorno, "\0", "\0");
|
|
|
|
XLByteAdvance(recptr, (uint32)segbytes);
|
|
|
|
nbytes -= segbytes;
|
|
buf += segbytes;
|
|
|
|
// update shared memory
|
|
UpdateWalRcvCtl(t_thrd.walreceiver_cxt.walRcvCtlBlock, recptr, segbytes);
|
|
}
|
|
|
|
wakeupWalRcvWriter();
|
|
}
|
|
|
|
/*
|
|
* Receive XLOG data into receiver buffer.
|
|
*/
|
|
static void XLogWalRcvReceive(char *buf, Size nbytes, XLogRecPtr recptr)
|
|
{
|
|
int walfreeoffset;
|
|
int walwriteoffset;
|
|
char *walrecvbuf = NULL;
|
|
XLogRecPtr startptr;
|
|
int recBufferSize = g_instance.attr.attr_storage.WalReceiverBufSize * 1024;
|
|
|
|
while (nbytes > 0) {
|
|
int segbytes;
|
|
int endPoint = recBufferSize;
|
|
errno_t errorno = EOK;
|
|
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
if (t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset ==
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset) {
|
|
// no data to be flushed
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walStart = recptr;
|
|
} else if (t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset == recBufferSize &&
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset > 0) {
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset = 0;
|
|
}
|
|
walfreeoffset = t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset;
|
|
walwriteoffset = t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset;
|
|
walrecvbuf = t_thrd.walreceiver_cxt.walRcvCtlBlock->walReceiverBuffer;
|
|
startptr = t_thrd.walreceiver_cxt.walRcvCtlBlock->walStart;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
|
|
ereport(DEBUG5, (errmsg("XLogWalRcvReceive: recptr(%u:%X),nbytes(%d),"
|
|
"walfreeoffset(%d),walwriteoffset(%d),startptr(%u:%X)",
|
|
(uint32)(recptr >> 32), (uint32)recptr, (int)nbytes, walfreeoffset, walwriteoffset,
|
|
(uint32)(startptr >> 32), (uint32)startptr)));
|
|
|
|
XLogWalRcvSendReply(false, false);
|
|
|
|
Assert(walrecvbuf != NULL);
|
|
Assert(walfreeoffset <= recBufferSize);
|
|
Assert(walwriteoffset <= recBufferSize);
|
|
|
|
if (walfreeoffset < walwriteoffset) {
|
|
endPoint = walwriteoffset - 1;
|
|
}
|
|
|
|
if (endPoint == walfreeoffset) {
|
|
if (WalRcvWriterInProgress()) {
|
|
wakeupWalRcvWriter();
|
|
/* Process any requests or signals received recently */
|
|
ProcessWalRcvInterrupts();
|
|
/* Keepalived with primary when waiting flush wal data */
|
|
XLogWalRcvSendReply(false, false);
|
|
pg_usleep(1000);
|
|
} else
|
|
walRcvDataCleanup();
|
|
continue;
|
|
}
|
|
|
|
segbytes = (walfreeoffset + (int)nbytes > endPoint) ? endPoint - walfreeoffset : nbytes;
|
|
|
|
/* Need to seek in the buffer? */
|
|
if (walfreeoffset != walwriteoffset) {
|
|
uint32 waladvancelen = (walfreeoffset > walwriteoffset) ?
|
|
(uint32)(walfreeoffset - walwriteoffset) :
|
|
(uint32)(recBufferSize - walwriteoffset + walfreeoffset);
|
|
XLByteAdvance(startptr, waladvancelen);
|
|
if (!XLByteEQ(startptr, recptr)) {
|
|
/* wait for finishing flushing all wal data */
|
|
while (true) {
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
if (t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset ==
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset) {
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walStart = recptr;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
break;
|
|
}
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
|
|
if (WalRcvWriterInProgress()) {
|
|
wakeupWalRcvWriter();
|
|
/* Process any requests or signals received recently */
|
|
ProcessWalRcvInterrupts();
|
|
/* Keepalived with primary when waiting flush wal data */
|
|
XLogWalRcvSendReply(false, false);
|
|
pg_usleep(1000);
|
|
} else
|
|
walRcvDataCleanup();
|
|
}
|
|
|
|
ereport(FATAL,
|
|
(errmsg("Unexpected seek in the walreceiver buffer. "
|
|
"xlogrecptr is (%X:%X) but local xlogptr is (%X:%X).",
|
|
(uint32)(recptr >> 32), (uint32)recptr, (uint32)(startptr >> 32), (uint32)startptr)));
|
|
}
|
|
}
|
|
|
|
/* OK to receive the logs */
|
|
Assert(walfreeoffset + segbytes <= recBufferSize);
|
|
errorno = memcpy_s(walrecvbuf + walfreeoffset, recBufferSize, buf, segbytes);
|
|
securec_check(errorno, "\0", "\0");
|
|
|
|
XLByteAdvance(recptr, (uint32)segbytes);
|
|
|
|
nbytes -= segbytes;
|
|
buf += segbytes;
|
|
|
|
// update shared memory
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset += segbytes;
|
|
if (t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset == recBufferSize &&
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset > 0) {
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset = 0;
|
|
}
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->receivePtr = recptr;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
}
|
|
|
|
wakeupWalRcvWriter();
|
|
}
|
|
|
|
/*
|
|
* Send reply message to primary, indicating our current XLOG positions, oldest
|
|
* xmin and the current time.
|
|
*
|
|
* If 'force' is not true, the message is not sent unless enough time has
|
|
* passed since last status update to reach wal_receiver_status_internal (or
|
|
* if wal_receiver_status_interval is disabled altogether).
|
|
*
|
|
* If 'requestReply' is true, requests the server to reply immediately upon receiving
|
|
* this message. This is used for heartbearts, when approaching wal_receiver_timeout.
|
|
*/
|
|
void XLogWalRcvSendReply(bool force, bool requestReply)
|
|
{
|
|
char buf[sizeof(StandbyReplyMessage) + 1] = {0};
|
|
TimestampTz now;
|
|
XLogRecPtr receivePtr = InvalidXLogRecPtr;
|
|
XLogRecPtr writePtr = InvalidXLogRecPtr;
|
|
XLogRecPtr flushPtr = InvalidXLogRecPtr;
|
|
XLogRecPtr ReplayReadPtr = InvalidXLogRecPtr;
|
|
int rc = 0;
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
|
|
XLogRecPtr sndFlushPtr;
|
|
|
|
/*
|
|
* If the user doesn't want status to be reported to the master, be sure
|
|
* to exit before doing anything at all.
|
|
*/
|
|
if (!force && u_sess->attr.attr_storage.wal_receiver_status_interval <= 0)
|
|
return;
|
|
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
receivePtr = t_thrd.walreceiver_cxt.walRcvCtlBlock->receivePtr;
|
|
writePtr = t_thrd.walreceiver_cxt.walRcvCtlBlock->writePtr;
|
|
flushPtr = t_thrd.walreceiver_cxt.walRcvCtlBlock->flushPtr;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
|
|
/* Get current timestamp. */
|
|
now = GetCurrentTimestamp();
|
|
/*
|
|
* We can compare the write and flush positions to the last message we
|
|
* sent without taking any lock, but the apply position requires a spin
|
|
* lock, so we don't check that unless something else has changed or 10
|
|
* seconds have passed. This means that the apply log position will
|
|
* appear, from the master's point of view, to lag slightly, but since
|
|
* this is only for reporting purposes and only on idle systems, that's
|
|
* probably OK.
|
|
*/
|
|
if (!force && XLByteEQ(t_thrd.walreceiver_cxt.reply_message->receive, receivePtr) &&
|
|
XLByteEQ(t_thrd.walreceiver_cxt.reply_message->write, writePtr) &&
|
|
XLByteEQ(t_thrd.walreceiver_cxt.reply_message->flush, flushPtr) &&
|
|
!(TimestampDifferenceExceeds(t_thrd.walreceiver_cxt.reply_message->sendTime, now,
|
|
u_sess->attr.attr_storage.wal_receiver_status_interval * 1000) ||
|
|
TimestampDifferenceExceeds(now, t_thrd.walreceiver_cxt.reply_message->sendTime,
|
|
u_sess->attr.attr_storage.wal_receiver_status_interval * 1000))) {
|
|
return;
|
|
}
|
|
|
|
/* Construct a new message */
|
|
t_thrd.walreceiver_cxt.reply_message->receive = receivePtr;
|
|
t_thrd.walreceiver_cxt.reply_message->write = writePtr;
|
|
t_thrd.walreceiver_cxt.reply_message->flush = flushPtr;
|
|
if (!dummyStandbyMode) {
|
|
t_thrd.walreceiver_cxt.reply_message->apply = GetXLogReplayRecPtr(NULL, &ReplayReadPtr);
|
|
t_thrd.walreceiver_cxt.reply_message->applyRead = ReplayReadPtr;
|
|
} else {
|
|
t_thrd.walreceiver_cxt.reply_message->apply = flushPtr;
|
|
t_thrd.walreceiver_cxt.reply_message->applyRead = flushPtr;
|
|
}
|
|
t_thrd.walreceiver_cxt.reply_message->sendTime = now;
|
|
t_thrd.walreceiver_cxt.reply_message->replyRequested = requestReply;
|
|
|
|
SpinLockAcquire(&hashmdata->mutex);
|
|
t_thrd.walreceiver_cxt.reply_message->peer_role = hashmdata->current_mode;
|
|
SpinLockRelease(&hashmdata->mutex);
|
|
t_thrd.walreceiver_cxt.reply_message->peer_state = get_local_dbstate();
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->receiver_received_location = receivePtr;
|
|
walrcv->receiver_write_location = writePtr;
|
|
walrcv->receiver_flush_location = flushPtr;
|
|
walrcv->receiver_replay_location = t_thrd.walreceiver_cxt.reply_message->apply;
|
|
sndFlushPtr = walrcv->sender_flush_location;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
if (u_sess->attr.attr_storage.HaModuleDebug) {
|
|
ereport(LOG, (errmsg("HA-XLogWalRcvSendReply: sending receive %X/%X write %X/%X flush %X/%X apply %X/%X",
|
|
(uint32)(t_thrd.walreceiver_cxt.reply_message->receive >> 32),
|
|
(uint32)t_thrd.walreceiver_cxt.reply_message->receive,
|
|
(uint32)(t_thrd.walreceiver_cxt.reply_message->write >> 32),
|
|
(uint32)t_thrd.walreceiver_cxt.reply_message->write,
|
|
(uint32)(t_thrd.walreceiver_cxt.reply_message->flush >> 32),
|
|
(uint32)t_thrd.walreceiver_cxt.reply_message->flush,
|
|
(uint32)(t_thrd.walreceiver_cxt.reply_message->apply >> 32),
|
|
(uint32)t_thrd.walreceiver_cxt.reply_message->apply)));
|
|
}
|
|
|
|
/* Prepend with the message type and send it. */
|
|
buf[0] = 'r';
|
|
rc = memcpy_s(&buf[1], sizeof(StandbyReplyMessage), t_thrd.walreceiver_cxt.reply_message,
|
|
sizeof(StandbyReplyMessage));
|
|
securec_check(rc, "\0", "\0");
|
|
(WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_send(buf, sizeof(StandbyReplyMessage) + 1);
|
|
WalRcvRefreshPercentCountStartLsn(sndFlushPtr, flushPtr);
|
|
}
|
|
|
|
/*
|
|
* Send hot standby feedback message to primary, plus the current time,
|
|
* in case they don't have a watch.
|
|
*/
|
|
static void XLogWalRcvSendHSFeedback(void)
|
|
{
|
|
char buf[sizeof(StandbyHSFeedbackMessage) + 1];
|
|
TimestampTz now;
|
|
TransactionId xmin;
|
|
errno_t rc = 0;
|
|
/*
|
|
* If the user doesn't want status to be reported to the master, be sure
|
|
* to exit before doing anything at all.
|
|
*/
|
|
if (u_sess->attr.attr_storage.wal_receiver_status_interval <= 0 || !u_sess->attr.attr_storage.hot_standby_feedback)
|
|
return;
|
|
|
|
/* Get current timestamp. */
|
|
now = GetCurrentTimestamp();
|
|
/*
|
|
* Send feedback at most once per wal_receiver_status_interval.
|
|
*/
|
|
if (!TimestampDifferenceExceeds(t_thrd.walreceiver_cxt.feedback_message->sendTime, now,
|
|
u_sess->attr.attr_storage.wal_receiver_status_interval * 1000)) {
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* If Hot Standby is not yet active there is nothing to send. Check this
|
|
* after the interval has expired to reduce number of calls.
|
|
*/
|
|
if (!HotStandbyActive())
|
|
return;
|
|
|
|
/*
|
|
* Make the expensive call to get the oldest xmin once we are certain
|
|
* everything else has been checked.
|
|
*/
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
/* Get updated RecentGlobalXmin */
|
|
GetSnapshotData(u_sess->utils_cxt.CurrentSnapshotData, true, true);
|
|
#endif
|
|
xmin = GetOldestXmin(NULL);
|
|
|
|
/*
|
|
* Always send feedback message.
|
|
*/
|
|
t_thrd.walreceiver_cxt.feedback_message->sendTime = now;
|
|
t_thrd.walreceiver_cxt.feedback_message->xmin = xmin;
|
|
|
|
ereport(DEBUG2,
|
|
(errmsg("sending hot standby feedback xmin " XID_FMT, t_thrd.walreceiver_cxt.feedback_message->xmin)));
|
|
|
|
/* Prepend with the message type and send it. */
|
|
buf[0] = 'h';
|
|
rc = memcpy_s(&buf[1], sizeof(StandbyHSFeedbackMessage), t_thrd.walreceiver_cxt.feedback_message,
|
|
sizeof(StandbyHSFeedbackMessage));
|
|
securec_check(rc, "\0", "\0");
|
|
(WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_send(buf, sizeof(StandbyHSFeedbackMessage) + 1);
|
|
}
|
|
|
|
/*
|
|
* Process WaldataHeaderMessage received from sender message type is 'd'.
|
|
*/
|
|
static void ProcessWalDataHeaderMessage(WalDataPageMessageHeader *msghdr)
|
|
{
|
|
/* Use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
TimestampTz lastMsgReceiptTime = GetCurrentTimestamp();
|
|
|
|
Assert(msghdr);
|
|
|
|
/* Update shared-memory status */
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->lastMsgSendTime = msghdr->sendTime;
|
|
walrcv->lastMsgReceiptTime = lastMsgReceiptTime;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
if (log_min_messages <= DEBUG2) {
|
|
MakeDebugLog(msghdr->sendTime, lastMsgReceiptTime,
|
|
"wal receive waldata header data sendtime %s receipttime %s");
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Process walHeaderMessage received from sender, message type is 'w'.
|
|
*/
|
|
static void ProcessWalHeaderMessage(WalDataMessageHeader *msghdr)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
TimestampTz lastMsgReceiptTime = GetCurrentTimestamp();
|
|
|
|
/* Update shared-memory status */
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->lastMsgSendTime = msghdr->sendTime;
|
|
walrcv->lastMsgReceiptTime = lastMsgReceiptTime;
|
|
|
|
walrcv->sender_sent_location = msghdr->sender_sent_location;
|
|
|
|
walrcv->sender_flush_location = msghdr->sender_flush_location;
|
|
|
|
walrcv->sender_replay_location = msghdr->sender_replay_location;
|
|
|
|
walrcv->sender_write_location = msghdr->sender_write_location;
|
|
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
/* Update the catchup flag */
|
|
wal_catchup = msghdr->catchup;
|
|
|
|
ereport(DEBUG2, (errmsg("wal receiver data message: start %X/%X end %X/%X "
|
|
"sender_write %X/%X sender_flush %X/%X sender_replay %X/%X",
|
|
(uint32)(msghdr->dataStart >> 32), (uint32)msghdr->dataStart,
|
|
(uint32)(msghdr->sender_sent_location >> 32), (uint32)msghdr->sender_sent_location,
|
|
(uint32)(msghdr->sender_write_location >> 32), (uint32)msghdr->sender_write_location,
|
|
(uint32)(msghdr->sender_flush_location >> 32), (uint32)msghdr->sender_flush_location,
|
|
(uint32)(msghdr->sender_replay_location >> 32), (uint32)msghdr->sender_replay_location)));
|
|
|
|
if (log_min_messages <= DEBUG2) {
|
|
MakeDebugLog(msghdr->sendTime, lastMsgReceiptTime, "wal receive wal header data sendtime %s receipttime %s");
|
|
ereport(DEBUG2, (errmsg("replication apply delay %d ms transfer latency %d ms", GetReplicationApplyDelay(),
|
|
GetReplicationTransferLatency())));
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Process ProcessKeepaliveMessage received from sender, message type is 'k'.
|
|
*/
|
|
static void ProcessKeepaliveMessage(PrimaryKeepaliveMessage *keepalive)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
TimestampTz lastMsgReceiptTime = GetCurrentTimestamp();
|
|
|
|
/* Update shared-memory status */
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->peer_role = keepalive->peer_role;
|
|
walrcv->peer_state = keepalive->peer_state;
|
|
walrcv->sender_sent_location = keepalive->walEnd;
|
|
walrcv->lastMsgSendTime = keepalive->sendTime;
|
|
walrcv->lastMsgReceiptTime = lastMsgReceiptTime;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
wal_catchup = keepalive->catchup;
|
|
|
|
if (log_min_messages <= DEBUG2) {
|
|
MakeDebugLog(keepalive->sendTime, lastMsgReceiptTime, "wal receive keep alive data sendtime %s receipttime %s");
|
|
ereport(DEBUG2, (errmsg("replication apply delay %d ms transfer latency %d ms", GetReplicationApplyDelay(),
|
|
GetReplicationTransferLatency())));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* update pg_control file.
|
|
* only wal receiver set system_identifier.
|
|
*/
|
|
void SyncSystemIdentifier(void)
|
|
{
|
|
if (t_thrd.walreceiver_cxt.control_file_writed == 0) {
|
|
ereport(LOG, (errmsg("update secondary system identifier")));
|
|
|
|
SetSystemIdentifier(sync_system_identifier);
|
|
|
|
t_thrd.walreceiver_cxt.control_file_writed++;
|
|
UpdateControlFile();
|
|
}
|
|
}
|
|
|
|
void ProcessWSRmXLog(void)
|
|
{
|
|
char xlog_path[MAXPGPATH] = {0};
|
|
int nRet = 0;
|
|
|
|
nRet = snprintf_s(xlog_path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", t_thrd.proc_cxt.DataDir, XLOGDIR);
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
|
|
DIR *dir = NULL;
|
|
struct dirent *de;
|
|
|
|
dir = AllocateDir(xlog_path);
|
|
while ((de = ReadDir(dir, xlog_path)) != NULL) {
|
|
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
|
|
continue;
|
|
|
|
char path[MAXPGPATH] = {0};
|
|
nRet = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", xlog_path, de->d_name);
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
|
|
(void)unlink(path);
|
|
}
|
|
FreeDir(dir);
|
|
}
|
|
|
|
void ProcessWSRmData(void)
|
|
{
|
|
DIR *dir = NULL;
|
|
struct dirent *de = NULL;
|
|
char data_path[MAXPGPATH] = {0};
|
|
int nRet = 0;
|
|
|
|
nRet = snprintf_s(data_path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", t_thrd.proc_cxt.DataDir, DUMMY_STANDBY_DATADIR);
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
|
|
dir = AllocateDir(data_path);
|
|
while ((de = ReadDir(dir, data_path)) != NULL) {
|
|
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
|
|
continue;
|
|
|
|
char path[MAXPGPATH] = {0};
|
|
nRet = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%s", data_path, de->d_name);
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
|
|
ereport(LOG, (errmsg("delete data path %s on the dummy standby.", path)));
|
|
|
|
(void)unlink(path);
|
|
}
|
|
FreeDir(dir);
|
|
}
|
|
|
|
/*
|
|
* Process RmXLogMessage received from primary sender, message type is 'x'.
|
|
* Refence searchBCMFiles
|
|
*/
|
|
static void ProcessRmXLogMessage(RmXLogMessage *rmXLogMessage)
|
|
{
|
|
XLogRecPtr lastFlushPtr = InvalidXLogRecPtr;
|
|
|
|
// check command source
|
|
if (rmXLogMessage->peer_role != PRIMARY_MODE) {
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_OPERATION),
|
|
errmsg("rm xlog comand is not from primary,peer_role=%d", rmXLogMessage->peer_role)));
|
|
}
|
|
|
|
ereport(DEBUG2, (errmsg("received rm xlog message")));
|
|
|
|
walRcvDataCleanup();
|
|
WalRcvXLogClose();
|
|
|
|
SpinLockAcquire(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
lastFlushPtr = t_thrd.walreceiver_cxt.walRcvCtlBlock->flushPtr;
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->receivePtr = t_thrd.walreceiver_cxt.walRcvCtlBlock->writePtr =
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->flushPtr = InvalidXLogRecPtr;
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walStart = InvalidXLogRecPtr;
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walWriteOffset = t_thrd.walreceiver_cxt.walRcvCtlBlock->walFreeOffset = 0;
|
|
t_thrd.walreceiver_cxt.walRcvCtlBlock->walReadOffset = 0;
|
|
SpinLockRelease(&t_thrd.walreceiver_cxt.walRcvCtlBlock->mutex);
|
|
|
|
/* Now rm the WAL files. */
|
|
ProcessWSRmXLog();
|
|
if (!XLByteEQ(lastFlushPtr, InvalidXLogRecPtr)) {
|
|
ereport(LOG, (errmsg("rm xlog command done, lastFlushPtr=%X/%X", (uint32)(lastFlushPtr >> 32),
|
|
(uint32)(lastFlushPtr))));
|
|
}
|
|
SyncSystemIdentifier();
|
|
|
|
/* Now rm the data file the same operation copyed from ProcessRmDataMessage() */
|
|
if (g_instance.attr.attr_storage.enable_mix_replication) {
|
|
while (true) {
|
|
if (!ws_dummy_data_writer_use_file) {
|
|
CloseWSDataFileOnDummyStandby();
|
|
break;
|
|
} else
|
|
pg_usleep(100000); /* sleep 0.1 s */
|
|
}
|
|
|
|
ProcessWSRmData();
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* Process RmXLogMessage received from primary sender, message type is 'e'.
|
|
* Refence searchBCMFiles
|
|
*/
|
|
static void ProcessEndXLogMessage(EndXLogMessage *endXLogMessage)
|
|
{
|
|
ereport(dummyStandbyMode ? DEBUG2 : LOG, (errmsg("sync Secondary Standby xlog done")));
|
|
|
|
if (endXLogMessage->percent == SYNC_DUMMY_STANDBY_END) {
|
|
SetWalRcvDummyStandbySyncPercent(SYNC_DUMMY_STANDBY_END);
|
|
|
|
if (dummyStandbyMode)
|
|
SyncSystemIdentifier();
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Process ProcessArchiveXlogMessage received from primary sender, message type is 'a'.
|
|
*/
|
|
const static int GET_ARCHIVE_XLOG_RETRY_MAX = 50;
|
|
const static int ARCHIVE_XLOG_DELAY = 10000;
|
|
|
|
static void ProcessArchiveXlogMessage(const ArchiveXlogMessage* archive_xlog_message)
|
|
{
|
|
ereport(LOG, (errmsg("get archive xlog message :%X/%X", (uint32)(archive_xlog_message->targetLsn >> 32),
|
|
(uint32)(archive_xlog_message->targetLsn))));
|
|
errno_t errorno = EOK;
|
|
volatile unsigned int *pitr_task_status = &g_instance.archive_obs_cxt.pitr_task_status;
|
|
unsigned int expected = PITR_TASK_NONE;
|
|
int failed_times = 0;
|
|
while (pg_atomic_compare_exchange_u32(pitr_task_status, &expected, PITR_TASK_GET) == false) {
|
|
/* some task arrived before last task done if expected not equal to NONE */
|
|
expected = PITR_TASK_NONE;
|
|
pg_usleep(ARCHIVE_XLOG_DELAY); // sleep 0.01s
|
|
if (failed_times++ >= GET_ARCHIVE_XLOG_RETRY_MAX) {
|
|
ereport(WARNING, (errmsg("get archive xlog message :%X/%X, but not finished",
|
|
(uint32)(archive_xlog_message->targetLsn >> 32),
|
|
(uint32)(archive_xlog_message->targetLsn))));
|
|
return;
|
|
}
|
|
}
|
|
errorno = memcpy_s(&g_instance.archive_obs_cxt.archive_task,
|
|
sizeof(ArchiveXlogMessage) + 1,
|
|
archive_xlog_message,
|
|
sizeof(ArchiveXlogMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
wakeupObsArchLatch();
|
|
}
|
|
|
|
/*
|
|
* Send switchover request message to primary, indicating the current time.
|
|
*/
|
|
static void XLogWalRcvSendSwitchRequest(void)
|
|
{
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
char buf[sizeof(StandbySwitchRequestMessage) + 1];
|
|
TimestampTz local_now;
|
|
errno_t errorno = EOK;
|
|
|
|
/* Get current timestamp. */
|
|
local_now = GetCurrentTimestamp();
|
|
t_thrd.walreceiver_cxt.request_message->sendTime = local_now;
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
t_thrd.walreceiver_cxt.request_message->demoteMode = walrcv->node_state;
|
|
walrcv->node_state = NODESTATE_STANDBY_WAITING;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
/* Prepend with the message type and send it. */
|
|
buf[0] = 's';
|
|
errorno = memcpy_s(&buf[1], sizeof(StandbySwitchRequestMessage), t_thrd.walreceiver_cxt.request_message,
|
|
sizeof(StandbySwitchRequestMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
(WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_send(buf, sizeof(StandbySwitchRequestMessage) + 1);
|
|
|
|
SendPostmasterSignal(PMSIGNAL_UPDATE_WAITING);
|
|
ereport(LOG, (errmsg("send %s switchover request to primary",
|
|
DemoteModeDesc(t_thrd.walreceiver_cxt.request_message->demoteMode))));
|
|
}
|
|
|
|
/*
|
|
* Send archive xlog response message to primary.
|
|
*/
|
|
static void WalRecvSendArchiveXlogResponse()
|
|
{
|
|
char buf[sizeof(ArchiveXlogResponseMeeeage) + 1];
|
|
ArchiveXlogResponseMeeeage reply;
|
|
errno_t errorno = EOK;
|
|
reply.pitr_result = g_instance.archive_obs_cxt.pitr_finish_result;
|
|
reply.targetLsn = g_instance.archive_obs_cxt.archive_task.targetLsn;
|
|
buf[0] = 'a';
|
|
errorno = memcpy_s(&buf[1],
|
|
sizeof(ArchiveXlogResponseMeeeage),
|
|
&reply,
|
|
sizeof(ArchiveXlogResponseMeeeage));
|
|
securec_check(errorno, "\0", "\0");
|
|
libpqrcv_send(buf, sizeof(ArchiveXlogResponseMeeeage) + 1);
|
|
|
|
ereport(LOG,
|
|
(errmsg("WalRecvSendArchiveXlogResponse %d %X/%X", reply.pitr_result,
|
|
(uint32)(reply.targetLsn >> 32), (uint32)(reply.targetLsn))));
|
|
|
|
}
|
|
|
|
/*
|
|
* process switchover response message from primary.
|
|
*/
|
|
static void ProcessSwitchResponse(int code)
|
|
{
|
|
switch (code) {
|
|
case SWITCHOVER_PROMOTE_REQUEST: /* promote standby */
|
|
t_thrd.walreceiverfuncs_cxt.WalRcv->node_state = NODESTATE_STANDBY_PROMOTING;
|
|
SendPostmasterSignal(PMSIGNAL_PROMOTE_STANDBY);
|
|
break;
|
|
|
|
case SWITCHOVER_DEMOTE_FAILED: /* demote failed */
|
|
ereport(WARNING, (errmsg("primary demote failed")));
|
|
break;
|
|
|
|
case SWITCHOVER_DEMOTE_CATCHUP_EXIST: /* demote failed */
|
|
t_thrd.walreceiverfuncs_cxt.WalRcv->node_state = NODESTATE_NORMAL;
|
|
SendPostmasterSignal(PMSIGNAL_ROLLBACK_STANDBY_PROMOTE);
|
|
ereport(LOG, (errmsg("catchup is still alive, switchover failed")));
|
|
break;
|
|
|
|
default:
|
|
ereport(WARNING, (errmsg("unknown switchover response message received from primary")));
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Keep track of important messages from primary.
|
|
*/
|
|
static void ProcessWalSndrMessage(XLogRecPtr *walEnd, TimestampTz sendTime)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
|
|
TimestampTz lastMsgReceiptTime = GetCurrentTimestamp();
|
|
|
|
/* Update shared-memory status */
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
if (XLByteLT(walrcv->latestWalEnd, *walEnd))
|
|
walrcv->latestWalEndTime = sendTime;
|
|
walrcv->latestWalEnd = *walEnd;
|
|
walrcv->sender_sent_location = *walEnd;
|
|
walrcv->lastMsgSendTime = sendTime;
|
|
walrcv->lastMsgReceiptTime = lastMsgReceiptTime;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
if (log_min_messages <= DEBUG2) {
|
|
int applyDelay;
|
|
applyDelay = GetReplicationApplyDelay();
|
|
MakeDebugLog(sendTime, lastMsgReceiptTime, "wal receive walSndMsg sendtime %s receipttime %s");
|
|
/* apply delay is not available */
|
|
if (applyDelay == -1) {
|
|
ereport(DEBUG2,
|
|
(errmsg("replication apply delay (N/A) transfer latency %d ms", GetReplicationTransferLatency())));
|
|
} else {
|
|
ereport(DEBUG2, (errmsg("replication apply delay %d ms transfer latency %d ms", applyDelay,
|
|
GetReplicationTransferLatency())));
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* get xlog sync percent between walsender and walreceiver.
|
|
*/
|
|
int GetSyncPercent(XLogRecPtr startLsn, XLogRecPtr totalLsn, XLogRecPtr hasCompleteLsn)
|
|
{
|
|
int64 needSyncLogSum = 0;
|
|
int64 haveSyncLog = 0;
|
|
int haveCompletePer = 0;
|
|
int basePercent = 0;
|
|
XLogSegNo segno;
|
|
|
|
if (XLByteLE(totalLsn, hasCompleteLsn)) {
|
|
return HIGHEST_PERCENT;
|
|
}
|
|
|
|
/*
|
|
* When startLsn is invalid, standby is under streaming, so count percent base on
|
|
* maxlsn - wal_keep_segments*XLOG_SEG_SIZE, and percent is 90% ~ 100%
|
|
*/
|
|
if (XLogRecPtrIsInvalid(startLsn)) {
|
|
XLByteToSeg(totalLsn, segno);
|
|
if (segno < WalGetSyncCountWindow()) {
|
|
startLsn = InvalidXLogRecPtr;
|
|
} else {
|
|
startLsn = totalLsn - (WalGetSyncCountWindow() * XLOG_SEG_SIZE);
|
|
basePercent = STREAMING_START_PERCENT;
|
|
}
|
|
}
|
|
|
|
needSyncLogSum = XLogDiff(totalLsn, startLsn);
|
|
haveSyncLog = XLogDiff(hasCompleteLsn, startLsn) + SizeOfXLogRecord - 1;
|
|
|
|
if (needSyncLogSum == 0) {
|
|
return HIGHEST_PERCENT;
|
|
} else {
|
|
haveCompletePer = (int)((HIGHEST_PERCENT - basePercent) * (haveSyncLog * 1.0 / needSyncLogSum)) + basePercent;
|
|
}
|
|
if (haveCompletePer > HIGHEST_PERCENT) {
|
|
haveCompletePer = HIGHEST_PERCENT;
|
|
} else if (haveCompletePer < 0) {
|
|
haveCompletePer = 0;
|
|
}
|
|
return haveCompletePer;
|
|
}
|
|
|
|
static bool am_cascade_standby(void)
|
|
{
|
|
if (t_thrd.postmaster_cxt.HaShmData->current_mode == STANDBY_MODE &&
|
|
t_thrd.postmaster_cxt.HaShmData->is_cascade_standby) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* transfer the server mode to string.
|
|
*/
|
|
const char* wal_get_role_string(ServerMode mode, bool getPeerRole)
|
|
{
|
|
switch (mode) {
|
|
case NORMAL_MODE:
|
|
return "Normal";
|
|
case PRIMARY_MODE:
|
|
return "Primary";
|
|
case STANDBY_MODE:
|
|
{
|
|
if (am_cascade_standby() && !getPeerRole) {
|
|
return "Cascade Standby";
|
|
} else {
|
|
return "Standby";
|
|
}
|
|
}
|
|
case CASCADE_STANDBY_MODE:
|
|
return "Cascade Standby";
|
|
case PENDING_MODE:
|
|
return "Pending";
|
|
case UNKNOWN_MODE:
|
|
return "Unknown";
|
|
default:
|
|
ereport(WARNING, (errmsg("invalid server mode:%d", (int)mode)));
|
|
break;
|
|
}
|
|
return "Unknown";
|
|
}
|
|
|
|
const char *wal_get_rebuild_reason_string(HaRebuildReason reason)
|
|
{
|
|
switch (reason) {
|
|
case NONE_REBUILD:
|
|
return "Normal";
|
|
case WALSEGMENT_REBUILD:
|
|
return "WAL segment removed";
|
|
case CONNECT_REBUILD:
|
|
return "Disconnected";
|
|
case VERSION_REBUILD:
|
|
return "Version not matched";
|
|
case MODE_REBUILD:
|
|
return "Mode not matched";
|
|
case SYSTEMID_REBUILD:
|
|
return "System id not matched";
|
|
case TIMELINE_REBUILD:
|
|
return "Timeline not matched";
|
|
default:
|
|
break;
|
|
}
|
|
return "Unknown";
|
|
}
|
|
|
|
static void wal_get_ha_rebuild_reason_with_dummy(char *buildReason, ServerMode local_role, bool isRunning)
|
|
{
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
|
|
int nRet = 0;
|
|
load_server_mode();
|
|
if (local_role == NORMAL_MODE || local_role == PRIMARY_MODE || IS_DISASTER_RECOVER_MODE) {
|
|
nRet = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Normal");
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
return;
|
|
}
|
|
|
|
if (t_thrd.postmaster_cxt.ReplConnArray[1] != NULL && walrcv->conn_target == REPCONNTARGET_PRIMARY) {
|
|
if (hashmdata->repl_reason[1] == NONE_REBUILD && isRunning) {
|
|
nRet = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Normal");
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
} else if (hashmdata->repl_reason[1] == NONE_REBUILD && !isRunning) {
|
|
nRet = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Connecting...");
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
} else {
|
|
nRet = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s",
|
|
wal_get_rebuild_reason_string(hashmdata->repl_reason[1]));
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
}
|
|
} else if (t_thrd.postmaster_cxt.ReplConnArray[2] != NULL && walrcv->conn_target == REPCONNTARGET_DUMMYSTANDBY) {
|
|
if (hashmdata->repl_reason[2] == NONE_REBUILD && isRunning) {
|
|
nRet = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Normal");
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
} else if (hashmdata->repl_reason[2] == NONE_REBUILD && !isRunning) {
|
|
nRet = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Connecting...");
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
} else {
|
|
nRet = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s",
|
|
wal_get_rebuild_reason_string(hashmdata->repl_reason[2]));
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
}
|
|
} else {
|
|
nRet = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Disconnected");
|
|
securec_check_ss(nRet, "\0", "\0");
|
|
}
|
|
}
|
|
|
|
static void wal_get_ha_rebuild_reason_with_multi(char *buildReason, ServerMode local_role, bool isRunning)
|
|
{
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
|
|
int rcs = 0;
|
|
load_server_mode();
|
|
if (local_role == NORMAL_MODE || local_role == PRIMARY_MODE) {
|
|
rcs = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Normal");
|
|
securec_check_ss(rcs, "\0", "\0");
|
|
return;
|
|
}
|
|
|
|
if (t_thrd.postmaster_cxt.ReplConnArray[hashmdata->current_repl] != NULL &&
|
|
(walrcv->conn_target == REPCONNTARGET_PRIMARY || am_cascade_standby()
|
|
|| IS_DISASTER_RECOVER_MODE)) {
|
|
if (hashmdata->repl_reason[hashmdata->current_repl] == NONE_REBUILD && isRunning) {
|
|
rcs = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Normal");
|
|
securec_check_ss(rcs, "\0", "\0");
|
|
} else if (hashmdata->repl_reason[hashmdata->current_repl] == NONE_REBUILD && !isRunning) {
|
|
rcs = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Connecting...");
|
|
securec_check_ss(rcs, "\0", "\0");
|
|
} else {
|
|
rcs = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s",
|
|
wal_get_rebuild_reason_string(hashmdata->repl_reason[hashmdata->current_repl]));
|
|
securec_check_ss(rcs, "\0", "\0");
|
|
}
|
|
} else {
|
|
rcs = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s", "Disconnected");
|
|
securec_check_ss(rcs, "\0", "\0");
|
|
}
|
|
}
|
|
|
|
static void wal_get_ha_rebuild_reason(char *buildReason, ServerMode local_role, bool isRunning)
|
|
{
|
|
if (IS_DN_DUMMY_STANDYS_MODE())
|
|
wal_get_ha_rebuild_reason_with_dummy(buildReason, local_role, isRunning);
|
|
else
|
|
wal_get_ha_rebuild_reason_with_multi(buildReason, local_role, isRunning);
|
|
}
|
|
|
|
/*
|
|
* Descriptions: Returns activity of walreveiver, including pids and xlog
|
|
* locations received from primary o cascading server.
|
|
*/
|
|
Datum pg_stat_get_wal_receiver(PG_FUNCTION_ARGS)
|
|
{
|
|
#define PG_STAT_GET_WAL_RECEIVER_COLS 15
|
|
ReturnSetInfo *rsinfo = (ReturnSetInfo *)fcinfo->resultinfo;
|
|
TupleDesc tupdesc = NULL;
|
|
Tuplestorestate *tupstore = NULL;
|
|
MemoryContext per_query_ctx;
|
|
MemoryContext oldcontext;
|
|
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
|
|
|
|
char location[MAXFNAMELEN] = {0};
|
|
|
|
XLogRecPtr rcvRedo;
|
|
XLogRecPtr rcvWrite;
|
|
XLogRecPtr rcvFlush;
|
|
bool isRuning = false;
|
|
|
|
XLogRecPtr sndSent;
|
|
XLogRecPtr sndWrite;
|
|
XLogRecPtr sndFlush;
|
|
XLogRecPtr sndReplay;
|
|
XLogRecPtr rcvReceived;
|
|
XLogRecPtr syncStart;
|
|
|
|
int sync_percent = 0;
|
|
ServerMode peer_role;
|
|
DbState peer_state;
|
|
DbState local_state;
|
|
ServerMode local_role;
|
|
char localip[IP_LEN] = {0};
|
|
char remoteip[IP_LEN] = {0};
|
|
int localport = 0;
|
|
int remoteport = 0;
|
|
Datum values[PG_STAT_GET_WAL_RECEIVER_COLS];
|
|
bool nulls[PG_STAT_GET_WAL_RECEIVER_COLS];
|
|
errno_t rc = EOK;
|
|
|
|
/* check to see if caller supports us returning a tuplestore */
|
|
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) {
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("set-valued function called in context that cannot accept a set")));
|
|
return (Datum)0;
|
|
}
|
|
if (!(rsinfo->allowedModes & SFRM_Materialize))
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("materialize mode required, but it is not allowed in this context")));
|
|
|
|
/* Build a tuple descriptor for our result type */
|
|
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
|
|
ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("return type must be a row type")));
|
|
|
|
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
|
|
oldcontext = MemoryContextSwitchTo(per_query_ctx);
|
|
|
|
tupstore = tuplestore_begin_heap(true, false, u_sess->attr.attr_memory.work_mem);
|
|
rsinfo->returnMode = SFRM_Materialize;
|
|
rsinfo->setResult = tupstore;
|
|
rsinfo->setDesc = tupdesc;
|
|
|
|
(void)MemoryContextSwitchTo(oldcontext);
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
isRuning = walrcv->isRuning;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
if (walrcv->pid == 0 || !isRuning)
|
|
return (Datum)0;
|
|
|
|
SpinLockAcquire(&hashmdata->mutex);
|
|
local_role = hashmdata->current_mode;
|
|
SpinLockRelease(&hashmdata->mutex);
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
localport = walrcv->conn_channel.localport;
|
|
remoteport = walrcv->conn_channel.remoteport;
|
|
rc = strncpy_s(localip, IP_LEN, (char *)walrcv->conn_channel.localhost, IP_LEN - 1);
|
|
securec_check(rc, "\0", "\0");
|
|
rc = strncpy_s(remoteip, IP_LEN, (char *)walrcv->conn_channel.remotehost, IP_LEN - 1);
|
|
securec_check(rc, "\0", "\0");
|
|
localip[IP_LEN - 1] = '\0';
|
|
remoteip[IP_LEN - 1] = '\0';
|
|
peer_role = walrcv->peer_role;
|
|
peer_state = walrcv->peer_state;
|
|
load_server_mode();
|
|
local_state = get_local_dbstate();
|
|
|
|
sndSent = walrcv->sender_sent_location;
|
|
sndWrite = walrcv->sender_write_location;
|
|
sndFlush = walrcv->sender_flush_location;
|
|
sndReplay = walrcv->sender_replay_location;
|
|
|
|
rcvReceived = walrcv->receiver_received_location;
|
|
rcvRedo = walrcv->receiver_replay_location;
|
|
rcvWrite = walrcv->receiver_write_location;
|
|
rcvFlush = walrcv->receiver_flush_location;
|
|
syncStart = walrcv->syncPercentCountStart;
|
|
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
rc = memset_s(nulls, sizeof(nulls), 0, sizeof(nulls));
|
|
securec_check_c(rc, "\0", "\0");
|
|
values[0] = Int32GetDatum(walrcv->lwpId);
|
|
|
|
if (!superuser() && !(isOperatoradmin(GetUserId()) && u_sess->attr.attr_security.operation_mode)) {
|
|
/*
|
|
* Only superusers can see details. Other users only get the pid
|
|
* value to know it's a receiver, but no details.
|
|
*/
|
|
rc = memset_s(&nulls[1], PG_STAT_GET_WAL_RECEIVER_COLS - 1, true, PG_STAT_GET_WAL_RECEIVER_COLS - 1);
|
|
securec_check(rc, "\0", "\0");
|
|
} else {
|
|
/* local_role */
|
|
values[1] = CStringGetTextDatum(wal_get_role_string(local_role));
|
|
/* peer_role */
|
|
values[2] = CStringGetTextDatum(wal_get_role_string(peer_role, true));
|
|
/* peer_state */
|
|
values[3] = CStringGetTextDatum(wal_get_db_state_string(peer_state));
|
|
/* state */
|
|
values[4] = CStringGetTextDatum(wal_get_db_state_string(local_state));
|
|
|
|
/* sender_sent_location */
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%X/%X", (uint32)(sndSent >> 32),
|
|
(uint32)sndSent);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[5] = CStringGetTextDatum(location);
|
|
|
|
/* sender_write_location */
|
|
if (sndWrite == 0)
|
|
SETXLOGLOCATION(sndWrite, sndSent)
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%X/%X", (uint32)(sndWrite >> 32),
|
|
(uint32)sndWrite);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[6] = CStringGetTextDatum(location);
|
|
|
|
/* sender_flush_location */
|
|
if (sndFlush == 0)
|
|
SETXLOGLOCATION(sndFlush, sndSent)
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%X/%X", (uint32)(sndFlush >> 32),
|
|
(uint32)sndFlush);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[7] = CStringGetTextDatum(location);
|
|
|
|
/* sender_replay_location */
|
|
if (sndReplay == 0)
|
|
SETXLOGLOCATION(sndReplay, sndSent)
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%X/%X", (uint32)(sndReplay >> 32),
|
|
(uint32)sndReplay);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[8] = CStringGetTextDatum(location);
|
|
|
|
/* receiver_received_location */
|
|
if (rcvReceived == 0)
|
|
SETXLOGLOCATION(rcvReceived, sndSent)
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%X/%X", (uint32)(rcvReceived >> 32),
|
|
(uint32)rcvReceived);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[9] = CStringGetTextDatum(location);
|
|
|
|
/* receiver_write_location */
|
|
if (rcvWrite == 0)
|
|
SETXLOGLOCATION(rcvWrite, sndSent)
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%X/%X", (uint32)(rcvWrite >> 32),
|
|
(uint32)rcvWrite);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[10] = CStringGetTextDatum(location);
|
|
|
|
/* receiver_flush_location */
|
|
if (rcvFlush == 0)
|
|
SETXLOGLOCATION(rcvFlush, sndSent)
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%X/%X", (uint32)(rcvFlush >> 32),
|
|
(uint32)rcvFlush);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[11] = CStringGetTextDatum(location);
|
|
|
|
/* receiver_replay_location */
|
|
if (rcvRedo == 0)
|
|
SETXLOGLOCATION(rcvRedo, sndSent)
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%X/%X", (uint32)(rcvRedo >> 32),
|
|
(uint32)rcvRedo);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[12] = CStringGetTextDatum(location);
|
|
|
|
/* sync_percent */
|
|
sync_percent = GetSyncPercent(syncStart, sndFlush, rcvFlush);
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%d%%", sync_percent);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[13] = CStringGetTextDatum(location);
|
|
|
|
/* channel */
|
|
rc = snprintf_s(location, sizeof(location), sizeof(location) - 1, "%s:%d<--%s:%d", localip, localport, remoteip,
|
|
remoteport);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
values[14] = CStringGetTextDatum(location);
|
|
}
|
|
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
|
|
|
|
/* clean up and return the tuplestore */
|
|
tuplestore_donestoring(tupstore);
|
|
|
|
return (Datum)0;
|
|
}
|
|
|
|
/*
|
|
* Returns activity of ha state, including static connections,local role,
|
|
* database state and rebuild reason if database state is unnormal.
|
|
*/
|
|
Datum pg_stat_get_stream_replications(PG_FUNCTION_ARGS)
|
|
{
|
|
#define PG_STAT_GET_STREAM_REPLICATIONS_COLS 4
|
|
ReturnSetInfo *rsinfo = (ReturnSetInfo *)fcinfo->resultinfo;
|
|
TupleDesc tupdesc = NULL;
|
|
Tuplestorestate *tupstore = NULL;
|
|
MemoryContext per_query_ctx;
|
|
MemoryContext oldcontext;
|
|
|
|
Datum values[PG_STAT_GET_STREAM_REPLICATIONS_COLS];
|
|
bool nulls[PG_STAT_GET_STREAM_REPLICATIONS_COLS];
|
|
|
|
ServerMode local_role;
|
|
int static_connnections = 0;
|
|
char buildReason[MAXFNAMELEN] = {0};
|
|
|
|
volatile HaShmemData *hashmdata = t_thrd.postmaster_cxt.HaShmData;
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
bool isRunning = false;
|
|
DbState db_state = UNKNOWN_STATE;
|
|
errno_t rc = 0;
|
|
load_server_mode();
|
|
|
|
/* check to see if caller supports us returning a tuplestore */
|
|
if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) {
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmsg("set-valued function called in context that cannot accept a set")));
|
|
return (Datum)0;
|
|
}
|
|
if (!(rsinfo->allowedModes & SFRM_Materialize))
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("materialize mode required, but it is not "
|
|
"allowed in this context")));
|
|
|
|
/* Build a tuple descriptor for our result type */
|
|
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
|
|
ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("return type must be a row type")));
|
|
|
|
per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
|
|
oldcontext = MemoryContextSwitchTo(per_query_ctx);
|
|
|
|
tupstore = tuplestore_begin_heap(true, false, u_sess->attr.attr_memory.work_mem);
|
|
rsinfo->returnMode = SFRM_Materialize;
|
|
rsinfo->setResult = tupstore;
|
|
rsinfo->setDesc = tupdesc;
|
|
|
|
(void)MemoryContextSwitchTo(oldcontext);
|
|
rc = memset_s(nulls, sizeof(nulls), 0, sizeof(nulls));
|
|
securec_check_c(rc, "\0", "\0");
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
isRunning = walrcv->isRuning;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
SpinLockAcquire(&hashmdata->mutex);
|
|
local_role = hashmdata->current_mode;
|
|
static_connnections = hashmdata->repl_list_num;
|
|
SpinLockRelease(&hashmdata->mutex);
|
|
|
|
wal_get_ha_rebuild_reason(buildReason, local_role, isRunning);
|
|
/* If walreceiver has detected a system id not match reason before, then use it. */
|
|
for (int i = 1; i <= hashmdata->repl_list_num; i++) {
|
|
if ((hashmdata->repl_reason[i] == SYSTEMID_REBUILD || hashmdata->repl_reason[i] == WALSEGMENT_REBUILD)
|
|
&& strcmp(buildReason, "Connecting...") == 0) {
|
|
rc = snprintf_s(buildReason, MAXFNAMELEN, MAXFNAMELEN - 1, "%s",
|
|
wal_get_rebuild_reason_string(hashmdata->repl_reason[i]));
|
|
securec_check_ss(rc, "\0", "\0");
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (local_role == UNKNOWN_MODE)
|
|
ereport(WARNING, (errmsg("server mode is unknown.")));
|
|
|
|
/* local role */
|
|
values[0] = CStringGetTextDatum(wal_get_role_string(local_role));
|
|
/* static connections */
|
|
values[1] = Int32GetDatum(static_connnections);
|
|
/* db state */
|
|
db_state = get_local_dbstate();
|
|
values[2] = CStringGetTextDatum(wal_get_db_state_string(db_state));
|
|
/* build_reason */
|
|
values[3] = CStringGetTextDatum(buildReason);
|
|
|
|
tuplestore_putvalues(tupstore, tupdesc, values, nulls);
|
|
|
|
/* clean up and return the tuplestore */
|
|
tuplestore_donestoring(tupstore);
|
|
|
|
return (Datum)0;
|
|
}
|
|
|
|
/*
|
|
* we check the configure file every check_file_timeout, if
|
|
* the configure has been modified, send the modify time to standy.
|
|
*/
|
|
static void ConfigFileTimer(void)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
if (g_instance.attr.attr_common.sync_config_strategy == NONE_NODE) {
|
|
return;
|
|
}
|
|
#endif
|
|
struct stat statbuf;
|
|
char bufTime[sizeof(ConfigModifyTimeMessage) + 1];
|
|
TimestampTz nowTime;
|
|
|
|
if (t_thrd.walreceiver_cxt.check_file_timeout > 0) {
|
|
nowTime = GetCurrentTimestamp();
|
|
if (TimestampDifferenceExceeds(t_thrd.walreceiver_cxt.last_sendfilereply_timestamp, nowTime,
|
|
t_thrd.walreceiver_cxt.check_file_timeout) ||
|
|
TimestampDifferenceExceeds(nowTime, t_thrd.walreceiver_cxt.last_sendfilereply_timestamp,
|
|
t_thrd.walreceiver_cxt.check_file_timeout)) {
|
|
errno_t errorno = EOK;
|
|
ereport(LOG, (errmsg("time is up to send file")));
|
|
if (lstat(t_thrd.walreceiver_cxt.gucconf_file, &statbuf) != 0) {
|
|
if (errno != ENOENT) {
|
|
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file or directory \"%s\": %m",
|
|
t_thrd.walreceiver_cxt.gucconf_file)));
|
|
}
|
|
}
|
|
/* the configure file in standby has been change yet. */
|
|
if (t_thrd.walreceiver_cxt.standby_config_modify_time != statbuf.st_mtime) {
|
|
ereport(LOG,
|
|
(errmsg("statbuf.st_mtime:%d is not equal to config_modify_time:%d", (int)(statbuf.st_mtime),
|
|
(int)(t_thrd.walreceiver_cxt.standby_config_modify_time))));
|
|
t_thrd.walreceiver_cxt.reply_modify_message->config_modify_time = 0;
|
|
} else {
|
|
ereport(LOG, (errmsg("the config file of standby has no change:%d", (int)(statbuf.st_mtime))));
|
|
t_thrd.walreceiver_cxt.reply_modify_message->config_modify_time =
|
|
t_thrd.walreceiver_cxt.Primary_config_modify_time;
|
|
}
|
|
bufTime[0] = 'A';
|
|
errorno = memcpy_s(&bufTime[1], sizeof(ConfigModifyTimeMessage),
|
|
t_thrd.walreceiver_cxt.reply_modify_message, sizeof(ConfigModifyTimeMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
(WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_send(bufTime, sizeof(ConfigModifyTimeMessage) + 1);
|
|
/* save the current timestamp */
|
|
t_thrd.walreceiver_cxt.last_sendfilereply_timestamp = GetCurrentTimestamp();
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool ProcessConfigFileMessage(char *buf, Size len)
|
|
{
|
|
struct stat statbuf;
|
|
ErrCode retcode = CODE_OK;
|
|
ConfFileLock filelock = { NULL, 0 };
|
|
char conf_bak[MAXPGPATH];
|
|
int ret = 0;
|
|
char **reserve_item = NULL;
|
|
|
|
ret = snprintf_s(conf_bak, MAXPGPATH, MAXPGPATH - 1, "%s/%s", t_thrd.proc_cxt.DataDir, CONFIG_BAK_FILENAME);
|
|
securec_check_ss(ret, "\0", "\0");
|
|
|
|
if (lstat(t_thrd.walreceiver_cxt.gucconf_file, &statbuf) != 0) {
|
|
if (errno != ENOENT)
|
|
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file or directory \"%s\": %m",
|
|
t_thrd.walreceiver_cxt.gucconf_file)));
|
|
return false;
|
|
}
|
|
|
|
reserve_item = alloc_opt_lines(RESERVE_SIZE);
|
|
if (reserve_item == NULL) {
|
|
ereport(LOG, (errmsg("Alloc mem for reserved parameters failed")));
|
|
return false;
|
|
}
|
|
|
|
/* 1. lock postgresql.conf */
|
|
if (get_file_lock(t_thrd.walreceiver_cxt.gucconf_lock_file, &filelock) != CODE_OK) {
|
|
release_opt_lines(reserve_item);
|
|
ereport(LOG, (errmsg("Modify the postgresql.conf failed : can not get the file lock ")));
|
|
return false;
|
|
}
|
|
|
|
/* 2. load reserved parameters to reserve_item(array in memeory) */
|
|
retcode = copy_asyn_lines(t_thrd.walreceiver_cxt.gucconf_file, reserve_item, g_reserve_param);
|
|
if (retcode != CODE_OK) {
|
|
release_opt_lines(reserve_item);
|
|
release_file_lock(&filelock);
|
|
ereport(LOG, (errmsg("copy asynchronization items failed: %s\n", gs_strerror(retcode))));
|
|
return false;
|
|
}
|
|
|
|
/* 3. genreate temp files and fill it with content from primary. */
|
|
retcode = generate_temp_file(buf, conf_bak, len);
|
|
if (retcode != CODE_OK) {
|
|
release_opt_lines(reserve_item);
|
|
release_file_lock(&filelock);
|
|
ereport(LOG, (errmsg("create %s failed: %s\n", conf_bak, gs_strerror(retcode))));
|
|
return false;
|
|
}
|
|
|
|
/* 4. adjust the info with reserved parameters, and sync to temp file. */
|
|
retcode = update_temp_file(conf_bak, reserve_item, g_reserve_param);
|
|
if (retcode != CODE_OK) {
|
|
release_file_lock(&filelock);
|
|
release_opt_lines(reserve_item);
|
|
ereport(LOG, (errmsg("update gaussdb config file failed: %s\n", gs_strerror(retcode))));
|
|
return false;
|
|
} else {
|
|
ereport(LOG, (errmsg("update gaussdb config file success")));
|
|
if (rename(conf_bak, t_thrd.walreceiver_cxt.gucconf_file) != 0) {
|
|
release_file_lock(&filelock);
|
|
release_opt_lines(reserve_item);
|
|
ereport(LOG, (errcode_for_file_access(), errmsg("could not rename \"%s\" to \"%s\": %m", conf_bak,
|
|
t_thrd.walreceiver_cxt.gucconf_file)));
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* save the modify time of standby config file */
|
|
if (lstat(t_thrd.walreceiver_cxt.gucconf_file, &statbuf) != 0) {
|
|
if (errno != ENOENT) {
|
|
release_file_lock(&filelock);
|
|
release_opt_lines(reserve_item);
|
|
ereport(ERROR, (errcode_for_file_access(), errmsg("could not stat file or directory \"%s\": %m",
|
|
t_thrd.walreceiver_cxt.gucconf_file)));
|
|
return false;
|
|
}
|
|
}
|
|
t_thrd.walreceiver_cxt.standby_config_modify_time = statbuf.st_mtime;
|
|
|
|
if (statbuf.st_size > 0) {
|
|
copy_file_internal(t_thrd.walreceiver_cxt.gucconf_file, t_thrd.walreceiver_cxt.temp_guc_conf_file, true);
|
|
ereport(DEBUG1, (errmsg("copy %s to %s success", t_thrd.walreceiver_cxt.gucconf_file,
|
|
t_thrd.walreceiver_cxt.temp_guc_conf_file)));
|
|
}
|
|
|
|
release_file_lock(&filelock);
|
|
release_opt_lines(reserve_item);
|
|
|
|
/* notify postmaster the config file has changed */
|
|
if (gs_signal_send(PostmasterPid, SIGHUP) != 0) {
|
|
ereport(WARNING, (errmsg("send SIGHUP to PM failed")));
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* firstSynchStandbyFile - Synchronise standby's configure file once the HA
|
|
* build successfully.
|
|
*/
|
|
static void firstSynchStandbyFile(void)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
if (g_instance.attr.attr_common.sync_config_strategy == NONE_NODE) {
|
|
return;
|
|
}
|
|
#endif
|
|
char bufTime[sizeof(ConfigModifyTimeMessage) + 1];
|
|
errno_t errorno = EOK;
|
|
|
|
bufTime[0] = 'A';
|
|
t_thrd.walreceiver_cxt.reply_modify_message->config_modify_time = 0;
|
|
errorno = memcpy_s(&bufTime[1], sizeof(ConfigModifyTimeMessage), t_thrd.walreceiver_cxt.reply_modify_message,
|
|
sizeof(ConfigModifyTimeMessage));
|
|
securec_check(errorno, "\0", "\0");
|
|
(WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_send(bufTime, sizeof(ConfigModifyTimeMessage) + 1);
|
|
}
|
|
|
|
void GetPrimaryServiceAddress(char *address, size_t address_len)
|
|
{
|
|
if (address == NULL || address_len == 0 || t_thrd.walreceiverfuncs_cxt.WalRcv == NULL)
|
|
return;
|
|
|
|
bool is_running = false;
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
int rc = 0;
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
is_running = walrcv->isRuning;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
|
|
if (walrcv->pid == 0 || !is_running)
|
|
return;
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
rc = snprintf_s(address, address_len, (address_len - 1), "%s:%d", walrcv->conn_channel.remotehost,
|
|
walrcv->conn_channel.remoteservice);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
|
|
void MakeDebugLog(TimestampTz sendTime, TimestampTz lastMsgReceiptTime, const char *msgFmt)
|
|
{
|
|
char *sendtimeStr = NULL;
|
|
char *receipttimeStr = NULL;
|
|
/* Copy because timestamptz_to_str returns a static buffer */
|
|
sendtimeStr = pstrdup(timestamptz_to_str(sendTime));
|
|
receipttimeStr = pstrdup(timestamptz_to_str(lastMsgReceiptTime));
|
|
ereport(DEBUG2, (errmsg(msgFmt, sendtimeStr, receipttimeStr)));
|
|
pfree(sendtimeStr);
|
|
sendtimeStr = NULL;
|
|
pfree(receipttimeStr);
|
|
receipttimeStr = NULL;
|
|
return;
|
|
}
|
|
|
|
/* Set start send lsn for current walsender (only called in walsender) */
|
|
void WalRcvSetPercentCountStartLsn(XLogRecPtr startLsn)
|
|
{
|
|
/* use volatile pointer to prevent code rearrangement */
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
walrcv->syncPercentCountStart = startLsn;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
}
|
|
|
|
/* Set start send lsn for current walsender (only called in walsender) */
|
|
static void WalRcvRefreshPercentCountStartLsn(XLogRecPtr currentMaxLsn, XLogRecPtr currentDoneLsn)
|
|
{
|
|
uint64 coundWindow = ((uint64)WalGetSyncCountWindow() * XLOG_SEG_SIZE);
|
|
volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv;
|
|
XLogRecPtr baseStartLsn = InvalidXLogRecPtr;
|
|
|
|
if (!walrcv) {
|
|
return;
|
|
}
|
|
|
|
/* clear syncPercentCountStart when recevier's redo equal to sender's flush */
|
|
if (XLByteEQ(currentMaxLsn, currentDoneLsn)) {
|
|
WalRcvSetPercentCountStartLsn(InvalidXLogRecPtr);
|
|
return;
|
|
}
|
|
|
|
/* if syncPercentCountStart is valid, it means last counting cycle has not been done. */
|
|
SpinLockAcquire(&walrcv->mutex);
|
|
baseStartLsn = walrcv->syncPercentCountStart;
|
|
SpinLockRelease(&walrcv->mutex);
|
|
if (!XLByteEQ(baseStartLsn, InvalidXLogRecPtr)) {
|
|
return;
|
|
}
|
|
|
|
/* starting a new counting cycle. */
|
|
if (XLogDiff(currentMaxLsn, currentDoneLsn) < coundWindow) {
|
|
WalRcvSetPercentCountStartLsn(InvalidXLogRecPtr);
|
|
} else {
|
|
WalRcvSetPercentCountStartLsn(currentDoneLsn);
|
|
}
|
|
}
|