2415 lines
92 KiB
C++
2415 lines
92 KiB
C++
/*
|
|
* Copyright (c) 2020 Huawei Technologies Co.,Ltd.
|
|
*
|
|
* openGauss is licensed under Mulan PSL v2.
|
|
* You can use this software according to the terms and conditions of the Mulan PSL v2.
|
|
* You may obtain a copy of Mulan PSL v2 at:
|
|
*
|
|
* http://license.coscl.org.cn/MulanPSL2
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
|
|
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
|
|
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
|
|
* See the Mulan PSL v2 for more details.
|
|
* ---------------------------------------------------------------------------------------
|
|
*
|
|
* ss_dms_callback.cpp
|
|
* Provide callback interface for called inside DMS API
|
|
*
|
|
* IDENTIFICATION
|
|
* src/gausskernel/ddes/adapter/ss_dms_callback.cpp
|
|
*
|
|
* ---------------------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "c.h"
|
|
#include "pgstat.h"
|
|
#include "postgres.h"
|
|
#include "miscadmin.h"
|
|
#include "postmaster/postmaster.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/palloc.h"
|
|
#include "utils/resowner.h"
|
|
#include "utils/postinit.h"
|
|
#include "catalog/pg_authid.h"
|
|
#include "storage/procarray.h"
|
|
#include "access/xact.h"
|
|
#include "access/transam.h"
|
|
#include "access/csnlog.h"
|
|
#include "access/xlog.h"
|
|
#include "access/multi_redo_api.h"
|
|
#include "ddes/dms/ss_dms_bufmgr.h"
|
|
#include "storage/buf/buf_internals.h"
|
|
#include "ddes/dms/ss_transaction.h"
|
|
#include "storage/smgr/segment.h"
|
|
#include "storage/sinvaladt.h"
|
|
#include "replication/walsender_private.h"
|
|
#include "replication/walreceiver.h"
|
|
#include "replication/ss_disaster_cluster.h"
|
|
#include "ddes/dms/ss_dms_callback.h"
|
|
#include "ddes/dms/ss_switchover.h"
|
|
#include "ddes/dms/ss_reform_common.h"
|
|
#include "ddes/dms/ss_dms_bufmgr.h"
|
|
#include "storage/file/fio_device.h"
|
|
#include "storage/buf/bufmgr.h"
|
|
#include "storage/buf/buf_internals.h"
|
|
#include "storage/buf/bufmgr.h"
|
|
#include "storage/ipc.h"
|
|
|
|
static void ReleaseResource();
|
|
|
|
static inline void IniRedoInfo()
|
|
{
|
|
g_instance.dms_cxt.SSReformInfo.redo_start_time = 0;
|
|
g_instance.dms_cxt.SSReformInfo.redo_end_time = 0;
|
|
g_instance.dms_cxt.SSReformInfo.construct_hashmap = 0;
|
|
g_instance.dms_cxt.SSReformInfo.redo_total_bytes = 0;
|
|
}
|
|
|
|
/*
|
|
* Wake up startup process to replay WAL, or to notice that
|
|
* failover has been requested.
|
|
*/
|
|
void SSWakeupRecovery(void)
|
|
{
|
|
uint32 thread_num = (uint32)g_instance.ckpt_cxt_ctl->pgwr_procs.num;
|
|
/* need make sure pagewriter started first */
|
|
bool need_recovery = true;
|
|
|
|
if (SS_DISASTER_MAIN_STANDBY_NODE) {
|
|
g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = false;
|
|
return;
|
|
}
|
|
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] wait pagewriter thread start")));
|
|
while (pg_atomic_read_u32(&g_instance.ckpt_cxt_ctl->current_page_writer_count) != thread_num) {
|
|
/* No need to wait pagewriter thread start because there is no xlog to need to recovery when db start. */
|
|
if (!RecoveryInProgress()) {
|
|
need_recovery = false;
|
|
break;
|
|
}
|
|
pg_usleep(REFORM_WAIT_TIME);
|
|
}
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] wait pagewriter thread start: succsess")));
|
|
|
|
if (need_recovery) {
|
|
g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = false;
|
|
}
|
|
}
|
|
|
|
static int CBGetUpdateXid(void *db_handle, unsigned long long xid, unsigned int t_infomask, unsigned int t_infomask2,
|
|
unsigned long long *uxid)
|
|
{
|
|
if (!SSCanFetchLocalSnapshotTxnRelatedInfo()) {
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
int result = DMS_SUCCESS;
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
|
|
PG_TRY();
|
|
{
|
|
*uxid =
|
|
(unsigned long long)MultiXactIdGetUpdateXid((TransactionId)xid, (uint16)t_infomask, (uint16)t_infomask2);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
result = DMS_ERROR;
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
return result;
|
|
}
|
|
|
|
static CommitSeqNo TransactionWaitCommittingCSN(dms_opengauss_xid_csn_t *xid_csn_ctx, bool *sync)
|
|
{
|
|
bool looped = false;
|
|
bool isCommit = (bool)xid_csn_ctx->is_committed;
|
|
bool isMvcc = (bool)xid_csn_ctx->is_mvcc;
|
|
bool isNest = (bool)xid_csn_ctx->is_nest;
|
|
TransactionId xid = xid_csn_ctx->xid;
|
|
CommitSeqNo snapshotcsn = xid_csn_ctx->snapshotcsn;
|
|
TransactionId parentXid = InvalidTransactionId;
|
|
SnapshotData snapshot = {SNAPSHOT_MVCC};
|
|
snapshot.xmin = xid_csn_ctx->snapshotxmin;
|
|
snapshot.snapshotcsn = snapshotcsn;
|
|
CommitSeqNo csn = TransactionIdGetCommitSeqNo(xid, isCommit, isMvcc, isNest, &snapshot);
|
|
|
|
while (COMMITSEQNO_IS_COMMITTING(csn)) {
|
|
if (looped && isCommit) {
|
|
ereport(DEBUG1,
|
|
(errmodule(MOD_DMS), errmsg("[SS] committed SS xid %lu's csn %lu"
|
|
"is changed to FROZEN after lockwait.", xid, csn)));
|
|
CSNLogSetCommitSeqNo(xid, 0, NULL, COMMITSEQNO_FROZEN);
|
|
SetLatestFetchState(xid, COMMITSEQNO_FROZEN);
|
|
/* in this case, SS tuple is visible on standby, as we already compared and waited */
|
|
return COMMITSEQNO_FROZEN;
|
|
} else if (looped && !isCommit) {
|
|
ereport(DEBUG1, (errmodule(MOD_DMS),
|
|
errmsg("[SS] SS XID %lu's csn %lu is changed to ABORT after lockwait.", xid, csn)));
|
|
/* recheck if transaction id is finished */
|
|
RecheckXidFinish(xid, csn);
|
|
CSNLogSetCommitSeqNo(xid, 0, NULL, COMMITSEQNO_ABORTED);
|
|
SetLatestFetchState(xid, COMMITSEQNO_ABORTED);
|
|
/* in this case, SS tuple is not visible on standby */
|
|
return COMMITSEQNO_ABORTED;
|
|
} else {
|
|
if (!COMMITSEQNO_IS_SUBTRANS(csn)) {
|
|
/* If snapshotcsn lower than csn stored in csn log, don't need to wait. */
|
|
CommitSeqNo latestCSN = GET_COMMITSEQNO(csn);
|
|
if (latestCSN >= snapshotcsn) {
|
|
ereport(DEBUG1,
|
|
(errmodule(MOD_DMS), errmsg(
|
|
"[SS] snapshotcsn %lu < csn %lu stored in CSNLog, TXN invisible, no need to sync wait, XID %lu",
|
|
snapshotcsn,
|
|
latestCSN,
|
|
xid)));
|
|
/* in this case, SS tuple is not visible; to return ABORT is inappropriate, so let standby judge */
|
|
return latestCSN;
|
|
}
|
|
} else {
|
|
parentXid = (TransactionId)GET_PARENTXID(csn);
|
|
}
|
|
|
|
if (u_sess->attr.attr_common.xc_maintenance_mode || t_thrd.xact_cxt.bInAbortTransaction) {
|
|
return COMMITSEQNO_ABORTED;
|
|
}
|
|
|
|
// standby does not need buf lock or validation
|
|
if (TransactionIdIsValid(parentXid)) {
|
|
SyncLocalXidWait(parentXid);
|
|
} else {
|
|
SyncLocalXidWait(xid);
|
|
}
|
|
|
|
looped = true;
|
|
*sync = true;
|
|
parentXid = InvalidTransactionId;
|
|
csn = TransactionIdGetCommitSeqNo(xid, isCommit, isMvcc, isNest, &snapshot);
|
|
}
|
|
}
|
|
return csn;
|
|
}
|
|
|
|
static int CBGetTxnCSN(void *db_handle, dms_opengauss_xid_csn_t *csn_req, dms_opengauss_csn_result_t *csn_res)
|
|
{
|
|
if (!SSCanFetchLocalSnapshotTxnRelatedInfo()) {
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
int ret;
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
bool sync = false;
|
|
CLogXidStatus clogstatus = CLOG_XID_STATUS_IN_PROGRESS;
|
|
XLogRecPtr lsn = InvalidXLogRecPtr;
|
|
CommitSeqNo csn = TransactionWaitCommittingCSN(csn_req, &sync);
|
|
clogstatus = CLogGetStatus(csn_req->xid, &lsn);
|
|
csn_res->csn = csn;
|
|
csn_res->sync = (unsigned char)sync;
|
|
csn_res->clogstatus = (unsigned int)clogstatus;
|
|
csn_res->lsn = lsn;
|
|
ret = DMS_SUCCESS;
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
ret = DMS_ERROR;
|
|
}
|
|
PG_END_TRY();
|
|
return ret;
|
|
}
|
|
|
|
static int CBGetSnapshotData(void *db_handle, dms_opengauss_txn_snapshot_t *txn_snapshot, uint8 inst_id)
|
|
{
|
|
/* SS_MAIN_STANDBY_NODE always is in recovery progress, but it can acquire snapshot*/
|
|
if (RecoveryInProgress() && !(SS_NORMAL_PRIMARY && SS_DISASTER_MAIN_STANDBY_NODE)) {
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
if (!SSCanFetchLocalSnapshotTxnRelatedInfo()) {
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
int retCode = DMS_ERROR;
|
|
SnapshotData snapshot = {SNAPSHOT_MVCC};
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
|
|
PG_TRY();
|
|
{
|
|
(void)GetSnapshotData(&snapshot, false);
|
|
if (snapshot.xmin != InvalidTransactionId) {
|
|
txn_snapshot->xmin = snapshot.xmin;
|
|
txn_snapshot->xmax = snapshot.xmax;
|
|
txn_snapshot->snapshotcsn = snapshot.snapshotcsn;
|
|
txn_snapshot->localxmin = u_sess->utils_cxt.RecentGlobalXmin;
|
|
if (!ENABLE_SS_BCAST_GETOLDESTXMIN) {
|
|
if (RecordSnapshotBeforeSend(inst_id, txn_snapshot->xmin)) {
|
|
retCode = DMS_SUCCESS;
|
|
}
|
|
} else {
|
|
retCode = DMS_SUCCESS;
|
|
}
|
|
}
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
|
|
return retCode;
|
|
}
|
|
|
|
static int CBGetTxnSwinfo(void *db_handle, dms_opengauss_txn_sw_info_t *txn_swinfo)
|
|
{
|
|
if (RecoveryInProgress()) {
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
if (!SSCanFetchLocalSnapshotTxnRelatedInfo()) {
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
int retCode = DMS_SUCCESS;
|
|
uint32 slot = txn_swinfo->server_proc_slot;
|
|
PGXACT* pgxact = &g_instance.proc_base_all_xacts[slot];
|
|
if (g_instance.proc_base_all_procs[slot] == NULL) {
|
|
retCode = DMS_ERROR;
|
|
} else {
|
|
txn_swinfo->sxid = pgxact->xid;
|
|
txn_swinfo->scid = pgxact->cid;
|
|
}
|
|
|
|
return retCode;
|
|
}
|
|
|
|
static int CBGetTxnStatus(void *db_handle, unsigned long long xid, unsigned char type, unsigned char *result)
|
|
{
|
|
if (!SSCanFetchLocalSnapshotTxnRelatedInfo()) {
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
switch (type) {
|
|
case XID_INPROGRESS:
|
|
*result = (unsigned char)TransactionIdIsInProgress(xid);
|
|
break;
|
|
case XID_COMMITTED:
|
|
*result = (unsigned char)TransactionIdDidCommit(xid);
|
|
break;
|
|
default:
|
|
PG_TRY_RETURN(DMS_ERROR);
|
|
}
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
return DMS_SUCCESS;
|
|
}
|
|
|
|
#define NDPGETBYTE(x, i) (*((char*)(x) + (int)((i) / BITS_PER_BYTE)))
|
|
#define NDPCLRBIT(x, i) NDPGETBYTE(x, i) &= ~(0x01 << ((i) % BITS_PER_BYTE))
|
|
#define NDPGETBIT(x, i) ((NDPGETBYTE(x, i) >> ((i) % BITS_PER_BYTE)) & 0x01)
|
|
|
|
static int CBGetPageStatus(void *db_handle, dms_opengauss_relfilenode_t *rnode, unsigned int page,
|
|
int pagesNum, dms_opengauss_page_status_result_t *page_result)
|
|
{
|
|
for (uint32 i = page, offset = 0; i != page + pagesNum; ++i, ++offset) {
|
|
if (NDPGETBIT(page_result->page_map, offset)) {
|
|
bool cached = IsPageHitBufferPool(*(RelFileNode * )(rnode), MAIN_FORKNUM, i);
|
|
if (cached) {
|
|
NDPCLRBIT(page_result->page_map, offset);
|
|
--page_result->bit_count;
|
|
}
|
|
}
|
|
}
|
|
return DMS_SUCCESS;
|
|
}
|
|
|
|
static int CBGetCurrModeAndLockBuffer(void *db_handle, int buffer, unsigned char lock_mode,
|
|
unsigned char *curr_mode)
|
|
{
|
|
Assert((buffer - 1) >= 0);
|
|
BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
|
|
*curr_mode = (unsigned char)GetHeldLWLockMode(bufHdr->content_lock); // LWLockMode
|
|
Assert(*curr_mode == LW_EXCLUSIVE || *curr_mode == LW_SHARED);
|
|
LockBuffer((Buffer)buffer, lock_mode); // BUFFER_LOCK_UNLOCK, BUFFER_LOCK_SHARE or BUFFER_LOCK_EXCLUSIVE
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS] SS lock buf success, buffer=%d, mode=%hhu, curr_mode=%hhu", buffer, lock_mode, *curr_mode)));
|
|
return DMS_SUCCESS;
|
|
}
|
|
|
|
static inline void SSResetDemoteReqType(void)
|
|
{
|
|
SpinLockAcquire(&t_thrd.walsender_cxt.WalSndCtl->mutex);
|
|
t_thrd.walsender_cxt.WalSndCtl->demotion = NoDemote;
|
|
SpinLockRelease(&t_thrd.walsender_cxt.WalSndCtl->mutex);
|
|
}
|
|
|
|
static void SSHandleReformFailDuringDemote(bool timeout, DemoteMode demote_mode)
|
|
{
|
|
ereport(WARNING,
|
|
(errmodule(MOD_DMS),
|
|
errmsg("[SS reform][SS switchover] Failure in %s primary demote, pmState=%d, need reform rcy.",
|
|
DemoteModeDesc(demote_mode), pmState)));
|
|
|
|
if (timeout) {
|
|
g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled = false;
|
|
pg_memory_barrier();
|
|
SendPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_DEMOTE_FAILURE_CHECK);
|
|
const int WAIT_SIGNAL_HANDLED = 100; /* only wait 10s*/
|
|
for (int ntries = 0; ntries < WAIT_SIGNAL_HANDLED; ntries++) {
|
|
if (g_instance.dms_cxt.SSReformInfo.switchover_demote_failure_signal_handled) {
|
|
break;
|
|
}
|
|
CHECK_FOR_INTERRUPTS();
|
|
pg_usleep(100000L); /* wait 0.1 sec, then retry */
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Shutdown checkpoint would cause concurrency as DMS is starting next round of reform.
|
|
* If we allow ckpt to finish and recover, DMS would not be aware of the recovery process.
|
|
* Therefore we flush as many dirty pages as we can, then trigger a DMS normal reform.
|
|
*/
|
|
if (CheckpointInProgress() || pmState >= PM_SHUTDOWN) {
|
|
ereport(WARNING,
|
|
(errmodule(MOD_DMS),
|
|
errmsg("[SS reform][SS switchover] reform failed after shutdown ckpt has started, exit now")));
|
|
_exit(0);
|
|
}
|
|
|
|
/* backends exiting, simply rollback */
|
|
pmState = PM_RUN;
|
|
g_instance.demotion = NoDemote;
|
|
SSResetDemoteReqType();
|
|
}
|
|
|
|
static int CBSwitchoverDemote(void *db_handle)
|
|
{
|
|
DemoteMode demote_mode = FastDemote;
|
|
|
|
/* borrows walsender lock */
|
|
SpinLockAcquire(&t_thrd.walsender_cxt.WalSndCtl->mutex);
|
|
if (t_thrd.walsender_cxt.WalSndCtl->demotion > NoDemote) {
|
|
SpinLockRelease(&t_thrd.walsender_cxt.WalSndCtl->mutex);
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] master is doing switchover,"
|
|
" probably standby already requested switchover.")));
|
|
return DMS_SUCCESS;
|
|
}
|
|
Assert(g_instance.dms_cxt.SSClusterState == NODESTATE_NORMAL);
|
|
Assert(SS_OFFICIAL_PRIMARY);
|
|
|
|
t_thrd.walsender_cxt.WalSndCtl->demotion = demote_mode;
|
|
g_instance.dms_cxt.SSClusterState = NODESTATE_PRIMARY_DEMOTING;
|
|
g_instance.dms_cxt.SSRecoveryInfo.new_primary_reset_walbuf_flag = true;
|
|
SpinLockRelease(&t_thrd.walsender_cxt.WalSndCtl->mutex);
|
|
|
|
ereport(LOG,
|
|
(errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Recv %s demote request from DMS reformer.",
|
|
DemoteModeDesc(demote_mode))));
|
|
|
|
SendPostmasterSignal(PMSIGNAL_DEMOTE_PRIMARY);
|
|
|
|
const int WAIT_DEMOTE = 6000; /* wait up to 10 min in case of too many dirty pages to be flushed */
|
|
for (int ntries = 0;; ntries++) {
|
|
if (pmState == PM_RUN && g_instance.dms_cxt.SSClusterState == NODESTATE_PROMOTE_APPROVE) {
|
|
SSResetDemoteReqType();
|
|
ereport(LOG,
|
|
(errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Success in %s primary demote, running as "
|
|
"standby, waiting for reformer setting new role.", DemoteModeDesc(demote_mode))));
|
|
return DMS_SUCCESS;
|
|
} else {
|
|
if (ntries >= WAIT_DEMOTE || dms_reform_failed()) {
|
|
bool timeout = ntries >= WAIT_DEMOTE ? true : false;
|
|
SSHandleReformFailDuringDemote(timeout, demote_mode);
|
|
return DMS_ERROR;
|
|
}
|
|
}
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
pg_usleep(100000L); /* wait 0.1 sec, then retry */
|
|
}
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
static int CBSwitchoverPromote(void *db_handle, unsigned char origPrimaryId)
|
|
{
|
|
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTING;
|
|
g_instance.dms_cxt.SSRecoveryInfo.new_primary_reset_walbuf_flag = true;
|
|
/* allow recovery in switchover to keep LSN in order */
|
|
t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false;
|
|
t_thrd.shemem_ptr_cxt.XLogCtl->SharedRecoveryInProgress = true;
|
|
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY;
|
|
pg_memory_barrier();
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Starting to promote standby.")));
|
|
|
|
SSNotifySwitchoverPromote();
|
|
|
|
const int WAIT_PROMOTE = 1200; /* wait 120 sec */
|
|
for (int ntries = 0;; ntries++) {
|
|
if (g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTED) {
|
|
/* flush control file primary id in advance to save new standby's waiting time */
|
|
SSSavePrimaryInstId(SS_MY_INST_ID);
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS reform][SS switchover] Standby promote: success, set new primary:%d.", SS_MY_INST_ID)));
|
|
return DMS_SUCCESS;
|
|
} else {
|
|
if (ntries >= WAIT_PROMOTE || dms_reform_failed()) {
|
|
ereport(WARNING, (errmodule(MOD_DMS),
|
|
errmsg("[SS reform][SS switchover] Standby promote timeout, please try again later.")));
|
|
SSWaitStartupExit();
|
|
return DMS_ERROR;
|
|
}
|
|
}
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
pg_usleep(100000L); /* wait 0.1 sec, then retry */
|
|
}
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
/* only sets switchover errno, everything else set in setPrimaryId */
|
|
static void CBSwitchoverResult(void *db_handle, int result)
|
|
{
|
|
if (result == DMS_SUCCESS) {
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS reform][SS switchover] Switchover success, letting reformer update roles.")));
|
|
return;
|
|
} else {
|
|
/* abort and restore state */
|
|
g_instance.dms_cxt.SSClusterState = NODESTATE_NORMAL;
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] Switchover failed,"
|
|
"errno: %d.", result)));
|
|
}
|
|
}
|
|
|
|
static int SetPrimaryIdOnStandby(int primary_id, unsigned long long list_stable)
|
|
{
|
|
char* type_string = NULL;
|
|
type_string = SSGetLogHeaderTypeStr();
|
|
int ret = DMS_SUCCESS;
|
|
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
for (int ntries = 0;; ntries++) {
|
|
SSReadControlFile(REFORM_CTRL_PAGE); /* need to double check */
|
|
if (g_instance.dms_cxt.SSReformerControl.primaryInstId == primary_id &&
|
|
g_instance.dms_cxt.SSReformerControl.list_stable == list_stable) {
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("%s Reform success, this is a standby:%d confirming new primary:%d, list_stable:%llu, "
|
|
"confirm ntries=%d.", type_string, SS_MY_INST_ID, primary_id, list_stable, ntries)));
|
|
ret = DMS_SUCCESS;
|
|
break;
|
|
} else {
|
|
if (dms_reform_failed()) {
|
|
ereport(ERROR,
|
|
(errmodule(MOD_DMS), errmsg("%s Failed to confirm new primary: %d, list_stable:%llu, "
|
|
"control file indicates primary is %d, list_stable%llu; dms reform failed.",
|
|
type_string, (int)primary_id, list_stable,
|
|
g_instance.dms_cxt.SSReformerControl.primaryInstId,
|
|
g_instance.dms_cxt.SSReformerControl.list_stable)));
|
|
ret = DMS_ERROR;
|
|
break;
|
|
}
|
|
if (ntries >= WAIT_REFORM_CTRL_REFRESH_TRIES) {
|
|
ereport(ERROR,
|
|
(errmodule(MOD_DMS), errmsg("%s Failed to confirm new primary: %d, list_stable:%llu, "
|
|
" control file indicates primary is %d, list_stable%llu; wait timeout.",
|
|
type_string, (int)primary_id, list_stable,
|
|
g_instance.dms_cxt.SSReformerControl.primaryInstId,
|
|
g_instance.dms_cxt.SSReformerControl.list_stable)));
|
|
ret = DMS_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
CHECK_FOR_INTERRUPTS();
|
|
pg_usleep(REFORM_WAIT_TIME); /* wait 0.01 sec, then retry */
|
|
}
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
ReleaseResource();
|
|
ret = DMS_ERROR;
|
|
}
|
|
PG_END_TRY();
|
|
return ret;
|
|
}
|
|
|
|
/* called on both new primary and all standby nodes to refresh status */
|
|
static int CBSaveStableList(void *db_handle, unsigned long long list_stable, unsigned char reformer_id,
|
|
unsigned long long list_in, unsigned int save_ctrl)
|
|
{
|
|
int primary_id = (int)reformer_id;
|
|
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
|
|
g_instance.dms_cxt.SSReformerControl.primaryInstId = primary_id;
|
|
g_instance.dms_cxt.SSReformerControl.list_stable = list_stable;
|
|
int ret = DMS_ERROR;
|
|
SSLockReleaseAll();
|
|
SSSyncOldestXminWhenReform(reformer_id);
|
|
|
|
if ((int)primary_id == SS_MY_INST_ID) {
|
|
if (g_instance.dms_cxt.SSClusterState > NODESTATE_NORMAL) {
|
|
Assert(g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_PROMOTED ||
|
|
g_instance.dms_cxt.SSClusterState == NODESTATE_STANDBY_FAILOVER_PROMOTING);
|
|
}
|
|
SSUpdateReformerCtrl();
|
|
LWLockRelease(ControlFileLock);
|
|
Assert(g_instance.dms_cxt.SSReformerControl.primaryInstId == (int)primary_id);
|
|
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] set current instance:%d as primary, list_stable:%llu.",
|
|
primary_id, list_stable)));
|
|
ret = DMS_SUCCESS;
|
|
} else { /* we are on standby */
|
|
LWLockRelease(ControlFileLock);
|
|
ret = SetPrimaryIdOnStandby(primary_id, list_stable);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static void ReleaseResource()
|
|
{
|
|
LWLockReleaseAll();
|
|
AbortBufferIO();
|
|
UnlockBuffers();
|
|
/* buffer pins are released here: */
|
|
ResourceOwnerRelease(t_thrd.utils_cxt.CurrentResourceOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, true);
|
|
FlushErrorState();
|
|
}
|
|
|
|
static unsigned int CBIncAndGetSrsn(uint32 sessid)
|
|
{
|
|
return ++t_thrd.dms_cxt.srsn;
|
|
}
|
|
|
|
static unsigned int CBPageHashCode(const char pageid[DMS_PAGEID_SIZE])
|
|
{
|
|
BufferTag *tag = (BufferTag *)pageid;
|
|
return BufTableHashCode(tag);
|
|
}
|
|
|
|
static unsigned long long CBGetPageLSN(const dms_buf_ctrl_t *buf_ctrl)
|
|
{
|
|
Assert(buf_ctrl->buf_id < TOTAL_BUFFER_NUM);
|
|
if (buf_ctrl->buf_id >= TOTAL_BUFFER_NUM) {
|
|
return 0;
|
|
}
|
|
BufferDesc* buf_desc = GetBufferDescriptor(buf_ctrl->buf_id);
|
|
XLogRecPtr lsn = BufferGetLSN(buf_desc);
|
|
return lsn;
|
|
}
|
|
|
|
static unsigned long long CBGetGlobalLSN(void *db_handle)
|
|
{
|
|
return GetInsertRecPtr();
|
|
}
|
|
|
|
static int tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_t **buf_ctrl)
|
|
{
|
|
bool is_seg;
|
|
int ret = DMS_SUCCESS;
|
|
int buf_id = -1;
|
|
uint32 hash;
|
|
BufferDesc *buf_desc = NULL;
|
|
RelFileNode relfilenode = tag->rnode;
|
|
bool get_lock = false;
|
|
DMSWaiteventTarget target;
|
|
bool waitevent_started = false;
|
|
|
|
#ifdef USE_ASSERT_CHECKING
|
|
if (IsSegmentPhysicalRelNode(relfilenode)) {
|
|
SegSpace *spc = spc_open(relfilenode.spcNode, relfilenode.dbNode, false, false);
|
|
BlockNumber spc_nblocks = spc_size(spc, relfilenode.relNode, tag->forkNum);
|
|
if (tag->blockNum >= spc_nblocks) {
|
|
ereport(PANIC, (errmodule(MOD_DMS),
|
|
errmsg("[SS] unexpected blocknum %u >= spc nblocks %u", tag->blockNum, spc_nblocks)));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
*buf_ctrl = NULL;
|
|
hash = BufTableHashCode(tag);
|
|
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
do {
|
|
buf_id = BufTableLookup(tag, hash);
|
|
if (buf_id < 0) {
|
|
break;
|
|
}
|
|
|
|
target.page.buffer = buf_id + 1;
|
|
target.page.mode = mode;
|
|
pgstat_report_dms_waitevent(WAIT_EVENT_DCS_TRANSFER_PAGE, &target);
|
|
waitevent_started = true;
|
|
|
|
buf_desc = GetBufferDescriptor(buf_id);
|
|
if (IsSegmentBufferID(buf_id)) {
|
|
(void)SegPinBuffer(buf_desc);
|
|
is_seg = true;
|
|
} else {
|
|
ResourceOwnerEnlargeBuffers(t_thrd.utils_cxt.CurrentResourceOwner);
|
|
(void)PinBuffer(buf_desc, NULL);
|
|
is_seg = false;
|
|
}
|
|
|
|
if (!BUFFERTAGS_PTR_EQUAL(&buf_desc->tag, tag)) {
|
|
DmsReleaseBuffer(buf_desc->buf_id + 1, is_seg);
|
|
break;
|
|
}
|
|
|
|
bool wait_success = SSWaitIOTimeout(buf_desc);
|
|
if (!wait_success) {
|
|
DmsReleaseBuffer(buf_desc->buf_id + 1, is_seg);
|
|
ret = GS_TIMEOUT;
|
|
break;
|
|
}
|
|
|
|
if (!(pg_atomic_read_u64(&buf_desc->state) & BM_VALID)) {
|
|
ereport(WARNING, (errmodule(MOD_DMS),
|
|
errmsg("[SS page][%d/%d/%d/%d %d-%d] try enter page failed, buffer is not valid, state = 0x%lx",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum, buf_desc->state)));
|
|
DmsReleaseBuffer(buf_desc->buf_id + 1, is_seg);
|
|
*buf_ctrl = NULL;
|
|
ret = DMS_SUCCESS;
|
|
break;
|
|
}
|
|
|
|
if (pg_atomic_read_u64(&buf_desc->state) & BM_IO_ERROR) {
|
|
ereport(WARNING, (errmodule(MOD_DMS),
|
|
errmsg("[SS page][%d/%d/%d/%d %d-%d] try enter page failed, buffer is io error, state = 0x%lx",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum, buf_desc->state)));
|
|
DmsReleaseBuffer(buf_desc->buf_id + 1, is_seg);
|
|
*buf_ctrl = NULL;
|
|
ret = DMS_SUCCESS;
|
|
break;
|
|
}
|
|
|
|
LWLockMode content_mode = (mode == DMS_LOCK_SHARE) ? LW_SHARED : LW_EXCLUSIVE;
|
|
get_lock = SSLWLockAcquireTimeout(buf_desc->content_lock, content_mode);
|
|
if (!get_lock) {
|
|
DmsReleaseBuffer(buf_desc->buf_id + 1, is_seg);
|
|
ret = GS_TIMEOUT;
|
|
ereport(WARNING, (errmodule(MOD_DMS), (errmsg("[SS lwlock][%u/%u/%u/%d %d-%u] request LWLock timeout, "
|
|
"buf_id:%d, lwlock:%p",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum, buf_id, buf_desc->content_lock))));
|
|
break;
|
|
}
|
|
*buf_ctrl = GetDmsBufCtrl(buf_id);
|
|
Assert(buf_id >= 0);
|
|
if ((*buf_ctrl)->been_loaded == false) {
|
|
*buf_ctrl = NULL;
|
|
LWLockRelease(buf_desc->content_lock);
|
|
DmsReleaseBuffer(buf_desc->buf_id + 1, is_seg);
|
|
ereport(WARNING, (errmodule(MOD_DMS),
|
|
errmsg("[SS page][%u/%u/%u/%d %d-%u] been_loaded marked false, page swapped out and failed to load",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum)));
|
|
break;
|
|
}
|
|
if ((*buf_ctrl)->lock_mode == DMS_LOCK_NULL) {
|
|
ereport(WARNING, (errmodule(MOD_DMS),
|
|
errmsg("[SS page][%u/%u/%u/%d %d-%u] lock mode is null, still need to transfer page",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum)));
|
|
} else if (buf_desc->extra->seg_fileno != EXTENT_INVALID) {
|
|
(*buf_ctrl)->seg_fileno = buf_desc->extra->seg_fileno;
|
|
(*buf_ctrl)->seg_blockno = buf_desc->extra->seg_blockno;
|
|
}
|
|
} while (0);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
ReleaseResource();
|
|
ret = DMS_ERROR;
|
|
}
|
|
PG_END_TRY();
|
|
|
|
if (waitevent_started) {
|
|
pgstat_report_dms_waitevent(WAIT_EVENT_END);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static int CBEnterLocalPage(void *db_handle, char pageid[DMS_PAGEID_SIZE], dms_lock_mode_t mode,
|
|
dms_buf_ctrl_t **buf_ctrl)
|
|
{
|
|
BufferTag *tag = (BufferTag *)pageid;
|
|
return tryEnterLocalPage(tag, mode, buf_ctrl);
|
|
}
|
|
|
|
static unsigned char CBPageDirty(dms_buf_ctrl_t *buf_ctrl)
|
|
{
|
|
Assert(buf_ctrl->buf_id < TOTAL_BUFFER_NUM);
|
|
if (buf_ctrl->buf_id >= TOTAL_BUFFER_NUM) {
|
|
return 0;
|
|
}
|
|
BufferDesc *buf_desc = GetBufferDescriptor(buf_ctrl->buf_id);
|
|
bool is_dirty = SSBufferIsDirty(buf_desc);
|
|
return (unsigned char)is_dirty;
|
|
}
|
|
|
|
static void CBLeaveLocalPage(void *db_handle, dms_buf_ctrl_t *buf_ctrl)
|
|
{
|
|
Assert(buf_ctrl->buf_id < TOTAL_BUFFER_NUM);
|
|
if (buf_ctrl->buf_id >= TOTAL_BUFFER_NUM) {
|
|
return;
|
|
}
|
|
|
|
if (IsSegmentBufferID(buf_ctrl->buf_id)) {
|
|
SegUnlockReleaseBuffer(buf_ctrl->buf_id + 1);
|
|
} else {
|
|
UnlockReleaseBuffer(buf_ctrl->buf_id + 1);
|
|
}
|
|
if (buf_ctrl->need_check_pincount) {
|
|
(GetDmsBufCtrl(buf_ctrl->buf_id))->need_check_pincount = false;
|
|
}
|
|
}
|
|
|
|
static char* CBGetPage(dms_buf_ctrl_t *buf_ctrl)
|
|
{
|
|
Assert(buf_ctrl->buf_id < TOTAL_BUFFER_NUM);
|
|
if (buf_ctrl->buf_id >= TOTAL_BUFFER_NUM) {
|
|
return NULL;
|
|
}
|
|
BufferDesc *buf_desc = GetBufferDescriptor(buf_ctrl->buf_id);
|
|
return (char *)BufHdrGetBlock(buf_desc);
|
|
}
|
|
|
|
static int CBInvalidatePage(void *db_handle, char pageid[DMS_PAGEID_SIZE], unsigned char invld_owner)
|
|
{
|
|
int buf_id = -1;
|
|
BufferTag* tag = (BufferTag *)pageid;
|
|
uint32 hash;
|
|
uint64 buf_state;
|
|
int ret = DMS_SUCCESS;
|
|
bool get_lock;
|
|
bool buftag_equal = true;
|
|
hash = BufTableHashCode(tag);
|
|
buf_id = BufTableLookup(tag, hash);
|
|
if (buf_id < 0) {
|
|
/* not found in shared buffer */
|
|
return ret;
|
|
}
|
|
|
|
DMSWaiteventTarget target;
|
|
target.page.buffer = buf_id + 1;
|
|
target.page.mode = DMS_LOCK_EXCLUSIVE;
|
|
pgstat_report_dms_waitevent(WAIT_EVENT_DCS_INVLDT_SHARE_COPY_PROCESS, &target);
|
|
|
|
BufferDesc *buf_desc = GetBufferDescriptor(buf_id);
|
|
dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_id);
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
do {
|
|
buf_desc = GetBufferDescriptor(buf_id);
|
|
if (SS_PRIMARY_MODE) {
|
|
buf_state = LockBufHdr(buf_desc);
|
|
if (BUF_STATE_GET_REFCOUNT(buf_state) != 0 || BUF_STATE_GET_USAGECOUNT(buf_state) != 0 ||
|
|
!BUFFERTAGS_PTR_EQUAL(&buf_desc->tag, tag)) {
|
|
UnlockBufHdr(buf_desc, buf_state);
|
|
ret = DMS_ERROR;
|
|
break;
|
|
}
|
|
|
|
if (!(buf_state & BM_VALID) || (buf_state & BM_IO_ERROR)) {
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS page][%d/%d/%d/%d %d-%d] invalidate page, buffer is not valid or io error, "
|
|
"state = 0x%lx", tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode,
|
|
tag->rnode.bucketNode, tag->forkNum, tag->blockNum, buf_desc->state)));
|
|
UnlockBufHdr(buf_desc, buf_state);
|
|
buf_ctrl->lock_mode = (unsigned char)DMS_LOCK_NULL;
|
|
buf_ctrl->seg_fileno = EXTENT_INVALID;
|
|
buf_ctrl->seg_blockno = InvalidBlockNumber;
|
|
ret = DMS_SUCCESS;
|
|
break;
|
|
}
|
|
|
|
/* For aio (flush disk not finished), dirty, in dirty queue, dirty need flush, can't recycle */
|
|
if (buf_desc->extra->aio_in_progress || (buf_state & BM_DIRTY) || (buf_state & BM_JUST_DIRTIED) ||
|
|
XLogRecPtrIsValid(pg_atomic_read_u64(&buf_desc->extra->rec_lsn)) ||
|
|
(buf_ctrl->state & BUF_DIRTY_NEED_FLUSH)) {
|
|
ereport(DEBUG1, (errmodule(MOD_DMS),
|
|
errmsg("[SS page][%d/%d/%d/%d %d-%d] invalidate owner rejected, buffer is dirty/permanent, "
|
|
"state = 0x%lx", tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode,
|
|
tag->rnode.bucketNode, tag->forkNum, tag->blockNum, buf_desc->state)));
|
|
ret = DMS_ERROR;
|
|
} else {
|
|
buf_ctrl->lock_mode = (unsigned char)DMS_LOCK_NULL;
|
|
buf_ctrl->seg_fileno = EXTENT_INVALID;
|
|
buf_ctrl->seg_blockno = InvalidBlockNumber;
|
|
}
|
|
|
|
UnlockBufHdr(buf_desc, buf_state);
|
|
break;
|
|
}
|
|
|
|
if (IsSegmentBufferID(buf_id)) {
|
|
(void)SegPinBuffer(buf_desc);
|
|
} else {
|
|
ResourceOwnerEnlargeBuffers(t_thrd.utils_cxt.CurrentResourceOwner);
|
|
(void)PinBuffer(buf_desc, NULL);
|
|
}
|
|
|
|
SS_FAULT_INJECTION_CALL(DB_FI_CHANGE_BUFFERTAG_BLOCKNUM, dms_fi_change_buffertag_blocknum);
|
|
FAULT_INJECTION_ACTION_TRIGGER_CUSTOM(DB_FI_CHANGE_BUFFERTAG_BLOCKNUM, tag->blockNum += 1);
|
|
if (!BUFFERTAGS_PTR_EQUAL(&buf_desc->tag, tag)) {
|
|
DmsReleaseBuffer(buf_id + 1, IsSegmentBufferID(buf_id));
|
|
buftag_equal = false;
|
|
break;
|
|
}
|
|
|
|
bool wait_success = SSWaitIOTimeout(buf_desc);
|
|
if (!wait_success) {
|
|
DmsReleaseBuffer(buf_id + 1, IsSegmentBufferID(buf_id));
|
|
ret = GS_TIMEOUT;
|
|
break;
|
|
}
|
|
|
|
if ((!(pg_atomic_read_u64(&buf_desc->state) & BM_VALID)) ||
|
|
(pg_atomic_read_u64(&buf_desc->state) & BM_IO_ERROR)) {
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS page][%d/%d/%d/%d %d-%d] invalidate page, buffer is not valid or io error, "
|
|
"state = 0x%lx", tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode,
|
|
tag->rnode.bucketNode, tag->forkNum, tag->blockNum, buf_desc->state)));
|
|
DmsReleaseBuffer(buf_id + 1, IsSegmentBufferID(buf_id));
|
|
buf_ctrl->lock_mode = (unsigned char)DMS_LOCK_NULL;
|
|
buf_ctrl->seg_fileno = EXTENT_INVALID;
|
|
buf_ctrl->seg_blockno = InvalidBlockNumber;
|
|
ret = DMS_SUCCESS;
|
|
break;
|
|
}
|
|
|
|
get_lock = SSLWLockAcquireTimeout(buf_desc->content_lock, LW_EXCLUSIVE);
|
|
if (!get_lock) {
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmodule(MOD_DMS), (errmsg("[SS lwlock][%u/%u/%u/%d %d-%u] "
|
|
"request LWLock timeout, buf_id:%d, lwlock:%p",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum, buf_id, buf_desc->content_lock))));
|
|
ret = GS_TIMEOUT;
|
|
} else {
|
|
buf_ctrl->lock_mode = (unsigned char)DMS_LOCK_NULL;
|
|
buf_ctrl->seg_fileno = EXTENT_INVALID;
|
|
buf_ctrl->seg_blockno = InvalidBlockNumber;
|
|
LWLockRelease(buf_desc->content_lock);
|
|
}
|
|
|
|
if (IsSegmentBufferID(buf_id)) {
|
|
SegReleaseBuffer(buf_id + 1);
|
|
} else {
|
|
ReleaseBuffer(buf_id + 1);
|
|
}
|
|
} while(0);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
/* Save error info */
|
|
ErrorData* edata = CopyErrorData();
|
|
FlushErrorState();
|
|
FreeErrorData(edata);
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS][%d/%d/%d/%d %d-%d] CBInvalidatePage: Error happend.",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode, tag->forkNum,
|
|
tag->blockNum)));
|
|
ReleaseResource();
|
|
ret = DMS_ERROR;
|
|
}
|
|
PG_END_TRY();
|
|
|
|
if (ret == DMS_SUCCESS && buftag_equal) {
|
|
Assert(buf_ctrl->lock_mode == DMS_LOCK_NULL);
|
|
}
|
|
|
|
pgstat_report_dms_waitevent(WAIT_EVENT_END);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void CBVerifyPage(dms_buf_ctrl_t *buf_ctrl, char *new_page)
|
|
{
|
|
Assert(buf_ctrl->buf_id < TOTAL_BUFFER_NUM);
|
|
if (buf_ctrl->buf_id >= TOTAL_BUFFER_NUM) {
|
|
return;
|
|
}
|
|
|
|
BufferDesc *buf_desc = GetBufferDescriptor(buf_ctrl->buf_id);
|
|
RelFileNode rnode = buf_desc->tag.rnode;
|
|
|
|
if (buf_ctrl->need_check_pincount && (pg_atomic_read_u32(&(buf_ctrl->pinned_count)) == 0)) {
|
|
buf_ctrl->need_check_pincount = false;
|
|
}
|
|
|
|
if (buf_ctrl->seg_fileno != EXTENT_INVALID) {
|
|
if (buf_desc->extra->seg_fileno == EXTENT_INVALID) {
|
|
buf_desc->extra->seg_fileno = buf_ctrl->seg_fileno;
|
|
buf_desc->extra->seg_blockno = buf_ctrl->seg_blockno;
|
|
} else if (buf_desc->extra->seg_fileno != buf_ctrl->seg_fileno ||
|
|
buf_desc->extra->seg_blockno != buf_ctrl->seg_blockno) {
|
|
ereport(PANIC, (errmodule(MOD_DMS), errmsg("[SS page][%u/%u/%u/%d/%d %d-%u] location mismatch, seg_fileno:%d, seg_blockno:%u",
|
|
buf_desc->tag.rnode.spcNode, buf_desc->tag.rnode.dbNode, buf_desc->tag.rnode.relNode,
|
|
buf_desc->tag.rnode.bucketNode, buf_desc->tag.rnode.opt, buf_desc->tag.forkNum,
|
|
buf_desc->tag.blockNum, buf_desc->extra->seg_fileno, buf_desc->extra->seg_blockno)));
|
|
}
|
|
}
|
|
|
|
/* page content is not valid */
|
|
if ((pg_atomic_read_u64(&buf_desc->state) & BM_VALID) == 0) {
|
|
return;
|
|
}
|
|
|
|
char *page = (char *)BufHdrGetBlock(buf_desc);
|
|
XLogRecPtr lsn_past = PageGetLSN(page);
|
|
XLogRecPtr lsn_now = PageGetLSN(new_page);
|
|
|
|
/* latest page must satisfy condition: page lsn_on_disk bigger than transfered page which is latest page */
|
|
if ((lsn_now != InvalidXLogRecPtr) && XLByteLT(lsn_now, buf_ctrl->lsn_on_disk)) {
|
|
if (SS_DISASTER_STANDBY_CLUSTER) {
|
|
ereport(WARNING, (errmsg("[%d/%d/%d/%d/%d %d-%d] now lsn(0x%llx) is less than lsn_on_disk(0x%llx)",
|
|
rnode.spcNode, rnode.dbNode, rnode.relNode, rnode.bucketNode, rnode.opt,
|
|
buf_desc->tag.forkNum, buf_desc->tag.blockNum,
|
|
(unsigned long long)lsn_now, (unsigned long long)buf_ctrl->lsn_on_disk)));
|
|
return;
|
|
} else {
|
|
ereport(PANIC, (errmsg("[%d/%d/%d/%d/%d %d-%d] now lsn(0x%llx) is less than lsn_on_disk(0x%llx)",
|
|
rnode.spcNode, rnode.dbNode, rnode.relNode, rnode.bucketNode, rnode.opt,
|
|
buf_desc->tag.forkNum, buf_desc->tag.blockNum,
|
|
(unsigned long long)lsn_now, (unsigned long long)buf_ctrl->lsn_on_disk)));
|
|
}
|
|
}
|
|
|
|
/* we only verify segment-page version */
|
|
if (!(buf_desc->extra->seg_fileno != EXTENT_INVALID || IsSegmentBufferID(buf_desc->buf_id))) {
|
|
return;
|
|
}
|
|
|
|
if ((lsn_now != InvalidXLogRecPtr) && XLByteLT(lsn_now, lsn_past)) {
|
|
RelFileNode rnode = buf_desc->tag.rnode;
|
|
if (SS_DISASTER_STANDBY_CLUSTER) {
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS page][%d/%d/%d/%d/%d %d-%d] now lsn(0x%llx) is less than past lsn(0x%llx)",
|
|
rnode.spcNode, rnode.dbNode, rnode.relNode, rnode.bucketNode, rnode.opt,
|
|
buf_desc->tag.forkNum, buf_desc->tag.blockNum,
|
|
(unsigned long long)lsn_now, (unsigned long long)lsn_past)));
|
|
return;
|
|
} else {
|
|
ereport(PANIC, (errmodule(MOD_DMS), errmsg("[SS page][%d/%d/%d/%d/%d %d-%d] now lsn(0x%llx) is less than past lsn(0x%llx)",
|
|
rnode.spcNode, rnode.dbNode, rnode.relNode, rnode.bucketNode, rnode.opt,
|
|
buf_desc->tag.forkNum, buf_desc->tag.blockNum,
|
|
(unsigned long long)lsn_now, (unsigned long long)lsn_past)));
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
static int CBXLogFlush(void *db_handle, unsigned long long *lsn)
|
|
{
|
|
(void)LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
|
|
(void)XLogBackgroundFlush();
|
|
*lsn = GetFlushRecPtr();
|
|
LWLockRelease(WALWriteLock);
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static char *CBDisplayBufferTag(char *displayBuf, unsigned int count, char *pageid)
|
|
{
|
|
BufferTag pagetag = *(BufferTag *)pageid;
|
|
int ret = sprintf_s(displayBuf, count, "%u/%u/%u/%d/%d %d-%u",
|
|
pagetag.rnode.spcNode, pagetag.rnode.dbNode, pagetag.rnode.relNode, (int)pagetag.rnode.bucketNode,
|
|
(int)pagetag.rnode.opt, pagetag.forkNum, pagetag.blockNum);
|
|
securec_check_ss(ret, "", "");
|
|
return displayBuf;
|
|
}
|
|
|
|
static int CBRemoveBufLoadStatus(dms_buf_ctrl_t *buf_ctrl, dms_buf_load_status_t dms_buf_load_status)
|
|
{
|
|
switch (dms_buf_load_status) {
|
|
case DMS_BUF_NEED_LOAD:
|
|
buf_ctrl->state &= ~BUF_NEED_LOAD;
|
|
break;
|
|
case DMS_BUF_IS_LOADED:
|
|
buf_ctrl->state &= ~BUF_IS_LOADED;
|
|
break;
|
|
case DMS_BUF_LOAD_FAILED:
|
|
buf_ctrl->state &= ~BUF_LOAD_FAILED;
|
|
break;
|
|
case DMS_BUF_NEED_TRANSFER:
|
|
buf_ctrl->state &= ~BUF_NEED_TRANSFER;
|
|
break;
|
|
default:
|
|
Assert(0);
|
|
}
|
|
return DMS_SUCCESS;
|
|
}
|
|
|
|
static int CBSetBufLoadStatus(dms_buf_ctrl_t *buf_ctrl, dms_buf_load_status_t dms_buf_load_status)
|
|
{
|
|
switch (dms_buf_load_status) {
|
|
case DMS_BUF_NEED_LOAD:
|
|
buf_ctrl->state |= BUF_NEED_LOAD;
|
|
break;
|
|
case DMS_BUF_IS_LOADED:
|
|
buf_ctrl->state |= BUF_IS_LOADED;
|
|
break;
|
|
case DMS_BUF_LOAD_FAILED:
|
|
buf_ctrl->state |= BUF_LOAD_FAILED;
|
|
break;
|
|
case DMS_BUF_NEED_TRANSFER:
|
|
buf_ctrl->state |= BUF_NEED_TRANSFER;
|
|
break;
|
|
default:
|
|
Assert(0);
|
|
}
|
|
return DMS_SUCCESS;
|
|
}
|
|
|
|
static void *CBGetHandle(unsigned int *db_handle_index, dms_session_type_e session_type)
|
|
{
|
|
ss_fake_seesion_context_t *fs_cxt = &g_instance.dms_cxt.SSFakeSessionCxt;
|
|
SpinLockAcquire(&fs_cxt->lock);
|
|
if (!fs_cxt->fake_sessions[fs_cxt->quickFetchIndex]) {
|
|
int index = fs_cxt->quickFetchIndex;
|
|
fs_cxt->fake_sessions[index] = true;
|
|
fs_cxt->quickFetchIndex++;
|
|
if (fs_cxt->quickFetchIndex >= fs_cxt->fake_session_cnt) {
|
|
fs_cxt->quickFetchIndex = 0;
|
|
}
|
|
SpinLockRelease(&fs_cxt->lock);
|
|
*db_handle_index = index + fs_cxt->session_start;
|
|
return &g_instance.proc_base->allProcs[index + fs_cxt->session_start];
|
|
}
|
|
|
|
int start_index = fs_cxt->quickFetchIndex;
|
|
int cur_index = 0;
|
|
bool found = false;
|
|
for (int i = 0; i < (int)fs_cxt->fake_session_cnt; i++) {
|
|
cur_index = (start_index + i) % fs_cxt->fake_session_cnt;
|
|
if (!fs_cxt->fake_sessions[cur_index]) {
|
|
found = true;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!found) {
|
|
SpinLockRelease(&fs_cxt->lock);
|
|
ereport(PANIC, (errmsg("[SS] can not find a session. please check")));
|
|
}
|
|
|
|
fs_cxt->quickFetchIndex = cur_index + 1;
|
|
if (fs_cxt->quickFetchIndex >= fs_cxt->fake_session_cnt) {
|
|
fs_cxt->quickFetchIndex = 0;
|
|
}
|
|
SpinLockRelease(&fs_cxt->lock);
|
|
*db_handle_index = cur_index;
|
|
return &g_instance.proc_base->allProcs[cur_index + fs_cxt->session_start];
|
|
}
|
|
|
|
static void CBReleaseHandle(void *db_handle)
|
|
{
|
|
ss_fake_seesion_context_t *fs_cxt = &g_instance.dms_cxt.SSFakeSessionCxt;
|
|
int index = ((char*)db_handle - (char*)&g_instance.proc_base->allProcs[fs_cxt->session_start]) / sizeof(PGPROC*);
|
|
fs_cxt->fake_sessions[index] = false;
|
|
}
|
|
|
|
static char *CBMemAlloc(void *context, unsigned int size)
|
|
{
|
|
char *ptr = NULL;
|
|
MemoryContext old_cxt = MemoryContextSwitchTo(t_thrd.dms_cxt.msgContext);
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
|
|
PG_TRY();
|
|
{
|
|
ptr = (char *)palloc(size);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
|
|
(void)MemoryContextSwitchTo(old_cxt);
|
|
return ptr;
|
|
}
|
|
|
|
static void CBMemFree(void *context, void *pointer)
|
|
{
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
|
|
PG_TRY();
|
|
{
|
|
pfree(pointer);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
}
|
|
|
|
static void CBMemReset(void *context)
|
|
{
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
MemoryContextReset(t_thrd.dms_cxt.msgContext);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
}
|
|
|
|
static int32 CBProcessLockAcquire(char *data, uint32 len)
|
|
{
|
|
if (unlikely(len != sizeof(SSBroadcastDDLLock))) {
|
|
ereport(DEBUG1, (errmodule(MOD_DMS), errmsg("[SS] invalid broadcast ddl lock message")));
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
SSBroadcastDDLLock *ssmsg = (SSBroadcastDDLLock *)data;
|
|
LockAcquireResult res;
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
res = LockAcquire(&(ssmsg->locktag), ssmsg->lockmode, false, ssmsg->dontWait);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
res = LOCKACQUIRE_NOT_AVAIL;
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS] Standby process DDLLockAccquire got in PG_CATCH")));
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
|
|
if (!(ssmsg->dontWait) && res == LOCKACQUIRE_NOT_AVAIL) {
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS] process DDLLockAccquire request failed!")));
|
|
return DMS_ERROR;
|
|
}
|
|
return DMS_SUCCESS;
|
|
}
|
|
|
|
static int32 CBProcessLockRelease(char *data, uint32 len)
|
|
{
|
|
if (unlikely(len != sizeof(SSBroadcastDDLLock))) {
|
|
ereport(DEBUG1, (errmodule(MOD_DMS), errmsg("[SS lock] invalid lock release message")));
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
SSBroadcastDDLLock *ssmsg = (SSBroadcastDDLLock *)data;
|
|
int res = DMS_SUCCESS;
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
(void)LockRelease(&(ssmsg->locktag), ssmsg->lockmode, ssmsg->sessionlock);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
res = DMS_ERROR;
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS lock] process DDLLockRelease request failed!")));
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
|
|
return res;
|
|
}
|
|
|
|
static int32 CBProcessReleaseAllLock(uint32 len)
|
|
{
|
|
if (unlikely(len != sizeof(SSBroadcastCmdOnly))) {
|
|
return DMS_ERROR;
|
|
}
|
|
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
int res = DMS_SUCCESS;
|
|
PG_TRY();
|
|
{
|
|
LockErrorCleanup();
|
|
LockReleaseAll(DEFAULT_LOCKMETHOD, true);
|
|
LockReleaseAll(USER_LOCKMETHOD, true);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
res = DMS_ERROR;
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS lock] process DDLLockReleaseAll request failed!")));
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
|
|
return res;
|
|
}
|
|
|
|
static int32 CBProcessBroadcast(void *db_handle, dms_broadcast_context_t *broad_ctx)
|
|
{
|
|
char *data = broad_ctx->data;
|
|
unsigned int len = broad_ctx->len;
|
|
char *output_msg = broad_ctx->output_msg;
|
|
unsigned int *output_msg_len = broad_ctx->output_msg_len;
|
|
int32 ret = DMS_SUCCESS;
|
|
SSBroadcastOp bcast_op = *(SSBroadcastOp *)data;
|
|
|
|
*output_msg_len = 0;
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
|
|
PG_TRY();
|
|
{
|
|
switch (bcast_op) {
|
|
case BCAST_GET_XMIN:
|
|
ret = SSGetOldestXmin(data, len, output_msg, output_msg_len);
|
|
break;
|
|
case BCAST_SI:
|
|
ret = SSProcessSharedInvalMsg(data, len);
|
|
break;
|
|
case BCAST_SEGDROPTL:
|
|
ret = SSProcessSegDropTimeline(data, len);
|
|
break;
|
|
case BCAST_DROP_REL_ALL_BUFFER:
|
|
ret = SSProcessDropRelAllBuffer(data, len);
|
|
break;
|
|
case BCAST_DROP_REL_RANGE_BUFFER:
|
|
ret = SSProcessDropRelRangeBuffer(data, len);
|
|
break;
|
|
case BCAST_DROP_DB_ALL_BUFFER:
|
|
ret = SSProcessDropDBAllBuffer(data, len);
|
|
break;
|
|
case BCAST_DROP_SEG_SPACE:
|
|
ret = SSProcessDropSegSpace(data, len);
|
|
break;
|
|
case BCAST_DDLLOCK:
|
|
ret = CBProcessLockAcquire(data, len);
|
|
break;
|
|
case BCAST_DDLLOCKRELEASE:
|
|
ret = CBProcessLockRelease(data, len);
|
|
break;
|
|
case BCAST_DDLLOCKRELEASE_ALL:
|
|
ret = CBProcessReleaseAllLock(len);
|
|
break;
|
|
case BCAST_CHECK_DB_BACKENDS:
|
|
ret = SSCheckDbBackends(data, len, output_msg, output_msg_len);
|
|
break;
|
|
case BCAST_SEND_SNAPSHOT:
|
|
ret = SSUpdateLatestSnapshotOfStandby(data, len, output_msg, output_msg_len);
|
|
break;
|
|
case BCAST_RELOAD_REFORM_CTRL_PAGE:
|
|
ret = SSReloadReformCtrlPage(len);
|
|
break;
|
|
case BCAST_REALTIME_BUILD_LOG_CTRL_ENABLE:
|
|
ret = SSUpdateRealtimeBuildLogCtrl(data, len);
|
|
break;
|
|
case BCAST_REPORT_REALTIME_BUILD_PTR:
|
|
ret = SSGetStandbyRealtimeBuildPtr(data, len);
|
|
break;
|
|
default:
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS] invalid broadcast operate type")));
|
|
ret = DMS_ERROR;
|
|
break;
|
|
}
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
if (t_thrd.role == DMS_WORKER) {
|
|
FlushErrorState();
|
|
}
|
|
}
|
|
PG_END_TRY();
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int32 CBProcessBroadcastAck(void *db_handle, dms_broadcast_context_t *broad_ctx)
|
|
{
|
|
char *data = broad_ctx->data;
|
|
unsigned int len = broad_ctx->len;
|
|
int32 ret = DMS_SUCCESS;
|
|
SSBroadcastOpAck bcast_op = *(SSBroadcastOpAck *)data;
|
|
|
|
switch (bcast_op) {
|
|
case BCAST_GET_XMIN_ACK:
|
|
ret = SSGetOldestXminAck((SSBroadcastXminAck *)data);
|
|
break;
|
|
case BCAST_CHECK_DB_BACKENDS_ACK:
|
|
ret = SSCheckDbBackendsAck(data, len);
|
|
break;
|
|
default:
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS] invalid broadcast ack type")));
|
|
ret = DMS_ERROR;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static int CBGetDmsStatus(void *db_handle)
|
|
{
|
|
return (int)g_instance.dms_cxt.dms_status;
|
|
}
|
|
|
|
static void CBSetDmsStatus(void *db_handle, int dms_status)
|
|
{
|
|
g_instance.dms_cxt.dms_status = (dms_status_t)dms_status;
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] set dms status: dmsStatus=%d.", g_instance.dms_cxt.dms_status)));
|
|
}
|
|
|
|
static int32 SSBufRebuildOneDrcInternal(BufferDesc *buf_desc, unsigned char thread_index)
|
|
{
|
|
dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id);
|
|
#ifdef USE_ASSERT_CHECKING
|
|
if (IsSegmentPhysicalRelNode(buf_desc->tag.rnode)) {
|
|
SegNetPageCheckDiskLSN(buf_desc, RBM_NORMAL, NULL);
|
|
} else {
|
|
SmgrNetPageCheckDiskLSN(buf_desc, RBM_NORMAL, NULL);
|
|
}
|
|
|
|
Assert(buf_ctrl != NULL);
|
|
Assert(buf_ctrl->is_edp != 1);
|
|
Assert(XLogRecPtrIsValid(g_instance.dms_cxt.ckptRedo));
|
|
#endif
|
|
dms_context_t dms_ctx;
|
|
InitDmsBufContext(&dms_ctx, buf_desc->tag);
|
|
dms_ctrl_info_t ctrl_info = { 0 };
|
|
ctrl_info.ctrl = *buf_ctrl;
|
|
ctrl_info.lsn = (unsigned long long)BufferGetLSN(buf_desc);
|
|
ctrl_info.is_dirty = SSBufferIsDirty(buf_desc);
|
|
int ret = dms_buf_res_rebuild_drc_parallel(&dms_ctx, &ctrl_info, thread_index);
|
|
if (ret != DMS_SUCCESS) {
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform][%u/%u/%u/%d %d-%u] rebuild page: failed.",
|
|
buf_desc->tag.rnode.spcNode, buf_desc->tag.rnode.dbNode, buf_desc->tag.rnode.relNode,
|
|
buf_desc->tag.rnode.bucketNode, buf_desc->tag.forkNum, buf_desc->tag.blockNum)));
|
|
return ret;
|
|
}
|
|
return DMS_SUCCESS;
|
|
}
|
|
|
|
static int SSBufRebuildOneDrc(int index, unsigned char thread_index)
|
|
{
|
|
BufferDesc *buf_desc = GetBufferDescriptor(index);
|
|
dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(index);
|
|
(void)SSPinBuffer(buf_desc);
|
|
bool need_rebuild = true;
|
|
LWLockAcquire((LWLock*)buf_ctrl->ctrl_lock, LW_EXCLUSIVE);
|
|
bool is_owner = DMS_BUF_CTRL_IS_OWNER(buf_ctrl);
|
|
LWLockRelease((LWLock*)buf_ctrl->ctrl_lock);
|
|
if (is_owner) {
|
|
uint64 buf_state = pg_atomic_read_u64(&buf_desc->state);
|
|
if (BUF_STATE_GET_REFCOUNT(buf_state) > 1) {
|
|
need_rebuild = true;
|
|
} else if (LWLockConditionalAcquire(buf_desc->content_lock, LW_SHARED)) {
|
|
if (!SSBufferIsDirty(buf_desc)) {
|
|
LWLockAcquire((LWLock*)buf_ctrl->ctrl_lock, LW_EXCLUSIVE);
|
|
buf_ctrl->lock_mode = DMS_LOCK_NULL;
|
|
LWLockRelease((LWLock*)buf_ctrl->ctrl_lock);
|
|
need_rebuild = false;
|
|
ereport(DEBUG5, (errmodule(MOD_DMS), errmsg("[SS reform][%u/%u/%u/%d %d-%u] no need rebuild, set lock_mode NULL.",
|
|
buf_desc->tag.rnode.spcNode, buf_desc->tag.rnode.dbNode, buf_desc->tag.rnode.relNode,
|
|
buf_desc->tag.rnode.bucketNode, buf_desc->tag.forkNum, buf_desc->tag.blockNum)));
|
|
}
|
|
LWLockRelease(buf_desc->content_lock);
|
|
}
|
|
} else {
|
|
need_rebuild = false;
|
|
}
|
|
|
|
if (need_rebuild) {
|
|
int ret = SSBufRebuildOneDrcInternal(buf_desc, thread_index);
|
|
SSUnPinBuffer(buf_desc);
|
|
return ret;
|
|
}
|
|
SSUnPinBuffer(buf_desc);
|
|
return DMS_SUCCESS;
|
|
}
|
|
|
|
static int32 CBBufRebuildDrcInternal(int begin, int len, unsigned char thread_index)
|
|
{
|
|
Assert(begin >= 0 && len > 0 && (begin + len) <= TOTAL_BUFFER_NUM);
|
|
int end = begin + len - 1;
|
|
for (int i = begin; i <= end; i++) {
|
|
int ret = SSBufRebuildOneDrc(i, thread_index);
|
|
if (ret != DMS_SUCCESS) {
|
|
return ret;
|
|
}
|
|
}
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] rebuild page: success."
|
|
"rebuild buf thread_index:%d, buf_if start from:%d to:%d, max_buf_id:%d.",
|
|
(int)thread_index, begin, end, (TOTAL_BUFFER_NUM - 1))));
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* As you can see, thread_num represents the number of thread. thread_index reprsents the n-th thread, begin from 0.
|
|
* special case:
|
|
* when parallel disable, rebuild phase still call this function,
|
|
* do you think thread_num is 1, and thread_index is 0 ?
|
|
* actually thread_num and thread_index are 255. It just a agreement in DMS
|
|
*/
|
|
const int dms_invalid_thread_index = 255;
|
|
const int dms_invalid_thread_num = 255;
|
|
static void CBAllocBufRangeForThread(unsigned char thread_index, unsigned char thread_num,
|
|
int* buf_begin, int* buf_num)
|
|
{
|
|
Assert((thread_index == dms_invalid_thread_index && thread_num == dms_invalid_thread_num) ||
|
|
(thread_index != dms_invalid_thread_index && thread_num != dms_invalid_thread_num &&
|
|
thread_index < thread_num));
|
|
int num = TOTAL_BUFFER_NUM / thread_num;
|
|
int begin = thread_index * num;
|
|
if (thread_index == thread_num - 1) {
|
|
num = TOTAL_BUFFER_NUM - begin;
|
|
}
|
|
|
|
if (thread_index == dms_invalid_thread_index && thread_num == dms_invalid_thread_num) {
|
|
begin = 0;
|
|
num = TOTAL_BUFFER_NUM;
|
|
}
|
|
*buf_begin = begin;
|
|
*buf_num = num;
|
|
}
|
|
|
|
static int32 CBBufRebuildDrcParallel(void* db_handle, unsigned char thread_index, unsigned char thread_num)
|
|
{
|
|
int buf_begin = 0;
|
|
int buf_num = 0;
|
|
CBAllocBufRangeForThread(thread_index, thread_num, &buf_begin, &buf_num);
|
|
return CBBufRebuildDrcInternal(buf_begin, buf_num, thread_index);
|
|
}
|
|
|
|
static int32 CBDrcBufValidate(void *db_handle)
|
|
{
|
|
/* Load Control File */
|
|
int src_id = SSGetPrimaryInstId();
|
|
SSReadControlFile(src_id, true);
|
|
int buf_cnt = 0;
|
|
|
|
uint64 buf_state;
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS reform] CBDrcBufValidate starts before reform done.")));
|
|
for (int i = 0; i < TOTAL_BUFFER_NUM; i++) {
|
|
BufferDesc *buf_desc = GetBufferDescriptor(i);
|
|
buf_state = LockBufHdr(buf_desc);
|
|
if ((buf_state & BM_VALID) || (buf_state & BM_TAG_VALID)) {
|
|
BufValidateDrc(buf_desc);
|
|
buf_cnt++;
|
|
}
|
|
UnlockBufHdr(buf_desc, buf_state);
|
|
}
|
|
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS reform] CBDrcBufValidate %d buffers success.", buf_cnt)));
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
// used for find bufferdesc in dms
|
|
// no need WaitIO to check valid bit is set or not, we use spinlock to guarantee to lock_mode
|
|
static BufferDesc* SSGetBufferDesc(char *pageid)
|
|
{
|
|
int buf_id;
|
|
BufferTag *tag = (BufferTag *)pageid;
|
|
BufferDesc *buf_desc = NULL;
|
|
RelFileNode relfilenode = tag->rnode;
|
|
uint32 hash = BufTableHashCode(tag);
|
|
bool retry = false;
|
|
|
|
#ifdef USE_ASSERT_CHECKING
|
|
if (IsSegmentPhysicalRelNode(relfilenode)) {
|
|
SegSpace *spc = spc_open(relfilenode.spcNode, relfilenode.dbNode, false, false);
|
|
BlockNumber spc_nblocks = spc_size(spc, relfilenode.relNode, tag->forkNum);
|
|
if (tag->blockNum >= spc_nblocks) {
|
|
ereport(PANIC, (errmodule(MOD_DMS),
|
|
errmsg("[SS] unexpected blocknum %u >= spc nblocks %u", tag->blockNum, spc_nblocks)));
|
|
}
|
|
}
|
|
#endif
|
|
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
do {
|
|
buf_id = BufTableLookup(tag, hash);
|
|
if (buf_id < 0) {
|
|
buf_desc = NULL;
|
|
break;
|
|
}
|
|
|
|
buf_desc = GetBufferDescriptor(buf_id);
|
|
(void)SSPinBuffer(buf_desc);
|
|
if (!BUFFERTAGS_PTR_EQUAL(&buf_desc->tag, tag)) {
|
|
SSUnPinBuffer(buf_desc);
|
|
buf_desc = NULL;
|
|
retry = true;
|
|
} else {
|
|
retry = false;
|
|
}
|
|
} while (retry);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
ReleaseResource();
|
|
}
|
|
PG_END_TRY();
|
|
return buf_desc;
|
|
}
|
|
|
|
static int CBConfirmConverting(void *db_handle, char *pageid, unsigned char smon_chk,
|
|
unsigned char *lock_mode, unsigned long long *edp_map, unsigned long long *lsn)
|
|
{
|
|
*lsn = 0; // lsn not used in dms, so need to waste time to PageGetLSN
|
|
*edp_map = 0;
|
|
|
|
BufferDesc *buf_desc = SSGetBufferDesc(pageid);
|
|
if (buf_desc == NULL) {
|
|
*lock_mode = (uint8)DMS_LOCK_NULL;
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id);
|
|
LWLockAcquire((LWLock*)buf_ctrl->ctrl_lock, LW_EXCLUSIVE);
|
|
*lock_mode = buf_ctrl->lock_mode;
|
|
#ifdef USE_ASSERT_CHECKING
|
|
if (buf_ctrl->is_edp) {
|
|
BufferTag *tag = &buf_desc->tag;
|
|
ereport(PANIC, (errmsg("[SS][%u/%u/%u/%d %d-%u] CBConfirmConverting, do not allow edp exist, please check.",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum)));
|
|
}
|
|
#endif
|
|
LWLockRelease((LWLock*)buf_ctrl->ctrl_lock);
|
|
SSUnPinBuffer(buf_desc);
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBGetStableList(void *db_handle, unsigned long long *list_stable, unsigned char *reformer_id)
|
|
{
|
|
*list_stable = g_instance.dms_cxt.SSReformerControl.list_stable;
|
|
*reformer_id = (uint8)g_instance.dms_cxt.SSReformerControl.primaryInstId;
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBStartup(void *db_handle)
|
|
{
|
|
g_instance.dms_cxt.SSRecoveryInfo.ready_to_startup = true;
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] initialize startup: Node %d set ready_to_startup to true.",
|
|
SS_MY_INST_ID)));
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBRecoveryStandby(void *db_handle, int inst_id)
|
|
{
|
|
Assert(inst_id == g_instance.attr.attr_storage.dms_attr.instance_id);
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] recovery: Recovery as standby start")));
|
|
|
|
if (!SSRecoveryNodes()) {
|
|
/*
|
|
* Because which can process failed condition is dms reform proc, so no errors can occurs here.
|
|
* If setting error is accept, That database exits before dms-reform maybe happen.
|
|
*/
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform] recovery: Fail")));
|
|
return GS_ERROR;
|
|
}
|
|
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBRecoveryPrimary(void *db_handle, int inst_id)
|
|
{
|
|
char* type_string = NULL;
|
|
type_string = SSGetLogHeaderTypeStr();
|
|
|
|
Assert(g_instance.dms_cxt.SSReformerControl.primaryInstId == inst_id ||
|
|
g_instance.dms_cxt.SSReformerControl.primaryInstId == -1);
|
|
g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy = false;
|
|
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("%s recovery: Recovery as primary start, will replay xlog "
|
|
"from inst:%d", type_string, g_instance.dms_cxt.SSReformerControl.primaryInstId)));
|
|
|
|
/* Release my own lock before recovery */
|
|
SSLockReleaseAll();
|
|
SSWakeupRecovery();
|
|
if (!SSRecoveryNodes()) {
|
|
g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = true;
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("%s recovery: Failed. pmstate=%d, SSClusterState=%d, "
|
|
"demotion=%d-%d, rec=%d", type_string, pmState, g_instance.dms_cxt.SSClusterState,
|
|
g_instance.demotion, t_thrd.walsender_cxt.WalSndCtl->demotion, t_thrd.xlog_cxt.InRecovery)));
|
|
return GS_ERROR;
|
|
}
|
|
|
|
g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = true;
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBFlushCopy(void *db_handle, char *pageid)
|
|
{
|
|
/*
|
|
* only two occasions
|
|
* 1) primary restart: SS_REFORM_REFORMER, dms_status not is DMS_STATUS_IN.
|
|
* 2) failover need flush_copy
|
|
*/
|
|
if (SS_REFORM_REFORMER && g_instance.dms_cxt.dms_status == DMS_STATUS_IN && !SS_STANDBY_FAILOVER) {
|
|
return GS_SUCCESS;
|
|
}
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] flush copy start: This step is occurs"
|
|
"only in primary restart and flush copy.")));
|
|
|
|
if (SS_REFORM_REFORMER && !g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy) {
|
|
g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy = true;
|
|
smgrcloseall();
|
|
}
|
|
|
|
BufferTag* tag = (BufferTag*)pageid;
|
|
Buffer buffer;
|
|
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
PG_TRY();
|
|
{
|
|
buffer = SSReadBuffer(tag, RBM_NORMAL);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
/* Save error info */
|
|
ErrorData* edata = CopyErrorData();
|
|
FlushErrorState();
|
|
FreeErrorData(edata);
|
|
ereport(PANIC, (errmodule(MOD_DMS), errmsg("[SS reform][%u/%u/%u/%d %d-%u] flush copy: Error happend",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum)));
|
|
}
|
|
PG_END_TRY();
|
|
|
|
if (BufferIsInvalid(buffer)) {
|
|
if (dms_reform_failed()) {
|
|
SSWaitStartupExit();
|
|
return GS_ERROR;
|
|
} else {
|
|
Assert(0);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* when remote DB instance reboot, this round reform fail
|
|
* primary node may fail to get page from remote node which reboot, this phase should return fail
|
|
*/
|
|
Assert(XLogRecPtrIsValid(g_instance.dms_cxt.ckptRedo));
|
|
LockBuffer(buffer, BUFFER_LOCK_SHARE);
|
|
if (t_thrd.dms_cxt.flush_copy_get_page_failed) {
|
|
t_thrd.dms_cxt.flush_copy_get_page_failed = false;
|
|
SSWaitStartupExit();
|
|
return GS_ERROR;
|
|
}
|
|
BufferDesc* buf_desc = GetBufferDescriptor(buffer - 1);
|
|
XLogRecPtr pagelsn = BufferGetLSN(buf_desc);
|
|
if (XLByteLT(g_instance.dms_cxt.ckptRedo, pagelsn)) {
|
|
dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buffer - 1);
|
|
buf_ctrl->state |= BUF_DIRTY_NEED_FLUSH;
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][%u/%u/%u/%d %d-%u] mark need flush in flush copy:"
|
|
"page lsn (0x%llx), buf_ctrl.state: %lu",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum, (unsigned long long)pagelsn, buf_ctrl->state)));
|
|
} else {
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][%u/%u/%u/%d %d-%u] ready to flush copy, page lsn (0x%llx)",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode,
|
|
tag->forkNum, tag->blockNum, (unsigned long long)pagelsn)));
|
|
}
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
|
ReleaseBuffer(buffer);
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static void SSFailoverPromoteNotify()
|
|
{
|
|
if (g_instance.dms_cxt.SSRecoveryInfo.startup_reform) {
|
|
g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = true;
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] SSFailoverPromoteNotify:"
|
|
"set restart_failover_flag to %s when DB restart.",
|
|
g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag ? "true" : "false")));
|
|
} else {
|
|
SendPostmasterSignal(PMSIGNAL_DMS_FAILOVER_STARTUP);
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] SSFailoverPromoteNotify:"
|
|
"send signal to PM to initialize startup thread when DB alive")));
|
|
}
|
|
}
|
|
|
|
static int CBFailoverPromote(void *db_handle)
|
|
{
|
|
SSClearSegCache();
|
|
SSFailoverPromoteNotify();
|
|
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] wait startup thread start.")));
|
|
long max_wait_time = 30000000L;
|
|
long wait_time = 0;
|
|
while (true) {
|
|
if (SS_STANDBY_FAILOVER && g_instance.pid_cxt.StartupPID != 0) {
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] startup thread success.")));
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
if ((wait_time % max_wait_time) == 0 && wait_time != 0) {
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] wait startup thread to"
|
|
"start successfully for %ld us.", wait_time)));
|
|
}
|
|
pg_usleep(REFORM_WAIT_TIME);
|
|
wait_time += REFORM_WAIT_TIME;
|
|
}
|
|
}
|
|
|
|
static int CBGetDBPrimaryId(void *db_handle, unsigned int *primary_id)
|
|
{
|
|
*primary_id = (unsigned int)g_instance.dms_cxt.SSReformerControl.primaryInstId;
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
/*
|
|
* Currently only used in SS switchover. To prevent state machine misjudgement,
|
|
* DSS status, dms_role, SSClusterState must be set atommically.
|
|
* DSS recommends we retry dss_set_server_status if it failed.
|
|
*/
|
|
static void CBReformSetDmsRole(void *db_handle, unsigned int reformer_id)
|
|
{
|
|
ss_reform_info_t *reform_info = &g_instance.dms_cxt.SSReformInfo;
|
|
dms_role_t new_dms_role = reformer_id == (unsigned int)SS_MY_INST_ID ? DMS_ROLE_REFORMER : DMS_ROLE_PARTNER;
|
|
if (new_dms_role == DMS_ROLE_REFORMER) {
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS switchover] begin to set currrent DSS as primary")));
|
|
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ALLOW_CKPT;
|
|
SSGrantDSSWritePermission();
|
|
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTING;
|
|
}
|
|
|
|
reform_info->dms_role = new_dms_role;
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS reform][SS switchover]role and lock switched, updated inst:%d with role:%d success",
|
|
SS_MY_INST_ID, reform_info->dms_role)));
|
|
/* we need change ha cur mode for switchover in ss double cluster here */
|
|
if (SS_DISASTER_CLUSTER) {
|
|
SSDisasterUpdateHAmode();
|
|
}
|
|
}
|
|
|
|
static void ReformCleanBackends()
|
|
{
|
|
/* cluster has no transactions during startup reform */
|
|
if (!g_instance.dms_cxt.SSRecoveryInfo.startup_reform) {
|
|
SendPostmasterSignal(PMSIGNAL_DMS_REFORM);
|
|
}
|
|
|
|
long max_wait_time = 60000000L;
|
|
long wait_time = 0;
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] wait backends to exit")));
|
|
while (true) {
|
|
if (dms_reform_failed()) {
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform] reform failed during caneling backends")));
|
|
return;
|
|
}
|
|
if (g_instance.dms_cxt.SSRecoveryInfo.reform_ready || g_instance.dms_cxt.SSRecoveryInfo.startup_reform) {
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] reform ready, backends have been terminated successfully."
|
|
" pmState=%d, SSClusterState=%d, demotion=%d-%d, rec=%d",
|
|
pmState, g_instance.dms_cxt.SSClusterState, g_instance.demotion,
|
|
t_thrd.walsender_cxt.WalSndCtl->demotion, t_thrd.xlog_cxt.InRecovery)));
|
|
return;
|
|
}
|
|
|
|
if (wait_time > max_wait_time) {
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform] reform failed, backends can not exit")));
|
|
/* check and print some thread which no exit. */
|
|
SSCountAndPrintChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC);
|
|
SSProcessForceExit();
|
|
}
|
|
|
|
pg_usleep(REFORM_WAIT_TIME);
|
|
wait_time += REFORM_WAIT_TIME;
|
|
}
|
|
}
|
|
|
|
static void FailoverCleanBackends()
|
|
{
|
|
if (g_instance.dms_cxt.SSRecoveryInfo.startup_reform) {
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] FailoverCleanBackends:"
|
|
"no need to clean backends.")));
|
|
return;
|
|
}
|
|
|
|
if (ENABLE_ONDEMAND_REALTIME_BUILD && SS_STANDBY_MODE) {
|
|
OnDemandWaitRealtimeBuildShutDownInPartnerFailover();
|
|
}
|
|
|
|
/**
|
|
* for failover:
|
|
* Ensure one round of failover and clean up all backend threads
|
|
* step 1, sned signal to tell thread to exit
|
|
* step 2, PM detected backend exit
|
|
* step 3, reform proc wait
|
|
*/
|
|
g_instance.dms_cxt.SSRecoveryInfo.no_backend_left = false;
|
|
SendPostmasterSignal(PMSIGNAL_DMS_FAILOVER_TERM_BACKENDS);
|
|
long wait_time = 0;
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] wait backends to exit")));
|
|
while (true) {
|
|
if (g_instance.dms_cxt.SSRecoveryInfo.no_backend_left && !CheckpointInProgress()) {
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] backends exit successfully, "
|
|
"wait_time = %ds", wait_time / FAILOVER_TIME_CONVERT)));
|
|
break;
|
|
}
|
|
|
|
/* check and print some thread which no exit. */
|
|
int backendNum = SSCountAndPrintChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC);
|
|
ereport (WARNING, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] there are %d backends can not exit! "
|
|
"wait_time = %lds", backendNum, wait_time / FAILOVER_TIME_CONVERT)));
|
|
|
|
if (dms_reform_failed()) {
|
|
ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] reform failed during clean backends")));
|
|
return;
|
|
}
|
|
|
|
pg_usleep(REFORM_WAIT_TIME);
|
|
wait_time += REFORM_WAIT_TIME;
|
|
}
|
|
}
|
|
|
|
static void RestartRealtimeBuildCtrl()
|
|
{
|
|
if (SS_IN_REFORM && g_instance.dms_cxt.SSRecoveryInfo.enableRealtimeBuildLogCtrl) {
|
|
ereport(LOG, (errmsg("[SS reform][On-demand] reform happened, disable realtime build log ctrl, "
|
|
"and will make it enable again after reform if needed.")));
|
|
}
|
|
g_instance.dms_cxt.SSRecoveryInfo.enableRealtimeBuildLogCtrl = false;
|
|
SpinLockInit(&g_instance.dms_cxt.SSRecoveryInfo.sleepTimeSyncLock);
|
|
g_instance.dms_cxt.SSRecoveryInfo.globalSleepTime = 0;
|
|
errno_t rc = memset_s(g_instance.dms_cxt.SSRecoveryInfo.rtBuildCtrl,
|
|
sizeof(g_instance.dms_cxt.SSRecoveryInfo.rtBuildCtrl),
|
|
0,
|
|
sizeof(g_instance.dms_cxt.SSRecoveryInfo.rtBuildCtrl));
|
|
securec_check(rc, "", "");
|
|
if (SS_PRIMARY_MODE && ENABLE_REALTIME_BUILD_TARGET_RTO) {
|
|
SSBroadcastRealtimeBuildLogCtrlEnable(false);
|
|
}
|
|
}
|
|
|
|
static int reform_type_str_len = 30;
|
|
static void ReformTypeToString(dms_reform_type_t reform_type, char* ret_str)
|
|
{
|
|
switch (reform_type)
|
|
{
|
|
case DMS_REFORM_TYPE_FOR_NORMAL_OPENGAUSS:
|
|
strcpy_s(ret_str, reform_type_str_len, "normal reform");
|
|
break;
|
|
case DMS_REFORM_TYPE_FOR_FAILOVER_OPENGAUSS:
|
|
strcpy_s(ret_str, reform_type_str_len, "failover reform");
|
|
break;
|
|
case DMS_REFORM_TYPE_FOR_SWITCHOVER_OPENGAUSS:
|
|
strcpy_s(ret_str, reform_type_str_len, "switchover reform");
|
|
break;
|
|
case DMS_REFORM_TYPE_FOR_FULL_CLEAN:
|
|
strcpy_s(ret_str, reform_type_str_len, "full clean reform");
|
|
break;
|
|
default:
|
|
strcpy_s(ret_str, reform_type_str_len, "unknown");
|
|
break;
|
|
}
|
|
return;
|
|
}
|
|
|
|
static void SSXminInfoPrepare()
|
|
{
|
|
ss_xmin_info_t *xmin_info = &g_instance.dms_cxt.SSXminInfo;
|
|
if (g_instance.dms_cxt.SSReformInfo.dms_role == DMS_ROLE_REFORMER) {
|
|
SpinLockAcquire(&xmin_info->global_oldest_xmin_lock);
|
|
xmin_info->prev_global_oldest_xmin = xmin_info->global_oldest_xmin;
|
|
xmin_info->global_oldest_xmin_active = false;
|
|
xmin_info->global_oldest_xmin = MaxTransactionId;
|
|
SpinLockRelease(&xmin_info->global_oldest_xmin_lock);
|
|
for (int i = 0; i < DMS_MAX_INSTANCES; i++) {
|
|
ss_node_xmin_item_t *item = &xmin_info->node_table[i];
|
|
SpinLockAcquire(&item->item_lock);
|
|
item->active = false;
|
|
item->notify_oldest_xmin = MaxTransactionId;
|
|
SpinLockRelease(&item->item_lock);
|
|
}
|
|
|
|
if (!SSPerformingStandbyScenario()) {
|
|
SpinLockAcquire(&xmin_info->snapshot_available_lock);
|
|
xmin_info->snapshot_available = false;
|
|
SpinLockRelease(&xmin_info->snapshot_available_lock);
|
|
}
|
|
}
|
|
xmin_info->bitmap_active_nodes = 0;
|
|
}
|
|
|
|
static void FailoverStartNotify(dms_reform_start_context_t *rs_cxt)
|
|
{
|
|
ss_reform_info_t *reform_info = &g_instance.dms_cxt.SSReformInfo;
|
|
if (reform_info->reform_type == DMS_REFORM_TYPE_FOR_FAILOVER_OPENGAUSS) {
|
|
g_instance.dms_cxt.SSRecoveryInfo.in_failover = true;
|
|
g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = true;
|
|
if (rs_cxt->role == DMS_ROLE_REFORMER) {
|
|
g_instance.dms_cxt.dw_init = false;
|
|
/* variable set order: SharedRecoveryInProgress -> reform_ckpt_status -> dms_role */
|
|
volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl;
|
|
SpinLockAcquire(&xlogctl->info_lck);
|
|
xlogctl->IsRecoveryDone = false;
|
|
xlogctl->SharedRecoveryInProgress = true;
|
|
SpinLockRelease(&xlogctl->info_lck);
|
|
t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY;
|
|
pg_memory_barrier();
|
|
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ALLOW_CKPT;
|
|
g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_FAILOVER_PROMOTING;
|
|
|
|
/*
|
|
* single cluster: SET PM_WAIT_BACKENDS in check PMSIGNAL_DMS_FAILOVER_TERM_BACKENDS.
|
|
* standby cluster of dual cluster: Backends should exit in here, this step should be
|
|
* bring forward and not in CBFailoverPromote.
|
|
*/
|
|
if (SS_DORADO_STANDBY_CLUSTER) {
|
|
pmState = PM_WAIT_BACKENDS;
|
|
}
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] failover trigger.")));
|
|
}
|
|
/* SET PM_WAIT_BACKENDS in check PMSIGNAL_DMS_FAILOVER_TERM_BACKENDS */
|
|
ereport(LOG, (errmsg("[SS reform][SS failover] starts, pmState=%d, SSClusterState=%d, demotion=%d-%d, rec=%d",
|
|
pmState, g_instance.dms_cxt.SSClusterState, g_instance.demotion,
|
|
t_thrd.walsender_cxt.WalSndCtl->demotion, t_thrd.xlog_cxt.InRecovery)));
|
|
}
|
|
}
|
|
|
|
static void CBReformStartNotify(void *db_handle, dms_reform_start_context_t *rs_cxt)
|
|
{
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] reform start enter: pmState=%d, SSClusterState=%d, demotion=%d-%d, rec=%d",
|
|
pmState, g_instance.dms_cxt.SSClusterState, g_instance.demotion,
|
|
t_thrd.walsender_cxt.WalSndCtl->demotion, t_thrd.xlog_cxt.InRecovery)));
|
|
/*
|
|
* During db is stoping by gs_ctl or cm_ctl stop, but next round of reform starts due to error judgement.
|
|
* If db stop process is blocked or too slow, so next round of reform maybe trigger. In theory, this should
|
|
* not happened, so db have to exit by fore.
|
|
*/
|
|
if (g_instance.status >= FastShutdown) {
|
|
ereport(WARNING,
|
|
(errmodule(MOD_DMS),
|
|
errmsg("[SS reform] reform starts concurrencely when db is stoping, db exit by force now")));
|
|
_exit(0);
|
|
}
|
|
|
|
SSHandleStartupWhenReformStart(rs_cxt);
|
|
ss_reform_info_t *reform_info = &g_instance.dms_cxt.SSReformInfo;
|
|
reform_info->is_hashmap_constructed = false;
|
|
reform_info->reform_type = rs_cxt->reform_type;
|
|
g_instance.dms_cxt.SSClusterState = NODESTATE_NORMAL;
|
|
g_instance.dms_cxt.SSRecoveryInfo.reform_ready = false;
|
|
g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy = false;
|
|
g_instance.dms_cxt.SSRecoveryInfo.startup_need_exit_normally = false;
|
|
g_instance.dms_cxt.resetSyscache = true;
|
|
g_instance.dms_cxt.SSRecoveryInfo.in_failover = false;
|
|
FailoverStartNotify(rs_cxt);
|
|
|
|
reform_info->reform_start_time = GetCurrentTimestamp();
|
|
reform_info->bitmap_nodes = rs_cxt->bitmap_participated;
|
|
reform_info->bitmap_reconnect = rs_cxt->bitmap_reconnect;
|
|
reform_info->dms_role = rs_cxt->role;
|
|
IniRedoInfo();
|
|
if (!ENABLE_SS_BCAST_GETOLDESTXMIN) {
|
|
SSXminInfoPrepare();
|
|
}
|
|
reform_info->reform_ver = reform_info->reform_start_time;
|
|
reform_info->in_reform = true;
|
|
char reform_type_str[reform_type_str_len] = {0};
|
|
ReformTypeToString(reform_info->reform_type, reform_type_str);
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS reform] reform start, role:%d, reform type:SS %s, standby scenario:%d, "
|
|
"bitmap_reconnect:%llu, reform_ver:%ld.",
|
|
reform_info->dms_role, reform_type_str, SSPerformingStandbyScenario(),
|
|
reform_info->bitmap_reconnect, reform_info->reform_ver)));
|
|
if (reform_info->dms_role == DMS_ROLE_REFORMER) {
|
|
SSGrantDSSWritePermission();
|
|
}
|
|
int old_primary = SSGetPrimaryInstId();
|
|
SSReadControlFile(old_primary, true);
|
|
g_instance.dms_cxt.SSReformInfo.old_bitmap = g_instance.dms_cxt.SSReformerControl.list_stable;
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] old cluster node bitmap: %lu", g_instance.dms_cxt.SSReformInfo.old_bitmap)));
|
|
|
|
g_instance.dms_cxt.SSRecoveryInfo.enableRealtimeBuildLogCtrl = false;
|
|
if (g_instance.dms_cxt.SSRecoveryInfo.in_failover) {
|
|
FailoverCleanBackends();
|
|
} else if (SSBackendNeedExitScenario()) {
|
|
ReformCleanBackends();
|
|
} else {
|
|
ProcessNoCleanBackendsScenario();
|
|
}
|
|
|
|
if (SS_DISASTER_CLUSTER && reform_info->reform_type != DMS_REFORM_TYPE_FOR_SWITCHOVER_OPENGAUSS) {
|
|
SSDisasterUpdateHAmode();
|
|
}
|
|
}
|
|
|
|
static int CBReformDoneNotify(void *db_handle)
|
|
{
|
|
if (g_instance.dms_cxt.SSRecoveryInfo.in_failover) {
|
|
g_instance.dms_cxt.SSRecoveryInfo.in_failover = false;
|
|
if (SS_REFORM_REFORMER) {
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform][SS failover] failover success, instance:%d"
|
|
" become primary.", g_instance.attr.attr_storage.dms_attr.instance_id)));
|
|
}
|
|
}
|
|
|
|
if (SS_DISASTER_CLUSTER) {
|
|
SSDisasterUpdateHAmode();
|
|
}
|
|
|
|
/* SSClusterState and in_reform must be set atomically */
|
|
g_instance.dms_cxt.SSRecoveryInfo.startup_reform = false;
|
|
g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = false;
|
|
g_instance.dms_cxt.SSRecoveryInfo.reform_ckpt_status = NOT_ACTIVE;
|
|
Assert(g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy == false);
|
|
g_instance.dms_cxt.SSReformInfo.new_bitmap = g_instance.dms_cxt.SSReformerControl.list_stable;
|
|
ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] new cluster node bitmap: %lu",
|
|
g_instance.dms_cxt.SSReformInfo.new_bitmap)));
|
|
g_instance.dms_cxt.SSReformInfo.reform_end_time = GetCurrentTimestamp();
|
|
g_instance.dms_cxt.SSReformInfo.reform_success = true;
|
|
|
|
/*
|
|
* Only two kind of condition:
|
|
* 1.Primary or standby restart in single node mode, other nodes in cluster is set PM_WAIT_REFORM.
|
|
* 2.In failover, standby no promoting as priamey is set PM_WAIT_BACKENDS.
|
|
*/
|
|
if (pmState == PM_WAIT_REFORM ||
|
|
(SS_PERFORMING_FAILOVER && SS_STANDBY_MODE && pmState == PM_WAIT_BACKENDS)) {
|
|
pmState = PM_RUN;
|
|
}
|
|
ereport(LOG,
|
|
(errmodule(MOD_DMS),
|
|
errmsg("[SS reform] Reform success, instance:%d is running.",
|
|
g_instance.attr.attr_storage.dms_attr.instance_id)));
|
|
|
|
if (ENABLE_REALTIME_BUILD_TARGET_RTO && SS_PRIMARY_MODE) {
|
|
RestartRealtimeBuildCtrl();
|
|
}
|
|
/* reform success indicates that reform of primary and standby all complete, then update gaussdb.state */
|
|
g_instance.dms_cxt.dms_status = (dms_status_t)DMS_STATUS_IN;
|
|
SendPostmasterSignal(PMSIGNAL_DMS_REFORM_DONE);
|
|
g_instance.dms_cxt.SSClusterState = NODESTATE_NORMAL;
|
|
g_instance.dms_cxt.SSRecoveryInfo.realtime_build_in_reform = false;
|
|
g_instance.dms_cxt.SSReformInfo.in_reform = false;
|
|
|
|
ereport(LOG, (errmodule(MOD_DMS),
|
|
errmsg("[SS reform] reform done: pmState=%d, SSClusterState=%d, demotion=%d-%d, "
|
|
"rec=%d, dmsStatus=%d.", pmState, g_instance.dms_cxt.SSClusterState,
|
|
g_instance.demotion, t_thrd.walsender_cxt.WalSndCtl->demotion,
|
|
t_thrd.xlog_cxt.InRecovery, g_instance.dms_cxt.dms_status)));
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBXLogWaitFlush(void *db_handle, unsigned long long lsn)
|
|
{
|
|
XLogWaitFlush(lsn);
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBDBCheckLock(void *db_handle)
|
|
{
|
|
if (t_thrd.storage_cxt.num_held_lwlocks > 0) {
|
|
TimestampTz now = GetCurrentTimestamp();
|
|
ereport(PANIC, (errmodule(MOD_DMS), errmsg("[SS lock] hold lock, lock address:%p, lock mode:%u, time:%ld ms",
|
|
t_thrd.storage_cxt.held_lwlocks[0].lock,
|
|
t_thrd.storage_cxt.held_lwlocks[0].mode,
|
|
now - t_thrd.storage_cxt.lwlock_held_times[0])));
|
|
return GS_ERROR;
|
|
}
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBCacheMsg(void *db_handle, char* msg)
|
|
{
|
|
errno_t rc = memcpy_s(t_thrd.dms_cxt.msg_backup, sizeof(t_thrd.dms_cxt.msg_backup), msg,
|
|
sizeof(t_thrd.dms_cxt.msg_backup));
|
|
securec_check(rc, "\0", "\0");
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
static int CBUpdateNodeOldestXmin(void *db_handle, uint8 inst_id, unsigned long long oldest_xmin)
|
|
{
|
|
SSUpdateNodeOldestXmin(inst_id, oldest_xmin);
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
void DmsCallbackThreadShmemInit(unsigned char need_startup, char **reg_data)
|
|
{
|
|
/* in dorado mode, we need to wait sharestorageinit finished */
|
|
while (!g_instance.dms_cxt.SSRecoveryInfo.dorado_sharestorage_inited && SS_DORADO_CLUSTER) {
|
|
pg_usleep(REFORM_WAIT_TIME);
|
|
}
|
|
IsUnderPostmaster = true;
|
|
// to add cnt, avoid postmain execute proc_exit to free shmem now
|
|
(void)pg_atomic_add_fetch_u32(&g_instance.dms_cxt.inDmsThreShmemInitCnt, 1);
|
|
|
|
// postmain execute proc_exit now, share mem maybe shdmt, exit this thread now.
|
|
if (pg_atomic_read_u32(&g_instance.dms_cxt.inProcExitCnt) > 0) {
|
|
(void)pg_atomic_sub_fetch_u32(&g_instance.dms_cxt.inDmsThreShmemInitCnt, 1);
|
|
ThreadExitCXX(0);
|
|
}
|
|
EarlyBindingTLSVariables();
|
|
MemoryContextInit();
|
|
knl_thread_init(DMS_WORKER);
|
|
*reg_data = (char *)&t_thrd;
|
|
t_thrd.fake_session = create_session_context(t_thrd.top_mem_cxt, 0);
|
|
t_thrd.fake_session->status = KNL_SESS_FAKE;
|
|
u_sess = t_thrd.fake_session;
|
|
t_thrd.proc_cxt.MyProcPid = gs_thread_self();
|
|
if (!need_startup) {
|
|
t_thrd.proc_cxt.MyProgName = "DMS WORKER";
|
|
t_thrd.dms_cxt.is_reform_proc = false;
|
|
} else {
|
|
t_thrd.proc_cxt.MyProgName = "DMS REFORM PROC";
|
|
t_thrd.dms_cxt.is_reform_proc = true;
|
|
}
|
|
t_thrd.proc_cxt.MyStartTime = time(NULL);
|
|
|
|
SelfMemoryContext = THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_DEFAULT);
|
|
/* memory context will be used by DMS message process functions */
|
|
t_thrd.dms_cxt.msgContext = AllocSetContextCreate(TopMemoryContext,
|
|
"DMSWorkerContext",
|
|
ALLOCSET_DEFAULT_MINSIZE,
|
|
ALLOCSET_DEFAULT_INITSIZE,
|
|
ALLOCSET_DEFAULT_MAXSIZE);
|
|
/* create timer with thread safe */
|
|
if (gs_signal_createtimer() < 0) {
|
|
ereport(FATAL, (errmodule(MOD_DMS), errmsg("[SS] create timer fail at thread : %lu",
|
|
t_thrd.proc_cxt.MyProcPid)));
|
|
}
|
|
CreateLocalSysDBCache();
|
|
InitShmemForDmsCallBack();
|
|
Assert(t_thrd.utils_cxt.CurrentResourceOwner == NULL);
|
|
t_thrd.utils_cxt.CurrentResourceOwner =
|
|
ResourceOwnerCreate(NULL, "dms worker", THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE));
|
|
|
|
u_sess->misc_cxt.SessionUserId = BOOTSTRAP_SUPERUSERID;
|
|
char namebuf[NAMEDATALEN] = {0};
|
|
pthread_getname_np(pthread_self(), namebuf, sizeof(namebuf));
|
|
|
|
SharedInvalBackendInit(false, false);
|
|
pgstat_initialize();
|
|
|
|
if (g_instance.attr.attr_storage.dms_attr.enable_dyn_trace) {
|
|
pgstat_bestart();
|
|
|
|
pgstat_report_appname(namebuf);
|
|
}
|
|
|
|
u_sess->attr.attr_common.Log_line_prefix = "\%m \%u \%d \%h \%p \%S ";
|
|
log_timezone = g_instance.dms_cxt.log_timezone;
|
|
(void)pg_atomic_sub_fetch_u32(&g_instance.dms_cxt.inDmsThreShmemInitCnt, 1);
|
|
t_thrd.postgres_cxt.whereToSendOutput = (int)DestNone;
|
|
}
|
|
|
|
int CBOndemandRedoPageForStandby(void *block_key, int32 *redo_status)
|
|
{
|
|
BufferTag* tag = (BufferTag *)block_key;
|
|
|
|
Assert(SS_PRIMARY_MODE);
|
|
Assert(!t_thrd.dms_cxt.in_ondemand_redo);
|
|
// do nothing if not in ondemand recovery
|
|
if (!SS_IN_ONDEMAND_RECOVERY) {
|
|
ereport(DEBUG1, (errmodule(MOD_DMS),
|
|
errmsg("[SS][On-demand] Ignore standby redo page request, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode,
|
|
tag->rnode.bucketNode, tag->forkNum, tag->blockNum)));
|
|
*redo_status = ONDEMAND_REDO_SKIP;
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
if (SS_IN_REFORM) {
|
|
ereport(DEBUG1, (errmodule(MOD_DMS),
|
|
errmsg("[SS][On-demand][%u/%u/%u/%d %d-%u] Reform happend when primary redo page for standby,"
|
|
"return ONDEMAND_REDO_FAIL.", tag->rnode.spcNode, tag->rnode.dbNode,
|
|
tag->rnode.relNode, tag->rnode.bucketNode, tag->forkNum, tag->blockNum)));
|
|
*redo_status = ONDEMAND_REDO_FAIL;
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
Buffer buffer = InvalidBuffer;
|
|
uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount;
|
|
*redo_status = ONDEMAND_REDO_DONE;
|
|
t_thrd.dms_cxt.in_ondemand_redo = true;
|
|
smgrcloseall();
|
|
PG_TRY();
|
|
{
|
|
buffer = SSReadBuffer(tag, RBM_NORMAL);
|
|
if (BufferIsInvalid(buffer)) {
|
|
*redo_status = ONDEMAND_REDO_FAIL;
|
|
} else {
|
|
ReleaseBuffer(buffer);
|
|
}
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount;
|
|
/* Save error info */
|
|
ErrorData* edata = CopyErrorData();
|
|
ereport(WARNING, (errmodule(MOD_DMS),
|
|
errmsg("[SS][On-demand][%u/%u/%u/%d %d-%u] Error happend when primary redo page for standby.",
|
|
tag->rnode.spcNode, tag->rnode.dbNode,
|
|
tag->rnode.relNode, tag->rnode.bucketNode, tag->forkNum, tag->blockNum),
|
|
errdetail("%s", edata->detail)));
|
|
FlushErrorState();
|
|
FreeErrorData(edata);
|
|
*redo_status = ONDEMAND_REDO_ERROR;
|
|
}
|
|
PG_END_TRY();
|
|
|
|
t_thrd.dms_cxt.in_ondemand_redo = false;
|
|
ereport(DEBUG1, (errmodule(MOD_DMS),
|
|
errmsg("[SS][On-demand][%u/%u/%u/%d %d-%u] Redo page for standby done. redo status: %d.",
|
|
tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode,
|
|
tag->rnode.bucketNode, tag->forkNum, tag->blockNum, *redo_status)));
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
void CBGetBufInfo(char* resid, stat_buf_info_t *buf_info)
|
|
{
|
|
BufferTag tag;
|
|
errno_t err = memcpy_s(&tag, sizeof(BufferTag), resid, sizeof(BufferTag));
|
|
securec_check(err, "\0", "\0");
|
|
buftag_get_buf_info(tag, buf_info);
|
|
}
|
|
|
|
static void CBBufCtrlRecycle(void *db_handle)
|
|
{
|
|
SSTryEliminateBuf(TRY_ELIMINATE_BUF_TIMES);
|
|
}
|
|
|
|
void DmsThreadDeinit()
|
|
{
|
|
proc_exit(0);
|
|
}
|
|
|
|
int CBDoCheckpointImmediately(unsigned long long *ckpt_lsn)
|
|
{
|
|
Assert(SS_PRIMARY_MODE);
|
|
|
|
RequestCheckpoint(CHECKPOINT_IMMEDIATE);
|
|
pg_usleep(REFORM_WAIT_LONG);
|
|
LWLockAcquire(ControlFileLock, LW_SHARED);
|
|
*ckpt_lsn = (unsigned long long)t_thrd.shemem_ptr_cxt.ControlFile->checkPointCopy.redo;
|
|
LWLockRelease(ControlFileLock);
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
int CBBufCtrlRcyClean(void *db_handle, unsigned char thread_index, unsigned char thread_num)
|
|
{
|
|
int buf_begin = 0;
|
|
int buf_num = 0;
|
|
CBAllocBufRangeForThread(thread_index, thread_num, &buf_begin, &buf_num);
|
|
int buf_end = buf_begin + buf_num - 1;
|
|
for (int i = buf_begin; i <= buf_end; i++) {
|
|
dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(i);
|
|
buf_ctrl->in_rcy = false;
|
|
}
|
|
return GS_SUCCESS;
|
|
}
|
|
|
|
void DmsInitCallback(dms_callback_t *callback)
|
|
{
|
|
// used in reform
|
|
callback->get_list_stable = CBGetStableList;
|
|
callback->save_list_stable = CBSaveStableList;
|
|
callback->opengauss_startup = CBStartup;
|
|
callback->opengauss_recovery_standby = CBRecoveryStandby;
|
|
callback->opengauss_recovery_primary = CBRecoveryPrimary;
|
|
callback->get_dms_status = CBGetDmsStatus;
|
|
callback->set_dms_status = CBSetDmsStatus;
|
|
callback->dms_reform_rebuild_parallel = CBBufRebuildDrcParallel;
|
|
callback->dms_thread_init = DmsCallbackThreadShmemInit;
|
|
callback->confirm_converting = CBConfirmConverting;
|
|
callback->flush_copy = CBFlushCopy;
|
|
callback->get_db_primary_id = CBGetDBPrimaryId;
|
|
callback->failover_promote_opengauss = CBFailoverPromote;
|
|
callback->reform_start_notify = CBReformStartNotify;
|
|
callback->reform_set_dms_role = CBReformSetDmsRole;
|
|
callback->opengauss_ondemand_redo_buffer = CBOndemandRedoPageForStandby;
|
|
|
|
callback->inc_and_get_srsn = CBIncAndGetSrsn;
|
|
callback->get_page_hash_val = CBPageHashCode;
|
|
callback->read_local_page4transfer = CBEnterLocalPage;
|
|
callback->leave_local_page = CBLeaveLocalPage;
|
|
callback->page_is_dirty = CBPageDirty;
|
|
callback->get_page = CBGetPage;
|
|
callback->set_buf_load_status = CBSetBufLoadStatus;
|
|
callback->remove_buf_load_status = CBRemoveBufLoadStatus;
|
|
callback->invalidate_page = CBInvalidatePage;
|
|
callback->get_db_handle = CBGetHandle;
|
|
callback->release_db_handle = CBReleaseHandle;
|
|
callback->display_pageid = CBDisplayBufferTag;
|
|
callback->verify_page = CBVerifyPage;
|
|
|
|
callback->mem_alloc = CBMemAlloc;
|
|
callback->mem_free = CBMemFree;
|
|
callback->mem_reset = CBMemReset;
|
|
|
|
callback->get_page_lsn = CBGetPageLSN;
|
|
callback->get_global_lsn = CBGetGlobalLSN;
|
|
callback->log_flush = CBXLogFlush;
|
|
callback->process_broadcast = CBProcessBroadcast;
|
|
callback->process_broadcast_ack = CBProcessBroadcastAck;
|
|
|
|
callback->get_opengauss_xid_csn = CBGetTxnCSN;
|
|
callback->get_opengauss_update_xid = CBGetUpdateXid;
|
|
callback->get_opengauss_txn_status = CBGetTxnStatus;
|
|
callback->opengauss_lock_buffer = CBGetCurrModeAndLockBuffer;
|
|
callback->get_opengauss_txn_snapshot = CBGetSnapshotData;
|
|
callback->get_opengauss_txn_of_master = CBGetTxnSwinfo;
|
|
callback->get_opengauss_page_status = CBGetPageStatus;
|
|
|
|
callback->log_output = NULL;
|
|
|
|
callback->switchover_demote = CBSwitchoverDemote;
|
|
callback->switchover_promote_opengauss = CBSwitchoverPromote;
|
|
callback->set_switchover_result = CBSwitchoverResult;
|
|
callback->reform_done_notify = CBReformDoneNotify;
|
|
callback->log_wait_flush = CBXLogWaitFlush;
|
|
callback->drc_validate = CBDrcBufValidate;
|
|
callback->db_check_lock = CBDBCheckLock;
|
|
callback->cache_msg = CBCacheMsg;
|
|
callback->update_node_oldest_xmin = CBUpdateNodeOldestXmin;
|
|
|
|
callback->get_buf_info = CBGetBufInfo;
|
|
callback->buf_ctrl_recycle = CBBufCtrlRecycle;
|
|
callback->dms_thread_deinit = DmsThreadDeinit;
|
|
callback->opengauss_do_ckpt_immediate = CBDoCheckpointImmediately;
|
|
callback->dms_ctl_rcy_clean_parallel = CBBufCtrlRcyClean;
|
|
}
|