Files
openGauss-server/src/gausskernel/storage/ipc/procarray.cpp
2023-12-27 11:05:52 +08:00

5539 lines
201 KiB
C++
Executable File

/* -------------------------------------------------------------------------
*
* procarray.cpp
* openGauss process array code.
*
*
* This module maintains arrays of the PGPROC and PGXACT structures for all
* active backends. Although there are several uses for this, the principal
* one is as a means of determining the set of currently running transactions.
*
* Because of various subtle race conditions it is critical that a backend
* hold the correct locks while setting or clearing its MyPgXact->xid field.
* See notes in src/backend/access/transam/README.
*
* The process arrays now also include structures representing prepared
* transactions. The xid and subxids fields of these are valid, as are the
* myProcLocks lists. They can be distinguished from regular backend PGPROCs
* at need by checking for pid == 0.
*
#ifdef PGXC
* Vanilla PostgreSQL assumes maximum TransactinIds in any snapshot is
* arrayP->maxProcs. It does not apply to XC because XC's snapshot
* should include XIDs running in other node, which may come at any
* time. This means that needed size of xip varies from time to time.
*
* This must be handled properly in all the functions in this module.
*
* The member max_xcnt was added as SnapshotData member to indicate the
* real size of xip array.
*
* Here, the following assumption is made for SnapshotData struct throughout
* this module.
*
* 1. xip member physical size is indicated by max_xcnt member.
* 2. If max_xcnt == 0, it means that xip members is NULL, and vise versa.
* 3. xip (and subxip) are allocated usign malloc() or realloc() directly.
*
* For Postgres-XC, there is some special handling for ANALYZE.
* An XID for a local ANALYZE command will never involve other nodes.
* Also, ANALYZE may run for a long time, affecting snapshot xmin values
* on other nodes unnecessarily. We want to exclude the XID
* in global snapshots, but include it in local ones. As a result,
* these are tracked in shared memory separately.
#endif
*
* During hot standby, we also keep a list of XIDs representing transactions
* that are known to be running in the master (or more precisely, were running
* as of the current point in the WAL stream). This list is kept in the
* KnownAssignedXids array, and is updated by watching the sequence of
* arriving XIDs. This is necessary because if we leave those XIDs out of
* snapshots taken for standby queries, then they will appear to be already
* complete, leading to MVCC failures. Note that in hot standby, the PGPROC
* array represents standby processes, which by definition are not running
* transactions that have XIDs.
*
* It is perhaps possible for a backend on the master to terminate without
* writing an abort record for its transaction. While that shouldn't really
* happen, it would tie up KnownAssignedXids indefinitely, so we protect
* ourselves by pruning the array when a valid list of running XIDs arrives.
*
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
*
*
* IDENTIFICATION
* src/gausskernel/storage/ipc/procarray.cpp
*
* -------------------------------------------------------------------------
*/
#include "postgres.h"
#include "knl/knl_variable.h"
#include <signal.h>
#include "access/clog.h"
#include "access/csnlog.h"
#include "access/extreme_rto/page_redo.h"
#include "access/subtrans.h"
#include "access/transam.h"
#include "access/twophase.h"
#include "catalog/catalog.h"
#include "access/xact.h"
#include "access/xlog.h"
#include "catalog/pg_type.h"
#include "commands/vacuum.h"
#include "funcapi.h"
#include "gtm/gtm_txn.h"
#include "miscadmin.h"
#include "postmaster/snapcapturer.h"
#include "postmaster/cfs_shrinker.h"
#include "storage/lmgr.h"
#include "storage/spin.h"
#include "threadpool/threadpool.h"
#include "utils/builtins.h"
#include "utils/rel.h"
#include "utils/snapmgr.h"
#include "utils/timestamp.h"
#include "utils/memutils.h"
#include "utils/atomic.h"
#include "utils/distribute_test.h"
#include "access/heapam.h"
#ifdef PGXC
#include "pgxc/pgxc.h"
#include "access/gtm.h"
#include "storage/ipc.h"
#include "pgxc/nodemgr.h"
#include"replication/walreceiver.h"
/* PGXC_DATANODE */
#include "postmaster/autovacuum.h"
#include "postmaster/postmaster.h"
#include "postmaster/twophasecleaner.h"
#endif
#include "gssignal/gs_signal.h"
#include "catalog/pg_control.h"
#include "pgstat.h"
#include "storage/lock/lwlock.h"
#include "threadpool/threadpool_sessctl.h"
#include "access/parallel_recovery/dispatcher.h"
#include "access/multi_redo_api.h"
#include "gstrace/gstrace_infra.h"
#include "gstrace/storage_gstrace.h"
#include "ddes/dms/ss_common_attr.h"
#include "ddes/dms/ss_transaction.h"
#include "ddes/dms/ss_reform_common.h"
#include "replication/ss_cluster_replication.h"
#ifdef ENABLE_UT
#define static
#endif /* USE_UT */
#ifdef XIDCACHE_DEBUG
/* counters for XidCache measurement */
static long xc_by_recent_xmin = 0;
static long xc_by_known_xact = 0;
static long xc_by_my_xact = 0;
static long xc_by_latest_xid = 0;
static long xc_by_main_xid = 0;
static long xc_by_child_xid = 0;
static long xc_by_known_assigned = 0;
static long xc_no_overflow = 0;
static long xc_slow_answer = 0;
#define xc_by_recent_xmin_inc() (xc_by_recent_xmin++)
#define xc_by_known_xact_inc() (xc_by_known_xact++)
#define xc_by_my_xact_inc() (xc_by_my_xact++)
#define xc_by_latest_xid_inc() (xc_by_latest_xid++)
#define xc_by_main_xid_inc() (xc_by_main_xid++)
#define xc_by_child_xid_inc() (xc_by_child_xid++)
static void DisplayXidCache(void);
#else /* !XIDCACHE_DEBUG */
#define xc_by_recent_xmin_inc() ((void)0)
#define xc_by_known_xact_inc() ((void)0)
#define xc_by_my_xact_inc() ((void)0)
#define xc_by_latest_xid_inc() ((void)0)
#define xc_by_main_xid_inc() ((void)0)
#define xc_by_child_xid_inc() ((void)0)
#endif /* XIDCACHE_DEBUG */
#ifdef PGXC /* PGXC_DATANODE */
#define ADD_XMIN_TO_ARRAY(xmin) \
if (xminArray != NULL) \
(xminArray)[count] = xmin
void SetGlobalSnapshotData(
TransactionId xmin, TransactionId xmax, uint64 csn, GTM_Timeline timeline, bool ss_need_sync_wait_all);
void UnsetGlobalSnapshotData(void);
static bool GetPGXCSnapshotData(Snapshot snapshot);
#ifdef ENABLE_MULTIPLE_NODES
static bool GetSnapshotDataDataNode(Snapshot snapshot);
static bool GetSnapshotDataCoordinator(Snapshot snapshot);
#endif
static void cleanSnapshot(Snapshot snapshot);
static void ResetProcXidCache(PGPROC* proc, bool needlock);
#endif
/* for local multi version snapshot */
void CalculateLocalLatestSnapshot(bool forceCalc);
static TransactionId GetMultiSnapshotOldestXmin();
static inline void ProcArrayEndTransactionInternal(PGPROC* proc, PGXACT* pgxact, TransactionId latestXid,
TransactionId* xid, uint32* nsubxids);
void XidCacheRemoveRunningXids(PGPROC* proc, PGXACT* pgxact);
void ProcArrayGroupClearXid(bool isSubTransaction, PGPROC* proc, TransactionId latestXid,
TransactionId subTranactionXid, int nSubTransactionXids,
TransactionId* subTransactionXids, TransactionId subTransactionLatestXid);
extern bool StreamTopConsumerAmI();
#define PROCARRAY_MAXPROCS (g_instance.shmem_cxt.MaxBackends + \
g_instance.attr.attr_storage.max_prepared_xacts * NUM_TWOPHASE_PARTITIONS)
/*
* Report shared-memory space needed by CreateProcXactHashTable
*/
Size ProcXactHashTableShmemSize(void)
{
return hash_estimate_size(PROCARRAY_MAXPROCS, sizeof(ProcXactLookupEntry));
}
void CreateProcXactHashTable(void)
{
HASHCTL info;
info.keysize = sizeof(TransactionId);
info.entrysize = sizeof(ProcXactLookupEntry);
info.hash = tag_hash;
info.num_partitions = NUM_PROCXACT_PARTITIONS; /* We only have 812 threads in current configuration */
g_instance.ProcXactTable = ShmemInitHash("Proc Xact Lookup Table", PROCARRAY_MAXPROCS, PROCARRAY_MAXPROCS,
&info, HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
}
/* @Return partition lock id. */
static LWLock *LockProcXactHashTablePartition(TransactionId xid, LWLockMode mode)
{
uint32 hashValue = get_hash_value(g_instance.ProcXactTable, &xid);
uint32 partition = hashValue & (NUM_PROCXACT_PARTITIONS - 1);
uint32 lockid = (uint32)(FirstProcXactMappingLock + partition);
LWLock* lock = &t_thrd.shemem_ptr_cxt.mainLWLockArray[lockid].lock;
LWLockAcquire(lock, mode);
return lock;
}
int
ProcXactHashTableLookup(TransactionId xid)
{
/* Caller should make sure ProcArrayLock is held by it in share mode */
ProcXactLookupEntry *result = NULL;
bool found = false;
LWLock* lock = LockProcXactHashTablePartition(xid, LW_SHARED);
result = (ProcXactLookupEntry *) hash_search(g_instance.ProcXactTable, &xid, HASH_FIND, &found);
LWLockRelease(lock);
return found ? result->proc_id : InvalidProcessId;
}
void
ProcXactHashTableAdd(TransactionId xid, int procId)
{
/* Caller should make sure ProcArrayLock is held by it in exclusive mode */
bool found = true;
LWLock* lock = LockProcXactHashTablePartition(xid, LW_EXCLUSIVE);
ProcXactLookupEntry *result =
(ProcXactLookupEntry *)hash_search(g_instance.ProcXactTable, &xid, HASH_ENTER, &found);
LWLockRelease(lock);
result->proc_id = procId;
}
void
ProcXactHashTableRemove(TransactionId xid)
{
bool found = false;
LWLock *lock = LockProcXactHashTablePartition(xid, LW_EXCLUSIVE);
hash_search(g_instance.ProcXactTable, &xid, HASH_REMOVE, &found);
LWLockRelease(lock);
if (!found)
ereport(WARNING, (errcode(ERRCODE_DUPLICATE_OBJECT),
errmsg("transaction identifier %lu not exists in ProcXact hash table", xid)));
}
/*
* Report shared-memory space needed by CreateSharedProcArray.
*/
Size ProcArrayShmemSize(void)
{
Size size;
/* Size of the ProcArray structure itself */
#define PROCARRAY_MAXPROCS (g_instance.shmem_cxt.MaxBackends + \
g_instance.attr.attr_storage.max_prepared_xacts * NUM_TWOPHASE_PARTITIONS)
size = offsetof(ProcArrayStruct, pgprocnos);
size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
/*
* During Hot Standby processing we have a data structure called
* KnownAssignedXids, created in shared memory. Local data structures are
* also created in various backends during GetSnapshotData(),
* TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
* main structures created in those functions must be identically sized,
* since we may at times copy the whole of the data structures around. We
* refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
*
* Ideally we'd only create this structure if we were actually doing hot
* standby in the current run, but we don't know that yet at the time
* shared memory is being set up.
*/
#define TOTAL_MAX_CACHED_SUBXIDS ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
if (g_instance.attr.attr_storage.EnableHotStandby) {
size = add_size(size, mul_size(sizeof(TransactionId), TOTAL_MAX_CACHED_SUBXIDS));
size = add_size(size, mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
}
return size;
}
/*
* Initialize the shared PGPROC array during postmaster startup.
*/
void CreateSharedProcArray(void)
{
/* Create or attach to the ProcArray shared structure */
MemoryContext oldcontext = MemoryContextSwitchTo(INSTANCE_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_CBB));
size_t array_size = offsetof(ProcArrayStruct, pgprocnos) + PROCARRAY_MAXPROCS * sizeof(int) + PG_CACHE_LINE_SIZE;
if (g_instance.proc_array_idx == NULL) {
g_instance.proc_array_idx = (ProcArrayStruct*)CACHELINEALIGN(palloc(array_size));
}
{
/* We're the first - initialize. */
g_instance.proc_array_idx->numProcs = 0;
g_instance.proc_array_idx->maxProcs = PROCARRAY_MAXPROCS;
g_instance.proc_array_idx->replication_slot_xmin = InvalidTransactionId;
g_instance.proc_array_idx->replication_slot_catalog_xmin = InvalidTransactionId;
}
g_instance.proc_base_all_procs = g_instance.proc_base->allProcs;
g_instance.proc_base_all_xacts = g_instance.proc_base->allPgXact;
MemoryContextSwitchTo(oldcontext);
}
/*
* Add the specified PGPROC to the shared array.
*/
void ProcArrayAdd(PGPROC* proc)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index = 0;
errno_t rc;
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
PGXACT* pgxact = &g_instance.proc_base_all_xacts[proc->pgprocno];
if (arrayP->numProcs >= arrayP->maxProcs) {
/*
* Ooops, no room. (This really shouldn't happen, since there is a
* fixed supply of PGPROC structs too, and so we should have failed
* earlier.)
*/
LWLockRelease(ProcArrayLock);
ereport(FATAL, (errcode(ERRCODE_TOO_MANY_CONNECTIONS), errmsg("sorry, too many clients already")));
}
/*
* Keep the procs array sorted by (PGPROC *) so that we can utilize
* locality of references much better. This is useful while traversing the
* ProcArray because there is a increased likelihood of finding the next
* PGPROC structure in the cache.
*
* Since the occurrence of adding/removing a proc is much lower than the
* access to the ProcArray itself, the overhead should be marginal
*/
for (index = 0; index < arrayP->numProcs; index++) {
/*
* If we are the first PGPROC or if we have found our right position
* in the array, break
*/
if ((arrayP->pgprocnos[index] == -1) || (arrayP->pgprocnos[index] > proc->pgprocno))
break;
}
rc = memmove_s(&arrayP->pgprocnos[index + 1],
PROCARRAY_MAXPROCS * sizeof(int),
&arrayP->pgprocnos[index],
(arrayP->numProcs - index) * sizeof(int));
securec_check(rc, "\0", "\0");
arrayP->pgprocnos[index] = proc->pgprocno;
arrayP->numProcs++;
if (TransactionIdIsValid(pgxact->xid)) {
ProcXactHashTableAdd(pgxact->xid, proc->pgprocno);
}
LWLockRelease(ProcArrayLock);
}
/*
* Remove the specified PGPROC from the shared array.
*
* When latestXid is a valid XID, we are removing a live 2PC gxact from the
* array, and thus causing it to appear as "not running" anymore. In this
* case we must advance latestCompletedXid. (This is essentially the same
* as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
* the ProcArrayLock only once, and don't damage the content of the PGPROC;
* twophase.c depends on the latter.)
*/
void ProcArrayRemove(PGPROC* proc, TransactionId latestXid)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
PGXACT* pgxact = &g_instance.proc_base_all_xacts[proc->pgprocno];
int index = 0;
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
if (TransactionIdIsValid(latestXid)) {
Assert(TransactionIdIsValid(pgxact->xid));
/* Advance global latestCompletedXid while holding the lock */
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid, latestXid))
t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestXid;
} else {
if (IS_PGXC_DATANODE || !IsConnFromCoord()) {
/* Shouldn't be trying to remove a live transaction here */
Assert(!TransactionIdIsValid(pgxact->xid));
}
}
/* Clear xid from ProcXactHashTable. We can ignore BootstrapTransactionId */
if (TransactionIdIsNormal(pgxact->xid)) {
ProcXactHashTableRemove(pgxact->xid);
}
for (index = 0; index < arrayP->numProcs; index++) {
if (arrayP->pgprocnos[index] == proc->pgprocno) {
/* Keep the PGPROC array sorted. See notes above */
errno_t rc;
rc = memmove_s(&arrayP->pgprocnos[index],
arrayP->numProcs * sizeof(int),
&arrayP->pgprocnos[index + 1],
(arrayP->numProcs - index - 1) * sizeof(int));
securec_check(rc, "\0", "\0");
arrayP->pgprocnos[arrayP->numProcs - 1] = -1; /* for debugging */
arrayP->numProcs--;
/* Calc new sanpshot. */
if (TransactionIdIsValid(latestXid))
CalculateLocalLatestSnapshot(false);
LWLockRelease(ProcArrayLock);
/* Free xid cache memory if needed, must after procarray remove */
ResetProcXidCache(proc, true);
proc->commitCSN = 0;
pgxact->needToSyncXid = 0;
return;
}
}
/* Ooops */
LWLockRelease(ProcArrayLock);
ereport(LOG, (errmsg("failed to find proc in ProcArray")));
}
static void inline ProcArrayClearAutovacuum(PGXACT* pgxact)
{
if (!IsAutoVacuumWorkerProcess() && IS_PGXC_DATANODE && !IS_SINGLE_NODE) {
pgxact->vacuumFlags &= ~PROC_IS_AUTOVACUUM;
}
}
/*
* ProcArrayEndTransaction -- mark a transaction as no longer running
*
* This is used interchangeably for commit and abort cases. The transaction
* commit/abort must already be reported to WAL and pg_clog.
*
* proc is currently always t_thrd.proc, but we pass it explicitly for flexibility.
* latestXid is the latest Xid among the transaction's main XID and
* subtransactions, or InvalidTransactionId if it has no XID. (We must ask
* the caller to pass latestXid, instead of computing it from the PGPROC's
* contents, because the subxid information in the PGPROC might be
* incomplete.)
*/
void ProcArrayEndTransaction(PGPROC* proc, TransactionId latestXid, bool isCommit)
{
PGXACT* pgxact = &g_instance.proc_base_all_xacts[proc->pgprocno];
#ifndef ENABLE_DISTRIBUTE_TEST
if (ENABLE_WORKLOAD_CONTROL && WLMIsInfoInit()) {
if (isCommit) {
UpdateWlmCatalogInfoHash();
} else {
ResetWlmCatalogFlag();
}
}
#endif
if (TransactionIdIsValid(latestXid)) {
/*
* We must lock ProcArrayLock while clearing our advertised XID, so
* that we do not exit the set of "running" transactions while someone
* else is taking a snapshot. See discussion in
* src/backend/access/transam/README.
*/
#ifdef PGXC
/*
* Remove this assertion. We have seen this failing because a ROLLBACK
* statement may get canceled by a Coordinator, leading to recursive
* abort of a transaction. This must be a openGauss issue, highlighted
* by XC. See thread on hackers with subject "Canceling ROLLBACK
* statement"
*/
#else
Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
#endif
/*
* If we can immediately acquire ProcArrayLock, we clear our own XID
* and release the lock. If not, use group XID clearing to improve
* efficiency.
*/
if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE)) {
TransactionId xid;
uint32 nsubxids;
ProcArrayEndTransactionInternal(proc, pgxact, latestXid, &xid, &nsubxids);
CalculateLocalLatestSnapshot(false);
LWLockRelease(ProcArrayLock);
} else
ProcArrayGroupClearXid(false, proc, latestXid, InvalidTransactionId, 0, NULL, InvalidTransactionId);
} else {
/*
* If we have no XID, we don't need to lock, since we won't affect
* anyone else's calculation of a snapshot. We might change their
* estimate of global xmin, but that's OK.
*/
Assert(!TransactionIdIsValid(pgxact->xid));
pgxact->handle = InvalidTransactionHandle;
proc->lxid = InvalidLocalTransactionId;
pgxact->next_xid = InvalidTransactionId;
pgxact->xmin = InvalidTransactionId;
proc->snapXmax = InvalidTransactionId;
proc->snapCSN = InvalidCommitSeqNo;
proc->exrto_read_lsn = 0;
proc->exrto_min = 0;
proc->exrto_gen_snap_time = 0;
pgxact->csn_min = InvalidCommitSeqNo;
pgxact->csn_dr = InvalidCommitSeqNo;
/* must be cleared with xid/xmin: */
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
ProcArrayClearAutovacuum(pgxact);
pgxact->delayChkpt = false; /* be sure this is cleared in abort */
proc->recoveryConflictPending = false;
proc->commitCSN = 0;
pgxact->needToSyncXid = 0;
Assert(pgxact->nxids == 0);
}
/*
* Reset isInResetUserName to false. isInResetUserName is set true in case 'O' so as to mask the log
* in GetPGXCSnapshotData and GetSnapshotData.
*/
t_thrd.postgres_cxt.isInResetUserName = false;
}
/*
* Mark a write transaction as no longer running.
*
* We don't do any locking here; caller must handle that.
*/
static inline void ProcArrayEndTransactionInternal(PGPROC* proc, PGXACT* pgxact, TransactionId latestXid,
TransactionId* xid, uint32* nsubxids)
{
/* Store xid and nsubxids to update csnlog */
*xid = pgxact->xid;
*nsubxids = pgxact->nxids;
/* Clear xid from ProcXactHashTable. We can ignore BootstrapTransactionId */
if (TransactionIdIsNormal(*xid)) {
ProcXactHashTableRemove(*xid);
}
pgxact->handle = InvalidTransactionHandle;
pgxact->xid = InvalidTransactionId;
pgxact->next_xid = InvalidTransactionId;
proc->lxid = InvalidLocalTransactionId;
pgxact->xmin = InvalidTransactionId;
proc->snapXmax = InvalidTransactionId;
proc->snapCSN = InvalidCommitSeqNo;
proc->exrto_read_lsn = 0;
proc->exrto_min = 0;
proc->exrto_gen_snap_time = 0;
pgxact->csn_min = InvalidCommitSeqNo;
pgxact->csn_dr = InvalidCommitSeqNo;
/* must be cleared with xid/xmin: */
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
ProcArrayClearAutovacuum(pgxact);
pgxact->delayChkpt = false; /* be sure this is cleared in abort */
proc->recoveryConflictPending = false;
/* Clear the subtransaction-XID cache too while holding the lock */
pgxact->nxids = 0;
/* Also advance global latestCompletedXid while holding the lock */
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid, latestXid))
t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestXid;
/* Clear commit csn after csn update */
proc->commitCSN = 0;
pgxact->needToSyncXid = 0;
ResetProcXidCache(proc, true);
}
static inline void ProcInsertIntoGroup(PGPROC* proc, uint32* nextidx) {
while (true) {
*nextidx = pg_atomic_read_u32(&g_instance.proc_base->procArrayGroupFirst);
pg_atomic_write_u32(&proc->procArrayGroupNext, *nextidx);
if (pg_atomic_compare_exchange_u32(
&g_instance.proc_base->procArrayGroupFirst, nextidx, (uint32)proc->pgprocno))
break;
}
}
static inline void ClearProcArrayGroupCache(PGPROC* proc) {
proc->procArrayGroupMember = false;
proc->procArrayGroupMemberXid = InvalidTransactionId;
proc->procArrayGroupSubXactNXids = 0;
proc->procArrayGroupSubXactXids = NULL;
proc->procArrayGroupSubXactLatestXid = InvalidTransactionId;
}
static inline void SetProcArrayGroupCache(PGPROC* proc, TransactionId xid, int nxids,
TransactionId* xids, TransactionId latestXid)
{
proc->procArrayGroupMemberXid = xid;
proc->procArrayGroupSubXactNXids = nxids;
proc->procArrayGroupSubXactXids = xids;
proc->procArrayGroupSubXactLatestXid = latestXid;
}
/*
* ProcArrayGroupClearXid -- group XID clearing
*
* When we cannot immediately acquire ProcArrayLock in exclusive mode at
* commit time, add ourselves to a list of processes that need their XIDs
* cleared. The first process to add itself to the list will acquire
* ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
* for transaction group members and XidCacheRemoveRunningXids
* for subtransaction group members. This avoids a great deal of contention
* around ProcArrayLock when many processes are trying to commit at once,
* since the lock need not be repeatedly handed off from one committing
* process to the next.
*/
void ProcArrayGroupClearXid(bool isSubTransaction, PGPROC* proc,
TransactionId latestXid, TransactionId subTranactionXid,
int nSubTransactionXids, TransactionId* subTransactionXids,
TransactionId subTransactionLatestXid)
{
uint32 nextidx;
uint32 wakeidx;
TransactionId xid[PROCARRAY_MAXPROCS];
uint32 nsubxids[PROCARRAY_MAXPROCS];
uint32 index = 0;
bool groupMemberHasTransaction = false;
/* We should definitely have an XID to clear. */
/* Add ourselves to the list of processes needing a group XID clear. */
proc->procArrayGroupMember = true;
if (isSubTransaction) {
SetProcArrayGroupCache(proc, subTranactionXid, nSubTransactionXids, subTransactionXids, subTransactionLatestXid);
} else {
SetProcArrayGroupCache(proc, latestXid, 0, NULL, InvalidTransactionId);
}
/* add current proc into ProcArrayGroup */
ProcInsertIntoGroup(proc, &nextidx);
/*
* If the list was not empty, the leader will clear our XID. It is
* impossible to have followers without a leader because the first process
* that has added itself to the list will always have nextidx as
* INVALID_PGPROCNO.
*/
if (nextidx != INVALID_PGPROCNO) {
int extraWaits = 0;
/* Sleep until the leader clears our XID. */
for (;;) {
/* acts as a read barrier */
PGSemaphoreLock(&proc->sem, false);
if (!proc->procArrayGroupMember)
break;
extraWaits++;
}
Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO);
/* Fix semaphore count for any absorbed wakeups */
while (extraWaits-- > 0)
PGSemaphoreUnlock(&proc->sem);
return;
}
/* We are the leader. Acquire the lock on behalf of everyone. */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
/*
* Now that we've got the lock, clear the list of processes waiting for
* group XID clearing, saving a pointer to the head of the list. Trying
* to pop elements one at a time could lead to an ABA problem.
*/
while (true) {
nextidx = pg_atomic_read_u32(&g_instance.proc_base->procArrayGroupFirst);
if (pg_atomic_compare_exchange_u32(&g_instance.proc_base->procArrayGroupFirst, &nextidx, INVALID_PGPROCNO))
break;
}
/* Remember head of list so we can perform wakeups after dropping lock. */
wakeidx = nextidx;
/* Walk the list and clear all XIDs. */
while (nextidx != INVALID_PGPROCNO) {
PGPROC* proc_member = g_instance.proc_base_all_procs[nextidx];
PGXACT* pgxact = &g_instance.proc_base_all_xacts[nextidx];
ereport(DEBUG2, (errmsg("handle group member from procArrayGroup, procno = %u, "
"procArrayGroupMemberXid = " XID_FMT ", "
"procArrayGroupSubXactNXids = %d, "
"procArrayGroupSubXactLatestXid = " XID_FMT ", "
"procArrayGroupNext = %u",
proc_member->pgprocno,
proc_member->procArrayGroupMemberXid,
proc_member->procArrayGroupSubXactNXids,
proc_member->procArrayGroupSubXactLatestXid,
proc_member->procArrayGroupNext)));
/*
* If the proc_member is a transaction, perform ProcArrayEndTransactionInternal
* to clear its XID. If the proc_member is a subtransaction,
* perform XidCacheRemoveRunningXids to clear its XIDs and
* its committed subtransaction's XIDS.
*
* proc_member->procArrayGroupSubXactLatestXid !=0 when the group member
* is a subtransaction.
*/
if (proc_member->procArrayGroupSubXactLatestXid != InvalidTransactionId) {
XidCacheRemoveRunningXids(proc_member, pgxact);
} else {
groupMemberHasTransaction = true;
ProcArrayEndTransactionInternal(
proc_member, pgxact, proc_member->procArrayGroupMemberXid, &xid[index], &nsubxids[index]);
}
/* Move to next proc in list. */
nextidx = pg_atomic_read_u32(&proc_member->procArrayGroupNext);
index++;
}
/* Already hold lock, caculate snapshot after last invocation,
* if there is at least one transaction in group.
*/
if (groupMemberHasTransaction) {
CalculateLocalLatestSnapshot(false);
}
/* We're done with the lock now. */
LWLockRelease(ProcArrayLock);
/*
* Now that we've released the lock, go back and wake everybody up. We
* don't do this under the lock so as to keep lock hold times to a
* minimum. The system calls we need to perform to wake other processes
* up are probably much slower than the simple memory writes we did while
* holding the lock.
*/
index = 0;
while (wakeidx != INVALID_PGPROCNO) {
PGPROC* proc_member = g_instance.proc_base_all_procs[wakeidx];
wakeidx = pg_atomic_read_u32(&proc_member->procArrayGroupNext);
pg_atomic_write_u32(&proc_member->procArrayGroupNext, INVALID_PGPROCNO);
/* ensure all previous writes are visible before follower continues. */
pg_write_barrier();
ClearProcArrayGroupCache(proc_member);
if (proc_member != t_thrd.proc)
PGSemaphoreUnlock(&proc_member->sem);
index++;
}
}
/*
* ProcArrayClearTransaction -- clear the transaction fields
*
* This is used after successfully preparing a 2-phase transaction. We are
* not actually reporting the transaction's XID as no longer running --- it
* will still appear as running because the 2PC's gxact is in the ProcArray
* too. We just have to clear out our own PGXACT.
*/
void ProcArrayClearTransaction(PGPROC* proc)
{
PGXACT* pgxact = &g_instance.proc_base_all_xacts[proc->pgprocno];
/*
* We can skip locking ProcArrayLock here, because this action does not
* actually change anyone's view of the set of running XIDs: our entry is
* duplicate with the gxact that has already been inserted into the
* ProcArray.
*/
pgxact->handle = InvalidTransactionHandle;
pgxact->xid = InvalidTransactionId;
pgxact->next_xid = InvalidTransactionId;
proc->lxid = InvalidLocalTransactionId;
pgxact->xmin = InvalidTransactionId;
proc->snapXmax = InvalidTransactionId;
proc->snapCSN = InvalidCommitSeqNo;
proc->exrto_read_lsn = 0;
proc->exrto_gen_snap_time = 0;
pgxact->csn_min = InvalidCommitSeqNo;
pgxact->csn_dr = InvalidCommitSeqNo;
proc->recoveryConflictPending = false;
/* redundant, but just in case */
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
ProcArrayClearAutovacuum(pgxact);
pgxact->delayChkpt = false;
pgxact->needToSyncXid = 0;
/* Clear the subtransaction-XID cache too */
pgxact->nxids = 0;
proc->exrto_min = 0;
/* Free xid cache memory if needed */
ResetProcXidCache(proc, true);
}
void UpdateCSNLogAtTransactionEND(
TransactionId xid, int nsubxids, TransactionId* subXids, CommitSeqNo csn, bool isCommit)
{
if (TransactionIdIsNormal(xid) && isCommit) {
Assert(csn >= COMMITSEQNO_FROZEN);
/* Update CSN log, stamp this XID (and sub-XIDs) with the CSN */
#ifdef ENABLE_MULTIPLE_NODES
CSNLogSetCommitSeqNo(xid, nsubxids, subXids, csn);
#else
CSNLogSetCommitSeqNo(xid, nsubxids, subXids, csn & ~COMMITSEQNO_COMMIT_INPROGRESS);
#endif
}
}
/*
* This is called in revorage stage, extend the CSN log page while doing
* xact_redo if need, after the CSN log is initialized to latestObservedXid.
*/
void CSNLogRecordAssignedTransactionId(TransactionId newXid)
{
if (TransactionIdFollows(newXid, t_thrd.storage_cxt.latestObservedXid)) {
TransactionId next_expected_xid = t_thrd.storage_cxt.latestObservedXid;
while (TransactionIdPrecedes(next_expected_xid, newXid)) {
TransactionIdAdvance(next_expected_xid);
ExtendCSNLOG(next_expected_xid);
}
Assert(next_expected_xid == newXid);
/*
* Now we can advance latestObservedXid
*/
t_thrd.storage_cxt.latestObservedXid = newXid;
if (t_thrd.xlog_cxt.standbyState <= STANDBY_INITIALIZED) {
return;
}
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
if (TransactionIdFollowsOrEquals(next_expected_xid, t_thrd.xact_cxt.ShmemVariableCache->nextXid)) {
t_thrd.xact_cxt.ShmemVariableCache->nextXid = next_expected_xid;
TransactionIdAdvance(t_thrd.xact_cxt.ShmemVariableCache->nextXid);
}
LWLockRelease(XidGenLock);
}
}
/*
* ProcArrayInitRecovery -- initialize recovery xid mgmt environment
*
* Remember up to where the startup process initialized the CLOG and subtrans
* so we can ensure it's initialized gaplessly up to the point where necessary
* while in recovery.
*/
void ProcArrayInitRecovery(TransactionId initializedUptoXID)
{
Assert(t_thrd.xlog_cxt.standbyState == STANDBY_INITIALIZED);
Assert(TransactionIdIsNormal(initializedUptoXID));
/*
* we set latestObservedXid to the xid SUBTRANS has been initialized upto,
* so we can extend it from that point onwards in RecordKnownAssignedTransactionIds,
* and when we get consistent in ProcArrayApplyRecoveryInfo().
*/
t_thrd.storage_cxt.latestObservedXid = initializedUptoXID;
TransactionIdRetreat(t_thrd.storage_cxt.latestObservedXid);
}
/*
* GetRunningTransactionData -- returns information about running transactions.
*
* Similar to GetSnapshotData but returns more information. We include
* all PGXACTs with an assigned TransactionId, even VACUUM processes.
*
* We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
* releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
* array until the caller has WAL-logged this snapshot, and releases the
* lock. Acquiring ProcArrayLock ensures that no transactions commit until the
* lock is released.
*
* The returned data structure is statically allocated; caller should not
* modify it, and must not assume it is valid past the next call.
*
* This is never executed during recovery so there is no need to look at
* KnownAssignedXids.
*
* We don't worry about updating other counters, we want to keep this as
* simple as possible and leave GetSnapshotData() as the primary code for
* that bookkeeping.
*
* Note that if any transaction has overflowed its cached subtransactions
* then there is no real need include any subtransactions. That isn't a
* common enough case to worry about optimising the size of the WAL record,
* and we may wish to see that data for diagnostic purposes anyway.
*/
RunningTransactions GetRunningTransactionData(void)
{
/* result workspace */
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
RunningTransactions CurrentRunningXacts = t_thrd.storage_cxt.CurrentRunningXacts;
TransactionId latestCompletedXid;
TransactionId oldestRunningXid;
TransactionId* xids = NULL;
int index;
int count = 0;
int subcount = 0;
bool suboverflowed = false;
int rc = 0;
Assert(!RecoveryInProgress());
/*
* Allocating space for maxProcs xids is usually overkill; numProcs would
* be sufficient. But it seems better to do the malloc while not holding
* the lock, so we can't look at numProcs. Likewise, we allocate much
* more subxip storage than is probably needed.
*
* Should only be allocated in bgwriter, since only ever executed during
* checkpoints.
*/
if (CurrentRunningXacts->xids == NULL) {
/*
* First call
*/
CurrentRunningXacts->xids = (TransactionId*)MemoryContextAlloc(
THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE),
(unsigned int)TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
if (CurrentRunningXacts->xids == NULL)
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")));
}
xids = CurrentRunningXacts->xids;
/*
* Ensure that no xids enter or leave the procarray while we obtain
* snapshot.
*/
LWLockAcquire(XidGenLock, LW_SHARED);
LWLockAcquire(ProcArrayLock, LW_SHARED);
latestCompletedXid = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
oldestRunningXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
/* xmax is always latestCompletedXid + 1 */
TransactionId xmax = latestCompletedXid;
TransactionIdAdvance(xmax);
TransactionId globalXmin = xmax;
/*
* Spin over procArray collecting all xids and subxids.
*/
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId xid;
int nxids;
/* Update globalxmin to be the smallest valid xmin */
xid = pgxact->xmin; /* fetch just once */
if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, globalXmin)) {
globalXmin = xid;
}
/* Fetch xid just once - see GetNewTransactionId */
xid = pgxact->xid;
/*
* We don't need to store transactions that don't have a TransactionId
* yet because they will not show as running on a standby server.
*/
if (!TransactionIdIsValid(xid))
continue;
xids[count++] = xid;
if (TransactionIdPrecedes(xid, oldestRunningXid))
oldestRunningXid = xid;
/*
* Save subtransaction XIDs. Other backends can't add or remove
* entries while we're holding XidGenLock.
*/
nxids = pgxact->nxids;
if (nxids > 0) {
if (nxids > PGPROC_MAX_CACHED_SUBXIDS)
nxids = PGPROC_MAX_CACHED_SUBXIDS;
rc = memcpy_s(&xids[count], nxids * sizeof(TransactionId), (void *)proc->subxids.xids,
nxids * sizeof(TransactionId));
securec_check(rc, "\0", "\0");
count += nxids;
subcount += nxids;
if (pgxact->nxids > PGPROC_MAX_CACHED_SUBXIDS)
suboverflowed = true;
/*
* Top-level XID of a transaction is always less than any of its
* subxids, so we don't need to check if any of the subxids are
* smaller than oldestRunningXid
*/
}
}
/*
* Update globalxmin to include actual process xids. This is a slightly
* different way of computing it than GetOldestXmin uses, but should give
* the same result.
*/
if (TransactionIdPrecedes(oldestRunningXid, globalXmin)) {
globalXmin = oldestRunningXid;
}
/*
* It's important *not* to include the limits set by slots here because
* snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
* were to be included here the initial value could never increase because
* of a circular dependency where slots only increase their limits when
* running xacts increases oldestRunningXid and running xacts only
* increases if slots do.
*/
CurrentRunningXacts->xcnt = count;
CurrentRunningXacts->subxid_overflow = suboverflowed;
CurrentRunningXacts->nextXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
CurrentRunningXacts->globalXmin = globalXmin;
Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
/* We don't release the locks here, the caller is responsible for that */
return CurrentRunningXacts;
}
/*
* ProcArrayApplyRecoveryInfo -- apply recovery info about xids
*
* Takes us through 3 states: Initialized, Pending and Ready.
* Normal case is to go all the way to Ready straight away, though there
* are atypical cases where we need to take it in steps.
*
* Use the data about running transactions on master to create the initial
* state of KnownAssignedXids. We also use these records to regularly prune
* KnownAssignedXids because we know it is possible that some transactions
* with FATAL errors fail to write abort records, which could cause eventual
* overflow.
*
* See comments for LogStandbySnapshot().
*/
void ProcArrayApplyRecoveryInfo(RunningTransactions running)
{
TransactionId nextXid;
Assert(t_thrd.xlog_cxt.standbyState >= STANDBY_INITIALIZED);
Assert(TransactionIdIsValid(running->nextXid));
Assert(TransactionIdIsValid(running->oldestRunningXid));
Assert(TransactionIdIsNormal(running->latestCompletedXid));
/*
* Remove stale locks, if any.
*
* Locks are always assigned to the toplevel xid so we don't need to care
* about subxcnt/subxids (and by extension not about ->suboverflowed).
*/
StandbyReleaseOldLocks(running->oldestRunningXid);
/*
* If our snapshot is already valid, nothing else to do...
*/
if (t_thrd.xlog_cxt.standbyState == STANDBY_SNAPSHOT_READY)
return;
Assert(t_thrd.xlog_cxt.standbyState == STANDBY_INITIALIZED);
/*
* latestObservedXid is at least set to the point where CSNLOG was
* started up to (c.f. ProcArrayInitRecovery()) or to the biggest xid
* RecordKnownAssignedTransactionIds() was called for. Initialize
* subtrans from thereon, up to nextXid - 1.
*
* We need to duplicate parts of RecordKnownAssignedTransactionId() here,
* because we've just added xids to the known assigned xids machinery that
* haven't gone through RecordKnownAssignedTransactionId().
*/
Assert(TransactionIdIsNormal(t_thrd.storage_cxt.latestObservedXid));
TransactionIdAdvance(t_thrd.storage_cxt.latestObservedXid);
while (TransactionIdPrecedes(t_thrd.storage_cxt.latestObservedXid, running->nextXid)) {
ExtendCSNLOG(t_thrd.storage_cxt.latestObservedXid);
TransactionIdAdvance(t_thrd.storage_cxt.latestObservedXid);
}
TransactionIdRetreat(t_thrd.storage_cxt.latestObservedXid); /* = running->nextXid - 1 */
t_thrd.xlog_cxt.standbyState = STANDBY_SNAPSHOT_READY;
MultiRedoUpdateStandbyState((HotStandbyState)t_thrd.xlog_cxt.standbyState);
/*
* If a transaction wrote a commit record in the gap between taking and
* logging the snapshot then latestCompletedXid may already be higher than
* the value from the snapshot, so check before we use the incoming value.
*/
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid, running->latestCompletedXid))
t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
Assert(TransactionIdIsNormal(t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid));
/*
* ShmemVariableCache->nextXid must be beyond any observed xid.
*
* We don't expect anyone else to modify nextXid, hence we don't need to
* hold a lock while examining it. We still acquire the lock to modify
* it, though.
*/
nextXid = t_thrd.storage_cxt.latestObservedXid;
TransactionIdAdvance(nextXid);
if (TransactionIdFollows(nextXid, t_thrd.xact_cxt.ShmemVariableCache->nextXid)) {
LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
if (TransactionIdFollows(nextXid, t_thrd.xact_cxt.ShmemVariableCache->nextXid)) {
t_thrd.xact_cxt.ShmemVariableCache->nextXid = nextXid;
}
LWLockRelease(XidGenLock);
}
Assert(TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->nextXid));
ereport(trace_recovery(DEBUG1), (errmsg("recovery snapshots are now enabled")));
}
/*
* TransactionIdIsActive -- is xid the top-level XID of an active backend?
*
* This differs from TransactionIdIsInProgress in that it ignores prepared
* transactions, as well as transactions running on the master if we're in
* hot standby. Also, we ignore subtransactions since that's not needed
* for current uses.
*/
bool TransactionIdIsActive(TransactionId xid)
{
bool result = false;
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int i;
/*
* Don't bother checking a transaction older than RecentXmin; it could not
* possibly still be running.
*/
if (TransactionIdPrecedes(xid, u_sess->utils_cxt.RecentXmin))
return false;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (i = 0; i < arrayP->numProcs; i++) {
int pgprocno = arrayP->pgprocnos[i];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId pxid;
/* Fetch xid just once - see GetNewTransactionId */
pxid = pgxact->xid;
if (!TransactionIdIsValid(pxid))
continue;
if (proc->pid == 0)
continue; /* ignore prepared transactions */
if (TransactionIdEquals(pxid, xid)) {
result = true;
break;
}
}
LWLockRelease(ProcArrayLock);
return result;
}
/* Free xid cache memory if max number exceed PGPROC_MAX_CACHED_SUBXIDS */
static void ResetProcXidCache(PGPROC* proc, bool needlock)
{
if (proc->subxids.maxNumber > PGPROC_INIT_CACHED_SUBXIDS) {
/* Use subxidsLock to protect subxids */
if (needlock)
LWLockAcquire(proc->subxidsLock, LW_EXCLUSIVE);
else
HOLD_INTERRUPTS();
proc->subxids.maxNumber = 0;
pfree(proc->subxids.xids);
proc->subxids.xids = NULL;
if (needlock)
LWLockRelease(proc->subxidsLock);
else
RESUME_INTERRUPTS();
}
}
/* Free xidcache before proc exit */
void ProcSubXidCacheClean()
{
if (t_thrd.proc && t_thrd.proc->subxids.maxNumber > PGPROC_INIT_CACHED_SUBXIDS) {
/* Use subxidsLock to protect subxids */
LWLockAcquire(t_thrd.proc->subxidsLock, LW_EXCLUSIVE);
t_thrd.pgxact->nxids = 0;
t_thrd.proc->subxids.maxNumber = 0;
pfree(t_thrd.proc->subxids.xids);
t_thrd.proc->subxids.xids = NULL;
LWLockRelease(t_thrd.proc->subxidsLock);
}
}
void InitProcSubXidCacheContext()
{
if (ProcSubXidCacheContext == NULL) {
ProcSubXidCacheContext = AllocSetContextCreate(g_instance.instance_context,
"ProcSubXidCacheContext",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE,
SHARED_CONTEXT);
}
}
/*
* TransactionIdIsInProgress -- is given transaction running in some backend
*
* Aside from some shortcuts such as checking RecentXmin and our own Xid,
* there are four possibilities for finding a running transaction:
*
* 1. The given Xid is a main transaction Id. We will find this out cheaply
* by looking at the PGXACT struct for each backend.
*
* 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
* We can find this out cheaply too.
*
* 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
* if the Xid is running on the master.
*
* 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
* if that is running according to PGXACT or KnownAssignedXids. This is the
* slowest way, but sadly it has to be done always if the others failed,
* unless we see that the cached subxact sets are complete (none have
* overflowed).
*
* ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids
* while doing 1 and 3, we can release the ProcArrayLock while we do 4.
* This buys back some concurrency (and we can't retrieve the main Xids from
* PGXACT again anyway; see GetNewTransactionId).
*
* In MPPDB cluster environment, RecentXmin might not be the minimun xid, e.g.
* 1. T1 starts at CN
* 2. T2 starts at DN, gets RecentXmin from GTM, larger than T1 if GTM is
cleared up
* 3. CN send T1 to DN
* 4. T2 maybe set wrong tuple hints of T1 if it considered T1 is minor than
RecentXmin
* as not in progress.
* This scene using RecentXmin to shortcut might has wrong status of T1, then
wrong
* infomask for tuple T1 dealt. So we will not shortut by RecentXmin by
default.
* But if using MVCC snapshot, we confirm local snapshot will sync with GTM,
and make
* sure RecentXmin is the minimun xid here. So we just shortcuts by checking
RecentXmin
* in HeapTupleSatisfiesMVCC. But we keep assert checking every scene for
data consistency.
*/
bool TransactionIdIsInProgress(TransactionId xid, uint32* needSync, bool shortcutByRecentXmin,
bool bCareNextxid, bool isTopXact, bool checkLatestCompletedXid)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
#ifdef USE_ASSERT_CHECKING
bool shortCutCheckRes = true;
#endif
volatile int i = 0;
volatile int j = 0;
/*
* Don't bother checking a transaction older than RecentXmin; it could not
* possibly still be running. (Note: in particular, this guarantees that
* we reject InvalidTransactionId, FrozenTransactionId, etc as not
* running.)
*
* Notes: our principle for distribute transaction is:
* We should treat gtm xact state as the global xact state, when local
xact state
* is not match with gtm xact, we block until they are match(
SyncLocalXactsWithGTM).
*
* So, the shortcut `RecentXmin' is not worth worried, because when it is
assigned value
* local must sync with gtm.
*/
uint64 recycle_xid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
/* in hotstandby mode, the proc may being runnnig */
if (RecoveryInProgress()) {
recycle_xid = InvalidTransactionId;
}
if (shortcutByRecentXmin && TransactionIdPrecedes(xid, recycle_xid)) {
xc_by_recent_xmin_inc();
/*
* As xc_maintenance_mode not sync local xacts with GTM for consistency,
* Here we just check not in xc_maintenance_mode.
*/
if (!u_sess->attr.attr_common.xc_maintenance_mode) {
#ifdef USE_ASSERT_CHECKING
shortCutCheckRes = false;
#endif
}
#ifdef USE_ASSERT_CHECKING
/* fall through to do recheck */
#else
return false;
#endif
}
/*
* We may have just checked the status of this transaction, so if it is
* already known to be completed, we can fall out without any access to
* shared memory.
*/
if (TransactionIdIsKnownCompleted(xid)) {
xc_by_known_xact_inc();
return false;
}
if (ENABLE_DMS) {
/* fetch TXN info locally if either reformer, original primary, or normal primary */
bool local_fetch = SSCanFetchLocalSnapshotTxnRelatedInfo();
if (!local_fetch) {
bool in_progress = true;
SSTransactionIdIsInProgress(xid, &in_progress);
return in_progress;
}
}
/*
* Also, we can handle our own transaction (and subtransactions) without
* any access to shared memory.
*/
if (TransactionIdIsCurrentTransactionId(xid)) {
xc_by_my_xact_inc();
Assert(shortCutCheckRes == true);
return true;
}
if (!RecoveryInProgress()) {
LWLockAcquire(ProcArrayLock, LW_SHARED);
/*
* Now that we have the lock, we can check latestCompletedXid; if the
* target Xid is after that, it's surely still running.
*/
if (checkLatestCompletedXid &&
TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid, xid)) {
LWLockRelease(ProcArrayLock);
xc_by_latest_xid_inc();
/*
* If xid < RecentXmin, xid should smaller than latestCompletedXid,
* So shortCutCheckRes should be false. But for data replication,
* page maybe faster than xlog, and tuple xid will be more than
* latestCompletedXid after standby promote to primary. So the assert cannot
* be always true, we will remove the assert. And it will not affect MVCC,
* the xid should be aborted. Assert(shortCutCheckRes == true);
*/
return true;
}
if (isTopXact && !bCareNextxid) {
int procId = ProcXactHashTableLookup(xid);
volatile PGXACT *pgxact = &g_instance.proc_base_all_xacts[procId];
if (procId != InvalidProcessId) {
if (needSync != NULL) {
*needSync = pgxact->needToSyncXid;
}
LWLockRelease(ProcArrayLock);
return true;
}
LWLockRelease(ProcArrayLock);
} else {
/* No shortcuts, gotta grovel through the array */
for (i = 0; i < arrayP->numProcs; i++) {
int pgprocno = arrayP->pgprocnos[i];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId pxid;
/* Ignore my own proc --- dealt with it above */
if (proc == t_thrd.proc)
continue;
/* Fetch xid just once - see GetNewTransactionId */
pxid = pgxact->xid;
if (!TransactionIdIsValid(pxid)) {
if (bCareNextxid && TransactionIdIsValid(pgxact->next_xid))
pxid = pgxact->next_xid;
else
continue;
}
/*
* Step 1: check the main Xid
*/
if (TransactionIdEquals(pxid, xid)) {
if (needSync != NULL)
*needSync = pgxact->needToSyncXid;
LWLockRelease(ProcArrayLock);
xc_by_main_xid_inc();
Assert(shortCutCheckRes == true);
return true;
}
/*
* We can ignore main Xids that are younger than the target Xid, since
* the target could not possibly be their child.
*/
if (TransactionIdPrecedes(xid, pxid))
continue;
/*
* Step 2: check the cached child-Xids arrays
*/
if (pgxact->nxids > 0) {
/* Use subxidsLock to protect subxids */
LWLockAcquire(proc->subxidsLock, LW_SHARED);
for (j = pgxact->nxids - 1; j >= 0; j--) {
/* Fetch xid just once - see GetNewTransactionId */
TransactionId cxid = proc->subxids.xids[j];
if (TransactionIdEquals(cxid, xid)) {
if (needSync != NULL)
*needSync = pgxact->needToSyncXid;
LWLockRelease(proc->subxidsLock);
LWLockRelease(ProcArrayLock);
xc_by_child_xid_inc();
Assert(shortCutCheckRes == true);
return true;
}
}
LWLockRelease(proc->subxidsLock);
}
}
LWLockRelease(ProcArrayLock);
}
}
/*
* Step 3: in hot standby mode, check the CSN log.
*/
if (RecoveryInProgress()) {
CommitSeqNo csn;
csn = TransactionIdGetCommitSeqNo(xid, false, false, true, NULL);
if (COMMITSEQNO_IS_COMMITTED(csn) || COMMITSEQNO_IS_ABORTED(csn))
return false;
else
return true;
}
return false;
}
/* Called by GetOldestXmin() */
static void UpdateRecentGlobalXmin(TransactionId currGlobalXmin, TransactionId result)
{
if (module_logging_is_on(MOD_TRANS_SNAPSHOT))
ereport(LOG, (errmodule(MOD_TRANS_SNAPSHOT), errmsg("recentGlobalXmin before update: currGlobalXmin = %lu",
currGlobalXmin)));
while (TransactionIdFollows(result, currGlobalXmin)) {
if (pg_atomic_compare_exchange_u64(
&t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin, &currGlobalXmin, result)) {
if (module_logging_is_on(MOD_TRANS_SNAPSHOT))
ereport(LOG,
(errmodule(MOD_TRANS_SNAPSHOT), errmsg("recentGlobalXmin after update: %lu.", result)));
break;
}
}
}
/*
* GetOldestXmin -- returns oldest transaction that was running
* when any current transaction was started.
*
* If rel is NULL or a shared relation, all backends are considered, otherwise
* only backends running in this database are considered.
*
* If ignoreVacuum is TRUE then backends with the PROC_IN_VACUUM flag set are
* ignored.
*
* This is used by VACUUM to decide which deleted tuples must be preserved in
* the passed in table. For shared relations backends in all databases must be
* considered, but for non-shared relations that's not required, since only
* backends in my own database could ever see the tuples in them. Also, we can
* ignore concurrently running lazy VACUUMs because (a) they must be working
* on other tables, and (b) they don't need to do snapshot-based lookups.
*
* This is also used to determine where to truncate pg_subtrans. For that
* backends in all databases have to be considered, so rel = NULL has to be
* passed in.
*
* Note: we include all currently running xids in the set of considered xids.
* This ensures that if a just-started xact has not yet set its snapshot,
* when it does set the snapshot it cannot set xmin less than what we compute.
* See notes in src/backend/access/transam/README.
*
* Note: despite the above, it's possible for the calculated value to move
* backwards on repeated calls. The calculated value is conservative, so that
* anything older is definitely not considered as running by anyone anymore,
* but the exact value calculated depends on a number of things. For example,
* if rel = NULL and there are no transactions running in the current
* database, GetOldestXmin() returns latestCompletedXid. If a transaction
* begins after that, its xmin will include in-progress transactions in other
* databases that started earlier, so another call will return a lower value.
* Nonetheless it is safe to vacuum a table in the current database with the
* first result. There are also replication-related effects: a walsender
* process can set its xmin based on transactions that are no longer running
* in the master but are still being replayed on the standby, thus possibly
* making the GetOldestXmin reading go backwards. In this case there is a
* possibility that we lose data that the standby would like to have, but
* there is little we can do about that --- data is only protected if the
* walsender runs continuously while queries are executed on the standby.
* (The Hot Standby code deals with such cases by failing standby queries
* that needed to access already-removed data, so there's no integrity bug.)
* The return value is also adjusted with vacuum_defer_cleanup_age, so
* increasing that setting on the fly is another easy way to make
* GetOldestXmin() move backwards, with no consequences for data integrity.
*/
TransactionId GetOldestXmin(Relation rel, bool bFixRecentGlobalXmin, bool bRecentGlobalXminNoCheck)
{
TransactionId result = InvalidTransactionId;
TransactionId currGlobalXmin;
TransactionId replication_slot_xmin;
volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
if (!bFixRecentGlobalXmin && TransactionIdIsNormal(u_sess->utils_cxt.RecentGlobalXmin) && !bRecentGlobalXminNoCheck)
return u_sess->utils_cxt.RecentGlobalXmin;
/* Fetch into local variable, don't need to hold ProcArrayLock */
replication_slot_xmin = g_instance.proc_array_idx->replication_slot_xmin;
if (!GTM_LITE_MODE) {
/* Get recentLocalXmin from the latest snapshot */
result = GetMultiSnapshotOldestXmin();
if (bFixRecentGlobalXmin) {
/* Fix recentGlobalXmin */
if (!TransactionIdIsNormal(result) || TransactionIdFollows(result, u_sess->utils_cxt.RecentGlobalXmin))
result = u_sess->utils_cxt.RecentGlobalXmin;
/* Update recentGlobalXmin if needed */
if (!u_sess->attr.attr_common.xc_maintenance_mode && !u_sess->utils_cxt.cn_xc_maintain_mode) {
currGlobalXmin = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin);
UpdateRecentGlobalXmin(currGlobalXmin, result);
}
} else if (!bRecentGlobalXminNoCheck) {
/* Get recentGlobalXmin from ShmemVariableCache */
currGlobalXmin = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin);
if (TransactionIdIsNormal(currGlobalXmin) &&
(!TransactionIdIsValid(result) || TransactionIdPrecedes(currGlobalXmin, result)))
result = currGlobalXmin;
}
} else {
/* directly fetch recentGlobalXmin from ShmemVariableCache */
result = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin);
}
/* Update by vacuum_defer_cleanup_age */
if (TransactionIdPrecedes(result, (uint64)u_sess->attr.attr_storage.vacuum_defer_cleanup_age)) {
result = FirstNormalTransactionId;
} else {
result -= u_sess->attr.attr_storage.vacuum_defer_cleanup_age;
}
/* Check whether there's a replication slot requiring an older xmin. */
if (TransactionIdIsNormal(replication_slot_xmin) && TransactionIdPrecedes(replication_slot_xmin, result))
result = replication_slot_xmin;
if (!TransactionIdIsNormal(result))
result = FirstNormalTransactionId;
/* fetch into volatile var while ProcArrayLock is held */
replication_slot_xmin = g_instance.proc_array_idx->replication_slot_xmin;
replication_slot_catalog_xmin = g_instance.proc_array_idx->replication_slot_catalog_xmin;
/*
* Check whether there are replication slots requiring an older xmin.
*/
if (TransactionIdIsValid(replication_slot_xmin) &&
NormalTransactionIdPrecedes(replication_slot_xmin, result)) {
result = replication_slot_xmin;
}
/*
* After locks have been released and defer_cleanup_age has been applied,
* check whether we need to back up further to make logical decoding
* possible. We need to do so if we're computing the global limit (rel =
* NULL) or if the passed relation is a catalog relation of some kind.
*/
if ((rel != NULL && RelationIsAccessibleInLogicalDecoding(rel)) &&
TransactionIdIsValid(replication_slot_catalog_xmin) &&
NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result))
result = replication_slot_catalog_xmin;
return result;
}
TransactionId GetGlobalOldestXmin()
{
TransactionId result = InvalidTransactionId;
/* directly fetch Global OldestXmin */
result = GetMultiSnapshotOldestXmin();
/* Update by vacuum_defer_cleanup_age */
if (TransactionIdPrecedes(result, (uint64)u_sess->attr.attr_storage.vacuum_defer_cleanup_age)) {
result = FirstNormalTransactionId;
} else {
result -= u_sess->attr.attr_storage.vacuum_defer_cleanup_age;
}
return result;
}
TransactionId GetOldestXminForUndo(TransactionId *recycleXmin)
{
TransactionId oldestXmin = GetMultiSnapshotOldestXmin();
*recycleXmin = oldestXmin;
TransactionId globalRecycleXid = pg_atomic_read_u64(&g_instance.undo_cxt.globalRecycleXid);
TransactionId xmin = InvalidTransactionId;
if (ENABLE_TCAP_VERSION) {
xmin = g_instance.flashback_cxt.globalOldestXminInFlashback;
if (TransactionIdIsValid(xmin)) {
*recycleXmin = (oldestXmin < xmin) ? oldestXmin : xmin;
}
if (unlikely(TransactionIdPrecedes(*recycleXmin, globalRecycleXid))) {
*recycleXmin = globalRecycleXid;
}
}
ereport(DEBUG1, (errmodule(MOD_UNDO),
errmsg("recycleXmin is %lu, globalOldestXminInFlashback is %lu, oldestXmin is %lu.",
*recycleXmin, g_instance.flashback_cxt.globalOldestXminInFlashback, oldestXmin)));
return oldestXmin;
}
/*
* GetMaxSnapshotXidCount -- get max size for snapshot XID array
*
* We have to export this for use by snapmgr.c.
*/
int GetMaxSnapshotXidCount(void)
{
return g_instance.proc_array_idx->maxProcs;
}
/*
* GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array
*
* We have to export this for use by snapmgr.c.
*/
int GetMaxSnapshotSubxidCount(void)
{
return TOTAL_MAX_CACHED_SUBXIDS;
}
/*
* returns oldest transaction for catalog that was running when any current transaction was started.
* take replication slot into consideration. please make sure it's safe to read replication_slot_catalog_xmin
* before calling this func.
*/
TransactionId GetOldestCatalogXmin()
{
TransactionId res = u_sess->utils_cxt.RecentGlobalXmin;
TransactionId repSlotCatalogXmin = g_instance.proc_array_idx->replication_slot_catalog_xmin;
if (TransactionIdIsNormal(repSlotCatalogXmin) && TransactionIdPrecedes(repSlotCatalogXmin, res)) {
return repSlotCatalogXmin;
}
return res;
}
#ifndef ENABLE_MULTIPLE_NODES
static void GroupGetSnapshotInternal(PGXACT* pgxact, Snapshot snapshot, TransactionId *xmin)
{
if (!TransactionIdIsValid(pgxact->xmin)) {
pgxact->xmin = *xmin;
}
if (snapshot->takenDuringRecovery && TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin)) {
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin, *xmin)) {
*xmin = t_thrd.xact_cxt.ShmemVariableCache->standbyXmin;
}
pgxact->xmin = *xmin;
}
snapshot->snapshotcsn = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo);
}
/*
* GroupGetSnapshot -- group snapshot getting
*
* When we cannot immediately acquire ProcArrayLock in exclusive mode, add
* ourselves to a list of processes that need to get snapshot.
* The first process to add itself to the list will acquire ProcArrayLock
* in exclusive mode and perform GroupGetSnapshotInternal on behalf of all
* group members. This avoids a great deal of contention around
* ProcArrayLock when many processes are trying to get snapshot at once,
* since the lock need not be repeatedly handed off from one process to the next.
*/
static void GroupGetSnapshot(PGPROC* proc)
{
uint32 nextidx;
uint32 wakeidx;
TransactionId xmin;
TransactionId xmax;
TransactionId globalxmin;
volatile TransactionId replication_slot_xmin = InvalidTransactionId;
volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
bool clearGroup = false;
HOLD_INTERRUPTS();
/* Add ourselves to the list of processes needing to get snapshot. */
proc->snapshotGroupMember = true;
while (true) {
nextidx = pg_atomic_read_u32(&g_instance.proc_base->snapshotGroupFirst);
pg_atomic_write_u32(&proc->snapshotGroupNext, nextidx);
/* Ensure all previous writes are visible before follower continues. */
pg_memory_barrier();
if (pg_atomic_compare_exchange_u32(
&g_instance.proc_base->snapshotGroupFirst, &nextidx, (uint32)proc->pgprocno))
break;
}
/*
* If the list was not empty, the leader will get our snapshot. It is
* impossible to have followers without a leader because the first process
* that has added itself to the list will always have nextidx as
* INVALID_PGPROCNO.
*/
if (nextidx != INVALID_PGPROCNO) {
int extraWaits = 0;
/* Sleep until the leader gets our snapshot. */
for (;;) {
/* acts as a read barrier */
PGSemaphoreLock(&proc->sem, false);
if (!proc->snapshotGroupMember)
break;
extraWaits++;
}
Assert(pg_atomic_read_u32(&proc->snapshotGroupNext) == INVALID_PGPROCNO);
/* Fix semaphore count for any absorbed wakeups */
while (extraWaits-- > 0)
PGSemaphoreUnlock(&proc->sem);
/* in case of memory reordering in relaxed memory model like ARM */
pg_memory_barrier();
RESUME_INTERRUPTS();
return;
}
RESUME_INTERRUPTS();
/* We are the leader. Acquire the lock on behalf of everyone. */
bool retryGet = false;
RETRY_GET:
if (retryGet) {
if (InterruptPending) {
clearGroup = true;
}
pg_usleep(100L);
}
if (!clearGroup) {
XLogRecPtr redoEndLsn = GetXLogReplayRecPtr(NULL, NULL);
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
bool condition = (t_thrd.xact_cxt.ShmemVariableCache->standbyXmin <=
t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXmin) &&
(t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXminLsn > redoEndLsn);
if (condition) {
LWLockRelease(ProcArrayLock);
retryGet = true;
goto RETRY_GET;
}
/*
* Now that we've got the lock, clear the list of processes waiting for
* group snapshot getting, saving a pointer to the head of the list. Trying
* to pop elements one at a time could lead to an ABA problem.
*/
while (true) {
nextidx = pg_atomic_read_u32(&g_instance.proc_base->snapshotGroupFirst);
if (pg_atomic_compare_exchange_u32(&g_instance.proc_base->snapshotGroupFirst, &nextidx, INVALID_PGPROCNO))
break;
}
/* Remember head of list so we can perform wakeups after dropping lock. */
wakeidx = nextidx;
/* calculate the following infos after we have got ProcArrayLock. */
/* xmax is always latestCompletedXid + 1 */
xmax = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
Assert(TransactionIdIsNormal(xmax));
TransactionIdAdvance(xmax);
/* initialize xmin calculation with xmax */
globalxmin = xmin = xmax;
/* fetch into volatile var while ProcArrayLock is held */
replication_slot_xmin = g_instance.proc_array_idx->replication_slot_xmin;
replication_slot_catalog_xmin = g_instance.proc_array_idx->replication_slot_catalog_xmin;
/* Walk the list and get all snapshots. */
while (nextidx != INVALID_PGPROCNO) {
PGPROC* procMember = g_instance.proc_base_all_procs[nextidx];
PGXACT* pgxact = &g_instance.proc_base_all_xacts[nextidx];
pg_memory_barrier();
GroupGetSnapshotInternal(pgxact, procMember->snapshotGroup, &xmin);
procMember->xminGroup = xmin;
procMember->xmaxGroup = xmax;
procMember->globalxminGroup = globalxmin;
procMember->replicationSlotXminGroup = replication_slot_xmin;
procMember->replicationSlotCatalogXminGroup = replication_slot_catalog_xmin;
/* Move to next proc in list. */
nextidx = pg_atomic_read_u32(&procMember->snapshotGroupNext);
}
/* We're done with the lock now. */
LWLockRelease(ProcArrayLock);
} else {
/* clear the group, then process interrupt */
while (true) {
nextidx = pg_atomic_read_u32(&g_instance.proc_base->snapshotGroupFirst);
if (pg_atomic_compare_exchange_u32(&g_instance.proc_base->snapshotGroupFirst, &nextidx, INVALID_PGPROCNO))
break;
}
wakeidx = nextidx;
}
/*
* Now that we've released the lock, go back and wake everybody up. We
* don't do this under the lock so as to keep lock hold times to a
* minimum. The system calls we need to perform to wake other processes
* up are probably much slower than the simple memory writes we did while
* holding the lock.
*/
while (wakeidx != INVALID_PGPROCNO) {
PGPROC* procMember = g_instance.proc_base_all_procs[wakeidx];
wakeidx = pg_atomic_read_u32(&procMember->snapshotGroupNext);
pg_atomic_write_u32(&procMember->snapshotGroupNext, INVALID_PGPROCNO);
/* ensure all previous writes are visible before follower continues. */
pg_memory_barrier();
procMember->snapshotGroupMember = false;
if (procMember != t_thrd.proc)
PGSemaphoreUnlock(&procMember->sem);
}
if (clearGroup) {
CHECK_FOR_INTERRUPTS();
}
}
void AgentCopySnapshot(TransactionId *xmin, TransactionId *xmax, CommitSeqNo *snapcsn)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int* pgprocnos = arrayP->pgprocnos;
int numProcs = arrayP->numProcs;
volatile PGXACT* pgxact = NULL;
PGPROC* proc = NULL;
int pgprocno;
int maxPgprocno;
TransactionId pgprocXmin;
TransactionId maxpgprocXmin;
maxpgprocXmin = InvalidTransactionId;
for (int index = 0; index < numProcs; index++) {
pgprocno = pgprocnos[index];
pgxact = &g_instance.proc_base_all_xacts[pgprocno];
/*
* Backend is doing logical decoding which manages snapshot
* separately, check below.
*/
if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING) {
continue;
}
if (pgxact == t_thrd.pgxact) {
continue;
}
pgprocXmin = pgxact->xmin;
/* get pgprocno with maximal xmin to reduce recovery conflict. */
if (TransactionIdIsNormal(pgprocXmin) && TransactionIdPrecedes(maxpgprocXmin, pgprocXmin)) {
maxpgprocXmin = pgprocXmin;
maxPgprocno = pgprocno;
}
}
if (TransactionIdIsValid(maxpgprocXmin)) {
pgxact = &g_instance.proc_base_all_xacts[maxPgprocno];
proc = g_instance.proc_base_all_procs[maxPgprocno];
*xmin = pgxact->xmin;
*xmax = proc->snapXmax;
*snapcsn = proc->snapCSN;
} else {
*xmin = InvalidTransactionId;
*xmax = InvalidTransactionId;
*snapcsn = InvalidCommitSeqNo;
}
}
#endif
/*
* GetSnapshotData -- returns information about running transactions.
*
* The returned snapshot includes xmin (lowest still-running xact ID),
* xmax (highest completed xact ID + 1), and a list of running xact IDs
* in the range xmin <= xid < xmax. It is used as follows:
* All xact IDs < xmin are considered finished.
* All xact IDs >= xmax are considered still running.
* For an xact ID xmin <= xid < xmax, consult list to see whether
* it is considered running or not.
* This ensures that the set of transactions seen as "running" by the
* current xact will not change after it takes the snapshot.
*
* All running top-level XIDs are included in the snapshot, except for lazy
* VACUUM processes. We also try to include running subtransaction XIDs,
* but since PGPROC has only a limited cache area for subxact XIDs, full
* information may not be available. If we find any overflowed subxid arrays,
* we have to mark the snapshot's subxid data as overflowed, and extra work
* *may* need to be done to determine what's running (see XidInMVCCSnapshot()
* in heapam_visibility.c).
*
* We also update the following backend-global variables:
* TransactionXmin: the oldest xmin of any snapshot in use in the
* current transaction (this is the same as MyPgXact->xmin).
* RecentXmin: the xmin computed for the most recent snapshot. XIDs
* older than this are known not running any more.
* RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
* running transactions, except those running LAZY VACUUM). This is
* the same computation done by GetOldestXmin(true, true).
* RecentGlobalDataXmin: the global xmin for non-catalog tables
* >= RecentGlobalXmin
*
* Note: this function should probably not be called with an argument that's
* not statically allocated (see xip allocation below).
*/
#ifndef ENABLE_MULTIPLE_NODES
Snapshot GetSnapshotData(Snapshot snapshot, bool force_local_snapshot, bool forHSFeedBack)
#else
Snapshot GetSnapshotData(Snapshot snapshot, bool force_local_snapshot)
#endif
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
TransactionId xmin;
TransactionId xmax;
TransactionId globalxmin;
int index;
volatile TransactionId replication_slot_xmin = InvalidTransactionId;
volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
bool is_exec_cn = IS_PGXC_COORDINATOR && !IsConnFromCoord();
bool is_exec_dn = IS_PGXC_DATANODE && !IsConnFromCoord() && !IsConnFromDatanode();
WaitState oldStatus = STATE_WAIT_UNDEFINED;
Assert(snapshot != NULL);
#ifdef PGXC /* PGXC_DATANODE */
t_thrd.xact_cxt.useLocalSnapshot = false;
if ((IS_MULTI_DISASTER_RECOVER_MODE && !is_exec_dn) ||
(GTM_LITE_MODE &&
((is_exec_cn && !force_local_snapshot) || /* GTM_LITE exec cn */
(!is_exec_cn && u_sess->utils_cxt.snapshot_source == SNAPSHOT_COORDINATOR)))) { /* GTM_LITE other node */
/*
* Obtain a global snapshot for a openGauss session
* if possible. When not in postmaster environment, get local snapshot, --single mode e.g.
*/
if (!useLocalXid) {
if (!u_sess->attr.attr_common.xc_maintenance_mode && IsPostmasterEnvironment &&
GetPGXCSnapshotData(snapshot)) {
return snapshot;
}
}
}
/* first we try to get multiversion snapshot */
if (t_thrd.postmaster_cxt.HaShmData->current_mode == PRIMARY_MODE ||
t_thrd.postmaster_cxt.HaShmData->current_mode == NORMAL_MODE) {
RETRY:
if (GTM_LITE_MODE) {
/* local snapshot, setup preplist array, must construct preplist before getting local snapshot */
SetLocalSnapshotPreparedArray(snapshot);
snapshot->gtm_snapshot_type = GTM_SNAPSHOT_TYPE_LOCAL;
}
Snapshot result;
if (ENABLE_DMS) {
/* fetch TXN info locally if either reformer, original primary, or normal primary */
if (SSCanFetchLocalSnapshotTxnRelatedInfo()) {
result = GetLocalSnapshotData(snapshot);
snapshot->snapshotcsn = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo);
} else {
result = SSGetSnapshotData(snapshot);
if (result == NULL) {
ereport(ERROR, (errmsg("failed to request snapshot as current node is in reform!")));
}
}
} else {
result = GetLocalSnapshotData(snapshot);
snapshot->snapshotcsn = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo);
}
if (result) {
if (GTM_LITE_MODE) {
/* gtm lite check csn, if not pass, try to get local snapshot form multiversion again */
CommitSeqNo return_csn = set_proc_csn_and_check("GetLocalSnapshotData", snapshot->snapshotcsn,
snapshot->gtm_snapshot_type, SNAPSHOT_DATANODE);
if (!COMMITSEQNO_IS_COMMITTED(return_csn)) {
ereport(LOG,
(errcode(ERRCODE_SNAPSHOT_INVALID), errmsg("Retry to get local multiversion snapshot")));
goto RETRY;
}
u_sess->utils_cxt.RecentGlobalXmin = GetOldestXmin(NULL, true);
u_sess->utils_cxt.RecentGlobalCatalogXmin = GetOldestCatalogXmin();
}
return result;
}
}
/* For gtm-lite and gtm-free, use local snapshot */
t_thrd.xact_cxt.useLocalSnapshot = true;
/*
* The codes below run when GetPGXCSnapshotData() couldn't get snapshot from
* GTM. So no data in snapshot will be used.
*/
cleanSnapshot(snapshot);
#endif
/* By here no available version for local snapshot
*
* It is sufficient to get shared lock on ProcArrayLock, even if we are
* going to set MyPgXact->xmin.
*/
snapshot->takenDuringRecovery = RecoveryInProgress();
if (snapshot->takenDuringRecovery) {
oldStatus = pgstat_report_waitstatus(STATE_STANDBY_GET_SNAPSHOT);
}
bool retry_get = false;
uint64 retry_count = 0;
const static uint64 WAIT_COUNT = 0x7FFFF;
/* reset xmin before acquiring lwlock, in case blocking redo */
t_thrd.pgxact->xmin = InvalidTransactionId;
RETRY_GET:
if (snapshot->takenDuringRecovery && !StreamThreadAmI() && !IS_EXRTO_READ &&
!u_sess->proc_cxt.clientIsCMAgent) {
if (InterruptPending) {
(void)pgstat_report_waitstatus(oldStatus);
}
if (retry_get) {
CHECK_FOR_INTERRUPTS();
pg_usleep(100L);
}
XLogRecPtr redoEndLsn = GetXLogReplayRecPtr(NULL, NULL);
retry_count++;
if ((retry_count & WAIT_COUNT) == WAIT_COUNT) {
ereport(LOG, (errmsg("standbyRedoCleanupXmin = %ld, "
"standbyRedoCleanupXminLsn = %ld, "
"standbyXmin = %ld, redoEndLsn = %ld",
t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXmin,
t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXminLsn,
t_thrd.xact_cxt.ShmemVariableCache->standbyXmin,
redoEndLsn)));
}
if ((u_sess->proc_cxt.gsqlRemainCopyNum > 0 && retry_get)) {
LWLockAcquire(ProcArrayLock, LW_SHARED);
if ((t_thrd.xact_cxt.ShmemVariableCache->standbyXmin
<= t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXmin)
&& (t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXminLsn > redoEndLsn)) {
/*
* If CM agent cannot get consistency snapshot immediately, try
* getting snapshot from other backends.
*/
AgentCopySnapshot(&xmin, &xmax, &snapshot->snapshotcsn);
bool obtained = TransactionIdIsValid(xmin) && TransactionIdIsValid(xmax) &&
snapshot->snapshotcsn != InvalidCommitSeqNo;
if (obtained) {
globalxmin = xmin;
/* fetch into volatile var while ProcArrayLock is held */
replication_slot_xmin = g_instance.proc_array_idx->replication_slot_xmin;
replication_slot_catalog_xmin = g_instance.proc_array_idx->replication_slot_catalog_xmin;
if (!TransactionIdIsValid(t_thrd.pgxact->xmin)) {
t_thrd.pgxact->handle = GetCurrentTransactionHandleIfAny();
}
t_thrd.pgxact->xmin = u_sess->utils_cxt.TransactionXmin = xmin;
LWLockRelease(ProcArrayLock);
u_sess->proc_cxt.gsqlRemainCopyNum--;
/* reuse the groupgetsnapshot logic to set snapshot and thread information. */
goto GROUP_GET_SNAPSHOT;
}
LWLockRelease(ProcArrayLock);
retry_get = true;
goto RETRY_GET;
}
} else if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE)) {
if ((t_thrd.xact_cxt.ShmemVariableCache->standbyXmin <=
t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXmin) &&
(t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXminLsn > redoEndLsn) &&
parallel_recovery::in_full_sync_dispatch()) {
LWLockRelease(ProcArrayLock);
retry_get = true;
goto RETRY_GET;
}
#ifndef ENABLE_MULTIPLE_NODES
} else if (forHSFeedBack) {
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
if ((t_thrd.xact_cxt.ShmemVariableCache->standbyXmin
<= t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXmin)
&& (t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXminLsn > redoEndLsn)) {
LWLockRelease(ProcArrayLock);
retry_get = true;
goto RETRY_GET;
}
}
#endif
else {
if (!retry_get) {
retry_get = true;
goto RETRY_GET;
}
if (!TransactionIdIsValid(t_thrd.pgxact->xmin)) {
t_thrd.pgxact->handle = GetCurrentTransactionHandleIfAny();
}
t_thrd.proc->snapshotGroup = snapshot;
/* ensure all previous writes are visible before setting snapshotGroup. */
pg_memory_barrier();
GroupGetSnapshot(t_thrd.proc);
xmin = t_thrd.proc->xminGroup;
xmax = t_thrd.proc->xmaxGroup;
globalxmin = t_thrd.proc->globalxminGroup;
replication_slot_xmin = t_thrd.proc->replicationSlotXminGroup;
replication_slot_catalog_xmin = t_thrd.proc->replicationSlotCatalogXminGroup;
u_sess->utils_cxt.TransactionXmin = t_thrd.pgxact->xmin;
t_thrd.proc->snapshotGroup = NULL;
t_thrd.proc->xminGroup = InvalidTransactionId;
t_thrd.proc->xmaxGroup = InvalidTransactionId;
t_thrd.proc->globalxminGroup = InvalidTransactionId;
t_thrd.proc->replicationSlotXminGroup = InvalidTransactionId;
t_thrd.proc->replicationSlotCatalogXminGroup = InvalidTransactionId;
if (snapshot->snapshotcsn == 0) {
retry_get = true;
goto RETRY_GET;
}
goto GROUP_GET_SNAPSHOT;
}
} else {
LWLockAcquire(ProcArrayLock, LW_SHARED);
}
/* xmax is always latestCompletedXid + 1 */
xmax = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
Assert(TransactionIdIsNormal(xmax));
TransactionIdAdvance(xmax);
/* initialize xmin calculation with xmax */
globalxmin = xmin = xmax;
/*
* If we're in recovery then snapshot data comes from a different place,
* so decide which route we take before grab the lock. It is possible for
* recovery to end before we finish taking snapshot, and for newly
* assigned transaction ids to be added to the procarray. Xmax cannot
* change while we hold ProcArrayLock, so those newly added transaction
* ids would be filtered away, so we need not be concerned about them.
*/
#ifndef ENABLE_MULTIPLE_NODES
if (!snapshot->takenDuringRecovery || forHSFeedBack) {
#else
if (!snapshot->takenDuringRecovery) {
#endif
int* pgprocnos = arrayP->pgprocnos;
int numProcs;
/*
* Spin over procArray checking xid, xmin, and subxids. The goal is
* to gather all active xids, find the lowest xmin, and try to record
* subxids.
*/
numProcs = arrayP->numProcs;
for (index = 0; index < numProcs; index++) {
int pgprocno = pgprocnos[index];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId xid = InvalidTransactionId;
/*
* Backend is doing logical decoding which manages xmin
* separately, check below.
*/
if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
continue;
/* Update globalxmin to be the smallest valid xmin, only Ignore procs running LAZY VACUUM xmin */
if (!(pgxact->vacuumFlags & PROC_IN_VACUUM)) {
xid = pgxact->xmin; /* fetch just once */
}
if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, globalxmin))
globalxmin = xid;
/* Fetch xid just once - see GetNewTransactionId */
xid = pgxact->xid;
/* If no XID assigned, use xid passed down from CN */
if (!TransactionIdIsNormal(xid))
xid = pgxact->next_xid;
/*
* If the transaction has no XID assigned, we can skip it; it
* won't have sub-XIDs either. If the XID is >= xmax, we can also
* skip it; such transactions will be treated as running anyway
* (and any sub-XIDs will also be >= xmax).
*/
if (!TransactionIdIsNormal(xid) || !TransactionIdPrecedes(xid, xmax))
continue;
/*
* We don't include our own XIDs (if any) in the snapshot, but we
* must include them in xmin.
*/
if (TransactionIdPrecedes(xid, xmin))
xmin = xid;
if (pgxact == t_thrd.pgxact)
continue;
}
}
/* fetch into volatile var while ProcArrayLock is held */
replication_slot_xmin = g_instance.proc_array_idx->replication_slot_xmin;
replication_slot_catalog_xmin = g_instance.proc_array_idx->replication_slot_catalog_xmin;
if (!TransactionIdIsValid(t_thrd.pgxact->xmin)) {
t_thrd.pgxact->xmin = u_sess->utils_cxt.TransactionXmin = xmin;
t_thrd.pgxact->handle = GetCurrentTransactionHandleIfAny();
}
#ifndef ENABLE_MULTIPLE_NODES
if (snapshot->takenDuringRecovery && TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin)) {
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin, xmin)) {
xmin = t_thrd.xact_cxt.ShmemVariableCache->standbyXmin;
}
t_thrd.pgxact->xmin = u_sess->utils_cxt.TransactionXmin = xmin;
}
#endif
snapshot->snapshotcsn = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo);
if (GTM_LITE_MODE) { /* gtm lite check csn, should always pass the check */
(void)set_proc_csn_and_check("GetLocalSnapshotDataFromProc", snapshot->snapshotcsn,
snapshot->gtm_snapshot_type, SNAPSHOT_LOCAL);
}
LWLockRelease(ProcArrayLock);
#ifndef ENABLE_MULTIPLE_NODES
GROUP_GET_SNAPSHOT:
#endif
/* Save the xmax and csn, so that the CM agent can obtain them. */
t_thrd.proc->snapXmax = xmax;
t_thrd.proc->snapCSN = snapshot->snapshotcsn;
/*
* Update globalxmin to include actual process xids. This is a slightly
* different way of computing it than GetOldestXmin uses, but should give
* the same result.
*/
if (TransactionIdPrecedes(xmin, globalxmin)) {
globalxmin = xmin;
}
/* When initdb we set vacuum_defer_cleanup_age to zero, so we can vacuum
freeze three default database to avoid that localxid larger than GTM next_xid. */
if (isSingleMode) {
u_sess->attr.attr_storage.vacuum_defer_cleanup_age = 0;
}
/* Update global variables too */
if (TransactionIdPrecedes(globalxmin, (uint64)u_sess->attr.attr_storage.vacuum_defer_cleanup_age)) {
u_sess->utils_cxt.RecentGlobalXmin = FirstNormalTransactionId;
} else {
u_sess->utils_cxt.RecentGlobalXmin = globalxmin - u_sess->attr.attr_storage.vacuum_defer_cleanup_age;
}
if (!TransactionIdIsNormal(u_sess->utils_cxt.RecentGlobalXmin)) {
u_sess->utils_cxt.RecentGlobalXmin = FirstNormalTransactionId;
}
/* Check whether there's a replication slot requiring an older xmin. */
if (TransactionIdIsValid(replication_slot_xmin) &&
TransactionIdPrecedes(replication_slot_xmin, u_sess->utils_cxt.RecentGlobalXmin)) {
u_sess->utils_cxt.RecentGlobalXmin = replication_slot_xmin;
}
/* Check whether there's a standby requiring an older xmin when dms is enabled. */
if (SS_NORMAL_PRIMARY && SS_REPLICATION_MAIN_STANBY_NODE) {
uint64 global_xmin = SSGetGlobalOldestXmin(u_sess->utils_cxt.RecentGlobalXmin);
u_sess->utils_cxt.RecentGlobalXmin = global_xmin;
}
/* Non-catalog tables can be vacuumed if older than this xid */
u_sess->utils_cxt.RecentGlobalDataXmin = u_sess->utils_cxt.RecentGlobalXmin;
/*
* Check whether there's a replication slot requiring an older catalog
* xmin.
*/
if (TransactionIdIsNormal(replication_slot_catalog_xmin) &&
NormalTransactionIdPrecedes(replication_slot_catalog_xmin, u_sess->utils_cxt.RecentGlobalXmin)) {
u_sess->utils_cxt.RecentGlobalXmin = replication_slot_catalog_xmin;
}
u_sess->utils_cxt.RecentXmin = xmin;
#ifndef ENABLE_MULTIPLE_NODES
if (forHSFeedBack) {
u_sess->utils_cxt.RecentGlobalXmin = globalxmin;
}
#endif
snapshot->xmin = xmin;
snapshot->xmax = xmax;
snapshot->curcid = GetCurrentCommandId(false);
#ifdef PGXC
if (!RecoveryInProgress()) {
int errlevel = LOG;
if (u_sess->attr.attr_common.xc_maintenance_mode || IsAutoVacuumLauncherProcess() || !IsNormalProcessingMode())
errlevel = DEBUG1;
/* Just ForeignScan runs in the compute pool, the snapshot and gxid is
* not necessary. To avoid too much log, we set errlevel to DEBUG1. */
if (IS_PGXC_COORDINATOR && (StreamTopConsumerAmI() || t_thrd.wlm_cxt.wlmalarm_dump_active))
errlevel = DEBUG1;
if (!GTM_FREE_MODE && !t_thrd.postgres_cxt.isInResetUserName)
ereport(errlevel,
(errmsg("Local snapshot is built, xmin: %lu, xmax: %lu, "
"RecentGlobalXmin: %lu",
xmin,
xmax,
globalxmin)));
}
#endif
/*
* This is a new snapshot, so set both refcounts are zero, and mark it as
* not copied in persistent memory.
*/
snapshot->active_count = 0;
snapshot->regd_count = 0;
snapshot->copied = false;
if (snapshot->takenDuringRecovery) {
if (IsDefaultExtremeRtoMode() && IS_EXRTO_STANDBY_READ) {
exrto_read_snapshot(snapshot);
if (t_thrd.proc->exrto_reload_cache) {
t_thrd.proc->exrto_reload_cache = false;
reset_invalidation_cache();
}
AcceptInvalidationMessages();
}
(void)pgstat_report_waitstatus(oldStatus);
}
return snapshot;
}
void exrto_get_snapshot_data(TransactionId &xmin, TransactionId &xmax, CommitSeqNo &snapshot_csn)
{
LWLockAcquire(ProcArrayLock, LW_SHARED);
/* xmax is always latest_completed_xid + 1 */
xmax = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
Assert(TransactionIdIsNormal(xmax));
TransactionIdAdvance(xmax);
/* initialize xmin calculation with xmax */
xmin = xmax;
if (TransactionIdIsValid(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin)) {
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->standbyXmin, xmin)) {
xmin = t_thrd.xact_cxt.ShmemVariableCache->standbyXmin;
}
}
LWLockRelease(ProcArrayLock);
snapshot_csn = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo);
}
/*
* ProcArrayInstallImportedXmin -- install imported xmin into MyPgXact->xmin
*
* This is called when installing a snapshot imported from another
* transaction. To ensure that OldestXmin doesn't go backwards, we must
* check that the source transaction is still running, and we'd better do
* that atomically with installing the new xmin.
*
* Returns TRUE if successful, FALSE if source xact is no longer running.
*/
bool ProcArrayInstallImportedXmin(TransactionId xmin, VirtualTransactionId *sourcevxid)
{
bool result = false;
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index;
Assert(TransactionIdIsNormal(xmin));
if (!sourcevxid)
return false;
/* Get lock so source xact can't end while we're doing this */
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId xid;
/* We are only interested in the specific virtual transaction. */
if (proc->backendId != sourcevxid->backendId)
continue;
if (proc->lxid != sourcevxid->localTransactionId)
continue;
/*
* We check the transaction's database ID for paranoia's sake: if it's
* in another DB then its xmin does not cover us. Caller should have
* detected this already, so we just treat any funny cases as
* "transaction not found".
*/
if (proc->databaseId != u_sess->proc_cxt.MyDatabaseId)
continue;
/*
* Likewise, let's just make real sure its xmin does cover us.
*/
xid = pgxact->xmin; /* fetch just once */
if (!TransactionIdIsNormal(xid) || !TransactionIdPrecedesOrEquals(xid, xmin))
continue;
/*
* We're good. Install the new xmin. As in GetSnapshotData, set
* TransactionXmin too. (Note that because snapmgr.c called
* GetSnapshotData first, we'll be overwriting a valid xmin here, so
* we don't check that.)
*/
t_thrd.pgxact->xmin = u_sess->utils_cxt.TransactionXmin = xmin;
result = true;
break;
}
LWLockRelease(ProcArrayLock);
return result;
}
typedef struct GTM_RunningXacts {
int cur_index;
} GTM_RunningXacts;
Datum pg_get_running_xacts(PG_FUNCTION_ARGS)
{
FuncCallContext* funcctx = NULL;
GTM_RunningXacts* status = NULL;
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
if (SRF_IS_FIRSTCALL()) {
TupleDesc tupdesc;
MemoryContext oldcontext;
/* create a function context for cross-call persistence */
funcctx = SRF_FIRSTCALL_INIT();
/*
* Switch to memory context appropriate for multiple function calls
*/
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
/* build tupdesc for result tuples */
/* this had better match pg_prepared_xacts view in system_views.sql */
tupdesc = CreateTemplateTupleDesc(10, false);
TupleDescInitEntry(tupdesc, (AttrNumber)1, "handle", INT4OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)2, "gxid", XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)3, "state", INT1OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)4, "node", TEXTOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)5, "xmin", XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)6, "vacuum", BOOLOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)7, "timeline", INT8OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)8, "prepare_xid", XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)9, "pid", INT8OID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)10, "next_xid", XIDOID, -1, 0);
funcctx->tuple_desc = BlessTupleDesc(tupdesc);
/*
* Collect all the 2PC status information that we will format and send
* out as a result set.
*/
status = (GTM_RunningXacts*)palloc(sizeof(GTM_RunningXacts));
status->cur_index = 0;
funcctx->user_fctx = (void*)status;
MemoryContextSwitchTo(oldcontext);
/*
* Ensure that no xids enter or leave the procarray while we obtain
* snapshot.
*/
LWLockAcquire(ProcArrayLock, LW_SHARED);
}
funcctx = SRF_PERCALL_SETUP();
status = (GTM_RunningXacts*)funcctx->user_fctx;
while (status->cur_index < arrayP->numProcs) {
int pgprocno = arrayP->pgprocnos[status->cur_index++];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
Datum values[10];
bool nulls[10];
HeapTuple tuple;
Datum result;
/* Skip self */
if (pgxact == t_thrd.pgxact)
continue;
/*
* Form tuple with appropriate data.
*/
errno_t ret = memset_s(values, sizeof(values), 0, sizeof(values));
securec_check(ret, "\0", "\0");
ret = memset_s(nulls, sizeof(nulls), 0, sizeof(nulls));
securec_check(ret, "\0", "\0");
values[0] = Int32GetDatum(pgxact->handle);
values[1] = TransactionIdGetDatum(pgxact->xid);
if (TransactionIdIsPrepared(pgxact->xid))
values[2] = Int8GetDatum(GTM_TXN_PREPARED);
else
values[2] = Int8GetDatum(GTM_TXN_STARTING);
values[3] = CStringGetTextDatum(g_instance.attr.attr_common.PGXCNodeName);
values[4] = TransactionIdGetDatum(pgxact->xmin);
if (pgxact->vacuumFlags & PROC_IN_VACUUM)
values[5] = BoolGetDatum(true);
else
values[5] = BoolGetDatum(false);
values[6] = Int64GetDatum(get_controlfile_timeline());
values[7] = TransactionIdGetDatum(pgxact->prepare_xid);
values[8] = Int64GetDatum(proc->pid);
values[9] = TransactionIdGetDatum(pgxact->next_xid);
tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
result = HeapTupleGetDatum(tuple);
SRF_RETURN_NEXT(funcctx, result);
}
LWLockRelease(ProcArrayLock);
SRF_RETURN_DONE(funcctx);
}
/*
* Similar to GetSnapshotData but returns just oldestActiveXid. We include
* all PGXACTs with an assigned TransactionId, even VACUUM processes.
* We look at all databases, though there is no need to include WALSender
* since this has no effect on hot standby conflicts.
*
* This is never executed during recovery so there is no need to look at
* KnownAssignedXids.
*
* We don't worry about updating other counters, we want to keep this as
* simple as possible and leave GetSnapshotData() as the primary code for
* that bookkeeping.
*/
TransactionId GetOldestActiveTransactionId(TransactionId *globalXmin)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
TransactionId oldestRunningXid;
int index;
/* xmax is always latestCompletedXid + 1 */
TransactionId xmax = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
Assert(TransactionIdIsNormal(xmax));
TransactionIdAdvance(xmax);
TransactionId xmin = xmax;
LWLockAcquire(ProcArrayLock, LW_SHARED);
/*
* It's okay to read nextXid without acquiring XidGenLock because (1) we
* assume TransactionIds can be read atomically and (2) we don't care if
* we get a slightly stale value. It can't be very stale anyway, because
* the LWLockAcquire above will have done any necessary memory
* interlocking.
*/
oldestRunningXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
/*
* Spin over procArray collecting all xids and subxids.
*/
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId xid;
/* Update globalxmin to be the smallest valid xmin */
xid = pgxact->xmin; /* fetch just once */
if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, xmin))
xmin = xid;
/* Fetch xid just once - see GetNewTransactionId */
xid = pgxact->xid;
if (!TransactionIdIsNormal(xid))
continue;
if (TransactionIdPrecedes(xid, oldestRunningXid))
oldestRunningXid = xid;
/*
* Top-level XID of a transaction is always less than any of its
* subxids, so we don't need to check if any of the subxids are
* smaller than oldestRunningXid
*/
}
LWLockRelease(ProcArrayLock);
/*
* Update globalxmin to include actual process xids. This is a slightly
* different way of computing it than GetOldestXmin uses, but should give
* the same result.
*/
if (TransactionIdPrecedes(oldestRunningXid, xmin)) {
xmin = oldestRunningXid;
}
*globalXmin = xmin;
if (IS_EXRTO_STANDBY_READ) {
ereport(LOG, (errmsg("proc_array_get_oldest_active_transaction_id: global_xmin = %lu", *globalXmin)));
}
return oldestRunningXid;
}
/*
* GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
*
* Returns the oldest xid that we can guarantee not to have been affected by
* vacuum, i.e. no rows >= that xid have been vacuumed away unless the
* transaction aborted. Note that the value can (and most of the time will) be
* much more conservative than what really has been affected by vacuum, but we
* currently don't have better data available.
*
* This is useful to initalize the cutoff xid after which a new changeset
* extraction replication slot can start decoding changes.
*
* Must be called with ProcArrayLock held either shared or exclusively,
* although most callers will want to use exclusive mode since it is expected
* that the caller will immediately use the xid to peg the xmin horizon.
*/
TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
TransactionId oldestSafeXid;
int index;
bool recovery_in_progress = RecoveryInProgress();
Assert(LWLockHeldByMe(ProcArrayLock));
/*
* Acquire XidGenLock, so no transactions can acquire an xid while we're
* running. If no transaction with xid were running concurrently a new xid
* could influence the the RecentXmin et al.
*
* We initialize the computation to nextXid since that's guaranteed to be
* a safe, albeit pessimal, value.
*/
LWLockAcquire(XidGenLock, LW_SHARED);
oldestSafeXid = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
/*
* If there's already a slot pegging the xmin horizon, we can start with
* that value, it's guaranteed to be safe since it's computed by this
* routine initially and has been enforced since. We can always use the
* slot's general xmin horizon, but the catalog horizon is only usable
* when we only catalog data is going to be looked at.
*/
if (TransactionIdIsValid(g_instance.proc_array_idx->replication_slot_xmin) &&
TransactionIdPrecedes(g_instance.proc_array_idx->replication_slot_xmin, oldestSafeXid))
oldestSafeXid = g_instance.proc_array_idx->replication_slot_xmin;
if (catalogOnly && TransactionIdIsValid(g_instance.proc_array_idx->replication_slot_catalog_xmin) &&
TransactionIdPrecedes(g_instance.proc_array_idx->replication_slot_catalog_xmin, oldestSafeXid))
oldestSafeXid = g_instance.proc_array_idx->replication_slot_catalog_xmin;
/*
* If we're not in recovery, we walk over the procarray and collect the
* lowest xid. Since we're called with ProcArrayLock held and have
* acquired XidGenLock, no entries can vanish concurrently, since
* PGXACT->xid is only set with XidGenLock held and only cleared with
* ProcArrayLock held.
*
* In recovery we can't lower the safe value besides what we've computed
* above, so we'll have to wait a bit longer there. We unfortunately can
* *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
* machinery can miss values and return an older value than is safe.
*/
if (!recovery_in_progress) {
/*
* Spin over procArray collecting all min(PGXACT->xid)
*/
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId xid;
/* Fetch xid just once - see GetNewTransactionId */
xid = pgxact->xid;
if (!TransactionIdIsNormal(xid))
continue;
if (TransactionIdPrecedes(xid, oldestSafeXid))
oldestSafeXid = xid;
}
}
LWLockRelease(XidGenLock);
return oldestSafeXid;
}
/*
* GetVirtualXIDsDelayingChkpt -- Get the XIDs of transactions that are
* delaying checkpoint because they have critical actions in progress.
*
* Constructs an array of VXIDs of transactions that are currently in commit
* critical sections, as shown by having delayChkpt set in their PGXACT.
*
* Returns a palloc'd array that should be freed by the caller.
* *nvxids is the number of valid entries.
*
* Note that because backends set or clear delayChkpt without holding any lock,
* the result is somewhat indeterminate, but we don't really care. Even in
* a multiprocessor with delayed writes to shared memory, it should be certain
* that setting of delayChkpt will propagate to shared memory when the backend
* takes a lock, so we cannot fail to see an virtual xact as delayChkpt if
* it's already inserted its commit record. Whether it takes a little while
* for clearing of delayChkpt to propagate is unimportant for correctness.
*/
VirtualTransactionId* GetVirtualXIDsDelayingChkpt(int* nvxids)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int count = 0;
/* allocate what's certainly enough result space */
VirtualTransactionId* vxids = (VirtualTransactionId*)palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (int index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
if (pgxact->delayChkpt) {
VirtualTransactionId vxid;
GET_VXID_FROM_PGPROC(vxid, *proc);
if (VirtualTransactionIdIsValid(vxid))
vxids[count++] = vxid;
}
}
LWLockRelease(ProcArrayLock);
*nvxids = count;
return vxids;
}
/*
* HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying?
*
* This is used with the results of GetVirtualXIDsDelayingChkpt to see if any
* of the specified VXIDs are still in critical sections of code.
*
* Note: this is O(N^2) in the number of vxacts that are/were delaying, but
* those numbers should be small enough for it not to be a problem.
*/
bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId* vxids, int nvxids)
{
bool result = false;
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
VirtualTransactionId vxid;
GET_VXID_FROM_PGPROC(vxid, *proc);
if (pgxact->delayChkpt && VirtualTransactionIdIsValid(vxid)) {
int i;
for (i = 0; i < nvxids; i++) {
if (VirtualTransactionIdEquals(vxid, vxids[i])) {
result = true;
break;
}
}
if (result) {
break;
}
}
}
LWLockRelease(ProcArrayLock);
return result;
}
/*
* BackendPidGetProc -- get a backend's PGPROC given its PID
*
* Returns NULL if not found. Note that it is up to the caller to be
* sure that the question remains meaningful for long enough for the
* answer to be used ...
*/
PGPROC* BackendPidGetProc(ThreadId pid)
{
PGPROC* result = NULL;
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index;
if (pid == 0) /* never match dummy PGPROCs */
return NULL;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) {
PGPROC* proc = g_instance.proc_base_all_procs[arrayP->pgprocnos[index]];
if (proc->pid == pid) {
result = proc;
break;
}
}
LWLockRelease(ProcArrayLock);
return result;
}
/*
* BackendXidGetPid -- get a backend's pid given its XID
*
* Returns 0 if not found or it's a prepared transaction. Note that
* it is up to the caller to be sure that the question remains
* meaningful for long enough for the answer to be used ...
*
* Only main transaction Ids are considered. This function is mainly
* useful for determining what backend owns a lock.
*
* Beware that not every xact has an XID assigned. However, as long as you
* only call this using an XID found on disk, you're safe.
*/
int BackendXidGetPid(TransactionId xid)
{
int result = 0;
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index;
if (xid == InvalidTransactionId) /* never match invalid xid */
return 0;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
if (pgxact->xid == xid) {
result = proc->pid;
break;
}
}
LWLockRelease(ProcArrayLock);
return result;
}
/*
* IsBackendPid -- is a given pid a running backend
*/
bool IsBackendPid(ThreadId pid)
{
return (BackendPidGetProc(pid) != NULL);
}
/*
* GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
*
* The array is palloc'd. The number of valid entries is returned into *nvxids.
*
* The arguments allow filtering the set of VXIDs returned. Our own process
* is always skipped. In addition:
* If limitXmin is not InvalidTransactionId, skip processes with
* xmin > limitXmin.
* If excludeXmin0 is true, skip processes with xmin = 0.
* If allDbs is false, skip processes attached to other databases.
* If excludeVacuum isn't zero, skip processes for which
* (vacuumFlags & excludeVacuum) is not zero.
*
* Note: the purpose of the limitXmin and excludeXmin0 parameters is to
* allow skipping backends whose oldest live snapshot is no older than
* some snapshot we have. Since we examine the procarray with only shared
* lock, there are race conditions: a backend could set its xmin just after
* we look. Indeed, on multiprocessors with weak memory ordering, the
* other backend could have set its xmin *before* we look. We know however
* that such a backend must have held shared ProcArrayLock overlapping our
* own hold of ProcArrayLock, else we would see its xmin update. Therefore,
* any snapshot the other backend is taking concurrently with our scan cannot
* consider any transactions as still running that we think are committed
* (since backends must hold ProcArrayLock exclusive to commit).
*/
VirtualTransactionId* GetCurrentVirtualXIDs(
TransactionId limitXmin, bool excludeXmin0, bool allDbs, int excludeVacuum, int* nvxids)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int count = 0;
/* allocate what's certainly enough result space */
VirtualTransactionId* vxids = (VirtualTransactionId*)palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (int index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
if (proc == t_thrd.proc)
continue;
if (excludeVacuum & pgxact->vacuumFlags)
continue;
if (allDbs || proc->databaseId == u_sess->proc_cxt.MyDatabaseId) {
/* Fetch xmin just once - might change on us */
TransactionId pxmin = pgxact->xmin;
if (excludeXmin0 && !TransactionIdIsValid(pxmin))
continue;
/*
* InvalidTransactionId precedes all other XIDs, so a proc that
* hasn't set xmin yet will not be rejected by this test.
*/
if (!TransactionIdIsValid(limitXmin) || TransactionIdPrecedesOrEquals(pxmin, limitXmin)) {
VirtualTransactionId vxid;
GET_VXID_FROM_PGPROC(vxid, *proc);
if (VirtualTransactionIdIsValid(vxid))
vxids[count++] = vxid;
}
}
}
LWLockRelease(ProcArrayLock);
*nvxids = count;
return vxids;
}
void UpdateCleanUpInfo(TransactionId limitXmin, XLogRecPtr lsn)
{
if (t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXmin < limitXmin) {
t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXmin = limitXmin;
const int xid_gap = 10000000;
if (limitXmin > t_thrd.xact_cxt.ShmemVariableCache->standbyXmin + xid_gap) {
ereport(LOG, (errmsg("limitXmin = %ld, standbyRedoCleanupXmin = %ld, "
"lsn = %ld, standbyRedoCleanupXminLsn = %ld, "
"standbyXmin = %ld",
limitXmin, t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXmin,
lsn, t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXminLsn,
t_thrd.xact_cxt.ShmemVariableCache->standbyXmin)));
}
}
if (t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXminLsn < lsn) {
t_thrd.xact_cxt.ShmemVariableCache->standbyRedoCleanupXminLsn = lsn;
}
}
/*
* GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
*
* Usage is limited to conflict resolution during recovery on standby servers.
* limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
* in cases where we cannot accurately determine a value for latestRemovedXid.
*
* If limitXmin is InvalidTransactionId then we want to kill everybody,
* so we're not worried if they have a snapshot or not, nor does it really
* matter what type of lock we hold.
*
* All callers that are checking xmins always now supply a valid and useful
* value for limitXmin. The limitXmin is always lower than the lowest
* numbered KnownAssignedXid that is not already a FATAL error. This is
* because we only care about cleanup records that are cleaning up tuple
* versions from committed transactions. In that case they will only occur
* at the point where the record is less than the lowest running xid. That
* allows us to say that if any backend takes a snapshot concurrently with
* us then the conflict assessment made here would never include the snapshot
* that is being derived. So we take LW_SHARED on the ProcArray and allow
* concurrent snapshots when limitXmin is valid. We might
* think about adding Assert(limitXmin < lowest(KnownAssignedXids))
* but that would not be true in the case of FATAL errors lagging in array,
* but we already know those are bogus anyway, so we skip that test.
*
* If dbOid is valid we skip backends attached to other databases.
*
* Be careful to *not* pfree the result from this function. We reuse
* this array sufficiently often that we use malloc for the result.
*/
VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid, XLogRecPtr lsn,
CommitSeqNo limitXminCSN, TransactionId* xminArray)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int count = 0;
int index;
/*
* If first time through, get workspace to remember main XIDs in. We
* malloc it permanently to avoid repeated palloc/pfree overhead. Allow
* result space, remembering room for a terminator.
*/
if (t_thrd.storage_cxt.proc_vxids == NULL) {
t_thrd.storage_cxt.proc_vxids = (VirtualTransactionId*)MemoryContextAlloc(
THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE),
sizeof(VirtualTransactionId) * (unsigned int)(arrayP->maxProcs + 1));
if (t_thrd.storage_cxt.proc_vxids == NULL)
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")));
}
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
/* Exclude prepared transactions and Statement flush thread */
if (proc->pid == 0 || (OidIsValid(dbOid) && proc->databaseId != dbOid) ||
strcmp((const char*)(proc->myProgName), "Statement flush thread") == 0) {
continue;
}
#ifndef ENABLE_MULTIPLE_NODES
/* Fetch xmin just once - can't change on us, but good coding */
TransactionId pxmin = pgxact->xmin;
/*
* We ignore an invalid pxmin because this means that backend has
* no snapshot and cannot get another one while we hold exclusive
* lock.
*/
if (!TransactionIdIsValid(limitXmin) ||
(TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin))) {
VirtualTransactionId vxid;
GET_VXID_FROM_PGPROC(vxid, *proc);
if (VirtualTransactionIdIsValid(vxid)) {
ADD_XMIN_TO_ARRAY(pxmin);
t_thrd.storage_cxt.proc_vxids[count++] = vxid;
}
}
#else
if (!IS_MULTI_DISASTER_RECOVER_MODE) {
break;
}
CommitSeqNo xact_csn = pgxact->csn_dr;
if (!TransactionIdIsValid(limitXmin) || (limitXminCSN >= xact_csn && xact_csn != InvalidCommitSeqNo)) {
VirtualTransactionId vxid;
GET_VXID_FROM_PGPROC(vxid, *proc);
if (VirtualTransactionIdIsValid(vxid)) {
t_thrd.storage_cxt.proc_vxids[count++] = vxid;
}
}
#endif
}
#ifndef ENABLE_MULTIPLE_NODES
UpdateCleanUpInfo(limitXmin, lsn);
#endif
LWLockRelease(ProcArrayLock);
/* add the terminator */
t_thrd.storage_cxt.proc_vxids[count].backendId = InvalidBackendId;
t_thrd.storage_cxt.proc_vxids[count].localTransactionId = InvalidLocalTransactionId;
ADD_XMIN_TO_ARRAY(InvalidTransactionId);
return t_thrd.storage_cxt.proc_vxids;
}
/*
* CancelVirtualTransaction - used in recovery conflict processing
*
* Returns pid of the process signaled, or 0 if not found.
*/
ThreadId CancelVirtualTransaction(const VirtualTransactionId& vxid, ProcSignalReason sigmode)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index;
ThreadId pid = 0;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
VirtualTransactionId procvxid;
GET_VXID_FROM_PGPROC(procvxid, *proc);
if (procvxid.backendId == vxid.backendId && procvxid.localTransactionId == vxid.localTransactionId) {
proc->recoveryConflictPending = true;
pid = proc->pid;
if (pid != 0) {
/*
* Kill the pid if it's still here. If not, that's what we
* wanted so ignore any errors.
*/
(void)SendProcSignal(pid, sigmode, vxid.backendId);
}
break;
}
}
LWLockRelease(ProcArrayLock);
return pid;
}
bool proc_array_cancel_conflicting_proc(
TransactionId latest_removed_xid, XLogRecPtr truncate_redo_lsn, bool reach_max_check_times)
{
ProcArrayStruct* proc_array = g_instance.proc_array_idx;
bool conflict = false;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (int index = 0; index < proc_array->numProcs; index++) {
int pg_proc_no = proc_array->pgprocnos[index];
PGPROC* pg_proc = g_instance.proc_base_all_procs[pg_proc_no];
PGXACT* pg_xact = &g_instance.proc_base_all_xacts[pg_proc_no];
XLogRecPtr read_lsn = pg_proc->exrto_min;
TransactionId pxmin = pg_xact->xmin;
if (pg_proc->pid == 0 || XLogRecPtrIsInvalid(read_lsn)) {
continue;
}
Assert(!(pg_xact->vacuumFlags & PROC_IN_VACUUM));
/*
* Backend is doing logical decoding which manages xmin
* separately, check below.
*/
if (pg_xact->vacuumFlags & PROC_IN_LOGICAL_DECODING) {
continue;
}
/* cancel query when its xmin < latest_removed_xid */
if (TransactionIdPrecedesOrEquals(pxmin, latest_removed_xid) ||
(truncate_redo_lsn != InvalidXLogRecPtr && XLByteLT(read_lsn, truncate_redo_lsn))) {
conflict = true;
pg_proc->recoveryConflictPending = true;
if (pg_proc->pid != 0) {
/*
* Kill the pid if it's still here. If not, that's what we
* wanted so ignore any errors.
*/
(void)SendProcSignal(pg_proc->pid, PROCSIG_RECOVERY_CONFLICT_SNAPSHOT, pg_proc->backendId);
/*
* Wait a little bit for it to die so that we avoid flooding
* an unresponsive backend when system is heavily loaded.
*/
ereport(LOG,
(errmsg(EXRTOFORMAT("cancel thread while "
"redo truncate (lsn: %08X/%08X, latest_removed_xid: %lu), thread id = %lu, "
"read_lsn: %08X/%08X, xmin: %lu"),
(uint32)(truncate_redo_lsn >> UINT64_HALF),
(uint32)truncate_redo_lsn,
latest_removed_xid,
pg_proc->pid,
(uint32)(read_lsn >> UINT64_HALF),
(uint32)read_lsn,
pxmin)));
pg_usleep(5000L);
}
}
if (reach_max_check_times) {
ereport(WARNING, (
errmsg("can not cancel thread while redo truncate, thread id = %lu", pg_proc->pid)));
}
}
LWLockRelease(ProcArrayLock);
return conflict;
}
/*
* MinimumActiveBackends --- count backends (other than myself) that are
* in active transactions. Return true if the count exceeds the
* minimum threshold passed. This is used as a heuristic to decide if
* a pre-XLOG-flush delay is worthwhile during commit.
*
* Do not count backends that are blocked waiting for locks, since they are
* not going to get to run until someone else commits.
*/
bool MinimumActiveBackends(int min)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int count = 0;
int index;
/* Quick short-circuit if no minimum is specified */
if (min == 0) {
return true;
}
/*
* Note: for speed, we don't acquire ProcArrayLock. This is a little bit
* bogus, but since we are only testing fields for zero or nonzero, it
* should be OK. The result is only used for heuristic purposes anyway...
*/
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
/*
* Since we're not holding a lock, need to check that the pointer is
* valid. Someone holding the lock could have incremented numProcs
* already, but not yet inserted a valid pointer to the array.
*
* If someone just decremented numProcs, 'proc' could also point to a
* PGPROC entry that's no longer in the array. It still points to a
* PGPROC struct, though, because freed PGPROC entries just go to the
* free list and are recycled. Its contents are nonsense in that case,
* but that's acceptable for this function.
*/
if (pgprocno == -1) {
continue; /* do not count deleted entries */
}
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
if (proc == t_thrd.proc) {
continue; /* do not count myself */
}
if (pgxact->xid == InvalidTransactionId) {
continue; /* do not count if no XID assigned */
}
if (proc->pid == 0) {
continue; /* do not count prepared xacts */
}
if (proc->waitLock != NULL) {
continue; /* do not count if blocked on a lock */
}
count++;
if (count >= min) {
break;
}
}
return count >= min;
}
/*
* CountDBBackends:
* The purpose is to collect statistics on the number of current connections.
*
* 1.In case of thread pool mode, active and inactive threads are counted through interface CountDBSessions.
* 2.In case of none thread pool mode, the number of threads to be prepared and the number of active backend threads
* need to be collected with the help of global variable g_instance.proc_base_all_procs.
*/
int CountDBBackends(Oid database_oid)
{
const int MAXAUTOVACPIDS = 10; /* max autovacs to SIGTERM per iteration */
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index, pgprocno, num_connections;
int num_autovacs = 0;
int num_wdrxdbs = 0;
int num_backends = 0;
int num_prepared = 0;
CHECK_FOR_INTERRUPTS();
/* Under thread pool mode, active and inactive threads are counted. */
if (ENABLE_THREAD_POOL) {
num_connections = g_threadPoolControler->GetSessionCtrl()->CountDBSessions(database_oid);
} else {
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) {
pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
volatile PgBackendStatus* beentry = pgstat_get_backend_single_entry(proc->sessionid);
if (proc->databaseId != database_oid) {
continue;
}
if (proc->pid == 0) {
num_prepared++;
} else {
/* Internal threads are not counted. Function: autovacuum */
if ((pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) && num_autovacs < MAXAUTOVACPIDS) {
num_autovacs++;
continue;
}
/* Internal threads are not counted. Function: cross-database query */
if (beentry != NULL && strcmp(beentry->st_appname, "WDRXdb") == 0 &&
num_wdrxdbs < MAXAUTOVACPIDS) {
num_wdrxdbs++;
continue;
}
num_backends++;
}
}
LWLockRelease(ProcArrayLock);
num_connections = num_backends + num_prepared;
}
if (ENABLE_THREAD_POOL) {
ereport(DEBUG5, (errmsg("count backend connections in threadpool mode, num_connections[%d].)",
num_connections)));
} else {
ereport(DEBUG5, (errmsg("count backend connections in none-threadpool mode, num_backends[%d], "
"num_prepared[%d], num_autovacs[%d], num_wdrxdbs[%d].)",
num_backends, num_prepared, num_autovacs, num_wdrxdbs)));
}
return num_connections;
}
/*
* CountDBActiveBackends
* The purpose is to count active backends that are using specified database which used for clearing links
* in the redo scenario.
*/
int CountDBActiveBackends(Oid database_oid)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int count = 0;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (int index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
if (proc->pid == 0)
continue; /* do not count prepared xacts */
if (!OidIsValid(database_oid) || proc->databaseId == database_oid)
count++;
}
LWLockRelease(ProcArrayLock);
return count;
}
/*
* CancelDBBackends --- cancel backends that are using specified database
*/
void CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index;
ThreadId pid = 0;
/* tell all backends to die */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
if (databaseid == InvalidOid || proc->databaseId == databaseid) {
VirtualTransactionId procvxid;
GET_VXID_FROM_PGPROC(procvxid, *proc);
proc->recoveryConflictPending = conflictPending;
pid = proc->pid;
if (pid != 0) {
/*
* Kill the pid if it's still here. If not, that's what we
* wanted so ignore any errors.
*/
(void)SendProcSignal(pid, sigmode, procvxid.backendId);
}
}
}
LWLockRelease(ProcArrayLock);
}
static bool ValidDBoidAndUseroid(Oid databaseOid, Oid userOid, volatile PGPROC* proc)
{
/*
* Thread are 3 situation in CLEAN CONNECTION:
* 1. Only database, for example: CLEAN CONNECTION TO ALL FORCE FOR DATABASE xxx;
* 2. Only user, for example: CLEAN CONNECTION TO ALL FORCE TO USER xxx;
* 3. Both database and user, for example: CLEAN CONNECTION TO ALL FORCE FOR DATABASE xxx TO USER xxx;
*/
if (((databaseOid != InvalidOid) && (userOid == InvalidOid) && (proc->databaseId == databaseOid))
|| ((databaseOid == InvalidOid) && (userOid != InvalidOid) && (proc->roleId == userOid))
|| ((proc->databaseId == databaseOid) && (proc->roleId == userOid))) {
return true;
}
return false;
}
int CountSingleNodeActiveBackends(Oid databaseOid, Oid userOid)
{
if ((databaseOid == InvalidOid) && (userOid == InvalidOid)) {
ereport(WARNING,
(errmsg("DB oid and user oid are all Invalid (may be NULL). Shut down clean activite sessions.")));
return 0;
}
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int count = 0;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (int index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
if (proc->pid == 0)
continue; /* do not count prepared xacts */
if (ValidDBoidAndUseroid(databaseOid, userOid, proc)) {
count++;
}
}
LWLockRelease(ProcArrayLock);
return count;
}
/*
* CancelSingleNodeBackends --- cancel backends in single node by database oid or user oid
*/
void CancelSingleNodeBackends(Oid databaseOid, Oid userOid, ProcSignalReason sigmode, bool conflictPending)
{
if ((databaseOid == InvalidOid) && (userOid == InvalidOid)) {
ereport(WARNING,
(errmsg("DB oid and user oid are all Invalid (may be NULL). Shut down clean activite sessions.")));
return;
}
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index;
ThreadId pid = 0;
/* tell all backends to die */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
if (ValidDBoidAndUseroid(databaseOid, userOid, proc)) {
VirtualTransactionId procvxid;
GET_VXID_FROM_PGPROC(procvxid, *proc);
proc->recoveryConflictPending = conflictPending;
pid = proc->pid;
if (pid != 0) {
/*
* Kill the pid if it's still here. If not, that's what we
* wanted so ignore any errors.
*/
(void)SendProcSignal(pid, sigmode, procvxid.backendId);
}
}
}
LWLockRelease(ProcArrayLock);
}
LWLock *RoleidPartitionLock(uint32 hashCode)
{
int id = FirstSessRoleIdLock + hashCode % NUM_SESSION_ROLEID_PARTITIONS;
return GetMainLWLockByIndex(id);
}
/* Init RoleId HashTable */
void InitRoleIdHashTable()
{
HASHCTL hctl;
errno_t rc = 0;
MemoryContext context = AllocSetContextCreate(g_instance.instance_context,
"RoleIdHashtblContext",
ALLOCSET_SMALL_MINSIZE,
ALLOCSET_SMALL_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
rc = memset_s(&hctl, sizeof(HASHCTL), 0, sizeof(HASHCTL));
securec_check(rc, "", "");
hctl.keysize = sizeof(Oid);
hctl.entrysize = sizeof(RoleIdHashEntry);
hctl.hash = oid_hash;
hctl.hcxt = context;
hctl.num_partitions = NUM_SESSION_ROLEID_PARTITIONS;
g_instance.roleid_cxt.roleid_table = HeapMemInitHash("Roleid map",
INIT_ROLEID_HASHTBL,
MAX_ROLEID_HASHTBL,
&hctl,
HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
}
/* get the RoleId Count. */
int GetRoleIdCount(Oid roleoid)
{
bool found = false;
uint32 hashCode = 0;
volatile int64 roleNum = 0;
RoleIdHashEntry *entry = NULL;
hashCode = oid_hash(&roleoid, sizeof(Oid));
LWLock *lock = RoleidPartitionLock(hashCode);
(void)LWLockAcquire(lock, LW_SHARED);
entry = (RoleIdHashEntry *)hash_search(g_instance.roleid_cxt.roleid_table, (void*)&roleoid, HASH_FIND, &found);
if (!found) {
roleNum = 0;
} else {
roleNum = entry->roleNum;
}
LWLockRelease(lock);
return roleNum;
}
int IncreaseUserCount(Oid roleoid)
{
bool found = false;
uint32 hashCode = 0;
volatile int64 roleNum = 0;
RoleIdHashEntry *entry = NULL;
if(roleoid == 0) {
return 0;
}
hashCode = oid_hash(&roleoid, sizeof(Oid));
LWLock *lock = RoleidPartitionLock(hashCode);
(void)LWLockAcquire(lock, LW_EXCLUSIVE);
entry = (RoleIdHashEntry *)hash_search(g_instance.roleid_cxt.roleid_table, (void*)&roleoid, HASH_ENTER, &found);
if (!found) {
entry->roleNum = 1;
} else {
entry->roleNum++;
}
roleNum = entry->roleNum;
LWLockRelease(lock);
return roleNum;
}
int DecreaseUserCount(Oid roleoid)
{
bool found = false;
uint32 hashCode = 0;
volatile int64 roleNum = 0;
RoleIdHashEntry *entry = NULL;
if(roleoid == 0) {
return 0;
}
hashCode = oid_hash(&roleoid, sizeof(Oid));
LWLock *lock = RoleidPartitionLock(hashCode);
(void)LWLockAcquire(lock, LW_EXCLUSIVE);
entry = (RoleIdHashEntry *)hash_search(g_instance.roleid_cxt.roleid_table, (void*)&roleoid, HASH_FIND, &found);
if (found) {
entry->roleNum--;
roleNum = entry->roleNum;
if (entry->roleNum == 0) {
(void)hash_search(g_instance.roleid_cxt.roleid_table, (void*)&roleoid, HASH_REMOVE, &found);
}
}
LWLockRelease(lock);
return roleNum;
}
/*
* CountUserBackends --- count backends that are used by specified user
*/
int CountUserBackends(Oid roleid)
{
int count = 0;
if (!ENABLE_THREAD_POOL) {
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (int index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
if (proc->pid == 0)
continue; /* do not count prepared xacts */
if (proc->roleId == roleid && (t_thrd.role != STREAM_WORKER))
count++;
}
LWLockRelease(ProcArrayLock);
} else {
count = GetRoleIdCount(roleid);
}
return count;
}
/*
* CountOtherDBBackends -- check for other backends running in the given DB
*
* If there are other backends in the DB, we will wait a maximum of 5 seconds
* for them to exit. Autovacuum backends are encouraged to exit early by
* sending them SIGTERM, but normal user backends are just waited for.
*
* The current backend is always ignored; it is caller's responsibility to
* check whether the current backend uses the given DB, if it's important.
*
* Returns TRUE if there are (still) other backends in the DB, FALSE if not.
* Also, *nbackends and *nprepared are set to the number of other backends
* and prepared transactions in the DB, respectively.
*
* This function is used to interlock DROP DATABASE and related commands
* against there being any active backends in the target DB --- dropping the
* DB while active backends remain would be a Bad Thing. Note that we cannot
* detect here the possibility of a newly-started backend that is trying to
* connect to the doomed database, so additional interlocking is needed during
* backend startup. The caller should normally hold an exclusive lock on the
* target DB before calling this, which is one reason we mustn't wait
* indefinitely.
*/
bool CountOtherDBBackends(Oid databaseId, int* nbackends, int* nprepared)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
#define MAXAUTOVACPIDS 10 /* max autovacs to SIGTERM per iteration */
ThreadId autovac_pids[MAXAUTOVACPIDS];
ThreadId wdrxdb_pids[MAXAUTOVACPIDS];
int tries;
if (ENABLE_DMS && SS_PRIMARY_MODE) {
bool ret = SSCheckDbBackendsFromAllStandby(databaseId);
if (ret) {
*nbackends = *nprepared = 0;
return true;
}
}
/* 50 tries with 100ms sleep between tries makes 5 sec total wait */
for (tries = 0; tries < 50; tries++) {
int nworkers = 0;
int nautovacs = 0;
int nwdrxdbs = 0;
int index;
CHECK_FOR_INTERRUPTS();
*nbackends = *nprepared = 0;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
volatile PgBackendStatus* beentry = pgstat_get_backend_single_entry(proc->sessionid);
if (proc->databaseId != databaseId)
continue;
if (proc == t_thrd.proc)
continue;
if (proc->pid == 0)
(*nprepared)++;
else {
(*nbackends)++;
if (ENABLE_THREAD_POOL && proc->sessionid > 0) {
nworkers++;
}
if ((pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) && nautovacs < MAXAUTOVACPIDS) {
autovac_pids[nautovacs++] = proc->pid;
}
if (!ENABLE_THREAD_POOL && beentry != NULL && strcmp(beentry->st_appname, "WDRXdb") == 0 &&
nwdrxdbs < MAXAUTOVACPIDS) {
wdrxdb_pids[nwdrxdbs++] = proc->pid;
ereport(LOG, (errmsg("WDRXdb sessionid (beentry sessionid): %lu", beentry->st_sessionid)));
ereport(LOG, (errmsg("WDRXdb thread id (beentry st_tid): %d", beentry->st_tid)));
}
}
}
/* Under thread pool mode, we also need to count inactive sessions that are detached from worker threads */
if (ENABLE_THREAD_POOL) {
*nbackends -= nworkers;
*nbackends += g_threadPoolControler->GetSessionCtrl()->CountDBSessions(databaseId);
}
LWLockRelease(ProcArrayLock);
if (*nbackends == 0 && *nprepared == 0) {
return false; /* no conflicting backends, so done */
}
/*
* Send SIGTERM to any conflicting autovacuums before sleeping. We
* postpone this step until after the loop because we don't want to
* hold ProcArrayLock while issuing kill(). We have no idea what might
* block kill() inside the kernel...
*/
for (index = 0; index < nautovacs; index++) {
gs_signal_send(autovac_pids[index], SIGTERM); /* ignore any error */
}
for (index = 0; index < nwdrxdbs; index++) {
gs_signal_send(wdrxdb_pids[index], SIGTERM);
gs_signal_send(wdrxdb_pids[index], SIGUSR2);
ereport(LOG, (errmsg("WDRXdb thread pid: %lu is killed(proc->pid)", wdrxdb_pids[index])));
}
/* sleep, then try again */
pg_usleep(100 * 1000L); /* 100ms */
}
return true; /* timed out, still conflicts */
}
#ifdef PGXC
/*
* ReloadConnInfoOnBackends -- reload connection information for all the backends
*/
void ReloadConnInfoOnBackends(void)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int index;
ThreadId pid = 0;
/* tell all backends to reload except this one who already reloaded */
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
VirtualTransactionId vxid;
GET_VXID_FROM_PGPROC(vxid, *proc);
if (proc == t_thrd.proc)
continue; /* do not do that on myself */
if (proc->pid == 0)
continue; /* useless on prepared xacts */
if (pgxact->vacuumFlags & PROC_IN_VACUUM)
continue; /* ignore vacuum processes */
if (EnableGlobalSysCache()) {
/* syscache is on thread in gsc mode. when enable thread pool,
* even the thread does not connect to a database, we still need send signal to it */
if (!OidIsValid(proc->databaseId) && !ENABLE_THREAD_POOL) {
continue;
}
} else {
if (!OidIsValid(proc->databaseId)) {
continue;
}
if (ENABLE_THREAD_POOL && proc->sessionid > 0) {
continue;
}
}
pid = proc->pid;
/*
* Send the reload signal if backend still exists
*/
(void)SendProcSignal(pid, PROCSIG_PGXCPOOL_RELOAD, vxid.backendId);
}
LWLockRelease(ProcArrayLock);
if (ENABLE_THREAD_POOL) {
g_threadPoolControler->GetSessionCtrl()->HandlePoolerReload();
}
}
#endif
char dump_memory_context_name[MEMORY_CONTEXT_NAME_LEN];
/*
* DumpMemoryCtxOnBackend -- dump memory context on some backend
*/
void DumpMemoryCtxOnBackend(ThreadId tid, const char* mem_ctx)
{
int ret;
errno_t ss_rc = EOK;
if (strlen(mem_ctx) >= MEMORY_CONTEXT_NAME_LEN) {
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("The name of memory context is too long(>=%dbytes)", MEMORY_CONTEXT_NAME_LEN)));
return;
}
ss_rc = memset_s(dump_memory_context_name, MEMORY_CONTEXT_NAME_LEN, 0, MEMORY_CONTEXT_NAME_LEN);
securec_check(ss_rc, "\0", "\0");
ss_rc = strcpy_s(dump_memory_context_name, MEMORY_CONTEXT_NAME_LEN, mem_ctx);
securec_check(ss_rc, "\0", "\0");
LWLockAcquire(ProcArrayLock, LW_SHARED);
ret = SendProcSignal(tid, PROCSIG_MEMORYCONTEXT_DUMP, InvalidBackendId);
LWLockRelease(ProcArrayLock);
if (ret)
ereport(ERROR,
(errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("Fail to send signal to backend(tid:%lu).", (unsigned long)tid)));
}
/*
* ProcArraySetReplicationSlotXmin
*
* Install limits to future computations of the xmin horizon to prevent vacuum
* and HOT pruning from removing affected rows still needed by clients with
* replicaton slots.
*/
void ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, bool already_locked)
{
Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
if (!already_locked)
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
if (xmin == InvalidTransactionId || TransactionIdPrecedes(g_instance.proc_array_idx->replication_slot_xmin, xmin)) {
g_instance.proc_array_idx->replication_slot_xmin = xmin;
}
if (catalog_xmin == InvalidTransactionId ||
TransactionIdPrecedes(g_instance.proc_array_idx->replication_slot_catalog_xmin, catalog_xmin)) {
g_instance.proc_array_idx->replication_slot_catalog_xmin = catalog_xmin;
}
if (!already_locked)
LWLockRelease(ProcArrayLock);
}
/*
* GetReplicationSlotCatalogXmin
*
* Return replication_slot_catalog_xmin.
*/
TransactionId GetReplicationSlotCatalogXmin() {
return g_instance.proc_array_idx->replication_slot_catalog_xmin;
}
/*
* ProcArrayGetReplicationSlotXmin
*
* Return the current slot xmin limits. That's useful to be able to remove
* data that's older than those limits.
*/
void ProcArrayGetReplicationSlotXmin(TransactionId* xmin, TransactionId* catalog_xmin)
{
LWLockAcquire(ProcArrayLock, LW_SHARED);
if (xmin != NULL)
*xmin = g_instance.proc_array_idx->replication_slot_xmin;
if (catalog_xmin != NULL)
*catalog_xmin = g_instance.proc_array_idx->replication_slot_catalog_xmin;
LWLockRelease(ProcArrayLock);
}
/*
* XidCacheRemoveRunningXids
*
* Remove a bunch of TransactionIds from the list of known-running
* subtransactions for my backend. Both the specified xid and those in
* the xids[] array (of length nxids) are removed from the subxids cache.
* latestXid must be the latest XID among the group. We should store the
* required parameters into proc before performing XidCacheRemoveRunningXids,
* including subtransaction xid, the number of committed subtransaction,
* committed substransaction list, the latestXid between its xid and its
* committed subtransactions'.
*
* We don't do any locking here; caller must get the procArrayLock before
* perform XidCacheRemoveRunningXids.
*/
void XidCacheRemoveRunningXids(PGPROC* proc, PGXACT* pgxact)
{
int i, j;
TransactionId xid = proc->procArrayGroupMemberXid;
int nxids = proc->procArrayGroupSubXactNXids;
TransactionId* xids = proc->procArrayGroupSubXactXids;
TransactionId latestXid = proc->procArrayGroupSubXactLatestXid;
Assert(TransactionIdIsValid(xid));
/*
* Under normal circumstances xid and xids[] will be in increasing order,
* as will be the entries in subxids. Scan backwards to avoid O(N^2)
* behavior when removing a lot of xids.
*/
for (i = nxids - 1; i >= 0; i--) {
TransactionId anxid = xids[i];
for (j = pgxact->nxids - 1; j >= 0; j--) {
if (TransactionIdEquals(proc->subxids.xids[j], anxid)) {
proc->subxids.xids[j] = proc->subxids.xids[pgxact->nxids - 1];
pgxact->nxids--;
break;
}
}
/*
* Ordinarily we should have found it, unless the cache has
* overflowed. However it's also possible for this routine to be
* invoked multiple times for the same subtransaction, in case of an
* error during AbortSubTransaction. So instead of Assert, emit a
* debug warning.
*/
if (j < 0)
ereport(WARNING, (errmsg("did not find subXID " XID_FMT " in t_thrd.proc", anxid)));
}
for (j = pgxact->nxids - 1; j >= 0; j--) {
if (TransactionIdEquals(proc->subxids.xids[j], xid)) {
proc->subxids.xids[j] = proc->subxids.xids[pgxact->nxids - 1];
pgxact->nxids--;
break;
}
}
/* Ordinarily we should have found it, unless the cache has overflowed */
if (j < 0)
ereport(WARNING, (errmsg("did not find subXID " XID_FMT " in t_thrd.proc", xid)));
/* Also advance global latestCompletedXid while holding the lock */
if (TransactionIdPrecedes(t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid, latestXid))
t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid = latestXid;
}
#ifdef XIDCACHE_DEBUG
/*
* Print stats about effectiveness of XID cache
*/
static void DisplayXidCache(void)
{
fprintf(stderr,
"XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, "
"nooflo: %ld, slow: %ld\n",
xc_by_recent_xmin,
xc_by_known_xact,
xc_by_my_xact,
xc_by_latest_xid,
xc_by_main_xid,
xc_by_child_xid,
xc_by_known_assigned,
xc_no_overflow,
xc_slow_answer);
}
#endif /* XIDCACHE_DEBUG */
#ifdef PGXC
/*
* Store snapshot data received from the Coordinator
*/
void SetGlobalSnapshotData(
TransactionId xmin, TransactionId xmax, uint64 csn, GTM_Timeline timeline, bool ss_need_sync_wait_all)
{
u_sess->utils_cxt.snapshot_source = SNAPSHOT_COORDINATOR;
u_sess->utils_cxt.g_GTM_Snapshot->sn_xmin = u_sess->utils_cxt.gxmin = xmin;
u_sess->utils_cxt.g_GTM_Snapshot->sn_xmax = u_sess->utils_cxt.gxmax = xmax;
u_sess->utils_cxt.g_GTM_Snapshot->sn_recent_global_xmin = u_sess->utils_cxt.RecentGlobalXmin;
u_sess->utils_cxt.g_GTM_Snapshot->csn = u_sess->utils_cxt.g_snapshotcsn = csn;
u_sess->utils_cxt.GtmTimeline = timeline;
u_sess->utils_cxt.snapshot_need_sync_wait_all = ss_need_sync_wait_all;
if (module_logging_is_on(MOD_TRANS_SNAPSHOT)) {
ereport(LOG,
(errmodule(MOD_TRANS_SNAPSHOT),
errmsg("global snapshot info from CN: gxmin: " XID_FMT ", gxmax: " XID_FMT ", gscn: %lu,"
"RecentGlobalXmin: %lu, cn_xc_maintain_mode: %s.",
u_sess->utils_cxt.gxmin,
u_sess->utils_cxt.gxmax,
u_sess->utils_cxt.g_snapshotcsn,
u_sess->utils_cxt.RecentGlobalXmin,
u_sess->utils_cxt.cn_xc_maintain_mode ? "on" : "off")));
}
}
/*
* Store snapshot data received from the Coordinator
*/
void SetGlobalSnapshotDataNode(TransactionId xmin, TransactionId xmax, uint64 csn, GTM_Timeline timeline)
{
u_sess->utils_cxt.snapshot_source = SNAPSHOT_DATANODE;
u_sess->utils_cxt.gxmin = xmin;
u_sess->utils_cxt.gxmax = xmax;
u_sess->utils_cxt.g_snapshotcsn = csn;
u_sess->utils_cxt.GtmTimeline = timeline;
ereport(DEBUG1,
(errmsg("global snapshot info: gxmin: " XID_FMT ", gxmax: " XID_FMT ", gscn: %lu",
u_sess->utils_cxt.gxmin,
u_sess->utils_cxt.gxmax,
u_sess->utils_cxt.g_snapshotcsn)));
}
/*
* Force Datanode to use local snapshot data
*/
void UnsetGlobalSnapshotData(void)
{
u_sess->utils_cxt.snapshot_source = SNAPSHOT_UNDEFINED;
u_sess->utils_cxt.gxmin = InvalidTransactionId;
u_sess->utils_cxt.gxmax = InvalidTransactionId;
u_sess->utils_cxt.g_snapshotcsn = 0;
u_sess->utils_cxt.GtmTimeline = InvalidTransactionTimeline;
u_sess->utils_cxt.is_autovacuum_snapshot = false;
ereport(DEBUG1, (errmsg("unset snapshot info")));
}
/*
* Entry of snapshot obtention for openGauss node
* returns information about running transactions.
* The returned snapshot includes xmin (lowest still-running xact ID),
* xmax (highest completed xact ID + 1), and a list of running xact IDs
* in the range xmin <= xid < xmax. It is used as follows:
* All xact IDs < xmin are considered finished.
* All xact IDs >= xmax are considered still running.
* For an xact ID xmin <= xid < xmax, consult list to see whether
* it is considered running or not.
* This ensures that the set of transactions seen as "running" by the
* current xact will not change after it takes the snapshot.
*
* We also update the following backend-global variables:
* TransactionXmin: the oldest xmin of any snapshot in use in the
* current transaction (this is the same as MyPgXact->xmin).
* RecentXmin: the xmin computed for the most recent snapshot. XIDs
* older than this are known not running any more.
* RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
* running transactions). This is
* the same computation done by GetOldestXmin(true, true).
*/
static bool GetPGXCSnapshotData(Snapshot snapshot)
{
#ifdef ENABLE_MULTIPLE_NODES
/*
* If this node is in recovery phase,
* snapshot has to be taken directly from WAL information.
*/
if (!IS_MULTI_DISASTER_RECOVER_MODE && RecoveryInProgress())
return false;
/*
* The typical case is that the local Coordinator passes down the snapshot to the
* remote nodes to use, while it itself obtains it from GTM. Autovacuum processes
* need however to connect directly to GTM themselves to obtain XID and snapshot
* information for autovacuum worker threads.
* A vacuum analyze uses a special function to get a transaction ID and signal
* GTM not to include this transaction ID in snapshot.
* A vacuum worker starts as a normal transaction would.
*/
if ((IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess() || GetForceXidFromGTM()) &&
IsNormalProcessingMode()) {
if (GetSnapshotDataDataNode(snapshot))
return true;
/* else fallthrough */
} else if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && IsNormalProcessingMode()) {
/* Snapshot has ever been received from remote Coordinator */
if (GetSnapshotDataCoordinator(snapshot))
return true;
/* else fallthrough */
}
/*
* If we have no snapshot, we will use a local one.
* If we are in normal mode, we output a warning though.
* We currently fallback and use a local one at initdb time,
* as well as when a new connection occurs.
* This is also the case for autovacuum launcher.
*
* IsPostmasterEnvironment - checks for initdb
* IsNormalProcessingMode() - checks for new connections
* IsAutoVacuumLauncherProcess - checks for autovacuum launcher process
*/
if (IS_PGXC_DATANODE && !isRestoreMode && u_sess->utils_cxt.snapshot_source == SNAPSHOT_UNDEFINED &&
IsPostmasterEnvironment && IsNormalProcessingMode() && !IsAutoVacuumLauncherProcess()) {
if (!t_thrd.postgres_cxt.isInResetUserName)
ereport(WARNING, (errmsg("Do not have a GTM snapshot available")));
}
return false;
#else
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
return false;
#endif /* ENABLE_MULTIPLE_NODES */
}
#ifdef ENABLE_MULTIPLE_NODES
/*
* Get snapshot data for Datanode
* This is usually passed down from the Coordinator
*
* returns whether or not to return immediately with snapshot
*/
static bool GetSnapshotDataDataNode(Snapshot snapshot)
{
Assert(IS_PGXC_DATANODE || IsConnFromCoord() || IsAutoVacuumWorkerProcess() || GetForceXidFromGTM());
/*
* Fallback to general case if Datanode is accessed directly by an application
*/
if (IsPGXCNodeXactDatanodeDirect())
return GetSnapshotDataCoordinator(snapshot);
if (IsAutoVacuumWorkerProcess() || GetForceXidFromGTM()) {
GTM_Snapshot gtm_snapshot;
ereport(DEBUG1,
(errmsg("Getting snapshot for autovacuum. Current XID = " XID_FMT, GetCurrentTransactionIdIfAny())));
gtm_snapshot = IS_MULTI_DISASTER_RECOVER_MODE ? GetSnapshotGTMDR() : GetSnapshotGTMLite();
if (!gtm_snapshot) {
if (g_instance.status > NoShutdown) {
if (module_logging_is_on(MOD_TRANS_SNAPSHOT)) {
ereport(LOG, (errmodule(MOD_TRANS_SNAPSHOT), errmsg("Shut down, could not obtain snapshot")));
}
return false;
} else {
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("GTM error, could not obtain snapshot")));
}
} else {
*u_sess->utils_cxt.g_GTM_Snapshot = *gtm_snapshot;
u_sess->utils_cxt.snapshot_source = SNAPSHOT_DIRECT;
snapshot->gtm_snapshot_type = IsAutoVacuumWorkerProcess() ? GTM_SNAPSHOT_TYPE_AUTOVACUUM : GTM_SNAPSHOT_TYPE_GLOBAL;
/* only use gtm csn */
Snapshot ret;
ret = GetLocalSnapshotData(snapshot);
Assert(ret != NULL);
snapshot->snapshotcsn = set_proc_csn_and_check("GetSnapshotDataDataNodeDirectGTM",
gtm_snapshot->csn, snapshot->gtm_snapshot_type, SNAPSHOT_DIRECT);
u_sess->utils_cxt.g_GTM_Snapshot->csn = snapshot->snapshotcsn;
u_sess->utils_cxt.RecentGlobalXmin = GetOldestXmin(NULL, true);
u_sess->utils_cxt.RecentGlobalCatalogXmin = GetOldestCatalogXmin();
return true;
}
}
if (GTM_LITE_MODE && u_sess->utils_cxt.snapshot_source == SNAPSHOT_COORDINATOR) {
TransactionId save_recentglobalxmin = u_sess->utils_cxt.RecentGlobalXmin;
snapshot->gtm_snapshot_type =
u_sess->utils_cxt.is_autovacuum_snapshot ? GTM_SNAPSHOT_TYPE_AUTOVACUUM : GTM_SNAPSHOT_TYPE_GLOBAL;
if (IS_MULTI_DISASTER_RECOVER_MODE) {
snapshot->snapshotcsn = u_sess->utils_cxt.g_snapshotcsn;
t_thrd.pgxact->csn_dr = snapshot->snapshotcsn;
pg_memory_barrier();
CommitSeqNo lastReplayedConflictCSN = (CommitSeqNo)pg_atomic_read_u64(
&(g_instance.comm_cxt.predo_cxt.last_replayed_conflict_csn));
if (lastReplayedConflictCSN != 0 && snapshot->snapshotcsn - 1 <= lastReplayedConflictCSN) {
ereport(ERROR, (errmsg("gtm csn small: gtm csn %lu, lastReplayedConflictCSN %lu",
snapshot->snapshotcsn, lastReplayedConflictCSN)));
}
LWLockAcquire(XLogMaxCSNLock, LW_SHARED);
if (t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN + 1 < snapshot->snapshotcsn) {
ereport(ERROR, (errmsg("dn data invisible: local csn %lu, gtm snapshotcsn %lu",
t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN, snapshot->snapshotcsn)));
}
LWLockRelease(XLogMaxCSNLock);
} else {
/* only use gtm csn */
Snapshot ret;
ret = GetLocalSnapshotData(snapshot);
Assert(ret != NULL);
snapshot->snapshotcsn = u_sess->utils_cxt.g_snapshotcsn;
(void)set_proc_csn_and_check("GetSnapshotDataDataNodeFromCN", snapshot->snapshotcsn,
snapshot->gtm_snapshot_type, SNAPSHOT_COORDINATOR);
/* reset RecentGlobalXmin */
u_sess->utils_cxt.RecentGlobalXmin = save_recentglobalxmin;
/* too late to check and set */
}
return true;
}
return false;
}
/*
* Get snapshot data for Coordinator
* It will later be passed down to Datanodes
*
* returns whether or not to return immediately with snapshot
*/
static bool GetSnapshotDataCoordinator(Snapshot snapshot)
{
GTM_Snapshot gtm_snapshot;
Assert(IS_PGXC_COORDINATOR || IsPGXCNodeXactDatanodeDirect());
/* Log some information about snapshot obtention */
if (IsAutoVacuumWorkerProcess()) {
ereport(DEBUG1,
(errmsg("Getting snapshot for autovacuum. Current XID = " XID_FMT, GetCurrentTransactionIdIfAny())));
} else {
ereport(DEBUG1, (errmsg("Getting snapshot. Current XID = " XID_FMT, GetCurrentTransactionIdIfAny())));
}
gtm_snapshot = IS_MULTI_DISASTER_RECOVER_MODE ? GetSnapshotGTMDR() : GetSnapshotGTMLite();
if (!gtm_snapshot) {
if (g_instance.status > NoShutdown) {
return false;
} else {
/* error level degrade when in AbortTransaction procedure */
ereport(t_thrd.xact_cxt.bInAbortTransaction ? WARNING : ERROR,
(errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("GTM error, could not obtain snapshot XID = " XID_FMT, GetCurrentTransactionIdIfAny())));
}
} else {
snapshot->gtm_snapshot_type = GTM_SNAPSHOT_TYPE_GLOBAL;
*u_sess->utils_cxt.g_GTM_Snapshot = *gtm_snapshot;
if (IS_MULTI_DISASTER_RECOVER_MODE) {
snapshot->snapshotcsn = gtm_snapshot->csn;
t_thrd.pgxact->csn_dr = snapshot->snapshotcsn;
LWLockAcquire(XLogMaxCSNLock, LW_SHARED);
if (t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN + 1 < snapshot->snapshotcsn) {
ereport(ERROR, (errmsg("cn data invisible: local csn %lu, gtm snapshotcsn %lu", t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN, snapshot->snapshotcsn)));
}
LWLockRelease(XLogMaxCSNLock);
} else {
/* only use gtm csn */
Snapshot ret;
ret = GetLocalSnapshotData(snapshot);
Assert(ret != NULL);
snapshot->snapshotcsn = set_proc_csn_and_check("GetSnapshotDataCoordinator", gtm_snapshot->csn,
snapshot->gtm_snapshot_type, SNAPSHOT_DIRECT);
u_sess->utils_cxt.g_GTM_Snapshot->csn = snapshot->snapshotcsn;
u_sess->utils_cxt.RecentGlobalXmin = GetOldestXmin(NULL, true);
u_sess->utils_cxt.RecentGlobalCatalogXmin = GetOldestCatalogXmin();
}
if (module_logging_is_on(MOD_TRANS_SNAPSHOT)) {
ereport(LOG, (errmodule(MOD_TRANS_SNAPSHOT),
errmsg("CN gets snapshot from gtm_snapshot, csn = %lu.", snapshot->snapshotcsn)));
}
return true;
}
return false;
}
void proc_cancel_invalid_gtm_lite_conn()
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int i;
GtmHostIndex hostindex = GTM_HOST_INVAILD;
GtmHostIndex my_gtmhost = InitGTM(false);
ereport(LOG, (errmsg("GTMLite: canceling stale GTM connections, new GTM host index: %d.", my_gtmhost)));
Assert(my_gtmhost == t_thrd.proc->my_gtmhost);
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (i = 0; i < arrayP->numProcs; i++) {
int pgprocno = arrayP->pgprocnos[i];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
if (proc == NULL || (proc->pid == 0)) {
continue;
}
Assert(proc->myProgName != NULL);
/* skip non-postgres threads */
if (strcmp((const char*)(proc->myProgName), "postgres") != 0) {
continue;
}
hostindex = (GtmHostIndex)pg_atomic_fetch_add_u32((volatile uint32*)&proc->my_gtmhost, 0);
ereport(DEBUG1, (errmsg("current GTM hostindex %d, thread id: %lu, thread GTM hostindex: %d", my_gtmhost,
proc->pid, hostindex)));
if (hostindex == GTM_HOST_INVAILD) {
continue;
}
if (my_gtmhost != hostindex) {
(void)pg_atomic_exchange_u32(&proc->signal_cancel_gtm_conn_flag, HOST2FLAG(my_gtmhost));
if (gs_signal_send(proc->pid, SIGUSR2)) {
ereport(WARNING, (errmsg("GTMLite: could not send signal to thread %lu: %m", proc->pid)));
(void)pg_atomic_exchange_u32(&proc->signal_cancel_gtm_conn_flag, 0);
} else {
ereport(LOG, (errmsg("GTMLite: success to send SIGUSR2 to openGauss thread: %lu.", proc->pid)));
}
}
}
LWLockRelease(ProcArrayLock);
}
#endif /* ENABLE_MULTIPLE_NODES */
/* Cleanup the snapshot */
static void cleanSnapshot(Snapshot snapshot)
{
snapshot->snapshotcsn = 0;
snapshot->xmin = snapshot->xmax = InvalidTransactionId;
snapshot->timeline = InvalidTransactionTimeline;
}
#endif /* PGXC */
TransactionId GetGlobal2pcXmin()
{
TransactionId golabl_2pc_xmin = t_thrd.xact_cxt.ShmemVariableCache->nextXid;
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
int ii = 0;
int* pgprocnos = arrayP->pgprocnos;
int numProcs;
LWLockAcquire(ProcArrayLock, LW_SHARED);
numProcs = arrayP->numProcs;
for (ii = 0; ii < numProcs; ii++) {
int pgprocno = pgprocnos[ii];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
TransactionId xid = pgxact->xid;
TransactionId prepare_xid = pgxact->prepare_xid;
if (proc->pid == 0)
continue; /* ignore prepared transactions */
ereport(DEBUG5, (errmsg("Active transaction: xid: " XID_FMT " ,prepare_xid: " XID_FMT, xid, prepare_xid)));
if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, golabl_2pc_xmin)) {
golabl_2pc_xmin = xid;
}
if (TransactionIdIsNormal(prepare_xid) && TransactionIdPrecedes(prepare_xid, golabl_2pc_xmin)) {
golabl_2pc_xmin = prepare_xid;
}
}
LWLockRelease(ProcArrayLock);
return golabl_2pc_xmin;
}
/*
* Wait for the transaction which modify the tuple to finish.
* First release the buffer lock. After waiting, re-acquire the buffer lock.
*/
void SyncWaitXidEnd(TransactionId xid, Buffer buffer, const Snapshot snapshot)
{
if (!BufferIsValid(buffer)) {
/* Wait local transaction finish */
SyncLocalXidWait(xid, snapshot);
return;
}
BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
LWLockMode mode = GetHeldLWLockMode(bufHdr->content_lock);
Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED);
/* Release buffer lock */
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
/* Wait local transaction finish */
SyncLocalXidWait(xid, snapshot);
/* Re-acqure buffer lock, need transform lwlock mode to buffer lock mode */
LockBuffer(buffer, mode == LW_EXCLUSIVE ? BUFFER_LOCK_EXCLUSIVE : BUFFER_LOCK_SHARE);
}
/*
* Wait local transaction finish, if transaction wait time exceed transaction_sync_naptime, call gs_clean.
*/
void SyncLocalXidWait(TransactionId xid, const Snapshot snapshot)
{
ReleaseAllGSCRdConcurrentLock();
int64 remainingNapTime = (int64)u_sess->attr.attr_common.transaction_sync_naptime * 1000000; /* us */
int64 remainingTimeout = (int64)u_sess->attr.attr_common.transaction_sync_timeout * 1000000; /* us */
const int64 sleepTime = 1000;
WaitState oldStatus = pgstat_report_waitstatus(STATE_WAIT_UNDEFINED, true);
gstrace_entry(GS_TRC_ID_SyncLocalXidWait);
while (!ConditionalXactLockTableWait(xid, snapshot)) {
/* type of transaction id is same as node id, reuse the second param for waited transaction id */
pgstat_report_waitstatus_xid(STATE_WAIT_XACTSYNC, xid);
if (u_sess->attr.attr_common.transaction_sync_naptime && remainingNapTime <= 0 && twoPhaseCleanerProc) {
ereport(LOG,
(errcode(ERRCODE_SUCCESSFUL_COMPLETION),
errmsg("wait transaction sync time would exceed %d s, "
"call gs_clean to clean reserved prepared transactions.",
u_sess->attr.attr_common.transaction_sync_naptime)));
CHECK_FOR_INTERRUPTS();
/* call gs_clean */
bSyncXactsCallGsclean = true;
SetLatch(&twoPhaseCleanerProc->procLatch);
/* sleep 0.1s, wait gs_clean process */
pg_usleep(100 * sleepTime);
remainingNapTime = (int64)u_sess->attr.attr_common.transaction_sync_naptime * 1000000; /* us */
}
if (u_sess->attr.attr_common.transaction_sync_timeout && remainingTimeout <= 0) {
(void)pgstat_report_waitstatus(oldStatus);
ereport(ERROR,
(errcode(ERRCODE_LOCK_WAIT_TIMEOUT),
errmsg("wait transaction %lu sync time exceed %d s.",
xid,
u_sess->attr.attr_common.transaction_sync_timeout)));
}
if (g_instance.status > NoShutdown || g_instance.demotion > NoDemote) {
ereport(FATAL,
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating SyncLocalXactsWithGTM process due to administrator command")));
}
CHECK_FOR_INTERRUPTS();
pg_usleep(sleepTime); /* 1ms */
remainingNapTime = remainingNapTime - sleepTime;
remainingTimeout = remainingTimeout - sleepTime;
}
(void)pgstat_report_waitstatus(oldStatus);
gstrace_exit(GS_TRC_ID_SyncLocalXidWait);
}
void PrintCurrentSnapshotInfo(int logelevel, TransactionId xid, Snapshot snapshot, const char* action)
{
if (snapshot) {
StringInfoData snapshot_str;
initStringInfo(&snapshot_str);
appendStringInfo(&snapshot_str,
"snapshot xmin: %lu, xmax: %lu, csn: %lu, "
"recentGlobalXmin: %lu",
snapshot->xmin,
snapshot->xmax,
snapshot->snapshotcsn,
pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin));
ereport(logelevel,
(errmsg("[%s] xtuplexid= %lu, [MVCCSanpshot] %s", action ? action : "no aciton", xid, snapshot_str.data)));
pfree(snapshot_str.data);
snapshot_str.data = NULL;
} else
ereport(logelevel, (errmsg("[%s] tuplexid = %lu", action ? action : "no aciton", xid)));
}
/*
* cache line size in bytes
*/
#define CACHE_LINE_SZ 64
/*
* partition reference count to groups of threads to reduce contention
*/
#define NREFCNT 1
/*
* atomic increment
*/
#define atomic_inc(ptr) __sync_add_and_fetch(ptr, 1)
/*
* atomic decrement
*/
#define atomic_dec(ptr) __sync_sub_and_fetch(ptr, 1)
/*
* cache-line aligned reference counter
*/
typedef struct _ref_cnt {
unsigned count;
unsigned pad[CACHE_LINE_SZ / sizeof(unsigned) - sizeof(unsigned)];
} ref_cnt_t;
/* snapxid structure to hold the values computed at a commit time */
#ifdef __aarch64__
/* the offset of ref_cnt in the struct _snapxid. */
#define REF_CNT_OFFSET 36
typedef struct _snapxid {
TransactionId xmin;
TransactionId xmax;
CommitSeqNo snapshotcsn;
TransactionId localxmin; /* the latest xmin in local node, update at transaction end. */
bool takenDuringRecovery;
char padding[PG_CACHE_LINE_SIZE - REF_CNT_OFFSET];
} snapxid_t;
#else
typedef struct _snapxid {
TransactionId xmin;
TransactionId xmax;
CommitSeqNo snapshotcsn;
TransactionId localxmin; /* the latest xmin in local node, update at transaction end. */
bool takenDuringRecovery;
ref_cnt_t ref_cnt[NREFCNT];
} snapxid_t;
#endif
/*
* the snapshot ring buffer
*/
static snapxid_t* g_snap_buffer = NULL; /* the ring buffer for snapxids */
static snapxid_t* g_snap_buffer_copy = NULL; /* the ring buffer for AtProcExit */
static size_t g_bufsz = 0;
static bool g_snap_assigned = false; /* true if current snap valid */
#define SNAP_SZ sizeof(snapxid_t) /* size of snapxid_t */
#define MaxNumSnapVersion 64 /* max version number */
/*
* get pointer to snapxid_t entry in specified index in ring buffer
*/
static inline snapxid_t* SNAPXID_AT(size_t i)
{
return (snapxid_t*)(((char*)g_snap_buffer) + SNAP_SZ * i);
}
/*
* get offset in bytes of snapxid_t entry in ring buffer
*/
static inline size_t SNAPXID_OFFSET(snapxid_t* x)
{
return (((char*)x) - ((char*)g_snap_buffer));
}
/*
* get index of snapxid_t entry in ring buffer
*/
static inline size_t SNAPXID_INDEX(snapxid_t* x)
{
return (SNAPXID_OFFSET(x) / SNAP_SZ);
}
/*
* points to most recently computed snapshot
*/
static volatile snapxid_t* g_snap_current = NULL;
/*
* points to next available slot in snapshot ring buffer
*/
static volatile snapxid_t* g_snap_next = NULL;
/*
* Report shared-memory space needed by CreateSharedRingBuffer.
*/
Size RingBufferShmemSize(void)
{
#ifdef __aarch64__
return mul_size(MaxNumSnapVersion, SNAP_SZ) + PG_CACHE_LINE_SIZE;
#else
return mul_size(MaxNumSnapVersion, SNAP_SZ);
#endif
}
/*
* Initialize the shared Snapshot Ring Buffer during postmaster startup.
*/
void CreateSharedRingBuffer(void)
{
bool found = false;
#ifdef __aarch64__
/* Create or attach to the ProcArray shared structure. */
g_snap_buffer = (snapxid_t*)CACHELINEALIGN(ShmemInitStruct("Snapshot Ring Buffer", RingBufferShmemSize(), &found));
#else
/* Create or attach to the ProcArray shared structure. */
g_snap_buffer = (snapxid_t*)ShmemInitStruct("Snapshot Ring Buffer", RingBufferShmemSize(), &found);
#endif
if (!found) {
/* Initialize if we're the first. */
g_bufsz = MaxNumSnapVersion;
g_snap_current = SNAPXID_AT(0);
g_snap_next = SNAPXID_AT(1);
g_snap_buffer_copy = g_snap_buffer;
errno_t rc = memset_s(g_snap_buffer, RingBufferShmemSize(), 0, RingBufferShmemSize());
securec_check(rc, "\0", "\0");
}
}
#ifdef __aarch64__
/*
* increment reference count of snapshot
*/
static void IncrRefCount(snapxid_t* s)
{
t_thrd.proc->snap_refcnt_bitmap |= 1 << (SNAPXID_INDEX(s) % 64);
pg_write_barrier();
}
/*
* decrement reference count of snapshot
*/
static void DecrRefCount(snapxid_t* s)
{
t_thrd.proc->snap_refcnt_bitmap &= ~(1 << (SNAPXID_INDEX(s) % 64));
pg_write_barrier();
}
/*
* test for zero reference count of snapshot
*/
static int IsZeroRefCount(snapxid_t* s)
{
uint64 bitmap = 1 << (SNAPXID_INDEX(s) % 64);
for (int i = 0; i < g_instance.proc_array_idx->numProcs; i++) {
if (g_instance.proc_base_all_procs[g_instance.proc_array_idx->pgprocnos[i]]->snap_refcnt_bitmap & bitmap) {
return 0;
}
}
return 1;
}
#else
/*
* increment reference count of snapshot
*/
static void IncrRefCount(snapxid_t* s)
{
const int wh = 0;
atomic_inc(&s->ref_cnt[wh].count);
}
/*
* decrement reference count of snapshot
*/
static void DecrRefCount(snapxid_t* s)
{
const int wh = 0;
atomic_dec(&s->ref_cnt[wh].count);
}
/*
* test for zero reference count of snapshot
*/
static int IsZeroRefCount(snapxid_t* s)
{
int i;
for (i = 0; i < NREFCNT; ++i) {
if (s->ref_cnt[i].count) {
return 0;
}
}
return 1;
}
#endif
/* snapxid to be held off to the next commit */
static inline snapxid_t* GetNextSnapXid()
{
return g_snap_buffer ? (snapxid_t*)g_snap_next : NULL;
}
const static int SNAP_ERROR_COUNT = 256;
/*
* update the current snapshot pointer find the next available slot for the next pointer
*/
static void SetNextSnapXid()
{
if (g_snap_buffer != NULL) {
g_snap_current = g_snap_next;
pg_write_barrier();
g_snap_assigned = true;
snapxid_t* ret = (snapxid_t*)g_snap_current;
size_t idx = SNAPXID_INDEX(ret);
int nofindCount = 0;
loop:
do {
++idx;
/* if wrap-around, take start from head to find free slot */
if (idx == g_bufsz)
idx = 0;
ret = SNAPXID_AT(idx);
if (IsZeroRefCount(ret)) {
g_snap_next = ret;
return;
}
nofindCount++;
} while (ret != g_snap_next);
/* we alloc sufficient space for local snapshot , overflow should not happen here */
ereport(WARNING, (errmsg("snapshot ring buffer overflow.")));
if (nofindCount >= SNAP_ERROR_COUNT) {
ereport(PANIC, (errcode(ERRCODE_LOG), errmsg("Can not get an available snapshot slot")));
}
/* try to find available slot */
goto loop;
}
}
/*
* just a wrapper to pass __snap_current to GetSnapshotData
*/
static snapxid_t* GetCurrentSnapXid()
{
snapxid_t* x = (snapxid_t*)g_snap_current;
IncrRefCount(x);
return x;
}
/*
* release snapshot data (decrement reference count)
*/
static void ReleaseSnapXid(snapxid_t* snapshot)
{
DecrRefCount(snapshot);
}
#ifdef USE_ASSERT_CHECKING
/* add assert information for refcount of snapshot */
class AutoSnapId {
public:
AutoSnapId()
: m_count(1)
{}
~AutoSnapId()
{
if (m_count > 0) {
ereport(PANIC, (errcode(ERRCODE_LOG),
errmsg("snapshot refcount leak, must be zero")));
}
}
void decr()
{
m_count = 0;
}
public:
int m_count;
};
#endif
Snapshot GetLocalSnapshotData(Snapshot snapshot)
{
/* if first here, fallback to original code */
if (!g_snap_assigned || (g_snap_buffer == NULL)) {
ereport(DEBUG1, (errmsg("Falling back to origin GetSnapshotData: not assigned yet or during shutdown\n")));
return NULL;
}
pg_read_barrier();
HOLD_INTERRUPTS();
/* 1. increase ref-count of current snapshot in ring buffer */
snapxid_t* snapxid = GetCurrentSnapXid();
#ifdef USE_ASSERT_CHECKING
AutoSnapId snapid;
#endif
/* save use_data for release */
snapshot->user_data = snapxid;
/* 2. copy from pre-computed snapshot arrays into return param snapshot */
snapshot->takenDuringRecovery = snapxid->takenDuringRecovery;
TransactionId replication_slot_xmin = g_instance.proc_array_idx->replication_slot_xmin;
if (!TransactionIdIsValid(t_thrd.pgxact->xmin)) {
t_thrd.pgxact->xmin = u_sess->utils_cxt.TransactionXmin = snapxid->xmin;
t_thrd.pgxact->handle = GetCurrentTransactionHandleIfAny();
}
if (TransactionIdPrecedes(snapxid->localxmin, (uint64)u_sess->attr.attr_storage.vacuum_defer_cleanup_age)) {
u_sess->utils_cxt.RecentGlobalXmin = FirstNormalTransactionId;
} else {
u_sess->utils_cxt.RecentGlobalXmin = snapxid->localxmin - u_sess->attr.attr_storage.vacuum_defer_cleanup_age;
}
if (!TransactionIdIsNormal(u_sess->utils_cxt.RecentGlobalXmin)) {
u_sess->utils_cxt.RecentGlobalXmin = FirstNormalTransactionId;
}
if (TransactionIdIsNormal(replication_slot_xmin) &&
TransactionIdPrecedes(replication_slot_xmin, u_sess->utils_cxt.RecentGlobalXmin)) {
u_sess->utils_cxt.RecentGlobalXmin = replication_slot_xmin;
}
u_sess->utils_cxt.RecentGlobalCatalogXmin = GetOldestCatalogXmin();
u_sess->utils_cxt.RecentXmin = snapxid->xmin;
snapshot->xmin = snapxid->xmin;
snapshot->xmax = snapxid->xmax;
snapshot->snapshotcsn = snapxid->snapshotcsn;
snapshot->curcid = GetCurrentCommandId(false);
snapshot->active_count = 0;
snapshot->regd_count = 0;
snapshot->copied = false;
/* Non-catalog tables can be vacuumed if older than this xid */
u_sess->utils_cxt.RecentGlobalDataXmin = u_sess->utils_cxt.RecentGlobalXmin;
ReleaseSnapXid(snapxid);
snapshot->user_data = NULL;
#ifdef USE_ASSERT_CHECKING
snapid.decr();
#endif
RESUME_INTERRUPTS();
return snapshot;
}
#define MAX_PENDING_SNAPSHOT_CNT 1000
#define CALC_SNAPSHOT_TIMEOUT (1 * 1000)
void forward_recent_global_xmin(void)
{
(void)LWLockAcquire(CsnMinLock, LW_EXCLUSIVE);
/*
* check and update recentGlobalXmin, get a snapshot,
* the csn of xid preceed recentLocalXmin, must smaller than nextCommitSeqNo.
*/
if (t_thrd.xact_cxt.ShmemVariableCache->keep_csn <= t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min) {
if (module_logging_is_on(MOD_TRANS_SNAPSHOT))
ereport(LOG, (errmodule(MOD_TRANS_SNAPSHOT),
errmsg("update recentGlobalXmin, from %lu to %lu. keep_xmin from %lu to %lu, "
"keep_csn from %lu to %lu.",
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin, t_thrd.xact_cxt.ShmemVariableCache->keep_xmin,
t_thrd.xact_cxt.ShmemVariableCache->keep_xmin, t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin,
t_thrd.xact_cxt.ShmemVariableCache->keep_csn,
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo)));
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = t_thrd.xact_cxt.ShmemVariableCache->keep_xmin;
t_thrd.xact_cxt.ShmemVariableCache->keep_xmin = t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin;
t_thrd.xact_cxt.ShmemVariableCache->keep_csn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
}
LWLockRelease(CsnMinLock);
}
static void init_shmem_csn_cleanup_instr(void)
{
(void)LWLockAcquire(CsnMinLock, LW_EXCLUSIVE);
/* make sure cutoff_csn_min is small enough when after redo to avoid false positive invalid snapshot */
t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min = COMMITSEQNO_FIRST_NORMAL + 1;
t_thrd.xact_cxt.ShmemVariableCache->keep_csn = COMMITSEQNO_FIRST_NORMAL + 1;
t_thrd.xact_cxt.ShmemVariableCache->keep_xmin = t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin;
t_thrd.xact_cxt.ShmemVariableCache->local_csn_min = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
LWLockRelease(CsnMinLock);
}
static bool ForceCalculateSnapshotXmin(bool forceCalc)
{
return (!u_sess->attr.attr_storage.enable_defer_calculate_snapshot || forceCalc);
}
void CalculateLocalLatestSnapshot(bool forceCalc)
{
/*
* 1. copy current snapshot data to next
* 2. follow same line as original ProcArrayEndTransactionInternal
* 3. generate new snapshot, based on code in GetSnapshotData_Orig()
* 4. add new snapshot to ring buffer (lock-free)
* 5. advance ring-buffer current snapshot pointer.
*/
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
TransactionId xmin;
TransactionId xmax;
TransactionId globalxmin;
int index;
Timestamp currentTimeStamp;
static Timestamp snapshotTimeStamp = 0;
static uint32 snapshotPendingCnt = 0;
snapxid_t* snapxid = GetNextSnapXid();
if (snapxid == NULL) {
ereport(LOG, (errmsg("Skipping generation of new snapshot: ring buffer not active (during shutdown)\n")));
return;
}
/* xmax is always latestCompletedXid + 1 */
xmax = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid;
Assert(TransactionIdIsNormal(xmax));
TransactionIdAdvance(xmax);
/*
* We calculate xmin under the fllowing conditions:
* 1. we always calculate snapshot if disable defer_calculate_snpshot.
* 2. we didn't calculate snapshot for GTM_MAX_PENDING_SNAPSHOT_CNT times.
* 3. we didn't calculate snapshot for GTM_CALC_SNAPSHOT_TIMEOUT seconds.
*/
currentTimeStamp = GetCurrentTimestamp();
if (ForceCalculateSnapshotXmin(forceCalc) || ((++snapshotPendingCnt == MAX_PENDING_SNAPSHOT_CNT) ||
(TimestampDifferenceExceeds(snapshotTimeStamp, currentTimeStamp, CALC_SNAPSHOT_TIMEOUT)))) {
pg_read_barrier();
snapshotPendingCnt = 0;
snapshotTimeStamp = currentTimeStamp;
/* initialize xmin calculation with xmax */
globalxmin = xmin = xmax;
/* Also need to include other snapshot xmin */
if (g_snap_buffer != NULL) {
TransactionId minXmin = ((snapxid_t*)g_snap_current)->xmin;
if (!TransactionIdIsValid(minXmin))
minXmin = globalxmin;
for (size_t idx = 0; idx < g_bufsz; idx++) {
snapxid_t* ret = NULL;
ret = SNAPXID_AT(idx);
if (!IsZeroRefCount(ret) && TransactionIdIsValid(ret->xmin)) {
if (TransactionIdPrecedes(ret->xmin, minXmin)) {
minXmin = ret->xmin;
}
}
}
if (TransactionIdPrecedes(minXmin, globalxmin))
globalxmin = minXmin;
}
int* pgprocnos = arrayP->pgprocnos;
int numProcs;
/*
* Spin over procArray checking xid, xmin, and subxids. The goal is
* to gather all active xids, find the lowest xmin, and try to record
* subxids. Also need include myself.
*/
numProcs = arrayP->numProcs;
for (index = 0; index < numProcs; index++) {
int pgprocno = pgprocnos[index];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId xid = InvalidTransactionId;
/*
* Backend is doing logical decoding which manages xmin
* separately, check below.
*/
if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING)
continue;
/* Update globalxmin to be the smallest valid xmin, only Ignore procs running LAZY VACUUM xmin */
if (!(pgxact->vacuumFlags & PROC_IN_VACUUM)) {
xid = pgxact->xmin; /* fetch just once */
}
if (TransactionIdIsNormal(xid) && TransactionIdPrecedes(xid, globalxmin))
globalxmin = xid;
/* Fetch xid just once - see GetNewTransactionId */
xid = pgxact->xid;
/* If no XID assigned, use xid passed down from CN */
if (!TransactionIdIsNormal(xid))
xid = pgxact->next_xid;
/*
* If the transaction has no XID assigned, we can skip it; it
* won't have sub-XIDs either. If the XID is >= xmax, we can also
* skip it; such transactions will be treated as running anyway
* (and any sub-XIDs will also be >= xmax).
*/
if (!TransactionIdIsNormal(xid) || !TransactionIdPrecedes(xid, xmax))
continue;
/*
* We don't include our own XIDs (if any) in the snapshot, but we
* must include them in xmin.
* Not true any more in this function.
*/
if (TransactionIdPrecedes(xid, xmin))
xmin = xid;
}
/*
* Update globalxmin to include actual process xids. This is a slightly
* different way of computing it than GetOldestXmin uses, but should give
* the same result. GTM-FREE-MODE always use recentLocalXmin.
*/
if (TransactionIdPrecedes(xmin, globalxmin))
globalxmin = xmin;
if (ENABLE_DMS && SS_PRIMARY_MODE) {
SSUpdateNodeOldestXmin(SS_MY_INST_ID, globalxmin);
globalxmin = SSGetGlobalOldestXmin(globalxmin);
if (ENABLE_SS_BCAST_SNAPSHOT) {
SSSendLatestSnapshotToStandby(xmin, xmax, t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo);
}
}
t_thrd.xact_cxt.ShmemVariableCache->xmin = xmin;
t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin = globalxmin;
if (GTM_FREE_MODE) {
t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin = globalxmin;
}
} else if (ENABLE_SS_BCAST_SNAPSHOT && SS_PRIMARY_MODE) {
SSSendLatestSnapshotToStandby(t_thrd.xact_cxt.ShmemVariableCache->xmin, xmax,
t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo);
}
if (GTM_LITE_MODE) {
currentTimeStamp = GetCurrentTimestamp();
if (forceCalc) { /* means first time here; */
init_shmem_csn_cleanup_instr();
forward_recent_global_xmin();
}
}
snapxid->xmin = t_thrd.xact_cxt.ShmemVariableCache->xmin;
snapxid->xmax = xmax;
snapxid->localxmin = t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin;
snapxid->snapshotcsn = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
snapxid->takenDuringRecovery = RecoveryInProgress();
pg_write_barrier();
ereport(DEBUG1, (errmsg("Generated snapshot in ring buffer slot %lu\n", SNAPXID_INDEX(snapxid))));
SetNextSnapXid();
}
/*
* Return the minimal xmin in all the valid snapshot versions.
*/
static TransactionId GetMultiSnapshotOldestXmin()
{
return ((snapxid_t*)g_snap_current)->localxmin;
}
void ProcArrayResetXmin(PGPROC* proc)
{
PGXACT* pgxact = &g_instance.proc_base_all_xacts[proc->pgprocno];
/*
* Note we can do this without locking because we assume that storing an Xid
* is atomic.
*/
pgxact->xmin = InvalidTransactionId;
}
/* return global csn from GTM */
CommitSeqNo GetCommitCsn()
{
return t_thrd.proc->commitCSN;
}
void setCommitCsn(uint64 commit_csn)
{
t_thrd.proc->commitCSN = commit_csn;
}
/**
* @Description: Return the parent xid of the given sub xid.
*
* @in xid - the sub transaction id
* @return - return invlid transactionid if not found, otherwise
* return the parent xid.
*/
TransactionId SubTransGetTopParentXidFromProcs(TransactionId xid)
{
ProcArrayStruct* arrayP = g_instance.proc_array_idx;
LWLockAcquire(ProcArrayLock, LW_SHARED);
for (int i = 0; i < arrayP->numProcs; i++) {
int pgprocno = arrayP->pgprocnos[i];
volatile PGPROC* proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId pxid;
/* Fetch xid just once - see GetNewTransactionId */
pxid = pgxact->xid;
/*
* search the sub xids, return the top parent xid when match.
*/
if (pgxact->nxids > 0) {
/* Use subxidsLock to protect subxids */
LWLockAcquire(proc->subxidsLock, LW_SHARED);
for (int j = pgxact->nxids - 1; j >= 0; j--) {
TransactionId cxid = proc->subxids.xids[j];
if (TransactionIdEquals(cxid, xid)) {
/* when found, release the lock and return the parent xid. */
LWLockRelease(proc->subxidsLock);
LWLockRelease(ProcArrayLock);
return pxid;
}
}
LWLockRelease(proc->subxidsLock);
}
}
LWLockRelease(ProcArrayLock);
return InvalidTransactionId;
}
Datum pgxc_gtm_snapshot_status(PG_FUNCTION_ARGS)
{
#ifndef ENABLE_MULTIPLE_NODES
FuncCallContext* funcctx = NULL;
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unsupported view in single node mode.")));
SRF_RETURN_DONE(funcctx);
#else
FuncCallContext* funcctx = NULL;
/* check gtm mode, only gtm support this function */
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("unsupported function or view in %s mode.", GTM_LITE_MODE ? "GTM-Lite" : "GTM-Free")));
SRF_RETURN_DONE(funcctx);
#endif
}
/*
* @Description: check whether csn is valid, if valid, set it to pgxact.
*
* @in func - the function which need to check and set csn
* @in csn_min - the csn to check
* @in snapshot_type - the type of snapshot
* @in from - where the snapshot from
*
* @return - return csn_min set in pgxact,
* or InvalidCommitSeqNo if csn_min is invalid and from the local multi snapshot
*/
CommitSeqNo
set_proc_csn_and_check(const char* func, CommitSeqNo csn_min, GTM_SnapshotType gtm_snapshot_type, SnapshotSource from)
{
if (u_sess->attr.attr_common.xc_maintenance_mode || u_sess->utils_cxt.cn_xc_maintain_mode ||
IsAutoVacuumWorkerProcess() || gtm_snapshot_type == GTM_SNAPSHOT_TYPE_AUTOVACUUM) {
return csn_min;
}
if (!COMMITSEQNO_IS_COMMITTED(csn_min))
ereport(ERROR, (errcode(ERRCODE_SNAPSHOT_INVALID),
errmsg("Snapshot is invalid, snaphot type %s, snapshot csn: %lu.",
transfer_snapshot_type(gtm_snapshot_type), csn_min)));
LWLockAcquire(CsnMinLock, LW_SHARED);
/* make sure the received csn from gtm is not small than local_csn_min */
CommitSeqNo local_csn_min = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->local_csn_min);
if (from == SNAPSHOT_DIRECT && csn_min < local_csn_min) {
csn_min = local_csn_min;
}
CommitSeqNo cutoff_csn_min = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min);
if (csn_min < cutoff_csn_min) {
if (from == SNAPSHOT_DATANODE) {
LWLockRelease(CsnMinLock);
return InvalidCommitSeqNo;
}
LWLockRelease(CsnMinLock);
ereport(ERROR,
(errcode(ERRCODE_SNAPSHOT_INVALID),
errmsg("Snapshot is invalid, this is a safe error, snapshot too old."),
errdetail("Snaphot type %s csn %lu is lower than cutoff_csn_min %lu in %s.",
transfer_snapshot_type(gtm_snapshot_type), csn_min, cutoff_csn_min, func),
errhint("This is a safe error report, will not impact "
"data consistency, retry your query if needed.")));
} else {
ereport(DEBUG1, (errmsg("try to set my proc csn from %lu to %lu.",
t_thrd.pgxact->csn_min, csn_min)));
}
t_thrd.pgxact->csn_min = csn_min;
LWLockRelease(CsnMinLock);
return t_thrd.pgxact->csn_min;
}
Datum get_gtm_lite_status(PG_FUNCTION_ARGS)
{
#ifndef ENABLE_MULTIPLE_NODES
FuncCallContext* funcctx = NULL;
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unsupported view in single node mode.")));
SRF_RETURN_DONE(funcctx);
#else
#define GTM_LITE_STATUS_ATTRS 2
/* check gtm mode, gtm-free unsupport this function */
if (GTM_FREE_MODE) {
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("unsupported function or view in GTM-FREE mode.")));
}
FuncCallContext* funcctx = NULL;
GTMLite_Status gtm_status = NULL;
if (SRF_IS_FIRSTCALL()) {
MemoryContext oldcontext;
TupleDesc tupdesc;
funcctx = SRF_FIRSTCALL_INIT();
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
tupdesc = CreateTemplateTupleDesc(GTM_LITE_STATUS_ATTRS, false);
TupleDescInitEntry(tupdesc, (AttrNumber)1, "backup_xid", XIDOID, -1, 0);
TupleDescInitEntry(tupdesc, (AttrNumber)2, "csn", XIDOID, -1, 0);
funcctx->tuple_desc = BlessTupleDesc(tupdesc);
funcctx->max_calls = 1;
MemoryContextSwitchTo(oldcontext);
}
/* stuff done on every call of the function */
funcctx = SRF_PERCALL_SETUP();
if (funcctx->call_cntr < funcctx->max_calls) {
Datum values[GTM_LITE_STATUS_ATTRS];
bool nulls[GTM_LITE_STATUS_ATTRS];
HeapTuple tuple;
errno_t rc = 0;
rc = memset_s(values, sizeof(values), 0, sizeof(values));
securec_check_c(rc, "\0", "\0");
rc = memset_s(nulls, sizeof(nulls), 0, sizeof(nulls));
securec_check_c(rc, "\0", "\0");
gtm_status = GetGTMLiteStatus();
if (!gtm_status) {
ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE),
errmsg("GTM error, could not obtain snapshot_status, please check GTM is running or failovering.")));
}
values[0] = TransactionIdGetDatum(gtm_status->backup_xid);
values[1] = TransactionIdGetDatum(gtm_status->csn);
tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls);
SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple));
} else {
SRF_RETURN_DONE(funcctx);
}
#endif
}
const char* transfer_snapshot_type(GTM_SnapshotType gtm_snap_type)
{
if (gtm_snap_type == GTM_SNAPSHOT_TYPE_UNDEFINED) {
return "UNDEFINED";
} else if (gtm_snap_type == GTM_SNAPSHOT_TYPE_LOCAL) {
return "LOCAL";
} else if (gtm_snap_type == GTM_SNAPSHOT_TYPE_GLOBAL) {
return "GLOBAL";
} else if (gtm_snap_type == GTM_SNAPSHOT_TYPE_AUTOVACUUM) {
return "AUTOVACUUM";
}
return "UnKnown";
}
/*
* search all active backend to get oldest frozenxid
* for global temp table.
*/
TransactionId ListAllThreadGttFrozenxids(int maxSize, ThreadId *pids, TransactionId *xids, int *n)
{
ProcArrayStruct *arrayP = g_instance.proc_array_idx;
TransactionId result = InvalidTransactionId;
int index;
int flags = 0;
int i = 0;
if (g_instance.attr.attr_storage.max_active_gtt <= 0)
return 0;
if (maxSize > 0) {
Assert(pids);
Assert(xids);
Assert(n);
*n = 0;
}
if (RecoveryInProgress() || SSIsServerModeReadOnly())
return InvalidTransactionId;
flags |= PROC_IS_AUTOVACUUM;
flags |= PROC_IN_LOGICAL_DECODING;
LWLockAcquire(ProcArrayLock, LW_SHARED);
if (maxSize > 0 && maxSize < arrayP->numProcs) {
LWLockRelease(ProcArrayLock);
elog(ERROR, "pids, xids array size is not enough for list all gtt frozenxids.");
}
for (index = 0; index < arrayP->numProcs; index++) {
int pgprocno = arrayP->pgprocnos[index];
volatile PGPROC *proc = g_instance.proc_base_all_procs[pgprocno];
volatile PGXACT *pgxact = &g_instance.proc_base_all_xacts[pgprocno];
if (pgxact->vacuumFlags & flags)
continue;
if (proc->databaseId == u_sess->proc_cxt.MyDatabaseId &&
TransactionIdIsNormal(proc->gtt_session_frozenxid)) {
if (result == InvalidTransactionId)
result = proc->gtt_session_frozenxid;
else if (TransactionIdPrecedes(proc->gtt_session_frozenxid, result))
result = proc->gtt_session_frozenxid;
if (maxSize > 0) {
pids[i] = proc->pid;
xids[i] = proc->gtt_session_frozenxid;
i++;
}
}
}
LWLockRelease(ProcArrayLock);
if (maxSize > 0) {
*n = i;
}
return result;
}
CommitSeqNo
calculate_local_csn_min()
{
/* acquire procarray and csnmin lock, and there is no deadlock */
LWLockAcquire(ProcArrayLock, LW_SHARED);
ProcArrayStruct *arrayP = g_instance.proc_array_idx;
int *pgprocnos = arrayP->pgprocnos;
int num_procs = arrayP->numProcs;
CommitSeqNo local_csn_min = t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo;
LWLockAcquire(CsnMinLock, LW_EXCLUSIVE);
for (int index = 0; index < num_procs; index++) {
int pgprocno = pgprocnos[index];
volatile PGXACT *pgxact = &g_instance.proc_base_all_xacts[pgprocno];
/*
* Ignore procs doing logical decoding which manages xmin
* separately or running LAZY VACUUM
*/
if ((pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING) || (pgxact->vacuumFlags & PROC_IN_VACUUM)) {
continue;
}
CommitSeqNo current_csn = pgxact->csn_min; /* fetch the csn min */
if (COMMITSEQNO_IS_COMMITTED(current_csn) && current_csn < local_csn_min) {
local_csn_min = current_csn;
}
}
LWLockRelease(CsnMinLock);
LWLockRelease(ProcArrayLock);
return local_csn_min;
}
/*
* Updates the maximum value for CSN read from XLog
*/
void UpdateXLogMaxCSN(CommitSeqNo xlogCSN)
{
if (xlogCSN > t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN) {
LWLockAcquire(XLogMaxCSNLock, LW_EXCLUSIVE);
if (xlogCSN > t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN) {
t_thrd.xact_cxt.ShmemVariableCache->xlogMaxCSN = xlogCSN;
}
LWLockRelease(XLogMaxCSNLock);
}
}
/* Get the current oldestxmin, as there may be no transaction or no finished one */
void GetOldestGlobalProcXmin(TransactionId *globalProcXmin)
{
TransactionId globalxmin = MaxTransactionId;
ProcArrayStruct *arrayP = g_instance.proc_array_idx;
int *pgprocnos = arrayP->pgprocnos;
int numProcs = arrayP->numProcs;
(void)LWLockAcquire(ProcArrayLock, LW_SHARED);
for (int index = 0; index < numProcs; index++) {
int pgprocno = pgprocnos[index];
volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno];
TransactionId xid;
if (pgxact->vacuumFlags & PROC_IN_VACUUM)
continue;
xid = pgxact->xmin;
if (TransactionIdIsNormal(xid) && TransactionIdPrecedesOrEquals(xid, globalxmin)) {
globalxmin = xid;
*globalProcXmin = globalxmin;
}
}
LWLockRelease(ProcArrayLock);
}