Files
openGauss-server/src/gausskernel/storage/replication/bcm.cpp
2021-09-23 15:19:37 +08:00

1454 lines
52 KiB
C++

/* -------------------------------------------------------------------------
* Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd.
* Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* bcm.cpp
* bcm map for tracking modify of heap blocks
*
*
* IDENTIFICATION
* src/gausskernel/storage/replication/bcm.cpp
*
* INTERFACE ROUTINES
* createBCMFile - create BCM file with a init file header
* BCM_pin - pin a map page for setting a bit
* BCMSetStatusBit - set Status in a previously pinned page
* BCMTestStatusBit - test if a bit is set
* BCMCountStatusBits - fast count number of bits set in BCM map
* BCMTruncateFile - truncate the BCM map
* BCMClearRel - clear all the BCM bis of a rel
* getBcmFileList -
*
* NOTES
*
* The bcm map is a bitmap with two bits(one for sync and another for backup)
* per heap block. A set bit means that block is modified and has not sync to
* the standby,if a bit is not set, it means the block has sync to standby and
* no need to be sync.
*
* Clearing a bcm map bit is not separately WAL-logged.
*
* When a bit is set, the LSN of the bcm map page is updated to make
* sure that the bcm map update doesn't get written to disk before the
* WAL record of the changes that made it possible to set the bit is flushed.
* But when a bit is cleared, we don't have to do that because it's always
* safe to clear a bit in the map from correctness point of view.
*
* -------------------------------------------------------------------------
*/
#include "postgres.h"
#include "knl/knl_variable.h"
#include "access/heapam.h"
#include "access/transam.h"
#include "access/xlogutils.h"
#include "access/visibilitymap.h"
#include "catalog/catalog.h"
#include "catalog/pg_database.h"
#include "catalog/pg_tablespace.h"
#include "miscadmin.h"
#include "storage/buf/bufmgr.h"
#include "storage/lmgr.h"
#include "storage/smgr/smgr.h"
#include "storage/smgr/fd.h"
#include "storage/cu.h"
#include "utils/inval.h"
#include "postmaster/alarmchecker.h"
#include "replication/bcm.h"
#include "replication/basebackup.h"
#include "replication/catchup.h"
#include "replication/dataprotocol.h"
#include "replication/dataqueue.h"
#include "replication/datasender.h"
#include "replication/datasender_private.h"
#include "replication/walsender.h"
#include "utils/aiomem.h"
#include "utils/memutils.h"
#include "storage/custorage.h"
#include "storage/ipc.h"
#include "commands/tablespace.h"
/*
* Table for fast counting of set bits, by now only for sync
* FUTURE CASE:: for backup(the second bit)
*
* Define bcm postfix
*/
#define BCM "_bcm"
/* prototypes for internal routines */
static Buffer BCM_readbuf(Relation rel, BlockNumber blkno, bool extend, int col = 0);
static void BCM_extend(Relation rel, BlockNumber nvmblocks, int col = 0);
static void searchBCMFiles(const char *tableSpacePath, const char *relativepath, bool undertablespace, bool clear,
int iterations);
static void GetIncrementalBcmFilePathForDefault(const RelFileNodeKey &data, char *path, int length);
static void GetIncrementalBcmFilePathForCustome(const RelFileNodeKey &data, char *path, int length);
static void HandleBCMfile(char *bcmpath, bool clear);
static void BCMClearFile(const RelFileNode &relfilenode, int col = 0);
static void BCMSendData(const RelFileNode &relfilenode, const char *bcmpath, int col = 0);
static void bcm_read_multi_cu(CUFile *cFile, Relation rel, int col, BlockNumber heapBlock, int &contibits,
BlockNumber maxHeapBlock);
static void BCMSetMetaBit(Relation rel, BlockNumber block, BCMBitStatus status, int col = 0);
static void BCMClearMetaBit(Relation rel, int col = 0);
static void BCMResetMetaBit(Relation rel, BlockNumber metablk, int col = 0);
static void BCMWalkMetaBuffer(Relation rel, CUFile *cFile, Buffer metabuffer, BlockNumber &heapBlock, int &contibits,
BlockNumber maxHeapBlock, int col = 0);
static void BCMSendOneBuffer(Relation rel, CUFile *cFile, Buffer bcmbuffer, BlockNumber &heapBlock, int &contibits,
BlockNumber maxHeapBlock, int col = 0);
static BlockNumber BCMGetDataFileMaxSize(Relation rel, int col);
static bool CheckFilePostfix(const char *str1, const char *str2);
// check tablespace size limitation when extending BCM file.
static inline void VerifyTblspcWhenBcmExtend(Relation rel, int col, int nblocks)
{
Assert(nblocks > 0);
STORAGE_SPACE_OPERATION(rel, (uint64)BLCKSZ * nblocks);
// Might have to re-open if a cache flush happened
if (col > 0) {
CStoreRelationOpenSmgr(rel, col);
} else {
RelationOpenSmgr(rel);
}
}
/* Create a bcm file with an inited bcm file header */
void createBCMFile(Relation rel, int col)
{
Page bcmHeader;
ADIO_RUN()
{
bcmHeader = (Page)adio_align_alloc(BLCKSZ);
}
ADIO_ELSE()
{
bcmHeader = (Page)palloc(BLCKSZ);
}
ADIO_END();
PageInit(bcmHeader, BLCKSZ, 0);
BCMHeader *hd = NULL;
ForkNumber forknum = BCM_FORKNUM;
hd = (BCMHeader *)PageGetContents(bcmHeader);
/* FUTURE CASE:: for COLUMN_STORE, only support ROW_STORE by now. */
hd->type = col > 0 ? COLUMN_STORE : ROW_STORE;
hd->node.dbNode = rel->rd_node.dbNode;
hd->node.relNode = rel->rd_node.relNode;
hd->node.spcNode = rel->rd_node.spcNode;
hd->node.bucketNode = rel->rd_node.bucketNode;
hd->blockSize = col > 0 ? CUAlignUtils::GetCuAlignSizeColumnId(col) : BLCKSZ; /* defaut size for ROW_STORE */
if (col > 0)
forknum = ColumnId2ColForkNum(col);
smgrcreate(rel->rd_smgr, forknum, false);
VerifyTblspcWhenBcmExtend(rel, col, 1);
PageSetChecksumInplace(bcmHeader, 0);
/* Now extend the file */
smgrextend(rel->rd_smgr, forknum, 0, (char *)bcmHeader, false);
ADIO_RUN()
{
adio_align_free(bcmHeader);
}
ADIO_ELSE()
{
pfree(bcmHeader);
bcmHeader = NULL;
}
ADIO_END();
}
void BCMLogCU(Relation rel, uint64 offset, int col, BCMBitStatus status, int count)
{
bool needwal = false;
needwal = (RelationNeedsWAL(rel) && !t_thrd.xlog_cxt.InRecovery);
if (needwal) {
Buffer bcmbuffer = InvalidBuffer;
Page page;
BCM_CStore_pin(rel, col, offset, &bcmbuffer);
LockBuffer(bcmbuffer, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(bcmbuffer);
START_CRIT_SECTION();
{
uint64 cuBlock = 0;
XLogRecPtr recptr = InvalidXLogRecPtr;
uint64 align_size = (uint64)(uint32)CUAlignUtils::GetCuAlignSizeColumnId(col);
cuBlock = cstore_offset_to_cstoreblock(offset, align_size);
recptr = log_cu_bcm(&(rel->rd_node), col, cuBlock, status, count);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
LockBuffer(bcmbuffer, BUFFER_LOCK_UNLOCK);
ReleaseBuffer(bcmbuffer);
}
}
/*
* Set the meta page sync bit where the bcm block refers to.
* Here, we use two sync bits(sync bit 0 and sync bit 1) to represent
* the sync status of a bcm page. When we set sync bit 1 to unsynced, it
* means the bcm page may have unsynced heap blocks. sync. When we set
* sync bit 0 to unsynced during catchup, it means that the bcm page status
* bit in meta page should not be reset after catchup which can be perceived
* by the next catchup.
* Note: call the function should first hold BUFFER_LOCK_EXCLUSIVE lock
*/
static void BCMSetMetaBit(Relation rel, BlockNumber block, BCMBitStatus status, int col)
{
BlockNumber metablock = BCMBLK_TO_METABLOCK(block);
int metaByte = BCMBLK_TO_METABYTE(block);
int metaBit = BCMBLK_TO_METABIT(block);
uint32 bshift = (uint32)metaBit * META_BITS_PER_BLOCK;
Buffer metabuffer = InvalidBuffer;
BCMBitStatus pageStatus0 = 0;
BCMBitStatus pageStatus1 = 0;
Page page;
unsigned char *map = NULL;
Assert(status == SYNCED || status == NOTSYNCED);
metabuffer = BCM_readbuf(rel, metablock, false, col);
Assert(BufferIsValid(metabuffer));
LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(metabuffer);
map = (unsigned char *)PageGetContents(page);
/* get sync bit 0 & 1 status */
pageStatus0 = ((map[metaByte] >> bshift) & META_SYNC0_BITMASK) >> 3;
Assert(pageStatus0 == SYNCED || pageStatus0 == NOTSYNCED);
pageStatus1 = ((map[metaByte] >> bshift) & META_SYNC1_BITMASK) >> 1;
Assert(pageStatus1 == SYNCED || pageStatus1 == NOTSYNCED);
/* set sync bit status */
if (status != pageStatus0 || status != pageStatus1) {
START_CRIT_SECTION();
if (status != pageStatus0)
SET_SYNC0_BYTE_STATUS(map[metaByte], status, bshift);
if (status != pageStatus1)
SET_SYNC1_BYTE_STATUS(map[metaByte], status, bshift);
MarkBufferDirty(metabuffer);
END_CRIT_SECTION();
}
UnlockReleaseBuffer(metabuffer);
}
/*
* Clear all bcm page sync status bit 0 in meta pages before catchup.
*/
static void BCMClearMetaBit(Relation rel, int col)
{
BlockNumber metablock = 1;
Buffer metabuffer = InvalidBuffer;
Page page;
unsigned char *map = NULL;
uint32 bshift = 0;
BCMBitStatus pageStatus0 = 0;
int i = 0;
int j = 0;
bool dirty = false;
metabuffer = BCM_readbuf(rel, metablock, false, col);
if (!BufferIsValid(metabuffer))
return; /* nothing to */
do {
LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(metabuffer);
map = (unsigned char *)PageGetContents(page);
ereport(DEBUG1, (errmsg("relation %u/%u/%u col %d try to clear meta block %u", rel->rd_node.spcNode,
rel->rd_node.dbNode, rel->rd_node.relNode, col, metablock)));
/* clear sync bit 0 status */
START_CRIT_SECTION();
for (i = 0; i < (int)BCMMAPSIZE; i++) {
for (j = 0; j < META_BLOCKS_PER_BYTE; j++) {
bshift = (uint32)j * META_BITS_PER_BLOCK;
pageStatus0 = ((map[i] >> bshift) & META_SYNC0_BITMASK) >> 3;
Assert(pageStatus0 == SYNCED || pageStatus0 == NOTSYNCED);
if (pageStatus0 == NOTSYNCED) {
SET_SYNC0_BYTE_STATUS(map[i], SYNCED, bshift);
dirty = true;
}
}
}
if (dirty)
MarkBufferDirty(metabuffer);
END_CRIT_SECTION();
UnlockReleaseBuffer(metabuffer);
/* caculate the next meta page, than clear again. */
metablock += META_BLOCKS_PER_PAGE + 1;
metabuffer = BCM_readbuf(rel, metablock, false, col);
} while (BufferIsValid(metabuffer));
return;
}
/*
* Reset the bcm page sync status bit 1 in meta pages after catchup.
* Notes: we skip those bcm pages which are recently marked as unsynced
* by checking the sync status bit 0.
*/
static void BCMResetMetaBit(Relation rel, BlockNumber metablk, int col)
{
BlockNumber metablock = 1;
Buffer metabuffer = InvalidBuffer;
Page page;
unsigned char *map = NULL;
uint32 bshift = 0;
BCMBitStatus pageStatus0 = 0;
BCMBitStatus pageStatus1 = 0;
int i = 0;
int j = 0;
bool dirty = false;
for (metablock = 1; metablock < metablk; metablock += (META_BLOCKS_PER_PAGE + 1)) {
metabuffer = BCM_readbuf(rel, metablock, false, col);
if (!BufferIsValid(metabuffer))
ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED),
errmsg("%u/%u/%u invalid bcm meta buffer %u", rel->rd_node.spcNode, rel->rd_node.dbNode,
rel->rd_node.relNode, metablock)));
LockBuffer(metabuffer, BUFFER_LOCK_EXCLUSIVE);
page = BufferGetPage(metabuffer);
map = (unsigned char *)PageGetContents(page);
ereport(DEBUG1, (errmsg("relation %u/%u/%u col %d try to reset meta block %u", rel->rd_node.spcNode,
rel->rd_node.dbNode, rel->rd_node.relNode, col, metablock)));
/*
* Clear the latest set sync bit 1 status, the the page status 0 has been set
* to NOTSYNCED sync last meta clear, we should skip this BCM block.
*/
START_CRIT_SECTION();
for (i = 0; i < (int)BCMMAPSIZE; i++) {
for (j = 0; j < META_BLOCKS_PER_BYTE; j++) {
bshift = (uint32)j * META_BITS_PER_BLOCK;
pageStatus0 = ((map[i] >> bshift) & META_SYNC0_BITMASK) >> 3;
Assert(SYNCED == pageStatus0 || NOTSYNCED == pageStatus0);
pageStatus1 = ((map[i] >> bshift) & META_SYNC1_BITMASK) >> 1;
Assert(SYNCED == pageStatus1 || NOTSYNCED == pageStatus1);
if (SYNCED == pageStatus0 && NOTSYNCED == pageStatus1) {
SET_SYNC1_BYTE_STATUS(map[i], SYNCED, bshift);
dirty = true;
}
}
}
if (dirty)
MarkBufferDirty(metabuffer);
END_CRIT_SECTION();
UnlockReleaseBuffer(metabuffer);
}
return;
}
/*
* Set the corresponding bit of the heap block as status, before call
* this function we should call BCM_pin to get the right bcmbuffer.
*/
void BCMSetStatusBit(Relation rel, uint64 heapBlk, Buffer buf, BCMBitStatus status, int col)
{
BlockNumber mapBlock = HEAPBLK_TO_BCMBLOCK(heapBlk);
int mapByte = HEAPBLK_TO_BCMBYTE(heapBlk);
int mapBit = HEAPBLK_TO_BCMBIT(heapBlk);
uint32 bshift = (uint32)mapBit * BCM_BITS_PER_BLOCK;
BCMBitStatus bcmStatus = 0;
bool needwal = false;
Page page;
unsigned char *map = NULL;
#ifdef TRACE_BCMMAP
elog(LOG, "BCMSetStatusBit: rel: %s col: %d blk: %lu status: %d ", RelationGetRelationName(rel), col, heapBlk,
status);
#endif
if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED), errmsg("wrong buffer passed to BCM_clear, BlockNumber from buf is %u,"
"mapBlock is %u",
BufferGetBlockNumber(buf), mapBlock)));
Assert(status == SYNCED || status == NOTSYNCED);
if (status == NOTSYNCED)
BCMSetMetaBit(rel, mapBlock, NOTSYNCED, col);
page = BufferGetPage(buf);
map = (unsigned char *)PageGetContents(page);
bcmStatus = (map[mapByte] >> bshift) & BCM_SYNC_BITMASK;
bcmStatus = bcmStatus >> 1;
Assert(bcmStatus == SYNCED || bcmStatus == NOTSYNCED);
/* Bcm status must be 0 before it will be set to 1 */
if (!RecoveryInProgress() && status == NOTSYNCED && bcmStatus == NOTSYNCED)
ereport(WARNING, (errmsg("BCM page maybe damage, rnode[%u,%u,%u] col:%d block:%lu ", rel->rd_node.spcNode,
rel->rd_node.dbNode, rel->rd_node.relNode, col, heapBlk)));
needwal = (RelationNeedsWAL(rel) && !t_thrd.xlog_cxt.InRecovery);
if (status != bcmStatus) {
START_CRIT_SECTION();
/* set status */
SET_SYNC_BYTE_STATUS(map[mapByte], status, bshift);
MarkBufferDirty(buf);
/*
* we record one cu bcm xlog in BCMLogCU for column store.
*/
bool isRowStore = (col == 0);
if (needwal && isRowStore) {
XLogRecPtr recptr = InvalidXLogRecPtr;
recptr = log_heap_bcm(&(rel->rd_node), 0, heapBlk, status);
PageSetLSN(page, recptr);
}
END_CRIT_SECTION();
}
}
/* Clear all the bcm bits of a relation */
void BCMClearRel(Relation rel, int col)
{
BlockNumber totalblocks = 0;
BlockNumber mapBlock;
ForkNumber forknum = BCM_FORKNUM;
#ifdef TRACE_BCMMAP
elog(LOG, "BCMClearRel %s", RelationGetRelationName(rel));
#endif
if (col > 0) {
forknum = ColumnId2ColForkNum(col);
CStoreRelationOpenSmgr(rel, col);
} else {
RelationOpenSmgr(rel);
}
/*
* If no bcm map has been created yet for this relation, there's
* nothing to clear.
*/
if (!smgrexists(rel->rd_smgr, forknum))
return;
totalblocks = smgrnblocks(rel->rd_smgr, forknum);
/*
* If bcm map only has a file header, there's nothing to clear.
*/
if (totalblocks == 0 || totalblocks == 1)
return;
/* We begin clear from page 1 not page 0 */
for (mapBlock = 1; mapBlock < totalblocks; mapBlock++) {
Buffer mapBuffer;
unsigned char *map = NULL;
errno_t rc = 0;
mapBuffer = BCM_readbuf(rel, mapBlock, false, col);
if (!BufferIsValid(mapBuffer))
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED), errmsg("%u/%u/%u invalid bcm buffer %u", rel->rd_node.spcNode,
rel->rd_node.dbNode, rel->rd_node.relNode, mapBlock)));
LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
map = (unsigned char *)PageGetContents(BufferGetPage(mapBuffer));
/* NB: We clear the whole page, including the dcm bits, is that ok? */
rc = memset_s(map, BCMMAPSIZE, 0, BCMMAPSIZE);
securec_check(rc, "", "");
MarkBufferDirty(mapBuffer);
UnlockReleaseBuffer(mapBuffer);
}
}
/*
* BCM_truncate - truncate the bcm map
*
* The caller must hold AccessExclusiveLock on the relation, to ensure that
* other backends receive the smgr invalidation event that this function sends
* before they access the bcm again.
*
* Note: bcm will be truncated to zero. Only data replication can generate bcm
* file, and heap can not be truncated by lazy vacuum(the function of
* lazy_truncate_heap has been disabled at data replication mode), so we need
* not to realize the code about truncate the bcm file to nblock.
*/
void BCM_truncate(Relation rel)
{
#ifdef TRACE_BCMMAP
ereport(DEBUG1, (errmodule(MOD_REP), errmsg("bcm_truncate %s", RelationGetRelationName(rel))));
#endif
RelationOpenSmgr(rel);
/*
* If no bcm map has been created yet for this relation, there's
* nothing to truncate.
*/
if (!smgrexists(rel->rd_smgr, BCM_FORKNUM))
return;
/* Truncate the bcm pages, and send smgr inval message */
smgrtruncate(rel->rd_smgr, BCM_FORKNUM, 0);
/*
* We might as well update the local smgr_bcm_nblocks setting. smgrtruncate
* sent an smgr cache inval message, which will cause other backends to
* invalidate their copy of smgr_bcm_nblocks, and this one too at the next
* command boundary. But this ensures it isn't outright wrong until then.
*/
for (int i = 0; i < rel->rd_smgr->smgr_bcmarry_size; i++)
rel->rd_smgr->smgr_bcm_nblocks[i] = 0;
}
/*
* Read a bcm map page.
*
* If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
* true, the bcm map file is extended.
*/
static Buffer BCM_readbuf(Relation rel, BlockNumber blkno, bool extend, int col)
{
Buffer buf;
ForkNumber forknum = BCM_FORKNUM;
if (col > 0) {
forknum = ColumnId2ColForkNum(col);
/*
* We might not have opened the relation at the smgr level yet, or we
* might have been forced to close it by a sinval message. The code below
* won't necessarily notice relation extension immediately when extend =
* false, so we rely on sinval messages to ensure that our ideas about the
* size of the map aren't too far out of date.
*/
CStoreRelationOpenSmgr(rel, col);
} else {
RelationOpenSmgr(rel);
}
/*
* If we haven't cached the size of the bcm map fork yet, check it
* first.
*/
if (rel->rd_smgr->smgr_bcm_nblocks[col] == InvalidBlockNumber) {
if (smgrexists(rel->rd_smgr, forknum))
rel->rd_smgr->smgr_bcm_nblocks[col] = smgrnblocks(rel->rd_smgr, forknum);
else
rel->rd_smgr->smgr_bcm_nblocks[col] = 0;
}
/* Handle requests beyond EOF */
if (blkno >= rel->rd_smgr->smgr_bcm_nblocks[col]) {
if (extend)
BCM_extend(rel, blkno + 1, col);
else
return InvalidBuffer;
}
/*
* Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's
* always safe to clear bits, so it's better to clear corrupt pages than
* error out.
*/
buf = ReadBufferExtended(rel, forknum, blkno, RBM_ZERO_ON_ERROR, NULL);
if (PageIsNew(BufferGetPage(buf)))
PageInit(BufferGetPage(buf), BLCKSZ, 0);
return buf;
}
/*
* Ensure that the bcm map fork is at least bcm_nblocks long, extending
* it if necessary with zeroed pages.
*/
static void BCM_extend(Relation rel, BlockNumber bcm_nblocks, int col)
{
BlockNumber bcm_nblocks_now;
Page pg;
ForkNumber forknum = BCM_FORKNUM;
ADIO_RUN()
{
pg = (Page)adio_align_alloc(BLCKSZ);
}
ADIO_ELSE()
{
pg = (Page)palloc(BLCKSZ);
}
ADIO_END();
PageInit(pg, BLCKSZ, 0);
/*
* We use the relation extension lock to lock out other backends trying to
* extend the bcm map at the same time. It also locks out extension
* of the main fork, unnecessarily, but extending the bcm map
* happens seldom enough that it doesn't seem worthwhile to have a
* separate lock tag type for it.
*
* Note that another backend might have extended or created the relation
* by the time we get the lock.
*/
LockRelationForExtension(rel, ExclusiveLock);
/*
* Create the file first if it doesn't exist. If smgr_bcm_nblocks is
* positive then it must exist, no need for an smgrexists call.
*/
if (col > 0) {
forknum = ColumnId2ColForkNum(col);
CStoreRelationOpenSmgr(rel, col);
} else {
/* Might have to re-open if a cache flush happened */
RelationOpenSmgr(rel);
}
if ((rel->rd_smgr->smgr_bcm_nblocks[col] == 0 || rel->rd_smgr->smgr_bcm_nblocks[col] == InvalidBlockNumber) &&
!smgrexists(rel->rd_smgr, forknum)) {
createBCMFile(rel, col);
}
bcm_nblocks_now = smgrnblocks(rel->rd_smgr, forknum);
if (bcm_nblocks_now < bcm_nblocks) {
VerifyTblspcWhenBcmExtend(rel, col, bcm_nblocks - bcm_nblocks_now);
}
/* Now extend the file */
while (bcm_nblocks_now < bcm_nblocks) {
PageSetChecksumInplace(pg, bcm_nblocks_now);
smgrextend(rel->rd_smgr, forknum, bcm_nblocks_now, (char *)pg, false);
bcm_nblocks_now++;
}
/*
* Send a shared-inval message to force other backends to close any smgr
* references they may have for this rel, which we are about to change.
* This is a useful optimization because it means that backends don't have
* to keep checking for creation or extension of the file, which happens
* infrequently.
*/
CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode);
/* Update local cache with the up-to-date size */
rel->rd_smgr->smgr_bcm_nblocks[col] = bcm_nblocks_now;
UnlockRelationForExtension(rel, ExclusiveLock);
ADIO_RUN();
{
adio_align_free(pg);
}
ADIO_ELSE()
{
pfree(pg);
pg = NULL;
}
ADIO_END();
}
/* Read bcm page */
void BCM_CStore_pin(Relation rel, int col, uint64 offset, Buffer *buf)
{
Assert(col > 0);
uint64 align_size = (uint64)(uint32)CUAlignUtils::GetCuAlignSizeColumnId(col);
BlockNumber mapBlock = cstore_offset_to_bcmblock(offset, align_size);
*buf = BCM_readbuf(rel, mapBlock, true, col);
}
/* Read bcm page */
void BCM_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
{
BlockNumber mapBlock = HEAPBLK_TO_BCMBLOCK(heapBlk);
*buf = BCM_readbuf(rel, mapBlock, true);
}
/*
* BCMSendData
*
* Traverse every BCM page of current relation to see if a corresponding
* heap page or CU unit needs to send to standby. If needed, load the
* heap page or CU unit data and push it to the send queue. We should
* hold the relation lock to avoid been dropped during catchup.
* In order to speed up the check efficiency, we just need to walk the
* bcm meta buffer instead. More comments see in bcm meta buffer.
*/
static void BCMSendData(const RelFileNode &relfilenode, const char *bcmpath, int col)
{
RelFileNode InvalidRelFileNode = { 0, 0, 0, -1 };
Relation rel;
Buffer metabuffer = InvalidBuffer;
ForkNumber forknum = BCM_FORKNUM;
BlockNumber heapBlock = InvalidBlockNumber;
BlockNumber metanum = 1;
BlockNumber maxHeapBlock = InvalidBlockNumber;
struct stat stat_buf;
volatile DataSndCtlData *datasndctl = t_thrd.datasender_cxt.DataSndCtl;
bool isColStore = col > 0 ? true : false;
int contibits = 0;
/* if disabled stream replication or relfilenode is invalid, skip current relation. */
if (!u_sess->attr.attr_storage.enable_stream_replication ||
(0 == memcmp(&relfilenode, &InvalidRelFileNode, sizeof(RelFileNode))))
return;
/*
* Here we lock the database to solve the checkpoint failure " ERROR:checkpoint request failed
* CONTEXT: Error message received from nodes:xxx" because of the concurrent execution of drop
* database and catchup.Steps to reproduce:
* 1.create database test,and create table t1 in test;
* 2.copy data to t1(without standby)
* 3.drop database and sleep before rm data
* 4.start standby, catchup thread will start and send data in primary
* 5.conitue step 3
* 6.drop database will success
* 7.create database or checkpoint will get the error.
*/
LockSharedObject(DatabaseRelationId, relfilenode.dbNode, 0, RowExclusiveLock);
rel = CreateFakeRelcacheEntry(relfilenode);
/*
* First lock relfilenode(Notes: relfilenode.relNode maybe differnt from oid),
* at this time, LockRelation is equal to LockRelFileNode,
* then read the bcm file, if it is not exit,
* the table maybe delete, so we will return.
*
* ExclusiveLock will block insert, because catchup maybe read a zero block
* after insert, it is tested on xfs file system.
*/
LockRelFileNode(relfilenode, ExclusiveLock);
if (isColStore) {
forknum = ColumnId2ColForkNum(col);
CStoreRelationOpenSmgr(rel, col);
} else {
RelationOpenSmgr(rel);
}
/*
* BCM file is just removed, skip following check.
* smgrexists maybe not correct(After the table is dropped), so we should use
* stat to check it.
*/
if (!smgrexists(rel->rd_smgr, forknum) || stat(bcmpath, &stat_buf) != 0) {
UnlockRelFileNode(relfilenode, ExclusiveLock);
FreeFakeRelcacheEntry(rel);
UnlockSharedObject(DatabaseRelationId, relfilenode.dbNode, 0, RowExclusiveLock);
return;
}
BCMClearMetaBit(rel, col);
metabuffer = BCM_readbuf(rel, metanum, false, col);
if (!BufferIsValid(metabuffer)) {
/* Nothing to do, the file was already smaller */
UnlockRelFileNode(relfilenode, ExclusiveLock);
FreeFakeRelcacheEntry(rel);
UnlockSharedObject(DatabaseRelationId, relfilenode.dbNode, 0, RowExclusiveLock);
return;
}
/* get max size of data file */
maxHeapBlock = BCMGetDataFileMaxSize(rel, col);
if (maxHeapBlock == InvalidBlockNumber) {
/* Nothing to do, the file size was zero */
UnlockRelFileNode(relfilenode, ExclusiveLock);
FreeFakeRelcacheEntry(rel);
UnlockSharedObject(DatabaseRelationId, relfilenode.dbNode, 0, RowExclusiveLock);
return;
}
CUFile *cFile = isColStore ? New(CurrentMemoryContext) CUFile(relfilenode, col) : NULL;
do {
ereport(DEBUG3, (errmsg("valid bcm meta buffer :%u", metanum)));
BCMWalkMetaBuffer(rel, cFile, metabuffer, heapBlock, contibits, maxHeapBlock, col);
ReleaseBuffer(metabuffer);
/* caculate the next meta page, than check again. */
metanum += META_BLOCKS_PER_PAGE + 1;
metabuffer = BCM_readbuf(rel, metanum, false, col);
} while (BufferIsValid(metabuffer));
/*
* For column store, after we loaded all the bcm buffers, especially when
* the last bcm status was NOTSYNCED, we should finish the surplus work --
* push the last contibits data to queue.
*/
if (contibits > 0)
bcm_read_multi_cu(cFile, rel, col, heapBlock, contibits, maxHeapBlock);
if (cFile)
DELETE_EX(cFile);
/*
* we should wait until all the pushed data has been send to the standby,
* then clear the BCMArray.
*/
while (DQByteLT(datasndctl->queue_offset, t_thrd.proc->waitDataSyncPoint)) {
CatchupShutdownIfNoDataSender();
pg_usleep(1000L); /* 1ms */
}
ClearBCMArray();
BCMResetMetaBit(rel, metanum, col);
UnlockRelFileNode(relfilenode, ExclusiveLock);
FreeFakeRelcacheEntry(rel);
UnlockSharedObject(DatabaseRelationId, relfilenode.dbNode, 0, RowExclusiveLock);
}
/*
* BCMWalkMetaBuffer
*
* Walk through every bit in current meta page to find out if any corresponding
* BCM page needs to search.
*/
static void BCMWalkMetaBuffer(Relation rel, CUFile *cFile, Buffer metabuffer, BlockNumber &heapBlock, int &contibits,
BlockNumber maxHeapBlock, int col)
{
Buffer bcmbuffer = InvalidBuffer;
BlockNumber metaBlock;
BlockNumber bcmBlock;
int i;
int j;
uint32 bshift;
BCMBitStatus status;
Page metapage;
unsigned char *map = NULL;
Assert(BufferIsValid(metabuffer));
metaBlock = BufferGetBlockNumber(metabuffer);
metapage = BufferGetPage(metabuffer);
map = (unsigned char *)PageGetContents(metapage);
for (i = 0; i < (int)BCMMAPSIZE; i++) {
for (j = 0; j < META_BLOCKS_PER_BYTE; j++) {
bshift = (uint32)j * META_BITS_PER_BLOCK;
status = ((map[i] >> bshift) & META_SYNC1_BITMASK) >> 1;
/* the bcm block needs to sync */
if (status == NOTSYNCED) {
CatchupShutdownIfNoDataSender();
/* get bcm page block */
bcmBlock = GET_BCM_BLOCK(metaBlock, i, j);
ereport(DEBUG2, (errmsg("relation %u/%u/%u col %d try to sync bcm block %u", rel->rd_node.spcNode,
rel->rd_node.dbNode, rel->rd_node.relNode, col, bcmBlock)));
/*
* We assume that if the bcm buffer is invalid, it means that some
* thread has just extended that block, and we can see it in meta page
* but not in the opened smgr of current relation. It's safe to skip this
* block 'cause we can sync it by data replication.
*/
bcmbuffer = BCM_readbuf(rel, bcmBlock, false, col);
if (BufferIsValid(bcmbuffer)) {
BCMSendOneBuffer(rel, cFile, bcmbuffer, heapBlock, contibits, maxHeapBlock, col);
ReleaseBuffer(bcmbuffer);
}
}
}
}
}
/*
* BCMSendOneBuffer
*
* Walk through every bit in current bcm page to find out if any corresponding
* heap pages or CU units need to send to standby.
*/
static void BCMSendOneBuffer(Relation rel, CUFile *cFile, Buffer bcmbuffer, BlockNumber &heapBlock, int &contibits,
BlockNumber maxHeapBlock, int col)
{
Buffer heapbuffer = InvalidBuffer;
Page bcmpage;
int i;
int j;
uint32 bshift;
unsigned char *map = NULL;
BCMBitStatus status;
BlockNumber blocknum = 0;
bool isColStore = col > 0 ? true : false;
blocknum = BufferGetBlockNumber(bcmbuffer);
Assert(isColStore || (cFile == NULL));
/*
* Do not lock buffer, maybe deadlock, if
* Catchup held this buffer share lock, and push to dataqueue, but queue has no freespace,
* Catchup will sleep with share lock; wait for DataSender to free queue's space;
* But DataSender need to get this buffer exclusive lock to set BCM bit: 1-->0, so
* Catchup held share lock wait DataSender; DataSender wait exclusive lock held by Catchup;
* then deadlock occured.
*/
bcmpage = BufferGetPage(bcmbuffer);
map = (unsigned char *)PageGetContents(bcmpage);
for (i = 0; i < (int)BCMMAPSIZE; i++) {
for (j = 0; j < BCM_BLOCKS_PER_BYTE; j++) {
bshift = (uint32)j * BCM_BITS_PER_BLOCK;
status = ((map[i] >> bshift) & BCM_SYNC_BITMASK) >> 1;
/* If not sync */
if (status == NOTSYNCED) {
CatchupShutdownIfNoDataSender();
if (isColStore) { /* column store */
/* get heap page block */
if (contibits == 0)
heapBlock = GET_HEAP_BLOCK(blocknum, i, j);
contibits++;
} else { /* row store */
/* get heap page block */
heapBlock = GET_HEAP_BLOCK(blocknum, i, j);
if (u_sess->attr.attr_storage.HaModuleDebug) {
ereport(LOG, (errmsg("HA-BCMSendOneBuffer: relation %u/%u/%u col %d try to sync bcm "
"blockno %u heap blockno %u maxHeapBlock %u",
rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode, col,
blocknum, heapBlock, maxHeapBlock)));
}
/*
* For OS crash, Data file block maybe not fsync disk to 100(For example),
* but BCM maybe flush disk to 101. We can not read data file block.
*/
if (heapBlock > maxHeapBlock)
return;
heapbuffer = ReadBuffer(rel, heapBlock);
LockBuffer(heapbuffer, BUFFER_LOCK_SHARE);
PushHeapPageToDataQueue(heapbuffer);
UnlockReleaseBuffer(heapbuffer);
}
}
if (isColStore) {
/*
* for column store, we record the continuous no-sync status,
* load CU data for once as much as possible.
*/
int max_contibits = (512 * 1024) / CUAlignUtils::GetCuAlignSizeColumnId(col);
if (contibits > 0 && (status == SYNCED || contibits >= max_contibits))
bcm_read_multi_cu(cFile, rel, col, heapBlock, contibits, maxHeapBlock);
}
}
}
}
/*
* Get max block num for bcm file relfilenode
*/
static BlockNumber BCMGetDataFileMaxSize(Relation rel, int col)
{
BlockNumber maxHeapBlock = 0;
if (col > 0) {
uint64 filesize = GetColDataFileSize(rel, col);
maxHeapBlock = (BlockNumber)(filesize / CUAlignUtils::GetCuAlignSizeColumnId(col));
} else {
if (smgrexists(rel->rd_smgr, MAIN_FORKNUM)) {
maxHeapBlock = smgrnblocks(rel->rd_smgr, MAIN_FORKNUM);
} else {
char *rpath = NULL;
RelFileNodeBackend smgr_rnode;
smgr_rnode.node = rel->rd_node;
smgr_rnode.backend = InvalidBackendId;
rpath = relpath(smgr_rnode, MAIN_FORKNUM);
ereport(WARNING, (errcode_for_file_access(), errmsg("relation file is not exist when get max block num "
"for bcm file relfilenode: \"%s\": %m",
rpath)));
pfree(rpath);
rpath = NULL;
}
}
/*
* Block is from 0 to nblocks-1, if maxHeapBlock is 0, we should return 0
*/
return maxHeapBlock ? (maxHeapBlock - 1) : InvalidBlockNumber;
}
/*
* Check if we have specific postfix in the string.
*/
static bool CheckFilePostfix(const char *str1, const char *str2)
{
int len1 = 0;
int len2 = 0;
if (str1 == NULL || str2 == NULL) {
return false;
}
len1 = (int)strlen(str1);
len2 = (int)strlen(str2);
if ((len1 < len2) || (len1 == 0 || len2 == 0)) {
return false;
}
while (len2 >= 1) {
if (str2[len2 - 1] != str1[len1 - 1]) {
return false;
}
len2--;
len1--;
}
return true;
}
/*
* BCMClearFile: set the BCM file's pages to init pages
* except the first page(BCM File Header).
* FUTURE CASE:: Maybe we should Consider concurrency scenarios,
* one is clearing file another is setting.
*/
static void BCMClearFile(const RelFileNode &relfilenode, int col)
{
RelFileNode InvalidRelFileNode = { 0, 0, 0, -1 };
Relation rel;
if (0 == memcmp(&relfilenode, &InvalidRelFileNode, sizeof(RelFileNode)))
return;
rel = CreateFakeRelcacheEntry(relfilenode);
BCMClearRel(rel, col);
FreeFakeRelcacheEntry(rel);
}
/* Recursion search BCM files with the tableSpacePath */
static void searchBCMFiles(const char *tableSpacePath, const char *relativepath, bool undertablespace, bool clear,
int iterations)
{
DIR *dir = NULL;
struct dirent *de;
char path[MAXPGPATH] = {'\0'};
char rpath[MAXPGPATH] = {'\0'};
int nRet = 0;
/* the layer number of searchBCMFiles iterations */
iterations++;
dir = AllocateDir(tableSpacePath);
while ((de = ReadDir(dir, tableSpacePath)) != NULL) {
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
if (strncmp(de->d_name, PG_TEMP_FILE_PREFIX, strlen(PG_TEMP_FILE_PREFIX)) == 0)
continue;
if (strncmp(de->d_name, "pg_log", strlen("pg_log")) == 0 ||
strncmp(de->d_name, "pg_location", strlen("pg_location")) == 0)
continue;
if (strncmp(de->d_name, "pg_xlog", strlen("pg_xlog")) == 0)
continue;
if (strncmp(de->d_name, "full_upgrade_bak", strlen("full_upgrade_bak")) == 0)
continue;
nRet = snprintf_s(path, sizeof(path), MAXPGPATH - 1, "%s/%s", tableSpacePath, de->d_name);
securec_check_ss(nRet, "", "");
if (undertablespace) {
if (NULL == strstr(path, TABLESPACE_VERSION_DIRECTORY) ||
NULL == strstr(path, g_instance.attr.attr_common.PGXCNodeName))
continue;
} else {
if (strcmp(de->d_name, "pg_tblspc") == 0)
continue;
}
if (relativepath) {
nRet = snprintf_s(rpath, sizeof(rpath), MAXPGPATH - 1, "%s/%s", relativepath, de->d_name);
securec_check_ss(nRet, "", "");
} else {
nRet = snprintf_s(rpath, sizeof(rpath), MAXPGPATH - 1, "%s", de->d_name);
securec_check_ss(nRet, "", "");
}
/*
* serchBCMFiles will be recursive call 3 interations to get file path. In the third layer,
* the file path is table file, not Dir, so we need not to decide whether it is a folder,
* because the performance of stat interface is too bad. The file path such as
* ./base/13764 or /home/xxx/tablespace/PG_9.2_201611171_datanode1/13764
*/
if (iterations < 3 && isDirExist(path)) {
ereport(DEBUG3, (errmsg("search path %s, relative path: %s, iterations: %d.", path, rpath, iterations)));
searchBCMFiles(path, rpath, undertablespace, clear, iterations);
} else {
/*
* When we handle the bcm files, we will find if we end with "_bcm".
*/
if (CheckFilePostfix(rpath, BCM)) {
HandleBCMfile(rpath, clear);
}
}
}
FreeDir(dir);
}
static void HandleBCMfile(char *bcmpath, bool clear)
{
RelFileNodeForkNum bcmfilenode;
bcmfilenode = relpath_to_filenode(bcmpath);
if (bcmfilenode.forknumber == InvalidForkNumber) {
ereport(WARNING,
(errmsg("relfilenode [spcNode%u] [dbNode%u] [relNode%u]"
"[backendId%d] [segno%u] [forkNumber-%d] forkNumber is invalid",
bcmfilenode.rnode.node.spcNode, bcmfilenode.rnode.node.dbNode, bcmfilenode.rnode.node.relNode,
bcmfilenode.rnode.backend, bcmfilenode.segno, bcmfilenode.forknumber)));
return;
}
ereport(DEBUG3,
(errmsg("relfilenode [spcNode%u] [dbNode%u] [relNode%u]"
"[backendId%d] [segno%u] [forkNumber-%d]",
bcmfilenode.rnode.node.spcNode, bcmfilenode.rnode.node.dbNode, bcmfilenode.rnode.node.relNode,
bcmfilenode.rnode.backend, bcmfilenode.segno, bcmfilenode.forknumber)));
if (clear) {
/* Clear this bcm file */
ereport(DEBUG2, (errmsg("clear bcm file %s ", bcmpath)));
BCMClearFile(bcmfilenode.rnode.node, GetColumnNum(bcmfilenode.forknumber));
} else {
/*
* According to bcm file bcmPath, we put the data(not synchronized)
* to the queue.
*/
ereport(DEBUG2, (errmsg("according to bcm file %s, send data(not synchronized)", bcmpath)));
CatchupShutdownIfNoDataSender();
BCMSendData(bcmfilenode.rnode.node, bcmpath, GetColumnNum(bcmfilenode.forknumber));
}
}
/* Get all bcm files, clear all or send the according not sync heap blocks */
void GetBcmFileList(bool clear)
{
DIR *dir = NULL;
List *tablespaces = NIL;
ListCell *lc = NULL;
struct dirent *de;
tablespaceinfo *ti = NULL;
MemoryContext bcm_context;
MemoryContext old_context;
int nRet = 0;
bcm_context = AllocSetContextCreate(CurrentMemoryContext, "Search BCM files context", ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE);
old_context = MemoryContextSwitchTo(bcm_context);
ereport(LOG, (errmsg("catchup process start to search all of bcm files.")));
/* Make sure we can open the directory with tablespaces in it. */
dir = AllocateDir("pg_tblspc");
if (!dir) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", "pg_tblspc")));
return;
}
/* Collect information about all tablespaces. */
while ((de = ReadDir(dir, "pg_tblspc")) != NULL) {
char fullpath[MAXPGPATH];
char linkpath[MAXPGPATH];
int rllen;
/* Skip special stuff */
if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
continue;
nRet = snprintf_s(fullpath, sizeof(fullpath), sizeof(fullpath) - 1, "pg_tblspc/%s", de->d_name);
securec_check_ss(nRet, "", "");
#if defined(HAVE_READLINK) || defined(WIN32)
rllen = readlink(fullpath, linkpath, sizeof(linkpath));
if (rllen < 0) {
ereport(WARNING, (errmsg("could not read symbolic link \"%s\": %m", fullpath)));
continue;
} else if (rllen >= (int)sizeof(linkpath)) {
ereport(WARNING, (errmsg("symbolic link \"%s\" target is too long", fullpath)));
continue;
}
linkpath[rllen] = '\0';
ti = (tablespaceinfo *)palloc(sizeof(tablespaceinfo));
ti->oid = pstrdup(de->d_name);
ti->path = pstrdup(linkpath);
ti->relativePath = pstrdup(fullpath);
ti->size = -1;
tablespaces = lappend(tablespaces, ti);
#else
/*
* If the platform does not have symbolic links, it should not be
* possible to have tablespaces - clearly somebody else created
* them. Warn about it and ignore.
*/
ereport(WARNING,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform")));
#endif
}
/* Add a node for the base directory at the end */
ti = (tablespaceinfo *)palloc0(sizeof(tablespaceinfo));
tablespaces = lcons(ti, tablespaces);
foreach (lc, tablespaces) {
tablespaceinfo *tsi = (tablespaceinfo *)lfirst(lc);
if (tsi->path != NULL) {
/* Tablespace create by user */
ereport(DEBUG1, (errmsg("bcm path: %s; relative path: %s.", tsi->path, tsi->relativePath)));
searchBCMFiles(tsi->path, tsi->relativePath, true, clear, 0);
} else {
/* Default tablespace */
ereport(DEBUG1, (errmsg("bcm path: %s; relative path: %s.", ".", ".")));
searchBCMFiles(".", NULL, false, clear, 0);
}
}
FreeDir(dir);
ereport(LOG, (errmsg("catchup process done to search all bcm files.")));
MemoryContextSwitchTo(old_context);
MemoryContextDelete(bcm_context);
}
/* Get incremental bcm files, clear all or send the according not sync heap blocks */
void GetIncrementalBcmFileList()
{
int num = 0;
char path[MAXPGPATH] = {'\0'};
char *temp = NULL;
char *fileList = NULL;
int msgLength = 0;
errno_t errorno = EOK;
ereport(LOG, (errmsg("catchup process start to search incremental bcm files.")));
int getIncrementalCatchupParseBcmTime = 0;
int getIncrementalCatchupHandleBcmTime = 0;
num = g_incrementalBcmInfo.msgLength / sizeof(RelFileNodeKey);
msgLength = g_incrementalBcmInfo.msgLength;
fileList = g_incrementalBcmInfo.receivedFileList;
temp = fileList;
ereport(LOG, (errmsg("num of file list we got from dummy:%d", num)));
while (num != 0) {
TimestampTz parseBcmStartTime = GetCurrentTimestamp();
RelFileNodeKey data;
errorno = memcpy_s((void *)&data, sizeof(RelFileNodeKey), temp, sizeof(RelFileNodeKey));
securec_check(errorno, "", "");
temp += sizeof(RelFileNodeKey);
if ((int)data.relfilenode.spcNode == DEFAULTTABLESPACE_OID) {
GetIncrementalBcmFilePathForDefault(data, path, sizeof(path));
} else {
GetIncrementalBcmFilePathForCustome(data, path, sizeof(path));
}
getIncrementalCatchupParseBcmTime += ComputeTimeStamp(parseBcmStartTime);
if (*path != '\0') {
TimestampTz handleBcmStartTime = GetCurrentTimestamp();
HandleBCMfile(path, false);
getIncrementalCatchupHandleBcmTime += ComputeTimeStamp(handleBcmStartTime);
}
num--;
}
ReplaceOrFreeBcmFileListBuffer(NULL, 0);
ereport(
LOG,
(errmsg("incremental catchup parsing bcm costs %d milliseconds, handling bcm costs %d milliseconds, and total "
"costs %d milliseconds",
getIncrementalCatchupParseBcmTime, getIncrementalCatchupHandleBcmTime,
getIncrementalCatchupParseBcmTime + getIncrementalCatchupHandleBcmTime)));
ereport(LOG, (errmsg("catchup process done to search incremental bcm files.")));
}
/* Get incremental bcm file path for default tablespace path example: base/dbnode/relnode_BCM */
static void GetIncrementalBcmFilePathForDefault(const RelFileNodeKey &data, char *path, int length)
{
int nRet = 0;
if ((int)data.relfilenode.spcNode == DEFAULTTABLESPACE_OID) {
if (data.columnid != 0) {
nRet = snprintf_s(path, length, length - 1, "base/%u/%u_C%d_bcm", data.relfilenode.dbNode,
data.relfilenode.relNode, data.columnid);
securec_check_ss(nRet, "", "");
} else {
nRet = snprintf_s(path, length, length - 1, "base/%u/%u_bcm", data.relfilenode.dbNode,
data.relfilenode.relNode);
securec_check_ss(nRet, "", "");
}
if (u_sess->attr.attr_storage.HaModuleDebug) {
ereport(LOG, (errmsg("default tablespace path :%s\n", path)));
}
}
}
/* Get incremental bcm file path for custome tablespace path example:
* pg_tblspc/spcnode/version_nodename/dbnode/relnode_BCM */
static void GetIncrementalBcmFilePathForCustome(const RelFileNodeKey &data, char *path, int length)
{
int nRet = 0;
DIR *dir = NULL;
char fullPath[MAXPGPATH];
char linkPath[MAXPGPATH];
int readLinkPathLength;
if ((int)data.relfilenode.spcNode != DEFAULTTABLESPACE_OID) {
/* Check pg_tblspc dir */
dir = AllocateDir("pg_tblspc");
if (!dir) {
ereport(ERROR, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", "pg_tblspc")));
return;
}
FreeDir(dir);
/* pg_tblspc/spcnode */
nRet = snprintf_s(fullPath, sizeof(fullPath), sizeof(fullPath) - 1, "pg_tblspc/%u", data.relfilenode.spcNode);
securec_check_ss(nRet, "", "");
#if defined(HAVE_READLINK) || defined(WIN32)
/* Check link path */
readLinkPathLength = readlink(fullPath, linkPath, sizeof(linkPath));
if (readLinkPathLength < 0) {
ereport(WARNING, (errmsg("could not read symbolic link \"%s\": %m", fullPath)));
return;
} else if (readLinkPathLength >= (int)sizeof(linkPath)) {
ereport(WARNING, (errmsg("symbolic link \"%s\" target is too long", fullPath)));
return;
}
linkPath[readLinkPathLength] = '\0';
#else
/*
* If the platform does not have symbolic links, it should not be
* possible to have tablespaces - clearly somebody else have created
* them. Warn about it and ignore.
*/
ereport(WARNING,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform")));
#endif
if (data.columnid != 0) {
/* pg_tblspc/spcnode/version_nodename/dbnode/relnode_C1_BCM */
nRet = snprintf_s(path, length, length - 1, "%s/%s_%s/%u/%u_C%d_bcm", fullPath,
TABLESPACE_VERSION_DIRECTORY, g_instance.attr.attr_common.PGXCNodeName,
data.relfilenode.dbNode, data.relfilenode.relNode, data.columnid);
securec_check_ss(nRet, "", "");
} else {
/* pg_tblspc/spcnode/version_nodename/dbnode/relnode_BCM */
nRet = snprintf_s(path, length, length - 1, "%s/%s_%s/%u/%u_bcm", fullPath, TABLESPACE_VERSION_DIRECTORY,
g_instance.attr.attr_common.PGXCNodeName, data.relfilenode.dbNode,
data.relfilenode.relNode);
securec_check_ss(nRet, "", "");
}
if (u_sess->attr.attr_storage.HaModuleDebug) {
ereport(LOG, (errmsg("custome tablespace BCM path:%s\n", path)));
}
}
}
/*
* Load multiple CU units to buffer, push data to sender queue.
* Cause the CU manager may not return the exact size we expected,
* so try again until we get the data we need.
*/
static void bcm_read_multi_cu(CUFile *cFile, Relation rel, int col, BlockNumber heapBlock, int &contibits,
BlockNumber maxHeapBlock)
{
uint64 align_size = (uint64)(uint32)CUAlignUtils::GetCuAlignSizeColumnId(col);
uint64 offset = align_size * (uint64)heapBlock;
char *write_buf = NULL;
int realSize = 0;
if (u_sess->attr.attr_storage.HaModuleDebug) {
ereport(LOG, (errmsg("HA-bcm_read_multi_cu: relation %u/%u/%u col %d try to sync "
"cu blockno %u, contibits %d, maxHeapBlock %u",
rel->rd_node.spcNode, rel->rd_node.dbNode, rel->rd_node.relNode, col, heapBlock, contibits,
maxHeapBlock)));
}
/* The heapBlock of data file must be not exist */
if (heapBlock > maxHeapBlock) {
contibits = 0;
return;
}
/* we should send the NOTSYNCED data from heapBlock to maxHeapBlock */
contibits = (int)Min((uint32)contibits, maxHeapBlock - heapBlock + 1);
while (contibits > 0) {
CatchupShutdownIfNoDataSender();
write_buf = cFile->Read(offset, align_size * contibits, &realSize, (int)align_size);
if (write_buf == NULL) {
Assert(realSize == 0);
contibits = 0;
return;
}
if (u_sess->attr.attr_storage.HaModuleDebug)
check_cu_block(write_buf, realSize, (int)align_size);
PushCUToDataQueue(rel, col, write_buf, offset, realSize, false);
ereport(DEBUG3, (errmsg("cuBlock %u col %d read and send data's realsize is %d.", heapBlock, col, realSize)));
offset += realSize;
contibits -= realSize / align_size;
}
Assert(contibits == 0);
}
void check_cu_block(char *mem, int size, int alignSize)
{
Assert(alignSize > 0);
int cuUnit = size / alignSize;
char zeroBlock[alignSize] = {0};
char *mem_temp = mem;
for (int i = 0; i < cuUnit; i++) {
if (memcmp(mem_temp, zeroBlock, alignSize) == 0)
ereport(WARNING, (errmsg("HA-check_cu_block: check cu blockno %d failed, it is zeropage", i)));
mem_temp += alignSize;
}
}
uint64 cstore_offset_to_cstoreblock(uint64 offset, uint64 align_size)
{
return offset / align_size;
}
uint64 cstore_offset_to_bcmblock(uint64 offset, uint64 align_size)
{
uint64 cstore_block = cstore_offset_to_cstoreblock(offset, align_size);
return (cstore_block / BCM_BLOCKS_PER_PAGE) + UNITBLK_TO_BCMGROUP(cstore_block) + 2;
}