From 5624f47847f8e4471656aa8af89b26b572b4114f Mon Sep 17 00:00:00 2001 From: bowenliu Date: Thu, 8 Dec 2022 15:07:39 +0800 Subject: [PATCH] 1130 bugfix --- src/bin/gs_guc/cluster_guc.conf | 1 + src/bin/initdb/ss_initdb.cpp | 1 + src/bin/pg_probackup/util.cpp | 23 +++ src/common/backend/catalog/catalog.cpp | 4 +- .../utils/cache/knl_localsysdbcache.cpp | 5 +- .../backend/utils/misc/guc/guc_storage.cpp | 20 +++ .../utils/misc/postgresql_single.conf.sample | 1 + .../cbb/instruments/wdr/snapshot.cpp | 2 +- src/gausskernel/ddes/adapter/Makefile | 2 +- src/gausskernel/ddes/adapter/ss_aio.cpp | 142 +++++++++++++++ src/gausskernel/ddes/adapter/ss_dms.cpp | 7 +- .../ddes/adapter/ss_dms_bufmgr.cpp | 71 +++++++- .../ddes/adapter/ss_dms_callback.cpp | 163 +++++++++--------- .../ddes/adapter/ss_dms_recovery.cpp | 34 +++- src/gausskernel/ddes/adapter/ss_init.cpp | 1 - .../ddes/adapter/ss_reform_common.cpp | 67 +++++-- .../ddes/adapter/ss_switchover.cpp | 6 - .../ddes/adapter/ss_transaction.cpp | 108 +++++++----- src/gausskernel/ddes/script/dms_contrl.sh | 2 +- .../process/postmaster/checkpointer.cpp | 2 +- .../process/postmaster/pagewriter.cpp | 115 +++++++++--- .../process/postmaster/postmaster.cpp | 84 ++++----- src/gausskernel/process/tcop/postgres.cpp | 8 +- .../process/threadpool/knl_instance.cpp | 4 + .../process/threadpool/knl_session.cpp | 7 +- .../storage/access/transam/double_write.cpp | 8 +- .../transam/parallel_recovery/dispatcher.cpp | 2 + .../storage/access/transam/xlog.cpp | 134 +++++++------- .../storage/access/transam/xlogreader.cpp | 6 + .../storage/access/transam/xlogutils.cpp | 8 +- src/gausskernel/storage/buffer/buf_init.cpp | 1 + src/gausskernel/storage/buffer/bufmgr.cpp | 98 ++++++++++- src/gausskernel/storage/dss/dss_adaptor.cpp | 4 +- src/gausskernel/storage/dss/fio_dss.cpp | 12 +- src/gausskernel/storage/ipc/procarray.cpp | 4 + src/gausskernel/storage/ipc/sinval.cpp | 5 +- .../storage/replication/walsender.cpp | 2 +- .../storage/smgr/segment/data_file.cpp | 2 +- .../storage/smgr/segment/segbuffer.cpp | 51 +++++- .../storage/smgr/segment/space.cpp | 24 +++ src/gausskernel/storage/smgr/segstore.cpp | 18 +- src/include/access/double_write.h | 16 ++ src/include/access/xlog_basic.h | 15 ++ src/include/ddes/dms/dms_api.h | 68 +++++--- src/include/ddes/dms/ss_aio.h | 63 +++++++ src/include/ddes/dms/ss_common_attr.h | 13 ++ src/include/ddes/dms/ss_dms.h | 2 + src/include/ddes/dms/ss_dms_bufmgr.h | 2 + src/include/ddes/dms/ss_dms_recovery.h | 4 +- src/include/ddes/dms/ss_reform_common.h | 2 + .../knl/knl_guc/knl_instance_attr_storage.h | 1 + src/include/knl/knl_instance.h | 15 +- src/include/knl/knl_thread.h | 1 + src/include/miscadmin.h | 3 +- src/include/postmaster/pagewriter.h | 5 + src/include/storage/buf/buf_internals.h | 2 + src/include/storage/dss/dss_adaptor.h | 4 + src/include/storage/dss/fio_dss.h | 4 +- src/include/storage/smgr/segment.h | 3 + src/include/storage/smgr/segment_internal.h | 4 + .../regress/output/recovery_2pc_tools.source | 1 + 61 files changed, 1147 insertions(+), 340 deletions(-) create mode 100644 src/gausskernel/ddes/adapter/ss_aio.cpp create mode 100644 src/include/ddes/dms/ss_aio.h diff --git a/src/bin/gs_guc/cluster_guc.conf b/src/bin/gs_guc/cluster_guc.conf index 07b162e91..f379c68f7 100755 --- a/src/bin/gs_guc/cluster_guc.conf +++ b/src/bin/gs_guc/cluster_guc.conf @@ -706,6 +706,7 @@ ss_enable_catalog_centralized|bool|0,0|NULL|NULL| ss_enable_reform|bool|0,0|NULL|NULL| ss_enable_ssl|bool|0,0|NULL|NULL| ss_enable_log_level|bool|0,0|NULL|NULL| +ss_enable_aio|bool|0,0|NULL|NULL| ss_interconnect_channel_count|int|1,32|NULL|NULL| ss_work_thread_count|int|16,128|NULL|NULL| ss_recv_msg_pool_size|int|1024,1048576|kB|NULL| diff --git a/src/bin/initdb/ss_initdb.cpp b/src/bin/initdb/ss_initdb.cpp index 4b0994a01..42bd10145 100644 --- a/src/bin/initdb/ss_initdb.cpp +++ b/src/bin/initdb/ss_initdb.cpp @@ -40,6 +40,7 @@ static const char* ss_clusterdirs[] = {"+global", "+base", + "+base/1", "+pg_tblspc", "+pg_clog", "+pg_csnlog", diff --git a/src/bin/pg_probackup/util.cpp b/src/bin/pg_probackup/util.cpp index 223e24236..ca268899a 100644 --- a/src/bin/pg_probackup/util.cpp +++ b/src/bin/pg_probackup/util.cpp @@ -178,6 +178,12 @@ digestControlFile(ControlFileData *ControlFile, char *src, size_t size) tmpDssSrc = oldSrc; tmpDssSrc += (MAX_INSTANCEID + 1) * PG_CONTROL_SIZE; checkSSControlFile(ControlFile, tmpDssSrc, PG_CONTROL_SIZE); + /* Calculate the control file CRC */ + pg_crc32c crc; + INIT_CRC32C(crc); + COMP_CRC32C(crc, src, offsetof(ControlFileData, crc)); + FIN_CRC32C(crc); + ((ControlFileData*)src)->crc = crc; } else { checkControlFile(ControlFile); } @@ -653,6 +659,23 @@ copy_pgcontrol_file(const char *from_fullpath, fio_location from_location, size_t size; buffer = slurpFile(from_fullpath, &size, false, from_location); + + /* + * In dss mode, we need to set the parameter list_stable + * on the pg_control file's last page to 0. + */ + if (is_dss_type(file->type)) { + ss_reformer_ctrl_t *reformerCtrl; + reformerCtrl = (ss_reformer_ctrl_t *)(buffer + (MAX_INSTANCEID + 1) * PG_CONTROL_SIZE); + reformerCtrl->list_stable = 0; + /* Calculate the reformer_ctrl CRC */ + pg_crc32c crc; + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) reformerCtrl, offsetof(ss_reformer_ctrl_t, crc)); + FIN_CRC32C(crc); + reformerCtrl->crc = crc; + } + digestControlFile(&ControlFile, buffer, size); file->crc = ControlFile.crc; file->read_size = size; diff --git a/src/common/backend/catalog/catalog.cpp b/src/common/backend/catalog/catalog.cpp index 147b39e3d..8523c69c6 100644 --- a/src/common/backend/catalog/catalog.cpp +++ b/src/common/backend/catalog/catalog.cpp @@ -205,7 +205,7 @@ char* relpathbackend(RelFileNode rnode, BackendId backend, ForkNumber forknum) char* path = NULL; char* datadir = (char *)palloc(MAXPGPATH); - if (ENABLE_DSS && rnode.dbNode != 1) { + if (ENABLE_DSS) { errno_t rc = snprintf_s(datadir, MAXPGPATH, MAXPGPATH - 1, "%s/", g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name); securec_check_ss(rc, "\0", "\0"); @@ -685,7 +685,7 @@ char* GetDatabasePath(Oid dbNode, Oid spcNode) errno_t rc = EOK; char* datadir = (char *)palloc(MAXPGPATH); - if (ENABLE_DSS && dbNode != 1) { + if (ENABLE_DSS) { rc = snprintf_s(datadir, MAXPGPATH, MAXPGPATH - 1, "%s/", g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name); securec_check_ss(rc, "\0", "\0"); diff --git a/src/common/backend/utils/cache/knl_localsysdbcache.cpp b/src/common/backend/utils/cache/knl_localsysdbcache.cpp index 4196ff2ac..872cf1bda 100644 --- a/src/common/backend/utils/cache/knl_localsysdbcache.cpp +++ b/src/common/backend/utils/cache/knl_localsysdbcache.cpp @@ -499,7 +499,8 @@ void LocalSysDBCache::LocalSysDBCacheReleaseCritialReSource(bool include_shared) bool LocalSysDBCache::DBStandbyChanged() { /* for standby, we cannot recognize whether event of alter db rename happened */ - if (unlikely(t_thrd.postmaster_cxt.HaShmData->current_mode == STANDBY_MODE && my_database_id != InvalidOid)) { + bool isStandby = t_thrd.postmaster_cxt.HaShmData->current_mode == STANDBY_MODE || (ENABLE_DMS && SS_STANDBY_MODE); + if (unlikely(isStandby && my_database_id != InvalidOid)) { if (unlikely(my_database_id != u_sess->proc_cxt.MyDatabaseId)) { return true; } @@ -922,7 +923,7 @@ void LocalSysDBCache::FixWrongCacheStat(Oid db_id, Oid db_tabspc) /* only for standby, we cannot recognize alter db rename event * otherwise, it should not happen */ - Assert(t_thrd.postmaster_cxt.HaShmData->current_mode == STANDBY_MODE); + Assert(t_thrd.postmaster_cxt.HaShmData->current_mode == STANDBY_MODE || (ENABLE_DMS && SS_STANDBY_MODE)); LocalSysDBCacheClearMyDB(); InitFileAccess(); diff --git a/src/common/backend/utils/misc/guc/guc_storage.cpp b/src/common/backend/utils/misc/guc/guc_storage.cpp index 521a9523f..228a7265c 100755 --- a/src/common/backend/utils/misc/guc/guc_storage.cpp +++ b/src/common/backend/utils/misc/guc/guc_storage.cpp @@ -210,6 +210,7 @@ static bool check_ss_rdma_work_config(char** newval, void** extra, GucSource sou static bool check_ss_dss_vg_name(char** newval, void** extra, GucSource source); static bool check_ss_dss_conn_path(char** newval, void** extra, GucSource source); static bool check_ss_enable_ssl(bool* newval, void** extra, GucSource source); +static void assign_ss_enable_aio(bool newval, void *extra); #ifndef ENABLE_MULTIPLE_NODES static void assign_dcf_election_timeout(int newval, void* extra); @@ -1009,6 +1010,19 @@ static void InitStorageConfigureNamesBool() NULL, NULL}, + {{"ss_enable_aio", + PGC_SIGHUP, + NODE_SINGLENODE, + SHARED_STORAGE_OPTIONS, + gettext_noop("Whether use dss aio"), + NULL, + GUC_SUPERUSER_ONLY}, + &g_instance.attr.attr_storage.dms_attr.enable_dss_aio, + true, + NULL, + assign_ss_enable_aio, + NULL}, + {{"ss_enable_catalog_centralized", PGC_POSTMASTER, NODE_SINGLENODE, @@ -5794,6 +5808,11 @@ static bool check_ss_enable_ssl(bool *newval, void **extra, GucSource source) return true; } +static void assign_ss_enable_aio(bool newval, void *extra) +{ + g_instance.attr.attr_storage.dms_attr.enable_dss_aio = newval; +} + #ifndef ENABLE_MULTIPLE_NODES static void assign_dcf_election_timeout(int newval, void* extra) @@ -5909,3 +5928,4 @@ static void assign_logical_decode_options_default(const char* newval, void* extr { u_sess->attr.attr_storage.logical_decode_options_default = extra; } + diff --git a/src/common/backend/utils/misc/postgresql_single.conf.sample b/src/common/backend/utils/misc/postgresql_single.conf.sample index b34de3970..fba8f3c45 100644 --- a/src/common/backend/utils/misc/postgresql_single.conf.sample +++ b/src/common/backend/utils/misc/postgresql_single.conf.sample @@ -817,6 +817,7 @@ job_queue_processes = 10 # Number of concurrent jobs, optional: [0..1000] #ss_enable_dss = off #ss_enable_reform = on #ss_enable_ssl = on +#ss_enable_aio = on #ss_enable_catalog_centralized = on #ss_instance_id = 0 #ss_dss_vg_name = '' diff --git a/src/gausskernel/cbb/instruments/wdr/snapshot.cpp b/src/gausskernel/cbb/instruments/wdr/snapshot.cpp index 29a520671..32a5e8bbf 100755 --- a/src/gausskernel/cbb/instruments/wdr/snapshot.cpp +++ b/src/gausskernel/cbb/instruments/wdr/snapshot.cpp @@ -1502,7 +1502,7 @@ NON_EXEC_STATIC void SnapshotMain() */ pg_usleep(INTERVAL); if ((PgxcIsCentralCoordinator(g_instance.attr.attr_common.PGXCNodeName) || IS_SINGLE_NODE) && - u_sess->attr.attr_common.enable_wdr_snapshot) { + u_sess->attr.attr_common.enable_wdr_snapshot && !SS_IN_REFORM) { /* to avoid dead lock with redis, disable snapshot during redistribution */ check_snapshot_thd_exit(); SnapshotNameSpace::SubSnapshotMain(); diff --git a/src/gausskernel/ddes/adapter/Makefile b/src/gausskernel/ddes/adapter/Makefile index 9ab198f83..141880b47 100644 --- a/src/gausskernel/ddes/adapter/Makefile +++ b/src/gausskernel/ddes/adapter/Makefile @@ -22,7 +22,7 @@ ifneq "$(MAKECMDGOALS)" "clean" endif OBJS = ss_dms_bufmgr.o ss_dms_callback.o ss_dms_log_output.o ss_dms_recovery.o ss_dms.o ss_init.o \ - ss_reform_common.o ss_switchover.o ss_transaction.o + ss_reform_common.o ss_switchover.o ss_transaction.o ss_aio.o include $(top_srcdir)/src/gausskernel/common.mk diff --git a/src/gausskernel/ddes/adapter/ss_aio.cpp b/src/gausskernel/ddes/adapter/ss_aio.cpp new file mode 100644 index 000000000..64373e688 --- /dev/null +++ b/src/gausskernel/ddes/adapter/ss_aio.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * ss_aio.cpp + * aio implementation + * + * IDENTIFICATION + * src/gausskernel/ddes/adapter/ss_aio.cpp + * + * --------------------------------------------------------------------------------------- + */ + +#include "ddes/dms/ss_aio.h" +#include "utils/elog.h" + +static void WaitDSSAioComplete(DSSAioCxt *aio_cxt, int index) +{ + AioUtil *aio = &aio_cxt->aio[index]; + errno_t ret; + struct timespec timeout = { 0, 200 }; + int event_num = aio->iocount; + while (event_num > 0) { + ret = memset_s(aio->events, sizeof(struct io_event) * event_num, + 0, sizeof(struct io_event) * event_num); + securec_check_c(ret, "\0", "\0"); + + int num = io_getevents(aio->handle, 1, event_num, aio->events, &timeout); + if (num < 0) { + if (errno == EINTR || num == -EINTR) { + continue; + } + ereport(PANIC, (errmsg("failed to getevent by aio, errno = %d, retCode = %d", errno, num))); + } + + for (int i = 0; i < num; i++) { + aio_cxt->aiocb(&aio->events[i]); + } + event_num -= num; + } + + aio->iocount = 0; + ret = memset_s(aio->iocbs_ptr, sizeof(struct iocb *) * DSS_AIO_BATCH_SIZE, + 0, sizeof(struct iocb *) * DSS_AIO_BATCH_SIZE); + securec_check_c(ret, "\0", "\0"); + ret = memset_s(aio->events, sizeof(struct io_event) * DSS_AIO_BATCH_SIZE, + 0, sizeof(struct io_event) * DSS_AIO_BATCH_SIZE); + securec_check_c(ret, "\0", "\0"); +} + +void DSSAioFlush(DSSAioCxt *aio_cxt) +{ + bool need_wait = false; + AioUtil *aio = &aio_cxt->aio[aio_cxt->index]; + if (aio->iocount > 0) { + if (io_submit(aio->handle, aio->iocount, aio->iocbs_ptr) != aio->iocount) { + ereport(PANIC, (errmsg("io_submit failed, errno = %d", errno))); + } + need_wait = true; + /* wait aio result at last to improve performance */ + } + + aio = &aio_cxt->aio[1 - aio_cxt->index]; + if (aio->iocount > 0) { + WaitDSSAioComplete(aio_cxt, 1 - aio_cxt->index); + } + + if (need_wait) { + WaitDSSAioComplete(aio_cxt, aio_cxt->index); + } +} + +void DSSAioAppendIOCB(DSSAioCxt *aio_cxt, struct iocb *iocb_ptr) +{ + AioUtil *aio = &aio_cxt->aio[aio_cxt->index]; + aio->iocbs_ptr[aio->iocount] = iocb_ptr; + aio->iocount++; + + if (aio->iocount >= DSS_AIO_BATCH_SIZE) { + if (io_submit(aio->handle, aio->iocount, aio->iocbs_ptr) != aio->iocount) { + ereport(PANIC, (errmsg("io_submit failed, errno = %d", errno))); + } + + aio_cxt->index = 1 - aio_cxt->index; + aio = &aio_cxt->aio[aio_cxt->index]; + if (aio->iocount > 0) { + WaitDSSAioComplete(aio_cxt, aio_cxt->index); + } + } +} + +struct iocb* DSSAioGetIOCB(DSSAioCxt *aio_cxt) +{ + AioUtil *aio = &aio_cxt->aio[aio_cxt->index]; + return &aio->iocbs[aio->iocount]; +} + +int DSSAioGetIOCBIndex(DSSAioCxt *aio_cxt) +{ + AioUtil *aio = &aio_cxt->aio[aio_cxt->index]; + return (aio_cxt->index * DSS_AIO_BATCH_SIZE + aio->iocount); +} + +void DSSAioInitialize(DSSAioCxt *aio_cxt, aio_callback callback) +{ + errno_t err = memset_s(aio_cxt, sizeof(DSSAioCxt), 0, sizeof(DSSAioCxt)); + securec_check_ss(err, "\0", "\0"); + + if (io_setup(DSS_AIO_BATCH_SIZE, &aio_cxt->aio[0].handle) < 0) { + ereport(PANIC, (errmsg("io_setup failed for DSS AIO, errno=%d", errno))); + } + + if (io_setup(DSS_AIO_BATCH_SIZE, &aio_cxt->aio[1].handle) < 0) { + ereport(PANIC, (errmsg("io_setup failed for DSS AIO, errno=%d", errno))); + } + aio_cxt->initialized = true; + aio_cxt->aiocb = callback; + aio_cxt->index = 0; +} + +void DSSAioDestroy(DSSAioCxt *aio_cxt) +{ + if (aio_cxt->initialized) { + (void)io_destroy(aio_cxt->aio[0].handle); + (void)io_destroy(aio_cxt->aio[1].handle); + aio_cxt->initialized = false; + errno_t err = memset_s(aio_cxt, sizeof(DSSAioCxt), 0, sizeof(DSSAioCxt)); + securec_check_ss(err, "\0", "\0"); + } +} + diff --git a/src/gausskernel/ddes/adapter/ss_dms.cpp b/src/gausskernel/ddes/adapter/ss_dms.cpp index ea7a4b97e..23982cdf7 100644 --- a/src/gausskernel/ddes/adapter/ss_dms.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms.cpp @@ -117,7 +117,7 @@ int ss_dms_func_init() SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_latch_timed_x)); SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_latch_timed_s)); SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_unlatch)); - + SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_pre_uninit)); return DMS_SUCCESS; } @@ -295,4 +295,9 @@ int dms_drc_accessible(void) int dms_reform_last_failed(void) { return g_ss_dms_func.dms_reform_last_failed(); +} + +void dms_pre_uninit(void) +{ + return g_ss_dms_func.dms_pre_uninit(); } \ No newline at end of file diff --git a/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp b/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp index 93a96c5a5..efc370a8e 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp @@ -26,7 +26,7 @@ #include "storage/proc.h" #include "storage/buf/bufmgr.h" #include "storage/smgr/segment.h" -#include "replication/shared_storage_walreceiver.h" +#include "utils/resowner.h" #include "ddes/dms/ss_dms_bufmgr.h" #include "securec_check.h" #include "miscadmin.h" @@ -155,7 +155,8 @@ void MarkReadHint(int buf_id, char persistence, bool extend, const XLogPhyBlock void ClearReadHint(int buf_id, bool buf_deleted) { dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_id); - buf_ctrl->state &= ~(BUF_NEED_LOAD | BUF_IS_LOADED | BUF_LOAD_FAILED | BUF_NEED_TRANSFER | BUF_IS_EXTEND); + buf_ctrl->state &= + ~(BUF_NEED_LOAD | BUF_IS_LOADED | BUF_LOAD_FAILED | BUF_NEED_TRANSFER | BUF_IS_EXTEND | BUF_DIRTY_NEED_FLUSH); if (buf_deleted) { buf_ctrl->state = 0; } @@ -184,6 +185,9 @@ Buffer TerminateReadPage(BufferDesc* buf_desc, ReadBufferMode read_mode, const X Buffer buffer; bool isExtend = (buf_ctrl->state & BUF_IS_EXTEND) ? true: false; if (buf_ctrl->state & BUF_NEED_LOAD) { + if (g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy && AmDmsReformProcProcess()) { + ereport(PANIC, (errmsg("SS In flush copy, can't read from disk!"))); + } buffer = ReadBuffer_common_for_dms(read_mode, buf_desc, pblk); } else { Block bufBlock = BufHdrGetBlock(buf_desc); @@ -221,7 +225,7 @@ Buffer TerminateReadPage(BufferDesc* buf_desc, ReadBufferMode read_mode, const X TerminateBufferIO(buf_desc, false, BM_VALID); buffer = BufferDescriptorGetBuffer(buf_desc); - if (!SSFAILOVER_TRIGGER && !RecoveryInProgress()) { + if (!SS_IN_FAILOVER && !RecoveryInProgress()) { CalcSegDmsPhysicalLoc(buf_desc, buffer); } } @@ -392,9 +396,25 @@ int32 CheckBuf4Rebuild(BufferDesc *buf_desc) dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); Assert(buf_ctrl != NULL); Assert(buf_ctrl->is_edp != 1); + Assert(XLogRecPtrIsValid(g_instance.dms_cxt.ckptRedo)); - if (buf_ctrl->lock_mode == (unsigned char)DMS_LOCK_NULL) { + XLogRecPtr pagelsn = BufferGetLSN(buf_desc); + if (buf_ctrl->lock_mode == (unsigned char)DMS_LOCK_NULL || XLByteLT(pagelsn, g_instance.dms_cxt.ckptRedo) || + IsSegmentBufferID(buf_desc->buf_id) || + (!SS_MY_INST_IS_MASTER && buf_ctrl->lock_mode == (unsigned char)DMS_LOCK_EXCLUSIVE)) { if (g_instance.dms_cxt.SSRecoveryInfo.in_failover) { + if (buf_ctrl->lock_mode != (unsigned char)DMS_LOCK_NULL) { + buf_ctrl->state = 0; + buf_ctrl->is_remote_dirty = 0; + buf_ctrl->lock_mode = (uint8)DMS_LOCK_NULL; + buf_ctrl->is_edp = 0; + buf_ctrl->force_request = 0; + buf_ctrl->edp_scn = 0; + buf_ctrl->edp_map = 0; + buf_ctrl->pblk_relno = InvalidOid; + buf_ctrl->pblk_blkno = InvalidBlockNumber; + buf_ctrl->pblk_lsn = InvalidXLogRecPtr; + } InvalidateBuffer(buf_desc); } return DMS_SUCCESS; @@ -405,7 +425,7 @@ int32 CheckBuf4Rebuild(BufferDesc *buf_desc) bool is_dirty = (buf_desc->state & (BM_DIRTY | BM_JUST_DIRTIED)) > 0 ? true : false; int ret = dms_buf_res_rebuild_drc(&dms_ctx, buf_ctrl, (unsigned long long)BufferGetLSN(buf_desc), is_dirty); if (ret != DMS_SUCCESS) { - ereport(DEBUG1, (errmsg("Failed to rebuild page, rel:%u/%u/%u/%d, forknum:%d, blocknum:%u.", + ereport(LOG, (errmsg("Failed to rebuild page, rel:%u/%u/%u/%d, forknum:%d, blocknum:%u.", buf_desc->tag.rnode.spcNode, buf_desc->tag.rnode.dbNode, buf_desc->tag.rnode.relNode, buf_desc->tag.rnode.bucketNode, buf_desc->tag.forkNum, buf_desc->tag.blockNum))); return ret; @@ -461,7 +481,7 @@ int SSLockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) int output_backup = t_thrd.postgres_cxt.whereToSendOutput; t_thrd.postgres_cxt.whereToSendOutput = DestNone; int ret = dms_broadcast_opengauss_ddllock(&dms_ctx, (char *)&ssmsg, sizeof(SSBroadcastDDLLock), - (unsigned char)false, SS_BROADCAST_WAIT_FIVE_SECONDS, (unsigned char)false); + (unsigned char)false, SS_BROADCAST_WAIT_FIVE_SECONDS, (unsigned char)LOCK_NORMAL_MODE); if (ret != DMS_SUCCESS) { ereport(WARNING, (errmsg("SS broadcast DDLLockRelease request failed!"))); } @@ -514,3 +534,42 @@ void SSLockAcquireAll() LWLockRelease(GetMainLWLockByIndex(FirstLockMgrLock + i)); } } + +void SSCheckBufferIfNeedMarkDirty(Buffer buf) +{ + dms_buf_ctrl_t* buf_ctrl = GetDmsBufCtrl(buf - 1); + if (buf_ctrl->state & BUF_DIRTY_NEED_FLUSH) { + MarkBufferDirty(buf); + } +} + +void SSRecheckBufferPool() +{ + uint32 buf_state; + for (int i = 0; i < TOTAL_BUFFER_NUM; i++) { + /* + * BUF_DIRTY_NEED_FLUSH was removed during mark buffer dirty and lsn_on_disk was set during sync buffer + * As BUF_DIRTY_NEED_FLUSH was set only if page lsn is bigger than ckpt redo, it should be removed at this time + * Unfortunately if it is not, mark it dirty again. For lsn_on_disk, if it is still invalid, this means it is + * not flushed. So if it is not dirty, invalidate it again. + */ + BufferDesc *buf_desc = GetBufferDescriptor(i); + pg_memory_barrier(); + buf_state = pg_atomic_read_u32(&buf_desc->state); + if (!(buf_state & BM_VALID || buf_state & BM_TAG_VALID)) { + continue; + } + + dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(i); + if (buf_ctrl->state & BUF_DIRTY_NEED_FLUSH) { + (void)PinBuffer(buf_desc, NULL); + LockBuffer(i + 1, BUFFER_LOCK_SHARE); + ereport(WARNING, + (errmsg("Buffer was not flushed or replayed, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u", + buf_desc->tag.rnode.spcNode, buf_desc->tag.rnode.dbNode, buf_desc->tag.rnode.relNode, + buf_desc->tag.rnode.bucketNode, buf_desc->tag.forkNum, buf_desc->tag.blockNum))); + MarkBufferDirty(i + 1); + UnlockReleaseBuffer(i + 1); + } + } +} diff --git a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp index 44ce00361..89d764a13 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp @@ -220,10 +220,6 @@ static int CBGetTxnCSN(void *db_handle, dms_opengauss_xid_csn_t *csn_req, dms_op static int CBGetSnapshotData(void *db_handle, dms_opengauss_txn_snapshot_t *txn_snapshot) { - if (SS_IN_REFORM) { - return DMS_ERROR; - } - if (RecoveryInProgress()) { return DMS_ERROR; } @@ -347,6 +343,7 @@ static int CBSwitchoverDemote(void *db_handle) DemoteModeDesc(demote_mode)))); return DMS_ERROR; } + ntries = 0; } CHECK_FOR_INTERRUPTS(); @@ -364,6 +361,11 @@ static int CBSwitchoverPromote(void *db_handle, unsigned char origPrimaryId) { g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTING; g_instance.dms_cxt.SSRecoveryInfo.new_primary_reset_walbuf_flag = true; + /* allow recovery in switchover to keep LSN in order */ + t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->SharedRecoveryInProgress = true; + t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY; + pg_memory_barrier(); ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS switchover] Starting to promote standby."))); /* since original primary must have demoted, it is safe to allow promting standby write */ @@ -392,6 +394,7 @@ static int CBSwitchoverPromote(void *db_handle, unsigned char origPrimaryId) ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS switchover] Standby promote timeout, please try again later."))); } + ntries = 0; } CHECK_FOR_INTERRUPTS(); @@ -407,8 +410,12 @@ static void CBSwitchoverResult(void *db_handle, int result) ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS switchover] Switchover success, letting reformer update roles."))); return; + } else { + /* abort and restore state */ + g_instance.dms_cxt.SSReformInfo.in_reform = false; + g_instance.dms_cxt.SSClusterState = NODESTATE_NORMAL; + ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS switchover] Switchover failed, errno: %d.", result))); } - ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS switchover] Switchover failed, errno: %d.", result))); } static int SetPrimaryIdOnStandby(int primary_id) @@ -466,15 +473,6 @@ static int CBSaveStableList(void *db_handle, unsigned long long list_stable, uns } else { /* we are on standby */ ret = SetPrimaryIdOnStandby(primary_id); } - - /* SSClusterState and in_reform must be set atomically */ - g_instance.dms_cxt.SSClusterState = NODESTATE_NORMAL; - g_instance.dms_cxt.SSReformInfo.in_reform = false; - g_instance.dms_cxt.SSRecoveryInfo.startup_reform = false; - ereport(LOG, - (errmodule(MOD_DMS), - errmsg("[SS reform/SS switchover/SS failover] Reform success, instance:%d is running.", - g_instance.attr.attr_storage.dms_attr.instance_id))); return ret; } @@ -525,7 +523,7 @@ static void DmsReleaseBuffer(int buffer, bool is_seg) } } -static int tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_t **buf_ctrl) +static void tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_t **buf_ctrl) { bool is_seg; int buf_id = -1; @@ -549,7 +547,6 @@ static int tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_ hash = BufTableHashCode(tag); partition_lock = BufMappingPartitionLock(hash); - int buffer; uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount; PG_TRY(); { @@ -558,7 +555,6 @@ static int tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_ buf_id = BufTableLookup(tag, hash); if (buf_id < 0) { LWLockRelease(partition_lock); - buffer = 0; break; } @@ -581,7 +577,6 @@ static int tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_ tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode, tag->forkNum, tag->blockNum, buf_desc->state))); DmsReleaseBuffer(buf_desc->buf_id + 1, is_seg); - buffer = 0; break; } @@ -591,18 +586,13 @@ static int tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_ tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode, tag->forkNum, tag->blockNum, buf_desc->state))); DmsReleaseBuffer(buf_desc->buf_id + 1, is_seg); - buffer = 0; break; } LWLockMode content_mode = (mode == DMS_LOCK_SHARE) ? LW_SHARED : LW_EXCLUSIVE; (void)LWLockAcquire(buf_desc->content_lock, content_mode); *buf_ctrl = GetDmsBufCtrl(buf_id); - (*buf_ctrl)->lsn_on_disk = BufferGetLSN(buf_desc); - (*buf_ctrl)->seg_fileno = buf_desc->seg_fileno; - (*buf_ctrl)->seg_blockno = buf_desc->seg_blockno; Assert(buf_id >= 0); - buffer = buf_id + 1; } while (0); } PG_CATCH(); @@ -611,34 +601,14 @@ static int tryEnterLocalPage(BufferTag *tag, dms_lock_mode_t mode, dms_buf_ctrl_ ReleaseResource(); } PG_END_TRY(); - - return buffer; -} - -static int CBTryEnterLocalPage(void *db_handle, char pageid[DMS_PAGEID_SIZE], dms_lock_mode_t mode, - dms_buf_ctrl_t **buf_ctrl) -{ - BufferTag *tag = (BufferTag *)pageid; - Buffer buffer = tryEnterLocalPage(tag, mode, buf_ctrl); - if (buffer <= 0) { - if (*buf_ctrl != NULL) { - ereport(PANIC, (errmsg("CBTryEnterLocalPage failed"))); - } - } else { - if (*buf_ctrl == NULL) { - ereport(PANIC, (errmsg("CBTryEnterLocalPage failed"))); - } - } - - return DMS_SUCCESS; } static int CBEnterLocalPage(void *db_handle, char pageid[DMS_PAGEID_SIZE], dms_lock_mode_t mode, dms_buf_ctrl_t **buf_ctrl) { BufferTag *tag = (BufferTag *)pageid; - Buffer buffer = tryEnterLocalPage(tag, mode, buf_ctrl); - return (buffer > 0) ? DMS_SUCCESS : DMS_ERROR; + tryEnterLocalPage(tag, mode, buf_ctrl); + return DMS_SUCCESS; } static unsigned char CBPageDirty(dms_buf_ctrl_t *buf_ctrl) @@ -648,7 +618,8 @@ static unsigned char CBPageDirty(dms_buf_ctrl_t *buf_ctrl) return 0; } BufferDesc *buf_desc = GetBufferDescriptor(buf_ctrl->buf_id); - return pg_atomic_read_u32(&buf_desc->state) & (BM_DIRTY | BM_JUST_DIRTIED); + bool is_dirty = (pg_atomic_read_u32(&buf_desc->state) & (BM_DIRTY | BM_JUST_DIRTIED)) > 0; + return (unsigned char)is_dirty; } static void CBLeaveLocalPage(void *db_handle, dms_buf_ctrl_t *buf_ctrl) @@ -675,7 +646,7 @@ static char* CBGetPage(dms_buf_ctrl_t *buf_ctrl) return (char *)BufHdrGetBlock(buf_desc); } -static void CBInvalidatePage(void *db_handle, char pageid[DMS_PAGEID_SIZE]) +static int CBInvalidatePage(void *db_handle, char pageid[DMS_PAGEID_SIZE], unsigned long long ver) { bool valid = false; int buf_id; @@ -684,6 +655,7 @@ static void CBInvalidatePage(void *db_handle, char pageid[DMS_PAGEID_SIZE]) LWLock *partition_lock = NULL; BufferDesc *buf_desc = NULL; dms_buf_ctrl_t *buf_ctrl = NULL; + int ret = DMS_SUCCESS; hash = BufTableHashCode(tag); partition_lock = BufMappingPartitionLock(hash); @@ -692,7 +664,7 @@ static void CBInvalidatePage(void *db_handle, char pageid[DMS_PAGEID_SIZE]) if (buf_id < 0) { /* not found in shared buffer */ LWLockRelease(partition_lock); - return; + return ret; } uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount; @@ -710,7 +682,13 @@ static void CBInvalidatePage(void *db_handle, char pageid[DMS_PAGEID_SIZE]) if (valid) { (void)LWLockAcquire(buf_desc->content_lock, LW_EXCLUSIVE); buf_ctrl = GetDmsBufCtrl(buf_id); - buf_ctrl->lock_mode = (unsigned char)DMS_LOCK_NULL; + if (ver >= buf_ctrl->ver) { + buf_ctrl->lock_mode = (unsigned char)DMS_LOCK_NULL; + } else { + ereport(WARNING, (errmodule(MOD_DMS), + errmsg("[CBInvalidatePage] invalid ver:%llu, buf_ctrl ver:%llu", ver, buf_ctrl->ver))); + ret = DMS_ERROR; + } LWLockRelease(buf_desc->content_lock); } if (IsSegmentBufferID(buf_id)) { @@ -725,14 +703,16 @@ static void CBInvalidatePage(void *db_handle, char pageid[DMS_PAGEID_SIZE]) ReleaseResource(); } PG_END_TRY(); + return ret; } -static void CBXLogFlush(void *db_handle, unsigned long long *lsn) +static int CBXLogFlush(void *db_handle, unsigned long long *lsn) { (void)LWLockAcquire(WALWriteLock, LW_EXCLUSIVE); (void)XLogBackgroundFlush(); *lsn = GetFlushRecPtr(); LWLockRelease(WALWriteLock); + return GS_SUCCESS; } static char *CBDisplayBufferTag(char *displayBuf, unsigned int count, char *pageid) @@ -1034,6 +1014,10 @@ static void CBSetDmsStatus(void *db_handle, int dms_status) static int32 CBDrcBufRebuild(void *db_handle) { + /* Load Control File */ + int src_id = SSGetPrimaryInstId(); + SSReadControlFile(src_id, true); + uint32 buf_state; for (int i = 0; i < TOTAL_BUFFER_NUM; i++) { BufferDesc *buf_desc = GetBufferDescriptor(i); @@ -1148,7 +1132,7 @@ static int CBConfirmOwner(void *db_handle, char *pageid, unsigned char *lock_mod } static int CBConfirmConverting(void *db_handle, char *pageid, unsigned char smon_chk, - unsigned char *lock_mode, unsigned long long *edp_map, unsigned long long *lsn) + unsigned char *lock_mode, unsigned long long *edp_map, unsigned long long *lsn, unsigned long long *ver) { BufferDesc *buf_desc = NULL; bool valid; @@ -1186,6 +1170,7 @@ static int CBConfirmConverting(void *db_handle, char *pageid, unsigned char smon bool is_locked = LWLockConditionalAcquire(buf_desc->content_lock, LW_EXCLUSIVE); if (is_locked) { buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); + *ver = buf_ctrl->ver; *lock_mode = buf_ctrl->lock_mode; LWLockRelease(buf_desc->content_lock); break; @@ -1205,6 +1190,7 @@ static int CBConfirmConverting(void *db_handle, char *pageid, unsigned char smon // without lock buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); + *ver = buf_ctrl->ver; *lock_mode = buf_ctrl->lock_mode; SSUnPinBuffer(buf_desc); @@ -1244,6 +1230,7 @@ static int CBRecoveryPrimary(void *db_handle, int inst_id) Assert(g_instance.dms_cxt.SSReformerControl.primaryInstId == inst_id || g_instance.dms_cxt.SSReformerControl.primaryInstId == -1); g_instance.dms_cxt.SSRecoveryInfo.skip_redo_replay = false; + g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy = false; ereport(LOG, (errmsg("[SS reform] Recovery as primary, will replay xlog from inst:%d", g_instance.dms_cxt.SSReformerControl.primaryInstId))); @@ -1257,12 +1244,8 @@ static int CBRecoveryPrimary(void *db_handle, int inst_id) static int CBFlushCopy(void *db_handle, char *pageid) { - /* - * 1. request page from remote - * 2. mark page need flush - */ - while (!g_instance.dms_cxt.SSRecoveryInfo.reclsn_updated) { - pg_usleep(100L); /* sleep 0.1ms */ + if (SS_REFORM_REFORMER && !g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy) { + g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy = true; } BufferTag* tag = (BufferTag*)pageid; @@ -1281,9 +1264,13 @@ static int CBFlushCopy(void *db_handle, char *pageid) PG_CATCH(); { t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount; - if (t_thrd.role == DMS_WORKER) { - FlushErrorState(); - } + /* Save error info */ + ErrorData* edata = CopyErrorData(); + FlushErrorState(); + FreeErrorData(edata); + ereport(PANIC, (errmsg("[SS Flush Copy] Error happend, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u", + tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode, + tag->forkNum, tag->blockNum))); } PG_END_TRY(); @@ -1299,12 +1286,18 @@ static int CBFlushCopy(void *db_handle, char *pageid) } } - LockBuffer(buffer, BUFFER_LOCK_SHARE); ereport(LOG, (errmsg("[SS] ready to flush copy, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u", tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->rnode.bucketNode, tag->forkNum, tag->blockNum))); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + Assert(XLogRecPtrIsValid(g_instance.dms_cxt.ckptRedo)); + BufferDesc* buf_desc = GetBufferDescriptor(buffer - 1); + XLogRecPtr pagelsn = BufferGetLSN(buf_desc); + if (XLByteLT(g_instance.dms_cxt.ckptRedo, pagelsn) && !IsSegmentPhysicalRelNode(tag->rnode)) { + dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buffer - 1); + buf_ctrl->state |= BUF_DIRTY_NEED_FLUSH; + } + + ReleaseBuffer(buffer); return GS_SUCCESS; } @@ -1331,14 +1324,20 @@ static int CBGetDBPrimaryId(void *db_handle, unsigned int *primary_id) return GS_SUCCESS; } -static void CBReformStartNotify(void *db_handle, dms_role_t role) +static void CBReformStartNotify(void *db_handle, dms_role_t role, unsigned char reform_type) { + SSReformType ss_reform_type = (SSReformType)reform_type; ss_reform_info_t *reform_info = &g_instance.dms_cxt.SSReformInfo; reform_info->dms_role = role; reform_info->in_reform = true; g_instance.dms_cxt.SSClusterState = NODESTATE_NORMAL; g_instance.dms_cxt.SSRecoveryInfo.reform_ready = false; - ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] dms reform start, role:%d", role))); + g_instance.dms_cxt.resetSyscache = true; + if (ss_reform_type == DMS_REFORM_TYPE_FOR_FAILOVER_OPENGAUSS) { + g_instance.dms_cxt.SSRecoveryInfo.in_failover = true; + } + ereport(LOG, (errmodule(MOD_DMS), + errmsg("[SS reform] dms reform start, role:%d, reform type:%d", role, (int)ss_reform_type))); if (reform_info->dms_role == DMS_ROLE_REFORMER) { if (dss_set_server_status_wrapper(true) != GS_SUCCESS) { ereport(PANIC, (errmodule(MOD_DMS), errmsg("[SS reform] Could not set dssserver flag=read_write"))); @@ -1346,7 +1345,6 @@ static void CBReformStartNotify(void *db_handle, dms_role_t role) if (!SS_MY_INST_IS_MASTER) { // means failover g_instance.dms_cxt.SSRecoveryInfo.reclsn_updated = false; - g_instance.dms_cxt.SSRecoveryInfo.in_failover = true; } } else { if (dss_set_server_status_wrapper(false) != GS_SUCCESS) { @@ -1361,28 +1359,37 @@ static void CBReformStartNotify(void *db_handle, dms_role_t role) while (true) { if (dms_reform_failed()) { - ereport(WARNING, (errmodule(MOD_DMS), errmsg("reform failed during caneling backends"))); + ereport(WARNING, (errmodule(MOD_DMS), errmsg("[SS reform]reform failed during caneling backends"))); return; } if (g_instance.dms_cxt.SSRecoveryInfo.reform_ready || g_instance.dms_cxt.SSRecoveryInfo.startup_reform) { - ereport(LOG, (errmodule(MOD_DMS), errmsg("reform ready, backends have been terminated"))); + ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform]reform ready, backends have been terminated"))); return; } pg_usleep(REFORM_WAIT_TIME); } } -static int CBSetBufInfo(dms_buf_ctrl_t* buf_ctrl) +static int CBReformDoneNotify(void *db_handle) { - Assert(buf_ctrl->buf_id < TOTAL_BUFFER_NUM); - if (buf_ctrl->buf_id >= TOTAL_BUFFER_NUM) { - return DMS_ERROR; + /* SSClusterState and in_reform must be set atomically */ + g_instance.dms_cxt.SSClusterState = NODESTATE_NORMAL; + g_instance.dms_cxt.SSReformInfo.in_reform = false; + if (g_instance.dms_cxt.SSRecoveryInfo.in_failover) { + g_instance.dms_cxt.SSRecoveryInfo.in_failover = false; } + g_instance.dms_cxt.SSRecoveryInfo.startup_reform = false; + g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = false; + ereport(LOG, + (errmodule(MOD_DMS), + errmsg("[SS reform/SS switchover/SS failover] Reform success, instance:%d is running.", + g_instance.attr.attr_storage.dms_attr.instance_id))); + return GS_SUCCESS; +} - BufferDesc *buf_desc = GetBufferDescriptor(buf_ctrl->buf_id); - buf_desc->lsn_on_disk = buf_ctrl->lsn_on_disk; - buf_desc->seg_fileno = buf_ctrl->seg_fileno; - buf_desc->seg_blockno = buf_ctrl->seg_blockno; +static int CBXLogWaitFlush(void *db_handle, unsigned long long lsn) +{ + XLogWaitFlush(lsn); return GS_SUCCESS; } @@ -1454,11 +1461,9 @@ void DmsInitCallback(dms_callback_t *callback) callback->get_db_primary_id = CBGetDBPrimaryId; callback->failover_promote_opengauss = CBFailoverPromote; callback->reform_start_notify = CBReformStartNotify; - callback->set_buf_info = CBSetBufInfo; callback->get_page_hash_val = CBPageHashCode; callback->read_local_page4transfer = CBEnterLocalPage; - callback->try_read_local_page = CBTryEnterLocalPage; callback->leave_local_page = CBLeaveLocalPage; callback->page_is_dirty = CBPageDirty; callback->get_page = CBGetPage; @@ -1491,4 +1496,6 @@ void DmsInitCallback(dms_callback_t *callback) callback->set_switchover_result = CBSwitchoverResult; callback->set_db_standby = CBSetDbStandby; callback->db_is_primary = CBDbIsPrimary; + callback->reform_done_notify = CBReformDoneNotify; + callback->log_wait_flush = CBXLogWaitFlush; } diff --git a/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp b/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp index 00f43ee8a..2a4793a60 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp @@ -60,6 +60,12 @@ void SSSavePrimaryInstId(int id) */ void SSWakeupRecovery(void) { + uint32 thread_num = (uint32)g_instance.ckpt_cxt_ctl->pgwr_procs.num; + /* need make sure pagewriter started first */ + while (pg_atomic_read_u32(&g_instance.ckpt_cxt_ctl->current_page_writer_count) != thread_num) { + pg_usleep(REFORM_WAIT_TIME); + } + g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag = false; } @@ -107,7 +113,7 @@ bool SSRecoveryApplyDelay(const XLogReaderState *record) return true; } -void SSReadControlFile(int id) +void SSReadControlFile(int id, bool updateDmsCtx) { pg_crc32c crc; errno_t rc = EOK; @@ -161,7 +167,15 @@ loop: } } } else { - rc = memcpy_s(t_thrd.shemem_ptr_cxt.ControlFile, (size_t)len, buffer, (size_t)len); + ControlFileData* controlFile = NULL; + ControlFileData tempControlFile; + if (updateDmsCtx) { + controlFile = &tempControlFile; + } else { + controlFile = t_thrd.shemem_ptr_cxt.ControlFile; + } + + rc = memcpy_s(controlFile, (size_t)len, buffer, (size_t)len); securec_check(rc, "", ""); if (close(fd) < 0) { ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); @@ -169,10 +183,10 @@ loop: /* Now check the CRC. */ INIT_CRC32C(crc); - COMP_CRC32C(crc, (char *)t_thrd.shemem_ptr_cxt.ControlFile, offsetof(ControlFileData, crc)); + COMP_CRC32C(crc, (char *)controlFile, offsetof(ControlFileData, crc)); FIN_CRC32C(crc); - if (!EQ_CRC32C(crc, t_thrd.shemem_ptr_cxt.ControlFile->crc)) { + if (!EQ_CRC32C(crc, controlFile->crc)) { if (retry == false) { ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); fname = XLOG_CONTROL_FILE_BAK; @@ -182,6 +196,10 @@ loop: ereport(FATAL, (errmsg("incorrect checksum in control file"))); } } + + if (XLByteLE(g_instance.dms_cxt.ckptRedo, controlFile->checkPointCopy.redo)) { + g_instance.dms_cxt.ckptRedo = controlFile->checkPointCopy.redo; + } } } @@ -293,15 +311,18 @@ void ss_failover_dw_init_internal() dw_exit(false); } + dw_exit(true); + dw_exit(false); ss_initdwsubdir(dssdir, old_primary_id); dw_ext_init(); dw_init(); + g_instance.dms_cxt.finishedRecoverOldPrimaryDWFile = true; dw_exit(true); dw_exit(false); - ss_initdwsubdir(dssdir, self_id); dw_ext_init(); dw_init(); + g_instance.dms_cxt.finishedRecoverOldPrimaryDWFile = false; ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS failover] dw init finish"))); } @@ -313,7 +334,6 @@ void ss_failover_dw_init() } } ckpt_shutdown_pagewriter(); - + g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy = false; ss_failover_dw_init_internal(); } - diff --git a/src/gausskernel/ddes/adapter/ss_init.cpp b/src/gausskernel/ddes/adapter/ss_init.cpp index 82f9b9bb9..4a474cbdc 100644 --- a/src/gausskernel/ddes/adapter/ss_init.cpp +++ b/src/gausskernel/ddes/adapter/ss_init.cpp @@ -437,7 +437,6 @@ static bool DMSReformCheckStartup() } if (g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag) { - g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = false; SSRestartFailoverPromote(); return true; } diff --git a/src/gausskernel/ddes/adapter/ss_reform_common.cpp b/src/gausskernel/ddes/adapter/ss_reform_common.cpp index 3b5e5293f..33f3cbd4e 100644 --- a/src/gausskernel/ddes/adapter/ss_reform_common.cpp +++ b/src/gausskernel/ddes/adapter/ss_reform_common.cpp @@ -160,25 +160,17 @@ static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int /* Read the requested page */ t_thrd.xlog_cxt.readOff = targetPageOff; -try_again: - ssize_t actualBytes = pread(t_thrd.xlog_cxt.readFile, buf, XLOG_BLCKSZ, t_thrd.xlog_cxt.readOff); - if (actualBytes != XLOG_BLCKSZ) { + bool ret = SSReadXlogInternal(xlogreader, targetPagePtr, buf); + if (!ret) { ereport(LOG, (errcode_for_file_access(), errmsg("read xlog(start:%X/%X, pos:%u len:%d) failed : %m", static_cast(targetPagePtr >> BIT_NUM_INT32), static_cast(targetPagePtr), targetPageOff, expectReadLen))); - ereport(emode_for_corrupt_record(emode, targetPagePtr), (errcode_for_file_access(), errmsg("could not read from log file %s to offset %u: %m", XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(REFORM_WAIT_TIME); - goto try_again; - } - goto next_record_is_invalid; } @@ -211,6 +203,61 @@ int SSXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int re return read_len; } +bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, char *buf) +{ + uint32 preReadOff; + + do { + if (XLByteInPreReadBuf(targetPagePtr, xlogreader->preReadStartPtr)) { + preReadOff = targetPagePtr % XLogPreReadSize; + int err = memcpy_s(buf, XLOG_BLCKSZ, xlogreader->preReadBuf + preReadOff, XLOG_BLCKSZ); + securec_check(err, "\0", "\0"); + break; + } else { + // pre-reading for dss + uint32 targetPageOff = targetPagePtr % XLogSegSize; + preReadOff = targetPageOff - targetPageOff % XLogPreReadSize; + ssize_t actualBytes = pread(t_thrd.xlog_cxt.readFile, xlogreader->preReadBuf, XLogPreReadSize, preReadOff); + if (actualBytes != XLogPreReadSize) { + return false; + } + xlogreader->preReadStartPtr = targetPagePtr + preReadOff - targetPageOff; + } + } while (true); + + return true; +} + +XLogReaderState *SSXLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data, Size alignedSize) +{ + XLogReaderState *state = XLogReaderAllocate(pagereadfunc, private_data, alignedSize); + if (state != NULL) { + state->preReadStartPtr = InvalidXlogPreReadStartPtr; + state->preReadBufOrigin = (char *)palloc_extended(XLogPreReadSize + alignedSize, + MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); + if (state->preReadBufOrigin == NULL) { + pfree(state->errormsg_buf); + state->errormsg_buf = NULL; + pfree(state->readBufOrigin); + state->readBufOrigin = NULL; + state->readBuf = NULL; + pfree(state->readRecordBuf); + state->readRecordBuf = NULL; + pfree(state); + state = NULL; + return NULL; + } + + if (alignedSize == 0) { + state->preReadBuf = state->preReadBufOrigin; + } else { + state->preReadBuf = (char *)TYPEALIGN(alignedSize, state->preReadBufOrigin); + } + } + + return state; +} + void SSGetXlogPath() { int primaryId = -1; diff --git a/src/gausskernel/ddes/adapter/ss_switchover.cpp b/src/gausskernel/ddes/adapter/ss_switchover.cpp index 6e712f7f1..9c6d2867e 100644 --- a/src/gausskernel/ddes/adapter/ss_switchover.cpp +++ b/src/gausskernel/ddes/adapter/ss_switchover.cpp @@ -72,12 +72,6 @@ void SSHandleSwitchoverPromote() ereport(LOG, (errmsg("[SS switchover] Standby promote: begin StartupThread."))); Assert(g_instance.dms_cxt.SSReformerControl.primaryInstId != SS_MY_INST_ID); - /* allow recovery in switchover to keep LSN in order */ - t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false; - t_thrd.shemem_ptr_cxt.XLogCtl->SharedRecoveryInProgress = true; - t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_CRASH_RECOVERY; - pg_memory_barrier(); - /* let StartupXLOG do the rest of switchover standby promotion */ if (pmState == PM_WAIT_BACKENDS) { g_instance.pid_cxt.StartupPID = initialize_util_thread(STARTUP); diff --git a/src/gausskernel/ddes/adapter/ss_transaction.cpp b/src/gausskernel/ddes/adapter/ss_transaction.cpp index eda70944c..a7551e94b 100644 --- a/src/gausskernel/ddes/adapter/ss_transaction.cpp +++ b/src/gausskernel/ddes/adapter/ss_transaction.cpp @@ -35,13 +35,25 @@ Snapshot SSGetSnapshotData(Snapshot snapshot) dms_opengauss_txn_snapshot_t dms_snapshot; dms_context_t dms_ctx; InitDmsContext(&dms_ctx); - dms_ctx.xmap_ctx.dest_id = (unsigned int)SS_MASTER_ID; - - if (dms_request_opengauss_txn_snapshot(&dms_ctx, &dms_snapshot) != DMS_SUCCESS) { - ereport(ERROR, (errmsg("failed to request snapshot from master through dms"))); + if (SS_IN_REFORM) { + ereport(WARNING, (errmsg("[SS reform] SSGetSnapshotData returns NULL in reform."))); return NULL; } + do { + dms_ctx.xmap_ctx.dest_id = (unsigned int)SS_MASTER_ID; + if (dms_request_opengauss_txn_snapshot(&dms_ctx, &dms_snapshot) == DMS_SUCCESS) { + break; + } + + if (SS_IN_REFORM) { + ereport(WARNING, (errmsg("[SS reform] SSGetSnapshotData returns NULL in reform."))); + return NULL; + } + pg_usleep(USECS_PER_SEC); + + } while (true); + snapshot->xmin = dms_snapshot.xmin; snapshot->xmax = dms_snapshot.xmax; snapshot->snapshotcsn = dms_snapshot.snapshotcsn; @@ -105,28 +117,26 @@ CommitSeqNo SSTransactionIdGetCommitSeqNo(TransactionId transactionId, bool isCo dms_txn_info.snapshotxmin = InvalidTransactionId; } - if (SSTransactionIdGetCSN(&dms_txn_info, &xid_csn_result) == DMS_SUCCESS) { - csn = xid_csn_result.csn; - clogstatus = (int)xid_csn_result.clogstatus; - lsn = xid_csn_result.lsn; - if (sync != NULL && (bool)xid_csn_result.sync) { - *sync = (bool)xid_csn_result.sync; - ereport(DEBUG1, (errmsg("SS primary xid sync success, xid=%lu.", transactionId))); - } - if (snapshot != NULL) { - ereport(DEBUG1, (errmsg("SS get txn info success, xid=%lu, snapshot=%lu-%lu-%lu, csn=%lu.", transactionId, - snapshot->xmin, snapshot->xmax, snapshot->snapshotcsn, csn))); + do { + if (SSTransactionIdGetCSN(&dms_txn_info, &xid_csn_result) == DMS_SUCCESS) { + csn = xid_csn_result.csn; + clogstatus = (int)xid_csn_result.clogstatus; + lsn = xid_csn_result.lsn; + if (sync != NULL && (bool)xid_csn_result.sync) { + *sync = (bool)xid_csn_result.sync; + ereport(DEBUG1, (errmsg("SS primary xid sync success, xid=%lu.", transactionId))); + } + if (snapshot != NULL) { + ereport(DEBUG1, (errmsg("SS get txn info success, xid=%lu, snapshot=%lu-%lu-%lu, csn=%lu.", transactionId, + snapshot->xmin, snapshot->xmax, snapshot->snapshotcsn, csn))); + } else { + ereport(DEBUG1, (errmsg("SS get txn info success, snapshot is NULL"))); + } } else { - ereport(DEBUG1, (errmsg("SS get txn info success, snapshot is NULL"))); + pg_usleep(USECS_PER_SEC); + continue; } - } else { - if (snapshot != NULL) { - ereport(ERROR, (errmsg("SS get txn info failed, xid=%lu, snapshot=%lu-%lu-%lu.", transactionId, - snapshot->xmin, snapshot->xmax, snapshot->snapshotcsn))); - } else { - ereport(ERROR, (errmsg("SS get txn info failed, snapshot is NULL"))); - } - } + } while (true); if (COMMITSEQNO_IS_COMMITTED(csn) || COMMITSEQNO_IS_ABORTED(csn)) { t_thrd.xact_cxt.cachedFetchCSNXid = transactionId; @@ -174,12 +184,19 @@ bool SSTransactionIdDidCommit(TransactionId transactionId) dms_ctx.xid_ctx.xid = *(uint64 *)(&transactionId); dms_ctx.xid_ctx.inst_id = (unsigned char)SS_MASTER_ID; - if (dms_request_opengauss_txn_status(&dms_ctx, (uint8)XID_COMMITTED, (uint8 *)&did_commit) != DMS_SUCCESS) { - ereport(FATAL, (errmsg("SS get txn did_commit failed, xid=%lu.", transactionId))); - } - remote_get = true; - ereport(DEBUG1, - (errmsg("SS get txn did_commit success, xid=%lu, did_commit=%d.", transactionId, did_commit))); + do { + if (dms_request_opengauss_txn_status(&dms_ctx, (uint8)XID_COMMITTED, (uint8 *)&did_commit) + == DMS_SUCCESS) { + remote_get = true; + ereport(DEBUG1, + (errmsg("SS get txn did_commit success, xid=%lu, did_commit=%d.", + transactionId, did_commit))); + break; + } else { + pg_usleep(USECS_PER_SEC); + continue; + } + } while (true); } if (did_commit && remote_get) { @@ -203,10 +220,16 @@ bool SSTransactionIdIsInProgress(TransactionId transactionId) dms_ctx.xid_ctx.xid = *(uint64 *)(&transactionId); dms_ctx.xid_ctx.inst_id = (unsigned char)SS_MASTER_ID; - if (dms_request_opengauss_txn_status(&dms_ctx, (uint8)XID_INPROGRESS, (uint8 *)&in_progress) != DMS_SUCCESS) { - ereport(ERROR, (errmsg("SS get txn in_progress failed, xid=%lu.", transactionId))); - } - ereport(DEBUG1, (errmsg("SS get txn in_progress success, xid=%lu, in_progress=%d.", transactionId, in_progress))); + do { + if (dms_request_opengauss_txn_status(&dms_ctx, (uint8)XID_INPROGRESS, (uint8 *)&in_progress) == DMS_SUCCESS) { + ereport(DEBUG1, (errmsg("SS get txn in_progress success, xid=%lu, in_progress=%d.", + transactionId, in_progress))); + break; + } else { + pg_usleep(USECS_PER_SEC); + continue; + } + } while (true); return in_progress; } @@ -219,14 +242,17 @@ TransactionId SSMultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask, u dms_ctx.xid_ctx.xid = *(uint64 *)(&xmax); dms_ctx.xid_ctx.inst_id = (unsigned char)SS_MASTER_ID; - int ret = dms_request_opengauss_update_xid(&dms_ctx, t_infomask, t_infomask2, (unsigned long long *)&update_xid); - if (ret != DMS_SUCCESS) { - update_xid = InvalidTransactionId; - ereport(WARNING, - (errmsg("SS get update xid failed, multixact xid=%lu.", xmax))); - } + do { + if (dms_request_opengauss_update_xid(&dms_ctx, t_infomask, t_infomask2, (unsigned long long *)&update_xid) + == DMS_SUCCESS) { + ereport(DEBUG1, (errmsg("SS get update xid success, multixact xid=%lu, uxid=%lu.", xmax, update_xid))); + break; + } else { + pg_usleep(USECS_PER_SEC); + continue; + } + } while (true); - ereport(DEBUG1, (errmsg("SS get update xid success, multixact xid=%lu, uxid=%lu.", xmax, update_xid))); return update_xid; } diff --git a/src/gausskernel/ddes/script/dms_contrl.sh b/src/gausskernel/ddes/script/dms_contrl.sh index 8028d4a0e..be2950697 100644 --- a/src/gausskernel/ddes/script/dms_contrl.sh +++ b/src/gausskernel/ddes/script/dms_contrl.sh @@ -173,7 +173,7 @@ function Start() if [[ -z ${pid} ]] then log "dssserver not exist in dir ${DSS_HOME}..." - exit 1 + exit 6 else log "Starting dn..." nohup ${GSDB_BIN} -D ${GSDB_HOME} >> $db_start_log 2>&1 & diff --git a/src/gausskernel/process/postmaster/checkpointer.cpp b/src/gausskernel/process/postmaster/checkpointer.cpp index e2acbda7e..b6afd0495 100755 --- a/src/gausskernel/process/postmaster/checkpointer.cpp +++ b/src/gausskernel/process/postmaster/checkpointer.cpp @@ -813,7 +813,7 @@ static bool IsCheckpointOnSchedule(double progress) if (!RecoveryInProgress()) { recptr = GetInsertRecPtr(); elapsed_xlogs = (((double)(recptr - t_thrd.checkpoint_cxt.ckpt_start_recptr)) / XLogSegSize) / - u_sess->attr.attr_storage.CheckPointSegments; + XLogSegmentsNum(u_sess->attr.attr_storage.CheckPointSegments); if (progress < elapsed_xlogs) { t_thrd.checkpoint_cxt.ckpt_cached_elapsed = elapsed_xlogs; diff --git a/src/gausskernel/process/postmaster/pagewriter.cpp b/src/gausskernel/process/postmaster/pagewriter.cpp index a5c0c7053..75d8428ad 100755 --- a/src/gausskernel/process/postmaster/pagewriter.cpp +++ b/src/gausskernel/process/postmaster/pagewriter.cpp @@ -33,10 +33,12 @@ #include "storage/buf/bufmgr.h" #include "storage/ipc.h" #include "storage/smgr/smgr.h" +#include "storage/smgr/segment.h" #include "storage/pmsignal.h" #include "storage/standby.h" #include "access/double_write.h" #include "access/xlog.h" +#include "utils/aiomem.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner.h" @@ -44,6 +46,7 @@ #include "gssignal/gs_signal.h" #include "gstrace/gstrace_infra.h" #include "gstrace/postmaster_gstrace.h" +#include "ddes/dms/ss_dms_bufmgr.h" #include @@ -313,6 +316,16 @@ void incre_ckpt_pagewriter_cxt_init() pgwr->dirty_buf_list = (CkptSortItem *)palloc0(dirty_list_size * sizeof(CkptSortItem)); } + if (ENABLE_DMS) { + /* initialize aio block buffer */ + for (int i = 1; i < thread_num; i++) { + PageWriterProc *pgwr = &g_instance.ckpt_cxt_ctl->pgwr_procs.writer_proc[i]; + /* 2M AIO buffer */ + char *unaligned_buf = (char *)palloc0(DSS_AIO_BATCH_SIZE * DSS_AIO_UTIL_NUM * BLCKSZ + BLCKSZ); + pgwr->aio_buf = (char *)TYPEALIGN(BLCKSZ, unaligned_buf); + } + } + init_candidate_list(); (void)MemoryContextSwitchTo(oldcontext); } @@ -1357,6 +1370,11 @@ static void ckpt_pagewriter_sub_thread_loop() (errmodule(MOD_INCRE_CKPT), errmsg("pagewriter thread shut down, id is %d", t_thrd.pagewriter_cxt.pagewriter_id))); + if (ENABLE_DMS) { + PageWriterProc *pgwr = &g_instance.ckpt_cxt_ctl->pgwr_procs.writer_proc[thread_id]; + DSSAioDestroy(&pgwr->aio_cxt); + } + /* * From here on, elog(ERROR) should end with exit(1), not send control back to * the sigsetjmp block above @@ -1617,6 +1635,21 @@ void crps_destory_ctxs() } } +static void incre_ckpt_aio_callback(struct io_event *event) +{ + BufferDesc *buf_desc = (BufferDesc *)(event->data); + uint32 written_size = event->obj->u.c.nbytes; + if (written_size != event->res) { + ereport(PANIC, (errmsg("aio write failed, buffer: %d/%d/%d/%d/%d %d-%d", + buf_desc->tag.rnode.spcNode, buf_desc->tag.rnode.dbNode, + buf_desc->tag.rnode.relNode, (int32)buf_desc->tag.rnode.bucketNode, + (int32)buf_desc->tag.rnode.opt, buf_desc->tag.forkNum, buf_desc->tag.blockNum))); + } + + buf_desc->aio_in_progress = false; + UnpinBuffer(buf_desc, true); +} + void ckpt_pagewriter_main(void) { sigjmp_buf localSigjmpBuf; @@ -1658,6 +1691,12 @@ void ckpt_pagewriter_main(void) (void)MemoryContextSwitchTo(pagewriter_context); on_shmem_exit(pagewriter_kill, (Datum)0); + /* initialize AIO context */ + if (ENABLE_DMS && t_thrd.pagewriter_cxt.pagewriter_id != 0) { + PageWriterProc *pgwr = &g_instance.ckpt_cxt_ctl->pgwr_procs.writer_proc[t_thrd.pagewriter_cxt.pagewriter_id]; + DSSAioInitialize(&pgwr->aio_cxt, incre_ckpt_aio_callback); + } + /* * If an exception is encountered, processing resumes here. * @@ -1665,6 +1704,11 @@ void ckpt_pagewriter_main(void) */ if (sigsetjmp(localSigjmpBuf, 1) != 0) { ereport(WARNING, (errmodule(MOD_INCRE_CKPT), errmsg("pagewriter exception occured."))); + if (ENABLE_DMS && t_thrd.pagewriter_cxt.pagewriter_id != 0) { + int thread_id = t_thrd.pagewriter_cxt.pagewriter_id; + PageWriterProc *pgwr = &g_instance.ckpt_cxt_ctl->pgwr_procs.writer_proc[thread_id]; + DSSAioFlush(&pgwr->aio_cxt); + } ckpt_pagewriter_handle_exception(pagewriter_context); } @@ -1953,6 +1997,9 @@ static uint32 incre_ckpt_pgwr_flush_dirty_page(WritebackContext *wb_context, uint32 sync_state; BufferDesc *buf_desc = NULL; int buf_id; + int thread_id = t_thrd.pagewriter_cxt.pagewriter_id; + PageWriterProc *pgwr = &g_instance.ckpt_cxt_ctl->pgwr_procs.writer_proc[thread_id]; + DSSAioCxt *aio_cxt = &pgwr->aio_cxt; for (int i = start; i < start + batch_num; i++) { buf_id = dirty_buf_list[i].buf_id; @@ -1960,6 +2007,9 @@ static uint32 incre_ckpt_pgwr_flush_dirty_page(WritebackContext *wb_context, continue; } + /* Make sure we will have room to remember the buffer pin */ + ResourceOwnerEnlargeBuffers(t_thrd.utils_cxt.CurrentResourceOwner); + buf_desc = GetBufferDescriptor(buf_id); buf_state = LockBufHdr(buf_desc); if ((buf_state & BM_CHECKPOINT_NEEDED) && (buf_state & BM_DIRTY)) { @@ -1973,6 +2023,11 @@ static uint32 incre_ckpt_pgwr_flush_dirty_page(WritebackContext *wb_context, UnlockBufHdr(buf_desc, buf_state); } } + + if (ENABLE_DMS) { + DSSAioFlush(aio_cxt); + } + return num_actual_flush; } @@ -1994,20 +2049,28 @@ static void incre_ckpt_pgwr_flush_dirty_queue(WritebackContext *wb_context) ResourceOwnerEnlargeBuffers(t_thrd.utils_cxt.CurrentResourceOwner); - /* Double write can only handle at most DW_DIRTY_PAGE_MAX at one time. */ - for (int i = 0; i < runs; i++) { - /* Last batch, take the rest of the buffers */ - int offset = i * dw_batch_page_max; - int batch_num = (i == runs - 1) ? (need_flush_num - offset) : dw_batch_page_max; - uint32 flush_num; - + if (ENABLE_DMS) { pgwr->thrd_dw_cxt.is_new_relfilenode = is_new_relfilenode; pgwr->thrd_dw_cxt.dw_page_idx = -1; - dw_perform_batch_flush(batch_num, dirty_buf_list + offset, thread_id, &pgwr->thrd_dw_cxt); - flush_num = incre_ckpt_pgwr_flush_dirty_page(wb_context, dirty_buf_list, offset, batch_num); + num_actual_flush = incre_ckpt_pgwr_flush_dirty_page(wb_context, dirty_buf_list, 0, need_flush_num); pgwr->thrd_dw_cxt.dw_page_idx = -1; - num_actual_flush += flush_num; + } else { + /* Double write can only handle at most DW_DIRTY_PAGE_MAX at one time. */ + for (int i = 0; i < runs; i++) { + /* Last batch, take the rest of the buffers */ + int offset = i * dw_batch_page_max; + int batch_num = (i == runs - 1) ? (need_flush_num - offset) : dw_batch_page_max; + uint32 flush_num; + + pgwr->thrd_dw_cxt.is_new_relfilenode = is_new_relfilenode; + pgwr->thrd_dw_cxt.dw_page_idx = -1; + dw_perform_batch_flush(batch_num, dirty_buf_list + offset, thread_id, &pgwr->thrd_dw_cxt); + flush_num = incre_ckpt_pgwr_flush_dirty_page(wb_context, dirty_buf_list, offset, batch_num); + pgwr->thrd_dw_cxt.dw_page_idx = -1; + num_actual_flush += flush_num; + } } + (void)pg_atomic_fetch_add_u64(&g_instance.ckpt_cxt_ctl->page_writer_actual_flush, num_actual_flush); (void)pg_atomic_fetch_add_u32(&g_instance.ckpt_cxt_ctl->page_writer_last_flush, num_actual_flush); (void)pg_atomic_fetch_add_u32(&g_instance.ckpt_cxt_ctl->page_writer_last_queue_flush, num_actual_flush); @@ -2030,19 +2093,26 @@ static void incre_ckpt_pgwr_flush_dirty_list(WritebackContext *wb_context, uint3 qsort(dirty_buf_list, need_flush_num, sizeof(CkptSortItem), ckpt_buforder_comparator); ResourceOwnerEnlargeBuffers(t_thrd.utils_cxt.CurrentResourceOwner); - /* Double write can only handle at most DW_DIRTY_PAGE_MAX at one time. */ - for (int i = 0; i < runs; i++) { - /* Last batch, take the rest of the buffers */ - int offset = i * dw_batch_page_max; - int batch_num = (i == runs - 1) ? (need_flush_num - offset) : dw_batch_page_max; - uint32 flush_num; - + if (ENABLE_DMS) { pgwr->thrd_dw_cxt.is_new_relfilenode = is_new_relfilenode; pgwr->thrd_dw_cxt.dw_page_idx = -1; - dw_perform_batch_flush(batch_num, dirty_buf_list + offset, thread_id, &pgwr->thrd_dw_cxt); - flush_num = incre_ckpt_pgwr_flush_dirty_page(wb_context, dirty_buf_list, offset, batch_num); + num_actual_flush = incre_ckpt_pgwr_flush_dirty_page(wb_context, dirty_buf_list, 0, need_flush_num); pgwr->thrd_dw_cxt.dw_page_idx = -1; - num_actual_flush += flush_num; + } else { + /* Double write can only handle at most DW_DIRTY_PAGE_MAX at one time. */ + for (int i = 0; i < runs; i++) { + /* Last batch, take the rest of the buffers */ + int offset = i * dw_batch_page_max; + int batch_num = (i == runs - 1) ? (need_flush_num - offset) : dw_batch_page_max; + uint32 flush_num; + + pgwr->thrd_dw_cxt.is_new_relfilenode = is_new_relfilenode; + pgwr->thrd_dw_cxt.dw_page_idx = -1; + dw_perform_batch_flush(batch_num, dirty_buf_list + offset, thread_id, &pgwr->thrd_dw_cxt); + flush_num = incre_ckpt_pgwr_flush_dirty_page(wb_context, dirty_buf_list, offset, batch_num); + pgwr->thrd_dw_cxt.dw_page_idx = -1; + num_actual_flush += flush_num; + } } (void)pg_atomic_fetch_add_u64(&g_instance.ckpt_cxt_ctl->page_writer_actual_flush, num_actual_flush); (void)pg_atomic_fetch_add_u32(&g_instance.ckpt_cxt_ctl->page_writer_last_flush, num_actual_flush); @@ -2071,6 +2141,11 @@ static bool check_buffer_dirty_flag(BufferDesc* buf_desc) bool check_lsn_not_match = (local_buf_state & BM_VALID) && !(local_buf_state & BM_DIRTY) && XLByteLT(buf_desc->lsn_on_disk, PageGetLSN(tmpBlock)) && RecoveryInProgress() && !segment_buf; + if (ENABLE_DMS && check_lsn_not_match && + (XLogRecPtrIsInvalid(buf_desc->lsn_on_disk) || GetDmsBufCtrl(buf_desc->buf_id)->state & BUF_DIRTY_NEED_FLUSH)) { + return false; + } + if (check_lsn_not_match) { PinBuffer(buf_desc, NULL); if (LWLockConditionalAcquire(buf_desc->content_lock, LW_SHARED)) { diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index cffd87e57..125a54bd0 100644 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -3258,7 +3258,7 @@ static int ServerLoop(void) u_sess->attr.attr_storage.enable_cbm_tracking) g_instance.pid_cxt.CBMWriterPID = initialize_util_thread(CBMWRITER); - if (!dummyStandbyMode && ENABLE_INCRE_CKPT) { + if (!dummyStandbyMode && ENABLE_INCRE_CKPT && !SS_IN_FAILOVER) { for (int i = 0; i < g_instance.ckpt_cxt_ctl->pgwr_procs.num; i++) { if (g_instance.pid_cxt.PageWriterPID[i] == 0) { g_instance.pid_cxt.PageWriterPID[i] = initialize_util_thread(PAGEWRITER_THREAD); @@ -3307,7 +3307,7 @@ static int ServerLoop(void) (AutoVacuumingActive() || t_thrd.postmaster_cxt.start_autovac_launcher) && pmState == PM_RUN && !dummyStandbyMode && u_sess->attr.attr_common.upgrade_mode != 1 && !g_instance.streaming_dr_cxt.isInSwitchover && - !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER) { + !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) { g_instance.pid_cxt.AutoVacPID = initialize_util_thread(AUTOVACUUM_LAUNCHER); if (g_instance.pid_cxt.AutoVacPID != 0) @@ -3338,7 +3338,7 @@ static int ServerLoop(void) if ((u_sess->attr.attr_common.upgrade_mode == 0 || pg_atomic_read_u32(&WorkingGrandVersionNum) >= PUBLICATION_VERSION_NUM) && g_instance.pid_cxt.ApplyLauncerPID == 0 && - pmState == PM_RUN && !dummyStandbyMode) { + pmState == PM_RUN && !dummyStandbyMode && !SS_IN_REFORM) { g_instance.pid_cxt.ApplyLauncerPID = initialize_util_thread(APPLY_LAUNCHER); } #endif @@ -3350,7 +3350,7 @@ static int ServerLoop(void) */ if (g_instance.pid_cxt.PgJobSchdPID == 0 && pmState == PM_RUN && (g_instance.attr.attr_sql.job_queue_processes || t_thrd.postmaster_cxt.start_job_scheduler) && - u_sess->attr.attr_common.upgrade_mode != 1) { + u_sess->attr.attr_common.upgrade_mode != 1 && !SS_IN_REFORM) { g_instance.pid_cxt.PgJobSchdPID = initialize_util_thread(JOB_SCHEDULER); if (g_instance.pid_cxt.PgJobSchdPID != 0) { @@ -3414,17 +3414,17 @@ static int ServerLoop(void) g_instance.pid_cxt.SnapshotPID = snapshot_start(); if (ENABLE_ASP && g_instance.pid_cxt.AshPID == 0 && pmState == PM_RUN && !dummyStandbyMode - && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER) + && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) g_instance.pid_cxt.AshPID = initialize_util_thread(ASH_WORKER); /* If we have lost the full sql flush thread, try to start a new one */ if (ENABLE_STATEMENT_TRACK && g_instance.pid_cxt.StatementPID == 0 && (pmState == PM_RUN || pmState == PM_HOT_STANDBY) - && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER) + && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) g_instance.pid_cxt.StatementPID = initialize_util_thread(TRACK_STMT_WORKER); if ((IS_PGXC_COORDINATOR || IS_SINGLE_NODE) && u_sess->attr.attr_common.enable_instr_rt_percentile && g_instance.pid_cxt.PercentilePID == 0 && - pmState == PM_RUN) + pmState == PM_RUN && !SS_IN_REFORM) g_instance.pid_cxt.PercentilePID = initialize_util_thread(PERCENTILE_WORKER); if ((ENABLE_DMS && pmState == PM_RUN && g_instance.stat_cxt.stack_perf_start) || (!ENABLE_DMS && g_instance.stat_cxt.stack_perf_start)) { @@ -3433,15 +3433,15 @@ static int ServerLoop(void) } /* if workload manager is off, we still use this thread to build user hash table */ if ((ENABLE_WORKLOAD_CONTROL || !WLMIsInfoInit()) && g_instance.pid_cxt.WLMCollectPID == 0 && - pmState == PM_RUN && !dummyStandbyMode) + pmState == PM_RUN && !dummyStandbyMode && !SS_IN_REFORM) g_instance.pid_cxt.WLMCollectPID = initialize_util_thread(WLM_WORKER); if (ENABLE_WORKLOAD_CONTROL && (g_instance.pid_cxt.WLMMonitorPID == 0) && (pmState == PM_RUN) && - !dummyStandbyMode) + !dummyStandbyMode && !SS_IN_REFORM) g_instance.pid_cxt.WLMMonitorPID = initialize_util_thread(WLM_MONITOR); if (ENABLE_WORKLOAD_CONTROL && (g_instance.pid_cxt.WLMArbiterPID == 0) && (pmState == PM_RUN) && - !dummyStandbyMode) + !dummyStandbyMode && !SS_IN_REFORM) g_instance.pid_cxt.WLMArbiterPID = initialize_util_thread(WLM_ARBITER); if (IS_PGXC_COORDINATOR && g_instance.attr.attr_sql.max_resource_package && @@ -4273,8 +4273,9 @@ int ProcessStartupPacket(Port* port, bool SSLdone) } if (SS_IN_REFORM) { - ereport(ERROR, (errcode(ERRCODE_CANNOT_CONNECT_NOW), + ereport(DEBUG1, (errcode(ERRCODE_CANNOT_CONNECT_NOW), errmsg("cannot accept connection during SS cluster reform"))); + return STATUS_ERROR; } #endif } @@ -5172,6 +5173,10 @@ static void pmdie(SIGNAL_ARGS) pgaudit_system_stop_ok(FastShutdown); } + if (ENABLE_DMS && pmState == PM_STARTUP) { + pmState = PM_RECOVERY; + } + if (pmState == PM_STARTUP || pmState == PM_INIT) { KillGraceThreads(); WaitGraceThreadsExit(); @@ -6146,12 +6151,18 @@ static void reaper(SIGNAL_ARGS) if (!u_sess->proc_cxt.IsBinaryUpgrade && AutoVacuumingActive() && g_instance.pid_cxt.AutoVacPID == 0 && !dummyStandbyMode && u_sess->attr.attr_common.upgrade_mode != 1 && !g_instance.streaming_dr_cxt.isInSwitchover && - !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER) + !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) g_instance.pid_cxt.AutoVacPID = initialize_util_thread(AUTOVACUUM_LAUNCHER); + if (SS_REFORM_PARTNER) { + write_stderr("%s LOG: I'm still a reform partner waiting for refom finished\n", + GetReaperLogPrefix(logBuf, ReaperLogBufSize)); + continue; + } + /* Before GRAND VERSION NUM 81000, we do not support scheduled job. */ if (g_instance.pid_cxt.PgJobSchdPID == 0 && - g_instance.attr.attr_sql.job_queue_processes && u_sess->attr.attr_common.upgrade_mode != 1) + g_instance.attr.attr_sql.job_queue_processes && u_sess->attr.attr_common.upgrade_mode != 1 && !SS_IN_REFORM) g_instance.pid_cxt.PgJobSchdPID = initialize_util_thread(JOB_SCHEDULER); if ((IS_PGXC_COORDINATOR) && g_instance.pid_cxt.CommPoolerCleanPID == 0 && @@ -6173,7 +6184,7 @@ static void reaper(SIGNAL_ARGS) #ifndef ENABLE_MULTIPLE_NODES if ((u_sess->attr.attr_common.upgrade_mode == 0 || pg_atomic_read_u32(&WorkingGrandVersionNum) >= PUBLICATION_VERSION_NUM) && - g_instance.pid_cxt.ApplyLauncerPID == 0 && !dummyStandbyMode) { + g_instance.pid_cxt.ApplyLauncerPID == 0 && !dummyStandbyMode && !SS_IN_REFORM) { g_instance.pid_cxt.ApplyLauncerPID = initialize_util_thread(APPLY_LAUNCHER); } #endif @@ -6207,15 +6218,15 @@ static void reaper(SIGNAL_ARGS) !dummyStandbyMode && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER) g_instance.pid_cxt.SnapshotPID = snapshot_start(); if ((IS_PGXC_COORDINATOR || IS_SINGLE_NODE) && u_sess->attr.attr_common.enable_instr_rt_percentile && - g_instance.pid_cxt.PercentilePID == 0 && !dummyStandbyMode) + g_instance.pid_cxt.PercentilePID == 0 && !dummyStandbyMode && !SS_IN_REFORM) g_instance.pid_cxt.PercentilePID = initialize_util_thread(PERCENTILE_WORKER); if (ENABLE_ASP && g_instance.pid_cxt.AshPID == 0 && !dummyStandbyMode - && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER) + && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) g_instance.pid_cxt.AshPID = initialize_util_thread(ASH_WORKER); if (ENABLE_STATEMENT_TRACK && g_instance.pid_cxt.StatementPID == 0 - && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER) + && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) g_instance.pid_cxt.StatementPID = initialize_util_thread(TRACK_STMT_WORKER); /* Database Security: Support database audit */ @@ -6252,7 +6263,7 @@ static void reaper(SIGNAL_ARGS) /* if workload manager is off, we still use this thread to build user hash table */ if ((ENABLE_WORKLOAD_CONTROL || !WLMIsInfoInit()) && g_instance.pid_cxt.WLMCollectPID == 0 && - !dummyStandbyMode) { + !dummyStandbyMode && !SS_IN_REFORM) { /* DN need rebuild hash when upgrade to primary */ if (IS_PGXC_DATANODE) g_instance.wlm_cxt->stat_manager.infoinit = 0; @@ -6732,7 +6743,7 @@ static void reaper(SIGNAL_ARGS) LogChildExit(LOG, _("Active session history collector process"), pid, exitstatus); if (pmState == PM_RUN && ENABLE_ASP && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER - && !SS_STANDBY_FAILOVER) + && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) g_instance.pid_cxt.AshPID = initialize_util_thread(ASH_WORKER); continue; } @@ -6745,7 +6756,7 @@ static void reaper(SIGNAL_ARGS) LogChildExit(LOG, _("full SQL statement flush process"), pid, exitstatus); if (pmState == PM_RUN && ENABLE_STATEMENT_TRACK && !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER - && !SS_STANDBY_FAILOVER) + && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) g_instance.pid_cxt.StatementPID = initialize_util_thread(TRACK_STMT_WORKER); continue; } @@ -6757,7 +6768,7 @@ static void reaper(SIGNAL_ARGS) if (!EXIT_STATUS_0(exitstatus)) LogChildExit(LOG, _("percentile collector process"), pid, exitstatus); - if (pmState == PM_RUN) + if (pmState == PM_RUN && !SS_IN_REFORM) g_instance.pid_cxt.PercentilePID = initialize_util_thread(PERCENTILE_WORKER); continue; } @@ -9301,6 +9312,7 @@ static void sigusr1_handler(SIGNAL_ARGS) } if (ENABLE_DMS && CheckPostmasterSignal(PMSIGNAL_DMS_SWITCHOVER_PROMOTE)) { + PMUpdateDBState(PROMOTING_STATE, get_cur_mode(), get_cur_repl_num()); if (ENABLE_THREAD_POOL) { g_threadPoolControler->CloseAllSessions(); /* @@ -9354,15 +9366,7 @@ static void sigusr1_handler(SIGNAL_ARGS) } if (ENABLE_DMS && CheckPostmasterSignal(PMSIGNAL_DMS_REFORM)) { - if (ENABLE_THREAD_POOL) { - g_threadPoolControler->CloseAllSessions(); - /* - * before pmState set to wait backends, - * threadpool cannot launch new thread by scheduler during demote. - */ - g_threadPoolControler->ShutDownScheduler(true, true); - g_threadPoolControler->ShutDownThreads(true); - } + PMUpdateDBState(STARTING_STATE, get_cur_mode(), get_cur_repl_num()); /* shut down all backends and autovac workers */ (void)SignalSomeChildren(SIGTERM, BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC); @@ -9425,15 +9429,12 @@ static void sigusr1_handler(SIGNAL_ARGS) signal_child(g_instance.pid_cxt.SnapshotPID, SIGTERM); } - if (ENABLE_THREAD_POOL) { - g_threadPoolControler->EnableAdjustPool(); - } - ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS reform] terminate backends success"))); g_instance.dms_cxt.SSRecoveryInfo.reform_ready = true; } if (ENABLE_DMS && CheckPostmasterSignal(PMSIGNAL_DMS_TRIGGERFAILOVER)) { + PMUpdateDBState(PROMOTING_STATE, get_cur_mode(), get_cur_repl_num()); if (ENABLE_THREAD_POOL) { g_threadPoolControler->CloseAllSessions(); /* @@ -11666,6 +11667,8 @@ DbState get_local_dbstate(void) db_state = WAITING_STATE; } else if (SS_STANDBY_PROMOTING || SS_STANDBY_FAILOVER) { db_state = PROMOTING_STATE; + } else if (SS_IN_REFORM) { + db_state = STARTING_STATE; } else { db_state = NORMAL_STATE; } @@ -11716,9 +11719,6 @@ const char* wal_get_db_state_string(DbState db_state) static ServerMode get_cur_mode(void) { if (ENABLE_DMS) { - if (RecoveryInProgress()) { - return RECOVERY_MODE; - } return SS_STANDBY_MODE ? STANDBY_MODE : PRIMARY_MODE; } return t_thrd.postmaster_cxt.HaShmData->current_mode; @@ -11846,6 +11846,10 @@ static void PMUpdateDBState(DbState db_state, ServerMode mode, int conn_num) { GaussState state; + if (ENABLE_DMS && SS_IN_REFORM && db_state == NORMAL_STATE) { + db_state = STARTING_STATE; + } + PMReadDBStateFile(&state); state.state = db_state; if (mode == STANDBY_MODE && t_thrd.postmaster_cxt.HaShmData->is_cascade_standby) { @@ -13866,8 +13870,8 @@ void set_disable_conn_mode() static bool NeedHeartbeat() { - /* heartbeat is no longer needed on DCF mode */ - if (g_instance.attr.attr_storage.dcf_attr.enable_dcf) + /* heartbeat is no longer needed in DCF/DMS mode */ + if (ENABLE_DMS || g_instance.attr.attr_storage.dcf_attr.enable_dcf) return false; if (!(IS_PGXC_DATANODE && g_instance.pid_cxt.HeartbeatPID == 0 && @@ -13971,7 +13975,7 @@ const char *GetSSServerMode() bool SSIsServerModeReadOnly() { - return SS_PERFORMING_SWITCHOVER || SS_STANDBY_MODE; + return SS_STANDBY_FAILOVER || SS_PERFORMING_SWITCHOVER || SS_STANDBY_MODE; } void SSRestartFailoverPromote() diff --git a/src/gausskernel/process/tcop/postgres.cpp b/src/gausskernel/process/tcop/postgres.cpp index 8aa0a9527..2a0938dc9 100755 --- a/src/gausskernel/process/tcop/postgres.cpp +++ b/src/gausskernel/process/tcop/postgres.cpp @@ -6388,10 +6388,14 @@ void ProcessInterrupts(void) if (u_sess->ClientAuthInProgress) { t_thrd.int_cxt.ImmediateInterruptOK = false; /* not idle anymore */ + int errlevel = ERROR; + if (ENABLE_DMS && IS_THREAD_POOL_WORKER) { + errlevel = FATAL; + } if (t_thrd.storage_cxt.cancel_from_timeout) { force_backtrace_messages = true; - ereport(ERROR, + ereport(errlevel, (errcode(ERRCODE_QUERY_CANCELED), errmsg("terminate because authentication timeout(%ds)", u_sess->attr.attr_network.PoolerConnectTimeout))); @@ -6399,7 +6403,7 @@ void ProcessInterrupts(void) if (t_thrd.postgres_cxt.whereToSendOutput == DestRemote) { t_thrd.postgres_cxt.whereToSendOutput = DestNone; } - ereport(ERROR, + ereport(errlevel, (errcode(ERRCODE_QUERY_CANCELED), errmsg("terminate because cancel interrupts"))); } diff --git a/src/gausskernel/process/threadpool/knl_instance.cpp b/src/gausskernel/process/threadpool/knl_instance.cpp index c3ee37d05..94765e639 100755 --- a/src/gausskernel/process/threadpool/knl_instance.cpp +++ b/src/gausskernel/process/threadpool/knl_instance.cpp @@ -190,10 +190,14 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt) dms_cxt->SSRecoveryInfo.restart_failover_flag = false; dms_cxt->SSRecoveryInfo.reform_ready = false; dms_cxt->SSRecoveryInfo.in_failover = false; + dms_cxt->SSRecoveryInfo.in_flushcopy = false; dms_cxt->log_timezone = NULL; pg_atomic_init_u32(&dms_cxt->inDmsThreShmemInitCnt, 0); pg_atomic_init_u32(&dms_cxt->inProcExitCnt, 0); dms_cxt->dmsInited = false; + dms_cxt->ckptRedo = InvalidXLogRecPtr; + dms_cxt->resetSyscache = false; + dms_cxt->finishedRecoverOldPrimaryDWFile = false; } static void knl_g_tests_init(knl_g_tests_context* tests_cxt) diff --git a/src/gausskernel/process/threadpool/knl_session.cpp b/src/gausskernel/process/threadpool/knl_session.cpp index 9d330c745..5d16a69d3 100755 --- a/src/gausskernel/process/threadpool/knl_session.cpp +++ b/src/gausskernel/process/threadpool/knl_session.cpp @@ -1109,7 +1109,12 @@ static void knl_u_statement_init(knl_u_statement_context* statement_cxt) void knl_u_relmap_init(knl_u_relmap_context* relmap_cxt) { - relmap_cxt->shared_map = (RelMapFile*)palloc0(sizeof(RelMapFile)); + if (ENABLE_DMS) { + char *unaligned_buf = (char*)palloc0(sizeof(RelMapFile) + ALIGNOF_BUFFER); + relmap_cxt->shared_map = (RelMapFile*)BUFFERALIGN(unaligned_buf); + } else { + relmap_cxt->shared_map = (RelMapFile*)palloc0(sizeof(RelMapFile)); + } relmap_cxt->local_map = (RelMapFile*)palloc0(sizeof(RelMapFile)); relmap_cxt->active_shared_updates = (RelMapFile*)palloc0(sizeof(RelMapFile)); relmap_cxt->active_local_updates = (RelMapFile*)palloc0(sizeof(RelMapFile)); diff --git a/src/gausskernel/storage/access/transam/double_write.cpp b/src/gausskernel/storage/access/transam/double_write.cpp index d320e0e44..d67ebc8be 100644 --- a/src/gausskernel/storage/access/transam/double_write.cpp +++ b/src/gausskernel/storage/access/transam/double_write.cpp @@ -1308,7 +1308,9 @@ void dw_file_check_rebuild() dw_batch_meta_file batch_meta_file; if (!file_exists(DW_BUILD_FILE_NAME)) { - return; + if (!ENABLE_DMS || (ENABLE_DMS && !g_instance.dms_cxt.finishedRecoverOldPrimaryDWFile)) { + return; + } } ereport(LOG, (errmodule(MOD_DW), errmsg("Double write initializing after build"))); @@ -1803,6 +1805,10 @@ void dw_enable_init() dw_cxt_init_batch(); dw_cxt_init_single(); + if (SS_REFORM_PARTNER || g_instance.dms_cxt.finishedRecoverOldPrimaryDWFile) { + return; + } + /* recovery batch flush dw file */ dw_recover_all_partial_write_batch(batch_cxt); diff --git a/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp b/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp index 5447ec825..2806d1ffb 100755 --- a/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp +++ b/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp @@ -1583,6 +1583,8 @@ void InitReaderStateByOld(XLogReaderState *newState, XLogReaderState *oldState, newState->currRecPtr = oldState->currRecPtr; newState->readLen = oldState->readLen; newState->readBuf = oldState->readBuf; + newState->preReadStartPtr = oldState->preReadStartPtr; + newState->preReadBuf = oldState->preReadBuf; if (isNew) { newState->readRecordBuf = NULL; diff --git a/src/gausskernel/storage/access/transam/xlog.cpp b/src/gausskernel/storage/access/transam/xlog.cpp index 08dea7a71..c5ae9b5b5 100755 --- a/src/gausskernel/storage/access/transam/xlog.cpp +++ b/src/gausskernel/storage/access/transam/xlog.cpp @@ -156,6 +156,7 @@ #include "ddes/dms/ss_reform_common.h" #include "ddes/dms/ss_dms_recovery.h" +#include "ddes/dms/ss_dms_bufmgr.h" #include "storage/file/fio_device.h" #ifdef ENABLE_UT #define STATIC @@ -222,7 +223,7 @@ static const int ONE_SECOND = 1000000; * of them; the +1 allows boundary cases to happen without wasting a * delete/create-segment cycle. */ -#define XLOGfileslop (2 * u_sess->attr.attr_storage.CheckPointSegments + 1) +#define XLOGfileslop (2 * XLogSegmentsNum(u_sess->attr.attr_storage.CheckPointSegments) + 1) /* * GUC support @@ -2471,7 +2472,7 @@ static bool XLogCheckpointNeeded(XLogSegNo new_segno) XLByteToSeg(t_thrd.xlog_cxt.RedoRecPtr, old_segno); - if (new_segno >= old_segno + ((uint32)u_sess->attr.attr_storage.CheckPointSegments - 1)) { + if (new_segno >= old_segno + ((uint32)XLogSegmentsNum(u_sess->attr.attr_storage.CheckPointSegments) - 1)) { return true; } return false; @@ -9444,7 +9445,7 @@ void StartupXLOG(void) if (ENABLE_DMS && ENABLE_DSS) { if (SSFAILOVER_TRIGGER || SS_STANDBY_PROMOTING) { SSGetXlogPath(); - xlogreader = XLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER); + xlogreader = SSXLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER); close_readFile_if_open(); // init shared memory set page empty SSCSNLOGShmemClear(); @@ -9452,7 +9453,7 @@ void StartupXLOG(void) SSMultiXactShmemClear(); SSClearSegCache(); } else { - xlogreader = XLogReaderAllocate(&XLogPageRead, &readprivate, ALIGNOF_BUFFER); + xlogreader = SSXLogReaderAllocate(&XLogPageRead, &readprivate, ALIGNOF_BUFFER); } } else { xlogreader = XLogReaderAllocate(&XLogPageRead, &readprivate); @@ -9677,7 +9678,7 @@ void StartupXLOG(void) dw_init(); } - if (SSFAILOVER_TRIGGER) { + if (SS_IN_FAILOVER && SS_REFORM_REFORMER) { ss_failover_dw_init(); } @@ -9865,9 +9866,11 @@ void StartupXLOG(void) * have been a clean shutdown and we did not have a recovery.conf file, * then assume no recovery needed. */ - if (SS_STANDBY_PROMOTING) { - ereport(LOG, (errmsg("[SS switchover] Standby promote: redo shutdown checkpoint now"))); + if (SSFAILOVER_TRIGGER || SS_STANDBY_PROMOTING) { t_thrd.xlog_cxt.InRecovery = true; + if (SS_STANDBY_PROMOTING) { + ereport(LOG, (errmsg("[SS switchover] Standby promote: redo shutdown checkpoint now"))); + } } if (XLByteLT(checkPoint.redo, RecPtr)) { #ifdef ENABLE_MULTIPLE_NODES @@ -10832,6 +10835,7 @@ void StartupXLOG(void) t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); UpdateControlFile(); LWLockRelease(ControlFileLock); + SSRecheckBufferPool(); ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS switchover/failover] standby promoting: finished start checkpoint."))); } @@ -11599,11 +11603,6 @@ void DummyStandbySetRecoveryTargetTLI(TimeLineID timeLineID) */ void ShutdownXLOG(int code, Datum arg) { - if (SS_STANDBY_PROMOTING) { - ereport(LOG, (errmsg("[SS switchover] Standby promote: skipping shutdown checkpoint"))); - return; - } - if (SS_PRIMARY_DEMOTING) { ereport(LOG, (errmsg("[SS switchover] primary demote: doing shutdown checkpoint"))); CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); @@ -11618,21 +11617,25 @@ void ShutdownXLOG(int code, Datum arg) ereport(LOG, (errmsg("shutting down"))); - if (RecoveryInProgress()) { - (void)CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { + ereport(LOG, (errmsg("[SS failover/SS switchover] Standby promote: skipping shutdown checkpoint"))); } else { - /* - * If archiving is enabled, rotate the last XLOG file so that all the - * remaining records are archived (postmaster wakes up the archiver - * process one more time at the end of shutdown). The checkpoint - * record will go to the next XLOG file and won't be archived (yet). - */ - if (!IsInitdb && XLogArchivingActive() && XLogArchiveCommandSet()) { - (void)RequestXLogSwitch(); - } + if (RecoveryInProgress()) { + (void)CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + } else { + /* + * If archiving is enabled, rotate the last XLOG file so that all the + * remaining records are archived (postmaster wakes up the archiver + * process one more time at the end of shutdown). The checkpoint + * record will go to the next XLOG file and won't be archived (yet). + */ + if (!IsInitdb && XLogArchivingActive() && XLogArchiveCommandSet()) { + (void)RequestXLogSwitch(); + } - if (g_instance.wal_cxt.upgradeSwitchMode != ExtremelyFast) - CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + if (g_instance.wal_cxt.upgradeSwitchMode != ExtremelyFast) + CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); + } } /* Stop DCF after primary CreateCheckPoint or standby CreateRestartPoint */ @@ -11657,7 +11660,7 @@ void ShutdownXLOG(int code, Datum arg) g_instance.bgwriter_cxt.rel_hashtbl_lock = NULL; g_instance.bgwriter_cxt.rel_one_fork_hashtbl_lock = NULL; - if (!ENABLE_DMS || !SS_STANDBY_MODE) { + if (!ENABLE_DMS || (!SS_STANDBY_MODE && !SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING)) { ShutdownCLOG(); ShutdownCSNLOG(); ShutdownMultiXact(); @@ -11846,7 +11849,8 @@ void CreateCheckPoint(int flags) (errcode(ERRCODE_INVALID_TRANSACTION_STATE), errmsg("can't create a checkpoint on SS standby node"))); } - if (ENABLE_DMS && SS_STANDBY_MODE && !SS_STANDBY_PROMOTING) { + /* allow standby do checkpoint only after it has promoted AND has finished recovery. */ + if (ENABLE_DMS && SS_STANDBY_MODE && !(SS_STANDBY_PROMOTING && !RecoveryInProgress())) { return; } else if (SSFAILOVER_TRIGGER) { ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS failover] do not do CreateCheckpoint during failover"))); @@ -13189,13 +13193,14 @@ static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo, XLogRecPtr curIns XLByteToSeg(recptr, segno); /* avoid underflow, don't go below 1 */ - if (segno <= (uint64)(uint32)u_sess->attr.attr_storage.wal_keep_segments) { + if (segno <= (uint64)(uint32)XLogSegmentsNum(u_sess->attr.attr_storage.wal_keep_segments)) { /* segno = 1 show all file should be keep */ ereport(LOG, (errmsg("keep all the xlog segments, because current segno = %lu, " - "less than wal_keep_segments = %d", segno, u_sess->attr.attr_storage.wal_keep_segments))); + "less than wal_keep_segments = %d", segno, + (int)XLogSegmentsNum(u_sess->attr.attr_storage.wal_keep_segments)))); segno = 1; } else { - segno = segno - (uint32)u_sess->attr.attr_storage.wal_keep_segments; + segno = segno - (uint32)XLogSegmentsNum(u_sess->attr.attr_storage.wal_keep_segments); } wal_keep_segno = segno; @@ -17831,36 +17836,49 @@ retry: /* Read the requested page */ t_thrd.xlog_cxt.readOff = targetPageOff; + if (ENABLE_DSS && ENABLE_DMS) { + bool ss_ret = SSReadXlogInternal(xlogreader, targetPagePtr, readBuf); + if (!ss_ret) { + ereport(emode_for_corrupt_record(emode, RecPtr), + (errcode_for_file_access(), + errmsg("[ss] could not read from log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), + t_thrd.xlog_cxt.readOff))); + goto next_record_is_invalid; + } + } else { try_again: - if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { - ereport(emode_for_corrupt_record(emode, RecPtr), - (errcode_for_file_access(), - errmsg("could not seek in log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; + if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { + ereport(emode_for_corrupt_record(emode, RecPtr), + (errcode_for_file_access(), + errmsg("could not seek in log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), + t_thrd.xlog_cxt.readOff))); + if (errno == EINTR) { + errno = 0; + pg_usleep(1000); + goto try_again; + } + goto next_record_is_invalid; } - goto next_record_is_invalid; - } - pgstat_report_waitevent(WAIT_EVENT_WAL_READ); - ret = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ); - pgstat_report_waitevent(WAIT_EVENT_END); - if (ret != XLOG_BLCKSZ) { - ereport(emode_for_corrupt_record(emode, RecPtr), - (errcode_for_file_access(), - errmsg("could not read from log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; + pgstat_report_waitevent(WAIT_EVENT_WAL_READ); + ret = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ); + pgstat_report_waitevent(WAIT_EVENT_END); + if (ret != XLOG_BLCKSZ) { + ereport(emode_for_corrupt_record(emode, RecPtr), + (errcode_for_file_access(), + errmsg("could not read from log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), + t_thrd.xlog_cxt.readOff))); + if (errno == EINTR) { + errno = 0; + pg_usleep(1000); + goto try_again; + } + goto next_record_is_invalid; } - goto next_record_is_invalid; } + Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo); Assert(targetPageOff == t_thrd.xlog_cxt.readOff); Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen); @@ -18830,7 +18848,7 @@ loop: cur_rec_lsn = compare.u64[0]; /* if we already left behind dirty array queue reclsn, do nothing */ if (!XLByteLE(current_insert_lsn, cur_rec_lsn) && - (need_immediately_update || current_insert_lsn - cur_rec_lsn > XLogSegSize * UPDATE_REC_XLOG_NUM)) { + (need_immediately_update || current_insert_lsn - cur_rec_lsn > XLogBaseSize * UPDATE_REC_XLOG_NUM)) { exchange.u64[0] = current_insert_lsn; exchange.u64[1] = compare.u64[1]; @@ -18878,7 +18896,7 @@ void update_dirty_page_queue_rec_lsn(XLogRecPtr current_insert_lsn, bool need_im if (!XLByteLE(current_insert_lsn, g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn) && (need_immediately_update || - current_insert_lsn - g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn > XLogSegSize * UPDATE_REC_XLOG_NUM)) { + current_insert_lsn - g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn > XLogBaseSize * UPDATE_REC_XLOG_NUM)) { g_instance.ckpt_cxt_ctl->dirty_page_queue_reclsn = current_insert_lsn; is_update = true; } diff --git a/src/gausskernel/storage/access/transam/xlogreader.cpp b/src/gausskernel/storage/access/transam/xlogreader.cpp index 155ce0878..b79ba4b8c 100644 --- a/src/gausskernel/storage/access/transam/xlogreader.cpp +++ b/src/gausskernel/storage/access/transam/xlogreader.cpp @@ -32,6 +32,7 @@ #include "access/parallel_recovery/redo_item.h" #include "utils/memutils.h" #include "utils/elog.h" +#include "ddes/dms/ss_dms_recovery.h" typedef struct XLogPageReadPrivate { const char *datadir; @@ -172,6 +173,11 @@ void XLogReaderFree(XLogReaderState *state) pfree(state->readBufOrigin); state->readBufOrigin = NULL; state->readBuf = NULL; + if (state->preReadBufOrigin) { + pfree(state->preReadBufOrigin); + state->preReadBufOrigin = NULL; + state->preReadBuf = NULL; + } /* state need to be reset NULL by caller */ pfree(state); diff --git a/src/gausskernel/storage/access/transam/xlogutils.cpp b/src/gausskernel/storage/access/transam/xlogutils.cpp index 8b5f7904d..5fab4c308 100644 --- a/src/gausskernel/storage/access/transam/xlogutils.cpp +++ b/src/gausskernel/storage/access/transam/xlogutils.cpp @@ -48,6 +48,7 @@ #include "commands/dbcommands.h" #include "postmaster/pagerepair.h" #include "storage/cfs/cfs_converter.h" +#include "ddes/dms/ss_dms_bufmgr.h" /* * During XLOG replay, we may see XLOG records for incremental updates of @@ -665,9 +666,12 @@ XLogRedoAction XLogReadBufferForRedoBlockExtend(RedoBufferTag *redoblock, ReadBu redobufferinfo->pageinfo.page = page; redobufferinfo->pageinfo.pagesize = pagesize; - if (XLByteLE(xloglsn, PageGetLSN(page))) + if (XLByteLE(xloglsn, PageGetLSN(page))) { + if (ENABLE_DMS) { + SSCheckBufferIfNeedMarkDirty(redobufferinfo->buf); + } return BLK_DONE; - else { + } else { if (SegmentNeedAdvancedLSNCheck(redoblock->rnode, redoblock->forknum, mode)) { /* * For segment-page storage, before returning BLK_NEEDS_REDO, we need checking LSN. Illegal LSN may be diff --git a/src/gausskernel/storage/buffer/buf_init.cpp b/src/gausskernel/storage/buffer/buf_init.cpp index 406712944..f5197ea6c 100644 --- a/src/gausskernel/storage/buffer/buf_init.cpp +++ b/src/gausskernel/storage/buffer/buf_init.cpp @@ -166,6 +166,7 @@ void InitBufferPool(void) buf->io_in_progress_lock = LWLockAssign(LWTRANCHE_BUFFER_IO_IN_PROGRESS); buf->content_lock = LWLockAssign(LWTRANCHE_BUFFER_CONTENT); pg_atomic_init_u64(&buf->rec_lsn, InvalidXLogRecPtr); + buf->aio_in_progress = false; buf->dirty_queue_loc = PG_UINT64_MAX; buf->encrypt = false; } diff --git a/src/gausskernel/storage/buffer/bufmgr.cpp b/src/gausskernel/storage/buffer/bufmgr.cpp index 57facb2de..6a3826b8a 100644 --- a/src/gausskernel/storage/buffer/bufmgr.cpp +++ b/src/gausskernel/storage/buffer/bufmgr.cpp @@ -2372,6 +2372,9 @@ found_branch: dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(bufHdr->buf_id); LWLockMode req_lock_mode = isExtend ? LW_EXCLUSIVE : LW_SHARED; + if (g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy && SS_REFORM_REFORMER) { + req_lock_mode = LW_EXCLUSIVE; + } if (!LockModeCompatible(buf_ctrl, req_lock_mode)) { if (!StartReadPage(bufHdr, req_lock_mode)) { TerminateBufferIO(bufHdr, false, 0); @@ -2522,13 +2525,19 @@ void SimpleMarkBufDirty(BufferDesc *buf) void PageCheckIfCanEliminate(BufferDesc *buf, uint32 *oldFlags, bool *needGetLock) { + if (SS_REFORM_REFORMER) { + Assert(XLogRecPtrIsValid(g_instance.dms_cxt.ckptRedo)); + } + Block tmpBlock = BufHdrGetBlock(buf); - if ((*oldFlags & BM_TAG_VALID) && !XLByteEQ(buf->lsn_on_disk, PageGetLSN(tmpBlock)) && !(*oldFlags & BM_DIRTY) && - RecoveryInProgress()) { + if ((*oldFlags & BM_TAG_VALID) && + !(XLByteEQ(buf->lsn_on_disk, PageGetLSN(tmpBlock)) || + (SS_REFORM_REFORMER && XLByteLT(PageGetLSN(tmpBlock), g_instance.dms_cxt.ckptRedo))) && + !(*oldFlags & BM_DIRTY) && RecoveryInProgress()) { int mode = DEBUG1; #ifdef USE_ASSERT_CHECKING - mode = PANIC; + mode = ENABLE_DMS ? WARNING : PANIC; #endif const uint32 shiftSize = 32; ereport(mode, (errmodule(MOD_INCRE_BG), @@ -2546,6 +2555,10 @@ void PageCheckIfCanEliminate(BufferDesc *buf, uint32 *oldFlags, bool *needGetLoc #ifdef USE_ASSERT_CHECKING void PageCheckWhenChosedElimination(const BufferDesc *buf, uint32 oldFlags) { + if (SS_REFORM_REFORMER) { + return; + } + if ((oldFlags & BM_TAG_VALID) && RecoveryInProgress()) { if (!XLByteEQ(buf->lsn_dirty, InvalidXLogRecPtr)) { Assert(XLByteEQ(buf->lsn_on_disk, buf->lsn_dirty)); @@ -2639,6 +2652,14 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumbe } } + /* set Physical segment file. */ + if (ENABLE_DMS && pblk != NULL) { + Assert(PhyBlockIsValid(*pblk)); + buf->seg_fileno = pblk->relNode; + buf->seg_blockno = pblk->block; + MarkReadPblk(buf->buf_id, pblk); + } + return buf; } @@ -2989,7 +3010,6 @@ void InvalidateBuffer(BufferDesc *buf) old_partition_lock = BufMappingPartitionLock(old_hash); retry: - /* * Acquire exclusive mapping lock in preparation for changing the buffer's * association. @@ -3159,6 +3179,10 @@ void MarkBufferDirty(Buffer buffer) UnlockBufHdr(buf_desc, buf_state); + if (SS_REFORM_REFORMER) { + dms_buf_ctrl_t* buf_ctrl = GetDmsBufCtrl(buf_desc->buf_id); + buf_ctrl->state &= ~BUF_DIRTY_NEED_FLUSH; + } /* * If the buffer was not dirty already, do vacuum accounting. */ @@ -4015,6 +4039,7 @@ bool BgBufferSync(WritebackContext *wb_context) const int CONDITION_LOCK_RETRY_TIMES = 5; bool SyncFlushOneBuffer(int buf_id, bool get_condition_lock) { + t_thrd.dms_cxt.buf_in_aio = false; BufferDesc *buf_desc = GetBufferDescriptor(buf_id); /* * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the @@ -4041,6 +4066,10 @@ bool SyncFlushOneBuffer(int buf_id, bool get_condition_lock) if (!BufferIsInvalid(queue_head_buffer) && (queue_head_buffer - 1 == buf_id)) { retry_times = CONDITION_LOCK_RETRY_TIMES; } + if (ENABLE_DMS) { + /* to speed the rate of flushing dirty page to disk */ + retry_times = CONDITION_LOCK_RETRY_TIMES; + } for (;;) { if (!LWLockConditionalAcquire(buf_desc->content_lock, LW_SHARED)) { i++; @@ -4057,6 +4086,12 @@ bool SyncFlushOneBuffer(int buf_id, bool get_condition_lock) (void)LWLockAcquire(buf_desc->content_lock, LW_SHARED); } + if (ENABLE_DMS && buf_desc->aio_in_progress) { + LWLockRelease(buf_desc->content_lock); + UnpinBuffer(buf_desc, true); + return false; + } + if (IsSegmentBufferID(buf_id)) { Assert(IsSegmentPhysicalRelNode(buf_desc->tag.rnode)); SegFlushBuffer(buf_desc, NULL); @@ -4064,6 +4099,10 @@ bool SyncFlushOneBuffer(int buf_id, bool get_condition_lock) FlushBuffer(buf_desc, NULL); } + if (SS_REFORM_REFORMER) { + ClearReadHint(buf_desc->buf_id); + } + LWLockRelease(buf_desc->content_lock); return true; } @@ -4126,7 +4165,11 @@ uint32 SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext* wb_c tag.rnode.relNode = buf_desc->seg_fileno; tag.blockNum = buf_desc->seg_blockno; } - UnpinBuffer(buf_desc, true); + + if (!t_thrd.dms_cxt.buf_in_aio) { + /* when enable DSS AIO, UnpinBuffer in AIO complete callback */ + UnpinBuffer(buf_desc, true); + } ScheduleBufferTagForWriteback(wb_context, &tag); @@ -4533,6 +4576,8 @@ void FlushBuffer(void *buf, SMgrRelation reln, ReadBufferMethod flushmethod, boo uint32 buf_state; RedoBufferInfo bufferinfo = {0}; + t_thrd.dms_cxt.buf_in_aio = false; + /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns * false, then someone else flushed the buffer before we could, so we need @@ -4629,7 +4674,39 @@ void FlushBuffer(void *buf, SMgrRelation reln, ReadBufferMethod flushmethod, boo .bucketNode = SegmentBktId, .opt = 0 }; - seg_physical_write(spc, fakenode, bufferinfo.blockinfo.forknum, bufdesc->seg_blockno, bufToWrite, false); + + if (ENABLE_DMS && t_thrd.role == PAGEWRITER_THREAD && ENABLE_DSS_AIO) { + int thread_id = t_thrd.pagewriter_cxt.pagewriter_id; + PageWriterProc *pgwr = &g_instance.ckpt_cxt_ctl->pgwr_procs.writer_proc[thread_id]; + DSSAioCxt *aio_cxt = &pgwr->aio_cxt; + int aiobuf_id = DSSAioGetIOCBIndex(aio_cxt); + char *tempBuf = (char *)(pgwr->aio_buf + aiobuf_id * BLCKSZ); + errno_t ret = memcpy_s(tempBuf, BLCKSZ, bufToWrite, BLCKSZ); + securec_check(ret, "\0", "\0"); + + struct iocb *iocb_ptr = DSSAioGetIOCB(aio_cxt); + int32 io_ret = seg_physical_aio_prep_pwrite(spc, fakenode, bufferinfo.blockinfo.forknum, + bufdesc->seg_blockno, tempBuf, (void *)iocb_ptr); + if (io_ret != DSS_SUCCESS) { + ereport(PANIC, (errmsg("dss aio failed, buffer: %d/%d/%d/%d/%d %d-%u", + fakenode.spcNode, fakenode.dbNode, fakenode.relNode, (int)fakenode.bucketNode, + (int)fakenode.opt, bufferinfo.blockinfo.forknum, bufdesc->seg_blockno))); + } + + if (bufdesc->aio_in_progress) { + ereport(PANIC, (errmsg("buffer is already in aio progress, buffer: %d/%d/%d/%d/%d %d-%u", + fakenode.spcNode, fakenode.dbNode, fakenode.relNode, (int)fakenode.bucketNode, + (int)fakenode.opt, bufferinfo.blockinfo.forknum, bufdesc->seg_blockno))); + } + + t_thrd.dms_cxt.buf_in_aio = true; + bufdesc->aio_in_progress = true; + /* should be after io_prep_pwrite, because io_prep_pwrite will memset iocb struct */ + iocb_ptr->data = (void *)bufdesc; + DSSAioAppendIOCB(aio_cxt, iocb_ptr); + } else { + seg_physical_write(spc, fakenode, bufferinfo.blockinfo.forknum, bufdesc->seg_blockno, bufToWrite, false); + } } else { SegmentCheck(!IsSegmentFileNode(bufdesc->tag.rnode)); smgrwrite(reln, bufferinfo.blockinfo.forknum, bufferinfo.blockinfo.blkno, bufToWrite, skipFsync); @@ -5304,7 +5381,8 @@ static void flush_wait_page_writer(BufferDesc *buf_desc, Relation rel, Oid db_id uint32 buf_state; for (;;) { buf_state = LockBufHdr(buf_desc); - if (flush_buffer_match(buf_desc, rel, db_id) && dw_buf_valid_dirty(buf_state)) { + if (flush_buffer_match(buf_desc, rel, db_id) && dw_buf_valid_aio_finished(buf_desc, buf_state) && + dw_buf_valid_dirty(buf_state)) { UnlockBufHdr(buf_desc, buf_state); pg_usleep(MILLISECOND_TO_MICROSECOND); } else { @@ -6218,6 +6296,12 @@ bool StartBufferIO(BufferDesc *buf, bool for_input) */ (void)LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE); + if (buf->aio_in_progress) { + LWLockRelease(buf->io_in_progress_lock); + pg_usleep(1000L); + continue; + } + buf_state = LockBufHdr(buf); if (!(buf_state & BM_IO_IN_PROGRESS)) { break; diff --git a/src/gausskernel/storage/dss/dss_adaptor.cpp b/src/gausskernel/storage/dss/dss_adaptor.cpp index dc0e7ba75..986222e92 100644 --- a/src/gausskernel/storage/dss/dss_adaptor.cpp +++ b/src/gausskernel/storage/dss/dss_adaptor.cpp @@ -131,6 +131,8 @@ int dss_device_init(const char *conn_path, bool enable_dss) SS_RETURN_IFERR( dss_load_symbol(device_op.handle, "dss_register_log_callback", (void **)&device_op.dss_register_log_callback)); SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_get_lib_version", (void **)&device_op.dss_get_version)); + SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_aio_prep_pwrite", (void **)&device_op.dss_aio_pwrite)); + SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_aio_prep_pread", (void **)&device_op.dss_aio_pread)); int my_version = dss_get_my_version(); int lib_version = dss_get_lib_version(); @@ -153,4 +155,4 @@ int dss_device_init(const char *conn_path, bool enable_dss) void dss_register_log_callback(dss_log_output cb_log_output) { device_op.dss_register_log_callback(cb_log_output); -} \ No newline at end of file +} diff --git a/src/gausskernel/storage/dss/fio_dss.cpp b/src/gausskernel/storage/dss/fio_dss.cpp index 56afa05b3..5fff27857 100644 --- a/src/gausskernel/storage/dss/fio_dss.cpp +++ b/src/gausskernel/storage/dss/fio_dss.cpp @@ -707,4 +707,14 @@ int dss_remove_dev(const char *name) } else { return GS_SUCCESS; } -} \ No newline at end of file +} + +int dss_aio_prep_pwrite(void *iocb, int fd, void *buf, size_t count, long long offset) +{ + return g_dss_device_op.dss_aio_pwrite(iocb, fd, buf, count, offset); +} + +int dss_aio_prep_pread(void *iocb, int fd, void *buf, size_t count, long long offset) +{ + return g_dss_device_op.dss_aio_pread(iocb, fd, buf, count, offset); +} diff --git a/src/gausskernel/storage/ipc/procarray.cpp b/src/gausskernel/storage/ipc/procarray.cpp index f4ec88c83..daa4ece53 100755 --- a/src/gausskernel/storage/ipc/procarray.cpp +++ b/src/gausskernel/storage/ipc/procarray.cpp @@ -1914,6 +1914,10 @@ RETRY: } else { result = GetLocalSnapshotData(snapshot); snapshot->snapshotcsn = pg_atomic_read_u64(&t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo); + if (SS_IN_REFORM) { + ereport(ERROR, (errmsg("failed to request snapshot as current node is in reform!"))); + return NULL; + } } if (result) { diff --git a/src/gausskernel/storage/ipc/sinval.cpp b/src/gausskernel/storage/ipc/sinval.cpp index c14ba10d9..e30e5e551 100644 --- a/src/gausskernel/storage/ipc/sinval.cpp +++ b/src/gausskernel/storage/ipc/sinval.cpp @@ -164,8 +164,11 @@ void ReceiveSharedInvalidMessages(void (*invalFunction)(SharedInvalidationMessag * catchup signal this way avoids creating spikes in system load for what * should be just a background maintenance activity. */ - if (catchupInterruptPending) { + if (catchupInterruptPending || (ENABLE_DMS && g_instance.dms_cxt.resetSyscache)) { catchupInterruptPending = false; + if (ENABLE_DMS) { + g_instance.dms_cxt.resetSyscache = false; + } ereport(DEBUG4, (errmsg("sinval catchup complete, cleaning queue"))); SICleanupQueue(false, 0); } diff --git a/src/gausskernel/storage/replication/walsender.cpp b/src/gausskernel/storage/replication/walsender.cpp index 93b0d7a6e..a67cd2451 100755 --- a/src/gausskernel/storage/replication/walsender.cpp +++ b/src/gausskernel/storage/replication/walsender.cpp @@ -7059,7 +7059,7 @@ static void WalSndRefreshPercentCountStartLsn(XLogRecPtr currentMaxLsn, XLogRecP XLogSegNo WalGetSyncCountWindow(void) { - return (XLogSegNo)(uint32)u_sess->attr.attr_storage.wal_keep_segments; + return (XLogSegNo)(uint32)XLogSegmentsNum(u_sess->attr.attr_storage.wal_keep_segments); } void add_archive_task_to_list(int archive_task_status_idx, WalSnd *walsnd) diff --git a/src/gausskernel/storage/smgr/segment/data_file.cpp b/src/gausskernel/storage/smgr/segment/data_file.cpp index be22d2c42..f20810f47 100644 --- a/src/gausskernel/storage/smgr/segment/data_file.cpp +++ b/src/gausskernel/storage/smgr/segment/data_file.cpp @@ -199,7 +199,7 @@ bool df_ss_update_segfile_size(SegLogicFile *sf, BlockNumber target_block) return true; } -static SegPhysicalFile df_get_physical_file(SegLogicFile *sf, int sliceno, BlockNumber target_block) +SegPhysicalFile df_get_physical_file(SegLogicFile *sf, int sliceno, BlockNumber target_block) { AutoMutexLock filelock(&sf->filelock); filelock.lock(); diff --git a/src/gausskernel/storage/smgr/segment/segbuffer.cpp b/src/gausskernel/storage/smgr/segment/segbuffer.cpp index b0ad1fc24..4a84a30fa 100644 --- a/src/gausskernel/storage/smgr/segment/segbuffer.cpp +++ b/src/gausskernel/storage/smgr/segment/segbuffer.cpp @@ -81,6 +81,12 @@ static bool SegStartBufferIO(BufferDesc *buf, bool forInput) while (true) { LWLockAcquire(buf->io_in_progress_lock, LW_EXCLUSIVE); + if (buf->aio_in_progress) { + LWLockRelease(buf->io_in_progress_lock); + pg_usleep(1000L); + continue; + } + buf_state = LockBufHdr(buf); if (!(buf_state & BM_IO_IN_PROGRESS)) { @@ -302,6 +308,8 @@ void SegMarkBufferDirty(Buffer buf) void SegFlushBuffer(BufferDesc *buf, SMgrRelation reln) { + t_thrd.dms_cxt.buf_in_aio = false; + if (!SegStartBufferIO(buf, false)) { /* Someone else flushed the buffer */ return; @@ -321,7 +329,7 @@ void SegFlushBuffer(BufferDesc *buf, SMgrRelation reln) char *buf_to_write = NULL; RedoBufferInfo buffer_info; - + SegSpace *spc; if (reln == NULL || reln->seg_space == NULL) { spc = spc_open(buf->tag.rnode.spcNode, buf->tag.rnode.dbNode, false); @@ -354,7 +362,38 @@ void SegFlushBuffer(BufferDesc *buf, SMgrRelation reln) buf_to_write = PageSetChecksumCopy((Page)buf_to_write, buffer_info.blockinfo.blkno, true); } - seg_physical_write(spc, buf->tag.rnode, buf->tag.forkNum, buf->tag.blockNum, (char *)buf_to_write, false); + if (ENABLE_DMS && t_thrd.role == PAGEWRITER_THREAD && ENABLE_DSS_AIO) { + int thread_id = t_thrd.pagewriter_cxt.pagewriter_id; + PageWriterProc *pgwr = &g_instance.ckpt_cxt_ctl->pgwr_procs.writer_proc[thread_id]; + DSSAioCxt *aio_cxt = &pgwr->aio_cxt; + int aiobuf_id = DSSAioGetIOCBIndex(aio_cxt); + char *tempBuf = (char *)(pgwr->aio_buf + aiobuf_id * BLCKSZ); + errno_t ret = memcpy_s(tempBuf, BLCKSZ, buf_to_write, BLCKSZ); + securec_check(ret, "\0", "\0"); + + struct iocb *iocb_ptr = DSSAioGetIOCB(aio_cxt); + int32 io_ret = seg_physical_aio_prep_pwrite(spc, buf->tag.rnode, buf->tag.forkNum, + buf->tag.blockNum, tempBuf, (void *)iocb_ptr); + if (io_ret != DSS_SUCCESS) { + ereport(PANIC, (errmsg("dss aio failed, buffer: %d/%d/%d/%d/%d %d-%u", + buf->tag.rnode.spcNode, buf->tag.rnode.dbNode, buf->tag.rnode.relNode, (int)buf->tag.rnode.bucketNode, + (int)buf->tag.rnode.opt, buf->tag.forkNum, buf->tag.blockNum))); + } + + if (buf->aio_in_progress) { + ereport(PANIC, (errmsg("buffer is already in aio progress, buffer: %d/%d/%d/%d/%d %d-%u", + buf->tag.rnode.spcNode, buf->tag.rnode.dbNode, buf->tag.rnode.relNode, (int)buf->tag.rnode.bucketNode, + (int)buf->tag.rnode.opt, buf->tag.forkNum, buf->tag.blockNum))); + } + + buf->aio_in_progress = true; + t_thrd.dms_cxt.buf_in_aio = true; + /* should be after io_prep_pwrite, because io_prep_pwrite will memset iocb struct */ + iocb_ptr->data = (void *)buf; + DSSAioAppendIOCB(aio_cxt, iocb_ptr); + } else { + seg_physical_write(spc, buf->tag.rnode, buf->tag.forkNum, buf->tag.blockNum, (char *)buf_to_write, false); + } SegTerminateBufferIO(buf, true, 0); @@ -485,8 +524,12 @@ Buffer ReadBufferFast(SegSpace *spc, RelFileNode rnode, ForkNumber forkNum, Bloc } dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(bufHdr->buf_id); - if (!LockModeCompatible(buf_ctrl, LW_SHARED)) { - if (!StartReadPage(bufHdr, LW_SHARED)) { + LWLockMode lockmode = LW_SHARED; + if (g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy && SS_REFORM_REFORMER) { + lockmode = LW_EXCLUSIVE; + } + if (!LockModeCompatible(buf_ctrl, lockmode)) { + if (!StartReadPage(bufHdr, lockmode)) { SegTerminateBufferIO((BufferDesc *)bufHdr, false, 0); // when reform fail, should return InvalidBuffer to reform proc thread if (AmDmsReformProcProcess() && dms_reform_failed()) { diff --git a/src/gausskernel/storage/smgr/segment/space.cpp b/src/gausskernel/storage/smgr/segment/space.cpp index e85cb2eca..462489b18 100644 --- a/src/gausskernel/storage/smgr/segment/space.cpp +++ b/src/gausskernel/storage/smgr/segment/space.cpp @@ -40,6 +40,8 @@ #include "utils/relfilenodemap.h" #include "pgxc/execRemote.h" #include "ddes/dms/ss_transaction.h" +#include "storage/file/fio_device.h" +#include "libaio.h" void spc_lock(SegSpace *spc) { @@ -103,6 +105,28 @@ void spc_write_block(SegSpace *spc, RelFileNode relNode, ForkNumber forknum, con df_pwrite_block(seg->segfile, buffer, blocknum); } +int32 spc_aio_prep_pwrite(SegSpace *spc, RelFileNode relNode, ForkNumber forknum, BlockNumber blocknum, + const char *buffer, void *iocb_ptr) +{ + int egid = EXTENT_TYPE_TO_GROUPID(relNode.relNode); + SegExtentGroup *seg = &spc->extent_group[egid][forknum]; + + off_t offset = ((off_t)blocknum) * BLCKSZ; + int sliceno = DF_OFFSET_TO_SLICENO(offset); + off_t roffset = DF_OFFSET_TO_SLICE_OFFSET(offset); + + SegPhysicalFile spf = df_get_physical_file(seg->segfile, sliceno, blocknum); + int32 ret; + if (is_dss_fd(spf.fd)) { + ret = dss_aio_prep_pwrite(iocb_ptr, spf.fd, (void *)buffer, BLCKSZ, roffset); + } else { + io_prep_pwrite((struct iocb *)iocb_ptr, spf.fd, (void *)buffer, BLCKSZ, roffset); + ret = DSS_SUCCESS; + } + + return ret; +} + void spc_writeback(SegSpace *spc, int extent_size, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { int egid = EXTENT_SIZE_TO_GROUPID(extent_size); diff --git a/src/gausskernel/storage/smgr/segstore.cpp b/src/gausskernel/storage/smgr/segstore.cpp index 9a618eab6..a8e4b9e2d 100755 --- a/src/gausskernel/storage/smgr/segstore.cpp +++ b/src/gausskernel/storage/smgr/segstore.cpp @@ -1113,14 +1113,16 @@ SegPageLocation seg_get_physical_location(RelFileNode rnode, ForkNumber forknum, reln = smgropen(rnode, InvalidBackendId); Buffer buffer = read_head_buffer(reln, forknum, false); - if (ENABLE_DMS) { - LockBuffer(buffer, BUFFER_LOCK_SHARE); - } SegmentCheck(BufferIsValid(buffer)); + volatile BufferDesc *buf = GetBufferDescriptor(buffer - 1); + bool need_lock = !LWLockHeldByMe(buf->content_lock); + if (ENABLE_DMS && need_lock) { + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } SegmentHead *head = (SegmentHead *)PageGetContents(BufferGetBlock(buffer)); SegPageLocation loc = seg_logic_to_physic_mapping(reln, head, blocknum); - if (ENABLE_DMS) { + if (ENABLE_DMS && need_lock) { LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } @@ -1896,6 +1898,14 @@ void seg_physical_write(SegSpace *spc, RelFileNode &rNode, ForkNumber forknum, B spc_write_block(spc, rNode, forknum, buffer, blocknum); } +int32 seg_physical_aio_prep_pwrite(SegSpace *spc, RelFileNode &rNode, ForkNumber forknum, BlockNumber blocknum, + const char *buffer, void *iocb_ptr) +{ + SegmentCheck(IsSegmentPhysicalRelNode(rNode)); + SegmentCheck(spc != NULL); + + return spc_aio_prep_pwrite(spc, rNode, forknum, blocknum, buffer, iocb_ptr); +} static bool check_meta_data(BlockNumber extent, uint32 extent_size, uint32* offset_block) { if (extent < DF_MAP_HEAD_PAGE + 1 || extent_size == EXTENT_1) { diff --git a/src/include/access/double_write.h b/src/include/access/double_write.h index 09df50262..00fa8c770 100644 --- a/src/include/access/double_write.h +++ b/src/include/access/double_write.h @@ -148,6 +148,10 @@ const uint16 DW_SECOND_DATA_START_IDX = DW_SECOND_BUFTAG_START_IDX + DW_SECOND_B inline bool dw_buf_valid_dirty(uint32 buf_state) { + if (ENABLE_DMS && ENABLE_DSS_AIO) { + return true; + } + return ((buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)); } @@ -302,6 +306,18 @@ inline bool dw_page_writer_running() return (dw_enabled() && pg_atomic_read_u32(&g_instance.ckpt_cxt_ctl->current_page_writer_count) > 0); } +/** + * If enable dms and aio, the aio_in_process should be false. + */ +inline bool dw_buf_valid_aio_finished(BufferDesc *buf_desc, uint32 buf_state) +{ + if (!ENABLE_DMS || !ENABLE_DSS_AIO) { + return true; + } + + return ((buf_state & BM_VALID) && ((buf_state & BM_DIRTY) || buf_desc->aio_in_progress)); +} + extern bool free_space_enough(int buf_id); extern void dw_generate_single_file(); diff --git a/src/include/access/xlog_basic.h b/src/include/access/xlog_basic.h index 46ce4afe2..d714fa160 100644 --- a/src/include/access/xlog_basic.h +++ b/src/include/access/xlog_basic.h @@ -57,6 +57,11 @@ #define XLogSegSize XLogSegmentSize #define XLogSegmentsPerXLogId (UINT64CONST(0x100000000) / XLogSegmentSize) #define XLogRecordMaxSize ((uint32)0x3fffe000) /* 1 gigabyte - 8 kbyte */ +#define XLogBaseSize (16ULL * 1024 * 1024) +#define XLogSegmentsNum(val) (((val) * XLogBaseSize + XLogSegSize - 1) / XLogSegSize) + + +#define XLogPreReadSize 67108864 // 64MB /* Compute XLogRecPtr with segment number and offset. */ #define XLogSegNoOffsetToRecPtr(segno, offset, dest) \ @@ -84,6 +89,9 @@ #define XLByteInPrevSeg(xlrp, logSegNo) ((((xlrp)-1) / XLogSegSize) == (logSegNo)) +#define XLByteInPreReadBuf(xlrp, preReadStartPtr) \ + (((xlrp) >= (preReadStartPtr)) && ((xlrp) < (preReadStartPtr + XLogPreReadSize))) + /* Check if an XLogRecPtr value is in a plausible range */ #define XRecOffIsValid(xlrp) ((xlrp) % XLOG_BLCKSZ >= SizeOfXLogShortPHD) @@ -103,6 +111,7 @@ #define InvalidRepOriginId 0 +#define InvalidXlogPreReadStartPtr 0xFFFFFFFFFFFFFFFF /* * Block IDs used to distinguish different kinds of record fragments. Block @@ -300,6 +309,12 @@ struct XLogReaderState { char* readBuf; uint32 readLen; char* readBufOrigin; + + /* per-reading for dss */ + XLogRecPtr preReadStartPtr; + char* preReadBuf; + char* preReadBufOrigin; + /* last read segment, segment offset, TLI for data currently in readBuf */ XLogSegNo readSegNo; uint32 readOff; diff --git a/src/include/ddes/dms/dms_api.h b/src/include/ddes/dms/dms_api.h index c100faa78..d97c467de 100644 --- a/src/include/ddes/dms/dms_api.h +++ b/src/include/ddes/dms/dms_api.h @@ -201,6 +201,7 @@ typedef struct st_dms_cr { unsigned long long query_scn; unsigned int ssn; char *page; + unsigned char *fb_mark; } dms_cr_t; typedef struct st_dms_opengauss_xid_csn { @@ -263,22 +264,19 @@ typedef struct st_dms_buf_ctrl { // can be discard only after latest version in other instance is cleaned volatile unsigned char is_edp; volatile unsigned char force_request; // force to request page from remote - volatile unsigned char need_chk_master; // suport owner transfer page again volatile unsigned char need_flush; // for recovery, owner is abort, copy instance should flush before release unsigned long long edp_scn; // set when become edp, lastest scn when page becomes edp unsigned long long edp_map; // records edp instance long long last_ckpt_time; // last time when local edp page is added to group. + unsigned long long ver; #ifdef OPENGAUSS int buf_id; unsigned int state; unsigned int pblk_relno; unsigned int pblk_blkno; unsigned long long pblk_lsn; - unsigned long long lsn_on_disk; - unsigned char seg_fileno; - unsigned int seg_blockno; #endif -}dms_buf_ctrl_t; +} dms_buf_ctrl_t; typedef enum en_dms_page_latch_mode { DMS_PAGE_LATCH_MODE_S = 1, @@ -431,6 +429,21 @@ typedef enum en_dms_role { DMS_ROLE_PARTNER = 2 } dms_role_t; +typedef enum en_reform_phase { + DMS_PHASE_START = 0, + DMS_PHASE_AFTER_RECOVERY = 1, + DMS_PHASE_BEFORE_DC_INIT = 2, + DMS_PHASE_BEFORE_ROLLBACK = 3, + DMS_PHASE_END = 4, +} reform_phase_t; + +typedef enum en_dms_status { + DMS_STATUS_OUT = 0, + DMS_STATUS_JOIN = 1, + DMS_STATUS_REFORM = 2, + DMS_STATUS_IN = 3 +} dms_status_t; // used in database startup + #define DCS_BATCH_BUF_SIZE (1024 * 30) #define DCS_RLS_OWNER_BATCH_SIZE (DCS_BATCH_BUF_SIZE / DMS_PAGEID_SIZE) typedef struct st_dcs_batch_buf { @@ -445,20 +458,21 @@ typedef int(*dms_save_list_stable)(void *db_handle, unsigned long long list_stab typedef int(*dms_get_dms_status)(void *db_handle); typedef void(*dms_set_dms_status)(void *db_handle, int status); typedef int(*dms_confirm_converting)(void *db_handle, char *pageid, unsigned char smon_chk, - unsigned char *lock_mode, unsigned long long *edp_map, unsigned long long *lsn); + unsigned char *lock_mode, unsigned long long *edp_map, unsigned long long *lsn, unsigned long long *ver); typedef int(*dms_confirm_owner)(void *db_handle, char *pageid, unsigned char *lock_mode, unsigned char *is_edp, unsigned long long *lsn); typedef int(*dms_flush_copy)(void *db_handle, char *pageid); typedef int(*dms_edp_lsn)(void *db_handle, char *pageid, unsigned long long *lsn); typedef int(*dms_disk_lsn)(void *db_handle, char *pageid, unsigned long long *lsn); -typedef int(*dms_recovery)(void *db_handle, void *recovery_list, void *remove_list, int is_reformer); +typedef int(*dms_recovery)(void *db_handle, void *recovery_list, int is_reformer); typedef int(*dms_opengauss_startup)(void *db_handle); typedef int(*dms_opengauss_recovery_standby)(void *db_handle, int inst_id); typedef int(*dms_opengauss_recovery_primary)(void *db_handle, int inst_id); -typedef void(*dms_reform_start_notify)(void *db_handle, dms_role_t role); +typedef void(*dms_reform_start_notify)(void *db_handle, dms_role_t role, unsigned char reform_type); typedef int(*dms_undo_init)(void *db_handle, unsigned char inst_id); typedef int(*dms_tx_area_init)(void *db_handle, unsigned char inst_id); -typedef int(*dms_tx_area_load)(void *db_handle, unsigned char inst_id); +typedef int (*dms_tx_area_load)(void *db_handle, unsigned char inst_id); +typedef int (*dms_tx_rollback_finish)(void *db_handle, unsigned char inst_id); typedef unsigned char(*dms_recovery_in_progress)(void *db_handle); typedef unsigned int(*dms_get_page_hash_val)(const char pageid[DMS_PAGEID_SIZE]); typedef unsigned long long(*dms_get_page_lsn)(const dms_buf_ctrl_t *buf_ctrl); @@ -480,7 +494,7 @@ typedef unsigned char(*dms_page_is_dirty)(dms_buf_ctrl_t *buf_ctrl); typedef void(*dms_leave_local_page)(void *db_handle, dms_buf_ctrl_t *buf_ctrl); typedef void(*dms_get_pageid)(dms_buf_ctrl_t *buf_ctrl, char **pageid, unsigned int *size); typedef char *(*dms_get_page)(dms_buf_ctrl_t *buf_ctrl); -typedef void (*dms_invalidate_page)(void *db_handle, char pageid[DMS_PAGEID_SIZE]); +typedef int (*dms_invalidate_page)(void *db_handle, char pageid[DMS_PAGEID_SIZE], unsigned long long ver); typedef void *(*dms_get_db_handle)(unsigned int *db_handle_index); typedef void *(*dms_stack_push_cr_cursor)(void *db_handle); typedef void (*dms_stack_pop_cr_cursor)(void *db_handle); @@ -492,9 +506,11 @@ typedef void(*dms_init_check_cr_cursor)(void *cr_cursor, char rowid[DMS_ROWID_SI unsigned long long query_scn, unsigned int ssn); typedef char *(*dms_get_wxid_from_cr_cursor)(void *cr_cursor); typedef unsigned char(*dms_get_instid_of_xid_from_cr_cursor)(void *db_handle, void *cr_cursor); -typedef int(*dms_get_page_invisible_txn_list)(void *db_handle, void *cr_cursor, void *cr_page, - unsigned char *is_empty_txn_list, unsigned char *exist_waiting_txn); -typedef int(*dms_reorganize_page_with_undo)(void *db_handle, void *cr_cursor, void *cr_page); +typedef int (*dms_get_page_invisible_txn_list)(void *db_handle, void *cr_cursor, void *cr_page, + unsigned char *is_empty_txn_list, unsigned char *exist_waiting_txn); +typedef int (*dms_reorganize_heap_page_with_undo)(void *db_handle, void *cr_cursor, void *cr_page, + unsigned char *fb_mark); +typedef int (*dms_reorganize_index_page_with_undo)(void *db_handle, void *cr_cursor, void *cr_page); typedef int(*dms_check_heap_page_visible_with_undo_snapshot)(void *db_handle, void *cr_cursor, void *page, unsigned char *is_found); typedef void(*dms_set_page_force_request)(void *db_handle, char pageid[DMS_PAGEID_SIZE]); @@ -525,7 +541,7 @@ typedef int(*dms_get_txn_snapshot)(void *db_handle, unsigned int xmap, dms_txn_s typedef int(*dms_get_opengauss_txn_snapshot)(void *db_handle, dms_opengauss_txn_snapshot_t *txn_snapshot); typedef void (*dms_log_output)(dms_log_id_t log_type, dms_log_level_t log_level, const char *code_file_name, unsigned int code_line_num, const char *module_name, const char *format, ...); -typedef void (*dms_log_flush)(void *db_handle, unsigned long long *lsn); +typedef int (*dms_log_flush)(void *db_handle, unsigned long long *lsn); typedef int(*dms_process_edp)(void *db_handle, dms_edp_info_t *pages, unsigned int count); typedef void (*dms_clean_ctrl_edp)(void *db_handle, dms_buf_ctrl_t *dms_ctrl); typedef char *(*dms_display_pageid)(char *display_buf, unsigned int count, char *pageid); @@ -537,12 +553,12 @@ typedef void (*dms_check_if_build_complete)(void *db_handle, unsigned int *build typedef int (*dms_db_is_primary)(void *db_handle); typedef void (*dms_set_switchover_result)(void *db_handle, int result); typedef void (*dms_set_db_standby)(void *db_handle); -typedef int (*dms_load_tablespace)(void *db_handle, unsigned int *has_offline); +typedef int (*dms_mount_to_recovery)(void *db_handle, unsigned int *has_offline); +typedef int(*dms_get_open_status)(void *db_handle); // for openGauss typedef void (*dms_thread_init_t)(unsigned char need_startup, char **reg_data); typedef int (*dms_get_db_primary_id)(void *db_handle, unsigned int *primary_id); -typedef int (*dms_set_buf_info)(dms_buf_ctrl_t *buf_ctrl); // for ssl typedef int(*dms_decrypt_pwd_t)(const char *cipher, unsigned int len, char *plain, unsigned int size); @@ -564,7 +580,9 @@ typedef int (*dms_switchover_demote)(void *db_handle); typedef int (*dms_switchover_promote)(void *db_handle); typedef int (*dms_switchover_promote_opengauss)(void *db_handle, unsigned char origPrimaryId); typedef int (*dms_failover_promote_opengauss)(void *db_handle); -typedef int (*dms_refresh_point)(void *db_handle); +typedef int (*dms_reform_done_notify)(void *db_handle); +typedef int (*dms_log_wait_flush)(void *db_handle, unsigned long long lsn); +typedef int (*dms_wait_ckpt)(void *db_handle); typedef struct st_dms_callback { // used in reform @@ -579,9 +597,11 @@ typedef struct st_dms_callback { dms_disk_lsn disk_lsn; dms_recovery recovery; dms_db_is_primary db_is_primary; + dms_get_open_status get_open_status; dms_undo_init undo_init; dms_tx_area_init tx_area_init; dms_tx_area_load tx_area_load; + dms_tx_rollback_finish tx_rollback_finish; dms_recovery_in_progress recovery_in_progress; dms_drc_buf_res_rebuild dms_reform_rebuild_buf_res; dms_check_if_build_complete check_if_build_complete; @@ -593,7 +613,6 @@ typedef struct st_dms_callback { dms_opengauss_recovery_standby opengauss_recovery_standby; dms_opengauss_recovery_primary opengauss_recovery_primary; dms_reform_start_notify reform_start_notify; - dms_set_buf_info set_buf_info; dms_get_page_hash_val get_page_hash_val; dms_get_page_lsn get_page_lsn; @@ -608,7 +627,6 @@ typedef struct st_dms_callback { dms_get_page_lfn get_page_lfn; dms_get_global_flushed_lfn get_global_flushed_lfn; dms_read_local_page4transfer read_local_page4transfer; - dms_try_read_local_page try_read_local_page; dms_page_is_dirty page_is_dirty; dms_leave_local_page leave_local_page; dms_get_pageid get_pageid; @@ -624,8 +642,8 @@ typedef struct st_dms_callback { dms_get_instid_of_xid_from_cr_cursor get_instid_of_xid_from_cr_cursor; dms_get_page_invisible_txn_list get_heap_invisible_txn_list; dms_get_page_invisible_txn_list get_index_invisible_txn_list; - dms_reorganize_page_with_undo reorganize_heap_page_with_undo; - dms_reorganize_page_with_undo reorganize_index_page_with_undo; + dms_reorganize_heap_page_with_undo reorganize_heap_page_with_undo; + dms_reorganize_index_page_with_undo reorganize_index_page_with_undo; dms_check_heap_page_visible_with_undo_snapshot check_heap_page_visible_with_udss; dms_set_page_force_request set_page_force_request; dms_get_entry_pageid_from_cr_cursor get_entry_pageid_from_cr_cursor; @@ -676,9 +694,11 @@ typedef struct st_dms_callback { dms_failover_promote_opengauss failover_promote_opengauss; dms_set_switchover_result set_switchover_result; dms_set_db_standby set_db_standby; - dms_load_tablespace load_tablespace; + dms_mount_to_recovery mount_to_recovery; - dms_refresh_point refresh_point; + dms_reform_done_notify reform_done_notify; + dms_log_wait_flush log_wait_flush; + dms_wait_ckpt wait_ckpt; } dms_callback_t; typedef struct st_dms_instance_net_addr { @@ -739,7 +759,7 @@ typedef struct st_dms_profile { #define DMS_LOCAL_MINOR_VER_WEIGHT 1000 #define DMS_LOCAL_MAJOR_VERSION 0 #define DMS_LOCAL_MINOR_VERSION 0 -#define DMS_LOCAL_VERSION 22 +#define DMS_LOCAL_VERSION 35 #ifdef __cplusplus } diff --git a/src/include/ddes/dms/ss_aio.h b/src/include/ddes/dms/ss_aio.h new file mode 100644 index 000000000..db41149e1 --- /dev/null +++ b/src/include/ddes/dms/ss_aio.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * ss_aio.h + * aio interface. + * + * IDENTIFICATION + * src/include/ddes/dms/ss_aio.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef __SS_AIO_H__ +#define __SS_AIO_H__ + +#include "libaio.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*aio_callback)(struct io_event *event); +#define DSS_AIO_BATCH_SIZE 128 +#define DSS_AIO_UTIL_NUM 2 +typedef struct AioUtil { + io_context_t handle; + struct iocb iocbs[DSS_AIO_BATCH_SIZE]; + struct iocb *iocbs_ptr[DSS_AIO_BATCH_SIZE]; + struct io_event events[DSS_AIO_BATCH_SIZE]; + int iocount; +} AioUtil; + +typedef struct DSSAioCxt { + bool initialized; + aio_callback aiocb; + int index; + AioUtil aio[DSS_AIO_UTIL_NUM]; +} DSSAioCxt; + +void DSSAioInitialize(DSSAioCxt *aio_cxt, aio_callback callback); +void DSSAioDestroy(DSSAioCxt *aio_cxt); +struct iocb* DSSAioGetIOCB(DSSAioCxt *aio_cxt); +int DSSAioGetIOCBIndex(DSSAioCxt *aio_cxt); +void DSSAioAppendIOCB(DSSAioCxt *aio_cxt, struct iocb *iocb_ptr); +void DSSAioFlush(DSSAioCxt *aio_cxt); + +#ifdef __cplusplus +} +#endif + +#endif /* __SS_AIO_H__ */ \ No newline at end of file diff --git a/src/include/ddes/dms/ss_common_attr.h b/src/include/ddes/dms/ss_common_attr.h index c8f272381..d691b0dbf 100644 --- a/src/include/ddes/dms/ss_common_attr.h +++ b/src/include/ddes/dms/ss_common_attr.h @@ -97,6 +97,7 @@ #define BUF_IS_RELPERSISTENT 0x20 #define BUF_IS_RELPERSISTENT_TEMP 0x40 #define BUF_READ_MODE_ZERO_LOCK 0x80 +#define BUF_DIRTY_NEED_FLUSH 0x100 #define SS_BROADCAST_FAILED_RETRYCOUNTS 4 #define SS_BROADCAST_WAIT_INFINITE (0xFFFFFFFF) @@ -135,4 +136,16 @@ typedef struct SSBroadcastCmdOnly { SSBroadcastOp type; // must be first } SSBroadcastCmdOnly; +typedef enum SSReformType { + DMS_REFORM_TYPE_FOR_NORMAL = 0, + DMS_REFORM_TYPE_FOR_BUILD, + DMS_REFORM_TYPE_FOR_FAILOVER, + DMS_REFORM_TYPE_FOR_SWITCHOVER, + DMS_REFORM_TYPE_FOR_OPENGAUSS, + DMS_REFORM_TYPE_FOR_FAILOVER_OPENGAUSS, + DMS_REFORM_TYPE_FOR_SWITCHOVER_OPENGAUSS, + DMS_REFORM_TYPE_FOR_FULL_CLEAN, + DMS_REFORM_TYPE_FOR_MAINTAIN +} SSReformType; + #endif diff --git a/src/include/ddes/dms/ss_dms.h b/src/include/ddes/dms/ss_dms.h index 247cc34bc..72d994084 100644 --- a/src/include/ddes/dms/ss_dms.h +++ b/src/include/ddes/dms/ss_dms.h @@ -72,6 +72,7 @@ typedef struct st_ss_dms_func { bool (*dms_latch_timed_s)(dms_context_t *dms_ctx, dms_drlatch_t *dlatch, unsigned int wait_ticks, unsigned char is_force); void (*dms_unlatch)(dms_context_t *dms_ctx, dms_drlatch_t *dlatch); + void (*dms_pre_uninit)(void); } ss_dms_func_t; int ss_dms_func_init(); @@ -109,6 +110,7 @@ int dms_reform_last_failed(void); bool dms_latch_timed_x(dms_context_t *dms_ctx, dms_drlatch_t *dlatch, unsigned int wait_ticks); bool dms_latch_timed_s(dms_context_t *dms_ctx, dms_drlatch_t *dlatch, unsigned int wait_ticks, unsigned char is_force); void dms_unlatch(dms_context_t *dms_ctx, dms_drlatch_t *dlatch); +void dms_pre_uninit(void); #ifdef __cplusplus } #endif diff --git a/src/include/ddes/dms/ss_dms_bufmgr.h b/src/include/ddes/dms/ss_dms_bufmgr.h index e35eae1d2..da42b7177 100644 --- a/src/include/ddes/dms/ss_dms_bufmgr.h +++ b/src/include/ddes/dms/ss_dms_bufmgr.h @@ -66,6 +66,8 @@ int SSLockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock); void SSLockReleaseAll(); void SSLockAcquireAll(); void MarkReadPblk(int buf_id, const XLogPhyBlock *pblk); +void SSCheckBufferIfNeedMarkDirty(Buffer buf); +void SSRecheckBufferPool(); void TransformLockTagToDmsLatch(dms_drlatch_t* dlatch, const LOCKTAG locktag); #endif diff --git a/src/include/ddes/dms/ss_dms_recovery.h b/src/include/ddes/dms/ss_dms_recovery.h index 3aa59874b..3bfb2d12a 100644 --- a/src/include/ddes/dms/ss_dms_recovery.h +++ b/src/include/ddes/dms/ss_dms_recovery.h @@ -34,6 +34,7 @@ #define SSSKIP_REDO_REPLAY (ENABLE_DMS && g_instance.dms_cxt.SSRecoveryInfo.skip_redo_replay == true) #define SS_BEFORE_RECOVERY (ENABLE_DMS && g_instance.dms_cxt.SSReformInfo.in_reform == true \ && g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag == true) +#define SS_IN_FAILOVER (ENABLE_DMS && g_instance.dms_cxt.SSRecoveryInfo.in_failover == true) typedef struct st_reformer_ctrl { uint64 list_stable; // stable instances list @@ -60,12 +61,13 @@ typedef struct ss_recovery_info { bool reform_ready; bool in_failover; // used to judge this is failover, this tag will combine with failover_triggered later // in failover Scenario,before failover_triggered become true, this node knows itself will become new primary + bool in_flushcopy; } ss_recovery_info_t; extern bool SSRecoveryNodes(); extern int SSGetPrimaryInstId(); extern void SSSavePrimaryInstId(int id); -extern void SSReadControlFile(int id); +extern void SSReadControlFile(int id, bool updateDmsCtx = false); extern void SSWriteReformerControlPages(void); extern bool SSRecoveryApplyDelay(const XLogReaderState *record); extern void SShandle_promote_signal(); diff --git a/src/include/ddes/dms/ss_reform_common.h b/src/include/ddes/dms/ss_reform_common.h index 387a6b52a..f6b3f675b 100644 --- a/src/include/ddes/dms/ss_reform_common.h +++ b/src/include/ddes/dms/ss_reform_common.h @@ -36,6 +36,8 @@ typedef struct SSBroadcastCancelTrx { int SSXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path); +bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, char *buf); +XLogReaderState *SSXLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data, Size alignedSize); void SSGetXlogPath(); void SSSaveReformerCtrl(); void SSClearSegCache(); diff --git a/src/include/knl/knl_guc/knl_instance_attr_storage.h b/src/include/knl/knl_guc/knl_instance_attr_storage.h index f5dee5b1b..fcb2ec3ec 100755 --- a/src/include/knl/knl_guc/knl_instance_attr_storage.h +++ b/src/include/knl/knl_guc/knl_instance_attr_storage.h @@ -98,6 +98,7 @@ typedef struct knl_instance_attr_dss { typedef struct knl_instance_attr_dms { bool enable_dms; bool enable_catalog_centralized; + bool enable_dss_aio; int instance_id; int recv_msg_pool_size; char* interconnect_url; diff --git a/src/include/knl/knl_instance.h b/src/include/knl/knl_instance.h index 2337f24c7..815aa6e46 100755 --- a/src/include/knl/knl_instance.h +++ b/src/include/knl/knl_instance.h @@ -112,16 +112,6 @@ enum knl_parallel_redo_state { REDO_DONE, }; -/* - * used for dms - */ -typedef enum en_dms_status { - DMS_STATUS_OUT = 0, - DMS_STATUS_JOIN = 1, - DMS_STATUS_REFORM = 2, - DMS_STATUS_IN = 3 -} dms_status_t; - /* all process level attribute which expose to user */ typedef struct knl_instance_attr { @@ -1206,7 +1196,10 @@ typedef struct knl_g_dms_context { pg_atomic_uint32 inDmsThreShmemInitCnt; // the count of threads in DmsCallbackThreadShmemInit pg_atomic_uint32 inProcExitCnt; // Post Main in proc_exit function bool dmsInited; -}knl_g_dms_context; + XLogRecPtr ckptRedo; + bool resetSyscache; + bool finishedRecoverOldPrimaryDWFile; +} knl_g_dms_context; typedef struct knl_instance_context { knl_virtual_role role; diff --git a/src/include/knl/knl_thread.h b/src/include/knl/knl_thread.h index 4d017164f..bbea8249e 100755 --- a/src/include/knl/knl_thread.h +++ b/src/include/knl/knl_thread.h @@ -3312,6 +3312,7 @@ typedef struct knl_t_publication_context { typedef struct knl_t_dms_context { MemoryContext msgContext; + bool buf_in_aio; } knl_t_dms_context; /* thread context. */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 903fa1666..348787b5b 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -466,7 +466,8 @@ extern bool CheckExecDirectPrivilege(const char* query); /* check user have priv u_sess->misc_cxt.Mode = (mode); \ } while (0) -#define ENABLE_DSS (g_instance.attr.attr_storage.dss_attr.ss_enable_dss == true) +#define ENABLE_DSS (g_instance.attr.attr_storage.dss_attr.ss_enable_dss) +#define ENABLE_DSS_AIO (ENABLE_DSS && g_instance.attr.attr_storage.dms_attr.enable_dss_aio && !IsInitdb) /* * Auxiliary-process type identifiers. diff --git a/src/include/postmaster/pagewriter.h b/src/include/postmaster/pagewriter.h index 220dc7581..6d458f6cd 100644 --- a/src/include/postmaster/pagewriter.h +++ b/src/include/postmaster/pagewriter.h @@ -27,6 +27,7 @@ #include "storage/buf/buf.h" #include "storage/lock/lwlock.h" #include "catalog/pg_control.h" +#include "ddes/dms/ss_aio.h" #define ENABLE_INCRE_CKPT g_instance.attr.attr_storage.enableIncrementalCheckpoint #define NEED_CONSIDER_USECOUNT u_sess->attr.attr_storage.enable_candidate_buf_usage_count @@ -82,6 +83,10 @@ typedef struct PageWriterProc { CandidateList normal_list; CandidateList nvm_list; CandidateList seg_list; + + /* auxiluary structs for implementing AIO in DSS */ + DSSAioCxt aio_cxt; + char *aio_buf; } PageWriterProc; typedef struct PageWriterProcs { diff --git a/src/include/storage/buf/buf_internals.h b/src/include/storage/buf/buf_internals.h index b3e8dfe78..15d98e8ee 100644 --- a/src/include/storage/buf/buf_internals.h +++ b/src/include/storage/buf/buf_internals.h @@ -211,6 +211,8 @@ typedef struct BufferDesc { bool encrypt; /* enable table's level data encryption */ volatile uint64 lsn_on_disk; + + volatile bool aio_in_progress; /* indicate aio is in progress */ #ifdef USE_ASSERT_CHECKING volatile uint64 lsn_dirty; #endif diff --git a/src/include/storage/dss/dss_adaptor.h b/src/include/storage/dss/dss_adaptor.h index 4e79237f1..112220767 100644 --- a/src/include/storage/dss/dss_adaptor.h +++ b/src/include/storage/dss/dss_adaptor.h @@ -68,6 +68,8 @@ typedef void (*dss_error_info)(int *errorcode, const char **errormsg); typedef void (*dss_svr_path)(const char *conn_path); typedef void (*dss_log_callback)(dss_log_output cb_log_output); typedef int (*dss_version)(void); +typedef int (*dss_aio_prep_pwrite_device)(void *iocb, int handle, void *buf, size_t count, long long offset); +typedef int (*dss_aio_prep_pread_device)(void *iocb, int handle, void *buf, size_t count, long long offset); typedef struct st_dss_device_op_t { void *handle; dss_create_device dss_create; @@ -104,6 +106,8 @@ typedef struct st_dss_device_op_t { dss_svr_path dss_set_svr_path; dss_log_callback dss_register_log_callback; dss_version dss_get_version; + dss_aio_prep_pwrite_device dss_aio_pwrite; + dss_aio_prep_pread_device dss_aio_pread; } dss_device_op_t; void dss_register_log_callback(dss_log_output cb_log_output); diff --git a/src/include/storage/dss/fio_dss.h b/src/include/storage/dss/fio_dss.h index 7d366eb85..21f7458e3 100644 --- a/src/include/storage/dss/fio_dss.h +++ b/src/include/storage/dss/fio_dss.h @@ -33,7 +33,6 @@ #include "storage/dss/dss_adaptor.h" void dss_device_register(dss_device_op_t *dss_device_op, bool enable_dss); - void dss_set_errno(int *errcode); bool dss_exist_file(const char *file_name); int dss_access_file(const char *file_name, int mode); @@ -80,4 +79,7 @@ int dss_chmod_file(const char* path, mode_t mode); int dss_set_server_status_wrapper(bool is_master); int dss_remove_dev(const char *name); +int dss_aio_prep_pwrite(void *iocb, int fd, void *buf, size_t count, long long offset); +int dss_aio_prep_pread(void *iocb, int fd, void *buf, size_t count, long long offset); + #endif // FIO_DSS_H \ No newline at end of file diff --git a/src/include/storage/smgr/segment.h b/src/include/storage/smgr/segment.h index a9d2b001a..df04328cf 100644 --- a/src/include/storage/smgr/segment.h +++ b/src/include/storage/smgr/segment.h @@ -56,6 +56,9 @@ void seg_physical_write(SegSpace *spc, RelFileNode &rNode, ForkNumber forknum, B bool skipFsync); XLogRecPtr seg_get_headlsn(SegSpace *spc, BlockNumber blockNum, bool isbucket); +int32 seg_physical_aio_prep_pwrite(SegSpace *spc, RelFileNode &rNode, ForkNumber forknum, BlockNumber blocknum, + const char *buffer, void *iocb_ptr); + /* segment sync callback */ void forget_space_fsync_request(SegSpace *spc); void seg_register_dirty_file(SegLogicFile *sf, int segno); diff --git a/src/include/storage/smgr/segment_internal.h b/src/include/storage/smgr/segment_internal.h index 7d6a9e54a..4861caaaa 100644 --- a/src/include/storage/smgr/segment_internal.h +++ b/src/include/storage/smgr/segment_internal.h @@ -35,6 +35,7 @@ #include "storage/smgr/smgr.h" #include "storage/file/fio_device_com.h" #include "utils/segment_test.h" +#include "libaio.h" const int DF_MAP_GROUP_RESERVED = 3; const int DF_MAX_MAP_GROUP_CNT = 33; @@ -102,6 +103,7 @@ void df_create_file(SegLogicFile *sf, bool redo); void df_shrink(SegLogicFile *sf, BlockNumber target); void df_flush_data(SegLogicFile *sf, BlockNumber blocknum, BlockNumber nblocks); bool df_ss_update_segfile_size(SegLogicFile *sf, BlockNumber target_block); +SegPhysicalFile df_get_physical_file(SegLogicFile *sf, int sliceno, BlockNumber target_block); /* * Data files status in the segment space; @@ -412,6 +414,8 @@ BlockNumber spc_size(SegSpace *spc, BlockNumber egRelNode, ForkNumber forknum); void spc_datafile_create(SegSpace *spc, BlockNumber egRelNode, ForkNumber forknum); void spc_extend_file(SegSpace *spc, BlockNumber egRelNode, ForkNumber forknum, BlockNumber blkno); bool spc_datafile_exist(SegSpace *spc, BlockNumber egRelNode, ForkNumber forknum); +int32 spc_aio_prep_pwrite(SegSpace *spc, RelFileNode relNode, ForkNumber forknum, BlockNumber blocknum, + const char *buffer, void *iocb_ptr); extern void spc_shrink_files(SegExtentGroup *seg, BlockNumber target_size, bool redo); diff --git a/src/test/regress/output/recovery_2pc_tools.source b/src/test/regress/output/recovery_2pc_tools.source index 7902ae1d9..413b8f9f5 100644 --- a/src/test/regress/output/recovery_2pc_tools.source +++ b/src/test/regress/output/recovery_2pc_tools.source @@ -594,6 +594,7 @@ select name,vartype,unit,min_val,max_val from pg_settings where name <> 'qunit_c sql_use_spacelimit | integer | kB | -1 | 2147483647 ss_dss_conn_path | string | | | ss_dss_vg_name | string | | | + ss_enable_aio | bool | | | ss_enable_catalog_centralized | bool | | | ss_enable_dms | bool | | | ss_enable_dss | bool | | |