Files
openGauss-server/src/include/access/double_write.h
2024-06-27 14:05:49 +08:00

380 lines
14 KiB
C

/*
* Copyright (c) 2020 Huawei Technologies Co.,Ltd.
*
* openGauss is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* ---------------------------------------------------------------------------------------
*
* double_write.h
* Define some inline function of double write and export some interfaces.
*
*
* IDENTIFICATION
* src/include/access/double_write.h
*
* ---------------------------------------------------------------------------------------
*/
#ifndef DOUBLE_WRITE_H
#define DOUBLE_WRITE_H
#include "double_write_basic.h"
#include "storage/buf/buf_internals.h"
#include "storage/checksum_impl.h"
typedef enum BufTagVer {
ORIGIN_TAG = 0,
HASHBUCKET_TAG,
PAGE_COMPRESS_TAG
} BufTagVer;
typedef struct st_dw_batch {
dw_page_head_t head;
uint16 page_num; /* for batch head, number of data pages */
uint16 buftag_ver;
BufferTag buf_tag[0]; /* to locate the data pages in batch */
} dw_batch_t;
typedef struct st_dw_batch_nohbkt {
dw_page_head_t head;
uint16 page_num; /* for batch head, number of data pages */
uint16 buftag_ver;
BufferTagFirstVer buf_tag[0]; /* to locate the data pages in batch */
} dw_batch_first_ver;
typedef struct dw_single_first_flush_item {
uint16 dwn; /* double write number, updated when file header changed */
BufferTag buf_tag;
}dw_first_flush_item;
typedef struct dw_single_flush_item {
uint16 data_page_idx; /* from zero start, indicates the slot of the data page. */
uint16 dwn; /* double write number, updated when file header changed */
BufferTag buf_tag;
pg_crc32c crc; /* CRC of all above ... MUST BE LAST! */
}dw_single_flush_item;
/* Used by double_write to mark the buffers which are not flushed in the given buf_id array. */
static const int DW_INVALID_BUFFER_ID = -1;
/* steal high bit from pagenum as the flag of hashbucket or segpage */
#define IS_HASH_BKT_SEGPAGE_MASK (0x8000)
#define GET_REL_PGAENUM(pagenum) (pagenum & ~IS_HASH_BKT_SEGPAGE_MASK)
/**
* Dirty data pages in one batch
* The number of data pages depends on the number of BufferTag one page can hold
*/
static const uint16 DW_BATCH_DATA_PAGE_MAX =
(uint16)((BLCKSZ - sizeof(dw_batch_t) - sizeof(dw_page_tail_t)) / sizeof(BufferTag));
static const uint16 DW_BATCH_DATA_PAGE_MAX_FOR_NOHBK =
(uint16)((BLCKSZ - sizeof(dw_batch_first_ver) - sizeof(dw_page_tail_t)) / sizeof(BufferTagFirstVer));
/* 1 head + data + 1 tail */
static const uint16 DW_EXTRA_FOR_ONE_BATCH = 2;
/* 1 head + data + [1 tail, 2 head] + data + 2 tail */
static const uint16 DW_EXTRA_FOR_TWO_BATCH = 3;
static const uint16 DW_BATCH_MIN = (1 + DW_EXTRA_FOR_ONE_BATCH);
static const uint16 DW_BATCH_MAX = (DW_BATCH_DATA_PAGE_MAX + DW_EXTRA_FOR_ONE_BATCH);
/* 2 batches at most for one perform */
static const uint16 DW_DIRTY_PAGE_MAX = (DW_BATCH_DATA_PAGE_MAX + DW_BATCH_DATA_PAGE_MAX);
static const uint16 DW_BUF_MAX = (DW_DIRTY_PAGE_MAX + DW_EXTRA_FOR_TWO_BATCH);
static const uint16 DW_BATCH_MAX_FOR_NOHBK = (DW_BATCH_DATA_PAGE_MAX_FOR_NOHBK + DW_EXTRA_FOR_ONE_BATCH);
/* 2 batches at most for one perform */
static const uint16 DW_DIRTY_PAGE_MAX_FOR_NOHBK = (DW_BATCH_DATA_PAGE_MAX_FOR_NOHBK + DW_BATCH_DATA_PAGE_MAX_FOR_NOHBK);
static const uint16 DW_BUF_MAX_FOR_NOHBK = (DW_DIRTY_PAGE_MAX_FOR_NOHBK + DW_EXTRA_FOR_TWO_BATCH);
#define GET_DW_BATCH_DATA_PAGE_MAX(contain_hashbucket) (!contain_hashbucket ? DW_BATCH_DATA_PAGE_MAX_FOR_NOHBK : DW_BATCH_DATA_PAGE_MAX)
#define GET_DW_BATCH_MAX(contain_hashbucket) (!contain_hashbucket ? DW_BATCH_MAX_FOR_NOHBK : DW_BATCH_MAX)
#define GET_DW_DIRTY_PAGE_MAX(contain_hashbucket) (!contain_hashbucket ? DW_DIRTY_PAGE_MAX_FOR_NOHBK : DW_DIRTY_PAGE_MAX)
#define GET_DW_MEM_CTX_MAX_BLOCK_SIZE(contain_hashbucket) (!contain_hashbucket ? DW_MEM_CTX_MAX_BLOCK_SIZE_FOR_NOHBK : DW_MEM_CTX_MAX_BLOCK_SIZE)
/*
* 1 block for alignment, 1 for file_head, 1 for reading data_page during recovery
* and DW_BUF_MAX for double_write buffer.
*/
static const uint32 DW_MEM_CTX_MAX_BLOCK_SIZE = ((1 + 1 + 1 + DW_BUF_MAX) * BLCKSZ);
static const uint32 DW_MEM_CTX_MAX_BLOCK_SIZE_FOR_NOHBK = ((1 + 1 + 1 + DW_BUF_MAX_FOR_NOHBK) * BLCKSZ);
const uint16 SINGLE_BLOCK_TAG_NUM = BLCKSZ / sizeof(dw_single_flush_item);
static const uint32 DW_BOOTSTRAP_VERSION = 91261;
const uint32 DW_SUPPORT_SINGLE_FLUSH_VERSION = 92266;
const uint32 DW_SUPPORT_NEW_SINGLE_FLUSH = 92433;
const uint32 DW_SUPPORT_MULTIFILE_FLUSH = 92568;
const uint32 DW_SUPPORT_BCM_VERSION = 92550;
const uint32 DW_SUPPORT_REABLE_DOUBLE_WRITE = 92590;
/* dw single flush file information, version is DW_SUPPORT_SINGLE_FLUSH_VERSION */
/* file head + storage buffer tag page + data page */
const int DW_SINGLE_FILE_SIZE = (1 + 161 + 32768) * BLCKSZ;
/* Reserve 8 bytes for bufferTag upgrade. now usepage num is 32768 * sizeof(dw_single_flush_item) / 8192 */
const int DW_SINGLE_BUFTAG_PAGE_NUM = 161;
const int DW_SINGLE_DIRTY_PAGE_NUM = 32768;
/* new dw single flush file, version is DW_SUPPORT_NEW_SINGLE_FLUSH */
/* file head + first version data page + file head + storage buffer tag page + second version data page */
const uint32 DW_NEW_SINGLE_FILE_SIZE = (32768 * BLCKSZ);
const uint16 DW_SECOND_BUFTAG_PAGE_NUM = 4;
const uint16 DW_SECOND_DATA_PAGE_NUM = (SINGLE_BLOCK_TAG_NUM * DW_SECOND_BUFTAG_PAGE_NUM);
const uint16 DW_FIRST_DATA_PAGE_NUM = (32768 - DW_SECOND_DATA_PAGE_NUM - DW_SECOND_BUFTAG_PAGE_NUM - 2);
const uint16 DW_SECOND_BUFTAG_START_IDX = 1 + DW_FIRST_DATA_PAGE_NUM + 1; /* two head */
const uint16 DW_SECOND_DATA_START_IDX = DW_SECOND_BUFTAG_START_IDX + DW_SECOND_BUFTAG_PAGE_NUM;
inline bool dw_buf_valid_dirty(uint64 buf_state)
{
if (ENABLE_DMS && ENABLE_DSS_AIO) {
return true;
}
return ((buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY));
}
inline bool dw_buf_ckpt_needed(uint64 buf_state)
{
return ((buf_state & (BM_VALID | BM_DIRTY | BM_CHECKPOINT_NEEDED)) == (BM_VALID | BM_DIRTY | BM_CHECKPOINT_NEEDED));
}
inline bool dw_verify_file_head_checksum(dw_file_head_t* file_head)
{
uint32 checksum;
uint16 org_cks = file_head->tail.checksum;
file_head->tail.checksum = 0;
checksum = pg_checksum_block((char*)file_head, sizeof(dw_file_head_t));
file_head->tail.checksum = org_cks;
return (org_cks == REDUCE_CKS2UINT16(checksum));
}
inline bool dw_verify_file_head(dw_file_head_t* file_head)
{
return file_head->head.dwn == file_head->tail.dwn && dw_verify_file_head_checksum(file_head);
}
inline void dw_calc_meta_checksum(dw_batch_meta_file* meta)
{
uint32 checksum;
meta->checksum = 0;
checksum = pg_checksum_block((char*)meta, sizeof(dw_batch_meta_file));
meta->checksum = REDUCE_CKS2UINT16(checksum);
}
inline void dw_calc_file_head_checksum(dw_file_head_t* file_head)
{
uint32 checksum;
file_head->tail.checksum = 0;
checksum = pg_checksum_block((char*)file_head, sizeof(dw_file_head_t));
file_head->tail.checksum = REDUCE_CKS2UINT16(checksum);
}
inline bool dw_verify_batch_checksum(dw_batch_t* batch)
{
uint32 checksum;
uint16 org_cks = DW_PAGE_CHECKSUM(batch);
DW_PAGE_CHECKSUM(batch) = 0;
checksum = pg_checksum_block((char*)batch, BLCKSZ);
DW_PAGE_CHECKSUM(batch) = org_cks;
return (org_cks == REDUCE_CKS2UINT16(checksum));
}
inline bool dw_verify_page(dw_batch_t* page)
{
return (page)->head.dwn == DW_PAGE_TAIL(page)->dwn && dw_verify_batch_checksum(page);
}
inline void dw_calc_batch_checksum(dw_batch_t* batch)
{
uint32 checksum;
DW_PAGE_CHECKSUM(batch) = 0;
checksum = pg_checksum_block((char*)batch, BLCKSZ);
DW_PAGE_CHECKSUM(batch) = REDUCE_CKS2UINT16(checksum);
}
inline dw_batch_t* dw_batch_tail_page(dw_batch_t* head_page)
{
return (dw_batch_t*)((char*)head_page + BLCKSZ * (GET_REL_PGAENUM(head_page->page_num) + 1));
}
/**
* verify the batch head and tail page, including dwn and checksum
* @param head_page batch head
* @param dwn double write number
* @return true dwn and checksum match
*/
inline bool dw_verify_batch(dw_batch_t* head_page, uint16 dwn)
{
if (head_page->head.dwn == dwn && dw_verify_page(head_page)) {
dw_batch_t* tail_page = dw_batch_tail_page(head_page);
return tail_page->head.dwn == dwn && dw_verify_page(tail_page);
}
return false;
}
inline uint64 dw_page_distance(void* left, void* right)
{
return ((char*)right - (char*)left) / BLCKSZ;
}
int64 dw_seek_file(int fd, int64 offset, int32 origin);
void dw_pread_file(int fd, void* buf, int size, int64 offset);
void dw_pwrite_file(int fd, const void* buf, int size, int64 offset, const char* fileName);
/**
* generate the file for the database first boot
*/
void dw_bootstrap();
/**
* do the memory allocate, spin_lock init, LWLock assign and double write recovery
* all the half-written pages should be recovered after this
* it should be finished before XLOG module start which may replay redo log
*/
void dw_init();
void dw_ext_init();
/**
* double write only work when incremental checkpoint enabled and double write enabled
* @return true if both enabled
*/
inline bool dw_enabled()
{
return (ENABLE_INCRE_CKPT && g_instance.attr.attr_storage.enable_double_write);
}
/**
* flush the buffers identified by the buf_id in buf_id_arr to double write file
* a token_id is returned, thus double write wish the caller to return it after the
* caller finish flushing the buffers to data file and forwarding the fsync request
* @param buf_id_arr the buffer id array which is used to get page from global buffer
* @param size the array size
*/
void dw_perform_batch_flush(uint32 size, CkptSortItem *dirty_buf_list, int thread_id, ThrdDwCxt* thrd_dw_cxt);
/**
* truncate the pages in double write file after ckpt or before exit
* wait for tokens, thus all the relative data file flush and fsync request forwarded
* then its safe to call fsync to make sure pages on data file
* and then safe to discard those pages on double write file
*/
void dw_truncate();
/**
* double write exit after XLOG exit.
* data file flushing, page writer and checkpointer thread may still running. wait for them.
*/
void dw_exit(bool single);
/**
* If double write is enabled and pagewriter is running,
* the dirty pages should only be flushed by pagewriter.
*/
inline bool dw_page_writer_running()
{
return (dw_enabled() && pg_atomic_read_u32(&g_instance.ckpt_cxt_ctl->current_page_writer_count) > 0);
}
/**
* If enable dms and aio, the aio_in_process should be false.
*/
inline bool dw_buf_valid_aio_finished(BufferDesc *buf_desc, uint64 buf_state)
{
if (!ENABLE_DMS || !ENABLE_DSS_AIO) {
return true;
}
return ((buf_state & BM_VALID) && ((buf_state & BM_DIRTY) || buf_desc->extra->aio_in_progress));
}
extern bool free_space_enough(int buf_id);
extern void dw_generate_single_file();
extern void dw_recovery_partial_write_single();
extern void dw_single_file_truncate(bool is_first);
extern void dw_generate_new_single_file();
extern void dw_cxt_init_single();
extern bool dw_verify_pg_checksum(PageHeader page_header, BlockNumber blockNum, bool dw_file);
extern void dw_log_recovery_page(int elevel, const char *state, BufferTag buf_tag);
extern bool dw_read_data_page(BufferTag buf_tag, SMgrRelation reln, char* data_block);
extern void dw_log_page_header(PageHeader page);
extern int buftag_compare(const void *pa, const void *pb);
extern void dw_encrypt_page(BufferTag tag, char* buf);
extern uint16 first_version_dw_single_flush(BufferDesc *buf_desc);
extern void dw_single_file_recycle(bool is_first);
extern bool backend_can_flush_dirty_page();
extern void dw_force_reset_single_file();
extern void reset_dw_pos_flag();
extern void clean_proc_dw_buf();
extern void init_proc_dw_buf();
extern void dw_prepare_file_head(char *file_head, uint16 start, uint16 dwn, int32 dw_version = -1);
extern void dw_set_pg_checksum(char *page, BlockNumber blockNum);
extern void dw_extend_file(int fd, const void *buf, int buf_size, int64 size,
int64 file_expect_size, bool single, char* file_name);
extern void dw_transfer_phybuffer_addr(const BufferDesc *buf_desc, BufferTag *buf_tag);
uint16 second_version_dw_single_flush(BufferTag tag, Block block, XLogRecPtr page_lsn,
bool encrypt, BufferTag phy_tag);
extern uint16 seg_dw_single_flush_without_buffer(BufferTag tag, Block block, bool* flush_old_file);
extern uint16 seg_dw_single_flush(BufferDesc *buf_desc, bool* flush_old_file);
extern void wait_all_single_dw_finish_flush_old();
extern void wait_all_single_dw_finish_flush(bool is_first);
extern uint16 dw_single_flush_internal_old(BufferTag tag, Block block, XLogRecPtr page_lsn,
BufferTag phy_tag, bool *dw_flush);
extern void dw_single_old_file_truncate();
extern void dw_recover_batch_meta_file(int fd, dw_batch_meta_file *batch_meta_file);
extern void dw_fetch_batch_file_name(int i, char* buf);
extern void wait_all_dw_page_finish_flush();
extern void dw_generate_meta_file(dw_batch_meta_file* batch_meta_file);
extern void dw_generate_batch_files(int batch_file_num, uint64 dw_file_size);
extern void dw_remove_batch_file(int dw_file_num);
extern void dw_remove_batch_meta_file();
extern void dw_recover_all_partial_write_batch(knl_g_dw_context *batch_cxt);
extern void dw_cxt_init_batch();
extern void dw_remove_file(const char* file_name);
extern int dw_open_file(const char* file_name);
extern int dw_create_file(const char* file_name);
extern void dw_upgrade_renable_double_write();
extern void dw_blocked_for_snapshot();
extern void dw_released_after_snapshot();
extern bool is_dw_snapshot_blocked();
#endif /* DOUBLE_WRITE_H */