From cb3fa65c635dbc2d90fa388a55746927e6f3118a Mon Sep 17 00:00:00 2001 From: Mijamind Date: Wed, 10 May 2023 14:21:59 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90=E8=B5=84=E6=BA=90=E6=B1=A0=E5=8C=96?= =?UTF-8?q?=E3=80=91openGauss=E7=AE=97=E5=AD=90=E4=B8=8B=E6=8E=A8=E7=89=B9?= =?UTF-8?q?=E6=80=A7=E5=90=88=E5=85=A5=201.opengauss=E5=86=85=E6=A0=B8?= =?UTF-8?q?=E9=80=82=E9=85=8D=202.ndpplugin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build/script/aarch64_opengauss_list | 1 + build/script/x86_64_opengauss_list | 1 + contrib/CMakeLists.txt | 2 + contrib/Makefile | 3 +- contrib/ndpplugin/CMakeLists.txt | 24 + contrib/ndpplugin/Makefile | 28 + contrib/ndpplugin/common.h | 680 +++++++ contrib/ndpplugin/component/ceph/ceph.h | 172 ++ contrib/ndpplugin/component/rpc/rpc.h | 136 ++ .../ndpplugin/component/thread/mpmcqueue.h | 207 +++ contrib/ndpplugin/dynloader.cpp | 44 + contrib/ndpplugin/ndp/ndp.h | 27 + contrib/ndpplugin/ndp/ndp_nodes.h | 135 ++ contrib/ndpplugin/ndp/ndp_req.h | 131 ++ contrib/ndpplugin/ndp_check.cpp | 548 ++++++ contrib/ndpplugin/ndp_check.h | 22 + contrib/ndpplugin/ndpam.cpp | 1139 ++++++++++++ contrib/ndpplugin/ndpam.h | 298 ++++ contrib/ndpplugin/ndpnodes.h | 23 + contrib/ndpplugin/ndpoutfuncs.cpp | 385 ++++ contrib/ndpplugin/ndpplugin--1.0.sql | 10 + contrib/ndpplugin/ndpplugin.control | 5 + contrib/ndpplugin/ndpplugin.cpp | 1570 +++++++++++++++++ contrib/ndpplugin/ndpplugin.h | 44 + contrib/ndpplugin/rpc.cpp | 615 +++++++ contrib/ndpplugin/utils/dynloader.h | 25 + src/common/backend/nodes/copyfuncs.cpp | 51 + src/gausskernel/ddes/ddes_commit_id | 2 +- .../optimizer/commands/dropcmds.cpp | 1 + .../optimizer/commands/explain.cpp | 57 + src/gausskernel/optimizer/plan/planner.cpp | 7 + src/gausskernel/runtime/executor/execScan.cpp | 12 + src/gausskernel/runtime/executor/nodeAgg.cpp | 7 +- .../runtime/executor/nodeSeqscan.cpp | 36 +- .../storage/access/hbstore/hbucket_am.cpp | 11 + .../storage/access/heap/heapam.cpp | 3 + src/gausskernel/storage/dss/dss_adaptor.cpp | 4 +- src/gausskernel/storage/dss/fio_dss.cpp | 19 + src/gausskernel/storage/smgr/md.cpp | 3 +- src/include/access/hbucket_am.h | 12 + src/include/access/heapam.h | 4 + src/include/access/htup.h | 2 +- src/include/executor/instrument.h | 4 + src/include/executor/tuptable.h | 1 + src/include/knl/knl_session.h | 25 + src/include/nodes/execnodes.h | 2 + src/include/nodes/nodes.h | 5 +- src/include/nodes/plannodes.h | 15 + src/include/optimizer/planner.h | 2 + src/include/storage/dss/dss_adaptor.h | 7 +- src/include/storage/dss/dss_api_def.h | 2 +- src/include/storage/dss/fio_dss.h | 6 +- src/include/storage/smgr/smgr.h | 1 + src/test/regress/pg_regress.cpp | 2 +- 54 files changed, 6555 insertions(+), 23 deletions(-) create mode 100644 contrib/ndpplugin/CMakeLists.txt create mode 100644 contrib/ndpplugin/Makefile create mode 100644 contrib/ndpplugin/common.h create mode 100644 contrib/ndpplugin/component/ceph/ceph.h create mode 100644 contrib/ndpplugin/component/rpc/rpc.h create mode 100644 contrib/ndpplugin/component/thread/mpmcqueue.h create mode 100644 contrib/ndpplugin/dynloader.cpp create mode 100644 contrib/ndpplugin/ndp/ndp.h create mode 100644 contrib/ndpplugin/ndp/ndp_nodes.h create mode 100644 contrib/ndpplugin/ndp/ndp_req.h create mode 100644 contrib/ndpplugin/ndp_check.cpp create mode 100644 contrib/ndpplugin/ndp_check.h create mode 100644 contrib/ndpplugin/ndpam.cpp create mode 100644 contrib/ndpplugin/ndpam.h create mode 100644 contrib/ndpplugin/ndpnodes.h create mode 100644 contrib/ndpplugin/ndpoutfuncs.cpp create mode 100644 contrib/ndpplugin/ndpplugin--1.0.sql create mode 100644 contrib/ndpplugin/ndpplugin.control create mode 100644 contrib/ndpplugin/ndpplugin.cpp create mode 100644 contrib/ndpplugin/ndpplugin.h create mode 100644 contrib/ndpplugin/rpc.cpp create mode 100644 contrib/ndpplugin/utils/dynloader.h diff --git a/build/script/aarch64_opengauss_list b/build/script/aarch64_opengauss_list index 606771d3d..7c29b3869 100644 --- a/build/script/aarch64_opengauss_list +++ b/build/script/aarch64_opengauss_list @@ -756,6 +756,7 @@ ./lib/postgresql/utf8_and_euc2004.so ./lib/postgresql/utf8_and_big5.so ./lib/postgresql/mppdb_decoding.so +./lib/postgresql/ndpplugin.so ./lib/postgresql/pg_plugin ./lib/postgresql/proc_srclib ./lib/postgresql/security_plugin.so diff --git a/build/script/x86_64_opengauss_list b/build/script/x86_64_opengauss_list index fb738d3c2..c949a2947 100644 --- a/build/script/x86_64_opengauss_list +++ b/build/script/x86_64_opengauss_list @@ -756,6 +756,7 @@ ./lib/postgresql/utf8_and_euc2004.so ./lib/postgresql/utf8_and_big5.so ./lib/postgresql/mppdb_decoding.so +./lib/postgresql/ndpplugin.so ./lib/postgresql/pg_plugin ./lib/postgresql/proc_srclib ./lib/postgresql/security_plugin.so diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 3c6fe2a4a..b6070c017 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -22,6 +22,7 @@ set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/file_fdw ${CMAKE_CURRENT_SOURCE_DIR}/log_fdw ${CMAKE_CURRENT_SOURCE_DIR}/gc_fdw + ${CMAKE_CURRENT_SOURCE_DIR}/ndpplugin ) add_subdirectory(hstore) @@ -42,4 +43,5 @@ add_subdirectory(log_fdw) if("${ENABLE_MULTIPLE_NODES}" STREQUAL "OFF") add_subdirectory(gc_fdw) endif() +add_subdirectory(ndpplugin) diff --git a/contrib/Makefile b/contrib/Makefile index 0f0088fcc..f1f57913c 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -55,7 +55,8 @@ SUBDIRS = \ tsearch2 \ unaccent \ vacuumlo \ - security_plugin + security_plugin \ + ndpplugin ifeq ($(with_openssl),yes) SUBDIRS += sslinfo diff --git a/contrib/ndpplugin/CMakeLists.txt b/contrib/ndpplugin/CMakeLists.txt new file mode 100644 index 000000000..12b46b3b8 --- /dev/null +++ b/contrib/ndpplugin/CMakeLists.txt @@ -0,0 +1,24 @@ +#This is the main CMAKE for build all ndpplugin. +# ndpplugin.so + +AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR} TGT_ndpplugin_SRC) + +SET(TGT_ndpplugin_INC + ${CMAKE_CURRENT_SOURCE_DIR} +) + +set(ndpplugin_DEF_OPTIONS ${MACRO_OPTIONS} -DNDP_CLIENT -DGlobalCache) +set(ndpplugin_COMPILE_OPTIONS ${OPTIMIZE_OPTIONS} ${OS_OPTIONS} ${PROTECT_OPTIONS} ${WARNING_OPTIONS} ${LIB_SECURE_OPTIONS} ${CHECK_OPTIONS}) +set(ndpplugin_LINK_OPTIONS ${LIB_LINK_OPTIONS}) + +add_shared_libtarget(ndpplugin TGT_ndpplugin_SRC TGT_ndpplugin_INC "${ndpplugin_DEF_OPTIONS}" "${ndpplugin_COMPILE_OPTIONS}" "${ndpplugin_LINK_OPTIONS}") +set_target_properties(ndpplugin PROPERTIES PREFIX "") + +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/ndpplugin.control + DESTINATION share/postgresql/extension/ +) +install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/ndpplugin--1.0.sql + DESTINATION share/postgresql/extension/ +) + +install(TARGETS ndpplugin DESTINATION lib/postgresql) diff --git a/contrib/ndpplugin/Makefile b/contrib/ndpplugin/Makefile new file mode 100644 index 000000000..4036f640a --- /dev/null +++ b/contrib/ndpplugin/Makefile @@ -0,0 +1,28 @@ +# contrib/ndpplugin/Makefile +top_builddir = ../../ + +RPC_INCLUDE = ./ + +RPC_CPPFLAGS = $(addprefix -I, $(RPC_INCLUDE)) -DNDP_CLIENT -DGlobalCache + +OBJS := ndpplugin.o dynloader.o rpc.o ndp_check.o ndpoutfuncs.o ndpam.o +CPPFLAGS += $(RPC_CPPFLAGS) + +MODULE_big = ndpplugin +EXTENSION = ndpplugin +REGRESS = ndpplugin +REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress -c 0 -d 1 --single_node + +DATA = ndpplugin--1.0.sql + +override CPPFLAGS :=$(filter-out -fPIE, $(CPPFLAGS)) -fPIC + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/ndpplugin +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/ndpplugin/common.h b/contrib/ndpplugin/common.h new file mode 100644 index 000000000..7291d0615 --- /dev/null +++ b/contrib/ndpplugin/common.h @@ -0,0 +1,680 @@ +/* ------------------------------------------------------------------------- + * + * common.h + * Fundamental C definitions. This is included by every .c file in + * openGauss (via either postgres.h or postgres_fe.h, as appropriate). + * + * Note that the definitions here are not intended to be exposed to clients + * of the frontend interface libraries --- so we don't worry much about + * polluting the namespace with lots of stuff... + * + * + * Portions Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, openGauss Contributors + * + * src/include/common.h + * + * ------------------------------------------------------------------------- + */ +/* + * ---------------------------------------------------------------- + * TABLE OF CONTENTS + * + * When adding stuff to this file, please try to put stuff + * into the relevant section, or add new sections as appropriate. + * + * section description + * ------- ------------------------------------------------ + * 0) pg_config.h and standard system headers + * 1) hacks to cope with non-ANSI C compilers + * 2) bool, true, false, TRUE, FALSE, NULL + * 3) standard system types + * 4) IsValid macros for system types + * 5) offsetof, lengthof, endof, alignment + * 6) widely useful macros + * 7) random stuff + * 8) system-specific hacks + * 9) C++-specific stuff + * + * NOTE: since this file is included by both frontend and backend modules, it's + * almost certainly wrong to put an "extern" declaration here. typedefs and + * macros are the kind of thing that might go here. + * + * ---------------------------------------------------------------- + */ + +#ifndef COMMON_H +#define COMMON_H + +#include "securec.h" + +/* + * We have to include stdlib.h here because it defines many of these macros + * on some platforms, and we only want our definitions used if stdlib.h doesn't + * have its own. The same goes for stddef and stdarg if present. + */ + +#ifndef NDP_CLIENT +#include "gs_config.h" +#endif + +#include +#include +#include +#include +#include +#ifdef HAVE_STDINT_H +#include +#endif + +#ifdef FAULT_INJECT +#include +constexpr int PERCENTAGE = 2; +constexpr int PERCENTAGE_DIV = 100; +constexpr int IOMAP_SIZE = 1024; +#endif + +#ifdef NDP_CLIENT +#include "c.h" +typedef int Status; +#else + +#ifndef likely +#define likely(x) __builtin_expect((x) != 0, 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect((x) != 0, 0) +#endif + +#define PG_INT8_MIN (-0x7F - 1) +#define PG_INT8_MAX 0x7F +#define PG_UINT8_MAX 0xFF +#define PG_INT16_MIN (-0x7FFF - 1) +#define PG_INT16_MAX 0x7FFF +#define PG_UINT16_MAX 0xFFFF +#define PG_INT32_MIN (-0x7FFFFFFF - 1) +#define PG_INT32_MAX (0x7FFFFFFF) +#define PG_UINT32_MAX 0xFFFFFFFFU +#define PG_INT64_MIN (-INT64CONST(0x7FFFFFFFFFFFFFFF) - 1) +#define PG_INT64_MAX INT64CONST(0x7FFFFFFFFFFFFFFF) +#define PG_UINT64_MAX UINT64CONST(0xFFFFFFFFFFFFFFFF) + +/* + * int128 type has 128 bits. + * INT128_MIN is (-1 * (1 << 127)) + * INT128_MAX is ((1 << 127) - 1) + */ +#define INT128_MAX (int128)(((uint128)1 << 127) - 1) +#define INT128_MIN (-INT128_MAX - 1) + +/* ---------------------------------------------------------------- + * Section 2: bool, true, false, TRUE, FALSE, NULL + * ---------------------------------------------------------------- + */ + +/* + * bool + * Boolean value, either true or false. + * + * XXX for C++ compilers, we assume the compiler has a compatible + * built-in definition of bool. + */ + +/* + * NULL + * Null pointer. + */ +#ifndef NULL +#define NULL ((void*)0) +#endif + +/* ---------------------------------------------------------------- + * Section 3: standard system types + * ---------------------------------------------------------------- + */ + +/* + * Pointer + * Variable holding address of any memory resident object. + * + * XXX Pointer arithmetic is done with this, so it can't be void * + * under "true" ANSI compilers. + */ +typedef char* Pointer; + +/* + * intN + * Signed integer, EXACTLY N BITS IN SIZE, + * used for numerical computations and the + * frontend/backend protocol. + */ +#ifndef HAVE_INT8 +typedef signed char int8; /* == 8 bits */ +typedef signed short int16; /* == 16 bits */ +typedef signed int int32; /* == 32 bits */ +#endif /* not HAVE_INT8 */ + +typedef __int128 int128; +typedef unsigned __int128 uint128; + +/* + * uintN + * Unsigned integer, EXACTLY N BITS IN SIZE, + * used for numerical computations and the + * frontend/backend protocol. + */ +#ifndef HAVE_UINT8 +typedef unsigned char uint8; /* == 8 bits */ +typedef unsigned short uint16; /* == 16 bits */ +typedef unsigned int uint32; /* == 32 bits */ +#endif /* not HAVE_UINT8 */ + +typedef unsigned int uint; /* == 32 bits */ + +/* + * bitsN + * Unit of bitwise operation, AT LEAST N BITS IN SIZE. + */ +typedef uint8 bits8; /* >= 8 bits */ +typedef uint16 bits16; /* >= 16 bits */ +typedef uint32 bits32; /* >= 32 bits */ + +/* + * 64-bit integers + */ +typedef long long int int64; +typedef unsigned long long int uint64; + +/* Decide if we need to decorate 64-bit constants */ +#define INT64CONST(x) ((int64)(x)) +#define UINT64CONST(x) ((uint64)(x)) + +/* + * Size + * Size of any memory resident object, as returned by sizeof. + */ +typedef size_t Size; + +/* + * Offset + * Offset into any memory resident array. + * + * Note: + * This differs from an Index in that an Index is always + * non negative, whereas Offset may be negative. + */ +typedef signed int Offset; + +/* + * Index + * Index into any memory resident array. + * + * Note: + * Indices are non negative. + */ +typedef unsigned int Index; + +/* + * Common openGauss datatype names (as used in the catalogs) + */ + +typedef int8 int1; +typedef int16 int2; +typedef int32 int4; +typedef float float4; +typedef double float8; + +typedef uint8 uint1; +typedef uint16 uint2; +typedef uint32 uint4; + +typedef int64 pg_time_t; + +typedef uintptr_t Datum; + +/* + * Oid, RegProcedure, TransactionId, SubTransactionId, MultiXactId, CommandId + */ + +typedef unsigned int Oid; + +#define InvalidOid 0 +#define InvalidBktId (-1) + +typedef Oid regproc; +typedef regproc RegProcedure; + +typedef uint64 TransactionId; + +#define TransactionIdPrecedes(id1, id2) ((id1) < (id2)) +#define TransactionIdPrecedesOrEquals(id1, id2) ((id1) <= (id2)) +#define TransactionIdFollows(id1, id2) ((id1) > (id2)) +#define TransactionIdFollowsOrEquals(id1, id2) ((id1) >= (id2)) + +#define StartTransactionIdIsValid(start_xid) ((start_xid) <= MAX_START_XID) + +typedef uint32 ShortTransactionId; + +typedef uint64 LocalTransactionId; + +typedef uint64 SubTransactionId; + + +/* Define to nothing if C supports flexible array members, and to 1 if it does + not. That way, with a declaration like `struct s { int n; double + d[FLEXIBLE_ARRAY_MEMBER]; };', the struct hack can be used with pre-C99 + compilers. When computing the size of such an object, don't use 'sizeof + (struct s)' as it overestimates the size. Use 'offsetof (struct s, d)' + instead. Don't use 'offsetof (struct s, d[0])', as this doesn't work with + MSVC and with C++ compilers. */ +#define FLEXIBLE_ARRAY_MEMBER /**/ + +struct varlena { + char vl_len_[4]; /* Do not touch this field directly! */ + char vl_dat[FLEXIBLE_ARRAY_MEMBER]; +}; + +typedef struct varlena bytea; +typedef struct varlena byteawithoutorderwithequalcol; +typedef struct varlena text; +typedef struct varlena BpChar; /* blank-padded char, ie SQL char(n) */ +typedef struct varlena VarChar; /* var-length char, ie SQL varchar(n) */ +typedef struct varlena NVarChar2; /* var-length char, ie SQL nvarchar2(n) */ + +/* + * These structs describe the header of a varlena object that may have been + * TOASTed. Generally, don't reference these structs directly, but use the + * macros below. + * + * We use separate structs for the aligned and unaligned cases because the + * compiler might otherwise think it could generate code that assumes + * alignment while touching fields of a 1-byte-header varlena. + */ +typedef union { + struct /* Normal varlena (4-byte length) */ + { + uint32 va_header; + char va_data[FLEXIBLE_ARRAY_MEMBER]; + } va_4byte; + struct /* Compressed-in-line format */ + { + uint32 va_header; + uint32 va_rawsize; /* Original data size (excludes header) */ + char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Compressed data */ + } va_compressed; +} varattrib_4b; + +typedef struct { + uint8 va_header; + char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Data begins here */ +} varattrib_1b; + +/* inline portion of a short varlena pointing to an external resource */ +typedef struct { + uint8 va_header; /* Always 0x80 or 0x01 */ + uint8 va_tag; /* Type of datum */ + char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Type-specific data */ +} varattrib_1b_e; + +typedef union { + struct { /* Normal varlena (4-byte length) */ + uint32 va_header; + char va_data[FLEXIBLE_ARRAY_MEMBER]; + } va_4byte; + struct { /* Compressed-in-line format */ + uint32 va_header; + uint32 va_rawsize; /* Original data size (excludes header) */ + char va_data[FLEXIBLE_ARRAY_MEMBER]; /* Compressed data */ + } va_compressed; +} varattrib_4b_fe; + +#define VARATT_NOT_PAD_BYTE(PTR) (*((uint8*)(PTR)) != 0) + +/* + * Endian-dependent macros. These are considered internal --- use the + * external macros below instead of using these directly. + * + * Note: IS_1B is true for external toast records but VARSIZE_1B will return 0 + * for such records. Hence you should usually check for IS_EXTERNAL before + * checking for IS_1B. + */ + +#ifdef WORDS_BIGENDIAN + +/* VARSIZE_4B() should only be used on known-aligned data */ +#define VARSIZE_4B(PTR) (((varattrib_4b*)(PTR))->va_4byte.va_header & 0x3FFFFFFF) +#define VARSIZE_1B(PTR) (((varattrib_1b*)(PTR))->va_header & 0x7F) +#define VARTAG_1B_E(PTR) (((varattrib_1b_e*)(PTR))->va_tag) +#define VARATT_IS_1B(PTR) ((((varattrib_1b*)(PTR))->va_header & 0x80) == 0x80) +#define VARATT_IS_1B_E(PTR) ((((varattrib_1b*)(PTR))->va_header) == 0x80) +#define VARATT_IS_4B(PTR) ((((varattrib_1b*)(PTR))->va_header & 0x80) == 0x00) +#define VARATT_IS_4B_C(PTR) ((((varattrib_1b*)(PTR))->va_header & 0xC0) == 0x40) +#define VARATT_IS_4B_U(PTR) ((((varattrib_1b*)(PTR))->va_header & 0xC0) == 0x00) +#define VARATT_IS_HUGE_TOAST_POINTER(PTR) ((((varattrib_1b*)(PTR))->va_header) == 0x80 && \ + ((((varattrib_1b_e*)(PTR))->va_tag) & 0x01) == 0x01) +#define VARSIZE_4B(PTR) (((varattrib_4b_fe *)(PTR))->va_4byte.va_header & 0x3FFFFFFF) +#define SET_VARSIZE_1B(PTR, len) (((varattrib_1b*)(PTR))->va_header = (len) | 0x80) + +#else /* !WORDS_BIGENDIAN */ + +#define VARSIZE_4B(PTR) ((((varattrib_4b*)(PTR))->va_4byte.va_header >> 2) & 0x3FFFFFFF) +#define VARSIZE_1B(PTR) ((((varattrib_1b*)(PTR))->va_header >> 1) & 0x7F) +#define VARTAG_1B_E(PTR) (((varattrib_1b_e*)(PTR))->va_tag) +#define VARATT_IS_1B(PTR) ((((varattrib_1b*)(PTR))->va_header & 0x01) == 0x01) +#define VARATT_IS_1B_E(PTR) ((((varattrib_1b*)(PTR))->va_header) == 0x01) +#define VARATT_IS_4B(PTR) ((((varattrib_1b*)(PTR))->va_header & 0x01) == 0x00) +#define VARATT_IS_4B_C(PTR) ((((varattrib_1b*)(PTR))->va_header & 0x03) == 0x02) +#define VARATT_IS_4B_U(PTR) ((((varattrib_1b*)(PTR))->va_header & 0x03) == 0x00) +#define VARATT_IS_HUGE_TOAST_POINTER(PTR) ((((varattrib_1b*)(PTR))->va_header) == 0x01 && \ + ((((varattrib_1b_e*)(PTR))->va_tag) >> 7) == 0x01) +#define SET_VARSIZE_4B(PTR, len) (((varattrib_4b_fe*)(PTR))->va_4byte.va_header = (((uint32)(len)) << 2)) +#define SET_VARSIZE_1B(PTR, len) (((varattrib_1b*)(PTR))->va_header = (((uint8)(len)) << 1) | 0x01) + +#endif /* WORDS_BIGENDIAN */ + +#define VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) + +#define VARHDRSZ_EXTERNAL offsetof(varattrib_1b_e, va_data) +#define VARHDRSZ_SHORT offsetof(varattrib_1b, va_data) +#define VARHDRSZ ((int32)sizeof(int32)) +#define VARTAG_SIZE(tag) ((tag & 0x80) == 0x00 ? \ +((tag) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ +((tag) == VARTAG_ONDISK ? sizeof(varatt_external) : \ +((tag) == VARTAG_BUCKET ? sizeof(varatt_external) + sizeof(int2) : \ +((tag) == VARTAG_LOB ? sizeof(varatt_lob_pointer) : \ +TrapMacro(true, "unknown vartag"))))) : \ +((tag & 0x7f) == VARTAG_INDIRECT ? sizeof(varatt_indirect) : \ +((tag & 0x7f) == VARTAG_ONDISK ? sizeof(varatt_lob_external) : \ +((tag & 0x7f) == VARTAG_BUCKET ? sizeof(varatt_lob_external) + sizeof(int2) : \ +((tag & 0x7f) == VARTAG_LOB ? sizeof(varatt_lob_pointer) : \ +TrapMacro(true, "unknown vartag")))))) + +#define VARDATA_4B(PTR) (((varattrib_4b*)(PTR))->va_4byte.va_data) +#define VARDATA_4B_C(PTR) (((varattrib_4b*)(PTR))->va_compressed.va_data) +#define VARDATA_1B(PTR) (((varattrib_1b*)(PTR))->va_data) +#define VARDATA_1B_E(PTR) (((varattrib_1b_e*)(PTR))->va_data) +#define VARDATA(PTR) VARDATA_4B(PTR) +#define VARSIZE(PTR) VARSIZE_4B(PTR) +#define VARDATA_SHORT(PTR) VARDATA_1B(PTR) +#define VARSIZE_SHORT(PTR) VARSIZE_1B(PTR) + +#define VARTAG_EXTERNAL(PTR) VARTAG_1B_E(PTR) +#define VARSIZE_EXTERNAL(PTR) (VARHDRSZ_EXTERNAL + VARTAG_SIZE(VARTAG_EXTERNAL(PTR))) +#define VARDATA_EXTERNAL(PTR) VARDATA_1B_E(PTR) + +#define VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) +#define VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) +#define VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) +#define VARATT_IS_EXTERNAL_ONDISK(PTR) (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_ONDISK) +#define VARATT_IS_EXTERNAL_INDIRECT(PTR) (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_INDIRECT) +#define VARATT_IS_EXTERNAL_BUCKET(PTR) \ + (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_BUCKET) +#define VARATT_IS_EXTERNAL_LOB(PTR) (VARATT_IS_EXTERNAL(PTR) && VARTAG_EXTERNAL(PTR) == VARTAG_LOB) +#define VARATT_IS_EXTERNAL_ONDISK_B(PTR) \ + (VARATT_IS_EXTERNAL_ONDISK(PTR) || VARATT_IS_EXTERNAL_BUCKET(PTR)) + +#define VARSIZE_ANY(PTR) \ + (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) : (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) : VARSIZE_4B(PTR))) + +#define VARSIZE_ANY_EXHDR(PTR) \ + (VARATT_IS_1B_E(PTR) ? VARSIZE_EXTERNAL(PTR) - VARHDRSZ_EXTERNAL : \ + (VARATT_IS_1B(PTR) ? VARSIZE_1B(PTR) - VARHDRSZ_SHORT : VARSIZE_4B(PTR) - VARHDRSZ)) + +#define VARDATA_ANY(PTR) (VARATT_IS_1B(PTR) ? VARDATA_1B(PTR) : VARDATA_4B(PTR)) + +#define SET_VARSIZE(PTR, len) SET_VARSIZE_4B(PTR, len) +#define SET_VARSIZE_SHORT(PTR, len) SET_VARSIZE_1B(PTR, len) + +#define VARATT_CONVERTED_SHORT_SIZE(PTR) (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) +#define VARATT_SHORT_MAX 0x7F +#define VARATT_CAN_MAKE_SHORT(PTR) \ + (VARATT_IS_4B_U(PTR) && (VARSIZE(PTR) - VARHDRSZ + VARHDRSZ_SHORT) <= VARATT_SHORT_MAX) + +/* ---------------- + * Special transaction ID values + * + * BootstrapTransactionId is the XID for "bootstrap" operations, and + * FrozenTransactionId is used for very old tuples. Both should + * always be considered valid. + * + * FirstNormalTransactionId is the first "normal" transaction id. + * Note: if you need to change it, you must change pg_class.h as well. + * ---------------- + */ + +#define InvalidTransactionId ((TransactionId)0) +#define BootstrapTransactionId ((TransactionId)1) +#define FrozenTransactionId ((TransactionId)2) +#define FirstNormalTransactionId ((TransactionId)3) + +/* ---------------- + * transaction ID manipulation macros + * ---------------- + */ + +#define TransactionIdIsValid(xid) ((xid) != InvalidTransactionId) +#define TransactionIdIsNormal(xid) ((xid) >= FirstNormalTransactionId) +#define TransactionIdEquals(id1, id2) ((id1) == (id2)) + +#define ShortTransactionIdToNormal(base, xid) \ + (TransactionIdIsNormal(xid) ? (TransactionId)(xid) + (base) : (TransactionId)(xid)) + + +typedef uint32 CommandId; + +#define NameStr(name) ((name).data) +#define NAMEDATALEN 64 +typedef struct nameData { + char data[NAMEDATALEN]; +} NameData; +typedef NameData* Name; + +#define FLOAT4OID 700 +#define FLOAT8OID 701 +#define INTERVALOID 1186 +#define BOOLOID 16 +#define INT8OID 20 +#define INT2OID 21 +#define INT4OID 23 +#define TEXTOID 25 +#define VARCHAROID 1043 +#define NUMERICOID 1700 +#define INT1OID 5545 +#define CSTRINGOID 2275 +#define FLOAT8ARRAYOID 1022 +#define BOOLARRAYOID 1000 +#define TEXTARRAYOID 1009 +#define INT4ARRAYOID 1007 +#define TIMESTAMPOID 1114 +#define BPCHAROID 1042 + +#define FirstCommandId ((CommandId)0) +#define InvalidCommandId (~(CommandId)0) + +/* + * CommitSeqNo is currently an LSN, but keep use a separate datatype for clarity. + */ +typedef uint64 CommitSeqNo; + +#define InvalidCommitSeqNo ((CommitSeqNo)0) + +/* ---------------------------------------------------------------- + * Section 4: IsValid macros for system types + * ---------------------------------------------------------------- + */ + +/* + * PointerIsValid + * True iff pointer is valid. + */ +#define PointerIsValid(pointer) ((const void*)(pointer) != NULL) + +/* ---------------------------------------------------------------- + * Section 5: offsetof, lengthof, endof, alignment + * ---------------------------------------------------------------- + */ +/* + * offsetof + * Offset of a structure/union field within that structure/union. + * + * XXX This is supposed to be part of stddef.h, but isn't on + * some systems (like SunOS 4). + */ +#ifndef offsetof +#define offsetof(type, field) ((long)&((type*)0)->field) +#endif /* offsetof */ + +/* + * lengthof + * Number of elements in an array. + */ +#define lengthof(array) (sizeof(array) / sizeof((array)[0])) + +/* ---------------------------------------------------------------- + * Section 6: widely useful macros + * ---------------------------------------------------------------- + */ + +/* ---------------- + * Alignment macros: align a length or address appropriately for a given type. + * The fooALIGN() macros round up to a multiple of the required alignment, + * while the fooALIGN_DOWN() macros round down. The latter are more useful + * for problems like "how many X-sized structures will fit in a page?". + * + * NOTE: TYPEALIGN[_DOWN] will not work if ALIGNVAL is not a power of 2. + * That case seems extremely unlikely to be needed in practice, however. + * ---------------- + */ + +#define TYPEALIGN(ALIGNVAL, LEN) (((uintptr_t)(LEN) + ((ALIGNVAL) - 1)) & ~((uintptr_t)((ALIGNVAL) - 1))) +#define SHORTALIGN(LEN) TYPEALIGN(ALIGNOF_SHORT, (LEN)) +#define INTALIGN(LEN) TYPEALIGN(ALIGNOF_INT, (LEN)) +#define DOUBLEALIGN(LEN) TYPEALIGN(ALIGNOF_DOUBLE, (LEN)) +#define MAXALIGN(LEN) TYPEALIGN(MAXIMUM_ALIGNOF, (LEN)) + +/* ---------------------------------------------------------------- + * Section 7: random stuff + * ---------------------------------------------------------------- + */ + +typedef enum Status { + STATUS_ERROR = -1, + STATUS_OK = 0 +} Status; +#endif + +#define CHECK_STATUS(condition) do { \ + if (!condition) return STATUS_ERROR; \ + }while (0) + +/* ---------------------------------------------------------------- + * Section 8: system-specific hacks + * + * This should be limited to things that absolutely have to be + * included in every source file. The port-specific header file + * is usually a better place for this sort of thing. + * ---------------------------------------------------------------- + */ + +/* ---------------------------------------------------------------- + * Section 9: C++-specific stuff + * + * This should be limited to stuff that are C++ language specific. + * ---------------------------------------------------------------- + */ + +#define DatumGetPointer(X) ((Pointer)(X)) +#define DatumGetCString(X) ((char*)DatumGetPointer(X)) + +typedef int64 Timestamp; +typedef int32 DateADT; + +typedef struct { + int64 count; + int64 sum; +} Int8TransTypeData; + +typedef double Cost; /* execution cost (in page-access units) */ + +/* The size of `void *', as computed by sizeof. */ +#define SIZEOF_VOID_P 8 + +#define SIZEOF_DATUM SIZEOF_VOID_P + +#ifndef AssertMacro +#define AssertMacro(condition) ((void)true) +#endif /* AssertMacro */ + +#define GET_1_BYTE(datum) (((Datum)(datum)) & 0x000000ff) +#define GET_2_BYTES(datum) (((Datum)(datum)) & 0x0000ffff) +#define GET_4_BYTES(datum) (((Datum)(datum)) & 0xffffffff) +#define GET_8_BYTES(datum) ((Datum)(datum)) + +#define SET_1_BYTE(value) (((Datum)(value)) & 0x000000ff) +#define SET_2_BYTES(value) (((Datum)(value)) & 0x0000ffff) +#define SET_4_BYTES(value) (((Datum)(value)) & 0xffffffff) +#define SET_8_BYTES(value) ((Datum)(value)) + +/* + * BoolGetDatum + * Returns datum representation for a boolean. + * + * Note: any nonzero value will be considered TRUE. + */ +#ifndef BoolGetDatum +#define BoolGetDatum(X) ((Datum)((X) ? 1 : 0)) +#endif + +/* + * PointerGetDatum + * Returns datum representation for a pointer. + */ +#ifndef PointerGetDatum +#define PointerGetDatum(X) ((Datum)(X)) +#endif + +/* + * CharGetDatum + * Returns datum representation for a character. + */ +#define CharGetDatum(X) ((Datum)SET_1_BYTE((unsigned char)(X))) + +/* + * Int16GetDatum + * Returns datum representation for a 16-bit integer. + */ +#define Int16GetDatum(X) ((Datum)SET_2_BYTES((uint16)(X))) + +/* + * Int32GetDatum + * Returns datum representation for a 32-bit integer. + */ +#define Int32GetDatum(X) ((Datum)SET_4_BYTES((uint32)(X))) + +#define DatumGetUInt8(X) ((uint8)GET_1_BYTE(X)) +#define DatumGetUInt32(X) ((uint32)GET_4_BYTES(X)) +#define DatumGetInt16(X) ((int16)GET_2_BYTES(X)) +#define DatumGetInt32(X) ((int32)GET_4_BYTES(X)) +#define DatumGetInt64(X) ((int64)GET_8_BYTES(X)) +#define DatumGetChar(X) ((char)GET_1_BYTE(X)) +#define DatumGetBool(X) ((bool)(((bool)(X)) != 0)) +#define DatumGetObjectId(X) ((Oid)GET_4_BYTES(X)) +#define DatumGetName(X) ((Name)DatumGetPointer(X)) +#define DatumGetDateADT(X) ((DateADT)DatumGetInt32(X)) + +#define UInt32GetDatum(X) ((Datum)SET_4_BYTES(X)) +#define Int64GetDatum(X) ((Datum)SET_8_BYTES(X)) +#define Int64GetDatumFast(X) Int64GetDatum(X) +#define ObjectIdGetDatum(X) ((Datum)SET_4_BYTES(X)) +#define DateADTGetDatum(X) Int32GetDatum(X) + +#define HIGHBIT 0x80 +#define IS_HIGHBIT_SET(ch) ((unsigned char)(ch)&HIGHBIT) + +#define SAMESIGN(a, b) (((a) < 0) == ((b) < 0)) + +typedef locale_t pg_locale_t; + +#endif /* COMMON_H */ diff --git a/contrib/ndpplugin/component/ceph/ceph.h b/contrib/ndpplugin/component/ceph/ceph.h new file mode 100644 index 000000000..f182d7d7e --- /dev/null +++ b/contrib/ndpplugin/component/ceph/ceph.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * ceph.h + * + * IDENTIFICATION + * src\include\component\ceph\ceph.h + * + * ------------------------------------------------------------------------- + */ + +#ifndef LIBSMARTSCAN_CEPHINTERFACE_H +#define LIBSMARTSCAN_CEPHINTERFACE_H + +#include "component/thread/mpmcqueue.h" + +#define RbdPath "librbd.so.1" +#define RadosPath "librados.so.2" + +typedef void *cephClientCtx; +typedef void *imageHandler; +typedef void *rbd_completion_t; +typedef void (*rbd_callback_t)(rbd_completion_t cb, void *arg); + +#ifdef GlobalCache +#define NDP_ASYNC_CEPH +#endif +#define CEPH_MAX_LENGTH 64 + +typedef enum CephStatus { + CEPH_ERROR = -1, + CEPH_OK = 0, +} CephStatus; + +typedef enum { + RBD_LOCK_NONE = 0, + RBD_LOCK_EXCLUSIVE = 1, + RBD_LOCK_SHARED = 2, +} RbdLockType; + +typedef struct CephObject { + uint32_t objId; + uint64_t objOffset; + char image[CEPH_MAX_LENGTH]; + char pool[CEPH_MAX_LENGTH]; +} CephObject; + +/*client keepalive deadline, a client should be kicked of*/ +#define CEPH_CLIENT_TIMEOUT 30 + +#define DSS_RBD_COOLIE_LEN 16 + +#define DSS_RBD_HEADER_LEN 32 + +/** + * before pool operation should init operation context + * ctx handler of pool operation + * poolName ceph pool name + * conf path pf ceph cluster conf + * timeout client keepalive timeout + * return 0 sucess, !0 failed; + */ +CephStatus CephClientCtxInit(cephClientCtx *ctx, char* poolName, const char *conf, uint64_t timeout); + +/** + * finish pool operation should close context + * ctx handle of pool operation + * return void + */ +void CephClientCtxClose(cephClientCtx ctx); + +/** + * open a image + * ctx handler of pool operation + * imageName image name + * fd imagehandler + * return 0 sucess, !0 failed + */ +CephStatus CephClientImageOpen(cephClientCtx ctx, char *imageName, imageHandler *fd); + +/** + * close image + * return 0 sucess, !0 failed + */ +void CephClientImageClose(imageHandler fd); + +/** + * read data from image + * fd image operation handler + * offset read from image offset + * buf read data buf + * size read data size + * return read size; + */ +int32_t CephClientImageReadUsingOffset(imageHandler fd, uint64_t offset, char *buf, uint64_t size); + +/** + * lock image + * ctx handle of pool operation + * fd handle of image operation + * type LockType exclusive/shared/none + * return 0 sucess, !0 failed + */ +CephStatus CephClientLock(cephClientCtx ctx, imageHandler fd, RbdLockType type); + +/** + * unlock image + * fd handle of image operation + * return 0 sucess, !0 failed + **/ +int CephClientUnLock(imageHandler fd); + +/* for GlobalCache */ +/** + * + **/ +int CephClientImageRead(imageHandler fd, CephObject* object, char* buf, size_t len); + +int CephClientImageAioRead(imageHandler fd, CephObject* object, + char* buf, size_t len, rbd_completion_t c); + +int CephClientAioCreateCom(void* cb_arg, rbd_callback_t complete_cb, rbd_completion_t* c); + +ssize_t CephClientAioGetRet(rbd_completion_t c); + +void CephClientAioRelease(rbd_completion_t c); + +/** + * get ceph descriptor + * ctx handler of pool operation + * poolName ceph pool name + * imageName image name + * fd imagehandler + * return 0 sucess, !0 failed + **/ +CephStatus GetCephDesc(char* poolName, char* imageName, cephClientCtx &ctx, imageHandler &fd); + +/** + * init ceph map lock + * return 0 sucess, !0 failed + **/ +CephStatus CephInit(); + +/** + * Finish Ceph + **/ +void CephUnInit(); + +typedef struct BufferPool { + MpmcBoundedQueue* bufferPool; + void* memptr; +} BufferPool; +extern BufferPool g_bufferPool; + +init_type InitCephBufferPool(); +void DestroyCephBufferPool(); + +// same with front end pool size, since frontend unlikely send request exceed memory pool size 2048 +constexpr int BUFFER_POOL_SIZE = 2048; +#endif diff --git a/contrib/ndpplugin/component/rpc/rpc.h b/contrib/ndpplugin/component/rpc/rpc.h new file mode 100644 index 000000000..5f4b27fad --- /dev/null +++ b/contrib/ndpplugin/component/rpc/rpc.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * rpc.h + * + * IDENTIFICATION + * src\include\component\rpc\rpc.h + * + * ------------------------------------------------------------------------- + */ +#ifndef __RPC_H__ +#define __RPC_H__ + +#include "ndp/ndp_req.h" + +#ifndef NDP_CLIENT +#include "component/thread/thread.h" +#include "knl/knl_instance.h" +#endif + +constexpr char* LIB_ULOG = "libulog.so"; +constexpr char* LIB_SSL = "libssl.so"; +constexpr char* LIB_RPC_UCX = "librpc_ucx.so"; +constexpr char* LIB_OPENSSL_DL = "libopenssl_dl.so"; +constexpr char* LIB_CRYPTO = "libcrypto.so"; + +typedef enum RpcStatus { + RPC_ERROR = -1, + RPC_OK = 0, +} RpcStatus; + +typedef enum RpcServiceId { + RPC_ADMIN_REQ = 0, + RPC_IO_REQ +} RpcServiceId; + +typedef uintptr_t RpcServer; +typedef uintptr_t RpcClient; +typedef uintptr_t RpcServerContext; + +/** + * @brief Message struct. + */ +typedef struct { + void *data; + size_t len; +} RpcMessage; + +/** + * @brief Message handler. + * + * User can pass the @b ctx and @b msg to other thread, it will remain valid until + * @ref OckRpcServerCleanupCtx is called. After calling the @ref OckRpcServerReply, + * user need to call @ref OckRpcServerCleanupCtx to release @b ctx. The lifetime + * of the memory that @b msg points to is same as @b ctx. So after invoking + * @ref OckRpcServerCleanupCtx, the @b msg is freed and can not be used anymore. + */ +typedef void(*RpcMsgHandler)(RpcServerContext ctx, RpcMessage msg); + +/** + * @brief RPC call completion callback. + * + * @b status is the result of the communication call and @b arg is specified by user. + */ +typedef void(*RpcDoneCallback)(RpcStatus status, void *arg); + +/** + * @brief RPC Service. + * + * Each service is a kind of message processing object. + */ +typedef struct { + uint16_t id; /** Message ID handled by this service. The range is [0,1024). */ + RpcMsgHandler handler; /** Message handler. */ +} RpcService; + +/** + * @brief RPC call completion handle. + * + * This structure should be allocated by the user and can be passed to communication + * primitives, such as @ref OckRpcClientCall. When the structure object is passed + * in, the communication routine changes to asynchronous mode. And if the routine + * returns success, the actual completion result will be notified through this callback. + */ +typedef struct { + RpcDoneCallback cb; /** User callback function. */ + void *arg; /** Argument of callback. */ +} RpcCallDone; + +typedef struct { + const char *key; + const char *value; +} RpcConfigPair; + +typedef struct { + int size; + RpcConfigPair *pairs; +} RpcConfigs; + +typedef struct { + char *ulogPath; + char *rpcPath; + char *sslDLPath; + char *sslPath; + char *cryptoPath; +} DependencePath; + +#ifdef NDP_CLIENT +RpcStatus RpcClientInit(DependencePath& paths); + +RpcStatus RpcClientConnect(char *ip, uint16_t port, RpcClient& clientHandle); +void RpcClientDisconnect(RpcClient clientHandle); + +RpcStatus RpcSendAdminReq(NdpAdminRequest* req, NdpAdminResponse* resp, size_t size, RpcClient clientHandle); +RpcStatus RpcSendIOReq(RpcMessage* request, RpcMessage* response, RpcCallDone* done, RpcClient clientHandle); +#else +RpcStatus RpcServerInit(void); + +RpcStatus RpcIOTaskHandler(NdpIOTask* task); + +RpcStatus SendIOTaskErrReply(NdpIOTask* task, NDP_ERRNO error); +#endif + +#endif /* __RPC_H__ */ diff --git a/contrib/ndpplugin/component/thread/mpmcqueue.h b/contrib/ndpplugin/component/thread/mpmcqueue.h new file mode 100644 index 000000000..7d223e77e --- /dev/null +++ b/contrib/ndpplugin/component/thread/mpmcqueue.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2022. All rights reserved. + * Description: mpmcqueue info + * Path: src/include/component/thread/mpmcqueue.h + */ + + +#ifndef LIBSMARTSCAN_MPMCQUEUE_H +#define LIBSMARTSCAN_MPMCQUEUE_H + +#include +#include + +#if defined(__aarch64__) /* 64 bit x86 */ +constexpr int CACHE_LINE_SIZE = 128; +#else +constexpr int CACHE_LINE_SIZE = 64; +#endif + +/** Multiple producer consumer, bounded queue + Implementation of Dmitry Vyukov's MPMC algorithm + http://www.1024cores.net/home/lock-free-algorithms/queues/bounded-mpmc-queue */ +template +class MpmcBoundedQueue { +public: + /** Constructor + @param[in] nElems Max number of elements allowed */ + explicit MpmcBoundedQueue(size_t nElems) + : _mRing(reinterpret_cast(new Aligned[nElems])), + _mCapacity(nElems - 1) + { + /* Should be a power of 2 */ + assert((nElems & (nElems - 1)) == 0); + + for (size_t i = 0; i < nElems; ++i) { + _mRing[i]._mPos.store(i, std::memory_order_relaxed); + } + + _mEnqueuePos.store(0, std::memory_order_relaxed); + _mDequeuePos.store(0, std::memory_order_relaxed); + } + + /** Destructor */ + ~MpmcBoundedQueue() + { + delete[] _mRing; + } + + /** Enqueue an element + @param[in] data Element to insert, it will be copied + @return true on success */ + bool Enqueue(T const& data) + { + /* _mEnqueuePos only wraps at MAX(_mEnqueuePos), instead + we use the capacity to convert the sequence to an array + index. This is why the ring buffer must be a size which + is a power of 2. This also allows the sequence to double + as a ticket/lock. */ + + size_t pos = _mEnqueuePos.load(std::memory_order_relaxed); + + Cell *cell = NULL; + + for (;;) { + cell = &_mRing[pos & _mCapacity]; + + size_t seq; + + seq = cell->_mPos.load(std::memory_order_acquire); + + intptr_t diff = (intptr_t)seq - (intptr_t)pos; + + /* If they are the same then it means this cell is empty */ + + if (diff == 0) { + /* Claim our spot by moving head. If head isn't the same as we last + checked then that means someone beat us to the punch. Weak compare is + faster, but can return spurious results which in this instance is OK, + because it's in the loop */ + + if (_mEnqueuePos.compare_exchange_weak(pos, pos + 1, + std::memory_order_relaxed)) { + break; + } + } else if (diff < 0) { + /* The queue is full */ + + return (false); + } else { + pos = _mEnqueuePos.load(std::memory_order_relaxed); + } + } + + cell->_mData = data; + + /* Increment the sequence so that the tail knows it's accessible */ + + cell->_mPos.store(pos + 1, std::memory_order_release); + + return (true); + } + + /** Dequeue an element + @param[out] data Element read from the queue + @return true on success */ + bool Dequeue(T& data) + { + Cell *cell = NULL; + size_t pos = _mDequeuePos.load(std::memory_order_relaxed); + + for (;;) { + cell = &_mRing[pos & _mCapacity]; + + size_t seq = cell->_mPos.load(std::memory_order_acquire); + + auto diff = (intptr_t)seq - (intptr_t)(pos + 1); + + if (diff == 0) { + /* Claim our spot by moving the head. If head isn't the same as we last + checked then that means someone beat us to the punch. Weak compare is + faster, but can return spurious results. Which in this instance is + OK, because it's in the loop. */ + + if (_mDequeuePos.compare_exchange_weak(pos, pos + 1, + std::memory_order_relaxed)) { + break; + } + } else if (diff < 0) { + /* The queue is empty */ + return (false); + } else { + /* Under normal circumstances this branch should never be taken. */ + pos = _mDequeuePos.load(std::memory_order_relaxed); + } + } + + data = cell->_mData; + + /* Set the sequence to what the head sequence should be next + time around */ + + cell->_mPos.store(pos + _mCapacity + 1, std::memory_order_release); + + return (true); + } + + /** @return the capacity of the queue */ + size_t Capacity() const + { + return (_mCapacity + 1); + } + + /** @return true if the queue is empty. */ + bool Empty() const + { + size_t pos = _mDequeuePos.load(std::memory_order_relaxed); + + for (;;) { + auto cell = &_mRing[pos & _mCapacity]; + + size_t seq = cell->_mPos.load(std::memory_order_acquire); + + auto diff = (intptr_t)seq - (intptr_t)(pos + 1); + + if (diff == 0) { + return (false); + } else if (diff < 0) { + return (true); + } else { + pos = _mDequeuePos.load(std::memory_order_relaxed); + } + } + + return (false); + } + +private: + using Pad = char[CACHE_LINE_SIZE]; + + struct Cell { + std::atomic _mPos; + T _mData; + }; + + using Aligned = + typename std::aligned_storage::value>::type; + + Pad m_pad0; + Cell *const _mRing; + size_t const _mCapacity; + Pad _mPad1; + std::atomic _mEnqueuePos; + Pad _mPad2; + std::atomic _mDequeuePos; + Pad _mPad3; + + MpmcBoundedQueue(MpmcBoundedQueue&&) = delete; + + MpmcBoundedQueue(const MpmcBoundedQueue&) = delete; + + MpmcBoundedQueue& operator=(MpmcBoundedQueue&&) = delete; + + MpmcBoundedQueue& operator=(const MpmcBoundedQueue&) = delete; +}; + +#endif //LIBSMARTSCAN_MPMCQUEUE_H diff --git a/contrib/ndpplugin/dynloader.cpp b/contrib/ndpplugin/dynloader.cpp new file mode 100644 index 000000000..57ddd7e11 --- /dev/null +++ b/contrib/ndpplugin/dynloader.cpp @@ -0,0 +1,44 @@ +#include "utils/dynloader.h" + +#include "dlfcn.h" +#ifndef NDP_CLIENT +#include "utils/log.h" +#else +#include "utils/elog.h" +#endif + +Status LoadSymbol(void *libHandle, char *symbol, void **symbolHandle) +{ + const char *dlsymErr = NULL; + + *symbolHandle = dlsym(libHandle, symbol); + dlsymErr = dlerror(); + if (dlsymErr != NULL) { +#ifndef NDP_CLIENT + LOG_ERROR << "load symbol error: " << symbol; +#else + ereport(WARNING, (errmsg("load symbol error: %s", symbol))); +#endif + return STATUS_ERROR; + } + return STATUS_OK; +} + +Status OpenDl(void **libHandle, char *symbol) +{ + *libHandle = dlopen(symbol, RTLD_LAZY); + if (*libHandle == NULL) { +#ifndef NDP_CLIENT + LOG_ERROR << "load dynamic lib (" << symbol << ") error: " << dlerror(); +#else + ereport(WARNING, (errmsg("load dynamic lib error: %s", symbol))); +#endif + return STATUS_ERROR; + } + return STATUS_OK; +} + +void CloseDl(void *libHandle) +{ + (void)dlclose(libHandle); +} diff --git a/contrib/ndpplugin/ndp/ndp.h b/contrib/ndpplugin/ndp/ndp.h new file mode 100644 index 000000000..09d844cfd --- /dev/null +++ b/contrib/ndpplugin/ndp/ndp.h @@ -0,0 +1,27 @@ +/* ------------------------------------------------------------------------- + * + * ndp.h + * Exports from ndp/ndp.cpp + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/ndp/ndp.h + * + * ------------------------------------------------------------------------- + */ +#ifndef NDP_H_ +#define NDP_H_ + +#include "component/rpc/rpc.h" +#include "utils/config.h" + +void* NdpMain(void* arg); + +init_type NdpWorkerInit(); +void NdpWorkerUnInit(); + +Status SubmitAioReadData(NdpIOTask* task); +#endif /* NDP_H_ */ + + diff --git a/contrib/ndpplugin/ndp/ndp_nodes.h b/contrib/ndpplugin/ndp/ndp_nodes.h new file mode 100644 index 000000000..4f5e50038 --- /dev/null +++ b/contrib/ndpplugin/ndp/ndp_nodes.h @@ -0,0 +1,135 @@ +/* ------------------------------------------------------------------------- + * + * ndp_nodes.h + * Exports for plan + * + * src/include/ndp/ndp_nodes.h + * + * ------------------------------------------------------------------------- + */ + +#ifndef LIBSMARTSCAN_NDP_NODES_H +#define LIBSMARTSCAN_NDP_NODES_H + +#ifdef NDP_CLIENT +typedef uintptr_t Datum; +#include "access/attnum.h" +#include "nodes/params.h" +#include "nodes/primnodes.h" +#else +#include "nodes/params.h" +#include "nodes/primnodes.h" +#include "utils/snapshot.h" +#endif + +enum init_type { INIT_SUCCESS, INIT_FAIL}; + +/***************************for NdpScanPage*******************************/ +typedef enum NdpScanPageFlag { + NORMAL_PAGE = 0, + NDP_FILTERED_PAGE, + NDP_AGG_PAGE, + INVALID_PAGE +} NdpScanPageFlag; + +/***************************for PlanState*********************************/ +typedef struct NdpPGAttr { + int2 attlen; + int4 attndims; + int4 attcacheoff; + bool attbyval; + char attstorage; + char attalign; +} NdpPGAttr; + +typedef struct NdpTupleDesc { + int natts; /* number of attributes in the tuple */ + NdpPGAttr* attrs; + Oid tdtypeid; /* composite type ID for tuple type */ + int32 tdtypmod; /* typmod for tuple type */ + bool tdhasoid; /* tuple has oid attribute in its header */ + bool tdhasuids; /* tuple has uid attribute in its header */ +} NdpTupleDesc; + +typedef struct NdpRelFileNode { + uint32 spcNode; + uint32 dbNode; + uint32 relNode; + uint16 bucketNode; + uint16 opt; +} NdpRelFileNode; + +typedef struct NdpRelation { + NdpRelFileNode node; + NdpTupleDesc att; +} NdpRelation; + +typedef struct NdpSnapshot { + uint16 satisfies; + uint64 xmin; /* all XID < xmin are visible to me */ + uint64 xmax; /* all XID >= xmax are invisible to me */ + uint64 snapshotcsn; + uint32 curcid; +} NdpSnapshot; + +typedef struct NdpXact { + NdpSnapshot snapshot; + uint64 transactionId; /* my XID, or Invalid if none */ + int usedComboCids; /* number of elements in comboCids */ + uint32 *comboCids; /* An array of cmin,cmax pairs, indexed by combo command id */ + uint64 latestCompletedXid; /* newest XID that has committed or aborted */ + int CLogLen; + char* CLogPageBuffer; + int CSNLogLen; + char* CSNLogPageBuffer; +} NdpXact; + +// context for one query +typedef struct NdpQuery { + uint32 tableNum; + NdpXact xact; +} NdpQuery; + +typedef struct NdpAggState { + NdpTupleDesc aggTd; + int aggNum; + NdpTupleDesc* perAggTd; + int numCols; + unsigned int* eqFuncOid; + unsigned int* hashFuncOid; +} NdpAggState; + +typedef struct NdpParamData { + bool isnull; /* is it NULL? */ + Oid ptype; /* parameter's datatype, or 0 */ + int16 typlen; + bool typbyval; + Datum value; /* parameter value */ +} NdpParamData; + +typedef struct NdpParamList { + int numParams; + NdpParamData* params; +} NdpParamList; + +typedef struct NdpSessionContext { + int sql_compatibility; /* belong to knl_session_attr_sql*/ + bool behavior_compat_flags; /* belong to knl_u_utils_context */ + int encoding; +} NdpSessionContext; + +typedef struct NdpPlanState { + NdpRelation rel; + NdpTupleDesc scanTd; + NdpAggState aggState; + NdpParamList paramList; + NdpSessionContext sess; +} NdpPlanState; + +enum FileType { + INVALIDFILE = -1, + MDFILE = 0, + SEGFILE = 1, +}; + +#endif //LIBSMARTSCAN_NDP_NODES_H diff --git a/contrib/ndpplugin/ndp/ndp_req.h b/contrib/ndpplugin/ndp/ndp_req.h new file mode 100644 index 000000000..9158a6454 --- /dev/null +++ b/contrib/ndpplugin/ndp/ndp_req.h @@ -0,0 +1,131 @@ +/* ------------------------------------------------------------------------- + * + * ndp_req.h + * Exports from ndp/ndp_req.cpp + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/ndp/ndp_req.h + * + * ------------------------------------------------------------------------- + */ +#ifndef NDP_REQ_H_ +#define NDP_REQ_H_ + +#include "common.h" +#include "ndp/ndp_nodes.h" + +#include "component/ceph/ceph.h" +#include "component/thread/mpmcqueue.h" + +const uint32 NDP_VERSION_NUM = 92899; +const uint32 NDP_LOCAL_VERSION_NUM = 4; + +#define DSS_DEFAULT_AU_SIZE (4 * 1024 * 1024) + +#define PAGE_NUM_PER_AU (DSS_DEFAULT_AU_SIZE / BLCKSZ) + +#ifndef BITS_PER_BYTE +#define BITS_PER_BYTE 8 +#endif + +#define BITMAP_SIZE_PER_AU_BYTE (PAGE_NUM_PER_AU / BITS_PER_BYTE) +#define BITMAP_SIZE_PER_AU_U64 (BITMAP_SIZE_PER_AU_BYTE / sizeof(uint64)) + +#define NDPGETBYTE(x, i) (*((char*)(x) + (int)((i) / BITS_PER_BYTE))) +#define NDPGETBITBYTE(x, i) ((((char)(x)) >> (i)) & 0x01) +#define NDPCLRBIT(x, i) NDPGETBYTE(x, i) &= ~(0x01 << ((i) % BITS_PER_BYTE)) +#define NDPSETBIT(x, i) NDPGETBYTE(x, i) |= (0x01 << ((i) % BITS_PER_BYTE)) +#define NDPGETBIT(x, i) ((NDPGETBYTE(x, i) >> ((i) % BITS_PER_BYTE)) & 0x01) +#define NDPMERGEBIT(x, y) ((x << 3) + y) +#define NDPGETARG1(z) (z >> 3) +#define NDPGETARG2(z) (z & 0b0111) + +#define TABLE_MAX_NUM 128 + +typedef enum NdpResult { + NDP_OK = 0, + NDP_ERR, + NDP_ILLEGAL +} NdpResult; + +typedef enum NdpCommand { + NDP_CONNECT = 0, + NDP_QUERY, + NDP_PLAN, + NDP_PLANSTATE, + NDP_TERMINATE, + NDP_VERSION +} NdpCommand; + +typedef struct AuInfo { + uint32 phyStartBlockNum; + int pageNum; + CephObject object; +} AuInfo; + +typedef struct NdpReqHeader { + uint8 command; + uint32 size; +} NdpReqHeader; + +typedef struct NdpAdminRequest { + NdpReqHeader head; + + uint16 taskId; + uint16 tableId; + + char data[0]; +} NdpAdminRequest; + +// take all here, but send needed +typedef struct NdpAdminResponse { + NdpResult ret; + uint16 queryId; +} NdpAdminResponse; + +typedef struct NdpIORequest { + uint16 taskId; + uint16 tableId; + AuInfo auInfos[1]; + uint64 pageMap[BITMAP_SIZE_PER_AU_U64]; +} NdpIORequest; + +typedef struct NdpIOResponse { + uint16 taskId; + uint16 status; + uint32 ndpPageNums; + uint64 pageMap[BITMAP_SIZE_PER_AU_U64]; + /* statistic */ +} NdpIOResponse; + +typedef struct AioDesc { + rbd_completion_t com; + char* readBuf; + int len; +} AioDesc; + +typedef class NdpIOTask { +public: + NdpIOTask(uintptr_t h, NdpIORequest* req) : handle(h), header(req), aioDesc(nullptr), aioRet(STATUS_OK) {} + + ~NdpIOTask(); + + Status InitBuffer(); + + uintptr_t handle; + NdpIORequest* header; + AioDesc* aioDesc; + Status aioRet; +} NdpIOTask; + +#ifndef NDP_CLIENT +Status NdpAdminProc(NdpAdminRequest *header, NdpAdminResponse& resp, size_t& len); +Status NdpIOProc(NdpIORequest *header, void *reply); +#endif + +init_type InitNdpQueryMgr(); +void DestroyNdpQueryMgr(); + +#endif /* NDP_REQ_H_ */ diff --git a/contrib/ndpplugin/ndp_check.cpp b/contrib/ndpplugin/ndp_check.cpp new file mode 100644 index 000000000..b07d5cac1 --- /dev/null +++ b/contrib/ndpplugin/ndp_check.cpp @@ -0,0 +1,548 @@ +/* ------------------------------------------------------------------------- + * ndp_check.cpp + * Routines to check whether to pushdown + * + * Portions Copyright (c) 2022 Huawei Technologies Co.,Ltd. + * + * IDENTIFICATION + * contrib/ndpplugin/ndp_check.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "catalog/pg_operator.h" +#include "utils/builtins.h" +#include "ndp_check.h" + +// operations not define in pg_operator.h +#define BPCHARLEOID 1059 +#define BPCHARGEOID 1061 +#define FLOAT48PLOID 1116 +#define FLOAT48MIOID 1117 +#define FLOAT48MULOID 1119 +#define FLOAT48DIVOID 1118 +#define FLOAT84PLOID 1126 +#define FLOAT84MIOID 1127 +#define FLOAT84MULOID 1129 +#define FLOAT84DIVOID 1128 +#define TEXTNEOID 531 +#define TEXTLEOID 665 +#define TEXTGEOID 667 + +// functions not define in pg_proc.h +#define INT4BOOLOID 2557 +#define BOOLINT4OID 2558 + +const Oid g_ndp_support_data_type[] = { + BOOLOID, + INT1OID, + INT2OID, + INT4OID, + INT8OID, + FLOAT4OID, + FLOAT8OID, + VARCHAROID, + TEXTOID, + BPCHAROID, + BPCHARARRAYOID, + TIMESTAMPOID, + NUMERICOID, + INTERVALOID, + ARRAYNUMERICOID, + TEXTARRAYOID +}; + +inline static bool CheckNdpSupportDataType(Oid oid) +{ + for (int i = 0; i < (int)(sizeof(g_ndp_support_data_type) / sizeof(Oid)); ++i) { + if (oid == g_ndp_support_data_type[i]) { + return true; + } + } + return false; +} + +const Oid g_ndp_support_op_expr_type[] = { + INT48EQOID, + INT48NEOID, + INT48LTOID, + INT48GTOID, + INT48LEOID, + INT48GEOID, + INT2EQOID, + INT2LTOID, + INT4EQOID, + INT4LTOID, + TEXTEQOID, + INT8EQOID, + INT8NEOID, + INT8LTOID, + INT8GTOID, + INT8LEOID, + INT8GEOID, + INT84EQOID, + INT84NEOID, + INT84LTOID, + INT84GTOID, + INT84LEOID, + INT84GEOID, + INT4MULOID, + INT4NEOID, + INT2NEOID, + INT2GTOID, + INT4GTOID, + INT2LEOID, + INT4LEOID, + INT2GEOID, + INT4GEOID, + INT2MULOID, + INT2DIVOID, + INT4DIVOID, + INT24EQOID, + INT42EQOID, + INT24LTOID, + INT42LTOID, + INT24GTOID, + INT42GTOID, + INT24NEOID, + INT42NEOID, + INT24LEOID, + INT42LEOID, + INT24GEOID, + INT42GEOID, + INT24MULOID, + INT42MULOID, + INT24DIVOID, + INT42DIVOID, + INT2PLOID, + INT4PLOID, + INT24PLOID, + INT42PLOID, + INT2MIOID, + INT4MIOID, + INT24MIOID, + INT42MIOID, + FLOAT4EQOID, + FLOAT4NEOID, + FLOAT4LTOID, + FLOAT4GTOID, + FLOAT4LEOID, + FLOAT4GEOID, + TEXTLTOID, + TEXTGTOID, + FLOAT8EQOID, + FLOAT8NEOID, + FLOAT8LTOID, + FLOAT8LEOID, + FLOAT8GTOID, + FLOAT8GEOID, + INT8PLOID, + INT8MIOID, + INT8MULOID, + INT8DIVOID, + INT84PLOID, + INT84MIOID, + INT84MULOID, + INT84DIVOID, + INT48PLOID, + INT48MIOID, + INT48MULOID, + INT48DIVOID, + INT82PLOID, + INT82MIOID, + INT82MULOID, + INT82DIVOID, + INT28PLOID, + INT28MIOID, + INT28MULOID, + INT28DIVOID, + BPCHAREQOID, + BPCHARNEOID, + BPCHARLTOID, + BPCHARGTOID, + FLOAT48EQOID, + FLOAT48NEOID, + FLOAT48LTOID, + FLOAT48GTOID, + FLOAT48LEOID, + FLOAT48GEOID, + FLOAT84EQOID, + FLOAT84NEOID, + FLOAT84LTOID, + FLOAT84GTOID, + FLOAT84LEOID, + FLOAT84GEOID, + NUMERICEQOID, + NUMERICNEOID, + NUMERICLTOID, + NUMERICLEOID, + NUMERICGTOID, + NUMERICGEOID, + NUMERICADDOID, + NUMERICSUBOID, + NUMERICMULOID, + NUMERICDIVOID, + INT28EQOID, + INT28NEOID, + INT28LTOID, + INT28GTOID, + INT28LEOID, + INT28GEOID, + INT82EQOID, + INT82NEOID, + INT82LTOID, + INT82GTOID, + INT82LEOID, + INT82GEOID, + TIMESTAMPEQOID, + TIMESTAMPNEOID, + TIMESTAMPLTOID, + TIMESTAMPLEOID, + TIMESTAMPGTOID, + TIMESTAMPGEOID, + OID_TEXT_LIKE_OP, + + // operations not define in pg_operator.h + BPCHARLEOID, + BPCHARGEOID, + FLOAT48PLOID, + FLOAT48MIOID, + FLOAT48MULOID, + FLOAT48DIVOID, + FLOAT84PLOID, + FLOAT84MIOID, + FLOAT84MULOID, + FLOAT84DIVOID, + TEXTNEOID, + TEXTLEOID, + TEXTGEOID, +}; + +inline static bool CheckNdpSupportOpExprType(Oid oid) +{ + for (int i = 0; i < (int)(sizeof(g_ndp_support_op_expr_type)/sizeof(Oid)); ++i) { + if (oid == g_ndp_support_op_expr_type[i]) { + return true; + } + } + return false; +} + +const Oid g_ndp_support_function_type[] = { + INT4NUMERICFUNCOID, + TIMESTAMPPARTFUNCOID, + TEXTSUBSTRINGFUNCOID, + FLOAT4TOFLOAT8FUNCOID, + INT4TOFLOAT8FUNCOID, + INT2TOFLOAT8FUNCOID, + INT2TOFLOAT4FUNCOID, + RTRIM1FUNCOID, + + // functions not define in pg_proc.h + INT4BOOLOID, + BOOLINT4OID, +}; + +inline static bool CheckNdpSupportFunctionType(Oid oid) +{ + for (int i = 0; i < (int)(sizeof(g_ndp_support_function_type)/sizeof(Oid)); ++i) { + if (oid == g_ndp_support_function_type[i]) { + return true; + } + } + return false; +} + +const Oid g_ndp_support_aggfunc_type[] = { + 2102, /* avg(int2) */ + 2101, /* avg(int4) */ + 2100, /* avg(int8) */ + 2104, /* avg(float4) */ + 2105, /* avg(float8) */ + 2103, /* avg(numeric) */ + 2106, /* avg(interval) */ + 2109, /* sum(int2) */ + 2108, /* sum(int4) */ + 2107, /* sum(int8) */ + 2110, /* sum(float4) */ + 2111, /* sum(float8) */ + 2114, /* sum(numeric) */ + 2113, /* sum(interval) */ + 2134, /* min(oid) */ + 2118, /* max(oid) */ + 2133, /* min(int2) */ + 2117, /* max(int2) */ + 2132, /* min(int4) */ + 2116, /* max(int4) */ + 2131, /* min(int8) */ + 2115, /* max(int8) */ + 2135, /* min(float4) */ + 2136, /* min(float8) */ + 2119, /* max(float4) */ + 2120, /* max(float8) */ + 2146, /* min(numeric) */ + 2130, /* max(numeric) */ + 2144, /* min(interval) */ + 2128, /* max(interval) */ + 2138, /* min(date) */ + 2122, /* max(date) */ + 2142, /* min(timestamp) */ + 2126, /* max(timestamp) */ + 2245, /* min(bpchar) */ + 2244, /* max(bpchar) */ + 2145, /* min(text) */ + 2129, /* max(text) */ + 2147, /* count(expr) */ + 2803, /* count(*) */ +}; + +inline static bool CheckNdpSupportAggFuncType(Oid oid) +{ + for (int i = 0; i < (int)(sizeof(g_ndp_support_aggfunc_type)/sizeof(Oid)); ++i) { + if (oid == g_ndp_support_aggfunc_type[i]) { + return true; + } + } + return false; +} + +static bool CheckNdpSupportVar(Var *var) +{ + if (!CheckNdpSupportDataType(var->vartype)) { + return false; + } + + if (var->vartype == NUMERICOID) { + unsigned int typemod = (unsigned int)(var->vartypmod - VARHDRSZ); + if (var->vartypmod != -1) { + int precision = (typemod >> 16) & 0xffff; + if (precision <= 0 || precision > 38) { + return false; + } + } + } + return true; +} +static bool CheckNdpSupportParam(Param *param) +{ + if (param->paramkind != PARAM_EXTERN) { + return false; + } + return CheckNdpSupportDataType(param->paramtype); +} + +static bool CheckNdpSupportListType(const List* exprs); + +static bool CheckNdpSupportNodeType(Node* node) +{ + if (!node) return true; + + switch (node->type) { + case T_TargetEntry: + return CheckNdpSupportNodeType(castNode(Node, (castNode(TargetEntry, node)->expr))); + case T_Var: + return CheckNdpSupportVar(castNode(Var, node)); + case T_Param: + return CheckNdpSupportParam(castNode(Param, node)); + case T_OpExpr: { + OpExpr* op = castNode(OpExpr, node); + // check OpExpr::opfuncid in future + if ((!CheckNdpSupportDataType(op->opresulttype)) || (!CheckNdpSupportOpExprType(op->opno))) { + return false; + } + return CheckNdpSupportListType(op->args); + } case T_Const: + return CheckNdpSupportDataType(castNode(Const, node)->consttype); + case T_RelabelType: + return CheckNdpSupportDataType(castNode(RelabelType, node)->resulttype); + case T_Aggref: { + Aggref* agg = castNode(Aggref, node); + if ((!CheckNdpSupportDataType(agg->aggtype)) || (!CheckNdpSupportAggFuncType(agg->aggfnoid))) { + return false; + } + if ((!CheckNdpSupportListType(agg->aggorder)) || (!CheckNdpSupportListType(agg->aggdistinct))) { + return false; + } + return CheckNdpSupportListType(agg->args); + } + case T_FuncExpr: { + FuncExpr* func = castNode(FuncExpr, node); + if ((!CheckNdpSupportDataType(func->funcresulttype)) || (!CheckNdpSupportFunctionType(func->funcid))) { + return false; + } + return CheckNdpSupportListType(func->args); + } + case T_BoolExpr: { + BoolExpr *boolExpr = castNode(BoolExpr, node); + return CheckNdpSupportListType(boolExpr->args); + } + case T_CaseExpr: { + CaseExpr *caseExpr = castNode(CaseExpr, node); + if ((!CheckNdpSupportDataType(caseExpr->casetype)) || + (!CheckNdpSupportNodeType(castNode(Node, caseExpr->defresult)))) { + return false; + } + return CheckNdpSupportListType(caseExpr->args); + } + case T_CaseWhen: { + CaseWhen *caseWhen = castNode(CaseWhen, node); + return CheckNdpSupportNodeType(castNode(Node, caseWhen->expr)); + } + case T_CaseTestExpr: { + CaseTestExpr *caseTestExpr = castNode(CaseTestExpr, node); + return CheckNdpSupportDataType(caseTestExpr->typeId); + } + case T_ScalarArrayOpExpr: { + ScalarArrayOpExpr *scalarArrayOpExpr = castNode(ScalarArrayOpExpr, node); + if (!CheckNdpSupportOpExprType(scalarArrayOpExpr->opno)) { + return false; + } + return CheckNdpSupportListType(scalarArrayOpExpr->args); + } + default: + return false; + } +} + +static bool CheckNdpSupportListType(const List* exprs) +{ + if (!exprs) return true; + + foreach_cell (l, exprs) { + Node* node = (Node*)lfirst(l); + if (!CheckNdpSupportNodeType(node)) { + return false; + } + } + return true; +} + +// check Plan, remove stmt if no use in future +Plan* CheckAndGetNdpPlan(PlannedStmt* stmt, SeqScan* scan, Plan* parent) +{ + Plan* node = nullptr; + + // 1. check scan, should check Scan::tableRows or Plan::plan_rows? + if (scan->plan.exec_type != EXEC_ON_DATANODES + || !CheckNdpSupportListType(scan->plan.targetlist) + || !CheckNdpSupportListType(scan->plan.qual)) { + return nullptr; + } + node = (Plan*)scan; + + // 2. check agg + if (parent && IsA(parent, Agg)) { + Agg* agg = (Agg*)parent; + if (agg->aggstrategy != AGG_SORTED && // don't support distinct sort agg + agg->groupingSets == nullptr && agg->chain == nullptr && // don't support grouping set + CheckNdpSupportListType(parent->targetlist) && + CheckNdpSupportListType(parent->qual)) { + node = parent; + } + } + + // 3. not push down if not agg and scan has no filter + if (node == (Plan*)scan && scan->plan.qual == nullptr) { + return nullptr; + } + + // 4. if seqscan or agg is righttree of nestloop, do not push down + if (parent && IsA(parent, NestLoop) && parent->righttree == node) { + return nullptr; + } + + // 5. if seqscan is under merge join, do not push down + if (parent && IsA(parent, MergeJoin)) { + return nullptr; + } + + return node; +} + +static bool CheckNdpSupportHint(HintState* hint) +{ + return true; +} + +static bool CheckNdpSupportXact(void) +{ + if (IsolationIsSerializable()) { + return false; + } + + if (!IsTransactionState() || IsSubTransaction()) { + return false; + } + + return true; +} + +static bool CheckNdpPreloadSupport(const char* libraries) +{ + char* rawstring = NULL; + List* elemlist = NULL; + ListCell* l = NULL; + + if (libraries == NULL || libraries[0] == '\0') { + return false; /* nothing to do */ + } + + /* Need a modifiable copy of string */ + rawstring = pstrdup(libraries); + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawstring, ',', &elemlist)) { + /* syntax error in list */ + pfree(rawstring); + list_free(elemlist); + ereport(LOG, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("invalid syntax in parameter shared_preload_libraries"))); + return false; + } + + foreach (l, elemlist) { + char* tok = (char*)lfirst(l); + char* filename = NULL; + + filename = pstrdup(tok); + if (strcmp(filename, "ndpplugin") == 0) { + pfree(filename); + pfree(rawstring); + list_free(elemlist); + return true; + } else { + pfree(filename); + } + } + + pfree(rawstring); + list_free(elemlist); + ereport(LOG, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("ndpplugin is not preloaded"))); + return false; +} + +// check Query/PlanedStmt/conf +bool CheckNdpSupport(Query* querytree, PlannedStmt *stmt) +{ + if (!u_sess->ndp_cxt.enable_ndp) { + return false; + } + + /* only plain relations are supported */ + if (!stmt || stmt->commandType != CMD_SELECT) { + return false; + } + + if (!CheckNdpSupportXact()) { + return false; + } + + if (!CheckNdpSupportHint(querytree->hintState)) { + return false; + } + + auto libraries = g_instance.attr.attr_common.shared_preload_libraries_string; + if (stmt->num_streams > 0 && !CheckNdpPreloadSupport(libraries)) { + return false; + } + return true; +} diff --git a/contrib/ndpplugin/ndp_check.h b/contrib/ndpplugin/ndp_check.h new file mode 100644 index 000000000..ba5a7a2d4 --- /dev/null +++ b/contrib/ndpplugin/ndp_check.h @@ -0,0 +1,22 @@ +/* ------------------------------------------------------------------------- + * ndp_check.h + * prototypes for functions in contrib/ndpplugin/ndp_check.cpp + * + * Portions Copyright (c) 2022 Huawei Technologies Co.,Ltd. + * + * IDENTIFICATION + * contrib/ndpplugin/ndp_check.h + * + * ------------------------------------------------------------------------- + */ + +#ifndef NDP_CHECK_H +#define NDP_CHECK_H + +#include "pgstat.h" +#include "catalog/pg_proc.h" + +Plan* CheckAndGetNdpPlan(PlannedStmt* stmt, SeqScan* scan, Plan* parent); +bool CheckNdpSupport(Query* querytree, PlannedStmt *stmt); + +#endif // NDP_CHECK_H diff --git a/contrib/ndpplugin/ndpam.cpp b/contrib/ndpplugin/ndpam.cpp new file mode 100644 index 000000000..4a6dc60e4 --- /dev/null +++ b/contrib/ndpplugin/ndpam.cpp @@ -0,0 +1,1139 @@ +/* ------------------------------------------------------------------------- + * ndpam.cpp + * Routines to handle ndp page + * + * Portions Copyright (c) 2022 Huawei Technologies Co.,Ltd. + * + * IDENTIFICATION + * contrib/ndpplugin/ndpam.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "access/csnlog.h" +#include "access/slru.h" +#include "executor/node/nodeAgg.h" + +#include "component/rpc/rpc.h" +#include "storage/smgr/segment_internal.h" +#include "ddes/dms/ss_transaction.h" +#include "algorithm" +#include "storage/dss/fio_dss.h" +#include "storage/smgr/segment.h" + +#include "ndpnodes.h" +#include "ndpam.h" + +#define NDP_PAGE_QUEUE_SIZE (1u << 10) +#define NDP_NORMAL_QUEUE_SIZE (1u << 12) + +#define ClogCtl(n) (&t_thrd.shemem_ptr_cxt.ClogCtl[CBufHashPartition(n)]) +#define CsnlogCtl(n) (&t_thrd.shemem_ptr_cxt.CsnlogCtlPtr[CSNBufHashPartition(n)]) +#define CSNLOG_XACTS_PER_PAGE (BLCKSZ / sizeof(CommitSeqNo)) +#define CSN_LWLOCK_ACQUIRE(pageno, lockmode) ((void)LWLockAcquire(CSNBufMappingPartitionLock(pageno), lockmode)) +#define CSN_LWLOCK_RELEASE(pageno) (LWLockRelease(CSNBufMappingPartitionLock(pageno))) + +#define TransactionIdToCSNPage(xid) ((xid) / (TransactionId)CSNLOG_XACTS_PER_PAGE) + +constexpr int RPC_FAILED_LIMIT = 3; +constexpr int SEND_FAILED_LIMIT = 10; + +inline static void SegLogicPageIdToExtentOffset(BlockNumber logicId, uint32* offset) +{ + if (logicId < EXT_SIZE_8_TOTAL_PAGES) { + *offset = logicId % EXT_SIZE_8; + } else if (logicId < EXT_SIZE_128_TOTAL_PAGES) { + logicId -= EXT_SIZE_8_TOTAL_PAGES; + *offset = logicId % EXT_SIZE_128; + } else if (logicId < EXT_SIZE_1024_TOTAL_PAGES) { + logicId -= EXT_SIZE_128_TOTAL_PAGES; + *offset = logicId % EXT_SIZE_1024; + } else { + logicId -= EXT_SIZE_1024_TOTAL_PAGES; + *offset = logicId % EXT_SIZE_8192; + } +} + +static void md_get_physical_info(Relation rel, ForkNumber forknum, BlockNumber blocknum, + int *handle, off_t *offset) +{ + SMgrRelation reln = rel->rd_smgr; + MdfdVec *v = NULL; + v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); + Assert(v != NULL); + + *handle = FileFd(v->mdfd_vfd); + *offset = DF_OFFSET_TO_SLICE_OFFSET(((off_t)blocknum) * BLCKSZ); +} + +void md_get_pageinfo(NdpScanDesc ndpScan, BlockNumber page, CephObject *object, char *ip, + BlockNumber& end, uint32& phyStartBlockNum) +{ + // step1:get object and ip. + int handle; + off_t offset; + HeapScanDesc scan = (HeapScanDesc)ndpScan->scan; + md_get_physical_info(scan->rs_base.rs_rd, 0, page, &handle, &offset); + +#ifdef GlobalCache + dss_get_addr(handle, offset, object->pool, object->image, ip, &object->objId, &object->objOffset); +#endif + + // step2:slice pages, get end and phyStartBlockNum. + BlockNumber start = ndpScan->handledBlock; +#ifdef GlobalCache + Assert(object->objOffset >= 0); + end = Min((uint64_t)ndpScan->nBlock, (uint64_t)(start + PAGE_NUM_PER_AU - object->objOffset/BLCKSZ)); +#else + end = Min((uint64_t)ndpScan->nBlock, (uint64_t)(start + PAGE_NUM_PER_AU)); +#endif + phyStartBlockNum = NDPMERGEBIT(start, 0); +} + +static void seg_get_physical_info(Relation rel, ForkNumber forknum, BlockNumber blocknum, + SegPageLocation &loc, int *handle, off_t *offset) +{ + SMgrRelation reln = rel->rd_smgr; + loc = seg_get_physical_location(rel->rd_node, MAIN_FORKNUM, blocknum); + SegmentCheck(loc.blocknum != InvalidBlockNumber); + + LockSegmentHeadPartition(reln->seg_space->spcNode, reln->seg_space->dbNode, + reln->seg_desc[forknum]->head_blocknum, LW_SHARED); + + SegSpace *spc = reln->seg_space; + + RelFileNode relNode = {.spcNode = spc->spcNode, + .dbNode = spc->dbNode, + .relNode = EXTENT_SIZE_TO_TYPE(loc.extent_size), + .bucketNode = SegmentBktId, + .opt = 0}; + int egid = EXTENT_TYPE_TO_GROUPID(relNode.relNode); + SegExtentGroup *seg = &spc->extent_group[egid][forknum]; + + off_t beginoff = ((off_t)loc.blocknum) * BLCKSZ; + int sliceno = DF_OFFSET_TO_SLICENO(beginoff); + SegPhysicalFile spf = df_get_physical_file(seg->segfile, sliceno, loc.blocknum); + *handle = spf.fd; + + *offset = DF_OFFSET_TO_SLICE_OFFSET(beginoff); + + UnlockSegmentHeadPartition(reln->seg_space->spcNode, reln->seg_space->dbNode, + reln->seg_desc[forknum]->head_blocknum); +} + +void seg_get_pageinfo(NdpScanDesc ndpScan, BlockNumber page, CephObject *object, char *ip, + BlockNumber& end, uint32& phyStartBlockNum) +{ + // step1:get object and ip. + int handle; + off_t offset; + SegPageLocation loc; + HeapScanDesc scan = (HeapScanDesc)ndpScan->scan; + seg_get_physical_info(scan->rs_base.rs_rd, 0, page, loc, &handle, &offset); + +#ifdef GlobalCache + dss_get_addr(handle, offset, object->pool, object->image, ip, &object->objId, &object->objOffset); +#endif + + // step2:slice pages, get end and phyStartBlockNum. + uint32 extentOff; + BlockNumber start = ndpScan->handledBlock; + SegLogicPageIdToExtentOffset(ndpScan->handledBlock, &extentOff); + uint64_t blks = Min(loc.extent_size - extentOff, + Min(uint32(PAGE_NUM_PER_AU), uint32(ndpScan->nBlock - ndpScan->handledBlock))); +#ifdef GlobalCache + Assert(object->objOffset >= 0); + end = start + Min(blks, (uint64_t)(PAGE_NUM_PER_AU - object->objOffset/BLCKSZ)); +#else + end = start + blks; +#endif + + phyStartBlockNum = NDPMERGEBIT(loc.blocknum, EXTENT_SIZE_TO_GROUPID(loc.extent_size) + 1); +} + +void pm_get_pageinfo(NdpScanDesc ndpScan, BlockNumber page, CephObject *object, char *ip, + BlockNumber& end, uint32& phyStartBlockNum) +{ + HeapScanDesc scan = (HeapScanDesc)ndpScan->scan; + int which = scan->rs_base.rs_rd->rd_smgr->smgr_which; + FileType filetype = which == 0 ? MDFILE : (which == 2 ? SEGFILE : INVALIDFILE); + Assert(filetype != INVALIDFILE); + PAGEMETHOD[filetype].get_pageinfo(ndpScan, page, object, ip, end, phyStartBlockNum); + + uint64 curAUAligned = ndpScan->handledBlock / PARALLEL_SCAN_GAP_AU_ALIGNED; + if (scan->dop > 1 && end > (curAUAligned + 1) * PARALLEL_SCAN_GAP_AU_ALIGNED) { + end = (curAUAligned + 1) * PARALLEL_SCAN_GAP_AU_ALIGNED; + } +} + +bool IsPageHitDms(RelFileNode& node, BlockNumber page) +{ + int bufId = 0; + BufferTag newTag; + + INIT_BUFFERTAG(newTag, node, MAIN_FORKNUM, page); + uint32 new_hash = BufTableHashCode(&newTag); + LWLock *new_partition_lock = BufMappingPartitionLock(new_hash); + /* see if the block is in the buffer pool already */ + (void)LWLockAcquire(new_partition_lock, LW_SHARED); + bufId = BufTableLookup(&newTag, new_hash); + LWLockRelease(new_partition_lock); + if (bufId != -1) { + return true; + } + return false; +} + +void CopyCLog(int64 pageno, char *pageBuffer) +{ + int slotno; + errno_t rc = EOK; + + /* lock is acquired by SimpleLruReadPage_ReadOnly */ + slotno = SimpleLruReadPage_ReadOnly(ClogCtl(pageno), pageno, FirstNormalTransactionId); + rc = memcpy_s(pageBuffer, BLCKSZ, ClogCtl(pageno)->shared->page_buffer[slotno], BLCKSZ); + securec_check(rc, "", ""); + + LWLockRelease(ClogCtl(pageno)->shared->control_lock); +} + +void CopyCSNLog(int64 pageno, char *pageBuffer) +{ + int slotno; + errno_t rc = EOK; + + CSN_LWLOCK_ACQUIRE(pageno, LW_SHARED); + slotno = SimpleLruReadPage_ReadOnly_Locked(CsnlogCtl(pageno), pageno, FirstNormalTransactionId); + rc = memcpy_s(pageBuffer, BLCKSZ, CsnlogCtl(pageno)->shared->page_buffer[slotno], BLCKSZ); + securec_check(rc, "", ""); + CSN_LWLOCK_RELEASE(pageno); +} + +int NdpIoSlot::SetReq(RelFileNode& node, uint16 taskId, uint16 tableId, AuInfo& auinfo) +{ + int bitCount = 0; + + reqMsg.data = &req; + reqMsg.len = sizeof(NdpIORequest); + respMsg.data = nullptr; + respMsg.len = 0; + + req.taskId = taskId; + req.tableId = tableId; + req.auInfos[0].phyStartBlockNum = auinfo.phyStartBlockNum; + req.auInfos[0].pageNum = auinfo.pageNum; +#ifdef GlobalCache + errno_t rc2 = memcpy_s(&req.auInfos[0].object, sizeof(CephObject), &auinfo.object, sizeof(CephObject)); + securec_check(rc2, "", ""); +#endif + errno_t rc = memset_s(req.pageMap, BITMAP_SIZE_PER_AU_BYTE, 0, BITMAP_SIZE_PER_AU_BYTE); + securec_check(rc, "", ""); + + if (SS_STANDBY_MODE) { + // SSIsPageHitDms(node, startBlockNum, auinfo.pageNum, req.pageMap, &bitCount); + } else { + for (uint32 i = startBlockNum, offset = 0; i != startBlockNum + auinfo.pageNum; ++i, ++offset) { + bool cached = IsPageHitDms(node, i); + if (!cached) { + NDPSETBIT(req.pageMap, offset); + ++bitCount; + } + } + } + return bitCount; +} + +NdpRetCode NdpIoSlot::SetResp(int pageNum) +{ +#ifdef ENABLE_SSL + respMsg.len = 0; + respMsg.data = nullptr; + return NdpRetCode::NDP_OK; +#else + respMsg.len = DSS_DEFAULT_AU_SIZE; + if (g_ndp_instance.pageContext->Dequeue(respMsg.data)) { + resp = reinterpret_cast(respMsg.data); + return NdpRetCode::NDP_OK; + } + return NdpRetCode::ALLOC_RESPONSE_MEMORY_FAILED; +#endif +} + +NdpRetCode NdpIoSlot::GetResp(NdpPageHeader& pages, int& pageNum, BlockNumber& start, uint64*& map) +{ + start = startBlockNum; + map = nullptr; + pages = nullptr; + pageNum = 0; + + if (respRet != NdpRetCode::NDP_OK) { + ereport(WARNING, (errmsg("rpc response status is illegal %d, RPC status %d", + static_cast(respRet), static_cast(rpcStatus)))); + return respRet; + } + + auto rpcResp = reinterpret_cast(respMsg.data); + if (rpcResp == nullptr) { + ereport(WARNING, (errmsg("rpc response is null."))); + return NdpRetCode::NDP_RETURN_FAILED; + } +#ifdef ENABLE_SSL + resp = reinterpret_cast(respMsg.data); +#endif + if (rpcResp->status != 0) { + ereport(WARNING, (errmsg("backend handle IO failed, status is %u.", resp->status))); + return NdpRetCode::NDP_RETURN_STATUS_ERROR; + } + + map = rpcResp->pageMap; + pageNum = rpcResp->ndpPageNums; + if (pageNum) { + pages = (NdpPageHeader)((char*)rpcResp + sizeof(NdpIOResponse)); + } + + return NdpRetCode::NDP_OK; +} + +void NdpIoSlot::FreeResp() +{ +#ifdef ENABLE_SSL + if (resp != nullptr) { + free(resp); + resp = nullptr; + return; + } + if (respMsg.data) { + free(respMsg.data); + respMsg.data = nullptr; + return; + } +#else + bool enqueued; + Assert(g_ndp_instance.pageContext); + if (resp) { + enqueued = g_ndp_instance.pageContext->Enqueue(resp); + if (!enqueued) { + ereport(WARNING, (errmsg("try enqueue slot memory to queue failed"))); + } + resp = nullptr; + return; + } + if (respMsg.data) { + enqueued = g_ndp_instance.pageContext->Enqueue(respMsg.data); + if (!enqueued) { + ereport(WARNING, (errmsg("try enqueue slot memory to queue failed"))); + } + respMsg.data = nullptr; + return; + } +#endif +} + +NdpRetCode NdpScanDescData::Init(ScanState* sstate, TableScanDesc sscan) +{ + curIO = nullptr; // necessary, because rescan may be before get next + + // free everything in destructor if alloc failed +#ifdef NDP_ASYNC_RPC + pg_atomic_init_u32(&reqCount, 0); + pg_atomic_init_u32(&respCount, 0); + + respIO = new MpmcBoundedQueue(NDP_PAGE_QUEUE_SIZE); + if (respIO == nullptr) { + ereport(WARNING, (errmsg("Alloc NpdPage Queue failed, size %d", NDP_PAGE_QUEUE_SIZE))); + return NdpRetCode::ALLOC_MQ_FAILED; + } + normalPagesId = new MpmcBoundedQueue(NDP_NORMAL_QUEUE_SIZE); + if (normalPagesId == nullptr) { + ereport(WARNING, (errmsg("Alloc Normal Queue failed, size %d", NDP_NORMAL_QUEUE_SIZE))); + return NdpRetCode::ALLOC_MQ_FAILED; + } +#endif + + memCtx = AllocSetContextCreate(CurrentMemoryContext, "ThreadNdpScanContext", 0, (4u << 10), (4u << 10)); + if (!memCtx) { + ereport(WARNING, (errmsg("Create ThreadNdpScanContext failed!"))); + return NdpRetCode::ALLOC_MC_FAILED; + } + +#ifdef FAULT_INJECT + if ((rand() % PERCENTAGE_DIV) < PERCENTAGE) { + ereport(WARNING, (errmsg("Fault inject -- Create ThreadNdpScanContext failed"))); + return NdpRetCode::ALLOC_MC_FAILED; + } +#endif + + cond = (NdpScanCondition*)sstate->ps.plan->ndp_pushdown_condition; + scan = sscan; + scanState = sstate; + aggState = NULL; + aggSlot = NULL; + + return NdpRetCode::NDP_OK; +} + +NdpScanDescData::~NdpScanDescData() +{ +#ifdef NDP_ASYNC_RPC + if (respIO) { + NdpIoSlot* tmpIO = nullptr; + while (respIO->Dequeue(tmpIO)) { + delete tmpIO; + } + delete respIO; + } + if (normalPagesId) { + delete normalPagesId; + } +#endif + if (memCtx) { + MemoryContextDelete(memCtx); + } +} + +void NdpScanDescData::Reset() +{ + handledBlock = 0; + curLinesNum = 0; + nextLineIndex = 0; + +#ifdef NDP_ASYNC_RPC + // add timestamp if we don't want to wait, and remember release all while deleting. + while (pg_atomic_read_u32(&respCount) < pg_atomic_read_u32(&reqCount)) { + pg_usleep(NDP_RPC_WAIT_USEC); + } +#endif + + FreeCurSlot(); + +#ifdef NDP_ASYNC_RPC + NdpIoSlot* tmpIO = nullptr; + int tmpId; + + while (respIO->Dequeue(tmpIO)) { + delete tmpIO; + } + while (normalPagesId->Dequeue(tmpId)); +#else + normalPagesNum = 0; +#endif + + MemoryContextReset(memCtx); +} + +void NdpScanDescData::AddToNormal(uint32 start, uint32 end) +{ + end = end > nBlock ? nBlock : end; + start = start > nBlock ? nBlock : start; + for (uint32 i = start; i < end; ++i) { + AddToNormal(i); + } +} + +// return true if set current io slot; +bool NdpScanDescData::HandleSlot(NdpIoSlot* slot) +{ + uint32 start; + uint64* pageMap; + NdpPageHeader pages; + int pageNum; + + NdpRetCode ret = slot->GetResp(pages, pageNum, start, pageMap); + if (ret != NdpRetCode::NDP_OK) { + ++failedIoN; + } + + if (start > nBlock) { + ereport(ERROR, (errmsg("can not happen start %u is cross the border.", start))); + return false; + } + + uint32 end = start + slot->GetPushDownPageNum(); + if (pageMap == nullptr) { // slot is invalid, add to normal list + AddToNormal(start, end); + return false; + } + + // put unhandled page to normal queue + int count = 0; + end = end > nBlock ? nBlock : end; + start = start > nBlock ? nBlock : start; + for (uint32 offset = start, i = 0; offset < end; ++i, ++offset) { + int flag = NDPGETBIT(pageMap, i); + if (!flag) { + AddToNormal(offset); + count++; + } + } + sendBackPageN += count; + + if (scanState->ps.instrument) { + scanState->ps.instrument->ndp_sendback_page += count; + } + + if (pages) { + curIO = slot; + curNdpPages = pages; + curNdpPagesNum = pageNum; + nextNdpPageIndex = 0; + if (pages->pd_flags == NDP_FILTERED_PAGE) { + ndpPageScanN += pageNum; + } else { + ndpPageAggN += pageNum; + } + if (scanState->ps.instrument) { + scanState->ps.instrument->ndp_handled += pageNum; + } + return true; + } else { + return false; + } +} + +#ifdef NDP_ASYNC_RPC +bool NdpScanDescData::GetNextSlot(void) +{ + NdpIoSlot* slot = nullptr; + if (respIO->Dequeue(slot)) { + if (HandleSlot(slot)) { + return true; + } else { + delete slot; + return false; + } + } + return false; +} +#endif + +#ifdef NDP_ASYNC_RPC +void NdpIoSlotCallDone(RpcStatus status, void *arg) +{ + NdpIoSlot* cbArg = reinterpret_cast(arg); + if (!cbArg) { + return; + } + + NdpScanDesc ndpScan = reinterpret_cast(cbArg->GetPriv()); + if (ndpScan == nullptr) { + return; + } + +#ifdef FAULT_INJECT + if ((rand() % PERCENTAGE_DIV) < PERCENTAGE) { + status = RPC_ERROR; + } +#endif + + if (status != RPC_OK) { + cbArg->SetRespRet(NdpRetCode::RPC_IO_CALLBACK_ERROR); + cbArg->SetRPCRet(status); + } + + while (!ndpScan->respIO->Enqueue(cbArg)) { + pg_usleep(NDP_RPC_WAIT_USEC); + } + + pg_atomic_add_fetch_u32(&ndpScan->respCount, 1); +} +#endif + +bool NdpScanChannel::Init(uint32 id, char* ip, uint32 tableN) +{ + rpcId = id; + errno_t rc = strcpy_s(rpcIp, NDP_RPC_IP_LEN, ip); + securec_check(rc, "", ""); + + status = NdpScanChannelStatus::UNCONNECTED; + pthread_mutex_init(&mutex, nullptr); + + rpcClient = 0; + queryId = 0; + tableNum = tableN; + tableMgr = New(CurrentMemoryContext) NdpTableMgr[tableNum](); + if (tableMgr == nullptr) { + return false; + } + connFailed = 0; + cmdFailed = 0; + return true; +} + +NdpRetCode NdpScanChannel::SendRequest(NdpIoSlot* req, NdpScanDesc ndpScan) +{ + if (status == NdpScanChannelStatus::CLOSED) { + return NdpRetCode::CONNECT_UNUSABLE; + } + if (req->GetReqTableId() >= tableNum) { + ereport(WARNING, (errmsg("table id %u should be littler then %u.", req->GetReqTableId(), tableNum))); + return NdpRetCode::TABLE_ID_INVALID; + } + + if (status == NdpScanChannelStatus::QUERYSENT) { + return SendReq(req, ndpScan); + } + + if (pthread_mutex_trylock(&mutex) != 0) { + return NdpRetCode::CONNECT_UNUSABLE; + } + + NdpRetCode ret = NdpRetCode::NDP_OK; + switch (status) { + case NdpScanChannelStatus::UNCONNECTED: { + // free old connection + DisconnectRpc(); + // call rpc connect + RpcStatus connectRet = RpcClientConnect(rpcIp, u_sess->ndp_cxt.ndp_port, rpcClient); + if (SECUREC_UNLIKELY(connectRet != RPC_OK)) { + ++connFailed; + DisconnectRpc(); + if (connFailed >= RPC_FAILED_LIMIT) { + status = NdpScanChannelStatus::CLOSED; + } + ereport(LOG, (errmsg("rpc connect (count:%d) failed, ip:port %s:%d. rpc status: %d", + connFailed, rpcIp, u_sess->ndp_cxt.ndp_port, connectRet))); + ret = NdpRetCode::CONNECT_FAILED; + break; + } + status = NdpScanChannelStatus::CONNECTED; + } + case NdpScanChannelStatus::CONNECTED: { + PG_TRY(); + { + ret = SendQuery(ndpScan); + } + PG_CATCH(); + { + ereport(WARNING, (errmsg("send query failed, it is possible a palloc failed."))); + pthread_mutex_unlock(&mutex); + PG_RE_THROW(); + } + PG_END_TRY(); + + if (SECUREC_UNLIKELY(ret != NdpRetCode::NDP_OK)) { + ++cmdFailed; + if (cmdFailed >= RPC_FAILED_LIMIT) { + status = NdpScanChannelStatus::CLOSED; + } + break; + } + status = NdpScanChannelStatus::QUERYSENT; + } + case NdpScanChannelStatus::QUERYSENT: { + pthread_mutex_unlock(&mutex); + return SendReq(req, ndpScan); + } + case NdpScanChannelStatus::CLOSED: + default: { + ret = NdpRetCode::CONNECT_UNUSABLE; + break; + } + } + pthread_mutex_unlock(&mutex); + return ret; +} + +NdpRetCode NdpScanChannel::SendEnd() +{ + NdpRetCode retCode = NdpRetCode::NDP_OK; + if (status != NdpScanChannelStatus::QUERYSENT) { + /* If all pages are cached in BufferPool, the channel might be UNCONNECTED */ + ereport(LOG, (errmsg("channel %s is not QUERY_SENT.", rpcIp))); + return NdpRetCode::CONNECT_UNUSABLE; + } + + pthread_mutex_lock(&mutex); + if (status == NdpScanChannelStatus::QUERYSENT) { + NdpAdminRequest req; + NdpAdminResponse resp; + req.head.command = NDP_TERMINATE; + req.head.size = sizeof(NdpAdminRequest); + req.taskId = queryId; + req.tableId = 0; + NdpRetCode ret = SendAdminReq(&req, &resp, sizeof(resp.ret)); + if (ret != NdpRetCode::NDP_OK) { + retCode = NdpRetCode::RPC_ADMIN_SEND_TERMINATE_FAILED; + } else { + status = NdpScanChannelStatus::CLOSED; + } + } + pthread_mutex_unlock(&mutex); + return retCode; +} + +NdpRetCode NdpScanChannel::SendReq(NdpIoSlot* req, NdpScanDesc ndpScan) +{ + NdpTableMgr* mgr = &tableMgr[req->GetReqTableId()]; + if (mgr->status == NdpTableStatus::CONSTRUCTFAIL) { + return NdpRetCode::NDP_CONSTRUCT_FAILED; + } + if (mgr->status == NdpTableStatus::STATESENT) { + return SendIo(req, ndpScan); + } + if (mgr->cmdNdpFailed >= RPC_FAILED_LIMIT) { + return NdpRetCode::RPC_ADMIN_SEND_FAIL; + } + + if (pthread_mutex_trylock(&mgr->mutex) != 0) { + return NdpRetCode::TABLE_MGR_UNUSABLE; + } + + NdpRetCode ret; + PG_TRY(); + { + ret = SendAdmin(mgr, req, ndpScan); + } + PG_CATCH(); + { + ereport(WARNING, (errmsg("send failed, it is possible a palloc failed."))); + ret = NdpRetCode::NDP_ERROR; + } + PG_END_TRY(); + + pthread_mutex_unlock(&mgr->mutex); + if (mgr->status == NdpTableStatus::STATESENT) { + return SendIo(req, ndpScan); + } + + return ret; +} + +NdpRetCode NdpScanChannel::SendAdmin(NdpTableMgr* mgr, NdpIoSlot* req, NdpScanDesc ndpScan) +{ + NdpRetCode ret = NdpRetCode::NDP_OK; + switch (mgr->status) { + case NdpTableStatus::INITIAL: { + ret = SendPlan(ndpScan); + if (ret != NdpRetCode::NDP_OK) { + mgr->status = (ret == NdpRetCode::NDP_CONSTRUCT_FAILED) ? + NdpTableStatus::CONSTRUCTFAIL : mgr->status; + break; + } + mgr->status = NdpTableStatus::PLANSENT; + } + case NdpTableStatus::PLANSENT: { + ret = SendState(ndpScan); + if (ret != NdpRetCode::NDP_OK) { + mgr->status = (ret == NdpRetCode::NDP_CONSTRUCT_FAILED) ? + NdpTableStatus::CONSTRUCTFAIL : mgr->status; + break; + } + mgr->status = NdpTableStatus::STATESENT; + } + case NdpTableStatus::STATESENT: { + ret = NdpRetCode::NDP_OK; + break; + } + default: { + ret = NdpRetCode::NDP_ERROR; + break; + } + } + if (ret != NdpRetCode::NDP_OK) { + ++mgr->cmdNdpFailed; + } + return ret; +} + +NdpRetCode NdpScanChannel::SendQuery(NdpScanDesc ndpScan) +{ + NdpAdminRequest *v = ConstructVersion(); + NdpAdminResponse resp; + NdpRetCode versionRet = SendAdminReq(v, &resp, sizeof(NdpAdminResponse)); + pfree(v); + if (versionRet != NdpRetCode::NDP_OK) { + DisconnectRpc(); + status = NdpScanChannelStatus::CLOSED; + ereport(LOG, (errmsg("send version admin (count:%d) request failed, ip:port %s:%d.", + cmdFailed, rpcIp, u_sess->ndp_cxt.ndp_port))); + return NdpRetCode::RPC_ADMIN_SEND_VERSION_FAILED; + } + + NdpAdminRequest *query; + resp = {}; + query = ConstructQuery(ndpScan); + NdpRetCode queryRet = SendAdminReq(query, &resp, sizeof(NdpAdminResponse)); + pfree(query); + if (queryRet != NdpRetCode::NDP_OK) { + ereport(LOG, (errmsg("send admin (count:%d) request failed, ip:port %s:%d.", + cmdFailed, rpcIp, u_sess->ndp_cxt.ndp_port))); + return NdpRetCode::RPC_ADMIN_SEND_CTX_FAILED; + } + queryId = (uint16)resp.queryId; + return NdpRetCode::NDP_OK; +} + +NdpRetCode NdpScanChannel::SendPlan(NdpScanDesc ndpScan) +{ + NdpAdminRequest* planReq = ConstructPlanReq(ndpScan); + if (planReq == nullptr) { + return NdpRetCode::NDP_CONSTRUCT_FAILED; + } + NdpAdminResponse resp; + NdpRetCode ret = SendAdminReq(planReq, &resp, sizeof(resp.ret)); + pfree(planReq); + if (ret != NdpRetCode::NDP_OK) { + ret = NdpRetCode::RPC_ADMIN_SEND_PLAN_FAILED; + } + return ret; +} + +NdpRetCode NdpScanChannel::SendAdminReq(NdpAdminRequest* req, NdpAdminResponse* resp, size_t size) +{ + RpcStatus status = RpcSendAdminReq(req, resp, size, rpcClient); + if (status != RPC_OK) { + ereport(LOG, (errmsg("RpcSendAdminReq failed. CMD code:%d, Rpc code: %d", req->head.command, status))); + return NdpRetCode::RPC_ADMIN_SEND_FAIL; + } else if (resp->ret != NDP_OK) { + ereport(LOG, (errmsg("AdminReq handle failed."))); + return NdpRetCode::NDP_ERROR; + } + + return NdpRetCode::NDP_OK; +} + +NdpRetCode NdpScanChannel::SendState(NdpScanDesc ndpScan) +{ + NdpAdminRequest* state = ConstructPlanState(ndpScan); + if (state == nullptr) { + return NdpRetCode::NDP_CONSTRUCT_FAILED; + } + NdpAdminResponse resp; + NdpRetCode ret = SendAdminReq(state, &resp, sizeof(resp.ret)); + pfree(state); + if (ret != NdpRetCode::NDP_OK) { + ret = NdpRetCode::RPC_ADMIN_SEND_STATE_FAILED; + } + return ret; +} + +NdpRetCode NdpScanChannel::SendIo(NdpIoSlot* req, NdpScanDesc ndpScan) +{ + req->SetReq(queryId); + NdpTableMgr* mgr = &tableMgr[req->GetReqTableId()]; + if (mgr->ioFailed >= SEND_FAILED_LIMIT) { + return NdpRetCode::RPC_IO_SEND_FAILED; + } + +#ifdef NDP_ASYNC_RPC + RpcCallDone callDone = {.cb = &NdpIoSlotCallDone, .arg = (void*)req }; + pg_atomic_add_fetch_u32(&ndpScan->reqCount, 1); // before send + RpcStatus ret = RpcSendIOReq(req->GetReqMsg(), req->GetRespMsg(), &callDone, rpcClient); + if (ret != RPC_OK) { + pg_atomic_sub_fetch_u32(&ndpScan->reqCount, 1); // before send + mgr->ioFailed++; + return NdpRetCode::RPC_IO_SEND_FAILED; + } + +#else + RpcStatus ret = RpcSendIOReq(req->GetReqMsg(), req->GetRespMsg(), NULL, rpcClient); + if (ret != RPC_OK) { + mgr->ioFailed++; + return NdpRetCode::RPC_IO_SEND_FAILED; + } +#endif + return NdpRetCode::NDP_OK; +} + +NdpAdminRequest* NdpScanChannel::ConstructPlanReq(NdpScanDesc ndpScan) +{ + char* str = nodeToString(ndpScan->cond->plan); + int len = strlen(str) + 1; + if (len == 1) { + return nullptr; + } + + NdpAdminRequest* req = reinterpret_cast(palloc(sizeof(NdpAdminRequest) + len)); + req->head.command = NDP_PLAN; + req->head.size = sizeof(NdpAdminRequest) + len; + req->taskId = queryId; + req->tableId = ndpScan->cond->tableId; + errno_t rc = memcpy_s(reinterpret_cast(req + 1), len, str, len); + securec_check(rc, "", ""); + return req; +} + +bool NdpScanChannel::ExtractTupleDesc(TupleDesc desc, NdpTupleDesc* td) +{ + td->natts = desc->natts; + if (td->natts == 0) { + td->attrs = NULL; + return true; + } + td->attrs = (NdpPGAttr *)palloc(sizeof(NdpPGAttr) * td->natts); + for (int i = 0; i < desc->natts; ++i) { + td->attrs[i].attlen = desc->attrs[i].attlen; + td->attrs[i].attbyval = desc->attrs[i].attbyval; + td->attrs[i].attcacheoff = desc->attrs[i].attcacheoff; + td->attrs[i].attalign = desc->attrs[i].attalign; + td->attrs[i].attndims = desc->attrs[i].attndims; + td->attrs[i].attstorage = desc->attrs[i].attstorage; + } + td->tdhasoid = desc->tdhasoid; + td->tdhasuids = desc->tdhasuids; + td->tdtypeid = desc->tdtypeid; + td->tdtypmod = desc->tdtypmod; + return true; +} + +bool NdpScanChannel::ExtractRelation(TableScanDesc scan, NdpRelation* rel) +{ + rel->node.spcNode = scan->rs_rd->rd_node.spcNode; + rel->node.dbNode = scan->rs_rd->rd_node.dbNode; + rel->node.relNode = scan->rs_rd->rd_node.relNode; + rel->node.bucketNode = scan->rs_rd->rd_node.bucketNode; + rel->node.opt = scan->rs_rd->rd_node.opt; + + return ExtractTupleDesc(scan->rs_rd->rd_att, &rel->att); +} + +bool NdpScanChannel::ExtractXact(TableScanDesc scan, NdpXact* xact) +{ + xact->comboCids = NULL; + xact->CLogPageBuffer = NULL; + xact->CSNLogPageBuffer = NULL; + + /* snapshot */ + NdpSnapshot* snapshot = &xact->snapshot; + + snapshot->satisfies = scan->rs_snapshot->satisfies; + snapshot->xmin = scan->rs_snapshot->xmin; /* all XID < xmin are visible to me */ + snapshot->xmax = scan->rs_snapshot->xmax; /* all XID >= xmax are invisible to me */ + snapshot->snapshotcsn = scan->rs_snapshot->snapshotcsn; + snapshot->curcid = scan->rs_snapshot->curcid; + + /* TransactionState */ + xact->transactionId = GetCurrentTransactionIdIfAny(); + + /* comboCids */ + xact->usedComboCids = u_sess->utils_cxt.usedComboCids; + if (xact->usedComboCids > 0) { + xact->comboCids = (uint32 *)palloc(xact->usedComboCids * 2 * sizeof(uint32)); + uint32 *comboCids = (uint32 *)u_sess->utils_cxt.comboCids; + for (int i = 0; i < xact->usedComboCids; i++) { + xact->comboCids[i * 2] = comboCids[i * 2]; + xact->comboCids[i * 2 + 1] = comboCids[i * 2 + 1]; + } + } else { + xact->comboCids = NULL; + } + + /* clog & csnlog */ + /* It's okay to read latestCompletedXid without acquiring ProcArrayLock shared lock + * because we dont' care if we get a slightly stale value + */ + xact->latestCompletedXid = t_thrd.xact_cxt.ShmemVariableCache->latestCompletedXid; + int64 pagenoStart, pagenoEnd, pageno; + + pagenoStart = TransactionIdToPage(FirstNormalTransactionId); + pagenoEnd = TransactionIdToPage(xact->latestCompletedXid); + + xact->CLogLen = (pagenoEnd - pagenoStart + 1) * BLCKSZ; + xact->CLogPageBuffer = (char *)palloc(xact->CLogLen); + for (pageno = pagenoStart; pageno <= pagenoEnd; pageno++) { + CopyCLog(pageno, xact->CLogPageBuffer + (pageno - pagenoStart) * BLCKSZ); + } + + pagenoStart = TransactionIdToCSNPage(xact->latestCompletedXid); + pagenoEnd = TransactionIdToCSNPage(xact->latestCompletedXid); + + xact->CSNLogLen = (pagenoEnd - pagenoStart + 1) * BLCKSZ; + xact->CSNLogPageBuffer = (char *)palloc(xact->CSNLogLen); + for (pageno = pagenoStart; pageno <= pagenoEnd; pageno++) { + CopyCSNLog(pageno, xact->CSNLogPageBuffer + (pageno - pagenoStart) * BLCKSZ); + } + + return true; +} + +bool NdpScanChannel::ExtractAggState(NdpScanDesc ndpScan, NdpAggState* aggS) +{ + aggS->aggTd.natts = 0; + aggS->aggTd.attrs = nullptr; + aggS->aggNum = 0; + aggS->perAggTd = nullptr; + aggS->numCols = 0; + aggS->eqFuncOid = nullptr; + aggS->hashFuncOid = nullptr; + + if (ndpScan->aggState) { + Assert(ndpScan->aggSlot != nullptr); + ExtractTupleDesc(ndpScan->aggSlot->tts_tupleDescriptor, &aggS->aggTd); + + aggS->aggNum = ndpScan->aggState->numaggs; + aggS->perAggTd = (NdpTupleDesc*)palloc(aggS->aggNum * sizeof(NdpTupleDesc)); + for (int aggNo = 0; aggNo < aggS->aggNum; ++aggNo) { + ExtractTupleDesc(ndpScan->aggState->evaldesc, &aggS->perAggTd[aggNo]); + } + Agg* agg = (Agg*)ndpScan->aggState->ss.ps.plan; + if ((agg->aggstrategy == AGG_HASHED) && (agg->numCols > 0)) { + aggS->numCols = agg->numCols; + aggS->eqFuncOid = (unsigned int*)palloc(aggS->numCols * sizeof(unsigned int)); + aggS->hashFuncOid = (unsigned int*)palloc(aggS->numCols * sizeof(unsigned int)); + for (int colNo = 0; colNo < aggS->numCols; ++colNo) { + aggS->eqFuncOid[colNo] = ndpScan->aggState->phases[0].eqfunctions[colNo].fn_oid; + aggS->hashFuncOid[colNo] = ndpScan->aggState->hashfunctions[colNo].fn_oid; + } + } + } + return true; +} + +bool CheckExprContext(ExprContext* econtext, NdpParamList* pList) +{ + ParamListInfo paramInfo = econtext->ecxt_param_list_info; + if (paramInfo->paramFetch != NULL) { + return false; + } + pList->numParams = paramInfo->numParams; + pList->params = (NdpParamData*)palloc(pList->numParams * sizeof(NdpParamData)); + for (int i = 0; i < pList->numParams; i++) { + ParamExternData* from = ¶mInfo->params[i]; + NdpParamData* to = &pList->params[i]; + if (from->tabInfo && from->tabInfo->isnestedtable && plpgsql_estate) { + pfree(pList->params); + pList->numParams = 0; + return false; + } + to->isnull = from->isnull; + to->ptype = from->ptype; + to->value = from->value; + get_typlenbyval(to->ptype, &to->typlen, &to->typbyval); + to->value = datumCopy(to->value, to->typbyval, to->typlen); + } + return true; +} + +bool ExtractParamList(NdpScanDesc ndpScan, NdpParamList* pList) +{ + ExprContext* econtext = ndpScan->scanState->ps.ps_ExprContext; + if (econtext == nullptr || econtext->ecxt_param_list_info == nullptr || + econtext->ecxt_param_list_info->numParams == 0) { + pList->numParams = 0; + pList->params = nullptr; + return true; + } + return CheckExprContext(econtext, pList); +} + +bool ExtractKnlSessionContext(NdpSessionContext* sess) +{ + if (u_sess != nullptr) { + sess->sql_compatibility = u_sess->attr.attr_sql.sql_compatibility; + sess->behavior_compat_flags = u_sess->utils_cxt.behavior_compat_flags; + sess->encoding = u_sess->mb_cxt.DatabaseEncoding->encoding; + return true; + } + return false; +} + +NdpPlanState* NdpScanChannel::CreatePlanState(NdpScanDesc ndpScan) +{ + NdpPlanState* state = (NdpPlanState*)palloc(sizeof(NdpPlanState)); + + TableScanDesc scan = ndpScan->scan; + ExtractRelation(scan, &state->rel); + + ProjectionInfo* proj_info = ndpScan->scanState->ps.ps_ProjInfo; + if (proj_info != NULL) { + ExtractTupleDesc(proj_info->pi_slot->tts_tupleDescriptor, &state->scanTd); + } else { + ExtractTupleDesc(ndpScan->scanState->ss_ScanTupleSlot->tts_tupleDescriptor, &state->scanTd); + } + if (!ExtractAggState(ndpScan, &state->aggState)) { + DestroyPlanState(state); + return nullptr; + } + if (!ExtractParamList(ndpScan, &state->paramList)) { + DestroyPlanState(state); + return nullptr; + } + if (!ExtractKnlSessionContext(&state->sess)) { + DestroyPlanState(state); + return nullptr; + } + return state; +} + +void NdpScanChannel::DestroyPlanState(NdpPlanState* state) +{ + if (state == nullptr) return; + if (state->rel.att.attrs) pfree(state->rel.att.attrs); + if (state->scanTd.attrs) pfree(state->scanTd.attrs); + if (state->paramList.params != nullptr) { + pfree(state->paramList.params); + } + + NdpAggState* aggS = &state->aggState; + if (aggS->aggTd.attrs) pfree(aggS->aggTd.attrs); + if (aggS->perAggTd) { + for (int i = 0; i < aggS->aggNum; ++i) { + if (aggS->perAggTd[i].attrs) pfree(aggS->perAggTd[i].attrs); + } + pfree(aggS->perAggTd); + } + if (aggS->eqFuncOid) pfree(aggS->eqFuncOid); + if (aggS->hashFuncOid) pfree(aggS->hashFuncOid); + + pfree(state); +} + +NdpAdminRequest* NdpScanChannel::ConstructPlanState(NdpScanDesc ndpScan) +{ + NdpPlanState* state = CreatePlanState(ndpScan); + + if (state == nullptr) { + return nullptr; + } + + StringInfoData str; + initStringInfo(&str); + NdpAdminRequest head; + appendBinaryStringInfo(&str, (const char*)&head, (int)sizeof(head)); + stateToString(state, &str); + NdpAdminRequest* ptr = (NdpAdminRequest*)str.data; + ptr->head.command = NDP_PLANSTATE; + ptr->head.size = str.len; + ptr->taskId = queryId; + ptr->tableId = ndpScan->cond->tableId; + + DestroyPlanState(state); + return ptr; +} + +NdpAdminRequest* NdpScanChannel::ConstructQuery(NdpScanDesc ndpScan) +{ + NdpQuery* query = (NdpQuery*)palloc(sizeof(NdpQuery)); + NdpContext* context = static_cast(ndpScan->cond->ctx); + query->tableNum = context->tableCount; + ExtractXact(ndpScan->scan, &query->xact); + + StringInfoData str; + initStringInfo(&str); + NdpAdminRequest head; + appendBinaryStringInfo(&str, (const char*)&head, (int)sizeof(head)); + queryToString(query, &str); + NdpAdminRequest* ptr = (NdpAdminRequest*)str.data; + ptr->head.command = NDP_QUERY; + ptr->head.size = str.len; + + if (query->xact.comboCids) pfree(query->xact.comboCids); + if (query->xact.CLogPageBuffer) pfree(query->xact.CLogPageBuffer); + if (query->xact.CSNLogPageBuffer) pfree(query->xact.CSNLogPageBuffer); + pfree(query); + return ptr; +} + +NdpAdminRequest* NdpScanChannel::ConstructVersion() +{ + int len = sizeof(uint64); + uint64 version = (((uint64)GRAND_VERSION_NUM) << 32) | NDP_LOCAL_VERSION_NUM; + NdpAdminRequest* req = reinterpret_cast(palloc(sizeof(NdpAdminRequest) + len)); + req->head.command = NDP_VERSION; + req->head.size = sizeof(NdpAdminRequest) + len; + req->taskId = -1; + req->tableId = -1; + errno_t rc = memcpy_s(reinterpret_cast(req + 1), len, &version, len); + securec_check(rc, "", ""); + return req; +} diff --git a/contrib/ndpplugin/ndpam.h b/contrib/ndpplugin/ndpam.h new file mode 100644 index 000000000..3bb275861 --- /dev/null +++ b/contrib/ndpplugin/ndpam.h @@ -0,0 +1,298 @@ +/* ------------------------------------------------------------------------- + * ndpam.h + * prototypes for functions in contrib/ndpplugin/ndpam.cpp + * + * Portions Copyright (c) 2022 Huawei Technologies Co.,Ltd. + * + * IDENTIFICATION + * contrib/ndpplugin/ndpam.h + * + * ------------------------------------------------------------------------- + */ + +#ifndef NDPAM_H +#define NDPAM_H + +#include "component/thread/mpmcqueue.h" + +#include "ndpplugin.h" // for global instance config + +#define PARALLEL_SCAN_GAP_AU_ALIGNED ((unsigned)PAGE_NUM_PER_AU << 1) + +enum class NdpRetCode { + // ok + NDP_OK = 0, + // for desc + ALLOC_MC_FAILED, + ALLOC_MQ_FAILED, + // for channel + CONNECT_FAILED, + CONNECT_UNUSABLE, + RPC_ADMIN_SEND_FAIL, + TABLE_ID_INVALID, + TABLE_MGR_UNUSABLE, + RPC_ADMIN_SEND_CTX_FAILED, + RPC_ADMIN_SEND_PLAN_FAILED, + RPC_ADMIN_SEND_STATE_FAILED, + RPC_ADMIN_SEND_TERMINATE_FAILED, + RPC_ADMIN_SEND_VERSION_FAILED, + RPC_IO_SEND_FAILED, + RPC_IO_CALLBACK_ERROR, + // for memory alloc + ALLOC_RESPONSE_MEMORY_FAILED, + NDP_RETURN_FAILED, + NDP_RETURN_STATUS_ERROR, + NDP_ERROR, + NDP_CONSTRUCT_FAILED +}; + +typedef PageHeaderData NdpPageHeaderData; +typedef PageHeaderData* NdpPageHeader; + +class NdpIoSlot : public BaseObject { +public: + explicit NdpIoSlot(void* privateData) + : priv(privateData), resp(nullptr), respRet(NdpRetCode::NDP_OK), rpcStatus(RPC_OK) + { + reqMsg.data = respMsg.data = nullptr; + reqMsg.len = respMsg.len = 0; + } + ~NdpIoSlot() { FreeResp(); } + + int SetReq(RelFileNode& node, uint16 taskId, uint16 tableId, AuInfo& auinfo); + void SetReq(uint16 taskId) { req.taskId = taskId; } + NdpRetCode SetResp(int pageNum); + void SetRespRet(NdpRetCode code) { respRet = code; } + void SetStartBlockNum(uint32 start) { startBlockNum = start; } + void* GetPriv(void) { return priv; } + uint16 GetReqTableId(void) { return req.tableId; } + NdpRetCode GetResp(NdpPageHeader& pages, int& pageNum, BlockNumber& start, uint64*& map); + + RpcMessage* GetReqMsg(void) { return &reqMsg; } + RpcMessage* GetRespMsg(void) { return &respMsg; } + uint32 GetStartBlockNum(void) { return startBlockNum; } + int GetPushDownPageNum(void) {return req.auInfos[0].pageNum;}; + void SetRPCRet(RpcStatus rpccode) { rpcStatus = rpccode; } +private: + void FreeResp(void); + + void* priv; + RpcMessage reqMsg; + RpcMessage respMsg; + NdpIORequest req; + NdpIOResponse* resp; + NdpRetCode respRet; + uint32 startBlockNum; + RpcStatus rpcStatus; +}; + +/* + * plan <-> NdpContext + * |----ScanNode <-> NdpScanCondition + * |----Scan producer <-> NdpScanDescData + * |----Scan producer <-> NdpScanDescData + * |----ScanNode <-> NdpScanCondition + */ +typedef struct NdpContext { // for each plan tree + MemoryContext ccMem; + pthread_rwlock_t ccLock; + struct HTAB* channelCache; // for connector <=> rpc + + uint32 rpcCount; + uint32 tableCount; // for oid <=> tableId, also can do in server + knl_session_context* u_sess; // for statistics without ENABLE_THREAD_POOL +} NdpContext; + +class NdpScanDescData : public BaseObject { // for each scan thread +public: + TableScanDesc scan; + + int curPageType; + int curLinesNum; + + NdpIoSlot* curIO; // for free + NdpPageHeader curNdpPages; + int curNdpPagesNum; + int nextNdpPageIndex; + + NdpPageHeader curNdpPage; // can unify to id, if pushdown page store after global shared memory + int curNormalPageId; + + int nextTupleOffset; // for access tuple in pushdown page + int nextLineIndex; + +#ifdef NDP_ASYNC_RPC + pg_atomic_uint32 reqCount; + pg_atomic_uint32 respCount; + MpmcBoundedQueue* respIO{nullptr}; + MpmcBoundedQueue* normalPagesId{nullptr}; +#else + int normalPagesNum; + int normalPagesId[PAGE_NUM_PER_AU]; +#endif + + BlockNumber handledBlock; // number of handled block + BlockNumber nBlock; // block's number of the scan relation + + // for statistics + int sendFailedN{0}; + int failedIoN{0}; + int normalPageN{0}; + int pushDownPageN{0}; + int sendBackPageN{0}; + int ndpPageAggN{0}; + int ndpPageScanN{0}; + int rev{0}; + + NdpScanCondition* cond; // for Plan + ScanState* scanState; + AggState* aggState; + TupleTableSlot* aggSlot; + + MemoryContext memCtx{nullptr}; + + NdpScanDescData() = default; + ~NdpScanDescData(); + NdpRetCode Init(ScanState* sstate, TableScanDesc sscan); + void Reset(void); + void AddToNormal(uint32 start, uint32 end); + void AddToNormal(uint32 block) + { +#ifdef NDP_ASYNC_RPC + if (!normalPagesId->Enqueue(block)) { + ereport(ERROR, (errmsg("normal page exceed limit."))); + } +#else + normalPagesId[normalPagesNum] = block; + normalPagesNum++; +#endif + normalPageN++; + } + + bool HandleSlot(NdpIoSlot* slot); +#ifdef NDP_ASYNC_RPC + bool GetNextSlot(void); +#endif + void FreeCurSlot(void) + { + if (curIO) { + delete curIO; + curIO = nullptr; + } + } +}; +typedef NdpScanDescData* NdpScanDesc; + +enum class NdpScanChannelStatus{ + UNCONNECTED = 0, + CONNECTED, + QUERYSENT, + CLOSED +}; + +enum class NdpTableStatus { + INITIAL = 0, + PLANSENT, + STATESENT, + CONSTRUCTFAIL +}; + +struct NdpTableMgr : public BaseObject { + volatile NdpTableStatus status = NdpTableStatus::INITIAL; + pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; // protect status + uint16 ioFailed = 0; + uint16 cmdNdpFailed = 0; +}; + +// connector <=> channel +struct NdpScanChannel { + char rpcIp[NDP_RPC_IP_LEN]; + uint32 rpcId; // handle of rpc, support multi-thread + + volatile NdpScanChannelStatus status; // atomic access, called by multi-thread + pthread_mutex_t mutex; + + RpcClient rpcClient; + uint16 queryId; + + uint32 tableNum; + NdpTableMgr* tableMgr; // status to know if condition sent + + uint32 connFailed; + uint16 cmdFailed; + + NdpTableStatus GetTableStatus(uint16 tableId) + { + if (tableId >= tableNum) { + return NdpTableStatus::CONSTRUCTFAIL; + } + return tableMgr[tableId].status; + } + + void DestroyChannel() + { + DisconnectRpc(); + status = NdpScanChannelStatus::CLOSED; + if (tableMgr) { + delete []tableMgr; + } + tableNum = 0; + pthread_mutex_destroy(&mutex); + } + // do initialize in init instead of constructor, because allocated by Hash insert + bool Init(uint32 id, char* ip, uint32 tableN); + NdpRetCode SendRequest(NdpIoSlot* req, NdpScanDesc ndpScan); // should support multi-thread + NdpRetCode SendEnd(); + NdpRetCode SendAdminReq(NdpAdminRequest* req, NdpAdminResponse* resp, size_t size); + + // this function can only be called under mutex locked, need atomic write status + void DisconnectRpc() + { + if (rpcClient) { + RpcClientDisconnect(rpcClient); + rpcClient = 0; + status = NdpScanChannelStatus::UNCONNECTED; + } + } + + NdpRetCode SendReq(NdpIoSlot* req, NdpScanDesc ndpScan); + NdpRetCode SendAdmin(NdpTableMgr* mgr, NdpIoSlot* req, NdpScanDesc ndpScan); + NdpRetCode SendIo(NdpIoSlot* req, NdpScanDesc ndpScan); + NdpRetCode SendQuery(NdpScanDesc ndpScan); + NdpRetCode SendPlan(NdpScanDesc ndpScan); + NdpRetCode SendState(NdpScanDesc ndpScan); + + NdpAdminRequest* ConstructPlanReq(NdpScanDesc ndpScan); + bool ExtractTupleDesc(TupleDesc desc, NdpTupleDesc* td); + bool ExtractRelation(TableScanDesc scan, NdpRelation* rel); + bool ExtractXact(TableScanDesc scan, NdpXact* xact); + bool ExtractAggState(NdpScanDesc ndpScan, NdpAggState* aggS); + NdpPlanState* CreatePlanState(NdpScanDesc ndpScan); + void DestroyPlanState(NdpPlanState* state); + NdpAdminRequest* ConstructPlanState(NdpScanDesc ndpScan); + NdpAdminRequest* ConstructQuery(NdpScanDesc ndpScan); + NdpAdminRequest* ConstructVersion(); +}; + +struct NdpPageMethod { + void (*get_pageinfo)(NdpScanDesc ndpScan, BlockNumber page, CephObject *object, char *ip, + BlockNumber& end, uint32& phyStartBlockNum); +}; + +void pm_get_pageinfo(NdpScanDesc ndpScan, BlockNumber page, CephObject *object, char *ip, + BlockNumber& end, uint32& phyStartBlockNum); +void md_get_pageinfo(NdpScanDesc ndpScan, BlockNumber page, CephObject *object, char *ip, + BlockNumber& end, uint32& phyStartBlockNum); +void seg_get_pageinfo(NdpScanDesc ndpScan, BlockNumber page, CephObject *object, char *ip, + BlockNumber& end, uint32& phyStartBlockNum); + +static const NdpPageMethod PAGEMETHOD[] { + { + md_get_pageinfo, + }, + { + seg_get_pageinfo, + } +}; + +#endif // NDPAM_H diff --git a/contrib/ndpplugin/ndpnodes.h b/contrib/ndpplugin/ndpnodes.h new file mode 100644 index 000000000..03b2bea30 --- /dev/null +++ b/contrib/ndpplugin/ndpnodes.h @@ -0,0 +1,23 @@ +/* ------------------------------------------------------------------------- + * ndpnodes.h + * prototypes for functions in contrib/ndpplugin/ndpoutfuncs.cpp + * + * Portions Copyright (c) 2022 Huawei Technologies Co.,Ltd. + * + * IDENTIFICATION + * contrib/ndpplugin/ndpnodes.h + * + * ------------------------------------------------------------------------- + */ + +#ifndef NDPNODES_H +#define NDPNODES_H + +#include "postgres.h" +#include "commands/extension.h" +#include "ndp/ndp_nodes.h" + +void stateToString(NdpPlanState* node, StringInfo str); +void queryToString(NdpQuery* node, StringInfo str); + +#endif // NDPNODES_H diff --git a/contrib/ndpplugin/ndpoutfuncs.cpp b/contrib/ndpplugin/ndpoutfuncs.cpp new file mode 100644 index 000000000..4c243d53c --- /dev/null +++ b/contrib/ndpplugin/ndpoutfuncs.cpp @@ -0,0 +1,385 @@ +/* ------------------------------------------------------------------------- + * ndpoutfuncs.cpp + * Routine to serialize PlanState information + * + * Portions Copyright (c) 2022 Huawei Technologies Co.,Ltd. + * + * IDENTIFICATION + * contrib/ndpplugin/ndpoutfuncs.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "ndpnodes.h" + +enum class NdpStateType { + // BaseType + BOOL, + CHAR, + INT32, + UINT32, + INT64, + UINT64, + + // NodeType + RELATION, + RELFILENODE, + TUPLEDESC, + PGATTR, + XACT, + AGGSTATE, + PARAMLIST, + PARAMDATA, + SESSIONCONTEXT, + ILLEGAL +}; + +static const char* NdpNodeNames[] { + // BaseType + "BOOL", + "CHAR", + "INT32", + "UINT32", + "INT64", + "UINT64", + + // NodeType + "RELATION", + "RELFILENODE", + "TUPLEDESC", + "PGATTR", + "XACT", + "AGGSTATE", + "PARAMLIST", + "PARAMDATA", + "SESSIONCONTEXT" +}; + +#define booltostr(x) ((x) ? "true" : "false") + +// for performance +#define WRITE_BOOL_FIELD(fldname) appendStringInfo(str, " :" CppAsString(fldname) " %s", booltostr(node->fldname)) + +/* Write a char field (ie, one ascii character) */ +#define WRITE_CHAR_FIELD(fldname) appendStringInfo(str, " :" CppAsString(fldname) " %c", node->fldname) + +/* Write an integer field (anything written as ":fldname %d") */ +#define WRITE_INT_FIELD(fldname) appendStringInfo(str, " :" CppAsString(fldname) " %d", node->fldname) + +/* Write an unsigned integer field (anything written as ":fldname %u") */ +#define WRITE_UINT_FIELD(fldname) appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname) + +/* Write an 64bit unsigned integer field (anything written as ":fldname %lu") */ +#define WRITE_UINT64_FIELD(fldname) appendStringInfo(str, " :" CppAsString(fldname) " %lu", node->fldname) + +/* Write an OID field (don't hard-wire assumption that OID is same as uint) */ +#define WRITE_OID_FIELD(fldname) appendStringInfo(str, " :" CppAsString(fldname) " %u", node->fldname) + +#define WRITE_BASE_ARRAY(fldname, fldlen, fldtype) \ + (appendStringInfo(str, " :" CppAsString(fldname) " "), _outBaseArray(str, node->fldname, node->fldlen, fldtype)) + +#define WRITE_UINT_ARRAY_LEN(fldname, len) \ + (appendStringInfo(str, " :" CppAsString(fldname) " "), \ + _outBaseArray(str, node->fldname, len, NdpStateType::UINT32)) + +#define WRITE_CHAR_ARRAY(fldname, fldlen) \ + (appendStringInfo(str, " :" CppAsString(fldname) " "), appendBinaryStringInfo(str, node->fldname, node->fldlen)) + +/* Write a Node field */ +#define WRITE_NODE_FIELD(fldname, fldtype) \ + (appendStringInfo(str, " :" CppAsString(fldname) " "), _outNode(str, &node->fldname, fldtype)) + +#define WRITE_NODE_ARRAY(fldname, fldlen, fldtype) \ + (appendStringInfo(str, " :" CppAsString(fldname) " "), _outNodeArray(str, node->fldname, node->fldlen, fldtype)) + +static void _outNode(StringInfo str, void* obj, NdpStateType type); +static void _outNodeArray(StringInfo str, void* node, int len, NdpStateType type); + +static void _outBaseArray(StringInfo str, void* node, int len, NdpStateType type) +{ + appendStringInfoChar(str, '('); + + for (int i = 0; i < len; ++i) { + switch (type) { + case NdpStateType::BOOL: + appendStringInfo(str, "%s", booltostr(((bool*)node)[i])); + break; + case NdpStateType::INT32: + appendStringInfo(str, "%d", ((int32*)node)[i]); + break; + case NdpStateType::UINT32: + appendStringInfo(str, "%u", ((uint32*)node)[i]); + break; + case NdpStateType::INT64: + appendStringInfo(str, "%ld", ((int64*)node)[i]); + break; + case NdpStateType::UINT64: + appendStringInfo(str, "%lu", ((uint64*)node)[i]); + break; + default: + break; + } + + if (i + 1 != len) { + appendStringInfoChar(str, ' '); + } + } + + appendStringInfoChar(str, ')'); +} + +static void _outNdpRelFileNode(StringInfo str, NdpRelFileNode* node) +{ + WRITE_UINT_FIELD(spcNode); + WRITE_UINT_FIELD(dbNode); + WRITE_UINT_FIELD(relNode); + WRITE_UINT_FIELD(bucketNode); + WRITE_UINT_FIELD(opt); +} + +static void _outNdpPGAttr(StringInfo str, NdpPGAttr* node) +{ + WRITE_INT_FIELD(attlen); + WRITE_BOOL_FIELD(attbyval); + WRITE_INT_FIELD(attcacheoff); + WRITE_CHAR_FIELD(attalign); + WRITE_INT_FIELD(attndims); + WRITE_CHAR_FIELD(attstorage); +} + +static void _outNdpTupleDesc(StringInfo str, NdpTupleDesc* node) +{ + WRITE_INT_FIELD(natts); + WRITE_NODE_ARRAY(attrs, natts, NdpStateType::PGATTR); + WRITE_BOOL_FIELD(tdhasoid); + WRITE_BOOL_FIELD(tdhasuids); + WRITE_OID_FIELD(tdtypeid); + WRITE_INT_FIELD(tdtypmod); +} + +static void _outNdpRelation(StringInfo str, NdpRelation* node) +{ + WRITE_NODE_FIELD(node, NdpStateType::RELFILENODE); + WRITE_NODE_FIELD(att, NdpStateType::TUPLEDESC); +} + +static void _outNdpSnapshot(StringInfo str, NdpSnapshot* node) +{ + WRITE_UINT_FIELD(satisfies); + WRITE_UINT64_FIELD(xmin); + WRITE_UINT64_FIELD(xmax); + WRITE_UINT64_FIELD(snapshotcsn); + WRITE_UINT_FIELD(curcid); +} + +static void _outNdpXact(StringInfo str, NdpXact* node) +{ + _outNdpSnapshot(str, &node->snapshot); + WRITE_UINT64_FIELD(transactionId); + WRITE_INT_FIELD(usedComboCids); + WRITE_UINT_ARRAY_LEN(comboCids, node->usedComboCids * 2); + WRITE_UINT64_FIELD(latestCompletedXid); + WRITE_INT_FIELD(CLogLen); + WRITE_CHAR_ARRAY(CLogPageBuffer, CLogLen); + WRITE_INT_FIELD(CSNLogLen); + WRITE_CHAR_ARRAY(CSNLogPageBuffer, CSNLogLen); +} + +static void _outNdpAggState(StringInfo str, NdpAggState* node) +{ + WRITE_NODE_FIELD(aggTd, NdpStateType::TUPLEDESC); + WRITE_INT_FIELD(aggNum); + WRITE_NODE_ARRAY(perAggTd, aggNum, NdpStateType::TUPLEDESC); + WRITE_INT_FIELD(numCols); + WRITE_BASE_ARRAY(eqFuncOid, numCols, NdpStateType::UINT32); + WRITE_BASE_ARRAY(hashFuncOid, numCols, NdpStateType::UINT32); +} +static void _outNdpParamList(StringInfo str, NdpParamList* node) +{ + WRITE_INT_FIELD(numParams); + WRITE_NODE_ARRAY(params, numParams, NdpStateType::PARAMDATA); +} + +static void _outNdpSessionContext(StringInfo str, NdpSessionContext* node) +{ + WRITE_INT_FIELD(sql_compatibility); + WRITE_BOOL_FIELD(behavior_compat_flags); + WRITE_INT_FIELD(encoding); +} + +Size datumGetSize(Datum value, bool typByVal, int typLen) +{ + Size size; + + if (typByVal) { + /* Pass-by-value types are always fixed-length */ + Assert(typLen > 0 && (unsigned int)(typLen) <= sizeof(Datum)); + size = (Size)typLen; + } else { + if (typLen > 0) { + /* Fixed-length pass-by-ref type */ + size = (Size)typLen; + } else if (typLen == -1) { + /* It is a varlena datatype */ + struct varlena* s = (struct varlena*)DatumGetPointer(value); + + if (!PointerIsValid(s)) + ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid Datum pointer"))); + + size = (Size)VARSIZE_ANY(s); + } else if (typLen == -2) { + /* It is a cstring datatype */ + char* s = (char*)DatumGetPointer(value); + + if (!PointerIsValid(s)) + ereport(ERROR, (errcode(ERRCODE_DATA_EXCEPTION), errmsg("invalid Datum pointer"))); + + size = (Size)(strlen(s) + 1); + } else { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("invalid typLen: %d", typLen))); + size = 0; /* keep compiler quiet */ + } + } + + return size; +} +/* + * Print the value of a Datum given its type. + */ +static void _outDatum(StringInfo str, Datum value, int typlen, bool typbyval) +{ + Size length, i; + char* s = NULL; + + length = datumGetSize(value, typbyval, typlen); + + if (typbyval) { + s = (char*)(&value); + appendStringInfo(str, "%u [ ", (unsigned int)length); + for (i = 0; i < (Size)sizeof(Datum); i++) { + appendStringInfo(str, "%d ", (int)(s[i])); + } + appendStringInfo(str, "]"); + } else { + s = (char*)DatumGetPointer(value); + if (!PointerIsValid(s)) { + appendStringInfo(str, "0 [ ]"); + } else { + appendStringInfo(str, "%u [ ", (unsigned int)length); + for (i = 0; i < length; i++) { + appendStringInfo(str, "%d ", (int)(s[i])); + } + appendStringInfo(str, "]"); + } + } +} +static void _outNdpParam(StringInfo str, NdpParamData* node) +{ + WRITE_BOOL_FIELD(isnull); + WRITE_OID_FIELD(ptype); + WRITE_INT_FIELD(typlen); + WRITE_BOOL_FIELD(typbyval); + appendStringInfo(str, " :value "); + if (node->isnull) { + /* null value */ + appendStringInfo(str, "<>"); + } else { + _outDatum(str, node->value, node->typlen, node->typbyval); + } +} +static void _outNode(StringInfo str, void* obj, NdpStateType type) +{ + if (obj == nullptr) { + appendStringInfo(str, "<>"); + } else { + appendStringInfoChar(str, '{'); + if (type > NdpStateType::ILLEGAL) { + return; + } else { + appendStringInfoString(str, NdpNodeNames[static_cast(type)]); + } + switch (type) { + case NdpStateType::RELATION: + _outNdpRelation(str, reinterpret_cast(obj)); + break; + case NdpStateType::RELFILENODE: + _outNdpRelFileNode(str, reinterpret_cast(obj)); + break; + case NdpStateType::TUPLEDESC: + _outNdpTupleDesc(str, reinterpret_cast(obj)); + break; + case NdpStateType::PGATTR: + _outNdpPGAttr(str, reinterpret_cast(obj)); + break; + case NdpStateType::XACT: + _outNdpXact(str, reinterpret_cast(obj)); + break; + case NdpStateType::AGGSTATE: + _outNdpAggState(str, reinterpret_cast(obj)); + break; + case NdpStateType::PARAMLIST: + _outNdpParamList(str, reinterpret_cast(obj)); + break; + case NdpStateType::PARAMDATA: + _outNdpParam(str, reinterpret_cast(obj)); + break; + case NdpStateType::SESSIONCONTEXT: + _outNdpSessionContext(str, reinterpret_cast(obj)); + default: + break; + } + appendStringInfoChar(str, '}'); + } +} + +static void _outNodeArray(StringInfo str, void* node, int len, NdpStateType type) +{ + appendStringInfoChar(str, '('); + + void* item = node; + for (int i = 0; i < len; ++i) { + switch (type) { + case NdpStateType::RELATION: + item = reinterpret_cast(node) + i; + break; + case NdpStateType::RELFILENODE: + item = reinterpret_cast(node) + i; + break; + case NdpStateType::TUPLEDESC: + item = reinterpret_cast(node) + i; + break; + case NdpStateType::PGATTR: + item = reinterpret_cast(node) + i; + break; + case NdpStateType::XACT: + item = reinterpret_cast(node) + i; + break; + case NdpStateType::PARAMDATA: + item = reinterpret_cast(node) + i; + break; + default: + break; + } + _outNode(str, item, type); + if (i + 1 != len) { + appendStringInfoChar(str, ' '); + } + } + + appendStringInfoChar(str, ')'); +} + +void stateToString(NdpPlanState* node, StringInfo str) +{ + _outNode(str, &node->rel, NdpStateType::RELATION); + _outNode(str, &node->scanTd, NdpStateType::TUPLEDESC); + _outNode(str, &node->aggState, NdpStateType::AGGSTATE); + _outNode(str, &node->paramList, NdpStateType::PARAMLIST); + _outNode(str, &node->sess, NdpStateType::SESSIONCONTEXT); +} + +void queryToString(NdpQuery* node, StringInfo str) +{ + WRITE_INT_FIELD(tableNum); + _outNode(str, &node->xact, NdpStateType::XACT); +} diff --git a/contrib/ndpplugin/ndpplugin--1.0.sql b/contrib/ndpplugin/ndpplugin--1.0.sql new file mode 100644 index 000000000..de10d50e6 --- /dev/null +++ b/contrib/ndpplugin/ndpplugin--1.0.sql @@ -0,0 +1,10 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION ndpplugin" to load this file. \quit + +CREATE FUNCTION pg_catalog.ndpplugin_invoke() + RETURNS VOID AS '$libdir/ndpplugin','ndpplugin_invoke' LANGUAGE C STRICT; + +CREATE FUNCTION pg_catalog.pushdown_statistics() +RETURNS TABLE (query bigint, total_pushdown_page bigint, back_to_gauss bigint, received_scan bigint, received_agg bigint, failed_backend_handle bigint, failed_sendback bigint) +AS '$libdir/ndpplugin', 'pushdown_statistics' +LANGUAGE C STRICT; \ No newline at end of file diff --git a/contrib/ndpplugin/ndpplugin.control b/contrib/ndpplugin/ndpplugin.control new file mode 100644 index 000000000..bd14f1b79 --- /dev/null +++ b/contrib/ndpplugin/ndpplugin.control @@ -0,0 +1,5 @@ +# ndpplugin extension +comment = 'example implementation for openGauss server smart executor pushdown interface' +default_version = '1.0' +module_pathname = '$libdir/nddplugin' +relocatable = true diff --git a/contrib/ndpplugin/ndpplugin.cpp b/contrib/ndpplugin/ndpplugin.cpp new file mode 100644 index 000000000..597aeb968 --- /dev/null +++ b/contrib/ndpplugin/ndpplugin.cpp @@ -0,0 +1,1570 @@ +/* ------------------------------------------------------------------------- + * ndpplugin.cpp + * Routines to support ndp executor smart pushdown + * + * Portions Copyright (c) 2022 Huawei Technologies Co.,Ltd. + * + * IDENTIFICATION + * contrib/ndpplugin/ndpplugin.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "access/valid.h" +#include "access/tableam.h" +#include "executor/node/nodeAgg.h" +#include "component/rpc/rpc.h" +#include "storage/file/fio_device.h" +#include "storage/smgr/segment_internal.h" +#include "funcapi.h" +#include "ndpplugin.h" +#include "ndp_check.h" +#include "ndpam.h" +#include "storage/ipc.h" + +PG_MODULE_MAGIC; +PG_FUNCTION_INFO_V1(ndpplugin_invoke); +PG_FUNCTION_INFO_V1(pushdown_statistics); + +#define IS_AU_ALIGNED(start) (!((start) & ((unsigned)PAGE_NUM_PER_AU - 1))) +#define NDP_SCAN_CHANNEL_DEFAULT_MAX 128 +#define NDP_SCAN_CEPH_DEFAULT_MAX 128 +#define NDP_SCAN_TABLE_DEFAULT_MAX 128 + +void TransitionFunction(AggState* aggstate, + AggStatePerAgg peraggstate, + AggStatePerGroup pergroupstate, + FunctionCallInfoData* fcinfo); +static void NdpAggSlotAppend(AggState* state, AggStatePerGroup pergroup, TupleTableSlot* slot); +static void knl_u_ndp_init(knl_u_ndp_context* ndp_cxt); +static void NdpReInitConetxt(); + +THR_LOCAL ndp_pushdown_hook_type backup_ndp_pushdown_hook_type = NULL; +THR_LOCAL TableAmNdpRoutine_hook_type backup_ndp_tableam = NULL; + +THR_LOCAL ExecutorStart_hook_type ndp_hook_ExecutorStart = NULL; +THR_LOCAL ExecutorEnd_hook_type ndp_hook_ExecutorEnd = NULL; +THR_LOCAL bool HOOK_INIT = false; + +constexpr int NDP_MAX_AWAIT_REQUEST = 64; +constexpr int NDP_MEMORY_POOL_SIZE = 2048; + +NdpInstanceContext g_ndp_instance = { + .mutex = PTHREAD_MUTEX_INITIALIZER, + .status = UNINITIALIZED, + .pageContext = new MpmcBoundedQueue(NDP_MEMORY_POOL_SIZE) +}; + +void NdpSharedMemoryAlloc() +{ + Assert(g_ndp_instance.pageContext); + if (!g_ndp_instance.pageContext) { + pthread_mutex_unlock(&g_ndp_instance.mutex); + ereport(ERROR, (errmsg("memory pool haven't been init or already released."))); + } + MpmcBoundedQueue* pageContext = g_ndp_instance.pageContext; + size_t blockSize = DSS_DEFAULT_AU_SIZE; + void* ptr = malloc(blockSize * NDP_MEMORY_POOL_SIZE); + + if (!ptr) { + pthread_mutex_unlock(&g_ndp_instance.mutex); + ereport(ERROR, (errmsg("ndpplugin try alloc memory failed."))); + } + g_ndp_instance.pageContextPtr = ptr; + + uintptr_t ptrval = reinterpret_cast(ptr); + for (int i = 0; i < NDP_MEMORY_POOL_SIZE; ++i) { + pageContext->Enqueue(reinterpret_cast(ptrval + (i * blockSize))); + } +} + +/* + * proc_exit callback to free g_ndp_instance + */ +static void NdpInstanceUninit(int status, Datum arg) +{ + pthread_mutex_lock(&g_ndp_instance.mutex); + if (g_ndp_instance.pageContextPtr) { + free(g_ndp_instance.pageContextPtr); + g_ndp_instance.pageContextPtr = nullptr; + delete g_ndp_instance.pageContext; + g_ndp_instance.pageContext = nullptr; + } + g_ndp_instance.status = UNINITIALIZED; + pthread_mutex_unlock(&g_ndp_instance.mutex); +} + +void NdpInstanceInit() +{ + if (g_ndp_instance.status == INITIALIZED) { + return; + } + +#ifndef ENABLE_SSL + // if not using ssl, use memory pool + NdpSharedMemoryAlloc(); +#endif + + g_ndp_instance.status = INITIALIZED; + /* PostmasterMain(process_shared_preload_libraries) inits g_ndp_instance first */ + on_proc_exit(NdpInstanceUninit, 0); +} + +#define IndexGetBuffer(pages, i) ((char*)(pages) + i * BLCKSZ) +#define NdpTupleOffset(t) ((t)->len + offsetof(NdpTupleHeaderData, tuple)) + +typedef struct NdpTupleHeaderData { + uint64 len; + HeapTupleHeaderData tuple; +} NdpTupleHeaderData; +typedef NdpTupleHeaderData* NdpTupleHeader; + +/* + * return channel if rpc channel is ok, otherwise return null; + */ +NdpScanChannel* NdpScanGetChannel(NdpContext* ctx, char* connIp) +{ + bool found = false; + NdpScanChannel* channel; + + // no need to think about remove, do it when plan is over + pthread_rwlock_rdlock(&ctx->ccLock); + channel = (NdpScanChannel*)hash_search(ctx->channelCache, connIp, HASH_FIND, NULL); + pthread_rwlock_unlock(&ctx->ccLock); + + if (!channel) { + MemoryContext old = MemoryContextSwitchTo(ctx->ccMem); + pthread_rwlock_wrlock(&ctx->ccLock); + channel = (NdpScanChannel*)hash_search(ctx->channelCache, connIp, HASH_ENTER, &found); + if (!found) { + if (!channel->Init(ctx->rpcCount++, connIp, ctx->tableCount)) { + hash_search(ctx->channelCache, connIp, HASH_REMOVE, NULL); + channel = nullptr; + } + } + pthread_rwlock_unlock(&ctx->ccLock); + (void)MemoryContextSwitchTo(old); + } + + return channel; +} + +void NdpScanTryPushDownScan(HeapScanDesc scan, NdpScanDesc ndpScan) +{ + BlockNumber start, end, phyStart; + int bitCount = 0; + NdpIoSlot* slot; + NdpRetCode ret; + AuInfo auinfo; + char connIp[NDP_RPC_IP_LEN]; + + start = ndpScan->handledBlock; + + pm_get_pageinfo(ndpScan, start, &auinfo.object, connIp, end, phyStart); + NdpContext* context = static_cast(ndpScan->cond->ctx); + NdpScanChannel* channel = NdpScanGetChannel(context, connIp); + if (!channel || channel->GetTableStatus(ndpScan->cond->tableId) == NdpTableStatus::CONSTRUCTFAIL) { + ndpScan->AddToNormal(start, end); + goto next; + } + + slot = New(CurrentMemoryContext) NdpIoSlot(ndpScan); + slot->SetStartBlockNum(start); + auinfo.phyStartBlockNum = phyStart; + auinfo.pageNum = end - start; + + bitCount = slot->SetReq(scan->rs_base.rs_rd->rd_smgr->smgr_rnode.node, 0, ndpScan->cond->tableId, auinfo); + // set a threshold int the future + if (bitCount == 0) { + delete slot; + ndpScan->AddToNormal(start, end); + goto next; + } + + /* Numbers of Ndp page may greater than pageNum, because of NdpTupleHeader has len. + * And sometimes there are too much agg. Fix it in the future and SetResp; + */ + if (slot->SetResp(bitCount) != NdpRetCode::NDP_OK) { + delete slot; + ndpScan->AddToNormal(start, end); + goto next; + } + + ret = channel->SendRequest(slot, ndpScan); + if (ret != NdpRetCode::NDP_OK) { + delete slot; + ndpScan->AddToNormal(start, end); + ndpScan->sendFailedN++; + ereport(DEBUG2, (errmsg("send request failed, error code %d.", + static_cast(ret)))); + } else { +#ifndef NDP_ASYNC_RPC + if (!ndpScan->HandleSlot(slot)) { + delete slot; + } +#endif + ndpScan->pushDownPageN += bitCount; + if (ndpScan->scanState->ps.instrument) { + ndpScan->scanState->ps.instrument->ndp_pushdown_page += bitCount; + } + } + +next: + ndpScan->handledBlock = end; + if (scan->dop > 1 + && ((ndpScan->handledBlock - scan->rs_base.rs_startblock) % PARALLEL_SCAN_GAP_AU_ALIGNED == 0)) { + ndpScan->handledBlock += (scan->dop - 1) * PARALLEL_SCAN_GAP_AU_ALIGNED; + } +} + +static bool NdpScanGetPageIO(NdpScanDesc ndpScan) +{ + for (;;) { + if (ndpScan->nextNdpPageIndex < ndpScan->curNdpPagesNum) { + ndpScan->curNdpPage = (NdpPageHeader)IndexGetBuffer(ndpScan->curNdpPages, ndpScan->nextNdpPageIndex); + Assert(((uintptr_t)ndpScan->curNdpPage & 0x7) == 0); + ndpScan->nextNdpPageIndex++; + + ndpScan->curPageType = ndpScan->curNdpPage->pd_flags; + + ndpScan->curLinesNum = PageGetMaxOffsetNumber((Page)(ndpScan->curNdpPage)); + ndpScan->nextLineIndex = 0; + return true; + } else { + // all Ndp page has been handled, can be free + ndpScan->FreeCurSlot(); + + // get next rpc page list +#ifdef NDP_ASYNC_RPC + if (!ndpScan->GetNextSlot()) { + return false; + } +#else + return false; +#endif + } + } +} + +static bool NdpScanGetPageLocal(NdpScanDesc ndpScan) +{ +#ifdef NDP_ASYNC_RPC + if (ndpScan->normalPagesId->Dequeue(ndpScan->curNormalPageId)) { +#else + if (ndpScan->normalPagesNum) { + ndpScan->curNormalPageId = ndpScan->normalPagesId[ndpScan->normalPagesNum - 1]; + ndpScan->normalPagesNum--; +#endif + ndpScan->curPageType = NORMAL_PAGE; + + heapgetpage(ndpScan->scan, ndpScan->curNormalPageId); + + ndpScan->curLinesNum = ndpScan->scan->rs_ntuples; + ndpScan->nextLineIndex = 0; + return true; + } + return false; +} + +static bool NdpScanGetPageQueue(NdpScanDesc ndpScan) +{ + /* + * If get page from IO queue first, and most pages of IO was failed, + * this pages will add to normal queue then will cause normal queue full quickly. + * Get page from normal queue first. + * If IO fail a lot but get page from normal first, plugin will not send IO request at a period of time. + */ + // from normal page + // normalPagesId depends on NdpIO(resp) + if (NdpScanGetPageLocal(ndpScan)) { + return true; + } + // from Ndp page + if (NdpScanGetPageIO(ndpScan)) { + return true; + } + return false; +} + +// return false if finished +static bool NdpScanGetPage(NdpScanDesc ndpScan) +{ + for (;;) { + bool found = NdpScanGetPageQueue(ndpScan); + if (found) { + return true; + } + +#ifdef NDP_ASYNC_RPC + uint32 req = pg_atomic_read_u32(&ndpScan->reqCount); + uint32 resp = pg_atomic_read_u32(&ndpScan->respCount); + Assert(req >= resp); + + if (ndpScan->handledBlock < ndpScan->nBlock) { +#ifdef ENABLE_SSL + if ((req - resp) >= NDP_MAX_AWAIT_REQUEST) { +#else + if ((req - resp) >= NDP_MAX_AWAIT_REQUEST || g_ndp_instance.pageContext->Empty()) { +#endif + pg_usleep(NDP_RPC_WAIT_USEC); + } else { + NdpScanTryPushDownScan((HeapScanDesc)ndpScan->scan, ndpScan); + } + continue; + } + + // wait request + if (resp < req) { + pg_usleep(NDP_RPC_WAIT_USEC); + // if normal page finish, io request failed, pages been added to normal queue, can't return directly. + } else if (ndpScan->normalPagesId->Empty() && !NdpScanGetPageQueue(ndpScan)) { + return false; + } +#else + if (ndpScan->handledBlock < ndpScan->nBlock) { + NdpScanTryPushDownScan((HeapScanDesc)ndpScan->scan, ndpScan); + } else { + return false; + } +#endif + } +} + +static void NdpScanHandleFilteredTuple(ScanState* scanState, HeapTuple tuple) +{ + ProjectionInfo* proj_info = scanState->ps.ps_ProjInfo; + if (proj_info) { + heap_slot_store_heap_tuple(tuple, proj_info->pi_slot, + InvalidBuffer, false, false); + tableam_tslot_getsomeattrs(proj_info->pi_slot, proj_info->pi_slot->tts_tupleDescriptor->natts); + proj_info->pi_slot->tts_flags &= ~TTS_FLAG_EMPTY; + } +} + +static void initialize_aggregate(AggState* aggstate, AggStatePerAgg peraggstate, AggStatePerGroup pergroupstate) +{ + Plan* plan = aggstate->ss.ps.plan; + int64 local_work_mem = SET_NODEMEM(plan->operatorMemKB[0], plan->dop); + int64 max_mem = (plan->operatorMaxMem > 0) ? SET_NODEMEM(plan->operatorMaxMem, plan->dop) : 0; + + if (peraggstate->numSortCols > 0) { + /* + * In case of rescan, maybe there could be an uncompleted sort + * operation? Clean it up if so. + */ + if (peraggstate->sortstates[aggstate->current_set]) + tuplesort_end(peraggstate->sortstates[aggstate->current_set]); + + if (peraggstate->numInputs == 1) { + peraggstate->sortstates[aggstate->current_set] = + tuplesort_begin_datum(peraggstate->evaldesc->attrs[0].atttypid, + peraggstate->sortOperators[0], + peraggstate->sortCollations[0], + peraggstate->sortNullsFirst[0], + local_work_mem, + false); + } else { + peraggstate->sortstates[aggstate->current_set] = + tuplesort_begin_heap(peraggstate->evaldesc, + peraggstate->numSortCols, + peraggstate->sortColIdx, + peraggstate->sortOperators, + peraggstate->sortCollations, + peraggstate->sortNullsFirst, + local_work_mem, + false, + max_mem, + plan->plan_node_id, + SET_DOP(plan->dop)); + } + } + + /* + * (Re)set transValue to the initial value. + * + * Note that when the initial value is pass-by-ref, we must copy it + * (into the aggcontext) since we will pfree the transValue later. + */ + if (peraggstate->initValueIsNull) { + pergroupstate->transValue = peraggstate->initValue; + } else { + pergroupstate->transValue = + datumCopy(peraggstate->initValue, peraggstate->transtypeByVal, peraggstate->transtypeLen); + } + pergroupstate->transValueIsNull = peraggstate->initValueIsNull; + + pergroupstate->noTransValue = peraggstate->initValueIsNull; + + /* + * (Re)set collectValue to the initial value. + * + * Note that when the initial value is pass-by-ref, we must copy it + * (into the aggcontext) since we will pfree the collectValue later. + * collection type is same as transition type. + */ + if (peraggstate->initCollectValueIsNull) { + pergroupstate->collectValue = peraggstate->initCollectValue; + } else { + pergroupstate->collectValue = + datumCopy(peraggstate->initCollectValue, peraggstate->transtypeByVal, peraggstate->transtypeLen); + } + pergroupstate->collectValueIsNull = peraggstate->initCollectValueIsNull; + + pergroupstate->noCollectValue = peraggstate->initCollectValueIsNull; +} + +static void initialize_aggregates(AggState* aggstate, AggStatePerAgg peragg, AggStatePerGroup pergroup) +{ + int numReset = Max(aggstate->phase->numsets, 1); + + for (int aggno = 0; aggno < aggstate->numaggs; aggno++) { + AggStatePerAgg peraggstate = &peragg[aggno]; + + for (int setno = 0; setno < numReset; setno++) { + AggStatePerGroup pergroupstate = &pergroup[aggno + (setno * (aggstate->numaggs))]; + + aggstate->current_set = setno; + + initialize_aggregate(aggstate, peraggstate, pergroupstate); + } + } +} + +/** + * look for a hash entry + * @param state agg state of current plan + * @param slot slot load from backend + * @return found hash entry + */ +static AggHashEntry LookForHashEntry(AggState* state, TupleTableSlot* slot) +{ + TupleTableSlot* hashslot = state->hashslot; + ListCell* l = NULL; + AggHashEntry entry; + bool isnew = false; + AggWriteFileControl* TempFileControl = (AggWriteFileControl*)state->aggTempFileControl; + + if (hashslot->tts_tupleDescriptor == NULL) { + ExecSetSlotDescriptor(hashslot, state->ss.ss_ScanTupleSlot->tts_tupleDescriptor); + // Make sure all unused columns are NULLs + ExecStoreAllNullTuple(hashslot); + } + + // init hash slot + tableam_tslot_getsomeattrs(slot, linitial_int(state->hash_needed)); + int counter = slot->tts_nvalid - 1; + foreach (l, state->hash_needed) { + int varNumber = lfirst_int(l) - 1; + + hashslot->tts_values[varNumber] = slot->tts_values[counter]; + hashslot->tts_isnull[varNumber] = slot->tts_isnull[counter]; + counter--; + } + + if (TempFileControl->spillToDisk == false || TempFileControl->finishwrite == true) { + entry = (AggHashEntry)LookupTupleHashEntry(state->hashtable, hashslot, &isnew, true); + } else { + entry = (AggHashEntry)LookupTupleHashEntry(state->hashtable, hashslot, &isnew, false); + } + + if (!isnew) { + if (((Agg *)state->ss.ps.plan)->unique_check) { + ereport(ERROR, (errcode(ERRCODE_UNEXPECTED_NODE_STATE), errmsg("find a duplicate plan"))); + } + return entry; + } + + // is a new entry + if (entry) { + initialize_aggregates(state, state->peragg, entry->pergroup); + agg_spill_to_disk(TempFileControl, state->hashtable, state->hashslot, + ((Agg*)state->ss.ps.plan)->numGroups, true, state->ss.ps.plan->plan_node_id, + SET_DOP(state->ss.ps.plan->dop), state->ss.ps.instrument); + + if (TempFileControl->filesource && state->ss.ps.instrument) { + TempFileControl->filesource->m_spill_size = &state->ss.ps.instrument->sorthashinfo.spill_size; + } + } else { + // find a new entry, but memory is not enough, write tuple to temp file + Assert(TempFileControl->spillToDisk == true && TempFileControl->finishwrite == false); + MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot); + /* + * Here need switch memorycontext to ecxt_per_tuple_memory, so memory which be applyed in function + * ComputeHashValue is freed. + */ + MemoryContext oldContext = MemoryContextSwitchTo(state->tmpcontext->ecxt_per_tuple_memory); + uint32 hashvalue = ComputeHashValue(state->hashtable); + MemoryContextSwitchTo(oldContext); + TempFileControl->filesource->writeTup(tuple, hashvalue & (TempFileControl->filenum - 1)); + } + + return entry; +} + +static void NdpHashAgg(AggState* state, TupleTableSlot* slot) +{ + AggHashEntry entry = LookForHashEntry(state, slot); + + if (entry != NULL) { + // accumulate slot to tuple + NdpAggSlotAppend(state, entry->pergroup, slot); + } +} + +static void NdpAggSlotAppend(AggState* state, AggStatePerGroup pergroup, TupleTableSlot* slot) +{ + int numGroupingSets = Max(state->phase->numsets, 1); + int numAggs = state->numaggs; + + int aggno; + int setno = 0; + int counter = 0; + + for (aggno = 0; aggno < state->numaggs; aggno++) { + AggStatePerAgg peraggstate = &state->peragg[aggno]; + + AggStatePerGroup pergroupstate = &pergroup[aggno]; + + if (pergroupstate->transValueIsNull) { + if (((Agg*)(state->ss.ps.plan))->aggstrategy == AGG_PLAIN) { + peraggstate->initValue = + datumCopy(slot->tts_values[counter], peraggstate->transtypeByVal, peraggstate->transtypeLen); + peraggstate->initValueIsNull = slot->tts_isnull[counter]; + + pergroupstate->transValue = peraggstate->initValue; + pergroupstate->transValueIsNull = slot->tts_isnull[counter]; + pergroupstate->noTransValue = false; + + counter++; + continue; + } + } + + FunctionCallInfoData fcinfo; + + // init the number of arguments to a function. + // fn_nargs = 2, since we only have two inputs, 0th is transvalue, 1th is resultvalue from backend + InitFunctionCallInfoArgs(fcinfo, 2, 1); + + // add slot value to fcinfo + fcinfo.arg[1] = slot->tts_values[counter]; + fcinfo.argnull[1] = slot->tts_isnull[counter]; + fcinfo.argTypes[1] = InvalidOid; + counter++; + + // normally numGroupingSets = 1 + for (setno = 0; setno < numGroupingSets; setno++) { + AggStatePerGroup pergroupstate = &pergroup[aggno + (setno * numAggs)]; + state->current_set = setno; + + TransitionFunction(state, peraggstate, pergroupstate, &fcinfo); + } + } +} + +/* + * check and fill transition value, then call the function + * */ +void TransitionFunction(AggState* aggstate, + AggStatePerAgg peraggstate, + AggStatePerGroup pergroupstate, + FunctionCallInfoData* fcinfo) +{ + Datum newVal; + if (peraggstate->transfn.fn_strict) { + /* + * For a strict transfn, nothing happens when there's a NULL input; we + * just keep the prior transValue. + */ + for (int i = 1; i <= peraggstate->numTransInputs; i++) { + if (fcinfo->argnull[i]) + return; + } + if (pergroupstate->noTransValue) { + /* + * transValue has not been initialized. This is the first non-NULL + * input value. We use it as the initial value for transValue. (We + * already checked that the agg's input type is binary-compatible + * with its transtype, so straight copy here is OK.) + * + * We must copy the datum into aggcontext if it is pass-by-ref. We + * do not need to pfree the old transValue, since it's NULL. + */ + pergroupstate->transValue = + datumCopy(fcinfo->arg[1], peraggstate->transtypeByVal, peraggstate->transtypeLen); + pergroupstate->transValueIsNull = false; + pergroupstate->noTransValue = false; + return; + } + if (pergroupstate->transValueIsNull) { + /* + * Don't call a strict function with NULL inputs. Note it is + * possible to get here despite the above tests, if the transfn is + * strict *and* returned a NULL on a prior cycle. If that happens + * we will propagate the NULL all the way to the end. + */ + return; + } + } + + // set up aggstate->curperagg to allow get aggref + aggstate->curperagg = peraggstate; + + /* + * OK to call the collection function + * fn_nargs = 2, since we only have two inputs, 0th is transvalue, 1th is resultvalue from backend + */ + InitFunctionCallInfoData( + *fcinfo, &(peraggstate->collectfn), 2, peraggstate->aggCollation, (Node*)aggstate, NULL); + fcinfo->arg[0] = pergroupstate->transValue; + fcinfo->argnull[0] = pergroupstate->transValueIsNull; + fcinfo->argTypes[0] = InvalidOid; + fcinfo->isnull = false; /* just in case transfn doesn't set it */ + + Node* origin_fcxt = fcinfo->context; + if (peraggstate->is_avg) { + Node* fcontext = (Node*)palloc0(sizeof(Node)); +#ifdef FAULT_INJECT + if ((rand() % PERCENTAGE_DIV) < PERCENTAGE) { + ereport(ERROR, (errmsg("Fault inject -- palloc fail"))); + } +#endif + fcontext->type = (NodeTag)(peraggstate->is_avg); + fcinfo->context = fcontext; + } + newVal = FunctionCallInvoke(fcinfo); + aggstate->curperagg = NULL; + fcinfo->context = origin_fcxt; + + /* + * If pass-by-ref datatype, must copy the new value into aggcontext and + * pfree the prior transValue. But if transfn returned a pointer to its + * first input, we don't need to do anything. + */ + if (!peraggstate->transtypeByVal && DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue)) { + if (!fcinfo->isnull) { + newVal = datumCopy(newVal, peraggstate->transtypeByVal, peraggstate->transtypeLen); + } + if (!pergroupstate->transValueIsNull) + pfree(DatumGetPointer(pergroupstate->transValue)); + } + + if (((Agg*)(aggstate->ss.ps.plan))->aggstrategy == AGG_PLAIN) { + peraggstate->initValue = newVal; + peraggstate->initValueIsNull = fcinfo->isnull; + } + + pergroupstate->transValue = newVal; + pergroupstate->transValueIsNull = fcinfo->isnull; + pergroupstate->noTransValue = pergroupstate->transValueIsNull; +} + +static TupleDesc NdpAggTupleDescCreate(AggState* aggState) +{ + Assert(aggState->ss.ps.plan->type == T_Agg); + Agg* agg = (Agg*)aggState->ss.ps.plan; + int len = aggState->numaggs + agg->numCols; + TupleDesc typeInfo = CreateTemplateTupleDesc(len, false, TableAmHeap); + int curResno = 1; + + for (int aggno = 0; aggno < aggState->numaggs; ++aggno) { + AggStatePerAgg perAgg = &aggState->peragg[aggno]; + + // we don't rely on Aggref::aggtrantype, which is defined in PGXC + Oid aggTransType = ((FuncExpr*)perAgg->transfn.fn_expr)->funcresulttype; + int32 typmod = -1; + int attdim = 0; + Oid collationid = 0; + + // get from pg_type + HeapTuple tp; + tp = SearchSysCache1(TYPEOID, ObjectIdGetDatum(aggTransType)); + if (HeapTupleIsValid(tp)) { + Form_pg_type typtup = (Form_pg_type)GETSTRUCT(tp); + typmod = typtup->typtypmod; + attdim = typtup->typndims; + collationid = typtup->typcollation; + ReleaseSysCache(tp); + } + + TupleDescInitEntry(typeInfo, curResno, NULL, aggTransType, typmod, attdim); + TupleDescInitEntryCollation(typeInfo, curResno, collationid); + + curResno++; + } + + for (int i = 0; i < agg->numCols; ++i) { + AttrNumber att = agg->grpColIdx[i]; + Node* node = (Node*)list_nth(agg->plan.lefttree->targetlist, att - 1); + Assert(node->type == T_TargetEntry); + Node* expr = (Node*)(((TargetEntry*)node)->expr); + + TupleDescInitEntry(typeInfo, curResno, ((TargetEntry*)node)->resname, + exprType(expr), exprTypmod(expr), 0); + TupleDescInitEntryCollation(typeInfo, curResno, exprCollation(expr)); + + curResno++; + } + + return typeInfo; +} + +static void NdpScanHandleAggTuple(AggState* aggState, TupleTableSlot* slot, HeapTuple tuple) +{ + if (aggState == NULL) { + ereport(WARNING, (errmsg("Can't happen, ndp page flag is wrong!"))); + return; + } + heap_slot_store_heap_tuple(tuple, slot, InvalidBuffer, false, false); + tableam_tslot_getsomeattrs(slot, slot->tts_tupleDescriptor->natts); // read tuple + + if (((Agg*)aggState->ss.ps.plan)->aggstrategy == AGG_HASHED) { + NdpHashAgg(aggState, slot); + } else { + NdpAggSlotAppend(aggState, aggState->pergroup, slot); + } +} + +static inline bool NdpScanCheckKey(HeapScanDesc scan) +{ + HeapTuple tuple = &(scan->rs_ctup); + int nkeys = scan->rs_base.rs_nkeys; + ScanKey key = scan->rs_base.rs_key; + + if (key != NULL) { + bool valid = false; + HeapKeyTest(tuple, (scan->rs_tupdesc), nkeys, key, valid); + if (valid) { + return true; + } + } else { + return true; + } + return false; +} + +static bool NdpScanGetTupleFromStocked(HeapScanDesc scan, NdpScanDesc ndpScan) +{ + HeapTuple tuple = &(scan->rs_ctup); + + while (ndpScan->nextLineIndex < ndpScan->curLinesNum) { + int curLineIndex = ndpScan->nextLineIndex; + ndpScan->nextLineIndex++; + Assert(ndpScan->curPageType != INVALID_PAGE); + if (ndpScan->curPageType == NORMAL_PAGE) { + BlockNumber page = ndpScan->curNormalPageId; + Page dp = (Page)BufferGetPage(scan->rs_base.rs_cbuf); + OffsetNumber line_off = scan->rs_base.rs_vistuples[curLineIndex]; + ItemId lpp = HeapPageGetItemId(dp, line_off); + Assert(ItemIdIsNormal(lpp)); + + // set tuple + tuple->t_data = (HeapTupleHeader)PageGetItem((Page)dp, lpp); + tuple->t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(tuple->t_self), page, line_off); + HeapTupleCopyBaseFromPage(tuple, dp); + if (NdpScanCheckKey(scan)) { + scan->rs_base.rs_cindex = curLineIndex; + return true; + } + } else { + ItemId lpp = PageGetItemId((Page)ndpScan->curNdpPage, curLineIndex + 1); + HeapTupleHeader pushDownTuple = (HeapTupleHeader)((char*)ndpScan->curNdpPage + lpp->lp_off); + + tuple->t_data = pushDownTuple; + tuple->t_len = (uint32)lpp->lp_len; + + if (ndpScan->curPageType == NDP_FILTERED_PAGE) { + if (NdpScanCheckKey(scan)) { + scan->rs_base.rs_cindex = curLineIndex; + NdpScanHandleFilteredTuple(ndpScan->scanState, tuple); + return true; + } + } else if (ndpScan->curPageType == NDP_AGG_PAGE) { + NdpScanHandleAggTuple(ndpScan->aggState, ndpScan->aggSlot, tuple); + } + } + } + return false; +} + +static void NdpScanGetCachedTuple(HeapScanDesc scan, NdpScanDesc ndpScan) +{ + CHECK_FOR_INTERRUPTS(); + for(;;) { + // 1. scan stocked page + if (NdpScanGetTupleFromStocked(scan, ndpScan)) { + return; + } + + // 2. get new page + if (!NdpScanGetPage(ndpScan)) { + // free buffer + if (BufferIsValid(scan->rs_base.rs_cbuf)) { + ReleaseBuffer(scan->rs_base.rs_cbuf); + } + ndpScan->FreeCurSlot(); + scan->rs_base.rs_cbuf = InvalidBuffer; + scan->rs_base.rs_cblock = InvalidBlockNumber; + scan->rs_base.rs_inited = false; + scan->rs_ctup.t_data = NULL; + return; + } + } +} + +Tuple NdpScanGetTuple(TableScanDesc sscan, ScanDirection dir, TupleTableSlot* slot) +{ + HeapScanDesc scan = (HeapScanDesc)sscan; + HeapTuple tuple = &(scan->rs_ctup); + + Assert(ScanDirectionIsForward(dir)); + + NdpScanDesc ndpScan = (NdpScanDesc)sscan->ndp_ctx; + + MemoryContext oldMct = MemoryContextSwitchTo(ndpScan->memCtx); + + if (!scan->rs_base.rs_inited) { + if (scan->rs_base.rs_nblocks == 0) { + Assert(!BufferIsValid(scan->rs_base.rs_cbuf)); + tuple->t_data = NULL; + goto out; + } + + // doesn't support rs_parallel and rs_syncscan + if (scan->rs_parallel != NULL || scan->rs_base.rs_syncscan) { + ereport(WARNING, (errmsg("parallel not support %p, syncscan not support %d in NDP scene.", + scan->rs_parallel, scan->rs_base.rs_syncscan))); + } + + // init NdpScanDesc, rs_startblock must AU aligned in begin_scan + Assert(IS_AU_ALIGNED(scan->rs_base.rs_startblock)); + ndpScan->handledBlock = scan->rs_base.rs_startblock; + ndpScan->nBlock = scan->rs_base.rs_nblocks; + ndpScan->curPageType = INVALID_PAGE; + ndpScan->curIO = nullptr; + + ndpScan->curLinesNum = 0; + ndpScan->nextLineIndex = 0; + ndpScan->curNdpPagesNum = 0; + ndpScan->nextNdpPageIndex = 0; + +#ifndef NDP_ASYNC_RPC + ndpScan->normalPagesNum = 0; +#endif + scan->rs_base.rs_inited = true; + } + + PG_TRY(); + { + NdpScanGetCachedTuple(scan, ndpScan); + } + PG_CATCH(); + { + // wait all callback return + while (pg_atomic_read_u32(&ndpScan->reqCount) != pg_atomic_read_u32(&ndpScan->respCount)) { + pg_usleep(NDP_RPC_WAIT_USEC); + } + delete ndpScan; + sscan->ndp_ctx = nullptr; + PG_RE_THROW(); + } + PG_END_TRY(); + + out: + (void)MemoryContextSwitchTo(oldMct); + + if (scan->rs_ctup.t_data == NULL) { + ereport(DEBUG2, (errmsg("heap_getnext returning EOS"))); + return NULL; // Upper doesn't judge t_data, so tuple must return NULL if t_data is NULL. + } + return tuple; +} + +void NdpScanParallelInit(TableScanDesc sscan, int32 dop, ScanDirection dir) +{ + HeapScanDesc scan = (HeapScanDesc) sscan; + + Assert(!ScanDirectionIsBackward(dir)); + if (!scan || scan->rs_base.rs_nblocks == 0) { + return; + } + if (dop <= 1) { + return; + } + scan->dop = dop; + + uint32 paral_blocks = u_sess->stream_cxt.smp_id * PARALLEL_SCAN_GAP_AU_ALIGNED; + /* If not enough pages to divide into every worker. */ + if (scan->rs_base.rs_nblocks <= paral_blocks) { + scan->rs_base.rs_startblock = 0; + scan->rs_base.rs_nblocks = 0; + return; + } + scan->rs_base.rs_startblock = paral_blocks; +} + +// check state after planstate inited +void CheckAndSetNdpScan(Relation relation, Snapshot snapshot, ScanState* sstate, TableScanDesc desc) +{ + if (relation->rd_tam_ops != TableAmHeap) return; // only support astore currently + if (IsSystemRelation(relation) || IsCatalogRelation(relation) || + IsToastRelation(relation) || RelationIsToast(relation) || + isAnyTempNamespace(RelationGetNamespace(relation)) || RELATION_IS_TEMP(relation) || + RelationGetRelPersistence(relation) == RELPERSISTENCE_UNLOGGED) return; + if (RowRelationIsCompressed(relation)) return; + if (relation->is_compressed) return; + + // check TableScanDesc + if (desc->rs_snapshot->satisfies != SNAPSHOT_MVCC) { + return; + } + if (!desc->rs_pageatatime) return; + if (desc->rs_nblocks < (unsigned int)u_sess->ndp_cxt.pushdown_min_blocks) return; + HeapScanDesc scan = (HeapScanDesc)desc; + if (scan->rs_parallel != nullptr) { + ereport(NOTICE, (errmsg("parallel are not supported in NDP scene"))); + return; + } + + // recheck + if (sstate->ps.plan->ndp_pushdown_condition == nullptr) { + ereport(WARNING, (errmsg("Ndp condition should not be NULL"))); + return; + } + + NdpScanDesc ndpScanDesc = New(CurrentMemoryContext) NdpScanDescData; + NdpRetCode ret = ndpScanDesc->Init(sstate, desc); + if (ret != NdpRetCode::NDP_OK) { + delete ndpScanDesc; + ereport(ERROR, (errmsg("NdpScanDesc init failed, code %d", static_cast(ret)))); + return; + } + + desc->ndp_pushdown_optimized = true; + desc->ndp_ctx = ndpScanDesc; + desc->rs_syncscan = false; +} + +TableScanDesc hook_ndp_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, + ScanState* sstate, RangeScanInRedis rangeScanInRedis) +{ + TableScanDesc scanDesc = tableam_scan_begin(relation, snapshot, nkeys, key, rangeScanInRedis); + if (scanDesc) { + CheckAndSetNdpScan(relation, snapshot, sstate, scanDesc); + } + return scanDesc; +} + +void hook_ndp_init_parallel(TableScanDesc sscan, int32 dop, ScanDirection dir) +{ + if (!ScanDirectionIsBackward(dir)) { + return NdpScanParallelInit(sscan, dop, dir); + } else { + return tableam_scan_init_parallel_seqscan(sscan, dop, dir); + } +} + +void hook_ndp_rescan(TableScanDesc sscan, ScanKey key) +{ + tableam_scan_rescan(sscan, key); + sscan->rs_syncscan = false; + sscan->ndp_pushdown_optimized = true; + NdpScanDesc ndpScan = (NdpScanDesc)sscan->ndp_ctx; + if (ndpScan) { + ndpScan->Reset(); + } +} +static void SendTerminate(NdpContext* context) +{ + if (context == nullptr) { + return; + } + HASH_SEQ_STATUS status; + NdpScanChannel* channel; + + // notify rpc server to release query resource + hash_seq_init(&status, context->channelCache); + while ((channel = (NdpScanChannel*)hash_seq_search(&status)) != nullptr) { + NdpRetCode retCode = channel->SendEnd(); + if (retCode != NdpRetCode::NDP_OK) { + ereport(DEBUG2, (errmsg("SendEnd %s fail code[%d].", channel->rpcIp, static_cast(retCode)))); + } + channel->DestroyChannel(); + hash_search(context->channelCache, channel->rpcIp, HASH_REMOVE, NULL); + } +} + +void NdpDestroyContext(NdpContext* context) +{ + if (context == nullptr) { + return; + } + hash_destroy(context->channelCache); + context->channelCache = nullptr; +} + +static void NdpReInitConetxt() +{ + if (u_sess->ndp_cxt.cxt == nullptr) { + return; + } + NdpContext* context = (NdpContext*)u_sess->ndp_cxt.cxt; + SendTerminate(context); + NdpDestroyContext(context); + + MemoryContext oldContext = MemoryContextSwitchTo(u_sess->ndp_cxt.mem_cxt); + context->rpcCount = 0; + context->tableCount = 0; + context->u_sess = u_sess; + HASHCTL ctlConn; + ctlConn.keysize = NDP_RPC_IP_LEN; + ctlConn.entrysize = sizeof(NdpScanChannel); + ctlConn.hash = string_hash; + context->channelCache = hash_create("Ndp Connector to IPC Channel", + NDP_SCAN_CHANNEL_DEFAULT_MAX, &ctlConn, + HASH_ELEM | HASH_FUNCTION); + if (context->channelCache == nullptr) { + pfree(context); + u_sess->ndp_cxt.cxt = nullptr; + } + MemoryContextSwitchTo(oldContext); +} + +void hook_ndp_endscan(TableScanDesc sscan) +{ + NdpScanDesc ndpScan = (NdpScanDesc)sscan->ndp_ctx; + if (ndpScan == nullptr || !sscan->ndp_pushdown_optimized || ndpScan->cond == nullptr) { + return tableam_scan_end(sscan); + } + NdpContext* context = static_cast(ndpScan->cond->ctx); + if (context == nullptr || context->u_sess == nullptr) { + delete ndpScan; + return tableam_scan_end(sscan); + } + knl_session_context* sess = context->u_sess; + __atomic_add_fetch(&sess->ndp_cxt.stats->sendFailed, ndpScan->sendFailedN, __ATOMIC_RELAXED); + __atomic_add_fetch(&sess->ndp_cxt.stats->failedIO, ndpScan->failedIoN, __ATOMIC_RELAXED); + __atomic_add_fetch(&sess->ndp_cxt.stats->pushDownPage, ndpScan->pushDownPageN, __ATOMIC_RELAXED); + __atomic_add_fetch(&sess->ndp_cxt.stats->sendBackPage, ndpScan->sendBackPageN, __ATOMIC_RELAXED); + __atomic_add_fetch(&sess->ndp_cxt.stats->ndpPageAgg, ndpScan->ndpPageAggN, __ATOMIC_RELAXED); + __atomic_add_fetch(&sess->ndp_cxt.stats->ndpPageScan, ndpScan->ndpPageScanN, __ATOMIC_RELAXED); + if (!StreamThreadAmI()) { + __atomic_add_fetch(&sess->ndp_cxt.stats->queryCounter, 1, __ATOMIC_RELAXED); + } + delete ndpScan; + sscan->ndp_ctx = NULL; + + return tableam_scan_end(sscan); +} + +Tuple hook_ndp_getnexttuple(TableScanDesc sscan, ScanDirection direction, TupleTableSlot* slot) +{ + if (ScanDirectionIsForward(direction)) { + return NdpScanGetTuple(sscan, direction, slot); + } else { + return heap_getnext(sscan, direction); + } +} + +void hook_ndp_handle_hashaggtuple(AggState* aggstate, HeapTupleData *tts_minhdr) +{ + TupleTableSlot *aggSlot = (TupleTableSlot *)aggstate->ndp_slot; + if (aggSlot) { + NdpScanHandleAggTuple(aggstate, aggSlot, tts_minhdr); + } +} + +// shared by multi thread +static TableAmNdpRoutine_hook ndp_tableam_apply = { + .scan_begin = hook_ndp_beginscan, + .scan_init_parallel_seqscan = hook_ndp_init_parallel, + .scan_rescan = hook_ndp_rescan, + .scan_end = hook_ndp_endscan, + .scan_getnexttuple = hook_ndp_getnexttuple, + .handle_hashaggslot = hook_ndp_handle_hashaggtuple +}; + +void ndpplugin_invoke(void) +{ + ereport(DEBUG2, (errmsg("dummy function to let process load this library."))); + return; +} + +NdpScanCondition* NdpCreateScanCondition(Plan* node) +{ + NdpScanCondition* cond = makeNode(NdpScanCondition); + cond->plan = node; + return cond; +} + +void NdpDestroyScanCondition(NdpScanCondition* cond) +{ + if (!cond) { + return; + } + pfree((void*)cond); +} + +NdpContext* NdpCreateContext() +{ + NdpContext* context = (NdpContext*)palloc(sizeof(NdpContext)); +#ifdef FAULT_INJECT + if ((rand() % PERCENTAGE_DIV) < PERCENTAGE) { + ereport(ERROR, (errmsg("Fault inject -- palloc fail"))); + } +#endif + pthread_rwlock_init(&context->ccLock, NULL); + + context->ccMem = CurrentMemoryContext; + HASHCTL ctlConn; + ctlConn.keysize = NDP_RPC_IP_LEN; + ctlConn.entrysize = sizeof(NdpScanChannel); + ctlConn.hash = string_hash; + context->channelCache = hash_create("Ndp Connector to IPC Channel", + NDP_SCAN_CHANNEL_DEFAULT_MAX, &ctlConn, + HASH_ELEM | HASH_FUNCTION); + if (context->channelCache == NULL) { + pfree(context); + return NULL; + } + + context->rpcCount = 0; + context->tableCount = 0; + context->u_sess = u_sess; + + DependencePath paths = { + .ulogPath = LIB_ULOG, + .rpcPath = LIB_RPC_UCX, + .sslDLPath = LIB_OPENSSL_DL, + .sslPath = LIB_SSL, + .cryptoPath = LIB_CRYPTO + }; + RpcStatus status = RpcClientInit(paths); + if (status != RPC_OK) { + hash_destroy(context->channelCache); + pfree(context); + return NULL; + } + return context; +} +NdpContext* GetNdpContext() +{ + if (u_sess->ndp_cxt.cxt == nullptr) { + MemoryContext oldContext = nullptr; + oldContext = MemoryContextSwitchTo(u_sess->ndp_cxt.mem_cxt); + u_sess->ndp_cxt.cxt = NdpCreateContext(); + MemoryContextSwitchTo(oldContext); + } else { + NdpReInitConetxt(); + } + return (NdpContext*)u_sess->ndp_cxt.cxt; +} +// check after create plan +static void CheckAndSetNdpScanPlan(PlannedStmt* stmt, SeqScan* scan, Plan* parent, NdpContext** context) +{ + Plan* pushDownPlan = CheckAndGetNdpPlan(stmt, scan, parent); + if (pushDownPlan == NULL) { + return; + } + + NdpScanCondition* cond = NdpCreateScanCondition(pushDownPlan); + if (cond == NULL) { + scan->plan.ndp_pushdown_optimized = false; + scan->plan.ndp_pushdown_condition = NULL; + ereport(WARNING, (errmsg("NdpCreateScanCondition failed"))); + } else { + // store ndp context in ndp_pushdown_condition + if (*context) { + cond->ctx = *context; + cond->tableId = ((*context)->tableCount)++; + scan->plan.ndp_pushdown_optimized = true; + scan->plan.ndp_pushdown_condition = (Node*)cond; + } else { + *context = GetNdpContext(); + if (!*context) { + NdpDestroyScanCondition(cond); + scan->plan.ndp_pushdown_optimized = false; + scan->plan.ndp_pushdown_condition = NULL; + } else { + cond->ctx = *context; + cond->tableId = ((*context)->tableCount)++; + scan->plan.ndp_pushdown_optimized = true; + scan->plan.ndp_pushdown_condition = (Node*)cond; + } + } + } +} + +static void TraversePlan(PlannedStmt* stmt, Plan* plan, Plan* parent, NdpContext** context) +{ + if (!plan) return; + + /* filter out the lefttree and righttree of T_MergeJoin which is T_Sort */ + if (IsA(plan, MergeJoin)) { + TraversePlan(stmt, outerPlan(outerPlan(plan)), plan, context); + TraversePlan(stmt, outerPlan(innerPlan(plan)), plan, context); + return; + } else if (IsA(plan, SeqScan)) { + CheckAndSetNdpScanPlan(stmt, castNode(SeqScan, plan), parent, context); + } else if (IsA(plan, SubqueryScan)) { + TraversePlan(stmt, castNode(SubqueryScan, plan)->subplan, plan, context); + } else if (IsA(plan, Append)) { + ListCell* lc = NULL; + foreach (lc, castNode(Append, plan)->appendplans) { + Plan* appendPlans = (Plan*)lfirst(lc); + TraversePlan(stmt, appendPlans, plan, context); + } + } + + TraversePlan(stmt, outerPlan(plan), plan, context); + TraversePlan(stmt, innerPlan(plan), plan, context); +} + +static void CheckAndSetNdpPlan(Query* querytree, PlannedStmt* stmt) +{ + knl_u_ndp_init(&u_sess->ndp_cxt); + if (!CheckNdpSupport(querytree, stmt)) { + return; + } + // travel plan to find scan node + NdpContext* context = NULL; + TraversePlan(stmt, stmt->planTree, NULL, &context); + ListCell *l = NULL; + foreach (l, stmt->subplans) { + Plan *subplan = (Plan *)lfirst(l); + TraversePlan(stmt, subplan, NULL, &context); + } +} + +static void NdpAggInitCollect(AggState* node) +{ + for (int i = 0; i < node->numaggs; i++) { + AggStatePerAgg peragg = &(node->peragg[i]); + Aggref* aggref = peragg->aggref; + Oid collectfn_oid; + Expr* collectfnexpr = NULL; + + if (OidIsValid(peragg->collectfn_oid)) { + continue; + } + + /* Fetch the pg_aggregate row */ + HeapTuple aggTuple = SearchSysCache1(AGGFNOID, ObjectIdGetDatum(aggref->aggfnoid)); + if (!HeapTupleIsValid(aggTuple)) { + ereport(ERROR, + (errcode(ERRCODE_CACHE_LOOKUP_FAILED), + errmodule(MOD_EXECUTOR), + errmsg("cache lookup failed for aggregate %u", aggref->aggfnoid))); + } + Form_pg_aggregate aggform = (Form_pg_aggregate)GETSTRUCT(aggTuple); + + /* Check permission to call aggregate function */ + AclResult aclresult = pg_proc_aclcheck(aggref->aggfnoid, GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, ACL_KIND_PROC, get_func_name(aggref->aggfnoid)); + + peragg->collectfn_oid = collectfn_oid = aggform->aggcollectfn; + Oid aggtranstype = aggform->aggtranstype; + + if (OidIsValid(collectfn_oid)) { + /* we expect final function expression to be NULL in call to + * build_aggregate_fnexprs below, since InvalidOid is passed for + * finalfn_oid argument. Use a dummy expression to accept that. + */ + Expr* dummyexpr = NULL; + /* + * for XC, we need to setup the collection function expression as well. + * Use build_aggregate_fnexpr() with invalid final function oid, and collection + * function information instead of transition function information. + * We should really be adding this step inside + * build_aggregate_fnexprs() but this way it becomes easy to merge. + */ + build_aggregate_fnexprs(&aggtranstype, + 1, + aggtranstype, + aggref->aggtype, + aggref->inputcollid, + collectfn_oid, + InvalidOid, + &collectfnexpr, + &dummyexpr); + Assert(!dummyexpr); + } + fmgr_info(collectfn_oid, &peragg->collectfn); + peragg->collectfn.fn_expr = (Node*)collectfnexpr; + + ReleaseSysCache(aggTuple); + } +} + +static void NdpAggInit(AggState* node) +{ + Agg* plan = reinterpret_cast(node->ss.ps.plan); + if (plan->aggstrategy == AGG_PLAIN) { + for (int i = 0; i < node->numaggs; i++) { + AggStatePerGroup pergroup = &(node->pergroup[i]); + AggStatePerAgg peragg = &(node->peragg[i]); + + pergroup->transValueIsNull = peragg->initValueIsNull; + if (!peragg->initValueIsNull) { + pergroup->transValue = + datumCopy(peragg->initValue, peragg->transtypeByVal, peragg->transtypeLen); + pergroup->noTransValue = false; + } else { + pergroup->noTransValue = true; + } + } + } + + /* + * we currently rely on collect function from gaussdb + */ + if (IS_STREAM_PLAN || StreamThreadAmI()) { + NdpAggInitCollect(node); + } +} + +static void TraverseState(PlanState* state, PlanState* parent) +{ + if (!state) return; + + if (IsA(state, SeqScanState)) { + auto seq = reinterpret_cast(state); + if (!seq->ss_currentScanDesc || !seq->ss_currentScanDesc->ndp_pushdown_optimized) { + return; + } + auto ndpScan = reinterpret_cast(seq->ss_currentScanDesc->ndp_ctx); + Assert(ndpScan); + + if (IsA(ndpScan->cond->plan, Agg)) { + Assert(parent && IsA(parent, AggState)); + ndpScan->aggState = reinterpret_cast(parent); + TupleDesc desc = NdpAggTupleDescCreate(ndpScan->aggState); + // should use ExecInitExtraTupleSlot to put in estate->es_tupleTable? + TupleTableSlot* slot = MakeTupleTableSlot(false, TableAmHeap); + ExecSetSlotDescriptor(slot, desc); + ndpScan->aggSlot = slot; + NdpAggInit(ndpScan->aggState); + ndpScan->aggState->ndp_slot = slot; + } + } else if (IsA(state, SubqueryScanState)) { + TraverseState(castNode(SubqueryScanState, state)->subplan, state); + } else if (IsA(state, AppendState)) { + AppendState* appState = reinterpret_cast(state); + for (int i = 0; i < appState->as_nplans; i++) { + TraverseState(*(appState->appendplans + i), state); + } + } + + TraverseState(outerPlanState(state), state); + TraverseState(innerPlanState(state), state); +} + +static void NdpExecutorStart(QueryDesc* queryDesc, int eflags) +{ + knl_u_ndp_init(&u_sess->ndp_cxt); + if (ndp_hook_ExecutorStart) + ndp_hook_ExecutorStart(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); + + TraverseState(queryDesc->planstate, NULL); + ListCell *l = NULL; + foreach (l, queryDesc->estate->es_subplanstates) { + PlanState* subplanstate = (PlanState*)lfirst(l); + TraverseState(subplanstate, NULL); + } +} + +static void NdpExecutorEnd(QueryDesc* queryDesc) +{ + if (ndp_hook_ExecutorEnd) + ndp_hook_ExecutorEnd(queryDesc); + else + standard_ExecutorEnd(queryDesc); + if (!StreamThreadAmI()) { + NdpReInitConetxt(); + } +} + +static void InitializeNdpGUCOptions() +{ + DefineCustomBoolVariable("ndpplugin.enable_ndp", + "Enable NDP engine", + NULL, + &u_sess->ndp_cxt.enable_ndp, + false, + PGC_USERSET, + 0, + NULL, + NULL, + NULL); + DefineCustomIntVariable("ndpplugin.pushdown_min_blocks", + "Sets the lower limit of pushdown pages..", + NULL, + &u_sess->ndp_cxt.pushdown_min_blocks, + 0, + 0, + INT_MAX / 1000, + PGC_USERSET, + GUC_CUSTOM_PLACEHOLDER, + NULL, + NULL, + NULL); + DefineCustomIntVariable("ndpplugin.ndp_port", + "Sets the ndp_port of ndp", + NULL, + &u_sess->ndp_cxt.ndp_port, + 8000, + 0, + 65535, + PGC_USERSET, + GUC_CUSTOM_PLACEHOLDER, + NULL, + NULL, + NULL); +#ifdef ENABLE_SSL + DefineCustomStringVariable("ndpplugin.ca_path", + "Client CA path", + NULL, + &u_sess->ndp_cxt.ca_path, + "./", + PGC_USERSET, + GUC_LIST_INPUT, + NULL, + NULL, + NULL); + DefineCustomStringVariable("ndpplugin.crl_path", + "Client crl path", + NULL, + &u_sess->ndp_cxt.crl_path, + "./", + PGC_USERSET, + GUC_LIST_INPUT, + NULL, + NULL, + NULL); +#endif +} + +static void knl_u_ndp_init(knl_u_ndp_context* ndp_cxt) +{ + if (ndp_cxt->mem_cxt != nullptr) { + return; + } + ndp_cxt->mem_cxt = AllocSetContextCreate(u_sess->top_mem_cxt, + "NdpSelfMemoryContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + MemoryContext oldContext = MemoryContextSwitchTo(u_sess->ndp_cxt.mem_cxt); + ndp_cxt->stats = (NdpStats*)palloc0(sizeof(NdpStats)); + ndp_cxt->cxt = nullptr; + InitializeNdpGUCOptions(); + MemoryContextSwitchTo(oldContext); +} + +/* + * Entrypoint of this extension + */ +void _PG_init(void) +{ + ereport(DEBUG2, (errmsg("init ndpplugin."))); + + pthread_mutex_lock(&g_ndp_instance.mutex); + +#ifdef GlobalCache + long long au_size; + const char *vg_name = g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name + 1; + int ret = dss_compare_size(vg_name, &au_size); + if (ret != 0 || au_size != DSS_DEFAULT_AU_SIZE) { + pthread_mutex_unlock(&g_ndp_instance.mutex); + ereport(WARNING, (errmsg("init ndpplugin failed, inconsistency between dss_ausize and ndpplugin_ausize!"))); + return; + } +#endif + NdpInstanceInit(); + pthread_mutex_unlock(&g_ndp_instance.mutex); + + if (HOOK_INIT == false) { + backup_ndp_pushdown_hook_type = ndp_pushdown_hook; + ndp_pushdown_hook = CheckAndSetNdpPlan; + + backup_ndp_tableam = ndp_tableam; + ndp_tableam = &ndp_tableam_apply; + + ndp_hook_ExecutorStart = ExecutorStart_hook; + ExecutorStart_hook = NdpExecutorStart; + ndp_hook_ExecutorEnd = ExecutorEnd_hook; + ExecutorEnd_hook = NdpExecutorEnd; + } + HOOK_INIT = true; + knl_u_ndp_init(&u_sess->ndp_cxt); +} + +void _PG_fini(void) +{ + ndp_pushdown_hook = backup_ndp_pushdown_hook_type; + ndp_tableam = backup_ndp_tableam; + + ExecutorStart_hook = ndp_hook_ExecutorStart; + ExecutorEnd_hook = ndp_hook_ExecutorEnd; + MemoryContextDelete(u_sess->ndp_cxt.mem_cxt); + + pthread_mutex_lock(&g_ndp_instance.mutex); + if (g_ndp_instance.pageContextPtr) { + free(g_ndp_instance.pageContextPtr); + g_ndp_instance.pageContextPtr = nullptr; + delete g_ndp_instance.pageContext; + g_ndp_instance.pageContext = nullptr; + } + g_ndp_instance.status = UNINITIALIZED; + pthread_mutex_unlock(&g_ndp_instance.mutex); +} + +/* + * For test ndpplugin push down functionality + */ +Datum pushdown_statistics(PG_FUNCTION_ARGS) +{ + if (u_sess->ndp_cxt.stats == NULL) { + ereport(WARNING, (errmsg("ndp init failed, the pushdown statistics can not be viewed"))); + PG_RETURN_NULL(); + } + + const int cols = 7; + + TupleDesc tupdesc = CreateTemplateTupleDesc(cols, true); + + TupleDescInitEntry(tupdesc, (AttrNumber)1, "query", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)2, "total_pushdown_page", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)3, "back_to_gauss", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)4, "received_scan", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)5, "received_agg", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)6, "failed_backend_handle", INT8OID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber)7, "failed_sendback", INT8OID, -1, 0); + + BlessTupleDesc(tupdesc); + + Datum values[cols]; + values[0] = UInt64GetDatum(u_sess->ndp_cxt.stats->queryCounter); + values[1] = UInt64GetDatum(u_sess->ndp_cxt.stats->pushDownPage); + values[2] = UInt64GetDatum(u_sess->ndp_cxt.stats->sendBackPage); + values[3] = UInt64GetDatum(u_sess->ndp_cxt.stats->ndpPageScan); + values[4] = UInt64GetDatum(u_sess->ndp_cxt.stats->ndpPageAgg); + values[5] = UInt64GetDatum(u_sess->ndp_cxt.stats->failedIO); + values[6] = UInt64GetDatum(u_sess->ndp_cxt.stats->sendFailed); + + bool nulls[cols] = {false, false, false, false, false, false, false}; + + HeapTuple tuple = heap_form_tuple(tupdesc, values, nulls); + HeapTupleHeader result = (HeapTupleHeader)palloc(tuple->t_len); + int rc = memcpy_s(result, tuple->t_len, tuple->t_data, tuple->t_len); + securec_check_ss(rc, "\0", "\0"); + ReleaseTupleDesc(tupdesc); + + PG_RETURN_HEAPTUPLEHEADER(result); +} + +/* test section end */ diff --git a/contrib/ndpplugin/ndpplugin.h b/contrib/ndpplugin/ndpplugin.h new file mode 100644 index 000000000..dd4ab0966 --- /dev/null +++ b/contrib/ndpplugin/ndpplugin.h @@ -0,0 +1,44 @@ +/* ------------------------------------------------------------------------- + * ndpplugin.h + * prototypes for functions in contrib/ndpplugin/ndpplugin.cpp + * + * Portions Copyright (c) 2022 Huawei Technologies Co.,Ltd. + * + * IDENTIFICATION + * contrib/ndpplugin/ndpplugin.h + * + * ------------------------------------------------------------------------- + */ + +#ifndef NDPPLUGIN_NDPPLUGIN_H +#define NDPPLUGIN_NDPPLUGIN_H + +#include +#include "utils/palloc.h" + +extern "C" void _PG_init(void); +extern "C" void _PG_fini(void); +extern "C" void ndpplugin_invoke(void); +extern "C" Datum pushdown_statistics(PG_FUNCTION_ARGS); + +#define NDP_ASYNC_RPC +#define NDP_RPC_IP_LEN 16 +#define NDP_RPC_WAIT_USEC 10 + +typedef enum NdpInstanceContextStatus { + UNINITIALIZED, + INITIALIZED +} NdpInstanceContextStatus; + + + +typedef struct NdpInstanceContext { + pthread_mutex_t mutex; + volatile NdpInstanceContextStatus status; + MpmcBoundedQueue* pageContext; + void* pageContextPtr; +} NdpInstanceContext; + +extern NdpInstanceContext g_ndp_instance; + +#endif //NDPPLUGIN_NDPPLUGIN_H diff --git a/contrib/ndpplugin/rpc.cpp b/contrib/ndpplugin/rpc.cpp new file mode 100644 index 000000000..f6072ed52 --- /dev/null +++ b/contrib/ndpplugin/rpc.cpp @@ -0,0 +1,615 @@ +/* + * Copyright (c) 2021 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * rpc.cpp + * + * IDENTIFICATION + * src\common\component\rpc\rpc.cpp + * + * ------------------------------------------------------------------------- + */ +#include +#include "utils/dynloader.h" +#include "component/rpc/rpc.h" + +#ifdef NDP_CLIENT +#include "utils/elog.h" +#include "knl/knl_session.h" +#else +#include "utils/log.h" +#include "utils/config.h" +#include "ndp/ndp.h" +#include "securec_check.h" +#endif + +#define CHECK_RPC_STATUS(status) if ((status) != STATUS_OK) return RPC_ERROR +#define CHECK_NDP_RPC_STATUS(status) if ((status) != RPC_OK) return RPC_ERROR + +#ifdef ENABLE_SSL +#define OCK_RPC_CONFIG_USE_SSL_CALLBACK (1ul << (2)) +typedef uintptr_t OckRpcServerContext; +using OckRpcServerCtxBuilderHandler = OckRpcServerContext (*)(RpcServer server); +using OckRpcServerCtxCleanupHandler = void (*)(RpcServer server, OckRpcServerContext ctx); + +/** @brief TLS callbacks */ +/** + * @brief Keypass erase function + * @param keypass the memory address of keypass + */ +using OckRpcTlsKeypassErase = void (*)(char *keypass); + +/** + * @brief Get private key file's path and length, and get the keypass + * @param priKeyPath the path of private key + * @param keypass the keypass + * @param erase the erase function + */ +using OckRpcTlsGetPrivateKey = void (*)(const char **priKeyPath, char **keypass, OckRpcTlsKeypassErase *erase); +/** + * @brief Get the certificate file of public key + * @param certPath the path of certificate + */ +using OckRpcTlsGetCert = void (*)(const char **certPath); + +/** + * @brief The cert verify function + * @param x509 the X509_STORE_CTX object of CA + * @param crlPath the crl file path + * + * @return -1 for failed, and 1 for success + */ +using OckRpcTlsCertVerify = int (*)(void *x509, const char *crlPath); +/** + * @brief Get the CA and verify + * @param caPath the path of CA file + * @param crlPath the crl file path + * @param verify the verify function + */ +using OckRpcTlsGetCAAndVerify = void (*)(const char **caPath, const char **crlPath, OckRpcTlsCertVerify *verify); + +typedef struct { + /* Must enable special bit before you set config value OckRpcCreateConfigMask */ + uint64_t mask; + + /* Set Key-Value mode to config, must enable OCK_RPC_CONFIG_USE_RPC_CONFIGS */ + RpcConfigs configs; + + /* Set user define Server Ctx build and cleanup handler, must enable OCK_RPC_CONFIG_USE_SERVER_CTX_BUILD */ + OckRpcServerCtxBuilderHandler serverCtxbuilder; + OckRpcServerCtxCleanupHandler serverCtxCleanup; + + /** + * Set SSL handler, must enable OCK_RPC_CONFIG_USE_SSL_CALLBACK + * + * In Server side getCert and getPriKey can't be nullptr + * In Client side getCaAndVerify can't be nullptr + */ + OckRpcTlsGetCAAndVerify getCaAndVerify; /* get the CA path and verify callback. */ + OckRpcTlsGetCert getCert; /* get the certificate file of public key */ + OckRpcTlsGetPrivateKey getPriKey; /* get the private key and keypass */ +} OckRpcCreateConfig; + +#ifdef NDP_CLIENT +using ClientConnectWithCfg = RpcStatus (*)(const char* ip, uint16_t port, RpcClient* client, OckRpcCreateConfig* cfg); +#else +using ServerCreateWithCfg = RpcStatus (*)(const char* ip, uint16_t port, RpcServer* server, OckRpcCreateConfig* cfg); +#endif + +#endif + +#ifdef NDP_CLIENT +using ClientConnect = RpcStatus (*)(const char *ip, uint16_t port, RpcClient *client); +using ClientDisconnect = void (*)(RpcClient client); +using ClientCall = RpcStatus (*)(RpcClient client, uint16_t msgId, RpcMessage *request, RpcMessage *response, + RpcCallDone *done); +using ClientSetTimeout = void (*)(RpcClient client, int64_t timeout); +typedef struct RpcUcxFunc { + ClientConnect clientConnect; +#ifdef ENABLE_SSL + ClientConnectWithCfg clientConnectWithCfg; +#endif + ClientDisconnect clientDisconnect; + ClientCall clientCall; + ClientSetTimeout clientSetTimeout; +} RpcUcxFunc; + +#else +using ServerCreate = RpcStatus (*)(const char *ip, uint16_t port, RpcServer *server); +using ServerAddService = RpcStatus (*)(RpcServer server, RpcService *service); +using ServerStart = RpcStatus (*)(RpcServer server); +using ServerDestroy = void (*)(RpcServer server); +using ServerReply = RpcStatus (*)(RpcServerContext ctx, uint16_t msgId, RpcMessage *reply, RpcCallDone *done); +using ServerCleanupCtx = void (*)(RpcServerContext ctx); +typedef struct RpcUcxFunc { + ServerCreate serverCreate; +#ifdef ENABLE_SSL + ServerCreateWithCfg serverCreateWithCfg; +#endif + ServerAddService serverAddService; + ServerStart serverStart; + ServerDestroy serverDestroy; + ServerReply serverReply; + ServerCleanupCtx serverCleanCtx; +} RpcUcxFunc; +#endif + +using ULOG_Init = void (*)(int x, int y, std::nullptr_t ptr, int z, int i); +using SetOpensslDLopenLibPath = int (*)(const char *ssl, const char *crypto); + +constexpr int64_t REPLY_TIMEOUT = 60000; + +void *g_rpcUcxDl = nullptr; +RpcUcxFunc g_rpcUcxFunc; + +#ifdef ENABLE_SSL +#ifdef NDP_CLIENT +int tlsCertVerify(void *x509, const char *crlPath) +{ + // rpc has basic verify, we don't add extra verify process, so return true directly + return 1; +} + +void GetCAAndVerify(const char **caPath, const char **crlPath, OckRpcTlsCertVerify *verify) +{ + *caPath = u_sess->ndp_cxt.ca_path; + *crlPath = u_sess->ndp_cxt.crl_path; + *verify = tlsCertVerify; + return; +} +#else +void KeypassErase(char *keypass) +{ + if (keypass != nullptr) { + free(keypass); + } +} +void GetCert(const char **certPath) +{ + *certPath = configSets->certPath.c_str(); +} +void GetPrivateKey(const char **priKeyPath, char **keypass, OckRpcTlsKeypassErase *erase) +{ + *priKeyPath = configSets->priKeyPath.c_str(); + *erase = KeypassErase; + *keypass = (char*)malloc(configSets->keypass.length() + 1); + if (*keypass == nullptr) { + LOG_ERROR << "malloc failed, keypass copy failed."; + } + // keypass need encrypt further + errno_t rc = memcpy_s(*keypass, configSets->keypass.length() + 1, configSets->keypass.c_str(), + configSets->keypass.length() + 1); + securec_check(rc, "", ""); +} +#endif + +RpcStatus InitSslDl(char *sslDlPath, char* sslPath, char* cryptoPath) +{ + if (sslDlPath == NULL || sslPath == NULL || cryptoPath == NULL) { +#ifdef NDP_CLIENT + ereport(WARNING, (errmsg("InitRpcDl failed, path is null"))); +#else + LOG_ERROR << "InitRpcDl failed, path is null"; +#endif + return RPC_ERROR; + } + + if (g_rpcUcxDl != NULL) { + return RPC_OK; + } + + /* load ulog */ + void *sslDl; + CHECK_RPC_STATUS(OpenDl(&sslDl, sslDlPath)); + + /* init ulog */ + SetOpensslDLopenLibPath setSSLDlPath; + CHECK_RPC_STATUS(LoadSymbol(sslDl, "SetOpensslDLopenLibPath", (void **)&setSSLDlPath)); + setSSLDlPath("sslPath", "cryptoPath"); + + return RPC_OK; +} + +#endif + +RpcStatus InitRpcDl(char *path) +{ + if (path == nullptr) { +#ifdef NDP_CLIENT + ereport(WARNING, (errmsg("dlopen rpc_ucx path is nullptr"))); +#else + LOG_ERROR << "dlopen rpc_ucx path is nullptr"; +#endif + return RPC_ERROR; + } + + if (g_rpcUcxDl != nullptr) { + return RPC_OK; + } + + CHECK_RPC_STATUS(OpenDl(&g_rpcUcxDl, path)); + + return RPC_OK; +} + +/** + * load ulog from so, only need to use once, before InitRpcDl + * @param ulogPath + * @return + */ +RpcStatus LoadUlog(char* ulogPath) +{ + if (ulogPath == nullptr) { +#ifdef NDP_CLIENT + ereport(WARNING, (errmsg("dlopen ulog path is nullptr"))); +#else + LOG_ERROR << "dlopen ulog path is nullptr"; +#endif + return RPC_ERROR; + } + + /* load ulog */ + void *ulog; + CHECK_RPC_STATUS(OpenDl(&ulog, ulogPath)); + + /* init ulog */ + ULOG_Init ulogInit; + CHECK_RPC_STATUS(LoadSymbol(ulog, "ULOG_Init", (void **)&ulogInit)); + + ulogInit(0, 3, nullptr, 0, 0); + CloseDl(ulog); + return RPC_OK; +} + +RpcStatus InitRpcEnv(DependencePath paths) +{ + CHECK_NDP_RPC_STATUS(LoadUlog(paths.ulogPath)); + +#ifdef ENABLE_SSL + CHECK_NDP_RPC_STATUS(InitSslDl(paths.sslDLPath, paths.sslPath, paths.cryptoPath)); +#endif + + CHECK_NDP_RPC_STATUS(InitRpcDl(paths.rpcPath)); + + return RPC_OK; +} + +#ifndef NDP_CLIENT +static RpcStatus RpcServerDlsym(void) +{ + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcServerCreate", (void **)&g_rpcUcxFunc.serverCreate)); + +#ifdef ENABLE_SSL + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcServerCreateWithCfg", (void **)&g_rpcUcxFunc.serverCreateWithCfg)); +#endif + + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcServerAddService", (void **)&g_rpcUcxFunc.serverAddService)); + + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcServerStart", (void **)&g_rpcUcxFunc.serverStart)); + + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcServerDestroy", (void **)&g_rpcUcxFunc.serverDestroy)); + + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcServerReply", (void **)&g_rpcUcxFunc.serverReply)); + + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcServerCleanupCtx", (void **)&g_rpcUcxFunc.serverCleanCtx)); + + return RPC_OK; +} + +RpcStatus InitRpcServerConfig() +{ + return RPC_OK; +} + +RpcStatus InitRpcServer(KnlRpcContext& ctx, DependencePath paths) +{ + // load dl + CHECK_NDP_RPC_STATUS(InitRpcEnv(paths)); + + // load server functions + if (RpcServerDlsym() != RPC_OK) { + LOG_ERROR << "dlsym rpc server func, path"; + CloseDl(g_rpcUcxDl); + g_rpcUcxDl = nullptr; + return RPC_ERROR; + } + + if (ctx.serverHandle != 0) { + g_rpcUcxFunc.serverDestroy(ctx.serverHandle); + } + + CHECK_NDP_RPC_STATUS(InitRpcServerConfig()); + +#ifdef ENABLE_SSL + OckRpcCreateConfig cfg; + cfg.mask = OCK_RPC_CONFIG_USE_SSL_CALLBACK; + cfg.getCaAndVerify = nullptr; + cfg.getCert = GetCert; + cfg.getPriKey = GetPrivateKey; + RpcStatus status = g_rpcUcxFunc.serverCreateWithCfg(ctx.ip, ctx.port, &ctx.serverHandle, &cfg); +#else + RpcStatus status = g_rpcUcxFunc.serverCreate(ctx.ip, ctx.port, &ctx.serverHandle); +#endif + if (status != RPC_OK) { + LOG_ERROR << "OckRpcServerCreate failed, ip " << ctx.ip << "port" << ctx.port; + CloseDl(g_rpcUcxDl); + g_rpcUcxDl = nullptr; + return RPC_ERROR; + } + return RPC_OK; +} + +static void RpcAdminProc(RpcServerContext handle, RpcMessage msg) +{ + NdpAdminRequest *header = (NdpAdminRequest *)msg.data; + NdpAdminResponse resp; + size_t size = offsetof(NdpAdminResponse, queryId); // just send ret default + resp.ret = NDP_ILLEGAL; + + NDP_PG_TRY(); + { + if (!NdpAdminProc(header, resp, size)) { + LOG_DEBUG << "rpc admin message is received successfully, " + << "admin command is " << (int)(header->head.command); + resp.ret = NDP_OK; + } + } + NDP_PG_CATCH(); + { + LOG_INFO << "rpc admin message is received failed, " + << "admin command is " << (int)(header->head.command); + resp.ret = NDP_ERR; + } + NDP_PG_END_TRY(); + + RpcMessage reply = {.data = (void*)&resp, .len = size}; + + if (g_rpcUcxFunc.serverReply(handle, RPC_ADMIN_REQ, &reply, nullptr) != RPC_OK) { + LOG_ERROR << "send reply failed"; + } + + g_rpcUcxFunc.serverCleanCtx(handle); +} + +RpcStatus SendIOTaskErrReply(NdpIOTask* task, NDP_ERRNO error) +{ + NdpIOResponse res; + res.status = error; + RpcMessage reply = {.data = nullptr, .len = 0}; + reply.data = &res; + g_rpcUcxFunc.serverReply(task->handle, RPC_IO_REQ, &reply, nullptr); + g_rpcUcxFunc.serverCleanCtx(task->handle); + delete task; +} +#ifdef FAULT_INJECT +static void IOInject(NdpIOTask* &task) +{ + auto iter = injectPlanVarMap.find(task->header->taskId); + if (iter != injectPlanVarMap.end()) { + iter->second->ioCount.fetch_add(1, std::memory_order_relaxed); + } + // timeout inject + if ((rand() % PERCENTAGE_DIV) < PERCENTAGE) { + sleep((rand() % PERCENTAGE_DIV)); + } + SendIOTaskErrReply(task, ERR_AIO_FAILED); +} +#endif + +static void RpcIOProc(RpcServerContext handle, RpcMessage msg) +{ + NdpIOTask* task = new NdpIOTask(handle, reinterpret_cast(msg.data)); +#ifdef NDP_ASYNC_CEPH + if (!SubmitAioReadData(task)) { + LOG_DEBUG << "rpc IO message is received successfully."; + } else { + SendIOTaskErrReply(task, ERR_AIO_FAILED); + } +#else + globalWorkerManager->AddTask(task); +#endif +} + +RpcStatus RpcIOTaskHandler(NdpIOTask* task) +{ +#ifdef FAULT_INJECT + if ((rand() % PERCENTAGE_DIV) < PERCENTAGE) { + IOInject(task); + return RPC_ERROR; + } +#endif + RpcServerContext handle = task->handle; + NdpIORequest *header = task->header; +#ifdef NDP_ASYNC_CEPH + t_thrd.ndpWorkerCtx->scanPages = task->aioDesc->readBuf; +#endif + + NdpIOResponse res; + res.status = NDP_ILLEGAL; + Status ioStatus; + + RpcMessage reply = {.data = nullptr, .len = 0}; + + NDP_PG_TRY(); + { + ioStatus = NdpIOProc(header, &reply); + if (reply.data) { + LOG_DEBUG << "ndpworker " << pthread_self() << " successful handle " + << reinterpret_cast(reply.data)->ndpPageNums << " ndppages."; + reinterpret_cast(reply.data)->status = NDP_OK; + } else { + LOG_DEBUG << "ndpworker " << pthread_self() << " handle 0 pages"; + } + } + NDP_PG_CATCH(); + { + ioStatus = STATUS_ERROR; + } + NDP_PG_END_TRY(); + + if (ioStatus != STATUS_OK) { + reply.len = sizeof(NdpIOResponse); + if (reply.data == nullptr) { + res.status = NDP_ERR; + reply.data = &res; + } else { + reinterpret_cast(reply.data)->status = NDP_ERR; + } + } + + RpcStatus status = g_rpcUcxFunc.serverReply(handle, RPC_IO_REQ, &reply, nullptr); + if (status != RPC_OK) { + LOG_WARN << "send reply failed"; + } + + g_rpcUcxFunc.serverCleanCtx(handle); + delete task; + return status; +} + +static RpcStatus RegisterRpcProcFunc(void) +{ + RpcServer server = ndp_instance.rpcContext.serverHandle; + if (server == 0) { + LOG_ERROR << "register rpc proc func failed, server handler:" << server; + return RPC_ERROR; + } + + RpcService adminService = {.id = RPC_ADMIN_REQ, .handler = RpcAdminProc}; + RpcStatus rpcStatus = g_rpcUcxFunc.serverAddService(server, &adminService); + if (rpcStatus != RPC_OK) { + LOG_ERROR << "add service RPC_ADMIN_REQ failed, status = " << rpcStatus; + return RPC_ERROR; + } + + RpcService ioService = {.id = RPC_IO_REQ, .handler = RpcIOProc}; + rpcStatus = g_rpcUcxFunc.serverAddService(server, &ioService); + if (rpcStatus != RPC_OK) { + LOG_ERROR << "add service RPC_IO_REQ failed, status = " << rpcStatus; + return RPC_ERROR; + } + + return RPC_OK; +} + +RpcStatus RpcServerInit(void) +{ + RpcStatus rpcStatus; + + memset(&ndp_instance, 0, sizeof(ndp_instance)); + DependencePath paths; + + paths.ulogPath = LIB_ULOG; + paths.rpcPath = LIB_RPC_UCX; + paths.sslDLPath = LIB_OPENSSL_DL; + paths.sslPath = LIB_SSL; + paths.cryptoPath = LIB_CRYPTO; + + strcpy(ndp_instance.rpcContext.ip, configSets->ip.c_str()); + ndp_instance.rpcContext.port = configSets->port; + + CHECK_NDP_RPC_STATUS(InitRpcServer(ndp_instance.rpcContext, paths)); + + CHECK_NDP_RPC_STATUS(RegisterRpcProcFunc()); + + rpcStatus = g_rpcUcxFunc.serverStart(ndp_instance.rpcContext.serverHandle); + if (rpcStatus != RPC_OK) { + LOG_ERROR << "RpcServerStart failed"; + return rpcStatus; + } + + return rpcStatus; +} +#else + +static RpcStatus RpcClientDlsym(void) +{ + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcClientConnect", (void **)&g_rpcUcxFunc.clientConnect)); + +#ifdef ENABLE_SSL + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcClientConnectWithCfg", (void **)&g_rpcUcxFunc.clientConnectWithCfg)); +#endif + + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcClientDisconnect", (void **)&g_rpcUcxFunc.clientDisconnect)); + + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcClientCall", (void **)&g_rpcUcxFunc.clientCall)); + + CHECK_RPC_STATUS(LoadSymbol(g_rpcUcxDl, "OckRpcClientSetTimeout", (void **)&g_rpcUcxFunc.clientSetTimeout)); + + return RPC_OK; +} + +RpcStatus RpcClientInit(DependencePath& paths) +{ + // load dl + CHECK_NDP_RPC_STATUS(InitRpcEnv(paths)); + + if (RpcClientDlsym() != RPC_OK) { + CloseDl(g_rpcUcxDl); + g_rpcUcxDl = nullptr; + return RPC_ERROR; + } + + return RPC_OK; +} + +RpcStatus RpcClientConnect(char *ip, uint16_t port, RpcClient& clientHandle) +{ +#ifdef ENABLE_SSL + OckRpcCreateConfig cfg; + cfg.mask = OCK_RPC_CONFIG_USE_SSL_CALLBACK; + cfg.getCaAndVerify = GetCAAndVerify; + cfg.getCert = nullptr; + cfg.getPriKey = nullptr; + RpcStatus rpcStatus = g_rpcUcxFunc.clientConnectWithCfg(ip, port, &clientHandle, &cfg); +#else + RpcStatus rpcStatus = g_rpcUcxFunc.clientConnect(ip, port, &clientHandle); +#endif + if (rpcStatus != RPC_OK) { + ereport(LOG, (errmsg("RpcClientConnect failed, ip: %s, port: %d", ip, port))); + return rpcStatus; + } + g_rpcUcxFunc.clientSetTimeout(clientHandle, REPLY_TIMEOUT); + + return RPC_OK; +} + +void RpcClientDisconnect(RpcClient clientHandle) +{ + g_rpcUcxFunc.clientDisconnect(clientHandle); + ereport(LOG, (errmsg("RpcClientDisconnect complete."))); +} + +// size is for expand NdpAdminResponse +RpcStatus RpcSendAdminReq(NdpAdminRequest* req, NdpAdminResponse* resp, size_t size, RpcClient clientHandle) +{ + RpcMessage request = {.data = (void*)req, .len = req->head.size}; + RpcMessage response = {.data = (void*)resp, .len = size}; + + resp->ret = NDP_ILLEGAL; + RpcStatus rpcStatus = g_rpcUcxFunc.clientCall(clientHandle, RPC_ADMIN_REQ, &request, &response, nullptr); + + return rpcStatus; +} + +RpcStatus RpcSendIOReq(RpcMessage* request, RpcMessage* response, RpcCallDone* done, RpcClient clientHandle) +{ + RpcStatus rpcStatus = g_rpcUcxFunc.clientCall(clientHandle, RPC_IO_REQ, request, response, done); + if (rpcStatus != RPC_OK) { + ereport(WARNING, (errmsg("RpcSendIOReq failed. Error code: %d", rpcStatus))); + } + return rpcStatus; +} + +#endif diff --git a/contrib/ndpplugin/utils/dynloader.h b/contrib/ndpplugin/utils/dynloader.h new file mode 100644 index 000000000..0233ef9ad --- /dev/null +++ b/contrib/ndpplugin/utils/dynloader.h @@ -0,0 +1,25 @@ +/* ------------------------------------------------------------------------- + * + * dynamic_loader.h + * + * + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/dynloader.h + * + * ------------------------------------------------------------------------- + */ +#ifndef DYNAMIC_LOADER_H +#define DYNAMIC_LOADER_H + +#include "common.h" + +Status LoadSymbol(void *libHandle, char *symbol, void **symbolHandle); + +Status OpenDl(void **libHandle, char *symbol); + +void CloseDl(void *libHandle); + +#endif /* DYNAMIC_LOADER_H */ diff --git a/src/common/backend/nodes/copyfuncs.cpp b/src/common/backend/nodes/copyfuncs.cpp index 15acdadf4..07fbbb484 100644 --- a/src/common/backend/nodes/copyfuncs.cpp +++ b/src/common/backend/nodes/copyfuncs.cpp @@ -191,6 +191,41 @@ static PlannedStmt* _copyPlannedStmt(const PlannedStmt* from) return newnode; } +/* + * CopyNdpPlan + */ +static void CopyNdpPlan(const Plan* from, Plan* newnode) +{ + COPY_SCALAR_FIELD(ndp_pushdown_optimized); + COPY_NODE_FIELD(ndp_pushdown_condition); + NdpScanCondition* newcond = (NdpScanCondition*)newnode->ndp_pushdown_condition; + NdpScanCondition* fromcond = (NdpScanCondition*)from->ndp_pushdown_condition; + if (IsA(newnode, SeqScan)) { + if (newcond != nullptr) { + newcond->plan = newnode; + } + return; + } + if (!IsA(from, Agg) || !IsA(newnode, Agg)) { + return; + } + if ((from->lefttree == nullptr || from->righttree != nullptr) || + (newnode->lefttree == nullptr || newnode->righttree !=nullptr)) { + return; + } + if (!IsA(from->lefttree, SeqScan) || !IsA(newnode->lefttree, SeqScan)) { + return; + } + newcond = (NdpScanCondition *)newnode->lefttree->ndp_pushdown_condition; + fromcond = (NdpScanCondition *)from->lefttree->ndp_pushdown_condition; + if (fromcond == nullptr || newcond == nullptr) { + return; + } + if (fromcond->plan == from) { + newcond->plan = newnode; + } +} + /* * CopyPlanFields * @@ -241,6 +276,7 @@ static void CopyPlanFields(const Plan* from, Plan* newnode) COPY_SCALAR_FIELD(pred_startup_time); COPY_SCALAR_FIELD(pred_total_time); COPY_SCALAR_FIELD(pred_max_memory); + CopyNdpPlan(from, newnode); newnode->rightRefState = CopyRightRefState(from->rightRefState); } @@ -260,6 +296,18 @@ static Plan* _copyPlan(const Plan* from) return newnode; } +/* + * _copyNdp + */ +static NdpScanCondition* _copyNdp(const NdpScanCondition* from) +{ + NdpScanCondition* newnode = makeNode(NdpScanCondition); + COPY_SCALAR_FIELD(tableId); + COPY_SCALAR_FIELD(ctx); + newnode->plan = nullptr; + return newnode; +} + /* * _copyResult */ @@ -7397,6 +7445,9 @@ void* copyObject(const void* from) case T_Plan: retval = _copyPlan((Plan*)from); break; + case T_NdpScanCondition: + retval = _copyNdp((NdpScanCondition*)from); + break; case T_BaseResult: retval = _copyResult((BaseResult*)from); break; diff --git a/src/gausskernel/ddes/ddes_commit_id b/src/gausskernel/ddes/ddes_commit_id index 088e13fa5..448edc022 100644 --- a/src/gausskernel/ddes/ddes_commit_id +++ b/src/gausskernel/ddes/ddes_commit_id @@ -1,2 +1,2 @@ dms_commit_id=384f13a3f0d080c85259b2c3a47def7d3c18c0a6 -dss_commit_id=0624291f4495300ada7375b57b49ff33be84f1b2 \ No newline at end of file +dss_commit_id=2db80e7f65b63c8412f97086cbd64340505075aa diff --git a/src/gausskernel/optimizer/commands/dropcmds.cpp b/src/gausskernel/optimizer/commands/dropcmds.cpp index daeba1379..28a0d4aa5 100644 --- a/src/gausskernel/optimizer/commands/dropcmds.cpp +++ b/src/gausskernel/optimizer/commands/dropcmds.cpp @@ -78,6 +78,7 @@ static void DropExtensionInListIsSupported(List* objname) "drop", "postgis", "packages", + "ndpplugin", #ifndef ENABLE_MULTIPLE_NODES "mysql_fdw", "oracle_fdw", diff --git a/src/gausskernel/optimizer/commands/explain.cpp b/src/gausskernel/optimizer/commands/explain.cpp index f639fe019..6fdb8293a 100755 --- a/src/gausskernel/optimizer/commands/explain.cpp +++ b/src/gausskernel/optimizer/commands/explain.cpp @@ -269,6 +269,7 @@ static bool show_scan_distributekey(const Plan* plan) } #endif /* ENABLE_MULTIPLE_NODES */ static void show_unique_check_info(PlanState *planstate, ExplainState *es); +static void show_ndpplugin_statistic(ExplainState *es, PlanState* planstate); /* * ExplainQuery - @@ -2496,6 +2497,11 @@ static void ExplainNode( } } + /* explain ndpplugin activities */ + if (ndp_pushdown_hook) { + show_ndpplugin_statistic(es, planstate); + } + /* * We have to forcibly clean up the instrumentation state because we * haven't done ExecutorEnd yet. This is pretty grotty ... @@ -10919,6 +10925,57 @@ static void show_unique_check_info(PlanState *planstate, ExplainState *es) } } +static void show_ndpplugin_statistic(ExplainState *es, PlanState* planstate) +{ + Plan* plan = planstate->plan; + if (!plan->ndp_pushdown_optimized && + !(plan->lefttree && plan->type == T_Agg && plan->lefttree->ndp_pushdown_optimized && + reinterpret_cast(plan->lefttree->ndp_pushdown_condition)->plan == plan)) { + return; + } + + TableScanDesc desc = reinterpret_cast(planstate)->ss_currentScanDesc; + if (desc && !desc->ndp_pushdown_optimized) { + return; + } + appendStringInfo(es->str, " NDPpushdown"); + + if (!es->analyze) { + return; + } + + Instrumentation* instr; + int pushdownPage = 0; + int normalPage = 0; + int ndpPage = 0; + + int dop = planstate->plan->parallel_enabled ? planstate->plan->dop : 1; + if (planstate->plan->plan_node_id > 0 && u_sess->instr_cxt.global_instr && + u_sess->instr_cxt.global_instr->isFromDataNode(planstate->plan->plan_node_id)) { + for (int i = 0; i < u_sess->instr_cxt.global_instr->getInstruNodeNum(); i++) { + ThreadInstrumentation* threadinstr = + u_sess->instr_cxt.global_instr->getThreadInstrumentation(i, planstate->plan->plan_node_id, 0); + if (threadinstr == NULL) + continue; + for (int j = 0; j < dop; j++) { + instr = u_sess->instr_cxt.global_instr->getInstrSlot(i, planstate->plan->plan_node_id, j); + if (instr != NULL && instr->nloops > 0) { + pushdownPage += instr->ndp_pushdown_page; + normalPage += instr->ndp_sendback_page; + ndpPage += instr->ndp_handled; + } + } + } + } + + instr = planstate->instrument; + pushdownPage += instr->ndp_pushdown_page; + normalPage += instr->ndp_sendback_page; + ndpPage += instr->ndp_handled; + + appendStringInfo(es->str, " (total page: %d, back to normal page: %d, ndp handled: %d)", pushdownPage, normalPage, ndpPage); +} + void ExplainDatumProperty(char const *name, Datum const value, Oid const type, ExplainState* es) { Datum output_datum = 0; diff --git a/src/gausskernel/optimizer/plan/planner.cpp b/src/gausskernel/optimizer/plan/planner.cpp index d94dc470b..7c0b6e979 100755 --- a/src/gausskernel/optimizer/plan/planner.cpp +++ b/src/gausskernel/optimizer/plan/planner.cpp @@ -90,6 +90,9 @@ #include "optimizer/gplanmgr.h" #include "instruments/instr_statement.h" +/* Hook for plugins to get control in planner() */ +THR_LOCAL ndp_pushdown_hook_type ndp_pushdown_hook = NULL; + #ifndef MIN #define MIN(A, B) ((B) < (A) ? (B) : (A)) #endif @@ -397,6 +400,10 @@ PlannedStmt* planner(Query* parse, int cursorOptions, ParamListInfo boundParams) GetRemoteQuery(result, NULL); } + if (ndp_pushdown_hook) { + (*ndp_pushdown_hook)(parse, result); + } + return result; } diff --git a/src/gausskernel/runtime/executor/execScan.cpp b/src/gausskernel/runtime/executor/execScan.cpp index a2e36800d..c767070b7 100755 --- a/src/gausskernel/runtime/executor/execScan.cpp +++ b/src/gausskernel/runtime/executor/execScan.cpp @@ -190,6 +190,18 @@ TupleTableSlot* ExecScan(ScanState* node, ExecScanAccessMtd access_mtd, /* funct } } + /* place to filter Ndp page */ + if (node->ss_currentScanDesc && node->ss_currentScanDesc->ndp_pushdown_optimized) { + HeapTuple tuple = (HeapTuple)slot->tts_tuple; + if (tuple && tuple->t_data && (tuple->t_data->t_infomask & NDP_HANDLED_TUPLE)) { + if (proj_info != NULL) { + return proj_info->pi_slot; + } else { + return slot; + } + } + } + /* * place the current tuple into the expr context */ diff --git a/src/gausskernel/runtime/executor/nodeAgg.cpp b/src/gausskernel/runtime/executor/nodeAgg.cpp index ed4dc8aa7..4895cac7b 100644 --- a/src/gausskernel/runtime/executor/nodeAgg.cpp +++ b/src/gausskernel/runtime/executor/nodeAgg.cpp @@ -1897,6 +1897,11 @@ static void agg_fill_hash_table(AggState* aggstate) break; } + if (aggstate->ndp_slot && outerslot->tts_mintuple && (outerslot->tts_mintuple->t_infomask & NDP_HANDLED_TUPLE)) { + ndp_tableam->handle_hashaggslot(aggstate, &outerslot->tts_minhdr); + continue; + } + /* set up for advance_aggregates call */ tmpcontext->ecxt_outertuple = outerslot; @@ -4360,4 +4365,4 @@ static void exec_lookups_agg_flattened(AggState *aggstate, Agg *node, EState *es phase->evaltrans = ExecBuildAggTrans(aggstate, phase, dosort, dohash); } -} \ No newline at end of file +} diff --git a/src/gausskernel/runtime/executor/nodeSeqscan.cpp b/src/gausskernel/runtime/executor/nodeSeqscan.cpp index 276506a85..d91ad8c19 100644 --- a/src/gausskernel/runtime/executor/nodeSeqscan.cpp +++ b/src/gausskernel/runtime/executor/nodeSeqscan.cpp @@ -190,7 +190,7 @@ static TupleTableSlot* SeqNext(SeqScanState* node); static void ExecInitNextPartitionForSeqScan(SeqScanState* node); -template +template FORCE_INLINE void seq_scan_getnext_template(TableScanDesc scan, TupleTableSlot* slot, ScanDirection direction, bool* has_cur_xact_write) @@ -199,6 +199,8 @@ void seq_scan_getnext_template(TableScanDesc scan, TupleTableSlot* slot, ScanDi if(hashBucket) { /* fall back to orign slow function. */ tuple = scan_handler_tbl_getnext(scan, direction, NULL, has_cur_xact_write); + } else if (pushdown) { + tuple = ndp_tableam->scan_getnexttuple(scan, direction, slot); } else if(type == TAM_HEAP) { tuple = (Tuple)heap_getnext(scan, direction, has_cur_xact_write); } else { @@ -841,16 +843,30 @@ static inline void InitSeqNextMtd(SeqScan* node, SeqScanState* scanstate) { if (!node->tablesample) { scanstate->ScanNextMtd = SeqNext; - if(RELATION_OWN_BUCKET(scanstate->ss_currentRelation)) { - if(scanstate->ss_currentRelation->rd_tam_ops == TableAmHeap) - scanstate->fillNextSlotFunc = seq_scan_getnext_template; - else - scanstate->fillNextSlotFunc = seq_scan_getnext_template; + if (scanstate->ss_currentScanDesc != NULL && scanstate->ss_currentScanDesc->ndp_pushdown_optimized) { + if (RELATION_OWN_BUCKET(scanstate->ss_currentRelation)) { + if (scanstate->ss_currentRelation->rd_tam_ops == TableAmHeap) + scanstate->fillNextSlotFunc = seq_scan_getnext_template; + else + scanstate->fillNextSlotFunc = seq_scan_getnext_template; + } else { + if (scanstate->ss_currentRelation->rd_tam_ops == TableAmHeap) + scanstate->fillNextSlotFunc = seq_scan_getnext_template; + else + scanstate->fillNextSlotFunc = seq_scan_getnext_template; + } } else { - if(scanstate->ss_currentRelation->rd_tam_ops == TableAmHeap) - scanstate->fillNextSlotFunc = seq_scan_getnext; - else - scanstate->fillNextSlotFunc = seq_scan_getnext_template; + if (RELATION_OWN_BUCKET(scanstate->ss_currentRelation)) { + if (scanstate->ss_currentRelation->rd_tam_ops == TableAmHeap) + scanstate->fillNextSlotFunc = seq_scan_getnext_template; + else + scanstate->fillNextSlotFunc = seq_scan_getnext_template; + } else { + if (scanstate->ss_currentRelation->rd_tam_ops == TableAmHeap) + scanstate->fillNextSlotFunc = seq_scan_getnext; + else + scanstate->fillNextSlotFunc = seq_scan_getnext_template; + } } } else { if (RELATION_OWN_BUCKET(scanstate->ss_currentRelation)) { diff --git a/src/gausskernel/storage/access/hbstore/hbucket_am.cpp b/src/gausskernel/storage/access/hbstore/hbucket_am.cpp index cf60aa991..c0eb28cb5 100644 --- a/src/gausskernel/storage/access/hbstore/hbucket_am.cpp +++ b/src/gausskernel/storage/access/hbstore/hbucket_am.cpp @@ -809,6 +809,7 @@ RedisMergeItem *search_redis_merge_item(const RedisMergeItemOrderArray *merge_it * Reconstruct these functions into the hook API in the future. * ------------------------------------------------------------------------ */ +TableAmNdpRoutine_hook_type ndp_tableam = NULL; /* Create HBktTblScanDesc with scan state. For hash-bucket table, it scans the * specified buckets in ScanState */ @@ -819,6 +820,10 @@ TableScanDesc scan_handler_tbl_beginscan(Relation relation, Snapshot snapshot, return (TableScanDesc)hbkt_tbl_beginscan(relation, snapshot, nkeys, key, sstate, isRangeScanInRedis); } RangeScanInRedis rangeScanInRedis = reset_scan_qual(relation, sstate, isRangeScanInRedis); + + if (sstate != NULL && sstate->ps.plan->ndp_pushdown_optimized) { + return ndp_tableam->scan_begin(relation, snapshot, nkeys, key, sstate, rangeScanInRedis); + } return tableam_scan_begin(relation, snapshot, nkeys, key, rangeScanInRedis); } @@ -863,6 +868,8 @@ void scan_handler_tbl_init_parallel_seqscan(TableScanDesc scan, int32 dop, ScanD { if (unlikely(RELATION_OWN_BUCKET(scan->rs_rd))) { tableam_scan_init_parallel_seqscan(((HBktTblScanDesc)scan)->currBktScan, dop, dir); + } else if (scan->ndp_pushdown_optimized) { + ndp_tableam->scan_init_parallel_seqscan(scan, dop, dir); } else { tableam_scan_init_parallel_seqscan(scan, dop, dir); } @@ -921,6 +928,8 @@ void scan_handler_tbl_endscan(TableScanDesc scan) } pfree_ext(hp_scan->hBktList); pfree(hp_scan); + } else if (scan->ndp_pushdown_optimized) { + ndp_tableam->scan_end(scan); } else { tableam_scan_end(scan); } @@ -930,6 +939,8 @@ void scan_handler_tbl_rescan(TableScanDesc scan, struct ScanKeyData* key, Relati { if (unlikely(RELATION_OWN_BUCKET(scan->rs_rd))) { hbkt_tbl_rescan(scan, key, is_bitmap_rescan); + } else if (scan->ndp_pushdown_optimized) { + ndp_tableam->scan_rescan(scan, key); } else { tableam_scan_rescan(scan, key); } diff --git a/src/gausskernel/storage/access/heap/heapam.cpp b/src/gausskernel/storage/access/heap/heapam.cpp index e520de787..2372c0740 100755 --- a/src/gausskernel/storage/access/heap/heapam.cpp +++ b/src/gausskernel/storage/access/heap/heapam.cpp @@ -271,6 +271,9 @@ static void initscan(HeapScanDesc scan, ScanKey key, bool is_rescan) scan->rs_base.rs_ss_accessor = NULL; scan->dop = 1; + /* ndp args init */ + scan->rs_base.ndp_pushdown_optimized = false; + /* we don't have a marked position... */ ItemPointerSetInvalid(&(scan->rs_mctid)); diff --git a/src/gausskernel/storage/dss/dss_adaptor.cpp b/src/gausskernel/storage/dss/dss_adaptor.cpp index 1ee5d875e..6ca556a17 100644 --- a/src/gausskernel/storage/dss/dss_adaptor.cpp +++ b/src/gausskernel/storage/dss/dss_adaptor.cpp @@ -130,6 +130,8 @@ int dss_device_init(const char *conn_path, bool enable_dss) SS_RETURN_IFERR( dss_load_symbol(device_op.handle, "dss_register_log_callback", (void **)&device_op.dss_register_log_callback)); SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_get_lib_version", (void **)&device_op.dss_get_version)); + SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_get_addr", (void **)&device_op.dss_get_addr)); + SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_compare_size_equal", (void **)&device_op.dss_compare_size)); SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_aio_prep_pwrite", (void **)&device_op.dss_aio_pwrite)); SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_aio_prep_pread", (void **)&device_op.dss_aio_pread)); SS_RETURN_IFERR(dss_load_symbol(device_op.handle, "dss_init_logger", (void **)&device_op.dss_init_logger)); @@ -169,4 +171,4 @@ void dss_call_refresh_logger(char * log_field, unsigned long long *value) if (device_op.inited) { device_op.dss_refresh_logger(log_field, value); } -} \ No newline at end of file +} diff --git a/src/gausskernel/storage/dss/fio_dss.cpp b/src/gausskernel/storage/dss/fio_dss.cpp index b0000366b..ae7673ccd 100644 --- a/src/gausskernel/storage/dss/fio_dss.cpp +++ b/src/gausskernel/storage/dss/fio_dss.cpp @@ -705,6 +705,25 @@ int dss_remove_dev(const char *name) } } +int dss_get_addr(int handle, long long offset, char *poolname, char *imagename, char *objAddr, + unsigned int *objId, unsigned long int *objOffset) +{ + if (g_dss_device_op.dss_get_addr(handle, offset, poolname, imagename, objAddr, objId, objOffset) != DSS_SUCCESS) { + dss_set_errno(NULL); + return -1; + } + return GS_SUCCESS; +} + +int dss_compare_size(const char *vg_name, long long *au_size) +{ + if (g_dss_device_op.dss_compare_size(vg_name, au_size) != DSS_SUCCESS) { + dss_set_errno(NULL); + return -1; + } + return GS_SUCCESS; +} + int dss_aio_prep_pwrite(void *iocb, int fd, void *buf, size_t count, long long offset) { return g_dss_device_op.dss_aio_pwrite(iocb, fd, buf, count, offset); diff --git a/src/gausskernel/storage/smgr/md.cpp b/src/gausskernel/storage/smgr/md.cpp index 885b60265..a7e378ee5 100644 --- a/src/gausskernel/storage/smgr/md.cpp +++ b/src/gausskernel/storage/smgr/md.cpp @@ -115,7 +115,6 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior static MdfdVec *_fdvec_alloc(void); static char *_mdfd_segpath(const SMgrRelation reln, ForkNumber forknum, BlockNumber segno); static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno, BlockNumber segno, int oflags); -static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno, BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior); static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, const MdfdVec *seg); static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, const MdfdVec *seg); static void register_unlink_segment(RelFileNodeBackend rnode, ForkNumber forknum, BlockNumber segno); @@ -1827,7 +1826,7 @@ static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber * segment, according to "behavior". Note: skipFsync is only used in the * EXTENSION_CREATE case. */ -static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, +MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior) { MdfdVec *v = mdopen(reln, forknum, behavior); diff --git a/src/include/access/hbucket_am.h b/src/include/access/hbucket_am.h index 6910b25b8..4e3c734a1 100644 --- a/src/include/access/hbucket_am.h +++ b/src/include/access/hbucket_am.h @@ -31,6 +31,7 @@ #include "utils/relcache.h" #include "optimizer/bucketinfo.h" #include "access/tableam.h" +#include "nodes/execnodes.h" /* * redis need merge item @@ -86,4 +87,15 @@ extern void scan_handler_tbl_rescan(TableScanDesc scan, struct ScanKeyData* key, bool is_bitmap_rescan = false); #define NOT_EXIST_MERGE_LIST "not_exist_merge_list" +typedef struct TableAmNdpRoutine_hook { + TableScanDesc (*scan_begin)(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, + ScanState* sstate, RangeScanInRedis rangeScanInRedis); + void (*scan_init_parallel_seqscan)(TableScanDesc sscan, int32 dop, ScanDirection dir); + void (*scan_rescan)(TableScanDesc sscan, ScanKey key); + void (*scan_end)(TableScanDesc sscan); + Tuple (*scan_getnexttuple)(TableScanDesc sscan, ScanDirection direction, TupleTableSlot* slot); + void (*handle_hashaggslot)(AggState* aggstate, HeapTupleData* tts_minhdr); +} TableAmNdpRoutine_hook, *TableAmNdpRoutine_hook_type; +extern PGDLLIMPORT TableAmNdpRoutine_hook_type ndp_tableam; + #endif /* HASHPART_AM_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index db4bd717b..296b63346 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -101,6 +101,10 @@ typedef struct TableScanDescData /* variables for batch mode scan */ int rs_ctupRows; int rs_maxScanRows; + + /* variables for ndp pushdown scan */ + bool ndp_pushdown_optimized; + void *ndp_ctx; } TableScanDescData; /* struct definition appears in relscan.h */ diff --git a/src/include/access/htup.h b/src/include/access/htup.h index 5c45c4abd..50799eef3 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -207,7 +207,7 @@ typedef HeapTupleHeaderData* HeapTupleHeader; #define HEAP_HAS_8BYTE_UID (0x4000) /* tuple has 8 bytes uid */ #define HEAP_UID_MASK (0x4000) -#define HEAP_RESERVED_BIT (0x8000) /* tuple uid related bits */ +#define NDP_HANDLED_TUPLE (0x8000) /* tuple is from ndp backend */ #define HEAP_XACT_MASK (0x3FE0) /* visibility-related bits */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 4f278b187..16cc2a6d4 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -325,6 +325,10 @@ typedef struct Instrumentation { int ec_libodbc_type; /* ec execute libodbc_type*/ int64 ec_fetch_count; /* ec fetch count*/ RecursiveInfo recursiveInfo; + + int ndp_pushdown_page; + int ndp_sendback_page; + int ndp_handled; } Instrumentation; /* instrumentation data */ diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h index 2837be3f4..6dec954e0 100644 --- a/src/include/executor/tuptable.h +++ b/src/include/executor/tuptable.h @@ -164,6 +164,7 @@ typedef struct TupleTableSlot { MemoryContext tts_per_tuple_mcxt; #endif TableAmType tts_tupslotTableAm; /* slots's tuple table type */ + bool tts_ndpAggHandled; /* slot is from ndp backend, handled by aggregate */ } TupleTableSlot; #define TTS_HAS_PHYSICAL_TUPLE(slot) ((slot)->tts_tuple != NULL && (slot)->tts_tuple != &((slot)->tts_minhdr)) diff --git a/src/include/knl/knl_session.h b/src/include/knl/knl_session.h index e5f01ff38..efc24f9b1 100644 --- a/src/include/knl/knl_session.h +++ b/src/include/knl/knl_session.h @@ -2758,6 +2758,29 @@ typedef enum { EXECUTE_MESSAGE_QUERY, EXECUTE_BATCH_MESSAGE_QUERY } PBEMessage; + +/* record statement of ndp plugin */ +typedef struct NdpStats { + unsigned long queryCounter; + unsigned long sendFailed; + unsigned long failedIO; + unsigned long pushDownPage; + unsigned long sendBackPage; + unsigned long ndpPageAgg; + unsigned long ndpPageScan; +} NdpStats; + +typedef struct knl_u_ndp_context { + NdpStats *stats; + MemoryContext mem_cxt; + void *cxt; + bool enable_ndp; + int pushdown_min_blocks; + int ndp_port; + char *ca_path; + char *crl_path; +} knl_u_ndp_context; + typedef struct knl_session_context { volatile knl_session_status status; /* used for threadworker, elem in m_readySessionList */ @@ -2887,6 +2910,8 @@ typedef struct knl_session_context { */ struct knl_u_clientConnTime_context clientConnTime_cxt; + knl_u_ndp_context ndp_cxt; + knl_u_hook_context hook_cxt; /* The datetime cache in current transaction. */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 5f8d42c65..682754fac 100755 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -2499,6 +2499,8 @@ typedef struct AggState { int num_hashes; AggStatePerGroup hash_pergroup; /* grouping set indexed array of* per-group pointers */ AggStatePerGroup all_pergroups; /* array of first ->pergroups, than * ->hash_pergroup */ + + TupleTableSlot* ndp_slot; /* slot for load ndp data */ } AggState; /* ---------------- diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 8517edf4b..242004bde 100755 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -828,7 +828,10 @@ typedef enum NodeTag { T_UserSetElem, T_UserVar, T_CharsetCollateOptions, - T_FunctionSources + T_FunctionSources, + + /* ndpplugin tag */ + T_NdpScanCondition } NodeTag; /* if you add to NodeTag also need to add nodeTagToString */ diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 33c10b3c3..705ad3dcb 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -358,8 +358,23 @@ typedef struct Plan { List* flatList = NULL; /* flattened targetlist representing columns in query */ RightRefState* rightRefState; + bool ndp_pushdown_optimized; + /* normally used for save ndp condition + * caution: ndp_pushdown_condition under Agg node is used for save ndp handled aggslot + * -> SeqScan save ndp condition + * -> Agg save ndp aggslot + * -> SeqScan->ndp_pushdown_condition save ndp condition + * */ + Node* ndp_pushdown_condition; } Plan; +typedef struct NdpScanCondition { // for each scan node + NodeTag type; + uint16 tableId; + void* ctx; + Plan* plan; // plan tree to pushdown; +} NdpScanCondition; + /* ---------------- * these are defined to avoid confusion problems with "left" * and "right" and "inner" and "outer". The convention is that diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 67a5e821e..e5c7a8f4d 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -54,6 +54,8 @@ extern PlannedStmt* planner(Query* parse, int cursorOptions, ParamListInfo bound extern PlannedStmt* standard_planner(Query* parse, int cursorOptions, ParamListInfo boundParams); typedef void (*planner_hook_type) (Query* parse, int cursorOptions, ParamListInfo boundParams); +typedef void (*ndp_pushdown_hook_type) (Query* querytree, PlannedStmt *stmt); +extern THR_LOCAL PGDLLIMPORT ndp_pushdown_hook_type ndp_pushdown_hook; extern Plan* subquery_planner(PlannerGlobal* glob, Query* parse, PlannerInfo* parent_root, bool hasRecursion, double tuple_fraction, PlannerInfo** subroot, int options = SUBQUERY_NORMAL, ItstDisKey* diskeys = NULL, diff --git a/src/include/storage/dss/dss_adaptor.h b/src/include/storage/dss/dss_adaptor.h index d386382cf..d8bac4902 100644 --- a/src/include/storage/dss/dss_adaptor.h +++ b/src/include/storage/dss/dss_adaptor.h @@ -67,6 +67,9 @@ typedef void (*dss_error_info)(int *errorcode, const char **errormsg); typedef void (*dss_svr_path)(const char *conn_path); typedef void (*dss_log_callback)(dss_log_output cb_log_output); typedef int (*dss_version)(void); +typedef int (*dss_get_storage_addr)(int handle, long long offset, char *poolname, char *imagename, char *objAddr, + unsigned int *objId, unsigned long int *objOffset); +typedef int (*dss_compare_size_equal)(const char *vg_name, long long *au_size); typedef int (*dss_aio_prep_pwrite_device)(void *iocb, int handle, void *buf, size_t count, long long offset); typedef int (*dss_aio_prep_pread_device)(void *iocb, int handle, void *buf, size_t count, long long offset); typedef int (*dss_init_logger_t)(char *log_home, unsigned int log_level, unsigned int log_backup_file_count, unsigned long long log_max_file_size); @@ -108,6 +111,8 @@ typedef struct st_dss_device_op_t { dss_svr_path dss_set_svr_path; dss_log_callback dss_register_log_callback; dss_version dss_get_version; + dss_get_storage_addr dss_get_addr; + dss_compare_size_equal dss_compare_size; dss_aio_prep_pwrite_device dss_aio_pwrite; dss_aio_prep_pread_device dss_aio_pread; dss_init_logger_t dss_init_logger; @@ -119,4 +124,4 @@ void dss_register_log_callback(dss_log_output cb_log_output); int dss_call_init_logger(char *log_home, unsigned int log_level, unsigned int log_backup_file_count, unsigned long long log_max_file_size); void dss_call_refresh_logger(char *log_field, unsigned long long *value); -#endif // DSS_ADAPTOR_H \ No newline at end of file +#endif // DSS_ADAPTOR_H diff --git a/src/include/storage/dss/dss_api_def.h b/src/include/storage/dss/dss_api_def.h index b615db70e..7e9b62920 100644 --- a/src/include/storage/dss/dss_api_def.h +++ b/src/include/storage/dss/dss_api_def.h @@ -84,7 +84,7 @@ typedef void (*dss_log_output)(dss_log_id_t log_type, dss_log_level_t log_level, #define DSS_LOCAL_MINOR_VER_WEIGHT 1000 #define DSS_LOCAL_MAJOR_VERSION 0 #define DSS_LOCAL_MINOR_VERSION 0 -#define DSS_LOCAL_VERSION 3 +#define DSS_LOCAL_VERSION 5 #define DSS_SUCCESS 0 #define DSS_ERROR (-1) diff --git a/src/include/storage/dss/fio_dss.h b/src/include/storage/dss/fio_dss.h index ba46ae09b..585047dfd 100644 --- a/src/include/storage/dss/fio_dss.h +++ b/src/include/storage/dss/fio_dss.h @@ -78,8 +78,10 @@ int dss_fstat_file(int handle, struct stat *buf); int dss_chmod_file(const char* path, mode_t mode); int dss_set_server_status_wrapper(); int dss_remove_dev(const char *name); - +int dss_get_addr(int handle, long long offset, char *poolname, char *imagename, char *objAddr, + unsigned int *objId, unsigned long int *objOffset); +int dss_compare_size(const char *vg_name, long long *au_size); int dss_aio_prep_pwrite(void *iocb, int fd, void *buf, size_t count, long long offset); int dss_aio_prep_pread(void *iocb, int fd, void *buf, size_t count, long long offset); -#endif // FIO_DSS_H \ No newline at end of file +#endif // FIO_DSS_H diff --git a/src/include/storage/smgr/smgr.h b/src/include/storage/smgr/smgr.h index c55456e00..e7ace5e4e 100644 --- a/src/include/storage/smgr/smgr.h +++ b/src/include/storage/smgr/smgr.h @@ -197,6 +197,7 @@ extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblock extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum); extern char* mdsegpath(const RelFileNode& rnode, ForkNumber forknum, BlockNumber blkno); extern void md_register_forget_request(RelFileNode rnode, ForkNumber forknum, BlockNumber segno); +extern MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior); /* md sync callbacks */ extern void mdForgetDatabaseFsyncRequests(Oid dbid); diff --git a/src/test/regress/pg_regress.cpp b/src/test/regress/pg_regress.cpp index a89937d0b..caffcfeb0 100644 --- a/src/test/regress/pg_regress.cpp +++ b/src/test/regress/pg_regress.cpp @@ -5412,7 +5412,7 @@ static void CheckCleanCodeWarningInfo(const int baseNum, const int currentNum, return; } -#define BASE_GLOBAL_VARIABLE_NUM 220 +#define BASE_GLOBAL_VARIABLE_NUM 221 #define CMAKE_CMD_BUF_LEN 1000