10329 lines
390 KiB
C++
Executable File
10329 lines
390 KiB
C++
Executable File
/* -------------------------------------------------------------------------
|
|
*
|
|
* execRemote.c
|
|
*
|
|
* Functions to execute commands on remote Datanodes
|
|
*
|
|
*
|
|
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 2010-2012 Postgres-XC Development Group
|
|
* Portions Copyright (c) 2021, openGauss Contributors
|
|
*
|
|
*
|
|
* IDENTIFICATION
|
|
* src/backend/pgxc/pool/execRemote.c
|
|
*
|
|
* -------------------------------------------------------------------------
|
|
*/
|
|
|
|
#include "postgres.h"
|
|
#include "knl/knl_variable.h"
|
|
|
|
#include <arpa/inet.h>
|
|
#include "access/twophase.h"
|
|
#include "access/gtm.h"
|
|
#include "access/sysattr.h"
|
|
#include "access/tableam.h"
|
|
#include "access/transam.h"
|
|
#include "access/xact.h"
|
|
#include "access/relscan.h"
|
|
#include "access/multixact.h"
|
|
#include "catalog/pg_namespace.h"
|
|
#include "catalog/pg_proc.h"
|
|
#include "catalog/pg_type.h"
|
|
#include "catalog/pg_statistic_ext.h"
|
|
#include "catalog/pgxc_node.h"
|
|
#include "commands/tablespace.h"
|
|
#include "commands/prepare.h"
|
|
#include "commands/tablecmds.h"
|
|
#include "postmaster/autovacuum.h"
|
|
#ifdef PGXC
|
|
#include "commands/trigger.h"
|
|
#endif
|
|
#include "executor/executor.h"
|
|
#include "executor/lightProxy.h"
|
|
#include "foreign/dummyserver.h"
|
|
#include "gtm/gtm_c.h"
|
|
#include "libpq/libpq.h"
|
|
#include "libpq/pqformat.h"
|
|
#include "miscadmin.h"
|
|
#include "pgxc/execRemote.h"
|
|
#include "pgxc/pgFdwRemote.h"
|
|
#include "pgxc/pgxcXact.h"
|
|
#include "nodes/nodes.h"
|
|
#include "nodes/nodeFuncs.h"
|
|
#include "optimizer/var.h"
|
|
#include "pgxc/copyops.h"
|
|
#include "pgxc/nodemgr.h"
|
|
#include "pgxc/pgxcnode.h"
|
|
#include "pgxc/poolmgr.h"
|
|
#include "storage/ipc.h"
|
|
#include "storage/procarray.h"
|
|
#include "storage/lmgr.h"
|
|
#include "tcop/tcopprot.h"
|
|
#include "utils/datum.h"
|
|
#include "utils/extended_statistics.h"
|
|
#include "utils/lsyscache.h"
|
|
#include "utils/memutils.h"
|
|
#include "utils/tuplesort.h"
|
|
#include "utils/snapmgr.h"
|
|
#include "utils/builtins.h"
|
|
#include "pgxc/locator.h"
|
|
#include "pgxc/pgxc.h"
|
|
#include "parser/parse_type.h"
|
|
#include "parser/parsetree.h"
|
|
#include "parser/parse_relation.h"
|
|
#include "pgstat.h"
|
|
#include "optimizer/streamplan.h"
|
|
#include "tcop/utility.h"
|
|
#include "utils/syscache.h"
|
|
#include "utils/rel.h"
|
|
#include "utils/rel_gs.h"
|
|
#include "access/heapam.h"
|
|
#include "utils/fmgroids.h"
|
|
#include "catalog/catalog.h"
|
|
#include "catalog/pg_statistic.h"
|
|
#include "catalog/namespace.h"
|
|
#include "catalog/pg_class.h"
|
|
#include "catalog/indexing.h"
|
|
#include "catalog/pg_inherits.h"
|
|
#include "catalog/pg_inherits_fn.h"
|
|
#include "funcapi.h"
|
|
#include "commands/vacuum.h"
|
|
#include "utils/distribute_test.h"
|
|
#include "utils/batchsort.h"
|
|
#include "vecexecutor/vectorbatch.h"
|
|
#include "access/hash.h"
|
|
#include "mb/pg_wchar.h"
|
|
#include "workload/cpwlm.h"
|
|
#include "instruments/instr_unique_sql.h"
|
|
#include "utils/elog.h"
|
|
#include "utils/globalplancore.h"
|
|
#include "executor/node/nodeModifyTable.h"
|
|
|
|
#ifndef MIN
|
|
#define MIN(A, B) (((B) < (A)) ? (B) : (A))
|
|
#endif
|
|
|
|
#ifdef ENABLE_UT
|
|
#define static
|
|
#endif
|
|
|
|
#pragma GCC diagnostic ignored "-Wunused-function"
|
|
|
|
extern bool IsAnalyzeTempRel(VacuumStmt* stmt);
|
|
|
|
#define ROLLBACK_RESP_LEN 9
|
|
|
|
#define DFS_PRIVATE_ITEM "DfsPrivateItem"
|
|
|
|
/*
|
|
* Buffer size does not affect performance significantly, just do not allow
|
|
* connection buffer grows infinitely
|
|
*/
|
|
#define COPY_BUFFER_SIZE 8192
|
|
#define PRIMARY_NODE_WRITEAHEAD (1024 * 1024)
|
|
|
|
#define PROTO_TCP 1
|
|
|
|
/* refer to ESTIMATE_BLOCK_FACTOR in src/backend/commands/analyze.cpp */
|
|
#define ESTIMATE_BLOCK_FACTOR 0.65
|
|
|
|
static int compute_node_begin(int conn_count, PGXCNodeHandle** connections, GlobalTransactionId gxid);
|
|
|
|
static void close_node_cursors(PGXCNodeHandle** connections, int conn_count, const char* cursor);
|
|
static void ExecRemoteFunctionInParallel(
|
|
ParallelFunctionState* state, RemoteQueryExecType exec_remote_type, bool non_check_count = false);
|
|
|
|
static int pgxc_get_transaction_nodes(PGXCNodeHandle* connections[], int size, bool writeOnly);
|
|
static int GetNodeIdFromNodesDef(NodeDefinition* node_def, Oid nodeoid);
|
|
static int pgxc_get_connections(PGXCNodeHandle* connections[], int size, List* connlist);
|
|
static void pgxc_node_send_queryid_with_sync(PGXCNodeHandle** connections, int conn_count, uint64 queryId);
|
|
static TupleTableSlot* RemoteQueryNext(ScanState* node);
|
|
static bool RemoteQueryRecheck(RemoteQueryState* node, TupleTableSlot* slot);
|
|
|
|
static char* pgxc_node_get_nodelist(bool localNode);
|
|
|
|
static void ExecClearTempObjectIncluded(void);
|
|
static void init_RemoteXactState(bool preparedLocalNode);
|
|
static void clear_RemoteXactState(void);
|
|
static bool IsReturningDMLOnReplicatedTable(RemoteQuery* rq);
|
|
static void SetDataRowForIntParams(
|
|
JunkFilter* junkfilter, TupleTableSlot* sourceSlot, TupleTableSlot* newSlot, RemoteQueryState* rq_state);
|
|
static void pgxc_append_param_val(StringInfo buf, Datum val, Oid valtype);
|
|
static void pgxc_append_param_junkval(
|
|
TupleTableSlot* slot, AttrNumber attno, Oid valtype, StringInfo buf, bool allow_dummy_junkfilter);
|
|
static void pgxc_rq_fire_bstriggers(RemoteQueryState* node);
|
|
static void pgxc_rq_fire_astriggers(RemoteQueryState* node);
|
|
static void pgxc_check_and_update_nodedef(PlannedStmt* planstmt, PGXCNodeHandle** connections, int regular_conn_count);
|
|
|
|
bool FetchTupleSimple(RemoteQueryState* combiner, TupleTableSlot* slot);
|
|
|
|
typedef enum StatisticKind {
|
|
StatisticNone,
|
|
StatisticPageAndTuple,
|
|
StatisticHistogram,
|
|
StatisticMultiHistogram,
|
|
StatisticPartitionPageAndTuple
|
|
} StatisticKind;
|
|
|
|
// parentRel is only valid for dfs delta table.
|
|
static void FetchStatisticsInternal(const char* schemaname, const char* relname, List* va_cols, StatisticKind kind,
|
|
RangeVar* parentRel, VacuumStmt* stmt = NULL, bool isReplication = false);
|
|
static void FetchGlobalRelationStatistics(VacuumStmt* stmt, Oid relid, RangeVar* parentRel, bool isReplication = false);
|
|
static void FetchGlobalStatisticsInternal(const char* schemaname, const char* relname, List* va_cols,
|
|
StatisticKind kind, RangeVar* parentRel, VacuumStmt* stmt = NULL);
|
|
static void ReceiveHistogram(
|
|
Oid relid, TupleTableSlot* slot, bool isReplication = false, PGFDWTableAnalyze* info = NULL);
|
|
static void ReceiveHistogramMultiColStats(
|
|
Oid relid, TupleTableSlot* slot, bool isReplication = false, PGFDWTableAnalyze* info = NULL);
|
|
static void ReceivePageAndTuple(Oid relid, TupleTableSlot* slot, VacuumStmt* stmt = NULL);
|
|
static void ReceivePartitionPageAndTuple(Oid relid, TupleTableSlot* slot);
|
|
|
|
static bool clean_splitmap(Plan* plan);
|
|
static List* reassign_splitmap(Plan* plan, int dn_num);
|
|
static void resetRelidOfRTE(PlannedStmt* ps);
|
|
static PGXCNodeAllHandles* connect_compute_pool_for_OBS();
|
|
static PGXCNodeAllHandles* connect_compute_pool_for_HDFS();
|
|
static PGXCNodeAllHandles* make_cp_conn(ComputePoolConfig** config, int num, int srvtype, const char* dbname = NULL);
|
|
static PGXCNodeAllHandles* try_make_cp_conn(
|
|
const char* cpip, ComputePoolConfig* config, int srvtype, const char* dbname = NULL);
|
|
|
|
extern void cancel_query_without_read();
|
|
extern void destroy_handles();
|
|
extern void pgxc_node_init(PGXCNodeHandle* handle, int sock);
|
|
extern void CheckGetServerIpAndPort(const char* Address, List** AddrList, bool IsCheck, int real_addr_max);
|
|
extern int32 get_relation_data_width(Oid relid, Oid partitionid, int32* attr_widths, bool vectorized = false);
|
|
|
|
void CheckRemoteUtiltyConn(PGXCNodeHandle* conn, Snapshot snapshot);
|
|
extern void ReorganizeSqlStatement(
|
|
ExplainState* es, RemoteQuery* rq, const char* queryString, const char* explainsql, Oid nodeoid);
|
|
|
|
void setSocketError(const char* msg, const char* node_name)
|
|
{
|
|
StringInfoData str;
|
|
errno_t rc = EOK;
|
|
|
|
if (msg != NULL) {
|
|
rc = strncpy_s(t_thrd.pgxc_cxt.socket_buffer,
|
|
sizeof(t_thrd.pgxc_cxt.socket_buffer),
|
|
msg,
|
|
sizeof(t_thrd.pgxc_cxt.socket_buffer) - 1);
|
|
securec_check(rc, "", "");
|
|
} else {
|
|
(void)SOCK_STRERROR(SOCK_ERRNO, t_thrd.pgxc_cxt.socket_buffer, sizeof(t_thrd.pgxc_cxt.socket_buffer));
|
|
}
|
|
|
|
if (IS_PGXC_DATANODE && node_name) {
|
|
initStringInfo(&str);
|
|
appendStringInfo(&str,
|
|
"%s. Local: %s Remote: %s.",
|
|
t_thrd.pgxc_cxt.socket_buffer,
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
node_name);
|
|
rc = strncpy_s(t_thrd.pgxc_cxt.socket_buffer, sizeof(t_thrd.pgxc_cxt.socket_buffer), str.data, str.len);
|
|
securec_check(rc, "", "");
|
|
resetStringInfo(&str);
|
|
}
|
|
}
|
|
|
|
#define CONN_RESET_BY_PEER "Connection reset by peer"
|
|
#define CONN_TIMED_OUT "Connection timed out"
|
|
#define CONN_REMOTE_CLOSE "Remote close socket unexpectedly"
|
|
#define CONN_SCTP_ERR_1 "1002 Memeory allocate error"
|
|
#define CONN_SCTP_ERR_2 "1041 No data in buffer"
|
|
#define CONN_SCTP_ERR_3 "1046 Close because release memory"
|
|
#define CONN_SCTP_ERR_4 "1047 TCP disconnect"
|
|
#define CONN_SCTP_ERR_5 "1048 SCTP disconnect"
|
|
#define CONN_SCTP_ERR_6 "1049 Stream closed by remote"
|
|
#define CONN_SCTP_ERR_7 "1059 Wait poll unknow error"
|
|
|
|
int getStreamSocketError(const char* str)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return 0;
|
|
}
|
|
|
|
char* getSocketError(int* err_code)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* @Description: Check if need check the other error message(s) when receiving
|
|
* some error caused by closing socket normally in Stream thread.
|
|
*
|
|
* @param[IN] error_code: error code
|
|
* @return: bool, true if need check the other error message(s)
|
|
*/
|
|
static bool IsErrorNeedCheck(int error_code)
|
|
{
|
|
if (error_code == ERRCODE_SCTP_REMOTE_CLOSE || error_code == ERRCODE_STREAM_REMOTE_CLOSE_SOCKET ||
|
|
error_code == ERRCODE_STREAM_CONNECTION_RESET_BY_PEER || error_code == ERRCODE_RU_STOP_QUERY ||
|
|
error_code == ERRCODE_QUERY_INTERNAL_CANCEL)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Create a structure to store parameters needed to combine responses from
|
|
* multiple connections as well as state information
|
|
*/
|
|
RemoteQueryState* CreateResponseCombiner(int node_count, CombineType combine_type)
|
|
{
|
|
RemoteQueryState* combiner = NULL;
|
|
|
|
/* ResponseComber is a typedef for pointer to ResponseCombinerData */
|
|
combiner = makeNode(RemoteQueryState);
|
|
combiner->node_count = node_count;
|
|
combiner->connections = NULL;
|
|
combiner->conn_count = 0;
|
|
combiner->combine_type = combine_type;
|
|
combiner->command_complete_count = 0;
|
|
combiner->request_type = REQUEST_TYPE_NOT_DEFINED;
|
|
combiner->tuple_desc = NULL;
|
|
combiner->description_count = 0;
|
|
combiner->copy_in_count = 0;
|
|
combiner->copy_out_count = 0;
|
|
combiner->errorCode = 0;
|
|
combiner->errorMessage = NULL;
|
|
combiner->errorDetail = NULL;
|
|
combiner->errorContext = NULL;
|
|
combiner->hint = NULL;
|
|
combiner->query = NULL;
|
|
combiner->cursorpos = 0;
|
|
combiner->query_Done = false;
|
|
combiner->is_fatal_error = false;
|
|
combiner->currentRow.msg = NULL;
|
|
combiner->currentRow.msglen = 0;
|
|
combiner->currentRow.msgnode = 0;
|
|
combiner->row_store = RowStoreAlloc(CurrentMemoryContext, ROW_STORE_MAX_MEM,
|
|
t_thrd.utils_cxt.CurrentResourceOwner);
|
|
combiner->maxCSN = InvalidCommitSeqNo;
|
|
combiner->hadrMainStandby = false;
|
|
combiner->tapenodes = NULL;
|
|
combiner->remoteCopyType = REMOTE_COPY_NONE;
|
|
combiner->copy_file = NULL;
|
|
combiner->rqs_cmd_id = FirstCommandId;
|
|
combiner->rqs_processed = 0;
|
|
combiner->rqs_cur_processed = 0;
|
|
combiner->need_error_check = false;
|
|
combiner->valid_command_complete_count = 0;
|
|
combiner->pbe_run_status = PBE_NONE;
|
|
|
|
return combiner;
|
|
}
|
|
|
|
RemoteQueryState* CreateResponseCombinerForBarrier(int nodeCount, CombineType combineType)
|
|
{
|
|
return CreateResponseCombiner(nodeCount, combineType);
|
|
}
|
|
|
|
/*
|
|
* Parse out row count from the command status response and convert it to integer
|
|
*/
|
|
static int parse_row_count(const char* message, size_t len, uint64* rowcount)
|
|
{
|
|
int digits = 0;
|
|
size_t pos;
|
|
|
|
*rowcount = 0;
|
|
/* skip \0 string terminator */
|
|
for (pos = 0; pos < len - 1; pos++) {
|
|
if (message[pos] >= '0' && message[pos] <= '9') {
|
|
int num = message[pos] - '0';
|
|
if (*rowcount <= (PG_UINT64_MAX - num) / 10) {
|
|
*rowcount = *rowcount * 10 + num;
|
|
digits++;
|
|
} else {
|
|
ereport(ERROR, (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), errmsg("message is out of range")));
|
|
}
|
|
} else {
|
|
*rowcount = 0;
|
|
digits = 0;
|
|
}
|
|
}
|
|
return digits;
|
|
}
|
|
|
|
/*
|
|
* Convert RowDescription message to a TupleDesc
|
|
*/
|
|
static TupleDesc create_tuple_desc(char* msg_body)
|
|
{
|
|
TupleDesc result;
|
|
int i, nattr;
|
|
uint16 n16;
|
|
|
|
/* get number of attributes */
|
|
errno_t rc = 0;
|
|
rc = memcpy_s(&n16, sizeof(uint16), msg_body, sizeof(uint16));
|
|
securec_check(rc, "\0", "\0");
|
|
nattr = ntohs(n16);
|
|
msg_body += 2;
|
|
|
|
result = CreateTemplateTupleDesc(nattr, false);
|
|
|
|
/* decode attributes */
|
|
for (i = 1; i <= nattr; i++) {
|
|
AttrNumber attnum;
|
|
char* attname = NULL;
|
|
char* typname = NULL;
|
|
Oid oidtypeid;
|
|
int32 typemode, typmod, oidtypeidint;
|
|
|
|
attnum = (AttrNumber)i;
|
|
|
|
/* attribute name */
|
|
attname = msg_body;
|
|
msg_body += strlen(attname) + 1;
|
|
|
|
/* table OID, ignored */
|
|
msg_body += 4;
|
|
|
|
/* column no, ignored */
|
|
msg_body += 2;
|
|
|
|
/* data type OID, ignored */
|
|
rc = memcpy_s(&oidtypeidint, sizeof(int32), msg_body, sizeof(int32));
|
|
securec_check(rc, "\0", "\0");
|
|
oidtypeid = ntohl(oidtypeidint);
|
|
msg_body += 4;
|
|
|
|
/* type len, ignored */
|
|
msg_body += 2;
|
|
|
|
/* type mod */
|
|
rc = memcpy_s(&typemode, sizeof(int32), msg_body, sizeof(int32));
|
|
securec_check(rc, "\0", "\0");
|
|
typmod = ntohl(typemode);
|
|
msg_body += 4;
|
|
|
|
/* Get the OID type and mode type from typename */
|
|
if (oidtypeid >= FirstBootstrapObjectId) {
|
|
/* type name */
|
|
typname = msg_body;
|
|
msg_body += strlen(typname) + 1;
|
|
|
|
oidtypeid = get_typeoid_with_namespace(typname);
|
|
} else
|
|
typname = "";
|
|
|
|
msg_body += 2;
|
|
|
|
TupleDescInitEntry(result, attnum, attname, oidtypeid, typmod, 0);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Handle CopyOutCommandComplete ('c') message from a Datanode connection
|
|
*/
|
|
static void HandleCopyOutComplete(RemoteQueryState* combiner)
|
|
{
|
|
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
combiner->request_type = REQUEST_TYPE_COPY_OUT;
|
|
if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
|
|
/* Inconsistent responses */
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg("Unexpected response from the Datanodes for 'c' message, current request type %d",
|
|
combiner->request_type)));
|
|
/* Just do nothing, close message is managed by the Coordinator */
|
|
combiner->copy_out_count++;
|
|
}
|
|
|
|
/*
|
|
* Handle CommandComplete ('C') message from a Datanode connection
|
|
*/
|
|
static void HandleCommandComplete(
|
|
RemoteQueryState* combiner, const char* msg_body, size_t len, PGXCNodeHandle* conn, bool isdummy)
|
|
{
|
|
int digits = 0;
|
|
bool non_fqs_dml = false;
|
|
|
|
/* Is this a DML query that is not FQSed ? */
|
|
non_fqs_dml = (combiner->ss.ps.plan && ((RemoteQuery*)combiner->ss.ps.plan)->rq_params_internal);
|
|
/*
|
|
* If we did not receive description we are having rowcount or OK response
|
|
*/
|
|
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
combiner->request_type = REQUEST_TYPE_COMMAND;
|
|
/* Extract rowcount */
|
|
if (combiner->combine_type != COMBINE_TYPE_NONE) {
|
|
uint64 rowcount;
|
|
digits = parse_row_count(msg_body, len, &rowcount);
|
|
if (digits > 0) {
|
|
/*
|
|
* Need to completely remove the dependency on whether
|
|
* it's an FQS or non-FQS DML query in future. For this, command_complete_count
|
|
* needs to be better handled. Currently this field is being updated
|
|
* for each iteration of FetchTuple by re-using the same combiner
|
|
* for each iteration, whereas it seems it should be updated only
|
|
* for each node execution, not for each tuple fetched.
|
|
*/
|
|
|
|
/* Replicated write, make sure they are the same */
|
|
if (combiner->combine_type == COMBINE_TYPE_SAME) {
|
|
if (combiner->valid_command_complete_count) {
|
|
/* For FQS, check if there is a consistency issue with replicated table. */
|
|
if (rowcount != combiner->rqs_processed && !isdummy && !non_fqs_dml) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg("Write to replicated table returned "
|
|
"different results from the Datanodes on current DN:%s and previous DN:%s.",
|
|
conn->remoteNodeName,
|
|
combiner->previousNodeName)));
|
|
}
|
|
}
|
|
/* Always update the row count. We have initialized it to 0 */
|
|
if (!isdummy)
|
|
combiner->rqs_processed = rowcount;
|
|
} else {
|
|
combiner->rqs_processed += rowcount;
|
|
combiner->rqs_cur_processed = rowcount;
|
|
}
|
|
combiner->previousNodeName = conn->remoteNodeName;
|
|
|
|
/*
|
|
* This rowcount will be used to increment estate->es_processed
|
|
* either in ExecInsert/Update/Delete for non-FQS query, or will
|
|
* used in RemoteQueryNext() for FQS query.
|
|
*/
|
|
} else {
|
|
combiner->combine_type = COMBINE_TYPE_NONE;
|
|
}
|
|
}
|
|
|
|
/* If response checking is enable only then do further processing */
|
|
|
|
if (conn->ck_resp_rollback == RESP_ROLLBACK_CHECK) {
|
|
conn->ck_resp_rollback = RESP_ROLLBACK_NOT_RECEIVED;
|
|
if (len == ROLLBACK_RESP_LEN) { /* No need to do string comparison otherwise */
|
|
if (strcmp(msg_body, "ROLLBACK") == 0)
|
|
conn->ck_resp_rollback = RESP_ROLLBACK_RECEIVED;
|
|
}
|
|
}
|
|
|
|
if (!isdummy)
|
|
combiner->valid_command_complete_count++;
|
|
combiner->command_complete_count++;
|
|
}
|
|
|
|
/*
|
|
* Handle RowDescription ('T') message from a Datanode connection
|
|
*/
|
|
static bool HandleRowDescription(RemoteQueryState* combiner, char* msg_body)
|
|
{
|
|
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
combiner->request_type = REQUEST_TYPE_QUERY;
|
|
if (combiner->request_type != REQUEST_TYPE_QUERY) {
|
|
/* Inconsistent responses */
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg("Unexpected response from the Datanodes for 'T' message, current request type %d",
|
|
combiner->request_type)));
|
|
}
|
|
/* Increment counter and check if it was first */
|
|
if (combiner->description_count++ == 0) {
|
|
combiner->tuple_desc = create_tuple_desc(msg_body);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
#ifdef NOT_USED
|
|
/*
|
|
* Handle ParameterStatus ('S') message from a Datanode connection (SET command)
|
|
*/
|
|
static void HandleParameterStatus(RemoteQueryState* combiner, char* msg_body, size_t len)
|
|
{
|
|
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
combiner->request_type = REQUEST_TYPE_QUERY;
|
|
if (combiner->request_type != REQUEST_TYPE_QUERY) {
|
|
/* Inconsistent responses */
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg("Unexpected response from the Datanodes for 'S' message, current request type %d",
|
|
combiner->request_type)));
|
|
}
|
|
/* Proxy last */
|
|
if (++combiner->description_count == combiner->node_count) {
|
|
pq_putmessage('S', msg_body, len);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Handle CopyInResponse ('G') message from a Datanode connection
|
|
*/
|
|
static void HandleCopyIn(RemoteQueryState* combiner)
|
|
{
|
|
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
combiner->request_type = REQUEST_TYPE_COPY_IN;
|
|
if (combiner->request_type != REQUEST_TYPE_COPY_IN) {
|
|
/* Inconsistent responses */
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg("Unexpected response from the Datanodes for 'G' message, current request type %d",
|
|
combiner->request_type)));
|
|
}
|
|
/*
|
|
* The normal PG code will output an G message when it runs in the
|
|
* Coordinator, so do not proxy message here, just count it.
|
|
*/
|
|
combiner->copy_in_count++;
|
|
}
|
|
|
|
/*
|
|
* Handle CopyOutResponse ('H') message from a Datanode connection
|
|
*/
|
|
static void HandleCopyOut(RemoteQueryState* combiner)
|
|
{
|
|
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
combiner->request_type = REQUEST_TYPE_COPY_OUT;
|
|
if (combiner->request_type != REQUEST_TYPE_COPY_OUT) {
|
|
/* Inconsistent responses */
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg("Unexpected response from the Datanodes for 'H' message, current request type %d",
|
|
combiner->request_type)));
|
|
}
|
|
/*
|
|
* The normal PG code will output an H message when it runs in the
|
|
* Coordinator, so do not proxy message here, just count it.
|
|
*/
|
|
combiner->copy_out_count++;
|
|
}
|
|
|
|
/*
|
|
* Handle CopyOutDataRow ('d') message from a Datanode connection
|
|
*/
|
|
static void HandleCopyDataRow(RemoteQueryState* combiner, char* msg_body, size_t len)
|
|
{
|
|
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
combiner->request_type = REQUEST_TYPE_COPY_OUT;
|
|
|
|
/* Inconsistent responses */
|
|
if (combiner->request_type != REQUEST_TYPE_COPY_OUT)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg("Unexpected response from the Datanodes for 'd' message, current request type %d",
|
|
combiner->request_type)));
|
|
|
|
/* count the row */
|
|
combiner->processed++;
|
|
|
|
/* Output remote COPY operation to correct location */
|
|
switch (combiner->remoteCopyType) {
|
|
case REMOTE_COPY_FILE: {
|
|
/* Write data directly to file */
|
|
|
|
/*
|
|
* This is supposed to be a temperary patch for our file_encoding/client_encoding issue for
|
|
* COPY and FDWs with gsmpp_server. Dedicated "minimal modification" was asked to be
|
|
* done here, so this chunk of code only fix COPY TO FILE (but not the others) in a very
|
|
* superficial way.
|
|
*/
|
|
char* transcoding_for_file = NULL;
|
|
if (u_sess->cmd_cxt.need_transcoding_for_copytofile &&
|
|
WillTranscodingBePerformed(u_sess->cmd_cxt.dest_encoding_for_copytofile)) {
|
|
transcoding_for_file = pg_server_to_any(msg_body, len, u_sess->cmd_cxt.dest_encoding_for_copytofile);
|
|
Assert(transcoding_for_file != msg_body);
|
|
fwrite(transcoding_for_file, 1, strlen(transcoding_for_file), combiner->copy_file);
|
|
pfree_ext(transcoding_for_file);
|
|
} else
|
|
fwrite(msg_body, 1, len, combiner->copy_file);
|
|
break;
|
|
}
|
|
case REMOTE_COPY_STDOUT:
|
|
/* Send back data to client */
|
|
pq_putmessage('d', msg_body, len);
|
|
break;
|
|
case REMOTE_COPY_TUPLESTORE: {
|
|
Datum* values = NULL;
|
|
bool* nulls = NULL;
|
|
TupleDesc tupdesc = combiner->tuple_desc;
|
|
int i, dropped;
|
|
FormData_pg_attribute* attr = tupdesc->attrs;
|
|
FmgrInfo* in_functions = NULL;
|
|
Oid* typioparams = NULL;
|
|
char** fields = NULL;
|
|
|
|
values = (Datum*)palloc(tupdesc->natts * sizeof(Datum));
|
|
nulls = (bool*)palloc(tupdesc->natts * sizeof(bool));
|
|
in_functions = (FmgrInfo*)palloc(tupdesc->natts * sizeof(FmgrInfo));
|
|
typioparams = (Oid*)palloc(tupdesc->natts * sizeof(Oid));
|
|
|
|
/* Calculate the Oids of input functions */
|
|
for (i = 0; i < tupdesc->natts; i++) {
|
|
Oid in_func_oid;
|
|
|
|
/* Do not need any information for dropped attributes */
|
|
if (attr[i].attisdropped)
|
|
continue;
|
|
|
|
getTypeInputInfo(attr[i].atttypid, &in_func_oid, &typioparams[i]);
|
|
fmgr_info(in_func_oid, &in_functions[i]);
|
|
}
|
|
|
|
/*
|
|
* Convert message into an array of fields.
|
|
* Last \n is not included in converted message.
|
|
*/
|
|
fields = CopyOps_RawDataToArrayField(tupdesc, msg_body, len - 1);
|
|
|
|
/* Fill in the array values */
|
|
dropped = 0;
|
|
for (i = 0; i < tupdesc->natts; i++) {
|
|
char* string = fields[i - dropped];
|
|
/* Do not need any information for dropped attributes */
|
|
if (attr[i].attisdropped) {
|
|
dropped++;
|
|
nulls[i] = true; /* Consider dropped parameter as NULL */
|
|
continue;
|
|
}
|
|
|
|
/* Find value */
|
|
values[i] = InputFunctionCall(&in_functions[i], string, typioparams[i], attr[i].atttypmod);
|
|
/* Setup value with NULL flag if necessary */
|
|
if (string == NULL)
|
|
nulls[i] = true;
|
|
else
|
|
nulls[i] = false;
|
|
}
|
|
|
|
/* Then insert the values into tuplestore */
|
|
tuplestore_putvalues(combiner->tuplestorestate, combiner->tuple_desc, values, nulls);
|
|
|
|
/* Clean up everything */
|
|
if (*fields)
|
|
pfree_ext(*fields);
|
|
pfree_ext(fields);
|
|
pfree_ext(values);
|
|
pfree_ext(nulls);
|
|
pfree_ext(in_functions);
|
|
pfree_ext(typioparams);
|
|
} break;
|
|
case REMOTE_COPY_NONE:
|
|
default:
|
|
Assert(0); /* Should not happen */
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Handle DataRow ('D') message from a Datanode connection
|
|
* The function returns true if buffer can accept more data rows.
|
|
* Caller must stop reading if function returns false
|
|
*/
|
|
static void HandleDataRow(
|
|
RemoteQueryState* combiner, char* msg_body, size_t len, Oid nodeoid, const char* remoteNodeName)
|
|
{
|
|
/* We expect previous message is consumed */
|
|
Assert(combiner->currentRow.msg == NULL);
|
|
|
|
if (combiner->request_type != REQUEST_TYPE_QUERY) {
|
|
/* Inconsistent responses */
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg("Unexpected response from the Datanodes for 'D' message, current request type %d",
|
|
combiner->request_type)));
|
|
}
|
|
|
|
/*
|
|
* If we got an error already ignore incoming data rows from other nodes
|
|
* Still we want to continue reading until get CommandComplete
|
|
*/
|
|
if (combiner->errorMessage)
|
|
return;
|
|
|
|
/* Check messages from DN. */
|
|
if (IS_PGXC_COORDINATOR) {
|
|
#ifdef USE_ASSERT_CHECKING
|
|
if (strcmp(remoteNodeName, g_instance.attr.attr_common.PGXCNodeName) != 0) {
|
|
CheckMessages(0, 0, msg_body, len, false);
|
|
msg_body += REMOTE_CHECKMSG_LEN;
|
|
len -= REMOTE_CHECKMSG_LEN;
|
|
}
|
|
#else
|
|
|
|
if (unlikely(anls_opt_is_on(ANLS_STREAM_DATA_CHECK) &&
|
|
strcmp(remoteNodeName, g_instance.attr.attr_common.PGXCNodeName) != 0)) {
|
|
CheckMessages(0, 0, msg_body, len, false);
|
|
msg_body += REMOTE_CHECKMSG_LEN;
|
|
len -= REMOTE_CHECKMSG_LEN;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* We are copying message because it points into connection buffer, and
|
|
* will be overwritten on next socket read
|
|
*/
|
|
combiner->currentRow.msg = (char*)palloc(len);
|
|
errno_t rc = 0;
|
|
rc = memcpy_s(combiner->currentRow.msg, len, msg_body, len);
|
|
securec_check(rc, "\0", "\0");
|
|
combiner->currentRow.msglen = len;
|
|
combiner->currentRow.msgnode = nodeoid;
|
|
}
|
|
|
|
/*
|
|
* Handle ErrorResponse ('E') message from a Datanode connection
|
|
*/
|
|
static void HandleError(RemoteQueryState* combiner, char* msg_body, size_t len)
|
|
{
|
|
/* parse error message */
|
|
char* code = NULL;
|
|
char* message = NULL;
|
|
char* detail = NULL;
|
|
char* context = NULL;
|
|
size_t offset = 0;
|
|
char* realerrcode = NULL;
|
|
char* funcname = NULL;
|
|
char* filename = NULL;
|
|
char* lineno = NULL;
|
|
int error_code = 0;
|
|
char* mod_id = NULL;
|
|
char* hint = NULL;
|
|
char* query = NULL;
|
|
char* cursorpos = NULL;
|
|
|
|
/*
|
|
* Scan until point to terminating \0
|
|
*/
|
|
while (offset + 1 < len) {
|
|
/* pointer to the field message */
|
|
char* str = msg_body + offset + 1;
|
|
|
|
switch (msg_body[offset]) {
|
|
case 'c':
|
|
realerrcode = str;
|
|
break;
|
|
case 'C': /* code */
|
|
code = str;
|
|
|
|
/* Error Code is exactly 5 significant bytes */
|
|
if (code != NULL)
|
|
error_code = MAKE_SQLSTATE((unsigned char)code[0],
|
|
(unsigned char)code[1],
|
|
(unsigned char)code[2],
|
|
(unsigned char)code[3],
|
|
(unsigned char)code[4]);
|
|
break;
|
|
case 'M': /* message */
|
|
message = str;
|
|
break;
|
|
case 'D': /* details */
|
|
detail = str;
|
|
break;
|
|
case 'd': /* mod_id */
|
|
mod_id = str;
|
|
break;
|
|
case 'W': /* where */
|
|
context = str;
|
|
break;
|
|
case 'S': /* severity */
|
|
if (pg_strncasecmp(str, "FATAL", 5) == 0)
|
|
combiner->is_fatal_error = true;
|
|
break;
|
|
case 'R': /* routine */
|
|
funcname = str;
|
|
break;
|
|
case 'F': /* file */
|
|
filename = str;
|
|
break;
|
|
case 'L': /* line */
|
|
lineno = str;
|
|
break;
|
|
|
|
case 'H': /* hint */
|
|
hint = str;
|
|
break;
|
|
|
|
case 'q': /* int query */
|
|
query = str;
|
|
break;
|
|
|
|
case 'p': /* position int */
|
|
cursorpos = str;
|
|
break;
|
|
|
|
/* Fields not yet in use */
|
|
case 'P': /* position string */
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/* code, message and \0 */
|
|
offset += strlen(str) + 2;
|
|
}
|
|
|
|
if (!IsErrorNeedCheck(error_code)) {
|
|
combiner->need_error_check = false;
|
|
|
|
/*
|
|
* If the error comes after some communication error(s), we should free the
|
|
* former cached one.
|
|
*/
|
|
combiner->errorCode = 0;
|
|
|
|
if (combiner->errorMessage) {
|
|
pfree_ext(combiner->errorMessage);
|
|
combiner->errorMessage = NULL;
|
|
}
|
|
|
|
if (combiner->errorDetail) {
|
|
pfree_ext(combiner->errorDetail);
|
|
combiner->errorDetail = NULL;
|
|
}
|
|
|
|
if (combiner->errorContext) {
|
|
pfree_ext(combiner->errorContext);
|
|
combiner->errorContext = NULL;
|
|
}
|
|
|
|
if (combiner->hint) {
|
|
pfree_ext(combiner->hint);
|
|
combiner->hint = NULL;
|
|
}
|
|
|
|
if (combiner->query) {
|
|
pfree_ext(combiner->query);
|
|
combiner->query = NULL;
|
|
}
|
|
|
|
combiner->cursorpos = 0;
|
|
combiner->remoteErrData.internalerrcode = 0;
|
|
combiner->remoteErrData.lineno = 0;
|
|
|
|
if (combiner->remoteErrData.filename) {
|
|
pfree_ext(combiner->remoteErrData.filename);
|
|
combiner->remoteErrData.filename = NULL;
|
|
}
|
|
|
|
if (combiner->remoteErrData.errorfuncname) {
|
|
pfree_ext(combiner->remoteErrData.errorfuncname);
|
|
combiner->remoteErrData.errorfuncname = NULL;
|
|
}
|
|
} else {
|
|
if (!combiner->need_error_check) {
|
|
/*
|
|
* If it's the first time meeting a communication error which may be caused by
|
|
* normal connection close, cache it in Combiner(RemoteQueryState) and set
|
|
* flag 'need_error_check' to true. The other errors comes from Datanodes need
|
|
* to be check to figure out the real one.
|
|
*/
|
|
combiner->need_error_check = true;
|
|
} else {
|
|
/* If still getting a communication error, just increment the counter and return. */
|
|
combiner->command_complete_count++;
|
|
return;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We may have special handling for some errors, default handling is to
|
|
* throw out error with the same message. We can not ereport immediately
|
|
* because we should read from this and other connections until
|
|
* ReadyForQuery is received, so we just store the error message.
|
|
* If multiple connections return errors only first one is reported.
|
|
*/
|
|
if (combiner->errorMessage == NULL) {
|
|
if (message != NULL) {
|
|
combiner->errorMessage = pstrdup(message);
|
|
|
|
if (code != NULL)
|
|
combiner->errorCode = error_code;
|
|
|
|
if (realerrcode != NULL)
|
|
combiner->remoteErrData.internalerrcode = pg_strtoint32(realerrcode);
|
|
}
|
|
|
|
if (hint != NULL)
|
|
combiner->hint = pstrdup(hint);
|
|
else
|
|
combiner->hint = NULL;
|
|
|
|
if (query != NULL)
|
|
combiner->query = pstrdup(query);
|
|
else
|
|
combiner->query = NULL;
|
|
|
|
if (detail != NULL)
|
|
combiner->errorDetail = pstrdup(detail);
|
|
else
|
|
combiner->errorDetail = NULL;
|
|
|
|
if (context != NULL)
|
|
combiner->errorContext = pstrdup(context);
|
|
else
|
|
combiner->errorContext = NULL;
|
|
|
|
if (filename != NULL)
|
|
combiner->remoteErrData.filename = pstrdup(filename);
|
|
else
|
|
combiner->remoteErrData.filename = NULL;
|
|
|
|
if (funcname != NULL)
|
|
combiner->remoteErrData.errorfuncname = pstrdup(funcname);
|
|
else
|
|
combiner->remoteErrData.errorfuncname = NULL;
|
|
|
|
if (lineno != NULL)
|
|
combiner->remoteErrData.lineno = pg_strtoint32(lineno);
|
|
else
|
|
combiner->remoteErrData.lineno = 0;
|
|
|
|
if (cursorpos != NULL)
|
|
combiner->cursorpos = pg_strtoint32(cursorpos);
|
|
else
|
|
combiner->cursorpos = 0;
|
|
|
|
if (mod_id != NULL)
|
|
combiner->remoteErrData.mod_id = get_module_id(mod_id);
|
|
}
|
|
|
|
/*
|
|
* If Datanode have sent ErrorResponse it will never send CommandComplete.
|
|
* Increment the counter to prevent endless waiting for it.
|
|
*/
|
|
combiner->command_complete_count++;
|
|
}
|
|
|
|
/*
|
|
* Handle NoticeResponse ('N') message from a Datanode connection
|
|
*/
|
|
static void HandleNotice(RemoteQueryState* combiner, char* msg_body, size_t len)
|
|
{
|
|
/* parse error message */
|
|
char* message = NULL;
|
|
char* detail = NULL;
|
|
char* hint = NULL;
|
|
size_t offset = 0;
|
|
int elevel = NOTICE;
|
|
|
|
/*
|
|
* Scan until point to terminating \0
|
|
*/
|
|
while (offset + 1 < len) {
|
|
/* pointer to the field message */
|
|
char* str = msg_body + offset + 1;
|
|
|
|
switch (msg_body[offset]) {
|
|
case 'M': /* message */
|
|
message = str;
|
|
break;
|
|
case 'D': /* details */
|
|
detail = str;
|
|
break;
|
|
|
|
/* Fields not yet in use */
|
|
case 'S': /* severity */
|
|
if (pg_strncasecmp(str, "WARNING", strlen("WARNING")) == 0) {
|
|
elevel = WARNING;
|
|
}
|
|
break;
|
|
|
|
case 'H': /* hint */
|
|
hint = str;
|
|
break;
|
|
|
|
case 'C': /* code */
|
|
case 'R': /* routine */
|
|
case 'P': /* position string */
|
|
case 'p': /* position int */
|
|
case 'q': /* int query */
|
|
case 'W': /* where */
|
|
case 'F': /* file */
|
|
case 'L': /* line */
|
|
default:
|
|
break;
|
|
}
|
|
|
|
/* code, message and \0 */
|
|
offset += strlen(str) + 2;
|
|
}
|
|
|
|
if (message != NULL) {
|
|
if ((detail != NULL) && (hint != NULL))
|
|
ereport(
|
|
elevel, (errmsg("%s", message), errdetail("%s", detail), errhint("%s", hint), handle_in_client(true)));
|
|
else if (detail != NULL)
|
|
ereport(elevel, (errmsg("%s", message), errdetail("%s", detail), handle_in_client(true)));
|
|
else if (hint != NULL)
|
|
ereport(elevel, (errmsg("%s", message), errhint("%s", hint), handle_in_client(true)));
|
|
else
|
|
ereport(elevel, (errmsg("%s", message), handle_in_client(true)));
|
|
}
|
|
}
|
|
|
|
void HandleCmdComplete(CmdType commandType, CombineTag* combine, const char* msg_body, size_t len)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
int digits = 0;
|
|
uint64 originrowcount = 0;
|
|
uint64 rowcount = 0;
|
|
uint64 total = 0;
|
|
errno_t rc = EOK;
|
|
|
|
if (msg_body == NULL)
|
|
return;
|
|
|
|
/* if there's nothing in combine, just copy the msg_body */
|
|
if (strlen(combine->data) == 0) {
|
|
errno_t ret = strcpy_s(combine->data, COMPLETION_TAG_BUFSIZE, msg_body);
|
|
securec_check(ret, "\0", "\0");
|
|
combine->cmdType = commandType;
|
|
return;
|
|
} else {
|
|
/* commandType is conflict */
|
|
if (combine->cmdType != commandType)
|
|
return;
|
|
|
|
/* get the processed row number from msg_body */
|
|
digits = parse_row_count(msg_body, len + 1, &rowcount);
|
|
elog(DEBUG1, "digits is %d\n", digits);
|
|
Assert(digits >= 0);
|
|
|
|
/* no need to combine */
|
|
if (digits == 0)
|
|
return;
|
|
|
|
/* combine the processed row number */
|
|
parse_row_count(combine->data, strlen(combine->data) + 1, &originrowcount);
|
|
elog(DEBUG1, "originrowcount is %lu, rowcount is %lu\n", originrowcount, rowcount);
|
|
total = originrowcount + rowcount;
|
|
}
|
|
|
|
/* output command completion tag */
|
|
switch (commandType) {
|
|
case CMD_SELECT:
|
|
rc = strcpy_s(combine->data, COMPLETION_TAG_BUFSIZE, "SELECT");
|
|
securec_check(rc, "", "");
|
|
break;
|
|
case CMD_INSERT:
|
|
rc = snprintf_s(
|
|
combine->data, COMPLETION_TAG_BUFSIZE, COMPLETION_TAG_BUFSIZE - 1, "INSERT %u %lu", 0, total);
|
|
securec_check_ss(rc, "", "");
|
|
break;
|
|
case CMD_UPDATE:
|
|
rc = snprintf_s(combine->data, COMPLETION_TAG_BUFSIZE, COMPLETION_TAG_BUFSIZE - 1, "UPDATE %lu", total);
|
|
securec_check_ss(rc, "", "");
|
|
break;
|
|
case CMD_DELETE:
|
|
rc = snprintf_s(combine->data, COMPLETION_TAG_BUFSIZE, COMPLETION_TAG_BUFSIZE - 1, "DELETE %lu", total);
|
|
securec_check_ss(rc, "", "");
|
|
break;
|
|
default:
|
|
rc = strcpy_s(combine->data, COMPLETION_TAG_BUFSIZE, "");
|
|
securec_check(rc, "", "");
|
|
break;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* HandleDatanodeCommandId ('M') message from a Datanode connection
|
|
*/
|
|
static void HandleDatanodeCommandId(RemoteQueryState* combiner, const char* msg_body, size_t len)
|
|
{
|
|
uint32 n32;
|
|
CommandId cid;
|
|
|
|
Assert(msg_body != NULL);
|
|
Assert(len >= 2);
|
|
|
|
/* Get the command Id */
|
|
errno_t rc = 0;
|
|
rc = memcpy_s(&n32, sizeof(uint32), &msg_body[0], sizeof(uint32));
|
|
securec_check(rc, "\0", "\0");
|
|
cid = ntohl(n32);
|
|
|
|
/* If received command Id is higher than current one, set it to a new value */
|
|
if (cid > GetReceivedCommandId())
|
|
SetReceivedCommandId(cid);
|
|
}
|
|
|
|
/*
|
|
* Description: message type is 'P' identify received total row count from a datanode under analyzing.
|
|
*
|
|
* Parameters:
|
|
* @in combiner: Connection info for a remote node.
|
|
* @in msg_body: Represents a TotalRowCount message received from a remote node.
|
|
* @in len: The message length.
|
|
* Returns: void
|
|
*/
|
|
static void HandleAnalyzeTotalRow(RemoteQueryState* combiner, const char* msg_body, size_t len)
|
|
{
|
|
StringInfoData buf;
|
|
|
|
Assert(msg_body != NULL);
|
|
|
|
initStringInfo(&buf);
|
|
appendBinaryStringInfo(&buf, &msg_body[0], len);
|
|
|
|
if (2 * sizeof(int64) == len) {
|
|
combiner->analyze_totalrowcnt[ANALYZENORMAL] = pq_getmsgint64(&buf);
|
|
combiner->analyze_memsize[ANALYZENORMAL] = pq_getmsgint64(&buf);
|
|
} else {
|
|
Assert(len == sizeof(int64) * 3);
|
|
combiner->analyze_totalrowcnt[ANALYZEDELTA - 1] = pq_getmsgint64(&buf);
|
|
combiner->analyze_memsize[ANALYZEDELTA - 1] = 0;
|
|
}
|
|
|
|
pfree_ext(buf.data);
|
|
}
|
|
|
|
/*
|
|
* Examine the specified combiner state and determine if command was completed
|
|
* successfully
|
|
*/
|
|
bool validate_combiner(RemoteQueryState* combiner)
|
|
{
|
|
/* There was error message while combining */
|
|
if (combiner->errorMessage)
|
|
return false;
|
|
/* Check if state is defined */
|
|
if (combiner->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
return false;
|
|
|
|
/* Check all nodes completed */
|
|
if ((combiner->request_type == REQUEST_TYPE_COMMAND || combiner->request_type == REQUEST_TYPE_QUERY) &&
|
|
combiner->command_complete_count != combiner->node_count)
|
|
return false;
|
|
|
|
/* Check count of description responses */
|
|
if (combiner->request_type == REQUEST_TYPE_QUERY && combiner->description_count != combiner->node_count)
|
|
return false;
|
|
|
|
/* Check count of copy-in responses */
|
|
if (combiner->request_type == REQUEST_TYPE_COPY_IN && combiner->copy_in_count != combiner->node_count)
|
|
return false;
|
|
|
|
/* Check count of copy-out responses */
|
|
if (combiner->request_type == REQUEST_TYPE_COPY_OUT && combiner->copy_out_count != combiner->node_count)
|
|
return false;
|
|
|
|
/* Add other checks here as needed */
|
|
|
|
/* All is good if we are here */
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Close combiner and free allocated memory, if it is not needed
|
|
*/
|
|
void CloseCombiner(RemoteQueryState* combiner)
|
|
{
|
|
if (combiner != NULL) {
|
|
if (combiner->connections)
|
|
pfree_ext(combiner->connections);
|
|
if (combiner->tuple_desc) {
|
|
/*
|
|
* In the case of a remote COPY with tuplestore, combiner is not
|
|
* responsible from freeing the tuple store. This is done at an upper
|
|
* level once data redistribution is completed.
|
|
*/
|
|
if (combiner->remoteCopyType != REMOTE_COPY_TUPLESTORE)
|
|
FreeTupleDesc(combiner->tuple_desc);
|
|
}
|
|
if (combiner->errorMessage)
|
|
pfree_ext(combiner->errorMessage);
|
|
if (combiner->errorDetail)
|
|
pfree_ext(combiner->errorDetail);
|
|
if (combiner->cursor_connections)
|
|
pfree_ext(combiner->cursor_connections);
|
|
if (combiner->tapenodes)
|
|
pfree_ext(combiner->tapenodes);
|
|
if (combiner->errorContext)
|
|
pfree_ext(combiner->errorContext);
|
|
if (combiner->switch_connection)
|
|
pfree_ext(combiner->switch_connection);
|
|
if (combiner->nodeidxinfo)
|
|
pfree_ext(combiner->nodeidxinfo);
|
|
if (combiner->row_store) {
|
|
RowStoreDestory(combiner->row_store);
|
|
}
|
|
pfree_ext(combiner);
|
|
}
|
|
}
|
|
|
|
void CloseCombinerForBarrier(RemoteQueryState* combiner)
|
|
{
|
|
CloseCombiner(combiner);
|
|
}
|
|
|
|
/*
|
|
* Validate combiner and release storage freeing allocated memory
|
|
*/
|
|
bool ValidateAndCloseCombiner(RemoteQueryState* combiner)
|
|
{
|
|
bool valid = validate_combiner(combiner);
|
|
|
|
CloseCombiner(combiner);
|
|
|
|
return valid;
|
|
}
|
|
|
|
/*
|
|
* @Description: get actual connection idx from combiner;
|
|
* @ in conn -- current connection info
|
|
* @return - connection idx
|
|
*/
|
|
static int GetConnIdx(PGXCNodeHandle* conn)
|
|
{
|
|
int idx = 0;
|
|
RemoteQueryState* combiner = conn->combiner;
|
|
for (int j = 0; j < combiner->node_count; j++) {
|
|
if (conn->nodeoid == combiner->nodeidxinfo[j].nodeoid) {
|
|
idx = combiner->nodeidxinfo[j].nodeidx;
|
|
break;
|
|
}
|
|
}
|
|
|
|
return idx;
|
|
}
|
|
|
|
void BufferConnection(PGXCNodeHandle* conn, bool cachedata)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
RemoteQueryState* combiner = conn->combiner;
|
|
MemoryContext oldcontext;
|
|
|
|
if (combiner == NULL || conn->state != DN_CONNECTION_STATE_QUERY)
|
|
return;
|
|
|
|
/*
|
|
* When BufferConnection is invoked CurrentContext is related to other
|
|
* portal, which is trying to control the connection.
|
|
*/
|
|
oldcontext = MemoryContextSwitchTo(combiner->ss.ss_ScanTupleSlot->tts_mcxt);
|
|
|
|
/* Verify the connection is in use by the combiner */
|
|
combiner->current_conn = 0;
|
|
while (combiner->current_conn < combiner->conn_count) {
|
|
if (combiner->connections[combiner->current_conn] == conn)
|
|
break;
|
|
combiner->current_conn++;
|
|
}
|
|
Assert(combiner->current_conn < combiner->conn_count);
|
|
|
|
/*
|
|
* Buffer data rows until Datanode return number of rows specified by the
|
|
* fetch_size parameter of last Execute message (PortalSuspended message)
|
|
* or end of result set is reached (CommandComplete message)
|
|
*/
|
|
while (conn->state == DN_CONNECTION_STATE_QUERY) {
|
|
int res;
|
|
|
|
/* Move to buffer currentRow (received from the Datanode) */
|
|
if (combiner->currentRow.msg) {
|
|
RemoteDataRow dataRow = (RemoteDataRow)palloc(sizeof(RemoteDataRowData));
|
|
*dataRow = combiner->currentRow;
|
|
combiner->currentRow.msg = NULL;
|
|
combiner->currentRow.msglen = 0;
|
|
combiner->currentRow.msgnode = 0;
|
|
combiner->rowBuffer = lappend(combiner->rowBuffer, dataRow);
|
|
}
|
|
|
|
res = handle_response(conn, combiner);
|
|
/*
|
|
* If response message is a DataRow it will be handled on the next
|
|
* iteration.
|
|
* PortalSuspended will cause connection state change and break the loop
|
|
* The same is for CommandComplete, but we need additional handling -
|
|
* remove connection from the list of active connections.
|
|
* We may need to add handling error response
|
|
*/
|
|
if (res == RESPONSE_EOF) {
|
|
/* incomplete message, read more */
|
|
if (pgxc_node_receive(1, &conn, NULL)) {
|
|
conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
add_error_message(conn, "%s", "Failed to fetch from Datanode");
|
|
}
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
/* Remove current connection, move last in-place, adjust current_conn */
|
|
if (combiner->current_conn < --combiner->conn_count)
|
|
combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count];
|
|
else
|
|
combiner->current_conn = 0;
|
|
}
|
|
/*
|
|
* Before output RESPONSE_COMPLETE or PORTAL_SUSPENDED handle_response()
|
|
* changes connection state to DN_CONNECTION_STATE_IDLE, breaking the
|
|
* loop. We do not need to do anything specific in case of
|
|
* PORTAL_SUSPENDED so skiping "else if" block for that case
|
|
*/
|
|
}
|
|
MemoryContextSwitchTo(oldcontext);
|
|
conn->combiner = NULL;
|
|
#endif
|
|
}
|
|
|
|
void CopyDataRowToBatch(RemoteQueryState* node, VectorBatch* batch)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* copy the datarow from combiner to the given slot, in the slot's memory
|
|
* context
|
|
*/
|
|
void CopyDataRowTupleToSlot(RemoteQueryState* combiner, TupleTableSlot* slot)
|
|
{
|
|
char* msg = NULL;
|
|
MemoryContext oldcontext;
|
|
oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
|
|
msg = (char*)palloc(combiner->currentRow.msglen);
|
|
errno_t rc = 0;
|
|
rc = memcpy_s(msg, combiner->currentRow.msglen, combiner->currentRow.msg, combiner->currentRow.msglen);
|
|
securec_check(rc, "\0", "\0");
|
|
ExecStoreDataRowTuple(msg, combiner->currentRow.msglen, combiner->currentRow.msgnode, slot, true);
|
|
pfree_ext(combiner->currentRow.msg);
|
|
combiner->currentRow.msg = NULL;
|
|
combiner->currentRow.msglen = 0;
|
|
combiner->currentRow.msgnode = 0;
|
|
MemoryContextSwitchTo(oldcontext);
|
|
}
|
|
|
|
bool FetchTuple(RemoteQueryState* combiner, TupleTableSlot* slot, ParallelFunctionState* parallelfunctionstate)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return false;
|
|
#else
|
|
bool have_tuple = false;
|
|
|
|
/* If we have message in the buffer, consume it */
|
|
if (combiner->currentRow.msg) {
|
|
CopyDataRowTupleToSlot(combiner, slot);
|
|
have_tuple = true;
|
|
}
|
|
|
|
/*
|
|
* Note: If we are fetching not sorted results we can not have both
|
|
* currentRow and buffered rows. When connection is buffered currentRow
|
|
* is moved to buffer, and then it is cleaned after buffering is
|
|
* completed. Afterwards rows will be taken from the buffer bypassing
|
|
* currentRow until buffer is empty, and only after that data are read
|
|
* from a connection.
|
|
* PGXCTODO: the message should be allocated in the same memory context as
|
|
* that of the slot. Are we sure of that in the call to
|
|
* ExecStoreDataRowTuple below? If one fixes this memory issue, please
|
|
* consider using CopyDataRowTupleToSlot() for the same.
|
|
*/
|
|
if (list_length(combiner->rowBuffer) > 0) {
|
|
RemoteDataRow dataRow = (RemoteDataRow)linitial(combiner->rowBuffer);
|
|
combiner->rowBuffer = list_delete_first(combiner->rowBuffer);
|
|
ExecStoreDataRowTuple(dataRow->msg, dataRow->msglen, dataRow->msgnode, slot, true);
|
|
pfree(dataRow);
|
|
return true;
|
|
}
|
|
|
|
while (combiner->conn_count > 0) {
|
|
int res;
|
|
PGXCNodeHandle* conn = combiner->connections[combiner->current_conn];
|
|
|
|
/* Going to use a connection, buffer it if needed */
|
|
if (conn->state == DN_CONNECTION_STATE_QUERY && conn->combiner != NULL && conn->combiner != combiner)
|
|
BufferConnection(conn);
|
|
|
|
/*
|
|
* If current connection is idle it means portal on the Datanode is
|
|
* suspended. If we have a tuple do not hurry to request more rows,
|
|
* leave connection clean for other RemoteQueries.
|
|
* If we do not have, request more and try to get it
|
|
*/
|
|
if (conn->state == DN_CONNECTION_STATE_IDLE) {
|
|
/*
|
|
* If we have tuple to return do not hurry to request more, keep
|
|
* connection clean
|
|
*/
|
|
if (have_tuple)
|
|
return true;
|
|
else {
|
|
if (pgxc_node_send_execute(conn, combiner->cursor, 1) != 0)
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to fetch from Datanode")));
|
|
if (pgxc_node_send_sync(conn) != 0)
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to fetch from Datanode")));
|
|
if (pgxc_node_receive(1, &conn, NULL))
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to fetch from Datanode")));
|
|
conn->combiner = combiner;
|
|
}
|
|
}
|
|
|
|
/* read messages */
|
|
res = handle_response(conn, combiner);
|
|
if (res == RESPONSE_EOF) {
|
|
/* incomplete message, read more */
|
|
if (pgxc_node_receive(1, &conn, NULL))
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to fetch from Datanode")));
|
|
continue;
|
|
} else if (res == RESPONSE_SUSPENDED) {
|
|
/* Make next connection current */
|
|
if (++combiner->current_conn >= combiner->conn_count)
|
|
combiner->current_conn = 0;
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
/* Remove current connection, move last in-place, adjust current_conn */
|
|
if (combiner->current_conn < --combiner->conn_count)
|
|
combiner->connections[combiner->current_conn] = combiner->connections[combiner->conn_count];
|
|
else
|
|
combiner->current_conn = 0;
|
|
} else if (res == RESPONSE_DATAROW && have_tuple) {
|
|
/*
|
|
* We already have a tuple and received another one, leave it till
|
|
* next fetch
|
|
*/
|
|
return true;
|
|
}
|
|
|
|
/* If we have message in the buffer, consume it */
|
|
if (combiner->currentRow.msg) {
|
|
CopyDataRowTupleToSlot(combiner, slot);
|
|
have_tuple = true;
|
|
}
|
|
}
|
|
|
|
/* report end of data to the caller */
|
|
if (!have_tuple)
|
|
(void)ExecClearTuple(slot);
|
|
|
|
return have_tuple;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Get next data row from the combiner's buffer into provided slot
|
|
* Just clear slot and return false if buffer is empty, that means end of result
|
|
* set is reached
|
|
*/
|
|
|
|
template <bool BatchFormat>
|
|
static bool GetTupleFromConn(RemoteQueryState* node, void* slot, ParallelFunctionState* parallelfunctionstate)
|
|
{
|
|
int connIdx = 0;
|
|
PGXCNodeHandle** connections = NULL;
|
|
|
|
connIdx = node->current_conn;
|
|
connections = node->connections;
|
|
|
|
// handle data from all connection.
|
|
while (connIdx < node->conn_count) {
|
|
int res = handle_response(connections[connIdx], node);
|
|
Assert(connections[connIdx]->combiner == NULL || connections[connIdx]->combiner == node);
|
|
switch (res) {
|
|
case RESPONSE_EOF: // try next run.
|
|
{
|
|
connIdx++;
|
|
} break;
|
|
case RESPONSE_COMPLETE: // finish one connection.
|
|
{
|
|
node->conn_count = node->conn_count - 1;
|
|
|
|
// all finished
|
|
if (node->conn_count == 0) {
|
|
if (BatchFormat == false)
|
|
(void)ExecClearTuple((TupleTableSlot*)slot);
|
|
|
|
node->need_more_data = false;
|
|
return false;
|
|
}
|
|
|
|
if (connIdx < node->conn_count) {
|
|
connections[connIdx] =
|
|
connections[node->conn_count]; // shrink for one size as one connection has finished
|
|
}
|
|
} break;
|
|
case RESPONSE_TUPDESC: {
|
|
/* when used in ParallelFunction we should get tupledesc by ourself. */
|
|
if (parallelfunctionstate != NULL) {
|
|
ExecSetSlotDescriptor((TupleTableSlot*)slot, node->tuple_desc);
|
|
parallelfunctionstate->tupdesc = node->tuple_desc;
|
|
} else {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Unexpected TUPDESC response from Datanode")));
|
|
}
|
|
} break;
|
|
case RESPONSE_DATAROW: {
|
|
/* If we have message in the buffer, consume it */
|
|
if (node->currentRow.msg != 0) {
|
|
if (BatchFormat)
|
|
CopyDataRowToBatch(node, (VectorBatch*)slot);
|
|
else
|
|
CopyDataRowTupleToSlot(node, (TupleTableSlot*)slot);
|
|
node->need_more_data = false;
|
|
node->current_conn = connIdx; // remember the current connection index.
|
|
}
|
|
|
|
if (parallelfunctionstate != NULL) {
|
|
if (parallelfunctionstate->tupstore != NULL && !TupIsNull((TupleTableSlot*)slot)) {
|
|
/*
|
|
* Store the tuple received from each node into the tuplestore in
|
|
* ParallelFunction and we won't return for each tuple recivied here.
|
|
*/
|
|
tuplestore_puttupleslot(parallelfunctionstate->tupstore, (TupleTableSlot*)slot);
|
|
(void)ExecClearTuple((TupleTableSlot*)slot);
|
|
}
|
|
} else {
|
|
return true;
|
|
}
|
|
} break;
|
|
default:
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Unexpected response from Datanode")));
|
|
break;
|
|
}
|
|
}
|
|
|
|
Assert(connIdx == node->conn_count);
|
|
node->current_conn = 0;
|
|
node->need_more_data = true;
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Get next data row from the combiner's buffer into provided slot
|
|
* Just clear slot and return false if buffer is empty, that means end of result
|
|
* set is reached
|
|
*/
|
|
template <bool BatchFormat, bool ForParallelFunction>
|
|
bool FetchTupleByMultiChannel(
|
|
RemoteQueryState* combiner, TupleTableSlot* slot, ParallelFunctionState* parallelfunctionstate)
|
|
{
|
|
if (!ForParallelFunction) {
|
|
/* If we have message in the buffer, consume it */
|
|
if (combiner->currentRow.msg) {
|
|
if (BatchFormat)
|
|
CopyDataRowToBatch(combiner, (VectorBatch*)slot);
|
|
else
|
|
CopyDataRowTupleToSlot(combiner, slot);
|
|
|
|
combiner->current_conn = 0;
|
|
return true;
|
|
}
|
|
|
|
if (((RemoteQuery*)combiner->ss.ps.plan) != NULL && ((RemoteQuery*)combiner->ss.ps.plan)->sort)
|
|
return false;
|
|
|
|
if (RowStoreLen(combiner->row_store) > 0) {
|
|
RemoteDataRowData dataRow;
|
|
|
|
RowStoreFetch(combiner->row_store, &dataRow);
|
|
|
|
if (BatchFormat) {
|
|
((VectorBatch*)slot)->DeserializeWithLZ4Decompress(dataRow.msg, dataRow.msglen);
|
|
} else {
|
|
NetWorkTimeDeserializeStart(t_thrd.pgxc_cxt.GlobalNetInstr);
|
|
ExecStoreDataRowTuple(dataRow.msg, dataRow.msglen, dataRow.msgnode, slot, true);
|
|
NetWorkTimeDeserializeEnd(t_thrd.pgxc_cxt.GlobalNetInstr);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
while (combiner->conn_count > 0) {
|
|
if (combiner->need_more_data) {
|
|
struct timeval timeout;
|
|
timeout.tv_sec = ERROR_CHECK_TIMEOUT;
|
|
timeout.tv_usec = 0;
|
|
|
|
/*
|
|
* If need check the other errors after getting a normal communcation error,
|
|
* set timeout first when coming to receive data again. If then get any poll
|
|
* error, report the former cached error in combiner(RemoteQueryState).
|
|
*/
|
|
if (pgxc_node_receive(
|
|
combiner->conn_count, combiner->connections, combiner->need_error_check ? &timeout : NULL)) {
|
|
if (!combiner->need_error_check) {
|
|
int error_code;
|
|
char* error_msg = getSocketError(&error_code);
|
|
|
|
ereport(ERROR,
|
|
(errcode(error_code),
|
|
errmsg("Failed to read response from Datanodes Detail: %s\n", error_msg)));
|
|
} else {
|
|
combiner->need_error_check = false;
|
|
pgxc_node_report_error(combiner);
|
|
}
|
|
}
|
|
}
|
|
if (!ForParallelFunction) {
|
|
if (GetTupleFromConn<BatchFormat>(combiner, slot, NULL))
|
|
return true;
|
|
else {
|
|
if (combiner->need_more_data == false) {
|
|
if (BatchFormat == false)
|
|
(void)ExecClearTuple(slot);
|
|
return false;
|
|
}
|
|
}
|
|
} else {
|
|
/* don't need check function's return value as it will be always false in ParallelFunction. */
|
|
(void)GetTupleFromConn<false>(combiner, slot, parallelfunctionstate);
|
|
|
|
/* report error if any. */
|
|
pgxc_node_report_error(combiner);
|
|
|
|
if (combiner->need_more_data == false)
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (BatchFormat == false && !ForParallelFunction)
|
|
(void)ExecClearTuple(slot);
|
|
|
|
return false;
|
|
}
|
|
|
|
bool FetchBatch(RemoteQueryState* combiner, VectorBatch* batch)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Handle responses from the Datanode connections
|
|
*/
|
|
int pgxc_node_receive_responses(const int conn_count, PGXCNodeHandle** connections, struct timeval* timeout,
|
|
RemoteQueryState* combiner, bool checkerror)
|
|
{
|
|
int count = conn_count;
|
|
errno_t rc = EOK;
|
|
PGXCNodeHandle** to_receive = NULL;
|
|
|
|
if (conn_count > 0) {
|
|
to_receive = (PGXCNodeHandle**)palloc(conn_count * sizeof(PGXCNodeHandle*));
|
|
|
|
/* make a copy of the pointers to the connections */
|
|
rc = memcpy_s(
|
|
to_receive, conn_count * sizeof(PGXCNodeHandle*), connections, conn_count * sizeof(PGXCNodeHandle*));
|
|
securec_check(rc, "", "");
|
|
}
|
|
|
|
/*
|
|
* Read results.
|
|
* Note we try and read from Datanode connections even if there is an error on one,
|
|
* so as to avoid reading incorrect results on the next statement.
|
|
* Other safegaurds exist to avoid this, however.
|
|
*/
|
|
while (count > 0) {
|
|
int i = 0;
|
|
|
|
if (pgxc_node_receive(count, to_receive, timeout)) {
|
|
pfree_ext(to_receive);
|
|
return EOF;
|
|
}
|
|
while (i < count) {
|
|
int result = handle_response(to_receive[i], combiner);
|
|
switch (result) {
|
|
case RESPONSE_EOF: /* have something to read, keep receiving */
|
|
i++;
|
|
break;
|
|
case RESPONSE_COMPLETE:
|
|
case RESPONSE_COPY:
|
|
/* Handling is done, do not track this connection */
|
|
count--;
|
|
/* Move last connection in place */
|
|
if (i < count)
|
|
to_receive[i] = to_receive[count];
|
|
break;
|
|
default:
|
|
/* Inconsistent responses */
|
|
add_error_message(to_receive[i], "%s", "Unexpected response from the Datanodes");
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Unexpected response from the Datanodes, result = %d, request type %d",
|
|
result, combiner->request_type)));
|
|
/* Stop tracking and move last connection in place */
|
|
count--;
|
|
if (i < count)
|
|
to_receive[i] = to_receive[count];
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* For pgxc_node_remote_prepare, pgxc_node_remote_commit and pgxc_node_remote_abort
|
|
* we don't report error here, because we have not set remoteXactState.status by now.
|
|
*/
|
|
if (checkerror)
|
|
pgxc_node_report_error(combiner);
|
|
|
|
if (conn_count > 0)
|
|
pfree_ext(to_receive);
|
|
return 0;
|
|
}
|
|
|
|
void light_node_report_error(lightProxyErrData* combiner)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* For Light proxy: like HandleError
|
|
*/
|
|
void light_handle_error(lightProxyErrData* combiner, char* msg_body, size_t len)
|
|
{
|
|
/* parse error message */
|
|
char* code = NULL;
|
|
char* message = NULL;
|
|
char* detail = NULL;
|
|
char* context = NULL;
|
|
size_t offset = 0;
|
|
char* realerrcode = NULL;
|
|
char* funcname = NULL;
|
|
char* filename = NULL;
|
|
char* lineno = NULL;
|
|
int error_code = 0;
|
|
|
|
/*
|
|
* Scan until point to terminating \0
|
|
*/
|
|
while (offset + 1 < len) {
|
|
/* pointer to the field message */
|
|
char* str = msg_body + offset + 1;
|
|
|
|
switch (msg_body[offset]) {
|
|
case 'c':
|
|
realerrcode = str;
|
|
break;
|
|
case 'C': /* code */
|
|
code = str;
|
|
/* Error Code is exactly 5 significant bytes */
|
|
if (code != NULL)
|
|
error_code = MAKE_SQLSTATE((unsigned char)code[0],
|
|
(unsigned char)code[1],
|
|
(unsigned char)code[2],
|
|
(unsigned char)code[3],
|
|
(unsigned char)code[4]);
|
|
break;
|
|
case 'M': /* message */
|
|
message = str;
|
|
break;
|
|
case 'D': /* details */
|
|
detail = str;
|
|
break;
|
|
case 'W': /* where */
|
|
context = str;
|
|
break;
|
|
case 'S': /* severity */
|
|
if (pg_strncasecmp(str, "FATAL", 5) == 0)
|
|
combiner->is_fatal_error = true;
|
|
break;
|
|
case 'R': /* routine */
|
|
funcname = str;
|
|
break;
|
|
case 'F': /* file */
|
|
filename = str;
|
|
break;
|
|
case 'L': /* line */
|
|
lineno = str;
|
|
break;
|
|
/* Fields not yet in use */
|
|
case 'H': /* hint */
|
|
case 'P': /* position string */
|
|
case 'p': /* position int */
|
|
case 'q': /* int query */
|
|
default:
|
|
break;
|
|
}
|
|
/* code, message and \0 */
|
|
offset += strlen(str) + 2;
|
|
}
|
|
|
|
if (message != NULL) {
|
|
combiner->errorMessage = pstrdup(message);
|
|
|
|
if (code != NULL)
|
|
combiner->errorCode = error_code;
|
|
|
|
if (realerrcode != NULL)
|
|
combiner->remoteErrData.internalerrcode = pg_strtoint32(realerrcode);
|
|
}
|
|
|
|
if (detail != NULL)
|
|
combiner->errorDetail = pstrdup(detail);
|
|
|
|
if (context != NULL)
|
|
combiner->errorContext = pstrdup(context);
|
|
|
|
if (filename != NULL)
|
|
combiner->remoteErrData.filename = pstrdup(filename);
|
|
|
|
if (funcname != NULL)
|
|
combiner->remoteErrData.errorfuncname = pstrdup(funcname);
|
|
|
|
if (lineno != NULL)
|
|
combiner->remoteErrData.lineno = pg_strtoint32(lineno);
|
|
}
|
|
|
|
int light_handle_response(PGXCNodeHandle* conn, lightProxyMsgCtl* msgctl, lightProxy* lp)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return 0;
|
|
}
|
|
|
|
int handle_response(PGXCNodeHandle* conn, RemoteQueryState* combiner, bool isdummy)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return 0;
|
|
#else
|
|
char* msg;
|
|
int msg_len;
|
|
char msg_type;
|
|
bool suspended = false;
|
|
|
|
for (;;) {
|
|
Assert(conn->state != DN_CONNECTION_STATE_IDLE);
|
|
|
|
/*
|
|
* If we are in the process of shutting down, we
|
|
* may be rolling back, and the buffer may contain other messages.
|
|
* We want to avoid a procarray exception
|
|
* as well as an error stack overflow.
|
|
*/
|
|
if (proc_exit_inprogress)
|
|
conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
|
|
/* don't read from from the connection if there is a fatal error */
|
|
if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL)
|
|
return RESPONSE_COMPLETE;
|
|
|
|
/* No data available, exit */
|
|
if (!HAS_MESSAGE_BUFFERED(conn))
|
|
return RESPONSE_EOF;
|
|
|
|
Assert(conn->combiner == combiner || conn->combiner == NULL);
|
|
|
|
msg_type = get_message(conn, &msg_len, &msg);
|
|
switch (msg_type) {
|
|
case '\0': /* Not enough data in the buffer */
|
|
return RESPONSE_EOF;
|
|
case 'c': /* CopyToCommandComplete */
|
|
HandleCopyOutComplete(combiner);
|
|
break;
|
|
case 'C': /* CommandComplete */
|
|
HandleCommandComplete(combiner, msg, msg_len, conn);
|
|
break;
|
|
case 'T': /* RowDescription */
|
|
#ifdef DN_CONNECTION_DEBUG
|
|
Assert(!conn->have_row_desc);
|
|
conn->have_row_desc = true;
|
|
#endif
|
|
if (HandleRowDescription(combiner, msg))
|
|
return RESPONSE_TUPDESC;
|
|
break;
|
|
case 'D': /* DataRow */
|
|
#ifdef DN_CONNECTION_DEBUG
|
|
Assert(conn->have_row_desc);
|
|
#endif
|
|
HandleDataRow(combiner, msg, msg_len, conn->nodeoid);
|
|
return RESPONSE_DATAROW;
|
|
case 's': /* PortalSuspended */
|
|
suspended = true;
|
|
break;
|
|
case '1': /* ParseComplete */
|
|
case '2': /* BindComplete */
|
|
case '3': /* CloseComplete */
|
|
case 'n': /* NoData */
|
|
/* simple notifications, continue reading */
|
|
break;
|
|
case 'G': /* CopyInResponse */
|
|
conn->state = DN_CONNECTION_STATE_COPY_IN;
|
|
HandleCopyIn(combiner);
|
|
/* Done, return to caller to let it know the data can be passed in */
|
|
return RESPONSE_COPY;
|
|
case 'H': /* CopyOutResponse */
|
|
conn->state = DN_CONNECTION_STATE_COPY_OUT;
|
|
HandleCopyOut(combiner);
|
|
return RESPONSE_COPY;
|
|
case 'd': /* CopyOutDataRow */
|
|
conn->state = DN_CONNECTION_STATE_COPY_OUT;
|
|
HandleCopyDataRow(combiner, msg, msg_len);
|
|
break;
|
|
case 'E': /* ErrorResponse */
|
|
HandleError(combiner, msg, msg_len);
|
|
add_error_message(conn, "%s", combiner->errorMessage);
|
|
/*
|
|
* Do not return with an error, we still need to consume Z,
|
|
* ready-for-query
|
|
*/
|
|
break;
|
|
case 'A': /* NotificationResponse */
|
|
case 'N': /* NoticeResponse */
|
|
case 'S': /* SetCommandComplete */
|
|
/*
|
|
* Ignore these to prevent multiple messages, one from each
|
|
* node. Coordinator will send one for DDL anyway
|
|
*/
|
|
break;
|
|
case 'Z': /* ReadyForQuery */
|
|
{
|
|
/*
|
|
* Return result depends on previous connection state.
|
|
* If it was PORTAL_SUSPENDED Coordinator want to send down
|
|
* another EXECUTE to fetch more rows, otherwise it is done
|
|
* with the connection
|
|
*/
|
|
int result = suspended ? RESPONSE_SUSPENDED : RESPONSE_COMPLETE;
|
|
conn->transaction_status = msg[0];
|
|
conn->state = DN_CONNECTION_STATE_IDLE;
|
|
conn->combiner = NULL;
|
|
#ifdef DN_CONNECTION_DEBUG
|
|
conn->have_row_desc = false;
|
|
#endif
|
|
return result;
|
|
}
|
|
case 'M': /* Command Id */
|
|
HandleDatanodeCommandId(combiner, msg, msg_len);
|
|
break;
|
|
case 'b':
|
|
conn->state = DN_CONNECTION_STATE_IDLE;
|
|
return RESPONSE_BARRIER_OK;
|
|
case 'I': /* EmptyQuery */
|
|
default:
|
|
/* sync lost? */
|
|
elog(WARNING, "Received unsupported message type: %c", msg_type);
|
|
conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
/* stop reading */
|
|
return RESPONSE_COMPLETE;
|
|
}
|
|
}
|
|
/* never happen, but keep compiler quiet */
|
|
return RESPONSE_EOF;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Has the Datanode sent Ready For Query
|
|
*/
|
|
|
|
bool is_data_node_ready(PGXCNodeHandle* conn)
|
|
{
|
|
char* msg = NULL;
|
|
int msg_len;
|
|
char msg_type;
|
|
|
|
for (;;) {
|
|
/*
|
|
* If we are in the process of shutting down, we
|
|
* may be rolling back, and the buffer may contain other messages.
|
|
* We want to avoid a procarray exception
|
|
* as well as an error stack overflow.
|
|
*/
|
|
if (t_thrd.proc_cxt.proc_exit_inprogress) {
|
|
conn->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
ereport(DEBUG2,
|
|
(errmsg("DN_CONNECTION_STATE_ERROR_FATAL1 is set for connection to node %u when proc_exit_inprogress",
|
|
conn->nodeoid)));
|
|
}
|
|
|
|
/* don't read from from the connection if there is a fatal error */
|
|
if (conn->state == DN_CONNECTION_STATE_ERROR_FATAL) {
|
|
ereport(DEBUG2,
|
|
(errmsg("is_data_node_ready returned with DN_CONNECTION_STATE_ERROR_FATAL for connection to node %u",
|
|
conn->nodeoid)));
|
|
return true;
|
|
}
|
|
|
|
/* No data available, exit */
|
|
if (!HAS_MESSAGE_BUFFERED(conn))
|
|
return false;
|
|
|
|
msg_type = get_message(conn, &msg_len, &msg);
|
|
switch (msg_type) {
|
|
case 's': /* PortalSuspended */
|
|
break;
|
|
|
|
case 'Z': /* ReadyForQuery */
|
|
/*
|
|
* Return result depends on previous connection state.
|
|
* If it was PORTAL_SUSPENDED Coordinator want to send down
|
|
* another EXECUTE to fetch more rows, otherwise it is done
|
|
* with the connection
|
|
*/
|
|
conn->transaction_status = msg[0];
|
|
conn->state = DN_CONNECTION_STATE_IDLE;
|
|
conn->combiner = NULL;
|
|
return true;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
/* never happen, but keep compiler quiet */
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Construct a BEGIN TRANSACTION command after taking into account the
|
|
* current options. The returned string is not palloced and is valid only until
|
|
* the next call to the function.
|
|
*/
|
|
char* generate_begin_command(void)
|
|
{
|
|
const char* read_only = NULL;
|
|
const char* isolation_level = NULL;
|
|
int rcs = 0;
|
|
|
|
/*
|
|
* First get the READ ONLY status because the next call to GetConfigOption
|
|
* will overwrite the return buffer
|
|
*/
|
|
if (strcmp(GetConfigOption("transaction_read_only", false, false), "on") == 0)
|
|
read_only = "READ ONLY";
|
|
else
|
|
read_only = "READ WRITE";
|
|
|
|
/* Now get the isolation_level for the transaction */
|
|
isolation_level = GetConfigOption("transaction_isolation", false, false);
|
|
if (strcmp(isolation_level, "default") == 0)
|
|
isolation_level = GetConfigOption("default_transaction_isolation", false, false);
|
|
|
|
/* Finally build a START TRANSACTION command */
|
|
rcs = sprintf_s(t_thrd.pgxc_cxt.begin_cmd,
|
|
BEGIN_CMD_BUFF_SIZE,
|
|
"START TRANSACTION ISOLATION LEVEL %s %s",
|
|
isolation_level,
|
|
read_only);
|
|
securec_check_ss(rcs, "\0", "\0");
|
|
|
|
return t_thrd.pgxc_cxt.begin_cmd;
|
|
}
|
|
|
|
/*
|
|
* Send BEGIN command to the compute Datanodes.
|
|
*/
|
|
static int compute_node_begin(int conn_count, PGXCNodeHandle** connections, GlobalTransactionId gxid)
|
|
{
|
|
int i;
|
|
TimestampTz gtmstart_timestamp = GetCurrentGTMStartTimestamp();
|
|
TimestampTz stmtsys_timestamp = GetCurrentStmtsysTimestamp();
|
|
/*
|
|
* If no remote connections, we don't have anything to do
|
|
*/
|
|
if (conn_count == 0) {
|
|
return 0;
|
|
}
|
|
|
|
for (i = 0; i < conn_count; i++) {
|
|
if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(connections[i]);
|
|
|
|
/*
|
|
* Send GXID and check for errors every time when gxid is valid
|
|
* For subTransaction if datanode has no next_xid for assignning, bogus occurs.
|
|
*/
|
|
if (GlobalTransactionIdIsValid(gxid) && pgxc_node_send_gxid(connections[i], gxid, false))
|
|
return EOF;
|
|
|
|
/*
|
|
* If the node is already a participant in the transaction, skip it
|
|
*/
|
|
if (list_member(u_sess->pgxc_cxt.XactReadNodes, connections[i]) ||
|
|
list_member(u_sess->pgxc_cxt.XactWriteNodes, connections[i])) {
|
|
continue;
|
|
}
|
|
|
|
/* Send timestamp and check for errors */
|
|
if (GlobalTimestampIsValid(gtmstart_timestamp) && GlobalTimestampIsValid(stmtsys_timestamp) &&
|
|
pgxc_node_send_timestamp(connections[i], gtmstart_timestamp, stmtsys_timestamp))
|
|
return EOF;
|
|
}
|
|
|
|
/* No problem, let's get going */
|
|
return 0;
|
|
}
|
|
|
|
bool light_xactnodes_member(bool write, const void* datum)
|
|
{
|
|
if (write)
|
|
return list_member(u_sess->pgxc_cxt.XactWriteNodes, datum);
|
|
else
|
|
return list_member(u_sess->pgxc_cxt.XactReadNodes, datum);
|
|
}
|
|
|
|
/*
|
|
* Send BEGIN command to the Datanodes or Coordinators and receive responses.
|
|
* Also send the GXID for the transaction.
|
|
*/
|
|
int pgxc_node_begin(int conn_count, PGXCNodeHandle** connections, GlobalTransactionId gxid, bool need_tran_block,
|
|
bool readOnly, char node_type, bool need_send_queryid)
|
|
{
|
|
int i;
|
|
struct timeval* timeout = NULL;
|
|
RemoteQueryState* combiner = NULL;
|
|
TimestampTz gtmstart_timestamp = GetCurrentGTMStartTimestamp();
|
|
TimestampTz stmtsys_timestamp = GetCurrentStmtsysTimestamp();
|
|
int new_count = 0;
|
|
int j = 0;
|
|
|
|
/*
|
|
* If no remote connections, we don't have anything to do
|
|
*/
|
|
if (conn_count == 0) {
|
|
return 0;
|
|
}
|
|
|
|
PGXCNodeHandle** new_connections = (PGXCNodeHandle**)palloc(conn_count * sizeof(PGXCNodeHandle*));
|
|
int* con = (int*)palloc(conn_count * sizeof(int));
|
|
|
|
for (i = 0; i < conn_count; i++) {
|
|
if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(connections[i]);
|
|
|
|
if (need_send_queryid) {
|
|
if (pgxc_node_send_queryid(connections[i], u_sess->debug_query_id))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send queryid to %s", connections[i]->remoteNodeName)));
|
|
}
|
|
|
|
if (connections[i]->state != DN_CONNECTION_STATE_IDLE)
|
|
LIBCOMM_DEBUG_LOG("pgxc_node_begin to node:%s[nid:%d,sid:%d] with abnormal state:%d",
|
|
connections[i]->remoteNodeName,
|
|
connections[i]->gsock.idx,
|
|
connections[i]->gsock.sid,
|
|
connections[i]->state);
|
|
|
|
/*
|
|
|
|
* Send GXID and check for errors every time when gxid is valid
|
|
* For subTransaction if datanode has no next_xid for assignning, bogus occurs.
|
|
*/
|
|
|
|
if (GlobalTransactionIdIsValid(gxid) && pgxc_node_send_gxid(connections[i], gxid, false))
|
|
return EOF;
|
|
|
|
/*
|
|
* If the node is already a participant in the transaction, skip it
|
|
*/
|
|
if (list_member(u_sess->pgxc_cxt.XactReadNodes, connections[i]) ||
|
|
list_member(u_sess->pgxc_cxt.XactWriteNodes, connections[i])) {
|
|
/*
|
|
* If we are doing a write operation, we may need to shift the node
|
|
* to the write-list. RegisterTransactionNodes does that for us
|
|
*/
|
|
if (!readOnly)
|
|
RegisterTransactionNodes(1, (void**)&connections[i], true);
|
|
continue;
|
|
}
|
|
|
|
/* Send timestamp and check for errors */
|
|
if (GlobalTimestampIsValid(gtmstart_timestamp) && GlobalTimestampIsValid(stmtsys_timestamp) &&
|
|
pgxc_node_send_timestamp(connections[i], gtmstart_timestamp, stmtsys_timestamp))
|
|
return EOF;
|
|
|
|
/* Send BEGIN */
|
|
if (need_tran_block) {
|
|
WaitStatePhase oldPhase = pgstat_report_waitstatus_phase(PHASE_BEGIN);
|
|
/* Send the BEGIN TRANSACTION command and check for errors */
|
|
if (pgxc_node_send_query(connections[i],
|
|
generate_begin_command(),
|
|
false,
|
|
false,
|
|
false,
|
|
g_instance.attr.attr_storage.enable_gtm_free)) {
|
|
pgstat_report_waitstatus_phase(oldPhase);
|
|
return EOF;
|
|
}
|
|
pgstat_report_waitstatus_phase(oldPhase);
|
|
|
|
con[j++] = PGXCNodeGetNodeId(connections[i]->nodeoid, node_type);
|
|
/*
|
|
|
|
* Register the node as a participant in the transaction. The
|
|
* caller should tell us if the node may do any write activitiy
|
|
*
|
|
* XXX This is a bit tricky since it would be difficult to know if
|
|
* statement has any side effect on the Datanode. So a SELECT
|
|
* statement may invoke a function on the Datanode which may end up
|
|
* modifying the data at the Datanode. We can possibly rely on the
|
|
* function qualification to decide if a statement is a read-only or a
|
|
* read-write statement.
|
|
*/
|
|
RegisterTransactionNodes(1, (void**)&connections[i], !readOnly);
|
|
new_connections[new_count++] = connections[i];
|
|
}
|
|
}
|
|
|
|
/* list the read nodes and write nodes */
|
|
PrintRegisteredTransactionNodes();
|
|
|
|
/*
|
|
* If we did not send a BEGIN command to any node, we are done. Otherwise,
|
|
* we need to check for any errors and report them
|
|
*/
|
|
if (new_count == 0) {
|
|
pfree_ext(new_connections);
|
|
pfree_ext(con);
|
|
return 0;
|
|
}
|
|
|
|
combiner = CreateResponseCombiner(new_count, COMBINE_TYPE_NONE);
|
|
|
|
/* Receive responses */
|
|
if (pgxc_node_receive_responses(new_count, new_connections, timeout, combiner)) {
|
|
pfree_ext(new_connections);
|
|
pfree_ext(con);
|
|
return EOF;
|
|
}
|
|
/* Verify status */
|
|
if (!ValidateAndCloseCombiner(combiner)) {
|
|
pfree_ext(new_connections);
|
|
pfree_ext(con);
|
|
return EOF;
|
|
}
|
|
|
|
/*
|
|
* Ask pooler to send commands (if any) to nodes involved in transaction to alter the
|
|
* behavior of current transaction. This fires all transaction level commands before
|
|
* issuing any DDL, DML or SELECT within the current transaction block.
|
|
*/
|
|
if (GetCurrentLocalParamStatus()) {
|
|
int res;
|
|
if (node_type == PGXC_NODE_DATANODE)
|
|
res = PoolManagerSendLocalCommand(j, con, 0, NULL);
|
|
else
|
|
res = PoolManagerSendLocalCommand(0, NULL, j, con);
|
|
|
|
if (res != 0) {
|
|
pfree_ext(new_connections);
|
|
pfree_ext(con);
|
|
return EOF;
|
|
}
|
|
}
|
|
|
|
/* No problem, let's get going */
|
|
pfree_ext(new_connections);
|
|
pfree_ext(con);
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* RemoteXactNodeStatusAsString
|
|
* for pgxc prepare comm status info support
|
|
*/
|
|
static const char* RemoteXactNodeStatusAsString(RemoteXactNodeStatus status)
|
|
{
|
|
switch (status) {
|
|
case RXACT_NODE_NONE:
|
|
return "RXACT_NODE_NONE";
|
|
case RXACT_NODE_PREPARE_SENT:
|
|
return "RXACT_NODE_PREPARE_SENT";
|
|
case RXACT_NODE_PREPARE_FAILED:
|
|
return "RXACT_NODE_PREPARE_FAILED";
|
|
case RXACT_NODE_PREPARED:
|
|
return "RXACT_NODE_PREPARED";
|
|
case RXACT_NODE_COMMIT_SENT:
|
|
return "RXACT_NODE_COMMIT_SENT";
|
|
case RXACT_NODE_COMMIT_FAILED:
|
|
return "RXACT_NODE_COMMIT_FAILED";
|
|
case RXACT_NODE_COMMITTED:
|
|
return "RXACT_NODE_COMMITTED";
|
|
case RXACT_NODE_ABORT_SENT:
|
|
return "RXACT_NODE_ABORT_SENT";
|
|
case RXACT_NODE_ABORT_FAILED:
|
|
return "RXACT_NODE_ABORT_FAILED";
|
|
case RXACT_NODE_ABORTED:
|
|
return "RXACT_NODE_ABORTED";
|
|
default:
|
|
break;
|
|
}
|
|
return "UNRECOGNIZED RXACT_NODE_STATUS";
|
|
}
|
|
|
|
/*
|
|
* RemoteXactStatusAsString
|
|
* for readable error msg support
|
|
*/
|
|
static const char* RemoteXactStatusAsString(RemoteXactStatus status)
|
|
{
|
|
switch (status) {
|
|
case RXACT_NONE:
|
|
return "RXACT_NONE:Initial state";
|
|
case RXACT_PREPARE_FAILED:
|
|
return "RXACT_PREPARE_FAILED:PREPARE failed";
|
|
case RXACT_PREPARED:
|
|
return "RXACT_PREPARED:PREPARED succeeded on all nodes";
|
|
case RXACT_COMMIT_FAILED:
|
|
return "RXACT_COMMIT_FAILED:COMMIT failed on all the nodes";
|
|
case RXACT_PART_COMMITTED:
|
|
return "RXACT_PART_COMMITTED:COMMIT failed on some and succeeded on other nodes";
|
|
case RXACT_COMMITTED:
|
|
return "RXACT_COMMITTED:COMMIT succeeded on all the nodes";
|
|
case RXACT_ABORT_FAILED:
|
|
return "RXACT_ABORT_FAILED:ABORT failed on all the nodes";
|
|
case RXACT_PART_ABORTED:
|
|
return "RXACT_PART_ABORTED:ABORT failed on some and succeeded on other nodes";
|
|
case RXACT_ABORTED:
|
|
return "RXACT_ABORTED:ABORT succeeded on all the nodes";
|
|
default:
|
|
break;
|
|
}
|
|
return "UNRECOGNIZED";
|
|
}
|
|
|
|
/*
|
|
* Prepare all remote nodes involved in this transaction. The local node is
|
|
* handled separately and prepared first in xact.c. If there is any error
|
|
* during this phase, it will be reported via ereport() and the transaction
|
|
* will be aborted on the local as well as remote nodes
|
|
*
|
|
* prepareGID is created and passed from xact.c
|
|
*/
|
|
bool pgxc_node_remote_prepare(const char* prepareGID, bool WriteCnLocalNode)
|
|
{
|
|
int result = 0;
|
|
int write_conn_count = u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes;
|
|
char prepare_cmd[256];
|
|
int i;
|
|
PGXCNodeHandle** connections = u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles;
|
|
RemoteQueryState* combiner = NULL;
|
|
errno_t errorno = EOK;
|
|
|
|
t_thrd.xact_cxt.XactPrepareSent = false;
|
|
|
|
/*
|
|
* If there is NO write activity or the caller does not want us to run a
|
|
* 2PC protocol, we don't need to do anything special
|
|
*/
|
|
if ((write_conn_count == 0) || (prepareGID == NULL)) {
|
|
/* Involes only one node, 2pc is not needed, notify GTM firstly when xact end */
|
|
if (!g_instance.attr.attr_storage.enable_gtm_free) {
|
|
/* Notify DN to set csn at commit in progress */
|
|
if (TransactionIdIsValid(GetTopTransactionIdIfAny())) {
|
|
NotifyDNSetCSN2CommitInProgress();
|
|
}
|
|
|
|
if (!AtEOXact_GlobalTxn(true)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to receive GTM commit transaction response.")));
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
t_thrd.pgxact->prepare_xid = GetCurrentTransactionIdIfAny();
|
|
|
|
SetSendCommandId(false);
|
|
|
|
/* Save the prepareGID in the global state information */
|
|
const int slen = 256;
|
|
errorno = snprintf_s(u_sess->pgxc_cxt.remoteXactState->prepareGID, slen, slen - 1, "%s", prepareGID);
|
|
securec_check_ss(errorno, "\0", "\0");
|
|
|
|
/* Generate the PREPARE TRANSACTION command */
|
|
errorno = snprintf_s(
|
|
prepare_cmd, slen, slen - 1, "PREPARE TRANSACTION '%s'", u_sess->pgxc_cxt.remoteXactState->prepareGID);
|
|
securec_check_ss(errorno, "\0", "\0");
|
|
|
|
for (i = 0; i < write_conn_count; i++) {
|
|
/*
|
|
* We should actually make sure that the connection state is
|
|
* IDLE when we reach here. The executor should have guaranteed that
|
|
* before the transaction gets to the commit point. For now, consume
|
|
* the pending data on the connection
|
|
*/
|
|
if (connections[i]->state != DN_CONNECTION_STATE_IDLE)
|
|
BufferConnection(connections[i]);
|
|
|
|
/* Clean the previous errors, if any */
|
|
pfree_ext(connections[i]->error);
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(CN_PREPARED_SEND_ALL_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("GTM_TEST %s: send %s failed, all failed",
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
prepare_cmd)));
|
|
}
|
|
|
|
/* white box test start */
|
|
if (execute_whitebox(WHITEBOX_LOC, u_sess->pgxc_cxt.remoteXactState->prepareGID, WHITEBOX_CORE, 0.1)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("WHITE_BOX TEST %s: prepare send all to remote failed",
|
|
g_instance.attr.attr_common.PGXCNodeName)));
|
|
}
|
|
/* white box test end */
|
|
|
|
if (i == write_conn_count - 1) {
|
|
if (TEST_STUB(CN_PREPARED_SEND_PART_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("GTM_TEST %s: send %s failed, part failed",
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
prepare_cmd)));
|
|
}
|
|
|
|
/* white box test start */
|
|
if (execute_whitebox(WHITEBOX_LOC, u_sess->pgxc_cxt.remoteXactState->prepareGID, WHITEBOX_CORE, 0.1)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("WHITE_BOX TEST %s: prepare send part to remote failed",
|
|
g_instance.attr.attr_common.PGXCNodeName)));
|
|
}
|
|
/* white box test end */
|
|
}
|
|
#endif
|
|
/*
|
|
* Send queryid to all the participants
|
|
*/
|
|
if (pgxc_node_send_queryid(connections[i], u_sess->debug_query_id))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send queryid to %s before PREPARE command", connections[i]->remoteNodeName)));
|
|
|
|
/*
|
|
* Now we are ready to PREPARE the transaction. Any error at this point
|
|
* can be safely ereport-ed and the transaction will be aborted.
|
|
*/
|
|
if (pgxc_node_send_query(connections[i], prepare_cmd)) {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_PREPARE_FAILED;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_PREPARE_FAILED;
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("failed to send PREPARE TRANSACTION command to "
|
|
"the node %u",
|
|
connections[i]->nodeoid)));
|
|
} else {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_PREPARE_SENT;
|
|
/* Let the HandleCommandComplete know response checking is enable */
|
|
connections[i]->ck_resp_rollback = RESP_ROLLBACK_CHECK;
|
|
|
|
t_thrd.xact_cxt.XactPrepareSent = true;
|
|
}
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(CN_PREPARED_MESSAGE_REPEAT, twophase_default_error_emit)) {
|
|
if (pgxc_node_send_query(connections[i], prepare_cmd))
|
|
ereport(LOG, (errmsg("Failed to send query in 2pc TEST : CN_PREPARED_MESSAGE_REPEAT.")));
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("GTM_TEST %s: repeate message %s", g_instance.attr.attr_common.PGXCNodeName, prepare_cmd)));
|
|
}
|
|
|
|
/* white box test start */
|
|
if (execute_whitebox(WHITEBOX_LOC, NULL, WHITEBOX_REPEAT, 0.1)) {
|
|
(void)pgxc_node_send_query(connections[i], prepare_cmd);
|
|
}
|
|
/* white box test end */
|
|
#endif
|
|
}
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(CN_PREPARED_RESPONSE_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg(
|
|
"GTM_TEST %s: wait response of %s failed", g_instance.attr.attr_common.PGXCNodeName, prepare_cmd)));
|
|
}
|
|
|
|
/* white box test start */
|
|
if (execute_whitebox(WHITEBOX_LOC, NULL, WHITEBOX_CORE, 0.1)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg(
|
|
"WHITE_BOX TEST %s: wait prepare remote response failed", g_instance.attr.attr_common.PGXCNodeName)));
|
|
}
|
|
/* white box test end */
|
|
#endif
|
|
|
|
/*
|
|
* Receive and check for any errors. In case of errors, we don't bail out
|
|
* just yet. We first go through the list of connections and look for
|
|
* errors on each connection. This is important to ensure that we run
|
|
* an appropriate ROLLBACK command later on (prepared transactions must be
|
|
* rolled back with ROLLBACK PREPARED commands).
|
|
*
|
|
* There doesn't seem to be a solid mechanism to track errors on
|
|
* individual connections. The transaction_status field doesn't get set
|
|
* every time there is an error on the connection. The combiner mechanism is
|
|
* good for parallel proessing, but I think we should have a leak-proof
|
|
* mechanism to track connection status
|
|
*/
|
|
if (write_conn_count) {
|
|
combiner = CreateResponseCombiner(write_conn_count, COMBINE_TYPE_NONE);
|
|
/* Receive responses */
|
|
result = pgxc_node_receive_responses(write_conn_count, connections, NULL, combiner, false);
|
|
if (result || !validate_combiner(combiner))
|
|
result = EOF;
|
|
else {
|
|
CloseCombiner(combiner);
|
|
combiner = NULL;
|
|
}
|
|
|
|
for (i = 0; i < write_conn_count; i++) {
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] == RXACT_NODE_PREPARE_SENT) {
|
|
if (connections[i]->error) {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_PREPARE_FAILED;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_PREPARE_FAILED;
|
|
ereport(LOG,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
|
|
errmsg("Failed to PREPARE the transaction on node: %u, state: %s, result: %d, %s",
|
|
connections[i]->nodeoid,
|
|
RemoteXactNodeStatusAsString(u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i]),
|
|
result,
|
|
connections[i]->error)));
|
|
} else {
|
|
/* Did we receive ROLLBACK in response to PREPARE TRANSCATION? */
|
|
if (connections[i]->ck_resp_rollback == RESP_ROLLBACK_RECEIVED) {
|
|
/* If yes, it means PREPARE TRANSACTION failed */
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_PREPARE_FAILED;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_PREPARE_FAILED;
|
|
ereport(LOG,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
|
|
errmsg("Failed to PREPARE the transaction on node: %u, state : %s, result: %d",
|
|
connections[i]->nodeoid,
|
|
RemoteXactNodeStatusAsString(u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i]),
|
|
result)));
|
|
result = 0;
|
|
} else {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_PREPARED;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If we failed to PREPARE on one or more nodes, report an error and let
|
|
* the normal abort processing take charge of aborting the transaction
|
|
*/
|
|
if (result) {
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_PREPARE_FAILED;
|
|
if (combiner != NULL)
|
|
pgxc_node_report_error(combiner);
|
|
|
|
elog(LOG, "failed to PREPARE transaction on one or more nodes - result %d", result);
|
|
}
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_PREPARE_FAILED)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to PREPARE the transaction on one or more nodes")));
|
|
|
|
/* Everything went OK. */
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_PREPARED;
|
|
TwoPhaseCommit = true;
|
|
|
|
/* Set csn to commit in progress on CN if CN don't write. */
|
|
if (!WriteCnLocalNode)
|
|
SetXact2CommitInProgress(GetTopTransactionIdIfAny(), 0);
|
|
|
|
if (!g_instance.attr.attr_storage.enable_gtm_free) {
|
|
/*
|
|
* Notify GTM firstly when xact end when LocalCnNode isn't write node
|
|
* If LocalCnNode need write(e.g DDL), notify GTM after local node prepare finish
|
|
*/
|
|
if (!WriteCnLocalNode && !AtEOXact_GlobalTxn(true)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to receive GTM commit transaction response after %s.", prepare_cmd)));
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/*
|
|
* Commit a running or a previously PREPARED transaction on the remote nodes.
|
|
* The local transaction is handled separately in xact.c
|
|
*
|
|
* Once a COMMIT command is sent to any node, the transaction must be finally
|
|
* be committed. But we still report errors via ereport and let
|
|
* AbortTransaction take care of handling partly committed transactions.
|
|
*
|
|
* For 2PC transactions: If local node is involved in the transaction, its
|
|
* already prepared locally and we are in a context of a different transaction
|
|
* (we call it auxulliary transaction) already. So AbortTransaction will
|
|
* actually abort the auxilliary transaction, which is OK. OTOH if the local
|
|
* node is not involved in the main transaction, then we don't care much if its
|
|
* rolled back on the local node as part of abort processing.
|
|
*
|
|
* When 2PC is not used for reasons such as because transaction has accessed
|
|
* some temporary objects, we are already exposed to the risk of committing it
|
|
* one node and aborting on some other node. So such cases need not get more
|
|
* attentions.
|
|
*/
|
|
void pgxc_node_remote_commit(bool barrierLockHeld)
|
|
{
|
|
int result = 0;
|
|
int rc = 0;
|
|
char commitPrepCmd[256];
|
|
char commitCmd[256];
|
|
char errMsg[256];
|
|
int write_conn_count = u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes;
|
|
int read_conn_count = u_sess->pgxc_cxt.remoteXactState->numReadRemoteNodes;
|
|
PGXCNodeHandle** connections = u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles;
|
|
PGXCNodeHandle* new_connections[write_conn_count + read_conn_count];
|
|
int new_conn_count = 0;
|
|
int i;
|
|
RemoteQueryState* combiner = NULL;
|
|
int rcs = 0;
|
|
StringInfoData str;
|
|
|
|
/*
|
|
* We must handle reader and writer connections both since the transaction
|
|
* must be closed even on a read-only node
|
|
*/
|
|
if (read_conn_count + write_conn_count == 0) {
|
|
return;
|
|
}
|
|
|
|
SetSendCommandId(false);
|
|
|
|
/*
|
|
* Barrier:
|
|
*
|
|
* We should acquire the BarrierLock in SHARE mode here to ensure that
|
|
* there are no in-progress barrier at this point. This mechanism would
|
|
* work as long as LWLock mechanism does not starve a EXCLUSIVE lock
|
|
* requester
|
|
*
|
|
* When local cn is involved in 2PC xacts, we get barrier lock at previous stage,
|
|
* just before cn local commit, to avoid backup inconsistency.
|
|
*/
|
|
if (!barrierLockHeld)
|
|
LWLockAcquire(BarrierLock, LW_SHARED);
|
|
|
|
/*
|
|
* The readers can be committed with a simple COMMIT command. We still need
|
|
* this to close the transaction block
|
|
*/
|
|
rcs = sprintf_s(commitCmd, sizeof(commitCmd), "COMMIT TRANSACTION");
|
|
securec_check_ss(rcs, "\0", "\0");
|
|
|
|
/*
|
|
* If we are running 2PC, construct a COMMIT command to commit the prepared
|
|
* transactions
|
|
*/
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_PREPARED) {
|
|
rcs = sprintf_s(
|
|
commitPrepCmd, sizeof(commitPrepCmd), "COMMIT PREPARED '%s'", u_sess->pgxc_cxt.remoteXactState->prepareGID);
|
|
securec_check_ss(rcs, "\0", "\0");
|
|
}
|
|
/*
|
|
* Now send the COMMIT command to all the participants
|
|
*/
|
|
WaitStatePhase oldPhase = pgstat_report_waitstatus_phase(PHASE_COMMIT);
|
|
for (i = 0; i < write_conn_count + read_conn_count; i++) {
|
|
const char* command = NULL;
|
|
|
|
Assert(u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] == RXACT_NODE_PREPARED ||
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] == RXACT_NODE_NONE);
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] == RXACT_NODE_PREPARED)
|
|
command = commitPrepCmd;
|
|
else
|
|
command = commitCmd;
|
|
|
|
/* Clean the previous errors, if any */
|
|
pfree_ext(connections[i]->error);
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(CN_COMMIT_PREPARED_SEND_ALL_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg(
|
|
"GTM_TEST %s: send %s failed, all failed", g_instance.attr.attr_common.PGXCNodeName, command)));
|
|
}
|
|
|
|
/* white box test start */
|
|
if (execute_whitebox(WHITEBOX_LOC, u_sess->pgxc_cxt.remoteXactState->prepareGID, WHITEBOX_CORE, 0.1)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("WHITE_BOX TEST %s: send remote commit msg %s all failed",
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
command)));
|
|
}
|
|
/* white box test end */
|
|
|
|
if (i == write_conn_count - 1) {
|
|
if (TEST_STUB(CN_COMMIT_PREPARED_SEND_PART_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("GTM_TEST %s: send %s failed, part failed",
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
command)));
|
|
}
|
|
|
|
/* white-box test inject start */
|
|
if (execute_whitebox(WHITEBOX_LOC, u_sess->pgxc_cxt.remoteXactState->prepareGID, WHITEBOX_CORE, 0.1)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("WHITE_BOX TEST %s: send remote commit msg %s part failed",
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
command)));
|
|
}
|
|
/* white-box test inject end */
|
|
}
|
|
#endif
|
|
if (pgxc_node_send_queryid(connections[i], u_sess->debug_query_id) != 0) {
|
|
const int dest_max = 256;
|
|
rc = sprintf_s(errMsg,
|
|
dest_max,
|
|
"failed to send queryid "
|
|
"to node %s before COMMIT command(2PC)",
|
|
connections[i]->remoteNodeName);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
add_error_message(connections[i], "%s", errMsg);
|
|
}
|
|
|
|
if (pgxc_node_send_query(connections[i], command)) {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_COMMIT_FAILED;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_COMMIT_FAILED;
|
|
|
|
/*
|
|
* If error occurred on two phase commit, gs_clean will clean the node later,
|
|
* just note warning message, else note error message.
|
|
*/
|
|
if (TwoPhaseCommit) {
|
|
ereport(WARNING,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("failed to send %s command to node %u", command, connections[i]->nodeoid)));
|
|
|
|
const int dest_max = 256;
|
|
rc = sprintf_s(errMsg,
|
|
dest_max,
|
|
"failed to send COMMIT PREPARED "
|
|
"command to node %s",
|
|
connections[i]->remoteNodeName);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
add_error_message(connections[i], "%s", errMsg);
|
|
} else {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("failed to send %s command to node %u", command, connections[i]->nodeoid)));
|
|
}
|
|
} else {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_COMMIT_SENT;
|
|
new_connections[new_conn_count++] = connections[i];
|
|
}
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(CN_COMMIT_PREPARED_MESSAGE_REPEAT, twophase_default_error_emit)) {
|
|
(void)pgxc_node_send_queryid(connections[i], u_sess->debug_query_id);
|
|
(void)pgxc_node_send_query(connections[i], command);
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("GTM_TEST %s: repeate send %s", g_instance.attr.attr_common.PGXCNodeName, command)));
|
|
}
|
|
|
|
/* white-box test inject start */
|
|
if (execute_whitebox(WHITEBOX_LOC, u_sess->pgxc_cxt.remoteXactState->prepareGID, WHITEBOX_REPEAT, 0.1)) {
|
|
(void)pgxc_node_send_queryid(connections[i], u_sess->debug_query_id);
|
|
(void)pgxc_node_send_query(connections[i], command);
|
|
}
|
|
/* white-box test inject end */
|
|
#endif
|
|
}
|
|
pgstat_report_waitstatus_phase(oldPhase);
|
|
|
|
/*
|
|
* Release the BarrierLock.
|
|
*/
|
|
LWLockRelease(BarrierLock);
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(CN_COMMIT_PREPARED_RESPONSE_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg(
|
|
"GTM_TEST %s: wait response of %s failed", g_instance.attr.attr_common.PGXCNodeName, commitPrepCmd)));
|
|
}
|
|
|
|
/* white-box test inject start */
|
|
if (execute_whitebox(WHITEBOX_LOC, NULL, WHITEBOX_CORE, 0.1)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg(
|
|
"WHITE_BOX TEST %s: wait remote commit response failed", g_instance.attr.attr_common.PGXCNodeName)));
|
|
}
|
|
/* white-box test inject end */
|
|
|
|
#endif
|
|
|
|
if (new_conn_count) {
|
|
initStringInfo(&str);
|
|
combiner = CreateResponseCombiner(new_conn_count, COMBINE_TYPE_NONE);
|
|
/* Receive responses */
|
|
result = pgxc_node_receive_responses(new_conn_count, new_connections, NULL, combiner, false);
|
|
if (result || !validate_combiner(combiner))
|
|
result = EOF;
|
|
else {
|
|
CloseCombiner(combiner);
|
|
combiner = NULL;
|
|
}
|
|
/*
|
|
* Even if the command failed on some node, don't throw an error just
|
|
* yet. That gives a chance to look for individual connection status
|
|
* and record appropriate information for later recovery
|
|
*
|
|
* XXX A node once prepared must be able to either COMMIT or ABORT. So a
|
|
* COMMIT can fail only because of either communication error or because
|
|
* the node went down. Even if one node commits, the transaction must be
|
|
* eventually committed on all the nodes.
|
|
*/
|
|
|
|
/* At this point, we must be in one the following state */
|
|
Assert(u_sess->pgxc_cxt.remoteXactState->status == RXACT_COMMIT_FAILED ||
|
|
u_sess->pgxc_cxt.remoteXactState->status == RXACT_PREPARED ||
|
|
u_sess->pgxc_cxt.remoteXactState->status == RXACT_NONE);
|
|
|
|
/*
|
|
* Go through every connection and check if COMMIT succeeded or failed on
|
|
* that connection. If the COMMIT has failed on one node, but succeeded on
|
|
* some other, such transactions need special attention (by the
|
|
* administrator for now)
|
|
*/
|
|
for (i = 0; i < write_conn_count + read_conn_count; i++) {
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] == RXACT_NODE_COMMIT_SENT) {
|
|
if (connections[i]->error) {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_COMMIT_FAILED;
|
|
if (u_sess->pgxc_cxt.remoteXactState->status != RXACT_PART_COMMITTED)
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_COMMIT_FAILED;
|
|
} else {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_COMMITTED;
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_COMMIT_FAILED)
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_PART_COMMITTED;
|
|
}
|
|
}
|
|
|
|
/* acquire and print the commit involved handles status for each failed node */
|
|
if (RXACT_NODE_COMMITTED != u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i]) {
|
|
if (i > 0)
|
|
appendStringInfoChar(&str, ',');
|
|
appendStringInfo(&str, "%u", connections[i]->nodeoid);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
if (NULL != combiner) {
|
|
// for cm_agent sql, return error
|
|
if (strcmp(u_sess->attr.attr_common.application_name, "cm_agent") == 0) {
|
|
pgxc_node_report_error(combiner, ERROR);
|
|
} else {
|
|
pgxc_node_report_error(combiner, TwoPhaseCommit ? WARNING : ERROR);
|
|
}
|
|
} else {
|
|
if (strcmp(u_sess->attr.attr_common.application_name, "cm_agent") == 0) {
|
|
ereport(
|
|
ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg(
|
|
"Connection error with Datanode, so failed to COMMIT the transaction on one or more nodes")));
|
|
} else {
|
|
ereport(
|
|
TwoPhaseCommit ? WARNING : ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg(
|
|
"Connection error with Datanode, so failed to COMMIT the transaction on one or more nodes")));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_COMMIT_FAILED ||
|
|
u_sess->pgxc_cxt.remoteXactState->status == RXACT_PART_COMMITTED) {
|
|
if (strcmp(u_sess->attr.attr_common.application_name, "cm_agent") == 0) {
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to COMMIT the transaction on nodes: %s.", str.data)));
|
|
} else {
|
|
ereport(TwoPhaseCommit ? WARNING : ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to COMMIT the transaction on nodes: %s.", str.data)));
|
|
}
|
|
} else {
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_COMMITTED;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Abort the current transaction on the local and remote nodes. If the
|
|
* transaction is prepared on the remote node, we send a ROLLBACK PREPARED
|
|
* command, otherwise a ROLLBACK command is sent.
|
|
*
|
|
* Note that if the local node was involved and prepared successfully, we are
|
|
* running in a separate transaction context right now
|
|
*/
|
|
int pgxc_node_remote_abort(void)
|
|
{
|
|
#define ERRMSG_BUFF_SIZE 256
|
|
int rc = 0;
|
|
int result = 0;
|
|
const char* rollbackCmd = "ROLLBACK TRANSACTION";
|
|
char rollbackPrepCmd[256];
|
|
char errMsg[ERRMSG_BUFF_SIZE];
|
|
int write_conn_count = u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes;
|
|
int read_conn_count = u_sess->pgxc_cxt.remoteXactState->numReadRemoteNodes;
|
|
int i;
|
|
PGXCNodeHandle** connections = u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles;
|
|
PGXCNodeHandle* new_connections[u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes +
|
|
u_sess->pgxc_cxt.remoteXactState->numReadRemoteNodes];
|
|
int new_conn_count = 0;
|
|
RemoteQueryState* combiner = NULL;
|
|
|
|
SetSendCommandId(false);
|
|
|
|
/* Send COMMIT/ROLLBACK PREPARED TRANSACTION to the remote nodes */
|
|
WaitStatePhase oldPhase = pgstat_report_waitstatus_phase(PHASE_ROLLBACK);
|
|
for (i = 0; i < write_conn_count + read_conn_count; i++) {
|
|
RemoteXactNodeStatus status = u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i];
|
|
|
|
/* We should buffer all messages before reusing the connections to send TRANSACTION commands */
|
|
if (connections[i]->state == DN_CONNECTION_STATE_QUERY) {
|
|
BufferConnection(connections[i]);
|
|
}
|
|
|
|
/* Clean the previous errors, if any */
|
|
pfree_ext(connections[i]->error);
|
|
|
|
if ((status == RXACT_NODE_PREPARED) || (status == RXACT_NODE_PREPARE_SENT)) {
|
|
rc = sprintf_s(rollbackPrepCmd,
|
|
sizeof(rollbackPrepCmd),
|
|
"ROLLBACK PREPARED '%s'",
|
|
u_sess->pgxc_cxt.remoteXactState->prepareGID);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(CN_ABORT_PREPARED_SEND_ALL_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("GTM_TEST %s: send %s failed, all failed",
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
rollbackPrepCmd)));
|
|
}
|
|
|
|
/* white box test start */
|
|
if (execute_whitebox(WHITEBOX_LOC, u_sess->pgxc_cxt.remoteXactState->prepareGID, WHITEBOX_CORE, 0.1)) {
|
|
ereport(LOG,
|
|
(errmsg("WHITE_BOX TEST %s: send abort prepared msg to remote all failed",
|
|
g_instance.attr.attr_common.PGXCNodeName)));
|
|
}
|
|
/* white box test end */
|
|
|
|
if (i == write_conn_count - 1) {
|
|
if (TEST_STUB(CN_ABORT_PREPARED_SEND_PART_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("GTM_TEST %s: send %s failed, part failed",
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
rollbackPrepCmd)));
|
|
}
|
|
/* white box test start */
|
|
if (execute_whitebox(WHITEBOX_LOC, u_sess->pgxc_cxt.remoteXactState->prepareGID, WHITEBOX_CORE, 0.1)) {
|
|
ereport(LOG,
|
|
(errmsg("WHITE_BOX TEST %s: send abort prepared msg to remote part failed",
|
|
g_instance.attr.attr_common.PGXCNodeName)));
|
|
}
|
|
/* white box test end */
|
|
}
|
|
#endif
|
|
if (pgxc_node_send_query(connections[i], rollbackPrepCmd)) {
|
|
rc = sprintf_s(errMsg,
|
|
ERRMSG_BUFF_SIZE,
|
|
"failed to send ROLLBACK PREPARED "
|
|
"TRANSACTION command to node %s",
|
|
connections[i]->remoteNodeName);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
add_error_message(connections[i], "%s", errMsg);
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_ABORT_FAILED;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_ABORT_FAILED;
|
|
} else {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_ABORT_SENT;
|
|
new_connections[new_conn_count++] = connections[i];
|
|
}
|
|
} else {
|
|
if (pgxc_node_send_query(connections[i], rollbackCmd)) {
|
|
rc = sprintf_s(errMsg,
|
|
ERRMSG_BUFF_SIZE,
|
|
"failed to send ROLLBACK "
|
|
"TRANSACTION command to node %s",
|
|
connections[i]->remoteNodeName);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
add_error_message(connections[i], "%s", errMsg);
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_ABORT_FAILED;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_ABORT_FAILED;
|
|
} else {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_ABORT_SENT;
|
|
new_connections[new_conn_count++] = connections[i];
|
|
}
|
|
}
|
|
}
|
|
pgstat_report_waitstatus_phase(oldPhase);
|
|
|
|
#ifdef ENABLE_DISTRIBUTE_TEST
|
|
if (TEST_STUB(CN_ABORT_PREPARED_RESPONSE_FAILED, twophase_default_error_emit)) {
|
|
ereport(g_instance.distribute_test_param_instance->elevel,
|
|
(errmsg("GTM_TEST %s: wait response of %s failed",
|
|
g_instance.attr.attr_common.PGXCNodeName,
|
|
rollbackPrepCmd)));
|
|
}
|
|
|
|
/* white box test start */
|
|
if (execute_whitebox(WHITEBOX_LOC, NULL, WHITEBOX_CORE, 0.1)) {
|
|
ereport(LOG,
|
|
(errmsg("WHITE_BOX TEST %s: wait remote abort prepared response failed",
|
|
g_instance.attr.attr_common.PGXCNodeName)));
|
|
}
|
|
/* white box test end */
|
|
#endif
|
|
|
|
if (new_conn_count) {
|
|
struct timeval abort_timeout = {0};
|
|
|
|
abort_timeout.tv_sec = u_sess->attr.attr_network.PoolerCancelTimeout; /* seconds */
|
|
abort_timeout.tv_usec = 0; /* microseconds */
|
|
|
|
combiner = CreateResponseCombiner(new_conn_count, COMBINE_TYPE_NONE);
|
|
|
|
/*
|
|
* Receive responses
|
|
* We use pooler cancel timeout here 'cause when we abort transaction in
|
|
* proc die process, we should not wait forever.
|
|
*/
|
|
result = pgxc_node_receive_responses(new_conn_count,
|
|
new_connections,
|
|
u_sess->attr.attr_network.PoolerCancelTimeout ? &abort_timeout : NULL,
|
|
combiner,
|
|
false);
|
|
if (result || !validate_combiner(combiner))
|
|
result = EOF;
|
|
else {
|
|
CloseCombiner(combiner);
|
|
combiner = NULL;
|
|
}
|
|
|
|
for (i = 0; i < write_conn_count + read_conn_count; i++) {
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] == RXACT_NODE_ABORT_SENT) {
|
|
if (connections[i]->error) {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_ABORT_FAILED;
|
|
if (u_sess->pgxc_cxt.remoteXactState->status != RXACT_PART_ABORTED)
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_ABORT_FAILED;
|
|
elog(LOG, "Failed to ABORT at node %u\nDetail: %s", connections[i]->nodeoid, connections[i]->error);
|
|
} else {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_ABORTED;
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_ABORT_FAILED)
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_PART_ABORTED;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
if (combiner != NULL)
|
|
pgxc_node_report_error(combiner, WARNING);
|
|
else {
|
|
elog(LOG,
|
|
"Failed to ABORT an implicitly PREPARED "
|
|
"transaction - result %d",
|
|
result);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Don't ereport because we might already been abort processing and any
|
|
* error at this point can lead to infinite recursion
|
|
*
|
|
* XXX How do we handle errors reported by internal functions used to
|
|
* communicate with remote nodes ?
|
|
*/
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_ABORT_FAILED ||
|
|
u_sess->pgxc_cxt.remoteXactState->status == RXACT_PART_ABORTED) {
|
|
result = EOF;
|
|
elog(LOG,
|
|
"Failed to ABORT an implicitly PREPARED transaction "
|
|
"status - %s.",
|
|
RemoteXactStatusAsString(u_sess->pgxc_cxt.remoteXactState->status));
|
|
} else
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_ABORTED;
|
|
|
|
return result;
|
|
}
|
|
|
|
void pgxc_node_remote_savepoint(const char* cmdString, RemoteQueryExecType exec_type, bool bNeedXid, bool bNeedBegin,
|
|
GlobalTransactionId transactionId)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
PGXCNodeHandle** DataNodeCopyBegin(const char* query, List* nodelist, Snapshot snapshot)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
#else
|
|
int i;
|
|
int conn_count = list_length(nodelist) == 0 ? NumDataNodes : list_length(nodelist);
|
|
struct timeval* timeout = NULL;
|
|
PGXCNodeAllHandles* pgxc_handles = NULL;
|
|
PGXCNodeHandle** connections = NULL;
|
|
PGXCNodeHandle** copy_connections = NULL;
|
|
ListCell* nodeitem = NULL;
|
|
bool need_tran_block = false;
|
|
GlobalTransactionId gxid;
|
|
RemoteQueryState* combiner = NULL;
|
|
|
|
if (conn_count == 0)
|
|
return NULL;
|
|
|
|
/* Get needed Datanode connections */
|
|
pgxc_handles = get_handles(nodelist, NULL, false);
|
|
connections = pgxc_handles->datanode_handles;
|
|
|
|
if (!connections)
|
|
return NULL;
|
|
|
|
/*
|
|
* If more than one nodes are involved or if we are already in a
|
|
* transaction block, we must the remote statements in a transaction block
|
|
*/
|
|
need_tran_block = (conn_count > 1) || (TransactionBlockStatusCode() == 'T');
|
|
|
|
elog(DEBUG1, "conn_count = %d, need_tran_block = %s", conn_count, need_tran_block ? "true" : "false");
|
|
|
|
/*
|
|
* We need to be able quickly find a connection handle for specified node number,
|
|
* So store connections in an array where index is node-1.
|
|
* Unused items in the array should be NULL
|
|
*/
|
|
copy_connections = (PGXCNodeHandle**)palloc0(NumDataNodes * sizeof(PGXCNodeHandle*));
|
|
i = 0;
|
|
foreach (nodeitem, nodelist)
|
|
copy_connections[lfirst_int(nodeitem)] = connections[i++];
|
|
|
|
gxid = GetCurrentTransactionId();
|
|
|
|
if (!GlobalTransactionIdIsValid(gxid)) {
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
pfree(copy_connections);
|
|
return NULL;
|
|
}
|
|
|
|
/* Start transaction on connections where it is not started */
|
|
if (pgxc_node_begin(conn_count, connections, gxid, need_tran_block, false, PGXC_NODE_DATANODE)) {
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
pfree(copy_connections);
|
|
return NULL;
|
|
}
|
|
|
|
/* Send query to nodes */
|
|
for (i = 0; i < conn_count; i++) {
|
|
if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(connections[i]);
|
|
|
|
if (snapshot && pgxc_node_send_snapshot(connections[i], snapshot)) {
|
|
add_error_message(connections[i], "%s", "Can not send request");
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
pfree(copy_connections);
|
|
return NULL;
|
|
}
|
|
if (pgxc_node_send_query(connections[i], query) != 0) {
|
|
add_error_message(connections[i], "%s", "Can not send request");
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
pfree(copy_connections);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* We are expecting CopyIn response, but do not want to send it to client,
|
|
* caller should take care about this, because here we do not know if
|
|
* client runs console or file copy
|
|
*/
|
|
combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE);
|
|
|
|
/* Receive responses */
|
|
if (pgxc_node_receive_responses(conn_count, connections, timeout, combiner) ||
|
|
!ValidateAndCloseCombiner(combiner)) {
|
|
DataNodeCopyFinish(connections, -1, COMBINE_TYPE_NONE);
|
|
pfree(connections);
|
|
pfree(copy_connections);
|
|
return NULL;
|
|
}
|
|
pfree(connections);
|
|
return copy_connections;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Send a data row to the specified nodes
|
|
*/
|
|
|
|
template <bool is_binary>
|
|
static int DataNodeCopyInT(
|
|
const char* data_row, int len, const char* eol, ExecNodes* exec_nodes, PGXCNodeHandle** copy_connections)
|
|
{
|
|
PGXCNodeHandle* primary_handle = NULL;
|
|
ListCell* nodeitem = NULL;
|
|
errno_t rc = EOK;
|
|
|
|
/* size + data row + \n */
|
|
int msgLen = 0;
|
|
if (is_binary) {
|
|
msgLen = 4 + len;
|
|
} else {
|
|
msgLen = 4 + len + ((eol == NULL) ? 1 : strlen(eol));
|
|
}
|
|
int nLen = htonl(msgLen);
|
|
|
|
if (exec_nodes->primarynodelist != NIL) {
|
|
primary_handle = copy_connections[lfirst_int(list_head(exec_nodes->primarynodelist))];
|
|
}
|
|
|
|
if (primary_handle != NULL) {
|
|
if (primary_handle->state == DN_CONNECTION_STATE_COPY_IN) {
|
|
/* precalculate to speed up access */
|
|
const int bytes_needed = 1 + msgLen;
|
|
|
|
/* flush buffer if it is almost full */
|
|
if (bytes_needed + primary_handle->outEnd > COPY_BUFFER_SIZE) {
|
|
int read_status = -1;
|
|
/* First look if Datanode has sent a error message */
|
|
if (primary_handle->is_logic_conn)
|
|
/* for logic connection between cn & dn */
|
|
read_status = pgxc_node_read_data_from_logic_conn(primary_handle, true);
|
|
else
|
|
read_status = pgxc_node_read_data(primary_handle, true);
|
|
if (read_status == EOF || read_status < 0) {
|
|
add_error_message(primary_handle, "%s", "failed to read data from Datanode");
|
|
return EOF;
|
|
}
|
|
|
|
if (primary_handle->inStart < primary_handle->inEnd) {
|
|
RemoteQueryState* combiner = CreateResponseCombiner(1, COMBINE_TYPE_NONE);
|
|
(void)handle_response(primary_handle, combiner);
|
|
if (!ValidateAndCloseCombiner(combiner))
|
|
return EOF;
|
|
}
|
|
|
|
if (DN_CONNECTION_STATE_ERROR(primary_handle))
|
|
return EOF;
|
|
|
|
if (pgxc_node_flush(primary_handle) < 0) {
|
|
add_error_message(primary_handle, "%s", "failed to send data to Datanode");
|
|
return EOF;
|
|
}
|
|
}
|
|
|
|
ensure_out_buffer_capacity(bytes_needed, primary_handle);
|
|
Assert(primary_handle->outBuffer != NULL);
|
|
primary_handle->outBuffer[primary_handle->outEnd++] = 'd';
|
|
const int memcpy_len = 4;
|
|
rc = memcpy_s(primary_handle->outBuffer + primary_handle->outEnd, memcpy_len, &nLen, memcpy_len);
|
|
securec_check(rc, "", "");
|
|
primary_handle->outEnd += 4;
|
|
if (len != 0) {
|
|
rc = memcpy_s(primary_handle->outBuffer + primary_handle->outEnd, len, data_row, len);
|
|
securec_check(rc, "", "");
|
|
}
|
|
primary_handle->outEnd += len;
|
|
if (!is_binary) {
|
|
if (eol == NULL) {
|
|
primary_handle->outBuffer[primary_handle->outEnd++] = '\n';
|
|
} else {
|
|
rc = memcpy_s(primary_handle->outBuffer + primary_handle->outEnd, strlen(eol), eol, strlen(eol));
|
|
securec_check(rc, "", "");
|
|
primary_handle->outEnd += strlen(eol);
|
|
}
|
|
}
|
|
} else {
|
|
add_error_message(primary_handle, "%s", "Invalid Datanode connection");
|
|
return EOF;
|
|
}
|
|
}
|
|
|
|
foreach (nodeitem, exec_nodes->nodeList) {
|
|
PGXCNodeHandle* handle = copy_connections[lfirst_int(nodeitem)];
|
|
if (handle != NULL && handle->state == DN_CONNECTION_STATE_COPY_IN) {
|
|
/* precalculate to speed up access */
|
|
const int bytes_needed = 1 + msgLen;
|
|
|
|
/* flush buffer if it is almost full */
|
|
if ((primary_handle != NULL && bytes_needed + handle->outEnd > PRIMARY_NODE_WRITEAHEAD) ||
|
|
(primary_handle == NULL && bytes_needed + handle->outEnd > COPY_BUFFER_SIZE)) {
|
|
int to_send = handle->outEnd;
|
|
int read_status = -1;
|
|
/* First look if Datanode has sent a error message */
|
|
if (handle->is_logic_conn)
|
|
/* for logic connection between cn & dn */
|
|
read_status = pgxc_node_read_data_from_logic_conn(handle, true);
|
|
else
|
|
read_status = pgxc_node_read_data(handle, true);
|
|
if (read_status == EOF || read_status < 0) {
|
|
add_error_message(handle, "%s", "failed to read data from Datanode");
|
|
return EOF;
|
|
}
|
|
|
|
if (handle->inStart < handle->inEnd) {
|
|
RemoteQueryState* combiner = CreateResponseCombiner(1, COMBINE_TYPE_NONE);
|
|
(void)handle_response(handle, combiner);
|
|
if (combiner->errorMessage)
|
|
pgxc_node_report_error(combiner);
|
|
if (!ValidateAndCloseCombiner(combiner))
|
|
return EOF;
|
|
}
|
|
|
|
if (DN_CONNECTION_STATE_ERROR(handle))
|
|
return EOF;
|
|
|
|
/*
|
|
* Allow primary node to write out data before others.
|
|
* If primary node was blocked it would not accept copy data.
|
|
* So buffer at least PRIMARY_NODE_WRITEAHEAD at the other nodes.
|
|
* If primary node is blocked and is buffering, other buffers will
|
|
* grow accordingly.
|
|
*/
|
|
if (primary_handle != NULL) {
|
|
if (primary_handle->outEnd + PRIMARY_NODE_WRITEAHEAD < handle->outEnd)
|
|
to_send = handle->outEnd - primary_handle->outEnd - PRIMARY_NODE_WRITEAHEAD;
|
|
else
|
|
to_send = 0;
|
|
}
|
|
|
|
/*
|
|
* Try to send down buffered data if we have
|
|
*/
|
|
if (to_send && pgxc_node_flush(handle) < 0) {
|
|
add_error_message(handle, "%s", "failed to send data to Datanode");
|
|
return EOF;
|
|
}
|
|
}
|
|
|
|
ensure_out_buffer_capacity(bytes_needed, handle);
|
|
Assert(handle->outBuffer != NULL);
|
|
handle->outBuffer[handle->outEnd++] = 'd';
|
|
const int mem_len = 4;
|
|
rc = memcpy_s(handle->outBuffer + handle->outEnd, mem_len, &nLen, mem_len);
|
|
securec_check(rc, "", "");
|
|
handle->outEnd += 4;
|
|
if (len != 0) {
|
|
rc = memcpy_s(handle->outBuffer + handle->outEnd, len, data_row, len);
|
|
securec_check(rc, "", "");
|
|
}
|
|
handle->outEnd += len;
|
|
handle->outNum += 1;
|
|
if (!is_binary) {
|
|
if (eol == NULL) {
|
|
handle->outBuffer[handle->outEnd++] = '\n';
|
|
} else {
|
|
rc = memcpy_s(handle->outBuffer + handle->outEnd, strlen(eol), eol, strlen(eol));
|
|
securec_check(rc, "", "");
|
|
handle->outEnd += strlen(eol);
|
|
}
|
|
}
|
|
} else {
|
|
if (handle != NULL)
|
|
add_error_message(handle, "%s", "Invalid Datanode connection");
|
|
return EOF;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int DataNodeCopyIn(const char* data_row, int len, const char* eol, ExecNodes* exec_nodes,
|
|
PGXCNodeHandle** copy_connections, bool is_binary)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return 0;
|
|
#else
|
|
PGXCNodeHandle* primary_handle = NULL;
|
|
ListCell* nodeitem = NULL;
|
|
/* size + data row + \n */
|
|
int msgLen = 4 + len + 1;
|
|
int nLen = htonl(msgLen);
|
|
|
|
if (exec_nodes->primarynodelist) {
|
|
primary_handle = copy_connections[lfirst_int(list_head(exec_nodes->primarynodelist))];
|
|
}
|
|
|
|
if (primary_handle) {
|
|
if (primary_handle->state == DN_CONNECTION_STATE_COPY_IN) {
|
|
/* precalculate to speed up access */
|
|
int bytes_needed = primary_handle->outEnd + 1 + msgLen;
|
|
|
|
/* flush buffer if it is almost full */
|
|
if (bytes_needed > COPY_BUFFER_SIZE) {
|
|
/* First look if Datanode has sent a error message */
|
|
int read_status = pgxc_node_read_data(primary_handle, true);
|
|
if (read_status == EOF || read_status < 0) {
|
|
add_error_message(primary_handle, "%s", "failed to read data from Datanode");
|
|
return EOF;
|
|
}
|
|
|
|
if (primary_handle->inStart < primary_handle->inEnd) {
|
|
RemoteQueryState* combiner = CreateResponseCombiner(1, COMBINE_TYPE_NONE);
|
|
handle_response(primary_handle, combiner);
|
|
if (!ValidateAndCloseCombiner(combiner))
|
|
return EOF;
|
|
}
|
|
|
|
if (DN_CONNECTION_STATE_ERROR(primary_handle))
|
|
return EOF;
|
|
|
|
if (send_some(primary_handle, primary_handle->outEnd) < 0) {
|
|
add_error_message(primary_handle, "%s", "failed to send data to Datanode");
|
|
return EOF;
|
|
}
|
|
}
|
|
|
|
if (ensure_out_buffer_capacity(bytes_needed, primary_handle) != 0) {
|
|
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")));
|
|
}
|
|
|
|
primary_handle->outBuffer[primary_handle->outEnd++] = 'd';
|
|
rc = memcpy_s(primary_handle->outBuffer + primary_handle->outEnd, 4, &nLen, 4);
|
|
securec_check(rc, "", "");
|
|
primary_handle->outEnd += 4;
|
|
rc = memcpy_s(primary_handle->outBuffer + primary_handle->outEnd, len, data_row, len);
|
|
securec_check(rc, "", "");
|
|
primary_handle->outEnd += len;
|
|
primary_handle->outBuffer[primary_handle->outEnd++] = '\n';
|
|
} else {
|
|
add_error_message(primary_handle, "%s", "Invalid Datanode connection");
|
|
return EOF;
|
|
}
|
|
}
|
|
|
|
foreach (nodeitem, exec_nodes->nodeList) {
|
|
PGXCNodeHandle* handle = copy_connections[lfirst_int(nodeitem)];
|
|
if (handle && handle->state == DN_CONNECTION_STATE_COPY_IN) {
|
|
/* precalculate to speed up access */
|
|
int bytes_needed = handle->outEnd + 1 + msgLen;
|
|
|
|
/* flush buffer if it is almost full */
|
|
if ((primary_handle && bytes_needed > PRIMARY_NODE_WRITEAHEAD) ||
|
|
(!primary_handle && bytes_needed > COPY_BUFFER_SIZE)) {
|
|
int to_send = handle->outEnd;
|
|
|
|
/* First look if Datanode has sent a error message */
|
|
int read_status = pgxc_node_read_data(handle, true);
|
|
if (read_status == EOF || read_status < 0) {
|
|
add_error_message(handle, "%s", "failed to read data from Datanode");
|
|
return EOF;
|
|
}
|
|
|
|
if (handle->inStart < handle->inEnd) {
|
|
RemoteQueryState* combiner = CreateResponseCombiner(1, COMBINE_TYPE_NONE);
|
|
handle_response(handle, combiner);
|
|
if (!ValidateAndCloseCombiner(combiner))
|
|
return EOF;
|
|
}
|
|
|
|
if (DN_CONNECTION_STATE_ERROR(handle))
|
|
return EOF;
|
|
|
|
/*
|
|
* Allow primary node to write out data before others.
|
|
* If primary node was blocked it would not accept copy data.
|
|
* So buffer at least PRIMARY_NODE_WRITEAHEAD at the other nodes.
|
|
* If primary node is blocked and is buffering, other buffers will
|
|
* grow accordingly.
|
|
*/
|
|
if (primary_handle) {
|
|
if (primary_handle->outEnd + PRIMARY_NODE_WRITEAHEAD < handle->outEnd)
|
|
to_send = handle->outEnd - primary_handle->outEnd - PRIMARY_NODE_WRITEAHEAD;
|
|
else
|
|
to_send = 0;
|
|
}
|
|
|
|
/*
|
|
* Try to send down buffered data if we have
|
|
*/
|
|
if (to_send && send_some(handle, to_send) < 0) {
|
|
add_error_message(handle, "%s", "failed to send data to Datanode");
|
|
return EOF;
|
|
}
|
|
}
|
|
|
|
if (ensure_out_buffer_capacity(bytes_needed, handle) != 0) {
|
|
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")));
|
|
}
|
|
|
|
handle->outBuffer[handle->outEnd++] = 'd';
|
|
rc = memcpy_s(handle->outBuffer + handle->outEnd, 4, &nLen, 4);
|
|
securec_check(rc, "", "");
|
|
handle->outEnd += 4;
|
|
rc = memcpy_s(handle->outBuffer + handle->outEnd, len, data_row, len);
|
|
securec_check(rc, "", "");
|
|
handle->outEnd += len;
|
|
handle->outBuffer[handle->outEnd++] = '\n';
|
|
} else {
|
|
add_error_message(handle, "%s", "Invalid Datanode connection");
|
|
return EOF;
|
|
}
|
|
}
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
uint64 DataNodeCopyOut(ExecNodes* exec_nodes, PGXCNodeHandle** copy_connections, TupleDesc tupleDesc, FILE* copy_file,
|
|
Tuplestorestate* store, RemoteCopyType remoteCopyType)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return 0;
|
|
#else
|
|
RemoteQueryState* combiner;
|
|
int conn_count = (list_length(exec_nodes->nodeList) == 0) ? NumDataNodes : list_length(exec_nodes->nodeList);
|
|
ListCell* nodeitem = NULL;
|
|
uint64 processed;
|
|
|
|
combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_SUM);
|
|
combiner->processed = 0;
|
|
combiner->remoteCopyType = remoteCopyType;
|
|
|
|
/*
|
|
* If there is an existing file where to copy data,
|
|
* pass it to combiner when remote COPY output is sent back to file.
|
|
*/
|
|
if (copy_file && remoteCopyType == REMOTE_COPY_FILE)
|
|
combiner->copy_file = copy_file;
|
|
if (store && remoteCopyType == REMOTE_COPY_TUPLESTORE) {
|
|
combiner->tuplestorestate = store;
|
|
combiner->tuple_desc = tupleDesc;
|
|
}
|
|
|
|
foreach (nodeitem, exec_nodes->nodeList) {
|
|
PGXCNodeHandle* handle = copy_connections[lfirst_int(nodeitem)];
|
|
int read_status = 0;
|
|
|
|
Assert(handle && handle->state == DN_CONNECTION_STATE_COPY_OUT);
|
|
|
|
/*
|
|
* H message has been consumed, continue to manage data row messages.
|
|
* Continue to read as long as there is data.
|
|
*/
|
|
while (read_status >= 0 && handle->state == DN_CONNECTION_STATE_COPY_OUT) {
|
|
if (handle_response(handle, combiner) == RESPONSE_EOF) {
|
|
/* read some extra-data */
|
|
read_status = pgxc_node_read_data(handle, true);
|
|
if (read_status < 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_FAILURE), errmsg("unexpected EOF on datanode connection")));
|
|
else
|
|
/*
|
|
* Set proper connection status - handle_response
|
|
* has changed it to DN_CONNECTION_STATE_QUERY
|
|
*/
|
|
handle->state = DN_CONNECTION_STATE_COPY_OUT;
|
|
}
|
|
/* There is no more data that can be read from connection */
|
|
}
|
|
}
|
|
|
|
processed = combiner->processed;
|
|
|
|
if (!ValidateAndCloseCombiner(combiner)) {
|
|
if (!PersistentConnections)
|
|
release_handles();
|
|
pfree(copy_connections);
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_DATA_CORRUPTED),
|
|
errmsg(
|
|
"Unexpected response from the Datanodes when combining, request type %d", combiner->request_type)));
|
|
}
|
|
|
|
return processed;
|
|
#endif
|
|
}
|
|
|
|
void report_table_skewness_alarm(AlarmType alarmType, const char* tableName)
|
|
{
|
|
Alarm AlarmTableSkewness[1];
|
|
AlarmItemInitialize(&(AlarmTableSkewness[0]), ALM_AI_AbnormalTableSkewness, ALM_AS_Normal, NULL);
|
|
AlarmAdditionalParam tempAdditionalParam;
|
|
// fill the alarm message
|
|
WriteAlarmAdditionalInfo(
|
|
&tempAdditionalParam, "", const_cast<char*>(tableName), "", AlarmTableSkewness, alarmType, tableName);
|
|
// report the alarm
|
|
AlarmReporter(AlarmTableSkewness, alarmType, &tempAdditionalParam);
|
|
}
|
|
|
|
void DataNodeCopyFinish(PGXCNodeHandle** copy_connections, int n_copy_connections, int primary_dn_index,
|
|
CombineType combine_type, Relation rel)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
int i;
|
|
RemoteQueryState* combiner = NULL;
|
|
bool error = false;
|
|
struct timeval* timeout = NULL; /* wait forever */
|
|
PGXCNodeHandle* connections[NumDataNodes];
|
|
PGXCNodeHandle* primary_handle = NULL;
|
|
int conn_count = 0;
|
|
|
|
for (i = 0; i < NumDataNodes; i++) {
|
|
PGXCNodeHandle* handle = copy_connections[i];
|
|
|
|
if (!handle)
|
|
continue;
|
|
|
|
if (i == primary_dn_index)
|
|
primary_handle = handle;
|
|
else
|
|
connections[conn_count++] = handle;
|
|
}
|
|
|
|
if (primary_handle) {
|
|
error = true;
|
|
if (primary_handle->state == DN_CONNECTION_STATE_COPY_IN ||
|
|
primary_handle->state == DN_CONNECTION_STATE_COPY_OUT)
|
|
error = DataNodeCopyEnd(primary_handle, false);
|
|
|
|
combiner = CreateResponseCombiner(conn_count + 1, combine_type);
|
|
error = (pgxc_node_receive_responses(1, &primary_handle, timeout, combiner) != 0) || error;
|
|
}
|
|
|
|
for (i = 0; i < conn_count; i++) {
|
|
PGXCNodeHandle* handle = connections[i];
|
|
|
|
error = true;
|
|
if (handle->state == DN_CONNECTION_STATE_COPY_IN || handle->state == DN_CONNECTION_STATE_COPY_OUT)
|
|
error = DataNodeCopyEnd(handle, false);
|
|
}
|
|
|
|
if (!combiner)
|
|
combiner = CreateResponseCombiner(conn_count, combine_type);
|
|
error = (pgxc_node_receive_responses(conn_count, connections, timeout, combiner) != 0) || error;
|
|
|
|
if (!ValidateAndCloseCombiner(combiner) || error)
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Error while running COPY")));
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* End copy process on a connection
|
|
*/
|
|
bool DataNodeCopyEnd(PGXCNodeHandle* handle, bool is_error)
|
|
{
|
|
int nLen = htonl(4);
|
|
errno_t rc = EOK;
|
|
|
|
if (handle == NULL)
|
|
return true;
|
|
|
|
/* msgType + msgLen */
|
|
ensure_out_buffer_capacity(1 + 4, handle);
|
|
Assert(handle->outBuffer != NULL);
|
|
if (is_error)
|
|
handle->outBuffer[handle->outEnd++] = 'f';
|
|
else
|
|
handle->outBuffer[handle->outEnd++] = 'c';
|
|
|
|
rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd - 1, &nLen, sizeof(int));
|
|
securec_check(rc, "", "");
|
|
handle->outEnd += 4;
|
|
|
|
/* We need response right away, so send immediately */
|
|
if (pgxc_node_flush(handle) < 0)
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
RemoteQueryState* ExecInitRemoteQuery(RemoteQuery* node, EState* estate, int eflags, bool row_plan)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
#else
|
|
RemoteQueryState* remotestate = NULL;
|
|
TupleDesc scan_type;
|
|
|
|
/* RemoteQuery node is the leaf node in the plan tree, just like seqscan */
|
|
Assert(innerPlan(node) == NULL);
|
|
Assert(outerPlan(node) == NULL);
|
|
|
|
remotestate = CreateResponseCombiner(0, node->combine_type);
|
|
remotestate->ss.ps.plan = (Plan*)node;
|
|
remotestate->ss.ps.state = estate;
|
|
|
|
/*
|
|
* Miscellaneous initialisation
|
|
*
|
|
* create expression context for node
|
|
*/
|
|
ExecAssignExprContext(estate, &remotestate->ss.ps);
|
|
|
|
/* Initialise child expressions */
|
|
if (estate->es_is_flt_frame) {
|
|
remotestate->ss.ps.qual = (List*)ExecInitQualByFlatten(node->scan.plan.qual, (PlanState*)remotestate);
|
|
} else {
|
|
remotestate->ss.ps.targetlist = (List*)ExecInitExprByRecursion((Expr*)node->scan.plan.targetlist, (PlanState*)remotestate);
|
|
remotestate->ss.ps.qual = (List*)ExecInitExprByRecursion((Expr*)node->scan.plan.qual, (PlanState*)remotestate);
|
|
}
|
|
|
|
/* check for unsupported flags */
|
|
Assert(!(eflags & (EXEC_FLAG_MARK)));
|
|
|
|
/* Extract the eflags bits that are relevant for tuplestorestate */
|
|
remotestate->eflags = (eflags & (EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD));
|
|
|
|
/* We anyways have to support REWIND for ReScan */
|
|
remotestate->eflags |= EXEC_FLAG_REWIND;
|
|
|
|
remotestate->eof_underlying = false;
|
|
remotestate->tuplestorestate = NULL;
|
|
|
|
ExecInitResultTupleSlot(estate, &remotestate->ss.ps);
|
|
ExecInitScanTupleSlot(estate, &remotestate->ss);
|
|
scan_type = ExecTypeFromTL(node->base_tlist, false);
|
|
ExecAssignScanType(&remotestate->ss, scan_type);
|
|
remotestate->ss.ps.ps_vec_TupFromTlist = false;
|
|
/*
|
|
* If there are parameters supplied, get them into a form to be sent to the
|
|
* Datanodes with bind message. We should not have had done this before.
|
|
*/
|
|
SetDataRowForExtParams(estate->es_param_list_info, remotestate);
|
|
|
|
/*
|
|
* Initialize result tuple type and projection info.
|
|
*/
|
|
ExecAssignResultTypeFromTL(&remotestate->ss.ps);
|
|
ExecAssignScanProjectionInfo(&remotestate->ss);
|
|
|
|
if (node->rq_save_command_id) {
|
|
/* Save command id to be used in some special cases */
|
|
remotestate->rqs_cmd_id = GetCurrentCommandId(false);
|
|
}
|
|
|
|
return remotestate;
|
|
#endif
|
|
}
|
|
/*
|
|
* GetNodeIdFromNodesDef
|
|
*/
|
|
static int GetNodeIdFromNodesDef(NodeDefinition* node_def, Oid nodeoid)
|
|
{
|
|
int i;
|
|
int res = -1;
|
|
|
|
/* Look into the handles and return correct position in array */
|
|
for (i = 0; i < u_sess->pgxc_cxt.NumDataNodes; i++) {
|
|
if (node_def[i].nodeoid == nodeoid) {
|
|
res = i;
|
|
break;
|
|
}
|
|
}
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
* Get Node connections depending on the connection type:
|
|
* Datanodes Only, Coordinators only or both types
|
|
*/
|
|
PGXCNodeAllHandles* get_exec_connections(
|
|
RemoteQueryState* planstate, ExecNodes* exec_nodes, RemoteQueryExecType exec_type)
|
|
{
|
|
List* nodelist = NIL;
|
|
List* primarynode = NIL;
|
|
List* coordlist = NIL;
|
|
PGXCNodeHandle* primaryconnection = NULL;
|
|
int co_conn_count, dn_conn_count;
|
|
bool is_query_coord_only = false;
|
|
PGXCNodeAllHandles* pgxc_handles = NULL;
|
|
RelationLocInfo* rel_loc_info = NULL;
|
|
bool isFreeNodeList = true;
|
|
List* dummynodelist = NIL;
|
|
|
|
/*
|
|
* If query is launched only on Coordinators, we have to inform get_handles
|
|
* not to ask for Datanode connections even if list of Datanodes is NIL.
|
|
*/
|
|
if (exec_type == EXEC_ON_COORDS)
|
|
is_query_coord_only = true;
|
|
|
|
if (exec_nodes != NULL) {
|
|
if (exec_nodes->en_expr) {
|
|
/* execution time determining of target Datanodes */
|
|
bool isnull = false;
|
|
MemoryContext oldContext;
|
|
|
|
rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
|
|
if (unlikely(rel_loc_info == NULL)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
|
|
errmsg("Can not find location info for relation oid: %u", exec_nodes->en_relid)));
|
|
}
|
|
|
|
int len = list_length(rel_loc_info->partAttrNum);
|
|
/* It should switch memctx to ExprContext for makenode in ExecInitExpr */
|
|
Datum* values = (Datum*)palloc(len * sizeof(Datum));
|
|
bool* null = (bool*)palloc(len * sizeof(bool));
|
|
Oid* typOid = (Oid*)palloc(len * sizeof(Oid));
|
|
List* dist_col = NULL;
|
|
int i = 0;
|
|
|
|
ListCell* cell = NULL;
|
|
foreach (cell, exec_nodes->en_expr) {
|
|
Expr* expr = (Expr*)lfirst(cell);
|
|
oldContext = MemoryContextSwitchTo(planstate->ss.ps.ps_ExprContext->ecxt_per_tuple_memory);
|
|
|
|
ExprState* estate = ExecInitExpr(expr, (PlanState*)planstate);
|
|
|
|
Datum partvalue = ExecEvalExpr(estate, planstate->ss.ps.ps_ExprContext, &isnull, NULL);
|
|
MemoryContextSwitchTo(oldContext);
|
|
|
|
values[i] = partvalue;
|
|
null[i] = isnull;
|
|
typOid[i] = exprType((Node*)expr);
|
|
dist_col = lappend_int(dist_col, i);
|
|
i++;
|
|
}
|
|
ExecNodes* nodes = GetRelationNodes(rel_loc_info, values, null, typOid, dist_col, exec_nodes->accesstype);
|
|
|
|
if (nodes != NULL) {
|
|
nodelist = nodes->nodeList;
|
|
primarynode = nodes->primarynodelist;
|
|
|
|
/* for explain analyze to show the datanode which really runs */
|
|
if (t_thrd.postgres_cxt.mark_explain_analyze) {
|
|
if (planstate->pbe_run_status == PBE_NONE) {
|
|
/* first run */
|
|
planstate->pbe_run_status = PBE_ON_ONE_NODE;
|
|
/* keep original nodeList if needed */
|
|
if (!exec_nodes->nodelist_is_nil && !exec_nodes->original_nodeList)
|
|
exec_nodes->original_nodeList = exec_nodes->nodeList;
|
|
|
|
exec_nodes->nodeList = nodelist;
|
|
isFreeNodeList = false;
|
|
} else if (planstate->pbe_run_status == PBE_ON_ONE_NODE) {
|
|
/* second run or always same nodelist before */
|
|
if (list_difference_int(exec_nodes->nodeList, nodelist)) {
|
|
planstate->pbe_run_status = PBE_ON_MULTI_NODES;
|
|
if (exec_nodes->nodeList)
|
|
list_free_ext(exec_nodes->nodeList);
|
|
|
|
exec_nodes->nodeList = exec_nodes->original_nodeList;
|
|
}
|
|
}
|
|
}
|
|
|
|
bms_free_ext(nodes->distribution.bms_data_nodeids);
|
|
pfree_ext(nodes);
|
|
}
|
|
/*
|
|
* en_expr is set by pgxc_set_en_expr only for distributed
|
|
* relations while planning DMLs, hence a select for update
|
|
* on a replicated table here is an assertion
|
|
*/
|
|
Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE && IsRelationReplicated(rel_loc_info)));
|
|
pfree_ext(values);
|
|
pfree_ext(null);
|
|
pfree_ext(typOid);
|
|
} else if (OidIsValid(exec_nodes->en_relid)) {
|
|
rel_loc_info = GetRelationLocInfo(exec_nodes->en_relid);
|
|
if (unlikely(rel_loc_info == NULL)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
|
|
errmsg("Can not find location info for relation oid: %u", exec_nodes->en_relid)));
|
|
}
|
|
|
|
ExecNodes* nodes = GetRelationNodes(rel_loc_info, NULL, NULL, NULL, NULL, exec_nodes->accesstype);
|
|
|
|
/*
|
|
* en_relid is set only for DMLs, hence a select for update on a
|
|
* replicated table here is an assertion
|
|
*/
|
|
Assert(!(exec_nodes->accesstype == RELATION_ACCESS_READ_FOR_UPDATE && IsRelationReplicated(rel_loc_info)));
|
|
|
|
/* Use the obtained list for given table */
|
|
if (nodes != NULL) {
|
|
bms_free_ext(nodes->distribution.bms_data_nodeids);
|
|
nodelist = nodes->nodeList;
|
|
}
|
|
|
|
/*
|
|
* Special handling for ROUND ROBIN distributed tables. The target
|
|
* node must be determined at the execution time
|
|
*/
|
|
if (rel_loc_info->locatorType == LOCATOR_TYPE_RROBIN && nodes) {
|
|
nodelist = nodes->nodeList;
|
|
primarynode = nodes->primarynodelist;
|
|
} else if (nodes != NULL) {
|
|
if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES) {
|
|
isFreeNodeList = true;
|
|
if (exec_nodes->nodeList || exec_nodes->primarynodelist) {
|
|
nodelist = exec_nodes->nodeList;
|
|
primarynode = exec_nodes->primarynodelist;
|
|
isFreeNodeList = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Must free nodelist if nodelist is not belone to exec_nodes */
|
|
if (nodes && !isFreeNodeList) {
|
|
if (nodes->nodeList)
|
|
list_free_ext(nodes->nodeList);
|
|
if (nodes->primarynodelist)
|
|
list_free_ext(nodes->primarynodelist);
|
|
|
|
pfree_ext(nodes);
|
|
nodes = NULL;
|
|
}
|
|
} else {
|
|
if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES)
|
|
nodelist = exec_nodes->nodeList;
|
|
else if (exec_type == EXEC_ON_COORDS)
|
|
coordlist = exec_nodes->nodeList;
|
|
|
|
primarynode = exec_nodes->primarynodelist;
|
|
isFreeNodeList = false;
|
|
}
|
|
}
|
|
|
|
/* Set datanode list and DN number */
|
|
if (list_length(nodelist) == 0 && (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_DATANODES)) {
|
|
/* Primary connection is included in this number of connections if it exists */
|
|
dn_conn_count = u_sess->pgxc_cxt.NumDataNodes;
|
|
} else {
|
|
if (exec_type == EXEC_ON_DATANODES || exec_type == EXEC_ON_ALL_NODES) {
|
|
if (primarynode != NULL)
|
|
dn_conn_count = list_length(nodelist) + 1;
|
|
else
|
|
dn_conn_count = list_length(nodelist);
|
|
} else
|
|
dn_conn_count = 0;
|
|
}
|
|
|
|
/* Set Coordinator list and Coordinator number */
|
|
if (exec_type == EXEC_ON_ALL_NODES || (list_length(coordlist) == 0 && exec_type == EXEC_ON_COORDS)) {
|
|
coordlist = GetAllCoordNodes();
|
|
co_conn_count = list_length(coordlist);
|
|
} else {
|
|
if (exec_type == EXEC_ON_COORDS)
|
|
co_conn_count = list_length(coordlist);
|
|
else
|
|
co_conn_count = 0;
|
|
}
|
|
|
|
/*
|
|
* For multi-node group, the exec_nodes of replicate table dml may be different from max datanode list.
|
|
* Therefore, we should get valid datanode from subplan, and check if the results are same for them
|
|
*/
|
|
if (planstate != NULL && planstate->combine_type == COMBINE_TYPE_SAME && exec_type == EXEC_ON_DATANODES) {
|
|
if (planstate->ss.ps.plan->lefttree != NULL && planstate->ss.ps.plan->lefttree->exec_nodes != NULL)
|
|
dummynodelist = list_difference_int(nodelist, planstate->ss.ps.plan->lefttree->exec_nodes->nodeList);
|
|
}
|
|
|
|
/* Get other connections (non-primary) */
|
|
pgxc_handles = get_handles(nodelist, coordlist, is_query_coord_only, dummynodelist);
|
|
|
|
/* Get connection for primary node, if used */
|
|
if (primarynode != NULL) {
|
|
/* Let's assume primary connection is always a Datanode connection for the moment */
|
|
PGXCNodeAllHandles* pgxc_conn_res = NULL;
|
|
pgxc_conn_res = get_handles(primarynode, NULL, false);
|
|
|
|
/* primary connection is unique */
|
|
primaryconnection = pgxc_conn_res->datanode_handles[0];
|
|
|
|
pfree_ext(pgxc_conn_res);
|
|
|
|
if (primaryconnection == NULL)
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("could not obtain connection from pool")));
|
|
pgxc_handles->primary_handle = primaryconnection;
|
|
}
|
|
|
|
/* Depending on the execution type, we still need to save the initial node counts */
|
|
pgxc_handles->dn_conn_count = dn_conn_count;
|
|
pgxc_handles->co_conn_count = co_conn_count;
|
|
|
|
if (rel_loc_info != NULL)
|
|
FreeRelationLocInfo(rel_loc_info);
|
|
|
|
/* Must free nodelist if nodelist is not belone to exec_nodes */
|
|
if (isFreeNodeList) {
|
|
if (nodelist != NULL)
|
|
list_free_ext(nodelist);
|
|
if (primarynode != NULL)
|
|
list_free_ext(primarynode);
|
|
}
|
|
|
|
return pgxc_handles;
|
|
}
|
|
|
|
/*
|
|
* @Description: Send the queryId down to the PGXC node with sync
|
|
*
|
|
* @param[IN] connections: all connection handle with Datanode
|
|
* @param[IN] conn_count: number of connections
|
|
* @param[IN] queryId: query ID of current simple query
|
|
* @return: void
|
|
*/
|
|
static void pgxc_node_send_queryid_with_sync(PGXCNodeHandle** connections, int conn_count, uint64 queryId)
|
|
{
|
|
PGXCNodeHandle** temp_connections = NULL;
|
|
RemoteQueryState* combiner = NULL;
|
|
int i = 0;
|
|
errno_t ss_rc = 0;
|
|
|
|
/* use temp connections instead */
|
|
temp_connections = (PGXCNodeHandle**)palloc(conn_count * sizeof(PGXCNodeHandle*));
|
|
for (i = 0; i < conn_count; i++)
|
|
temp_connections[i] = connections[i];
|
|
|
|
Assert(queryId != 0);
|
|
|
|
for (i = 0; i < conn_count; i++) {
|
|
int msglen = 12;
|
|
|
|
if (temp_connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(temp_connections[i]);
|
|
|
|
if (connections[i]->state != DN_CONNECTION_STATE_IDLE)
|
|
LIBCOMM_DEBUG_LOG("send_queryid to node:%s[nid:%hu,sid:%hu] with abnormal state:%d",
|
|
temp_connections[i]->remoteNodeName,
|
|
temp_connections[i]->gsock.idx,
|
|
temp_connections[i]->gsock.sid,
|
|
temp_connections[i]->state);
|
|
|
|
/* msgType + msgLen */
|
|
ensure_out_buffer_capacity(1 + msglen, temp_connections[i]);
|
|
Assert(temp_connections[i]->outBuffer != NULL);
|
|
temp_connections[i]->outBuffer[temp_connections[i]->outEnd++] = 'r';
|
|
msglen = htonl(msglen);
|
|
ss_rc = memcpy_s(temp_connections[i]->outBuffer + temp_connections[i]->outEnd,
|
|
temp_connections[i]->outSize - temp_connections[i]->outEnd - 1,
|
|
&msglen,
|
|
sizeof(int));
|
|
securec_check(ss_rc, "\0", "\0");
|
|
temp_connections[i]->outEnd += 4;
|
|
ss_rc = memcpy_s(temp_connections[i]->outBuffer + temp_connections[i]->outEnd,
|
|
temp_connections[i]->outSize - temp_connections[i]->outEnd,
|
|
&queryId,
|
|
sizeof(uint64));
|
|
securec_check(ss_rc, "\0", "\0");
|
|
temp_connections[i]->outEnd += sizeof(uint64);
|
|
|
|
if (pgxc_node_flush(temp_connections[i]) != 0) {
|
|
temp_connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send query ID to %s while sending query ID with sync",
|
|
temp_connections[i]->remoteNodeName)));
|
|
}
|
|
|
|
temp_connections[i]->state = DN_CONNECTION_STATE_QUERY;
|
|
}
|
|
|
|
combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE);
|
|
|
|
while (conn_count > 0) {
|
|
if (pgxc_node_receive(conn_count, temp_connections, NULL)) {
|
|
int error_code;
|
|
char* error_msg = getSocketError(&error_code);
|
|
|
|
ereport(ERROR,
|
|
(errcode(error_code),
|
|
errmsg("Failed to read response from Datanodes while sending query ID with sync. Detail: %s\n",
|
|
error_msg)));
|
|
}
|
|
i = 0;
|
|
while (i < conn_count) {
|
|
int res = handle_response(temp_connections[i], combiner);
|
|
if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else if (res == RESPONSE_PLANID_OK) {
|
|
if (--conn_count > i)
|
|
temp_connections[i] = temp_connections[conn_count];
|
|
} else {
|
|
temp_connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Unexpected response from %s while sending query ID with sync",
|
|
temp_connections[i]->remoteNodeName),
|
|
errdetail("%s", (combiner->errorMessage == NULL) ? "none" : combiner->errorMessage)));
|
|
}
|
|
}
|
|
/* report error if any */
|
|
pgxc_node_report_error(combiner);
|
|
}
|
|
|
|
ValidateAndCloseCombiner(combiner);
|
|
pfree_ext(temp_connections);
|
|
}
|
|
|
|
bool pgxc_start_command_on_connection(
|
|
PGXCNodeHandle* connection, RemoteQueryState* remotestate, Snapshot snapshot,
|
|
const char* compressedPlan, int cLen)
|
|
{
|
|
CommandId cid;
|
|
RemoteQuery* step = (RemoteQuery*)remotestate->ss.ps.plan;
|
|
bool trigger_ship = false;
|
|
|
|
/*
|
|
* When enable_stream_operator = off;
|
|
* The current mechanism has such a problem that a CN will split complex SQL into multiple simple SQL and send it to
|
|
* the DN for execution. In order to ensure data consistency, the DN needs to use the same Snapshot for visibility
|
|
* judgment of such SQL. Therefore, the CN side sends such SQL identifier to the DN. Use
|
|
* PlannedStmt->Max_push_sql_num records the maximum number of SQL statements split by a SQL statement.
|
|
*/
|
|
int max_push_sqls = remotestate->ss.ps.state->es_plannedstmt->max_push_sql_num; /* max SQLs may send one DN */
|
|
|
|
if (connection->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(connection);
|
|
|
|
/*
|
|
* Scan descriptor would be valid and would contain a valid snapshot
|
|
* in cases when we need to send out of order command id to data node
|
|
* e.g. in case of a fetch
|
|
*/
|
|
TableScanDesc scanDesc = GetTableScanDesc(remotestate->ss.ss_currentScanDesc, remotestate->ss.ss_currentRelation);
|
|
|
|
if (remotestate->cursor != NULL && remotestate->cursor[0] != '\0' && scanDesc != NULL &&
|
|
scanDesc->rs_snapshot != NULL)
|
|
cid = scanDesc->rs_snapshot->curcid;
|
|
else {
|
|
/*
|
|
* An insert into a child by selecting form its parent gets translated
|
|
* into a multi-statement transaction in which first we select from parent
|
|
* and then insert into child, then select form child and insert into child.
|
|
* The select from child should not see the just inserted rows.
|
|
* The command id of the select from child is therefore set to
|
|
* the command id of the insert-select query saved earlier.
|
|
* Similarly a WITH query that updates a table in main query
|
|
* and inserts a row in the same table in the WITH query
|
|
* needs to make sure that the row inserted by the WITH query does
|
|
* not get updated by the main query.
|
|
*/
|
|
/* step->exec_nodes will be null in plan router/scan gather nodes */
|
|
if (step->exec_nodes && step->exec_nodes->accesstype == RELATION_ACCESS_READ && step->rq_save_command_id)
|
|
cid = remotestate->rqs_cmd_id;
|
|
else
|
|
// This "false" as passed-in parameter remains questionable, as it might affect the
|
|
// future updates of CommandId in someway we are not aware of yet.
|
|
cid = GetCurrentCommandId(false);
|
|
}
|
|
|
|
if (pgxc_node_send_cmd_id(connection, cid) < 0)
|
|
return false;
|
|
|
|
/* snapshot is not necessary to be sent to the compute pool */
|
|
if (snapshot && pgxc_node_send_snapshot(connection, snapshot, max_push_sqls) && IS_PGXC_COORDINATOR)
|
|
return false;
|
|
|
|
/* wlm_cgroup is not necessary to be sent to the compute pool */
|
|
if (ENABLE_WORKLOAD_CONTROL && *u_sess->wlm_cxt->control_group && IS_PGXC_COORDINATOR &&
|
|
pgxc_node_send_wlm_cgroup(connection))
|
|
return false;
|
|
|
|
if (ENABLE_WORKLOAD_CONTROL && u_sess->attr.attr_resource.resource_track_level == RESOURCE_TRACK_OPERATOR &&
|
|
pgxc_node_send_threadid(connection, t_thrd.proc_cxt.MyProcPid))
|
|
return false;
|
|
|
|
if (pgxc_node_send_queryid(connection, u_sess->debug_query_id))
|
|
return false;
|
|
|
|
// Instrumentation/Unique SQL: send unique sql id to DN node
|
|
if (is_unique_sql_enabled() && pgxc_node_send_unique_sql_id(connection))
|
|
return false;
|
|
|
|
if (step->remote_query && step->remote_query->isRowTriggerShippable)
|
|
trigger_ship = true;
|
|
|
|
if ((step->statement && step->statement[0] != '\0') || step->cursor || remotestate->rqs_num_params) {
|
|
/* need to use Extended Query Protocol */
|
|
int fetch = 0;
|
|
bool prepared = false;
|
|
bool send_desc = false;
|
|
|
|
if (step->base_tlist != NULL || step->has_row_marks ||
|
|
(step->exec_nodes && step->exec_nodes->accesstype == RELATION_ACCESS_READ)) {
|
|
send_desc = true;
|
|
}
|
|
|
|
/* if prepared statement is referenced see if it is already exist */
|
|
if (step->statement && step->statement[0] != '\0') {
|
|
if (step->is_simple) {
|
|
prepared = HaveActiveCoordinatorPreparedStatement(step->statement);
|
|
|
|
if (!prepared)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNDEFINED_PSTATEMENT),
|
|
errmsg("prepared statement \"%s\" does not exist", step->statement)));
|
|
} else
|
|
prepared = ActivateDatanodeStatementOnNode(
|
|
step->statement, PGXCNodeGetNodeId(connection->nodeoid, PGXC_NODE_DATANODE));
|
|
}
|
|
|
|
/*
|
|
* execute and fetch rows only if they will be consumed
|
|
* immediately by the sorter
|
|
*/
|
|
if (step->cursor)
|
|
fetch = 1;
|
|
|
|
if (step->is_simple) {
|
|
if (pgxc_node_send_plan_with_params(connection,
|
|
step->sql_statement,
|
|
remotestate->rqs_num_params,
|
|
remotestate->rqs_param_types,
|
|
remotestate->paramval_len,
|
|
remotestate->paramval_data,
|
|
fetch) != 0)
|
|
return false;
|
|
} else {
|
|
if (pgxc_node_send_query_extended(connection,
|
|
prepared ? NULL : step->sql_statement,
|
|
step->statement,
|
|
step->cursor,
|
|
remotestate->rqs_num_params,
|
|
remotestate->rqs_param_types,
|
|
remotestate->paramval_len,
|
|
remotestate->paramval_data,
|
|
send_desc,
|
|
fetch) != 0)
|
|
return false;
|
|
}
|
|
} else {
|
|
if (pgxc_node_send_query(connection, step->sql_statement, false, false, trigger_ship) != 0)
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* IsReturningDMLOnReplicatedTable
|
|
*
|
|
* This function returns true if the passed RemoteQuery
|
|
* 1. Operates on a table that is replicated
|
|
* 2. Represents a DML
|
|
* 3. Has a RETURNING clause in it
|
|
*
|
|
* If the passed RemoteQuery has a non null base_tlist
|
|
* means that DML has a RETURNING clause.
|
|
*/
|
|
|
|
static bool IsReturningDMLOnReplicatedTable(RemoteQuery* rq)
|
|
{
|
|
if (IsExecNodesReplicated(rq->exec_nodes) && rq->base_tlist != NULL && /* Means DML has RETURNING */
|
|
(rq->exec_nodes->accesstype == RELATION_ACCESS_UPDATE || rq->exec_nodes->accesstype == RELATION_ACCESS_INSERT))
|
|
return true;
|
|
|
|
return false;
|
|
}
|
|
|
|
static uint64 get_datasize_for_hdfsfdw(Plan* plan, int* filenum)
|
|
{
|
|
int fnum = 0;
|
|
|
|
Plan* fsplan = get_foreign_scan(plan);
|
|
|
|
/* get private data from foreign scan node */
|
|
ForeignScan* foreignScan = (ForeignScan*)fsplan;
|
|
|
|
List* foreignPrivateList = (List*)foreignScan->fdw_private;
|
|
|
|
if (0 == list_length(foreignPrivateList)) {
|
|
return 0;
|
|
}
|
|
|
|
DfsPrivateItem* item = NULL;
|
|
ListCell* lc = NULL;
|
|
foreach (lc, foreignPrivateList) {
|
|
DefElem* def = (DefElem*)lfirst(lc);
|
|
|
|
if (0 == pg_strcasecmp(def->defname, DFS_PRIVATE_ITEM)) {
|
|
item = (DfsPrivateItem*)def->arg;
|
|
}
|
|
}
|
|
|
|
if (item == NULL) {
|
|
return 0;
|
|
} else if (!item->dnTask) {
|
|
return 0;
|
|
}
|
|
|
|
uint64 totalSize = 0;
|
|
|
|
foreach (lc, item->dnTask) {
|
|
SplitMap* tmp_map = (SplitMap*)lfirst(lc);
|
|
totalSize += tmp_map->totalSize;
|
|
fnum += tmp_map->fileNums;
|
|
}
|
|
|
|
if (filenum != NULL)
|
|
*filenum = fnum;
|
|
|
|
return totalSize;
|
|
}
|
|
|
|
static uint64 get_datasize_for_gdsfdw(Plan* plan, int* filenum)
|
|
{
|
|
uint64 totalSize = 0;
|
|
int fnum = 0;
|
|
|
|
Plan* fsplan = get_foreign_scan(plan);
|
|
|
|
/* get private data from foreign scan node */
|
|
ForeignScan* foreignScan = (ForeignScan*)fsplan;
|
|
|
|
List* foreignPrivateList = (List*)foreignScan->fdw_private;
|
|
|
|
if (0 == list_length(foreignPrivateList)) {
|
|
return 0;
|
|
}
|
|
|
|
ListCell *lc = NULL, *lc1 = NULL, *lc2 = NULL;
|
|
foreach (lc, foreignPrivateList) {
|
|
DefElem* def = (DefElem*)lfirst(lc);
|
|
|
|
if (0 == pg_strcasecmp(def->defname, optTaskList)) {
|
|
List* oldDnTask = (List*)def->arg;
|
|
foreach (lc1, oldDnTask) {
|
|
DistFdwDataNodeTask* task = (DistFdwDataNodeTask*)lfirst(lc1);
|
|
foreach (lc2, task->task) {
|
|
DistFdwFileSegment* segment = (DistFdwFileSegment*)lfirst(lc2);
|
|
if (segment->ObjectSize) {
|
|
totalSize += segment->ObjectSize;
|
|
fnum++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (filenum != NULL)
|
|
*filenum = fnum;
|
|
|
|
return totalSize;
|
|
}
|
|
|
|
uint64 get_datasize(Plan* plan, int srvtype, int* filenum)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* @Description: delete the other datanode task for current data node.
|
|
* @in item, the given item, it repreants all node tasks.
|
|
* @return if we find the current data node task, return true, otherwise
|
|
* retuen false.
|
|
*/
|
|
bool handleDfsPrivateItemForComputPool(DfsPrivateItem* item)
|
|
{
|
|
ListCell* lc = NULL;
|
|
SplitMap* self_map = NULL;
|
|
|
|
/* reset splitmap for this data ndoe */
|
|
List* dnTask = item->dnTask;
|
|
item->dnTask = NIL;
|
|
|
|
foreach (lc, dnTask) {
|
|
SplitMap* tmp_map = (SplitMap*)lfirst(lc);
|
|
|
|
if ((u_sess->pgxc_cxt.PGXCNodeId == tmp_map->nodeId || LOCATOR_TYPE_REPLICATED == tmp_map->locatorType) &&
|
|
NIL != tmp_map->splits) {
|
|
self_map = tmp_map;
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* In some scenarioes fileList would be NIL. Take a example:
|
|
* In a cluster, we have M DNs, the table file count we read
|
|
* in OBS/HDFS is N. When M < N, some DNs will be not assign
|
|
* files to scan. Add this if statement to bypass the following
|
|
* codes. The programer who changes this should take care about
|
|
* this condition.
|
|
*/
|
|
if (self_map == NULL) {
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* When we don't set producerDOP, it should be 1. In this condition
|
|
* fileList should not be splitted into pieces shared between producer
|
|
* threads.
|
|
*/
|
|
if (u_sess->stream_cxt.producer_dop == 1) {
|
|
item->dnTask = lappend(item->dnTask, self_map);
|
|
} else {
|
|
SplitMap* new_map = (SplitMap*)makeNode(SplitMap);
|
|
|
|
// query on obs, more comments here to explain why set the flag.
|
|
new_map->locatorType = LOCATOR_TYPE_REPLICATED;
|
|
|
|
List* splits = list_copy(self_map->splits);
|
|
int fileCnt = 0;
|
|
foreach (lc, splits) {
|
|
SplitInfo* si = (SplitInfo*)lfirst(lc);
|
|
if ((fileCnt % u_sess->stream_cxt.producer_dop) != u_sess->stream_cxt.smp_id) {
|
|
self_map->splits = list_delete(self_map->splits, si);
|
|
}
|
|
|
|
fileCnt++;
|
|
}
|
|
|
|
new_map->splits = list_copy(self_map->splits);
|
|
new_map->fileNums = list_length(new_map->splits);
|
|
|
|
if (new_map->splits == NIL)
|
|
return false;
|
|
|
|
item->dnTask = lappend(item->dnTask, new_map);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* @Description: delete the other datanode task for current data node.
|
|
* @in dnTask, the given dnTask, it repreants all node tasks.
|
|
* @return if we find the current data node task, return true, otherwise
|
|
* retuen false.
|
|
*/
|
|
List* handleDistFdwDataNodeTaskForComputePool(List* dnTask)
|
|
{
|
|
DistFdwDataNodeTask* task = NULL;
|
|
ListCell* lc = NULL;
|
|
List* taskList = NIL;
|
|
foreach (lc, dnTask) {
|
|
task = (DistFdwDataNodeTask*)lfirst(lc);
|
|
if (0 == pg_strcasecmp(task->dnName, g_instance.attr.attr_common.PGXCNodeName)) {
|
|
taskList = lappend(taskList, copyObject(task));
|
|
break;
|
|
}
|
|
}
|
|
|
|
return taskList;
|
|
}
|
|
|
|
/*
|
|
* @Description: Just keep the splitmap which belongs to the data node, and
|
|
* remove all others. This function should be called in DataNode.
|
|
*
|
|
* @param[IN] plan : the foreign scan node.
|
|
* @return: false if this data node does nothing.
|
|
*/
|
|
static bool clean_splitmap(Plan* plan)
|
|
{
|
|
Assert(plan);
|
|
|
|
Assert(T_ForeignScan == nodeTag(plan) || T_VecForeignScan == nodeTag(plan));
|
|
|
|
ereport(DEBUG5, (errmodule(MOD_ACCELERATE), errmsg("in %s", __FUNCTION__)));
|
|
|
|
/* get private data from foreign scan node */
|
|
ForeignScan* foreignScan = (ForeignScan*)plan;
|
|
List* foreignPrivateList = (List*)foreignScan->fdw_private;
|
|
|
|
if (0 == list_length(foreignPrivateList)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNEXPECTED_NULL_VALUE), errmodule(MOD_ACCELERATE), errmsg("foreignPrivateList is NULL")));
|
|
}
|
|
|
|
/*
|
|
* find the splitmap which belongs to the data node, and reset
|
|
* DfsPrivateItem for DFS fdw.
|
|
* There is three listcell fdw_private for dist_fdw, it includes
|
|
* convert_selectively, optTaskList, gt_sessionId. This function only
|
|
* excute on datanode. Only the current datanode dntask is need to pushdow
|
|
* to coodinator node of computing pool.
|
|
*/
|
|
bool has_splits = false;
|
|
|
|
ListCell* lc = NULL;
|
|
foreach (lc, foreignPrivateList) {
|
|
DefElem* def = (DefElem*)lfirst(lc);
|
|
|
|
if (0 == pg_strcasecmp(def->defname, DFS_PRIVATE_ITEM)) {
|
|
DfsPrivateItem* item = (DfsPrivateItem*)def->arg;
|
|
has_splits = handleDfsPrivateItemForComputPool(item);
|
|
} else if (0 == pg_strcasecmp(def->defname, optTaskList)) {
|
|
List* dnTask = (List*)def->arg;
|
|
def->arg = (Node*)handleDistFdwDataNodeTaskForComputePool(dnTask);
|
|
list_free_ext(dnTask);
|
|
has_splits = list_length(((List*)def->arg)) ? true : false;
|
|
}
|
|
}
|
|
|
|
return has_splits;
|
|
}
|
|
|
|
/**
|
|
* @Description: need to assign dnTask on coordinator node of computing pool
|
|
* for dfs_fdw foreign table.
|
|
* @in item, the given item, that includes the all files, which is form main
|
|
* cluster.
|
|
* @in dn_num, current data node number.
|
|
* @return return new dn task.
|
|
*/
|
|
List* computePoolAssignDnTaskForFt(DfsPrivateItem* item, int dn_num)
|
|
{
|
|
List* dnTask = item->dnTask;
|
|
item->dnTask = NIL;
|
|
|
|
/* make ONE empty SplitMap for each DN */
|
|
List* new_maps = NIL;
|
|
for (int i = 0; i < dn_num; i++) {
|
|
SplitMap* new_map = (SplitMap*)makeNode(SplitMap);
|
|
|
|
new_map->locatorType = LOCATOR_TYPE_REPLICATED;
|
|
|
|
new_maps = lappend(new_maps, new_map);
|
|
}
|
|
|
|
/*
|
|
* I'm sure that just ONE splitmap in dnTask, because dnTask has been
|
|
* cleaned up in DWS DN by calling clean_splitmap()
|
|
*/
|
|
SplitMap* old_map = (SplitMap*)linitial(dnTask);
|
|
|
|
/* assign file(SplitInfo) in old_map to all DNs(SplitMap in new_maps */
|
|
int index = 0;
|
|
ListCell* lc = NULL;
|
|
foreach (lc, old_map->splits) {
|
|
SplitInfo* si = (SplitInfo*)lfirst(lc);
|
|
|
|
if (index >= dn_num)
|
|
index = 0;
|
|
|
|
Assert(new_maps);
|
|
|
|
SplitMap* map = (SplitMap*)list_nth(new_maps, index);
|
|
|
|
map->splits = lappend(map->splits, si);
|
|
|
|
index++;
|
|
}
|
|
|
|
return new_maps;
|
|
}
|
|
|
|
/**
|
|
* @Description: need to assign dnTask on coordinator node of computing pool
|
|
* for dfs_fdw foreign table.
|
|
* @in oldDnTask, the given oldDnTask, that includes the all files, which is form main
|
|
* cluster.
|
|
* @in dn_num, current data node number.
|
|
* @return return new dn task.
|
|
*/
|
|
List* computePoolAssignDnTaskForImportFt(List* oldDnTask)
|
|
{
|
|
Assert(list_length(oldDnTask) == 1);
|
|
|
|
List* dnNameList = PgxcNodeGetAllDataNodeNames();
|
|
DistFdwDataNodeTask* oldDataNodetask = (DistFdwDataNodeTask*)linitial(oldDnTask);
|
|
int nTasks = 0;
|
|
DistFdwDataNodeTask* task = NULL;
|
|
List* totalTask = NULL;
|
|
ListCell* lc = NULL;
|
|
int dnNum = list_length(dnNameList);
|
|
|
|
if (list_length(oldDataNodetask->task) > dnNum) {
|
|
nTasks = dnNum;
|
|
} else {
|
|
nTasks = list_length(oldDataNodetask->task);
|
|
}
|
|
|
|
for (int i = 0; i < nTasks; i++) {
|
|
task = makeNode(DistFdwDataNodeTask);
|
|
task->dnName = pstrdup((char*)list_nth(dnNameList, (i % dnNum)));
|
|
totalTask = lappend(totalTask, task);
|
|
}
|
|
|
|
int num_processed = 0;
|
|
foreach (lc, oldDataNodetask->task) {
|
|
DistFdwFileSegment* segment = (DistFdwFileSegment*)lfirst(lc);
|
|
|
|
task = (DistFdwDataNodeTask*)list_nth(totalTask, (num_processed % nTasks));
|
|
task->task = lappend(task->task, segment);
|
|
|
|
num_processed++;
|
|
}
|
|
|
|
return totalTask;
|
|
}
|
|
|
|
List* try_get_needed_dnnum(uint64 dnneeded)
|
|
{
|
|
return get_dnlist(dnneeded);
|
|
}
|
|
|
|
/*
|
|
* @Description:
|
|
Reassign the file in filelist to all DNs in the compute pool.
|
|
* This function should be called in the CN of the comptue pool.
|
|
*
|
|
* @param[IN] plan : the foreign scan node;
|
|
* @param[IN] dn_num : the number of the DN in the compute pool;
|
|
* @return: the list of the SplitMap, the number of SplitMap == the number of DN
|
|
*/
|
|
static List* reassign_splitmap(Plan* plan, int dn_num)
|
|
{
|
|
ListCell* lc = NULL;
|
|
List* new_maps = NULL;
|
|
Assert(plan && dn_num > 0);
|
|
|
|
Assert(T_ForeignScan == nodeTag(plan) || T_VecForeignScan == nodeTag(plan));
|
|
|
|
ereport(DEBUG5, (errmodule(MOD_ACCELERATE), errmsg("in %s", __FUNCTION__)));
|
|
|
|
/* get private data from foreign scan node */
|
|
ForeignScan* foreignScan = (ForeignScan*)plan;
|
|
List* foreignPrivateList = (List*)foreignScan->fdw_private;
|
|
|
|
if (0 == list_length(foreignPrivateList)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNEXPECTED_NULL_VALUE), errmodule(MOD_ACCELERATE), errmsg("foreignPrivateList is NULL")));
|
|
}
|
|
|
|
foreach (lc, foreignPrivateList) {
|
|
DefElem* def = (DefElem*)lfirst(lc);
|
|
|
|
if (0 == pg_strcasecmp(def->defname, DFS_PRIVATE_ITEM)) {
|
|
DfsPrivateItem* item = (DfsPrivateItem*)def->arg;
|
|
new_maps = computePoolAssignDnTaskForFt(item, dn_num);
|
|
} else if (0 == pg_strcasecmp(def->defname, optTaskList)) {
|
|
List* dnTask = (List*)def->arg;
|
|
new_maps = computePoolAssignDnTaskForImportFt(dnTask);
|
|
def->arg = (Node*)new_maps;
|
|
}
|
|
}
|
|
|
|
return new_maps;
|
|
}
|
|
|
|
static void make_new_spiltmap(Plan* plan, SplitMap* map)
|
|
{
|
|
Assert(T_ForeignScan == nodeTag(plan) || T_VecForeignScan == nodeTag(plan));
|
|
|
|
ForeignScan* foreignScan = (ForeignScan*)plan;
|
|
|
|
if (foreignScan->options->stype == T_TXT_CSV_OBS_SERVER) {
|
|
return;
|
|
}
|
|
|
|
List* foreignPrivateList = (List*)foreignScan->fdw_private;
|
|
if (0 == list_length(foreignPrivateList)) {
|
|
ereport(
|
|
ERROR, (errcode(ERRCODE_UNEXPECTED_NULL_VALUE), errmodule(MOD_HDFS), errmsg("foreignPrivateList is NULL")));
|
|
}
|
|
|
|
DfsPrivateItem* item = (DfsPrivateItem*)((DefElem*)linitial(foreignPrivateList))->arg;
|
|
|
|
item->dnTask = NIL;
|
|
item->dnTask = lappend(item->dnTask, map);
|
|
}
|
|
|
|
/*
|
|
* @Description: set relid of RET in PlannedStmt to InvalidOid.
|
|
*
|
|
* @param[IN] ps : PlannedStmt*
|
|
* @return: void
|
|
*/
|
|
static void resetRelidOfRTE(PlannedStmt* ps)
|
|
{
|
|
ListCell* lc = NULL;
|
|
|
|
foreach (lc, ps->rtable) {
|
|
RangeTblEntry* rte = (RangeTblEntry*)lfirst(lc);
|
|
|
|
if (rte->rtekind == RTE_RELATION) {
|
|
rte->relid = InvalidOid;
|
|
rte->ignoreResetRelid = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
List* get_dnlist_for_hdfs(int fnum)
|
|
{
|
|
static int origin_number = 0;
|
|
|
|
List* dnlist = NIL;
|
|
|
|
int dnnum = MIN(fnum, u_sess->pgxc_cxt.NumDataNodes);
|
|
|
|
if (dnnum == u_sess->pgxc_cxt.NumDataNodes)
|
|
return NIL;
|
|
|
|
int start_dn_number = origin_number;
|
|
|
|
if (start_dn_number >= u_sess->pgxc_cxt.NumDataNodes)
|
|
start_dn_number %= u_sess->pgxc_cxt.NumDataNodes;
|
|
|
|
int dnindex = 0;
|
|
for (int i = 1; i <= dnnum; i++) {
|
|
dnindex = start_dn_number + i;
|
|
|
|
if (dnindex >= u_sess->pgxc_cxt.NumDataNodes)
|
|
dnindex -= u_sess->pgxc_cxt.NumDataNodes;
|
|
|
|
dnlist = lappend_int(dnlist, dnindex);
|
|
}
|
|
|
|
origin_number = dnindex;
|
|
|
|
return dnlist;
|
|
}
|
|
|
|
void do_query_for_scangather(RemoteQueryState* node, bool vectorized)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* @Description: fill the BloomFilter information to foreignscan node.
|
|
* we need to Serialize it, and store it in foreignscan.
|
|
* @in node, the RemoteQueryState.
|
|
* @return none.
|
|
*/
|
|
void addBloomFilterSet(RemoteQueryState* node)
|
|
{
|
|
EState* estate = node->ss.ps.state;
|
|
RemoteQuery* step = (RemoteQuery*)node->ss.ps.plan;
|
|
ForeignScan* fScan = NULL;
|
|
List* bfIndex = NULL;
|
|
int bfCount = 0;
|
|
|
|
Plan* plan = step->scan.plan.lefttree;
|
|
while (plan->lefttree) {
|
|
plan = plan->lefttree;
|
|
}
|
|
|
|
Assert(IsA(plan, ForeignScan) || IsA(plan, VecForeignScan));
|
|
|
|
fScan = (ForeignScan*)plan;
|
|
|
|
bfIndex = ((Plan*)fScan)->filterIndexList;
|
|
bfCount = list_length(((Plan*)fScan)->var_list);
|
|
|
|
fScan->bfNum = bfCount;
|
|
fScan->bloomFilterSet = (BloomFilterSet**)palloc0(sizeof(BloomFilterSet*) * fScan->bfNum);
|
|
|
|
for (int bfNum = 0; bfNum < fScan->bfNum; bfNum++) {
|
|
int idx = list_nth_int(bfIndex, bfNum);
|
|
if (NULL != estate->es_bloom_filter.bfarray[idx]) {
|
|
fScan->bloomFilterSet[bfNum] = estate->es_bloom_filter.bfarray[idx]->makeBloomFilterSet();
|
|
}
|
|
}
|
|
}
|
|
|
|
/* return true if nothing to to */
|
|
static bool internal_do_query_for_planrouter(RemoteQueryState* node, bool vectorized)
|
|
{
|
|
Assert(IS_PGXC_DATANODE);
|
|
|
|
ereport(DEBUG5, (errmodule(MOD_ACCELERATE), errmsg("in %s", __FUNCTION__)));
|
|
|
|
RemoteQuery* step = (RemoteQuery*)node->ss.ps.plan;
|
|
|
|
Assert(step->position == PLAN_ROUTER);
|
|
|
|
PlannedStmt* planstmt = node->ss.ps.state->es_plannedstmt;
|
|
GlobalTransactionId gxid = InvalidGlobalTransactionId;
|
|
|
|
PGXCNodeAllHandles* pgxc_connections = NULL;
|
|
PGXCNodeHandle** connections = NULL;
|
|
int regular_conn_count = 0;
|
|
|
|
/*
|
|
* A openGauss node cannot run transactions while in recovery as
|
|
* this operation needs transaction IDs. This is more a safety guard than anything else.
|
|
*/
|
|
if (RecoveryInProgress())
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_RUN_TRANSACTION_DURING_RECOVERY),
|
|
errmsg("cannot run transaction to remote nodes during recovery")));
|
|
|
|
/*
|
|
* Consider a test case
|
|
*
|
|
* create table rf(a int, b int) distributed by replication;
|
|
* insert into rf values(1,2),(3,4) returning ctid;
|
|
*
|
|
* While inserting the first row do_query works fine, receives the returned
|
|
* row from the first connection and returns it. In this iteration the other
|
|
* datanodes also had returned rows but they have not yet been read from the
|
|
* network buffers. On Next Iteration do_query does not enter the data
|
|
* receiving loop because it finds that node->connections is not null.
|
|
* It is therefore required to set node->connections to null here.
|
|
*/
|
|
if (node->conn_count == 0)
|
|
node->connections = NULL;
|
|
|
|
/* just keep the files for the node itself */
|
|
Plan* fsplan = get_foreign_scan(step->scan.plan.lefttree);
|
|
if (!clean_splitmap(fsplan)) {
|
|
return true;
|
|
}
|
|
|
|
addBloomFilterSet(node);
|
|
|
|
ForeignOptions* options = ((ForeignScan*)fsplan)->options;
|
|
ServerTypeOption srvtype = options->stype;
|
|
|
|
/* collect coordinator get connection time */
|
|
TRACK_START(node->ss.ps.plan->plan_node_id, GET_COMPUTE_POOL_CONNECTION);
|
|
pgxc_connections = connect_compute_pool(srvtype);
|
|
TRACK_END(node->ss.ps.plan->plan_node_id, GET_COMPUTE_POOL_CONNECTION);
|
|
|
|
pgxc_palloc_net_ctl(1);
|
|
|
|
Assert(pgxc_connections);
|
|
connections = pgxc_connections->datanode_handles;
|
|
regular_conn_count = 1;
|
|
|
|
pfree_ext(pgxc_connections);
|
|
|
|
/*
|
|
* We save only regular connections, at the time we exit the function
|
|
* we finish with the primary connection and deal only with regular
|
|
* connections on subsequent invocations
|
|
*/
|
|
node->node_count = regular_conn_count;
|
|
|
|
/* assign gxid */
|
|
gxid = GetCurrentTransactionIdIfAny();
|
|
|
|
#ifdef STREAMPLAN
|
|
if (step->is_simple) {
|
|
StringInfoData str_remoteplan;
|
|
initStringInfo(&str_remoteplan);
|
|
elog(DEBUG5,
|
|
"DN Node Id %d, Thread ID:%lu, queryId: %lu, query: %s",
|
|
u_sess->pgxc_cxt.PGXCNodeId,
|
|
gs_thread_self(),
|
|
planstmt->queryId,
|
|
t_thrd.postgres_cxt.debug_query_string ? t_thrd.postgres_cxt.debug_query_string : "");
|
|
|
|
planstmt->query_string =
|
|
const_cast<char*>(t_thrd.postgres_cxt.debug_query_string ? t_thrd.postgres_cxt.debug_query_string : "");
|
|
|
|
PlannedStmt* ps = (PlannedStmt*)copyObject(planstmt);
|
|
resetRelidOfRTE(ps);
|
|
|
|
ps->in_compute_pool = true;
|
|
|
|
Plan* pushdown_plan = (Plan*)copyObject(step->scan.plan.lefttree);
|
|
ForeignScan* scan_plan = (ForeignScan*)get_foreign_scan(pushdown_plan);
|
|
scan_plan->in_compute_pool = true;
|
|
scan_plan->errCache = NULL;
|
|
|
|
SerializePlan(pushdown_plan, ps, &str_remoteplan, step->num_stream, step->num_gather, false);
|
|
step->sql_statement = str_remoteplan.data;
|
|
}
|
|
#endif
|
|
|
|
Assert(u_sess->debug_query_id != 0);
|
|
pgstat_report_queryid(u_sess->debug_query_id);
|
|
|
|
for (int i = 0; i < regular_conn_count; i++) {
|
|
if (srvtype == T_OBS_SERVER || srvtype == T_TXT_CSV_OBS_SERVER) {
|
|
ComputePoolConfig** conf = get_cp_conninfo();
|
|
|
|
char* file_format = getFTOptionValue(options->fOptions, OPTION_NAME_FORMAT);
|
|
Assert(file_format);
|
|
|
|
uint64 pl_size = conf[0]->pl;
|
|
pl_size *= 1024;
|
|
|
|
Index index = ((ForeignScan*)fsplan)->scan.scanrelid;
|
|
RangeTblEntry* rte = (RangeTblEntry*)list_nth(planstmt->rtable, index - 1);
|
|
|
|
if (!pg_strcasecmp(file_format, "orc"))
|
|
pl_size = adjust_plsize(rte->relid, (uint64)fsplan->plan_width, pl_size, NULL);
|
|
|
|
if (pgxc_node_send_userpl(connections[i], int64(pl_size)))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Could not send user pl to CN of the compute pool: %s.",
|
|
connections[i]->connInfo.host.data)));
|
|
}
|
|
|
|
if (compute_node_begin(1, &connections[i], gxid))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Could not begin transaction on Datanodes %u.", connections[i]->nodeoid)));
|
|
|
|
if (!pgxc_start_command_on_connection(connections[i], node, NULL)) {
|
|
pfree_ext(connections);
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to send command to Datanodes")));
|
|
}
|
|
connections[i]->combiner = node;
|
|
|
|
// If send command to Datanodes successfully, outEnd must be 0.
|
|
// So the outBuffer can be freed here and reset to default buffer size 16K.
|
|
//
|
|
Assert(connections[i]->outEnd == 0);
|
|
ResetHandleOutBuffer(connections[i]);
|
|
}
|
|
|
|
do_query_for_first_tuple(node, vectorized, regular_conn_count, connections, NULL, NIL);
|
|
|
|
/* reset */
|
|
if (step->is_simple)
|
|
step->sql_statement = NULL;
|
|
|
|
return false;
|
|
}
|
|
|
|
bool do_query_for_planrouter(RemoteQueryState* node, bool vectorized)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return false;
|
|
}
|
|
|
|
void do_query(RemoteQueryState* node, bool vectorized)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
RemoteQuery* step = (RemoteQuery*)node->ss.ps.plan;
|
|
TupleTableSlot* scanslot = node->ss.ss_ScanTupleSlot;
|
|
bool force_autocommit = step->force_autocommit;
|
|
bool is_read_only = step->read_only;
|
|
GlobalTransactionId gxid = InvalidGlobalTransactionId;
|
|
Snapshot snapshot = GetActiveSnapshot();
|
|
PGXCNodeHandle** connections = NULL;
|
|
PGXCNodeHandle* primaryconnection = NULL;
|
|
int i;
|
|
int regular_conn_count = 0;
|
|
bool need_tran_block = false;
|
|
PGXCNodeAllHandles* pgxc_connections = NULL;
|
|
|
|
/*
|
|
* A openGauss node cannot run transactions while in recovery as
|
|
* this operation needs transaction IDs. This is more a safety guard than anything else.
|
|
*/
|
|
if (RecoveryInProgress())
|
|
elog(ERROR, "cannot run transaction to remote nodes during recovery");
|
|
|
|
/*
|
|
* Remember if the remote query is accessing a temp object
|
|
*
|
|
* !! PGXC TODO Check if the is_temp flag is propogated correctly when a
|
|
* remote join is reduced
|
|
*/
|
|
if (step->is_temp)
|
|
ExecSetTempObjectIncluded();
|
|
|
|
/*
|
|
* Consider a test case
|
|
*
|
|
* create table rf(a int, b int) distributed by replication;
|
|
* insert into rf values(1,2),(3,4) returning ctid;
|
|
*
|
|
* While inserting the first row do_query works fine, receives the returned
|
|
* row from the first connection and returns it. In this iteration the other
|
|
* datanodes also had returned rows but they have not yet been read from the
|
|
* network buffers. On Next Iteration do_query does not enter the data
|
|
* receiving loop because it finds that node->connections is not null.
|
|
* It is therefore required to set node->connections to null here.
|
|
*/
|
|
if (node->conn_count == 0)
|
|
node->connections = NULL;
|
|
|
|
/*
|
|
* Get connections for Datanodes only, utilities and DDLs
|
|
* are launched in ExecRemoteUtility
|
|
*/
|
|
pgxc_connections = get_exec_connections(node, step->exec_nodes, step->exec_type);
|
|
|
|
if (step->exec_type == EXEC_ON_DATANODES) {
|
|
connections = pgxc_connections->datanode_handles;
|
|
regular_conn_count = pgxc_connections->dn_conn_count;
|
|
} else if (step->exec_type == EXEC_ON_COORDS) {
|
|
connections = pgxc_connections->coord_handles;
|
|
regular_conn_count = pgxc_connections->co_conn_count;
|
|
}
|
|
|
|
primaryconnection = pgxc_connections->primary_handle;
|
|
|
|
/* Primary connection is counted separately */
|
|
if (primaryconnection)
|
|
regular_conn_count--;
|
|
|
|
pfree(pgxc_connections);
|
|
|
|
/*
|
|
* We save only regular connections, at the time we exit the function
|
|
* we finish with the primary connection and deal only with regular
|
|
* connections on subsequent invocations
|
|
*/
|
|
node->node_count = regular_conn_count;
|
|
|
|
if (force_autocommit || is_read_only)
|
|
need_tran_block = false;
|
|
else
|
|
need_tran_block = true;
|
|
/*
|
|
* XXX We are forcing a transaction block for non-read-only every remote query. We can
|
|
* get smarter here and avoid a transaction block if all of the following
|
|
* conditions are true:
|
|
*
|
|
* - there is only one writer node involved in the transaction (including
|
|
* the local node)
|
|
* - the statement being executed on the remote writer node is a single
|
|
* step statement. IOW, Coordinator must not send multiple queries to the
|
|
* remote node.
|
|
*
|
|
* Once we have leak-proof mechanism to enforce these constraints, we
|
|
* should relax the transaction block requirement.
|
|
*
|
|
need_tran_block = (!is_read_only && total_conn_count > 1) ||
|
|
(TransactionBlockStatusCode() == 'T');
|
|
*/
|
|
|
|
elog(DEBUG1,
|
|
"has primary = %s, regular_conn_count = %d, "
|
|
"need_tran_block = %s",
|
|
primaryconnection ? "true" : "false",
|
|
regular_conn_count,
|
|
need_tran_block ? "true" : "false");
|
|
|
|
gxid = GetCurrentTransactionId();
|
|
|
|
if (!GlobalTransactionIdIsValid(gxid)) {
|
|
if (primaryconnection)
|
|
pfree(primaryconnection);
|
|
pfree(connections);
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to get next transaction ID")));
|
|
}
|
|
|
|
/* See if we have a primary node, execute on it first before the others */
|
|
if (primaryconnection) {
|
|
if (pgxc_node_begin(1, &primaryconnection, gxid, need_tran_block, is_read_only, PGXC_NODE_DATANODE))
|
|
ereport(
|
|
ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not begin transaction on primary Datanode.")));
|
|
|
|
if (!pgxc_start_command_on_connection(primaryconnection, node, snapshot)) {
|
|
pfree(connections);
|
|
pfree(primaryconnection);
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to Datanodes")));
|
|
}
|
|
Assert(node->combine_type == COMBINE_TYPE_SAME);
|
|
|
|
/* Make sure the command is completed on the primary node */
|
|
while (true) {
|
|
int res;
|
|
if (pgxc_node_receive(1, &primaryconnection, NULL))
|
|
break;
|
|
|
|
res = handle_response(primaryconnection, node);
|
|
if (res == RESPONSE_COMPLETE)
|
|
break;
|
|
else if (res == RESPONSE_TUPDESC) {
|
|
ExecSetSlotDescriptor(scanslot, node->tuple_desc);
|
|
/*
|
|
* Now tuple table slot is responsible for freeing the
|
|
* descriptor
|
|
*/
|
|
node->tuple_desc = NULL;
|
|
/*
|
|
* RemoteQuery node doesn't support backward scan, so
|
|
* randomAccess is false, neither we want this tuple store
|
|
* persist across transactions.
|
|
*/
|
|
node->tuplestorestate = tuplestore_begin_heap(false, false, work_mem);
|
|
tuplestore_set_eflags(node->tuplestorestate, node->eflags);
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
pfree(node->currentRow.msg);
|
|
node->currentRow.msg = NULL;
|
|
node->currentRow.msglen = 0;
|
|
node->currentRow.msgnode = 0;
|
|
continue;
|
|
} else if (res == RESPONSE_EOF)
|
|
continue;
|
|
else
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Unexpected response from Datanode")));
|
|
}
|
|
/* report error if any */
|
|
pgxc_node_report_error(node);
|
|
}
|
|
|
|
for (i = 0; i < regular_conn_count; i++) {
|
|
if (pgxc_node_begin(1, &connections[i], gxid, need_tran_block, is_read_only, PGXC_NODE_DATANODE))
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not begin transaction on Datanodes.")));
|
|
|
|
if (!pgxc_start_command_on_connection(connections[i], node, snapshot)) {
|
|
pfree(connections);
|
|
if (primaryconnection)
|
|
pfree(primaryconnection);
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to Datanodes")));
|
|
}
|
|
connections[i]->combiner = node;
|
|
}
|
|
|
|
if (step->cursor) {
|
|
node->cursor_count = regular_conn_count;
|
|
node->cursor_connections = (PGXCNodeHandle**)palloc(regular_conn_count * sizeof(PGXCNodeHandle*));
|
|
memcpy(node->cursor_connections, connections, regular_conn_count * sizeof(PGXCNodeHandle*));
|
|
}
|
|
|
|
/*
|
|
* Stop if all commands are completed or we got a data row and
|
|
* initialized state node for subsequent invocations
|
|
*/
|
|
while (regular_conn_count > 0 && node->connections == NULL) {
|
|
int i = 0;
|
|
|
|
if (pgxc_node_receive(regular_conn_count, connections, NULL)) {
|
|
pfree(connections);
|
|
if (primaryconnection)
|
|
pfree(primaryconnection);
|
|
if (node->cursor_connections)
|
|
pfree(node->cursor_connections);
|
|
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to read response from Datanodes")));
|
|
}
|
|
/*
|
|
* Handle input from the Datanodes.
|
|
* If we got a RESPONSE_DATAROW we can break handling to wrap
|
|
* it into a tuple and return. Handling will be continued upon
|
|
* subsequent invocations.
|
|
* If we got 0, we exclude connection from the list. We do not
|
|
* expect more input from it. In case of non-SELECT query we quit
|
|
* the loop when all nodes finish their work and send ReadyForQuery
|
|
* with empty connections array.
|
|
* If we got EOF, move to the next connection, will receive more
|
|
* data on the next iteration.
|
|
*/
|
|
while (i < regular_conn_count) {
|
|
int res = handle_response(connections[i], node);
|
|
if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (i < --regular_conn_count)
|
|
connections[i] = connections[regular_conn_count];
|
|
} else if (res == RESPONSE_TUPDESC) {
|
|
ExecSetSlotDescriptor(scanslot, node->tuple_desc);
|
|
/*
|
|
* Now tuple table slot is responsible for freeing the
|
|
* descriptor
|
|
*/
|
|
node->tuple_desc = NULL;
|
|
/*
|
|
* RemoteQuery node doesn't support backward scan, so
|
|
* randomAccess is false, neither we want this tuple store
|
|
* persist across transactions.
|
|
*/
|
|
node->tuplestorestate = tuplestore_begin_heap(false, false, work_mem);
|
|
tuplestore_set_eflags(node->tuplestorestate, node->eflags);
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
/*
|
|
* Got first data row, quit the loop
|
|
*/
|
|
node->connections = connections;
|
|
node->conn_count = regular_conn_count;
|
|
node->current_conn = i;
|
|
break;
|
|
} else
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Unexpected response from Datanode")));
|
|
}
|
|
/* report error if any */
|
|
pgxc_node_report_error(node);
|
|
}
|
|
|
|
if (node->cursor_count) {
|
|
node->conn_count = node->cursor_count;
|
|
memcpy(connections, node->cursor_connections, node->cursor_count * sizeof(PGXCNodeHandle*));
|
|
node->connections = connections;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* the code in this function belongs to do_query() previously.
|
|
* Now, split old do_query() into new do_query() and do_query_for_first_tuple()
|
|
* for lower complexity.
|
|
*/
|
|
void do_query_for_first_tuple(RemoteQueryState* node, bool vectorized, int regular_conn_count,
|
|
PGXCNodeHandle** connections, PGXCNodeHandle* primaryconnection, List* dummy_connections)
|
|
{
|
|
RemoteQuery* step = (RemoteQuery*)node->ss.ps.plan;
|
|
TupleTableSlot* scanslot = node->ss.ss_ScanTupleSlot;
|
|
PlannedStmt* planstmt = node->ss.ps.state->es_plannedstmt;
|
|
uint64 max_processed = 0;
|
|
uint64 min_processed = PG_UINT64_MAX;
|
|
Oid relid = (planstmt->resultRelations == NULL)
|
|
? InvalidOid
|
|
: getrelid(list_nth_int((List*)linitial2(planstmt->resultRelations), 0), planstmt->rtable);
|
|
bool compute_dn_oids = true;
|
|
List* oids = NULL;
|
|
errno_t rc = 0;
|
|
|
|
/*
|
|
* Stop if all commands are completed or we got a data row and
|
|
* initialized state node for subsequent invocations
|
|
*/
|
|
bool isFreeConn = true;
|
|
while (regular_conn_count > 0 && node->connections == NULL) {
|
|
int i = 0;
|
|
struct timeval timeout;
|
|
timeout.tv_sec = ERROR_CHECK_TIMEOUT;
|
|
timeout.tv_usec = 0;
|
|
|
|
/*
|
|
* If need check the other errors after getting a normal communcation error,
|
|
* set timeout first when coming to receive data again. If then get any poll
|
|
* error, report the former cached error in combiner(RemoteQueryState).
|
|
*/
|
|
if (pgxc_node_receive(regular_conn_count, connections, node->need_error_check ? &timeout : NULL)) {
|
|
pfree_ext(connections);
|
|
if (primaryconnection != NULL)
|
|
pfree_ext(primaryconnection);
|
|
if (node->cursor_connections)
|
|
pfree_ext(node->cursor_connections);
|
|
|
|
if (!node->need_error_check) {
|
|
int error_code = 0;
|
|
char* error_msg = getSocketError(&error_code);
|
|
|
|
ereport(ERROR,
|
|
(errcode(error_code), errmsg("Failed to read response from Datanodes Detail: %s\n", error_msg == NULL ? "null" : error_msg)));
|
|
} else {
|
|
node->need_error_check = false;
|
|
pgxc_node_report_error(node);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Handle input from the Datanodes.
|
|
* If we got a RESPONSE_DATAROW we can break handling to wrap
|
|
* it into a tuple and return. Handling will be continued upon
|
|
* subsequent invocations.
|
|
* If we got 0, we exclude connection from the list. We do not
|
|
* expect more input from it. In case of non-SELECT query we quit
|
|
* the loop when all nodes finish their work and send ReadyForQuery
|
|
* with empty connections array.
|
|
* If we got EOF, move to the next connection, will receive more
|
|
* data on the next iteration.
|
|
*/
|
|
while (i < regular_conn_count) {
|
|
int res = handle_response(connections[i],
|
|
node,
|
|
(dummy_connections != NIL) ? list_member_ptr(dummy_connections, connections[i]) : false);
|
|
if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
/* the response is complete but the connection state is fatal */
|
|
if (unlikely(connections[i]->state == DN_CONNECTION_STATE_ERROR_FATAL))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("FATAL state of connection to datanode %u", connections[i]->nodeoid)));
|
|
|
|
if (u_sess->attr.attr_sql.table_skewness_warning_threshold < 1 && planstmt->commandType == CMD_INSERT &&
|
|
node->combine_type != COMBINE_TYPE_SAME) {
|
|
if (compute_dn_oids) {
|
|
RelationLocInfo* rlc = (relid == InvalidOid) ? NULL : GetRelationLocInfo(relid);
|
|
oids = (rlc == NULL) ? NULL : PgxcNodeGetDataNodeOids(rlc->nodeList);
|
|
compute_dn_oids = false;
|
|
}
|
|
|
|
if (node->rqs_cur_processed > max_processed)
|
|
max_processed = node->rqs_cur_processed;
|
|
if (node->rqs_cur_processed < min_processed) {
|
|
if (node->rqs_cur_processed > 0)
|
|
min_processed = node->rqs_cur_processed;
|
|
else if (list_member_int(oids, connections[i]->nodeoid))
|
|
min_processed = 0;
|
|
}
|
|
}
|
|
|
|
if (i < --regular_conn_count)
|
|
connections[i] = connections[regular_conn_count];
|
|
} else if (res == RESPONSE_TUPDESC) {
|
|
ExecSetSlotDescriptor(scanslot, node->tuple_desc);
|
|
/*
|
|
* Now tuple table slot is responsible for freeing the
|
|
* descriptor
|
|
*/
|
|
node->tuple_desc = NULL;
|
|
/*
|
|
* RemoteQuery node doesn't support backward scan, so
|
|
* randomAccess is false, neither we want this tuple store
|
|
* persist across transactions.
|
|
*/
|
|
node->tuplestorestate = tuplestore_begin_heap(false, false, u_sess->attr.attr_memory.work_mem);
|
|
tuplestore_set_eflags(node->tuplestorestate, node->eflags);
|
|
|
|
if (step->sort) {
|
|
SimpleSort* sort = step->sort;
|
|
node->connections = connections;
|
|
isFreeConn = false;
|
|
node->conn_count = regular_conn_count;
|
|
|
|
if (!vectorized)
|
|
node->tuplesortstate = tuplesort_begin_merge(scanslot->tts_tupleDescriptor,
|
|
sort->numCols,
|
|
sort->sortColIdx,
|
|
sort->sortOperators,
|
|
sort->sortCollations,
|
|
sort->nullsFirst,
|
|
node,
|
|
u_sess->attr.attr_memory.work_mem);
|
|
else
|
|
node->batchsortstate = batchsort_begin_merge(scanslot->tts_tupleDescriptor,
|
|
sort->numCols,
|
|
sort->sortColIdx,
|
|
sort->sortOperators,
|
|
sort->sortCollations,
|
|
sort->nullsFirst,
|
|
node,
|
|
u_sess->attr.attr_memory.work_mem);
|
|
|
|
/*
|
|
* Break the loop, do not wait for first row.
|
|
* Tuplesort module want to control node it is
|
|
* fetching rows from, while in this loop first
|
|
* row would be got from random node
|
|
*/
|
|
break;
|
|
}
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
/*
|
|
* Got first data row, quit the loop
|
|
*/
|
|
node->connections = connections;
|
|
isFreeConn = false;
|
|
node->conn_count = regular_conn_count;
|
|
node->current_conn = i;
|
|
break;
|
|
} else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Unexpected response from Datanode %u", connections[i]->nodeoid)));
|
|
}
|
|
|
|
/* report error if any */
|
|
pgxc_node_report_error(node);
|
|
}
|
|
|
|
/* Alarm table skewness warning if occurs. */
|
|
if (u_sess->attr.attr_sql.table_skewness_warning_threshold < 1 && planstmt->commandType == CMD_INSERT &&
|
|
node->combine_type != COMBINE_TYPE_SAME &&
|
|
node->rqs_processed > (uint64)(u_sess->attr.attr_sql.table_skewness_warning_rows * planstmt->num_nodes) &&
|
|
(max_processed - min_processed) >
|
|
node->rqs_processed * u_sess->attr.attr_sql.table_skewness_warning_threshold) {
|
|
char tableInfo[256] = {'\0'};
|
|
char* dbName = NULL;
|
|
errno_t ret = EOK;
|
|
if (u_sess->proc_cxt.MyProcPort)
|
|
dbName = u_sess->proc_cxt.MyProcPort->database_name;
|
|
else
|
|
dbName = "[unknown]";
|
|
ret = sprintf_s(tableInfo, sizeof(tableInfo), "%s.%s", dbName, get_nsp_relname(relid));
|
|
securec_check_ss_c(ret, "\0", "\0");
|
|
if (get_rel_relkind(relid) != RELKIND_FOREIGN_TABLE && get_rel_relkind(relid) != RELKIND_STREAM)
|
|
ereport(WARNING,
|
|
(errmsg("Skewness occurs, table name: %s, min value: %lu, max value: %lu, sum value: %lu, avg value: "
|
|
"%lu, skew ratio: %.3lf",
|
|
tableInfo,
|
|
min_processed,
|
|
max_processed,
|
|
node->rqs_processed,
|
|
(list_length(oids) != 0) ? (node->rqs_processed / list_length(oids)) : 0,
|
|
(double)(max_processed - min_processed) / (double)(node->rqs_processed)),
|
|
errhint("Please check data distribution or modify warning threshold")));
|
|
for (int j = 0; j < ALARM_RETRY_COUNT; j++) {
|
|
report_table_skewness_alarm(ALM_AT_Fault, tableInfo);
|
|
}
|
|
}
|
|
if (node->cursor_count && node->cursor_connections) {
|
|
node->conn_count = node->cursor_count;
|
|
rc = memcpy_s(connections,
|
|
node->cursor_count * sizeof(PGXCNodeHandle*),
|
|
node->cursor_connections,
|
|
node->cursor_count * sizeof(PGXCNodeHandle*));
|
|
securec_check(rc, "\0", "\0");
|
|
node->connections = connections;
|
|
}
|
|
|
|
/* Must free connections if it does not used */
|
|
if (isFreeConn) {
|
|
pfree_ext(connections);
|
|
connections = NULL;
|
|
}
|
|
if (dummy_connections != NULL)
|
|
list_free_ext(dummy_connections);
|
|
}
|
|
|
|
TupleTableSlot* ExecRemoteQuery(RemoteQueryState* step)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
#else
|
|
return ExecScan(&(node->ss), (ExecScanAccessMtd)RemoteQueryNext, (ExecScanRecheckMtd)RemoteQueryRecheck);
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* RemoteQueryRecheck -- remote query routine to recheck a tuple in EvalPlanQual
|
|
*/
|
|
static bool RemoteQueryRecheck(RemoteQueryState* node, TupleTableSlot* slot)
|
|
{
|
|
/*
|
|
* Note that unlike IndexScan, RemoteQueryScan never use keys in tableam_scan_begin
|
|
* (and this is very bad) - so, here we do not check are keys ok or not.
|
|
*/
|
|
return true;
|
|
}
|
|
/*
|
|
* Execute step of PGXC plan.
|
|
* The step specifies a command to be executed on specified nodes.
|
|
* On first invocation connections to the Datanodes are initialized and
|
|
* command is executed. Further, as well as within subsequent invocations,
|
|
* responses are received until step is completed or there is a tuple to emit.
|
|
* If there is a tuple it is returned, otherwise returned NULL. The NULL result
|
|
* from the function indicates completed step.
|
|
* The function returns at most one tuple per invocation.
|
|
*/
|
|
static TupleTableSlot* RemoteQueryNext(ScanState* scan_node)
|
|
{
|
|
PlanState* outerNode = NULL;
|
|
|
|
RemoteQueryState* node = (RemoteQueryState*)scan_node;
|
|
TupleTableSlot* scanslot = scan_node->ss_ScanTupleSlot;
|
|
RemoteQuery* rq = (RemoteQuery*)node->ss.ps.plan;
|
|
EState* estate = node->ss.ps.state;
|
|
|
|
/*
|
|
* Initialize tuples processed to 0, to make sure we don't re-use the
|
|
* values from the earlier iteration of RemoteQueryNext(). For an FQS'ed
|
|
* DML returning query, it may not get updated for subsequent calls.
|
|
* because there won't be a HandleCommandComplete() call to update this
|
|
* field.
|
|
*/
|
|
node->rqs_processed = 0;
|
|
|
|
if (!node->query_Done) {
|
|
/* Fire BEFORE STATEMENT triggers just before the query execution */
|
|
if (rq->remote_query)
|
|
pgxc_rq_fire_bstriggers(node);
|
|
|
|
if (rq->position == PLAN_ROUTER) {
|
|
if (do_query_for_planrouter(node))
|
|
return NULL;
|
|
} else if (rq->position == SCAN_GATHER) {
|
|
do_query_for_scangather(node);
|
|
} else {
|
|
do_query(node);
|
|
}
|
|
|
|
node->query_Done = true;
|
|
}
|
|
|
|
if (rq->position == PLAN_ROUTER && node->resource_error == true) {
|
|
outerNode = outerPlanState(outerPlanState(node)); /* skip scangather node */
|
|
scanslot = ExecProcNode(outerNode);
|
|
} else if (node->update_cursor) {
|
|
PGXCNodeAllHandles* all_dn_handles = get_exec_connections(node, NULL, EXEC_ON_DATANODES);
|
|
close_node_cursors(all_dn_handles->datanode_handles, all_dn_handles->dn_conn_count, node->update_cursor);
|
|
pfree_ext(node->update_cursor);
|
|
node->update_cursor = NULL;
|
|
pfree_pgxc_all_handles(all_dn_handles);
|
|
} else if (rq->is_simple && node->tuplesortstate) {
|
|
if (rq->sort->sortToStore) {
|
|
Assert(node->tuplestorestate);
|
|
Tuplestorestate* tuplestorestate = node->tuplestorestate;
|
|
bool eof_tuplestore = tuplestore_ateof(tuplestorestate);
|
|
|
|
/*
|
|
* If we can fetch another tuple from the tuplestore, return it.
|
|
*/
|
|
if (!eof_tuplestore) {
|
|
/* RemoteQuery node doesn't support backward scans */
|
|
if (!tuplestore_gettupleslot(tuplestorestate, true, false, scanslot))
|
|
eof_tuplestore = true;
|
|
}
|
|
|
|
/*
|
|
* If there is no tuple in tuplestore, but there are tuples not be processed, we should sort them
|
|
* and put it into tuplestore too.
|
|
*/
|
|
if (eof_tuplestore && (!node->eof_underlying || (node->currentRow.msg != NULL))) {
|
|
if (!tuplesort_gettupleslot_into_tuplestore(
|
|
(Tuplesortstate*)node->tuplesortstate, true, scanslot, NULL, node->tuplestorestate)) {
|
|
(void)ExecClearTuple(scanslot);
|
|
node->eof_underlying = true;
|
|
}
|
|
}
|
|
|
|
if (eof_tuplestore && node->eof_underlying)
|
|
(void)ExecClearTuple(scanslot);
|
|
} else
|
|
(void)tuplesort_gettupleslot((Tuplesortstate*)node->tuplesortstate, true, scanslot, NULL);
|
|
} else if (rq->spool_no_data == true) { // for simple remotequery, we just pass the data, no need to spool
|
|
node->fetchTuple(node, scanslot, NULL);
|
|
} else if (node->tuplestorestate) {
|
|
/*
|
|
* If we are not at the end of the tuplestore, try
|
|
* to fetch a tuple from tuplestore.
|
|
*/
|
|
Tuplestorestate* tuplestorestate = node->tuplestorestate;
|
|
bool eof_tuplestore = tuplestore_ateof(tuplestorestate);
|
|
|
|
/*
|
|
* If we can fetch another tuple from the tuplestore, return it.
|
|
*/
|
|
if (!eof_tuplestore) {
|
|
/* RemoteQuery node doesn't support backward scans */
|
|
if (!tuplestore_gettupleslot(tuplestorestate, true, false, scanslot))
|
|
eof_tuplestore = true;
|
|
}
|
|
|
|
/*
|
|
* Consider a test case
|
|
*
|
|
* create table ta1 (v1 int, v2 int);
|
|
* insert into ta1 values(1,2),(2,3),(3,4);
|
|
*
|
|
* create table ta2 (v1 int, v2 int);
|
|
* insert into ta2 values(1,2),(2,3),(3,4);
|
|
*
|
|
* select t1.ctid, t2.ctid,* from ta1 t1, ta2 t2
|
|
* where t2.v2<=3 order by t1.v1;
|
|
* ctid | ctid | v1 | v2 | v1 | v2
|
|
* -------+-------+----+----+----+----
|
|
* Row_1 (0,1) | (0,1) | 1 | 2 | 1 | 2
|
|
* Row_2 (0,1) | (0,2) | 1 | 2 | 2 | 3
|
|
* Row_3 (0,2) | (0,1) | 2 | 3 | 1 | 2
|
|
* Row_4 (0,2) | (0,2) | 2 | 3 | 2 | 3
|
|
* Row_5 (0,1) | (0,1) | 3 | 4 | 1 | 2
|
|
* Row_6 (0,1) | (0,2) | 3 | 4 | 2 | 3
|
|
* (6 rows)
|
|
*
|
|
* Note that in the resulting join, we are getting one row of ta1 twice,
|
|
* as shown by the ctid's in the results. Now consider this update
|
|
*
|
|
* update ta1 t1 set v2=t1.v2+10 from ta2 t2
|
|
* where t2.v2<=3 returning t1.ctid,t1.v1 t1_v1, t1.v2 t1_v2;
|
|
*
|
|
* The first iteration of the update runs for Row_1, succeeds and
|
|
* updates its ctid to say (0,3). In the second iteration for Row_2,
|
|
* since the ctid of the row has already changed, fails to update any
|
|
* row and hence do_query does not return any tuple. The FetchTuple
|
|
* call in RemoteQueryNext hence fails and eof_underlying is set to true.
|
|
* However in the third iteration for Row_3, the update succeeds and
|
|
* returns a row, but since the eof_underlying is already set to true,
|
|
* the RemoteQueryNext does not bother calling FetchTuple, we therefore
|
|
* do not get more than one row returned as a result of the update
|
|
* returning query. It is therefore required in RemoteQueryNext to call
|
|
* FetchTuple in case do_query has copied a row in node->currentRow.msg.
|
|
* Also we have to reset the eof_underlying flag every time
|
|
* FetchTuple succeeds to clear any previously set status.
|
|
*/
|
|
if (eof_tuplestore && (!node->eof_underlying || (node->currentRow.msg != NULL))) {
|
|
/*
|
|
* If tuplestore has reached its end but the underlying RemoteQueryNext() hasn't
|
|
* finished yet, try to fetch another row.
|
|
*/
|
|
if (node->fetchTuple(node, scanslot, NULL)) {
|
|
/* See comments a couple of lines above */
|
|
node->eof_underlying = false;
|
|
/*
|
|
* Append a copy of the returned tuple to tuplestore. NOTE: because
|
|
* the tuplestore is certainly in EOF state, its read position will
|
|
* move forward over the added tuple. This is what we want.
|
|
*/
|
|
if (tuplestorestate && !TupIsNull(scanslot))
|
|
tuplestore_puttupleslot(tuplestorestate, scanslot);
|
|
} else
|
|
node->eof_underlying = true;
|
|
}
|
|
|
|
if (eof_tuplestore && node->eof_underlying)
|
|
(void)ExecClearTuple(scanslot);
|
|
} else
|
|
(void)ExecClearTuple(scanslot);
|
|
|
|
/* When finish remote query already, should better reset the flag. */
|
|
if (TupIsNull(scanslot))
|
|
node->need_error_check = false;
|
|
|
|
/* report error if any */
|
|
pgxc_node_report_error(node);
|
|
|
|
/*
|
|
* Now we know the query is successful. Fire AFTER STATEMENT triggers. Make
|
|
* sure this is the last iteration of the query. If an FQS query has
|
|
* RETURNING clause, this function can be called multiple times until we
|
|
* return NULL.
|
|
*/
|
|
if (TupIsNull(scanslot) && rq->remote_query)
|
|
pgxc_rq_fire_astriggers(node);
|
|
|
|
/*
|
|
* If it's an FQSed DML query for which command tag is to be set,
|
|
* then update estate->es_processed. For other queries, the standard
|
|
* executer takes care of it; namely, in ExecModifyTable for DML queries
|
|
* and ExecutePlan for SELECT queries.
|
|
*/
|
|
if (rq->remote_query != NULL && rq->remote_query->canSetTag && !rq->rq_params_internal &&
|
|
(rq->remote_query->commandType == CMD_INSERT || rq->remote_query->commandType == CMD_UPDATE ||
|
|
rq->remote_query->commandType == CMD_DELETE || rq->remote_query->commandType == CMD_MERGE))
|
|
estate->es_processed += node->rqs_processed;
|
|
|
|
/*
|
|
* We only handle stream plan && RemoteQuery as root && DML's tag here.
|
|
* Other are handled by ExecModifyTable and ExecutePlan
|
|
*/
|
|
if (rq->is_simple && (Plan*)rq == estate->es_plannedstmt->planTree &&
|
|
(estate->es_plannedstmt->commandType == CMD_INSERT || estate->es_plannedstmt->commandType == CMD_UPDATE ||
|
|
estate->es_plannedstmt->commandType == CMD_DELETE || estate->es_plannedstmt->commandType == CMD_MERGE))
|
|
estate->es_processed += node->rqs_processed;
|
|
|
|
/* early free the connection to the compute pool */
|
|
if (TupIsNull(scanslot) && PLAN_ROUTER == rq->position) {
|
|
release_conn_to_compute_pool();
|
|
}
|
|
|
|
return scanslot;
|
|
}
|
|
|
|
/*
|
|
* @Description: Pack all un-completed connections together
|
|
* @in node - remotequerystate info
|
|
*
|
|
* @out - int, active count
|
|
*/
|
|
int PackConnections(RemoteQueryState* node)
|
|
{
|
|
int active_count = 0;
|
|
RemoteQuery* step = (RemoteQuery*)node->ss.ps.plan;
|
|
PGXCNodeAllHandles* pgxc_handles = NULL;
|
|
|
|
/* refresh the connections due to connections switch */
|
|
pgxc_handles = get_handles(step->exec_nodes->nodeList, NULL, false);
|
|
node->connections = pgxc_handles->datanode_handles;
|
|
|
|
for (int i = 0; i < pgxc_handles->dn_conn_count; i++) {
|
|
if (node->connections[i] == NULL)
|
|
continue;
|
|
|
|
if (node->connections[i]->state == DN_CONNECTION_STATE_QUERY) {
|
|
/* Buffer the connection first if it's combiner diffs from the current one */
|
|
if (node->connections[i]->combiner != node) {
|
|
BufferConnection(node->connections[i]);
|
|
continue;
|
|
} else {
|
|
node->connections[active_count] = node->connections[i];
|
|
active_count++;
|
|
}
|
|
}
|
|
}
|
|
|
|
return active_count;
|
|
}
|
|
|
|
inline static uint64 GetQueryidFromRemoteQuery(RemoteQueryState* node)
|
|
{
|
|
if (node->ss.ps.state != NULL && node->ss.ps.state->es_plannedstmt != NULL) {
|
|
return node->ss.ps.state->es_plannedstmt->queryId;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
void ExecEndRemoteQuery(RemoteQueryState* step, bool pre_end)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
ListCell* lc = NULL;
|
|
|
|
/* clean up the buffer */
|
|
foreach (lc, node->rowBuffer) {
|
|
RemoteDataRow dataRow = (RemoteDataRow)lfirst(lc);
|
|
pfree(dataRow->msg);
|
|
}
|
|
list_free_deep(node->rowBuffer);
|
|
|
|
node->current_conn = 0;
|
|
while (node->conn_count > 0) {
|
|
int res;
|
|
PGXCNodeHandle* conn = node->connections[node->current_conn];
|
|
|
|
/* throw away message */
|
|
if (node->currentRow.msg) {
|
|
pfree(node->currentRow.msg);
|
|
node->currentRow.msg = NULL;
|
|
}
|
|
|
|
if (conn == NULL) {
|
|
node->conn_count--;
|
|
continue;
|
|
}
|
|
|
|
/* no data is expected */
|
|
if (conn->state == DN_CONNECTION_STATE_IDLE || conn->state == DN_CONNECTION_STATE_ERROR_FATAL) {
|
|
if (node->current_conn < --node->conn_count)
|
|
node->connections[node->current_conn] = node->connections[node->conn_count];
|
|
continue;
|
|
}
|
|
res = handle_response(conn, node);
|
|
if (res == RESPONSE_EOF) {
|
|
struct timeval timeout;
|
|
timeout.tv_sec = END_QUERY_TIMEOUT;
|
|
timeout.tv_usec = 0;
|
|
|
|
if (pgxc_node_receive(1, &conn, &timeout))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INTERNAL_ERROR),
|
|
errmsg("Failed to read response from Datanodes when ending query")));
|
|
}
|
|
}
|
|
|
|
if (node->tuplestorestate != NULL)
|
|
(void)ExecClearTuple(node->ss.ss_ScanTupleSlot);
|
|
/*
|
|
* Release tuplestore resources
|
|
*/
|
|
if (node->tuplestorestate != NULL)
|
|
tuplestore_end(node->tuplestorestate);
|
|
node->tuplestorestate = NULL;
|
|
|
|
/*
|
|
* If there are active cursors close them
|
|
*/
|
|
if (node->cursor || node->update_cursor) {
|
|
PGXCNodeAllHandles* all_handles = NULL;
|
|
PGXCNodeHandle** cur_handles;
|
|
bool bFree = false;
|
|
int nCount;
|
|
int i;
|
|
|
|
cur_handles = node->cursor_connections;
|
|
nCount = node->cursor_count;
|
|
|
|
for (i = 0; i < node->cursor_count; i++) {
|
|
if (node->cursor_connections == NULL || node->cursor_connections[i]->sock == -1) {
|
|
bFree = true;
|
|
all_handles = get_exec_connections(node, NULL, EXEC_ON_DATANODES);
|
|
cur_handles = all_handles->datanode_handles;
|
|
nCount = all_handles->dn_conn_count;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (node->cursor) {
|
|
close_node_cursors(cur_handles, nCount, node->cursor);
|
|
pfree(node->cursor);
|
|
node->cursor = NULL;
|
|
}
|
|
|
|
if (node->update_cursor) {
|
|
close_node_cursors(cur_handles, nCount, node->update_cursor);
|
|
pfree(node->update_cursor);
|
|
node->update_cursor = NULL;
|
|
}
|
|
|
|
if (bFree)
|
|
pfree_pgxc_all_handles(all_handles);
|
|
}
|
|
|
|
/*
|
|
* Clean up parameters if they were set
|
|
*/
|
|
if (node->paramval_data) {
|
|
pfree(node->paramval_data);
|
|
node->paramval_data = NULL;
|
|
node->paramval_len = 0;
|
|
}
|
|
|
|
/* Free the param types if they are newly allocated */
|
|
if (node->rqs_param_types && node->rqs_param_types != ((RemoteQuery*)node->ss.ps.plan)->rq_param_types) {
|
|
pfree(node->rqs_param_types);
|
|
node->rqs_param_types = NULL;
|
|
node->rqs_num_params = 0;
|
|
}
|
|
|
|
if (node->ss.ss_currentRelation)
|
|
ExecCloseScanRelation(node->ss.ss_currentRelation);
|
|
|
|
CloseCombiner(node);
|
|
#endif
|
|
}
|
|
|
|
static void close_node_cursors(PGXCNodeHandle** connections, int conn_count, const char* cursor)
|
|
{
|
|
int i;
|
|
RemoteQueryState* combiner = NULL;
|
|
|
|
for (i = 0; i < conn_count; i++) {
|
|
if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(connections[i]);
|
|
if (pgxc_node_send_close(connections[i], false, cursor) != 0)
|
|
ereport(WARNING,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to close Datanode %u cursor", connections[i]->nodeoid)));
|
|
if (pgxc_node_send_sync(connections[i]) != 0)
|
|
ereport(WARNING,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to close Datanode %u cursor", connections[i]->nodeoid)));
|
|
}
|
|
|
|
combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE);
|
|
|
|
while (conn_count > 0) {
|
|
if (pgxc_node_receive(conn_count, connections, NULL))
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to close Datanode cursor")));
|
|
i = 0;
|
|
while (i < conn_count) {
|
|
int res = handle_response(connections[i], combiner);
|
|
if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (--conn_count > i)
|
|
connections[i] = connections[conn_count];
|
|
} else {
|
|
// Unexpected response, ignore?
|
|
}
|
|
}
|
|
}
|
|
|
|
(void)ValidateAndCloseCombiner(combiner);
|
|
}
|
|
|
|
/*
|
|
* Encode parameter values to format of DataRow message (the same format is
|
|
* used in Bind) to prepare for sending down to Datanodes.
|
|
* The data row is copied to RemoteQueryState.paramval_data.
|
|
*/
|
|
void SetDataRowForExtParams(ParamListInfo paraminfo, RemoteQueryState* rq_state)
|
|
{
|
|
StringInfoData buf;
|
|
uint16 n16;
|
|
int i;
|
|
int real_num_params = 0;
|
|
RemoteQuery* node = (RemoteQuery*)rq_state->ss.ps.plan;
|
|
|
|
/* If there are no parameters, there is no data to BIND. */
|
|
if (!paraminfo)
|
|
return;
|
|
|
|
/*
|
|
* If this query has been generated internally as a part of two-step DML
|
|
* statement, it uses only the internal parameters for input values taken
|
|
* from the source data, and it never uses external parameters. So even if
|
|
* parameters were being set externally, they won't be present in this
|
|
* statement (they might be present in the source data query). In such
|
|
* case where parameters refer to the values returned by SELECT query, the
|
|
* parameter data and parameter types would be set in SetDataRowForIntParams().
|
|
*/
|
|
if (node->rq_params_internal)
|
|
return;
|
|
|
|
Assert(!rq_state->paramval_data);
|
|
|
|
/*
|
|
* It is necessary to fetch parameters
|
|
* before looking at the output value.
|
|
*/
|
|
for (i = 0; i < paraminfo->numParams; i++) {
|
|
ParamExternData* param = NULL;
|
|
|
|
param = ¶minfo->params[i];
|
|
|
|
if (!OidIsValid(param->ptype) && paraminfo->paramFetch != NULL)
|
|
(*paraminfo->paramFetch)(paraminfo, i + 1);
|
|
|
|
/*
|
|
* This is the last parameter found as useful, so we need
|
|
* to include all the previous ones to keep silent the remote
|
|
* nodes. All the parameters prior to the last usable having no
|
|
* type available will be considered as NULL entries.
|
|
*/
|
|
if (OidIsValid(param->ptype))
|
|
real_num_params = i + 1;
|
|
}
|
|
|
|
/*
|
|
* If there are no parameters available, simply leave.
|
|
* This is possible in the case of a query called through SPI
|
|
* and using no parameters.
|
|
*/
|
|
if (real_num_params == 0) {
|
|
rq_state->paramval_data = NULL;
|
|
rq_state->paramval_len = 0;
|
|
return;
|
|
}
|
|
|
|
initStringInfo(&buf);
|
|
|
|
/* Number of parameter values */
|
|
n16 = htons(real_num_params);
|
|
appendBinaryStringInfo(&buf, (char*)&n16, 2);
|
|
|
|
/* Parameter values */
|
|
for (i = 0; i < real_num_params; i++) {
|
|
ParamExternData* param = ¶minfo->params[i];
|
|
uint32 n32;
|
|
|
|
/*
|
|
* Parameters with no types are considered as NULL and treated as integer
|
|
* The same trick is used for dropped columns for remote DML generation.
|
|
*/
|
|
if (param->isnull || !OidIsValid(param->ptype)) {
|
|
n32 = htonl(~0);
|
|
appendBinaryStringInfo(&buf, (char*)&n32, 4);
|
|
} else {
|
|
Oid typOutput;
|
|
bool typIsVarlena = false;
|
|
Datum pval;
|
|
char* pstring = NULL;
|
|
int len;
|
|
|
|
/* Get info needed to output the value */
|
|
getTypeOutputInfo(param->ptype, &typOutput, &typIsVarlena);
|
|
|
|
/*
|
|
* If we have a toasted datum, forcibly detoast it here to avoid
|
|
* memory leakage inside the type's output routine.
|
|
*/
|
|
if (typIsVarlena)
|
|
pval = PointerGetDatum(PG_DETOAST_DATUM(param->value));
|
|
else
|
|
pval = param->value;
|
|
|
|
/* Convert Datum to string */
|
|
pstring = OidOutputFunctionCall(typOutput, pval);
|
|
|
|
/* copy data to the buffer */
|
|
len = strlen(pstring);
|
|
n32 = htonl(len);
|
|
appendBinaryStringInfo(&buf, (char*)&n32, 4);
|
|
appendBinaryStringInfo(&buf, pstring, len);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If parameter types are not already set, infer them from
|
|
* the paraminfo.
|
|
*/
|
|
if (node->rq_num_params > 0) {
|
|
/*
|
|
* Use the already known param types for BIND. Parameter types
|
|
* can be already known when the same plan is executed multiple
|
|
* times.
|
|
*/
|
|
if (node->rq_num_params != real_num_params)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
errmsg("Number of user-supplied parameters do not match the number of remote parameters")));
|
|
rq_state->rqs_num_params = node->rq_num_params;
|
|
rq_state->rqs_param_types = node->rq_param_types;
|
|
} else {
|
|
rq_state->rqs_num_params = real_num_params;
|
|
rq_state->rqs_param_types = (Oid*)palloc(sizeof(Oid) * real_num_params);
|
|
for (i = 0; i < real_num_params; i++)
|
|
rq_state->rqs_param_types[i] = paraminfo->params[i].ptype;
|
|
}
|
|
|
|
/* Assign the newly allocated data row to paramval */
|
|
rq_state->paramval_data = buf.data;
|
|
rq_state->paramval_len = buf.len;
|
|
}
|
|
|
|
/* ----------------------------------------------------------------
|
|
* ExecRemoteQueryReScan
|
|
*
|
|
* Rescans the relation.
|
|
* ----------------------------------------------------------------
|
|
*/
|
|
void ExecRemoteQueryReScan(RemoteQueryState* node, ExprContext* exprCtxt)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
/*
|
|
* If the materialized store is not empty, just rewind the stored output.
|
|
*/
|
|
(void)ExecClearTuple(node->ss.ps.ps_ResultTupleSlot);
|
|
|
|
if (!node->tuplestorestate)
|
|
return;
|
|
|
|
tuplestore_rescan(node->tuplestorestate);
|
|
#endif
|
|
}
|
|
|
|
void FreeParallelFunctionState(ParallelFunctionState* state)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
void StrategyFuncSum(ParallelFunctionState* state)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
ParallelFunctionState* RemoteFunctionResultHandler(char* sql_statement, ExecNodes* exec_nodes, strategy_func function,
|
|
bool read_only, RemoteQueryExecType exec_type, bool non_check_count,
|
|
bool need_tran_block, bool need_transform_anyarray, bool active_nodes_only)
|
|
{
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* ExecRemoteFunctionInParallel, it executes function statement on all DN/CN
|
|
* nodes in parallel and fetch results from all of them.
|
|
*
|
|
* We call this function in relfilenode swap at the end of resizing stage on one local table
|
|
*/
|
|
static void ExecRemoteFunctionInParallel(
|
|
ParallelFunctionState* state, RemoteQueryExecType exec_remote_type, bool non_check_count)
|
|
{
|
|
GlobalTransactionId gxid = InvalidGlobalTransactionId;
|
|
Snapshot snapshot = GetActiveSnapshot();
|
|
RemoteQueryState* remotestate = NULL;
|
|
PGXCNodeAllHandles* pgxc_connections = NULL;
|
|
PGXCNodeHandle** dn_connections = NULL;
|
|
PGXCNodeHandle** cn_connections = NULL;
|
|
int dn_conn_count = 0;
|
|
int cn_conn_count = 0;
|
|
int i = 0;
|
|
|
|
#define exec_on_datanode(type) (EXEC_ON_ALL_NODES == (type) || EXEC_ON_DATANODES == (type))
|
|
#define exec_on_coordinator(type) (EXEC_ON_ALL_NODES == (type) || EXEC_ON_COORDS == (type))
|
|
|
|
if (!state->read_only)
|
|
gxid = GetCurrentTransactionId();
|
|
|
|
/* If no Datanodes defined, the query cannot be launched */
|
|
if (u_sess->pgxc_cxt.NumDataNodes == 0 && exec_on_datanode(exec_remote_type) && !IS_SINGLE_NODE) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNDEFINED_OBJECT),
|
|
errmsg("No Datanode defined in cluster"),
|
|
errhint("Need to define at least 1 Datanode with CREATE NODE.")));
|
|
}
|
|
|
|
if (u_sess->pgxc_cxt.NumCoords == 0 && exec_on_coordinator(exec_remote_type)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNDEFINED_OBJECT),
|
|
errmsg("No coordinator nodes defined in cluster"),
|
|
errhint("Need to define at least 1 Coordinator with CREATE NODE.")));
|
|
}
|
|
|
|
remotestate = CreateResponseCombiner(0, non_check_count ? COMBINE_TYPE_NONE : COMBINE_TYPE_SAME);
|
|
pgxc_connections = get_exec_connections(NULL, state->exec_nodes, exec_remote_type);
|
|
dn_connections = pgxc_connections->datanode_handles;
|
|
dn_conn_count = pgxc_connections->dn_conn_count;
|
|
/* cn_connections maybe NULL, because we may have no other cn. */
|
|
cn_connections = pgxc_connections->coord_handles;
|
|
cn_conn_count = pgxc_connections->co_conn_count;
|
|
|
|
state->tupstore = tuplestore_begin_heap(false, false, u_sess->attr.attr_memory.work_mem);
|
|
|
|
if (exec_on_datanode(exec_remote_type)) {
|
|
/* send execute info to datanodes */
|
|
/* The query_id parameter is added to facilitate DFX fault locating. */
|
|
if (pgxc_node_begin(dn_conn_count, dn_connections, gxid, false, state->read_only, PGXC_NODE_DATANODE, true)) {
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Could not begin transaction on Datanodes")));
|
|
}
|
|
|
|
for (i = 0; i < dn_conn_count; i++) {
|
|
if (dn_connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(dn_connections[i]);
|
|
|
|
if (snapshot && pgxc_node_send_snapshot(dn_connections[i], snapshot)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send snapshot to %s", dn_connections[i]->remoteNodeName)));
|
|
}
|
|
|
|
if (pgxc_node_send_query(dn_connections[i], state->sql_statement) != 0) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send command to %s", dn_connections[i]->remoteNodeName)));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (exec_on_coordinator(exec_remote_type)) {
|
|
/* send execute info to coordinators */
|
|
if (pgxc_node_begin(cn_conn_count, cn_connections, gxid, false, state->read_only, PGXC_NODE_COORDINATOR)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Could not begin transaction on Coordinator nodes")));
|
|
}
|
|
|
|
for (i = 0; i < cn_conn_count; i++) {
|
|
if (cn_connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(cn_connections[i]);
|
|
|
|
if (snapshot && pgxc_node_send_snapshot(cn_connections[i], snapshot)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send snapshot to %s", cn_connections[i]->remoteNodeName)));
|
|
}
|
|
|
|
if (pgxc_node_send_query(cn_connections[i], state->sql_statement) != 0) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send command to %s", cn_connections[i]->remoteNodeName)));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Parallel collect data from datanode through fetch tuple by multi channel. */
|
|
TupleTableSlot* slot = MakeTupleTableSlot();
|
|
remotestate->conn_count = dn_conn_count;
|
|
remotestate->connections = dn_connections;
|
|
(void)FetchTupleByMultiChannel<false, true>(remotestate, slot, state);
|
|
|
|
/* Parallel collect data from coordinator through fetch tuple by multi channel. */
|
|
remotestate->conn_count = cn_conn_count;
|
|
remotestate->connections = cn_connections;
|
|
(void)FetchTupleByMultiChannel<false, true>(remotestate, slot, state);
|
|
}
|
|
|
|
/*
|
|
* Execute utility statement on multiple Datanodes
|
|
* It does approximately the same as
|
|
*
|
|
* RemoteQueryState *state = ExecInitRemoteQuery(plan, estate, flags);
|
|
* Assert(TupIsNull(ExecRemoteQuery(state));
|
|
* ExecEndRemoteQuery(state)
|
|
*
|
|
* But does not need an Estate instance and does not do some unnecessary work,
|
|
* like allocating tuple slots.
|
|
*/
|
|
void ExecRemoteUtility(RemoteQuery* node)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
|
|
RemoteQueryState* remotestate = NULL;
|
|
bool force_autocommit = node->force_autocommit;
|
|
RemoteQueryExecType exec_type = node->exec_type;
|
|
GlobalTransactionId gxid = InvalidGlobalTransactionId;
|
|
Snapshot snapshot = GetActiveSnapshot();
|
|
PGXCNodeAllHandles* pgxc_connections;
|
|
int co_conn_count;
|
|
int dn_conn_count;
|
|
bool need_tran_block = false;
|
|
ExecDirectType exec_direct_type = node->exec_direct_type;
|
|
int i;
|
|
|
|
if (!force_autocommit)
|
|
RegisterTransactionLocalNode(true);
|
|
|
|
/*
|
|
* It is possible to invoke create table with inheritance on
|
|
* temporary objects. Remember that we might have accessed a temp object
|
|
*/
|
|
if (node->is_temp)
|
|
ExecSetTempObjectIncluded();
|
|
|
|
remotestate = CreateResponseCombiner(0, node->combine_type);
|
|
|
|
pgxc_connections = get_exec_connections(NULL, node->exec_nodes, exec_type);
|
|
|
|
dn_conn_count = pgxc_connections->dn_conn_count;
|
|
co_conn_count = pgxc_connections->co_conn_count;
|
|
|
|
if (force_autocommit)
|
|
need_tran_block = false;
|
|
else
|
|
need_tran_block = true;
|
|
|
|
/* Commands launched through EXECUTE DIRECT do not need start a transaction */
|
|
if (exec_direct_type == EXEC_DIRECT_UTILITY) {
|
|
need_tran_block = false;
|
|
|
|
/* This check is not done when analyzing to limit dependencies */
|
|
if (IsTransactionBlock())
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
|
|
errmsg("cannot run EXECUTE DIRECT with utility inside a transaction block")));
|
|
}
|
|
|
|
gxid = GetCurrentTransactionId();
|
|
if (!GlobalTransactionIdIsValid(gxid))
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to get next transaction ID")));
|
|
|
|
if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_DATANODES) {
|
|
if (pgxc_node_begin(
|
|
dn_conn_count, pgxc_connections->datanode_handles, gxid, need_tran_block, false, PGXC_NODE_DATANODE))
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not begin transaction on Datanodes")));
|
|
for (i = 0; i < dn_conn_count; i++) {
|
|
PGXCNodeHandle* conn = pgxc_connections->datanode_handles[i];
|
|
|
|
if (conn->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(conn);
|
|
if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) {
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to Datanodes")));
|
|
}
|
|
if (pgxc_node_send_query(conn, node->sql_statement) != 0) {
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to Datanodes")));
|
|
}
|
|
}
|
|
}
|
|
|
|
if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_COORDS) {
|
|
if (pgxc_node_begin(
|
|
co_conn_count, pgxc_connections->coord_handles, gxid, need_tran_block, false, PGXC_NODE_COORDINATOR))
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Could not begin transaction on coordinators")));
|
|
/* Now send it to Coordinators if necessary */
|
|
for (i = 0; i < co_conn_count; i++) {
|
|
if (snapshot && pgxc_node_send_snapshot(pgxc_connections->coord_handles[i], snapshot)) {
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to coordinators")));
|
|
}
|
|
if (pgxc_node_send_query(pgxc_connections->coord_handles[i], node->sql_statement) != 0) {
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to send command to coordinators")));
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Stop if all commands are completed or we got a data row and
|
|
* initialized state node for subsequent invocations
|
|
*/
|
|
if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_DATANODES) {
|
|
while (dn_conn_count > 0) {
|
|
int i = 0;
|
|
|
|
if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL))
|
|
break;
|
|
/*
|
|
* Handle input from the Datanodes.
|
|
* We do not expect Datanodes returning tuples when running utility
|
|
* command.
|
|
* If we got EOF, move to the next connection, will receive more
|
|
* data on the next iteration.
|
|
*/
|
|
while (i < dn_conn_count) {
|
|
PGXCNodeHandle* conn = pgxc_connections->datanode_handles[i];
|
|
int res = handle_response(conn, remotestate);
|
|
if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (i < --dn_conn_count)
|
|
pgxc_connections->datanode_handles[i] = pgxc_connections->datanode_handles[dn_conn_count];
|
|
} else if (res == RESPONSE_TUPDESC) {
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Unexpected response from Datanode")));
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Unexpected response from Datanode")));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Make the same for Coordinators */
|
|
if (exec_type == EXEC_ON_ALL_NODES || exec_type == EXEC_ON_COORDS) {
|
|
while (co_conn_count > 0) {
|
|
int i = 0;
|
|
|
|
if (pgxc_node_receive(co_conn_count, pgxc_connections->coord_handles, NULL))
|
|
break;
|
|
|
|
while (i < co_conn_count) {
|
|
int res = handle_response(pgxc_connections->coord_handles[i], remotestate);
|
|
if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (i < --co_conn_count)
|
|
pgxc_connections->coord_handles[i] = pgxc_connections->coord_handles[co_conn_count];
|
|
} else if (res == RESPONSE_TUPDESC) {
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Unexpected response from coordinator")));
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Unexpected response from coordinator")));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
/*
|
|
* We have processed all responses from nodes and if we have
|
|
* error message pending we can report it. All connections should be in
|
|
* consistent state now and so they can be released to the pool after ROLLBACK.
|
|
*/
|
|
pgxc_node_report_error(remotestate);
|
|
#endif
|
|
}
|
|
void ExecRemoteUtility_ParallelDDLMode(RemoteQuery* node, const char* FirstExecNode)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
void ExecRemoteUtilityParallelBarrier(const RemoteQuery* node, const char* firstExecNode)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
|
|
void CheckRemoteUtiltyConn(PGXCNodeHandle* conn, Snapshot snapshot)
|
|
{
|
|
if (pgxc_node_send_cmd_id(conn, GetCurrentCommandId(true)) < 0) {
|
|
ereport(
|
|
ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to send cid to Datanode %u", conn->nodeoid)));
|
|
}
|
|
|
|
if (snapshot && pgxc_node_send_snapshot(conn, snapshot)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to send snapshot to Datanode %u", conn->nodeoid)));
|
|
}
|
|
|
|
if (ENABLE_WORKLOAD_CONTROL && *u_sess->wlm_cxt->control_group && pgxc_node_send_wlm_cgroup(conn)) {
|
|
ereport(WARNING,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to send cgroup to Datanode %u", conn->nodeoid)));
|
|
}
|
|
|
|
if (pgxc_node_send_queryid(conn, u_sess->debug_query_id) != 0) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Failed to send queryid to Datanode %u", conn->nodeoid)));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @global stats
|
|
* @Description: receive estimate or real total row count from dn.
|
|
*
|
|
* @in dn_conn_count - count of target datanodes connection
|
|
* @in pgxc_connections - target datanodes connection information
|
|
* @in remotestate - state of remote query node
|
|
* @out totalRowCnts - receive estimate total row count, save in it and return.
|
|
* @return: void
|
|
*
|
|
*/
|
|
static void recv_totalrowcnt_from_dn(int dn_conn_count, PGXCNodeAllHandles* pgxc_connections,
|
|
RemoteQueryState* remotestate, ANALYZE_RQTYPE arq_type, VacuumStmt* stmt, AnalyzeMode eAnalyzeMode)
|
|
{
|
|
/*
|
|
* Stop if all commands are completed or we got a data row and
|
|
* initialized state node for subsequent invocations
|
|
*/
|
|
int i;
|
|
int deadblock_dn_num = 0;
|
|
|
|
/* Reset totalRowCnts */
|
|
if (eAnalyzeMode == ANALYZENORMAL) {
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].totalRowCnts = 0;
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].topRowCnts = 0;
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].topMemSize = 0;
|
|
}
|
|
|
|
if (remotestate->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
remotestate->request_type = REQUEST_TYPE_QUERY;
|
|
|
|
while (dn_conn_count > 0) {
|
|
i = 0;
|
|
|
|
if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL)) {
|
|
int error_code;
|
|
char* error_msg = getSocketError(&error_code);
|
|
pfree_ext(pgxc_connections->datanode_handles);
|
|
|
|
ereport(
|
|
ERROR, (errcode(error_code), errmsg("Failed to read response from Datanodes Detail: %s\n", error_msg)));
|
|
}
|
|
|
|
while (i < dn_conn_count) {
|
|
PGXCNodeHandle* conn = pgxc_connections->datanode_handles[i];
|
|
int res = handle_response(conn, remotestate);
|
|
if (res == RESPONSE_ANALYZE_ROWCNT) {
|
|
NameData nodename = {{0}};
|
|
if (eAnalyzeMode == ANALYZENORMAL) {
|
|
if ((double)remotestate->analyze_totalrowcnt[ANALYZENORMAL] >= 0) {
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].totalRowCnts +=
|
|
(double)remotestate->analyze_totalrowcnt[ANALYZENORMAL];
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].topRowCnts =
|
|
Max(stmt->pstGlobalStatEx[ANALYZENORMAL].topRowCnts,
|
|
remotestate->analyze_totalrowcnt[ANALYZENORMAL]);
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].topMemSize =
|
|
Max(stmt->pstGlobalStatEx[ANALYZENORMAL].topMemSize,
|
|
remotestate->analyze_memsize[ANALYZENORMAL]);
|
|
} else if (arq_type == ARQ_TYPE_TOTALROWCNTS) {
|
|
deadblock_dn_num++;
|
|
}
|
|
|
|
elog(DEBUG1,
|
|
"%s total row count[%lf] for %s.",
|
|
(arq_type == ARQ_TYPE_SAMPLE) ? "Step 4-1: Get real" : "Step 1-1: Get estimate",
|
|
(double)remotestate->analyze_totalrowcnt[ANALYZENORMAL],
|
|
get_pgxc_nodename(conn->nodeoid, &nodename));
|
|
}
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (i < --dn_conn_count) {
|
|
pgxc_connections->datanode_handles[i] = pgxc_connections->datanode_handles[dn_conn_count];
|
|
}
|
|
} else if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Unexpected response from Datanode %u", conn->nodeoid)));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* If it exists dead block(as estimate totalrows invalid) in some or all datanodes,
|
|
* and there is no valid totalrows, may be we haven't got the valid block for sampling,
|
|
* then we should set invalid totalrows as flag for setting sample rate of 2% as default.
|
|
*/
|
|
if (arq_type == ARQ_TYPE_TOTALROWCNTS && eAnalyzeMode == ANALYZENORMAL && 0 < deadblock_dn_num &&
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].totalRowCnts == 0) {
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].totalRowCnts = INVALID_ESTTOTALROWS;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @global stats
|
|
* @Description: receive sample rows from dn.
|
|
*
|
|
* @in dn_conn_count - count of target datanodes connection
|
|
* @in pgxc_connections - target datanodes connection information
|
|
* @in remotestate - state of remote query node
|
|
* @in stmt - analyze or vacuum statement, we will receive sample rows and save in it
|
|
* @in eAnalyzeMode - identify which type the table, normal table/dfs table/delta table
|
|
* @return: HeapTuple*
|
|
*
|
|
*/
|
|
static HeapTuple* recv_samplerows_from_dn(int dn_conn_count, PGXCNodeAllHandles* pgxc_connections,
|
|
RemoteQueryState* remotestate, VacuumStmt* stmt, AnalyzeMode eAnalyzeMode)
|
|
{
|
|
#define RECV_DFSTUPLE_TUPDESCNO 1
|
|
#define RECV_DELTATUPLE_TUPDESCNO 2
|
|
|
|
int i, numRows = 0;
|
|
int* tupdescno = NULL;
|
|
TupleTableSlot** scanslot = NULL;
|
|
HeapTuple* results = NULL;
|
|
int estrows = 0;
|
|
double rstate = 0;
|
|
|
|
if (dn_conn_count > 0) {
|
|
tupdescno = (int*)palloc0(dn_conn_count * sizeof(int));
|
|
scanslot = (TupleTableSlot**)palloc0(dn_conn_count * sizeof(TupleTableSlot*));
|
|
}
|
|
|
|
/* identify it will get sample rows from dn, so we should get enough memory to save sample rows. */
|
|
if (eAnalyzeMode == ANALYZENORMAL) {
|
|
int targetrows =
|
|
(int)(stmt->pstGlobalStatEx[ANALYZENORMAL].totalRowCnts * stmt->pstGlobalStatEx[ANALYZENORMAL].sampleRate);
|
|
|
|
stmt->tupleDesc = NULL;
|
|
|
|
/* get estimate total sample rows for normal table. */
|
|
estrows = dn_conn_count * Max(DEFAULT_SAMPLE_ROWCNT, targetrows);
|
|
if (SUPPORT_PRETTY_ANALYZE) {
|
|
estrows = Max(DEFAULT_SAMPLE_ROWCNT, targetrows);
|
|
rstate = anl_init_selection_state(estrows);
|
|
}
|
|
|
|
results = (HeapTuple*)palloc0(estrows * sizeof(HeapTuple));
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].totalRowCnts = 0;
|
|
}
|
|
|
|
if (remotestate->request_type == REQUEST_TYPE_NOT_DEFINED)
|
|
remotestate->request_type = REQUEST_TYPE_QUERY;
|
|
|
|
while (dn_conn_count > 0) {
|
|
i = 0;
|
|
|
|
if (pgxc_node_receive(dn_conn_count, pgxc_connections->datanode_handles, NULL)) {
|
|
int error_code;
|
|
char* error_msg = getSocketError(&error_code);
|
|
pfree_ext(pgxc_connections->datanode_handles);
|
|
|
|
ereport(
|
|
ERROR, (errcode(error_code), errmsg("Failed to read response from Datanodes Detail: %s\n", error_msg)));
|
|
}
|
|
|
|
while (i < dn_conn_count) {
|
|
PGXCNodeHandle* conn = pgxc_connections->datanode_handles[i];
|
|
int res = handle_response(conn, remotestate);
|
|
if (res == RESPONSE_TUPDESC) {
|
|
scanslot[i] = MakeSingleTupleTableSlot(remotestate->tuple_desc);
|
|
|
|
/* receive next tupdesc for hdfs table. */
|
|
tupdescno[i]++;
|
|
Assert(tupdescno[i] < ANALYZE_MODE_MAX_NUM);
|
|
remotestate->description_count = 0;
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
/*
|
|
* If scanslot[i] is NULL, it may be cause some error result in confusion
|
|
* between TUPDESC and DATAROW. So we should make sure process samplerows well.
|
|
*/
|
|
if (scanslot[i] == NULL) {
|
|
if (remotestate->tuple_desc == NULL) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("TUPDESC message has not been received before DATAROW message from %s",
|
|
conn->remoteNodeName),
|
|
errdetail("Maybe datanode cause some error result in confusion between TUPDESC and "
|
|
"DATAROW.")));
|
|
} else {
|
|
scanslot[i] = MakeSingleTupleTableSlot(remotestate->tuple_desc);
|
|
}
|
|
}
|
|
|
|
FetchTuple(remotestate, scanslot[i]);
|
|
/* report error message come from datanode if there is error message. */
|
|
if (remotestate->errorMessage)
|
|
pgxc_node_report_error(remotestate);
|
|
|
|
/* Make sure the tuple is fully deconstructed */
|
|
tableam_tslot_getallattrs(scanslot[i]);
|
|
|
|
/* receive and save sample rows for normal table. */
|
|
if (eAnalyzeMode == ANALYZENORMAL) {
|
|
/* save tuple descriptor for the sample rows if it is NULL. */
|
|
if (stmt->tupleDesc == NULL)
|
|
stmt->tupleDesc = CreateTupleDescCopy(scanslot[i]->tts_tupleDescriptor);
|
|
|
|
/* don't save the received sample rows if the num rows we have received more than total rows. */
|
|
if (numRows < estrows) {
|
|
results[numRows++] = heap_form_tuple(
|
|
scanslot[i]->tts_tupleDescriptor, scanslot[i]->tts_values, scanslot[i]->tts_isnull);
|
|
} else if (SUPPORT_PRETTY_ANALYZE) {
|
|
if (0 >= anl_get_next_S(numRows, estrows, &rstate)) {
|
|
/* Found a suitable tuple, so save it, replacing one old tuple at random */
|
|
int64 k = (int64)(estrows * anl_random_fract());
|
|
|
|
Assert(k >= 0 && k < estrows);
|
|
heap_freetuple_ext(results[k]);
|
|
results[k] = heap_form_tuple(
|
|
scanslot[i]->tts_tupleDescriptor, scanslot[i]->tts_values, scanslot[i]->tts_isnull);
|
|
}
|
|
numRows++;
|
|
}
|
|
}
|
|
|
|
(void)ExecClearTuple(scanslot[i]);
|
|
} else if (res == RESPONSE_ANALYZE_ROWCNT) {
|
|
NameData nodename = {{0}};
|
|
if (eAnalyzeMode == ANALYZENORMAL) {
|
|
elog(DEBUG1,
|
|
"Step 4-1: Get real total row count[%lf] for %s.",
|
|
(double)remotestate->analyze_totalrowcnt[ANALYZENORMAL],
|
|
get_pgxc_nodename(conn->nodeoid, &nodename));
|
|
|
|
stmt->pstGlobalStatEx[ANALYZENORMAL].totalRowCnts +=
|
|
(double)remotestate->analyze_totalrowcnt[ANALYZENORMAL];
|
|
}
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (i < --dn_conn_count) {
|
|
pgxc_connections->datanode_handles[i] = pgxc_connections->datanode_handles[dn_conn_count];
|
|
|
|
/*
|
|
* end of receive desc and datarows for current connection,
|
|
* it should clear for next connection.
|
|
*/
|
|
tupdescno[i] = tupdescno[dn_conn_count];
|
|
scanslot[i] = scanslot[dn_conn_count];
|
|
}
|
|
} else if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Unexpected response from Datanode %u", conn->nodeoid)));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* save num of sample rows for normal table. */
|
|
if (eAnalyzeMode == ANALYZENORMAL) {
|
|
stmt->num_samples = (estrows > numRows) ? numRows : estrows;
|
|
elog(DEBUG1,
|
|
"Step 3: Get sample rows from DNs finished, receive [%d] tuples, sample [%d] tuples",
|
|
numRows,
|
|
stmt->num_samples);
|
|
}
|
|
|
|
return results;
|
|
}
|
|
|
|
HeapTuple* RecvRemoteSampleMessage(
|
|
VacuumStmt* stmt, RemoteQuery* node, ANALYZE_RQTYPE arq_type, AnalyzeMode eAnalyzeMode)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Called when the backend is ending.
|
|
*/
|
|
void PGXCNodeCleanAndRelease(int code, Datum arg)
|
|
{
|
|
/* Clean up prepared transactions before releasing connections */
|
|
DropAllPreparedStatements();
|
|
|
|
/* clean saved plan but not save into gpc */
|
|
GPCCleanUpSessionSavedPlan();
|
|
|
|
/* Release Datanode connections */
|
|
release_handles();
|
|
|
|
/* Disconnect from Pooler */
|
|
if (IsPoolHandle())
|
|
PoolManagerDisconnect();
|
|
|
|
/* Close connection with GTM */
|
|
CloseGTM();
|
|
|
|
/* Free remote xact state */
|
|
free_RemoteXactState();
|
|
|
|
/* Free gxip */
|
|
UnsetGlobalSnapshotData();
|
|
}
|
|
|
|
/*
|
|
* Called when the session is ending.
|
|
*/
|
|
void PGXCConnClean(int code, Datum arg)
|
|
{
|
|
/* Clean up prepared transactions before releasing connections */
|
|
DropAllPreparedStatements();
|
|
|
|
/* Release Datanode connections */
|
|
release_handles();
|
|
|
|
/* Disconnect from Pooler */
|
|
if (IsPoolHandle())
|
|
PoolManagerDisconnect();
|
|
|
|
/* Free remote xact state */
|
|
free_RemoteXactState();
|
|
}
|
|
|
|
static int pgxc_get_connections(PGXCNodeHandle* connections[], int size, List* connlist)
|
|
{
|
|
ListCell* lc = NULL;
|
|
int count = 0;
|
|
|
|
foreach (lc, connlist) {
|
|
PGXCNodeHandle* conn = (PGXCNodeHandle*)lfirst(lc);
|
|
Assert(count < size);
|
|
connections[count++] = conn;
|
|
}
|
|
return count;
|
|
}
|
|
/*
|
|
* Get all connections for which we have an open transaction,
|
|
* for both Datanodes and Coordinators
|
|
*/
|
|
static int pgxc_get_transaction_nodes(PGXCNodeHandle* connections[], int size, bool write)
|
|
{
|
|
return pgxc_get_connections(
|
|
connections, size, write ? u_sess->pgxc_cxt.XactWriteNodes : u_sess->pgxc_cxt.XactReadNodes);
|
|
}
|
|
|
|
void ExecCloseRemoteStatement(const char* stmt_name, List* nodelist)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
PGXCNodeAllHandles* all_handles = NULL;
|
|
PGXCNodeHandle** connections = NULL;
|
|
RemoteQueryState* combiner = NULL;
|
|
int conn_count;
|
|
int i;
|
|
|
|
/* Exit if nodelist is empty */
|
|
if (list_length(nodelist) == 0)
|
|
return;
|
|
|
|
/* get needed Datanode connections */
|
|
all_handles = get_handles(nodelist, NIL, false);
|
|
conn_count = all_handles->dn_conn_count;
|
|
connections = all_handles->datanode_handles;
|
|
|
|
for (i = 0; i < conn_count; i++) {
|
|
if (connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(connections[i]);
|
|
|
|
if (pgxc_node_send_close(connections[i], true, stmt_name) != 0) {
|
|
/*
|
|
* statements are not affected by statement end, so consider
|
|
* unclosed statement on the Datanode as a fatal issue and
|
|
* force connection is discarded
|
|
*/
|
|
connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to close Datanode statemrnt")));
|
|
}
|
|
if (pgxc_node_send_sync(connections[i]) != 0) {
|
|
connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
ereport(WARNING, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to close Datanode statement")));
|
|
}
|
|
}
|
|
|
|
combiner = CreateResponseCombiner(conn_count, COMBINE_TYPE_NONE);
|
|
|
|
while (conn_count > 0) {
|
|
if (pgxc_node_receive(conn_count, connections, NULL)) {
|
|
for (i = 0; i <= conn_count; i++)
|
|
connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
|
|
ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to close Datanode statement")));
|
|
}
|
|
i = 0;
|
|
while (i < conn_count) {
|
|
int res = handle_response(connections[i], combiner);
|
|
if (res == RESPONSE_EOF) {
|
|
i++;
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (--conn_count > i)
|
|
connections[i] = connections[conn_count];
|
|
} else {
|
|
connections[i]->state = DN_CONNECTION_STATE_ERROR_FATAL;
|
|
}
|
|
}
|
|
}
|
|
|
|
ValidateAndCloseCombiner(combiner);
|
|
pfree_pgxc_all_handles(all_handles);
|
|
#endif
|
|
}
|
|
|
|
int DataNodeCopyInBinaryForAll(const char* msg_buf, int len, PGXCNodeHandle** copy_connections)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return 0;
|
|
#else
|
|
int i;
|
|
int conn_count = 0;
|
|
PGXCNodeHandle* connections[NumDataNodes];
|
|
int msgLen = 4 + len + 1;
|
|
int nLen = htonl(msgLen);
|
|
errno_t rc = EOK;
|
|
|
|
for (i = 0; i < NumDataNodes; i++) {
|
|
PGXCNodeHandle* handle = copy_connections[i];
|
|
|
|
if (!handle)
|
|
continue;
|
|
|
|
connections[conn_count++] = handle;
|
|
}
|
|
|
|
for (i = 0; i < conn_count; i++) {
|
|
PGXCNodeHandle* handle = connections[i];
|
|
if (handle->state == DN_CONNECTION_STATE_COPY_IN) {
|
|
/* msgType + msgLen */
|
|
if (ensure_out_buffer_capacity(handle->outEnd + 1 + msgLen, handle) != 0) {
|
|
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")));
|
|
}
|
|
|
|
handle->outBuffer[handle->outEnd++] = 'd';
|
|
rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd - 1, &nLen, sizeof(int));
|
|
securec_check(rc, "", "");
|
|
handle->outEnd += 4;
|
|
rc = memcpy_s(handle->outBuffer + handle->outEnd, handle->outSize - handle->outEnd - 1, msg_buf, len);
|
|
securec_check(rc, "", "");
|
|
handle->outEnd += len;
|
|
handle->outBuffer[handle->outEnd++] = '\n';
|
|
} else {
|
|
add_error_message(handle, "%s", "Invalid Datanode connection");
|
|
return EOF;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* ExecSetTempObjectIncluded
|
|
*
|
|
* Remember that we have accessed a temporary object.
|
|
*/
|
|
void ExecSetTempObjectIncluded(void)
|
|
{
|
|
t_thrd.pgxc_cxt.temp_object_included = true;
|
|
}
|
|
|
|
/*
|
|
* ExecClearTempObjectIncluded
|
|
*
|
|
* Forget about temporary objects
|
|
*/
|
|
static void ExecClearTempObjectIncluded(void)
|
|
{
|
|
t_thrd.pgxc_cxt.temp_object_included = false;
|
|
}
|
|
|
|
/* ExecIsTempObjectIncluded
|
|
*
|
|
* Check if a temporary object has been accessed
|
|
*/
|
|
bool ExecIsTempObjectIncluded(void)
|
|
{
|
|
return t_thrd.pgxc_cxt.temp_object_included;
|
|
}
|
|
|
|
/*
|
|
* ExecProcNodeDMLInXC
|
|
*
|
|
* This function is used by ExecInsert/Update/Delete to execute the
|
|
* Insert/Update/Delete on the datanode using RemoteQuery plan.
|
|
*
|
|
* In XC, a non-FQSed UPDATE/DELETE is planned as a two step process
|
|
* The first step selects the ctid & node id of the row to be modified and the
|
|
* second step creates a parameterized query that is supposed to take the data
|
|
* row returned by the lower plan node as the parameters to modify the affected
|
|
* row. In case of an INSERT however the first step is used to get the new
|
|
* column values to be inserted in the target table and the second step uses
|
|
* those values as parameters of the INSERT query.
|
|
*
|
|
* We use extended query protocol to avoid repeated planning of the query and
|
|
* pass the column values(in case of an INSERT) and ctid & xc_node_id
|
|
* (in case of UPDATE/DELETE) as parameters while executing the query.
|
|
*
|
|
* Parameters:
|
|
* resultRemoteRel: The RemoteQueryState containing DML statement to be
|
|
* executed
|
|
* sourceDataSlot: The tuple returned by the first step (described above)
|
|
* to be used as parameters in the second step.
|
|
* newDataSlot: This has all the junk attributes stripped off from
|
|
* sourceDataSlot, plus BEFORE triggers may have modified the
|
|
* originally fetched data values. In other words, this has
|
|
* the final values that are to be sent to datanode through BIND.
|
|
*
|
|
* Returns the result of RETURNING clause if any
|
|
*/
|
|
TupleTableSlot* ExecProcNodeDMLInXC(EState* estate, TupleTableSlot* sourceDataSlot, TupleTableSlot* newDataSlot)
|
|
{
|
|
ResultRelInfo* resultRelInfo = estate->es_result_relation_info;
|
|
RemoteQueryState* resultRemoteRel = (RemoteQueryState*)estate->es_result_remoterel;
|
|
ExprContext* econtext = resultRemoteRel->ss.ps.ps_ExprContext;
|
|
TupleTableSlot* returningResultSlot = NULL; /* RETURNING clause result */
|
|
TupleTableSlot* temp_slot = NULL;
|
|
bool dml_returning_on_replicated = false;
|
|
RemoteQuery* step = (RemoteQuery*)resultRemoteRel->ss.ps.plan;
|
|
uint64 saved_rqs_processed = 0;
|
|
|
|
/*
|
|
* If the tuple returned by the previous step was null,
|
|
* simply return null tuple, no need to execute the DML
|
|
*/
|
|
if (TupIsNull(sourceDataSlot))
|
|
return NULL;
|
|
|
|
/*
|
|
* The current implementation of DMLs with RETURNING when run on replicated
|
|
* tables returns row from one of the datanodes. In order to achieve this
|
|
* ExecProcNode is repeatedly called saving one tuple and rejecting the rest.
|
|
* Do we have a DML on replicated table with RETURNING?
|
|
*/
|
|
dml_returning_on_replicated = IsReturningDMLOnReplicatedTable(step);
|
|
|
|
/*
|
|
* Use data row returned by the previous step as parameter for
|
|
* the DML to be executed in this step.
|
|
*/
|
|
SetDataRowForIntParams(resultRelInfo->ri_junkFilter, sourceDataSlot, newDataSlot, resultRemoteRel);
|
|
|
|
/*
|
|
* do_query calls get_exec_connections to determine target nodes
|
|
* at execution time. The function get_exec_connections can decide
|
|
* to evaluate en_expr to determine the target nodes. To evaluate en_expr,
|
|
* ExecEvalVar is called which picks up values from ecxt_scantuple if Var
|
|
* does not refer either OUTER or INNER varno. Hence we should copy the
|
|
* tuple returned by previous step in ecxt_scantuple if econtext is set.
|
|
* The econtext is set only when en_expr is set for execution time
|
|
* determination of the target nodes.
|
|
*/
|
|
|
|
if (econtext != NULL)
|
|
econtext->ecxt_scantuple = newDataSlot;
|
|
|
|
/*
|
|
* This loop would be required to reject tuples received from datanodes
|
|
* when a DML with RETURNING is run on a replicated table otherwise it
|
|
* would run once.
|
|
* PGXC need to: This approach is error prone if the DML statement constructed
|
|
* by the planner is such that it updates more than one row (even in case of
|
|
* non-replicated data). Fix it.
|
|
*/
|
|
do {
|
|
temp_slot = ExecProcNode((PlanState*)resultRemoteRel);
|
|
if (!TupIsNull(temp_slot)) {
|
|
/* Have we already copied the returned tuple? */
|
|
if (returningResultSlot == NULL) {
|
|
/* Copy the received tuple to be returned later */
|
|
returningResultSlot = MakeSingleTupleTableSlot(temp_slot->tts_tupleDescriptor);
|
|
returningResultSlot = ExecCopySlot(returningResultSlot, temp_slot);
|
|
saved_rqs_processed = resultRemoteRel->rqs_processed;
|
|
}
|
|
|
|
/* we should never get a failure here */
|
|
Assert(ExecIsTempObjectIncluded() || (saved_rqs_processed == resultRemoteRel->rqs_processed));
|
|
|
|
/* Clear the received tuple, the copy required has already been saved */
|
|
(void)ExecClearTuple(temp_slot);
|
|
} else {
|
|
if (dml_returning_on_replicated) {
|
|
resultRemoteRel->rqs_processed = saved_rqs_processed;
|
|
}
|
|
/* Null tuple received, so break the loop */
|
|
(void)ExecClearTuple(temp_slot);
|
|
break;
|
|
}
|
|
} while (dml_returning_on_replicated);
|
|
|
|
/*
|
|
* A DML can impact more than one row, e.g. an update without any where
|
|
* clause on a table with more than one row. We need to make sure that
|
|
* RemoteQueryNext calls do_query for each affected row, hence we reset
|
|
* the flag here and finish the DML being executed only when we return
|
|
* NULL from ExecModifyTable
|
|
*/
|
|
resultRemoteRel->query_Done = false;
|
|
|
|
return returningResultSlot;
|
|
}
|
|
|
|
void RegisterTransactionNodes(int count, void** connections, bool write)
|
|
{
|
|
int i;
|
|
MemoryContext oldcontext = MemoryContextSwitchTo(SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_EXECUTOR));
|
|
|
|
for (i = 0; i < count; i++) {
|
|
/*
|
|
* Add the node to either read or write participants. If a node is
|
|
* already in the write participant's list, don't add it to the read
|
|
* participant's list. OTOH if a node is currently in the read
|
|
* participant's list, but we are now initiating a write operation on
|
|
* the node, move it to the write participant's list
|
|
*/
|
|
if (write) {
|
|
u_sess->pgxc_cxt.XactWriteNodes = list_append_unique(u_sess->pgxc_cxt.XactWriteNodes, connections[i]);
|
|
u_sess->pgxc_cxt.XactReadNodes = list_delete(u_sess->pgxc_cxt.XactReadNodes, connections[i]);
|
|
} else {
|
|
if (!list_member(u_sess->pgxc_cxt.XactWriteNodes, connections[i]))
|
|
u_sess->pgxc_cxt.XactReadNodes = list_append_unique(u_sess->pgxc_cxt.XactReadNodes, connections[i]);
|
|
}
|
|
}
|
|
|
|
MemoryContextSwitchTo(oldcontext);
|
|
}
|
|
|
|
void PrintRegisteredTransactionNodes(void)
|
|
{
|
|
ListCell* cell = NULL;
|
|
|
|
if (module_logging_is_on(MOD_TRANS_HANDLE)) {
|
|
foreach (cell, u_sess->pgxc_cxt.XactReadNodes) {
|
|
PGXCNodeHandle* handle = (PGXCNodeHandle*)lfirst(cell);
|
|
ereport(LOG,
|
|
(errmodule(MOD_TRANS_HANDLE),
|
|
errmsg("u_sess->pgxc_cxt.XactReadNodes list : nodeoid = %u, nodeIdx = %d",
|
|
handle->nodeoid,
|
|
handle->nodeIdx)));
|
|
}
|
|
|
|
foreach (cell, u_sess->pgxc_cxt.XactWriteNodes) {
|
|
PGXCNodeHandle* handle = (PGXCNodeHandle*)lfirst(cell);
|
|
ereport(LOG,
|
|
(errmodule(MOD_TRANS_HANDLE),
|
|
errmsg("u_sess->pgxc_cxt.XactWriteNodes list : nodeoid = %u, nodeIdx = %d",
|
|
handle->nodeoid,
|
|
handle->nodeIdx)));
|
|
}
|
|
}
|
|
}
|
|
|
|
void ForgetTransactionNodes(void)
|
|
{
|
|
list_free_ext(u_sess->pgxc_cxt.XactReadNodes);
|
|
u_sess->pgxc_cxt.XactReadNodes = NIL;
|
|
|
|
list_free_ext(u_sess->pgxc_cxt.XactWriteNodes);
|
|
u_sess->pgxc_cxt.XactWriteNodes = NIL;
|
|
}
|
|
|
|
/*
|
|
* Clear per transaction remote information
|
|
*/
|
|
void AtEOXact_Remote(void)
|
|
{
|
|
ExecClearTempObjectIncluded();
|
|
ForgetTransactionNodes();
|
|
clear_RemoteXactState();
|
|
|
|
#ifdef PGXC
|
|
/* for "analyze table"
|
|
*
|
|
* cn1 | cn2
|
|
* create db; | drop database db;
|
|
* create t1; | WARNING: Clean connections not completed
|
|
* analyze; |
|
|
*
|
|
* analyze create a openGauss on cn2, the openGauss do NOT give conn back
|
|
* to pooler, so code here do this.
|
|
*/
|
|
if (IS_PGXC_COORDINATOR && IsConnFromCoord())
|
|
destroy_handles();
|
|
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* For 2PC, we do the following steps:
|
|
*
|
|
* 1. PREPARE the transaction locally if the local node is involved in the
|
|
* transaction. If local node is not involved, skip this step and go to the
|
|
* next step
|
|
* 2. PREPARE the transaction on all the remote nodes. If any node fails to
|
|
* PREPARE, directly go to step 6
|
|
* 3. Now that all the involved nodes are PREPAREd, we can commit the
|
|
* transaction. We first inform the GTM that the transaction is fully
|
|
* PREPARED and also supply the list of the nodes involved in the
|
|
* transaction
|
|
* 4. Start a new transaction so that normal commit processing
|
|
* works unchanged. COMMIT PREPARED the transaction on local node firstly if its involved in the
|
|
* transaction . And then commit prepared all the remotes nodes.
|
|
* and . Go to step 5.
|
|
* 5. Return and let the normal commit processing resume
|
|
* 6. Abort by ereporting the error and let normal abort-processing take
|
|
* charge.
|
|
*/
|
|
|
|
/*
|
|
* Do commit prepared processiong for remote nodes includes Datanodes and other Coordinators.
|
|
*/
|
|
void PreCommit_Remote(char* prepareGID, bool barrierLockHeld)
|
|
{
|
|
MemoryContext current_context = NULL;
|
|
/* send the commit csn before send the commit command to remote. */
|
|
if (!(useLocalXid || !IsPostmasterEnvironment || g_instance.attr.attr_storage.enable_gtm_free))
|
|
SendPGXCNodeCommitCsn(GetCommitCsn());
|
|
|
|
/*
|
|
* OK, everything went fine. At least one remote node is in PREPARED state
|
|
* and the transaction is successfully prepared on all the involved nodes.
|
|
* Now we are ready to commit the transaction. We need a new GXID to send
|
|
* down the remote nodes to execute the forthcoming COMMIT PREPARED
|
|
* command. So grab one from the GTM and track it. It will be closed along
|
|
* with the main transaction at the end.
|
|
*/
|
|
if (TwoPhaseCommit) {
|
|
/*
|
|
* At two phase transaction, after CN has committed,
|
|
* pgxc_node_remote_commit can not ereport ERROR when receive signal.
|
|
*/
|
|
current_context = CurrentMemoryContext;
|
|
PG_TRY();
|
|
{
|
|
pgxc_node_remote_commit(barrierLockHeld);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
(void)MemoryContextSwitchTo(current_context);
|
|
ErrorData* edata = CopyErrorData();
|
|
FlushErrorState();
|
|
|
|
if (!barrierLockHeld && LWLockHeldByMe(BarrierLock)) {
|
|
/* match the upcoming RESUME_INTERRUPTS */
|
|
HOLD_INTERRUPTS();
|
|
LWLockRelease(BarrierLock);
|
|
}
|
|
|
|
/*
|
|
* remoteXactState.status maybe RXACT_COMMIT_FAILED or
|
|
* RXACT_PART_COMMITTED, we set it to RXACT_COMMIT_FAILED at here,
|
|
* it doesn't matter.
|
|
*/
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_COMMIT_FAILED;
|
|
ereport(WARNING, (errmsg("Failed during commit prepared transaction: %s", edata->message)));
|
|
}
|
|
PG_END_TRY();
|
|
} else
|
|
pgxc_node_remote_commit(barrierLockHeld);
|
|
|
|
Assert(u_sess->pgxc_cxt.remoteXactState->status == RXACT_COMMITTED ||
|
|
u_sess->pgxc_cxt.remoteXactState->status == RXACT_COMMIT_FAILED ||
|
|
u_sess->pgxc_cxt.remoteXactState->status == RXACT_PART_COMMITTED ||
|
|
u_sess->pgxc_cxt.remoteXactState->status == RXACT_NONE);
|
|
/*
|
|
* we can not ereport ERROR after CN has committed when two phase transaction
|
|
*/
|
|
if (TwoPhaseCommit)
|
|
START_CRIT_SECTION();
|
|
|
|
clear_RemoteXactState();
|
|
|
|
if (TwoPhaseCommit) {
|
|
END_CRIT_SECTION();
|
|
}
|
|
|
|
/*
|
|
* The transaction is now successfully committed on all the remote nodes.
|
|
* (XXX How about the local node ?). It can now be cleaned up from the GTM
|
|
* as well.
|
|
*
|
|
* During inplace or online upgrade, we should never put the connection
|
|
* back to the pooler, since they may be reused to communicate with CNs
|
|
* having different t_thrd.proc->workingVersionNum values, which may lead to
|
|
* incompatibility.
|
|
*/
|
|
if (!u_sess->attr.attr_common.pooler_cache_connection || isInLargeUpgrade() ||
|
|
(g_instance.attr.attr_common.enable_thread_pool && !u_sess->attr.attr_common.PersistentConnections &&
|
|
(u_sess->proc_cxt.IsInnerMaintenanceTools || IsHAPort(u_sess->proc_cxt.MyProcPort))))
|
|
destroy_handles();
|
|
else if (!u_sess->attr.attr_common.PersistentConnections)
|
|
release_handles();
|
|
}
|
|
|
|
void SubXactCancel_Remote(void)
|
|
{
|
|
cancel_query();
|
|
clear_RemoteXactState();
|
|
}
|
|
|
|
/*
|
|
* Do abort processing for the transaction. We must abort the transaction on
|
|
* all the involved nodes. If a node has already prepared a transaction, we run
|
|
* ROLLBACK PREPARED command on the node. Otherwise, a simple ROLLBACK command
|
|
* is sufficient.
|
|
*
|
|
* We must guard against the case when a transaction is prepared succefully on
|
|
* all the nodes and some error occurs after we send a COMMIT PREPARED message
|
|
* to at lease one node. Such a transaction must not be aborted to preserve
|
|
* global consistency. We handle this case by recording the nodes involved in
|
|
* the transaction at the GTM and keep the transaction open at the GTM so that
|
|
* its reported as "in-progress" on all the nodes until resolved
|
|
*/
|
|
bool PreAbort_Remote(bool PerfectRollback)
|
|
{
|
|
int has_error = 0;
|
|
// If has any communacation failure when cancel, we just drop the connection.
|
|
if (IS_PGXC_COORDINATOR && !IsConnFromCoord() && !PerfectRollback)
|
|
cancel_query_without_read();
|
|
|
|
if (!t_thrd.xact_cxt.XactLocalNodeCanAbort)
|
|
return false;
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_COMMITTED)
|
|
return false;
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_NONE && !IsNormalProcessingMode())
|
|
return true;
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_PART_COMMITTED)
|
|
return false;
|
|
else {
|
|
/*
|
|
* The transaction is neither part or fully committed. We can safely
|
|
* abort such transaction
|
|
*/
|
|
if (u_sess->pgxc_cxt.remoteXactState->status == RXACT_NONE)
|
|
init_RemoteXactState(false);
|
|
|
|
has_error = pgxc_node_remote_abort();
|
|
}
|
|
|
|
clear_RemoteXactState();
|
|
|
|
// Here we destroy the connection unconditionally, to avoid possible message
|
|
// disorder, which may cause the message of this query be received by the next query.
|
|
if (!PerfectRollback || has_error || !u_sess->attr.attr_common.pooler_cache_connection ||
|
|
isInLargeUpgrade() || (g_instance.attr.attr_common.enable_thread_pool &&
|
|
(u_sess->proc_cxt.IsInnerMaintenanceTools || IsHAPort(u_sess->proc_cxt.MyProcPort))))
|
|
destroy_handles();
|
|
else if (!u_sess->attr.attr_common.PersistentConnections)
|
|
release_handles();
|
|
|
|
return has_error;
|
|
}
|
|
|
|
/* Indicates the version containing the backend_version parameter,
|
|
* This parameter is used to determine whether the handle is used to update catalog.
|
|
* For a version that does not contain the backend_version parameter,
|
|
* check whether the version is a handle in the upgrade process.. */
|
|
bool isInLargeUpgrade()
|
|
{
|
|
if (contain_backend_version(t_thrd.proc->workingVersionNum)) {
|
|
return u_sess->attr.attr_common.IsInplaceUpgrade;
|
|
} else {
|
|
return (t_thrd.proc->workingVersionNum != GRAND_VERSION_NUM || u_sess->attr.attr_common.IsInplaceUpgrade);
|
|
}
|
|
}
|
|
|
|
char* PrePrepare_Remote(const char* prepareGID, bool implicit, bool WriteCnLocalNode)
|
|
{
|
|
init_RemoteXactState(false);
|
|
|
|
/*
|
|
* PREPARE the transaction on all nodes including remote nodes as well as
|
|
* local node. Any errors will be reported via ereport and the transaction
|
|
* will be aborted accordingly.
|
|
*/
|
|
(void)pgxc_node_remote_prepare(prepareGID, WriteCnLocalNode);
|
|
|
|
if (u_sess->pgxc_cxt.preparedNodes)
|
|
pfree_ext(u_sess->pgxc_cxt.preparedNodes);
|
|
u_sess->pgxc_cxt.preparedNodes = NULL;
|
|
|
|
if (!implicit)
|
|
u_sess->pgxc_cxt.preparedNodes = pgxc_node_get_nodelist(true);
|
|
|
|
return u_sess->pgxc_cxt.preparedNodes;
|
|
}
|
|
|
|
void PostPrepare_Remote(char* prepareGID, char* nodestring, bool implicit)
|
|
{
|
|
u_sess->pgxc_cxt.remoteXactState->preparedLocalNode = true;
|
|
|
|
/*
|
|
* If this is an explicit PREPARE request by the client, we must also save
|
|
* the list of nodes involved in this transaction on the GTM for later use
|
|
*/
|
|
|
|
/* Now forget the transaction nodes */
|
|
ForgetTransactionNodes();
|
|
}
|
|
|
|
/*
|
|
* Return the list of nodes where the prepared transaction is not yet committed
|
|
*/
|
|
static char* pgxc_node_get_nodelist(bool localNode)
|
|
{
|
|
int i;
|
|
char* nodestring = NULL;
|
|
errno_t rc = EOK;
|
|
|
|
for (i = 0; i < u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes; i++) {
|
|
RemoteXactNodeStatus status = u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i];
|
|
PGXCNodeHandle* conn = u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles[i];
|
|
|
|
if (status != RXACT_NODE_COMMITTED) {
|
|
NameData nodename = {{0}};
|
|
(void)get_pgxc_nodename(conn->nodeoid, &nodename);
|
|
if (nodestring == NULL) {
|
|
nodestring = (char*)MemoryContextAlloc(
|
|
SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_EXECUTOR), strlen(nodename.data) + 1);
|
|
rc = sprintf_s(nodestring, strlen(nodename.data) + 1, "%s", nodename.data);
|
|
securec_check_ss(rc, "", "");
|
|
} else {
|
|
nodestring = (char*)repalloc(nodestring, strlen(nodename.data) + strlen(nodestring) + 2);
|
|
rc = sprintf_s(
|
|
nodestring, strlen(nodename.data) + strlen(nodestring) + 2, "%s,%s", nodestring, nodename.data);
|
|
securec_check_ss(rc, "", "");
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Case of a single Coordinator */
|
|
if (localNode && u_sess->pgxc_cxt.PGXCNodeId >= 0) {
|
|
if (nodestring == NULL) {
|
|
nodestring = (char*)MemoryContextAlloc(SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_EXECUTOR),
|
|
strlen(g_instance.attr.attr_common.PGXCNodeName) + 1);
|
|
rc = sprintf_s(nodestring,
|
|
strlen(g_instance.attr.attr_common.PGXCNodeName) + 1,
|
|
"%s",
|
|
g_instance.attr.attr_common.PGXCNodeName);
|
|
securec_check_ss(rc, "", "");
|
|
} else {
|
|
nodestring =
|
|
(char*)repalloc(nodestring, strlen(g_instance.attr.attr_common.PGXCNodeName) + strlen(nodestring) + 2);
|
|
rc = sprintf_s(nodestring,
|
|
strlen(g_instance.attr.attr_common.PGXCNodeName) + strlen(nodestring) + 2,
|
|
"%s,%s",
|
|
nodestring,
|
|
g_instance.attr.attr_common.PGXCNodeName);
|
|
securec_check_ss(rc, "", "");
|
|
}
|
|
}
|
|
|
|
return nodestring;
|
|
}
|
|
|
|
bool IsTwoPhaseCommitRequired(bool localWrite)
|
|
{
|
|
/*
|
|
* under gtm free mode, disable 2pc DML when enable_twophase_commit is off.
|
|
* always enable 2pc commit when upgrade,expand or deal with replicated table.
|
|
*/
|
|
if (g_instance.attr.attr_storage.enable_gtm_free && (u_sess->attr.attr_common.upgrade_mode == 0) &&
|
|
!t_thrd.xact_cxt.MyXactAccessedRepRel && !u_sess->attr.attr_storage.enable_twophase_commit &&
|
|
!u_sess->attr.attr_sql.enable_cluster_resize && !localWrite &&
|
|
(list_length(u_sess->pgxc_cxt.XactWriteNodes) > 1)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
errmodule(MOD_TRANS_XACT),
|
|
errmsg("Unsupport DML two phase commit under gtm free mode."),
|
|
errhint("Set enable_twophase_commit to on if need to use DML two phase commit.")));
|
|
}
|
|
if ((list_length(u_sess->pgxc_cxt.XactWriteNodes) > 1) ||
|
|
((list_length(u_sess->pgxc_cxt.XactWriteNodes) == 1) && localWrite)) {
|
|
// temp table can use 2PC, so just return true here.
|
|
return true;
|
|
} else
|
|
return false;
|
|
}
|
|
|
|
static void clear_RemoteXactState(void)
|
|
{
|
|
/* Clear the previous state */
|
|
u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes = 0;
|
|
u_sess->pgxc_cxt.remoteXactState->numReadRemoteNodes = 0;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_NONE;
|
|
u_sess->pgxc_cxt.remoteXactState->prepareGID[0] = '\0';
|
|
errno_t rc = EOK;
|
|
|
|
if ((u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles == NULL) ||
|
|
(u_sess->pgxc_cxt.remoteXactState->maxRemoteNodes <
|
|
(u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords))) {
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles != NULL)
|
|
free(u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles);
|
|
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles = (PGXCNodeHandle**)malloc(
|
|
sizeof(PGXCNodeHandle*) *
|
|
(g_instance.attr.attr_common.MaxDataNodes + g_instance.attr.attr_network.MaxCoords));
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus != NULL)
|
|
free(u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus);
|
|
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus = (RemoteXactNodeStatus*)malloc(
|
|
sizeof(RemoteXactNodeStatus) *
|
|
(g_instance.attr.attr_common.MaxDataNodes + g_instance.attr.attr_network.MaxCoords));
|
|
|
|
u_sess->pgxc_cxt.remoteXactState->maxRemoteNodes = u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords;
|
|
}
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles != NULL) {
|
|
rc = memset_s(u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles,
|
|
sizeof(PGXCNodeHandle*) * (u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords),
|
|
0,
|
|
sizeof(PGXCNodeHandle*) * (u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords));
|
|
securec_check(rc, "\0", "\0");
|
|
} else
|
|
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")));
|
|
|
|
if (u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords <= 0)
|
|
return;
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus != NULL) {
|
|
rc = memset_s(u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus,
|
|
sizeof(RemoteXactNodeStatus) * (u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords),
|
|
0,
|
|
sizeof(RemoteXactNodeStatus) * (u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords));
|
|
securec_check(rc, "\0", "\0");
|
|
} else
|
|
ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")));
|
|
}
|
|
|
|
static void init_RemoteXactState(bool preparedLocalNode)
|
|
{
|
|
int write_conn_count, read_conn_count;
|
|
PGXCNodeHandle** connections = NULL;
|
|
|
|
clear_RemoteXactState();
|
|
|
|
u_sess->pgxc_cxt.remoteXactState->preparedLocalNode = preparedLocalNode;
|
|
connections = u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles;
|
|
|
|
Assert(connections);
|
|
|
|
/*
|
|
* First get information about all the nodes involved in this transaction
|
|
*/
|
|
write_conn_count =
|
|
pgxc_get_transaction_nodes(connections, u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords, true);
|
|
u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes = write_conn_count;
|
|
|
|
read_conn_count = pgxc_get_transaction_nodes(connections + write_conn_count,
|
|
u_sess->pgxc_cxt.NumDataNodes + u_sess->pgxc_cxt.NumCoords - write_conn_count,
|
|
false);
|
|
u_sess->pgxc_cxt.remoteXactState->numReadRemoteNodes = read_conn_count;
|
|
}
|
|
|
|
void free_RemoteXactState(void)
|
|
{
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles != NULL)
|
|
free(u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles);
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles = NULL;
|
|
|
|
if (u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus != NULL)
|
|
free(u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus);
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus = NULL;
|
|
}
|
|
|
|
/*
|
|
* pgxc_node_report_error
|
|
* Throw error if any.
|
|
*/
|
|
void pgxc_node_report_error(RemoteQueryState* combiner, int elevel)
|
|
{
|
|
#define REPORT_ERROR ((elevel > 0) ? elevel : ERROR)
|
|
|
|
/* If no combiner, nothing to do */
|
|
if (combiner == NULL)
|
|
return;
|
|
|
|
if (combiner->need_error_check)
|
|
return;
|
|
|
|
if (combiner->errorMessage) {
|
|
// For internal cancel error message, do not bother reporting it.
|
|
//
|
|
if (combiner->errorCode == ERRCODE_QUERY_INTERNAL_CANCEL && t_thrd.xact_cxt.bInAbortTransaction)
|
|
return;
|
|
|
|
char* errMsg = combiner->errorMessage;
|
|
|
|
if (combiner->position == PLAN_ROUTER) {
|
|
StringInfo si = makeStringInfo();
|
|
appendStringInfo(si, "from the Compute Pool: \"%s\"", errMsg);
|
|
errMsg = si->data;
|
|
}
|
|
|
|
if (combiner->errorDetail != NULL && combiner->query != NULL && combiner->errorContext != NULL &&
|
|
combiner->hint != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errdetail("%s", combiner->errorDetail),
|
|
errquery("%s", combiner->query),
|
|
errcontext("%s", combiner->errorContext),
|
|
errhint("%s", combiner->hint)));
|
|
else if (combiner->errorDetail != NULL && combiner->query != NULL && combiner->errorContext != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errdetail("%s", combiner->errorDetail),
|
|
errquery("%s", combiner->query),
|
|
errcontext("%s", combiner->errorContext)));
|
|
else if (combiner->errorDetail != NULL && combiner->errorContext != NULL && combiner->hint != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errdetail("%s", combiner->errorDetail),
|
|
errcontext("%s", combiner->errorContext),
|
|
errhint("%s", combiner->hint)));
|
|
else if (combiner->errorDetail != NULL && combiner->query != NULL && combiner->hint != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errdetail("%s", combiner->errorDetail),
|
|
errquery("%s", combiner->query),
|
|
errhint("%s", combiner->hint)));
|
|
else if (combiner->query && combiner->errorContext != NULL && combiner->hint != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errquery("%s", combiner->query),
|
|
errcontext("%s", combiner->errorContext),
|
|
errhint("%s", combiner->hint)));
|
|
else if (combiner->errorDetail != NULL && combiner->query != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errdetail("%s", combiner->errorDetail),
|
|
errquery("%s", combiner->query)));
|
|
else if (combiner->errorDetail != NULL && combiner->errorContext != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errdetail("%s", combiner->errorDetail),
|
|
errcontext("%s", combiner->errorContext)));
|
|
else if (combiner->errorDetail != NULL && combiner->hint != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errdetail("%s", combiner->errorDetail),
|
|
errhint("%s", combiner->hint)));
|
|
else if (combiner->query != NULL && combiner->errorContext != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errquery("%s", combiner->query),
|
|
errcontext("%s", combiner->errorContext)));
|
|
else if (combiner->query != NULL && combiner->hint != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errquery("%s", combiner->query),
|
|
errhint("%s", combiner->hint)));
|
|
else if (combiner->errorContext != NULL && combiner->hint != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errcontext("%s", combiner->errorContext),
|
|
errhint("%s", combiner->hint)));
|
|
else if (combiner->errorDetail != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errdetail("%s", combiner->errorDetail)));
|
|
else if (combiner->errorContext != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errcontext("%s", combiner->errorContext)));
|
|
else if (combiner->hint != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errhint("%s", combiner->hint)));
|
|
else if (combiner->query != NULL)
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos),
|
|
errquery("%s", combiner->query)));
|
|
else
|
|
ereport(REPORT_ERROR,
|
|
(errcode(combiner->errorCode),
|
|
combiner_errdata(&combiner->remoteErrData),
|
|
errmsg("%s", errMsg),
|
|
internalerrposition(combiner->cursorpos)));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* get_success_nodes:
|
|
* Currently called to print a user-friendly message about
|
|
* which nodes the query failed.
|
|
* Gets all the nodes where no 'E' (error) messages were received; i.e. where the
|
|
* query ran successfully.
|
|
*/
|
|
static ExecNodes* get_success_nodes(int node_count, PGXCNodeHandle** handles, char node_type, StringInfo failednodes)
|
|
{
|
|
ExecNodes* success_nodes = NULL;
|
|
int i;
|
|
|
|
for (i = 0; i < node_count; i++) {
|
|
PGXCNodeHandle* handle = handles[i];
|
|
int nodenum = PGXCNodeGetNodeId(handle->nodeoid, node_type);
|
|
|
|
if (!handle->error) {
|
|
if (success_nodes == NULL)
|
|
success_nodes = makeNode(ExecNodes);
|
|
success_nodes->nodeList = lappend_int(success_nodes->nodeList, nodenum);
|
|
} else {
|
|
if (failednodes->len == 0)
|
|
appendStringInfo(failednodes, "Error message received from nodes:");
|
|
appendStringInfo(failednodes, " %s", handle->remoteNodeName);
|
|
}
|
|
}
|
|
return success_nodes;
|
|
}
|
|
|
|
void pgxc_all_success_nodes(ExecNodes** d_nodes, ExecNodes** c_nodes, char** failednodes_msg)
|
|
{
|
|
#ifndef ENABLE_MULTIPLE_NODES
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
#else
|
|
PGXCNodeAllHandles* connections = get_exec_connections(NULL, NULL, EXEC_ON_ALL_NODES);
|
|
StringInfoData failednodes;
|
|
initStringInfo(&failednodes);
|
|
|
|
*d_nodes =
|
|
get_success_nodes(connections->dn_conn_count, connections->datanode_handles, PGXC_NODE_DATANODE, &failednodes);
|
|
|
|
*c_nodes =
|
|
get_success_nodes(connections->co_conn_count, connections->coord_handles, PGXC_NODE_COORDINATOR, &failednodes);
|
|
|
|
if (failednodes.len == 0)
|
|
*failednodes_msg = NULL;
|
|
else
|
|
*failednodes_msg = failednodes.data;
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* set_dbcleanup_callback:
|
|
* Register a callback function which does some non-critical cleanup tasks
|
|
* on xact success or abort, such as tablespace/database directory cleanup.
|
|
*/
|
|
void set_dbcleanup_callback(xact_callback function, const void* paraminfo, int paraminfo_size)
|
|
{
|
|
AutoContextSwitch dbcleanupCxt(SESS_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_EXECUTOR));
|
|
|
|
void* fparams = NULL;
|
|
errno_t rc = EOK;
|
|
|
|
abort_callback_type* dbcleanupInfo = (abort_callback_type*)palloc(sizeof(abort_callback_type));
|
|
fparams = palloc(paraminfo_size);
|
|
rc = memcpy_s(fparams, paraminfo_size, paraminfo, paraminfo_size);
|
|
securec_check(rc, "", "");
|
|
|
|
dbcleanupInfo->fparams = fparams;
|
|
dbcleanupInfo->function = function;
|
|
|
|
u_sess->xact_cxt.dbcleanupInfoList = lappend(u_sess->xact_cxt.dbcleanupInfoList, dbcleanupInfo);
|
|
}
|
|
|
|
/*
|
|
* AtEOXact_DBCleanup: To be called at post-commit or pre-abort.
|
|
* Calls the cleanup function registered during this transaction, if any.
|
|
*/
|
|
void AtEOXact_DBCleanup(bool isCommit)
|
|
{
|
|
bool doFunc = isCommit || t_thrd.xact_cxt.XactLocalNodeCanAbort;
|
|
ListCell *lc = NULL;
|
|
if (u_sess->xact_cxt.dbcleanupInfoList && doFunc) {
|
|
foreach (lc, u_sess->xact_cxt.dbcleanupInfoList) {
|
|
abort_callback_type *dbcleanupInfo = (abort_callback_type*)lfirst(lc);
|
|
if (dbcleanupInfo->function) {
|
|
(*dbcleanupInfo->function)(isCommit, dbcleanupInfo->fparams);
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Just reset the callbackinfo. We anyway don't want this to be called again,
|
|
* until explicitly set.
|
|
*/
|
|
foreach (lc, u_sess->xact_cxt.dbcleanupInfoList) {
|
|
abort_callback_type *dbcleanupInfo = (abort_callback_type*)lfirst(lc);
|
|
pfree_ext(dbcleanupInfo->fparams);
|
|
dbcleanupInfo->function = NULL;
|
|
}
|
|
list_free_deep(u_sess->xact_cxt.dbcleanupInfoList);
|
|
u_sess->xact_cxt.dbcleanupInfoList = NIL;
|
|
}
|
|
|
|
/*
|
|
* SetDataRowForIntParams: Form a BIND data row for internal parameters.
|
|
* This function is called when the data for the parameters of remote
|
|
* statement resides in some plan slot of an internally generated remote
|
|
* statement rather than from some extern params supplied by the caller of the
|
|
* query. Currently DML is the only case where we generate a query with
|
|
* internal parameters.
|
|
* The parameter data is constructed from the slot data, and stored in
|
|
* RemoteQueryState.paramval_data.
|
|
* At the same time, remote parameter types are inferred from the slot
|
|
* tuple descriptor, and stored in RemoteQueryState.rqs_param_types.
|
|
* On subsequent calls, these param types are re-used.
|
|
* The data to be BOUND consists of table column data to be inserted/updated
|
|
* and the ctid/nodeid values to be supplied for the WHERE clause of the
|
|
* query. The data values are present in dataSlot whereas the ctid/nodeid
|
|
* are available in sourceSlot as junk attributes.
|
|
* For DELETEs, the dataSlot is NULL.
|
|
* sourceSlot is used only to retrieve ctid/nodeid, so it does not get
|
|
* used for INSERTs, although it will never be NULL.
|
|
* The slots themselves are undisturbed.
|
|
*/
|
|
static void SetDataRowForIntParams(
|
|
JunkFilter* junkfilter, TupleTableSlot* sourceSlot, TupleTableSlot* dataSlot, RemoteQueryState* rq_state)
|
|
{
|
|
StringInfoData buf;
|
|
uint16 numparams = 0;
|
|
RemoteQuery* step = (RemoteQuery*)rq_state->ss.ps.plan;
|
|
errno_t rc = EOK;
|
|
|
|
Assert(sourceSlot);
|
|
|
|
/* Calculate the total number of parameters */
|
|
if (dataSlot != NULL)
|
|
numparams = dataSlot->tts_tupleDescriptor->natts;
|
|
/* Add number of junk attributes */
|
|
if (junkfilter != NULL) {
|
|
if (junkfilter->jf_primary_keys != NIL)
|
|
numparams += list_length(junkfilter->jf_primary_keys);
|
|
if (junkfilter->jf_junkAttNo)
|
|
numparams++;
|
|
if (junkfilter->jf_xc_node_id)
|
|
numparams++;
|
|
if (junkfilter->jf_xc_part_id)
|
|
numparams++;
|
|
}
|
|
|
|
/*
|
|
* Infer param types from the slot tupledesc and junk attributes. But we
|
|
* have to do it only the first time: the interal parameters remain the same
|
|
* while processing all the source data rows because the data slot tupdesc
|
|
* never changes. Even though we can determine the internal param types
|
|
* during planning, we want to do it here: we don't want to set the param
|
|
* types and param data at two different places. Doing them together here
|
|
* helps us to make sure that the param types are in sync with the param
|
|
* data.
|
|
*/
|
|
|
|
/*
|
|
* We know the numparams, now initialize the param types if not already
|
|
* done. Once set, this will be re-used for each source data row.
|
|
*/
|
|
if (rq_state->rqs_num_params == 0) {
|
|
int attindex = 0;
|
|
TupleDesc tdesc;
|
|
|
|
rq_state->rqs_num_params = numparams;
|
|
rq_state->rqs_param_types = (Oid*)palloc(sizeof(Oid) * rq_state->rqs_num_params);
|
|
|
|
if (dataSlot != NULL) /* We have table attributes to bind */
|
|
{
|
|
tdesc = dataSlot->tts_tupleDescriptor;
|
|
int numatts = tdesc->natts;
|
|
for (attindex = 0; attindex < numatts; attindex++) {
|
|
rq_state->rqs_param_types[attindex] = tdesc->attrs[attindex].atttypid;
|
|
|
|
/* For unknown param type(maybe a const), we need to convert it to text */
|
|
if (tdesc->attrs[attindex].atttypid == UNKNOWNOID) {
|
|
rq_state->rqs_param_types[attindex] = TEXTOID;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (junkfilter != NULL) /* Param types for specific junk attributes if present */
|
|
{
|
|
ListCell* lc = NULL;
|
|
|
|
foreach (lc, junkfilter->jf_primary_keys) {
|
|
Var* var = (Var*)lfirst(lc);
|
|
Assert(IsA(var, Var));
|
|
|
|
rq_state->rqs_param_types[attindex++] = var->vartype;
|
|
}
|
|
|
|
/* jf_junkAttNo always contains ctid */
|
|
if (AttributeNumberIsValid(junkfilter->jf_junkAttNo))
|
|
rq_state->rqs_param_types[attindex++] = TIDOID;
|
|
|
|
if (AttributeNumberIsValid(junkfilter->jf_xc_node_id))
|
|
rq_state->rqs_param_types[attindex++] = INT4OID;
|
|
|
|
if (AttributeNumberIsValid(junkfilter->jf_xc_part_id))
|
|
rq_state->rqs_param_types[attindex++] = OIDOID;
|
|
}
|
|
} else {
|
|
Assert(rq_state->rqs_num_params == numparams);
|
|
}
|
|
|
|
/*
|
|
* If we already have the data row, just copy that, and we are done. One
|
|
* scenario where we can have the data row is for INSERT ... SELECT.
|
|
* Effectively, in this case, we just re-use the data row from SELECT as-is
|
|
* for BIND row of INSERT. But just make sure all of the data required to
|
|
* bind is available in the slot. If there are junk attributes to be added
|
|
* in the BIND row, we cannot re-use the data row as-is.
|
|
*/
|
|
if (junkfilter == NULL && dataSlot != NULL && dataSlot->tts_dataRow != NULL) {
|
|
if (rq_state->paramval_data != NULL) {
|
|
pfree_ext(rq_state->paramval_data);
|
|
rq_state->paramval_data = NULL;
|
|
}
|
|
rq_state->paramval_data = (char*)palloc(dataSlot->tts_dataLen);
|
|
rc = memcpy_s(rq_state->paramval_data, dataSlot->tts_dataLen, dataSlot->tts_dataRow, dataSlot->tts_dataLen);
|
|
securec_check(rc, "", "");
|
|
rq_state->paramval_len = dataSlot->tts_dataLen;
|
|
return;
|
|
}
|
|
|
|
initStringInfo(&buf);
|
|
|
|
{
|
|
uint16 params_nbo = htons(numparams); /* Network byte order */
|
|
appendBinaryStringInfo(&buf, (char*)¶ms_nbo, sizeof(params_nbo));
|
|
}
|
|
|
|
/*
|
|
* The data attributes would not be present for DELETE. In such case,
|
|
* dataSlot will be NULL.
|
|
*/
|
|
if (dataSlot != NULL) {
|
|
TupleDesc tdesc = dataSlot->tts_tupleDescriptor;
|
|
int attindex;
|
|
|
|
/* Append the data attributes */
|
|
|
|
/* ensure we have all values */
|
|
tableam_tslot_getallattrs(dataSlot);
|
|
for (attindex = 0; attindex < tdesc->natts; attindex++) {
|
|
uint32 n32;
|
|
Assert(attindex < numparams);
|
|
|
|
if (dataSlot->tts_isnull[attindex]) {
|
|
n32 = htonl(~0);
|
|
appendBinaryStringInfo(&buf, (char*)&n32, 4);
|
|
} else
|
|
/* It should switch memctx to ExprContext for makenode in ExecInitExpr */
|
|
pgxc_append_param_val(&buf, dataSlot->tts_values[attindex], tdesc->attrs[attindex].atttypid);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* From the source data, fetch the junk attribute values to be appended in
|
|
* the end of the data buffer. The junk attribute vals like ctid and
|
|
* xc_node_id are used in the WHERE clause parameters.
|
|
* These attributes would not be present for INSERT.
|
|
*
|
|
* For MergeInto query, there are junkfilters, but we do not need it for INSERT
|
|
*/
|
|
if (junkfilter != NULL) {
|
|
ListCell* lc = NULL;
|
|
int attindex = junkfilter->jf_junkAttNo - list_length(junkfilter->jf_primary_keys);
|
|
bool allow_dummy_junkfilter = step->remote_query->commandType == CMD_INSERT;
|
|
|
|
foreach (lc, junkfilter->jf_primary_keys) {
|
|
Var* var = (Var*)lfirst(lc);
|
|
|
|
Assert(IsA(var, Var));
|
|
pgxc_append_param_junkval(sourceSlot, attindex++, var->vartype, &buf, allow_dummy_junkfilter);
|
|
}
|
|
|
|
/* First one - jf_junkAttNo - always reprsents ctid */
|
|
pgxc_append_param_junkval(sourceSlot, junkfilter->jf_junkAttNo, TIDOID, &buf, allow_dummy_junkfilter);
|
|
pgxc_append_param_junkval(sourceSlot, junkfilter->jf_xc_node_id, INT4OID, &buf, allow_dummy_junkfilter);
|
|
pgxc_append_param_junkval(sourceSlot, junkfilter->jf_xc_part_id, OIDOID, &buf, allow_dummy_junkfilter);
|
|
}
|
|
|
|
/* Assign the newly allocated data row to paramval */
|
|
if (rq_state->paramval_data != NULL) {
|
|
pfree_ext(rq_state->paramval_data);
|
|
rq_state->paramval_data = NULL;
|
|
}
|
|
rq_state->paramval_data = buf.data;
|
|
rq_state->paramval_len = buf.len;
|
|
}
|
|
|
|
/*
|
|
* pgxc_append_param_junkval:
|
|
* Append into the data row the parameter whose value cooresponds to the junk
|
|
* attributes in the source slot, namely ctid or node_id.
|
|
*/
|
|
static void pgxc_append_param_junkval(
|
|
TupleTableSlot* slot, AttrNumber attno, Oid valtype, StringInfo buf, bool allow_dummy_junkfilter)
|
|
{
|
|
bool isNull = false;
|
|
|
|
if (slot != NULL && attno != InvalidAttrNumber) {
|
|
/* Junk attribute positions are saved by ExecFindJunkAttribute() */
|
|
Datum val = ExecGetJunkAttribute(slot, attno, &isNull);
|
|
|
|
/* shouldn't ever get a null result... */
|
|
if (isNull && !allow_dummy_junkfilter)
|
|
ereport(ERROR, (errcode(ERRCODE_NULL_JUNK_ATTRIBUTE), errmsg("NULL junk attribute")));
|
|
|
|
/* for MERGEINTO query ,we allow null junkfilter */
|
|
if (isNull && allow_dummy_junkfilter) {
|
|
char* pstring = NULL;
|
|
|
|
if (valtype == TIDOID) {
|
|
pstring = "(0,0)";
|
|
int len = strlen(pstring);
|
|
uint32 n32 = htonl(len);
|
|
|
|
appendBinaryStringInfo(buf, (char*)&n32, 4);
|
|
appendBinaryStringInfo(buf, pstring, len);
|
|
} else {
|
|
pstring = "0";
|
|
int len = strlen(pstring);
|
|
uint32 n32 = htonl(len);
|
|
|
|
appendBinaryStringInfo(buf, (char*)&n32, 4);
|
|
appendBinaryStringInfo(buf, pstring, len);
|
|
}
|
|
}
|
|
|
|
if (!isNull)
|
|
pgxc_append_param_val(buf, val, valtype);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* pgxc_append_param_val:
|
|
* Append the parameter value for the SET clauses of the UPDATE statement.
|
|
* These values are the table attribute values from the dataSlot.
|
|
*/
|
|
static void pgxc_append_param_val(StringInfo buf, Datum val, Oid valtype)
|
|
{
|
|
/* Convert Datum to string */
|
|
char* pstring = NULL;
|
|
int len;
|
|
uint32 n32;
|
|
Oid typOutput;
|
|
bool typIsVarlena = false;
|
|
Datum newval = 0;
|
|
|
|
/* Get info needed to output the value */
|
|
getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
|
|
/*
|
|
* If we have a toasted datum, forcibly detoast it here to avoid
|
|
* memory leakage inside the type's output routine.
|
|
*/
|
|
if (typIsVarlena)
|
|
newval = PointerGetDatum(PG_DETOAST_DATUM(val));
|
|
|
|
pstring = OidOutputFunctionCall(typOutput, ((typIsVarlena == true) ? newval : val));
|
|
|
|
/* copy data to the buffer */
|
|
len = strlen(pstring);
|
|
n32 = htonl(len);
|
|
appendBinaryStringInfo(buf, (char*)&n32, 4);
|
|
appendBinaryStringInfo(buf, pstring, len);
|
|
|
|
/*
|
|
* pstring should never be null according to input param 'val'.
|
|
* This function is only invoked when val is not null.
|
|
*/
|
|
pfree_ext(pstring);
|
|
if (typIsVarlena && (val != newval))
|
|
pfree(DatumGetPointer(newval));
|
|
}
|
|
|
|
/*
|
|
* pgxc_rq_fire_bstriggers:
|
|
* BEFORE STATEMENT triggers to be fired for a user-supplied DML query.
|
|
* For non-FQS query, we internally generate remote DML query to be executed
|
|
* for each row to be processed. But we do not want to explicitly fire triggers
|
|
* for such a query; ExecModifyTable does that for us. It is the FQS DML query
|
|
* where we need to explicitly fire statement triggers on coordinator. We
|
|
* cannot run stmt triggers on datanode. While we can fire stmt trigger on
|
|
* datanode versus coordinator based on the function shippability, we cannot
|
|
* do the same for FQS query. The datanode has no knowledge that the trigger
|
|
* being fired is due to a non-FQS query or an FQS query. Even though it can
|
|
* find that all the triggers are shippable, it won't know whether the stmt
|
|
* itself has been FQSed. Even though all triggers were shippable, the stmt
|
|
* might have been planned on coordinator due to some other non-shippable
|
|
* clauses. So the idea here is to *always* fire stmt triggers on coordinator.
|
|
* Note that this does not prevent the query itself from being FQSed. This is
|
|
* because we separately fire stmt triggers on coordinator.
|
|
*/
|
|
static void pgxc_rq_fire_bstriggers(RemoteQueryState* node)
|
|
{
|
|
RemoteQuery* rq = (RemoteQuery*)node->ss.ps.plan;
|
|
EState* estate = node->ss.ps.state;
|
|
|
|
/* If it's not an internally generated query, fire BS triggers */
|
|
if (!rq->rq_params_internal && estate->es_result_relations) {
|
|
Assert(rq->remote_query);
|
|
switch (rq->remote_query->commandType) {
|
|
case CMD_INSERT:
|
|
ExecBSInsertTriggers(estate, estate->es_result_relations);
|
|
break;
|
|
case CMD_UPDATE:
|
|
ExecBSUpdateTriggers(estate, estate->es_result_relations);
|
|
break;
|
|
case CMD_DELETE:
|
|
ExecBSDeleteTriggers(estate, estate->es_result_relations);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* pgxc_rq_fire_astriggers:
|
|
* AFTER STATEMENT triggers to be fired for a user-supplied DML query.
|
|
* See comments in pgxc_rq_fire_astriggers()
|
|
*/
|
|
static void pgxc_rq_fire_astriggers(RemoteQueryState* node)
|
|
{
|
|
RemoteQuery* rq = (RemoteQuery*)node->ss.ps.plan;
|
|
EState* estate = node->ss.ps.state;
|
|
|
|
/* If it's not an internally generated query, fire AS triggers */
|
|
if (!rq->rq_params_internal && estate->es_result_relations) {
|
|
Assert(rq->remote_query);
|
|
switch (rq->remote_query->commandType) {
|
|
case CMD_INSERT:
|
|
ExecASInsertTriggers(estate, estate->es_result_relations);
|
|
break;
|
|
case CMD_UPDATE:
|
|
ExecASUpdateTriggers(estate, estate->es_result_relations);
|
|
break;
|
|
case CMD_DELETE:
|
|
ExecASDeleteTriggers(estate, estate->es_result_relations);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool IsInheritor(Oid relid)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("10655")));
|
|
Relation pginherits;
|
|
SysScanDesc scan;
|
|
ScanKeyData key[1];
|
|
bool ret = false;
|
|
|
|
if (InvalidOid == relid)
|
|
return ret;
|
|
|
|
pginherits = heap_open(InheritsRelationId, AccessShareLock);
|
|
ScanKeyInit(&key[0], Anum_pg_inherits_inhrelid, BTEqualStrategyNumber, F_OIDEQ, relid);
|
|
scan = systable_beginscan(pginherits, InheritsRelidSeqnoIndexId, true, NULL, 1, key);
|
|
if (HeapTupleIsValid(systable_getnext(scan))) {
|
|
ret = true;
|
|
}
|
|
systable_endscan(scan);
|
|
heap_close(pginherits, AccessShareLock);
|
|
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Description: Construct query string for fetch statistics from system table of data nodes.
|
|
*
|
|
* Parameters:
|
|
* @in stmt: the statment for analyze or vacuum command
|
|
* @in schemaname: the schema name of the relation for analyze or vacuum
|
|
* @in relname: the relname for analyze or vacuum command
|
|
* @in va_cols: the columns of the relation for analyze or vacuum
|
|
* @in kind: which type of statistic we will get from DN, pg_class/pg_statistic/pg_partition
|
|
* @in relid: relation oid for analyze command
|
|
* @in parentRel: the parent relation's stmt for delta table, this is NULL for non delta table
|
|
*
|
|
* @return: void
|
|
*/
|
|
static char* construct_fetch_statistics_query(const char* schemaname, const char* relname, List* va_cols,
|
|
StatisticKind kind, VacuumStmt* stmt, Oid relid, RangeVar* parentRel)
|
|
{
|
|
/* there is no tuples in pg_class for complex table, but it has pg_statistic. */
|
|
if (stmt && IS_PGXC_COORDINATOR && !IsConnFromCoord() && (StatisticPageAndTuple == kind)) {
|
|
return NULL;
|
|
}
|
|
|
|
#define relLiteral(a) repairObjectName(a)
|
|
#define nspLiteral(a) (strcasecmp(a, "pg_temp") ? repairObjectName(a) : repairTempNamespaceName(a))
|
|
|
|
char* tablename = (char*)relname;
|
|
char* nspname = (char*)schemaname;
|
|
Oid namespaceId = LookupNamespaceNoError(schemaname);
|
|
|
|
StringInfo query = makeStringInfo();
|
|
|
|
/* set guc paramter to force all table do indexscan */
|
|
appendStringInfo(query, "set enable_seqscan = off;set enable_index_nestloop = on;set enable_indexscan = on;");
|
|
|
|
switch (kind) {
|
|
case StatisticPageAndTuple:
|
|
/* judge the delta table. If it is a delta table, you need to find delta table in pg_class to append query.
|
|
*/
|
|
if (parentRel && IsCStoreNamespace(namespaceId) &&
|
|
pg_strncasecmp(relname, "pg_delta", strlen("pg_delta")) == 0) {
|
|
appendStringInfo(query,
|
|
"select /*+ nestloop(p g) nestloop(p c) nestloop(c n) indexscan(g pg_namespace_nspname_index) "
|
|
"indexscan(n pg_namespace_nspname_index) indexscan(c pg_class_oid_index) indexscan(p "
|
|
"pg_class_relname_nsp_index)*/ c.relpages,c.reltuples,c.relallvisible,c.relhasindex from pg_class "
|
|
"c "
|
|
"join pg_namespace n on n.oid=c.relnamespace and n.nspname='%s' "
|
|
"inner join pg_class p on c.oid=p.reldeltarelid and p.relname ='%s' "
|
|
"inner join pg_namespace g on p.relnamespace= g.oid and g.nspname = '%s';",
|
|
nspLiteral(nspname),
|
|
relLiteral(parentRel->relname),
|
|
nspLiteral(parentRel->schemaname));
|
|
} else {
|
|
appendStringInfo(query,
|
|
"select /*+ nestloop(c n) indexscan(n pg_namespace_nspname_index) indexscan(c "
|
|
"pg_class_relname_nsp_index)*/ relpages,reltuples,relallvisible,relhasindex from pg_class c "
|
|
"join pg_namespace n on n.oid=c.relnamespace where c.relname='%s' and n.nspname='%s';",
|
|
relLiteral(tablename),
|
|
nspLiteral(nspname));
|
|
}
|
|
break;
|
|
|
|
case StatisticHistogram:
|
|
case StatisticMultiHistogram: {
|
|
const char* pgstat_name = (kind == StatisticHistogram) ? "pg_statistic" : "pg_statistic_ext";
|
|
|
|
if (parentRel && IsCStoreNamespace(namespaceId) &&
|
|
pg_strncasecmp(relname, "pg_delta", strlen("pg_delta")) == 0) {
|
|
appendStringInfo(query,
|
|
"select /*+ nestloop(p g) nestloop(p c) nestloop(n c) nestloop(s c) indexscan(p "
|
|
"pg_class_relname_nsp_index) "
|
|
"indexscan(c pg_class_oid_index) indexscan(s %s_relid_kind_att_inh_index) indexscan(n "
|
|
"pg_namespace_oid_index) indexscan(g pg_namespace_nspname_index) */ "
|
|
"s.* from %s s join pg_class c on s.starelid=c.oid "
|
|
"join pg_namespace n on n.oid=c.relnamespace and n.nspname='%s' and s.stainherit=false "
|
|
"inner join pg_class p on c.oid = p.reldeltarelid and p.relname ='%s' "
|
|
"inner join pg_namespace g on g.oid=p.relnamespace and g.nspname = '%s' ",
|
|
pgstat_name,
|
|
pgstat_name,
|
|
nspLiteral(nspname),
|
|
relLiteral(parentRel->relname),
|
|
nspLiteral(parentRel->schemaname));
|
|
} else {
|
|
appendStringInfo(query,
|
|
"select /*+ nestloop(s c) nestloop(n c) indexscan(c pg_class_relname_nsp_index) indexscan(n "
|
|
"pg_namespace_nspname_index) indexscan(s %s_relid_kind_att_inh_index)*/ "
|
|
"s.* from %s s join pg_class c on s.starelid=c.oid "
|
|
"join pg_namespace n on n.oid=c.relnamespace where relname='%s' and "
|
|
"n.nspname='%s' ",
|
|
pgstat_name,
|
|
pgstat_name,
|
|
relLiteral(tablename),
|
|
nspLiteral(nspname));
|
|
}
|
|
|
|
/* stainherit is false when select pg_statistic from dfs or delta table, and is true for complex. */
|
|
if (stmt && IS_PGXC_COORDINATOR && !IsConnFromCoord() &&
|
|
(stmt->pstGlobalStatEx[stmt->tableidx].eAnalyzeMode != ANALYZENORMAL)) {
|
|
switch (stmt->pstGlobalStatEx[stmt->tableidx].eAnalyzeMode) {
|
|
case ANALYZEDELTA:
|
|
appendStringInfoString(query, "and s.stainherit=false ");
|
|
break;
|
|
default:
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (list_length(va_cols) > 0) {
|
|
Relation rel = relation_open(relid, ShareUpdateExclusiveLock);
|
|
if (kind == StatisticHistogram) {
|
|
Bitmapset* bms_single_cols = NULL;
|
|
StringInfo single_cols = makeStringInfo();
|
|
|
|
ListCell* col = NULL;
|
|
foreach (col, va_cols) {
|
|
Node* col_node = (Node*)lfirst(col);
|
|
|
|
Assert(IsA(col_node, String));
|
|
|
|
int attnum = attnameAttNum(rel, strVal(col_node), false);
|
|
|
|
if (attnum == InvalidAttrNumber || bms_is_member(attnum, bms_single_cols))
|
|
continue;
|
|
|
|
bms_single_cols = bms_add_member(bms_single_cols, attnum);
|
|
|
|
if (0 < single_cols->len)
|
|
appendStringInfoString(single_cols, ",");
|
|
|
|
appendStringInfo(single_cols, "%d", attnum);
|
|
}
|
|
if (single_cols->len > 0)
|
|
appendStringInfo(query, " and s.staattnum in (%s) order by staattnum", single_cols->data);
|
|
bms_free(bms_single_cols);
|
|
pfree(single_cols->data);
|
|
} else if (kind == StatisticMultiHistogram) {
|
|
List* bmslist_multi_cols = NIL;
|
|
StringInfo ext_info = makeStringInfo();
|
|
Bitmapset* bms_multi_attnum = NULL;
|
|
|
|
ListCell* col = NULL;
|
|
foreach (col, va_cols) {
|
|
Node* col_node = (Node*)lfirst(col);
|
|
|
|
Assert(IsA(col_node, List));
|
|
|
|
if (!RelationIsRelation(rel))
|
|
continue;
|
|
|
|
ListCell* lc = NULL;
|
|
foreach (lc, (List*)col_node) {
|
|
Node* m_attname = (Node*)lfirst(lc);
|
|
int attnum = -1;
|
|
|
|
Assert(IsA(m_attname, String));
|
|
attnum = attnameAttNum(rel, strVal(m_attname), false);
|
|
bms_multi_attnum = bms_add_member(bms_multi_attnum, attnum);
|
|
}
|
|
|
|
int ori_size = list_length(bmslist_multi_cols);
|
|
bmslist_multi_cols = es_attnum_bmslist_add_unique_item(bmslist_multi_cols, bms_multi_attnum);
|
|
|
|
if (ori_size == list_length(bmslist_multi_cols))
|
|
continue;
|
|
|
|
int2 attnum = -1;
|
|
StringInfoData ext_info_str;
|
|
{
|
|
initStringInfo(&ext_info_str);
|
|
appendStringInfo(&ext_info_str, "'");
|
|
|
|
for (int i = 0; (attnum = bms_next_member(bms_multi_attnum, attnum)) > 0; ++i) {
|
|
if (i != 0) {
|
|
appendStringInfo(&ext_info_str, " ");
|
|
}
|
|
appendStringInfo(&ext_info_str, " %d", attnum);
|
|
}
|
|
|
|
appendStringInfo(&ext_info_str, "'::int2vector");
|
|
}
|
|
|
|
if (ext_info->len > 0)
|
|
appendStringInfoString(ext_info, ",");
|
|
|
|
appendStringInfo(ext_info, "%s", ext_info_str.data);
|
|
}
|
|
if (ext_info->len > 0)
|
|
appendStringInfo(query, " and s.stakey in (%s) order by stakey", ext_info->data);
|
|
list_free_deep(bmslist_multi_cols);
|
|
pfree(ext_info->data);
|
|
}
|
|
relation_close(rel, ShareUpdateExclusiveLock);
|
|
}
|
|
appendStringInfoString(query, ";");
|
|
} break;
|
|
|
|
case StatisticPartitionPageAndTuple:
|
|
appendStringInfo(query,
|
|
"select /*+ nestloop(c p) nestloop(n c) indexscan(c pg_class_relname_nsp_index) indexscan(p "
|
|
"pg_partition_partoid_index) indexscan(n pg_namespace_nspname_index)*/ "
|
|
"p.relname,p.parttype,p.relpages,p.reltuples,p.relallvisible from "
|
|
"pg_partition p join pg_class c on c.oid=p.parentid join pg_namespace n "
|
|
"on n.oid=c.relnamespace where (p.parttype='p' or p.parttype='x') and "
|
|
"c.relname='%s' and n.nspname='%s';",
|
|
relLiteral(tablename),
|
|
nspLiteral(nspname));
|
|
break;
|
|
|
|
default:
|
|
return NULL;
|
|
}
|
|
|
|
elog(DEBUG1, "Fetch statistics from 1st datanode with query:%s", query->data);
|
|
|
|
return query->data;
|
|
}
|
|
|
|
/*
|
|
* These statistics come from system table of data nodes.
|
|
* We cann't simply "select" by normal, because optimizer will intercept it.
|
|
* So we send query directly here, meanwhile omit a long process on cooridinator,
|
|
* including parse,analyze,rewrite,optimize,execute.
|
|
* @in stmt - the statment for analyze or vacuum command
|
|
* @in schemaname - the schema name of the relation for analyze or vacuum
|
|
* @in relname - the relname for analyze or vacuum command
|
|
* @in va_cols - the columns of the relation for analyze or vacuum
|
|
* @in kind - which type of statistic we will get from DN, pg_class/pg_statistic/pg_partition
|
|
* @in reltuples - it will get estimate reltuples for hdfs forign table of global stats
|
|
* @in isReplication - if the relation is replication, we will identify if we get dirty data from datanode1
|
|
* @in parentRel: the parent relation's stmt for delta table, this is NULL for non delta table
|
|
* @return: void
|
|
*/
|
|
static void FetchStatisticsInternal(const char* schemaname, const char* relname, List* va_cols, StatisticKind kind,
|
|
RangeVar* parentRel, VacuumStmt* stmt, bool isReplication)
|
|
{
|
|
List* nodeList = NIL;
|
|
int dn_conn_count, i;
|
|
PGXCNodeAllHandles* pgxc_handles = NULL;
|
|
PGXCNodeHandle** pgxc_connections = NULL;
|
|
RemoteQueryState* remotestate = NULL;
|
|
TupleTableSlot* scanslot = NULL;
|
|
char* query_string = NULL;
|
|
GlobalTransactionId gxid = GetCurrentTransactionId();
|
|
Snapshot snapshot = GetActiveSnapshot();
|
|
Oid namespaceId = LookupNamespaceNoError(schemaname);
|
|
Oid relid = get_relname_relid(relname, namespaceId);
|
|
|
|
/* Construct query string for fetch statistics from system table of data nodes. */
|
|
query_string = construct_fetch_statistics_query(schemaname, relname, va_cols, kind, stmt, relid, parentRel);
|
|
if (query_string == NULL) {
|
|
return;
|
|
}
|
|
|
|
if (!stmt->isForeignTables) /* get global stats from dn1 for replication table. */
|
|
{
|
|
ExecNodes* nodes = RelidGetExecNodes(relid, false);
|
|
nodeList = lappend_int(NIL, linitial_int(nodes->nodeList));
|
|
} else /* @hfds foreign table Fetch from the nodeNo Data Node */
|
|
{
|
|
/*
|
|
* we sholud use nodeNo identify the dn node if local cn get global stats for replication,
|
|
* otherwise, we sholud use orgCnNodeNo identify the original cn node
|
|
* if other cn get global stats from original cn.
|
|
*/
|
|
nodeList = !IsConnFromCoord() ? lappend_int(nodeList, stmt->nodeNo) : lappend_int(nodeList, stmt->orgCnNodeNo);
|
|
}
|
|
|
|
pgxc_handles = get_handles(nodeList, NULL, false);
|
|
pgxc_connections = pgxc_handles->datanode_handles;
|
|
dn_conn_count = pgxc_handles->dn_conn_count;
|
|
if (pgxc_connections == NULL)
|
|
return;
|
|
|
|
if (pgxc_node_begin(dn_conn_count, pgxc_connections, gxid, false, false, PGXC_NODE_DATANODE))
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Could not begin transaction on Datanodes")));
|
|
for (i = 0; i < dn_conn_count; i++) {
|
|
if (pgxc_connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(pgxc_connections[i]);
|
|
|
|
if (snapshot && pgxc_node_send_snapshot(pgxc_connections[i], snapshot)) {
|
|
if (i < --dn_conn_count) {
|
|
pgxc_connections[i] = pgxc_connections[dn_conn_count];
|
|
i--;
|
|
}
|
|
continue;
|
|
}
|
|
if (pgxc_node_send_queryid(pgxc_connections[i], u_sess->debug_query_id) != 0) {
|
|
add_error_message(pgxc_connections[i], "%s", "Can not send query ID");
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
pfree_ext(query_string);
|
|
return;
|
|
}
|
|
if (pgxc_node_send_query(pgxc_connections[i], query_string) != 0) {
|
|
add_error_message(pgxc_connections[i], "%s", "Can not send request");
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
pfree_ext(query_string);
|
|
return;
|
|
}
|
|
}
|
|
pfree_ext(query_string);
|
|
|
|
if (dn_conn_count == 0) {
|
|
return;
|
|
}
|
|
|
|
if (pgxc_node_receive(dn_conn_count, pgxc_connections, NULL)) {
|
|
int error_code;
|
|
char* error_msg = getSocketError(&error_code);
|
|
|
|
pfree_ext(pgxc_connections);
|
|
|
|
ereport(ERROR, (errcode(error_code), errmsg("Failed to read response from Datanodes Detail: %s\n", error_msg)));
|
|
}
|
|
|
|
// Consider statistics are same on any datanode by now.
|
|
//
|
|
remotestate = CreateResponseCombiner(0, COMBINE_TYPE_SAME);
|
|
remotestate->request_type = REQUEST_TYPE_QUERY;
|
|
scanslot = remotestate->ss.ss_ScanTupleSlot;
|
|
|
|
while (dn_conn_count > 0) {
|
|
i = 0;
|
|
|
|
while (i < dn_conn_count) {
|
|
int res = handle_response(pgxc_connections[i], remotestate);
|
|
|
|
if (res == RESPONSE_TUPDESC) {
|
|
if (scanslot == NULL)
|
|
scanslot = MakeSingleTupleTableSlot(remotestate->tuple_desc);
|
|
else
|
|
ExecSetSlotDescriptor(scanslot, remotestate->tuple_desc);
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
FetchTuple(remotestate, scanslot);
|
|
tableam_tslot_getallattrs(scanslot);
|
|
switch (kind) {
|
|
case StatisticPageAndTuple:
|
|
ReceivePageAndTuple(relid, scanslot, stmt);
|
|
break;
|
|
case StatisticHistogram:
|
|
ReceiveHistogram(relid, scanslot, isReplication);
|
|
break;
|
|
case StatisticMultiHistogram:
|
|
ReceiveHistogramMultiColStats(relid, scanslot, isReplication);
|
|
break;
|
|
case StatisticPartitionPageAndTuple:
|
|
ReceivePartitionPageAndTuple(relid, scanslot);
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (i < --dn_conn_count)
|
|
pgxc_handles->datanode_handles[i] = pgxc_handles->datanode_handles[dn_conn_count];
|
|
} else if (res == RESPONSE_EOF) {
|
|
if (pgxc_node_receive(1, &pgxc_handles->datanode_handles[i], NULL))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to read response from Datanode %u when ending query",
|
|
(pgxc_handles->datanode_handles[i])->nodeoid)));
|
|
} else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Unexpected response from Datanode %u", pgxc_connections[i]->nodeoid)));
|
|
}
|
|
}
|
|
|
|
pgxc_node_report_error(remotestate);
|
|
}
|
|
|
|
void FetchGlobalStatistics(VacuumStmt* stmt, Oid relid, RangeVar* parentRel, bool isReplication)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* @global stats
|
|
* Get stadndistinct from DN1 if local is CN and receive analyze command from client.
|
|
* Fetch global statistics information, if other CN has been told to fetch local table statistics information.
|
|
* @in parentRel: the parent relation's stmt for delta table, this is NULL for non delta table
|
|
*/
|
|
static void FetchGlobalRelationStatistics(VacuumStmt* stmt, Oid relid, RangeVar* parentRel, bool isReplication)
|
|
{
|
|
Relation rel = NULL;
|
|
Oid indid;
|
|
List* indexList = NIL;
|
|
ListCell* indexId = NULL;
|
|
char* indname = NULL;
|
|
bool shouldfree = false;
|
|
|
|
/* get schemaname and relation name */
|
|
char* schemaname = NULL;
|
|
char* relname = NULL;
|
|
|
|
if (!OidIsValid(relid)) {
|
|
Assert(stmt->relation != NULL);
|
|
relname = stmt->relation->relname;
|
|
if (stmt->relation->schemaname) {
|
|
Oid namespaceId;
|
|
|
|
schemaname = stmt->relation->schemaname;
|
|
namespaceId = LookupNamespaceNoError(schemaname);
|
|
relid = get_relname_relid(relname, namespaceId);
|
|
} else {
|
|
relid = RelnameGetRelid(stmt->relation->relname);
|
|
}
|
|
}
|
|
|
|
rel = relation_open(relid, ShareUpdateExclusiveLock);
|
|
|
|
if (IsSystemRelation(rel) || !check_analyze_permission(relid) || RelationIsContquery(rel) ||
|
|
RelationIsView(rel) || RelationIsIndex(rel)) {
|
|
/*
|
|
* Don't fetch statistics if
|
|
* 1) it is a system table since it do local analyze
|
|
* 2) it is not permisision
|
|
* 3) it is a view since it doesnot have stat info
|
|
* 4) it is a sequence since it doesnot have stat info
|
|
*/
|
|
relation_close(rel, ShareUpdateExclusiveLock);
|
|
return;
|
|
}
|
|
|
|
if (relname == NULL) {
|
|
relname = RelationGetRelationName(rel);
|
|
}
|
|
|
|
if (schemaname == NULL) {
|
|
if (stmt->relation != NULL && stmt->relation->schemaname != NULL) {
|
|
schemaname = stmt->relation->schemaname;
|
|
} else {
|
|
schemaname = get_namespace_name(RelationGetNamespace(rel));
|
|
if (schemaname == NULL) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CACHE_LOOKUP_FAILED),
|
|
errmsg("cache lookup failed for namespace %u", RelationGetNamespace(rel))));
|
|
}
|
|
shouldfree = true;
|
|
}
|
|
}
|
|
|
|
List *va_cols = NIL, *va_cols_multi = NIL;
|
|
es_split_multi_column_stats(stmt->va_cols, &va_cols, &va_cols_multi);
|
|
|
|
#define FETCH_STATS_REPLICATION(relname, single_cols, multi_cols, has_cols) \
|
|
do { \
|
|
FetchStatisticsInternal(schemaname, relname, NIL, StatisticPageAndTuple, parentRel, stmt, true); \
|
|
if (!(has_cols) || (single_cols) != NIL) \
|
|
FetchStatisticsInternal(schemaname, relname, single_cols, StatisticHistogram, parentRel, stmt, true); \
|
|
if (!(has_cols) || (multi_cols) != NIL) \
|
|
FetchStatisticsInternal(schemaname, relname, multi_cols, StatisticMultiHistogram, parentRel, stmt, true); \
|
|
} while (0)
|
|
|
|
#define FETCH_GLOBAL_STATS(relname, single_cols, multi_cols, has_cols) \
|
|
do { \
|
|
FetchGlobalStatisticsInternal(schemaname, relname, NIL, StatisticPageAndTuple, parentRel, stmt); \
|
|
if (!(has_cols) || (single_cols) != NIL) \
|
|
FetchGlobalStatisticsInternal(schemaname, relname, single_cols, StatisticHistogram, parentRel, stmt); \
|
|
if (!(has_cols) || (multi_cols) != NIL) \
|
|
FetchGlobalStatisticsInternal(schemaname, relname, multi_cols, StatisticMultiHistogram, parentRel, stmt); \
|
|
} while (0)
|
|
|
|
/* get info from DN1 for global stats. */
|
|
if (!IsConnFromCoord()) {
|
|
if (!isReplication) {
|
|
FetchGlobalStatisticsInternal(schemaname, relname, stmt->va_cols, StatisticPageAndTuple, parentRel, stmt);
|
|
|
|
/*
|
|
* We don't need get dndistinct for foreign table because scheduler
|
|
* assign different filelist for different datanode, so the dndistinct is no sense.
|
|
* It should estimate dndistinct with poisson.
|
|
*/
|
|
if (!stmt->isForeignTables) {
|
|
if (stmt->va_cols == NIL || va_cols != NIL)
|
|
FetchGlobalStatisticsInternal(schemaname, relname, va_cols, StatisticHistogram, parentRel, stmt);
|
|
if (stmt->va_cols == NIL || va_cols_multi != NIL)
|
|
FetchGlobalStatisticsInternal(
|
|
schemaname, relname, va_cols_multi, StatisticMultiHistogram, parentRel, stmt);
|
|
}
|
|
} else /* for replicate table, get stats from DN1, we have set stmt->totalRowCnts = 0 before here. */
|
|
{
|
|
FETCH_STATS_REPLICATION(relname, va_cols, va_cols_multi, (stmt->va_cols != NIL));
|
|
|
|
indexList = RelationGetIndexList(rel);
|
|
foreach (indexId, indexList) {
|
|
indid = lfirst_oid(indexId);
|
|
indname = GetIndexNameForStat(indid, relname);
|
|
if (indname == NULL) {
|
|
continue;
|
|
}
|
|
FETCH_STATS_REPLICATION(indname, va_cols, va_cols_multi, (stmt->va_cols != NIL));
|
|
pfree_ext(indname);
|
|
}
|
|
}
|
|
|
|
if (RelationIsPartitioned(rel))
|
|
FetchStatisticsInternal(
|
|
schemaname, relname, stmt->va_cols, StatisticPartitionPageAndTuple, parentRel, stmt, isReplication);
|
|
} else /* other CNs get stats */
|
|
{
|
|
stmt->tableidx = ANALYZENORMAL;
|
|
stmt->pstGlobalStatEx[stmt->tableidx].eAnalyzeMode = ANALYZENORMAL;
|
|
|
|
/* for global stats, other CNs get stats from the CN which do analyze. */
|
|
FETCH_GLOBAL_STATS(relname, va_cols, va_cols_multi, (stmt->va_cols != NIL));
|
|
|
|
if (RelationIsPartitioned(rel))
|
|
FetchGlobalStatisticsInternal(
|
|
schemaname, relname, stmt->va_cols, StatisticPartitionPageAndTuple, parentRel, stmt);
|
|
|
|
indexList = RelationGetIndexList(rel);
|
|
foreach (indexId, indexList) {
|
|
indid = lfirst_oid(indexId);
|
|
indname = GetIndexNameForStat(indid, relname);
|
|
if (indname == NULL) {
|
|
continue;
|
|
}
|
|
FETCH_GLOBAL_STATS(indname, va_cols, va_cols_multi, (stmt->va_cols != NIL));
|
|
pfree_ext(indname);
|
|
}
|
|
}
|
|
|
|
if (shouldfree) {
|
|
pfree_ext(schemaname);
|
|
}
|
|
list_free_ext(va_cols);
|
|
list_free_ext(va_cols_multi);
|
|
|
|
relation_close(rel, NoLock);
|
|
}
|
|
|
|
void FetchGlobalPgfdwStatistics(VacuumStmt* stmt, bool has_var, PGFDWTableAnalyze* info)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* @global stats
|
|
* @Description: send query string to target CN or DN for get Page/Tuple/Histogram
|
|
* @in conn_count - count of target connection
|
|
* @in pgxc_connections - target connection information
|
|
* @in pgxc_handles - all the handles involved in a transaction
|
|
* @in gxid - global transaction id
|
|
* @in snapshot - snapshot for transaction
|
|
* @in query - query string for get Page/Tuple/Histogram
|
|
* @in exec_type - execute on datanodes or coordinator
|
|
* @return: int - 0: success; -1: fail
|
|
*
|
|
* Build connections with execute nodes(datanodes or coordinator) through send begin and query string.
|
|
*/
|
|
static int SendQueryToExecNode(int conn_count, PGXCNodeHandle** pgxc_connections, PGXCNodeAllHandles* pgxc_handles,
|
|
GlobalTransactionId gxid, Snapshot snapshot, char* query, RemoteQueryExecType exec_type)
|
|
{
|
|
if (pgxc_node_begin(conn_count, pgxc_connections, gxid, false, false, PGXC_NODE_DATANODE)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Could not begin transaction on %s",
|
|
(exec_type == EXEC_ON_DATANODES) ? "Datanodes" : "Coordinator")));
|
|
}
|
|
|
|
for (int i = 0; i < conn_count; i++) {
|
|
if (pgxc_connections[i]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(pgxc_connections[i]);
|
|
|
|
if (snapshot && pgxc_node_send_snapshot(pgxc_connections[i], snapshot)) {
|
|
if (i < --conn_count) {
|
|
pgxc_connections[i] = pgxc_connections[conn_count];
|
|
i--;
|
|
}
|
|
continue;
|
|
}
|
|
if (pgxc_node_send_queryid(pgxc_connections[i], u_sess->debug_query_id) != 0) {
|
|
add_error_message(pgxc_connections[i], "%s", "Can not send query ID");
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
pfree_ext(query);
|
|
return -1;
|
|
}
|
|
if (pgxc_node_send_query(pgxc_connections[i], query) != 0) {
|
|
add_error_message(pgxc_connections[i], "%s", "Can not send request");
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
pfree_ext(query);
|
|
return -1;
|
|
}
|
|
}
|
|
pfree_ext(query);
|
|
query = NULL;
|
|
|
|
if (conn_count == 0) {
|
|
return -1;
|
|
}
|
|
|
|
if (pgxc_node_receive(conn_count, pgxc_connections, NULL)) {
|
|
int error_code;
|
|
char* error_msg = getSocketError(&error_code);
|
|
|
|
pfree_ext(pgxc_connections);
|
|
|
|
ereport(ERROR, (errcode(error_code), errmsg("Failed to read response from Datanodes Detail: %s\n", error_msg)));
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* @global stats
|
|
* @Description: other CNs will get statistic info(Page/Tuple/Histogram) from the original CN which
|
|
* receive analyze command.
|
|
* @in cn_conn_count - count of target coordinator connection
|
|
* @in pgxc_connections - target coordinator connection information
|
|
* @in remotestate - state of remote query node
|
|
* @in kind - identify which statistic info we should get, relpage/reltuple/histogram
|
|
* @in relid - relation oid which table we should get statistic
|
|
* @return: void
|
|
*
|
|
*/
|
|
static void FetchGlobalStatisticsFromCN(int cn_conn_count, PGXCNodeHandle** pgxc_connections,
|
|
RemoteQueryState* remotestate, StatisticKind kind, VacuumStmt* stmt, Oid relid, PGFDWTableAnalyze* info)
|
|
{
|
|
int i;
|
|
TupleTableSlot* scanslot = NULL;
|
|
|
|
while (cn_conn_count > 0) {
|
|
i = 0;
|
|
|
|
if (pgxc_node_receive(cn_conn_count, pgxc_connections, NULL))
|
|
break;
|
|
|
|
while (i < cn_conn_count) {
|
|
/* read messages */
|
|
int res = handle_response(pgxc_connections[i], remotestate);
|
|
|
|
if (res == RESPONSE_TUPDESC) {
|
|
/*
|
|
* Now tuple table slot is responsible for freeing the descriptor
|
|
*/
|
|
if (scanslot == NULL)
|
|
scanslot = MakeSingleTupleTableSlot(remotestate->tuple_desc);
|
|
else
|
|
ExecSetSlotDescriptor(scanslot, remotestate->tuple_desc);
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
/*
|
|
* We already have a tuple and received another one.
|
|
*/
|
|
FetchTuple(remotestate, scanslot);
|
|
tableam_tslot_getallattrs(scanslot);
|
|
/* We already get statistic info(Page/Tuple/Histogram) and update in local. */
|
|
switch (kind) {
|
|
case StatisticPageAndTuple:
|
|
ReceivePageAndTuple(relid, scanslot, stmt);
|
|
break;
|
|
case StatisticHistogram:
|
|
ReceiveHistogram(relid, scanslot, false, info);
|
|
break;
|
|
case StatisticMultiHistogram:
|
|
ReceiveHistogramMultiColStats(relid, scanslot, false, info);
|
|
break;
|
|
case StatisticPartitionPageAndTuple:
|
|
ReceivePartitionPageAndTuple(relid, scanslot);
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (i < --cn_conn_count)
|
|
pgxc_connections[i] = pgxc_connections[cn_conn_count];
|
|
} else if (res == RESPONSE_EOF) {
|
|
if (pgxc_node_receive(1, &pgxc_connections[i], NULL))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to read response from CN %u when ending query",
|
|
(pgxc_connections[i])->nodeoid)));
|
|
} else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Unexpected response from CN %u", pgxc_connections[i]->nodeoid)));
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @global stats
|
|
* @Description: Fetch relpage and reltuple from pg_class in DN1 using to estimate global relpages.
|
|
* Fetch stadndistinct from pg_statistic in DN1 and update local.
|
|
* @in dn_conn_count - count of target datanodes connection
|
|
* @in pgxc_connections - target datanodes connection information
|
|
* @in remotestate - state of remote query node
|
|
* @in kind - identify which statistic info we should get, relpage/reltuple/histogram
|
|
* @in stmt - analyze or vacuum statement, we will use totalRowCnts to estimate global relpages
|
|
* @in relid - relation oid which table we should get statistic
|
|
* @return: void
|
|
*
|
|
*/
|
|
static void FetchGlobalStatisticsFromDN(int dn_conn_count, PGXCNodeHandle** pgxc_connections,
|
|
RemoteQueryState* remotestate, StatisticKind kind, VacuumStmt* stmt, Oid relid)
|
|
{
|
|
int i, colno;
|
|
TupleTableSlot* scanslot = NULL;
|
|
bool* bRecvedAttnum = (bool*)palloc0(stmt->pstGlobalStatEx[stmt->tableidx].attnum * sizeof(bool));
|
|
|
|
while (dn_conn_count > 0) {
|
|
i = 0;
|
|
if (pgxc_node_receive(dn_conn_count, pgxc_connections, NULL))
|
|
break;
|
|
|
|
while (i < dn_conn_count) {
|
|
/* read messages */
|
|
int res = handle_response(pgxc_connections[i], remotestate);
|
|
|
|
if (res == RESPONSE_TUPDESC) {
|
|
/*
|
|
* Now tuple table slot is responsible for freeing the descriptor
|
|
*/
|
|
if (scanslot == NULL)
|
|
scanslot = MakeSingleTupleTableSlot(remotestate->tuple_desc);
|
|
else
|
|
ExecSetSlotDescriptor(scanslot, remotestate->tuple_desc);
|
|
} else if (res == RESPONSE_DATAROW) {
|
|
/*
|
|
* We already have a tuple and received another one.
|
|
*/
|
|
FetchTuple(remotestate, scanslot);
|
|
tableam_tslot_getallattrs(scanslot);
|
|
switch (kind) {
|
|
case StatisticPageAndTuple: {
|
|
double reltuples = 0;
|
|
double relpages = 0;
|
|
/* Received relpage and reltuple from pg_class in DN1 */
|
|
RelPageType dn_pages = (RelPageType)DatumGetFloat8(scanslot->tts_values[0]);
|
|
double dn_tuples = (double)DatumGetFloat8(scanslot->tts_values[1]);
|
|
stmt->pstGlobalStatEx[stmt->tableidx].dn1totalRowCnts = dn_tuples;
|
|
reltuples = stmt->pstGlobalStatEx[stmt->tableidx].totalRowCnts;
|
|
|
|
if (0 < dn_pages && 0 < dn_tuples) {
|
|
/* Estimate global relpages. */
|
|
relpages = ceil(stmt->pstGlobalStatEx[stmt->tableidx].totalRowCnts / dn_tuples * dn_pages);
|
|
} else {
|
|
elog(LOG,
|
|
"fetch stats info from first datanode: reltuples = %lf relpages = %lf",
|
|
dn_tuples,
|
|
dn_pages);
|
|
if (dn_pages > 0) {
|
|
relpages = dn_pages * *t_thrd.pgxc_cxt.shmemNumDataNodes;
|
|
} else if (reltuples > 0) {
|
|
double est_cu_num = 0;
|
|
double est_cu_pages = 0;
|
|
Relation rel = heap_open(relid, NoLock);
|
|
int32 relwidth = get_relation_data_width(relid, InvalidOid, NULL);
|
|
|
|
if (RelationIsColStore(rel)) {
|
|
/* refer to estimate_cstore_blocks */
|
|
est_cu_num = ceil(reltuples / RelDefaultFullCuSize);
|
|
est_cu_pages = (double)(RelDefaultFullCuSize * relwidth / BLCKSZ);
|
|
relpages = ceil(est_cu_num * est_cu_pages);
|
|
} else if (RelationIsPAXFormat(rel)) {
|
|
/* refer to estimate_cstore_blocks */
|
|
est_cu_num = (reltuples * relwidth) / BLCKSZ;
|
|
est_cu_pages = ESTIMATE_BLOCK_FACTOR;
|
|
relpages = ceil(est_cu_num * est_cu_pages);
|
|
} else {
|
|
/* Estimate global pages as normal. */
|
|
relpages = ceil(reltuples * relwidth / BLCKSZ);
|
|
}
|
|
heap_close(rel, NoLock);
|
|
}
|
|
}
|
|
BlockNumber relallvisible = (BlockNumber)DatumGetInt32(scanslot->tts_values[2]);
|
|
relallvisible = (BlockNumber)floor(stmt->DnCnt * relallvisible + 0.5);
|
|
if (relallvisible > relpages) {
|
|
relallvisible = (BlockNumber)relpages;
|
|
}
|
|
scanslot->tts_values[0] = Float8GetDatum(relpages);
|
|
scanslot->tts_values[1] = Float8GetDatum(reltuples);
|
|
scanslot->tts_values[2] = UInt32GetDatum(relallvisible);
|
|
/* Update pg_class in local. */
|
|
ReceivePageAndTuple(relid, scanslot, stmt);
|
|
break;
|
|
}
|
|
case StatisticHistogram: {
|
|
colno = scanslot->tts_values[Anum_pg_statistic_staattnum - 1];
|
|
|
|
/*
|
|
* If received current attno from pg_statistic in DN1 more than the attnum
|
|
* of analyzing table, It identify have concurrency with delete/insert column
|
|
* under analyzing. We should realloc memory of dndistinct and bRecvedAttnum.
|
|
*/
|
|
if (colno > stmt->pstGlobalStatEx[stmt->tableidx].attnum) {
|
|
bRecvedAttnum = (bool*)repalloc(bRecvedAttnum, colno * sizeof(bool));
|
|
bRecvedAttnum[colno - 1] = false;
|
|
stmt->pstGlobalStatEx[stmt->tableidx].dndistinct = (double*)repalloc(
|
|
stmt->pstGlobalStatEx[stmt->tableidx].dndistinct, colno * sizeof(double));
|
|
stmt->pstGlobalStatEx[stmt->tableidx].correlations = (double*)repalloc(
|
|
stmt->pstGlobalStatEx[stmt->tableidx].correlations, colno * sizeof(double));
|
|
stmt->pstGlobalStatEx[stmt->tableidx].dndistinct[colno - 1] = 1;
|
|
stmt->pstGlobalStatEx[stmt->tableidx].correlations[colno - 1] = 1;
|
|
stmt->pstGlobalStatEx[stmt->tableidx].attnum = colno;
|
|
}
|
|
|
|
if (!bRecvedAttnum[colno - 1]) {
|
|
if (!scanslot->tts_isnull[Anum_pg_statistic_stadistinct - 1]) {
|
|
stmt->pstGlobalStatEx[stmt->tableidx].dndistinct[colno - 1] =
|
|
DatumGetFloat4(scanslot->tts_values[Anum_pg_statistic_stadistinct - 1]);
|
|
if (stmt->pstGlobalStatEx[stmt->tableidx].dndistinct[colno - 1] == 0) {
|
|
stmt->pstGlobalStatEx[stmt->tableidx].dndistinct[colno - 1] = 1;
|
|
elog(LOG, "the collumn[%d] have no distinct on DN1", colno);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Get correlations of each column from dn1, because the value of
|
|
* correlations can't accurately if analyze with sample table.
|
|
*/
|
|
if (stmt->sampleTableRequired) {
|
|
for (int stakindno = 0; stakindno < STATISTIC_NUM_SLOTS; stakindno++) {
|
|
int j = Anum_pg_statistic_stakind1 - 1 + stakindno;
|
|
int k = Anum_pg_statistic_stanumbers1 - 1 + stakindno;
|
|
|
|
if (!scanslot->tts_isnull[j] &&
|
|
STATISTIC_KIND_CORRELATION == DatumGetInt16(scanslot->tts_values[j])) {
|
|
Oid foutoid;
|
|
bool typisvarlena = false;
|
|
char *corrs = NULL, *tmp = NULL;
|
|
getTypeOutputInfo(
|
|
scanslot->tts_tupleDescriptor->attrs[k].atttypid, &foutoid, &typisvarlena);
|
|
corrs = OidOutputFunctionCall(foutoid, scanslot->tts_values[k]);
|
|
while (corrs != NULL) {
|
|
if (*corrs == '{')
|
|
tmp = corrs + 1;
|
|
else if (*corrs == '}') {
|
|
*corrs = '\0';
|
|
break;
|
|
}
|
|
|
|
corrs++;
|
|
}
|
|
|
|
stmt->pstGlobalStatEx[stmt->tableidx].correlations[colno - 1] = atof(tmp);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
bRecvedAttnum[colno - 1] = true;
|
|
}
|
|
|
|
break;
|
|
}
|
|
case StatisticMultiHistogram:
|
|
break;
|
|
case StatisticPartitionPageAndTuple:
|
|
ReceivePartitionPageAndTuple(relid, scanslot);
|
|
/* fall through */
|
|
default:
|
|
return;
|
|
}
|
|
} else if (res == RESPONSE_COMPLETE) {
|
|
if (i < --dn_conn_count)
|
|
pgxc_connections[i] = pgxc_connections[dn_conn_count];
|
|
} else if (res == RESPONSE_EOF) {
|
|
/* incomplete message, read more */
|
|
if (pgxc_node_receive(1, &pgxc_connections[i], NULL))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to read response from DN %u when ending query",
|
|
(pgxc_connections[i])->nodeoid)));
|
|
} else
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Unexpected response from DN %u", pgxc_connections[i]->nodeoid)));
|
|
}
|
|
}
|
|
|
|
pfree_ext(bRecvedAttnum);
|
|
bRecvedAttnum = NULL;
|
|
}
|
|
|
|
typedef struct {
|
|
Oid nodeoid;
|
|
StringInfo explain;
|
|
} DnExplain;
|
|
|
|
/*
|
|
* check whether prepared statement is ready in DN exec node.
|
|
*/
|
|
bool CheckPrepared(RemoteQuery* rq, Oid nodeoid)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return false;
|
|
}
|
|
|
|
void FindExecNodesInPBE(RemoteQueryState* planstate, ExecNodes* exec_nodes, RemoteQueryExecType exec_type)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* @Description: send query string to DN.
|
|
* @in pgxc_connections - target connection information
|
|
* @in pgxc_handles - all the handles involved in a transaction
|
|
* @in query - explain/explain verbose query string
|
|
* @return: int - 0: success; -1: fail
|
|
*
|
|
* Build connections with execute datanodes through send query string.
|
|
*/
|
|
static int SendQueryToExecDN(PGXCNodeHandle** pgxc_connections, PGXCNodeAllHandles* pgxc_handles, const char* query)
|
|
{
|
|
GlobalTransactionId gxid = GetCurrentTransactionId();
|
|
Snapshot snapshot = GetActiveSnapshot();
|
|
|
|
/* begin transaction for all datanodes */
|
|
if (pgxc_node_begin(1, pgxc_connections, gxid, false, false, PGXC_NODE_DATANODE)) {
|
|
ereport(ERROR, (errcode(ERRCODE_CONNECTION_EXCEPTION), errmsg("Could not begin transaction on Datanode.")));
|
|
}
|
|
|
|
if (pgxc_connections[0]->state == DN_CONNECTION_STATE_QUERY)
|
|
BufferConnection(pgxc_connections[0]);
|
|
|
|
if (snapshot && pgxc_node_send_snapshot(pgxc_connections[0], snapshot)) {
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Send queryid and query, but should not free query,
|
|
* because query need to be used multiple times.
|
|
*/
|
|
if (pgxc_node_send_queryid(pgxc_connections[0], u_sess->debug_query_id) != 0) {
|
|
add_error_message(pgxc_connections[0], "%s", "Can not send query ID");
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
return -1;
|
|
}
|
|
if (pgxc_node_send_query(pgxc_connections[0], query) != 0) {
|
|
add_error_message(pgxc_connections[0], "%s", "Can not send request");
|
|
pfree_pgxc_all_handles(pgxc_handles);
|
|
return -1;
|
|
}
|
|
|
|
if (pgxc_node_receive(1, pgxc_connections, NULL)) {
|
|
int error_code;
|
|
char* error_msg = getSocketError(&error_code);
|
|
|
|
pfree_ext(pgxc_connections);
|
|
|
|
ereport(ERROR, (errcode(error_code), errmsg("Failed to read response from Datanodes Detail: %s\n", error_msg)));
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
StringInfo* SendExplainToDNs(ExplainState*, RemoteQuery*, int*, const char*)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* @global stats
|
|
* Get stadndistinct from DN1 if local is CN and receive analyze command from client.
|
|
* Fetch global statistics information, if other CN has been told to fetch local table statistics information.
|
|
*.@in parentRel: the parent relation's stmt for delta table, this is NULL for non delta table
|
|
*/
|
|
static void FetchGlobalStatisticsInternal(const char* schemaname, const char* relname, List* va_cols,
|
|
StatisticKind kind, RangeVar* parentRel, VacuumStmt* stmt)
|
|
{
|
|
List* nodeList = NIL;
|
|
int co_conn_count = 0, dn_conn_count = 0;
|
|
PGXCNodeAllHandles* pgxc_handles = NULL;
|
|
PGXCNodeHandle** pgxc_connections = NULL;
|
|
RemoteQueryState* remotestate = NULL;
|
|
RemoteQueryExecType exec_type;
|
|
char* query_string = NULL;
|
|
GlobalTransactionId gxid = GetCurrentTransactionId();
|
|
Snapshot snapshot = GetActiveSnapshot();
|
|
Oid namespaceId = LookupNamespaceNoError(schemaname);
|
|
Oid relid = get_relname_relid(relname, namespaceId);
|
|
if (!OidIsValid(relid)) {
|
|
ereport(ERROR, (errmsg("%s.%s not found when analyze fetching global statistics.", schemaname, relname)));
|
|
return;
|
|
}
|
|
|
|
/* Construct query string for fetch statistics from system table of data nodes. */
|
|
query_string = construct_fetch_statistics_query(schemaname, relname, va_cols, kind, stmt, relid, parentRel);
|
|
if (query_string == NULL) {
|
|
return;
|
|
}
|
|
|
|
if (IS_PGXC_COORDINATOR && !IsConnFromCoord()) {
|
|
/*
|
|
* for other table: fetch stats in pg_class and pg_statistic from DN1 if non-hdfs foreign table;
|
|
* otherwise, fetch stats from the determined DN of scheduler.
|
|
*/
|
|
ExecNodes* exec_nodes = getRelationExecNodes(relid);
|
|
nodeList = stmt->isForeignTables ? lappend_int(nodeList, stmt->nodeNo)
|
|
: lappend_int(nodeList, linitial_int(exec_nodes->nodeList));
|
|
pgxc_handles = get_handles(nodeList, NULL, false);
|
|
pgxc_connections = pgxc_handles->datanode_handles;
|
|
dn_conn_count = pgxc_handles->dn_conn_count;
|
|
exec_type = EXEC_ON_DATANODES;
|
|
} else {
|
|
/*
|
|
* for foreign tables: fetch stats from nodeNo Data Node
|
|
* global stats: other coordinators will get statistics from coordinator node identified by nodeNo.
|
|
*/
|
|
Assert(stmt->orgCnNodeNo >= 0);
|
|
|
|
nodeList = lappend_int(nodeList, stmt->orgCnNodeNo);
|
|
pgxc_handles = get_handles(NULL, nodeList, true);
|
|
pgxc_connections = pgxc_handles->coord_handles;
|
|
co_conn_count = pgxc_handles->co_conn_count;
|
|
exec_type = EXEC_ON_COORDS;
|
|
|
|
if (strcmp(pgxc_connections[0]->remoteNodeName, g_instance.attr.attr_common.PGXCNodeName) == 0) {
|
|
ereport(ERROR, (errmsg("Fetch statistics from myself is unexpected. Maybe switchover happened.")));
|
|
return;
|
|
}
|
|
}
|
|
if (pgxc_connections == NULL)
|
|
return;
|
|
|
|
/* Send query string to target CN or DN for get Page/Tuple/Histogram. */
|
|
int ret = SendQueryToExecNode((exec_type == EXEC_ON_DATANODES) ? dn_conn_count : co_conn_count,
|
|
pgxc_connections,
|
|
pgxc_handles,
|
|
gxid,
|
|
snapshot,
|
|
query_string,
|
|
exec_type);
|
|
/* It should return if encounter some error. */
|
|
if (ret != 0) {
|
|
return;
|
|
}
|
|
|
|
remotestate = CreateResponseCombiner(0, COMBINE_TYPE_SAME);
|
|
remotestate->request_type = REQUEST_TYPE_QUERY;
|
|
|
|
if (exec_type == EXEC_ON_DATANODES) {
|
|
/*
|
|
* Fetch relpage and reltuple from pg_class in DN1 using to estimate global relpages.
|
|
* Fetch stadndistinct from pg_statistic in DN1.
|
|
*/
|
|
FetchGlobalStatisticsFromDN(dn_conn_count, pgxc_connections, remotestate, kind, stmt, relid);
|
|
}
|
|
|
|
else /* Make the same for Coordinators */
|
|
{
|
|
/* for global stats, other CNs will get statistic info from the original CN which do analyze. */
|
|
FetchGlobalStatisticsFromCN(co_conn_count, pgxc_connections, remotestate, kind, stmt, relid, NULL);
|
|
}
|
|
|
|
pgxc_node_report_error(remotestate);
|
|
}
|
|
|
|
/*
|
|
* @cooperation analysis
|
|
* get really attnum
|
|
*/
|
|
bool PgfdwGetRelAttnum(int2vector* keys, PGFDWTableAnalyze* info)
|
|
{
|
|
int attnum = 0;
|
|
char** att_name = (char**)palloc0(keys->dim1 * sizeof(char*));
|
|
|
|
for (int i = 0; i < info->natts; i++) {
|
|
for (int j = 0; j < keys->dim1; j++) {
|
|
if (info->attnum[i] == keys->values[j]) {
|
|
att_name[attnum] = info->attname[i];
|
|
attnum++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (keys->dim1 != attnum) {
|
|
return false;
|
|
}
|
|
|
|
Relation rel = relation_open(info->relid, AccessShareLock);
|
|
TupleDesc tupdesc = RelationGetDescr(rel);
|
|
char* tup_attname = NULL;
|
|
int* real_attnum = (int*)palloc0(attnum * sizeof(int));
|
|
int total = 0;
|
|
|
|
for (int i = 0; i < tupdesc->natts; i++) {
|
|
for (int j = 0; j < attnum; j++) {
|
|
tup_attname = tupdesc->attrs[i].attname.data;
|
|
if (tup_attname && strcmp(tup_attname, att_name[j]) == 0) {
|
|
real_attnum[total] = tupdesc->attrs[i].attnum;
|
|
total++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
relation_close(rel, AccessShareLock);
|
|
|
|
if (total != attnum) {
|
|
return false;
|
|
}
|
|
|
|
for (int i = 0; i < keys->dim1; i++) {
|
|
keys->values[i] = (int2)real_attnum[i];
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* @cooperation analysis
|
|
* get really attnum
|
|
*/
|
|
bool PgfdwGetRelAttnum(TupleTableSlot* slot, PGFDWTableAnalyze* info)
|
|
{
|
|
int attnum = slot->tts_values[Anum_pg_statistic_staattnum - 1];
|
|
char* tup_attname = NULL;
|
|
char* att_name = NULL;
|
|
int real_attnum = -1;
|
|
|
|
for (int i = 0; i < info->natts; i++) {
|
|
if (info->attnum[i] == attnum) {
|
|
att_name = info->attname[i];
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (att_name == NULL) {
|
|
return false;
|
|
}
|
|
|
|
Relation rel = relation_open(info->relid, AccessShareLock);
|
|
TupleDesc tupdesc = RelationGetDescr(rel);
|
|
|
|
for (int i = 0; i < tupdesc->natts; i++) {
|
|
tup_attname = tupdesc->attrs[i].attname.data;
|
|
if (tup_attname && strcmp(tup_attname, att_name) == 0) {
|
|
real_attnum = tupdesc->attrs[i].attnum;
|
|
break;
|
|
}
|
|
}
|
|
|
|
relation_close(rel, AccessShareLock);
|
|
|
|
if (real_attnum == -1) {
|
|
return false;
|
|
}
|
|
|
|
slot->tts_values[Anum_pg_statistic_staattnum - 1] = real_attnum;
|
|
|
|
return true;
|
|
}
|
|
|
|
/*update pages, tuples, etc in pg_class */
|
|
static void ReceivePageAndTuple(Oid relid, TupleTableSlot* slot, VacuumStmt* stmt)
|
|
{
|
|
Relation rel;
|
|
Relation classRel;
|
|
RelPageType relpages;
|
|
double reltuples;
|
|
BlockNumber relallvisible;
|
|
bool hasindex = false;
|
|
|
|
relpages = (RelPageType)DatumGetFloat8(slot->tts_values[0]);
|
|
reltuples = (double)DatumGetFloat8(slot->tts_values[1]);
|
|
|
|
relallvisible = (BlockNumber)DatumGetInt32(slot->tts_values[2]);
|
|
hasindex = DatumGetBool(slot->tts_values[3]);
|
|
rel = relation_open(relid, ShareUpdateExclusiveLock);
|
|
classRel = heap_open(RelationRelationId, RowExclusiveLock);
|
|
|
|
vac_update_relstats(rel, classRel, relpages, reltuples, relallvisible,
|
|
hasindex, BootstrapTransactionId, InvalidMultiXactId);
|
|
|
|
/* Save the flag identify is there dirty data in the relation. */
|
|
if (stmt != NULL) {
|
|
stmt->pstGlobalStatEx[stmt->tableidx].totalRowCnts = reltuples;
|
|
}
|
|
|
|
/*
|
|
* we does not fetch dead tuples info from remote DN/CN, just set deadtuples to 0. it does
|
|
* not matter because we should fetch all deadtuples info from all datanodes to calculate a
|
|
* user-definedtable's deadtuples info
|
|
*/
|
|
if (!IS_PGXC_COORDINATOR || IsConnFromCoord())
|
|
pgstat_report_analyze(rel, (PgStat_Counter)reltuples, (PgStat_Counter)0);
|
|
|
|
heap_close(classRel, NoLock);
|
|
relation_close(rel, NoLock);
|
|
}
|
|
|
|
/*
|
|
* Handle statistics fetched from datanode.
|
|
* in isReplication -identify the relation is replication table if it is true.
|
|
*/
|
|
static void ReceiveHistogram(Oid relid, TupleTableSlot* slot, bool isReplication, PGFDWTableAnalyze* info)
|
|
{
|
|
MemoryContext oldcontext;
|
|
Relation sd;
|
|
Form_pg_attribute attForm;
|
|
HeapTuple attTup, stup, oldtup;
|
|
Oid atttypid;
|
|
int attnum, atttypmod, i, j, k;
|
|
bool replaces[Natts_pg_statistic];
|
|
|
|
for (i = 0; i < Natts_pg_statistic; i++) {
|
|
replaces[i] = true;
|
|
}
|
|
replaces[Anum_pg_statistic_starelid - 1] = false;
|
|
replaces[Anum_pg_statistic_staattnum - 1] = false;
|
|
|
|
/*
|
|
* Step : Get true type of attribute
|
|
*/
|
|
if (info != NULL) {
|
|
Assert(relid == info->relid);
|
|
|
|
if (Natts_pg_statistic != info->natts_pg_statistic) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROTOCOL_VIOLATION),
|
|
errmsg("cooperation analysis: please update to the same version")));
|
|
}
|
|
|
|
if (!PgfdwGetRelAttnum(slot, info)) {
|
|
return;
|
|
}
|
|
|
|
info->has_analyze = true;
|
|
}
|
|
attnum = slot->tts_values[Anum_pg_statistic_staattnum - 1];
|
|
|
|
attTup = SearchSysCache2(ATTNUM, ObjectIdGetDatum(relid), Int32GetDatum(attnum));
|
|
if (!HeapTupleIsValid(attTup)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CACHE_LOOKUP_FAILED),
|
|
errmsg("cache lookup failed for attribute %d of relation %u", attnum, relid)));
|
|
}
|
|
|
|
attForm = (Form_pg_attribute)GETSTRUCT(attTup);
|
|
|
|
/*
|
|
* If a drop column operation happened between fetching sample rows and updating
|
|
* pg_statistic, we don't have to further processing the dropped-column's stats
|
|
* update, so just skip.
|
|
*/
|
|
if (attForm->attisdropped) {
|
|
elog(WARNING,
|
|
"relation:%s's attnum:%d is droppred during ANALYZE, so skipped.",
|
|
get_rel_name(attForm->attrelid),
|
|
attForm->attnum);
|
|
|
|
ReleaseSysCache(attTup);
|
|
return;
|
|
}
|
|
|
|
atttypid = attForm->atttypid;
|
|
atttypmod = attForm->atttypmod;
|
|
|
|
ReleaseSysCache(attTup);
|
|
|
|
/*
|
|
* Step : Reconstruct staValues
|
|
*/
|
|
oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
|
|
|
|
slot->tts_values[Anum_pg_statistic_starelid - 1] = relid;
|
|
if (slot->tts_attinmeta == NULL)
|
|
slot->tts_attinmeta = TupleDescGetAttInMetadata(slot->tts_tupleDescriptor);
|
|
|
|
/* if the relation is replication for global stats, we should set stadndistinct the same as stadistinct from dn1. */
|
|
if (u_sess->attr.attr_sql.enable_global_stats && isReplication && IS_PGXC_COORDINATOR && !IsConnFromCoord()) {
|
|
/*
|
|
* we should set stadndistinct as 1 if there is no global distinct.
|
|
* because we consider stadndistinct is 0 as single stats.
|
|
*/
|
|
if (slot->tts_values[Anum_pg_statistic_stadistinct - 1] == 0)
|
|
slot->tts_values[Anum_pg_statistic_stadndistinct - 1] = Float4GetDatum(1);
|
|
else
|
|
slot->tts_values[Anum_pg_statistic_stadndistinct - 1] = slot->tts_values[Anum_pg_statistic_stadistinct - 1];
|
|
}
|
|
|
|
for (i = 0; i < STATISTIC_NUM_SLOTS; i++) {
|
|
j = Anum_pg_statistic_stakind1 - 1 + i;
|
|
k = Anum_pg_statistic_stavalues1 - 1 + i;
|
|
|
|
if (!slot->tts_isnull[j]) {
|
|
int t = DatumGetInt16(slot->tts_values[j]);
|
|
|
|
if (0 != t && STATISTIC_KIND_HISTOGRAM != t) {
|
|
Assert(!slot->tts_isnull[Anum_pg_statistic_stanumbers1 - 1 + i]);
|
|
}
|
|
|
|
if (STATISTIC_KIND_MCV == t || STATISTIC_KIND_HISTOGRAM == t || STATISTIC_KIND_MCELEM == t) {
|
|
Assert(!slot->tts_isnull[k]);
|
|
|
|
/* When stakindN = STATISTIC_KIND_MCELEM, element type of staValuesN is text by now */
|
|
if (STATISTIC_KIND_MCELEM == t) {
|
|
atttypid = TEXTOID;
|
|
atttypmod = -1;
|
|
}
|
|
|
|
slot->tts_values[k] = FunctionCall3Coll(slot->tts_attinmeta->attinfuncs + k,
|
|
InvalidOid,
|
|
(Datum)slot->tts_values[k],
|
|
UInt32GetDatum(atttypid),
|
|
Int32GetDatum(atttypmod));
|
|
}
|
|
}
|
|
}
|
|
|
|
// Is there already a pg_statistic tuple for this attribute?
|
|
//
|
|
sd = heap_open(StatisticRelationId, RowExclusiveLock);
|
|
MemoryContext current_context = MemoryContextSwitchTo(oldcontext);
|
|
ResourceOwner asOwner, oldOwner1;
|
|
/*
|
|
* Create a resource owner to keep track of resources
|
|
* in order to release resources when catch the exception.
|
|
*/
|
|
asOwner = ResourceOwnerCreate(t_thrd.utils_cxt.CurrentResourceOwner, "update_stats",
|
|
THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_OPTIMIZER));
|
|
oldOwner1 = t_thrd.utils_cxt.CurrentResourceOwner;
|
|
t_thrd.utils_cxt.CurrentResourceOwner = asOwner;
|
|
|
|
PG_TRY();
|
|
{
|
|
oldtup = SearchSysCache4(STATRELKINDATTINH,
|
|
ObjectIdGetDatum(slot->tts_values[Anum_pg_statistic_starelid - 1]),
|
|
CharGetDatum(slot->tts_values[Anum_pg_statistic_starelkind - 1]),
|
|
Int16GetDatum(slot->tts_values[Anum_pg_statistic_staattnum - 1]),
|
|
BoolGetDatum(slot->tts_values[Anum_pg_statistic_stainherit - 1]));
|
|
|
|
if (HeapTupleIsValid(oldtup)) {
|
|
// Yes, replace it
|
|
//
|
|
stup = heap_modify_tuple(oldtup, RelationGetDescr(sd), slot->tts_values, slot->tts_isnull, replaces);
|
|
ReleaseSysCache(oldtup);
|
|
(void)simple_heap_update(sd, &stup->t_self, stup);
|
|
} else {
|
|
// No, insert new tuple
|
|
//
|
|
stup = heap_form_tuple(RelationGetDescr(sd), slot->tts_values, slot->tts_isnull);
|
|
simple_heap_insert(sd, stup);
|
|
}
|
|
|
|
// update indexes too
|
|
//
|
|
CatalogUpdateIndexes(sd, stup);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
analyze_concurrency_process(slot->tts_values[Anum_pg_statistic_starelid - 1],
|
|
slot->tts_values[Anum_pg_statistic_staattnum - 1],
|
|
current_context,
|
|
PG_FUNCNAME_MACRO);
|
|
}
|
|
PG_END_TRY();
|
|
|
|
/* Release everything */
|
|
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, false);
|
|
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_LOCKS, false, false);
|
|
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_AFTER_LOCKS, false, false);
|
|
t_thrd.utils_cxt.CurrentResourceOwner = oldOwner1;
|
|
ResourceOwnerDelete(asOwner);
|
|
heap_close(sd, RowExclusiveLock);
|
|
}
|
|
|
|
/*
|
|
* Handle Extended Statistics fetched from datanode.
|
|
* in isReplication -identify the relation is replication table if it is true.
|
|
*/
|
|
static void ReceiveHistogramMultiColStats(Oid relid, TupleTableSlot* slot, bool isReplication, PGFDWTableAnalyze* info)
|
|
{
|
|
MemoryContext oldcontext;
|
|
HeapTuple stup, oldtup;
|
|
int i, j, k;
|
|
bool replaces[Natts_pg_statistic_ext];
|
|
|
|
for (i = 0; i < Natts_pg_statistic_ext; i++) {
|
|
replaces[i] = true;
|
|
}
|
|
replaces[Anum_pg_statistic_ext_starelid - 1] = false;
|
|
replaces[Anum_pg_statistic_ext_stakey - 1] = false;
|
|
|
|
/*
|
|
* Step : Get true type of attribute
|
|
*/
|
|
Oid atttypid;
|
|
int atttypmod;
|
|
Oid* atttypid_array = NULL;
|
|
int* atttypmod_array = NULL;
|
|
unsigned int num_column = 1;
|
|
|
|
/* Mark as anyarray */
|
|
atttypid = ANYARRAYOID;
|
|
atttypmod = -1;
|
|
|
|
int2vector* keys = (int2vector*)DatumGetPointer(slot->tts_values[Anum_pg_statistic_ext_stakey - 1]);
|
|
if (info != NULL) {
|
|
Assert(relid == info->relid);
|
|
|
|
if (Natts_pg_statistic_ext != info->natts_pg_statistic_ext) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_PROTOCOL_VIOLATION),
|
|
errmsg("cooperation analysis: please update to the same version")));
|
|
}
|
|
|
|
if (!PgfdwGetRelAttnum(keys, info)) {
|
|
return;
|
|
}
|
|
}
|
|
es_get_columns_typid_typmod(relid, keys, &atttypid_array, &atttypmod_array, &num_column);
|
|
|
|
/*
|
|
* Step : Reconstruct staValues
|
|
*/
|
|
oldcontext = MemoryContextSwitchTo(slot->tts_mcxt);
|
|
|
|
slot->tts_values[Anum_pg_statistic_ext_starelid - 1] = relid;
|
|
if (slot->tts_attinmeta == NULL)
|
|
slot->tts_attinmeta = TupleDescGetAttInMetadata(slot->tts_tupleDescriptor);
|
|
|
|
/* if the relation is replication for global stats, we should set stadndistinct the same as stadistinct from dn1. */
|
|
if (u_sess->attr.attr_sql.enable_global_stats && isReplication && IS_PGXC_COORDINATOR && !IsConnFromCoord()) {
|
|
/*
|
|
* we should set stadndistinct as 1 if there is no global distinct.
|
|
* because we consider stadndistinct is 0 as single stats.
|
|
*/
|
|
if (0 == slot->tts_values[Anum_pg_statistic_ext_stadistinct - 1])
|
|
slot->tts_values[Anum_pg_statistic_ext_stadndistinct - 1] = Float4GetDatum(1);
|
|
else
|
|
slot->tts_values[Anum_pg_statistic_ext_stadndistinct - 1] =
|
|
slot->tts_values[Anum_pg_statistic_ext_stadistinct - 1];
|
|
}
|
|
|
|
for (i = 0; i < STATISTIC_NUM_SLOTS; i++) {
|
|
j = Anum_pg_statistic_ext_stakind1 - 1 + i;
|
|
k = Anum_pg_statistic_ext_stavalues1 - 1 + i;
|
|
|
|
if (!slot->tts_isnull[j]) {
|
|
int t = DatumGetInt16(slot->tts_values[j]);
|
|
|
|
if (0 != t && STATISTIC_KIND_HISTOGRAM != t) {
|
|
Assert(!slot->tts_isnull[Anum_pg_statistic_ext_stanumbers1 - 1 + i]);
|
|
}
|
|
|
|
if (STATISTIC_KIND_MCV == t || STATISTIC_KIND_NULL_MCV == t || STATISTIC_KIND_HISTOGRAM == t ||
|
|
STATISTIC_KIND_MCELEM == t) {
|
|
Assert(!slot->tts_isnull[k]);
|
|
Oid atttypid_temp = atttypid;
|
|
int atttypmod_temp = atttypmod;
|
|
|
|
/* When stakindN = STATISTIC_KIND_MCELEM, element type of staValuesN is text by now */
|
|
if (STATISTIC_KIND_MCELEM == t) {
|
|
atttypid_temp = TEXTOID;
|
|
atttypmod_temp = -1;
|
|
}
|
|
|
|
if (STATISTIC_KIND_NULL_MCV == t || STATISTIC_KIND_MCV == t) {
|
|
atttypid_temp = CSTRINGOID;
|
|
atttypmod_temp = -1;
|
|
}
|
|
|
|
slot->tts_values[k] = FunctionCall3Coll(slot->tts_attinmeta->attinfuncs + k,
|
|
InvalidOid,
|
|
(Datum)slot->tts_values[k],
|
|
UInt32GetDatum(atttypid_temp),
|
|
Int32GetDatum(atttypmod_temp));
|
|
|
|
if (STATISTIC_KIND_NULL_MCV == t || STATISTIC_KIND_MCV == t) {
|
|
slot->tts_values[k] = es_mcv_slot_cstring_array_to_array_array(
|
|
(Datum)slot->tts_values[k], num_column, atttypid_array, atttypmod_array);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* In multi-column statistic, we are going to check & free the array of typid/typmode */
|
|
if (atttypid_array != NULL) {
|
|
pfree_ext(atttypid_array);
|
|
atttypid_array = NULL;
|
|
}
|
|
|
|
if (atttypmod_array != NULL) {
|
|
pfree_ext(atttypmod_array);
|
|
atttypmod_array = NULL;
|
|
}
|
|
|
|
// Is there already a pg_statistic tuple for this attribute?
|
|
//
|
|
Relation sd = heap_open(StatisticExtRelationId, RowExclusiveLock);
|
|
MemoryContext current_context = MemoryContextSwitchTo(oldcontext);
|
|
ResourceOwner asOwner, oldOwner1;
|
|
/*
|
|
* Create a resource owner to keep track of resources
|
|
* in order to release resources when catch the exception.
|
|
*/
|
|
asOwner = ResourceOwnerCreate(t_thrd.utils_cxt.CurrentResourceOwner, "update_stats",
|
|
THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_OPTIMIZER));
|
|
oldOwner1 = t_thrd.utils_cxt.CurrentResourceOwner;
|
|
t_thrd.utils_cxt.CurrentResourceOwner = asOwner;
|
|
|
|
PG_TRY();
|
|
{
|
|
oldtup = SearchSysCache4(STATRELKINDKEYINH,
|
|
ObjectIdGetDatum(slot->tts_values[Anum_pg_statistic_ext_starelid - 1]),
|
|
CharGetDatum(slot->tts_values[Anum_pg_statistic_ext_starelkind - 1]),
|
|
BoolGetDatum(slot->tts_values[Anum_pg_statistic_ext_stainherit - 1]),
|
|
slot->tts_values[Anum_pg_statistic_ext_stakey - 1]);
|
|
|
|
if (HeapTupleIsValid(oldtup)) {
|
|
// Yes, replace it
|
|
//
|
|
stup = heap_modify_tuple(oldtup, RelationGetDescr(sd), slot->tts_values, slot->tts_isnull, replaces);
|
|
ReleaseSysCache(oldtup);
|
|
(void)simple_heap_update(sd, &stup->t_self, stup);
|
|
} else {
|
|
// No, insert new tuple
|
|
//
|
|
stup = heap_form_tuple(RelationGetDescr(sd), slot->tts_values, slot->tts_isnull);
|
|
simple_heap_insert(sd, stup);
|
|
}
|
|
|
|
// update indexes too
|
|
//
|
|
CatalogUpdateIndexes(sd, stup);
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
analyze_concurrency_process(slot->tts_values[Anum_pg_statistic_ext_starelid - 1],
|
|
ES_MULTI_COLUMN_STATS_ATTNUM,
|
|
current_context,
|
|
PG_FUNCNAME_MACRO);
|
|
}
|
|
PG_END_TRY();
|
|
|
|
/* Release everything */
|
|
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_BEFORE_LOCKS, false, false);
|
|
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_LOCKS, false, false);
|
|
ResourceOwnerRelease(asOwner, RESOURCE_RELEASE_AFTER_LOCKS, false, false);
|
|
t_thrd.utils_cxt.CurrentResourceOwner = oldOwner1;
|
|
ResourceOwnerDelete(asOwner);
|
|
heap_close(sd, RowExclusiveLock);
|
|
}
|
|
|
|
/* Handle partition pages&tuples fetched from datanode. */
|
|
static void ReceivePartitionPageAndTuple(Oid relid, TupleTableSlot* slot)
|
|
{
|
|
Relation rel;
|
|
Relation fakerel;
|
|
Partition partrel;
|
|
Name partname;
|
|
char parttype;
|
|
Oid partitionid = InvalidOid;
|
|
LOCKMODE part_lock;
|
|
RelPageType relpages;
|
|
double reltuples;
|
|
BlockNumber relallvisible;
|
|
|
|
partname = DatumGetName(slot->tts_values[0]);
|
|
parttype = DatumGetChar(slot->tts_values[1]);
|
|
relpages = (RelPageType)DatumGetFloat8(slot->tts_values[2]);
|
|
reltuples = (double)DatumGetFloat8(slot->tts_values[3]);
|
|
relallvisible = (BlockNumber)DatumGetInt32(slot->tts_values[4]);
|
|
|
|
if (parttype == PART_OBJ_TYPE_TABLE_PARTITION) {
|
|
part_lock = ShareUpdateExclusiveLock;
|
|
} else if (parttype == PART_OBJ_TYPE_INDEX_PARTITION) {
|
|
part_lock = RowExclusiveLock;
|
|
} else {
|
|
/* should not happen */
|
|
Assert(0);
|
|
ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognize LOCKMODE type.")));
|
|
part_lock = 0; /* make complier slience */
|
|
}
|
|
|
|
rel = relation_open(relid, ShareUpdateExclusiveLock);
|
|
partitionid = PartitionNameGetPartitionOid(
|
|
relid, (const char*)partname->data, parttype, part_lock, true, false, NULL, NULL, NoLock);
|
|
|
|
if (!OidIsValid(partitionid)) {
|
|
relation_close(rel, NoLock);
|
|
return;
|
|
}
|
|
partrel = partitionOpen(rel, partitionid, NoLock);
|
|
|
|
vac_update_partstats(partrel, (BlockNumber)relpages, reltuples, relallvisible,
|
|
BootstrapTransactionId, RelationIsColStore(rel) ? InvalidMultiXactId : FirstMultiXactId);
|
|
|
|
/*
|
|
* we does not fetch dead tuples info from remote DN/CN, just set deadtuples to 0. it does
|
|
* not matter because we should fetch all deadtuples info from all datanodes to calculate a
|
|
* user-defined table's deadtuples info
|
|
*
|
|
* now we just do analyze on table, and there is no stats info on partition, we deal with partition
|
|
* just because we will support to analyze a partition oneday
|
|
*/
|
|
fakerel = partitionGetRelation(rel, partrel);
|
|
pgstat_report_analyze(fakerel, (PgStat_Counter)reltuples, (PgStat_Counter)0);
|
|
releaseDummyRelation(&fakerel);
|
|
|
|
partitionClose(rel, partrel, NoLock);
|
|
relation_close(rel, NoLock);
|
|
}
|
|
|
|
char* repairObjectName(const char* relname)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
}
|
|
|
|
//@Temp Table. when use pg_temp.table_name, we should change the schema name
|
|
// to actual temp schema name when generate SQL use the name.
|
|
char* repairTempNamespaceName(char* name)
|
|
{
|
|
Assert(name != NULL);
|
|
Assert(strcasecmp(name, "pg_temp") == 0);
|
|
|
|
return pstrdup(get_namespace_name(u_sess->catalog_cxt.myTempNamespace));
|
|
}
|
|
|
|
int FetchStatistics4WLM(const char* sql, void* info, Size size, strategy_func func)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return 0;
|
|
}
|
|
|
|
PGXCNodeAllHandles* connect_compute_pool(int srvtype)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* @Description: connect to the compute pool for OBS foreign table.
|
|
* conninfo is from the cp_client.conf which is in the
|
|
* data directory of the DWS CN.
|
|
*
|
|
* @return: conn handle to the compute pool.
|
|
*/
|
|
static PGXCNodeAllHandles* connect_compute_pool_for_OBS()
|
|
{
|
|
int cnum = 0;
|
|
ComputePoolConfig** confs = get_cp_conninfo(&cnum);
|
|
|
|
return make_cp_conn(confs, cnum, T_OBS_SERVER);
|
|
}
|
|
|
|
/*
|
|
* @Description: connect to the compute pool for HDFS foreign table.
|
|
* conninfo is from dummy server.
|
|
*
|
|
* @return: conn handle to the compute pool.
|
|
*/
|
|
static PGXCNodeAllHandles* connect_compute_pool_for_HDFS()
|
|
{
|
|
errno_t rt;
|
|
int ret;
|
|
|
|
char address[NAMEDATALEN] = {0};
|
|
|
|
struct addrinfo* gai_result = NULL;
|
|
|
|
/* get all connection info from the options of "dummy server" */
|
|
DummyServerOptions* options = getDummyServerOption();
|
|
|
|
Assert(options);
|
|
|
|
rt = memcpy_s(address, NAMEDATALEN, options->address, strlen(options->address));
|
|
securec_check(rt, "\0", "\0");
|
|
|
|
/* parse "ip:port" into the binary format of IP and Port */
|
|
char* port = strchr(address, ':');
|
|
if (port == NULL)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_UNEXPECTED_NULL_VALUE),
|
|
errmodule(MOD_ACCELERATE),
|
|
errmsg("invalid address for the compute pool: %s", address)));
|
|
|
|
*port = '\0';
|
|
port++;
|
|
|
|
ret = getaddrinfo(address, NULL, NULL, &gai_result);
|
|
if (ret != 0)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
|
|
errmodule(MOD_ACCELERATE),
|
|
errmsg("could not translate host name \"%s\" to address: %s", address, gai_strerror(ret))));
|
|
|
|
struct sockaddr_in* h = (struct sockaddr_in*)gai_result->ai_addr;
|
|
char ip[16] = {0};
|
|
rt = strcpy_s(ip, sizeof(ip), inet_ntoa(h->sin_addr));
|
|
securec_check(rt, "\0", "\0");
|
|
|
|
freeaddrinfo(gai_result);
|
|
|
|
elog(DEBUG1, "compute pool ip: %s, port: %s", ip, port);
|
|
|
|
ComputePoolConfig config;
|
|
config.cpip = ip;
|
|
config.cpport = port;
|
|
config.username = options->userName;
|
|
config.password = options->passWord;
|
|
|
|
ComputePoolConfig** configs = (ComputePoolConfig**)palloc0(sizeof(ComputePoolConfig*));
|
|
configs[0] = &config;
|
|
|
|
return make_cp_conn(configs, 1, T_HDFS_SERVER, options->dbname);
|
|
}
|
|
|
|
/*
|
|
* @Description: connect to the compute pool.
|
|
*
|
|
* @param[IN] : config, necessary conn info such as ip, port, username and password.
|
|
*
|
|
* @return: conn handle to the compute pool.
|
|
*/
|
|
static PGXCNodeAllHandles* make_cp_conn(ComputePoolConfig** configs, int cnum, int srvtype, const char* dbname)
|
|
{
|
|
PGXCNodeAllHandles* handles = NULL;
|
|
|
|
for (int i = 0; i < cnum - 1; i++) {
|
|
MemoryContext current_ctx = CurrentMemoryContext;
|
|
|
|
PG_TRY();
|
|
{
|
|
handles = try_make_cp_conn(configs[i]->cpip, configs[i], srvtype, dbname);
|
|
|
|
return handles;
|
|
}
|
|
PG_CATCH();
|
|
{
|
|
/*
|
|
* the compute pool is unavailable, so reset memory contex and clear
|
|
* error stack.
|
|
*/
|
|
MemoryContextSwitchTo(current_ctx);
|
|
|
|
/* Save error info */
|
|
ErrorData* edata = CopyErrorData();
|
|
|
|
ereport(LOG,
|
|
(errcode(ERRCODE_CONNECTION_FAILURE),
|
|
errmodule(MOD_ACCELERATE),
|
|
errmsg("Fail to connect to the compute pool: %s, cause: %s", configs[i]->cpip, edata->message)));
|
|
|
|
FlushErrorState();
|
|
|
|
FreeErrorData(edata);
|
|
}
|
|
PG_END_TRY();
|
|
}
|
|
|
|
handles = try_make_cp_conn(configs[cnum - 1]->cpip, configs[cnum - 1], srvtype, dbname);
|
|
|
|
return handles;
|
|
}
|
|
|
|
/*
|
|
* @Description: do real job to connect to the compute pool.
|
|
*
|
|
* @param[IN] : config, necessary conn info such as ip, port, username and password.
|
|
*
|
|
* @return: conn handle to the compute pool.
|
|
*/
|
|
static PGXCNodeAllHandles* try_make_cp_conn(
|
|
const char* cpip, ComputePoolConfig* config, int srvtype, const char* dbname)
|
|
{
|
|
#define PASSWORD_LEN 128
|
|
|
|
/* make connection string */
|
|
PGXCNodeHandle** handles = NULL;
|
|
PGXCNodeAllHandles* pgxc_handles = NULL;
|
|
|
|
if (dbname == NULL) {
|
|
dbname = "postgres";
|
|
}
|
|
|
|
char* pgoptions = session_options();
|
|
char* tmp_str =
|
|
PGXCNodeConnStr(cpip, atoi(config->cpport), dbname, config->username, pgoptions, "application", PROTO_TCP, "");
|
|
|
|
StringInfo conn_str = makeStringInfo();
|
|
|
|
if (srvtype == T_OBS_SERVER) {
|
|
char password[PASSWORD_LEN] = {0};
|
|
errno_t rc = 0;
|
|
decryptOBS(config->password, password, PASSWORD_LEN);
|
|
appendStringInfo(conn_str, "%s, password='%s'", tmp_str, password);
|
|
rc = memset_s(password, PASSWORD_LEN, 0, PASSWORD_LEN);
|
|
securec_check(rc, "\0", "\0");
|
|
} else
|
|
appendStringInfo(conn_str, "%s, password='%s'", tmp_str, config->password);
|
|
|
|
/* prepare result */
|
|
pgxc_handles = (PGXCNodeAllHandles*)palloc0(sizeof(PGXCNodeAllHandles));
|
|
pgxc_handles->dn_conn_count = 1;
|
|
|
|
handles = (PGXCNodeHandle**)palloc0(sizeof(PGXCNodeHandle*));
|
|
|
|
pgxc_handles->datanode_handles = handles;
|
|
|
|
/* connect to the compute pool */
|
|
connect_server(conn_str->data, &handles[0], cpip, atoi(config->cpport), tmp_str);
|
|
|
|
/* clear password */
|
|
int passWordLength = strlen(conn_str->data);
|
|
errno_t rc = memset_s(conn_str->data, passWordLength, 0, passWordLength);
|
|
securec_check(rc, "\0", "\0");
|
|
|
|
return pgxc_handles;
|
|
}
|
|
|
|
void PgFdwSendSnapshot(StringInfo buf, Snapshot snapshot)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
void PgFdwSendSnapshot(StringInfo buf, Snapshot snapshot, Size snap_size)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
void PgFdwRemoteReply(StringInfo msg)
|
|
{
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return;
|
|
}
|
|
|
|
/* PG_VERSION_STR */
|
|
|
|
/**
|
|
* @Description: CN will commit on GTM, first notify DN to set csn to commit_in_progress
|
|
* @return - no return
|
|
*/
|
|
void NotifyDNSetCSN2CommitInProgress()
|
|
{
|
|
PGXCNodeHandle** connections = u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles;
|
|
|
|
/* Two Phase trx don't need to notify */
|
|
if (u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes != 1)
|
|
return;
|
|
|
|
/*
|
|
* Send queryid to all the participants
|
|
*/
|
|
if (pgxc_node_send_queryid(connections[0], u_sess->debug_query_id))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send queryid to %s before COMMIT command(1PC)", connections[0]->remoteNodeName)));
|
|
/*
|
|
* Now send the COMMIT command to all the participants
|
|
*/
|
|
if (pgxc_node_notify_commit(connections[0])) {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[0] = RXACT_NODE_COMMIT_FAILED;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_COMMIT_FAILED;
|
|
|
|
/*
|
|
* If the error occurred, can't abort because local committed firstly. Just note error message
|
|
*/
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("failed to notify node %u to commit", connections[0]->nodeoid)));
|
|
}
|
|
|
|
/* wait for response */
|
|
{
|
|
RemoteQueryState* combiner = CreateResponseCombiner(1, COMBINE_TYPE_NONE);
|
|
/* Receive responses */
|
|
int result = pgxc_node_receive_responses(1, connections, NULL, combiner, false);
|
|
if (result || !validate_combiner(combiner)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("failed to receice response from node %u after notify commit", connections[0]->nodeoid)));
|
|
} else {
|
|
CloseCombiner(combiner);
|
|
combiner = NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @Description: send the commit csn to other pgxc node.
|
|
* @in commit_csn - the csn to be sent
|
|
* @return - no return
|
|
*/
|
|
void SendPGXCNodeCommitCsn(uint64 commit_csn)
|
|
{
|
|
#define ERRMSG_BUFF_SIZE 256
|
|
int rc = 0;
|
|
char errMsg[ERRMSG_BUFF_SIZE];
|
|
int write_conn_count = u_sess->pgxc_cxt.remoteXactState->numWriteRemoteNodes;
|
|
PGXCNodeHandle** connections = u_sess->pgxc_cxt.remoteXactState->remoteNodeHandles;
|
|
|
|
/*
|
|
* We must handle reader and writer connections both since the transaction
|
|
* must be closed even on a read-only node
|
|
*/
|
|
if (IS_PGXC_DATANODE || write_conn_count == 0)
|
|
return;
|
|
|
|
/*
|
|
* Now send the COMMIT command to all the participants
|
|
*/
|
|
for (int i = 0; i < write_conn_count; i++) {
|
|
if (pgxc_node_send_commit_csn(connections[i], commit_csn)) {
|
|
u_sess->pgxc_cxt.remoteXactState->remoteNodeStatus[i] = RXACT_NODE_COMMIT_FAILED;
|
|
u_sess->pgxc_cxt.remoteXactState->status = RXACT_COMMIT_FAILED;
|
|
|
|
/*
|
|
* If the error occurred, can't abort because local committed firstly. Just note error message
|
|
*/
|
|
if (i == 0) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("failed to send commit csn to node %u", connections[i]->nodeoid)));
|
|
} else {
|
|
rc = sprintf_s(errMsg,
|
|
ERRMSG_BUFF_SIZE,
|
|
"failed to send commit csn "
|
|
"command to node %s",
|
|
connections[i]->remoteNodeName);
|
|
securec_check_ss(rc, "\0", "\0");
|
|
add_error_message(connections[i], "%s", errMsg);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @Description: get List of PGXCNodeHandle to track writers involved in the
|
|
* current transaction.
|
|
* @return - List *u_sess->pgxc_cxt.XactWriteNodes
|
|
*/
|
|
List* GetWriterHandles()
|
|
{
|
|
return u_sess->pgxc_cxt.XactWriteNodes;
|
|
}
|
|
|
|
/**
|
|
* @Description: get List of PGXCNodeHandle to track readers involved in the
|
|
* current transaction.
|
|
* @return - List *u_sess->pgxc_cxt.XactReadNodes
|
|
*/
|
|
List* GetReaderHandles()
|
|
{
|
|
return u_sess->pgxc_cxt.XactReadNodes;
|
|
}
|
|
|
|
/**
|
|
* @Description: checking the host of connections are the same as the node definitions of the current plan.
|
|
* If not, then switchover happened, the address and port information of current plan needs to be updated.
|
|
* @planstmt - current plan statment
|
|
* @connections - connections info from pooler, which are current primary node info
|
|
* @regular_conn_count - number of nodes in plan
|
|
*/
|
|
void pgxc_check_and_update_nodedef(PlannedStmt* planstmt, PGXCNodeHandle** connections, int regular_conn_count)
|
|
{
|
|
int i, node_id, rc;
|
|
NameData temp_nodehost;
|
|
int temp_nodeport;
|
|
int temp_nodectlport;
|
|
int temp_nodesctpport;
|
|
char node_type;
|
|
|
|
if (unlikely(planstmt == NULL || connections == NULL)) {
|
|
return;
|
|
}
|
|
|
|
for (i = 0; i < regular_conn_count; i++) {
|
|
/*
|
|
* For primary/standby/dummmy mode. nodedef of primary and standby are in same row in pgxc_node.
|
|
* just switch the upper part and the lower part.
|
|
*/
|
|
if (IS_DN_DUMMY_STANDYS_MODE()) {
|
|
node_id = GetNodeIdFromNodesDef(planstmt->nodesDefinition, connections[i]->nodeoid);
|
|
if (unlikely(node_id < 0)) {
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_INTERNAL_ERROR), errmsg("Failed to get valid node id from node definitions")));
|
|
}
|
|
|
|
if (unlikely(strcmp(NameStr(connections[i]->connInfo.host),
|
|
NameStr(planstmt->nodesDefinition[node_id].nodehost)) != 0)) {
|
|
temp_nodehost = planstmt->nodesDefinition[node_id].nodehost;
|
|
temp_nodeport = planstmt->nodesDefinition[node_id].nodeport;
|
|
|
|
planstmt->nodesDefinition[node_id].nodehost = planstmt->nodesDefinition[node_id].nodehost1;
|
|
planstmt->nodesDefinition[node_id].nodeport = planstmt->nodesDefinition[node_id].nodeport1;
|
|
planstmt->nodesDefinition[node_id].nodehost1 = temp_nodehost;
|
|
planstmt->nodesDefinition[node_id].nodeport1 = temp_nodeport;
|
|
|
|
/* sctp mode information of standby datanode */
|
|
temp_nodectlport = planstmt->nodesDefinition[node_id].nodectlport;
|
|
temp_nodesctpport = planstmt->nodesDefinition[node_id].nodesctpport;
|
|
|
|
planstmt->nodesDefinition[node_id].nodectlport = planstmt->nodesDefinition[node_id].nodectlport1;
|
|
planstmt->nodesDefinition[node_id].nodesctpport = planstmt->nodesDefinition[node_id].nodesctpport1;
|
|
planstmt->nodesDefinition[node_id].nodectlport1 = temp_nodectlport;
|
|
planstmt->nodesDefinition[node_id].nodesctpport1 = temp_nodesctpport;
|
|
}
|
|
}
|
|
/*
|
|
*for multi-standby mode, nodedef of primary and standby are different rows,
|
|
*get primary node oid from pgxc_node and update the nodedef to current primary node.
|
|
*/
|
|
else {
|
|
if (get_pgxc_nodetype(connections[i]->nodeoid) == PGXC_NODE_COORDINATOR) {
|
|
node_type = PGXC_NODE_COORDINATOR;
|
|
} else {
|
|
node_type = PGXC_NODE_DATANODE;
|
|
}
|
|
|
|
node_id = PGXCNodeGetNodeId(connections[i]->nodeoid, node_type);
|
|
if (unlikely(strcmp(NameStr(connections[i]->connInfo.host),
|
|
NameStr(planstmt->nodesDefinition[node_id].nodehost)) != 0)) {
|
|
Oid current_primary_oid = PgxcNodeGetPrimaryDNFromMatric(planstmt->nodesDefinition[node_id].nodeoid);
|
|
NodeDefinition* res = PgxcNodeGetDefinition(current_primary_oid);
|
|
|
|
elog(WARNING,
|
|
"nodesDefinition of planstmt is wrong, [node:%s,oid:%u] planstmt host:%s, expected host:%s, has "
|
|
"fixed to:%s.",
|
|
connections[i]->remoteNodeName,
|
|
connections[i]->nodeoid,
|
|
NameStr(planstmt->nodesDefinition[node_id].nodehost),
|
|
NameStr(connections[i]->connInfo.host),
|
|
(res == NULL) ? "" : NameStr(res->nodehost));
|
|
if (res != NULL) {
|
|
rc = memcpy_s(
|
|
&planstmt->nodesDefinition[node_id], sizeof(NodeDefinition), res, sizeof(NodeDefinition));
|
|
securec_check(rc, "\0", "\0");
|
|
pfree(res);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void parse_nodes_name(char* node_list, char** target_nodes, int* node_num)
|
|
{
|
|
char* p = node_list;
|
|
int num = *node_num;
|
|
int node_list_len = strlen(node_list);
|
|
while (((p - node_list) <= node_list_len) && *p != '\0') {
|
|
char* node_name = p;
|
|
int node_name_lenth = 0;
|
|
while (*p != '\0' && (*p != MOD_DELIMITER)) {
|
|
p++;
|
|
node_name_lenth++;
|
|
}
|
|
if (num >= (u_sess->pgxc_cxt.NumCoords + u_sess->pgxc_cxt.NumDataNodes))
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_STRING_DATA_LENGTH_MISMATCH),
|
|
errmsg("invalid node_list, coor_num:%d, datanode_num:%d",
|
|
u_sess->pgxc_cxt.NumCoords,
|
|
u_sess->pgxc_cxt.NumDataNodes)));
|
|
/* skip ',' */
|
|
target_nodes[num] = node_name;
|
|
num++;
|
|
if (*p == MOD_DELIMITER) {
|
|
/* mark the end */
|
|
*p = '\0';
|
|
p++;
|
|
}
|
|
}
|
|
*node_num = num;
|
|
}
|
|
|
|
static int get_target_conn(
|
|
int node_num, char** target_nodes, PGXCNodeHandle** connections, PGXCNodeAllHandles* pgxc_connections)
|
|
{
|
|
PGXCNodeHandle** dn_connections = NULL;
|
|
PGXCNodeHandle** cn_connections = NULL;
|
|
int dn_conn_count = 0;
|
|
int cn_conn_count = 0;
|
|
int conn_idx = 0;
|
|
int i = 0;
|
|
int j = 0;
|
|
bool find_node = false;
|
|
|
|
pgxc_connections = get_exec_connections(NULL, NULL, EXEC_ON_ALL_NODES);
|
|
dn_connections = pgxc_connections->datanode_handles;
|
|
dn_conn_count = pgxc_connections->dn_conn_count;
|
|
cn_connections = pgxc_connections->coord_handles;
|
|
cn_conn_count = pgxc_connections->co_conn_count;
|
|
|
|
for (i = 0; i < node_num; i++) {
|
|
for (j = 0; j < dn_conn_count; j++) {
|
|
if (strcmp(dn_connections[j]->remoteNodeName, target_nodes[i]) == 0) {
|
|
connections[conn_idx] = dn_connections[j];
|
|
conn_idx++;
|
|
find_node = true;
|
|
}
|
|
}
|
|
for (j = 0; j < cn_conn_count; j++) {
|
|
Oid node_oid = get_pgxc_nodeoid(target_nodes[i]);
|
|
bool nodeis_active = true;
|
|
nodeis_active = is_pgxc_nodeactive(node_oid);
|
|
if ((strcmp(cn_connections[j]->remoteNodeName, target_nodes[i]) == 0) && OidIsValid(node_oid) &&
|
|
nodeis_active) {
|
|
connections[conn_idx] = cn_connections[j];
|
|
conn_idx++;
|
|
find_node = true;
|
|
}
|
|
}
|
|
if (!find_node) {
|
|
ereport(WARNING, (errmsg("target node %s is not in cluster nodes.", target_nodes[i])));
|
|
}
|
|
}
|
|
return conn_idx;
|
|
}
|
|
|
|
static void send_remote_node_query(int conn_num, PGXCNodeHandle** connections, const char* sql)
|
|
{
|
|
int new_conn_count = 0;
|
|
int i = 0;
|
|
int result = 0;
|
|
RemoteQueryState* combiner = NULL;
|
|
StringInfoData send_fail_node;
|
|
StringInfoData exec_fail_node;
|
|
bool clean_success = true;
|
|
PGXCNodeHandle** new_connections = (PGXCNodeHandle**)palloc0(sizeof(PGXCNodeHandle*) * conn_num);
|
|
for (i = 0; i < conn_num; i++) {
|
|
initStringInfo(&send_fail_node);
|
|
/* Clean the previous errors, if any */
|
|
connections[i]->error = NULL;
|
|
if (pgxc_node_send_query(connections[i], sql, false, false, false,
|
|
g_instance.attr.attr_storage.enable_gtm_free)) {
|
|
/* do something record send failed node_name */
|
|
clean_success = false;
|
|
appendStringInfo(&send_fail_node, "%s ", connections[i]->remoteNodeName);
|
|
} else {
|
|
new_connections[new_conn_count++] = connections[i];
|
|
}
|
|
}
|
|
|
|
/* get response */
|
|
if (new_conn_count) {
|
|
initStringInfo(&exec_fail_node);
|
|
combiner = CreateResponseCombiner(new_conn_count, COMBINE_TYPE_NONE);
|
|
/* Receive responses */
|
|
result = pgxc_node_receive_responses(new_conn_count, new_connections, NULL, combiner, false);
|
|
if (result || !validate_combiner(combiner)) {
|
|
result = EOF;
|
|
} else {
|
|
CloseCombiner(combiner);
|
|
combiner = NULL;
|
|
}
|
|
|
|
for (i = 0; i < new_conn_count; i++) {
|
|
if (new_connections[i]->error) {
|
|
clean_success = false;
|
|
appendStringInfo(&exec_fail_node, "%s ", new_connections[i]->remoteNodeName);
|
|
}
|
|
}
|
|
}
|
|
pfree(new_connections);
|
|
|
|
/* ereport detail message */
|
|
if (clean_success == false)
|
|
ereport(ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg("Failed to send COMMIT/ROLLBACK on nodes: %s.Failed to COMMIT/ROLLBACK the transaction on "
|
|
"nodes: %s.",
|
|
send_fail_node.data,
|
|
exec_fail_node.data)));
|
|
|
|
if (result) {
|
|
if (combiner != NULL)
|
|
pgxc_node_report_error(combiner, TwoPhaseCommit ? WARNING : ERROR);
|
|
else
|
|
ereport(TwoPhaseCommit ? WARNING : ERROR,
|
|
(errcode(ERRCODE_CONNECTION_EXCEPTION),
|
|
errmsg(
|
|
"Connection error with Datanode, so failed to COMMIT the transaction on one or more nodes")));
|
|
}
|
|
}
|
|
|
|
Datum global_clean_prepared_xacts(PG_FUNCTION_ARGS)
|
|
{
|
|
ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("unsupported proc in single node mode.")));
|
|
|
|
PG_RETURN_NULL();
|
|
}
|
|
|
|
bool check_receive_buffer(RemoteQueryState* combiner, int tapenum, bool* has_checked, int* has_err_idx)
|
|
{
|
|
Assert(false);
|
|
DISTRIBUTED_FEATURE_NOT_SUPPORTED();
|
|
return NULL;
|
|
}
|
|
|