Files
openGauss-server/src/common/backend/libpq/pqcomm.cpp
2020-08-25 21:27:33 +08:00

2523 lines
85 KiB
C++

/* -------------------------------------------------------------------------
*
* pqcomm.cpp
* Communication functions between the Frontend and the Backend
*
* These routines handle the low-level details of communication between
* frontend and backend. They just shove data across the communication
* channel, and are ignorant of the semantics of the data --- or would be,
* except for major brain damage in the design of the old COPY OUT protocol.
* Unfortunately, COPY OUT was designed to commandeer the communication
* channel (it just transfers data without wrapping it into messages).
* No other messages can be sent while COPY OUT is in progress; and if the
* copy is aborted by an ereport(ERROR), we need to close out the copy so that
* the frontend gets back into sync. Therefore, these routines have to be
* aware of COPY OUT state. (New COPY-OUT is message-based and does *not*
* set the DoingCopyOut flag.)
*
* NOTE: generally, it's a bad idea to emit outgoing messages directly with
* pq_putbytes(), especially if the message would require multiple calls
* to send. Instead, use the routines in pqformat.c to construct the message
* in a buffer and then emit it in one call to pq_putmessage. This ensures
* that the channel will not be clogged by an incomplete message if execution
* is aborted by ereport(ERROR) partway through the message. The only
* non-libpq code that should call pq_putbytes directly is old-style COPY OUT.
*
* At one time, libpq was shared between frontend and backend, but now
* the backend's "backend/libpq" is quite separate from "interfaces/libpq".
* All that remains is similarities of names to trap the unwary...
*
* Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/common/backend/libpq/pqcomm.cpp
*
* -------------------------------------------------------------------------
*/
/* ------------------------
* INTERFACE ROUTINES
*
* setup/teardown:
* StreamServerPort - Open postmaster's server port
* StreamConnection - Create new connection with client
* StreamClose - Close a client/backend connection
* TouchSocketFile - Protect socket file against /tmp cleaners
* pq_init - initialize libpq at backend startup
* pq_comm_reset - reset libpq during error recovery
* pq_close - shutdown libpq at backend exit
*
* low-level I/O:
* pq_getbytes - get a known number of bytes from connection
* pq_getstring - get a null terminated string from connection
* pq_getmessage - get a message with length word from connection
* pq_getbyte - get next byte from connection
* pq_peekbyte - peek at next byte from connection
* pq_putbytes - send bytes to connection (not flushed until pq_flush)
* pq_flush - flush pending output
* pq_flush_if_writable - flush pending output if writable without blocking
* pq_getbyte_if_available - get a byte if available without blocking
*
* message-level I/O (and old-style-COPY-OUT cruft):
* pq_putmessage - send a normal message (suppressed in COPY OUT mode)
* pq_putmessage_noblock - buffer a normal message (suppressed in COPY OUT)
* pq_startcopyout - inform libpq that a COPY OUT transfer is beginning
* pq_endcopyout - end a COPY OUT transfer
*
* ------------------------
*/
#include "postgres.h"
#include "knl/knl_variable.h"
#include <fcntl.h>
#include <grp.h>
#include <sys/file.h>
#include <sys/time.h>
#ifdef HAVE_NETINET_TCP_H
#include <netinet/tcp.h>
#endif
#include <arpa/inet.h>
#ifdef HAVE_UTIME_H
#include <utime.h>
#endif
#ifdef WIN32_ONLY_COMPILER /* mstcpip.h is missing on mingw */
#include <mstcpip.h>
#endif
#ifdef HAVE_POLL_H
#include <poll.h>
#endif
#include "pgxc/pgxc.h"
#include "libpq/ip.h"
#include "libpq/libpq.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/ipc.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "libpq/pqcomm.h"
#include "replication/replicainternal.h"
#include "utils/timestamp.h"
#include "postmaster/postmaster.h"
#include "libcomm/libcomm.h"
#include "libpq/pqformat.h"
#include "pgxc/nodemgr.h"
#include "storage/lz4_file.h"
#include "tcop/stmt_retry.h"
#include "distributelayer/streamProducer.h"
#define MAXLISTEN 64
#define IP_LEN 64
#define CRC_HEADER 12 // uint32 sequence number + uint32 data length + uint32 crc checksum.
extern bool StreamThreadAmI();
extern GlobalNodeDefinition* global_node_definition;
/*
* Buffers for low-level I/O.
*
* The receive buffer is fixed size. Send buffer is usually 8k, but can be
* enlarged by pq_putmessage_noblock() if the message doesn't fit otherwise.
*/
#define PQ_BUFFER_SIZE 8192
#define PQ_SEND_BUFFER_SIZE PQ_BUFFER_SIZE
#ifdef USE_RETRY_STUB
#define PQ_RECV_BUFFER_SIZE 16
#else
#define PQ_RECV_BUFFER_SIZE PQ_BUFFER_SIZE
#endif
#define NAPTIME_PER_SEND_RETRY 100 /* max sleep between two send try (100ms) */
#define NAPTIME_PER_SEND 10 /* max sleep before sending next batch of data (10ms) */
void pq_close(int code, Datum arg);
/* Internal functions */
static int internal_putbytes(const char* s, size_t len);
static int internal_flush(void);
static void pq_set_nonblocking(bool nonblocking);
static void pq_disk_generate_checking_header(
const char* src_data, StringInfo dest_data, uint32 data_len, uint32 seq_num);
static size_t pq_disk_read_data_block(
LZ4File* file_handle, char* src_data, char* dest_data, uint32 data_len, uint32 seq_num);
#ifdef HAVE_UNIX_SOCKETS
static int Lock_AF_UNIX(unsigned short portNumber, const char* unixSocketName, bool is_create_psql_sock);
static int Setup_AF_UNIX(bool is_create_psql_sock);
#endif /* HAVE_UNIX_SOCKETS */
static void socket_comm_reset(void);
static int socket_flush(void);
static int socket_flush_if_writable(void);
static bool socket_is_send_pending(void);
static int socket_putmessage(char msgtype, const char *s, size_t len);
static int socket_putmessage_noblock(char msgtype, const char *s, size_t len);
static void socket_startcopyout(void);
static void socket_endcopyout(bool errorAbort);
static PQcommMethods PqCommSocketMethods = {
socket_comm_reset,
socket_flush,
socket_flush_if_writable,
socket_is_send_pending,
socket_putmessage,
socket_putmessage_noblock,
socket_startcopyout,
socket_endcopyout
};
THR_LOCAL PQcommMethods *PqCommMethods = &PqCommSocketMethods;
extern bool FencedUDFMasterMode;
/* --------------------------------
* usages for temp file operations
* --------------------------------
*/
typedef struct TempFileContextInfo {
LZ4File* file_handle;
TempFileState file_state;
size_t file_size;
uint32 seq_count; /* count number of PqRecvBuffers */
StringInfoData crc_buffer; /* store stringinfo with crc header */
} TempFileContextInfo;
void temp_file_context_init(knl_t_libpq_context* libpq_cxt)
{
libpq_cxt->PqTempFileContextInfo = (TempFileContextInfo*)palloc0(sizeof(TempFileContextInfo));
}
/*
* @Description: enable temp file for saving query result.
*/
void pq_disk_enable_temp_file(void)
{
if ((u_sess->attr.attr_sql.max_cn_temp_file_size > 0) && IS_PGXC_COORDINATOR) {
t_thrd.libpq_cxt.save_query_result_to_disk = true;
}
}
/*
* @Description: disable temp file to saving query result.
*/
void pq_disk_disable_temp_file(void)
{
t_thrd.libpq_cxt.save_query_result_to_disk = false;
}
/*
* @Description: check temp file enabled or not.
* @return - true, tempfile is enabled. false, meaning temp file is disabled.
*/
bool pq_disk_is_temp_file_enabled(void)
{
return t_thrd.libpq_cxt.save_query_result_to_disk;
}
/*
* @Description: check temp file created or not.
* @return - true, tempfile is created.
*/
bool pq_disk_is_temp_file_created(void)
{
return t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle != NULL;
}
/*
* @Description: create temp file for saving query result. which will update PqTempFileContextInfo at the same time.
*/
static inline void pq_disk_create_tempfile(void)
{
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(u_sess->top_mem_cxt);
t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle = LZ4FileCreate(true);
t_thrd.libpq_cxt.PqTempFileContextInfo->file_state = TEMPFILE_CREATED;
t_thrd.libpq_cxt.PqTempFileContextInfo->file_size = 0;
t_thrd.libpq_cxt.PqTempFileContextInfo->seq_count = 0;
initStringInfo(&(t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer));
(void)MemoryContextSwitchTo(oldcontext);
}
/*
* @Description: write query result to temp file instead of saving query result to PqSendBuffer.
* @in - data, data pointer.
* @in - size, size of data to be written.
* @return - written size, EOF if error happens.
*/
static size_t pq_disk_write_tempfile(const void* data, size_t size)
{
size_t nwritten = 0;
if (t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer.data != NULL) {
resetStringInfo(&(t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer));
} else {
ereport(LOG, (errmodule(MOD_CN_RETRY), errmsg("invalid crc buffer for temp file.")));
return EOF;
}
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(u_sess->top_mem_cxt);
/* generate a crc header and append data after it */
pq_disk_generate_checking_header((char*)data,
&(t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer),
size,
t_thrd.libpq_cxt.PqTempFileContextInfo->seq_count++);
nwritten = LZ4FileWrite(t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle,
t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer.data,
size + CRC_HEADER);
(void)MemoryContextSwitchTo(oldcontext);
if (nwritten < CRC_HEADER) {
return 0;
}
nwritten -= CRC_HEADER; // don't calculate file size with crc header
Assert(nwritten == size);
t_thrd.libpq_cxt.PqTempFileContextInfo->file_state = TEMPFILE_FLUSHED;
t_thrd.libpq_cxt.PqTempFileContextInfo->file_size += nwritten;
/* be ware to write file first before flush file data to frontend */
size_t file_size = t_thrd.libpq_cxt.PqTempFileContextInfo->file_size / (GUC_UNIT_KB);
if (file_size > (size_t)u_sess->attr.attr_sql.max_cn_temp_file_size) {
/*
* if here, meaning temp file size has exceeded, in order to continue
* query execution,
* 1. we need to send current query result in temp file to
* the frontend.
* 2. but current query can't be retried any more, since we can't
* protect total query result stay in temp file.
* 3. close current file and create a new one
*/
ereport(LOG,
(errmodule(MOD_CN_RETRY),
errmsg(" %s temp file exceeded, max temp file size : %d KB, current result size : %lu KB",
PRINT_PREFIX_TYPE_ALERT,
u_sess->attr.attr_sql.max_cn_temp_file_size,
file_size)));
StmtRetrySetFileExceededFlag();
pq_disk_send_to_frontend();
pq_disk_create_tempfile();
}
return nwritten;
}
/*
* @Description: extract send buffer data to temp file.
*/
void pq_disk_extract_sendbuffer(void)
{
if (t_thrd.libpq_cxt.PqSendPointer) {
(void)pq_disk_write_tempfile(
t_thrd.libpq_cxt.PqSendBuffer + t_thrd.libpq_cxt.PqSendStart, t_thrd.libpq_cxt.PqSendPointer);
t_thrd.libpq_cxt.PqSendPointer = 0;
}
}
/*
* @Description: reset PqTempFileContextInfo.
*/
void pq_disk_reset_tempfile_contextinfo(void)
{
t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle = NULL;
t_thrd.libpq_cxt.PqTempFileContextInfo->file_state = TEMPFILE_DEFAULT;
t_thrd.libpq_cxt.PqTempFileContextInfo->file_size = 0;
t_thrd.libpq_cxt.PqTempFileContextInfo->seq_count = 0;
if (t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer.data != NULL) {
pfree_ext(t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer.data);
}
t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer.len = 0;
t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer.maxlen = 0;
t_thrd.libpq_cxt.PqTempFileContextInfo->crc_buffer.cursor = 0;
}
/*
* @Description: check whether temp file is once flushed before.
* @return - true, temp file had been flushed. false, temp file never flushed.
*/
bool pq_disk_is_flushed(void)
{
return (t_thrd.libpq_cxt.PqTempFileContextInfo->file_state == TEMPFILE_FLUSHED);
}
/*
* @Description: send query result data in tempfile to frontend.
* @in use_flush_protection - if true use pq_flush to send data else use internal_flush
* @return - 0 if OK, EOF if trouble
*/
int pq_disk_send_to_frontend(void)
{
Assert(t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle);
int res = 0;
ereport(DEBUG2,
(errmodule(MOD_CN_RETRY),
errmsg("data in tempfile \"%zu\". ", t_thrd.libpq_cxt.PqTempFileContextInfo->file_size)));
LZ4FileRewind(t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle);
size_t total_read = 0;
size_t read_size = 0;
uint32 read_seq = 0;
t_thrd.libpq_cxt.PqTempFileContextInfo->file_state = TEMPFILE_ON_SENDING;
char* read_data = NULL;
read_data = (char*)palloc0(t_thrd.libpq_cxt.PqSendBufferSize + CRC_HEADER);
do {
/* exclude a crc header and extract data after it */
read_size = pq_disk_read_data_block(t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle,
read_data,
t_thrd.libpq_cxt.PqSendBuffer,
t_thrd.libpq_cxt.PqSendBufferSize,
read_seq);
if (read_size == 0) {
break;
}
read_seq++;
t_thrd.libpq_cxt.PqSendPointer = read_size;
total_read += read_size;
res = internal_flush();
if (res == EOF) {
ereport(LOG, (errmodule(MOD_CN_RETRY), errmsg("get EOF while flushing data.")));
t_thrd.libpq_cxt.PqTempFileContextInfo->file_state = TEMPFILE_ERROR_SEND;
break;
}
} while (read_size == (size_t)t_thrd.libpq_cxt.PqSendBufferSize);
pfree_ext(read_data);
if (read_seq != t_thrd.libpq_cxt.PqTempFileContextInfo->seq_count) {
ereport(FATAL,
(errmodule(MOD_CN_RETRY),
errcode(ERRCODE_DATA_EXCEPTION),
errmsg("Last read message sequence %u is not equal to the max written message sequence %u",
read_seq,
t_thrd.libpq_cxt.PqTempFileContextInfo->seq_count)));
}
t_thrd.libpq_cxt.PqTempFileContextInfo->file_size -= total_read;
if (t_thrd.libpq_cxt.PqTempFileContextInfo->file_state != TEMPFILE_ERROR_SEND) {
t_thrd.libpq_cxt.PqTempFileContextInfo->file_state = TEMPFILE_SENDED;
}
ereport(DEBUG2,
(errmodule(MOD_CN_RETRY),
errmsg("remaining data in tempfile \"%zu\", total read data \"%zu\" ",
t_thrd.libpq_cxt.PqTempFileContextInfo->file_size,
total_read)));
/* since we can not reuse tempfile discard it */
pq_disk_discard_temp_file();
return res;
}
/*
* @Description: discard temp file if created, which will close and delete temp file at same time.
*/
void pq_disk_discard_temp_file(void)
{
LZ4File* file_handle = t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle;
pq_disk_reset_tempfile_contextinfo();
if (file_handle != NULL) {
LZ4FileClose(file_handle);
}
}
/*
* @Description: get temp file state.
* @return - file state
*/
TempFileState pq_disk_file_state(void)
{
return t_thrd.libpq_cxt.PqTempFileContextInfo->file_state;
}
/* --------------------------------
* pq_init - initialize libpq at backend startup
* --------------------------------
*/
void pq_init(void)
{
t_thrd.libpq_cxt.PqSendBufferSize = g_instance.attr.attr_network.cn_send_buffer_size * (GUC_UNIT_KB);
#ifdef USE_RETRY_STUB
t_thrd.libpq_cxt.PqSendBufferSize = 64;
#endif
t_thrd.libpq_cxt.PqSendBuffer = (char*)MemoryContextAlloc(t_thrd.top_mem_cxt, t_thrd.libpq_cxt.PqSendBufferSize);
t_thrd.libpq_cxt.PqRecvBuffer = (char*)MemoryContextAlloc(t_thrd.top_mem_cxt, PQ_RECV_BUFFER_SIZE);
t_thrd.libpq_cxt.PqRecvBufferSize = PQ_RECV_BUFFER_SIZE;
t_thrd.libpq_cxt.PqSendPointer = t_thrd.libpq_cxt.PqSendStart = t_thrd.libpq_cxt.PqRecvPointer =
t_thrd.libpq_cxt.PqRecvLength = 0;
t_thrd.libpq_cxt.PqCommBusy = false;
t_thrd.libpq_cxt.DoingCopyOut = false;
pq_disk_reset_tempfile_contextinfo();
on_proc_exit(pq_close, 0);
}
/* --------------------------------
* pq_comm_reset - reset libpq during error recovery
*
* This is called from error recovery at the outer idle loop. It's
* just to get us out of trouble if we somehow manage to elog() from
* inside a pqcomm.c routine (which ideally will never happen, but...)
* --------------------------------
*/
static void socket_comm_reset(void)
{
/* Do not throw away pending data, but do reset the busy flag */
t_thrd.libpq_cxt.PqCommBusy = false;
/* We can abort any old-style COPY OUT, too */
pq_endcopyout(true);
}
/* --------------------------------
* pq_close - shutdown libpq at backend exit
*
* Note: in a standalone backend u_sess->proc_cxt.MyProcPort will be null,
* don't crash during exit...
* --------------------------------
*/
void pq_close(int code, Datum arg)
{
if (t_thrd.postmaster_cxt.KeepSocketOpenForStream) {
return;
}
if (u_sess->proc_cxt.MyProcPort != NULL) {
if (u_sess->proc_cxt.MyProcPort->gss != NULL) {
#if defined(ENABLE_GSS) || defined(ENABLE_SSPI)
#ifdef ENABLE_GSS
OM_uint32 min_s;
/* Shutdown GSSAPI layer */
if (u_sess->proc_cxt.MyProcPort->gss->ctx != GSS_C_NO_CONTEXT) {
gss_delete_sec_context(&min_s, &u_sess->proc_cxt.MyProcPort->gss->ctx, NULL);
}
if (u_sess->proc_cxt.MyProcPort->gss->cred != GSS_C_NO_CREDENTIAL) {
gss_release_cred(&min_s, &u_sess->proc_cxt.MyProcPort->gss->cred);
}
#endif /* ENABLE_GSS */
/* GSS and SSPI share the port->gss struct */
pfree_ext(u_sess->proc_cxt.MyProcPort->gss);
#endif /* ENABLE_GSS || ENABLE_SSPI */
}
/* Cleanly shut down SSL layer */
secure_close(u_sess->proc_cxt.MyProcPort);
/*
* Formerly we did an explicit close() here, but it seems better to
* leave the socket open until the process dies. This allows clients
* to perform a "synchronous close" if they care --- wait till the
* transport layer reports connection closure, and you can be sure the
* backend has exited.
*
* We do set sock to PGINVALID_SOCKET to prevent any further I/O,
* though.
*/
if (u_sess->proc_cxt.MyProcPort && u_sess->proc_cxt.MyProcPort->is_logic_conn) {
gs_close_gsocket(&(u_sess->proc_cxt.MyProcPort->gs_sock));
} else if (u_sess->proc_cxt.MyProcPort) {
if (u_sess->proc_cxt.MyProcPort->sock != PGINVALID_SOCKET) {
closesocket(u_sess->proc_cxt.MyProcPort->sock);
}
u_sess->proc_cxt.MyProcPort->sock = PGINVALID_SOCKET;
}
}
}
/*
* Streams -- wrapper around Unix socket system calls
*
* Stream functions are used for vanilla TCP connection protocol.
*/
/* StreamDoUnlink()
* Shutdown routine for backend connection
* If a Unix socket is used for communication, explicitly close it.
*/
#ifdef HAVE_UNIX_SOCKETS
static void StreamDoUnlink(int code, Datum arg)
{
if ((int)arg == PSQL_LISTEN_SOCKET) {
Assert(t_thrd.libpq_cxt.sock_path[0]);
unlink(t_thrd.libpq_cxt.sock_path);
} else if ((int)arg == HA_LISTEN_SOCKET) {
Assert(t_thrd.libpq_cxt.ha_sock_path[0]);
unlink(t_thrd.libpq_cxt.ha_sock_path);
}
}
#endif /* HAVE_UNIX_SOCKETS */
/*
* StreamServerPort -- open a "listening" port to accept connections.
*
* Successfully opened sockets are added to the ListenSocket[] array,
* at the first position that isn't PGINVALID_SOCKET.
*
* RETURNS: STATUS_OK or STATUS_ERROR
*/
int StreamServerPort(int family, char* hostName, unsigned short portNumber, const char* unixSocketName,
pgsocket ListenSocket[], pgsocket SctpListenSocket[], int MaxListen, bool add_localaddr_flag,
bool is_create_psql_sock, bool is_create_libcomm_sock)
{
#define RETRY_SLEEP_TIME 1000000L
pgsocket fd = PGINVALID_SOCKET;
pgsocket fd_sctp = PGINVALID_SOCKET;
int err;
int maxconn;
int ret;
char portNumberStr[32];
const char* familyDesc = NULL;
char familyDescBuf[64];
char* service = NULL;
struct addrinfo* addrs = NULL;
struct addrinfo* addr = NULL;
struct addrinfo hint;
int listen_index = 0;
int sctp_listen_index = 0;
int added = 0;
const int tryBindNum = 3;
int i = 0;
errno_t rc = EOK;
#if !defined(WIN32) || defined(IPV6_V6ONLY)
int one = 1;
#endif
/* Initialize hint structure */
rc = memset_s(&hint, sizeof(hint), 0, sizeof(hint));
securec_check(rc, "\0", "\0");
hint.ai_family = family;
hint.ai_flags = AI_PASSIVE;
hint.ai_socktype = SOCK_STREAM;
#ifdef HAVE_UNIX_SOCKETS
if (family == AF_UNIX) {
/* Lock_AF_UNIX will also fill in sock_path. */
if (Lock_AF_UNIX(portNumber, unixSocketName, is_create_psql_sock) != STATUS_OK) {
return STATUS_ERROR;
}
service = (is_create_psql_sock ? t_thrd.libpq_cxt.sock_path : t_thrd.libpq_cxt.ha_sock_path);
} else
#endif /* HAVE_UNIX_SOCKETS */
{
rc = snprintf_s(portNumberStr, sizeof(portNumberStr), sizeof(portNumberStr) - 1, "%hu", portNumber);
securec_check_ss(rc, "\0", "\0");
service = portNumberStr;
}
ret = pg_getaddrinfo_all(hostName, service, &hint, &addrs);
if (ret || addrs == NULL) {
if (hostName != NULL) {
ereport(LOG,
(errmsg("could not translate host name \"%s\", service \"%s\" to address: %s",
hostName,
service,
gai_strerror(ret))));
} else {
ereport(LOG, (errmsg("could not translate service \"%s\" to address: %s", service, gai_strerror(ret))));
}
if (addrs != NULL) {
pg_freeaddrinfo_all(hint.ai_family, addrs);
}
return STATUS_ERROR;
}
for (addr = addrs; addr; addr = addr->ai_next) {
/* init value of fd */
fd = PGINVALID_SOCKET;
fd_sctp = PGINVALID_SOCKET;
if (!IS_AF_UNIX(family) && IS_AF_UNIX(addr->ai_family)) {
/*
* Only set up a unix domain socket when they really asked for it.
* The service/port is different in that case.
*/
continue;
}
/* See if there is still room to add 1 more socket. */
for (; listen_index < MaxListen; listen_index++) {
if (ListenSocket[listen_index] == PGINVALID_SOCKET) {
break;
}
}
if (listen_index >= MaxListen) {
ereport(LOG, (errmsg("could not bind to all requested addresses: MAXLISTEN (%d) exceeded", MaxListen)));
break;
}
if (SCTP_CN_DN_CONN) {
for (; sctp_listen_index < MaxListen; sctp_listen_index++) {
if (SctpListenSocket[sctp_listen_index] == PGINVALID_SOCKET) {
break;
}
}
if (sctp_listen_index >= MaxListen) {
ereport(LOG,
(errmsg("sctp could not bind to all requested addresses: MAXLISTEN (%d) exceeded", MaxListen)));
break;
}
}
/* set up family name for possible error messages */
switch (addr->ai_family) {
case AF_INET:
familyDesc = _("IPv4");
break;
#ifdef HAVE_IPV6
case AF_INET6:
familyDesc = _("IPv6");
break;
#endif
#ifdef HAVE_UNIX_SOCKETS
case AF_UNIX:
familyDesc = _("Unix");
break;
#endif
default:
rc = snprintf_s(familyDescBuf,
sizeof(familyDescBuf),
sizeof(familyDescBuf) - 1,
_("unrecognized address family %d"),
addr->ai_family);
securec_check_ss(rc, "\0", "\0");
familyDesc = familyDescBuf;
break;
}
if ((fd = socket(addr->ai_family, SOCK_STREAM, 0)) < 0) {
ereport(LOG,
(errcode_for_socket_access(),
/* translator: %s is IPv4, IPv6, or Unix */
errmsg("could not create %s socket: %m", familyDesc)));
goto errhandle;
}
if (!IS_AF_UNIX(addr->ai_family)) {
if (SCTP_CN_DN_CONN && !(addr->ai_family == AF_INET6)) {
if ((fd_sctp = socket(addr->ai_family, SOCK_STREAM, IPPROTO_SCTP)) < 0) {
ereport(LOG,
(errcode_for_socket_access(),
/* translator: %s is IPv4, IPv6, or Unix */
errmsg("could not create SCTP %s socket: %m", familyDesc)));
goto errhandle;
}
}
}
/*
* save unix domain sock, thus we will know when
* rece flow ctrl thread send the libcomm addr to server loop
*/
if (is_create_libcomm_sock) {
t_thrd.libpq_cxt.listen_fd_for_recv_flow_ctrl = fd;
}
#ifdef F_SETFD
if (fcntl(fd, F_SETFD, FD_CLOEXEC) == -1) {
ereport(LOG, (errcode_for_socket_access(), errmsg("setsockopt(FD_CLOEXEC) failed: %m")));
goto errhandle;
}
if (!IS_AF_UNIX(addr->ai_family)) {
if (SCTP_CN_DN_CONN && !(addr->ai_family == AF_INET6)) {
if (fcntl(fd_sctp, F_SETFD, FD_CLOEXEC) == -1) {
ereport(LOG, (errcode_for_socket_access(), errmsg("sctp setsockopt(FD_CLOEXEC) failed: %m")));
goto errhandle;
}
}
}
#endif /* F_SETFD */
#ifndef WIN32
/*
* Without the SO_REUSEADDR flag, a new postmaster can't be started
* right away after a stop or crash, giving "address already in use"
* error on TCP ports.
*
* On win32, however, this behavior only happens if the
* SO_EXLUSIVEADDRUSE is set. With SO_REUSEADDR, win32 allows multiple
* servers to listen on the same address, resulting in unpredictable
* behavior. With no flags at all, win32 behaves as Unix with
* SO_REUSEADDR.
*/
if (!IS_AF_UNIX(addr->ai_family)) {
if ((setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&one, sizeof(one))) == -1) {
ereport(LOG, (errcode_for_socket_access(), errmsg("setsockopt(SO_REUSEADDR) failed: %m")));
goto errhandle;
}
if (SCTP_CN_DN_CONN && !(addr->ai_family == AF_INET6)) {
if ((setsockopt(fd_sctp, SOL_SOCKET, SO_REUSEADDR, (char*)&one, sizeof(one))) == -1) {
ereport(LOG, (errcode_for_socket_access(), errmsg("sctp setsockopt(SO_REUSEADDR) failed: %m")));
goto errhandle;
}
}
}
#endif
#ifdef IPV6_V6ONLY
if (addr->ai_family == AF_INET6) {
if (setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, (char*)&one, sizeof(one)) == -1) {
ereport(LOG, (errcode_for_socket_access(), errmsg("setsockopt(IPV6_V6ONLY) failed: %m")));
goto errhandle;
}
}
#endif
/*
* Note: This might fail on some OS's, like Linux older than
* 2.4.21-pre3, that don't have the IPV6_V6ONLY socket option, and map
* ipv4 addresses to ipv6. It will show ::ffff:ipv4 for all ipv4
* connections.if the bind failded, we sleep for 100ms and try it again.
* We will try at most 30 times, because of slow clean of OS.
*/
for (i = 0; i != tryBindNum; ++i) {
err = bind(fd, addr->ai_addr, addr->ai_addrlen);
if (err < 0) {
/* need not retry when a addr is added before */
if (added != 0) {
i = tryBindNum;
break;
}
ereport(LOG,
(errcode_for_socket_access(),
/* translator: %s is IPv4, IPv6, or Unix */
errmsg("could not bind %s socket at the %d time: %m", familyDesc, i),
(IS_AF_UNIX(addr->ai_family))
? errhint("Is another postmaster already running on port %d?"
" If not, remove socket file \"%s\" and retry.",
(int)portNumber, service)
: errhint("Port %u is used, run 'netstat -anop|grep %u' or "
"'lsof -i:%u'(need root) to see who is using this port.",
portNumber, portNumber, portNumber)));
pg_usleep(RETRY_SLEEP_TIME);
continue;
} else {
break;
}
}
if (i == tryBindNum) {
goto errhandle;
}
if (!IS_AF_UNIX(addr->ai_family)) {
if (SCTP_CN_DN_CONN && !(addr->ai_family == AF_INET6)) {
for (i = 0; i != tryBindNum; ++i) {
err = bind(fd_sctp, addr->ai_addr, addr->ai_addrlen);
if (err < 0) {
/* need not retry when a addr is added before */
if (added != 0) {
i = tryBindNum;
break;
}
ereport(LOG,
(errcode_for_socket_access(),
// translator: %s is IPv4, IPv6, or Unix
errmsg("sctp could not bind %s socket at the %d time: %m", familyDesc, i),
(IS_AF_UNIX(addr->ai_family))
? errhint("sctp Is another postmaster already running on port %d?"
" If not, remove socket file \"%s\" and retry.",
(int)portNumber, service)
: errhint("sctp Is another postmaster already running on port %d?"
" If not, wait a few seconds and retry.", (int)portNumber)));
pg_usleep(RETRY_SLEEP_TIME);
continue;
} else {
break;
}
}
if (i == tryBindNum) {
goto errhandle;
}
}
}
#ifdef HAVE_UNIX_SOCKETS
if (addr->ai_family == AF_UNIX) {
if (Setup_AF_UNIX(is_create_psql_sock) != STATUS_OK) {
goto errhandle;
}
}
#endif
/*
* Select appropriate accept-queue length limit. PG_SOMAXCONN is only
* intended to provide a clamp on the request on platforms where an
* overly large request provokes a kernel error (are there any?).
*/
maxconn = g_instance.attr.attr_network.MaxConnections * 6;
maxconn = Max(maxconn, PG_SOMINCONN);
maxconn = Min(maxconn, PG_SOMAXCONN);
err = listen(fd, maxconn);
if (err < 0) {
ereport(LOG,
(errcode_for_socket_access(),
/* translator: %s is IPv4, IPv6, or Unix */
errmsg("could not listen on %s socket: %m", familyDesc)));
goto errhandle;
}
ListenSocket[listen_index] = fd;
if (!IS_AF_UNIX(addr->ai_family)) {
if (SCTP_CN_DN_CONN && !(addr->ai_family == AF_INET6)) {
err = listen(fd_sctp, maxconn);
if (err < 0) {
ereport(LOG,
(errcode_for_socket_access(),
/* translator: %s is IPv4, IPv6, or Unix */
errmsg("sctp could not listen on %s socket: %m", familyDesc)));
closesocket(fd_sctp);
continue;
}
SctpListenSocket[sctp_listen_index] = fd_sctp;
}
}
added++;
if (add_localaddr_flag == true) {
struct sockaddr* sinp = NULL;
char* result = NULL;
sinp = (struct sockaddr*)(addr->ai_addr);
if (addr->ai_family == AF_INET6) {
result = inet_net_ntop(AF_INET6,
&((struct sockaddr_in*)sinp)->sin_addr,
128,
t_thrd.postmaster_cxt.LocalAddrList[t_thrd.postmaster_cxt.LocalIpNum],
IP_LEN);
} else if (addr->ai_family == AF_INET) {
result = inet_net_ntop(AF_INET,
&((struct sockaddr_in*)sinp)->sin_addr,
32,
t_thrd.postmaster_cxt.LocalAddrList[t_thrd.postmaster_cxt.LocalIpNum],
IP_LEN);
}
if (result == NULL) {
ereport(WARNING, (errmsg("inet_net_ntop failed, error: %d", EAFNOSUPPORT)));
} else {
t_thrd.postmaster_cxt.LocalIpNum++;
}
}
if (is_create_psql_sock) {
t_thrd.postmaster_cxt.listen_sock_type[listen_index] = PSQL_LISTEN_SOCKET;
} else {
t_thrd.postmaster_cxt.listen_sock_type[listen_index] = HA_LISTEN_SOCKET;
}
continue;
errhandle:
if (fd != PGINVALID_SOCKET) {
closesocket(fd);
}
if (fd_sctp != PGINVALID_SOCKET) {
closesocket(fd_sctp);
}
}
pg_freeaddrinfo_all(hint.ai_family, addrs);
if (!added) {
return STATUS_ERROR;
}
return STATUS_OK;
}
#ifdef HAVE_UNIX_SOCKETS
/*
* Lock_AF_UNIX -- configure unix socket file path
*/
static int Lock_AF_UNIX(unsigned short portNumber, const char* unixSocketName, bool is_create_psql_sock)
{
char* sock_path = NULL;
if (is_create_psql_sock) {
UNIXSOCK_PATH(t_thrd.libpq_cxt.sock_path, portNumber, unixSocketName);
sock_path = t_thrd.libpq_cxt.sock_path;
} else {
UNIXSOCK_PATH(t_thrd.libpq_cxt.ha_sock_path, portNumber, unixSocketName);
sock_path = t_thrd.libpq_cxt.ha_sock_path;
}
if (strlen(sock_path) >= UNIXSOCK_PATH_BUFLEN) {
ereport(LOG,
(errmsg("Unix-domain socket path \"%s\" is too long (maximum %d bytes)",
sock_path,
(int)(UNIXSOCK_PATH_BUFLEN - 1))));
return STATUS_ERROR;
}
/*
* Grab an interlock file associated with the socket file.
*
* Note: there are two reasons for using a socket lock file, rather than
* trying to interlock directly on the socket itself. First, it's a lot
* more portable, and second, it lets us remove any pre-existing socket
* file without race conditions.
*/
CreateSocketLockFile(sock_path, true, is_create_psql_sock);
/*
* Once we have the interlock, we can safely delete any pre-existing
* socket file to avoid failure at bind() time.
*/
unlink(sock_path);
return STATUS_OK;
}
/*
* Setup_AF_UNIX -- configure unix socket permissions
*/
static int Setup_AF_UNIX(bool is_create_psql_sock)
{
/* Arrange to unlink the socket file at exit */
on_proc_exit(StreamDoUnlink, is_create_psql_sock ? (Datum)PSQL_LISTEN_SOCKET : (Datum)HA_LISTEN_SOCKET);
const char* sock_path = (is_create_psql_sock ? t_thrd.libpq_cxt.sock_path : t_thrd.libpq_cxt.ha_sock_path);
/*
* Fix socket ownership/permission if requested. Note we must do this
* before we listen() to avoid a window where unwanted connections could
* get accepted.
*/
AssertEreport(g_instance.attr.attr_network.Unix_socket_group, MOD_OPT, "");
if (g_instance.attr.attr_network.Unix_socket_group[0] != '\0') {
#ifdef WIN32
ereport(WARNING, (errmsg("configuration item unix_socket_group is not supported on this platform")));
#else
char* endptr = NULL;
unsigned long val;
gid_t gid;
val = strtoul(g_instance.attr.attr_network.Unix_socket_group, &endptr, 10);
if (*endptr == '\0') { /* numeric group id */
gid = val;
} else { /* convert group name to id */
// use the getgrnam_r to guarantee thread safe
struct group grp;
struct group* grpptr = &grp;
struct group* tmpGrpPtr = NULL;
char grpBuffer[200];
int grpLineLen = sizeof(grpBuffer);
int ret;
if ((ret = getgrnam_r(
g_instance.attr.attr_network.Unix_socket_group, grpptr, grpBuffer, grpLineLen, &tmpGrpPtr)) != 0) {
ereport(LOG, (errmsg("getgrnam_r() error, error num is %d", ret)));
return STATUS_ERROR;
}
if (tmpGrpPtr == NULL) {
ereport(LOG, (errmsg("group \"%s\" does not exist", g_instance.attr.attr_network.Unix_socket_group)));
return STATUS_ERROR;
}
gid = grp.gr_gid;
}
if (chown(sock_path, -1, gid) == -1) {
ereport(LOG, (errcode_for_file_access(), errmsg("could not set group of file \"%s\": %m", sock_path)));
return STATUS_ERROR;
}
#endif
}
if (chmod(sock_path, g_instance.attr.attr_network.Unix_socket_permissions) == -1) {
ereport(LOG, (errcode_for_file_access(), errmsg("could not set permissions of file \"%s\": %m", sock_path)));
return STATUS_ERROR;
}
return STATUS_OK;
}
#endif /* HAVE_UNIX_SOCKETS */
/*
* StreamConnection -- create a new connection with client using
* server port. Set port->sock to the FD of the new connection.
*
* ASSUME: that this doesn't need to be non-blocking because
* the Postmaster uses select() to tell when the server master
* socket is ready for accept().
*
* RETURNS: STATUS_OK or STATUS_ERROR
*/
int StreamConnection(pgsocket server_fd, Port* port)
{
/* accept connection and fill in the client (remote) address */
port->raddr.salen = sizeof(port->raddr.addr);
if ((port->sock = accept4(server_fd, (struct sockaddr*)&port->raddr.addr, &port->raddr.salen, SOCK_CLOEXEC)) < 0) {
ereport(LOG, (errcode_for_socket_access(), errmsg("could not accept new connection: %m")));
/*
* If accept() fails then postmaster.c will still see the server
* socket as read-ready, and will immediately try again. To avoid
* uselessly sucking lots of CPU, delay a bit before trying again.
* (The most likely reason for failure is being out of kernel file
* table slots; we can do little except hope some will get freed up.)
*/
pg_usleep(100000L); /* wait 0.1 sec */
return STATUS_ERROR;
}
#ifdef SCO_ACCEPT_BUG
/*
* UnixWare 7+ and OpenServer 5.0.4 are known to have this bug, but it
* shouldn't hurt to catch it for all versions of those platforms.
*/
if (port->raddr.addr.ss_family == 0) {
port->raddr.addr.ss_family = AF_UNIX;
}
#endif
/* fill in the server (local) address */
port->laddr.salen = sizeof(port->laddr.addr);
if (getsockname(port->sock, (struct sockaddr*)&port->laddr.addr, &port->laddr.salen) < 0) {
ereport(LOG, (errmsg("getsockname() failed: %m")));
return STATUS_ERROR;
}
/* select NODELAY, KEEPALIVE and SO_RCVTIMEO options if it's a TCP connection */
if (!IS_AF_UNIX(port->laddr.addr.ss_family)) {
int on;
int opval = 0;
on = 1;
socklen_t oplen = sizeof(opval);
if (getsockopt(port->sock, SOL_SOCKET, SO_PROTOCOL, &opval, &oplen) < 0) {
ereport(LOG, (errmsg("getsockopt(SO_PROTOCOL) failed: %m")));
return STATUS_ERROR;
}
if (opval != IPPROTO_SCTP) {
if (setsockopt(port->sock, IPPROTO_TCP, TCP_NODELAY, (char*)&on, sizeof(on)) < 0) {
ereport(LOG, (errmsg("setsockopt(TCP_NODELAY) failed: %m")));
return STATUS_ERROR;
}
if (setsockopt(port->sock, SOL_SOCKET, SO_KEEPALIVE, (char*)&on, sizeof(on)) < 0) {
ereport(LOG, (errmsg("setsockopt(SO_KEEPALIVE) failed: %m")));
return STATUS_ERROR;
}
struct timeval tv = {u_sess->attr.attr_common.tcpRecvTimeout, 0};
if (setsockopt(port->sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(struct timeval)) < 0) {
ereport(LOG, (errmsg("setsockopt(SO_RCVTIMEO) failed: %m")));
return STATUS_ERROR;
}
} else {
if (setsockopt(port->sock, IPPROTO_SCTP, COMM_NO_DELAY, (char*)&on, sizeof(on)) < 0) {
ereport(LOG, (errmsg("setsockopt(COMM_NO_DELAY) failed: %m")));
return STATUS_ERROR;
}
}
#ifdef WIN32
/*
* This is a Win32 socket optimization. The ideal size is 32k.
* http://support.microsoft.com/kb/823764/EN-US/
*/
on = PQ_SEND_BUFFER_SIZE * 4;
if (setsockopt(port->sock, SOL_SOCKET, SO_SNDBUF, (char*)&on, sizeof(on)) < 0) {
ereport(LOG, (errmsg("setsockopt(SO_SNDBUF) failed: %m")));
return STATUS_ERROR;
}
#endif
/*
* Also apply the current keepalive parameters. If we fail to set a
* parameter, don't error out, because these aren't universally
* supported. (Note: you might think we need to reset the GUC
* variables to 0 in such a case, but it's not necessary because the
* show hooks for these variables report the truth anyway.)
*/
if (opval == IPPROTO_TCP) {
(void)pq_setkeepalivesidle(u_sess->attr.attr_common.tcp_keepalives_idle, port);
(void)pq_setkeepalivesinterval(u_sess->attr.attr_common.tcp_keepalives_interval, port);
(void)pq_setkeepalivescount(u_sess->attr.attr_common.tcp_keepalives_count, port);
}
}
return STATUS_OK;
}
/*
* StreamClose -- close a client/backend connection
*
* NOTE: this is NOT used to terminate a session; it is just used to release
* the file descriptor in a process that should no longer have the socket
* open. (For example, the postmaster calls this after passing ownership
* of the connection to a child process.) It is expected that someone else
* still has the socket open. So, we only want to close the descriptor,
* we do NOT want to send anything to the far end.
*/
void StreamClose(pgsocket sock)
{
closesocket(sock);
}
/*
* TouchSocketFileInternel & TouchSocketFile -- mark socket file as recently accessed
*
* This routine should be called every so often to ensure that the socket
* file has a recent mod date (ordinary operations on sockets usually won't
* change the mod date). That saves it from being removed by
* overenthusiastic /tmp-directory-cleaner daemons. (Another reason we should
* never have put the socket file in /tmp...)
*/
void TouchSocketFileInternel(const char* sock_path)
{
/* Do nothing if we did not create a socket... */
if (sock_path[0] != '\0') {
/*
* utime() is POSIX standard, utimes() is a common alternative. If we
* have neither, there's no way to affect the mod or access time of
* the socket :-(
*
* In either path, we ignore errors; there's no point in complaining.
*/
#ifdef HAVE_UTIME
utime(sock_path, NULL);
#else /* !HAVE_UTIME */
#ifdef HAVE_UTIMES
utimes(sock_path, NULL);
#endif /* HAVE_UTIMES */
#endif /* HAVE_UTIME */
}
}
void TouchSocketFile(void)
{
TouchSocketFileInternel(t_thrd.libpq_cxt.sock_path);
TouchSocketFileInternel(t_thrd.libpq_cxt.ha_sock_path);
}
/* --------------------------------
* Low-level I/O routines begin here.
*
* These routines communicate with a frontend client across a connection
* already established by the preceding routines.
* --------------------------------
*/
/* --------------------------------
* pq_set_nonblocking - set socket blocking/non-blocking
*
* Sets the socket non-blocking if nonblocking is TRUE, or sets it
* blocking otherwise.
* --------------------------------
*/
static void pq_set_nonblocking(bool nonblocking)
{
if (u_sess->proc_cxt.MyProcPort->noblock == nonblocking) {
return;
}
#ifdef WIN32
pgwin32_noblock = nonblocking ? 1 : 0;
#else
/*
* Use COMMERROR on failure, because ERROR would try to send the error to
* the client, which might require changing the mode again, leading to
* infinite recursion.
*/
if (nonblocking) {
if (!pg_set_noblock(u_sess->proc_cxt.MyProcPort->sock)) {
ereport(COMMERROR, (errmsg("could not set socket to non-blocking mode: %m")));
}
} else {
if (!pg_set_block(u_sess->proc_cxt.MyProcPort->sock)) {
ereport(COMMERROR, (errmsg("could not set socket to blocking mode: %m")));
}
}
#endif
u_sess->proc_cxt.MyProcPort->noblock = nonblocking;
}
/* --------------------------------
* pq_recvbuf - load some bytes into the input buffer
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
static int pq_recvbuf(void)
{
if (t_thrd.libpq_cxt.PqRecvPointer > 0) {
if (t_thrd.libpq_cxt.PqRecvLength > t_thrd.libpq_cxt.PqRecvPointer) {
/* still some unread data, left-justify it in the buffer */
errno_t rc = memmove_s(t_thrd.libpq_cxt.PqRecvBuffer,
t_thrd.libpq_cxt.PqRecvLength - t_thrd.libpq_cxt.PqRecvPointer,
t_thrd.libpq_cxt.PqRecvBuffer + t_thrd.libpq_cxt.PqRecvPointer,
t_thrd.libpq_cxt.PqRecvLength - t_thrd.libpq_cxt.PqRecvPointer);
securec_check(rc, "\0", "\0");
t_thrd.libpq_cxt.PqRecvLength -= t_thrd.libpq_cxt.PqRecvPointer;
t_thrd.libpq_cxt.PqRecvPointer = 0;
} else {
t_thrd.libpq_cxt.PqRecvLength = t_thrd.libpq_cxt.PqRecvPointer = 0;
}
}
/* Ensure that we're in blocking mode */
pq_set_nonblocking(false);
/* Can fill buffer from PqRecvLength and upwards */
for (;;) {
int r;
WaitState oldStatus = pgstat_report_waitstatus(STATE_WAIT_COMM);
r = secure_read(u_sess->proc_cxt.MyProcPort,
t_thrd.libpq_cxt.PqRecvBuffer + t_thrd.libpq_cxt.PqRecvLength,
PQ_RECV_BUFFER_SIZE - t_thrd.libpq_cxt.PqRecvLength);
(void)pgstat_report_waitstatus(oldStatus);
if (r < 0) {
if (errno == EINTR) {
continue; /* Ok if interrupted */
}
/*
* Careful: an ereport() that tries to write to the client would
* cause recursion to here, leading to stack overflow and core
* dump! This message must go *only* to the postmaster log.
*/
ereport(COMMERROR,
(errcode_for_socket_access(), errmsg("could not receive data from client: %s", gs_comm_strerror())));
return EOF;
}
if (r == 0) {
/*
* EOF detected. We used to write a log message here, but it's
* better to expect the ultimate caller to do that.
*/
return EOF;
}
/* r contains number of bytes read, so just incr length */
t_thrd.libpq_cxt.PqRecvLength += r;
return 0;
}
}
/* --------------------------------
* pq_getbyte - get a single byte from connection, or return EOF
* --------------------------------
*/
int pq_getbyte(void)
{
while (t_thrd.libpq_cxt.PqRecvPointer >= t_thrd.libpq_cxt.PqRecvLength) {
if (pq_recvbuf()) { /* If nothing in buffer, then recv some */
return EOF; /* Failed to recv data */
}
}
return (unsigned char)t_thrd.libpq_cxt.PqRecvBuffer[t_thrd.libpq_cxt.PqRecvPointer++];
}
/* --------------------------------
* pq_peekbyte - peek at next byte from connection
*
* Same as pq_getbyte() except we don't advance the pointer.
* --------------------------------
*/
int pq_peekbyte(void)
{
while (t_thrd.libpq_cxt.PqRecvPointer >= t_thrd.libpq_cxt.PqRecvLength) {
if (pq_recvbuf()) {/* If nothing in buffer, then recv some */
return EOF; /* Failed to recv data */
}
}
return (unsigned char)t_thrd.libpq_cxt.PqRecvBuffer[t_thrd.libpq_cxt.PqRecvPointer];
}
/* --------------------------------
* pq_getbyte_if_available - get a single byte from connection,
* if available
*
* The received byte is stored in *c. Returns 1 if a byte was read,
* 0 if no data was available, or EOF if trouble.
* --------------------------------
*/
int pq_getbyte_if_available(unsigned char* c)
{
int r;
if (t_thrd.libpq_cxt.PqRecvPointer < t_thrd.libpq_cxt.PqRecvLength) {
*c = t_thrd.libpq_cxt.PqRecvBuffer[t_thrd.libpq_cxt.PqRecvPointer++];
return 1;
}
/* Put the socket into non-blocking mode */
pq_set_nonblocking(true);
r = secure_read(u_sess->proc_cxt.MyProcPort, c, 1);
if (r < 0) {
/*
* Ok if no data available without blocking or interrupted (though
* EINTR really shouldn't happen with a non-blocking socket). Report
* other errors.
*/
if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) {
r = 0;
} else {
/*
* Careful: an ereport() that tries to write to the client would
* cause recursion to here, leading to stack overflow and core
* dump! This message must go *only* to the postmaster log.
*/
ereport(COMMERROR,
(errcode_for_socket_access(), errmsg("could not receive data from client: %s", gs_comm_strerror())));
r = EOF;
}
} else if (r == 0) {
/* EOF detected */
r = EOF;
}
return r;
}
/* --------------------------------
* pq_getbytes - get a known number of bytes from connection
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
int pq_getbytes(char* s, size_t len)
{
size_t amount;
while (len > 0) {
while (t_thrd.libpq_cxt.PqRecvPointer >= t_thrd.libpq_cxt.PqRecvLength) {
if (pq_recvbuf()) { /* If nothing in buffer, then recv some */
return EOF; /* Failed to recv data */
}
}
amount = t_thrd.libpq_cxt.PqRecvLength - t_thrd.libpq_cxt.PqRecvPointer;
if (amount > len) {
amount = len;
}
errno_t rc = memcpy_s(s, amount, t_thrd.libpq_cxt.PqRecvBuffer + t_thrd.libpq_cxt.PqRecvPointer, amount);
securec_check(rc, "\0", "\0");
t_thrd.libpq_cxt.PqRecvPointer += amount;
s += amount;
len -= amount;
}
return 0;
}
/* --------------------------------
* pq_discardbytes - throw away a known number of bytes
*
* same as pq_getbytes except we do not copy the data to anyplace.
* this is used for resynchronizing after read errors.
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
static int pq_discardbytes(size_t len)
{
size_t amount;
while (len > 0) {
while (t_thrd.libpq_cxt.PqRecvPointer >= t_thrd.libpq_cxt.PqRecvLength) {
if (pq_recvbuf()) { /* If nothing in buffer, then recv some */
return EOF; /* Failed to recv data */
}
}
amount = t_thrd.libpq_cxt.PqRecvLength - t_thrd.libpq_cxt.PqRecvPointer;
if (amount > len) {
amount = len;
}
t_thrd.libpq_cxt.PqRecvPointer += amount;
len -= amount;
}
return 0;
}
/* --------------------------------
* pq_getstring - get a null terminated string from connection
*
* The return value is placed in an expansible StringInfo, which has
* already been initialized by the caller.
*
* This is used only for dealing with old-protocol clients. The idea
* is to produce a StringInfo that looks the same as we would get from
* pq_getmessage() with a newer client; we will then process it with
* pq_getmsgstring. Therefore, no character set conversion is done here,
* even though this is presumably useful only for text.
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
int pq_getstring(StringInfo s)
{
int i;
resetStringInfo(s);
/* Read until we get the terminating '\0' */
for (;;) {
while (t_thrd.libpq_cxt.PqRecvPointer >= t_thrd.libpq_cxt.PqRecvLength) {
if (pq_recvbuf()) {/* If nothing in buffer, then recv some */
return EOF; /* Failed to recv data */
}
}
for (i = t_thrd.libpq_cxt.PqRecvPointer; i < t_thrd.libpq_cxt.PqRecvLength; i++) {
if (t_thrd.libpq_cxt.PqRecvBuffer[i] == '\0') {
/* include the '\0' in the copy */
appendBinaryStringInfo(s,
t_thrd.libpq_cxt.PqRecvBuffer + t_thrd.libpq_cxt.PqRecvPointer,
i - t_thrd.libpq_cxt.PqRecvPointer + 1);
t_thrd.libpq_cxt.PqRecvPointer = i + 1; /* advance past \0 */
return 0;
}
}
/* If we're here we haven't got the \0 in the buffer yet. */
appendBinaryStringInfo(s,
t_thrd.libpq_cxt.PqRecvBuffer + t_thrd.libpq_cxt.PqRecvPointer,
t_thrd.libpq_cxt.PqRecvLength - t_thrd.libpq_cxt.PqRecvPointer);
t_thrd.libpq_cxt.PqRecvPointer = t_thrd.libpq_cxt.PqRecvLength;
}
}
/* --------------------------------
* pq_getmessage - get a message with length word from connection
*
* The return value is placed in an expansible StringInfo, which has
* already been initialized by the caller.
* Only the message body is placed in the StringInfo; the length word
* is removed. Also, s->cursor is initialized to zero for convenience
* in scanning the message contents.
*
* If maxlen is greater than zero, it is an upper limit on the length
* of the message we are willing to accept. We abort the connection
* (by returning EOF) if client tries to send more than that.
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
int pq_getmessage(StringInfo s, int maxlen)
{
int32 len;
resetStringInfo(s);
/* Read message length word */
if (pq_getbytes((char*)&len, 4) == EOF) {
ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("unexpected EOF within message length word")));
return EOF;
}
len = ntohl(len);
if (len < 4 || (maxlen > 0 && len > maxlen)) {
ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid message length")));
return EOF;
}
len -= 4; /* discount length itself */
if (len > 0) {
/*
* Allocate space for message. If we run out of room (ridiculously
* large message), we will elog(ERROR), but we want to discard the
* message body so as not to lose communication sync.
*/
PG_TRY();
{
enlargeStringInfo(s, len);
}
PG_CATCH();
{
if (pq_discardbytes(len) == EOF) {
ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("incomplete message from client")));
}
PG_RE_THROW();
}
PG_END_TRY();
/* And grab the message */
if (pq_getbytes(s->data, len) == EOF) {
ereport(COMMERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("incomplete message from client")));
return EOF;
}
s->len = len;
/* Place a trailing null per StringInfo convention */
s->data[len] = '\0';
}
return 0;
}
/* --------------------------------
* pq_putbytes - send bytes to connection (not flushed until pq_flush)
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
int pq_putbytes(const char* s, size_t len)
{
int res;
/* Should only be called by old-style COPY OUT */
Assert(t_thrd.libpq_cxt.DoingCopyOut);
/* No-op if reentrant call */
if (t_thrd.libpq_cxt.PqCommBusy) {
return 0;
}
t_thrd.libpq_cxt.PqCommBusy = true;
res = internal_putbytes(s, len);
t_thrd.libpq_cxt.PqCommBusy = false;
return res;
}
static int internal_putbytes(const char* s, size_t len)
{
size_t amount;
while (len > 0) {
/* If buffer is full, then flush it out */
if (t_thrd.libpq_cxt.PqSendPointer >= t_thrd.libpq_cxt.PqSendBufferSize) {
if (pq_disk_is_temp_file_enabled()) {
/* create temp file to store the result, it is caller's responsibility
* to close the file done */
if (!t_thrd.libpq_cxt.PqTempFileContextInfo->file_handle) {
pq_disk_create_tempfile();
}
if ((int)pq_disk_write_tempfile(
t_thrd.libpq_cxt.PqSendBuffer, ((size_t)t_thrd.libpq_cxt.PqSendPointer)) == EOF) {
return EOF;
}
t_thrd.libpq_cxt.PqSendPointer = 0;
} else {
StmtRetrySetFileExceededFlag(); /* once flush data to frontend, can not retry this query anymore */
pq_set_nonblocking(false);
if (internal_flush()) {
return EOF;
}
}
}
amount = t_thrd.libpq_cxt.PqSendBufferSize - t_thrd.libpq_cxt.PqSendPointer;
if (amount > len) {
amount = len;
}
errno_t rc = memcpy_s(t_thrd.libpq_cxt.PqSendBuffer + t_thrd.libpq_cxt.PqSendPointer, amount, s, amount);
securec_check(rc, "\0", "\0");
t_thrd.libpq_cxt.PqSendPointer += amount;
s += amount;
len -= amount;
}
return 0;
}
/* --------------------------------
* pq_flush - flush pending output
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
static int socket_flush(void)
{
int res = 0;
/* No-op if reentrant call */
if (t_thrd.libpq_cxt.PqCommBusy) {
return res;
}
t_thrd.libpq_cxt.PqCommBusy = true;
pq_set_nonblocking(false);
if (t_thrd.libpq_cxt.save_query_result_to_disk &&
(t_thrd.libpq_cxt.PqTempFileContextInfo->file_state == TEMPFILE_FLUSHED)) {
if (!u_sess->wlm_cxt->spill_limit_error) {
MemoryContext oldMemory;
oldMemory = MemoryContextSwitchTo(u_sess->top_mem_cxt);
/*
* read query result from temp file, then flush to client.
* extract remaining data in send buffer to disk,
* in order to send total query result together through temp file
*/
pq_disk_extract_sendbuffer();
res = pq_disk_send_to_frontend();
(void)MemoryContextSwitchTo(oldMemory);
} else {
pq_disk_discard_temp_file();
}
} else {
res = internal_flush();
}
t_thrd.libpq_cxt.PqCommBusy = false;
return res;
}
/* --------------------------------
* internal_flush - flush pending output
*
* Returns 0 if OK (meaning everything was sent, or operation would block
* and the socket is in non-blocking mode), or EOF if trouble.
* --------------------------------
*/
static int internal_flush(void)
{
static THR_LOCAL int last_reported_send_errno = 0;
char* bufptr = t_thrd.libpq_cxt.PqSendBuffer + t_thrd.libpq_cxt.PqSendStart;
char* bufend = t_thrd.libpq_cxt.PqSendBuffer + t_thrd.libpq_cxt.PqSendPointer;
WaitState oldStatus = pgstat_report_waitstatus(STATE_WAIT_UNDEFINED, true);
if (StreamThreadAmI() == false) {
oldStatus = pgstat_report_waitstatus(STATE_WAIT_FLUSH_DATA);
} else {
/* Add node name to mark where to flush data for SCTP */
oldStatus = pgstat_report_waitstatus_comm(STATE_WAIT_FLUSH_DATA,
u_sess->proc_cxt.MyProcPort->libcomm_addrinfo->nodeIdx,
-1,
u_sess->stream_cxt.producer_obj->getParentPlanNodeId(),
global_node_definition ? global_node_definition->num_nodes : -1);
}
while (bufptr < bufend) {
int r;
r = secure_write(u_sess->proc_cxt.MyProcPort, bufptr, bufend - bufptr);
if (unlikely(r == 0 && (StreamThreadAmI() == true || u_sess->proc_cxt.MyProcPort->is_logic_conn))) {
/* Stop query when cancel happend */
if (t_thrd.int_cxt.QueryCancelPending) {
if (t_thrd.storage_cxt.cancel_from_timeout) {
ereport(LOG,
(errcode(ERRCODE_QUERY_CANCELED),
errmsg("canceling statement due to statement timeout"),
ignore_interrupt(true)));
} else {
ereport(LOG,
(errcode(ERRCODE_QUERY_CANCELED),
errmsg("canceling statement due to %s request", IS_PGXC_DATANODE ? "coordinator" : "user"),
ignore_interrupt(true)));
}
(void)pgstat_report_waitstatus(oldStatus);
return EOF;
} else {
continue;
}
}
if (unlikely(r <= 0)) {
if (errno == EINTR) {
continue; /* Ok if we were interrupted */
}
/*
* Ok if no data writable without blocking, and the socket is in
* non-blocking mode.
*/
if (errno == EAGAIN || errno == EWOULDBLOCK) {
(void)pgstat_report_waitstatus(oldStatus);
return 0;
}
/*
* Careful: an ereport() that tries to write to the client would
* cause recursion to here, leading to stack overflow and core
* dump! This message must go *only* to the postmaster log.
*
* If a client disconnects while we're in the midst of output, we
* might write quite a bit of data before we get to a safe query
* abort point. So, suppress duplicate log messages.
*/
// if it is stream thread, suppress the error message.
if (errno != last_reported_send_errno && StreamThreadAmI() == false) {
last_reported_send_errno = errno;
ereport(COMMERROR,
(errcode_for_socket_access(),
errmsg("could not send data to client [ Remote IP: %s PORT: %s]. Detail: %m",
u_sess->proc_cxt.MyProcPort->remote_host,
(u_sess->proc_cxt.MyProcPort->remote_port != NULL &&
u_sess->proc_cxt.MyProcPort->remote_port[0] != '\0')
? u_sess->proc_cxt.MyProcPort->remote_port
: "")));
}
/*
* We drop the buffered data anyway so that processing can
* continue, even though we'll probably quit soon. We also set a
* flag that'll cause the next CHECK_FOR_INTERRUPTS to terminate
* the connection.
*/
t_thrd.libpq_cxt.PqSendStart = t_thrd.libpq_cxt.PqSendPointer = 0;
if ((StreamThreadAmI() == false) && (!t_thrd.proc_cxt.proc_exit_inprogress)) {
t_thrd.int_cxt.ClientConnectionLost = 1;
InterruptPending = 1;
} else if (StreamThreadAmI()) {
t_thrd.int_cxt.StreamConnectionLost = 1;
}
(void)pgstat_report_waitstatus(oldStatus);
return EOF;
}
last_reported_send_errno = 0; /* reset after any successful send */
bufptr += r;
t_thrd.libpq_cxt.PqSendStart += r;
}
t_thrd.libpq_cxt.PqSendStart = t_thrd.libpq_cxt.PqSendPointer = 0;
(void)pgstat_report_waitstatus(oldStatus);
return 0;
}
/* --------------------------------
* pq_flush_if_writable - flush pending output if writable without blocking
*
* Returns 0 if OK, or EOF if trouble.
* --------------------------------
*/
static int socket_flush_if_writable(void)
{
int res;
/* Quick exit if nothing to do */
if (t_thrd.libpq_cxt.PqSendPointer == t_thrd.libpq_cxt.PqSendStart) {
return 0;
}
/* No-op if reentrant call */
if (t_thrd.libpq_cxt.PqCommBusy) {
return 0;
}
/* Temporarily put the socket into non-blocking mode */
pq_set_nonblocking(true);
t_thrd.libpq_cxt.PqCommBusy = true;
res = internal_flush();
t_thrd.libpq_cxt.PqCommBusy = false;
return res;
}
/* --------------------------------
* pq_flush_timedwait - Check if some data is pending to be flushed.
* If yes then call the existing non-block flush function to flush.
* If all datas are flushed (means PqSendStart is 0), then return
* Otherwise check even if at least few bytes of datas are flushed
* (by checking the before and after PqSendStart), if yes then
* update the last flush time otherwise check if any data was able
* to flush during maximum configured timeout.
* --------------------------------
*/
void pq_flush_timedwait(int timeout)
{
int sleeptime = 0;
int send_start_before_flush = 0;
TimestampTz start_time = 0;
start_time = GetCurrentTimestamp();
for (;;) {
/* Check if still some data is pending to be sent */
if (!pq_is_send_pending()) {
break;
}
send_start_before_flush = t_thrd.libpq_cxt.PqSendStart;
if (pq_flush_if_writable()) {
ereport(COMMERROR, (errmsg("could not send data due to connection reset, terminating process")));
proc_exit(0);
}
if (t_thrd.libpq_cxt.PqSendStart == 0) {
/*
* Means either nothing was flushed or
* all datas are flushed. So loop back and see if
* if any data to be send pending, if it zero because
* everything was flushed then no data will be pending
* to send otherwise it will be pending, so try to send
* again. So here there is no need to reset the flush time
*/
if (!pq_is_send_pending()) {
break;
}
} else if (send_start_before_flush != t_thrd.libpq_cxt.PqSendStart) {
/* Some more data have been flushed */
sleeptime = 0;
pg_usleep(NAPTIME_PER_SEND * 1000);
continue;
}
if (timeout > 0 && sleeptime >= timeout) {
long secs;
int usecs;
TimestampTz stop_time = GetCurrentTimestamp();
TimestampDifference(start_time, stop_time, &secs, &usecs);
sleeptime = secs * 1000 + usecs / 1000 + 1;
/*
* By checking the delayed time again, it ensures we won't delay
* less than the specified time if pg_usleep is interrupted by other
* signals such as SIGHUP.
*/
if (stop_time < start_time || sleeptime >= timeout) {
ereport(COMMERROR, (errmsg("could not send data during maximum timeout, terminating process")));
proc_exit(0);
}
}
pg_usleep(NAPTIME_PER_SEND_RETRY * 1000);
sleeptime += NAPTIME_PER_SEND_RETRY;
}
}
/* --------------------------------
* pq_is_send_pending - is there any pending data in the output buffer?
* --------------------------------
*/
static bool socket_is_send_pending(void)
{
return (t_thrd.libpq_cxt.PqSendStart < t_thrd.libpq_cxt.PqSendPointer);
}
/* --------------------------------
* Message-level I/O routines begin here.
*
* These routines understand about the old-style COPY OUT protocol.
* --------------------------------
*/
/* --------------------------------
* pq_putmessage - send a normal message (suppressed in COPY OUT mode)
*
* If msgtype is not '\0', it is a message type code to place before
* the message body. If msgtype is '\0', then the message has no type
* code (this is only valid in pre-3.0 protocols).
*
* len is the length of the message body data at *s. In protocol 3.0
* and later, a message length word (equal to len+4 because it counts
* itself too) is inserted by this routine.
*
* All normal messages are suppressed while old-style COPY OUT is in
* progress. (In practice only a few notice messages might get emitted
* then; dropping them is annoying, but at least they will still appear
* in the postmaster log.)
*
* We also suppress messages generated while pqcomm.c is busy. This
* avoids any possibility of messages being inserted within other
* messages. The only known trouble case arises if SIGQUIT occurs
* during a pqcomm.c routine --- quickdie() will try to send a warning
* message, and the most reasonable approach seems to be to drop it.
*
* returns 0 if OK, EOF if trouble
* --------------------------------
*/
static int socket_putmessage(char msgtype, const char* s, size_t len)
{
if (t_thrd.libpq_cxt.DoingCopyOut || t_thrd.libpq_cxt.PqCommBusy) {
return 0;
}
t_thrd.libpq_cxt.PqCommBusy = true;
if (msgtype) {
if (internal_putbytes(&msgtype, 1)) {
goto fail;
}
}
if (PG_PROTOCOL_MAJOR(FrontendProtocol) >= 3) {
uint32 n32;
n32 = htonl((uint32)(len + 4));
if (internal_putbytes((char*)&n32, 4)) {
goto fail;
}
}
if (internal_putbytes(s, len)) {
goto fail;
}
t_thrd.libpq_cxt.PqCommBusy = false;
return 0;
fail:
t_thrd.libpq_cxt.PqCommBusy = false;
return EOF;
}
/* --------------------------------
* pq_putmessage_noblock - like pq_putmessage, but never blocks
*
* If the output buffer is too small to hold the message, the buffer
* is enlarged.
*/
static int socket_putmessage_noblock(char msgtype, const char* s, size_t len)
{
int res;
int required;
const int datalen = 1 + 4 + len;
/*
* Ensure we have enough space in the output buffer for the message header
* as well as the message itself.
*/
Assert((unsigned int)t_thrd.libpq_cxt.PqSendPointer <= MaxBuildAllocSize);
if (MaxBuildAllocSize - (unsigned int)t_thrd.libpq_cxt.PqSendPointer >= (unsigned int)datalen) {
required = t_thrd.libpq_cxt.PqSendPointer + datalen;
if (required > t_thrd.libpq_cxt.PqSendBufferSize) {
t_thrd.libpq_cxt.PqSendBuffer = (char*)repalloc(t_thrd.libpq_cxt.PqSendBuffer, required);
t_thrd.libpq_cxt.PqSendBufferSize = required;
}
}
res = pq_putmessage(msgtype, s, len);
return res;
}
/* --------------------------------
* pq_startcopyout - inform libpq that an old-style COPY OUT transfer
* is beginning
* --------------------------------
*/
static void socket_startcopyout(void)
{
t_thrd.libpq_cxt.DoingCopyOut = true;
}
/* --------------------------------
* pq_endcopyout - end an old-style COPY OUT transfer
*
* If errorAbort is indicated, we are aborting a COPY OUT due to an error,
* and must send a terminator line. Since a partial data line might have
* been emitted, send a couple of newlines first (the first one could
* get absorbed by a backslash...) Note that old-style COPY OUT does
* not allow binary transfers, so a textual terminator is always correct.
* --------------------------------
*/
static void socket_endcopyout(bool errorAbort)
{
if (!t_thrd.libpq_cxt.DoingCopyOut) {
return;
}
if (errorAbort) {
pq_putbytes("\n\n\\.\n", 5);
}
/* in non-error case, copy.c will have emitted the terminator line */
t_thrd.libpq_cxt.DoingCopyOut = false;
}
/* --------------------------------
* pq_select - Wait until we can read data, or timeout.
* Returns true if data has become available for reading, false if timed out
* or interrupted by signal.
* This is based on libpq_select of libpq_walreceiver.cpp.
* --------------------------------
*/
bool pq_select(int timeout_ms)
{
int ret;
/* We use poll(2) if available, otherwise select(2) */
{
#ifdef HAVE_POLL
struct pollfd input_fd;
input_fd.fd = u_sess->proc_cxt.MyProcPort->sock;
input_fd.events = POLLIN | POLLERR;
input_fd.revents = 0;
ret = poll(&input_fd, 1, timeout_ms);
#else /* !HAVE_POLL */
fd_set input_mask;
struct timeval timeout;
struct timeval* ptr_timeout = NULL;
FD_ZERO(&input_mask);
FD_SET(u_sess->proc_cxt.MyProcPort->sock, &input_mask);
if (timeout_ms < 0) {
ptr_timeout = NULL;
} else {
timeout.tv_sec = timeout_ms / 1000;
timeout.tv_usec = (timeout_ms % 1000) * 1000;
ptr_timeout = &timeout;
}
ret = select(u_sess->proc_cxt.MyProcPort->sock + 1, &input_mask, NULL, NULL, ptr_timeout);
#endif /* HAVE_POLL */
}
if (ret == 0 || (ret < 0 && errno == EINTR)) {
return false;
}
if (ret < 0) {
ereport(ERROR, (errcode_for_socket_access(), errmsg("select() failed: %m")));
}
return true;
}
/*
* Support for TCP Keepalive parameters
*/
/*
* On Windows, we need to set both idle and interval at the same time.
* We also cannot reset them to the default (setting to zero will
* actually set them to zero, not default), therefor we fallback to
* the out-of-the-box default instead.
*/
#if defined(WIN32) && defined(SIO_KEEPALIVE_VALS)
static int pq_setkeepaliveswin32(Port* port, int idle, int interval)
{
struct tcp_keepalive ka;
DWORD retsize;
if (idle <= 0)
idle = 2 * 60 * 60; /* default = 2 hours */
if (interval <= 0)
interval = 1; /* default = 1 second */
ka.onoff = 1;
ka.keepalivetime = idle * 1000;
ka.keepaliveinterval = interval * 1000;
if (WSAIoctl(port->sock, SIO_KEEPALIVE_VALS, (LPVOID)&ka, sizeof(ka), NULL, 0, &retsize, NULL, NULL) != 0) {
ereport(LOG, (errmsg("WSAIoctl(SIO_KEEPALIVE_VALS) failed: %ui", WSAGetLastError())));
return STATUS_ERROR;
}
if (port->keepalives_idle != idle)
port->keepalives_idle = idle;
if (port->keepalives_interval != interval)
port->keepalives_interval = interval;
return STATUS_OK;
}
#endif
int pq_getkeepalivesidle(Port* port)
{
#if defined(TCP_KEEPIDLE) || defined(TCP_KEEPALIVE) || defined(WIN32)
if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) {
return 0;
}
if (port->keepalives_idle != 0) {
return port->keepalives_idle;
}
if ((port->default_keepalives_idle == 0) && (port->sock != NO_SOCKET)) {
#ifndef WIN32
ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_idle);
#ifdef TCP_KEEPIDLE
if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE, (char*)&port->default_keepalives_idle, &size) < 0) {
ereport(LOG, (errmsg("getsockopt(TCP_KEEPIDLE) failed: %m")));
port->default_keepalives_idle = -1; /* don't know */
}
#else
if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPALIVE, (char*)&port->default_keepalives_idle, &size) < 0) {
ereport(LOG, (errmsg("getsockopt(TCP_KEEPALIVE) failed: %m")));
port->default_keepalives_idle = -1; /* don't know */
}
#endif /* TCP_KEEPIDLE */
#else /* WIN32 */
/* We can't get the defaults on Windows, so return "don't know" */
port->default_keepalives_idle = -1;
#endif /* WIN32 */
}
return port->default_keepalives_idle;
#else
return 0;
#endif
}
int pq_setkeepalivesidle(int idle, Port* port)
{
if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) {
return STATUS_OK;
}
#if defined(TCP_KEEPIDLE) || defined(TCP_KEEPALIVE) || defined(SIO_KEEPALIVE_VALS)
if (idle == port->keepalives_idle) {
return STATUS_OK;
}
#ifndef WIN32
if (port->default_keepalives_idle <= 0) {
if (pq_getkeepalivesidle(port) < 0) {
if (idle == 0) {
return STATUS_OK; /* default is set but unknown */
} else {
return STATUS_ERROR;
}
}
}
if (idle == 0) {
idle = port->default_keepalives_idle;
}
if (port->sock != NO_SOCKET) {
#ifdef TCP_KEEPIDLE
if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPIDLE, (char*)&idle, sizeof(idle)) < 0) {
ereport(LOG, (errmsg("setsockopt(TCP_KEEPIDLE) failed: %m")));
return STATUS_ERROR;
}
#else
if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPALIVE, (char*)&idle, sizeof(idle)) < 0) {
ereport(LOG, (errmsg("setsockopt(TCP_KEEPALIVE) failed: %m")));
return STATUS_ERROR;
}
#endif
port->keepalives_idle = idle;
}
#else /* WIN32 */
return pq_setkeepaliveswin32(port, idle, port->keepalives_interval);
#endif
#else /* TCP_KEEPIDLE || SIO_KEEPALIVE_VALS */
if (idle != 0) {
ereport(LOG, (errmsg("setting the keepalive idle time is not supported")));
return STATUS_ERROR;
}
#endif
return STATUS_OK;
}
int pq_getkeepalivesinterval(Port* port)
{
#if defined(TCP_KEEPINTVL) || defined(SIO_KEEPALIVE_VALS)
if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) {
return 0;
}
if (port->keepalives_interval != 0) {
return port->keepalives_interval;
}
if ((port->default_keepalives_interval == 0) && (port->sock != NO_SOCKET)) {
#ifndef WIN32
ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_interval);
if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL, (char*)&port->default_keepalives_interval, &size) < 0) {
ereport(LOG, (errmsg("getsockopt(TCP_KEEPINTVL) failed: %m")));
port->default_keepalives_interval = -1; /* don't know */
}
#else
/* We can't get the defaults on Windows, so return "don't know" */
port->default_keepalives_interval = -1;
#endif /* WIN32 */
}
return port->default_keepalives_interval;
#else
return 0;
#endif
}
int pq_setkeepalivesinterval(int interval, Port* port)
{
if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) {
return STATUS_OK;
}
#if defined(TCP_KEEPINTVL) || defined(SIO_KEEPALIVE_VALS)
if (interval == port->keepalives_interval) {
return STATUS_OK;
}
#ifndef WIN32
if (port->default_keepalives_interval <= 0) {
if (pq_getkeepalivesinterval(port) < 0) {
if (interval == 0) {
return STATUS_OK; /* default is set but unknown */
} else {
return STATUS_ERROR;
}
}
}
if (interval == 0) {
interval = port->default_keepalives_interval;
}
if (port->sock != NO_SOCKET) {
if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPINTVL, (char*)&interval, sizeof(interval)) < 0) {
ereport(LOG, (errmsg("setsockopt(TCP_KEEPINTVL) failed: %m")));
return STATUS_ERROR;
}
port->keepalives_interval = interval;
}
#else /* WIN32 */
return pq_setkeepaliveswin32(port, port->keepalives_idle, interval);
#endif
#else
if (interval != 0) {
ereport(LOG, (errmsg("setsockopt(TCP_KEEPINTVL) not supported")));
return STATUS_ERROR;
}
#endif
return STATUS_OK;
}
int pq_getkeepalivescount(Port* port)
{
#ifdef TCP_KEEPCNT
if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) {
return 0;
}
if (port->keepalives_count != 0) {
return port->keepalives_count;
}
if ((port->default_keepalives_count == 0) && (port->sock != NO_SOCKET)) {
ACCEPT_TYPE_ARG3 size = sizeof(port->default_keepalives_count);
if (getsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT, (char*)&port->default_keepalives_count, &size) < 0) {
ereport(LOG, (errmsg("getsockopt(TCP_KEEPCNT) failed: %m")));
port->default_keepalives_count = -1; /* don't know */
}
}
return port->default_keepalives_count;
#else
return 0;
#endif
}
int pq_setkeepalivescount(int count, Port* port)
{
if (port == NULL || IS_AF_UNIX(port->laddr.addr.ss_family)) {
return STATUS_OK;
}
#ifdef TCP_KEEPCNT
if (count == port->keepalives_count) {
return STATUS_OK;
}
if (port->default_keepalives_count <= 0) {
if (pq_getkeepalivescount(port) < 0) {
if (count == 0) {
return STATUS_OK; /* default is set but unknown */
} else {
return STATUS_ERROR;
}
}
}
if (count == 0) {
count = port->default_keepalives_count;
}
if (port->sock != NO_SOCKET) {
if (setsockopt(port->sock, IPPROTO_TCP, TCP_KEEPCNT, (char*)&count, sizeof(count)) < 0) {
ereport(LOG, (errmsg("setsockopt(TCP_KEEPCNT) failed: %m")));
return STATUS_ERROR;
}
port->keepalives_count = count;
}
#else
if (count != 0) {
ereport(LOG, (errmsg("setsockopt(TCP_KEEPCNT) not supported")));
return STATUS_ERROR;
}
#endif
return STATUS_OK;
}
/*
* @Description: reset send buffer cursors
*/
void pq_abandon_sendbuffer(void)
{
t_thrd.libpq_cxt.PqSendPointer = 0;
t_thrd.libpq_cxt.PqSendStart = 0;
}
/*
* @Description: reset recv buffer cursors
*/
void pq_abandon_recvbuffer(void)
{
t_thrd.libpq_cxt.PqRecvPointer = 0;
t_thrd.libpq_cxt.PqRecvLength = 0;
}
/*
* @Description: resize PqRecvBuffer
*/
void pq_resize_recvbuffer(int size)
{
#ifdef USE_RETRY_STUB
elog(LOG,
"%s %s resize pqrecvbuffer from %d to %d",
STUB_PRINT_PREFIX,
STUB_PRINT_PREFIX_TYPE_S,
t_thrd.libpq_cxt.PqRecvBufferSize,
size);
#endif
char* enlarged_buffer = (char*)MemoryContextAlloc(t_thrd.top_mem_cxt, size);
if (t_thrd.libpq_cxt.PqRecvBuffer != NULL) {
/* since MemoryContextAlloc may fail, so alloc new memory first, then free old memory */
pfree(t_thrd.libpq_cxt.PqRecvBuffer);
t_thrd.libpq_cxt.PqRecvBuffer = NULL;
}
t_thrd.libpq_cxt.PqRecvBuffer = enlarged_buffer;
t_thrd.libpq_cxt.PqRecvBufferSize = size;
t_thrd.libpq_cxt.PqRecvPointer = 0;
t_thrd.libpq_cxt.PqRecvLength = 0;
}
/*
* @Description: revert PqRecvBuffer to given data
*/
void pq_revert_recvbuffer(const char* data, int len)
{
if (unlikely(data == NULL || len < 0 || t_thrd.libpq_cxt.PqRecvBufferSize < len)) {
ereport(ERROR,
(errmodule(MOD_CN_RETRY),
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("alert, failed in revert command buffer, invalid params data len %d pq buffer size %d",
len,
t_thrd.libpq_cxt.PqRecvBufferSize)));
}
errno_t rc = memcpy_s(t_thrd.libpq_cxt.PqRecvBuffer, t_thrd.libpq_cxt.PqRecvBufferSize, data, len);
securec_check(rc, "", "");
t_thrd.libpq_cxt.PqRecvPointer = 0;
t_thrd.libpq_cxt.PqRecvLength = len;
}
/*
* @Description: generate crc checking header
* @in - src data and length and sequence number
* @return - int seqnum + int datalength + pg_crc32 crc as char *.
*/
static void pq_disk_generate_checking_header(
const char* src_data, StringInfo dest_data, uint32 data_len, uint32 seq_num)
{
Assert(src_data != NULL);
pq_sendint(dest_data, seq_num, 4);
pq_sendint(dest_data, data_len, 4);
/* Add CRC check. */
pg_crc32 val_crc;
INIT_CRC32(val_crc);
#ifdef USE_ASSERT_CHECKING
COMP_CRC32(val_crc, src_data, data_len);
#endif
FIN_CRC32(val_crc);
pq_sendint(dest_data, val_crc, 4);
appendBinaryStringInfo(dest_data, src_data, data_len);
return;
}
/*
* @Description: read data file and do crc checking
* @in - src data and length
* @return - pqSendBuf read size.
*/
static size_t pq_disk_read_data_block(
LZ4File* file_handle, char* src_data, char* dest_data, uint32 data_len, uint32 seq_num)
{
Assert(file_handle != NULL && src_data != NULL);
errno_t rc = EOK;
uint32 actual_crc_val;
uint32 actual_seq_num = 0;
uint32 actual_msg_len = 0;
size_t read_len = LZ4FileRead(file_handle, src_data, data_len + CRC_HEADER);
if (read_len < CRC_HEADER) {
return 0;
}
read_len -= CRC_HEADER;
rc = memcpy_s(&actual_seq_num, 4, src_data, 4);
securec_check(rc, "\0", "\0");
actual_seq_num = ntohl(actual_seq_num);
src_data += 4;
Assert(actual_seq_num == seq_num);
if (actual_seq_num != seq_num) {
src_data -= 4;
pfree_ext(src_data);
ereport(FATAL,
(errmodule(MOD_CN_RETRY),
errcode(ERRCODE_DATA_EXCEPTION),
errmsg("expected message sequnce is %u, actual message sequence is %u", seq_num, actual_seq_num)));
}
rc = memcpy_s(&actual_msg_len, 4, src_data, 4);
securec_check(rc, "\0", "\0");
actual_msg_len = ntohl(actual_msg_len);
src_data += 4;
Assert(actual_msg_len == read_len);
if (actual_msg_len != read_len) {
src_data -= 8;
pfree_ext(src_data);
ereport(FATAL,
(errmodule(MOD_CN_RETRY),
errcode(ERRCODE_STRING_DATA_LENGTH_MISMATCH),
errmsg("expected message length is %u, actual message length is %u", actual_msg_len, data_len)));
}
/* CRC check. */
rc = memcpy_s(&actual_crc_val, 4, src_data, 4);
securec_check(rc, "\0", "\0");
actual_crc_val = ntohl(actual_crc_val);
src_data += 4;
#ifdef USE_ASSERT_CHECKING
pg_crc32 valcrc;
INIT_CRC32(valcrc);
COMP_CRC32(valcrc, src_data, actual_msg_len);
FIN_CRC32(valcrc);
if (!EQ_CRC32(valcrc, actual_crc_val)) {
src_data -= 12;
pfree_ext(src_data);
ereport(FATAL,
(errmodule(MOD_CN_RETRY),
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("expected crc is %u, actual crc is %u", actual_crc_val, valcrc)));
}
#endif
errno_t err_rc = memcpy_s(dest_data, read_len, src_data, read_len);
securec_check(err_rc, "\0", "\0");
src_data -= CRC_HEADER;
return read_len;
}