openGauss-server/src/gausskernel/process/threadpool/threadpool_controler.cpp

/*
 * Copyright (c) 2020 Huawei Technologies Co.,Ltd.
 *
 * openGauss is licensed under Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *
 *          http://license.coscl.org.cn/MulanPSL2
 *
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
 * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
 * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
 * See the Mulan PSL v2 for more details.
 * -------------------------------------------------------------------------
 *
 * threadpool_controler.cpp
 *        Controler for thread pool. Class ThreadPoolControler is defined to
 *        initilize thread pool's worker and listener threads, and dispatch
 *        new session from postmaster thread to suitable thread group.
 *
 *
 * IDENTIFICATION
 *        src/gausskernel/process/threadpool/threadpool_controler.cpp
 *
 * ---------------------------------------------------------------------------------------
 */
#ifdef __USE_NUMA
#include <numa.h>
#endif
#include "postgres.h"
#include "knl/knl_variable.h"

#include "threadpool/threadpool.h"

#include "access/xact.h"
#include "catalog/pg_collation.h"
#include "commands/copy.h"
#include "gssignal/gs_signal.h"
#include "lib/dllist.h"
#include "libpq/ip.h"
#include "libpq/libpq.h"
#include "libpq/pqformat.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "postmaster/postmaster.h"
#include "pgxc/pgxc.h"
#include "storage/pmsignal.h"
#include "tcop/dest.h"
#include "utils/atomic.h"
#include "utils/builtins.h"
#include "utils/formatting.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/palloc.h"
#include "utils/ps_status.h"
#include "executor/executor.h"

#include "communication/commproxy_interface.h"

#ifdef HAVE_POLL_H
#include <poll.h>
#endif

ThreadPoolControler* g_threadPoolControler = NULL;

#define BUFSIZE 128

#define IS_NULL_STR(str) ((str) == NULL || (str)[0] == '\0')
#define INVALID_ATTR_ERROR(detail) \
    ereport(FATAL, (errcode(ERRCODE_OPERATE_INVALID_PARAM), errmsg("Invalid attribute for thread pool."), detail))

ThreadPoolControler::ThreadPoolControler()
{
    m_threadPoolContext = NULL;
    m_sessCtrl = NULL;
    m_groups = NULL;
    m_scheduler = NULL;
    m_groupNum = 1;
    m_threadNum = 0;
    m_maxPoolSize = 0;
}

ThreadPoolControler::~ThreadPoolControler()
{
    delete m_scheduler;
    delete m_sessCtrl;
    MemoryContextDelete(m_threadPoolContext);
    m_threadPoolContext = NULL;
    m_groups = NULL;
    m_sessCtrl = NULL;
}

void ThreadPoolControler::Init(bool enableNumaDistribute)
{

    m_threadPoolContext = AllocSetContextCreate(g_instance.instance_context,
        "ThreadPoolContext",
        ALLOCSET_DEFAULT_MINSIZE,
        ALLOCSET_DEFAULT_INITSIZE,
        ALLOCSET_DEFAULT_MAXSIZE,
        SHARED_CONTEXT);

    AutoContextSwitch memSwitch(m_threadPoolContext);

    m_groups = (ThreadPoolGroup**)palloc(sizeof(ThreadPoolGroup*) * m_groupNum);
    m_sessCtrl = New(CurrentMemoryContext) ThreadPoolSessControl(CurrentMemoryContext);

    bool bindCpu = CheckCpuBind();
    int maxThreadNum = 0;
    int expectThreadNum = 0;
    int maxStreamNum = 0;
    int numaId = 0;
    int cpuNum = 0;
    int *cpuArr = NULL;

    for (int i = 0; i < m_groupNum; i++) {
        if (bindCpu) {
            while (m_cpuInfo.cpuArrSize[numaId] == 0)
                numaId++;

            /*
             * Invoke numa_set_preferred before starting worker thread to make
             * more memory allocation local to worker thread.
             */
#ifdef __USE_NUMA
            if (enableNumaDistribute) {
                numa_set_preferred(numaId);
            }
#endif

            Assert(numaId < m_cpuInfo.totalNumaNum);
            expectThreadNum = (int)round(
                (double)m_threadNum * ((double)m_cpuInfo.cpuArrSize[numaId] / (double)m_cpuInfo.activeCpuNum));
            maxThreadNum = (int)round(
                (double)m_maxPoolSize * ((double)m_cpuInfo.cpuArrSize[numaId] / (double)m_cpuInfo.activeCpuNum));
            maxStreamNum = (int)round(
                (double)m_maxPoolSize * ((double)m_cpuInfo.cpuArrSize[numaId] / (double)m_cpuInfo.activeCpuNum));
            cpuNum = m_cpuInfo.cpuArrSize[numaId];
            cpuArr = m_cpuInfo.cpuArr[numaId];

            numaId++;
        } else {
            expectThreadNum = m_threadNum / m_groupNum;
            maxThreadNum = m_maxPoolSize / m_groupNum;
            maxStreamNum = m_maxPoolSize / m_groupNum;
            numaId = -1;
        }

        m_groups[i] = New(CurrentMemoryContext)ThreadPoolGroup(maxThreadNum, expectThreadNum,
                                                    maxStreamNum, i, numaId, cpuNum, cpuArr);
        m_groups[i]->Init(enableNumaDistribute);
    }

    for (int i = 0; i < m_groupNum; i++) {
        m_groups[i]->WaitReady();
    }

#ifdef __USE_NUMA
    if (enableNumaDistribute) {
        /* Set to interleave mode for other than worker thread */
        numa_set_interleave_mask(numa_all_nodes_ptr);
    }
#endif

    m_scheduler = New(CurrentMemoryContext) ThreadPoolScheduler(m_groupNum, m_groups);
    m_scheduler->StartUp();
}

void ThreadPoolControler::SetThreadPoolInfo()
{
    InitCpuInfo();

    GetInstanceBind();

    GetCpuAndNumaNum();

    ParseAttr();

    GetSysCpuInfo();

    SetGroupAndThreadNum();
}

void ThreadPoolControler::GetInstanceBind()
{
    /* Check if the instance has been attch to some specific CPUs. */
    int ret = pthread_getaffinity_np(PostmasterPid, sizeof(cpu_set_t), &m_cpuset);
    if (ret == 0) {
        return;
    } else {
        errno_t rc = memset_s(&m_cpuset, sizeof(m_cpuset), 0, sizeof(m_cpuset));
        securec_check(rc, "\0", "\0");
    }
}

void ThreadPoolControler::ParseAttr()
{
    m_attr.threadNum = DEFAULT_THREAD_POOL_SIZE;
    m_attr.groupNum = DEFAULT_THREAD_POOL_GROUPS;
    m_attr.bindCpu = NULL;

    /* Do str copy and remove space. */
    char* attr = TrimStr(g_instance.attr.attr_common.thread_pool_attr);
    if (IS_NULL_STR(attr))
        return;

    char* ptoken = NULL;
    char* psave = NULL;
    const char* pdelimiter = ",";

    /* Get thread num */
    ptoken = TrimStr(strtok_r(attr, pdelimiter, &psave));
    if (!IS_NULL_STR(ptoken))
        m_attr.threadNum = pg_strtoint32(ptoken);

    /* Ger group num */
    ptoken = TrimStr(strtok_r(NULL, pdelimiter, &psave));
    if (!IS_NULL_STR(ptoken))
        m_attr.groupNum = pg_strtoint32(ptoken);

    if (m_attr.threadNum < 0 || m_attr.threadNum > MAX_THREAD_POOL_SIZE)
        INVALID_ATTR_ERROR(
            errdetail("Current thread num %d is out of range [%d, %d].", m_attr.threadNum, 0, MAX_THREAD_POOL_SIZE));
    if (m_attr.groupNum < 0 || m_attr.groupNum > MAX_THREAD_POOL_GROUPS)
        INVALID_ATTR_ERROR(
            errdetail("Current group num %d is out of range [%d, %d].", m_attr.groupNum, 0, MAX_THREAD_POOL_GROUPS));

    /* Get attach cpu */
    m_attr.bindCpu = TrimStr(psave);
    ParseBindCpu();
}

void ThreadPoolControler::ParseBindCpu()
{
    if (IS_NULL_STR(m_attr.bindCpu)) {
        m_cpuInfo.bindType = NO_CPU_BIND;
        return;
    }

    char* scpu = pstrdup(m_attr.bindCpu);
    char* ptoken = NULL;
    char* psave = NULL;
    const char* pdelimiter = ":";
    int bindNum = 0;

    if (scpu[0] != '(' || scpu[strlen(scpu) - 1] != ')')
        INVALID_ATTR_ERROR("Use '(' ')' to indicate cpu bind info.");

    scpu++;
    scpu[strlen(scpu) - 1] = '\0';

    ptoken = TrimStr(strtok_r(scpu, pdelimiter, &psave));
    ptoken = pg_strtolower(ptoken);

    if (strncmp("nobind", ptoken, strlen("nobind")) == 0) {
        m_cpuInfo.bindType = NO_CPU_BIND;
        return;
    } else if (strncmp("allbind", ptoken, strlen("allbind")) == 0) {
        m_cpuInfo.bindType = ALL_CPU_BIND;
        return;
    } else if (strncmp("cpubind", ptoken, strlen("cpubind")) == 0) {
        m_cpuInfo.bindType = CPU_BIND;
        m_cpuInfo.isBindCpuArr = (bool*)palloc0(sizeof(bool) * m_cpuInfo.totalCpuNum);
        bindNum = ParseRangeStr(psave, m_cpuInfo.isBindCpuArr, m_cpuInfo.totalCpuNum, "cpubind");
    } else if (strncmp("nodebind", ptoken, strlen("nodebind")) == 0) {
        m_cpuInfo.bindType = NODE_BIND;
        m_cpuInfo.isBindNumaArr = (bool*)palloc0(sizeof(bool) * m_cpuInfo.totalNumaNum);
        bindNum = ParseRangeStr(psave, m_cpuInfo.isBindNumaArr, m_cpuInfo.totalNumaNum, "nodebind");
    } else {
        INVALID_ATTR_ERROR(errdetail("Only 'nobind', 'allbind', 'cpubind', and 'nodebind' are valid attribute."));
    }

    if (bindNum == 0)
        INVALID_ATTR_ERROR(
            errdetail("Can not find valid CPU for thread binding, there are two possible reasons:\n"
                         "1. These CPUs are not active, use lscpu to check On-line CPU(s) list.\n"
                         "2. The process has been bind to other CPUs and there is no intersection,"
                         "use taskset -pc to check process CPU bind info.\n"));
}

int ThreadPoolControler::ParseRangeStr(char* attr, bool* arr, int totalNum, char* bindtype)
{
    char* ptoken = NULL;
    char* psave = NULL;
    const char* pdelimiter = ",";
    int retNum = 0;

    ptoken = TrimStr(strtok_r(attr, pdelimiter, &psave));

    while (!IS_NULL_STR(ptoken)) {
        char* pt = NULL;
        char* ps = NULL;
        const char* pd = "-";
        int startid = -1;
        int endid = -1;

        pt = TrimStr(strtok_r(ptoken, pd, &ps));
        if (!IS_NULL_STR(pt))
            startid = pg_strtoint32(pt);
        if (!IS_NULL_STR(ps))
            endid = pg_strtoint32(ps);

        if (startid < 0 && endid < 0)
            INVALID_ATTR_ERROR(errdetail("Can not parse attribute %s", pt));
        if (startid >= totalNum)
            INVALID_ATTR_ERROR(
                errdetail("The %s attribute %d is out of valid range [%d, %d]", bindtype, startid, 0, totalNum - 1));
        if (endid >= totalNum)
            INVALID_ATTR_ERROR(
                errdetail("The %s attribute %d is out of valid range [%d, %d]", bindtype, endid, 0, totalNum - 1));

        if (endid == -1) {
            retNum += arr[startid] ? 0 : 1;
            arr[startid] = true;
        } else {
            if (startid > endid) {
                int tmpid = startid;
                startid = endid;
                endid = tmpid;
            }

            for (int i = startid; i <= endid; i++) {
                retNum += arr[startid] ? 0 : 1;
                arr[i] = true;
            }
        }

        ptoken = TrimStr(strtok_r(NULL, pdelimiter, &psave));
    }

    return retNum;
}

void ThreadPoolControler::GetMcsCpuInfo()
{
    FILE* fp = NULL;
    char buf[BUFSIZE];

    m_cpuInfo.isMcsCpuArr = (bool*)palloc0(sizeof(bool) * m_cpuInfo.totalCpuNum);

    /*
     * When the database is deplyed on MCS, we need to read cpuset.cpus to find
     * available CPUs in this MCS. If we can read this file, then we think all
     * CPUs are available.
     */
    fp = fopen("/sys/fs/cgroup/cpuset/cpuset.cpus", "r");
    if (fp == NULL) {
        ereport(WARNING, (errcode(ERRCODE_OPERATE_INVALID_PARAM),
                        errmsg("Failed to open file /sys/fs/cgroup/cpuset/cpuset.cpus")));
        errno_t rc = memset_s(m_cpuInfo.isMcsCpuArr, m_cpuInfo.totalCpuNum, 1, m_cpuInfo.totalCpuNum);
        securec_check(rc, "", "");
        return;
    }

    if (fgets(buf, BUFSIZE, fp) == NULL) {
        ereport(WARNING, (errcode(ERRCODE_OPERATE_INVALID_PARAM),
                        errmsg("Failed to read file /sys/fs/cgroup/cpuset/cpuset.cpus")));
        errno_t rc = memset_s(m_cpuInfo.isMcsCpuArr, m_cpuInfo.totalCpuNum, 1, m_cpuInfo.totalCpuNum);
        securec_check(rc, "", "");
        fclose(fp);
        return;
    }

    int mcsNum = ParseRangeStr(buf, m_cpuInfo.isMcsCpuArr, m_cpuInfo.totalCpuNum, "Mcs Cpu set");
    if (mcsNum == 0) {
        ereport(WARNING, (errcode(ERRCODE_OPERATE_INVALID_PARAM),
                        errmsg("No available CPUs in /sys/fs/cgroup/cpuset/cpuset.cpus")));
        errno_t rc = memset_s(m_cpuInfo.isMcsCpuArr, m_cpuInfo.totalCpuNum, 1, m_cpuInfo.totalCpuNum);
        securec_check(rc, "", "");
    }
    fclose(fp);
}

void ThreadPoolControler::GetSysCpuInfo()
{
    FILE* fp = NULL;
    char buf[BUFSIZE];

    if (m_cpuInfo.totalNumaNum == 0 || m_cpuInfo.totalCpuNum == 0) {
        ereport(WARNING, (errmsg("Fail to read cpu num or numa num.")));
        return;
    }

    GetMcsCpuInfo();

    m_cpuInfo.cpuArr = (int**)palloc0(sizeof(int*) * m_cpuInfo.totalNumaNum);
    m_cpuInfo.cpuArrSize = (int*)palloc0(sizeof(int) * m_cpuInfo.totalNumaNum);
    int cpu_per_numa = m_cpuInfo.totalCpuNum / m_cpuInfo.totalNumaNum;
    for (int i = 0; i < m_cpuInfo.totalNumaNum; i++)
        m_cpuInfo.cpuArr[i] = (int*)palloc0(sizeof(int) * cpu_per_numa);

    /* use lscpu to get active cpu info */
    fp = popen("lscpu -b -e=cpu,node", "r");

    if (fp == NULL) {
        ereport(WARNING, (errmsg("Unable to use 'lscpu' to read CPU info.")));
        return;
    }

    char* ptoken = NULL;
    char* psave = NULL;
    const char* pdelimiter = " ";
    int cpuid = 0;
    int numaid = 0;
    /* try to read the header. */
    if (fgets(buf, sizeof(buf), fp) != NULL) {
        m_cpuInfo.activeCpuNum = 0;
        while (fgets(buf, sizeof(buf), fp) != NULL) {
            ptoken = strtok_r(buf, pdelimiter, &psave);
            if (!IS_NULL_STR(ptoken))
                cpuid = pg_strtoint32(ptoken);

            ptoken = strtok_r(NULL, pdelimiter, &psave);
            if (!IS_NULL_STR(ptoken))
                numaid = pg_strtoint32(ptoken);

            if (IsActiveCpu(cpuid, numaid)) {
                m_cpuInfo.cpuArr[numaid][m_cpuInfo.cpuArrSize[numaid]] = cpuid;
                m_cpuInfo.cpuArrSize[numaid]++;
                m_cpuInfo.activeCpuNum++;
            }
        }
    }

    for (int i = 0; i < m_cpuInfo.totalNumaNum; i++) {
        if (m_cpuInfo.cpuArrSize[i] > 0)
            m_cpuInfo.activeNumaNum++;
    }

    pclose(fp);
}

void ThreadPoolControler::InitCpuInfo()
{
    m_cpuInfo.totalCpuNum = 0;
    m_cpuInfo.activeCpuNum = 0;
    m_cpuInfo.totalNumaNum = 0;
    m_cpuInfo.activeNumaNum = 0;
    m_cpuInfo.cpuArrSize = NULL;
    m_cpuInfo.cpuArr = NULL;

    m_cpuInfo.bindType = NO_CPU_BIND;
    m_cpuInfo.isBindCpuArr = NULL;
    m_cpuInfo.isBindNumaArr = NULL;
    m_cpuInfo.isMcsCpuArr = NULL;
}

void ThreadPoolControler::GetCpuAndNumaNum()
{
    char buf[BUFSIZE];

    FILE* fp = NULL;

    if ((fp = popen("LANG=en_US.UTF-8;lscpu", "r")) != NULL) {
        while (fgets(buf, sizeof(buf), fp) != NULL) {
            if (strncmp("CPU(s)", buf, strlen("CPU(s)")) == 0 &&
                strncmp("On-line CPU(s) list", buf, strlen("On-line CPU(s) list")) != 0 &&
                strncmp("NUMA node", buf, strlen("NUMA node")) != 0) {
                char* loc = strchr(buf, ':');
                m_cpuInfo.totalCpuNum = pg_strtoint32(loc + 1);
            } else if (strncmp("NUMA node(s)", buf, strlen("NUMA node(s)")) == 0) {
                char* loc = strchr(buf, ':');
                m_cpuInfo.totalNumaNum = pg_strtoint32(loc + 1);
            }
        }
        pclose(fp);
    }
}

bool ThreadPoolControler::IsActiveCpu(int cpuid, int numaid)
{
    switch (m_cpuInfo.bindType) {
        case NO_CPU_BIND:
        case ALL_CPU_BIND:
            return (m_cpuInfo.isMcsCpuArr[cpuid] && CPU_ISSET(cpuid, &m_cpuset));
        case NODE_BIND:
            return (m_cpuInfo.isBindNumaArr[numaid] && m_cpuInfo.isMcsCpuArr[cpuid] && CPU_ISSET(cpuid, &m_cpuset));
        case CPU_BIND:
            return (m_cpuInfo.isBindCpuArr[cpuid] && m_cpuInfo.isMcsCpuArr[cpuid] && CPU_ISSET(cpuid, &m_cpuset));
    }
    return false;
}

bool ThreadPoolControler::CheckCpuBind() const
{
    if (m_cpuInfo.bindType == NO_CPU_BIND)
        return false;
    if (m_groupNum != m_cpuInfo.activeNumaNum) {
        ereport(WARNING,
            (errmsg("Can not bind worker thread to CPU because the "
                    "thread group num must equal to active NUMA num.")));
        return false;
    }
    if (m_cpuInfo.activeCpuNum == 0 || m_cpuInfo.cpuArr == NULL) {
        ereport(WARNING, (errmsg("Can not bind worker thread to CPU because no valid CPUs.")));
        return false;
    }

    return true;
}

bool ThreadPoolControler::CheckNumaDistribute(int numaNodeNum) const
{
    if (m_cpuInfo.bindType == NO_CPU_BIND) {
        ereport(WARNING,
            (errmsg("allbind should be used to replace nobind in thread_pool_attr when NUMA is activated.")));
        return false;
    }

    if (!CheckCpuBind()) {
        return false;
    }

    if (m_cpuInfo.totalNumaNum != numaNodeNum  || !m_cpuInfo.cpuArrSize) {
        ereport(WARNING,
            (errmsg("Can not activate NUMA distribute because no multiple NUMA nodes or CPUs are available.")));
        return false;
    }

    for (int i = 0; i < m_cpuInfo.totalNumaNum; ++i) {
        if (m_cpuInfo.cpuArrSize[i] <= 0) {
            ereport(WARNING,
                (errmsg("Can not activate NUMA distribute because no available cpu in node %d.", i)));
            return false;
        }
    }

    return true;
}

CPUBindType ThreadPoolControler::GetCpuBindType() const
{
    return m_cpuInfo.bindType;
}

void ThreadPoolControler::SetGroupAndThreadNum()
{
    if (m_attr.groupNum == 0) {
        if (m_cpuInfo.totalNumaNum > 0)
            m_groupNum = m_cpuInfo.activeNumaNum;
        else
            m_groupNum = DEFAULT_THREAD_POOL_GROUPS;
    } else {
        m_groupNum = m_attr.groupNum;
    }

    if (m_attr.threadNum == 0) {
        if (m_cpuInfo.activeCpuNum > 0)
            m_threadNum = m_cpuInfo.activeCpuNum * THREAD_CORE_RATIO;
        else
            m_threadNum = DEFAULT_THREAD_POOL_SIZE;
    } else {
        m_threadNum = m_attr.threadNum;
    }

    ConstrainThreadNum();
}

void ThreadPoolControler::ConstrainThreadNum()
{
    /* Thread pool size should not be larger than max_connections. */
    if (MAX_THREAD_POOL_SIZE > g_instance.attr.attr_network.MaxConnections) {
        ereport(LOG, (errcode(ERRCODE_OPERATE_INVALID_PARAM),
                      errmsg("Max thread pool size %d should not be larger than max_connections %d, "
                             "so reduce max thread pool size to max_connections",
                      MAX_THREAD_POOL_SIZE, g_instance.attr.attr_network.MaxConnections)));
    }

    m_maxPoolSize = Min(MAX_THREAD_POOL_SIZE, g_instance.attr.attr_network.MaxConnections);
    m_threadNum = Min(m_threadNum, m_maxPoolSize);
}

int ThreadPoolControler::GetThreadNum()
{
    return m_maxPoolSize;
}

ThreadPoolStat* ThreadPoolControler::GetThreadPoolStat(uint32* num)
{
    ThreadPoolStat* result = (ThreadPoolStat*)palloc(m_groupNum * sizeof(ThreadPoolStat));
    int i;

    for (i = 0; i < m_groupNum; i++) {
        m_groups[i]->GetThreadPoolGroupStat(&result[i]);
    }

    *num = m_groupNum;
    return result;
}

void ThreadPoolControler::CloseAllSessions()
{
    m_sessCtrl->MarkAllSessionClose();
    (void)SignalCancelAllBackEnd();

    for (int i = 0; i < m_groupNum; i++) {
        m_groups[i]->GetListener()->SendShutDown();
    }

    /* Check until all groups have closed their sessions. */
    bool allclose = false;
    while (!allclose) {
        if (m_sessCtrl->IsActiveListEmpty()) {
            break;
        }

        allclose = true;
        for (int i = 0; i < m_groupNum; i++) {
            allclose = (m_groups[i]->AllSessionClosed() && allclose);
        }
        pg_usleep(100);
    }
}

void ThreadPoolControler::ShutDownThreads(bool forceWait)
{
    for (int i = 0; i < m_groupNum; i++) {
        m_groups[i]->ShutDownThreads();
    }

    if (forceWait) {
        /* Check until all groups have shut down their workers. */
        bool allshut = false;
        while (!allshut) {
            allshut = true;
            for (int i = 0; i < m_groupNum; i++) {
                allshut = (m_groups[i]->AllThreadShutDown() && allshut);
            }
            pg_usleep(100);
        }
    }
}

void ThreadPoolControler::ShutDownListeners(bool forceWait)
{
    for (int i = 0; i < m_groupNum; i++) {
        m_groups[i]->GetListener()->ShutDown();
    }
    if (forceWait) {
        bool allshut = false;
        while (!allshut) {
            allshut = true;
            for (int i = 0; i < m_groupNum; i++) {
                allshut = (m_groups[i]->GetListener()->GetThreadId() == 0) && allshut;
            }
            pg_usleep(100);
        }
    }
}

void ThreadPoolControler::ShutDownScheduler(bool forceWait)
{
    m_scheduler->ShutDown();
    if (forceWait) {
        bool allshut = false;
        while (!allshut) {
            allshut = m_scheduler->HasShutDown();
            pg_usleep(100);
        }
    }
}

void ThreadPoolControler::AddWorkerIfNecessary()
{
    for (int i = 0; i < m_groupNum; i++)
        m_groups[i]->AddWorkerIfNecessary();
    if (m_scheduler->HasShutDown()) {
        m_scheduler->StartUp();
        m_scheduler->SetShutDown(false);
    }
}

ThreadPoolGroup* ThreadPoolControler::FindThreadGroupWithLeastSession()
{
    int idx = 0;
    float4 least_session = 0.0;
    float4 session_per_thread = 0.0;

    least_session = m_groups[0]->GetSessionPerThread();
    for (int i = 1; i < m_groupNum; i++) {
        session_per_thread = m_groups[i]->GetSessionPerThread();
        if (session_per_thread < least_session) {
            least_session = session_per_thread;
            idx = i;
        }
    }

    return m_groups[idx];
}

bool ThreadPoolControler::StayInAttachMode()
{
    return m_sessCtrl->GetActiveSessionCount() < m_threadNum;
}

int ThreadPoolControler::DispatchSession(Port* port)
{
    ThreadPoolGroup* grp = NULL;
    knl_session_context* sc = NULL;

    /*
     * In comm_proxy mode, each accepted fd is combined with a fixed communicator thread in one NUMA group,
     * we no longer distribute it with old mothod "group with latest sessions",
     * so just return the communicator's NUMA group.
     *
     * Note: We assume that each connected user session can be equal-possibily distributed to communicators,
     * fortunately,it looks like Euler OS can guarantee this(proved),
     * otherwise we need revisit it.
     *
     * Performance optimization with comm_proxy when thread_pool m_groupNum same as comm_proxy numa groups
     */
    if (AmIProxyModeSockfd(port->sock) && m_groupNum == g_comm_proxy_config.s_numa_num) {
        CommSockDesc* comm_sock = g_comm_controller->FdGetCommSockDesc(port->sock);
        grp = m_groups[comm_sock->m_group_id];
    } else {
        grp = FindThreadGroupWithLeastSession();
    }

    if (grp == NULL) {
        Assert(false);
        return STATUS_ERROR;
    }
    /* if this group is hanged, we don't accept new session */
    if (grp->IsGroupHanged()) {
        ereport(WARNING,
                (errmodule(MOD_THREAD_POOL),
                    errmsg("Group[%d] is too busy to add new session for now.", grp->GetGroupId())));
        return STATUS_ERROR;
    }

    sc = m_sessCtrl->CreateSession(port);
    if (sc == NULL)
        return STATUS_ERROR;

    grp->GetListener()->AddNewSession(sc);
    return STATUS_OK;
}

/*
 * Bind the specified thread to all the available CPUs.
 * This is invoked by auxiliary thread, such as WALSender.
 */
void ThreadPoolControler::BindThreadToAllAvailCpu(ThreadId thread) const
{
    if (!CheckCpuBind()) {
        return;
    }

    if (m_cpuInfo.bindType == ALL_CPU_BIND) {
        return;
    }

    cpu_set_t availCpuSet;
    CPU_ZERO(&availCpuSet);
    for (int numaNo = 0; numaNo < m_cpuInfo.totalNumaNum; ++numaNo) {
        int cpuNumber = m_cpuInfo.cpuArrSize[numaNo];
        for (int i = 0; i < cpuNumber; ++i) {
            CPU_SET(m_cpuInfo.cpuArr[numaNo][i], &availCpuSet);
        }
    }
    int ret = pthread_setaffinity_np(thread, sizeof(cpu_set_t), &availCpuSet);
    if (ret != 0)
        ereport(WARNING, (errmsg("BindThreadToAllAvailCpu fail to bind thread %lu, errno: %d", thread, ret)));
}