/** * Copyright (c) 2021 OceanBase * OceanBase CE is licensed under Mulan PubL v2. * You can use this software according to the terms and conditions of the Mulan PubL v2. * You may obtain a copy of Mulan PubL v2 at: * http://license.coscl.org.cn/MulanPubL-2.0 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. * See the Mulan PubL v2 for more details. */ #include #include "lib/oblog/ob_trace_log.h" #include "common/ob_queue_thread.h" #include "lib/utility/utility.h" namespace oceanbase { namespace common { ObCond::ObCond(const int64_t spin_wait_num) : spin_wait_num_(spin_wait_num), bcond_(false), last_waked_time_(0) { pthread_mutex_init(&mutex_, NULL); if (0 != pthread_cond_init(&cond_, NULL)) { _OB_LOG_RET(ERROR, common::OB_ERR_SYS, "pthread_cond_init failed"); } } ObCond::~ObCond() { pthread_mutex_destroy(&mutex_); pthread_cond_destroy(&cond_); } void ObCond::signal() { if (false == ATOMIC_CAS(&bcond_, false, true)) { __sync_synchronize(); (void)pthread_mutex_lock(&mutex_); (void)pthread_cond_signal(&cond_); (void)pthread_mutex_unlock(&mutex_); } } int ObCond::wait() { int ret = OB_SUCCESS; bool need_wait = true; if ((last_waked_time_ + BUSY_INTERVAL) > ::oceanbase::common::ObTimeUtility::current_time()) { //return OB_SUCCESS; for (int64_t i = 0; i < spin_wait_num_; i++) { if (true == ATOMIC_CAS(&bcond_, true, false)) { need_wait = false; break; } PAUSE(); } } if (need_wait) { pthread_mutex_lock(&mutex_); while (OB_SUCC(ret) && false == ATOMIC_CAS(&bcond_, true, false)) { int tmp_ret = ob_pthread_cond_wait(&cond_, &mutex_); if (ETIMEDOUT == tmp_ret) { ret = OB_TIMEOUT; break; } } pthread_mutex_unlock(&mutex_); } if (OB_SUCC(ret)) { last_waked_time_ = ::oceanbase::common::ObTimeUtility::current_time(); } return ret; } int ObCond::timedwait(const int64_t time_us) { int ret = OB_SUCCESS; bool need_wait = true; if ((last_waked_time_ + BUSY_INTERVAL) > ::oceanbase::common::ObTimeUtility::current_time()) { for (int64_t i = 0; i < spin_wait_num_; i++) { if (true == ATOMIC_CAS(&bcond_, true, false)) { need_wait = false; break; } PAUSE(); } } if (need_wait) { int64_t abs_time = time_us + ::oceanbase::common::ObTimeUtility::current_time(); struct timespec ts; ts.tv_sec = abs_time / 1000000; ts.tv_nsec = (abs_time % 1000000) * 1000; pthread_mutex_lock(&mutex_); while (OB_SUCC(ret) && false == ATOMIC_CAS(&bcond_, true, false)) { int tmp_ret = ob_pthread_cond_timedwait(&cond_, &mutex_, &ts); if (ETIMEDOUT == tmp_ret) { ret = OB_TIMEOUT; break; } } pthread_mutex_unlock(&mutex_); } if (OB_SUCC(ret)) { last_waked_time_ = ::oceanbase::common::ObTimeUtility::current_time(); } return ret; } //////////////////////////////////////////////////////////////////////////////////////////////////// S2MQueueThread::S2MQueueThread() : thread_num_(0), thread_conf_iter_(0), thread_conf_lock_(ObLatchIds::DEFAULT_DRW_LOCK), queued_num_(), queue_rebalance_(false) { memset((void *)each_queue_len_, 0, sizeof(each_queue_len_)); } S2MQueueThread::~S2MQueueThread() { destroy(); } int S2MQueueThread::init(const int64_t thread_num, const int64_t task_num_limit, const bool queue_rebalance, const bool dynamic_rebalance) { int ret = OB_SUCCESS; queue_rebalance_ = queue_rebalance; if (0 != thread_num_) { ret = OB_INIT_TWICE; } else if (0 >= thread_num || MAX_THREAD_NUM < thread_num || 0 >= task_num_limit) { _OB_LOG(WARN, "invalid param, thread_num=%ld task_num_limit=%ld", thread_num, task_num_limit); ret = OB_INVALID_ARGUMENT; } else if (OB_SUCCESS != (ret = launch_thread_(thread_num, task_num_limit))) { _OB_LOG(WARN, "launch thread fail, ret=%d thread_num=%ld task_num_limit=%ld", ret, thread_num, task_num_limit); } else if (OB_SUCCESS != (ret = balance_filter_.init(thread_num_, dynamic_rebalance))) { _OB_LOG(WARN, "init balance_filter fail, ret=%d thread_num=%ld", ret, thread_num_); } else { // do nothing } if (OB_FAIL(ret)) { destroy(); } else { thread_conf_iter_ = 0; } return ret; } void S2MQueueThread::destroy() { for (int64_t i = 0; i < thread_num_; i++) { ThreadConf &tc = thread_conf_array_[i]; tc.run_flag = false; tc.stop_flag = true; tc.queue_cond.signal(); pthread_join(tc.pd, NULL); } for (int64_t i = 0; i < thread_num_; i++) { ThreadConf &tc = thread_conf_array_[i]; tc.high_prio_task_queue.destroy(); tc.spec_task_queue.destroy(); tc.comm_task_queue.destroy(); tc.low_prio_task_queue.destroy(); } balance_filter_.destroy(); thread_num_ = 0; } int64_t S2MQueueThread::get_queued_num() const { return queued_num_.value(); } int S2MQueueThread::wakeup() { int err = OB_SUCCESS; if (thread_num_ > 0) { DRWLock::RDLockGuard guard(thread_conf_lock_); for (int64_t i = 0; i < thread_num_; i++) { thread_conf_array_[i % thread_num_].queue_cond.signal(); } } return err; } int S2MQueueThread::push(void *task, const int64_t prio) { int ret = OB_SUCCESS; int queue_id = -1; if (0 >= thread_num_) { ret = OB_NOT_INIT; } else if (NULL == task) { ret = OB_INVALID_ARGUMENT; } else { RDLockGuard guard(thread_conf_lock_); uint64_t i = ATOMIC_AAF(&thread_conf_iter_, 1); ThreadConf &tc = thread_conf_array_[i % thread_num_]; switch (prio) { case HIGH_PRIV: ret = tc.high_prio_task_queue.push(task); queue_id = HIGH_PRIO_QUEUE; break; case NORMAL_PRIV: ret = tc.comm_task_queue.push(task); queue_id = NORMAL_PRIO_QUEUE; break; case LOW_PRIV: ret = tc.low_prio_task_queue.push(task); queue_id = LOW_PRIO_QUEUE; break; default: ret = OB_NOT_SUPPORTED; } if (OB_SUCC(ret)) { tc.queue_cond.signal(); } if (OB_SIZE_OVERFLOW == ret) { ret = OB_EAGAIN; } } if (OB_SUCC(ret)) { if (queue_id >= 0 && queue_id < QUEUE_COUNT) { each_queue_len_[queue_id].inc(1); } queued_num_.inc(1); } return ret; } int S2MQueueThread::push(void *task, const uint64_t task_sign, const int64_t prio) { int ret = OB_SUCCESS; if (OB_INVALID_ID == task_sign) { ret = push(task, prio); } else if (0 >= thread_num_) { ret = OB_NOT_INIT; } else if (NULL == task) { ret = OB_INVALID_ARGUMENT; } else { uint64_t filted_task_sign = balance_filter_.filt(task_sign); RDLockGuard guard(thread_conf_lock_); ThreadConf &tc = thread_conf_array_[filted_task_sign % thread_num_]; _OB_LOG(TRACE, "req_sign=%lu,%lu hash=%lu", filted_task_sign % thread_num_, filted_task_sign, task_sign); if (OB_SUCCESS != (ret = tc.spec_task_queue.push(task))) { //_OB_LOG(WARN, "push task to queue fail, ret=%d", ret); if (OB_SIZE_OVERFLOW == ret) { ret = OB_EAGAIN; } } else { tc.queue_cond.signal(); queued_num_.inc(1); each_queue_len_[HOTSPOT_QUEUE].inc(1); } } return ret; } int S2MQueueThread::set_prio_quota(v4si "a) { int ret = OB_SUCCESS; if (0 >= thread_num_) { ret = OB_NOT_INIT; } else { WRLockGuard guard(thread_conf_lock_); for (int64_t i = 0; i < thread_num_; i++) { ThreadConf &tc = thread_conf_array_[i]; tc.scheduler_.set_quota(quota); } } return ret; } int S2MQueueThread::add_thread(const int64_t thread_num, const int64_t task_num_limit) { int ret = OB_SUCCESS; if (0 >= thread_num_) { ret = OB_NOT_INIT; } else if (0 >= thread_num) { _OB_LOG(WARN, "invalid param, thread_num=%ld", thread_num); ret = OB_INVALID_ARGUMENT; } else { WRLockGuard guard(thread_conf_lock_); int64_t prev_thread_num = thread_num_; ret = launch_thread_(thread_num, task_num_limit); balance_filter_.add_thread(thread_num_ - prev_thread_num); } return ret; } int S2MQueueThread::sub_thread(const int64_t thread_num) { int ret = OB_SUCCESS; int64_t prev_thread_num = 0; int64_t cur_thread_num = 0; if (0 >= thread_num_) { ret = OB_NOT_INIT; } else { WRLockGuard guard(thread_conf_lock_); if (0 >= thread_num || thread_num >= thread_num_) { _OB_LOG(WARN, "invalid param, thread_num=%ld thread_num_=%ld", thread_num, thread_num_); ret = OB_INVALID_ARGUMENT; } else { prev_thread_num = thread_num_; thread_num_ -= thread_num; cur_thread_num = thread_num_; } } if (OB_SUCC(ret)) { for (int64_t i = cur_thread_num; i < prev_thread_num; i++) { balance_filter_.sub_thread(1); ThreadConf &tc = thread_conf_array_[i]; tc.run_flag = false; tc.stop_flag = true; tc.queue_cond.signal(); pthread_join(tc.pd, NULL); tc.high_prio_task_queue.destroy(); tc.spec_task_queue.destroy(); tc.comm_task_queue.destroy(); tc.low_prio_task_queue.destroy(); _OB_LOG(INFO, "join thread succ, index=%ld", i); } } return ret; } void *S2MQueueThread::rebalance_(int64_t &idx, const ThreadConf &cur_thread) { void *ret = NULL; // Note: debug only RLOCAL(int64_t, rebalance_counter); for (uint64_t i = 1; (int64_t)i <= thread_num_; i++) { RDLockGuard guard(thread_conf_lock_); uint64_t balance_idx = (cur_thread.index + i) % thread_num_; ThreadConf &tc = thread_conf_array_[balance_idx]; int64_t try_queue_idx = -1; if (tc.using_flag ) { //&& (tc.last_active_time + THREAD_BUSY_TIME_LIMIT) < ::oceanbase::common::ObTimeUtility::current_time()) continue; } if (NULL == ret) { try_queue_idx = HIGH_PRIO_QUEUE; IGNORE_RETURN tc.high_prio_task_queue.pop(ret); } if (NULL == ret) { try_queue_idx = NORMAL_PRIO_QUEUE; IGNORE_RETURN tc.comm_task_queue.pop(ret); } if (NULL != ret) { idx = try_queue_idx; each_queue_len_[idx].inc(-1); if (0 == (rebalance_counter++ % 10000)) { _OB_LOG(INFO, "task has been rebalance between threads rebalance_counter=%ld cur_idx=%ld balance_idx=%ld", *(&rebalance_counter), cur_thread.index, balance_idx); } break; } } return ret; } int S2MQueueThread::launch_thread_(const int64_t thread_num, const int64_t task_num_limit) { int ret = OB_SUCCESS; int64_t thread_num2launch = std::min(MAX_THREAD_NUM - thread_num_, thread_num); for (int64_t i = 0; OB_SUCC(ret) && i < thread_num2launch; i++) { ThreadConf &tc = thread_conf_array_[thread_num_]; tc.index = thread_num_; tc.run_flag = true; tc.stop_flag = false; tc.using_flag = false; tc.last_active_time = 0; tc.host = this; if (OB_SUCCESS != (ret = tc.high_prio_task_queue.init(task_num_limit))) { _OB_LOG(WARN, "high prio task queue init fail, task_num_limit=%ld", task_num_limit); break; } if (OB_SUCCESS != (ret = tc.spec_task_queue.init(task_num_limit))) { _OB_LOG(WARN, "spec task queue init fail, task_num_limit=%ld", task_num_limit); break; } if (OB_SUCCESS != (ret = tc.comm_task_queue.init(task_num_limit))) { _OB_LOG(WARN, "comm task queue init fail, task_num_limit=%ld", task_num_limit); break; } if (OB_SUCCESS != (ret = tc.low_prio_task_queue.init(task_num_limit))) { _OB_LOG(WARN, "low prio task queue init fail, task_num_limit=%ld", task_num_limit); break; } int tmp_ret = 0; if (0 != (tmp_ret = pthread_create(&(tc.pd), NULL, thread_func_, &tc))) { _OB_LOG(WARN, "pthread_create fail, ret=%d", tmp_ret); ret = OB_ERR_UNEXPECTED; tc.high_prio_task_queue.destroy(); tc.spec_task_queue.destroy(); tc.comm_task_queue.destroy(); tc.low_prio_task_queue.destroy(); break; } //cpu_set_t cpu_set; //CPU_ZERO(&cpu_set); //CPU_SET(thread_num_ % get_cpu_num(), &cpu_set); //tmp_ret = pthread_setaffinity_np(tc.pd, sizeof(cpu_set), &cpu_set); _OB_LOG(INFO, "create thread succ, index=%ld tmp_ret=%d", thread_num_, tmp_ret); thread_num_ += 1; } return ret; } void *S2MQueueThread::thread_func_(void *data) { ThreadConf *const tc = (ThreadConf *)data; if (NULL == tc || NULL == tc->host) { _OB_LOG_RET(WARN, common::OB_INVALID_ARGUMENT, "thread_func param null pointer"); } else { tc->host->thread_index() = tc->index; void *pdata = tc->host->on_begin(); bool last_rebalance_got = false; while (tc->run_flag) // && (0 != tc->high_prio_task_queue.get_total() // || 0 != tc->spec_task_queue.get_total() // || 0 != tc->comm_task_queue.get_total() // || 0 != tc->low_prio_task_queue.get_total())) { void *task = NULL; int64_t start_time = ::oceanbase::common::ObTimeUtility::current_time(); int64_t idx = -1; tc->using_flag = true; switch (tc->scheduler_.get()) { case 0: IGNORE_RETURN tc->high_prio_task_queue.pop(task); if (NULL != task) { idx = 0; } break; case 1: IGNORE_RETURN tc->spec_task_queue.pop(task); if (NULL != task) { idx = 1; } break; case 2: IGNORE_RETURN tc->comm_task_queue.pop(task); if (NULL != task) { idx = 2; } break; case 3: IGNORE_RETURN tc->low_prio_task_queue.pop(task); if (NULL != task) { idx = 3; } break; default: ; }; if (NULL == task) { IGNORE_RETURN tc->high_prio_task_queue.pop(task); if (NULL != task) { idx = 0; } } if (NULL == task) { IGNORE_RETURN tc->spec_task_queue.pop(task); if (NULL != task) { idx = 1; } } if (NULL == task) { IGNORE_RETURN tc->comm_task_queue.pop(task); if (NULL != task) { idx = 2; } } if (NULL == task) { IGNORE_RETURN tc->low_prio_task_queue.pop(task); if (NULL != task) { idx = 3; } } tc->using_flag = false; // not need strict consist, so do not use barrier if (idx >= 0 && idx < QUEUE_COUNT) { tc->host->each_queue_len_[idx].inc(-1); } if (NULL != task || (tc->host->queue_rebalance_ && (last_rebalance_got || TC_REACH_TIME_INTERVAL(QUEUE_WAIT_TIME)) && (last_rebalance_got = (NULL != (task = tc->host->rebalance_(idx, *tc)))))) { tc->host->queued_num_.inc(-1); tc->host->handle_with_stopflag(task, pdata, tc->stop_flag); tc->last_active_time = ::oceanbase::common::ObTimeUtility::current_time(); } else { tc->queue_cond.timedwait(QUEUE_WAIT_TIME); } tc->host->on_iter(); v4si queue_len = { tc->high_prio_task_queue.get_total(), tc->spec_task_queue.get_total(), tc->comm_task_queue.get_total(), tc->low_prio_task_queue.get_total(), }; tc->scheduler_.update(idx, ::oceanbase::common::ObTimeUtility::current_time() - start_time, queue_len); } tc->host->on_end(pdata); } return NULL; } int64_t S2MQueueThread::get_thread_index() const { int64_t ret = const_cast(*this).thread_index(); return ret; } //////////////////////////////////////////////////////////////////////////////////////////////////// const int64_t M2SQueueThread::QUEUE_WAIT_TIME = 100 * 1000; M2SQueueThread::M2SQueueThread() : inited_(false), pd_(0), run_flag_(true), queue_cond_(), task_queue_(), idle_interval_(INT64_MAX), last_idle_time_(0) { } M2SQueueThread::~M2SQueueThread() { destroy(); } int M2SQueueThread::init(const int64_t task_num_limit, const int64_t idle_interval) { int ret = OB_SUCCESS; int tmp_ret = 0; run_flag_ = true; if (inited_) { ret = OB_INIT_TWICE; } else if (OB_SUCCESS != (ret = task_queue_.init(task_num_limit))) { _OB_LOG(WARN, "task_queue init fail, ret=%d task_num_limit=%ld", ret, task_num_limit); } else if (0 != (tmp_ret = pthread_create(&pd_, NULL, thread_func_, this))) { ret = OB_ERR_UNEXPECTED; _OB_LOG(WARN, "pthread_create fail, ret=%d", tmp_ret); } else { inited_ = true; idle_interval_ = idle_interval; last_idle_time_ = 0; } if (OB_SUCCESS != ret && OB_INIT_TWICE != ret) { destroy(); } return ret; } void M2SQueueThread::destroy() { if (0 != pd_) { run_flag_ = false; queue_cond_.signal(); pthread_join(pd_, NULL); pd_ = 0; } task_queue_.destroy(); inited_ = false; } int M2SQueueThread::push(void *task) { int ret = OB_SUCCESS; if (!inited_) { ret = OB_NOT_INIT; } else if (NULL == task) { ret = OB_INVALID_ARGUMENT; } else { if (OB_SUCCESS != (ret = task_queue_.push(task))) { //_OB_LOG(WARN, "push task to queue fail, ret=%d", ret); if (OB_SIZE_OVERFLOW == ret) { ret = OB_EAGAIN; } } else { queue_cond_.signal(); } } return ret; } int64_t M2SQueueThread::get_queued_num() const { return task_queue_.get_total(); } void *M2SQueueThread::thread_func_(void *data) { M2SQueueThread *const host = (M2SQueueThread *)data; if (NULL == host) { _OB_LOG_RET(WARN, common::OB_INVALID_ARGUMENT, "thread_func param null pointer"); } else { void *pdata = host->on_begin(); while (host->run_flag_) //&& 0 != host->task_queue_.get_total()) { void *task = NULL; IGNORE_RETURN host->task_queue_.pop(task); if (NULL != task) { host->handle(task, pdata); } else if ((host->last_idle_time_ + host->idle_interval_) <= ::oceanbase::common::ObTimeUtility::current_time()) { host->on_idle(); host->last_idle_time_ = ::oceanbase::common::ObTimeUtility::current_time(); } else { host->queue_cond_.timedwait(std::min(QUEUE_WAIT_TIME, host->idle_interval_)); } } host->on_end(pdata); } return NULL; } } }