add defensive log & fix wrong tid in pthread_join
This commit is contained in:
@ -1902,7 +1902,7 @@ int dump_thread_info(lua_State *L)
|
||||
struct iovec remote_iov = {thread_base + rpc_dest_addr_offset, sizeof(ObAddr)};
|
||||
wait_event[0] = '\0';
|
||||
if (0 != join_addr) {
|
||||
IGNORE_RETURN snprintf(wait_event, BUF_LEN, "thread %u %ld", *(uint32_t*)(thread_base + tid_offset), tid_offset);
|
||||
IGNORE_RETURN snprintf(wait_event, BUF_LEN, "thread %u %ld", *(uint32_t*)(join_addr + tid_offset), tid_offset);
|
||||
} else if (OB_NOT_NULL(wait_addr)) {
|
||||
uint32_t val = 0;
|
||||
struct iovec local_iov = {&val, sizeof(val)};
|
||||
|
@ -584,7 +584,7 @@ ObTenant::ObTenant(const int64_t id,
|
||||
shrink_(0),
|
||||
total_worker_cnt_(0),
|
||||
gc_thread_(0),
|
||||
stopped_(true),
|
||||
stopped_(0),
|
||||
wait_mtl_finished_(false),
|
||||
req_queue_(),
|
||||
multi_level_queue_(nullptr),
|
||||
@ -697,7 +697,7 @@ int ObTenant::init(const ObTenantMeta &meta)
|
||||
if (OB_FAIL(ret)) {
|
||||
LOG_ERROR("fail to create tenant module", K(ret));
|
||||
} else {
|
||||
ATOMIC_STORE(&stopped_, false);
|
||||
start();
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -858,6 +858,15 @@ int ObTenant::create_tenant_module()
|
||||
return ret;
|
||||
}
|
||||
|
||||
void ObTenant::sleep_and_warn(ObTenant* tenant)
|
||||
{
|
||||
ob_usleep(10_ms);
|
||||
const int64_t ts = ObTimeUtility::current_time() - tenant->stopped_;
|
||||
if (ts >= 3_min && TC_REACH_TIME_INTERVAL(3_min)) {
|
||||
LOG_ERROR_RET(OB_SUCCESS, "tenant destructed for too long time.", K_(tenant->id), K(ts));
|
||||
}
|
||||
}
|
||||
|
||||
void* ObTenant::wait(void* t)
|
||||
{
|
||||
ObStackHeaderGuard stack_header_guard;
|
||||
@ -869,7 +878,7 @@ void* ObTenant::wait(void* t)
|
||||
tenant->handle_retry_req(true);
|
||||
while (tenant->req_queue_.size() > 0
|
||||
|| (tenant->multi_level_queue_ != nullptr && tenant->multi_level_queue_->get_total_size() > 0)) {
|
||||
ob_usleep(10L * 1000L);
|
||||
sleep_and_warn(tenant);
|
||||
}
|
||||
while (tenant->workers_.get_size() > 0) {
|
||||
if (OB_SUCC(tenant->workers_lock_.trylock())) {
|
||||
@ -879,16 +888,16 @@ void* ObTenant::wait(void* t)
|
||||
destroy_worker(w);
|
||||
}
|
||||
IGNORE_RETURN tenant->workers_lock_.unlock();
|
||||
if (REACH_TIME_INTERVAL(10 * 1000L * 1000L)) {
|
||||
if (REACH_TIME_INTERVAL(10_s)) {
|
||||
LOG_INFO(
|
||||
"Tenant has some workers need stop", K_(tenant->id),
|
||||
"workers", tenant->workers_.get_size(),
|
||||
K_(tenant->req_queue));
|
||||
}
|
||||
}
|
||||
ob_usleep(10L * 1000L);
|
||||
sleep_and_warn(tenant);
|
||||
}
|
||||
LOG_WARN_RET(OB_SUCCESS,"start remove nesting", K(tenant->nesting_workers_.get_size()), K_(tenant->id));
|
||||
LOG_INFO("start remove nesting", K(tenant->nesting_workers_.get_size()), K_(tenant->id));
|
||||
while (tenant->nesting_workers_.get_size() > 0) {
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_SUCC(tenant->workers_lock_.trylock())) {
|
||||
@ -898,7 +907,7 @@ void* ObTenant::wait(void* t)
|
||||
destroy_worker(w);
|
||||
}
|
||||
IGNORE_RETURN tenant->workers_lock_.unlock();
|
||||
if (REACH_TIME_INTERVAL(10 * 1000L * 1000L)) {
|
||||
if (REACH_TIME_INTERVAL(10_s)) {
|
||||
LOG_INFO(
|
||||
"Tenant has some nesting workers need stop",
|
||||
K_(tenant->id),
|
||||
@ -906,12 +915,12 @@ void* ObTenant::wait(void* t)
|
||||
K_(tenant->req_queue));
|
||||
}
|
||||
}
|
||||
ob_usleep(10L * 1000L);
|
||||
sleep_and_warn(tenant);
|
||||
}
|
||||
LOG_WARN_RET(OB_SUCCESS, "finish remove nesting", K(tenant->nesting_workers_.get_size()), K_(tenant->id));
|
||||
LOG_WARN_RET(OB_SUCCESS, "start remove group_map", K_(tenant->id));
|
||||
LOG_INFO("finish remove nesting", K(tenant->nesting_workers_.get_size()), K_(tenant->id));
|
||||
LOG_INFO("start remove group_map", K_(tenant->id));
|
||||
tenant->group_map_.wait_group();
|
||||
LOG_WARN_RET(OB_SUCCESS, "finish remove group_map", K_(tenant->id));
|
||||
LOG_INFO("finish remove group_map", K_(tenant->id));
|
||||
if (!is_virtual_tenant_id(tenant->id_) && !tenant->wait_mtl_finished_) {
|
||||
ObTenantSwitchGuard guard(tenant);
|
||||
tenant->stop_mtl_module();
|
||||
@ -920,6 +929,7 @@ void* ObTenant::wait(void* t)
|
||||
tenant->wait_mtl_module();
|
||||
tenant->wait_mtl_finished_ = true;
|
||||
}
|
||||
LOG_INFO("finish waiting", K_(tenant->id));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
@ -1190,7 +1200,7 @@ int ObTenant::recv_request(ObRequest &req)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
int req_level = 0;
|
||||
if (ATOMIC_LOAD(&stopped_)) {
|
||||
if (has_stopped()) {
|
||||
ret = OB_TENANT_NOT_IN_SERVER;
|
||||
LOG_WARN("receive request but tenant has already stopped", K(ret), K(id_));
|
||||
} else if (0 != req.get_group_id()) {
|
||||
@ -1325,7 +1335,7 @@ int ObTenant::recv_large_request(rpc::ObRequest &req)
|
||||
int ObTenant::push_retry_queue(rpc::ObRequest &req, const uint64_t timestamp)
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
if (ATOMIC_LOAD(&stopped_)) {
|
||||
if (has_stopped()) {
|
||||
ret = OB_IN_STOP_STATE;
|
||||
LOG_WARN("receive retry request but tenant has already stopped", K(ret), K(id_));
|
||||
} else if (OB_FAIL(retry_queue_.push(req, timestamp))) {
|
||||
@ -1338,9 +1348,9 @@ int ObTenant::timeup()
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
ObLDHandle handle;
|
||||
if (!stopped_ && OB_SUCC(try_rdlock(handle))) {
|
||||
if (!has_stopped() && OB_SUCC(try_rdlock(handle))) {
|
||||
// it may fail during drop tenant, try next time.
|
||||
if (!stopped_) {
|
||||
if (!has_stopped()) {
|
||||
check_group_worker_count();
|
||||
check_worker_count();
|
||||
update_token_usage();
|
||||
|
@ -383,11 +383,11 @@ public:
|
||||
int init_ctx();
|
||||
int init_multi_level_queue();
|
||||
int init(const ObTenantMeta &meta);
|
||||
void stop() { ATOMIC_STORE(&stopped_, true); }
|
||||
void start() { ATOMIC_STORE(&stopped_, false); }
|
||||
void stop() { ATOMIC_STORE(&stopped_, ObTimeUtility::current_time()); }
|
||||
void start() { ATOMIC_STORE(&stopped_, 0); }
|
||||
int try_wait();
|
||||
void destroy();
|
||||
bool has_stopped() const { return ATOMIC_LOAD(&stopped_); }
|
||||
bool has_stopped() const { return ATOMIC_LOAD(&stopped_) != 0; }
|
||||
|
||||
ObTenantMeta get_tenant_meta();
|
||||
bool is_hidden();
|
||||
@ -498,6 +498,7 @@ public:
|
||||
// OB_INLINE bool has_normal_request() const { return req_queue_.size() != 0; }
|
||||
// OB_INLINE bool has_level_request() const { return OB_NOT_NULL(multi_level_queue_) && multi_level_queue_->get_total_size() != 0; }
|
||||
private:
|
||||
static void sleep_and_warn(ObTenant* tenant);
|
||||
static void* wait(void* tenant);
|
||||
// update CPU usage
|
||||
void update_token_usage();
|
||||
@ -528,7 +529,6 @@ private:
|
||||
int construct_mtl_init_ctx(const ObTenantMeta &meta, share::ObTenantModuleInitCtx *&ctx);
|
||||
|
||||
int recv_group_request(rpc::ObRequest &req, int64_t group_id);
|
||||
|
||||
protected:
|
||||
|
||||
mutable common::TCRWLock meta_lock_;
|
||||
@ -540,7 +540,7 @@ protected:
|
||||
volatile bool shrink_ CACHE_ALIGNED;
|
||||
int64_t total_worker_cnt_;
|
||||
pthread_t gc_thread_;
|
||||
bool stopped_;
|
||||
int64_t stopped_;
|
||||
bool wait_mtl_finished_;
|
||||
|
||||
/// tenant task queue,
|
||||
|
@ -130,7 +130,7 @@ int ObAllVirtualThread::inner_get_next_row(common::ObNewRow *&row)
|
||||
struct iovec remote_iov = {thread_base + rpc_dest_addr_offset, sizeof(ObAddr)};
|
||||
wait_event_[0] = '\0';
|
||||
if (0 != join_addr) {
|
||||
IGNORE_RETURN snprintf(wait_event_, 64, "thread %u", *(uint32_t*)(thread_base + tid_offset));
|
||||
IGNORE_RETURN snprintf(wait_event_, 64, "thread %u", *(uint32_t*)(join_addr + tid_offset));
|
||||
} else if (OB_NOT_NULL(wait_addr)) {
|
||||
uint32_t val = 0;
|
||||
struct iovec local_iov = {&val, sizeof(val)};
|
||||
|
Reference in New Issue
Block a user