[DeadLock] fix detector bug
This commit is contained in:
@ -211,8 +211,9 @@ ObDependencyResource& ObDependencyResource::operator=(const ObDependencyResource
|
||||
uint64_t ObDependencyResource::hash() const
|
||||
{
|
||||
uint64_t hash_val = 0;
|
||||
hash_val = murmurhash(&addr_, sizeof(addr_), hash_val);
|
||||
hash_val = murmurhash(&user_key_, sizeof(user_key_), hash_val);
|
||||
hash_val = addr_.hash();
|
||||
uint64_t key_hash = user_key_.hash();
|
||||
hash_val = murmurhash(&key_hash, sizeof(key_hash), hash_val);
|
||||
return hash_val;
|
||||
}
|
||||
|
||||
@ -225,6 +226,8 @@ bool ObDependencyResource::operator<(const ObDependencyResource &rhs) const
|
||||
{
|
||||
if (addr_ < rhs.addr_) {
|
||||
return true;
|
||||
} else if (addr_ > rhs.addr_) {
|
||||
return false;
|
||||
} else {
|
||||
if (user_key_ < rhs.user_key_) {
|
||||
return true;
|
||||
|
||||
@ -32,6 +32,11 @@ namespace share
|
||||
{
|
||||
namespace detector
|
||||
{
|
||||
// if msg in map count below LCL_MSG_CACHE_LIMIT/2, all pending msg is accepted
|
||||
// if msg in map count greater than LCL_MSG_CACHE_LIMIT/2, but less than LCL_MSG_CACHE_LIMIT,
|
||||
// random drop appending msg, drop probability depends on how many msg keeping in map,
|
||||
// if msg count in map reach LCL_MSG_CACHE_LIMIT, drop probability is 100%, no more msg is accepted.
|
||||
constexpr int64_t LCL_MSG_CACHE_LIMIT = 4096;
|
||||
|
||||
class ObLCLMessage;
|
||||
class ObDependencyResource;
|
||||
|
||||
@ -105,6 +105,8 @@ public:
|
||||
const KeyType2 &parent_key);
|
||||
template<typename KeyType>
|
||||
int set_timeout(const KeyType &key, const int64_t timeout);
|
||||
template<typename KeyType>
|
||||
int check_detector_exist(const KeyType &key, bool &exist);
|
||||
// ungister resource operation
|
||||
template<typename KeyType>
|
||||
int unregister_key(const KeyType &key);
|
||||
@ -281,6 +283,28 @@ int ObDeadLockDetectorMgr::register_key(const KeyType &key,
|
||||
return ret;
|
||||
#undef PRINT_WRAPPER
|
||||
}
|
||||
template<typename KeyType>
|
||||
int ObDeadLockDetectorMgr::check_detector_exist(const KeyType &key, bool &exist)
|
||||
{
|
||||
CHECK_INIT();
|
||||
CHECK_ARGS(key);
|
||||
#define PRINT_WRAPPER KR(ret), K(key)
|
||||
int ret = common::OB_SUCCESS;
|
||||
UserBinaryKey user_key;
|
||||
DetectorRefGuard ref_guard;
|
||||
if (OB_FAIL(user_key.set_user_key(key))) {
|
||||
DETECT_LOG(WARN, "user key serialization failed", PRINT_WRAPPER);
|
||||
} else if (OB_FAIL(get_detector_(user_key, ref_guard))) {
|
||||
if (OB_ENTRY_NOT_EXIST == ret) {
|
||||
exist = false;
|
||||
ret = OB_SUCCESS;
|
||||
}
|
||||
} else {
|
||||
exist = true;
|
||||
}
|
||||
return ret;
|
||||
#undef PRINT_WRAPPER
|
||||
}
|
||||
// unregister a user specified key
|
||||
// unregister action means:
|
||||
// 1. the detector instance associated with user specified key will be released
|
||||
@ -295,7 +319,6 @@ template<typename KeyType>
|
||||
int ObDeadLockDetectorMgr::unregister_key(const KeyType &key)
|
||||
{
|
||||
CHECK_INIT();
|
||||
CHECK_ENABLED();
|
||||
CHECK_ARGS(key);
|
||||
#define PRINT_WRAPPER KR(ret), K(key)
|
||||
int ret = common::OB_SUCCESS;
|
||||
|
||||
@ -10,6 +10,7 @@
|
||||
* See the Mulan PubL v2 for more details.
|
||||
*/
|
||||
|
||||
#include "lib/ob_errno.h"
|
||||
#include "share/ob_occam_time_guard.h"
|
||||
#include "ob_lcl_batch_sender_thread.h"
|
||||
#include "lib/atomic/ob_atomic.h"
|
||||
@ -19,6 +20,8 @@
|
||||
#include "ob_lcl_parameters.h"
|
||||
#include "share/deadlock/ob_deadlock_arg_checker.h"
|
||||
#include "share/deadlock/ob_deadlock_detector_rpc.h"
|
||||
#include <cstdlib>
|
||||
#include <exception>
|
||||
|
||||
namespace oceanbase
|
||||
{
|
||||
@ -38,10 +41,15 @@ bool ObLCLBatchSenderThread::RemoveIfOp::operator()(const ObDependencyResource &
|
||||
int temp_ret = OB_SUCCESS;
|
||||
|
||||
DETECT_TIME_GUARD(100_ms);
|
||||
if (OB_SUCCESS != (temp_ret = lcl_message_list_.push_back(lcl_msg))) {
|
||||
if (lcl_message_list_.count() >= LCL_MSG_CACHE_LIMIT) {
|
||||
temp_ret = OB_BUF_NOT_ENOUGH;
|
||||
ret = false;
|
||||
DETECT_LOG_RET(WARN, temp_ret, "LCL message fetch failed",
|
||||
KR(temp_ret), K(lcl_msg));
|
||||
} else if (OB_SUCCESS != (temp_ret = lcl_message_list_.push_back(lcl_msg))) {
|
||||
ret = false;
|
||||
DETECT_LOG_RET(WARN, temp_ret, "push lcl message to lcl_message_list failed",
|
||||
KR(temp_ret), K(lcl_msg));
|
||||
KR(temp_ret), K(lcl_msg));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@ -64,7 +72,7 @@ int ObLCLBatchSenderThread::init()
|
||||
int ret = OB_SUCCESS;
|
||||
if (OB_FAIL(share::ObThreadPool::init())) {
|
||||
DETECT_LOG(WARN, "init thread failed", K(ret), KP(this), K(MTL_ID()));
|
||||
} else if (OB_FAIL(lcl_msg_map_.init(MEMORY_LABEL, MTL_ID()))) {
|
||||
} else if (OB_FAIL(lcl_msg_map_.init("LCLSender", MTL_ID()))) {
|
||||
DETECT_LOG(WARN, "init thread failed", K(ret), KP(this), K(MTL_ID()));
|
||||
} else {
|
||||
is_inited_ = true;
|
||||
@ -85,28 +93,64 @@ int ObLCLBatchSenderThread::start()
|
||||
return ret;
|
||||
}
|
||||
|
||||
int ObLCLBatchSenderThread::cache_msg(const ObDependencyResource &key,
|
||||
const ObLCLMessage &lcl_msg)
|
||||
int ObLCLBatchSenderThread::cache_msg(const ObDependencyResource &key, const ObLCLMessage &lcl_msg)
|
||||
{
|
||||
CHECK_INIT_AND_START();
|
||||
#define PRINT_WRAPPER KR(ret), K(key), K(lcl_msg)
|
||||
#define PRINT_WRAPPER KR(ret), K(key), K(lcl_msg), K(can_insert), K(random_drop_percentage)
|
||||
int ret = OB_SUCCESS;
|
||||
CHECK_INIT_AND_START();
|
||||
ObLCLBatchSenderThread::MergeOp op(lcl_msg);
|
||||
|
||||
int64_t random_drop_percentage = 0;
|
||||
DETECT_TIME_GUARD(100_ms);
|
||||
do {
|
||||
int64_t msg_count = lcl_msg_map_.count();
|
||||
bool can_insert = false;
|
||||
if (msg_count < LCL_MSG_CACHE_LIMIT / 2) {// always keep
|
||||
can_insert = true;
|
||||
} else if (msg_count < LCL_MSG_CACHE_LIMIT) {// random drop
|
||||
int64_t keep_threshold = LCL_MSG_CACHE_LIMIT / 2;
|
||||
// more keeping messages means higher probability to drop new appended one
|
||||
// if reach LCL_MSG_CACHE_LIMIT, definitely drop
|
||||
random_drop_percentage = (msg_count - keep_threshold) * 100 / keep_threshold;
|
||||
can_insert = distribution_(random_generator_) > random_drop_percentage;
|
||||
} else {// always drop
|
||||
can_insert = false;
|
||||
random_drop_percentage = 100;
|
||||
}
|
||||
if (OB_FAIL(insert_or_merge_(key, lcl_msg, can_insert))) {
|
||||
DETECT_LOG(WARN, "lcl message is droped", PRINT_WRAPPER);
|
||||
}
|
||||
return ret;
|
||||
#undef PRINT_WRAPPER
|
||||
}
|
||||
|
||||
int ObLCLBatchSenderThread::insert_or_merge_(const ObDependencyResource &key,
|
||||
const ObLCLMessage &lcl_message,
|
||||
const bool can_insert)
|
||||
{
|
||||
#define PRINT_WRAPPER KR(ret), K(key), K(lcl_message), K(can_insert), K(msg_count)
|
||||
DETECT_TIME_GUARD(100_ms);
|
||||
int ret = OB_SUCCESS;
|
||||
ObLCLBatchSenderThread::MergeOp op(lcl_message);
|
||||
int64_t msg_count = lcl_msg_map_.count();
|
||||
do {// there may be concurrent problem, so need retry until success or meet can't handle failure
|
||||
if (OB_SUCCESS != ret) {
|
||||
DETECT_LOG(INFO, "try again", PRINT_WRAPPER);
|
||||
}
|
||||
if (OB_SUCC(lcl_msg_map_.insert(key, lcl_msg))) {
|
||||
// do nothing
|
||||
} else if (OB_ENTRY_EXIST != ret) {
|
||||
DETECT_LOG(WARN, "this error can't handle", PRINT_WRAPPER);
|
||||
break;
|
||||
} else if (OB_SUCC(lcl_msg_map_.operate(key, op))) {
|
||||
// do nothing
|
||||
} else if (OB_ENTRY_NOT_EXIST != ret) {
|
||||
DETECT_LOG(WARN, "this error can't handle", PRINT_WRAPPER);
|
||||
if (can_insert) {// try insert first, if exist, try update merge then
|
||||
if (OB_SUCC(lcl_msg_map_.insert(key, lcl_message))) {
|
||||
} else if (OB_ENTRY_EXIST != ret) {
|
||||
DETECT_LOG(WARN, "this error can't handle", PRINT_WRAPPER);
|
||||
break;
|
||||
} else if (OB_SUCC(lcl_msg_map_.operate(key, op))) {
|
||||
} else if (OB_ENTRY_NOT_EXIST != ret) {
|
||||
DETECT_LOG(WARN, "this error can't handle", PRINT_WRAPPER);
|
||||
}
|
||||
} else {// just try update merge
|
||||
if (OB_FAIL(lcl_msg_map_.operate(key, op))) {
|
||||
if (OB_ENTRY_NOT_EXIST == ret) {
|
||||
ret = OB_BUF_NOT_ENOUGH;
|
||||
}
|
||||
}
|
||||
break;// no matter success or not, no retry
|
||||
}
|
||||
} while (CLICK() && (OB_ENTRY_NOT_EXIST == ret) && ATOMIC_LOAD(&is_running_));
|
||||
return ret;
|
||||
@ -149,18 +193,16 @@ void ObLCLBatchSenderThread::record_summary_info_and_logout_when_necessary_(int6
|
||||
total_busy_time_ += diff;
|
||||
|
||||
if (total_record_time_ > 5L * 1000L * 1000L) {// 5s
|
||||
double duty_ratio = double(total_busy_time_) / total_record_time_ * 100;
|
||||
int duty_ratio_percentage = double(total_busy_time_) / total_record_time_ * 100;
|
||||
int64_t total_constructed_detector = ATOMIC_LOAD(&ObIDeadLockDetector::total_constructed_count);
|
||||
int64_t total_destructed_detector = ATOMIC_LOAD(&ObIDeadLockDetector::total_destructed_count);
|
||||
int64_t total_alived_detector = total_constructed_detector - total_destructed_detector;
|
||||
DETECT_LOG(INFO, "ObLCLBatchSenderThread periodic report summary info",
|
||||
DETECT_LOG(INFO, "ObLCLBatchSenderThread periodic report summary info", K(duty_ratio_percentage),
|
||||
K(total_constructed_detector), K(total_destructed_detector),
|
||||
K(total_alived_detector), K(duty_ratio),
|
||||
K(int64_t(ObServerConfig::get_instance()._lcl_op_interval)), K(*this));
|
||||
K(total_alived_detector), K(_lcl_op_interval), K(lcl_msg_map_.count()), K(*this));
|
||||
total_record_time_ = 0;
|
||||
total_busy_time_ = 0;
|
||||
over_night_times_ = 0;
|
||||
duty_ratio = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -173,6 +215,7 @@ void ObLCLBatchSenderThread::run1()
|
||||
int64_t diff = 0;
|
||||
|
||||
ObArray<ObLCLMessage> mock_lcl_message_list;
|
||||
mock_lcl_message_list.set_label("LCLArray");
|
||||
ObLCLBatchSenderThread::RemoveIfOp op(mock_lcl_message_list);
|
||||
lib::set_thread_name("LCLSender");
|
||||
while(ATOMIC_LOAD(&is_running_)) {
|
||||
@ -190,17 +233,21 @@ void ObLCLBatchSenderThread::run1()
|
||||
DETECT_TIME_GUARD(50_ms < _lcl_op_interval ? 50_ms : _lcl_op_interval);
|
||||
begin_ts = ObClockGenerator::getRealClock();
|
||||
mock_lcl_message_list.reset();
|
||||
if (OB_FAIL(lcl_msg_map_.remove_if(op))) {
|
||||
DETECT_LOG(WARN, "can't fill mock_lcl_message_list", KR(ret));
|
||||
}
|
||||
CLICK();
|
||||
for (int64_t idx = 0; idx < mock_lcl_message_list.count(); ++idx) {
|
||||
const ObLCLMessage &msg = mock_lcl_message_list.at(idx);
|
||||
if (OB_FAIL(mgr_->get_rpc().post_lcl_message(msg.get_addr(), msg))) {
|
||||
DETECT_LOG(WARN, "send LCL msg failed", KR(ret), K(msg));
|
||||
CLICK();
|
||||
} else {
|
||||
DETECT_LOG(DEBUG, "send LCL msg success", K(msg));
|
||||
if (ATOMIC_LOAD(&allow_send_)) {
|
||||
if (OB_FAIL(lcl_msg_map_.remove_if(op))) {
|
||||
DETECT_LOG(WARN, "can't fill mock_lcl_message_list", KR(ret));
|
||||
lcl_msg_map_.reset();// if fetch failed, remove all
|
||||
}
|
||||
CLICK();
|
||||
for (int64_t idx = 0; idx < mock_lcl_message_list.count(); ++idx) {
|
||||
const ObLCLMessage &msg = mock_lcl_message_list.at(idx);
|
||||
if (OB_ISNULL(mgr_)) {
|
||||
} else if (OB_FAIL(mgr_->get_rpc().post_lcl_message(msg.get_addr(), msg))) {
|
||||
DETECT_LOG(WARN, "send LCL msg failed", KR(ret), K(msg));
|
||||
CLICK();
|
||||
} else {
|
||||
DETECT_LOG(DEBUG, "send LCL msg success", K(msg));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -20,6 +20,8 @@
|
||||
#include "lib/container/ob_array.h"
|
||||
#include "lib/hash/ob_linear_hash_map.h"
|
||||
#include "share/deadlock/ob_deadlock_detector_common_define.h"
|
||||
#include <random>
|
||||
#include <ctime>
|
||||
|
||||
namespace oceanbase
|
||||
{
|
||||
@ -31,14 +33,19 @@ class ObDeadLockDetectorMgr;
|
||||
|
||||
class ObLCLBatchSenderThread : public share::ObThreadPool
|
||||
{
|
||||
using RandomGenerator = std::mt19937;// high quanlity random generator advised by cppreference
|
||||
using RandomDistribution = std::uniform_int_distribution<>;// random range
|
||||
public:
|
||||
ObLCLBatchSenderThread(ObDeadLockDetectorMgr *mgr) :
|
||||
is_inited_(false),
|
||||
is_running_(false),
|
||||
allow_send_(true),
|
||||
total_record_time_(0),
|
||||
total_busy_time_(0),
|
||||
over_night_times_(0),
|
||||
mgr_(mgr) {}
|
||||
mgr_(mgr),
|
||||
random_generator_(std::time(nullptr)),
|
||||
distribution_(0, 100) {}
|
||||
~ObLCLBatchSenderThread() { destroy(); }
|
||||
int init();
|
||||
int start();
|
||||
@ -47,8 +54,7 @@ public:
|
||||
void destroy();
|
||||
void run1();
|
||||
public:
|
||||
int cache_msg(const ObDependencyResource &key,
|
||||
const ObLCLMessage &lcl_msg);
|
||||
int cache_msg(const ObDependencyResource &key, const ObLCLMessage &lcl_msg);
|
||||
TO_STRING_KV(KP(this), K_(is_inited), K_(is_running), K_(total_record_time), K_(over_night_times));
|
||||
private:
|
||||
class RemoveIfOp
|
||||
@ -68,16 +74,22 @@ private:
|
||||
const ObLCLMessage &lcl_message_;
|
||||
};
|
||||
private:
|
||||
int insert_or_merge_(const ObDependencyResource &key,
|
||||
const ObLCLMessage &lcl_message,
|
||||
const bool can_insert);
|
||||
int64_t update_and_get_lcl_op_interval_();
|
||||
void record_summary_info_and_logout_when_necessary_(int64_t, int64_t, int64_t);
|
||||
private:
|
||||
bool is_inited_;
|
||||
bool is_running_;
|
||||
bool allow_send_; // for unittest mock used
|
||||
int64_t total_record_time_;
|
||||
int64_t total_busy_time_;
|
||||
int64_t over_night_times_;
|
||||
ObDeadLockDetectorMgr* mgr_;
|
||||
common::ObLinearHashMap<ObDependencyResource, ObLCLMessage> lcl_msg_map_;
|
||||
RandomGenerator random_generator_;
|
||||
RandomDistribution distribution_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@ -496,9 +496,7 @@ int ObLCLNode::broadcast_(const BlockList &list,
|
||||
lclv,
|
||||
public_label,
|
||||
ObClockGenerator::getRealClock());
|
||||
if (CLICK() && OB_FAIL(MTL(ObDeadLockDetectorMgr*)->sender_thread_.cache_msg(list.at(idx), msg))) {
|
||||
DETECT_LOG_(WARN, "cache message failed", KR(ret), K(msg), K(list), K(list), K(*this), K(lbt()));
|
||||
}
|
||||
MTL(ObDeadLockDetectorMgr*)->sender_thread_.cache_msg(list.at(idx), msg);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -843,7 +841,7 @@ int ObLCLNode::push_state_to_downstreams_with_lock_()
|
||||
void ObLCLNode::update_lcl_period_if_necessary_with_lock_()
|
||||
{
|
||||
int ret = OB_SUCCESS;
|
||||
DETECT_TIME_GUARD(100_us);
|
||||
DETECT_TIME_GUARD(10_ms);
|
||||
int64_t current_ts = ObClockGenerator::getRealClock();
|
||||
int64_t new_period_ = current_ts / PERIOD;
|
||||
int64_t timeout_ts = 0;
|
||||
@ -869,7 +867,7 @@ bool ObLCLNode::if_phase_match_(const int64_t ts,
|
||||
int64_t my_phase = ts / PHASE_TIME;
|
||||
int64_t msg_phase = msg.get_send_ts() / PHASE_TIME;
|
||||
|
||||
DETECT_TIME_GUARD(100_us);
|
||||
DETECT_TIME_GUARD(10_ms);
|
||||
if (my_phase != msg_phase) {
|
||||
ret = false;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user