[DeadLock] fix detector bug

This commit is contained in:
obdev
2023-02-20 12:47:38 +00:00
committed by ob-robot
parent 2e954397ec
commit b1faf451eb
13 changed files with 428 additions and 86 deletions

View File

@ -211,8 +211,9 @@ ObDependencyResource& ObDependencyResource::operator=(const ObDependencyResource
uint64_t ObDependencyResource::hash() const
{
uint64_t hash_val = 0;
hash_val = murmurhash(&addr_, sizeof(addr_), hash_val);
hash_val = murmurhash(&user_key_, sizeof(user_key_), hash_val);
hash_val = addr_.hash();
uint64_t key_hash = user_key_.hash();
hash_val = murmurhash(&key_hash, sizeof(key_hash), hash_val);
return hash_val;
}
@ -225,6 +226,8 @@ bool ObDependencyResource::operator<(const ObDependencyResource &rhs) const
{
if (addr_ < rhs.addr_) {
return true;
} else if (addr_ > rhs.addr_) {
return false;
} else {
if (user_key_ < rhs.user_key_) {
return true;

View File

@ -32,6 +32,11 @@ namespace share
{
namespace detector
{
// if msg in map count below LCL_MSG_CACHE_LIMIT/2, all pending msg is accepted
// if msg in map count greater than LCL_MSG_CACHE_LIMIT/2, but less than LCL_MSG_CACHE_LIMIT,
// random drop appending msg, drop probability depends on how many msg keeping in map,
// if msg count in map reach LCL_MSG_CACHE_LIMIT, drop probability is 100%, no more msg is accepted.
constexpr int64_t LCL_MSG_CACHE_LIMIT = 4096;
class ObLCLMessage;
class ObDependencyResource;

View File

@ -105,6 +105,8 @@ public:
const KeyType2 &parent_key);
template<typename KeyType>
int set_timeout(const KeyType &key, const int64_t timeout);
template<typename KeyType>
int check_detector_exist(const KeyType &key, bool &exist);
// ungister resource operation
template<typename KeyType>
int unregister_key(const KeyType &key);
@ -281,6 +283,28 @@ int ObDeadLockDetectorMgr::register_key(const KeyType &key,
return ret;
#undef PRINT_WRAPPER
}
template<typename KeyType>
int ObDeadLockDetectorMgr::check_detector_exist(const KeyType &key, bool &exist)
{
CHECK_INIT();
CHECK_ARGS(key);
#define PRINT_WRAPPER KR(ret), K(key)
int ret = common::OB_SUCCESS;
UserBinaryKey user_key;
DetectorRefGuard ref_guard;
if (OB_FAIL(user_key.set_user_key(key))) {
DETECT_LOG(WARN, "user key serialization failed", PRINT_WRAPPER);
} else if (OB_FAIL(get_detector_(user_key, ref_guard))) {
if (OB_ENTRY_NOT_EXIST == ret) {
exist = false;
ret = OB_SUCCESS;
}
} else {
exist = true;
}
return ret;
#undef PRINT_WRAPPER
}
// unregister a user specified key
// unregister action means:
// 1. the detector instance associated with user specified key will be released
@ -295,7 +319,6 @@ template<typename KeyType>
int ObDeadLockDetectorMgr::unregister_key(const KeyType &key)
{
CHECK_INIT();
CHECK_ENABLED();
CHECK_ARGS(key);
#define PRINT_WRAPPER KR(ret), K(key)
int ret = common::OB_SUCCESS;

View File

@ -10,6 +10,7 @@
* See the Mulan PubL v2 for more details.
*/
#include "lib/ob_errno.h"
#include "share/ob_occam_time_guard.h"
#include "ob_lcl_batch_sender_thread.h"
#include "lib/atomic/ob_atomic.h"
@ -19,6 +20,8 @@
#include "ob_lcl_parameters.h"
#include "share/deadlock/ob_deadlock_arg_checker.h"
#include "share/deadlock/ob_deadlock_detector_rpc.h"
#include <cstdlib>
#include <exception>
namespace oceanbase
{
@ -38,10 +41,15 @@ bool ObLCLBatchSenderThread::RemoveIfOp::operator()(const ObDependencyResource &
int temp_ret = OB_SUCCESS;
DETECT_TIME_GUARD(100_ms);
if (OB_SUCCESS != (temp_ret = lcl_message_list_.push_back(lcl_msg))) {
if (lcl_message_list_.count() >= LCL_MSG_CACHE_LIMIT) {
temp_ret = OB_BUF_NOT_ENOUGH;
ret = false;
DETECT_LOG_RET(WARN, temp_ret, "LCL message fetch failed",
KR(temp_ret), K(lcl_msg));
} else if (OB_SUCCESS != (temp_ret = lcl_message_list_.push_back(lcl_msg))) {
ret = false;
DETECT_LOG_RET(WARN, temp_ret, "push lcl message to lcl_message_list failed",
KR(temp_ret), K(lcl_msg));
KR(temp_ret), K(lcl_msg));
}
return ret;
}
@ -64,7 +72,7 @@ int ObLCLBatchSenderThread::init()
int ret = OB_SUCCESS;
if (OB_FAIL(share::ObThreadPool::init())) {
DETECT_LOG(WARN, "init thread failed", K(ret), KP(this), K(MTL_ID()));
} else if (OB_FAIL(lcl_msg_map_.init(MEMORY_LABEL, MTL_ID()))) {
} else if (OB_FAIL(lcl_msg_map_.init("LCLSender", MTL_ID()))) {
DETECT_LOG(WARN, "init thread failed", K(ret), KP(this), K(MTL_ID()));
} else {
is_inited_ = true;
@ -85,28 +93,64 @@ int ObLCLBatchSenderThread::start()
return ret;
}
int ObLCLBatchSenderThread::cache_msg(const ObDependencyResource &key,
const ObLCLMessage &lcl_msg)
int ObLCLBatchSenderThread::cache_msg(const ObDependencyResource &key, const ObLCLMessage &lcl_msg)
{
CHECK_INIT_AND_START();
#define PRINT_WRAPPER KR(ret), K(key), K(lcl_msg)
#define PRINT_WRAPPER KR(ret), K(key), K(lcl_msg), K(can_insert), K(random_drop_percentage)
int ret = OB_SUCCESS;
CHECK_INIT_AND_START();
ObLCLBatchSenderThread::MergeOp op(lcl_msg);
int64_t random_drop_percentage = 0;
DETECT_TIME_GUARD(100_ms);
do {
int64_t msg_count = lcl_msg_map_.count();
bool can_insert = false;
if (msg_count < LCL_MSG_CACHE_LIMIT / 2) {// always keep
can_insert = true;
} else if (msg_count < LCL_MSG_CACHE_LIMIT) {// random drop
int64_t keep_threshold = LCL_MSG_CACHE_LIMIT / 2;
// more keeping messages means higher probability to drop new appended one
// if reach LCL_MSG_CACHE_LIMIT, definitely drop
random_drop_percentage = (msg_count - keep_threshold) * 100 / keep_threshold;
can_insert = distribution_(random_generator_) > random_drop_percentage;
} else {// always drop
can_insert = false;
random_drop_percentage = 100;
}
if (OB_FAIL(insert_or_merge_(key, lcl_msg, can_insert))) {
DETECT_LOG(WARN, "lcl message is droped", PRINT_WRAPPER);
}
return ret;
#undef PRINT_WRAPPER
}
int ObLCLBatchSenderThread::insert_or_merge_(const ObDependencyResource &key,
const ObLCLMessage &lcl_message,
const bool can_insert)
{
#define PRINT_WRAPPER KR(ret), K(key), K(lcl_message), K(can_insert), K(msg_count)
DETECT_TIME_GUARD(100_ms);
int ret = OB_SUCCESS;
ObLCLBatchSenderThread::MergeOp op(lcl_message);
int64_t msg_count = lcl_msg_map_.count();
do {// there may be concurrent problem, so need retry until success or meet can't handle failure
if (OB_SUCCESS != ret) {
DETECT_LOG(INFO, "try again", PRINT_WRAPPER);
}
if (OB_SUCC(lcl_msg_map_.insert(key, lcl_msg))) {
// do nothing
} else if (OB_ENTRY_EXIST != ret) {
DETECT_LOG(WARN, "this error can't handle", PRINT_WRAPPER);
break;
} else if (OB_SUCC(lcl_msg_map_.operate(key, op))) {
// do nothing
} else if (OB_ENTRY_NOT_EXIST != ret) {
DETECT_LOG(WARN, "this error can't handle", PRINT_WRAPPER);
if (can_insert) {// try insert first, if exist, try update merge then
if (OB_SUCC(lcl_msg_map_.insert(key, lcl_message))) {
} else if (OB_ENTRY_EXIST != ret) {
DETECT_LOG(WARN, "this error can't handle", PRINT_WRAPPER);
break;
} else if (OB_SUCC(lcl_msg_map_.operate(key, op))) {
} else if (OB_ENTRY_NOT_EXIST != ret) {
DETECT_LOG(WARN, "this error can't handle", PRINT_WRAPPER);
}
} else {// just try update merge
if (OB_FAIL(lcl_msg_map_.operate(key, op))) {
if (OB_ENTRY_NOT_EXIST == ret) {
ret = OB_BUF_NOT_ENOUGH;
}
}
break;// no matter success or not, no retry
}
} while (CLICK() && (OB_ENTRY_NOT_EXIST == ret) && ATOMIC_LOAD(&is_running_));
return ret;
@ -149,18 +193,16 @@ void ObLCLBatchSenderThread::record_summary_info_and_logout_when_necessary_(int6
total_busy_time_ += diff;
if (total_record_time_ > 5L * 1000L * 1000L) {// 5s
double duty_ratio = double(total_busy_time_) / total_record_time_ * 100;
int duty_ratio_percentage = double(total_busy_time_) / total_record_time_ * 100;
int64_t total_constructed_detector = ATOMIC_LOAD(&ObIDeadLockDetector::total_constructed_count);
int64_t total_destructed_detector = ATOMIC_LOAD(&ObIDeadLockDetector::total_destructed_count);
int64_t total_alived_detector = total_constructed_detector - total_destructed_detector;
DETECT_LOG(INFO, "ObLCLBatchSenderThread periodic report summary info",
DETECT_LOG(INFO, "ObLCLBatchSenderThread periodic report summary info", K(duty_ratio_percentage),
K(total_constructed_detector), K(total_destructed_detector),
K(total_alived_detector), K(duty_ratio),
K(int64_t(ObServerConfig::get_instance()._lcl_op_interval)), K(*this));
K(total_alived_detector), K(_lcl_op_interval), K(lcl_msg_map_.count()), K(*this));
total_record_time_ = 0;
total_busy_time_ = 0;
over_night_times_ = 0;
duty_ratio = 0;
}
}
@ -173,6 +215,7 @@ void ObLCLBatchSenderThread::run1()
int64_t diff = 0;
ObArray<ObLCLMessage> mock_lcl_message_list;
mock_lcl_message_list.set_label("LCLArray");
ObLCLBatchSenderThread::RemoveIfOp op(mock_lcl_message_list);
lib::set_thread_name("LCLSender");
while(ATOMIC_LOAD(&is_running_)) {
@ -190,17 +233,21 @@ void ObLCLBatchSenderThread::run1()
DETECT_TIME_GUARD(50_ms < _lcl_op_interval ? 50_ms : _lcl_op_interval);
begin_ts = ObClockGenerator::getRealClock();
mock_lcl_message_list.reset();
if (OB_FAIL(lcl_msg_map_.remove_if(op))) {
DETECT_LOG(WARN, "can't fill mock_lcl_message_list", KR(ret));
}
CLICK();
for (int64_t idx = 0; idx < mock_lcl_message_list.count(); ++idx) {
const ObLCLMessage &msg = mock_lcl_message_list.at(idx);
if (OB_FAIL(mgr_->get_rpc().post_lcl_message(msg.get_addr(), msg))) {
DETECT_LOG(WARN, "send LCL msg failed", KR(ret), K(msg));
CLICK();
} else {
DETECT_LOG(DEBUG, "send LCL msg success", K(msg));
if (ATOMIC_LOAD(&allow_send_)) {
if (OB_FAIL(lcl_msg_map_.remove_if(op))) {
DETECT_LOG(WARN, "can't fill mock_lcl_message_list", KR(ret));
lcl_msg_map_.reset();// if fetch failed, remove all
}
CLICK();
for (int64_t idx = 0; idx < mock_lcl_message_list.count(); ++idx) {
const ObLCLMessage &msg = mock_lcl_message_list.at(idx);
if (OB_ISNULL(mgr_)) {
} else if (OB_FAIL(mgr_->get_rpc().post_lcl_message(msg.get_addr(), msg))) {
DETECT_LOG(WARN, "send LCL msg failed", KR(ret), K(msg));
CLICK();
} else {
DETECT_LOG(DEBUG, "send LCL msg success", K(msg));
}
}
}
}

View File

@ -20,6 +20,8 @@
#include "lib/container/ob_array.h"
#include "lib/hash/ob_linear_hash_map.h"
#include "share/deadlock/ob_deadlock_detector_common_define.h"
#include <random>
#include <ctime>
namespace oceanbase
{
@ -31,14 +33,19 @@ class ObDeadLockDetectorMgr;
class ObLCLBatchSenderThread : public share::ObThreadPool
{
using RandomGenerator = std::mt19937;// high quanlity random generator advised by cppreference
using RandomDistribution = std::uniform_int_distribution<>;// random range
public:
ObLCLBatchSenderThread(ObDeadLockDetectorMgr *mgr) :
is_inited_(false),
is_running_(false),
allow_send_(true),
total_record_time_(0),
total_busy_time_(0),
over_night_times_(0),
mgr_(mgr) {}
mgr_(mgr),
random_generator_(std::time(nullptr)),
distribution_(0, 100) {}
~ObLCLBatchSenderThread() { destroy(); }
int init();
int start();
@ -47,8 +54,7 @@ public:
void destroy();
void run1();
public:
int cache_msg(const ObDependencyResource &key,
const ObLCLMessage &lcl_msg);
int cache_msg(const ObDependencyResource &key, const ObLCLMessage &lcl_msg);
TO_STRING_KV(KP(this), K_(is_inited), K_(is_running), K_(total_record_time), K_(over_night_times));
private:
class RemoveIfOp
@ -68,16 +74,22 @@ private:
const ObLCLMessage &lcl_message_;
};
private:
int insert_or_merge_(const ObDependencyResource &key,
const ObLCLMessage &lcl_message,
const bool can_insert);
int64_t update_and_get_lcl_op_interval_();
void record_summary_info_and_logout_when_necessary_(int64_t, int64_t, int64_t);
private:
bool is_inited_;
bool is_running_;
bool allow_send_; // for unittest mock used
int64_t total_record_time_;
int64_t total_busy_time_;
int64_t over_night_times_;
ObDeadLockDetectorMgr* mgr_;
common::ObLinearHashMap<ObDependencyResource, ObLCLMessage> lcl_msg_map_;
RandomGenerator random_generator_;
RandomDistribution distribution_;
};
}

View File

@ -496,9 +496,7 @@ int ObLCLNode::broadcast_(const BlockList &list,
lclv,
public_label,
ObClockGenerator::getRealClock());
if (CLICK() && OB_FAIL(MTL(ObDeadLockDetectorMgr*)->sender_thread_.cache_msg(list.at(idx), msg))) {
DETECT_LOG_(WARN, "cache message failed", KR(ret), K(msg), K(list), K(list), K(*this), K(lbt()));
}
MTL(ObDeadLockDetectorMgr*)->sender_thread_.cache_msg(list.at(idx), msg);
}
return ret;
@ -843,7 +841,7 @@ int ObLCLNode::push_state_to_downstreams_with_lock_()
void ObLCLNode::update_lcl_period_if_necessary_with_lock_()
{
int ret = OB_SUCCESS;
DETECT_TIME_GUARD(100_us);
DETECT_TIME_GUARD(10_ms);
int64_t current_ts = ObClockGenerator::getRealClock();
int64_t new_period_ = current_ts / PERIOD;
int64_t timeout_ts = 0;
@ -869,7 +867,7 @@ bool ObLCLNode::if_phase_match_(const int64_t ts,
int64_t my_phase = ts / PHASE_TIME;
int64_t msg_phase = msg.get_send_ts() / PHASE_TIME;
DETECT_TIME_GUARD(100_us);
DETECT_TIME_GUARD(10_ms);
if (my_phase != msg_phase) {
ret = false;
}