fix io hung and amplification

This commit is contained in:
obdev 2024-12-20 07:45:32 +00:00 committed by ob-robot
parent 24ef025b73
commit 451044fb83
6 changed files with 32 additions and 3 deletions

View File

@ -66,7 +66,17 @@ int QSchedCallback::handle(TCRequest* tc_req)
LOG_INFO("submit_request cost too much time", K(ret), K(time_guard), K(req));
}
if (OB_FAIL(ret)) {
io_req_finish(req, ObIORetCode(ret));
if (ret == OB_EAGAIN) {
if (REACH_TIME_INTERVAL(1 * 1000L * 1000L)) {
LOG_INFO("device channel eagain", K(ret));
}
if (OB_FAIL(req.retry_io())) {
LOG_WARN("retry io failed", K(ret), K(req));
io_req_finish(req, ObIORetCode(ret));
}
} else {
io_req_finish(req, ObIORetCode(ret));
}
}
req.dec_ref("phyqueue_dec"); // ref for io queue
return ret;

View File

@ -1514,6 +1514,17 @@ int ObIORequest::re_prepare()
return ret;
}
int ObIORequest::retry_io()
{
int ret = OB_SUCCESS;
if(OB_ISNULL(tenant_io_mgr_.get_ptr())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("tenant io mgr is null", K(ret), K(*this));
} else if (OB_FAIL(tenant_io_mgr_.get_ptr()->retry_io(*this))) {
LOG_WARN("retry io failed", K(ret), K(*this));
}
return ret;
}
int ObIORequest::try_alloc_buf_until_timeout(char *&io_buf)
{
int ret = OB_SUCCESS;

View File

@ -619,6 +619,7 @@ public:
int prepare(char *next_buffer = nullptr, int64_t next_size = 0, int64_t next_offset = 0);
int recycle_buffer();
int re_prepare();
int retry_io();
int try_alloc_buf_until_timeout(char *&io_buf);
bool can_callback() const;
void free_io_buffer();

View File

@ -1936,6 +1936,10 @@ int ObTenantIOManager::retry_io(ObIORequest &req)
} else if (OB_UNLIKELY(!is_working())) {
ret = OB_STATE_NOT_MATCH;
LOG_WARN("tenant not working", K(ret), K(tenant_id_));
} else if (GCONF._enable_tree_based_io_scheduler) {
if (OB_FAIL(qsched_.schedule_request(req))) {
LOG_WARN("retry io request failed", K(ret), K(req));
}
} else if (OB_FAIL(io_scheduler_->retry_request(req))) {
LOG_WARN("retry io request into sender failed", K(ret), K(req));
}

View File

@ -2134,7 +2134,9 @@ int ObAsyncIOChannel::submit(ObIORequest &req)
} else if (OB_UNLIKELY(device_handle_ != req.fd_.device_handle_)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("invalid argument", K(ret), K(req), KP(device_handle_));
} else if (submit_count_ >= MAX_AIO_EVENT_CNT) {
} else if (OB_ISNULL(req.io_result_)) {
ret = OB_INVALID_ARGUMENT;
} else if ((!req.get_flag().is_detect()) && (submit_count_ >= MAX_AIO_EVENT_CNT - MAX_DETECT_DISK_HUNG_IO_CNT)) {
ret = OB_EAGAIN;
if (REACH_TIME_INTERVAL(1000000L)) {
LOG_WARN("too many io requests", K(ret), K(submit_count_));
@ -2142,7 +2144,7 @@ int ObAsyncIOChannel::submit(ObIORequest &req)
} else if (OB_UNLIKELY(current_ts > req.timeout_ts())) {
ret = OB_TIMEOUT;
LOG_WARN("io timeout because current time is larger than timeout timestamp", K(ret), K(current_ts), K(req));
} else if (device_channel_->used_io_depth_ > device_channel_->max_io_depth_) {
} else if ((!req.get_flag().is_detect()) && (device_channel_->used_io_depth_ > device_channel_->max_io_depth_ - MAX_DETECT_DISK_HUNG_IO_CNT)) {
ret = OB_EAGAIN;
FLOG_INFO("reach max io depth", K(ret), K(device_channel_->used_io_depth_), K(device_channel_->max_io_depth_));
} else {

View File

@ -457,6 +457,7 @@ private:
private:
static const int32_t MAX_AIO_EVENT_CNT = 512;
static const int32_t MAX_DETECT_DISK_HUNG_IO_CNT = 10;
static const int64_t AIO_POLLING_TIMEOUT_NS = 1000L * 1000L * 1000L - 1L; // almost 1s, for timespec_valid check
private:
bool is_inited_;