From 898d0d42f133914dbcd24dbbb6ec4a3d1d2086ae Mon Sep 17 00:00:00 2001 From: AlexYue Date: Tue, 29 Nov 2022 22:28:39 +0800 Subject: [PATCH] [improvement](load)add more log for better bug tracing experience for be write (#14424) Recently when tracing one bug happened in version 1.1.4 I found out there were some places we can add more log for a better tracing. --- be/src/exec/tablet_sink.cpp | 13 ++++++++++++- be/src/olap/delta_writer.cpp | 10 +++++++--- be/src/runtime/tablets_channel.cpp | 5 +++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/be/src/exec/tablet_sink.cpp b/be/src/exec/tablet_sink.cpp index fa9584a060..7c0051c191 100644 --- a/be/src/exec/tablet_sink.cpp +++ b/be/src/exec/tablet_sink.cpp @@ -216,6 +216,7 @@ Status NodeChannel::open_wait() { // if this is last rpc, will must set _add_batches_finished. otherwise, node channel's close_wait // will be blocked. _add_batches_finished = true; + VLOG_PROGRESS << "node channel " << channel_info() << "add_batches_finished"; } }); @@ -271,6 +272,9 @@ Status NodeChannel::open_wait() { } } _add_batches_finished = true; + VLOG_PROGRESS << "node channel " << channel_info() + << "add_batches_finished and handled " + << result.tablet_errors().size() << " tablets errors"; } } else { _cancel_with_msg( @@ -408,7 +412,8 @@ void NodeChannel::mark_close() { DCHECK(_pending_batches.back().second.eos()); _close_time_ms = UnixMillis(); LOG(INFO) << channel_info() - << " mark closed, left pending batch size: " << _pending_batches.size(); + << " mark closed, left pending batch size: " << _pending_batches.size() + << " left pending batch size: " << _pending_batches_bytes; } _eos_is_produced = true; @@ -718,6 +723,8 @@ Status IndexChannel::init(RuntimeState* state, const std::vectorload_job_id() << " index: " << index->index_id + << " would open 0 tablet"; + } _channels.emplace_back(new IndexChannel(this, index->index_id, use_vec)); RETURN_IF_ERROR(_channels.back()->init(state, tablets)); } diff --git a/be/src/olap/delta_writer.cpp b/be/src/olap/delta_writer.cpp index c1bb34f1d6..d330e430fe 100644 --- a/be/src/olap/delta_writer.cpp +++ b/be/src/olap/delta_writer.cpp @@ -223,7 +223,7 @@ Status DeltaWriter::write(const vectorized::Block* block, const std::vector if (_mem_table->is_flush()) { auto s = _flush_memtable_async(); _reset_mem_table(); - if (OLAP_UNLIKELY(!s.ok())) { + if (UNLIKELY(!s.ok())) { return s; } } @@ -255,7 +255,7 @@ Status DeltaWriter::flush_memtable_and_wait(bool need_wait) { << ", load id: " << print_id(_req.load_id); auto s = _flush_memtable_async(); _reset_mem_table(); - if (OLAP_UNLIKELY(!s.ok())) { + if (UNLIKELY(!s.ok())) { return s; } @@ -346,7 +346,11 @@ Status DeltaWriter::close_wait(const PSlaveTabletNodes& slave_tablet_nodes, return _cancel_status; } // return error if previous flush failed - RETURN_NOT_OK(_flush_token->wait()); + auto st = _flush_token->wait(); + if (UNLIKELY(!st.ok())) { + LOG(WARNING) << "previous flush failed tablet " << _tablet->tablet_id(); + return st; + } _mem_table.reset(); diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index 85ab150203..d6c81fb845 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -130,6 +130,8 @@ Status TabletsChannel::close( // just skip this tablet(writer) and continue to close others continue; } + VLOG_PROGRESS << "cancel tablet writer successfully, tablet_id=" << it.first + << ", transaction_id=" << _txn_id; } } @@ -189,6 +191,8 @@ void TabletsChannel::_close_wait(DeltaWriter* writer, PTabletError* tablet_error = tablet_errors->Add(); tablet_error->set_tablet_id(writer->tablet_id()); tablet_error->set_msg(st.get_error_msg()); + VLOG_PROGRESS << "close wait failed tablet " << writer->tablet_id() << " transaction_id " + << _txn_id << "err msg " << st.get_error_msg(); } } @@ -443,6 +447,7 @@ Status TabletsChannel::add_batch(const TabletWriterAddRequest& request, int64_t tablet_id = request.tablet_ids(i); if (_broken_tablets.find(tablet_id) != _broken_tablets.end()) { // skip broken tablets + VLOG_PROGRESS << "skip broken tablet tablet=" << tablet_id; continue; } auto it = tablet_to_rowidxs.find(tablet_id);