Files
doris/be/src/exec/tablet_sink.h
Xinyi Zou b34ed43ec9 [feature-wip] (memory tracker) (step6, End) Fix some details (#9301)
1. Fix LoadTask, ChunkAllocator, TabletMeta, Brpc, the accuracy of memory track.
2. Modified some MemTracker names, deleted some unnecessary trackers, and improved readability.
3. More powerful MemTracker debugging capabilities.
4. Avoid creating TabletColumn temporary objects and improve BE startup time by 8%.
5. Fix some other details.
2022-05-10 18:17:09 +08:00

552 lines
21 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <fmt/format.h>
#include <memory>
#include <queue>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "common/object_pool.h"
#include "common/status.h"
#include "exec/data_sink.h"
#include "exec/tablet_info.h"
#include "gen_cpp/Types_types.h"
#include "gen_cpp/internal_service.pb.h"
#include "runtime/thread_context.h"
#include "util/bitmap.h"
#include "util/countdown_latch.h"
#include "util/ref_count_closure.h"
#include "util/spinlock.h"
#include "util/thread.h"
#include "util/thrift_util.h"
namespace doris {
class Bitmap;
class MemTracker;
class RuntimeProfile;
class RowDescriptor;
class ThreadPool;
class ThreadPoolToken;
class Tuple;
class TupleDescriptor;
class ExprContext;
class TExpr;
namespace vectorized {
class Block;
class MutableBlock;
} // namespace vectorized
namespace stream_load {
class OlapTableSink;
// The counter of add_batch rpc of a single node
struct AddBatchCounter {
// total execution time of a add_batch rpc
int64_t add_batch_execution_time_us = 0;
// lock waiting time in a add_batch rpc
int64_t add_batch_wait_execution_time_us = 0;
// number of add_batch call
int64_t add_batch_num = 0;
// time passed between marked close and finish close
int64_t close_wait_time_ms = 0;
AddBatchCounter& operator+=(const AddBatchCounter& rhs) {
add_batch_execution_time_us += rhs.add_batch_execution_time_us;
add_batch_wait_execution_time_us += rhs.add_batch_wait_execution_time_us;
add_batch_num += rhs.add_batch_num;
close_wait_time_ms += rhs.close_wait_time_ms;
return *this;
}
friend AddBatchCounter operator+(const AddBatchCounter& lhs, const AddBatchCounter& rhs) {
AddBatchCounter sum = lhs;
sum += rhs;
return sum;
}
};
// It's very error-prone to guarantee the handler capture vars' & this closure's destruct sequence.
// So using create() to get the closure pointer is recommended. We can delete the closure ptr before the capture vars destruction.
// Delete this point is safe, don't worry about RPC callback will run after ReusableClosure deleted.
template <typename T>
class ReusableClosure final : public google::protobuf::Closure {
public:
ReusableClosure() : cid(INVALID_BTHREAD_ID) {}
~ReusableClosure() override {
// shouldn't delete when Run() is calling or going to be called, wait for current Run() done.
join();
}
static ReusableClosure<T>* create() { return new ReusableClosure<T>(); }
void addFailedHandler(const std::function<void(bool)>& fn) { failed_handler = fn; }
void addSuccessHandler(const std::function<void(const T&, bool)>& fn) { success_handler = fn; }
void join() {
// We rely on in_flight to assure one rpc is running,
// while cid is not reliable due to memory order.
// in_flight is written before getting callid,
// so we can not use memory fence to synchronize.
while (_packet_in_flight) {
// cid here is complicated
if (cid != INVALID_BTHREAD_ID) {
// actually cid may be the last rpc call id.
brpc::Join(cid);
}
if (_packet_in_flight) {
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
}
// plz follow this order: reset() -> set_in_flight() -> send brpc batch
void reset() {
cntl.Reset();
cid = cntl.call_id();
}
bool try_set_in_flight() {
bool value = false;
return _packet_in_flight.compare_exchange_strong(value, true);
}
void clear_in_flight() { _packet_in_flight = false; }
bool is_packet_in_flight() { return _packet_in_flight; }
void end_mark() {
DCHECK(_is_last_rpc == false);
_is_last_rpc = true;
}
void Run() override {
DCHECK(_packet_in_flight);
if (cntl.Failed()) {
LOG(WARNING) << "failed to send brpc batch, error=" << berror(cntl.ErrorCode())
<< ", error_text=" << cntl.ErrorText();
failed_handler(_is_last_rpc);
} else {
success_handler(result, _is_last_rpc);
}
clear_in_flight();
}
brpc::Controller cntl;
T result;
private:
brpc::CallId cid;
std::atomic<bool> _packet_in_flight {false};
std::atomic<bool> _is_last_rpc {false};
std::function<void(bool)> failed_handler;
std::function<void(const T&, bool)> success_handler;
};
class IndexChannel;
class NodeChannel {
public:
NodeChannel(OlapTableSink* parent, IndexChannel* index_channel, int64_t node_id);
virtual ~NodeChannel() noexcept;
// called before open, used to add tablet located in this backend
void add_tablet(const TTabletWithPartition& tablet) { _all_tablets.emplace_back(tablet); }
virtual Status init(RuntimeState* state);
// we use open/open_wait to parallel
void open();
virtual Status open_wait();
Status add_row(Tuple* tuple, int64_t tablet_id);
virtual Status add_row(const BlockRow& block_row, int64_t tablet_id) {
LOG(FATAL) << "add block row to NodeChannel not supported";
return Status::OK();
}
// two ways to stop channel:
// 1. mark_close()->close_wait() PS. close_wait() will block waiting for the last AddBatch rpc response.
// 2. just cancel()
virtual void mark_close();
Status close_wait(RuntimeState* state);
void cancel(const std::string& cancel_msg);
// return:
// 0: stopped, send finished(eos request has been sent), or any internal error;
// 1: running, haven't reach eos.
// only allow 1 rpc in flight
// plz make sure, this func should be called after open_wait().
virtual int try_send_and_fetch_status(RuntimeState* state,
std::unique_ptr<ThreadPoolToken>& thread_pool_token);
void try_send_batch(RuntimeState* state);
void time_report(std::unordered_map<int64_t, AddBatchCounter>* add_batch_counter_map,
int64_t* serialize_batch_ns, int64_t* mem_exceeded_block_ns,
int64_t* queue_push_lock_ns, int64_t* actual_consume_ns,
int64_t* total_add_batch_exec_time_ns, int64_t* add_batch_exec_time_ns,
int64_t* total_add_batch_num) {
(*add_batch_counter_map)[_node_id] += _add_batch_counter;
(*add_batch_counter_map)[_node_id].close_wait_time_ms = _close_time_ms;
*serialize_batch_ns += _serialize_batch_ns;
*mem_exceeded_block_ns += _mem_exceeded_block_ns;
*queue_push_lock_ns += _queue_push_lock_ns;
*actual_consume_ns += _actual_consume_ns;
*add_batch_exec_time_ns = (_add_batch_counter.add_batch_execution_time_us * 1000);
*total_add_batch_exec_time_ns += *add_batch_exec_time_ns;
*total_add_batch_num += _add_batch_counter.add_batch_num;
}
int64_t node_id() const { return _node_id; }
std::string host() const { return _node_info.host; }
std::string name() const { return _name; }
Status none_of(std::initializer_list<bool> vars);
void clear_all_batches();
virtual void clear_all_blocks() { LOG(FATAL) << "NodeChannel::clear_all_blocks not supported"; }
std::string channel_info() const {
return fmt::format("{}, {}, node={}:{}", _name, _load_info, _node_info.host,
_node_info.brpc_port);
}
protected:
void _cancel_with_msg(const std::string& msg);
virtual void _close_check();
protected:
bool _is_vectorized = false;
OlapTableSink* _parent = nullptr;
IndexChannel* _index_channel = nullptr;
int64_t _node_id = -1;
std::string _load_info;
std::string _name;
std::shared_ptr<MemTracker> _node_channel_tracker;
TupleDescriptor* _tuple_desc = nullptr;
NodeInfo _node_info;
// this should be set in init() using config
int _rpc_timeout_ms = 60000;
int64_t _next_packet_seq = 0;
MonotonicStopWatch _timeout_watch;
// the timestamp when this node channel be marked closed and finished closed
uint64_t _close_time_ms = 0;
// user cancel or get some errors
std::atomic<bool> _cancelled {false};
SpinLock _cancel_msg_lock;
std::string _cancel_msg = "";
// send finished means the consumer thread which send the rpc can exit
std::atomic<bool> _send_finished {false};
// add batches finished means the last rpc has be response, used to check whether this channel can be closed
std::atomic<bool> _add_batches_finished {false}; // reuse for vectorized
bool _eos_is_produced {false}; // only for restricting producer behaviors
std::unique_ptr<RowDescriptor> _row_desc;
int _batch_size = 0;
// limit _pending_batches size
std::atomic<size_t> _pending_batches_bytes {0};
size_t _max_pending_batches_bytes {10 * 1024 * 1024};
std::mutex _pending_batches_lock; // reuse for vectorized
std::atomic<int> _pending_batches_num {0}; // reuse for vectorized
std::shared_ptr<PBackendService_Stub> _stub = nullptr;
RefCountClosure<PTabletWriterOpenResult>* _open_closure = nullptr;
std::vector<TTabletWithPartition> _all_tablets;
std::vector<TTabletCommitInfo> _tablet_commit_infos;
AddBatchCounter _add_batch_counter;
std::atomic<int64_t> _serialize_batch_ns {0};
std::atomic<int64_t> _mem_exceeded_block_ns {0};
std::atomic<int64_t> _queue_push_lock_ns {0};
std::atomic<int64_t> _actual_consume_ns {0};
// lock to protect _is_closed.
// The methods in the IndexChannel are called back in the RpcClosure in the NodeChannel.
// However, this rpc callback may occur after the whole task is finished (e.g. due to network latency),
// and by that time the IndexChannel may have been destructured, so we should not call the
// IndexChannel methods anymore, otherwise the BE will crash.
// Therefore, we use the _is_closed and _closed_lock to ensure that the RPC callback
// function will not call the IndexChannel method after the NodeChannel is closed.
// The IndexChannel is definitely accessible until the NodeChannel is closed.
std::mutex _closed_lock;
bool _is_closed = false;
private:
// buffer for saving serialized row batch data.
// In the non-attachment approach, we need to use two PRowBatch structures alternately
// so that when one PRowBatch is sent, the other PRowBatch can be used for the serialization of the next RowBatch.
// This is not necessary with the attachment approach, because the memory structures
// are already copied into attachment memory before sending, and will wait for
// the previous RPC to be fully completed before the next copy.
std::string _tuple_data_buffer;
std::string* _tuple_data_buffer_ptr = nullptr;
std::unique_ptr<RowBatch> _cur_batch;
PTabletWriterAddBatchRequest _cur_add_batch_request;
using AddBatchReq = std::pair<std::unique_ptr<RowBatch>, PTabletWriterAddBatchRequest>;
std::queue<AddBatchReq> _pending_batches;
ReusableClosure<PTabletWriterAddBatchResult>* _add_batch_closure = nullptr;
};
class IndexChannel {
public:
IndexChannel(OlapTableSink* parent, int64_t index_id, bool is_vec)
: _parent(parent), _index_id(index_id), _is_vectorized(is_vec) {
_index_channel_tracker =
MemTracker::create_tracker(-1, "IndexChannel:indexID=" + std::to_string(_index_id));
}
~IndexChannel() = default;
Status init(RuntimeState* state, const std::vector<TTabletWithPartition>& tablets);
template <typename Row>
void add_row(const Row& tuple, int64_t tablet_id);
void for_each_node_channel(
const std::function<void(const std::shared_ptr<NodeChannel>&)>& func) {
SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_index_channel_tracker);
for (auto& it : _node_channels) {
func(it.second);
}
}
void mark_as_failed(int64_t node_id, const std::string& host, const std::string& err,
int64_t tablet_id = -1);
Status check_intolerable_failure();
// set error tablet info in runtime state, so that it can be returned to FE.
void set_error_tablet_in_state(RuntimeState* state);
size_t num_node_channels() const { return _node_channels.size(); }
private:
friend class NodeChannel;
friend class VNodeChannel;
OlapTableSink* _parent;
int64_t _index_id;
bool _is_vectorized = false;
// from backend channel to tablet_id
// ATTN: must be placed before `_node_channels` and `_channels_by_tablet`.
// Because the destruct order of objects is opposite to the creation order.
// So NodeChannel will be destructured first.
// And the destructor function of NodeChannel waits for all RPCs to finish.
// This ensures that it is safe to use `_tablets_by_channel` in the callback function for the end of the RPC.
std::unordered_map<int64_t, std::unordered_set<int64_t>> _tablets_by_channel;
// BeId -> channel
std::unordered_map<int64_t, std::shared_ptr<NodeChannel>> _node_channels;
// from tablet_id to backend channel
std::unordered_map<int64_t, std::vector<std::shared_ptr<NodeChannel>>> _channels_by_tablet;
// lock to protect _failed_channels and _failed_channels_msgs
mutable SpinLock _fail_lock;
// key is tablet_id, value is a set of failed node id
std::unordered_map<int64_t, std::unordered_set<int64_t>> _failed_channels;
// key is tablet_id, value is error message
std::unordered_map<int64_t, std::string> _failed_channels_msgs;
Status _intolerable_failure_status = Status::OK();
std::shared_ptr<MemTracker> _index_channel_tracker;
};
template <typename Row>
void IndexChannel::add_row(const Row& tuple, int64_t tablet_id) {
SCOPED_SWITCH_THREAD_LOCAL_MEM_TRACKER(_index_channel_tracker);
auto it = _channels_by_tablet.find(tablet_id);
DCHECK(it != _channels_by_tablet.end()) << "unknown tablet, tablet_id=" << tablet_id;
for (const auto& channel : it->second) {
// if this node channel is already failed, this add_row will be skipped
auto st = channel->add_row(tuple, tablet_id);
if (!st.ok()) {
mark_as_failed(channel->node_id(), channel->host(), st.get_error_msg(), tablet_id);
// continue add row to other node, the error will be checked for every batch outside
}
}
}
// Write data to Olap Table.
// When OlapTableSink::open() called, there will be a consumer thread running in the background.
// When you call OlapTableSink::send(), you will be the producer who products pending batches.
// Join the consumer thread in close().
class OlapTableSink : public DataSink {
public:
// Construct from thrift struct which is generated by FE.
OlapTableSink(ObjectPool* pool, const RowDescriptor& row_desc, const std::vector<TExpr>& texprs,
Status* status);
~OlapTableSink() override;
Status init(const TDataSink& sink) override;
Status prepare(RuntimeState* state) override;
Status open(RuntimeState* state) override;
Status send(RuntimeState* state, RowBatch* batch) override;
// close() will send RPCs too. If RPCs failed, return error.
Status close(RuntimeState* state, Status close_status) override;
// Returns the runtime profile for the sink.
RuntimeProfile* profile() override { return _profile; }
private:
// convert input batch to output batch which will be loaded into OLAP table.
// this is only used in insert statement.
Status _convert_batch(RuntimeState* state, RowBatch* input_batch, RowBatch* output_batch);
// make input data valid for OLAP table
// return number of invalid/filtered rows.
// invalid row number is set in Bitmap
// set stop_processing is we want to stop the whole process now.
Status _validate_data(RuntimeState* state, RowBatch* batch, Bitmap* filter_bitmap,
int* filtered_rows, bool* stop_processing);
// the consumer func of sending pending batches in every NodeChannel.
// use polling & NodeChannel::try_send_and_fetch_status() to achieve nonblocking sending.
// only focus on pending batches and channel status, the internal errors of NodeChannels will be handled by the producer
void _send_batch_process(RuntimeState* state);
protected:
friend class NodeChannel;
friend class VNodeChannel;
friend class IndexChannel;
bool _is_vectorized = false;
std::shared_ptr<MemTracker> _mem_tracker;
ObjectPool* _pool;
const RowDescriptor& _input_row_desc;
// unique load id
PUniqueId _load_id;
int64_t _txn_id = -1;
int _num_replicas = -1;
int _tuple_desc_id = -1;
// this is tuple descriptor of destination OLAP table
TupleDescriptor* _output_tuple_desc = nullptr;
RowDescriptor* _output_row_desc = nullptr;
bool _need_validate_data = false;
// number of senders used to insert into OlapTable, if we only support single node insert,
// all data from select should collectted and then send to OlapTable.
// To support multiple senders, we maintain a channel for each sender.
int _sender_id = -1;
int _num_senders = -1;
bool _is_high_priority = false;
// TODO(zc): think about cache this data
std::shared_ptr<OlapTableSchemaParam> _schema;
OlapTableLocationParam* _location = nullptr;
DorisNodesInfo* _nodes_info = nullptr;
RuntimeProfile* _profile = nullptr;
std::set<int64_t> _partition_ids;
// only used for partition with random distribution
std::map<int64_t, int64_t> _partition_to_tablet_map;
Bitmap _filter_bitmap;
// index_channel
std::vector<std::shared_ptr<IndexChannel>> _channels;
CountDownLatch _stop_background_threads_latch;
scoped_refptr<Thread> _sender_thread;
std::unique_ptr<ThreadPoolToken> _send_batch_thread_pool_token;
std::vector<DecimalV2Value> _max_decimalv2_val;
std::vector<DecimalV2Value> _min_decimalv2_val;
// Stats for this
int64_t _convert_batch_ns = 0;
int64_t _validate_data_ns = 0;
int64_t _send_data_ns = 0;
int64_t _number_input_rows = 0;
int64_t _number_output_rows = 0;
int64_t _number_filtered_rows = 0;
RuntimeProfile::Counter* _input_rows_counter = nullptr;
RuntimeProfile::Counter* _output_rows_counter = nullptr;
RuntimeProfile::Counter* _filtered_rows_counter = nullptr;
RuntimeProfile::Counter* _send_data_timer = nullptr;
RuntimeProfile::Counter* _wait_mem_limit_timer = nullptr;
RuntimeProfile::Counter* _convert_batch_timer = nullptr;
RuntimeProfile::Counter* _validate_data_timer = nullptr;
RuntimeProfile::Counter* _open_timer = nullptr;
RuntimeProfile::Counter* _close_timer = nullptr;
RuntimeProfile::Counter* _non_blocking_send_timer = nullptr;
RuntimeProfile::Counter* _non_blocking_send_work_timer = nullptr;
RuntimeProfile::Counter* _serialize_batch_timer = nullptr;
RuntimeProfile::Counter* _total_add_batch_exec_timer = nullptr;
RuntimeProfile::Counter* _max_add_batch_exec_timer = nullptr;
RuntimeProfile::Counter* _add_batch_number = nullptr;
RuntimeProfile::Counter* _num_node_channels = nullptr;
// load mem limit is for remote load channel
int64_t _load_mem_limit = -1;
// the timeout of load channels opened by this tablet sink. in second
int64_t _load_channel_timeout_s = 0;
int32_t _send_batch_parallelism = 1;
// Save the status of close() method
Status _close_status;
// TODO(cmy): this should be removed after we switch to rpc attachment by default.
bool _transfer_data_by_brpc_attachment = false;
// FIND_TABLET_EVERY_ROW is used for both hash and random distribution info, which indicates that we
// should compute tablet index for every row
// FIND_TABLET_EVERY_BATCH is only used for random distribution info, which indicates that we should
// compute tablet index for every row batch
// FIND_TABLET_EVERY_SINK is only used for random distribution info, which indicates that we should
// only compute tablet index in the corresponding partition once for the whole time in olap table sink
enum FindTabletMode { FIND_TABLET_EVERY_ROW, FIND_TABLET_EVERY_BATCH, FIND_TABLET_EVERY_SINK };
FindTabletMode findTabletMode = FindTabletMode::FIND_TABLET_EVERY_ROW;
private:
OlapTablePartitionParam* _partition = nullptr;
std::vector<ExprContext*> _output_expr_ctxs;
std::unique_ptr<RowBatch> _output_batch;
};
} // namespace stream_load
} // namespace doris