# Proposed changes Issue Number: close #6238 Co-authored-by: HappenLee <happenlee@hotmail.com> Co-authored-by: stdpain <34912776+stdpain@users.noreply.github.com> Co-authored-by: Zhengguo Yang <yangzhgg@gmail.com> Co-authored-by: wangbo <506340561@qq.com> Co-authored-by: emmymiao87 <522274284@qq.com> Co-authored-by: Pxl <952130278@qq.com> Co-authored-by: zhangstar333 <87313068+zhangstar333@users.noreply.github.com> Co-authored-by: thinker <zchw100@qq.com> Co-authored-by: Zeno Yang <1521564989@qq.com> Co-authored-by: Wang Shuo <wangshuo128@gmail.com> Co-authored-by: zhoubintao <35688959+zbtzbtzbt@users.noreply.github.com> Co-authored-by: Gabriel <gabrielleebuaa@gmail.com> Co-authored-by: xinghuayu007 <1450306854@qq.com> Co-authored-by: weizuo93 <weizuo@apache.org> Co-authored-by: yiguolei <guoleiyi@tencent.com> Co-authored-by: anneji-dev <85534151+anneji-dev@users.noreply.github.com> Co-authored-by: awakeljw <993007281@qq.com> Co-authored-by: taberylyang <95272637+taberylyang@users.noreply.github.com> Co-authored-by: Cui Kaifeng <48012748+azurenake@users.noreply.github.com> ## Problem Summary: ### 1. Some code from clickhouse **ClickHouse is an excellent implementation of the vectorized execution engine database, so here we have referenced and learned a lot from its excellent implementation in terms of data structure and function implementation. We are based on ClickHouse v19.16.2.2 and would like to thank the ClickHouse community and developers.** The following comment has been added to the code from Clickhouse, eg: // This file is copied from // https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/AggregationCommon.h // and modified by Doris ### 2. Support exec node and query: * vaggregation_node * vanalytic_eval_node * vassert_num_rows_node * vblocking_join_node * vcross_join_node * vempty_set_node * ves_http_scan_node * vexcept_node * vexchange_node * vintersect_node * vmysql_scan_node * vodbc_scan_node * volap_scan_node * vrepeat_node * vschema_scan_node * vselect_node * vset_operation_node * vsort_node * vunion_node * vhash_join_node You can run exec engine of SSB/TPCH and 70% TPCDS stand query test set. ### 3. Data Model Vec Exec Engine Support **Dup/Agg/Unq** table, Support Block Reader Vectorized. Segment Vec is working in process. ### 4. How to use 1. Set the environment variable `set enable_vectorized_engine = true; `(required) 2. Set the environment variable `set batch_size = 4096; ` (recommended) ### 5. Some diff from origin exec engine https://github.com/doris-vectorized/doris-vectorized/issues/294 ## Checklist(Required) 1. Does it affect the original behavior: (No) 2. Has unit tests been added: (Yes) 3. Has document been added or modified: (No) 4. Does it need to update dependencies: (No) 5. Are there any changes that cannot be rolled back: (Yes)
258 lines
8.8 KiB
C++
258 lines
8.8 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "odbc_scan_node.h"
|
|
|
|
#include <sstream>
|
|
|
|
#include "exec/text_converter.hpp"
|
|
#include "gen_cpp/PlanNodes_types.h"
|
|
#include "runtime/row_batch.h"
|
|
#include "runtime/runtime_state.h"
|
|
#include "runtime/string_value.h"
|
|
#include "runtime/tuple_row.h"
|
|
#include "util/runtime_profile.h"
|
|
|
|
namespace doris {
|
|
|
|
OdbcScanNode::OdbcScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs,
|
|
std::string scan_node_type)
|
|
: ScanNode(pool, tnode, descs),
|
|
_is_init(false),
|
|
_scan_node_type(scan_node_type),
|
|
_table_name(tnode.odbc_scan_node.table_name),
|
|
_connect_string(std::move(tnode.odbc_scan_node.connect_string)),
|
|
_query_string(std::move(tnode.odbc_scan_node.query_string)),
|
|
_tuple_id(tnode.odbc_scan_node.tuple_id),
|
|
_tuple_desc(nullptr),
|
|
_slot_num(0) {}
|
|
|
|
OdbcScanNode::~OdbcScanNode() {}
|
|
|
|
Status OdbcScanNode::prepare(RuntimeState* state) {
|
|
VLOG_CRITICAL << _scan_node_type << "::Prepare";
|
|
|
|
if (_is_init) {
|
|
return Status::OK();
|
|
}
|
|
|
|
if (nullptr == state) {
|
|
return Status::InternalError("input pointer is null.");
|
|
}
|
|
|
|
RETURN_IF_ERROR(ScanNode::prepare(state));
|
|
// get tuple desc
|
|
_tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id);
|
|
|
|
if (nullptr == _tuple_desc) {
|
|
return Status::InternalError("Failed to get tuple descriptor.");
|
|
}
|
|
|
|
_slot_num = _tuple_desc->slots().size();
|
|
|
|
_odbc_param.connect_string = std::move(_connect_string);
|
|
_odbc_param.query_string = std::move(_query_string);
|
|
_odbc_param.tuple_desc = _tuple_desc;
|
|
|
|
_odbc_scanner.reset(new (std::nothrow) ODBCConnector(_odbc_param));
|
|
|
|
if (_odbc_scanner.get() == nullptr) {
|
|
return Status::InternalError("new a odbc scanner failed.");
|
|
}
|
|
|
|
_tuple_pool.reset(new (std::nothrow) MemPool(mem_tracker().get()));
|
|
|
|
if (_tuple_pool.get() == nullptr) {
|
|
return Status::InternalError("new a mem pool failed.");
|
|
}
|
|
|
|
_text_converter.reset(new (std::nothrow) TextConverter('\\'));
|
|
|
|
if (_text_converter.get() == nullptr) {
|
|
return Status::InternalError("new a text convertor failed.");
|
|
}
|
|
|
|
_is_init = true;
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status OdbcScanNode::open(RuntimeState* state) {
|
|
RETURN_IF_ERROR(ExecNode::open(state));
|
|
VLOG_CRITICAL << _scan_node_type << "::Open";
|
|
|
|
if (nullptr == state) {
|
|
return Status::InternalError("input pointer is null.");
|
|
}
|
|
|
|
if (!_is_init) {
|
|
return Status::InternalError("used before initialize.");
|
|
}
|
|
|
|
RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN));
|
|
RETURN_IF_CANCELLED(state);
|
|
SCOPED_TIMER(_runtime_profile->total_time_counter());
|
|
RETURN_IF_ERROR(_odbc_scanner->open());
|
|
RETURN_IF_ERROR(_odbc_scanner->query());
|
|
// check materialize slot num
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status OdbcScanNode::write_text_slot(char* value, int value_length, SlotDescriptor* slot,
|
|
RuntimeState* state) {
|
|
if (!_text_converter->write_slot(slot, _tuple, value, value_length, true, false,
|
|
_tuple_pool.get())) {
|
|
std::stringstream ss;
|
|
ss << "Fail to convert odbc value:'" << value << "' to " << slot->type() << " on column:`"
|
|
<< slot->col_name() + "`";
|
|
return Status::InternalError(ss.str());
|
|
}
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status OdbcScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) {
|
|
VLOG_CRITICAL << _scan_node_type << "::GetNext";
|
|
|
|
if (nullptr == state || nullptr == row_batch || nullptr == eos) {
|
|
return Status::InternalError("input is nullptr pointer");
|
|
}
|
|
|
|
if (!_is_init) {
|
|
return Status::InternalError("used before initialize.");
|
|
}
|
|
|
|
RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT));
|
|
RETURN_IF_CANCELLED(state);
|
|
SCOPED_TIMER(_runtime_profile->total_time_counter());
|
|
|
|
if (reached_limit()) {
|
|
*eos = true;
|
|
return Status::OK();
|
|
}
|
|
|
|
// create new tuple buffer for row_batch
|
|
int tuple_buffer_size = row_batch->capacity() * _tuple_desc->byte_size();
|
|
void* tuple_buffer = _tuple_pool->allocate(tuple_buffer_size);
|
|
|
|
if (nullptr == tuple_buffer) {
|
|
return Status::InternalError("Allocate memory failed.");
|
|
}
|
|
|
|
_tuple = reinterpret_cast<Tuple*>(tuple_buffer);
|
|
// Indicates whether there are more rows to process. Set in _odbc_scanner.next().
|
|
bool odbc_eos = false;
|
|
|
|
while (true) {
|
|
RETURN_IF_CANCELLED(state);
|
|
|
|
if (reached_limit() || row_batch->is_full()) {
|
|
// hang on to last allocated chunk in pool, we'll keep writing into it in the
|
|
// next get_next() call
|
|
row_batch->tuple_data_pool()->acquire_data(_tuple_pool.get(), !reached_limit());
|
|
*eos = reached_limit();
|
|
return Status::OK();
|
|
}
|
|
|
|
RETURN_IF_ERROR(_odbc_scanner->get_next_row(&odbc_eos));
|
|
|
|
if (odbc_eos) {
|
|
row_batch->tuple_data_pool()->acquire_data(_tuple_pool.get(), false);
|
|
*eos = true;
|
|
return Status::OK();
|
|
}
|
|
|
|
int row_idx = row_batch->add_row();
|
|
TupleRow* row = row_batch->get_row(row_idx);
|
|
// scan node is the first tuple of tuple row
|
|
row->set_tuple(0, _tuple);
|
|
memset(_tuple, 0, _tuple_desc->num_null_bytes());
|
|
int j = 0;
|
|
|
|
for (int i = 0; i < _slot_num; ++i) {
|
|
auto slot_desc = _tuple_desc->slots()[i];
|
|
// because the fe planner filter the non_materialize column
|
|
if (!slot_desc->is_materialized()) {
|
|
continue;
|
|
}
|
|
|
|
const auto& column_data = _odbc_scanner->get_column_data(j);
|
|
if (column_data.strlen_or_ind == SQL_NULL_DATA) {
|
|
if (slot_desc->is_nullable()) {
|
|
_tuple->set_null(slot_desc->null_indicator_offset());
|
|
} else {
|
|
std::stringstream ss;
|
|
ss << "nonnull column contains nullptr. table=" << _table_name
|
|
<< ", column=" << slot_desc->col_name();
|
|
return Status::InternalError(ss.str());
|
|
}
|
|
} else if (column_data.strlen_or_ind > column_data.buffer_length) {
|
|
std::stringstream ss;
|
|
ss << "nonnull column contains nullptr. table=" << _table_name
|
|
<< ", column=" << slot_desc->col_name();
|
|
return Status::InternalError(ss.str());
|
|
} else {
|
|
RETURN_IF_ERROR(write_text_slot(static_cast<char*>(column_data.target_value_ptr),
|
|
column_data.strlen_or_ind, slot_desc, state));
|
|
}
|
|
j++;
|
|
}
|
|
|
|
ExprContext* const* ctxs = &_conjunct_ctxs[0];
|
|
int num_ctxs = _conjunct_ctxs.size();
|
|
|
|
// ODBC scanner can not filter conjunct with function, need check conjunct again.
|
|
if (ExecNode::eval_conjuncts(ctxs, num_ctxs, row)) {
|
|
row_batch->commit_last_row();
|
|
++_num_rows_returned;
|
|
COUNTER_SET(_rows_returned_counter, _num_rows_returned);
|
|
char* new_tuple = reinterpret_cast<char*>(_tuple);
|
|
new_tuple += _tuple_desc->byte_size();
|
|
_tuple = reinterpret_cast<Tuple*>(new_tuple);
|
|
}
|
|
}
|
|
}
|
|
|
|
Status OdbcScanNode::close(RuntimeState* state) {
|
|
if (is_closed()) {
|
|
return Status::OK();
|
|
}
|
|
RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE));
|
|
SCOPED_TIMER(_runtime_profile->total_time_counter());
|
|
|
|
_tuple_pool.reset();
|
|
|
|
return ExecNode::close(state);
|
|
}
|
|
|
|
void OdbcScanNode::debug_string(int indentation_level, std::stringstream* out) const {
|
|
*out << string(indentation_level * 2, ' ');
|
|
*out << _scan_node_type << "(tupleid=" << _tuple_id << " table=" << _table_name;
|
|
*out << ")" << std::endl;
|
|
|
|
for (int i = 0; i < _children.size(); ++i) {
|
|
_children[i]->debug_string(indentation_level + 1, out);
|
|
}
|
|
}
|
|
|
|
Status OdbcScanNode::set_scan_ranges(const std::vector<TScanRangeParams>& scan_ranges) {
|
|
return Status::OK();
|
|
}
|
|
|
|
} // namespace doris
|