[Refactor][Bug-Fix][Load Vec] Refactor code of basescanner and vjson/vparquet/vbroker scanner (#9666)
* [Refactor][Bug-Fix][Load Vec] Refactor code of basescanner and vjson/vparquet/vbroker scanner 1. fix bug of vjson scanner not support `range_from_file_path` 2. fix bug of vjson/vbrocker scanner core dump by src/dest slot nullable is different 3. fix bug of vparquest filter_block reference of column in not 1 4. refactor code to simple all the code It only changed vectorized load, not original row based load. Co-authored-by: lihaopeng <lihaopeng@baidu.com>
This commit is contained in:
@ -28,14 +28,20 @@
|
||||
#include "runtime/raw_value.h"
|
||||
#include "runtime/runtime_state.h"
|
||||
#include "runtime/tuple.h"
|
||||
#include "vec/data_types/data_type_factory.hpp"
|
||||
|
||||
namespace doris {
|
||||
|
||||
BaseScanner::BaseScanner(RuntimeState* state, RuntimeProfile* profile,
|
||||
const TBrokerScanRangeParams& params,
|
||||
const std::vector<TBrokerRangeDesc>& ranges,
|
||||
const std::vector<TNetworkAddress>& broker_addresses,
|
||||
const std::vector<TExpr>& pre_filter_texprs, ScannerCounter* counter)
|
||||
: _state(state),
|
||||
_params(params),
|
||||
_ranges(ranges),
|
||||
_broker_addresses(broker_addresses),
|
||||
_next_range(0),
|
||||
_counter(counter),
|
||||
_src_tuple(nullptr),
|
||||
_src_tuple_row(nullptr),
|
||||
@ -71,6 +77,22 @@ Status BaseScanner::open() {
|
||||
_rows_read_counter = ADD_COUNTER(_profile, "RowsRead", TUnit::UNIT);
|
||||
_read_timer = ADD_TIMER(_profile, "TotalRawReadTime(*)");
|
||||
_materialize_timer = ADD_TIMER(_profile, "MaterializeTupleTime(*)");
|
||||
|
||||
DCHECK(!_ranges.empty());
|
||||
const auto& range = _ranges[0];
|
||||
_num_of_columns_from_file = range.__isset.num_of_columns_from_file
|
||||
? implicit_cast<int>(range.num_of_columns_from_file)
|
||||
: implicit_cast<int>(_src_slot_descs.size());
|
||||
|
||||
// check consistency
|
||||
if (range.__isset.num_of_columns_from_file) {
|
||||
int size = range.columns_from_path.size();
|
||||
for (const auto& r : _ranges) {
|
||||
if (r.columns_from_path.size() != size) {
|
||||
return Status::InternalError("ranges have different number of columns.");
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -272,59 +294,135 @@ Status BaseScanner::_fill_dest_tuple(Tuple* dest_tuple, MemPool* mem_pool) {
|
||||
}
|
||||
void* slot = dest_tuple->get_slot(slot_desc->tuple_offset());
|
||||
RawValue::write(value, slot, slot_desc->type(), mem_pool);
|
||||
continue;
|
||||
}
|
||||
_success = true;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BaseScanner::filter_block(vectorized::Block* temp_block, size_t slot_num) {
|
||||
Status BaseScanner::_filter_src_block() {
|
||||
auto origin_column_num = _src_block.columns();
|
||||
// filter block
|
||||
if (!_vpre_filter_ctxs.empty()) {
|
||||
for (auto _vpre_filter_ctx : _vpre_filter_ctxs) {
|
||||
auto old_rows = temp_block->rows();
|
||||
RETURN_IF_ERROR(
|
||||
vectorized::VExprContext::filter_block(_vpre_filter_ctx, temp_block, slot_num));
|
||||
_counter->num_rows_unselected += old_rows - temp_block->rows();
|
||||
auto old_rows = _src_block.rows();
|
||||
RETURN_IF_ERROR(vectorized::VExprContext::filter_block(_vpre_filter_ctx, &_src_block,
|
||||
origin_column_num));
|
||||
_counter->num_rows_unselected += old_rows - _src_block.rows();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BaseScanner::execute_exprs(vectorized::Block* output_block, vectorized::Block* temp_block) {
|
||||
Status BaseScanner::_materialize_dest_block(vectorized::Block* dest_block) {
|
||||
// Do vectorized expr here
|
||||
Status status;
|
||||
if (!_dest_vexpr_ctx.empty()) {
|
||||
*output_block = vectorized::VExprContext::get_output_block_after_execute_exprs(
|
||||
_dest_vexpr_ctx, *temp_block, status);
|
||||
if (UNLIKELY(output_block->rows() == 0)) {
|
||||
return status;
|
||||
int ctx_idx = 0;
|
||||
size_t rows = _src_block.rows();
|
||||
auto filter_column = vectorized::ColumnUInt8::create(rows, 1);
|
||||
auto& filter_map = filter_column->get_data();
|
||||
|
||||
for (auto slot_desc : _dest_tuple_desc->slots()) {
|
||||
if (!slot_desc->is_materialized()) {
|
||||
continue;
|
||||
}
|
||||
int dest_index = ctx_idx++;
|
||||
|
||||
auto* ctx = _dest_vexpr_ctx[dest_index];
|
||||
int result_column_id = -1;
|
||||
// PT1 => dest primitive type
|
||||
RETURN_IF_ERROR(ctx->execute(&_src_block, &result_column_id));
|
||||
auto column_ptr = _src_block.get_by_position(result_column_id).column;
|
||||
|
||||
// because of src_slot_desc is always be nullable, so the column_ptr after do dest_expr
|
||||
// is likely to be nullable
|
||||
if (LIKELY(column_ptr->is_nullable())) {
|
||||
auto nullable_column =
|
||||
reinterpret_cast<const vectorized::ColumnNullable*>(column_ptr.get());
|
||||
for (int i = 0; i < rows; ++i) {
|
||||
if (filter_map[i] && nullable_column->is_null_at(i)) {
|
||||
if (_strict_mode && (_src_slot_descs_order_by_dest[dest_index]) &&
|
||||
!_src_block.get_by_position(dest_index).column->is_null_at(i)) {
|
||||
RETURN_IF_ERROR(_state->append_error_msg_to_file(
|
||||
[&]() -> std::string {
|
||||
return _src_block.dump_one_line(i, _num_of_columns_from_file);
|
||||
},
|
||||
[&]() -> std::string {
|
||||
auto raw_value =
|
||||
_src_block.get_by_position(ctx_idx).column->get_data_at(
|
||||
i);
|
||||
std::string raw_string = raw_value.to_string();
|
||||
fmt::memory_buffer error_msg;
|
||||
fmt::format_to(error_msg,
|
||||
"column({}) value is incorrect while strict "
|
||||
"mode is {}, "
|
||||
"src value is {}",
|
||||
slot_desc->col_name(), _strict_mode, raw_string);
|
||||
return fmt::to_string(error_msg);
|
||||
},
|
||||
&_scanner_eof));
|
||||
filter_map[i] = false;
|
||||
} else if (!slot_desc->is_nullable()) {
|
||||
RETURN_IF_ERROR(_state->append_error_msg_to_file(
|
||||
[&]() -> std::string {
|
||||
return _src_block.dump_one_line(i, _num_of_columns_from_file);
|
||||
},
|
||||
[&]() -> std::string {
|
||||
fmt::memory_buffer error_msg;
|
||||
fmt::format_to(error_msg,
|
||||
"column({}) values is null while columns is not "
|
||||
"nullable",
|
||||
slot_desc->col_name());
|
||||
return fmt::to_string(error_msg);
|
||||
},
|
||||
&_scanner_eof));
|
||||
filter_map[i] = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!slot_desc->is_nullable()) column_ptr = nullable_column->get_nested_column_ptr();
|
||||
} else if (slot_desc->is_nullable()) {
|
||||
column_ptr = vectorized::make_nullable(column_ptr);
|
||||
}
|
||||
dest_block->insert(vectorized::ColumnWithTypeAndName(
|
||||
std::move(column_ptr), slot_desc->get_data_type_ptr(), slot_desc->col_name()));
|
||||
}
|
||||
|
||||
// after do the dest block insert operation, clear _src_block to remove the reference of origin column
|
||||
_src_block.clear();
|
||||
|
||||
size_t dest_size = dest_block->columns();
|
||||
// do filter
|
||||
dest_block->insert(vectorized::ColumnWithTypeAndName(
|
||||
std::move(filter_column), std::make_shared<vectorized::DataTypeUInt8>(),
|
||||
"filter column"));
|
||||
RETURN_IF_ERROR(vectorized::Block::filter_block(dest_block, dest_size, dest_size));
|
||||
_counter->num_rows_filtered += rows - dest_block->rows();
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// TODO: opt the reuse of src_block or dest_block column. some case we have to
|
||||
// shallow copy the column of src_block to dest block
|
||||
Status BaseScanner::_init_src_block() {
|
||||
DCHECK(_src_block.columns() == 0);
|
||||
for (auto i = 0; i < _num_of_columns_from_file; ++i) {
|
||||
SlotDescriptor* slot_desc = _src_slot_descs[i];
|
||||
if (slot_desc == nullptr) {
|
||||
continue;
|
||||
}
|
||||
auto data_type = slot_desc->get_data_type_ptr();
|
||||
_src_block.insert(vectorized::ColumnWithTypeAndName(
|
||||
data_type->create_column(), slot_desc->get_data_type_ptr(), slot_desc->col_name()));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BaseScanner::fill_dest_block(vectorized::Block* dest_block,
|
||||
std::vector<vectorized::MutableColumnPtr>& columns) {
|
||||
if (columns.empty() || columns[0]->size() == 0) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::unique_ptr<vectorized::Block> temp_block(new vectorized::Block());
|
||||
auto n_columns = 0;
|
||||
for (const auto slot_desc : _src_slot_descs) {
|
||||
temp_block->insert(vectorized::ColumnWithTypeAndName(std::move(columns[n_columns++]),
|
||||
slot_desc->get_data_type_ptr(),
|
||||
slot_desc->col_name()));
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(BaseScanner::filter_block(temp_block.get(), _dest_tuple_desc->slots().size()));
|
||||
|
||||
if (_dest_vexpr_ctx.empty()) {
|
||||
*dest_block = *temp_block;
|
||||
} else {
|
||||
RETURN_IF_ERROR(BaseScanner::execute_exprs(dest_block, temp_block.get()));
|
||||
Status BaseScanner::_fill_dest_block(vectorized::Block* dest_block, bool* eof) {
|
||||
*eof = _scanner_eof;
|
||||
_fill_columns_from_path();
|
||||
if (LIKELY(_src_block.rows() > 0)) {
|
||||
RETURN_IF_ERROR(BaseScanner::_filter_src_block());
|
||||
RETURN_IF_ERROR(BaseScanner::_materialize_dest_block(dest_block));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
@ -337,7 +435,7 @@ void BaseScanner::fill_slots_of_columns_from_path(
|
||||
auto slot_desc = _src_slot_descs.at(i + start);
|
||||
_src_tuple->set_not_null(slot_desc->null_indicator_offset());
|
||||
void* slot = _src_tuple->get_slot(slot_desc->tuple_offset());
|
||||
StringValue* str_slot = reinterpret_cast<StringValue*>(slot);
|
||||
auto* str_slot = reinterpret_cast<StringValue*>(slot);
|
||||
const std::string& column_from_path = columns_from_path[i];
|
||||
str_slot->ptr = const_cast<char*>(column_from_path.c_str());
|
||||
str_slot->len = column_from_path.size();
|
||||
@ -360,4 +458,28 @@ void BaseScanner::close() {
|
||||
}
|
||||
}
|
||||
|
||||
void BaseScanner::_fill_columns_from_path() {
|
||||
const TBrokerRangeDesc& range = _ranges.at(_next_range - 1);
|
||||
if (range.__isset.num_of_columns_from_file) {
|
||||
size_t start = range.num_of_columns_from_file;
|
||||
size_t rows = _src_block.rows();
|
||||
|
||||
for (size_t i = 0; i < range.columns_from_path.size(); ++i) {
|
||||
auto slot_desc = _src_slot_descs.at(i + start);
|
||||
if (slot_desc == nullptr) continue;
|
||||
auto is_nullable = slot_desc->is_nullable();
|
||||
auto data_type = vectorized::DataTypeFactory::instance().create_data_type(TYPE_VARCHAR,
|
||||
is_nullable);
|
||||
auto data_column = data_type->create_column();
|
||||
const std::string& column_from_path = range.columns_from_path[i];
|
||||
for (size_t j = 0; j < rows; ++j) {
|
||||
data_column->insert_data(const_cast<char*>(column_from_path.c_str()),
|
||||
column_from_path.size());
|
||||
}
|
||||
_src_block.insert(vectorized::ColumnWithTypeAndName(std::move(data_column), data_type,
|
||||
slot_desc->col_name()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace doris
|
||||
|
||||
Reference in New Issue
Block a user