Files
doris/be/src/exec/base_scanner.h
lihangyu 37d1519316 [WIP](dynamic-table) support dynamic schema table (#16335)
Issue Number: close #16351

Dynamic schema table is a special type of table, it's schema change with loading procedure.Now we implemented this feature mainly for semi-structure data such as JSON, since JSON is schema self-described we could extract schema info from the original documents and inference the final type infomation.This speical table could reduce manual schema change operation and easily import semi-structure data and extends it's schema automatically.
2023-02-11 13:37:50 +08:00

141 lines
4.8 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include "common/status.h"
#include "util/runtime_profile.h"
#include "vec/common/schema_util.h"
#include "vec/exprs/vexpr.h"
#include "vec/exprs/vexpr_context.h"
namespace doris {
class TupleDescriptor;
class RowDescriptor;
class RuntimeState;
namespace vectorized {
class VExprContext;
class IColumn;
using MutableColumnPtr = IColumn::MutablePtr;
} // namespace vectorized
// The counter will be passed to each scanner.
// Note that this struct is not thread safe.
// So if we support concurrent scan in the future, we need to modify this struct.
struct ScannerCounter {
ScannerCounter() : num_rows_filtered(0), num_rows_unselected(0) {}
int64_t num_rows_filtered; // unqualified rows (unmatched the dest schema, or no partition)
int64_t num_rows_unselected; // rows filtered by predicates
};
class BaseScanner {
public:
BaseScanner(RuntimeState* state, RuntimeProfile* profile, const TBrokerScanRangeParams& params,
const std::vector<TBrokerRangeDesc>& ranges,
const std::vector<TNetworkAddress>& broker_addresses,
const std::vector<TExpr>& pre_filter_texprs, ScannerCounter* counter);
virtual ~BaseScanner() { vectorized::VExpr::close(_dest_vexpr_ctx, _state); }
virtual Status init_expr_ctxes();
// Open this scanner, will initialize information need to
virtual Status open();
// Get next block
virtual Status get_next(vectorized::Block* block, bool* eof) {
return Status::NotSupported("Not Implemented get block");
}
// Close this scanner
virtual void close() = 0;
bool is_dynamic_schema() const { return _is_dynamic_schema; }
protected:
Status _fill_dest_block(vectorized::Block* dest_block, bool* eof);
virtual Status _init_src_block();
bool is_null(const Slice& slice);
bool is_array(const Slice& slice);
bool check_array_format(std::vector<Slice>& split_values);
RuntimeState* _state;
const TBrokerScanRangeParams& _params;
//const TBrokerScanRangeParams& _params;
const std::vector<TBrokerRangeDesc>& _ranges;
const std::vector<TNetworkAddress>& _broker_addresses;
int _next_range;
// used for process stat
ScannerCounter* _counter;
// Used for constructing tuple
// slots for value read from broker file
std::vector<SlotDescriptor*> _src_slot_descs;
std::unique_ptr<RowDescriptor> _row_desc;
// Dest tuple descriptor and dest expr context
const TupleDescriptor* _dest_tuple_desc;
// the map values of dest slot id to src slot desc
// if there is not key of dest slot id in dest_sid_to_src_sid_without_trans, it will be set to nullptr
std::vector<SlotDescriptor*> _src_slot_descs_order_by_dest;
// dest slot desc index to src slot desc index
std::unordered_map<int, int> _dest_slot_to_src_slot_index;
// to filter src tuple directly
// the `_pre_filter_texprs` is the origin thrift exprs passed from scan node.
const std::vector<TExpr> _pre_filter_texprs;
bool _strict_mode;
int32_t _line_counter;
// Profile
RuntimeProfile* _profile;
RuntimeProfile::Counter* _rows_read_counter;
RuntimeProfile::Counter* _read_timer;
RuntimeProfile::Counter* _materialize_timer;
// Used to record whether a row of data is successfully read.
bool _success = false;
bool _scanner_eof = false;
// for vectorized load
std::vector<vectorized::VExprContext*> _dest_vexpr_ctx;
std::unique_ptr<vectorized::VExprContext*> _vpre_filter_ctx_ptr;
vectorized::Block _src_block;
bool _src_block_mem_reuse = false;
int _num_of_columns_from_file;
// slot_ids for parquet predicate push down are in tuple desc
TupleId _tupleId = -1;
bool _is_dynamic_schema = false;
// for tracing dynamic schema
std::unique_ptr<vectorized::schema_util::FullBaseSchemaView> _full_base_schema_view;
private:
Status _filter_src_block();
void _fill_columns_from_path();
Status _materialize_dest_block(vectorized::Block* output_block);
};
} /* namespace doris */