diff --git a/be/src/exec/exec_node.cpp b/be/src/exec/exec_node.cpp index cf21e7ae91..62dc3a2acb 100644 --- a/be/src/exec/exec_node.cpp +++ b/be/src/exec/exec_node.cpp @@ -46,7 +46,6 @@ #include "vec/exec/vaggregation_node.h" #include "vec/exec/vanalytic_eval_node.h" #include "vec/exec/vassert_num_rows_node.h" -#include "vec/exec/vbroker_scan_node.h" #include "vec/exec/vdata_gen_scan_node.h" #include "vec/exec/vempty_set_node.h" #include "vec/exec/vexchange_node.h" @@ -340,7 +339,6 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN case TPlanNodeType::SELECT_NODE: case TPlanNodeType::REPEAT_NODE: case TPlanNodeType::TABLE_FUNCTION_NODE: - case TPlanNodeType::BROKER_SCAN_NODE: case TPlanNodeType::DATA_GEN_SCAN_NODE: case TPlanNodeType::FILE_SCAN_NODE: case TPlanNodeType::JDBC_SCAN_NODE: @@ -445,10 +443,6 @@ Status ExecNode::create_node(RuntimeState* state, ObjectPool* pool, const TPlanN *node = pool->add(new vectorized::VExceptNode(pool, tnode, descs)); return Status::OK(); - case TPlanNodeType::BROKER_SCAN_NODE: - *node = pool->add(new vectorized::VBrokerScanNode(pool, tnode, descs)); - return Status::OK(); - case TPlanNodeType::FILE_SCAN_NODE: *node = pool->add(new vectorized::NewFileScanNode(pool, tnode, descs)); return Status::OK(); @@ -529,7 +523,6 @@ void ExecNode::collect_nodes(TPlanNodeType::type node_type, std::vector* nodes) { collect_nodes(TPlanNodeType::OLAP_SCAN_NODE, nodes); - collect_nodes(TPlanNodeType::BROKER_SCAN_NODE, nodes); collect_nodes(TPlanNodeType::ES_HTTP_SCAN_NODE, nodes); collect_nodes(TPlanNodeType::DATA_GEN_SCAN_NODE, nodes); collect_nodes(TPlanNodeType::FILE_SCAN_NODE, nodes); diff --git a/be/src/pipeline/exec/broker_scan_operator.h b/be/src/pipeline/exec/broker_scan_operator.h deleted file mode 100644 index 584ad8c472..0000000000 --- a/be/src/pipeline/exec/broker_scan_operator.h +++ /dev/null @@ -1,59 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include "operator.h" -#include "vec/exec/vbroker_scan_node.h" - -namespace doris::pipeline { - -class BrokerScanOperatorBuilder : public OperatorBuilder { -public: - BrokerScanOperatorBuilder(int32_t id, ExecNode* node) - : OperatorBuilder(id, "BrokerScanOperator", node) {} - bool is_source() const override { return true; } - OperatorPtr build_operator() override; -}; - -class BrokerScanOperator : public SourceOperator { -public: - BrokerScanOperator(OperatorBuilderBase* operator_builder, ExecNode* scan_node) - : SourceOperator(operator_builder, scan_node) {} - - bool can_read() override { return _node->can_read(); } - - bool is_pending_finish() const override { return !_node->can_finish(); } - - Status open(RuntimeState* state) override { - SCOPED_TIMER(_runtime_profile->total_time_counter()); - RETURN_IF_ERROR(SourceOperator::open(state)); - return _node->open(state); - } - - Status close(RuntimeState* state) override { - RETURN_IF_ERROR(SourceOperator::close(state)); - _node->close(state); - return Status::OK(); - } -}; - -OperatorPtr BrokerScanOperatorBuilder::build_operator() { - return std::make_shared(this, _node); -} - -} // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index 51aac3c7a7..7704628da0 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -29,7 +29,6 @@ #include "pipeline/exec/analytic_sink_operator.h" #include "pipeline/exec/analytic_source_operator.h" #include "pipeline/exec/assert_num_rows_operator.h" -#include "pipeline/exec/broker_scan_operator.h" #include "pipeline/exec/const_value_operator.h" #include "pipeline/exec/data_queue.h" #include "pipeline/exec/datagen_operator.h" @@ -306,12 +305,6 @@ Status PipelineFragmentContext::_build_pipelines(ExecNode* node, PipelinePtr cur auto node_type = node->type(); switch (node_type) { // for source - case TPlanNodeType::BROKER_SCAN_NODE: { - OperatorBuilderPtr operator_t = - std::make_shared(next_operator_builder_id(), node); - RETURN_IF_ERROR(cur_pipe->add_operator(operator_t)); - break; - } case TPlanNodeType::OLAP_SCAN_NODE: case TPlanNodeType::JDBC_SCAN_NODE: case TPlanNodeType::ODBC_SCAN_NODE: diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index ac3b9ff0e6..3ef7199ee4 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -840,10 +840,9 @@ void FragmentMgr::_set_scan_concurrency(const TExecPlanFragmentParams& params, bool FragmentMgr::_is_scan_node(const TPlanNodeType::type& type) { return type == TPlanNodeType::OLAP_SCAN_NODE || type == TPlanNodeType::MYSQL_SCAN_NODE || type == TPlanNodeType::SCHEMA_SCAN_NODE || type == TPlanNodeType::META_SCAN_NODE || - type == TPlanNodeType::BROKER_SCAN_NODE || type == TPlanNodeType::ES_SCAN_NODE || - type == TPlanNodeType::ES_HTTP_SCAN_NODE || type == TPlanNodeType::ODBC_SCAN_NODE || - type == TPlanNodeType::DATA_GEN_SCAN_NODE || type == TPlanNodeType::FILE_SCAN_NODE || - type == TPlanNodeType::JDBC_SCAN_NODE; + type == TPlanNodeType::ES_SCAN_NODE || type == TPlanNodeType::ES_HTTP_SCAN_NODE || + type == TPlanNodeType::ODBC_SCAN_NODE || type == TPlanNodeType::DATA_GEN_SCAN_NODE || + type == TPlanNodeType::FILE_SCAN_NODE || type == TPlanNodeType::JDBC_SCAN_NODE; } void FragmentMgr::cancel(const TUniqueId& fragment_id, const PPlanFragmentCancelReason& reason, diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index b2c6e36c5c..fa9a783dc5 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -108,12 +108,8 @@ set(VEC_FILES exec/vassert_num_rows_node.cpp exec/vrepeat_node.cpp exec/vtable_function_node.cpp - exec/vbroker_scan_node.cpp - exec/vbroker_scanner.cpp - exec/vjson_scanner.cpp exec/vjdbc_connector.cpp exec/vparquet_scanner.cpp - exec/vorc_scanner.cpp exec/join/vhash_join_node.cpp exec/join/vjoin_node_base.cpp exec/join/vnested_loop_join_node.cpp diff --git a/be/src/vec/exec/varrow_scanner.cpp b/be/src/vec/exec/varrow_scanner.cpp index 6a8da8d40a..236c336a09 100644 --- a/be/src/vec/exec/varrow_scanner.cpp +++ b/be/src/vec/exec/varrow_scanner.cpp @@ -15,13 +15,14 @@ // specific language governing permissions and limitations // under the License. +#include "vec/exec/varrow_scanner.h" + #include "exec/arrow/parquet_reader.h" #include "exprs/expr.h" #include "io/file_factory.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "vec/data_types/data_type_factory.hpp" -#include "vec/exec/vorc_scanner.h" #include "vec/functions/simple_function_factory.h" #include "vec/utils/arrow_column_to_doris_column.h" diff --git a/be/src/vec/exec/vbroker_scan_node.cpp b/be/src/vec/exec/vbroker_scan_node.cpp deleted file mode 100644 index d0ebe5c6cb..0000000000 --- a/be/src/vec/exec/vbroker_scan_node.cpp +++ /dev/null @@ -1,347 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/vbroker_scan_node.h" - -#include "gen_cpp/PlanNodes_types.h" -#include "runtime/memory/mem_tracker.h" -#include "runtime/runtime_state.h" -#include "runtime/tuple.h" -#include "runtime/tuple_row.h" -#include "util/runtime_profile.h" -#include "util/thread.h" -#include "util/types.h" -#include "vec/common/string_ref.h" -#include "vec/exec/vbroker_scanner.h" -#include "vec/exec/vjson_scanner.h" -#include "vec/exec/vorc_scanner.h" -#include "vec/exec/vparquet_scanner.h" -#include "vec/exprs/vexpr_context.h" - -namespace doris::vectorized { - -VBrokerScanNode::VBrokerScanNode(ObjectPool* pool, const TPlanNode& tnode, - const DescriptorTbl& descs) - : ScanNode(pool, tnode, descs), - _tuple_id(tnode.broker_scan_node.tuple_id), - _runtime_state(nullptr), - _tuple_desc(nullptr), - _num_running_scanners(0), - _scan_finished(false), - _max_buffered_batches(32), - _wait_scanner_timer(nullptr) {} - -Status VBrokerScanNode::init(const TPlanNode& tnode, RuntimeState* state) { - RETURN_IF_ERROR(ScanNode::init(tnode, state)); - auto& broker_scan_node = tnode.broker_scan_node; - - if (broker_scan_node.__isset.pre_filter_exprs) { - _pre_filter_texprs = broker_scan_node.pre_filter_exprs; - } - - return Status::OK(); -} - -Status VBrokerScanNode::prepare(RuntimeState* state) { - VLOG_QUERY << "VBrokerScanNode prepare"; - RETURN_IF_ERROR(ScanNode::prepare(state)); - // get tuple desc - _runtime_state = state; - _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); - if (_tuple_desc == nullptr) { - return Status::InternalError("Failed to get tuple descriptor, _tuple_id={}", _tuple_id); - } - - // Initialize slots map - for (auto slot : _tuple_desc->slots()) { - auto pair = _slots_map.emplace(slot->col_name(), slot); - if (!pair.second) { - return Status::InternalError("Failed to insert slot, col_name={}", slot->col_name()); - } - } - - // Profile - _wait_scanner_timer = ADD_TIMER(runtime_profile(), "WaitScannerTime"); - - return Status::OK(); -} - -Status VBrokerScanNode::open(RuntimeState* state) { - START_AND_SCOPE_SPAN(state->get_tracer(), span, "VBrokerScanNode::open"); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - RETURN_IF_ERROR(ExecNode::open(state)); - RETURN_IF_CANCELLED(state); - - RETURN_IF_ERROR(start_scanners()); - - return Status::OK(); -} - -Status VBrokerScanNode::start_scanners() { - { - std::unique_lock l(_batch_queue_lock); - _num_running_scanners = 1; - } - _scanner_threads.emplace_back([this, size = 0, length = _scan_ranges.size(), - parent_span = opentelemetry::trace::Tracer::GetCurrentSpan()] { - OpentelemetryScope scope {parent_span}; - this->scanner_worker(size, length); - }); - return Status::OK(); -} - -Status VBrokerScanNode::get_next(RuntimeState* state, vectorized::Block* block, bool* eos) { - INIT_AND_SCOPE_GET_NEXT_SPAN(state->get_tracer(), _get_next_span, "VBrokerScanNode::get_next"); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - // check if CANCELLED. - if (state->is_cancelled()) { - std::unique_lock l(_batch_queue_lock); - if (update_status(Status::Cancelled("Cancelled"))) { - // Notify all scanners - _queue_writer_cond.notify_all(); - } - } - - if (_scan_finished.load()) { - *eos = true; - return Status::OK(); - } - - const int batch_size = _runtime_state->batch_size(); - while (true) { - std::shared_ptr scanner_block; - { - std::unique_lock l(_batch_queue_lock); - while (_process_status.ok() && !_runtime_state->is_cancelled() && - _num_running_scanners > 0 && _block_queue.empty()) { - SCOPED_TIMER(_wait_scanner_timer); - _queue_reader_cond.wait_for(l, std::chrono::seconds(1)); - } - if (!_process_status.ok()) { - // Some scanner process failed. - return _process_status; - } - if (_runtime_state->is_cancelled()) { - if (update_status(Status::Cancelled("Cancelled"))) { - _queue_writer_cond.notify_all(); - } - return _process_status; - } - if (!_block_queue.empty()) { - scanner_block = _block_queue.front(); - _block_queue.pop_front(); - } - } - - // All scanner has been finished, and all cached batch has been read - if (!scanner_block) { - if (_mutable_block && !_mutable_block->empty()) { - *block = _mutable_block->to_block(); - reached_limit(block, eos); - LOG_IF(INFO, *eos) << "VBrokerScanNode ReachedLimit."; - } - _scan_finished.store(true); - *eos = true; - return Status::OK(); - } - // notify one scanner - _queue_writer_cond.notify_one(); - - if (UNLIKELY(!_mutable_block)) { - _mutable_block.reset(new MutableBlock(scanner_block->clone_empty())); - } - - if (_mutable_block->rows() + scanner_block->rows() < batch_size) { - // merge scanner_block into _mutable_block - _mutable_block->add_rows(scanner_block.get(), 0, scanner_block->rows()); - continue; - } else { - if (_mutable_block->empty()) { - // directly use scanner_block - *block = *scanner_block; - } else { - // copy _mutable_block firstly, then merge scanner_block into _mutable_block for next. - *block = _mutable_block->to_block(); - _mutable_block->set_muatable_columns(scanner_block->clone_empty_columns()); - _mutable_block->add_rows(scanner_block.get(), 0, scanner_block->rows()); - } - break; - } - } - - reached_limit(block, eos); - if (*eos) { - _scan_finished.store(true); - _queue_writer_cond.notify_all(); - LOG(INFO) << "VBrokerScanNode ReachedLimit."; - } else { - *eos = false; - } - - return Status::OK(); -} - -Status VBrokerScanNode::close(RuntimeState* state) { - if (is_closed()) { - return Status::OK(); - } - START_AND_SCOPE_SPAN(state->get_tracer(), span, "VBrokerScanNode::close"); - SCOPED_TIMER(_runtime_profile->total_time_counter()); - _scan_finished.store(true); - _queue_writer_cond.notify_all(); - _queue_reader_cond.notify_all(); - for (int i = 0; i < _scanner_threads.size(); ++i) { - _scanner_threads[i].join(); - } - - return ExecNode::close(state); -} - -Status VBrokerScanNode::scanner_scan(const TBrokerScanRange& scan_range, ScannerCounter* counter) { - //create scanner object and open - std::unique_ptr scanner = create_scanner(scan_range, counter); - RETURN_IF_ERROR(scanner->open()); - bool scanner_eof = false; - while (!scanner_eof) { - RETURN_IF_CANCELLED(_runtime_state); - // If we have finished all works - if (_scan_finished.load() || !_process_status.ok()) { - return Status::OK(); - } - - std::shared_ptr block(new vectorized::Block()); - RETURN_IF_ERROR(scanner->get_next(block.get(), &scanner_eof)); - if (block->rows() == 0) { - continue; - } - auto old_rows = block->rows(); - RETURN_IF_ERROR(VExprContext::filter_block(_vconjunct_ctx_ptr, block.get(), - _tuple_desc->slots().size())); - counter->num_rows_unselected += old_rows - block->rows(); - if (block->rows() == 0) { - continue; - } - - std::unique_lock l(_batch_queue_lock); - while (_process_status.ok() && !_scan_finished.load() && !_runtime_state->is_cancelled() && - // stop pushing more batch if - // 1. too many batches in queue, or - // 2. at least one batch in queue and memory exceed limit. - (_block_queue.size() >= _max_buffered_batches || !_block_queue.empty())) { - _queue_writer_cond.wait_for(l, std::chrono::seconds(1)); - } - // Process already set failed, so we just return OK - if (!_process_status.ok()) { - return Status::OK(); - } - // Scan already finished, just return - if (_scan_finished.load()) { - return Status::OK(); - } - // Runtime state is canceled, just return cancel - if (_runtime_state->is_cancelled()) { - return Status::Cancelled("Cancelled"); - } - // Queue size Must be smaller than _max_buffered_batches - _block_queue.push_back(std::move(block)); - - // Notify reader to process - _queue_reader_cond.notify_one(); - } - return Status::OK(); -} - -void VBrokerScanNode::scanner_worker(int start_idx, int length) { - START_AND_SCOPE_SPAN(_runtime_state->get_tracer(), span, "VBrokerScanNode::scanner_worker"); - SCOPED_ATTACH_TASK(_runtime_state); - Thread::set_self_name("vbroker_scanner"); - Status status = Status::OK(); - ScannerCounter counter; - for (int i = 0; i < length && status.ok(); ++i) { - const TBrokerScanRange& scan_range = - _scan_ranges[start_idx + i].scan_range.broker_scan_range; - status = scanner_scan(scan_range, &counter); - if (!status.ok()) { - LOG(WARNING) << "Scanner[" << start_idx + i << "] process failed. status=" << status; - } - } - - // Update stats - _runtime_state->update_num_rows_load_filtered(counter.num_rows_filtered); - _runtime_state->update_num_rows_load_unselected(counter.num_rows_unselected); - - // scanner is going to finish - { - std::lock_guard l(_batch_queue_lock); - if (!status.ok()) { - update_status(status); - } - // This scanner will finish - _num_running_scanners--; - } - _queue_reader_cond.notify_all(); - // If one scanner failed, others don't need scan any more - if (!status.ok()) { - _queue_writer_cond.notify_all(); - } -} - -std::unique_ptr VBrokerScanNode::create_scanner(const TBrokerScanRange& scan_range, - ScannerCounter* counter) { - BaseScanner* scan = nullptr; - switch (scan_range.ranges[0].format_type) { - case TFileFormatType::FORMAT_PARQUET: - scan = new vectorized::VParquetScanner(_runtime_state, runtime_profile(), scan_range.params, - scan_range.ranges, scan_range.broker_addresses, - _pre_filter_texprs, counter); - break; - case TFileFormatType::FORMAT_ORC: - scan = new vectorized::VORCScanner(_runtime_state, runtime_profile(), scan_range.params, - scan_range.ranges, scan_range.broker_addresses, - _pre_filter_texprs, counter); - break; - case TFileFormatType::FORMAT_JSON: - if (config::enable_simdjson_reader) { - scan = new vectorized::VJsonScanner( - _runtime_state, runtime_profile(), scan_range.params, scan_range.ranges, - scan_range.broker_addresses, _pre_filter_texprs, counter); - } else { - scan = new vectorized::VJsonScanner( - _runtime_state, runtime_profile(), scan_range.params, scan_range.ranges, - scan_range.broker_addresses, _pre_filter_texprs, counter); - } - break; - default: - scan = new vectorized::VBrokerScanner(_runtime_state, runtime_profile(), scan_range.params, - scan_range.ranges, scan_range.broker_addresses, - _pre_filter_texprs, counter); - } - std::unique_ptr scanner(scan); - return scanner; -} - -// This function is called after plan node has been prepared. -Status VBrokerScanNode::set_scan_ranges(const std::vector& scan_ranges) { - _scan_ranges = scan_ranges; - return Status::OK(); -} - -void VBrokerScanNode::debug_string(int ident_level, std::stringstream* out) const { - (*out) << "VBrokerScanNode"; -} - -} // namespace doris::vectorized diff --git a/be/src/vec/exec/vbroker_scan_node.h b/be/src/vec/exec/vbroker_scan_node.h deleted file mode 100644 index bad1bd93c7..0000000000 --- a/be/src/vec/exec/vbroker_scan_node.h +++ /dev/null @@ -1,115 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include - -#include "common/status.h" -#include "exec/base_scanner.h" -#include "exec/scan_node.h" -#include "gen_cpp/PaloInternalService_types.h" -#include "runtime/descriptors.h" - -namespace doris { - -class RuntimeState; -class Status; - -namespace vectorized { -class VBrokerScanNode final : public ScanNode { -public: - VBrokerScanNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); - ~VBrokerScanNode() override = default; - - // Called after create this scan node - Status init(const TPlanNode& tnode, RuntimeState* state = nullptr) override; - - // Prepare partition infos & set up timer - Status prepare(RuntimeState* state) override; - - // Start broker scan using ParquetScanner or BrokerScanner. - Status open(RuntimeState* state) override; - - Status get_next(RuntimeState* state, vectorized::Block* block, bool* eos) override; - - // Close the scanner, and report errors. - Status close(RuntimeState* state) override; - - // No use - Status set_scan_ranges(const std::vector& scan_ranges) override; - - bool can_read() { return true; } - bool can_finish() const { return _num_running_scanners == 0; } - -private: - // Write debug string of this into out. - void debug_string(int indentation_level, std::stringstream* out) const override; - - // Update process status to one failed status, - // NOTE: Must hold the mutex of this scan node - bool update_status(const Status& new_status) { - if (_process_status.ok()) { - _process_status = new_status; - return true; - } - return false; - } - - std::unique_ptr create_scanner(const TBrokerScanRange& scan_range, - ScannerCounter* counter); - - Status start_scanners(); - - void scanner_worker(int start_idx, int length); - // Scan one range - Status scanner_scan(const TBrokerScanRange& scan_range, ScannerCounter* counter); - - TupleId _tuple_id; - RuntimeState* _runtime_state; - TupleDescriptor* _tuple_desc; - std::map _slots_map; - std::vector _scan_ranges; - - std::mutex _batch_queue_lock; - std::condition_variable _queue_reader_cond; - std::condition_variable _queue_writer_cond; - - std::atomic _num_running_scanners; - - std::atomic _scan_finished; - - Status _process_status; - - std::vector _scanner_threads; - - int _max_buffered_batches; - - // The origin preceding filter exprs. - // These exprs will be converted to expr context - // in XXXScanner. - // Because the row descriptor used for these exprs is `src_row_desc`, - // which is initialized in XXXScanner. - std::vector _pre_filter_texprs; - - RuntimeProfile::Counter* _wait_scanner_timer; - - std::deque> _block_queue; - std::unique_ptr _mutable_block; -}; -} // namespace vectorized -} // namespace doris diff --git a/be/src/vec/exec/vbroker_scanner.cpp b/be/src/vec/exec/vbroker_scanner.cpp deleted file mode 100644 index c39b2003fb..0000000000 --- a/be/src/vec/exec/vbroker_scanner.cpp +++ /dev/null @@ -1,471 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/vbroker_scanner.h" - -#include - -#include "common/consts.h" -#include "exec/line_reader.h" -#include "exec/plain_binary_line_reader.h" -#include "exec/plain_text_line_reader.h" -#include "exec/text_converter.h" -#include "exec/text_converter.hpp" -#include "gen_cpp/internal_service.pb.h" -#include "io/file_factory.h" -#include "util/utf8_check.h" - -namespace doris::vectorized { - -VBrokerScanner::VBrokerScanner(RuntimeState* state, RuntimeProfile* profile, - const TBrokerScanRangeParams& params, - const std::vector& ranges, - const std::vector& broker_addresses, - const std::vector& pre_filter_texprs, ScannerCounter* counter) - : BaseScanner(state, profile, params, ranges, broker_addresses, pre_filter_texprs, counter), - _cur_file_reader(nullptr), - _cur_line_reader(nullptr), - _cur_decompressor(nullptr), - _cur_line_reader_eof(false), - _skip_lines(0) { - if (params.__isset.column_separator_length && params.column_separator_length > 1) { - _value_separator = params.column_separator_str; - _value_separator_length = params.column_separator_length; - } else { - _value_separator.push_back(static_cast(params.column_separator)); - _value_separator_length = 1; - } - if (params.__isset.line_delimiter_length && params.line_delimiter_length > 1) { - _line_delimiter = params.line_delimiter_str; - _line_delimiter_length = params.line_delimiter_length; - } else { - _line_delimiter.push_back(static_cast(params.line_delimiter)); - _line_delimiter_length = 1; - } - if (params.__isset.trim_double_quotes) { - _trim_double_quotes = params.trim_double_quotes; - } - _split_values.reserve(sizeof(Slice) * params.src_slot_ids.size()); - _text_converter.reset(new (std::nothrow) TextConverter('\\')); - _src_block_mem_reuse = true; -} - -VBrokerScanner::~VBrokerScanner() { - close(); -} - -Status VBrokerScanner::open() { - RETURN_IF_ERROR(BaseScanner::open()); // base default function - return Status::OK(); -} - -Status VBrokerScanner::_open_file_reader() { - const TBrokerRangeDesc& range = _ranges[_next_range]; - int64_t start_offset = range.start_offset; - if (start_offset != 0) { - start_offset -= 1; - } - //means first range, skip - if (start_offset == 0 && range.header_type.size() > 0) { - std::string header_type = to_lower(range.header_type); - if (header_type == BeConsts::CSV_WITH_NAMES) { - _skip_lines = 1; - } else if (header_type == BeConsts::CSV_WITH_NAMES_AND_TYPES) { - _skip_lines = 2; - } - } - - if (range.file_type == TFileType::FILE_STREAM) { - RETURN_IF_ERROR(FileFactory::create_pipe_reader(range.load_id, _cur_file_reader_s)); - _real_reader = _cur_file_reader_s.get(); - } else { - RETURN_IF_ERROR(FileFactory::create_file_reader( - range.file_type, _state->exec_env(), _profile, _broker_addresses, - _params.properties, range, start_offset, _cur_file_reader)); - _real_reader = _cur_file_reader.get(); - } - return _real_reader->open(); -} - -Status VBrokerScanner::_create_decompressor(TFileFormatType::type type) { - if (_cur_decompressor != nullptr) { - delete _cur_decompressor; - _cur_decompressor = nullptr; - } - - CompressType compress_type; - switch (type) { - case TFileFormatType::FORMAT_CSV_PLAIN: - case TFileFormatType::FORMAT_JSON: - case TFileFormatType::FORMAT_PROTO: - compress_type = CompressType::UNCOMPRESSED; - break; - case TFileFormatType::FORMAT_CSV_GZ: - compress_type = CompressType::GZIP; - break; - case TFileFormatType::FORMAT_CSV_BZ2: - compress_type = CompressType::BZIP2; - break; - case TFileFormatType::FORMAT_CSV_LZ4FRAME: - compress_type = CompressType::LZ4FRAME; - break; - case TFileFormatType::FORMAT_CSV_LZOP: - compress_type = CompressType::LZOP; - break; - case TFileFormatType::FORMAT_CSV_DEFLATE: - compress_type = CompressType::DEFLATE; - break; - default: { - return Status::InternalError("Unknown format type, cannot inference compress type, type={}", - type); - } - } - RETURN_IF_ERROR(Decompressor::create_decompressor(compress_type, &_cur_decompressor)); - - return Status::OK(); -} - -Status VBrokerScanner::_open_line_reader() { - if (_cur_decompressor != nullptr) { - delete _cur_decompressor; - _cur_decompressor = nullptr; - } - - if (_cur_line_reader != nullptr) { - delete _cur_line_reader; - _cur_line_reader = nullptr; - } - - const TBrokerRangeDesc& range = _ranges[_next_range]; - int64_t size = range.size; - if (range.start_offset != 0) { - if (range.format_type != TFileFormatType::FORMAT_CSV_PLAIN) { - return Status::InternalError("For now we do not support split compressed file"); - } - size += 1; - // not first range will always skip one line - _skip_lines = 1; - } - - // create decompressor. - // _decompressor may be nullptr if this is not a compressed file - RETURN_IF_ERROR(_create_decompressor(range.format_type)); - - _file_format_type = range.format_type; - // open line reader - switch (range.format_type) { - case TFileFormatType::FORMAT_CSV_PLAIN: - case TFileFormatType::FORMAT_CSV_GZ: - case TFileFormatType::FORMAT_CSV_BZ2: - case TFileFormatType::FORMAT_CSV_LZ4FRAME: - case TFileFormatType::FORMAT_CSV_LZOP: - case TFileFormatType::FORMAT_CSV_DEFLATE: - _cur_line_reader = new PlainTextLineReader(_profile, _real_reader, _cur_decompressor, size, - _line_delimiter, _line_delimiter_length); - break; - case TFileFormatType::FORMAT_PROTO: - _cur_line_reader = new PlainBinaryLineReader(_real_reader); - break; - default: { - return Status::InternalError("Unknown format type, cannot init line reader, type={}", - range.format_type); - } - } - - _cur_line_reader_eof = false; - - return Status::OK(); -} - -void VBrokerScanner::close() { - BaseScanner::close(); - if (_cur_decompressor != nullptr) { - delete _cur_decompressor; - _cur_decompressor = nullptr; - } - - if (_cur_line_reader != nullptr) { - delete _cur_line_reader; - _cur_line_reader = nullptr; - } -} - -Status VBrokerScanner::_open_next_reader() { - if (_next_range >= _ranges.size()) { - _scanner_eof = true; - return Status::OK(); - } - - RETURN_IF_ERROR(_open_file_reader()); - RETURN_IF_ERROR(_open_line_reader()); - _next_range++; - - return Status::OK(); -} - -Status VBrokerScanner::_line_to_src_tuple(const Slice& line) { - RETURN_IF_ERROR(_line_split_to_values(line)); - if (!_success) { - return Status::OK(); - } - - for (int i = 0; i < _split_values.size(); ++i) { - auto slot_desc = _src_slot_descs[i]; - const Slice& value = _split_values[i]; - if (slot_desc->is_nullable() && is_null(value)) { - _src_tuple->set_null(slot_desc->null_indicator_offset()); - continue; - } - _src_tuple->set_not_null(slot_desc->null_indicator_offset()); - void* slot = _src_tuple->get_slot(slot_desc->tuple_offset()); - Slice* str_slot = reinterpret_cast(slot); - str_slot->data = value.data; - str_slot->size = value.size; - } - - const TBrokerRangeDesc& range = _ranges.at(_next_range - 1); - if (range.__isset.num_of_columns_from_file) { - fill_slots_of_columns_from_path(range.num_of_columns_from_file, range.columns_from_path); - } - - return Status::OK(); -} - -void VBrokerScanner::split_line(const Slice& line) { - _split_values.clear(); - if (_file_format_type == TFileFormatType::FORMAT_PROTO) { - PDataRow** ptr = reinterpret_cast(line.data); - PDataRow* row = *ptr; - for (const PDataColumn& col : (row)->col()) { - int len = col.value().size(); - uint8_t* buf = new uint8_t[len]; - memcpy(buf, col.value().c_str(), len); - _split_values.emplace_back(buf, len); - } - delete row; - delete[] ptr; - } else { - const char* value = line.data; - size_t start = 0; // point to the start pos of next col value. - size_t curpos = 0; // point to the start pos of separator matching sequence. - size_t p1 = 0; // point to the current pos of separator matching sequence. - size_t non_space = 0; // point to the last pos of non_space character. - - // Separator: AAAA - // - // p1 - // ▼ - // AAAA - // 1000AAAA2000AAAA - // ▲ ▲ - // Start │ - // curpos - - while (curpos < line.size) { - if (curpos + p1 == line.size || *(value + curpos + p1) != _value_separator[p1]) { - // Not match, move forward: - curpos += (p1 == 0 ? 1 : p1); - p1 = 0; - } else { - p1++; - if (p1 == _value_separator_length) { - // Match a separator - non_space = curpos; - // Trim tailing spaces. Be consistent with hive and trino's behavior. - if (_state->trim_tailing_spaces_for_external_table_query()) { - while (non_space > start && *(value + non_space - 1) == ' ') { - non_space--; - } - } - if (_trim_double_quotes && (non_space - 1) > start && - *(value + start) == '\"' && *(value + non_space - 1) == '\"') { - start++; - non_space--; - } - _split_values.emplace_back(value + start, non_space - start); - start = curpos + _value_separator_length; - curpos = start; - p1 = 0; - non_space = 0; - } - } - } - - CHECK(curpos == line.size) << curpos << " vs " << line.size; - non_space = curpos; - if (_state->trim_tailing_spaces_for_external_table_query()) { - while (non_space > start && *(value + non_space - 1) == ' ') { - non_space--; - } - } - if (_trim_double_quotes && (non_space - 1) > start && *(value + start) == '\"' && - *(value + non_space - 1) == '\"') { - start++; - non_space--; - } - _split_values.emplace_back(value + start, non_space - start); - } -} - -Status VBrokerScanner::_line_split_to_values(const Slice& line) { - bool is_proto_format = _file_format_type == TFileFormatType::FORMAT_PROTO; - if (!is_proto_format && !validate_utf8(line.data, line.size)) { - RETURN_IF_ERROR(_state->append_error_msg_to_file( - []() -> std::string { return "Unable to display"; }, - []() -> std::string { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", "Unable to display"); - return fmt::to_string(error_msg); - }, - &_scanner_eof)); - _counter->num_rows_filtered++; - _success = false; - return Status::OK(); - } - - split_line(line); - - // range of current file - const TBrokerRangeDesc& range = _ranges.at(_next_range - 1); - bool read_by_column_def = false; - if (range.__isset.read_by_column_def) { - read_by_column_def = range.read_by_column_def; - } - const std::vector& columns_from_path = range.columns_from_path; - // read data by column definition, resize _split_values to _src_solt_size - if (read_by_column_def) { - // fill slots by NULL - while (_split_values.size() + columns_from_path.size() < _src_slot_descs.size()) { - _split_values.emplace_back(_split_values.back().get_data(), 0); - } - // remove redundant slots - while (_split_values.size() + columns_from_path.size() > _src_slot_descs.size()) { - _split_values.pop_back(); - } - } else { - if (_split_values.size() + columns_from_path.size() < _src_slot_descs.size()) { - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { - return is_proto_format ? "" : std::string(line.data, line.size); - }, - [&]() -> std::string { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", - "actual column number is less than schema column number."); - fmt::format_to(error_msg, "actual number: {}, column separator: [{}], ", - _split_values.size(), _value_separator); - fmt::format_to(error_msg, "line delimiter: [{}], schema number: {}; ", - _line_delimiter, _src_slot_descs.size()); - return fmt::to_string(error_msg); - }, - &_scanner_eof)); - _counter->num_rows_filtered++; - _success = false; - return Status::OK(); - } else if (_split_values.size() + columns_from_path.size() > _src_slot_descs.size()) { - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { - return is_proto_format ? "" : std::string(line.data, line.size); - }, - [&]() -> std::string { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", - "actual column number is more than schema column number."); - fmt::format_to(error_msg, "actual number: {}, column separator: [{}], ", - _split_values.size(), _value_separator); - fmt::format_to(error_msg, "line delimiter: [{}], schema number: {}; ", - _line_delimiter, _src_slot_descs.size()); - return fmt::to_string(error_msg); - }, - &_scanner_eof)); - _counter->num_rows_filtered++; - _success = false; - return Status::OK(); - } - } - - _success = true; - return Status::OK(); -} - -Status VBrokerScanner::get_next(Block* output_block, bool* eof) { - SCOPED_TIMER(_read_timer); - RETURN_IF_ERROR(_init_src_block()); - - const int batch_size = _state->batch_size(); - auto columns = _src_block.mutate_columns(); - - while (columns[0]->size() < batch_size && !_scanner_eof) { - if (_cur_line_reader == nullptr || _cur_line_reader_eof) { - RETURN_IF_ERROR(_open_next_reader()); - // If there isn't any more reader, break this - if (_scanner_eof) { - continue; - } - } - const uint8_t* ptr = nullptr; - size_t size = 0; - RETURN_IF_ERROR(_cur_line_reader->read_line(&ptr, &size, &_cur_line_reader_eof)); - if (_skip_lines > 0) { - _skip_lines--; - continue; - } - if (size == 0) { - // Read empty row, just continue - continue; - } - { - COUNTER_UPDATE(_rows_read_counter, 1); - SCOPED_TIMER(_materialize_timer); - RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), columns)); - if (_success) { - free_expr_local_allocations(); - } - } - } - columns.clear(); - - return _fill_dest_block(output_block, eof); -} - -Status VBrokerScanner::_fill_dest_columns(const Slice& line, - std::vector& columns) { - RETURN_IF_ERROR(_line_split_to_values(line)); - if (UNLIKELY(!_success)) { - // If not success, which means we met an invalid row, return. - return Status::OK(); - } - - // This check is meaningless, should be removed - // if (!check_array_format(_split_values)) { - // return Status::OK(); - // } - - int idx = 0; - for (int i = 0; i < _split_values.size(); ++i) { - int dest_index = idx++; - - auto src_slot_desc = _src_slot_descs[i]; - - const Slice& value = _split_values[i]; - _text_converter->write_string_column(src_slot_desc, &columns[dest_index], value.data, - value.size); - } - - return Status::OK(); -} -} // namespace doris::vectorized diff --git a/be/src/vec/exec/vbroker_scanner.h b/be/src/vec/exec/vbroker_scanner.h deleted file mode 100644 index 34f3d75b72..0000000000 --- a/be/src/vec/exec/vbroker_scanner.h +++ /dev/null @@ -1,94 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include "common/status.h" -#include "exec/base_scanner.h" -#include "exec/decompressor.h" -#include "exec/line_reader.h" -#include "exec/text_converter.h" -#include "gen_cpp/PlanNodes_types.h" -#include "gen_cpp/Types_types.h" -#include "io/file_reader.h" -#include "runtime/mem_pool.h" -#include "util/runtime_profile.h" -#include "util/slice.h" - -namespace doris::vectorized { -class VBrokerScanner final : public BaseScanner { -public: - VBrokerScanner(RuntimeState* state, RuntimeProfile* profile, - const TBrokerScanRangeParams& params, - const std::vector& ranges, - const std::vector& broker_addresses, - const std::vector& pre_filter_texprs, ScannerCounter* counter); - ~VBrokerScanner() override; - - Status open() override; - - Status get_next(Block* block, bool* eof) override; - - void close() override; - -private: - Status _open_file_reader(); - Status _create_decompressor(TFileFormatType::type type); - Status _open_line_reader(); - // Read next buffer from reader - Status _open_next_reader(); - Status _line_to_src_tuple(const Slice& line); - Status _line_split_to_values(const Slice& line); - // Split one text line to values - void split_line(const Slice& line); - - std::unique_ptr _text_converter; - - Status _fill_dest_columns(const Slice& line, std::vector& columns); - - std::string _value_separator; - std::string _line_delimiter; - TFileFormatType::type _file_format_type; - int _value_separator_length; - int _line_delimiter_length; - - // Reader - // _cur_file_reader_s is for stream load pipe reader, - // and _cur_file_reader is for other file reader. - // TODO: refactor this to use only shared_ptr or unique_ptr - std::unique_ptr _cur_file_reader; - std::shared_ptr _cur_file_reader_s; - FileReader* _real_reader; - LineReader* _cur_line_reader; - Decompressor* _cur_decompressor; - bool _cur_line_reader_eof; - - // When we fetch range start from 0, header_type="csv_with_names" skip first line - // When we fetch range start from 0, header_type="csv_with_names_and_types" skip first two line - // When we fetch range doesn't start from 0 will always skip the first line - int _skip_lines; - - std::vector _split_values; - bool _trim_double_quotes = false; -}; -} // namespace doris::vectorized diff --git a/be/src/vec/exec/vjson_scanner.cpp b/be/src/vec/exec/vjson_scanner.cpp deleted file mode 100644 index 7585af4d98..0000000000 --- a/be/src/vec/exec/vjson_scanner.cpp +++ /dev/null @@ -1,1611 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/vjson_scanner.h" - -#include - -#include - -#include "exec/line_reader.h" -#include "exec/plain_text_line_reader.h" -#include "exprs/json_functions.h" -#include "io/file_factory.h" -#include "runtime/runtime_state.h" -#include "vec/data_types/data_type_string.h" - -namespace doris::vectorized { -using namespace ErrorCode; - -template -VJsonScanner::VJsonScanner(RuntimeState* state, RuntimeProfile* profile, - const TBrokerScanRangeParams& params, - const std::vector& ranges, - const std::vector& broker_addresses, - const std::vector& pre_filter_texprs, - ScannerCounter* counter) - : BaseScanner(state, profile, params, ranges, broker_addresses, pre_filter_texprs, counter), - _cur_file_reader(nullptr), - _cur_file_reader_s(nullptr), - _real_reader(nullptr), - _cur_line_reader(nullptr), - _cur_json_reader(nullptr), - _cur_reader_eof(false), - _read_json_by_line(false) { - if (params.__isset.line_delimiter_length && params.line_delimiter_length > 1) { - _line_delimiter = params.line_delimiter_str; - _line_delimiter_length = params.line_delimiter_length; - } else { - _line_delimiter.push_back(static_cast(params.line_delimiter)); - _line_delimiter_length = 1; - } -} - -template -VJsonScanner::~VJsonScanner() { - close(); -} - -template -Status VJsonScanner::open() { - return BaseScanner::open(); -} - -template -void VJsonScanner::close() { - BaseScanner::close(); - if (_cur_json_reader != nullptr) { - delete _cur_json_reader; - _cur_json_reader = nullptr; - } - if (_cur_line_reader != nullptr) { - delete _cur_line_reader; - _cur_line_reader = nullptr; - } -} - -template -Status VJsonScanner::get_next(vectorized::Block* output_block, bool* eof) { - SCOPED_TIMER(_read_timer); - RETURN_IF_ERROR(_init_src_block()); - const int batch_size = _state->batch_size(); - - auto columns = _src_block.mutate_columns(); - // Get one line - while (columns[0]->size() < batch_size && !_scanner_eof) { - if (_real_reader == nullptr || _cur_reader_eof) { - RETURN_IF_ERROR(_open_next_reader()); - // If there isn't any more reader, break this - if (_scanner_eof) { - break; - } - } - - if (_read_json_by_line && _skip_next_line) { - size_t size = 0; - const uint8_t* line_ptr = nullptr; - RETURN_IF_ERROR(_cur_line_reader->read_line(&line_ptr, &size, &_cur_reader_eof)); - _skip_next_line = false; - continue; - } - - bool is_empty_row = false; - if constexpr (std::is_same_v) { - RETURN_IF_ERROR(_cur_vjson_reader->read_json_column(_src_block, _src_slot_descs, - &is_empty_row, &_cur_reader_eof)); - } else { - RETURN_IF_ERROR(_cur_vjson_reader->read_json_column(columns, _src_slot_descs, - &is_empty_row, &_cur_reader_eof)); - } - if (is_empty_row) { - // Read empty row, just continue - continue; - } - } - - COUNTER_UPDATE(_rows_read_counter, columns[0]->size()); - SCOPED_TIMER(_materialize_timer); - - return _fill_dest_block(output_block, eof); -} - -template -Status VJsonScanner::_open_next_reader() { - if (_next_range >= _ranges.size()) { - _scanner_eof = true; - return Status::OK(); - } - RETURN_IF_ERROR(_open_based_reader()); - RETURN_IF_ERROR(_open_vjson_reader()); - _next_range++; - return Status::OK(); -} - -template -Status VJsonScanner::_open_vjson_reader() { - if (_cur_vjson_reader != nullptr) { - _cur_vjson_reader.reset(); - } - std::string json_root; - std::string jsonpath; - bool strip_outer_array = false; - bool num_as_string = false; - bool fuzzy_parse = false; - - RETURN_IF_ERROR( - _get_range_params(jsonpath, json_root, strip_outer_array, num_as_string, fuzzy_parse)); - _cur_vjson_reader.reset(new JsonReader(_state, _counter, _profile, strip_outer_array, - num_as_string, fuzzy_parse, &_scanner_eof, - _read_json_by_line ? nullptr : _real_reader, - _read_json_by_line ? _cur_line_reader : nullptr)); - - RETURN_IF_ERROR(_cur_vjson_reader->init(jsonpath, json_root)); - return Status::OK(); -} - -template -Status VJsonScanner::_open_based_reader() { - RETURN_IF_ERROR(_open_file_reader()); - if (_read_json_by_line) { - RETURN_IF_ERROR(_open_line_reader()); - } - return Status::OK(); -} - -template -Status VJsonScanner::_open_file_reader() { - const TBrokerRangeDesc& range = _ranges[_next_range]; - int64_t start_offset = range.start_offset; - if (start_offset != 0) { - start_offset -= 1; - } - if (range.__isset.read_json_by_line) { - _read_json_by_line = range.read_json_by_line; - } - - if (range.file_type == TFileType::FILE_STREAM) { - RETURN_IF_ERROR(FileFactory::create_pipe_reader(range.load_id, _cur_file_reader_s)); - _real_reader = _cur_file_reader_s.get(); - } else { - RETURN_IF_ERROR(FileFactory::create_file_reader( - range.file_type, _state->exec_env(), _profile, _broker_addresses, - _params.properties, range, start_offset, _cur_file_reader)); - _real_reader = _cur_file_reader.get(); - } - _cur_reader_eof = false; - return _real_reader->open(); -} - -template -Status VJsonScanner::_open_line_reader() { - if (_cur_line_reader != nullptr) { - delete _cur_line_reader; - _cur_line_reader = nullptr; - } - - const TBrokerRangeDesc& range = _ranges[_next_range]; - int64_t size = range.size; - if (range.start_offset != 0) { - size += 1; - _skip_next_line = true; - } else { - _skip_next_line = false; - } - _cur_line_reader = new PlainTextLineReader(_profile, _real_reader, nullptr, size, - _line_delimiter, _line_delimiter_length); - _cur_reader_eof = false; - return Status::OK(); -} - -template -Status VJsonScanner::_open_json_reader() { - if (_cur_json_reader != nullptr) { - delete _cur_json_reader; - _cur_json_reader = nullptr; - } - - std::string json_root = ""; - std::string jsonpath = ""; - bool strip_outer_array = false; - bool num_as_string = false; - bool fuzzy_parse = false; - - RETURN_IF_ERROR( - _get_range_params(jsonpath, json_root, strip_outer_array, num_as_string, fuzzy_parse)); - if (_read_json_by_line) { - _cur_json_reader = - new JsonReader(_state, _counter, _profile, strip_outer_array, num_as_string, - fuzzy_parse, &_scanner_eof, nullptr, _cur_line_reader); - } else { - _cur_json_reader = new JsonReader(_state, _counter, _profile, strip_outer_array, - num_as_string, fuzzy_parse, &_scanner_eof, _real_reader); - } - - RETURN_IF_ERROR(_cur_json_reader->init(jsonpath, json_root)); - return Status::OK(); -} - -template -Status VJsonScanner::_get_range_params(std::string& jsonpath, std::string& json_root, - bool& strip_outer_array, bool& num_as_string, - bool& fuzzy_parse) { - const TBrokerRangeDesc& range = _ranges[_next_range]; - - if (range.__isset.jsonpaths) { - jsonpath = range.jsonpaths; - } - if (range.__isset.json_root) { - json_root = range.json_root; - } - if (range.__isset.strip_outer_array) { - strip_outer_array = range.strip_outer_array; - } - if (range.__isset.num_as_string) { - num_as_string = range.num_as_string; - } - if (range.__isset.fuzzy_parse) { - fuzzy_parse = range.fuzzy_parse; - } - return Status::OK(); -} - -VJsonReader::VJsonReader(RuntimeState* state, ScannerCounter* counter, RuntimeProfile* profile, - bool strip_outer_array, bool num_as_string, bool fuzzy_parse, - bool* scanner_eof, FileReader* file_reader, LineReader* line_reader) - : _vhandle_json_callback(nullptr), - _next_line(0), - _total_lines(0), - _state(state), - _counter(counter), - _profile(profile), - _file_reader(file_reader), - _line_reader(line_reader), - _closed(false), - _strip_outer_array(strip_outer_array), - _num_as_string(num_as_string), - _fuzzy_parse(fuzzy_parse), - _value_allocator(_value_buffer, sizeof(_value_buffer)), - _parse_allocator(_parse_buffer, sizeof(_parse_buffer)), - _origin_json_doc(&_value_allocator, sizeof(_parse_buffer), &_parse_allocator), - _json_doc(nullptr), - _scanner_eof(scanner_eof) { - _bytes_read_counter = ADD_COUNTER(_profile, "BytesRead", TUnit::BYTES); - _read_timer = ADD_TIMER(_profile, "ReadTime"); - _file_read_timer = ADD_TIMER(_profile, "FileReadTime"); -} - -VJsonReader::~VJsonReader() { - _close(); -} - -Status VJsonReader::init(const std::string& jsonpath, const std::string& json_root) { - // generate _parsed_jsonpaths and _parsed_json_root - RETURN_IF_ERROR(_parse_jsonpath_and_json_root(jsonpath, json_root)); - - //improve performance - if (_parsed_jsonpaths.empty()) { // input is a simple json-string - _vhandle_json_callback = &VJsonReader::_vhandle_simple_json; - } else { // input is a complex json-string and a json-path - if (_strip_outer_array) { - _vhandle_json_callback = &VJsonReader::_vhandle_flat_array_complex_json; - } else { - _vhandle_json_callback = &VJsonReader::_vhandle_nested_complex_json; - } - } - - return Status::OK(); -} - -Status VJsonReader::read_json_column(std::vector& columns, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof) { - return (this->*_vhandle_json_callback)(columns, slot_descs, is_empty_row, eof); -} - -Status VJsonReader::_vhandle_simple_json(std::vector& columns, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof) { - do { - bool valid = false; - if (_next_line >= _total_lines) { // parse json and generic document - Status st = _parse_json(is_empty_row, eof); - if (st.is()) { - continue; // continue to read next - } - RETURN_IF_ERROR(st); - if (*is_empty_row == true) { - return Status::OK(); - } - _name_map.clear(); - rapidjson::Value* objectValue = nullptr; - if (_json_doc->IsArray()) { - _total_lines = _json_doc->Size(); - if (_total_lines == 0) { - // may be passing an empty json, such as "[]" - RETURN_IF_ERROR(_append_error_msg(*_json_doc, "Empty json line", "", nullptr)); - if (*_scanner_eof) { - *is_empty_row = true; - return Status::OK(); - } - continue; - } - objectValue = &(*_json_doc)[0]; - } else { - _total_lines = 1; // only one row - objectValue = _json_doc; - } - _next_line = 0; - if (_fuzzy_parse) { - for (auto v : slot_descs) { - for (int i = 0; i < objectValue->MemberCount(); ++i) { - auto it = objectValue->MemberBegin() + i; - if (v->col_name() == it->name.GetString()) { - _name_map[v->col_name()] = i; - break; - } - } - } - } - } - - if (_json_doc->IsArray()) { // handle case 1 - rapidjson::Value& objectValue = (*_json_doc)[_next_line]; // json object - RETURN_IF_ERROR(_set_column_value(objectValue, columns, slot_descs, &valid)); - } else { // handle case 2 - RETURN_IF_ERROR(_set_column_value(*_json_doc, columns, slot_descs, &valid)); - } - _next_line++; - if (!valid) { - if (*_scanner_eof) { - // When _scanner_eof is true and valid is false, it means that we have encountered - // unqualified data and decided to stop the scan. - *is_empty_row = true; - return Status::OK(); - } - continue; - } - *is_empty_row = false; - break; // get a valid row, then break - } while (_next_line <= _total_lines); - return Status::OK(); -} - -// for simple format json -// set valid to true and return OK if succeed. -// set valid to false and return OK if we met an invalid row. -// return other status if encounter other problems. -Status VJsonReader::_set_column_value(rapidjson::Value& objectValue, - std::vector& columns, - const std::vector& slot_descs, bool* valid) { - if (!objectValue.IsObject()) { - // Here we expect the incoming `objectValue` to be a Json Object, such as {"key" : "value"}, - // not other type of Json format. - RETURN_IF_ERROR(_append_error_msg(objectValue, "Expect json object value", "", valid)); - return Status::OK(); - } - - int ctx_idx = 0; - bool has_valid_value = false; - size_t cur_row_count = columns[0]->size(); - for (auto slot_desc : slot_descs) { - if (!slot_desc->is_materialized()) { - continue; - } - - int dest_index = ctx_idx++; - auto* column_ptr = columns[dest_index].get(); - rapidjson::Value::ConstMemberIterator it = objectValue.MemberEnd(); - - if (_fuzzy_parse) { - auto idx_it = _name_map.find(slot_desc->col_name()); - if (idx_it != _name_map.end() && idx_it->second < objectValue.MemberCount()) { - it = objectValue.MemberBegin() + idx_it->second; - } - } else { - it = objectValue.FindMember( - rapidjson::Value(slot_desc->col_name().c_str(), slot_desc->col_name().size())); - } - - if (it != objectValue.MemberEnd()) { - const rapidjson::Value& value = it->value; - RETURN_IF_ERROR(_write_data_to_column(&value, slot_desc, column_ptr, valid)); - if (!(*valid)) { - return Status::OK(); - } - has_valid_value = true; - } else { // not found - // When the entire row has no valid value, this row should be filtered, - // so the default value cannot be directly inserted here - if (!slot_desc->is_nullable()) { - RETURN_IF_ERROR(_append_error_msg( - objectValue, - "The column `{}` is not nullable, but it's not found in jsondata.", - slot_desc->col_name(), valid)); - break; - } - } - } - if (!has_valid_value) { - RETURN_IF_ERROR(_append_error_msg(objectValue, "All fields is null, this is a invalid row.", - "", valid)); - return Status::OK(); - } - ctx_idx = 0; - int nullcount = 0; - // fill missing slot - for (auto slot_desc : slot_descs) { - if (!slot_desc->is_materialized()) { - continue; - } - int dest_index = ctx_idx++; - auto* column_ptr = columns[dest_index].get(); - if (column_ptr->size() < cur_row_count + 1) { - DCHECK(column_ptr->size() == cur_row_count); - column_ptr->assume_mutable()->insert_default(); - ++nullcount; - } - DCHECK(column_ptr->size() == cur_row_count + 1); - } - // There is at least one valid value here - DCHECK(nullcount < columns.size()); - *valid = true; - return Status::OK(); -} - -Status VJsonReader::_write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, - vectorized::IColumn* column_ptr, bool* valid) { - const char* str_value = nullptr; - char tmp_buf[128] = {0}; - int32_t wbytes = 0; - std::string json_str; - - vectorized::ColumnNullable* nullable_column = nullptr; - if (slot_desc->is_nullable()) { - nullable_column = reinterpret_cast(column_ptr); - // kNullType will put 1 into the Null map, so there is no need to push 0 for kNullType. - if (value->GetType() != rapidjson::Type::kNullType) { - nullable_column->get_null_map_data().push_back(0); - } else { - nullable_column->insert_default(); - } - column_ptr = &nullable_column->get_nested_column(); - } - - switch (value->GetType()) { - case rapidjson::Type::kStringType: - str_value = value->GetString(); - wbytes = strlen(str_value); - break; - case rapidjson::Type::kNumberType: - if (value->IsUint()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); - } else if (value->IsInt()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); - } else if (value->IsUint64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); - } else if (value->IsInt64()) { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); - } else { - wbytes = snprintf(tmp_buf, sizeof(tmp_buf), "%f", value->GetDouble()); - } - str_value = tmp_buf; - break; - case rapidjson::Type::kFalseType: - wbytes = 1; - str_value = (char*)"0"; - break; - case rapidjson::Type::kTrueType: - wbytes = 1; - str_value = (char*)"1"; - break; - case rapidjson::Type::kNullType: - if (!slot_desc->is_nullable()) { - RETURN_IF_ERROR(_append_error_msg( - *value, "Json value is null, but the column `{}` is not nullable.", - slot_desc->col_name(), valid)); - return Status::OK(); - } - // return immediately to prevent from repeatedly insert_data - *valid = true; - return Status::OK(); - default: - // for other type like array or object. we convert it to string to save - json_str = _print_json_value(*value); - wbytes = json_str.size(); - str_value = json_str.c_str(); - break; - } - - // TODO: if the vexpr can support another 'slot_desc type' than 'TYPE_VARCHAR', - // we need use a function to support these types to insert data in columns. - DCHECK(slot_desc->type().type == TYPE_VARCHAR); - assert_cast(column_ptr)->insert_data(str_value, wbytes); - - *valid = true; - return Status::OK(); -} - -Status VJsonReader::_vhandle_flat_array_complex_json(std::vector& columns, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof) { - do { - if (_next_line >= _total_lines) { - Status st = _parse_json(is_empty_row, eof); - if (st.is()) { - continue; // continue to read next - } - RETURN_IF_ERROR(st); - if (*is_empty_row == true) { - if (st == Status::OK()) { - return Status::OK(); - } - if (_total_lines == 0) { - continue; - } - } - } - rapidjson::Value& objectValue = (*_json_doc)[_next_line++]; - bool valid = true; - RETURN_IF_ERROR(_write_columns_by_jsonpath(objectValue, slot_descs, columns, &valid)); - if (!valid) { - continue; // process next line - } - *is_empty_row = false; - break; // get a valid row, then break - } while (_next_line <= _total_lines); - return Status::OK(); -} - -Status VJsonReader::_vhandle_nested_complex_json(std::vector& columns, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof) { - while (true) { - Status st = _parse_json(is_empty_row, eof); - if (st.is()) { - continue; // continue to read next - } - RETURN_IF_ERROR(st); - if (*is_empty_row == true) { - return Status::OK(); - } - *is_empty_row = false; - break; // read a valid row - } - bool valid = true; - RETURN_IF_ERROR(_write_columns_by_jsonpath(*_json_doc, slot_descs, columns, &valid)); - if (!valid) { - // there is only one line in this case, so if it return false, just set is_empty_row true - // so that the caller will continue reading next line. - *is_empty_row = true; - } - return Status::OK(); -} - -Status VJsonReader::_write_columns_by_jsonpath(rapidjson::Value& objectValue, - const std::vector& slot_descs, - std::vector& columns, - bool* valid) { - int ctx_idx = 0; - bool has_valid_value = false; - size_t cur_row_count = columns[0]->size(); - for (auto slot_desc : slot_descs) { - if (!slot_desc->is_materialized()) { - continue; - } - int i = ctx_idx++; - auto* column_ptr = columns[i].get(); - rapidjson::Value* json_values = nullptr; - bool wrap_explicitly = false; - if (LIKELY(i < _parsed_jsonpaths.size())) { - json_values = JsonFunctions::get_json_array_from_parsed_json( - _parsed_jsonpaths[i], &objectValue, _origin_json_doc.GetAllocator(), - &wrap_explicitly); - } - - if (json_values == nullptr) { - // not match in jsondata. - if (!slot_descs[i]->is_nullable()) { - RETURN_IF_ERROR(_append_error_msg( - objectValue, - "The column `{}` is not nullable, but it's not found in jsondata.", - slot_descs[i]->col_name(), valid)); - return Status::OK(); - } - } else { - CHECK(json_values->IsArray()); - if (json_values->Size() == 1 && wrap_explicitly) { - // NOTICE1: JsonFunctions::get_json_array_from_parsed_json() will wrap the single json object with an array. - // so here we unwrap the array to get the real element. - // if json_values' size > 1, it means we just match an array, not a wrapped one, so no need to unwrap. - json_values = &((*json_values)[0]); - } - RETURN_IF_ERROR(_write_data_to_column(json_values, slot_descs[i], column_ptr, valid)); - if (!(*valid)) { - return Status::OK(); - } - has_valid_value = true; - } - } - if (!has_valid_value) { - RETURN_IF_ERROR(_append_error_msg( - objectValue, "All fields is null or not matched, this is a invalid row.", "", - valid)); - return Status::OK(); - } - ctx_idx = 0; - for (auto slot_desc : slot_descs) { - if (!slot_desc->is_materialized()) { - continue; - } - int dest_index = ctx_idx++; - auto* column_ptr = columns[dest_index].get(); - if (column_ptr->size() < cur_row_count + 1) { - DCHECK(column_ptr->size() == cur_row_count); - column_ptr->assume_mutable()->insert_default(); - } - DCHECK(column_ptr->size() == cur_row_count + 1); - } - return Status::OK(); -} - -Status VJsonReader::_parse_json(bool* is_empty_row, bool* eof) { - size_t size = 0; - Status st = _parse_json_doc(&size, eof); - // terminate if encounter other errors - RETURN_IF_ERROR(st); - - // read all data, then return - if (size == 0 || *eof) { - *is_empty_row = true; - return Status::OK(); - } - - if (!_parsed_jsonpaths.empty() && _strip_outer_array) { - _total_lines = _json_doc->Size(); - _next_line = 0; - - if (_total_lines == 0) { - // meet an empty json array. - *is_empty_row = true; - } - } - return Status::OK(); -} - -Status VJsonReader::_append_error_msg(const rapidjson::Value& objectValue, std::string error_msg, - std::string col_name, bool* valid) { - std::string err_msg; - if (!col_name.empty()) { - fmt::memory_buffer error_buf; - fmt::format_to(error_buf, error_msg, col_name); - err_msg = fmt::to_string(error_buf); - } else { - err_msg = error_msg; - } - - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return _print_json_value(objectValue); }, - [&]() -> std::string { return err_msg; }, _scanner_eof)); - - _counter->num_rows_filtered++; - if (valid != nullptr) { - // current row is invalid - *valid = false; - } - return Status::OK(); -} - -// simdjson -VSIMDJsonReader::VSIMDJsonReader(doris::RuntimeState* state, doris::ScannerCounter* counter, - RuntimeProfile* profile, bool strip_outer_array, - bool num_as_string, bool fuzzy_parse, bool* scanner_eof, - FileReader* file_reader, LineReader* line_reader) - : _vhandle_json_callback(nullptr), - _next_line(0), - _total_lines(0), - _state(state), - _counter(counter), - _profile(profile), - _file_reader(file_reader), - _line_reader(line_reader), - _strip_outer_array(strip_outer_array), - _scanner_eof(scanner_eof) { - _bytes_read_counter = ADD_COUNTER(_profile, "BytesRead", TUnit::BYTES); - _read_timer = ADD_TIMER(_profile, "ReadTime"); - _file_read_timer = ADD_TIMER(_profile, "FileReadTime"); - _json_parser = std::make_unique(); -} - -VSIMDJsonReader::~VSIMDJsonReader() {} - -Status VSIMDJsonReader::init(const std::string& jsonpath, const std::string& json_root) { - // generate _parsed_jsonpaths and _parsed_json_root - RETURN_IF_ERROR(_parse_jsonpath_and_json_root(jsonpath, json_root)); - - // improve performance - if (_parsed_jsonpaths.empty()) { // input is a simple json-string - _vhandle_json_callback = &VSIMDJsonReader::_vhandle_simple_json; - } else { // input is a complex json-string and a json-path - if (_strip_outer_array) { - _vhandle_json_callback = &VSIMDJsonReader::_vhandle_flat_array_complex_json; - } else { - _vhandle_json_callback = &VSIMDJsonReader::_vhandle_nested_complex_json; - } - } - - return Status::OK(); -} - -Status VSIMDJsonReader::read_json_column(Block& block, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof) { - return (this->*_vhandle_json_callback)(block, slot_descs, is_empty_row, eof); -} - -#define RETURN_IF_SIMDJSON_ERROR(error, col_name, valid) \ - if (UNLIKELY(error)) { \ - RETURN_IF_ERROR(_append_error_msg("Encounter error while iterate json", col_name, valid)); \ - Status::DataQualityError("Encounter error while iterate json"); \ - } - -// for simple format json -// set valid to true and return OK if succeed. -// set valid to false and return OK if we met an invalid row. -// return other status if encounter other problems. -Status VSIMDJsonReader::_set_column_value(simdjson::ondemand::value objectValue, Block& block, - const std::vector& slot_descs, - bool* valid) { - if (objectValue.type() != simdjson::ondemand::json_type::object) { - // Here we expect the incoming `objectValue` to be a Json Object, such as {"key" : "value"}, - // not other type of Json format. - RETURN_IF_ERROR(_append_error_msg("Expect json object value", "", valid)); - return Status::OK(); - } - - auto object_val = objectValue.get_object(); - size_t cur_row_count = block.rows(); - bool has_valid_value = false; - // iterate through object, simdjson::ondemond will parsing on the fly - for (auto field : object_val) { - std::string_view key; - RETURN_IF_SIMDJSON_ERROR(field.unescaped_key().get(key), "", valid) - auto column_type_and_name = block.try_get_by_name(std::string(key)); - if (!column_type_and_name) { - continue; - } - _write_data_to_column(field.value(), nullptr, - column_type_and_name->column->assume_mutable().get(), valid); - if (!(*valid)) { - return Status::OK(); - } - has_valid_value = true; - } - if (!has_valid_value) { - RETURN_IF_ERROR(_append_error_msg("All fields is null, this is a invalid row.", "", valid)); - return Status::OK(); - } - - int nullcount = 0; - // fill missing slot - for (const auto& column_type_name : block) { - auto column = column_type_name.column; - if (column->size() < cur_row_count + 1) { - DCHECK(column->size() == cur_row_count); - column->assume_mutable()->insert_default(); - ++nullcount; - } - DCHECK(column->size() == cur_row_count + 1); - } - if (nullcount == block.columns()) { - RETURN_IF_ERROR(_append_error_msg("All fields is null, this is a invalid row.", "", valid)); - return Status::OK(); - } - - *valid = true; - return Status::OK(); -} - -Status VSIMDJsonReader::_write_data_to_column(simdjson::ondemand::value value, - SlotDescriptor* slot_desc, - vectorized::IColumn* column, bool* valid) { - vectorized::ColumnNullable* nullable_column = nullptr; - vectorized::IColumn* column_ptr = nullptr; - if (column->is_nullable()) { - nullable_column = assert_cast(column); - column_ptr = &nullable_column->get_nested_column(); - } - - // TODO: if the vexpr can support another 'slot_desc type' than 'TYPE_VARCHAR', - // we need use a function to support these types to insert data in columns. - ColumnString* column_string = assert_cast(column_ptr); - if (value.is_null()) { - if (column->is_nullable()) { - // insert_default already push 1 to null_map - nullable_column->insert_default(); - } else { - RETURN_IF_ERROR( - _append_error_msg("Json value is null, but the column `{}` is not nullable.", - slot_desc->col_name(), valid)); - return Status::OK(); - } - } else if (value.type() == simdjson::ondemand::json_type::boolean) { - nullable_column->get_null_map_data().push_back(0); - if (value.get_bool()) { - column_string->insert_data("1", 1); - } else { - column_string->insert_data("0", 1); - } - } else { - // just return it's str representation - auto str_view = simdjson::to_json_string(value).value(); - if (str_view[0] == '\"' || str_view[0] == '\'') { - str_view = str_view.substr(1, str_view.length() - 2); - } - nullable_column->get_null_map_data().push_back(0); - column_string->insert_data(str_view.data(), str_view.length()); - } - - *valid = true; - return Status::OK(); -} - -Status VSIMDJsonReader::_parse_json(bool* is_empty_row, bool* eof) { - size_t size = 0; - Status st = _parse_json_doc(&size, eof); - // terminate if encounter other errors - RETURN_IF_ERROR(st); - - // read all data, then return - if (size == 0 || *eof) { - *is_empty_row = true; - return Status::OK(); - } - - if (!_parsed_jsonpaths.empty() && _strip_outer_array) { - _total_lines = _json_value.count_elements(); - _next_line = 0; - - if (_total_lines == 0) { - // meet an empty json array. - *is_empty_row = true; - } - } - return Status::OK(); -} - -// read one json string from line reader or file reader and parse it to json doc. -// return Status::DataQualityError() if data has quality error. -// return other error if encounter other problems. -// return Status::OK() if parse succeed or reach EOF. -Status VSIMDJsonReader::_parse_json_doc(size_t* size, bool* eof) { - // read a whole message - SCOPED_TIMER(_file_read_timer); - const uint8_t* json_str = nullptr; - std::unique_ptr json_str_ptr; - if (_line_reader != nullptr) { - RETURN_IF_ERROR(_line_reader->read_line(&json_str, size, eof)); - } else { - int64_t length = 0; - RETURN_IF_ERROR(_file_reader->read_one_message(&json_str_ptr, &length)); - json_str = json_str_ptr.get(); - *size = length; - if (length == 0) { - *eof = true; - } - } - - _bytes_read_counter += *size; - if (*eof) { - return Status::OK(); - } - memcpy(_simdjson_ondemand_padding_buffer, json_str, *size); - auto error = _json_parser - ->iterate(std::string_view(reinterpret_cast( - _simdjson_ondemand_padding_buffer), - *size), - _padded_size) - .get(_original_json_doc); - if (error != simdjson::error_code::SUCCESS) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "Parse json data for JsonDoc failed. code: {}, error info: {}", - error, simdjson::error_message(error)); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return std::string((char*)json_str, *size); }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Case A: if _scanner_eof is set to true in "append_error_msg_to_file", which means - // we meet enough invalid rows and the scanner should be stopped. - // So we set eof to true and return OK, the caller will stop the process as we meet the end of file. - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } - // set json root - if (_parsed_json_root.size() != 0) { - auto real_doc = _original_json_doc.at_pointer( - std::string_view {_parsed_json_root.data(), _parsed_json_root.size()}); - if (real_doc.error() != simdjson::error_code::SUCCESS) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", "JSON Root not found."); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { - return std::string(simdjson::to_json_string(_original_json_doc).value()); - }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Same as Case A - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } - RETURN_IF_SIMDJSON_ERROR(real_doc.get(_json_value), "", nullptr); - } else { - RETURN_IF_SIMDJSON_ERROR(_original_json_doc.get(_json_value), "", nullptr); - } - - if (_json_value.type() == simdjson::ondemand::json_type::array && !_strip_outer_array) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", - "JSON data is array-object, `strip_outer_array` must be TRUE."); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { - return std::string(simdjson::to_json_string(_json_value).value()); - }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Same as Case A - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } - - if (_json_value.type() != simdjson::ondemand::json_type::array && _strip_outer_array) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", - "JSON data is not an array-object, `strip_outer_array` must be FALSE."); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { - return std::string(simdjson::to_json_string(_json_value).value()); - }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Same as Case A - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } - return Status::OK(); -} - -Status VSIMDJsonReader::_append_error_msg(std::string error_msg, std::string col_name, - bool* valid) { - std::string err_msg; - if (!col_name.empty()) { - fmt::memory_buffer error_buf; - fmt::format_to(error_buf, error_msg, col_name); - err_msg = fmt::to_string(error_buf); - } else { - err_msg = error_msg; - } - - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { - return std::string(simdjson::to_json_string(_original_json_doc).value()); - }, - [&]() -> std::string { return err_msg; }, _scanner_eof)); - - _counter->num_rows_filtered++; - if (valid != nullptr) { - // current row is invalid - *valid = false; - } - return Status::OK(); -} - -Status VSIMDJsonReader::_vhandle_simple_json(Block& block, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof) { - simdjson::ondemand::value objectValue; - simdjson::ondemand::array array; - do { - bool valid = false; - try { - if (_next_line >= _total_lines) { // parse json and generic document - Status st = _parse_json(is_empty_row, eof); - if (st.is()) { - continue; // continue to read next - } - RETURN_IF_ERROR(st); - if (*is_empty_row == true) { - return Status::OK(); - } - if (_json_value.type() == simdjson::ondemand::json_type::array) { - array = _json_value.get_array(); - _array_iter = array.begin(); - - _total_lines = array.count_elements(); - if (_total_lines == 0) { - // may be passing an empty json, such as "[]" - RETURN_IF_ERROR(_append_error_msg("Empty json line", "", nullptr)); - if (*_scanner_eof) { - *is_empty_row = true; - return Status::OK(); - } - continue; - } - } else { - _total_lines = 1; // only one row - objectValue = _json_value; - } - _next_line = 0; - } - - if (_json_value.type() == simdjson::ondemand::json_type::array) { // handle case 1 - objectValue = *_array_iter; - RETURN_IF_ERROR(_set_column_value(objectValue, block, slot_descs, &valid)); - ++_array_iter; - } else { // handle case 2 - RETURN_IF_ERROR(_set_column_value(_json_value, block, slot_descs, &valid)); - } - _next_line++; - if (!valid) { - if (*_scanner_eof) { - // When _scanner_eof is true and valid is false, it means that we have encountered - // unqualified data and decided to stop the scan. - *is_empty_row = true; - return Status::OK(); - } - continue; - } - *is_empty_row = false; - break; // get a valid row, then break - } catch (simdjson::simdjson_error& e) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "Parse json data for array failed. code: {}, error info: {}", - e.error(), e.what()); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return ""; }, - [&]() -> std::string { return fmt::to_string(error_msg); }, eof)); - _counter->num_rows_filtered++; - RETURN_IF_ERROR(_append_error_msg("Empty json line", "", nullptr)); - if (!valid) { - if (*_scanner_eof) { - // When _scanner_eof is true and valid is false, it means that we have encountered - // unqualified data and decided to stop the scan. - *is_empty_row = true; - return Status::OK(); - } - continue; - } - continue; - } - } while (_next_line <= _total_lines); - return Status::OK(); -} - -Status VSIMDJsonReader::_vhandle_flat_array_complex_json( - Block& block, const std::vector& slot_descs, bool* is_empty_row, - bool* eof) { - do { - try { - if (_next_line >= _total_lines) { - Status st = _parse_json(is_empty_row, eof); - if (st.is()) { - continue; // continue to read next - } - RETURN_IF_ERROR(st); - if (*is_empty_row == true) { - if (st == Status::OK()) { - return Status::OK(); - } - if (_total_lines == 0) { - continue; - } - } - simdjson::ondemand::array array; - RETURN_IF_SIMDJSON_ERROR(_json_value.get(array), "", nullptr); - _array_iter = array.begin(); - } - bool valid = true; - RETURN_IF_ERROR(_write_columns_by_jsonpath(*_array_iter, slot_descs, block, &valid)); - ++_array_iter; - ++_next_line; - if (!valid) { - continue; // process next line - } - *is_empty_row = false; - break; // get a valid row, then break - } catch (simdjson::simdjson_error& e) { - RETURN_IF_SIMDJSON_ERROR(e.error(), "", nullptr); - } - } while (_next_line <= _total_lines); - return Status::OK(); -} - -Status VSIMDJsonReader::_vhandle_nested_complex_json(Block& block, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof) { - while (true) { - try { - Status st = _parse_json(is_empty_row, eof); - if (st.is()) { - continue; // continue to read next - } - RETURN_IF_ERROR(st); - if (*is_empty_row == true) { - return Status::OK(); - } - *is_empty_row = false; - break; // read a valid row - } catch (simdjson::simdjson_error& e) { - RETURN_IF_SIMDJSON_ERROR(e.error(), "", nullptr); - } - } - bool valid = true; - RETURN_IF_ERROR(_write_columns_by_jsonpath(_json_value, slot_descs, block, &valid)); - if (!valid) { - // there is only one line in this case, so if it return false, just set is_empty_row true - // so that the caller will continue reading next line. - *is_empty_row = true; - } - return Status::OK(); -} - -// convert `["$.k1[0]", "$.k2.a"]` -> ["/k1/0", "/k2/a"] -static std::optional convert_to_simdjson_path( - const std::vector& parsed_paths) { - std::stringstream read_path; - bool is_valid = true; - std::for_each(parsed_paths.begin() + 1, parsed_paths.end(), - [&read_path, &is_valid](const auto& path) { - if (is_valid) { - read_path << path.to_simdjson_pointer(&is_valid); - } - }); - if (!is_valid) { - return {}; - } - return read_path.str(); -} - -Status VSIMDJsonReader::_parse_jsonpath_and_json_root(const std::string& jsonpath, - const std::string& json_root) { - // parse jsonpath - if (!jsonpath.empty()) { - RETURN_IF_ERROR(_generate_json_paths(jsonpath, &_parsed_jsonpaths)); - } - if (!json_root.empty()) { - std::vector parsed_json_root; - JsonFunctions::parse_json_paths(json_root, &parsed_json_root); - auto json_root_path = convert_to_simdjson_path(parsed_json_root); - if (!json_root_path) { - return Status::InvalidArgument("Invalid json root: " + json_root); - } - _parsed_json_root = json_root_path.value(); - } - return Status::OK(); -} - -Status VSIMDJsonReader::_generate_json_paths(const std::string& jsonpath, - std::vector* vect) { - memcpy(_simdjson_ondemand_padding_buffer, jsonpath.data(), jsonpath.size()); - simdjson::ondemand::document path_doc; - auto error = _json_parser - ->iterate(std::string_view(reinterpret_cast( - _simdjson_ondemand_padding_buffer), - jsonpath.size()), - _padded_size) - .get(path_doc); - if (error || path_doc.type() != simdjson::ondemand::json_type::array) { - return Status::InvalidArgument("Invalid json path: " + jsonpath); - } - for (auto item : path_doc) { - if (item.type() != simdjson::ondemand::json_type::string) { - return Status::InvalidArgument("Invalid json path: " + jsonpath); - } - std::vector parsed_paths; - JsonFunctions::parse_json_paths(std::string(item.get_string().value()), &parsed_paths); - - auto simdjson_path = convert_to_simdjson_path(parsed_paths); - if (!simdjson_path) { - return Status::InvalidArgument("Invalid json path: " + jsonpath); - } - vect->push_back(simdjson_path.value()); - } - return Status::OK(); -} - -Status VSIMDJsonReader::_write_columns_by_jsonpath(simdjson::ondemand::value value, - const std::vector& slot_descs, - Block& block, bool* valid) { - size_t column_num = slot_descs.size(); - auto object_value = value.get_object(); - bool has_valid_value = false; - size_t cur_row_count = block.rows(); - for (size_t i = 0; i < column_num; i++) { - auto* column_ptr = block.get_by_position(i).column->assume_mutable().get(); - simdjson::simdjson_result json_value; - if (i < _parsed_jsonpaths.size()) { - json_value = object_value.at_pointer( - std::string_view {_parsed_jsonpaths[i].data(), _parsed_jsonpaths[i].size()}); - } - if (i >= _parsed_jsonpaths.size() || json_value.error() != simdjson::error_code::SUCCESS) { - // not match in jsondata. - if (!slot_descs[i]->is_nullable()) { - RETURN_IF_ERROR(_append_error_msg( - "The column `{}` is not nullable, but it's not found in jsondata.", - slot_descs[i]->col_name(), valid)); - return Status::OK(); - } - } else { - RETURN_IF_ERROR( - _write_data_to_column(json_value.value(), slot_descs[i], column_ptr, valid)); - if (!(*valid)) { - return Status::OK(); - } - has_valid_value = true; - } - object_value.reset(); - } - if (!has_valid_value) { - RETURN_IF_ERROR(_append_error_msg("All fields is null, this is a invalid row.", "", valid)); - return Status::OK(); - } - - // fill missing slot - for (const auto& column_type_name : block) { - auto column = column_type_name.column; - if (column->size() < cur_row_count + 1) { - DCHECK(column->size() == cur_row_count); - column->assume_mutable()->insert_default(); - } - DCHECK(column->size() == cur_row_count + 1); - } - return Status::OK(); -} - -Status VJsonReader::_parse_jsonpath_and_json_root(const std::string& jsonpath, - const std::string& json_root) { - // parse jsonpath - if (!jsonpath.empty()) { - RETURN_IF_ERROR(_generate_json_paths(jsonpath, &_parsed_jsonpaths)); - } - if (!json_root.empty()) { - JsonFunctions::parse_json_paths(json_root, &_parsed_json_root); - } - return Status::OK(); -} - -Status VJsonReader::_generate_json_paths(const std::string& jsonpath, - std::vector>* vect) { - rapidjson::Document jsonpaths_doc; - if (!jsonpaths_doc.Parse(jsonpath.c_str(), jsonpath.length()).HasParseError()) { - if (!jsonpaths_doc.IsArray()) { - return Status::InvalidArgument("Invalid json path: {}", jsonpath); - } else { - for (int i = 0; i < jsonpaths_doc.Size(); i++) { - const rapidjson::Value& path = jsonpaths_doc[i]; - if (!path.IsString()) { - return Status::InvalidArgument("Invalid json path: {}", jsonpath); - } - std::vector parsed_paths; - JsonFunctions::parse_json_paths(path.GetString(), &parsed_paths); - vect->push_back(std::move(parsed_paths)); - } - return Status::OK(); - } - } else { - return Status::InvalidArgument("Invalid json path: {}", jsonpath); - } -} - -void VJsonReader::_close() { - if (_closed) { - return; - } - _closed = true; -} - -// read one json string from line reader or file reader and parse it to json doc. -// return Status::DataQualityError() if data has quality error. -// return other error if encounter other problems. -// return Status::OK() if parse succeed or reach EOF. -Status VJsonReader::_parse_json_doc(size_t* size, bool* eof) { - // read a whole message - SCOPED_TIMER(_file_read_timer); - const uint8_t* json_str = nullptr; - std::unique_ptr json_str_ptr; - if (_line_reader != nullptr) { - RETURN_IF_ERROR(_line_reader->read_line(&json_str, size, eof)); - } else { - int64_t length = 0; - RETURN_IF_ERROR(_file_reader->read_one_message(&json_str_ptr, &length)); - json_str = json_str_ptr.get(); - *size = length; - if (length == 0) { - *eof = true; - } - } - - _bytes_read_counter += *size; - if (*eof) { - return Status::OK(); - } - - // clear memory here. - _value_allocator.Clear(); - _parse_allocator.Clear(); - bool has_parse_error = false; - // parse jsondata to JsonDoc - - // As the issue: https://github.com/Tencent/rapidjson/issues/1458 - // Now, rapidjson only support uint64_t, So lagreint load cause bug. We use kParseNumbersAsStringsFlag. - if (_num_as_string) { - has_parse_error = - _origin_json_doc - .Parse((char*)json_str, *size) - .HasParseError(); - } else { - has_parse_error = _origin_json_doc.Parse((char*)json_str, *size).HasParseError(); - } - - if (has_parse_error) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "Parse json data for JsonDoc failed. code: {}, error info: {}", - _origin_json_doc.GetParseError(), - rapidjson::GetParseError_En(_origin_json_doc.GetParseError())); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return std::string((char*)json_str, *size); }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Case A: if _scanner_eof is set to true in "append_error_msg_to_file", which means - // we meet enough invalid rows and the scanner should be stopped. - // So we set eof to true and return OK, the caller will stop the process as we meet the end of file. - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } - - // set json root - if (_parsed_json_root.size() != 0) { - _json_doc = JsonFunctions::get_json_object_from_parsed_json( - _parsed_json_root, &_origin_json_doc, _origin_json_doc.GetAllocator()); - if (_json_doc == nullptr) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", "JSON Root not found."); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return _print_json_value(_origin_json_doc); }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Same as Case A - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } - } else { - _json_doc = &_origin_json_doc; - } - - if (_json_doc->IsArray() && !_strip_outer_array) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", - "JSON data is array-object, `strip_outer_array` must be TRUE."); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return _print_json_value(_origin_json_doc); }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Same as Case A - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } - - if (!_json_doc->IsArray() && _strip_outer_array) { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, "{}", - "JSON data is not an array-object, `strip_outer_array` must be FALSE."); - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return _print_json_value(_origin_json_doc); }, - [&]() -> std::string { return fmt::to_string(error_msg); }, _scanner_eof)); - _counter->num_rows_filtered++; - if (*_scanner_eof) { - // Same as Case A - *eof = true; - return Status::OK(); - } - return Status::DataQualityError(fmt::to_string(error_msg)); - } - - return Status::OK(); -} - -std::string VJsonReader::_print_json_value(const rapidjson::Value& value) { - rapidjson::StringBuffer buffer; - buffer.Clear(); - rapidjson::Writer writer(buffer); - value.Accept(writer); - return std::string(buffer.GetString()); -} - -// TODO: NEED TO REWRITE COMPLETELY. the way writing now is WRONG. -// StringRef shouldn't managing exclusive memory cause it will break RAII. -// besides, accessing object which is essentially const by non-const object -// is UB! -void VJsonReader::_fill_slot(doris::Tuple* tuple, SlotDescriptor* slot_desc, MemPool* mem_pool, - const uint8_t* value, int32_t len) { - tuple->set_not_null(slot_desc->null_indicator_offset()); - void* slot = tuple->get_slot(slot_desc->tuple_offset()); - StringRef* str_slot = reinterpret_cast(slot); - str_slot->data = reinterpret_cast(mem_pool->allocate(len)); - memcpy(const_cast(str_slot->data), value, len); - str_slot->size = len; -} - -Status VJsonReader::_write_data_to_tuple(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* desc, doris::Tuple* tuple, - MemPool* tuple_pool, bool* valid) { - const char* str_value = nullptr; - uint8_t tmp_buf[128] = {0}; - int32_t wbytes = 0; - switch (value->GetType()) { - case rapidjson::Type::kStringType: - str_value = value->GetString(); - _fill_slot(tuple, desc, tuple_pool, (uint8_t*)str_value, strlen(str_value)); - break; - case rapidjson::Type::kNumberType: - if (value->IsUint()) { - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%u", value->GetUint()); - _fill_slot(tuple, desc, tuple_pool, tmp_buf, wbytes); - } else if (value->IsInt()) { - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%d", value->GetInt()); - _fill_slot(tuple, desc, tuple_pool, tmp_buf, wbytes); - } else if (value->IsUint64()) { - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%" PRIu64, value->GetUint64()); - _fill_slot(tuple, desc, tuple_pool, tmp_buf, wbytes); - } else if (value->IsInt64()) { - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%" PRId64, value->GetInt64()); - _fill_slot(tuple, desc, tuple_pool, tmp_buf, wbytes); - } else { - wbytes = snprintf((char*)tmp_buf, sizeof(tmp_buf), "%f", value->GetDouble()); - _fill_slot(tuple, desc, tuple_pool, tmp_buf, wbytes); - } - break; - case rapidjson::Type::kFalseType: - _fill_slot(tuple, desc, tuple_pool, (uint8_t*)"0", 1); - break; - case rapidjson::Type::kTrueType: - _fill_slot(tuple, desc, tuple_pool, (uint8_t*)"1", 1); - break; - case rapidjson::Type::kNullType: - if (desc->is_nullable()) { - tuple->set_null(desc->null_indicator_offset()); - } else { - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return _print_json_value(*value); }, - [&]() -> std::string { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, - "Json value is null, but the column `{}` is not nullable.", - desc->col_name()); - return fmt::to_string(error_msg); - }, - _scanner_eof)); - _counter->num_rows_filtered++; - *valid = false; - return Status::OK(); - } - break; - default: - // for other type like array or object. we convert it to string to save - std::string json_str = _print_json_value(*value); - _fill_slot(tuple, desc, tuple_pool, (uint8_t*)json_str.c_str(), json_str.length()); - break; - } - *valid = true; - return Status::OK(); -} - -// for simple format json -// set valid to true and return OK if succeed. -// set valid to false and return OK if we met an invalid row. -// return other status if encounter other problmes. -Status VJsonReader::_set_tuple_value(rapidjson::Value& objectValue, doris::Tuple* tuple, - const std::vector& slot_descs, - MemPool* tuple_pool, bool* valid) { - if (!objectValue.IsObject()) { - // Here we expect the incoming `objectValue` to be a Json Object, such as {"key" : "value"}, - // not other type of Json format. - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return _print_json_value(objectValue); }, - [&]() -> std::string { return "Expect json object value"; }, _scanner_eof)); - _counter->num_rows_filtered++; - *valid = false; // current row is invalid - return Status::OK(); - } - - int nullcount = 0; - for (auto v : slot_descs) { - rapidjson::Value::ConstMemberIterator it = objectValue.MemberEnd(); - if (_fuzzy_parse) { - auto idx_it = _name_map.find(v->col_name()); - if (idx_it != _name_map.end() && idx_it->second < objectValue.MemberCount()) { - it = objectValue.MemberBegin() + idx_it->second; - } - } else { - it = objectValue.FindMember( - rapidjson::Value(v->col_name().c_str(), v->col_name().size())); - } - if (it != objectValue.MemberEnd()) { - const rapidjson::Value& value = it->value; - RETURN_IF_ERROR(_write_data_to_tuple(&value, v, tuple, tuple_pool, valid)); - if (!(*valid)) { - return Status::OK(); - } - } else { // not found - if (v->is_nullable()) { - tuple->set_null(v->null_indicator_offset()); - nullcount++; - } else { - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return _print_json_value(objectValue); }, - [&]() -> std::string { - fmt::memory_buffer error_msg; - fmt::format_to(error_msg, - "The column `{}` is not nullable, but it's not found in " - "jsondata.", - v->col_name()); - return fmt::to_string(error_msg); - }, - _scanner_eof)); - _counter->num_rows_filtered++; - *valid = false; // current row is invalid - break; - } - } - } - - if (nullcount == slot_descs.size()) { - RETURN_IF_ERROR(_state->append_error_msg_to_file( - [&]() -> std::string { return _print_json_value(objectValue); }, - [&]() -> std::string { return "All fields is null, this is a invalid row."; }, - _scanner_eof)); - _counter->num_rows_filtered++; - *valid = false; - return Status::OK(); - } - *valid = true; - return Status::OK(); -} - -template class VJsonScanner; -template class VJsonScanner; -} // namespace doris::vectorized diff --git a/be/src/vec/exec/vjson_scanner.h b/be/src/vec/exec/vjson_scanner.h deleted file mode 100644 index 3f32648f29..0000000000 --- a/be/src/vec/exec/vjson_scanner.h +++ /dev/null @@ -1,279 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "common/status.h" -#include "exec/base_scanner.h" -#include "exec/line_reader.h" -#include "exprs/json_functions.h" -#include "io/file_reader.h" -#include "runtime/descriptors.h" -#include "util/runtime_profile.h" - -namespace doris { -class ExprContext; -class RuntimeState; -struct ScannerCounter; - -namespace vectorized { -class VJsonReader; - -template -class VJsonScanner : public BaseScanner { -public: - VJsonScanner(RuntimeState* state, RuntimeProfile* profile, const TBrokerScanRangeParams& params, - const std::vector& ranges, - const std::vector& broker_addresses, - const std::vector& pre_filter_texprs, ScannerCounter* counter); - - ~VJsonScanner() override; - - // Open this scanner, will initialize information needed - Status open() override; - - Status get_next(vectorized::Block* output_block, bool* eof) override; - - void close() override; - -private: - Status _open_vjson_reader(); - Status _open_next_reader(); - - Status _open_file_reader(); - Status _open_line_reader(); - Status _open_json_reader(); - - Status _open_based_reader(); - Status _get_range_params(std::string& jsonpath, std::string& json_root, bool& strip_outer_array, - bool& num_as_string, bool& fuzzy_parse); - std::string _jsonpath; - std::string _jsonpath_file; - - std::string _line_delimiter; - int _line_delimiter_length; - - // Reader - // _cur_file_reader_s is for stream load pipe reader, - // and _cur_file_reader is for other file reader. - // TODO: refactor this to use only shared_ptr or unique_ptr - std::unique_ptr _cur_file_reader; - std::shared_ptr _cur_file_reader_s; - FileReader* _real_reader; - LineReader* _cur_line_reader; - JsonReader* _cur_json_reader; - bool _cur_reader_eof; - bool _read_json_by_line; - - // When we fetch range doesn't start from 0, - // we will read to one ahead, and skip the first line - bool _skip_next_line; - std::unique_ptr _cur_vjson_reader = nullptr; -}; - -class VJsonReader { -public: - VJsonReader(RuntimeState* state, ScannerCounter* counter, RuntimeProfile* profile, - bool strip_outer_array, bool num_as_string, bool fuzzy_parse, bool* scanner_eof, - FileReader* file_reader = nullptr, LineReader* line_reader = nullptr); - - ~VJsonReader(); - - Status init(const std::string& jsonpath, const std::string& json_root); - - Status read_json_column(std::vector& columns, - const std::vector& slot_descs, bool* is_empty_row, - bool* eof); - -private: - Status (VJsonReader::*_vhandle_json_callback)( - std::vector& columns, - const std::vector& slot_descs, bool* is_empty_row, bool* eof); - - Status _vhandle_simple_json(std::vector& columns, - const std::vector& slot_descs, bool* is_empty_row, - bool* eof); - - Status _vhandle_flat_array_complex_json(std::vector& columns, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof); - - Status _vhandle_nested_complex_json(std::vector& columns, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof); - - Status _write_columns_by_jsonpath(rapidjson::Value& objectValue, - const std::vector& slot_descs, - std::vector& columns, bool* valid); - - Status _set_column_value(rapidjson::Value& objectValue, std::vector& columns, - const std::vector& slot_descs, bool* valid); - - Status _write_data_to_column(rapidjson::Value::ConstValueIterator value, - SlotDescriptor* slot_desc, vectorized::IColumn* column_ptr, - bool* valid); - - Status _parse_json(bool* is_empty_row, bool* eof); - - Status _append_error_msg(const rapidjson::Value& objectValue, std::string error_msg, - std::string col_name, bool* valid); - - void _fill_slot(doris::Tuple* tuple, SlotDescriptor* slot_desc, MemPool* mem_pool, - const uint8_t* value, int32_t len); - Status _parse_json_doc(size_t* size, bool* eof); - Status _set_tuple_value(rapidjson::Value& objectValue, doris::Tuple* tuple, - const std::vector& slot_descs, MemPool* tuple_pool, - bool* valid); - Status _write_data_to_tuple(rapidjson::Value::ConstValueIterator value, SlotDescriptor* desc, - doris::Tuple* tuple, MemPool* tuple_pool, bool* valid); - std::string _print_json_value(const rapidjson::Value& value); - - void _close(); - Status _generate_json_paths(const std::string& jsonpath, - std::vector>* vect); - Status _parse_jsonpath_and_json_root(const std::string& jsonpath, const std::string& json_root); - - int _next_line; - int _total_lines; - RuntimeState* _state; - ScannerCounter* _counter; - RuntimeProfile* _profile; - FileReader* _file_reader; - LineReader* _line_reader; - bool _closed; - bool _strip_outer_array; - bool _num_as_string; - bool _fuzzy_parse; - RuntimeProfile::Counter* _bytes_read_counter; - RuntimeProfile::Counter* _read_timer; - RuntimeProfile::Counter* _file_read_timer; - - std::vector> _parsed_jsonpaths; - std::vector _parsed_json_root; - - char _value_buffer[4 * 1024 * 1024]; - char _parse_buffer[512 * 1024]; - - using Document = rapidjson::GenericDocument, rapidjson::MemoryPoolAllocator<>, - rapidjson::MemoryPoolAllocator<>>; - rapidjson::MemoryPoolAllocator<> _value_allocator; - rapidjson::MemoryPoolAllocator<> _parse_allocator; - Document _origin_json_doc; // origin json document object from parsed json string - rapidjson::Value* _json_doc; // _json_doc equals _final_json_doc iff not set `json_root` - std::unordered_map _name_map; - - // point to the _scanner_eof of JsonScanner - bool* _scanner_eof; -}; - -class VSIMDJsonReader { -public: - VSIMDJsonReader(RuntimeState* state, ScannerCounter* counter, RuntimeProfile* profile, - bool strip_outer_array, bool num_as_string, bool fuzzy_parse, bool* scanner_eof, - FileReader* file_reader = nullptr, LineReader* line_reader = nullptr); - - ~VSIMDJsonReader(); - - Status init(const std::string& jsonpath, const std::string& json_root); - - Status read_json_column(Block& block, const std::vector& slot_descs, - bool* is_empty_row, bool* eof); - -private: - Status (VSIMDJsonReader::*_vhandle_json_callback)( - Block& block, const std::vector& slot_descs, bool* is_empty_row, - bool* eof); - - Status _vhandle_simple_json(Block& block, const std::vector& slot_descs, - bool* is_empty_row, bool* eof); - - Status _vhandle_flat_array_complex_json(Block& block, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof); - - Status _vhandle_nested_complex_json(Block& block, - const std::vector& slot_descs, - bool* is_empty_row, bool* eof); - - Status _write_columns_by_jsonpath(simdjson::ondemand::value value, - const std::vector& slot_descs, Block& block, - bool* valid); - - Status _set_column_value(simdjson::ondemand::value value, Block& block, - const std::vector& slot_descs, bool* valid); - - Status _write_data_to_column(simdjson::ondemand::value value, SlotDescriptor* slot_desc, - vectorized::IColumn* column_ptr, bool* valid); - - Status _parse_json(bool* is_empty_row, bool* eof); - - Status _parse_json_doc(size_t* size, bool* eof); - - Status _parse_jsonpath_and_json_root(const std::string& jsonpath, const std::string& json_root); - - Status _generate_json_paths(const std::string& jsonpath, std::vector* vect); - - Status _append_error_msg(std::string error_msg, std::string col_name, bool* valid); - - std::unique_ptr _json_parser = nullptr; - simdjson::ondemand::document _original_json_doc; - simdjson::ondemand::value _json_value; - // for strip outer array - simdjson::ondemand::array_iterator _array_iter; - - int _next_line; - int _total_lines; - doris::RuntimeState* _state; - doris::ScannerCounter* _counter; - RuntimeProfile* _profile; - FileReader* _file_reader; - LineReader* _line_reader; - bool _strip_outer_array; - RuntimeProfile::Counter* _bytes_read_counter; - RuntimeProfile::Counter* _read_timer; - RuntimeProfile::Counter* _file_read_timer; - - // simdjson pointer string, eg. - // jsonpath simdjson pointer - // `["$.k1[0]", "$.k2.a"]` -> ["/k1/0", "/k2/a"] - // notice array index not support `*` - // so we are not fully compatible with previous implementation by rapidjson - std::vector _parsed_jsonpaths; - std::string _parsed_json_root; - - bool* _scanner_eof; - - static constexpr size_t _buffer_size = 1024 * 1024 * 8; - static constexpr size_t _padded_size = _buffer_size + simdjson::SIMDJSON_PADDING; - char _simdjson_ondemand_padding_buffer[_padded_size]; -}; - -} // namespace vectorized -} // namespace doris diff --git a/be/src/vec/exec/vorc_scanner.cpp b/be/src/vec/exec/vorc_scanner.cpp deleted file mode 100644 index 4c7c80d731..0000000000 --- a/be/src/vec/exec/vorc_scanner.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/vorc_scanner.h" - -#include - -namespace doris::vectorized { - -VORCScanner::VORCScanner(RuntimeState* state, RuntimeProfile* profile, - const TBrokerScanRangeParams& params, - const std::vector& ranges, - const std::vector& broker_addresses, - const std::vector& pre_filter_texprs, ScannerCounter* counter) - : VArrowScanner(state, profile, params, ranges, broker_addresses, pre_filter_texprs, - counter) {} - -ArrowReaderWrap* VORCScanner::_new_arrow_reader(const std::vector& file_slot_descs, - FileReader* file_reader, - int32_t num_of_columns_from_file, - int64_t range_start_offset, int64_t range_size) { - return new ORCReaderWrap(_state, file_slot_descs, file_reader, num_of_columns_from_file, - range_start_offset, range_size); -} - -} // namespace doris::vectorized diff --git a/be/src/vec/exec/vorc_scanner.h b/be/src/vec/exec/vorc_scanner.h deleted file mode 100644 index 78002e8a43..0000000000 --- a/be/src/vec/exec/vorc_scanner.h +++ /dev/null @@ -1,54 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "common/status.h" -#include "exec/base_scanner.h" -#include "gen_cpp/Types_types.h" -#include "runtime/mem_pool.h" -#include "util/runtime_profile.h" - -namespace doris::vectorized { - -// VOrc scanner convert the data read from Orc to doris's columns. -class VORCScanner final : public VArrowScanner { -public: - VORCScanner(RuntimeState* state, RuntimeProfile* profile, const TBrokerScanRangeParams& params, - const std::vector& ranges, - const std::vector& broker_addresses, - const std::vector& pre_filter_texprs, ScannerCounter* counter); - - ~VORCScanner() override = default; - -protected: - ArrowReaderWrap* _new_arrow_reader(const std::vector& file_slot_descs, - FileReader* file_reader, int32_t num_of_columns_from_file, - int64_t range_start_offset, int64_t range_size) override; -}; - -} // namespace doris::vectorized diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index 4697a729c5..214ed42032 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -273,12 +273,7 @@ set(VEC_TEST_FILES vec/core/column_nullable_test.cpp vec/core/column_vector_test.cpp vec/exec/vgeneric_iterators_test.cpp - vec/exec/vbroker_scan_node_test.cpp - vec/exec/vbroker_scanner_test.cpp - vec/exec/vjson_scanner_test.cpp vec/exec/vtablet_sink_test.cpp - vec/exec/vorc_scanner_test.cpp - vec/exec/vparquet_scanner_test.cpp vec/exprs/vexpr_test.cpp vec/function/function_array_aggregation_test.cpp vec/function/function_array_element_test.cpp diff --git a/be/test/vec/exec/vbroker_scan_node_test.cpp b/be/test/vec/exec/vbroker_scan_node_test.cpp deleted file mode 100644 index dc237c7103..0000000000 --- a/be/test/vec/exec/vbroker_scan_node_test.cpp +++ /dev/null @@ -1,637 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#include "vec/exec/vbroker_scan_node.h" - -#include - -#include -#include -#include - -#include "common/object_pool.h" -#include "exprs/binary_predicate.h" -#include "exprs/cast_functions.h" -#include "exprs/literal.h" -#include "exprs/slot_ref.h" -#include "gen_cpp/Descriptors_types.h" -#include "gen_cpp/PlanNodes_types.h" -#include "io/local_file_reader.h" -#include "runtime/descriptors.h" -#include "runtime/memory/mem_tracker.h" -#include "runtime/primitive_type.h" -#include "runtime/runtime_state.h" -#include "runtime/user_function_cache.h" - -namespace doris { - -Expr* create_literal(ObjectPool* pool, PrimitiveType type, const void* data); - -namespace vectorized { -class VBrokerScanNodeTest : public testing::Test { -public: - VBrokerScanNodeTest() : _runtime_state(TQueryGlobals()) { - init(); - _runtime_state.init_mem_trackers(); - } - void init(); - static void SetUpTestCase() { - UserFunctionCache::instance()->init( - "./be/test/runtime/test_data/user_function_cache/normal"); - CastFunctions::init(); - } - -protected: - virtual void SetUp() {} - virtual void TearDown() {} - -private: - void init_desc_table(); - RuntimeState _runtime_state; - ObjectPool _obj_pool; - std::map _slots_map; - TBrokerScanRangeParams _params; - DescriptorTbl* _desc_tbl; - TPlanNode _tnode; -}; - -void VBrokerScanNodeTest::init_desc_table() { - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::OLAP_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - - int next_slot_id = 1; - // TSlotDescriptor - // int offset = 1; - // int i = 0; - // k1 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 0; - slot_desc.byteOffset = 0; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = -1; - slot_desc.colName = "k1"; - slot_desc.slotIdx = 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k2 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 1; - slot_desc.byteOffset = 4; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = -1; - slot_desc.colName = "k2"; - slot_desc.slotIdx = 2; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k3 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 1; - slot_desc.byteOffset = 8; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = -1; - slot_desc.colName = "k3"; - slot_desc.slotIdx = 3; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k4(partitioned column) - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 1; - slot_desc.byteOffset = 12; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = -1; - slot_desc.colName = "k4"; - slot_desc.slotIdx = 4; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - - t_desc_table.__isset.slotDescriptors = true; - { - // TTupleDescriptor dest - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = 0; - t_tuple_desc.byteSize = 16; - t_tuple_desc.numNullBytes = 0; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - } - - // source tuple descriptor - // TSlotDescriptor - // int offset = 1; - // int i = 0; - // k1 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 0; - slot_desc.byteOffset = 0; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 0; - slot_desc.colName = "k1"; - slot_desc.slotIdx = 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k2 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 1; - slot_desc.byteOffset = 16; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 1; - slot_desc.colName = "k2"; - slot_desc.slotIdx = 2; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k3 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 2; - slot_desc.byteOffset = 32; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 2; - slot_desc.colName = "k3"; - slot_desc.slotIdx = 3; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k4(partitioned column) - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 3; - slot_desc.byteOffset = 48; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 3; - slot_desc.colName = "k4"; - slot_desc.slotIdx = 4; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - - { - // TTupleDescriptor source - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = 1; - t_tuple_desc.byteSize = 64; - t_tuple_desc.numNullBytes = 0; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - } - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - - _runtime_state.set_desc_tbl(_desc_tbl); -} - -void VBrokerScanNodeTest::init() { - _params.column_separator = ','; - _params.line_delimiter = '\n'; - - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TTypeDesc varchar_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(5000); - node.__set_scalar_type(scalar_type); - varchar_type.types.push_back(node); - } - - for (int i = 0; i < 4; ++i) { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttoint"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_int_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 5 + i; - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(i + 1, expr); - _params.src_slot_ids.push_back(5 + i); - } - // _params.__isset.expr_of_dest_slot = true; - _params.__set_dest_tuple_id(0); - _params.__set_src_tuple_id(1); - - init_desc_table(); - - // Node Id - _tnode.node_id = 0; - _tnode.node_type = TPlanNodeType::BROKER_SCAN_NODE; - _tnode.num_children = 0; - _tnode.limit = -1; - _tnode.row_tuples.push_back(0); - _tnode.nullable_tuples.push_back(false); - _tnode.broker_scan_node.tuple_id = 0; - _tnode.__isset.broker_scan_node = true; -} - -TEST_F(VBrokerScanNodeTest, normal) { - VBrokerScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - scan_node.init(_tnode); - auto status = scan_node.prepare(&_runtime_state); - ASSERT_TRUE(status.ok()); - - // set scan range - std::vector scan_ranges; - - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - - TBrokerRangeDesc range; - range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; - range.start_offset = 0; - range.size = -1; - range.file_type = TFileType::FILE_LOCAL; - range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; - range.splittable = true; - std::vector columns_from_path {"1"}; - range.__set_columns_from_path(columns_from_path); - range.__set_num_of_columns_from_file(3); - broker_scan_range.ranges.push_back(range); - - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - - scan_ranges.push_back(scan_range_params); - } - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - - TBrokerRangeDesc range; - range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; - range.start_offset = 1; - range.size = 7; - range.file_type = TFileType::FILE_LOCAL; - range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; - range.splittable = true; - std::vector columns_from_path {"2"}; - range.__set_columns_from_path(columns_from_path); - range.__set_num_of_columns_from_file(3); - broker_scan_range.ranges.push_back(range); - - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - - scan_ranges.push_back(scan_range_params); - } - - scan_node.set_scan_ranges(scan_ranges); - - status = scan_node.open(&_runtime_state); - ASSERT_TRUE(status.ok()); - - doris::vectorized::Block block; - bool eos = false; - status = scan_node.get_next(&_runtime_state, &block, &eos); - ASSERT_EQ(4, block.rows()); - ASSERT_EQ(4, block.columns()); - ASSERT_TRUE(eos); - - auto columns = block.get_columns_with_type_and_name(); - ASSERT_EQ(columns.size(), 4); - ASSERT_EQ(columns[0].to_string(0), "1"); - ASSERT_EQ(columns[0].to_string(1), "4"); - ASSERT_EQ(columns[0].to_string(2), "8"); - ASSERT_EQ(columns[0].to_string(3), "4"); - - ASSERT_EQ(columns[1].to_string(0), "2"); - ASSERT_EQ(columns[1].to_string(1), "5"); - ASSERT_EQ(columns[1].to_string(2), "9"); - ASSERT_EQ(columns[1].to_string(3), "5"); - - ASSERT_EQ(columns[2].to_string(0), "3"); - ASSERT_EQ(columns[2].to_string(1), "6"); - ASSERT_EQ(columns[2].to_string(2), "10"); - ASSERT_EQ(columns[2].to_string(3), "6"); - - ASSERT_EQ(columns[3].to_string(0), "1"); - ASSERT_EQ(columns[3].to_string(1), "1"); - ASSERT_EQ(columns[3].to_string(2), "1"); - ASSERT_EQ(columns[3].to_string(3), "2"); - - block.clear(); - status = scan_node.get_next(&_runtime_state, &block, &eos); - ASSERT_EQ(0, block.rows()); - ASSERT_TRUE(eos); - - scan_node.close(&_runtime_state); - { - std::stringstream ss; - scan_node.runtime_profile()->pretty_print(&ss); - LOG(INFO) << ss.str(); - } -} - -TEST_F(VBrokerScanNodeTest, where_binary_pre) { - TPlanNode _tnode_ = _tnode; - - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TExpr expr; - { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::BINARY_PRED); - expr_node.type = gen_type_desc(TPrimitiveType::BOOLEAN); - expr_node.__set_num_children(2); - expr_node.__isset.opcode = true; - expr_node.__set_opcode(TExprOpcode::LT); - expr_node.__isset.vector_opcode = true; - expr_node.__set_vector_opcode(TExprOpcode::LT); - expr_node.__isset.fn = true; - expr_node.fn.name.function_name = "lt"; - expr_node.fn.binary_type = TFunctionBinaryType::BUILTIN; - expr_node.fn.ret_type = int_type; - expr_node.fn.has_var_args = false; - expr.nodes.push_back(expr_node); - } - { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::SLOT_REF); - expr_node.type = int_type; - expr_node.__set_num_children(0); - expr_node.__isset.slot_ref = true; - TSlotRef slot_ref; - slot_ref.__set_slot_id(1); - slot_ref.__set_tuple_id(0); - expr_node.__set_slot_ref(slot_ref); - expr_node.__isset.output_column = true; - expr_node.__set_output_column(0); - expr.nodes.push_back(expr_node); - } - { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::INT_LITERAL); - expr_node.type = int_type; - expr_node.__set_num_children(0); - expr_node.__isset.int_literal = true; - TIntLiteral int_literal; - int_literal.__set_value(8); - expr_node.__set_int_literal(int_literal); - expr.nodes.push_back(expr_node); - } - _tnode_.__set_vconjunct(expr); - - VBrokerScanNode scan_node(&_obj_pool, _tnode_, *_desc_tbl); - auto status = scan_node.init(_tnode_); - ASSERT_TRUE(status.ok()); - status = scan_node.prepare(&_runtime_state); - ASSERT_TRUE(status.ok()); - - // set scan range - std::vector scan_ranges; - - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - - TBrokerRangeDesc range; - range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; - range.start_offset = 0; - range.size = -1; - range.file_type = TFileType::FILE_LOCAL; - range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; - range.splittable = true; - std::vector columns_from_path {"1"}; - range.__set_columns_from_path(columns_from_path); - range.__set_num_of_columns_from_file(3); - broker_scan_range.ranges.push_back(range); - - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - - scan_ranges.push_back(scan_range_params); - } - - scan_node.set_scan_ranges(scan_ranges); - - status = scan_node.open(&_runtime_state); - ASSERT_TRUE(status.ok()); - - doris::vectorized::Block block; - bool eos = false; - status = scan_node.get_next(&_runtime_state, &block, &eos); - ASSERT_EQ(2, block.rows()); - ASSERT_EQ(4, block.columns()); - - auto columns = block.get_columns_with_type_and_name(); - ASSERT_EQ(columns.size(), 4); - ASSERT_EQ(columns[0].to_string(0), "1"); - ASSERT_EQ(columns[0].to_string(1), "4"); - - ASSERT_EQ(columns[1].to_string(0), "2"); - ASSERT_EQ(columns[1].to_string(1), "5"); - - ASSERT_EQ(columns[2].to_string(0), "3"); - ASSERT_EQ(columns[2].to_string(1), "6"); - - ASSERT_EQ(columns[3].to_string(0), "1"); - ASSERT_EQ(columns[3].to_string(1), "1"); - - ASSERT_TRUE(eos); - - block.clear(); - status = scan_node.get_next(&_runtime_state, &block, &eos); - ASSERT_EQ(0, block.rows()); - ASSERT_TRUE(eos); - - scan_node.close(&_runtime_state); - { - std::stringstream ss; - scan_node.runtime_profile()->pretty_print(&ss); - LOG(INFO) << ss.str(); - } -} - -} // namespace vectorized -} // namespace doris diff --git a/be/test/vec/exec/vbroker_scanner_test.cpp b/be/test/vec/exec/vbroker_scanner_test.cpp deleted file mode 100644 index 179b54bafb..0000000000 --- a/be/test/vec/exec/vbroker_scanner_test.cpp +++ /dev/null @@ -1,554 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. -#include "vec/exec/vbroker_scanner.h" - -#include - -#include -#include -#include - -#include "common/object_pool.h" -#include "exprs/cast_functions.h" -#include "gen_cpp/Descriptors_types.h" -#include "gen_cpp/PlanNodes_types.h" -#include "io/local_file_reader.h" -#include "runtime/descriptors.h" -#include "runtime/memory/mem_tracker.h" -#include "runtime/runtime_state.h" -#include "runtime/user_function_cache.h" - -namespace doris { - -namespace vectorized { -class VBrokerScannerTest : public testing::Test { -public: - VBrokerScannerTest() : _runtime_state(TQueryGlobals()) { - init(); - _profile = _runtime_state.runtime_profile(); - _runtime_state.init_mem_trackers(); - - TUniqueId unique_id; - TQueryOptions query_options; - TQueryGlobals query_globals; - - _runtime_state.init(unique_id, query_options, query_globals, nullptr); - } - void init(); - - static void SetUpTestCase() { - UserFunctionCache::instance()->init( - "./be/test/runtime/test_data/user_function_cache/normal"); - CastFunctions::init(); - } - -protected: - virtual void SetUp() {} - virtual void TearDown() {} - -private: - void init_desc_table(); - void init_params(); - - TupleId _dst_tuple_id = 0; - TupleId _src_tuple_id = 1; - RuntimeState _runtime_state; - RuntimeProfile* _profile; - ObjectPool _obj_pool; - TBrokerScanRangeParams _params; - DescriptorTbl* _desc_tbl; - std::vector _addresses; - ScannerCounter _counter; - std::vector _pre_filter; -}; - -void VBrokerScannerTest::init_desc_table() { - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::OLAP_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - - int next_slot_id = 1; - // TSlotDescriptor - // int offset = 1; - // int i = 0; - // k1 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 0; - slot_desc.byteOffset = 0; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = -1; - slot_desc.colName = "k1"; - slot_desc.slotIdx = 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k2 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 1; - slot_desc.byteOffset = 4; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = -1; - slot_desc.colName = "k2"; - slot_desc.slotIdx = 2; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k3 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 2; - slot_desc.byteOffset = 8; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = -1; - slot_desc.colName = "k3"; - slot_desc.slotIdx = 3; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - - t_desc_table.__isset.slotDescriptors = true; - { - // TTupleDescriptor dest - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = 0; - t_tuple_desc.byteSize = 12; - t_tuple_desc.numNullBytes = 0; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - } - - // source tuple descriptor - // TSlotDescriptor - // int offset = 1; - // int i = 0; - // k1 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 0; - slot_desc.byteOffset = 0; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 0; - slot_desc.colName = "k1"; - slot_desc.slotIdx = 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k2 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 1; - slot_desc.byteOffset = 16; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 1; - slot_desc.colName = "k2"; - slot_desc.slotIdx = 2; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - // k3 - { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 2; - slot_desc.byteOffset = 32; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 2; - slot_desc.colName = "k3"; - slot_desc.slotIdx = 3; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - - { - // TTupleDescriptor source - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = 1; - t_tuple_desc.byteSize = 48; - t_tuple_desc.numNullBytes = 0; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - } - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - - _runtime_state.set_desc_tbl(_desc_tbl); -} - -void VBrokerScannerTest::init_params() { - _params.column_separator = ','; - _params.line_delimiter = '\n'; - - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TTypeDesc varchar_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(5000); - node.__set_scalar_type(scalar_type); - varchar_type.types.push_back(node); - } - - for (int i = 0; i < 3; ++i) { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttoint"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_int_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 4 + i; - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(i + 1, expr); - _params.src_slot_ids.push_back(4 + i); - } - // _params.__isset.expr_of_dest_slot = true; - _params.__set_dest_tuple_id(_dst_tuple_id); - _params.__set_src_tuple_id(_src_tuple_id); -} - -void VBrokerScannerTest::init() { - init_desc_table(); - init_params(); -} - -TEST_F(VBrokerScannerTest, normal) { - std::vector ranges; - TBrokerRangeDesc range; - range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; - range.start_offset = 0; - range.size = -1; - range.splittable = true; - range.file_type = TFileType::FILE_LOCAL; - range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; - ranges.push_back(range); - VBrokerScanner scanner(&_runtime_state, _profile, _params, ranges, _addresses, _pre_filter, - &_counter); - auto st = scanner.open(); - ASSERT_TRUE(st.ok()); - - std::unique_ptr block(new vectorized::Block()); - bool eof = false; - st = scanner.get_next(block.get(), &eof); - ASSERT_TRUE(st.ok()); - ASSERT_TRUE(eof); - auto columns = block->get_columns(); - ASSERT_EQ(columns.size(), 3); - ASSERT_EQ(columns[0]->get_int(0), 1); - ASSERT_EQ(columns[0]->get_int(1), 4); - ASSERT_EQ(columns[0]->get_int(2), 8); - - ASSERT_EQ(columns[1]->get_int(0), 2); - ASSERT_EQ(columns[1]->get_int(1), 5); - ASSERT_EQ(columns[1]->get_int(2), 9); - - ASSERT_EQ(columns[2]->get_int(0), 3); - ASSERT_EQ(columns[2]->get_int(1), 6); - ASSERT_EQ(columns[2]->get_int(2), 10); -} - -TEST_F(VBrokerScannerTest, normal_with_pre_filter) { - std::vector ranges; - TBrokerRangeDesc range; - range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; - range.start_offset = 0; - range.size = -1; - range.splittable = true; - range.file_type = TFileType::FILE_LOCAL; - range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; - ranges.push_back(range); - - // init pre_filter expr: k1 < '8' - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TTypeDesc varchar_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(5000); - node.__set_scalar_type(scalar_type); - varchar_type.types.push_back(node); - } - - TExpr filter_expr; - { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::BINARY_PRED); - expr_node.type = gen_type_desc(TPrimitiveType::BOOLEAN); - expr_node.__set_num_children(2); - expr_node.__isset.opcode = true; - expr_node.__set_opcode(TExprOpcode::LT); - expr_node.__isset.vector_opcode = true; - expr_node.__set_vector_opcode(TExprOpcode::LT); - expr_node.__isset.fn = true; - expr_node.fn.name.function_name = "lt"; - expr_node.fn.binary_type = TFunctionBinaryType::BUILTIN; - expr_node.fn.ret_type = int_type; - expr_node.fn.has_var_args = false; - filter_expr.nodes.push_back(expr_node); - } - { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::SLOT_REF); - expr_node.type = varchar_type; - expr_node.__set_num_children(0); - expr_node.__isset.slot_ref = true; - TSlotRef slot_ref; - slot_ref.__set_slot_id(4); - slot_ref.__set_tuple_id(1); - expr_node.__set_slot_ref(slot_ref); - expr_node.__isset.output_column = true; - expr_node.__set_output_column(0); - filter_expr.nodes.push_back(expr_node); - } - { - TExprNode expr_node; - expr_node.__set_node_type(TExprNodeType::STRING_LITERAL); - expr_node.type = varchar_type; - expr_node.__set_num_children(0); - expr_node.__isset.string_literal = true; - TStringLiteral string_literal; - string_literal.__set_value("8"); - expr_node.__set_string_literal(string_literal); - filter_expr.nodes.push_back(expr_node); - } - _pre_filter.push_back(filter_expr); - VBrokerScanner scanner(&_runtime_state, _profile, _params, ranges, _addresses, _pre_filter, - &_counter); - auto st = scanner.open(); - ASSERT_TRUE(st.ok()); - - std::unique_ptr block(new vectorized::Block()); - bool eof = false; - // end of file - st = scanner.get_next(block.get(), &eof); - ASSERT_TRUE(st.ok()); - ASSERT_TRUE(eof); - auto columns = block->get_columns(); - ASSERT_EQ(columns.size(), 3); - - ASSERT_EQ(columns[0]->get_int(0), 1); - ASSERT_EQ(columns[0]->get_int(1), 4); - - ASSERT_EQ(columns[1]->get_int(0), 2); - ASSERT_EQ(columns[1]->get_int(1), 5); - - ASSERT_EQ(columns[2]->get_int(0), 3); - ASSERT_EQ(columns[2]->get_int(1), 6); -} - -TEST_F(VBrokerScannerTest, normal2) { - std::vector ranges; - - TBrokerRangeDesc range; - range.path = "./be/test/exec/test_data/broker_scanner/normal2_1.csv"; - range.start_offset = 0; - range.size = 7; - range.splittable = true; - range.file_type = TFileType::FILE_LOCAL; - range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; - ranges.push_back(range); - - range.path = "./be/test/exec/test_data/broker_scanner/normal2_2.csv"; - range.start_offset = 0; - range.size = 4; - ranges.push_back(range); - VBrokerScanner scanner(&_runtime_state, _profile, _params, ranges, _addresses, _pre_filter, - &_counter); - auto st = scanner.open(); - ASSERT_TRUE(st.ok()); - - std::unique_ptr block(new vectorized::Block()); - bool eof = false; - st = scanner.get_next(block.get(), &eof); - ASSERT_TRUE(st.ok()); - ASSERT_TRUE(eof); - auto columns = block->get_columns(); - ASSERT_EQ(columns.size(), 3); - - ASSERT_EQ(columns[0]->get_int(0), 1); - ASSERT_EQ(columns[0]->get_int(1), 3); - - ASSERT_EQ(columns[1]->get_int(0), 2); - ASSERT_EQ(columns[1]->get_int(1), 4); - - ASSERT_EQ(columns[2]->get_int(0), 3); - ASSERT_EQ(columns[2]->get_int(1), 5); -} - -TEST_F(VBrokerScannerTest, normal5) { - std::vector ranges; - TBrokerRangeDesc range; - range.path = "./be/test/exec/test_data/broker_scanner/normal.csv"; - range.start_offset = 0; - range.size = 0; - range.splittable = true; - range.file_type = TFileType::FILE_LOCAL; - range.format_type = TFileFormatType::FORMAT_CSV_PLAIN; - ranges.push_back(range); - VBrokerScanner scanner(&_runtime_state, _profile, _params, ranges, _addresses, _pre_filter, - &_counter); - auto st = scanner.open(); - ASSERT_TRUE(st.ok()); - - std::unique_ptr block(new vectorized::Block()); - bool eof = false; - // end of file - st = scanner.get_next(block.get(), &eof); - ASSERT_TRUE(st.ok()); - ASSERT_TRUE(eof); - auto columns = block->get_columns(); - ASSERT_EQ(columns.size(), 0); -} - -} // namespace vectorized -} // namespace doris diff --git a/be/test/vec/exec/vjson_scanner_test.cpp b/be/test/vec/exec/vjson_scanner_test.cpp deleted file mode 100644 index 362a4fb94b..0000000000 --- a/be/test/vec/exec/vjson_scanner_test.cpp +++ /dev/null @@ -1,861 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/vjson_scanner.h" - -#include -#include - -#include -#include -#include - -#include "common/object_pool.h" -#include "exprs/cast_functions.h" -#include "exprs/decimalv2_operators.h" -#include "gen_cpp/Descriptors_types.h" -#include "gen_cpp/PlanNodes_types.h" -#include "io/local_file_reader.h" -#include "runtime/descriptors.h" -#include "runtime/exec_env.h" -#include "runtime/runtime_state.h" -#include "runtime/tuple.h" -#include "runtime/user_function_cache.h" -#include "util/defer_op.h" -#include "vec/exec/vbroker_scan_node.h" - -namespace doris { -namespace vectorized { - -class VJsonScannerTest : public testing::Test { -public: - VJsonScannerTest() : _runtime_state(TQueryGlobals()) { - init(); - _runtime_state.init_mem_trackers(); - - TUniqueId unique_id; - TQueryOptions query_options; - TQueryGlobals query_globals; - - _runtime_state.init(unique_id, query_options, query_globals, nullptr); - } - void init(); - static void SetUpTestCase() { - config::enable_simdjson_reader = true; - UserFunctionCache::instance()->init( - "./be/test/runtime/test_data/user_function_cache/normal"); - CastFunctions::init(); - DecimalV2Operators::init(); - } - -protected: - virtual void SetUp() {} - virtual void TearDown() {} - -private: - int create_src_tuple(TDescriptorTable& t_desc_table, int next_slot_id); - int create_dst_tuple(TDescriptorTable& t_desc_table, int next_slot_id); - void create_expr_info(); - void init_desc_table(); - RuntimeState _runtime_state; - ObjectPool _obj_pool; - std::map _slots_map; - TBrokerScanRangeParams _params; - DescriptorTbl* _desc_tbl; - TPlanNode _tnode; -}; - -#define TUPLE_ID_DST 0 -#define TUPLE_ID_SRC 1 -#define COLUMN_NUMBERS 6 -#define DST_TUPLE_SLOT_ID_START 1 -#define SRC_TUPLE_SLOT_ID_START 7 - -int VJsonScannerTest::create_src_tuple(TDescriptorTable& t_desc_table, int next_slot_id) { - const char* columnNames[] = {"category", "author", "title", "price", "largeint", "decimal"}; - for (int i = 0; i < COLUMN_NUMBERS; i++) { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = i; - slot_desc.byteOffset = i * 16 + 8; - slot_desc.nullIndicatorByte = i / 8; - slot_desc.nullIndicatorBit = i % 8; - slot_desc.colName = columnNames[i]; - slot_desc.slotIdx = i + 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - - { - // TTupleDescriptor source - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = TUPLE_ID_SRC; - t_tuple_desc.byteSize = COLUMN_NUMBERS * 16 + 8; - t_tuple_desc.numNullBytes = 0; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - } - return next_slot_id; -} - -int VJsonScannerTest::create_dst_tuple(TDescriptorTable& t_desc_table, int next_slot_id) { - int32_t byteOffset = 8; - { //category - TSlotDescriptor slot_desc; - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 0; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 0; - slot_desc.colName = "category"; - slot_desc.slotIdx = 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - byteOffset += 16; - { // author - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 1; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 1; - slot_desc.colName = "author"; - slot_desc.slotIdx = 2; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - byteOffset += 16; - { // title - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 2; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 2; - slot_desc.colName = "title"; - slot_desc.slotIdx = 3; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - byteOffset += 16; - { // price - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::DOUBLE); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 3; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 3; - slot_desc.colName = "price"; - slot_desc.slotIdx = 4; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - byteOffset += 8; - { // lagreint - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::LARGEINT); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 4; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 4; - slot_desc.colName = "lagreint"; - slot_desc.slotIdx = 5; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - byteOffset += 16; - { // decimal - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__isset.precision = true; - scalar_type.__isset.scale = true; - scalar_type.__set_precision(-1); - scalar_type.__set_scale(-1); - scalar_type.__set_type(TPrimitiveType::DECIMALV2); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 5; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 5; - slot_desc.colName = "decimal"; - slot_desc.slotIdx = 6; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - - t_desc_table.__isset.slotDescriptors = true; - { - // TTupleDescriptor dest - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = TUPLE_ID_DST; - t_tuple_desc.byteSize = byteOffset + 8; - t_tuple_desc.numNullBytes = 0; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - } - return next_slot_id; -} - -void VJsonScannerTest::init_desc_table() { - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::BROKER_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - - int next_slot_id = 1; - - next_slot_id = create_dst_tuple(t_desc_table, next_slot_id); - - next_slot_id = create_src_tuple(t_desc_table, next_slot_id); - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - - _runtime_state.set_desc_tbl(_desc_tbl); -} - -void VJsonScannerTest::create_expr_info() { - TTypeDesc varchar_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(5000); - node.__set_scalar_type(scalar_type); - varchar_type.types.push_back(node); - } - // category VARCHAR --> VARCHAR - { - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START; // category id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START); - } - // author VARCHAR --> VARCHAR - { - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START + 1; // author id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START + 1, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START + 1); - } - // title VARCHAR --> VARCHAR - { - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START + 2; // log_time id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START + 2, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START + 2); - } - - // price VARCHAR --> DOUBLE - { - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::DOUBLE); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttodouble"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttodouble(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_double_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START + 3; // price id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START + 3, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START + 3); - } - // largeint VARCHAR --> LargeInt - { - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::LARGEINT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttolargeint"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttolargeint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_large_int_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START + 4; // price id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START + 4, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START + 4); - } - // decimal VARCHAR --> Decimal - { - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__isset.precision = true; - scalar_type.__isset.scale = true; - scalar_type.__set_precision(-1); - scalar_type.__set_scale(-1); - scalar_type.__set_type(TPrimitiveType::DECIMALV2); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttodecimalv2"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttodecimalv2(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::DecimalV2Operators::cast_to_decimalv2_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START + 5; // price id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START + 5, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START + 5); - } - // _params.__isset.expr_of_dest_slot = true; - _params.__set_dest_tuple_id(TUPLE_ID_DST); - _params.__set_src_tuple_id(TUPLE_ID_SRC); -} - -void VJsonScannerTest::init() { - create_expr_info(); - init_desc_table(); - - // Node Id - _tnode.node_id = 0; - _tnode.node_type = TPlanNodeType::SCHEMA_SCAN_NODE; - _tnode.num_children = 0; - _tnode.limit = -1; - _tnode.row_tuples.push_back(0); - _tnode.nullable_tuples.push_back(false); - _tnode.broker_scan_node.tuple_id = 0; - _tnode.__isset.broker_scan_node = true; -} - -TEST_F(VJsonScannerTest, simple_array_json) { - auto test_fn = [&](bool using_simdjson) { - bool saved_flag = config::enable_simdjson_reader; - if (using_simdjson) { - config::enable_simdjson_reader = true; - } - Defer __defer([&] { config::enable_simdjson_reader = saved_flag; }); - VBrokerScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - scan_node.init(_tnode); - auto status = scan_node.prepare(&_runtime_state); - EXPECT_TRUE(status.ok()); - - // set scan range - std::vector scan_ranges; - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - TBrokerRangeDesc range; - range.start_offset = 0; - range.size = -1; - range.format_type = TFileFormatType::FORMAT_JSON; - range.strip_outer_array = true; - range.__isset.strip_outer_array = true; - range.__set_num_as_string(true); - range.splittable = true; - range.path = "./be/test/exec/test_data/json_scanner/test_simple2.json"; - range.file_type = TFileType::FILE_LOCAL; - broker_scan_range.ranges.push_back(range); - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - scan_ranges.push_back(scan_range_params); - } - - scan_node.set_scan_ranges(scan_ranges); - status = scan_node.open(&_runtime_state); - EXPECT_TRUE(status.ok()); - - bool eof = false; - vectorized::Block block; - status = scan_node.get_next(&_runtime_state, &block, &eof); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(2, block.rows()); - EXPECT_EQ(6, block.columns()); - - auto columns = block.get_columns_with_type_and_name(); - ASSERT_EQ(columns.size(), 6); - ASSERT_EQ(columns[0].to_string(0), "reference"); - ASSERT_EQ(columns[0].to_string(1), "fiction"); - ASSERT_EQ(columns[1].to_string(0), "NigelRees"); - ASSERT_EQ(columns[1].to_string(1), "EvelynWaugh"); - ASSERT_EQ(columns[2].to_string(0), "SayingsoftheCentury"); - ASSERT_EQ(columns[2].to_string(1), "SwordofHonour"); - ASSERT_EQ(columns[3].to_string(0), "8.95"); - ASSERT_EQ(columns[3].to_string(1), "12.99"); - ASSERT_EQ(columns[4].to_string(0), "1234"); - ASSERT_EQ(columns[4].to_string(1), "1180591620717411303424"); - ASSERT_EQ(columns[5].to_string(0), "1234.123400000"); - ASSERT_EQ(columns[5].to_string(1), "9999999999999.999999000"); - - block.clear(); - status = scan_node.get_next(&_runtime_state, &block, &eof); - ASSERT_EQ(0, block.rows()); - ASSERT_TRUE(eof); - scan_node.close(&_runtime_state); - }; - test_fn(true); - test_fn(false); -} - -TEST_F(VJsonScannerTest, use_jsonpaths_with_file_reader) { - auto test_fn = [&](bool using_simdjson) { - bool saved_flag = config::enable_simdjson_reader; - if (using_simdjson) { - config::enable_simdjson_reader = true; - } - Defer __defer([&] { config::enable_simdjson_reader = saved_flag; }); - VBrokerScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - scan_node.init(_tnode); - auto status = scan_node.prepare(&_runtime_state); - EXPECT_TRUE(status.ok()); - - // set scan range - std::vector scan_ranges; - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - TBrokerRangeDesc range; - range.start_offset = 0; - range.size = -1; - range.format_type = TFileFormatType::FORMAT_JSON; - range.strip_outer_array = true; - range.__isset.strip_outer_array = true; - range.splittable = true; - range.path = "./be/test/exec/test_data/json_scanner/test_simple2.json"; - range.file_type = TFileType::FILE_LOCAL; - range.jsonpaths = - "[\"$.category\", \"$.author\", \"$.title\", \"$.price\", \"$.largeint\", " - "\"$.decimal\"]"; - range.__isset.jsonpaths = true; - broker_scan_range.ranges.push_back(range); - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - scan_ranges.push_back(scan_range_params); - } - - scan_node.set_scan_ranges(scan_ranges); - status = scan_node.open(&_runtime_state); - EXPECT_TRUE(status.ok()); - - bool eof = false; - vectorized::Block block; - status = scan_node.get_next(&_runtime_state, &block, &eof); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(2, block.rows()); - EXPECT_EQ(6, block.columns()); - - auto columns = block.get_columns_with_type_and_name(); - ASSERT_EQ(columns.size(), 6); - ASSERT_EQ(columns[0].to_string(0), "reference"); - ASSERT_EQ(columns[0].to_string(1), "fiction"); - ASSERT_EQ(columns[1].to_string(0), "NigelRees"); - ASSERT_EQ(columns[1].to_string(1), "EvelynWaugh"); - ASSERT_EQ(columns[2].to_string(0), "SayingsoftheCentury"); - ASSERT_EQ(columns[2].to_string(1), "SwordofHonour"); - - block.clear(); - status = scan_node.get_next(&_runtime_state, &block, &eof); - ASSERT_EQ(0, block.rows()); - ASSERT_TRUE(eof); - scan_node.close(&_runtime_state); - }; - test_fn(true); - test_fn(false); -} - -TEST_F(VJsonScannerTest, use_jsonpaths_with_line_reader) { - auto test_fn = [&](bool using_simdjson) { - bool saved_flag = config::enable_simdjson_reader; - if (using_simdjson) { - config::enable_simdjson_reader = true; - } - Defer __defer([&] { config::enable_simdjson_reader = saved_flag; }); - VBrokerScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - scan_node.init(_tnode); - auto status = scan_node.prepare(&_runtime_state); - EXPECT_TRUE(status.ok()); - - std::vector scan_ranges; - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - TBrokerRangeDesc range; - range.start_offset = 0; - range.size = -1; - range.format_type = TFileFormatType::FORMAT_JSON; - range.splittable = true; - range.strip_outer_array = true; - range.__isset.strip_outer_array = true; - range.path = "./be/test/exec/test_data/json_scanner/test_simple2.json"; - range.file_type = TFileType::FILE_LOCAL; - range.jsonpaths = - "[\"$.category\", \"$.author\", \"$.title\", \"$.price\", \"$.largeint\", " - "\"$.decimal\"]"; - range.__isset.jsonpaths = true; - range.read_json_by_line = true; - range.__isset.read_json_by_line = true; - broker_scan_range.ranges.push_back(range); - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - scan_ranges.push_back(scan_range_params); - } - - scan_node.set_scan_ranges(scan_ranges); - status = scan_node.open(&_runtime_state); - EXPECT_TRUE(status.ok()); - - bool eof = false; - vectorized::Block block; - status = scan_node.get_next(&_runtime_state, &block, &eof); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(2, block.rows()); - EXPECT_EQ(6, block.columns()); - - auto columns = block.get_columns_with_type_and_name(); - ASSERT_EQ(columns.size(), 6); - ASSERT_EQ(columns[0].to_string(0), "reference"); - ASSERT_EQ(columns[0].to_string(1), "fiction"); - ASSERT_EQ(columns[1].to_string(0), "NigelRees"); - ASSERT_EQ(columns[1].to_string(1), "EvelynWaugh"); - ASSERT_EQ(columns[2].to_string(0), "SayingsoftheCentury"); - ASSERT_EQ(columns[2].to_string(1), "SwordofHonour"); - - block.clear(); - status = scan_node.get_next(&_runtime_state, &block, &eof); - ASSERT_EQ(0, block.rows()); - ASSERT_TRUE(eof); - scan_node.close(&_runtime_state); - }; - test_fn(true); - test_fn(false); -} - -TEST_F(VJsonScannerTest, use_jsonpaths_mismatch) { - auto test_fn = [&](bool using_simdjson) { - bool saved_flag = config::enable_simdjson_reader; - if (using_simdjson) { - config::enable_simdjson_reader = true; - } - Defer __defer([&] { config::enable_simdjson_reader = saved_flag; }); - VBrokerScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - scan_node.init(_tnode); - auto status = scan_node.prepare(&_runtime_state); - EXPECT_TRUE(status.ok()); - - // set scan range - std::vector scan_ranges; - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - TBrokerRangeDesc range; - range.start_offset = 0; - range.size = -1; - range.format_type = TFileFormatType::FORMAT_JSON; - range.strip_outer_array = true; - range.__isset.strip_outer_array = true; - range.splittable = true; - range.path = "./be/test/exec/test_data/json_scanner/test_simple2.json"; - range.file_type = TFileType::FILE_LOCAL; - range.jsonpaths = "[\"$.k1\", \"$.k2\", \"$.k3\", \"$.k4\", \"$.k5\", \"$.k6\"]"; - range.__isset.jsonpaths = true; - broker_scan_range.ranges.push_back(range); - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - scan_ranges.push_back(scan_range_params); - } - - scan_node.set_scan_ranges(scan_ranges); - status = scan_node.open(&_runtime_state); - EXPECT_TRUE(status.ok()); - - bool eof = false; - vectorized::Block block; - status = scan_node.get_next(&_runtime_state, &block, &eof); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(0, block.rows()); - EXPECT_EQ(0, block.columns()); - block.clear(); - scan_node.close(&_runtime_state); - }; - test_fn(true); - test_fn(false); -} - -TEST_F(VJsonScannerTest, use_nested_with_jsonpath) { - auto test_fn = [&](bool using_simdjson) { - bool saved_flag = config::enable_simdjson_reader; - if (using_simdjson) { - config::enable_simdjson_reader = true; - } - Defer __defer([&] { config::enable_simdjson_reader = saved_flag; }); - VBrokerScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - scan_node.init(_tnode); - auto status = scan_node.prepare(&_runtime_state); - EXPECT_TRUE(status.ok()); - - // set scan range - std::vector scan_ranges; - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - TBrokerRangeDesc range; - range.start_offset = 0; - range.size = -1; - range.format_type = TFileFormatType::FORMAT_JSON; - range.strip_outer_array = true; - range.__isset.strip_outer_array = true; - range.splittable = true; - range.path = "./be/test/exec/test_data/json_scanner/test_nested.json"; - range.file_type = TFileType::FILE_LOCAL; - range.jsonpaths = "[\"$.qid\", \"$.tag\", \"$.creationDate\", \"$.answers[0].user\"]"; - range.__isset.jsonpaths = true; - broker_scan_range.ranges.push_back(range); - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - scan_ranges.push_back(scan_range_params); - } - - scan_node.set_scan_ranges(scan_ranges); - status = scan_node.open(&_runtime_state); - EXPECT_TRUE(status.ok()); - - bool eof = false; - vectorized::Block block; - status = scan_node.get_next(&_runtime_state, &block, &eof); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(2048, block.rows()); - EXPECT_EQ(6, block.columns()); - - auto columns = block.get_columns_with_type_and_name(); - ASSERT_EQ(columns.size(), 6); - EXPECT_EQ(columns[0].to_string(0), "1000000"); - EXPECT_EQ(columns[0].to_string(1), "10000005"); - EXPECT_EQ(columns[1].to_string(0), "[\"vb6\", \"progress-bar\"]"); - EXPECT_EQ(columns[1].to_string(1), "[\"php\", \"arrays\", \"sorting\"]"); - EXPECT_EQ(columns[2].to_string(0), "2009-06-16T07:28:42.770"); - EXPECT_EQ(columns[2].to_string(1), "2012-04-03T19:25:46.213"); - block.clear(); - scan_node.close(&_runtime_state); - }; - test_fn(true); - test_fn(false); -} - -} // namespace vectorized -} // namespace doris diff --git a/be/test/vec/exec/vorc_scanner_test.cpp b/be/test/vec/exec/vorc_scanner_test.cpp deleted file mode 100644 index 9ab5bd2d83..0000000000 --- a/be/test/vec/exec/vorc_scanner_test.cpp +++ /dev/null @@ -1,884 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/vorc_scanner.h" - -#include -#include -#include - -#include -#include -#include - -#include "common/object_pool.h" -#include "exprs/cast_functions.h" -#include "exprs/decimalv2_operators.h" -#include "gen_cpp/Descriptors_types.h" -#include "gen_cpp/PlanNodes_types.h" -#include "io/local_file_reader.h" -#include "runtime/descriptors.h" -#include "runtime/runtime_state.h" -#include "runtime/tuple.h" -#include "runtime/user_function_cache.h" -#include "vec/exec/vbroker_scan_node.h" - -namespace doris { -namespace vectorized { - -class VOrcScannerTest : public testing::Test { -public: - VOrcScannerTest() : _runtime_state(TQueryGlobals()) { - _profile = _runtime_state.runtime_profile(); - _runtime_state.init_mem_trackers(); - } - ~VOrcScannerTest() {} - - static void SetUpTestCase() { - UserFunctionCache::instance()->init( - "./be/test/runtime/test_data/user_function_cache/normal"); - CastFunctions::init(); - DecimalV2Operators::init(); - } - -protected: - virtual void SetUp() {} - - virtual void TearDown() {} - -private: - RuntimeState _runtime_state; - RuntimeProfile* _profile; - ObjectPool _obj_pool; - DescriptorTbl* _desc_tbl; - std::vector _addresses; - ScannerCounter _counter; - std::vector _pre_filter; -}; - -TEST_F(VOrcScannerTest, normal) { - TBrokerScanRangeParams params; - TTypeDesc varchar_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - varchar_type.types.push_back(node); - } - - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - - TTypeDesc big_int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::BIGINT); - node.__set_scalar_type(scalar_type); - big_int_type.types.push_back(node); - } - - TTypeDesc float_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::FLOAT); - node.__set_scalar_type(scalar_type); - float_type.types.push_back(node); - } - - TTypeDesc double_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::DOUBLE); - node.__set_scalar_type(scalar_type); - double_type.types.push_back(node); - } - - TTypeDesc date_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::DATE); - node.__set_scalar_type(scalar_type); - date_type.types.push_back(node); - } - - //col1 varchar -> bigint - { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = big_int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttobigint"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = big_int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_big_int_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 0; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(8, expr); - params.src_slot_ids.push_back(0); - } - //col2, col3 - for (int i = 1; i <= 2; i++) { - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = i; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(8 + i, expr); - params.src_slot_ids.push_back(i); - } - - //col5 varchar -> double - { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = double_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttodouble"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = double_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_double_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 3; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(11, expr); - params.src_slot_ids.push_back(3); - } - - //col6 varchar -> float - { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = float_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttofloat"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = float_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_float_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 4; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(12, expr); - params.src_slot_ids.push_back(4); - } - //col7,col8 - for (int i = 5; i <= 6; i++) { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttoint"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_int_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = i; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(8 + i, expr); - params.src_slot_ids.push_back(i); - } - - //col9 varchar -> var - { - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 7; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(15, expr); - params.src_slot_ids.push_back(7); - } - - params.__set_src_tuple_id(0); - params.__set_dest_tuple_id(1); - - //init_desc_table - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::BROKER_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - - TDescriptorTableBuilder dtb; - TTupleDescriptorBuilder src_tuple_builder; - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col1") - .column_pos(1) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col2") - .column_pos(2) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col3") - .column_pos(3) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col5") - .column_pos(4) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col6") - .column_pos(5) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col7") - .column_pos(6) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col8") - .column_pos(7) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col9") - .column_pos(8) - .build()); - src_tuple_builder.build(&dtb); - - TTupleDescriptorBuilder dest_tuple_builder; - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().type(TYPE_BIGINT).column_name("col1").column_pos(1).build()); - dest_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col2") - .column_pos(2) - .build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().string_type(65535).column_name("col3").column_pos(3).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().type(TYPE_DOUBLE).column_name("col5").column_pos(4).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().type(TYPE_FLOAT).column_name("col6").column_pos(5).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().type(TYPE_INT).column_name("col7").column_pos(6).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().type(TYPE_INT).column_name("col8").column_pos(7).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().string_type(65535).column_name("col9").column_pos(8).build()); - dest_tuple_builder.build(&dtb); - t_desc_table = dtb.desc_tbl(); - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - _runtime_state.set_desc_tbl(_desc_tbl); - - std::vector ranges; - TBrokerRangeDesc rangeDesc; - rangeDesc.start_offset = 0; - rangeDesc.size = -1; - rangeDesc.format_type = TFileFormatType::FORMAT_ORC; - rangeDesc.splittable = false; - - rangeDesc.path = "./be/test/exec/test_data/orc_scanner/my-file.orc"; - rangeDesc.file_type = TFileType::FILE_LOCAL; - ranges.push_back(rangeDesc); - - VORCScanner scanner(&_runtime_state, _profile, params, ranges, _addresses, _pre_filter, - &_counter); - EXPECT_TRUE(scanner.open().ok()); - - vectorized::Block block; - bool eof = false; - - EXPECT_TRUE(scanner.get_next(&block, &eof).ok()); - EXPECT_TRUE(eof); - EXPECT_TRUE(scanner.get_next(&block, &eof).ok()); - EXPECT_TRUE(eof); - scanner.close(); -} - -TEST_F(VOrcScannerTest, normal2) { - TBrokerScanRangeParams params; - TTypeDesc varchar_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - varchar_type.types.push_back(node); - } - - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::INT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - - { - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 1; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(3, expr); - params.src_slot_ids.push_back(0); - params.src_slot_ids.push_back(1); - params.src_slot_ids.push_back(2); - } - params.__set_src_tuple_id(0); - params.__set_dest_tuple_id(1); - - //init_desc_table - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::BROKER_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - - TDescriptorTableBuilder dtb; - TTupleDescriptorBuilder src_tuple_builder; - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col1") - .column_pos(1) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col2") - .column_pos(2) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col3") - .column_pos(3) - .build()); - src_tuple_builder.build(&dtb); - TTupleDescriptorBuilder dest_tuple_builder; - dest_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .column_name("value_from_col2") - .column_pos(1) - .build()); - - dest_tuple_builder.build(&dtb); - t_desc_table = dtb.desc_tbl(); - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - _runtime_state.set_desc_tbl(_desc_tbl); - - std::vector ranges; - TBrokerRangeDesc rangeDesc; - rangeDesc.start_offset = 0; - rangeDesc.size = -1; - rangeDesc.format_type = TFileFormatType::FORMAT_ORC; - rangeDesc.splittable = false; - - rangeDesc.path = "./be/test/exec/test_data/orc_scanner/my-file.orc"; - rangeDesc.file_type = TFileType::FILE_LOCAL; - ranges.push_back(rangeDesc); - - VORCScanner scanner(&_runtime_state, _profile, params, ranges, _addresses, _pre_filter, - &_counter); - EXPECT_TRUE(scanner.open().ok()); - - bool eof = false; - vectorized::Block block; - EXPECT_TRUE(scanner.get_next(&block, &eof).ok()); - EXPECT_EQ(10, block.rows()); - EXPECT_TRUE(eof); - scanner.close(); -} - -TEST_F(VOrcScannerTest, normal3) { - TBrokerScanRangeParams params; - TTypeDesc varchar_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - varchar_type.types.push_back(node); - } - - TTypeDesc decimal_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::DECIMALV2); - scalar_type.__set_precision(64); - scalar_type.__set_scale(64); - node.__set_scalar_type(scalar_type); - decimal_type.types.push_back(node); - } - - TTypeDesc tinyint_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::TINYINT); - node.__set_scalar_type(scalar_type); - tinyint_type.types.push_back(node); - } - - TTypeDesc datetime_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::DATETIME); - node.__set_scalar_type(scalar_type); - datetime_type.types.push_back(node); - } - - TTypeDesc date_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::DATE); - node.__set_scalar_type(scalar_type); - date_type.types.push_back(node); - } - - { - for (int i = 0; i < 5; ++i) { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = decimal_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttodecimalv2"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = decimal_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("cast_to_decimalv2_val(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::DecimalV2Operators::cast_to_decimalv2_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = i; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(9 + i, expr); - params.src_slot_ids.push_back(i); - } - - { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = tinyint_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttotinyint"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = tinyint_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("cast_to_tiny_int_val(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_tiny_int_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 5; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(14, expr); - params.src_slot_ids.push_back(5); - } - - { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = datetime_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttodatetime"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = datetime_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("cast_to_datetime_val(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_datetime_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 6; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(15, expr); - params.src_slot_ids.push_back(6); - } - { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = date_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttodate"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = date_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_date_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 7; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(16, expr); - params.src_slot_ids.push_back(7); - } - { - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = decimal_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttodecimalv2"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = decimal_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("cast_to_decimalv2_val(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::DecimalV2Operators::cast_to_decimalv2_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = 8; - slot_ref.slot_ref.tuple_id = 0; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - params.expr_of_dest_slot.emplace(17, expr); - params.src_slot_ids.push_back(8); - } - } - params.__set_src_tuple_id(0); - params.__set_dest_tuple_id(1); - - //init_desc_table - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::BROKER_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - - TDescriptorTableBuilder dtb; - TTupleDescriptorBuilder src_tuple_builder; - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col1") - .column_pos(1) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col2") - .column_pos(2) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col3") - .column_pos(3) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col4") - .column_pos(4) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col5") - .column_pos(5) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col6") - .column_pos(6) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col7") - .column_pos(7) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col8") - .column_pos(8) - .build()); - src_tuple_builder.add_slot(TSlotDescriptorBuilder() - .string_type(65535) - .nullable(true) - .column_name("col9") - .column_pos(9) - .build()); - src_tuple_builder.build(&dtb); - - TTupleDescriptorBuilder dest_tuple_builder; - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().decimal_type(10, 9).column_name("col1").column_pos(1).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().decimal_type(7, 5).column_name("col2").column_pos(2).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().decimal_type(10, 9).column_name("col3").column_pos(3).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().decimal_type(10, 5).column_name("col4").column_pos(4).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().decimal_type(10, 5).column_name("col5").column_pos(5).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().type(TYPE_TINYINT).column_name("col6").column_pos(6).build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().type(TYPE_DATETIME).column_name("col7").column_pos(7).build()); - dest_tuple_builder.add_slot(TSlotDescriptorBuilder() - .type(TYPE_DATE) - .nullable(true) - .column_name("col8") - .column_pos(8) - .build()); - dest_tuple_builder.add_slot( - TSlotDescriptorBuilder().decimal_type(27, 9).column_name("col9").column_pos(9).build()); - - dest_tuple_builder.build(&dtb); - t_desc_table = dtb.desc_tbl(); - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - _runtime_state.set_desc_tbl(_desc_tbl); - - std::vector ranges; - TBrokerRangeDesc rangeDesc; - rangeDesc.start_offset = 0; - rangeDesc.size = -1; - rangeDesc.format_type = TFileFormatType::FORMAT_ORC; - rangeDesc.splittable = false; - - rangeDesc.path = "./be/test/exec/test_data/orc_scanner/decimal_and_timestamp.orc"; - rangeDesc.file_type = TFileType::FILE_LOCAL; - ranges.push_back(rangeDesc); - - VORCScanner scanner(&_runtime_state, _profile, params, ranges, _addresses, _pre_filter, - &_counter); - EXPECT_TRUE(scanner.open().ok()); - - bool eof = false; - vectorized::Block block; - EXPECT_TRUE(scanner.get_next(&block, &eof).ok()); - EXPECT_EQ(1, block.rows()); - EXPECT_TRUE(eof); - scanner.close(); -} - -} // namespace vectorized -} // namespace doris diff --git a/be/test/vec/exec/vparquet_scanner_test.cpp b/be/test/vec/exec/vparquet_scanner_test.cpp deleted file mode 100644 index b175ec130a..0000000000 --- a/be/test/vec/exec/vparquet_scanner_test.cpp +++ /dev/null @@ -1,497 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include - -#include -#include -#include - -#include "common/object_pool.h" -#include "exprs/cast_functions.h" -#include "gen_cpp/Descriptors_types.h" -#include "gen_cpp/PlanNodes_types.h" -#include "io/local_file_reader.h" -#include "runtime/descriptors.h" -#include "runtime/runtime_state.h" -#include "runtime/tuple.h" -#include "runtime/user_function_cache.h" -#include "vec/exec/vbroker_scan_node.h" - -namespace doris { -namespace vectorized { - -class VParquetScannerTest : public testing::Test { -public: - VParquetScannerTest() : _runtime_state(TQueryGlobals()) { - init(); - _runtime_state.init_mem_trackers(); - } - ~VParquetScannerTest() {} - void init(); - static void SetUpTestCase() { - UserFunctionCache::instance()->init( - "./be/test/runtime/test_data/user_function_cache/normal"); - CastFunctions::init(); - } - -protected: - virtual void SetUp() {} - virtual void TearDown() {} - -private: - int create_src_tuple(TDescriptorTable& t_desc_table, int next_slot_id); - int create_dst_tuple(TDescriptorTable& t_desc_table, int next_slot_id); - void create_expr_info(); - void init_desc_table(); - RuntimeState _runtime_state; - ObjectPool _obj_pool; - std::map _slots_map; - TBrokerScanRangeParams _params; - DescriptorTbl* _desc_tbl; - TPlanNode _tnode; -}; - -#define TUPLE_ID_DST 0 -#define TUPLE_ID_SRC 1 -#define COLUMN_NUMBERS 20 -#define DST_TUPLE_SLOT_ID_START 1 -#define SRC_TUPLE_SLOT_ID_START 21 -int VParquetScannerTest::create_src_tuple(TDescriptorTable& t_desc_table, int next_slot_id) { - const char* columnNames[] = { - "log_version", "log_time", "log_time_stamp", "js_version", - "vst_cookie", "vst_ip", "vst_user_id", "vst_user_agent", - "device_resolution", "page_url", "page_refer_url", "page_yyid", - "page_type", "pos_type", "content_id", "media_id", - "spm_cnt", "spm_pre", "scm_cnt", "partition_column"}; - for (int i = 0; i < COLUMN_NUMBERS; i++) { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 1; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = i; - // Skip the first 8 bytes These 8 bytes are used to indicate whether the field is a null value - slot_desc.byteOffset = i * 16 + 8; - slot_desc.nullIndicatorByte = i / 8; - slot_desc.nullIndicatorBit = i % 8; - slot_desc.colName = columnNames[i]; - slot_desc.slotIdx = i + 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - - { - // TTupleDescriptor source - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = TUPLE_ID_SRC; - //Here 8 bytes in order to handle null values - t_tuple_desc.byteSize = COLUMN_NUMBERS * 16 + 8; - t_tuple_desc.numNullBytes = 0; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - } - return next_slot_id; -} - -int VParquetScannerTest::create_dst_tuple(TDescriptorTable& t_desc_table, int next_slot_id) { - int32_t byteOffset = - 8; // Skip the first 8 bytes These 8 bytes are used to indicate whether the field is a null value - { //log_version - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); //parquet::Type::BYTE - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 0; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 0; - slot_desc.colName = "log_version"; - slot_desc.slotIdx = 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - byteOffset += 16; - { // log_time - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::BIGINT); //parquet::Type::INT64 - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 1; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 1; - slot_desc.colName = "log_time"; - slot_desc.slotIdx = 2; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - byteOffset += 8; - { // log_time_stamp - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::BIGINT); //parquet::Type::INT32 - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = 2; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = 0; - slot_desc.nullIndicatorBit = 2; - slot_desc.colName = "log_time_stamp"; - slot_desc.slotIdx = 3; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - byteOffset += 8; - const char* columnNames[] = { - "log_version", "log_time", "log_time_stamp", "js_version", - "vst_cookie", "vst_ip", "vst_user_id", "vst_user_agent", - "device_resolution", "page_url", "page_refer_url", "page_yyid", - "page_type", "pos_type", "content_id", "media_id", - "spm_cnt", "spm_pre", "scm_cnt", "partition_column"}; - for (int i = 3; i < COLUMN_NUMBERS; i++, byteOffset += 16) { - TSlotDescriptor slot_desc; - - slot_desc.id = next_slot_id++; - slot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); //parquet::Type::BYTE - scalar_type.__set_len(65535); - node.__set_scalar_type(scalar_type); - type.types.push_back(node); - } - slot_desc.slotType = type; - slot_desc.columnPos = i; - slot_desc.byteOffset = byteOffset; - slot_desc.nullIndicatorByte = i / 8; - slot_desc.nullIndicatorBit = i % 8; - slot_desc.colName = columnNames[i]; - slot_desc.slotIdx = i + 1; - slot_desc.isMaterialized = true; - - t_desc_table.slotDescriptors.push_back(slot_desc); - } - - t_desc_table.__isset.slotDescriptors = true; - { - // TTupleDescriptor dest - TTupleDescriptor t_tuple_desc; - t_tuple_desc.id = TUPLE_ID_DST; - t_tuple_desc.byteSize = byteOffset + 8; //Here 8 bytes in order to handle null values - t_tuple_desc.numNullBytes = 0; - t_tuple_desc.tableId = 0; - t_tuple_desc.__isset.tableId = true; - t_desc_table.tupleDescriptors.push_back(t_tuple_desc); - } - return next_slot_id; -} - -void VParquetScannerTest::init_desc_table() { - TDescriptorTable t_desc_table; - - // table descriptors - TTableDescriptor t_table_desc; - - t_table_desc.id = 0; - t_table_desc.tableType = TTableType::BROKER_TABLE; - t_table_desc.numCols = 0; - t_table_desc.numClusteringCols = 0; - t_desc_table.tableDescriptors.push_back(t_table_desc); - t_desc_table.__isset.tableDescriptors = true; - - int next_slot_id = 1; - - next_slot_id = create_dst_tuple(t_desc_table, next_slot_id); - - next_slot_id = create_src_tuple(t_desc_table, next_slot_id); - - DescriptorTbl::create(&_obj_pool, t_desc_table, &_desc_tbl); - - _runtime_state.set_desc_tbl(_desc_tbl); -} - -void VParquetScannerTest::create_expr_info() { - TTypeDesc varchar_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::VARCHAR); - scalar_type.__set_len(5000); - node.__set_scalar_type(scalar_type); - varchar_type.types.push_back(node); - } - // log_version VARCHAR --> VARCHAR - { - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START; // log_time id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START); - } - // log_time VARCHAR --> BIGINT - { - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::BIGINT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttoint"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_big_int_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START + 1; // log_time id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START + 1, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START + 1); - } - // log_time_stamp VARCHAR --> BIGINT - { - TTypeDesc int_type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::BIGINT); - node.__set_scalar_type(scalar_type); - int_type.types.push_back(node); - } - TExprNode cast_expr; - cast_expr.node_type = TExprNodeType::CAST_EXPR; - cast_expr.type = int_type; - cast_expr.__set_opcode(TExprOpcode::CAST); - cast_expr.__set_num_children(1); - cast_expr.__set_output_scale(-1); - cast_expr.__isset.fn = true; - cast_expr.fn.name.function_name = "casttoint"; - cast_expr.fn.binary_type = TFunctionBinaryType::BUILTIN; - cast_expr.fn.arg_types.push_back(varchar_type); - cast_expr.fn.ret_type = int_type; - cast_expr.fn.has_var_args = false; - cast_expr.fn.__set_signature("casttoint(VARCHAR(*))"); - cast_expr.fn.__isset.scalar_fn = true; - cast_expr.fn.scalar_fn.symbol = "doris::CastFunctions::cast_to_big_int_val"; - - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START + 2; - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(cast_expr); - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START + 2, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START + 2); - } - // couldn't convert type - for (int i = 3; i < COLUMN_NUMBERS; i++) { - TExprNode slot_ref; - slot_ref.node_type = TExprNodeType::SLOT_REF; - slot_ref.type = varchar_type; - slot_ref.num_children = 0; - slot_ref.__isset.slot_ref = true; - slot_ref.slot_ref.slot_id = SRC_TUPLE_SLOT_ID_START + i; // log_time id in src tuple - slot_ref.slot_ref.tuple_id = 1; - - TExpr expr; - expr.nodes.push_back(slot_ref); - - _params.expr_of_dest_slot.emplace(DST_TUPLE_SLOT_ID_START + i, expr); - _params.src_slot_ids.push_back(SRC_TUPLE_SLOT_ID_START + i); - } - - // _params.__isset.expr_of_dest_slot = true; - _params.__set_dest_tuple_id(TUPLE_ID_DST); - _params.__set_src_tuple_id(TUPLE_ID_SRC); -} - -void VParquetScannerTest::init() { - create_expr_info(); - init_desc_table(); - - // Node Id - _tnode.node_id = 0; - _tnode.node_type = TPlanNodeType::SCHEMA_SCAN_NODE; - _tnode.num_children = 0; - _tnode.limit = -1; - _tnode.row_tuples.push_back(0); - _tnode.nullable_tuples.push_back(false); - _tnode.broker_scan_node.tuple_id = 0; - _tnode.__isset.broker_scan_node = true; -} - -TEST_F(VParquetScannerTest, normal) { - VBrokerScanNode scan_node(&_obj_pool, _tnode, *_desc_tbl); - scan_node.init(_tnode); - auto status = scan_node.prepare(&_runtime_state); - EXPECT_TRUE(status.ok()); - - // set scan range - std::vector scan_ranges; - { - TScanRangeParams scan_range_params; - - TBrokerScanRange broker_scan_range; - broker_scan_range.params = _params; - TBrokerRangeDesc range; - range.start_offset = 0; - range.size = -1; - range.format_type = TFileFormatType::FORMAT_PARQUET; - range.splittable = true; - - std::vector columns_from_path {"value"}; - range.__set_columns_from_path(columns_from_path); - range.__set_num_of_columns_from_file(19); -#if 1 - range.path = "./be/test/exec/test_data/parquet_scanner/localfile.parquet"; - range.file_type = TFileType::FILE_LOCAL; -#else - range.path = "hdfs://ip:8020/user/xxxx.parq"; - range.file_type = TFileType::FILE_BROKER; - TNetworkAddress addr; - addr.__set_hostname("127.0.0.1"); - addr.__set_port(8000); - broker_scan_range.broker_addresses.push_back(addr); -#endif - broker_scan_range.ranges.push_back(range); - scan_range_params.scan_range.__set_broker_scan_range(broker_scan_range); - scan_ranges.push_back(scan_range_params); - } - - scan_node.set_scan_ranges(scan_ranges); - status = scan_node.open(&_runtime_state); - EXPECT_TRUE(status.ok()); - - // Get block - vectorized::Block block; - bool eof = false; - for (int i = 0; i < 14; i++) { - status = scan_node.get_next(&_runtime_state, &block, &eof); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(2048, block.rows()); - EXPECT_FALSE(eof); - block.clear(); - } - - status = scan_node.get_next(&_runtime_state, &block, &eof); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(1328, block.rows()); - EXPECT_TRUE(eof); - block.clear(); - status = scan_node.get_next(&_runtime_state, &block, &eof); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(0, block.rows()); - EXPECT_TRUE(eof); - - scan_node.close(&_runtime_state); - { - std::stringstream ss; - scan_node.runtime_profile()->pretty_print(&ss); - LOG(INFO) << ss.str(); - } -} - -} // namespace vectorized -} // namespace doris \ No newline at end of file diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadingTaskPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadingTaskPlanner.java index 69e971abe1..dad51c44f7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadingTaskPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/loadv2/LoadingTaskPlanner.java @@ -34,7 +34,6 @@ import org.apache.doris.common.UserException; import org.apache.doris.common.util.DebugUtil; import org.apache.doris.load.BrokerFileGroup; import org.apache.doris.mysql.privilege.PrivPredicate; -import org.apache.doris.planner.BrokerScanNode; import org.apache.doris.planner.DataPartition; import org.apache.doris.planner.OlapTableSink; import org.apache.doris.planner.PlanFragment; @@ -147,17 +146,9 @@ public class LoadingTaskPlanner { // Generate plan trees // 1. Broker scan node ScanNode scanNode; - boolean useNewScanNode = Config.enable_new_load_scan_node || useNewLoadScanNode; - if (useNewScanNode) { - scanNode = new ExternalFileScanNode(new PlanNodeId(nextNodeId++), scanTupleDesc); - ((ExternalFileScanNode) scanNode).setLoadInfo(loadJobId, txnId, table, brokerDesc, fileGroups, - fileStatusesList, filesAdded, strictMode, loadParallelism, userInfo); - } else { - scanNode = new BrokerScanNode(new PlanNodeId(nextNodeId++), scanTupleDesc, "BrokerScanNode", - fileStatusesList, filesAdded); - ((BrokerScanNode) scanNode).setLoadInfo(loadJobId, txnId, table, brokerDesc, fileGroups, strictMode, - loadParallelism, userInfo); - } + scanNode = new ExternalFileScanNode(new PlanNodeId(nextNodeId++), scanTupleDesc); + ((ExternalFileScanNode) scanNode).setLoadInfo(loadJobId, txnId, table, brokerDesc, fileGroups, + fileStatusesList, filesAdded, strictMode, loadParallelism, userInfo); scanNode.init(analyzer); scanNode.finalize(analyzer); if (Config.enable_vectorized_load) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/BrokerScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/BrokerScanNode.java deleted file mode 100644 index 800011d31f..0000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/BrokerScanNode.java +++ /dev/null @@ -1,654 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.planner; - -import org.apache.doris.analysis.Analyzer; -import org.apache.doris.analysis.BrokerDesc; -import org.apache.doris.analysis.Expr; -import org.apache.doris.analysis.ImportColumnDesc; -import org.apache.doris.analysis.IntLiteral; -import org.apache.doris.analysis.SlotDescriptor; -import org.apache.doris.analysis.SlotRef; -import org.apache.doris.analysis.StorageBackend; -import org.apache.doris.analysis.TupleDescriptor; -import org.apache.doris.analysis.UserIdentity; -import org.apache.doris.catalog.BrokerTable; -import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.FsBroker; -import org.apache.doris.catalog.HdfsResource; -import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.Table; -import org.apache.doris.common.AnalysisException; -import org.apache.doris.common.Config; -import org.apache.doris.common.DdlException; -import org.apache.doris.common.FeConstants; -import org.apache.doris.common.UserException; -import org.apache.doris.common.util.BrokerUtil; -import org.apache.doris.common.util.Util; -import org.apache.doris.common.util.VectorizedUtil; -import org.apache.doris.load.BrokerFileGroup; -import org.apache.doris.load.Load; -import org.apache.doris.load.loadv2.LoadTask; -import org.apache.doris.mysql.privilege.UserProperty; -import org.apache.doris.qe.ConnectContext; -import org.apache.doris.resource.Tag; -import org.apache.doris.statistics.StatisticalType; -import org.apache.doris.system.Backend; -import org.apache.doris.system.BeSelectionPolicy; -import org.apache.doris.task.LoadTaskInfo; -import org.apache.doris.thrift.TBrokerFileStatus; -import org.apache.doris.thrift.TBrokerRangeDesc; -import org.apache.doris.thrift.TBrokerScanRange; -import org.apache.doris.thrift.TBrokerScanRangeParams; -import org.apache.doris.thrift.TExplainLevel; -import org.apache.doris.thrift.TFileFormatType; -import org.apache.doris.thrift.TFileType; -import org.apache.doris.thrift.THdfsParams; -import org.apache.doris.thrift.TNetworkAddress; -import org.apache.doris.thrift.TScanRange; -import org.apache.doris.thrift.TScanRangeLocation; -import org.apache.doris.thrift.TScanRangeLocations; - -import com.google.common.base.Joiner; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.nio.charset.Charset; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -/** - * Broker scan node - * - * Since https://github.com/apache/doris/pull/5686, Doris can read data from HDFS without broker by - * broker scan node. - * Broker scan node is more likely a file scan node for now. - * With this feature, we can extend BrokerScanNode to query external table which data is stored in HDFS, such as - * Hive and Iceberg, etc. - */ -public class BrokerScanNode extends LoadScanNode { - private static final Logger LOG = LogManager.getLogger(BrokerScanNode.class); - private static final TBrokerFileStatusComparator T_BROKER_FILE_STATUS_COMPARATOR - = new TBrokerFileStatusComparator(); - - public static class TBrokerFileStatusComparator implements Comparator { - @Override - public int compare(TBrokerFileStatus o1, TBrokerFileStatus o2) { - if (o1.size < o2.size) { - return -1; - } else if (o1.size > o2.size) { - return 1; - } - return 0; - } - } - - // File groups need to - private List locationsList; - - // used both for load statement and select statement - private long totalBytes; - private long bytesPerInstance; - - // Parameters need to process - private long loadJobId = -1; // -1 means this scan node is not for a load job - private long txnId = -1; - protected Table targetTable; - protected BrokerDesc brokerDesc; - protected List fileGroups; - private boolean strictMode = false; - private int loadParallelism = 1; - private UserIdentity userIdentity; - - protected List> fileStatusesList; - // file num - protected int filesAdded; - - // Only used for external table in select statement - private List backends; - private int nextBe = 0; - - private Analyzer analyzer; - - protected static class ParamCreateContext { - public BrokerFileGroup fileGroup; - public TBrokerScanRangeParams params; - public TupleDescriptor srcTupleDescriptor; - public Map exprMap; - public Map slotDescByName; - public String timezone; - } - - private List paramCreateContexts; - - // For broker load and external broker table - public BrokerScanNode(PlanNodeId id, TupleDescriptor destTupleDesc, String planNodeName, - List> fileStatusesList, int filesAdded) { - super(id, destTupleDesc, planNodeName, StatisticalType.BROKER_SCAN_NODE); - this.fileStatusesList = fileStatusesList; - this.filesAdded = filesAdded; - if (ConnectContext.get() != null) { - this.userIdentity = ConnectContext.get().getCurrentUserIdentity(); - } - } - - // For hive and iceberg scan node - public BrokerScanNode(PlanNodeId id, TupleDescriptor destTupleDesc, String planNodeName, - List> fileStatusesList, int filesAdded, StatisticalType statisticalType) { - super(id, destTupleDesc, planNodeName, statisticalType); - this.fileStatusesList = fileStatusesList; - this.filesAdded = filesAdded; - if (ConnectContext.get() != null) { - this.userIdentity = ConnectContext.get().getCurrentUserIdentity(); - } - } - - @Override - public void init(Analyzer analyzer) throws UserException { - super.init(analyzer); - - this.analyzer = analyzer; - if (desc.getTable() != null) { - this.initFileGroup(); - } - - // Get all broker file status - assignBackends(); - getFileStatusAndCalcInstance(); - - paramCreateContexts = Lists.newArrayList(); - for (BrokerFileGroup fileGroup : fileGroups) { - ParamCreateContext context = new ParamCreateContext(); - context.fileGroup = fileGroup; - context.timezone = analyzer.getTimezone(); - initParams(context); - paramCreateContexts.add(context); - } - } - - public List getParamCreateContexts() { - return paramCreateContexts; - } - - protected void initFileGroup() throws UserException { - BrokerTable brokerTable = (BrokerTable) desc.getTable(); - try { - fileGroups = Lists.newArrayList(new BrokerFileGroup(brokerTable)); - } catch (AnalysisException e) { - throw new UserException(e.getMessage()); - } - brokerDesc = new BrokerDesc(brokerTable.getBrokerName(), brokerTable.getBrokerProperties()); - targetTable = brokerTable; - } - - protected boolean isLoad() { - return desc.getTable() == null; - } - - public void setLoadInfo(long loadJobId, - long txnId, - Table targetTable, - BrokerDesc brokerDesc, - List fileGroups, - boolean strictMode, - int loadParallelism, - UserIdentity userIdentity) { - this.loadJobId = loadJobId; - this.txnId = txnId; - this.targetTable = targetTable; - this.brokerDesc = brokerDesc; - this.fileGroups = fileGroups; - this.strictMode = strictMode; - this.loadParallelism = loadParallelism; - this.userIdentity = userIdentity; - } - - // Called from init, construct source tuple information - private void initParams(ParamCreateContext context) - throws UserException { - TBrokerScanRangeParams params = new TBrokerScanRangeParams(); - context.params = params; - - BrokerFileGroup fileGroup = context.fileGroup; - params.setColumnSeparator(fileGroup.getColumnSeparator().getBytes(Charset.forName("UTF-8"))[0]); - params.setLineDelimiter(fileGroup.getLineDelimiter().getBytes(Charset.forName("UTF-8"))[0]); - params.setColumnSeparatorStr(fileGroup.getColumnSeparator()); - params.setLineDelimiterStr(fileGroup.getLineDelimiter()); - params.setColumnSeparatorLength(fileGroup.getColumnSeparator().getBytes(Charset.forName("UTF-8")).length); - params.setLineDelimiterLength(fileGroup.getLineDelimiter().getBytes(Charset.forName("UTF-8")).length); - params.setStrictMode(strictMode); - params.setProperties(brokerDesc.getProperties()); - if (params.getSrcSlotIds() == null) { - params.setSrcSlotIds(new java.util.ArrayList()); - } - deleteCondition = fileGroup.getDeleteCondition(); - mergeType = fileGroup.getMergeType(); - initColumns(context); - initAndSetPrecedingFilter(fileGroup.getPrecedingFilterExpr(), context.srcTupleDescriptor, analyzer); - initAndSetWhereExpr(fileGroup.getWhereExpr(), this.desc, analyzer); - } - - /** - * This method is used to calculate the slotDescByName and exprMap. - * The expr in exprMap is analyzed in this function. - * The smap of slot which belongs to expr will be analyzed by src desc. - * slotDescByName: the single slot from columns in load stmt - * exprMap: the expr from column mapping in load stmt. - * - * @param context - * @throws UserException - */ - private void initColumns(ParamCreateContext context) throws UserException { - context.srcTupleDescriptor = analyzer.getDescTbl().createTupleDescriptor(); - context.slotDescByName = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); - context.exprMap = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); - - // for load job, column exprs is got from file group - // for query, there is no column exprs, they will be got from table's schema in "Load.initColumns" - LoadTaskInfo.ImportColumnDescs columnDescs = new LoadTaskInfo.ImportColumnDescs(); - if (isLoad()) { - columnDescs.descs = context.fileGroup.getColumnExprList(); - if (mergeType == LoadTask.MergeType.MERGE) { - columnDescs.descs.add(ImportColumnDesc.newDeleteSignImportColumnDesc(deleteCondition)); - } else if (mergeType == LoadTask.MergeType.DELETE) { - columnDescs.descs.add(ImportColumnDesc.newDeleteSignImportColumnDesc(new IntLiteral(1))); - } - // add columnExpr for sequence column - if (targetTable instanceof OlapTable && ((OlapTable) targetTable).hasSequenceCol()) { - String sequenceCol = ((OlapTable) targetTable).getSequenceMapCol(); - if (sequenceCol == null) { - sequenceCol = context.fileGroup.getSequenceCol(); - } - columnDescs.descs.add(new ImportColumnDesc(Column.SEQUENCE_COL, - new SlotRef(null, sequenceCol))); - } - } - - if (targetTable != null) { - Load.initColumns(targetTable, columnDescs, context.fileGroup.getColumnToHadoopFunction(), context.exprMap, - analyzer, context.srcTupleDescriptor, context.slotDescByName, context.params.getSrcSlotIds(), - formatType(context.fileGroup.getFileFormat(), ""), null, VectorizedUtil.isVectorized()); - } - } - - protected TScanRangeLocations newLocations(TBrokerScanRangeParams params, BrokerDesc brokerDesc) - throws UserException { - - Backend selectedBackend; - if (brokerDesc.isMultiLoadBroker()) { - if (!brokerDesc.getProperties().containsKey(BrokerDesc.MULTI_LOAD_BROKER_BACKEND_KEY)) { - throw new DdlException("backend not found for multi load."); - } - String backendId = brokerDesc.getProperties().get(BrokerDesc.MULTI_LOAD_BROKER_BACKEND_KEY); - selectedBackend = Env.getCurrentSystemInfo().getBackend(Long.valueOf(backendId)); - if (selectedBackend == null) { - throw new DdlException("backend " + backendId + " not found for multi load."); - } - } else { - selectedBackend = backends.get(nextBe++); - nextBe = nextBe % backends.size(); - } - - // Generate on broker scan range - TBrokerScanRange brokerScanRange = new TBrokerScanRange(); - brokerScanRange.setParams(params); - if (brokerDesc.getStorageType() == StorageBackend.StorageType.BROKER) { - FsBroker broker = null; - try { - broker = Env.getCurrentEnv().getBrokerMgr() - .getBroker(brokerDesc.getName(), selectedBackend.getHost()); - } catch (AnalysisException e) { - throw new UserException(e.getMessage()); - } - brokerScanRange.addToBrokerAddresses(new TNetworkAddress(broker.ip, broker.port)); - } else if (brokerDesc.getStorageType() == StorageBackend.StorageType.OFS - || brokerDesc.getStorageType() == StorageBackend.StorageType.JFS) { - FsBroker broker = Env.getCurrentEnv().getBrokerMgr().getAnyAliveBroker(); - if (broker == null) { - throw new UserException("No alive broker."); - } - brokerScanRange.addToBrokerAddresses(new TNetworkAddress(broker.ip, broker.port)); - } else { - brokerScanRange.setBrokerAddresses(new ArrayList<>()); - } - - // Scan range - TScanRange scanRange = new TScanRange(); - scanRange.setBrokerScanRange(brokerScanRange); - - // Locations - TScanRangeLocations locations = new TScanRangeLocations(); - locations.setScanRange(scanRange); - - TScanRangeLocation location = new TScanRangeLocation(); - location.setBackendId(selectedBackend.getId()); - location.setServer(new TNetworkAddress(selectedBackend.getHost(), selectedBackend.getBePort())); - locations.addToLocations(location); - - return locations; - } - - private void getFileStatusAndCalcInstance() throws UserException { - if (fileStatusesList == null || filesAdded == -1) { - // FIXME(cmy): fileStatusesList and filesAdded can be set out of db lock when doing pull load, - // but for now it is very difficult to set them out of db lock when doing broker query. - // So we leave this code block here. - // This will be fixed later. - fileStatusesList = Lists.newArrayList(); - filesAdded = 0; - this.getFileStatus(); - } - - if (isLoad() && filesAdded == 0) { - throw new UserException("No source file in this table(" + targetTable.getName() + ")."); - } - - totalBytes = 0; - for (List fileStatuses : fileStatusesList) { - if (!brokerDesc.isMultiLoadBroker()) { - Collections.sort(fileStatuses, T_BROKER_FILE_STATUS_COMPARATOR); - } - for (TBrokerFileStatus fileStatus : fileStatuses) { - totalBytes += fileStatus.size; - } - } - numInstances = 1; - if (!brokerDesc.isMultiLoadBroker()) { - numInstances = (int) (totalBytes / Config.min_bytes_per_broker_scanner); - int totalLoadParallelism = loadParallelism * backends.size(); - numInstances = Math.min(totalLoadParallelism, numInstances); - numInstances = Math.min(numInstances, Config.max_broker_concurrency); - numInstances = Math.max(1, numInstances); - } - - bytesPerInstance = totalBytes / numInstances + 1; - - if (bytesPerInstance > Config.max_bytes_per_broker_scanner) { - throw new UserException( - "Scan bytes per broker scanner exceed limit: " + Config.max_bytes_per_broker_scanner); - } - LOG.info("number instance of broker scan node is: {}, bytes per instance: {}", numInstances, bytesPerInstance); - } - - protected void getFileStatus() throws UserException { - for (BrokerFileGroup fileGroup : fileGroups) { - boolean isBinaryFileFormat = fileGroup.isBinaryFileFormat(); - List fileStatuses = Lists.newArrayList(); - for (int i = 0; i < fileGroup.getFilePaths().size(); i++) { - if (brokerDesc.isMultiLoadBroker()) { - TBrokerFileStatus fileStatus = new TBrokerFileStatus(fileGroup.getFilePaths().get(i), - false, fileGroup.getFileSize().get(i), false); - fileStatuses.add(fileStatus); - } else { - BrokerUtil.parseFile(fileGroup.getFilePaths().get(i), brokerDesc, fileStatuses); - } - } - - // only get non-empty file or non-binary file - fileStatuses = fileStatuses.stream().filter(f -> { - return f.getSize() > 0 || !isBinaryFileFormat; - }).collect(Collectors.toList()); - - fileStatusesList.add(fileStatuses); - filesAdded += fileStatuses.size(); - for (TBrokerFileStatus fstatus : fileStatuses) { - LOG.info("Add file status is {}", fstatus); - } - } - } - - private void assignBackends() throws UserException { - Set tags = Sets.newHashSet(); - if (userIdentity != null) { - tags = Env.getCurrentEnv().getAuth().getResourceTags(userIdentity.getQualifiedUser()); - if (tags == UserProperty.INVALID_RESOURCE_TAGS) { - throw new UserException("No valid resource tag for user: " + userIdentity.getQualifiedUser()); - } - } else { - LOG.debug("user info in BrokerScanNode should not be null, add log to observer"); - } - backends = Lists.newArrayList(); - // broker scan node is used for query or load - BeSelectionPolicy policy = new BeSelectionPolicy.Builder().needQueryAvailable().needLoadAvailable() - .addTags(tags).build(); - backends.addAll(policy.getCandidateBackends(Env.getCurrentSystemInfo().getIdToBackend().values())); - if (backends.isEmpty()) { - throw new UserException("No available backends"); - } - } - - private TFileFormatType formatType(String fileFormat, String path) throws UserException { - if (fileFormat != null) { - if (fileFormat.toLowerCase().equals("parquet")) { - return TFileFormatType.FORMAT_PARQUET; - } else if (fileFormat.toLowerCase().equals("orc")) { - return TFileFormatType.FORMAT_ORC; - } else if (fileFormat.toLowerCase().equals("json")) { - return TFileFormatType.FORMAT_JSON; - // csv/csv_with_name/csv_with_names_and_types treat as csv format - } else if (fileFormat.toLowerCase().equals(FeConstants.csv) - || fileFormat.toLowerCase().equals(FeConstants.csv_with_names) - || fileFormat.toLowerCase().equals(FeConstants.csv_with_names_and_types) - // TODO: Add TEXTFILE to TFileFormatType to Support hive text file format. - || fileFormat.toLowerCase().equals(FeConstants.text)) { - return TFileFormatType.FORMAT_CSV_PLAIN; - } else { - throw new UserException("Not supported file format: " + fileFormat); - } - } - - return Util.getFileFormatType(path); - } - - public String getHostUri() throws UserException { - return ""; - } - - private String getHeaderType(String formatType) { - if (formatType != null) { - if (formatType.toLowerCase().equals(FeConstants.csv_with_names) - || formatType.toLowerCase().equals(FeConstants.csv_with_names_and_types)) { - return formatType; - } - } - return ""; - } - - // If fileFormat is not null, we use fileFormat instead of check file's suffix - private void processFileGroup( - ParamCreateContext context, - List fileStatuses) - throws UserException { - if (fileStatuses == null || fileStatuses.isEmpty()) { - return; - } - // set hdfs params, used to Hive and Iceberg scan - THdfsParams tHdfsParams = new THdfsParams(); - String fsName = getHostUri(); - tHdfsParams.setFsName(fsName); - - TScanRangeLocations curLocations = newLocations(context.params, brokerDesc); - long curInstanceBytes = 0; - long curFileOffset = 0; - for (int i = 0; i < fileStatuses.size(); ) { - TBrokerFileStatus fileStatus = fileStatuses.get(i); - long leftBytes = fileStatus.size - curFileOffset; - long tmpBytes = curInstanceBytes + leftBytes; - //header_type - String headerType = getHeaderType(context.fileGroup.getFileFormat()); - TFileFormatType formatType = formatType(context.fileGroup.getFileFormat(), fileStatus.path); - List columnsFromPath = BrokerUtil.parseColumnsFromPath(fileStatus.path, - context.fileGroup.getColumnNamesFromPath()); - int numberOfColumnsFromFile = context.slotDescByName.size() - columnsFromPath.size(); - if (tmpBytes > bytesPerInstance) { - // Now only support split plain text - if ((formatType == TFileFormatType.FORMAT_CSV_PLAIN && fileStatus.isSplitable) - || formatType == TFileFormatType.FORMAT_JSON) { - long rangeBytes = bytesPerInstance - curInstanceBytes; - TBrokerRangeDesc rangeDesc = createBrokerRangeDesc(curFileOffset, fileStatus, formatType, - rangeBytes, columnsFromPath, numberOfColumnsFromFile, brokerDesc, headerType); - if (formatType == TFileFormatType.FORMAT_JSON) { - rangeDesc.setStripOuterArray(context.fileGroup.isStripOuterArray()); - rangeDesc.setJsonpaths(context.fileGroup.getJsonPaths()); - rangeDesc.setJsonRoot(context.fileGroup.getJsonRoot()); - rangeDesc.setFuzzyParse(context.fileGroup.isFuzzyParse()); - rangeDesc.setNumAsString(context.fileGroup.isNumAsString()); - rangeDesc.setReadJsonByLine(context.fileGroup.isReadJsonByLine()); - } - curLocations.getScanRange().getBrokerScanRange().addToRanges(rangeDesc); - curFileOffset += rangeBytes; - - } else { - TBrokerRangeDesc rangeDesc = createBrokerRangeDesc(curFileOffset, fileStatus, formatType, - leftBytes, columnsFromPath, numberOfColumnsFromFile, brokerDesc, headerType); - if (rangeDesc.hdfs_params != null && rangeDesc.hdfs_params.getFsName() == null) { - rangeDesc.hdfs_params.setFsName(fsName); - } else if (rangeDesc.hdfs_params == null) { - rangeDesc.setHdfsParams(tHdfsParams); - } - - rangeDesc.setReadByColumnDef(true); - curLocations.getScanRange().getBrokerScanRange().addToRanges(rangeDesc); - curFileOffset = 0; - i++; - } - - // New one scan - locationsList.add(curLocations); - curLocations = newLocations(context.params, brokerDesc); - curInstanceBytes = 0; - - } else { - TBrokerRangeDesc rangeDesc = createBrokerRangeDesc(curFileOffset, fileStatus, formatType, - leftBytes, columnsFromPath, numberOfColumnsFromFile, brokerDesc, headerType); - if (formatType == TFileFormatType.FORMAT_JSON) { - rangeDesc.setStripOuterArray(context.fileGroup.isStripOuterArray()); - rangeDesc.setJsonpaths(context.fileGroup.getJsonPaths()); - rangeDesc.setJsonRoot(context.fileGroup.getJsonRoot()); - rangeDesc.setFuzzyParse(context.fileGroup.isFuzzyParse()); - rangeDesc.setNumAsString(context.fileGroup.isNumAsString()); - rangeDesc.setReadJsonByLine(context.fileGroup.isReadJsonByLine()); - } - if (rangeDesc.hdfs_params != null && rangeDesc.hdfs_params.getFsName() == null) { - rangeDesc.hdfs_params.setFsName(fsName); - } else if (rangeDesc.hdfs_params == null) { - rangeDesc.setHdfsParams(tHdfsParams); - } - - rangeDesc.setReadByColumnDef(true); - curLocations.getScanRange().getBrokerScanRange().addToRanges(rangeDesc); - curFileOffset = 0; - curInstanceBytes += leftBytes; - i++; - } - } - - // Put the last file - if (curLocations.getScanRange().getBrokerScanRange().isSetRanges()) { - locationsList.add(curLocations); - } - } - - private TBrokerRangeDesc createBrokerRangeDesc(long curFileOffset, TBrokerFileStatus fileStatus, - TFileFormatType formatType, long rangeBytes, - List columnsFromPath, int numberOfColumnsFromFile, - BrokerDesc brokerDesc, String headerType) { - TBrokerRangeDesc rangeDesc = new TBrokerRangeDesc(); - rangeDesc.setFileType(brokerDesc.getFileType()); - rangeDesc.setFormatType(formatType); - rangeDesc.setPath(fileStatus.path); - rangeDesc.setSplittable(fileStatus.isSplitable); - rangeDesc.setStartOffset(curFileOffset); - rangeDesc.setSize(rangeBytes); - // fileSize only be used when format is orc or parquet and TFileType is broker - // When TFileType is other type, it is not necessary - rangeDesc.setFileSize(fileStatus.size); - // In Backend, will append columnsFromPath to the end of row after data scanned from file. - rangeDesc.setNumOfColumnsFromFile(numberOfColumnsFromFile); - rangeDesc.setColumnsFromPath(columnsFromPath); - rangeDesc.setHeaderType(headerType); - // set hdfs params for hdfs file type. - if (brokerDesc.getFileType() == TFileType.FILE_HDFS) { - THdfsParams tHdfsParams = HdfsResource.generateHdfsParam(brokerDesc.getProperties()); - rangeDesc.setHdfsParams(tHdfsParams); - } - return rangeDesc; - } - - //TODO(wx):support quantile state column or forbidden it. - @Override - public void finalize(Analyzer analyzer) throws UserException { - locationsList = Lists.newArrayList(); - - for (int i = 0; i < fileGroups.size(); ++i) { - List fileStatuses = fileStatusesList.get(i); - if (fileStatuses.isEmpty()) { - continue; - } - ParamCreateContext context = paramCreateContexts.get(i); - try { - finalizeParams(context.slotDescByName, context.exprMap, context.params, - context.srcTupleDescriptor, strictMode, context.fileGroup.isNegative(), analyzer); - } catch (AnalysisException e) { - throw new UserException(e.getMessage()); - } - processFileGroup(context, fileStatuses); - } - if (LOG.isDebugEnabled()) { - for (TScanRangeLocations locations : locationsList) { - LOG.debug("Scan range is {}", locations); - } - } - - if (loadJobId != -1) { - LOG.info("broker load job {} with txn {} has {} scan range: {}", - loadJobId, txnId, locationsList.size(), - brokerDesc.isMultiLoadBroker() ? "local" - : locationsList.stream().map(loc -> loc.locations.get(0).backend_id).toArray()); - } - } - - @Override - public List getScanRangeLocations(long maxScanRangeLength) { - return locationsList; - } - - @Override - public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { - StringBuilder output = new StringBuilder(); - if (!isLoad()) { - BrokerTable brokerTable = (BrokerTable) targetTable; - output.append(prefix).append("TABLE: ").append(brokerTable.getName()).append("\n"); - if (detailLevel != TExplainLevel.BRIEF) { - output.append(prefix).append("PATH: ") - .append(Joiner.on(",").join(brokerTable.getPaths())).append("\",\n"); - } - } - output.append(prefix).append("BROKER: ").append(brokerDesc.getName()).append("\n"); - return output.toString(); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/HiveScanNode.java deleted file mode 100644 index c963ff0ea5..0000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/HiveScanNode.java +++ /dev/null @@ -1,198 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.planner; - -import org.apache.doris.analysis.Analyzer; -import org.apache.doris.analysis.BrokerDesc; -import org.apache.doris.analysis.ImportColumnDesc; -import org.apache.doris.analysis.StorageBackend; -import org.apache.doris.analysis.TupleDescriptor; -import org.apache.doris.catalog.HMSResource; -import org.apache.doris.catalog.HiveMetaStoreClientHelper; -import org.apache.doris.catalog.HiveTable; -import org.apache.doris.common.FeConstants; -import org.apache.doris.common.UserException; -import org.apache.doris.common.util.Util; -import org.apache.doris.load.BrokerFileGroup; -import org.apache.doris.statistics.StatisticalType; -import org.apache.doris.thrift.TBrokerFileStatus; -import org.apache.doris.thrift.TExplainLevel; - -import com.google.common.base.Strings; -import com.google.common.collect.Lists; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.hive.metastore.api.FieldSchema; -import org.apache.hadoop.hive.metastore.api.Table; -import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -public class HiveScanNode extends BrokerScanNode { - private static final Logger LOG = LogManager.getLogger(HiveScanNode.class); - - private static final String HIVE_DEFAULT_COLUMN_SEPARATOR = "\001"; - private static final String HIVE_DEFAULT_LINE_DELIMITER = "\n"; - - private HiveTable hiveTable; - // partition column predicates of hive table - private ExprNodeGenericFuncDesc hivePartitionPredicate; - private List parsedColumnExprList = new ArrayList<>(); - private String hdfsUri; - - private Table remoteHiveTable; - - /* hive table properties */ - private String columnSeparator; - private String lineDelimiter; - private String fileFormat; - private String path; - private List partitionKeys = new ArrayList<>(); - private StorageBackend.StorageType storageType; - /* hive table properties */ - - public String getHostUri() { - return hdfsUri; - } - - public List getParsedColumnExprList() { - return parsedColumnExprList; - } - - public String getColumnSeparator() { - return columnSeparator; - } - - public String getLineDelimiter() { - return lineDelimiter; - } - - public String getFileFormat() { - return fileFormat; - } - - public String getPath() { - return path; - } - - public List getPartitionKeys() { - return partitionKeys; - } - - public HiveScanNode(PlanNodeId id, TupleDescriptor destTupleDesc, String planNodeName, - List> fileStatusesList, int filesAdded) { - super(id, destTupleDesc, planNodeName, fileStatusesList, filesAdded, StatisticalType.HIVE_SCAN_NODE); - this.hiveTable = (HiveTable) destTupleDesc.getTable(); - } - - @Override - public void init(Analyzer analyzer) throws UserException { - super.init(analyzer); - } - - @Override - protected void initFileGroup() throws UserException { - initHiveTblProperties(); - analyzeColumnFromPath(); - - HiveTable hiveTable = (HiveTable) desc.getTable(); - fileGroups = Lists.newArrayList( - new BrokerFileGroup(hiveTable.getId(), - getColumnSeparator(), - getLineDelimiter(), - getPath(), - getFileFormat(), - getPartitionKeys(), - getParsedColumnExprList())); - brokerDesc = new BrokerDesc("HiveTableDesc", storageType, hiveTable.getHiveProperties()); - targetTable = hiveTable; - } - - private void setStorageType(String location) throws UserException { - String[] strings = StringUtils.split(location, "/"); - String storagePrefix = strings[0].split(":")[0]; - if (Util.isS3CompatibleStorageSchema(storagePrefix)) { - this.storageType = StorageBackend.StorageType.S3; - } else if (storagePrefix.equalsIgnoreCase("hdfs")) { - this.storageType = StorageBackend.StorageType.HDFS; - } else if (storagePrefix.equalsIgnoreCase(FeConstants.FS_PREFIX_OFS)) { - this.storageType = StorageBackend.StorageType.OFS; - } else if (storagePrefix.equalsIgnoreCase(FeConstants.FS_PREFIX_JFS)) { - this.storageType = StorageBackend.StorageType.JFS; - } else { - throw new UserException("Not supported storage type: " + storagePrefix); - } - } - - private void initHiveTblProperties() throws UserException { - this.remoteHiveTable = HiveMetaStoreClientHelper.getTable(hiveTable); - this.fileFormat = HiveMetaStoreClientHelper.HiveFileFormat.getFormat(remoteHiveTable.getSd().getInputFormat()); - this.setStorageType(remoteHiveTable.getSd().getLocation()); - - Map serDeInfoParams = remoteHiveTable.getSd().getSerdeInfo().getParameters(); - this.columnSeparator = Strings.isNullOrEmpty(serDeInfoParams.get("field.delim")) - ? HIVE_DEFAULT_COLUMN_SEPARATOR : serDeInfoParams.get("field.delim"); - this.lineDelimiter = Strings.isNullOrEmpty(serDeInfoParams.get("line.delim")) - ? HIVE_DEFAULT_LINE_DELIMITER : serDeInfoParams.get("line.delim"); - this.path = remoteHiveTable.getSd().getLocation(); - for (FieldSchema fieldSchema : remoteHiveTable.getPartitionKeys()) { - this.partitionKeys.add(fieldSchema.getName()); - } - } - - @Override - protected void getFileStatus() throws UserException { - if (partitionKeys.size() > 0) { - // Hive Table is no longer supported. - // So there we just create an empty predicate - hivePartitionPredicate = new ExprNodeGenericFuncDesc(); - } - List fileStatuses = new ArrayList<>(); - this.hdfsUri = HiveMetaStoreClientHelper.getHiveDataFiles(hiveTable, hivePartitionPredicate, - fileStatuses, remoteHiveTable, storageType); - fileStatusesList.add(fileStatuses); - filesAdded += fileStatuses.size(); - for (TBrokerFileStatus fstatus : fileStatuses) { - LOG.debug("Add file status is {}", fstatus); - } - } - - @Override - public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { - StringBuilder output = new StringBuilder(); - if (!isLoad()) { - output.append(prefix).append("TABLE: ").append(hiveTable.getName()).append("\n"); - output.append(prefix).append("PATH: ") - .append(hiveTable.getHiveProperties().get(HMSResource.HIVE_METASTORE_URIS)).append("\n"); - } - return output.toString(); - } - - /** - * Analyze columns from path, the partition columns - */ - private void analyzeColumnFromPath() { - for (String colName : partitionKeys) { - ImportColumnDesc importColumnDesc = new ImportColumnDesc(colName, null); - parsedColumnExprList.add(importColumnDesc); - } - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/IcebergScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/IcebergScanNode.java deleted file mode 100644 index 4afc4305bd..0000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/IcebergScanNode.java +++ /dev/null @@ -1,112 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.planner; - -import org.apache.doris.analysis.Analyzer; -import org.apache.doris.analysis.BrokerDesc; -import org.apache.doris.analysis.Expr; -import org.apache.doris.analysis.TupleDescriptor; -import org.apache.doris.catalog.IcebergProperty; -import org.apache.doris.catalog.IcebergTable; -import org.apache.doris.common.UserException; -import org.apache.doris.external.iceberg.util.IcebergUtils; -import org.apache.doris.load.BrokerFileGroup; -import org.apache.doris.statistics.StatisticalType; -import org.apache.doris.thrift.TBrokerFileStatus; -import org.apache.doris.thrift.TExplainLevel; - -import com.google.common.collect.Lists; -import org.apache.iceberg.expressions.Expression; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.util.ArrayList; -import java.util.List; -import java.util.ListIterator; - -public class IcebergScanNode extends BrokerScanNode { - private static final Logger LOG = LogManager.getLogger(IcebergScanNode.class); - - private IcebergTable icebergTable; - private final List icebergPredicates = new ArrayList<>(); - - public IcebergScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName, - List> fileStatusesList, int filesAdded) { - super(id, desc, planNodeName, fileStatusesList, filesAdded, StatisticalType.ICEBERG_SCAN_NODE); - icebergTable = (IcebergTable) desc.getTable(); - } - - @Override - public void init(Analyzer analyzer) throws UserException { - super.init(analyzer); - } - - @Override - protected void initFileGroup() throws UserException { - fileGroups = Lists.newArrayList( - new BrokerFileGroup(icebergTable.getId(), - null, - icebergTable.getFileFormat())); - brokerDesc = new BrokerDesc("IcebergTableDesc", icebergTable.getStorageType(), - icebergTable.getIcebergProperties()); - targetTable = icebergTable; - } - - @Override - public String getHostUri() throws UserException { - return icebergTable.getHostUri(); - } - - @Override - protected void getFileStatus() throws UserException { - // extract iceberg conjuncts - ListIterator it = conjuncts.listIterator(); - while (it.hasNext()) { - Expression expression = IcebergUtils.convertToIcebergExpr(it.next()); - if (expression != null) { - icebergPredicates.add(expression); - } - } - // get iceberg file status - List fileStatuses; - try { - fileStatuses = icebergTable.getIcebergDataFiles(icebergPredicates); - } catch (Exception e) { - LOG.warn("errors while load iceberg table {} data files.", icebergTable.getName(), e); - throw new UserException("errors while load Iceberg table [" - + icebergTable.getName() + "] data files."); - } - fileStatusesList.add(fileStatuses); - filesAdded += fileStatuses.size(); - for (TBrokerFileStatus fstatus : fileStatuses) { - LOG.debug("Add file status is {}", fstatus); - } - } - - @Override - public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { - StringBuilder output = new StringBuilder(); - if (!isLoad()) { - output.append(prefix).append("TABLE: ").append(icebergTable.getName()).append("\n"); - output.append(prefix).append("PATH: ") - .append(icebergTable.getIcebergProperties().get(IcebergProperty.ICEBERG_HIVE_METASTORE_URIS)) - .append("\n"); - } - return output.toString(); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/LoadScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/LoadScanNode.java deleted file mode 100644 index bf4c66dccb..0000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/LoadScanNode.java +++ /dev/null @@ -1,256 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.planner; - -import org.apache.doris.analysis.Analyzer; -import org.apache.doris.analysis.ArithmeticExpr; -import org.apache.doris.analysis.Expr; -import org.apache.doris.analysis.ExprSubstitutionMap; -import org.apache.doris.analysis.FunctionCallExpr; -import org.apache.doris.analysis.IntLiteral; -import org.apache.doris.analysis.NullLiteral; -import org.apache.doris.analysis.SlotDescriptor; -import org.apache.doris.analysis.SlotRef; -import org.apache.doris.analysis.StringLiteral; -import org.apache.doris.analysis.TupleDescriptor; -import org.apache.doris.catalog.AggregateType; -import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.FunctionSet; -import org.apache.doris.catalog.PrimitiveType; -import org.apache.doris.catalog.Type; -import org.apache.doris.common.AnalysisException; -import org.apache.doris.common.Config; -import org.apache.doris.common.UserException; -import org.apache.doris.load.loadv2.LoadTask; -import org.apache.doris.rewrite.ExprRewriter; -import org.apache.doris.statistics.StatisticalType; -import org.apache.doris.thrift.TBrokerScanNode; -import org.apache.doris.thrift.TBrokerScanRangeParams; -import org.apache.doris.thrift.TPlanNode; -import org.apache.doris.thrift.TPlanNodeType; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import java.util.List; -import java.util.Map; - -public abstract class LoadScanNode extends ScanNode { - - protected Expr deleteCondition; - protected LoadTask.MergeType mergeType = LoadTask.MergeType.APPEND; - - public LoadScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName) { - super(id, desc, planNodeName, StatisticalType.LOAD_SCAN_NODE); - } - - public LoadScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName, StatisticalType statisticalType) { - super(id, desc, planNodeName, statisticalType); - } - - protected void initAndSetWhereExpr(Expr whereExpr, TupleDescriptor tupleDesc, - Analyzer analyzer) throws UserException { - Expr newWhereExpr = initWhereExpr(whereExpr, tupleDesc, analyzer); - if (newWhereExpr != null) { - addConjuncts(newWhereExpr.getConjuncts()); - } - } - - protected void initAndSetPrecedingFilter(Expr whereExpr, - TupleDescriptor tupleDesc, Analyzer analyzer) throws UserException { - Expr newWhereExpr = initWhereExpr(whereExpr, tupleDesc, analyzer); - if (newWhereExpr != null) { - addPreFilterConjuncts(newWhereExpr.getConjuncts()); - } - } - - private Expr initWhereExpr(Expr whereExpr, TupleDescriptor tupleDesc, Analyzer analyzer) throws UserException { - if (whereExpr == null) { - return null; - } - - Map dstDescMap = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); - for (SlotDescriptor slotDescriptor : tupleDesc.getSlots()) { - dstDescMap.put(slotDescriptor.getColumn().getName(), slotDescriptor); - } - - // substitute SlotRef in filter expression - // where expr must be equal first to transfer some predicates(eg: BetweenPredicate to BinaryPredicate) - Expr newWhereExpr = analyzer.getExprRewriter() - .rewrite(whereExpr, analyzer, ExprRewriter.ClauseType.WHERE_CLAUSE); - List slots = Lists.newArrayList(); - newWhereExpr.collect(SlotRef.class, slots); - - ExprSubstitutionMap smap = new ExprSubstitutionMap(); - for (SlotRef slot : slots) { - SlotDescriptor slotDesc = dstDescMap.get(slot.getColumnName()); - if (slotDesc == null) { - throw new UserException("unknown column reference in where statement, reference=" - + slot.getColumnName()); - } - smap.getLhs().add(slot); - smap.getRhs().add(new SlotRef(slotDesc)); - } - newWhereExpr = newWhereExpr.clone(smap); - newWhereExpr.analyze(analyzer); - if (!newWhereExpr.getType().equals(Type.BOOLEAN)) { - throw new UserException("where statement is not a valid statement return bool"); - } - return newWhereExpr; - } - - protected void checkBitmapCompatibility(Analyzer analyzer, - SlotDescriptor slotDesc, Expr expr) throws AnalysisException { - if (slotDesc.getColumn().getAggregationType() == AggregateType.BITMAP_UNION) { - expr.analyze(analyzer); - if (!expr.getType().isBitmapType()) { - String errorMsg = String.format("bitmap column %s require the function return type is BITMAP", - slotDesc.getColumn().getName()); - throw new AnalysisException(errorMsg); - } - } - } - - protected void checkQuantileStateCompatibility(Analyzer analyzer, - SlotDescriptor slotDesc, Expr expr) throws AnalysisException { - if (slotDesc.getColumn().getAggregationType() == AggregateType.QUANTILE_UNION) { - expr.analyze(analyzer); - if (!expr.getType().isQuantileStateType()) { - String errorMsg = "quantile_state column %s require the function return type is QUANTILE_STATE"; - throw new AnalysisException(errorMsg); - } - } - } - - protected void finalizeParams(Map slotDescByName, - Map exprMap, - TBrokerScanRangeParams params, - TupleDescriptor srcTupleDesc, - boolean strictMode, - boolean negative, - Analyzer analyzer) throws UserException { - Map destSidToSrcSidWithoutTrans = Maps.newHashMap(); - for (SlotDescriptor destSlotDesc : desc.getSlots()) { - if (!destSlotDesc.isMaterialized()) { - continue; - } - Expr expr = null; - if (exprMap != null) { - expr = exprMap.get(destSlotDesc.getColumn().getName()); - } - if (expr == null) { - SlotDescriptor srcSlotDesc = slotDescByName.get(destSlotDesc.getColumn().getName()); - if (srcSlotDesc != null) { - destSidToSrcSidWithoutTrans.put(destSlotDesc.getId().asInt(), srcSlotDesc.getId().asInt()); - // If dest is allow null, we set source to nullable - if (destSlotDesc.getColumn().isAllowNull()) { - srcSlotDesc.setIsNullable(true); - } - expr = new SlotRef(srcSlotDesc); - } else { - Column column = destSlotDesc.getColumn(); - if (column.getDefaultValue() != null) { - if (column.getDefaultValueExprDef() != null) { - expr = column.getDefaultValueExpr(); - } else { - expr = new StringLiteral(destSlotDesc.getColumn().getDefaultValue()); - } - } else { - if (column.isAllowNull()) { - expr = NullLiteral.create(column.getType()); - } else { - throw new AnalysisException("column has no source field, column=" + column.getName()); - } - } - } - } - - // check hll_hash - if (destSlotDesc.getType().getPrimitiveType() == PrimitiveType.HLL) { - if (!(expr instanceof FunctionCallExpr)) { - throw new AnalysisException("HLL column must use " + FunctionSet.HLL_HASH + " function, like " - + destSlotDesc.getColumn().getName() + "=" + FunctionSet.HLL_HASH + "(xxx)"); - } - FunctionCallExpr fn = (FunctionCallExpr) expr; - if (!fn.getFnName().getFunction().equalsIgnoreCase(FunctionSet.HLL_HASH) - && !fn.getFnName().getFunction().equalsIgnoreCase("hll_empty")) { - throw new AnalysisException("HLL column must use " + FunctionSet.HLL_HASH + " function, like " - + destSlotDesc.getColumn().getName() + "=" + FunctionSet.HLL_HASH - + "(xxx) or " + destSlotDesc.getColumn().getName() + "=hll_empty()"); - } - expr.setType(Type.HLL); - } - - checkBitmapCompatibility(analyzer, destSlotDesc, expr); - - checkQuantileStateCompatibility(analyzer, destSlotDesc, expr); - - // check quantile_state - - if (negative && destSlotDesc.getColumn().getAggregationType() == AggregateType.SUM) { - expr = new ArithmeticExpr(ArithmeticExpr.Operator.MULTIPLY, expr, new IntLiteral(-1)); - expr.analyze(analyzer); - } - - // for jsonb type, use jsonb_parse_xxx to parse src string to jsonb. - // and if input string is not a valid json string, return null. - PrimitiveType dstType = destSlotDesc.getType().getPrimitiveType(); - PrimitiveType srcType = expr.getType().getPrimitiveType(); - if (dstType == PrimitiveType.JSONB - && (srcType == PrimitiveType.VARCHAR || srcType == PrimitiveType.STRING)) { - List args = Lists.newArrayList(); - args.add(expr); - String nullable = "notnull"; - if (destSlotDesc.getIsNullable() || expr.isNullable()) { - nullable = "nullable"; - } - String name = "jsonb_parse_" + nullable + "_error_to_null"; - expr = new FunctionCallExpr(name, args); - expr.analyze(analyzer); - } else { - expr = castToSlot(destSlotDesc, expr); - } - params.putToExprOfDestSlot(destSlotDesc.getId().asInt(), expr.treeToThrift()); - } - params.setDestSidToSrcSidWithoutTrans(destSidToSrcSidWithoutTrans); - params.setDestTupleId(desc.getId().asInt()); - params.setStrictMode(strictMode); - params.setSrcTupleId(srcTupleDesc.getId().asInt()); - // LOG.info("brokerScanRange is {}", brokerScanRange); - - // Need re compute memory layout after set some slot descriptor to nullable - srcTupleDesc.computeStatAndMemLayout(); - } - - @Override - protected void toThrift(TPlanNode planNode) { - planNode.setNodeType(TPlanNodeType.BROKER_SCAN_NODE); - TBrokerScanNode brokerScanNode = new TBrokerScanNode(desc.getId().asInt()); - if (!preFilterConjuncts.isEmpty()) { - if (Config.enable_vectorized_load && vpreFilterConjunct != null) { - brokerScanNode.addToPreFilterExprs(vpreFilterConjunct.treeToThrift()); - } else { - for (Expr e : preFilterConjuncts) { - brokerScanNode.addToPreFilterExprs(e.treeToThrift()); - } - } - } - planNode.setBrokerScanNode(brokerScanNode); - } -} - diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java b/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java index 59d9400060..c8b9f12d04 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java @@ -1934,16 +1934,12 @@ public class SingleNodePlanner { } break; case BROKER: - scanNode = new BrokerScanNode(ctx.getNextNodeId(), tblRef.getDesc(), "BrokerScanNode", - null, -1); - break; + throw new RuntimeException("Broker external table is not supported, try to use table function please"); case ELASTICSEARCH: scanNode = new EsScanNode(ctx.getNextNodeId(), tblRef.getDesc(), "EsScanNode"); break; case HIVE: - scanNode = new HiveScanNode(ctx.getNextNodeId(), tblRef.getDesc(), "HiveScanNode", - null, -1); - break; + throw new RuntimeException("Hive external table is not supported, try to use hive catalog please"); case ICEBERG: scanNode = new ExternalFileScanNode(ctx.getNextNodeId(), tblRef.getDesc()); break; @@ -1968,7 +1964,7 @@ public class SingleNodePlanner { default: break; } - if (scanNode instanceof OlapScanNode || scanNode instanceof EsScanNode || scanNode instanceof HiveScanNode + if (scanNode instanceof OlapScanNode || scanNode instanceof EsScanNode || scanNode instanceof ExternalFileScanNode) { if (analyzer.enableInferPredicate()) { PredicatePushDown.visitScanNode(scanNode, tblRef.getJoinOp(), analyzer); diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java b/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java index 86a06980ef..1128342804 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadPlanner.java @@ -172,30 +172,26 @@ public class StreamLoadPlanner { } // create scan node - if (Config.enable_new_load_scan_node && Config.enable_vectorized_load) { - ExternalFileScanNode fileScanNode = new ExternalFileScanNode(new PlanNodeId(0), scanTupleDesc); - // 1. create file group - DataDescription dataDescription = new DataDescription(destTable.getName(), taskInfo); - dataDescription.analyzeWithoutCheckPriv(db.getFullName()); - BrokerFileGroup fileGroup = new BrokerFileGroup(dataDescription); - fileGroup.parse(db, dataDescription); - // 2. create dummy file status - TBrokerFileStatus fileStatus = new TBrokerFileStatus(); - if (taskInfo.getFileType() == TFileType.FILE_LOCAL) { - fileStatus.setPath(taskInfo.getPath()); - fileStatus.setIsDir(false); - fileStatus.setSize(taskInfo.getFileSize()); // must set to -1, means stream. - } else { - fileStatus.setPath(""); - fileStatus.setIsDir(false); - fileStatus.setSize(-1); // must set to -1, means stream. - } - fileScanNode.setLoadInfo(loadId, taskInfo.getTxnId(), destTable, BrokerDesc.createForStreamLoad(), - fileGroup, fileStatus, taskInfo.isStrictMode(), taskInfo.getFileType()); - scanNode = fileScanNode; + ExternalFileScanNode fileScanNode = new ExternalFileScanNode(new PlanNodeId(0), scanTupleDesc); + // 1. create file group + DataDescription dataDescription = new DataDescription(destTable.getName(), taskInfo); + dataDescription.analyzeWithoutCheckPriv(db.getFullName()); + BrokerFileGroup fileGroup = new BrokerFileGroup(dataDescription); + fileGroup.parse(db, dataDescription); + // 2. create dummy file status + TBrokerFileStatus fileStatus = new TBrokerFileStatus(); + if (taskInfo.getFileType() == TFileType.FILE_LOCAL) { + fileStatus.setPath(taskInfo.getPath()); + fileStatus.setIsDir(false); + fileStatus.setSize(taskInfo.getFileSize()); // must set to -1, means stream. } else { - scanNode = new StreamLoadScanNode(loadId, new PlanNodeId(0), scanTupleDesc, destTable, taskInfo); + fileStatus.setPath(""); + fileStatus.setIsDir(false); + fileStatus.setSize(-1); // must set to -1, means stream. } + fileScanNode.setLoadInfo(loadId, taskInfo.getTxnId(), destTable, BrokerDesc.createForStreamLoad(), + fileGroup, fileStatus, taskInfo.isStrictMode(), taskInfo.getFileType()); + scanNode = fileScanNode; scanNode.init(analyzer); scanNode.finalize(analyzer); diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java deleted file mode 100644 index 621c7a9306..0000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/StreamLoadScanNode.java +++ /dev/null @@ -1,215 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.planner; - -import org.apache.doris.analysis.Analyzer; -import org.apache.doris.analysis.Expr; -import org.apache.doris.analysis.ImportColumnDesc; -import org.apache.doris.analysis.IntLiteral; -import org.apache.doris.analysis.SlotDescriptor; -import org.apache.doris.analysis.SlotRef; -import org.apache.doris.analysis.TupleDescriptor; -import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.Table; -import org.apache.doris.common.UserException; -import org.apache.doris.common.util.VectorizedUtil; -import org.apache.doris.load.Load; -import org.apache.doris.load.loadv2.LoadTask; -import org.apache.doris.statistics.StatisticalType; -import org.apache.doris.task.LoadTaskInfo; -import org.apache.doris.thrift.TBrokerRangeDesc; -import org.apache.doris.thrift.TBrokerScanRange; -import org.apache.doris.thrift.TBrokerScanRangeParams; -import org.apache.doris.thrift.TExplainLevel; -import org.apache.doris.thrift.TFileFormatType; -import org.apache.doris.thrift.TScanRange; -import org.apache.doris.thrift.TScanRangeLocations; -import org.apache.doris.thrift.TUniqueId; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.nio.charset.Charset; -import java.util.List; -import java.util.Map; - -/** - * used to scan from stream - */ -public class StreamLoadScanNode extends LoadScanNode { - private static final Logger LOG = LogManager.getLogger(StreamLoadScanNode.class); - - private TUniqueId loadId; - // TODO(zc): now we use scanRange - // input parameter - private Table dstTable; - private LoadTaskInfo taskInfo; - - // helper - private Analyzer analyzer; - private TupleDescriptor srcTupleDesc; - private TBrokerScanRange brokerScanRange; - - // If use case sensitive map, for example, - // the column name 「A」 in the table and the mapping '(a) set (A = a)' in load sql, - // Slotdescbyname stores「a」, later will use 「a」to get table's 「A」 column info, will throw exception. - private final Map slotDescByName = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); - private final Map exprsByName = Maps.newTreeMap(String.CASE_INSENSITIVE_ORDER); - - // used to construct for streaming loading - public StreamLoadScanNode( - TUniqueId loadId, PlanNodeId id, TupleDescriptor tupleDesc, Table dstTable, LoadTaskInfo taskInfo) { - super(id, tupleDesc, "StreamLoadScanNode", StatisticalType.STREAM_LOAD_SCAN_NODE); - this.loadId = loadId; - this.dstTable = dstTable; - this.taskInfo = taskInfo; - this.numInstances = 1; - } - - @Override - public void init(Analyzer analyzer) throws UserException { - // can't call super.init(), because after super.init, conjuncts would be null - assignConjuncts(analyzer); - - this.analyzer = analyzer; - brokerScanRange = new TBrokerScanRange(); - - deleteCondition = taskInfo.getDeleteCondition(); - mergeType = taskInfo.getMergeType(); - - TBrokerRangeDesc rangeDesc = new TBrokerRangeDesc(); - rangeDesc.file_type = taskInfo.getFileType(); - rangeDesc.format_type = taskInfo.getFormatType(); - if (rangeDesc.format_type == TFileFormatType.FORMAT_JSON) { - if (!taskInfo.getJsonPaths().isEmpty()) { - rangeDesc.setJsonpaths(taskInfo.getJsonPaths()); - } - if (!taskInfo.getJsonRoot().isEmpty()) { - rangeDesc.setJsonRoot(taskInfo.getJsonRoot()); - } - rangeDesc.setStripOuterArray(taskInfo.isStripOuterArray()); - rangeDesc.setNumAsString(taskInfo.isNumAsString()); - rangeDesc.setFuzzyParse(taskInfo.isFuzzyParse()); - rangeDesc.setReadJsonByLine(taskInfo.isReadJsonByLine()); - } - rangeDesc.splittable = false; - switch (taskInfo.getFileType()) { - case FILE_LOCAL: - rangeDesc.path = taskInfo.getPath(); - break; - case FILE_STREAM: - rangeDesc.path = "Invalid Path"; - rangeDesc.load_id = loadId; - break; - default: - throw new UserException("unsupported file type, type=" + taskInfo.getFileType()); - } - rangeDesc.start_offset = 0; - rangeDesc.setHeaderType(taskInfo.getHeaderType()); - rangeDesc.size = -1; - brokerScanRange.addToRanges(rangeDesc); - - srcTupleDesc = analyzer.getDescTbl().createTupleDescriptor("StreamLoadScanNode"); - - TBrokerScanRangeParams params = new TBrokerScanRangeParams(); - LoadTaskInfo.ImportColumnDescs columnExprDescs = taskInfo.getColumnExprDescs(); - if (!columnExprDescs.isColumnDescsRewrited) { - if (mergeType == LoadTask.MergeType.MERGE) { - columnExprDescs.descs.add(ImportColumnDesc.newDeleteSignImportColumnDesc(deleteCondition)); - } else if (mergeType == LoadTask.MergeType.DELETE) { - columnExprDescs.descs.add(ImportColumnDesc.newDeleteSignImportColumnDesc(new IntLiteral(1))); - } - if (dstTable instanceof OlapTable && ((OlapTable) dstTable).hasSequenceCol()) { - String sequenceCol = ((OlapTable) dstTable).getSequenceMapCol(); - if (sequenceCol == null) { - sequenceCol = taskInfo.getSequenceCol(); - } - columnExprDescs.descs.add(new ImportColumnDesc(Column.SEQUENCE_COL, - new SlotRef(null, sequenceCol))); - } - } - - if (params.getSrcSlotIds() == null) { - params.setSrcSlotIds(Lists.newArrayList()); - } - Load.initColumns(dstTable, columnExprDescs, null /* no hadoop function */, exprsByName, analyzer, srcTupleDesc, - slotDescByName, params.getSrcSlotIds(), taskInfo.getFormatType(), taskInfo.getHiddenColumns(), - VectorizedUtil.isVectorized()); - - // analyze where statement - initAndSetPrecedingFilter(taskInfo.getPrecedingFilter(), this.srcTupleDesc, analyzer); - initAndSetWhereExpr(taskInfo.getWhereExpr(), this.desc, analyzer); - - createDefaultSmap(analyzer); - - if (taskInfo.getColumnSeparator() != null) { - String sep = taskInfo.getColumnSeparator().getSeparator(); - params.setColumnSeparatorStr(sep); - params.setColumnSeparatorLength(sep.getBytes(Charset.forName("UTF-8")).length); - params.setColumnSeparator(sep.getBytes(Charset.forName("UTF-8"))[0]); - } else { - params.setColumnSeparator((byte) '\t'); - params.setColumnSeparatorLength(1); - params.setColumnSeparatorStr("\t"); - } - if (taskInfo.getLineDelimiter() != null) { - String sep = taskInfo.getLineDelimiter().getSeparator(); - params.setLineDelimiterStr(sep); - params.setLineDelimiterLength(sep.getBytes(Charset.forName("UTF-8")).length); - params.setLineDelimiter(sep.getBytes(Charset.forName("UTF-8"))[0]); - } else { - params.setLineDelimiter((byte) '\n'); - params.setLineDelimiterLength(1); - } - params.setTrimDoubleQuotes(taskInfo.getTrimDoubleQuotes()); - params.setDestTupleId(desc.getId().asInt()); - brokerScanRange.setParams(params); - - brokerScanRange.setBrokerAddresses(Lists.newArrayList()); - computeStats(analyzer); - } - - @Override - public void finalize(Analyzer analyzer) throws UserException { - finalizeParams(slotDescByName, exprsByName, brokerScanRange.params, srcTupleDesc, - taskInfo.isStrictMode(), taskInfo.getNegative(), analyzer); - } - - @Override - public List getScanRangeLocations(long maxScanRangeLength) { - TScanRangeLocations locations = new TScanRangeLocations(); - TScanRange scanRange = new TScanRange(); - scanRange.setBrokerScanRange(brokerScanRange); - locations.setScanRange(scanRange); - locations.setLocations(Lists.newArrayList()); - return Lists.newArrayList(locations); - } - - @Override - public int getNumInstances() { - return 1; - } - - @Override - public String getNodeExplainString(String prefix, TExplainLevel detailLevel) { - return "StreamLoadScanNode"; - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/InsertStreamTxnExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/InsertStreamTxnExecutor.java index 033d013ecf..34340da8cf 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/InsertStreamTxnExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/InsertStreamTxnExecutor.java @@ -19,7 +19,6 @@ package org.apache.doris.qe; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.OlapTable; -import org.apache.doris.common.Config; import org.apache.doris.common.UserException; import org.apache.doris.planner.StreamLoadPlanner; import org.apache.doris.proto.InternalService; @@ -29,7 +28,6 @@ import org.apache.doris.rpc.RpcException; import org.apache.doris.system.Backend; import org.apache.doris.system.BeSelectionPolicy; import org.apache.doris.task.StreamLoadTask; -import org.apache.doris.thrift.TBrokerRangeDesc; import org.apache.doris.thrift.TExecPlanFragmentParams; import org.apache.doris.thrift.TExecPlanFragmentParamsList; import org.apache.doris.thrift.TFileCompressType; @@ -77,16 +75,10 @@ public class InsertStreamTxnExecutor { tRequest.setTxnConf(txnConf).setImportLabel(txnEntry.getLabel()); for (Map.Entry> entry : tRequest.params.per_node_scan_ranges.entrySet()) { for (TScanRangeParams scanRangeParams : entry.getValue()) { - if (Config.enable_new_load_scan_node && Config.enable_vectorized_load) { - scanRangeParams.scan_range.ext_scan_range.file_scan_range.params.setFormatType( - TFileFormatType.FORMAT_PROTO); - scanRangeParams.scan_range.ext_scan_range.file_scan_range.params.setCompressType( - TFileCompressType.PLAIN); - } else { - for (TBrokerRangeDesc desc : scanRangeParams.scan_range.broker_scan_range.ranges) { - desc.setFormatType(TFileFormatType.FORMAT_PROTO); - } - } + scanRangeParams.scan_range.ext_scan_range.file_scan_range.params.setFormatType( + TFileFormatType.FORMAT_PROTO); + scanRangeParams.scan_range.ext_scan_range.file_scan_range.params.setCompressType( + TFileCompressType.PLAIN); } } txnConf.setFragmentInstanceId(tRequest.params.fragment_instance_id); diff --git a/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/BrokerLoadJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/BrokerLoadJobTest.java index 0b2dc7682a..526fad2fd7 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/BrokerLoadJobTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/load/loadv2/BrokerLoadJobTest.java @@ -43,7 +43,6 @@ import org.apache.doris.load.EtlStatus; import org.apache.doris.load.Load; import org.apache.doris.load.Source; import org.apache.doris.metric.MetricRepo; -import org.apache.doris.planner.BrokerScanNode; import org.apache.doris.planner.OlapTableSink; import org.apache.doris.planner.PlanFragment; import org.apache.doris.task.MasterTaskExecutor; @@ -344,8 +343,7 @@ public class BrokerLoadJobTest { @Injectable FileGroupAggKey aggKey, @Mocked OlapTable olapTable, @Mocked PlanFragment sinkFragment, - @Mocked OlapTableSink olapTableSink, - @Mocked BrokerScanNode scanNode) throws Exception { + @Mocked OlapTableSink olapTableSink) throws Exception { List schema = new ArrayList<>(); schema.add(new Column("a", PrimitiveType.BIGINT)); Map properties = new HashMap<>(); diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/StreamLoadScanNodeTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/StreamLoadScanNodeTest.java deleted file mode 100644 index 30abd06ca8..0000000000 --- a/fe/fe-core/src/test/java/org/apache/doris/planner/StreamLoadScanNodeTest.java +++ /dev/null @@ -1,824 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.planner; - -import org.apache.doris.analysis.Analyzer; -import org.apache.doris.analysis.CastExpr; -import org.apache.doris.analysis.DescriptorTable; -import org.apache.doris.analysis.FunctionName; -import org.apache.doris.analysis.SlotDescriptor; -import org.apache.doris.analysis.TupleDescriptor; -import org.apache.doris.catalog.AggregateType; -import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.Database; -import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.Function; -import org.apache.doris.catalog.FunctionSet; -import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.PrimitiveType; -import org.apache.doris.catalog.ScalarFunction; -import org.apache.doris.catalog.ScalarType; -import org.apache.doris.catalog.Type; -import org.apache.doris.common.AnalysisException; -import org.apache.doris.common.UserException; -import org.apache.doris.qe.ConnectContext; -import org.apache.doris.task.StreamLoadTask; -import org.apache.doris.thrift.TExplainLevel; -import org.apache.doris.thrift.TFileFormatType; -import org.apache.doris.thrift.TFileType; -import org.apache.doris.thrift.TPlanNode; -import org.apache.doris.thrift.TStreamLoadPutRequest; - -import com.google.common.collect.Lists; -import mockit.Expectations; -import mockit.Injectable; -import mockit.Mocked; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.junit.Assert; -import org.junit.Test; - -import java.util.List; - -public class StreamLoadScanNodeTest { - private static final Logger LOG = LogManager.getLogger(StreamLoadScanNodeTest.class); - - @Mocked - Env env; - - @Injectable - ConnectContext connectContext; - - @Injectable - Database db; - - @Injectable - OlapTable dstTable; - - @Mocked - CastExpr castExpr; - - TStreamLoadPutRequest getBaseRequest() { - TStreamLoadPutRequest request = new TStreamLoadPutRequest(); - request.setFileType(TFileType.FILE_STREAM); - request.setFormatType(TFileFormatType.FORMAT_CSV_PLAIN); - return request; - } - - List getBaseSchema() { - List columns = Lists.newArrayList(); - - Column k1 = new Column("k1", PrimitiveType.BIGINT); - k1.setIsKey(true); - k1.setIsAllowNull(false); - columns.add(k1); - - Column k2 = new Column("k2", ScalarType.createVarchar(25)); - k2.setIsKey(true); - k2.setIsAllowNull(true); - columns.add(k2); - - Column v1 = new Column("v1", PrimitiveType.BIGINT); - v1.setIsKey(false); - v1.setIsAllowNull(true); - v1.setAggregationType(AggregateType.SUM, false); - - columns.add(v1); - - Column v2 = new Column("v2", ScalarType.createVarchar(25)); - v2.setIsKey(false); - v2.setAggregationType(AggregateType.REPLACE, false); - v2.setIsAllowNull(false); - columns.add(v2); - - return columns; - } - - List getHllSchema() { - List columns = Lists.newArrayList(); - - Column k1 = new Column("k1", PrimitiveType.BIGINT); - k1.setIsKey(true); - k1.setIsAllowNull(false); - columns.add(k1); - - Column v1 = new Column("v1", PrimitiveType.HLL); - v1.setIsKey(false); - v1.setIsAllowNull(true); - v1.setAggregationType(AggregateType.HLL_UNION, false); - - columns.add(v1); - - return columns; - } - - List getSequenceColSchema() { - List columns = Lists.newArrayList(); - - Column k1 = new Column("k1", PrimitiveType.BIGINT); - k1.setIsKey(true); - k1.setIsAllowNull(false); - columns.add(k1); - - Column k2 = new Column("k2", ScalarType.createVarchar(25)); - k2.setIsKey(true); - k2.setIsAllowNull(true); - columns.add(k2); - - // sequence column, it's hidden column - Column sequenceCol = new Column(Column.SEQUENCE_COL, PrimitiveType.BIGINT); - sequenceCol.setIsKey(false); - sequenceCol.setAggregationType(AggregateType.REPLACE, false); - sequenceCol.setIsAllowNull(false); - sequenceCol.setIsVisible(false); - columns.add(sequenceCol); - - // sequence column, it's visible column for user, it's equals to the hidden column - Column visibleSequenceCol = new Column("visible_sequence_col", PrimitiveType.BIGINT); - visibleSequenceCol.setIsKey(false); - visibleSequenceCol.setAggregationType(AggregateType.REPLACE, false); - visibleSequenceCol.setIsAllowNull(true); - columns.add(visibleSequenceCol); - - Column v1 = new Column("v1", ScalarType.createVarchar(25)); - v1.setIsKey(false); - v1.setAggregationType(AggregateType.REPLACE, false); - v1.setIsAllowNull(false); - columns.add(v1); - - return columns; - } - - private StreamLoadScanNode getStreamLoadScanNode(TupleDescriptor dstDesc, TStreamLoadPutRequest request) - throws UserException { - StreamLoadTask streamLoadTask = StreamLoadTask.fromTStreamLoadPutRequest(request); - return new StreamLoadScanNode(streamLoadTask.getId(), new PlanNodeId(1), dstDesc, dstTable, streamLoadTask); - } - - @Test - public void testNormal() throws UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - TStreamLoadPutRequest request = getBaseRequest(); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - new Expectations() { - { - dstTable.getBaseSchema(); - result = columns; - dstTable.getBaseSchema(anyBoolean); - result = columns; - dstTable.getFullSchema(); - result = columns; - dstTable.getColumn("k1"); - result = columns.get(0); - dstTable.getColumn("k2"); - result = columns.get(1); - dstTable.getColumn("v1"); - result = columns.get(2); - dstTable.getColumn("v2"); - result = columns.get(3); - } - }; - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - - Assert.assertEquals(1, scanNode.getNumInstances()); - Assert.assertEquals(1, scanNode.getScanRangeLocations(0).size()); - } - - @Test(expected = AnalysisException.class) - public void testLostV2() throws UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1, k2, v1"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test(expected = AnalysisException.class) - public void testBadColumns() throws UserException, UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1 k2 v1"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test - public void testColumnsNormal() throws UserException, UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - dstTable.getColumn("k1"); - result = columns.stream().filter(c -> c.getName().equals("k1")).findFirst().get(); - - dstTable.getColumn("k2"); - result = columns.stream().filter(c -> c.getName().equals("k2")).findFirst().get(); - - dstTable.getColumn("v1"); - result = columns.stream().filter(c -> c.getName().equals("v1")).findFirst().get(); - - dstTable.getColumn("v2"); - result = columns.stream().filter(c -> c.getName().equals("v2")).findFirst().get(); - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1,k2,v1, v2=k2"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test - public void testHllColumnsNormal() throws UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getHllSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - env.getFunction((Function) any, (Function.CompareMode) any); - result = new ScalarFunction(new FunctionName(FunctionSet.HLL_HASH), - Lists.newArrayList(), Type.BIGINT, false, true); - - dstTable.getColumn("k1"); - result = columns.stream().filter(c -> c.getName().equals("k1")).findFirst().get(); - - dstTable.getColumn("k2"); - result = null; - - dstTable.getColumn("v1"); - result = columns.stream().filter(c -> c.getName().equals("v1")).findFirst().get(); - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setFileType(TFileType.FILE_STREAM); - - request.setColumns("k1,k2, v1=" + FunctionSet.HLL_HASH + "(k2)"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test(expected = UserException.class) - public void testHllColumnsNoHllHash() throws UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getHllSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - env.getFunction((Function) any, (Function.CompareMode) any); - result = new ScalarFunction(new FunctionName("hll_hash1"), Lists.newArrayList(), - Type.BIGINT, false, true); - minTimes = 0; - } - }; - - new Expectations() { - { - dstTable.getColumn("k1"); - result = columns.stream().filter(c -> c.getName().equals("k1")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("k2"); - result = null; - minTimes = 0; - - dstTable.getColumn("v1"); - result = columns.stream().filter(c -> c.getName().equals("v1")).findFirst().get(); - minTimes = 0; - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setFileType(TFileType.FILE_LOCAL); - request.setColumns("k1,k2, v1=hll_hash1(k2)"); - StreamLoadTask.fromTStreamLoadPutRequest(request); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test(expected = UserException.class) - public void testHllColumnsFail() throws UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getHllSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - TStreamLoadPutRequest request = getBaseRequest(); - request.setFileType(TFileType.FILE_LOCAL); - request.setColumns("k1,k2, v1=k2"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test(expected = UserException.class) - public void testUnsupportedFType() throws UserException, UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - TStreamLoadPutRequest request = getBaseRequest(); - request.setFileType(TFileType.FILE_BROKER); - request.setColumns("k1,k2,v1, v2=k2"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test(expected = UserException.class) - public void testColumnsUnknownRef() throws UserException, UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - dstTable.getBaseSchema(); - minTimes = 0; - result = columns; - dstTable.getBaseSchema(anyBoolean); - minTimes = 0; - result = columns; - dstTable.getFullSchema(); - minTimes = 0; - result = columns; - dstTable.getColumn("k1"); - minTimes = 0; - result = columns.get(0); - dstTable.getColumn("k2"); - minTimes = 0; - result = columns.get(1); - dstTable.getColumn("v1"); - minTimes = 0; - result = columns.get(2); - dstTable.getColumn("v2"); - minTimes = 0; - result = columns.get(3); - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1,k2,v1, v2=k3"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test - public void testWhereNormal() throws UserException, UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - dstTable.getColumn("k1"); - result = columns.stream().filter(c -> c.getName().equals("k1")).findFirst().get(); - - dstTable.getColumn("k2"); - result = columns.stream().filter(c -> c.getName().equals("k2")).findFirst().get(); - - dstTable.getColumn("v1"); - result = columns.stream().filter(c -> c.getName().equals("v1")).findFirst().get(); - - dstTable.getColumn("v2"); - result = columns.stream().filter(c -> c.getName().equals("v2")).findFirst().get(); - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1,k2,v1, v2=k1"); - request.setWhere("k1 = 1"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test(expected = AnalysisException.class) - public void testWhereBad() throws UserException, UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - dstTable.getColumn("k1"); - result = columns.stream().filter(c -> c.getName().equals("k1")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("k2"); - result = columns.stream().filter(c -> c.getName().equals("k2")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("v1"); - result = columns.stream().filter(c -> c.getName().equals("v1")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("v2"); - result = columns.stream().filter(c -> c.getName().equals("v2")).findFirst().get(); - minTimes = 0; - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1,k2,v1, v2=k2"); - request.setWhere("k1 1"); - StreamLoadTask streamLoadTask = StreamLoadTask.fromTStreamLoadPutRequest(request); - StreamLoadScanNode scanNode = new StreamLoadScanNode(streamLoadTask.getId(), new PlanNodeId(1), - dstDesc, dstTable, streamLoadTask); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test(expected = UserException.class) - public void testWhereUnknownRef() throws UserException, UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - dstTable.getBaseSchema(); - minTimes = 0; - result = columns; - dstTable.getBaseSchema(anyBoolean); - minTimes = 0; - result = columns; - dstTable.getFullSchema(); - minTimes = 0; - result = columns; - dstTable.getColumn("k1"); - minTimes = 0; - result = columns.get(0); - dstTable.getColumn("k2"); - minTimes = 0; - result = columns.get(1); - dstTable.getColumn("v1"); - minTimes = 0; - result = columns.get(2); - dstTable.getColumn("v2"); - minTimes = 0; - result = columns.get(3); - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1,k2,v1, v2=k1"); - request.setWhere("k5 = 1"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test(expected = UserException.class) - public void testWhereNotBool() throws UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getBaseSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - dstTable.getBaseSchema(); - minTimes = 0; - result = columns; - dstTable.getBaseSchema(anyBoolean); - minTimes = 0; - result = columns; - dstTable.getFullSchema(); - minTimes = 0; - result = columns; - dstTable.getColumn("k1"); - minTimes = 0; - result = columns.get(0); - dstTable.getColumn("k2"); - minTimes = 0; - result = columns.get(1); - dstTable.getColumn("v1"); - minTimes = 0; - result = columns.get(2); - dstTable.getColumn("v2"); - minTimes = 0; - result = columns.get(3); - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1,k2,v1, v2=k1"); - request.setWhere("k1 + v2"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test - public void testSequenceColumnWithSetColumns() throws UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getSequenceColSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - System.out.println(column); - slot.setColumn(column); - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - db.getTableNullable(anyInt); - result = dstTable; - minTimes = 0; - dstTable.hasSequenceCol(); - result = true; - - dstTable.getColumn("k1"); - result = columns.stream().filter(c -> c.getName().equals("k1")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("k2"); - result = columns.stream().filter(c -> c.getName().equals("k2")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn(Column.SEQUENCE_COL); - result = columns.stream().filter(c -> c.getName().equals(Column.SEQUENCE_COL)).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("visible_sequence_col"); - result = columns.stream().filter(c -> c.getName().equals("visible_sequence_col")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("v1"); - result = columns.stream().filter(c -> c.getName().equals("v1")).findFirst().get(); - minTimes = 0; - // there is no "source_sequence" column in the Table - dstTable.getColumn("source_sequence"); - result = null; - minTimes = 0; - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setColumns("k1,k2,source_sequence,v1"); - request.setFileType(TFileType.FILE_STREAM); - request.setSequenceCol("source_sequence"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } - - @Test - public void testSequenceColumnWithoutSetColumns() throws UserException { - Analyzer analyzer = new Analyzer(env, connectContext); - DescriptorTable descTbl = analyzer.getDescTbl(); - - List columns = getSequenceColSchema(); - TupleDescriptor dstDesc = descTbl.createTupleDescriptor("DstTableDesc"); - for (Column column : columns) { - SlotDescriptor slot = descTbl.addSlotDescriptor(dstDesc); - slot.setColumn(column); - - slot.setIsMaterialized(true); - slot.setIsNullable(column.isAllowNull()); - } - - new Expectations() { - { - db.getTableNullable(anyInt); - result = dstTable; - minTimes = 0; - dstTable.hasSequenceCol(); - result = true; - - dstTable.getBaseSchema(anyBoolean); - result = columns; - dstTable.getFullSchema(); - result = columns; - - dstTable.getColumn("k1"); - result = columns.stream().filter(c -> c.getName().equals("k1")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("k2"); - result = columns.stream().filter(c -> c.getName().equals("k2")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn(Column.SEQUENCE_COL); - result = columns.stream().filter(c -> c.getName().equals(Column.SEQUENCE_COL)).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("visible_sequence_col"); - result = columns.stream().filter(c -> c.getName().equals("visible_sequence_col")).findFirst().get(); - minTimes = 0; - - dstTable.getColumn("v1"); - result = columns.stream().filter(c -> c.getName().equals("v1")).findFirst().get(); - minTimes = 0; - - dstTable.hasSequenceCol(); - result = true; - minTimes = 0; - } - }; - - TStreamLoadPutRequest request = getBaseRequest(); - request.setFileType(TFileType.FILE_STREAM); - request.setSequenceCol("visible_sequence_col"); - StreamLoadScanNode scanNode = getStreamLoadScanNode(dstDesc, request); - - scanNode.init(analyzer); - scanNode.finalize(analyzer); - scanNode.getNodeExplainString("", TExplainLevel.NORMAL); - TPlanNode planNode = new TPlanNode(); - scanNode.toThrift(planNode); - } -}