// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "csv_scan_node.h" #include #include #include #include "exec/text_converter.hpp" #include "exprs/hll_hash_function.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/runtime_state.h" #include "runtime/row_batch.h" #include "runtime/string_value.h" #include "runtime/tuple_row.h" #include "util/file_utils.h" #include "util/runtime_profile.h" #include "util/debug_util.h" #include "util/hash_util.hpp" #include "olap/olap_common.h" #include "olap/utils.h" namespace doris { class StringRef { public: StringRef(char const* const begin, int const size) : _begin(begin), _size(size) { } ~StringRef() { // No need to delete _begin, because it only record the index in a std::string. // The c-string will be released along with the std::string object. } int size() const { return _size; } int length() const { return _size; } char const* c_str() const { return _begin; } char const* begin() const { return _begin; } char const* end() const { return _begin + _size; } private: char const* _begin; int _size; }; void split_line(const std::string& str, char delimiter, std::vector& result) { enum State { IN_DELIM = 1, IN_TOKEN = 0 }; // line-begin char and line-end char are considered to be 'delimeter' State state = IN_DELIM; char const* p_begin = str.c_str(); // Begin of either a token or a delimiter for (string::const_iterator it = str.begin(); it != str.end(); ++it) { State const new_state = (*it == delimiter? IN_DELIM : IN_TOKEN); if (new_state != state) { if (new_state == IN_DELIM) { result.push_back(StringRef(p_begin, &*it - p_begin)); } p_begin = &*it; } else if (new_state == IN_DELIM) { result.push_back(StringRef(&*p_begin, 0)); p_begin = &*it; } state = new_state; } result.push_back(StringRef(p_begin, (&*str.end() - p_begin) - state)); } CsvScanNode::CsvScanNode( ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : ScanNode(pool, tnode, descs), _tuple_id(tnode.csv_scan_node.tuple_id), _file_paths(tnode.csv_scan_node.file_paths), _column_separator(tnode.csv_scan_node.column_separator), _column_type_map(tnode.csv_scan_node.column_type_mapping), _column_function_map(tnode.csv_scan_node.column_function_mapping), _columns(tnode.csv_scan_node.columns), _unspecified_columns(tnode.csv_scan_node.unspecified_columns), _default_values(tnode.csv_scan_node.default_values), _is_init(false), _tuple_desc(nullptr), _tuple_pool(nullptr), _text_converter(nullptr), _hll_column_num(0) { // do nothing LOG(INFO) << "csv scan node: " << apache::thrift::ThriftDebugString(tnode).c_str(); } CsvScanNode::~CsvScanNode() { // do nothing } Status CsvScanNode::init(const TPlanNode& tnode) { return ExecNode::init(tnode); } Status CsvScanNode::prepare(RuntimeState* state) { VLOG(1) << "CsvScanNode::Prepare"; if (_is_init) { return Status::OK(); } if (nullptr == state) { return Status::InternalError("input runtime_state pointer is nullptr."); } RETURN_IF_ERROR(ScanNode::prepare(state)); // add timer _split_check_timer = ADD_TIMER(_runtime_profile, "split check timer"); _split_line_timer = ADD_TIMER(_runtime_profile, "split line timer"); _tuple_desc = state->desc_tbl().get_tuple_descriptor(_tuple_id); if (nullptr == _tuple_desc) { return Status::InternalError("Failed to get tuple descriptor."); } _slot_num = _tuple_desc->slots().size(); const OlapTableDescriptor* csv_table = static_cast(_tuple_desc->table_desc()); if (nullptr == csv_table) { return Status::InternalError("csv table pointer is nullptr."); } // for (int i = 0; i < _slot_num; ++i) { SlotDescriptor* slot = _tuple_desc->slots()[i]; const std::string& column_name = slot->col_name(); if (slot->type().type == TYPE_HLL) { TMiniLoadEtlFunction& function = _column_function_map[column_name]; if (check_hll_function(function) == false) { return Status::InternalError("Function name or param error."); } _hll_column_num++; } // NOTE: not all the columns in '_columns' is exist in table schema if (_columns.end() != std::find(_columns.begin(), _columns.end(), column_name)) { _column_slot_map[column_name] = slot; } else { _column_slot_map[column_name] = nullptr; } // add 'unspecified_columns' which have default values if (_unspecified_columns.end() != std::find( _unspecified_columns.begin(), _unspecified_columns.end(), column_name)) { _column_slot_map[column_name] = slot; } } _column_type_vec.resize(_columns.size()); for (int i = 0; i < _columns.size(); ++i) { const std::string& column_name = _columns[i]; SlotDescriptor* slot = _column_slot_map[column_name]; _column_slot_vec.push_back(slot); if (slot != nullptr) { _column_type_vec[i] = _column_type_map[column_name]; } } for (int i = 0; i < _default_values.size(); ++i) { const std::string& column_name = _unspecified_columns[i]; SlotDescriptor* slot = _column_slot_map[column_name]; _unspecified_colomn_slot_vec.push_back(slot); _unspecified_colomn_type_vec.push_back(_column_type_map[column_name]); } // new one scanner _csv_scanner.reset(new(std::nothrow) CsvScanner(_file_paths)); if (_csv_scanner.get() == nullptr) { return Status::InternalError("new a csv scanner failed."); } _tuple_pool.reset(new(std::nothrow) MemPool(state->instance_mem_tracker())); if (_tuple_pool.get() == nullptr) { return Status::InternalError("new a mem pool failed."); } _text_converter.reset(new(std::nothrow) TextConverter('\\')); if (_text_converter.get() == nullptr) { return Status::InternalError("new a text convertor failed."); } _is_init = true; return Status::OK(); } Status CsvScanNode::open(RuntimeState* state) { RETURN_IF_ERROR(ExecNode::open(state)); VLOG(1) << "CsvScanNode::Open"; if (nullptr == state) { return Status::InternalError("input pointer is nullptr."); } if (!_is_init) { return Status::InternalError("used before initialize."); } _runtime_state = state; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::OPEN)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(_csv_scanner->open()); return Status::OK(); } Status CsvScanNode::get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) { VLOG(1) << "CsvScanNode::GetNext"; if (nullptr == state || nullptr == row_batch || nullptr == eos) { return Status::InternalError("input is nullptr pointer"); } if (!_is_init) { return Status::InternalError("used before initialize."); } RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::GETNEXT)); RETURN_IF_CANCELLED(state); SCOPED_TIMER(_runtime_profile->total_time_counter()); SCOPED_TIMER(materialize_tuple_timer()); if (reached_limit()) { *eos = true; return Status::OK(); } // create new tuple buffer for row_batch int tuple_buffer_size = row_batch->capacity() * _tuple_desc->byte_size(); void* tuple_buffer = _tuple_pool->allocate(tuple_buffer_size); if (nullptr == tuple_buffer) { return Status::InternalError("Allocate memory failed."); } _tuple = reinterpret_cast(tuple_buffer); memset(_tuple, 0, _tuple_desc->num_null_bytes()); // Indicates whether there are more rows to process. bool csv_eos = false; // NOTE: not like Mysql, we need check correctness. while (!csv_eos) { RETURN_IF_CANCELLED(state); if (reached_limit() || row_batch->is_full()) { // hang on to last allocated chunk in pool, we'll keep writing into it in the // next get_next() call row_batch->tuple_data_pool()->acquire_data(_tuple_pool.get(), !reached_limit()); *eos = reached_limit(); return Status::OK(); } // read csv std::string line; RETURN_IF_ERROR(_csv_scanner->get_next_row(&line, &csv_eos)); //VLOG_ROW << "line readed: [" << line << "]"; if (line.empty()) { continue; } // split & check line & fill default value bool is_success = split_check_fill(line, state); ++_num_rows_load_total; if (!is_success) { ++_num_rows_load_filtered; continue; } int row_idx = row_batch->add_row(); TupleRow* row = row_batch->get_row(row_idx); // scan node is the first tuple of tuple row row->set_tuple(0, _tuple); { row_batch->commit_last_row(); ++_num_rows_returned; COUNTER_SET(_rows_returned_counter, _num_rows_returned); char* new_tuple = reinterpret_cast(_tuple); new_tuple += _tuple_desc->byte_size(); _tuple = reinterpret_cast(new_tuple); } } state->update_num_rows_load_total(_num_rows_load_total); state->update_num_rows_load_filtered(_num_rows_load_filtered); VLOG_ROW << "normal_row_number: " << state->num_rows_load_success() << "; error_row_number: " << state->num_rows_load_filtered() << std::endl; row_batch->tuple_data_pool()->acquire_data(_tuple_pool.get(), false); *eos = csv_eos; return Status::OK(); } Status CsvScanNode::close(RuntimeState* state) { if (is_closed()) { return Status::OK(); } VLOG(1) << "CsvScanNode::Close"; RETURN_IF_ERROR(exec_debug_action(TExecNodePhase::CLOSE)); SCOPED_TIMER(_runtime_profile->total_time_counter()); if (memory_used_counter() != nullptr && _tuple_pool.get() != nullptr) { COUNTER_UPDATE(memory_used_counter(), _tuple_pool->peak_allocated_bytes()); } RETURN_IF_ERROR(ExecNode::close(state)); if (state->num_rows_load_success() == 0) { std::stringstream error_msg; error_msg << "Read zero normal line file. "; state->append_error_msg_to_file("", error_msg.str(), true); return Status::InternalError(error_msg.str()); } // only write summary line if there are error lines if (_num_rows_load_filtered > 0) { // Summary normal line and error line number info std::stringstream summary_msg; summary_msg << "error line: " << _num_rows_load_filtered << "; normal line: " << state->num_rows_load_success(); state->append_error_msg_to_file("", summary_msg.str(), true); } return Status::OK(); } void CsvScanNode::debug_string(int indentation_level, stringstream* out) const { *out << string(indentation_level * 2, ' '); *out << "csvScanNode(tupleid=" << _tuple_id; *out << ")" << std::endl; for (int i = 0; i < _children.size(); ++i) { _children[i]->debug_string(indentation_level + 1, out); } } Status CsvScanNode::set_scan_ranges(const vector& scan_ranges) { return Status::OK(); } void CsvScanNode::fill_fix_length_string( const char* value, const int value_length, MemPool* pool, char** new_value_p, const int new_value_length) { if (new_value_length != 0 && value_length < new_value_length) { DCHECK(pool != nullptr); *new_value_p = reinterpret_cast(pool->allocate(new_value_length)); // 'value' is guaranteed not to be nullptr memcpy(*new_value_p, value, value_length); for (int i = value_length; i < new_value_length; ++i) { (*new_value_p)[i] = '\0'; } VLOG_ROW << "Fill fix length string. " << "value: [" << std::string(value, value_length) << "]; " << "value_length: " << value_length << "; " << "*new_value_p: [" << *new_value_p << "]; " << "new value length: " << new_value_length << std::endl; } } // Following format are included. // .123 1.23 123. -1.23 // ATTN: The decimal point and (for negative numbers) the "-" sign are not counted. // like '.123', it will be regarded as '0.123', but it match decimal(3, 3) bool CsvScanNode::check_decimal_input( const char* value, const int value_length, const int precision, const int scale, std::stringstream* error_msg) { if (value_length > (precision + 2)) { (*error_msg) << "the length of decimal value is overflow. " << "precision in schema: (" << precision << ", " << scale << "); " << "value: [" << std::string(value, value_length) << "]; " << "str actual length: " << value_length << ";"; return false; } // ignore leading spaces and trailing spaces int begin_index = 0; while (begin_index < value_length && std::isspace(value[begin_index])) { ++begin_index; } int end_index = value_length - 1; while (end_index >= begin_index && std::isspace(value[end_index])) { --end_index; } if (value[begin_index] == '+' || value[begin_index] == '-') { ++begin_index; } int point_index = -1; for (int i = begin_index; i <= end_index; ++i) { if (value[i] == '.') { point_index = i; } } int value_int_len = 0; int value_frac_len = 0; value_int_len = point_index - begin_index; value_frac_len = end_index- point_index; if (point_index == -1) { // an int value: like 123 value_int_len = end_index - begin_index + 1; value_frac_len = 0; } else { value_int_len = point_index - begin_index; value_frac_len = end_index- point_index; } if (value_int_len > (precision - scale)) { (*error_msg) << "the int part length longer than schema precision [" << precision << "]. " << "value [" << std::string(value, value_length) << "]. "; return false; } else if (value_frac_len > scale) { (*error_msg) << "the frac part length longer than schema scale [" << scale << "]. " << "value [" << std::string(value, value_length) << "]. "; return false; } return true; } static bool is_null(const char* value, int value_length) { return value_length == 2 && value[0] == '\\' && value[1] == 'N'; } // Writes a slot in _tuple from an value containing text data. bool CsvScanNode::check_and_write_text_slot( const std::string& column_name, const TColumnType& column_type, const char* value, int value_length, const SlotDescriptor* slot, RuntimeState* state, std::stringstream* error_msg) { if (value_length == 0 && !slot->type().is_string_type()) { (*error_msg) << "the length of input should not be 0. " << "column_name: " << column_name << "; " << "type: " << slot->type() << "; " << "input_str: [" << std::string(value, value_length) << "]."; return false; } if (is_null(value, value_length)) { if (slot->is_nullable()) { _tuple->set_null(slot->null_indicator_offset()); return true; } else { (*error_msg) << "value cannot be null. column name: " << column_name << "; type: " << slot->type() << "; input_str: [" << std::string(value, value_length) << "]."; return false; } } if (!slot->is_nullable() && is_null(value, value_length)) { (*error_msg) << "value cannot be null. column name: " << column_name << "; type: " << slot->type() << "; input_str: [" << std::string(value, value_length) << "]."; return false; } char* value_to_convert = const_cast(value); int value_to_convert_length = value_length; // Fill all the spaces if it is 'TYPE_CHAR' type if (slot->type().is_string_type()) { int char_len = column_type.len; if (slot->type().type != TYPE_HLL && value_length > char_len) { (*error_msg) << "the length of input is too long than schema. " << "column_name: " << column_name << "; " << "input_str: [" << std::string(value, value_length) << "] " << "type: " << slot->type() << "; " << "schema length: " << char_len << "; " << "actual length: " << value_length << "; "; return false; } if (slot->type().type == TYPE_CHAR && value_length < char_len) { fill_fix_length_string( value, value_length, _tuple_pool.get(), &value_to_convert, char_len); value_to_convert_length = char_len; } } else if (slot->type().is_decimal_type()) { int precision = column_type.precision; int scale = column_type.scale; bool is_success = check_decimal_input(value, value_length, precision, scale, error_msg); if (is_success == false) { return false; } } if (!_text_converter->write_slot(slot, _tuple, value_to_convert, value_to_convert_length, true, false, _tuple_pool.get())) { (*error_msg) << "convert csv string to " << slot->type() << " failed. " << "column_name: " << column_name << "; " << "input_str: [" << std::string(value, value_length) << "]; "; return false; } return true; } bool CsvScanNode::split_check_fill(const std::string& line, RuntimeState* state) { SCOPED_TIMER(_split_check_timer); std::stringstream error_msg; // std::vector fields; std::vector fields; { SCOPED_TIMER(_split_line_timer); split_line(line, _column_separator[0], fields); } if (_hll_column_num == 0 && fields.size() < _columns.size()) { error_msg << "actual column number is less than schema column number. " << "actual number: " << fields.size() << " ," << "schema number: " << _columns.size() << "; "; _runtime_state->append_error_msg_to_file(line, error_msg.str()); return false; } else if (_hll_column_num == 0 && fields.size() > _columns.size()) { error_msg << "actual column number is more than schema column number. " << "actual number: " << fields.size() << " ," << "schema number: " << _columns.size() << "; "; _runtime_state->append_error_msg_to_file(line, error_msg.str()); return false; } for (int i = 0; i < _columns.size(); ++i) { const std::string& column_name = _columns[i]; const SlotDescriptor* slot = _column_slot_vec[i]; // ignore unspecified columns if (slot == nullptr) { continue; } if (!slot->is_materialized()) { continue; } if (slot->type().type == TYPE_HLL) { continue; } const TColumnType& column_type = _column_type_vec[i]; bool flag = check_and_write_text_slot( column_name, column_type, fields[i].c_str(), fields[i].length(), slot, state, &error_msg); if (flag == false) { _runtime_state->append_error_msg_to_file(line, error_msg.str()); return false; } } for (int i = 0; i < _unspecified_columns.size(); ++i) { const std::string& column_name = _unspecified_columns[i]; const SlotDescriptor* slot = _unspecified_colomn_slot_vec[i]; if (slot == nullptr) { continue; } if (!slot->is_materialized()) { continue; } if (slot->type().type == TYPE_HLL) { continue; } const TColumnType& column_type = _unspecified_colomn_type_vec[i]; bool flag = check_and_write_text_slot( column_name, column_type, _default_values[i].c_str(), _default_values[i].length(), slot, state, &error_msg); if (flag == false) { _runtime_state->append_error_msg_to_file(line, error_msg.str()); return false; } } for (std::map::iterator iter = _column_function_map.begin(); iter != _column_function_map.end(); iter++) { TMiniLoadEtlFunction& function = iter->second; const std::string& column_name = iter->first; const SlotDescriptor* slot = _column_slot_map[column_name]; const TColumnType& column_type = _column_type_map[column_name]; std::string column_string = ""; const char* src = fields[function.param_column_index].c_str(); int src_column_len = fields[function.param_column_index].length(); hll_hash(src, src_column_len, &column_string); bool flag = check_and_write_text_slot( column_name, column_type, column_string.c_str(), column_string.length(), slot, state, &error_msg); if (flag == false) { _runtime_state->append_error_msg_to_file(line, error_msg.str()); return false; } } return true; } bool CsvScanNode::check_hll_function(TMiniLoadEtlFunction& function) { if (function.function_name.empty() || function.function_name != "hll_hash" || function.param_column_index < 0) { return false; } return true; } void CsvScanNode::hll_hash(const char* src, int len, std::string* result) { std::string str(src, len); if (str != "\\N") { uint64_t hash = HashUtil::murmur_hash64A(src, len, HashUtil::MURMUR_SEED); char buf[10]; // expliclit set buf[0] = HLL_DATA_EXPLICIT; buf[1] = 1; *((uint64_t*)(buf + 2)) = hash; *result = std::string(buf, sizeof(buf)); } else { char buf[1]; // empty set buf[0] = HLL_DATA_EMPTY; *result = std::string(buf, sizeof(buf)); } } } // end namespace doris