Add new lib, Backend can read data from hdfs without broker, this patch include libhdfs3.a which can read file on hdfs. This patch will make reading the data from hdfs with parquet possible. By this, we will support more format of file on hdfs in the future, and we will support other metadata in the future.
508 lines
17 KiB
C++
508 lines
17 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "exec/broker_scanner.h"
|
|
|
|
#include <iostream>
|
|
#include <sstream>
|
|
|
|
#include "exec/broker_reader.h"
|
|
#include "exec/decompressor.h"
|
|
#include "exec/exec_node.h"
|
|
#include "exec/local_file_reader.h"
|
|
#include "exec/plain_text_line_reader.h"
|
|
#include "exec/s3_reader.h"
|
|
#include "exec/text_converter.h"
|
|
#include "exec/text_converter.hpp"
|
|
#include "exprs/expr.h"
|
|
#include "runtime/descriptors.h"
|
|
#include "runtime/exec_env.h"
|
|
#include "runtime/mem_tracker.h"
|
|
#include "runtime/raw_value.h"
|
|
#include "runtime/stream_load/load_stream_mgr.h"
|
|
#include "runtime/stream_load/stream_load_pipe.h"
|
|
#include "runtime/tuple.h"
|
|
#include "exprs/expr.h"
|
|
#include "exec/text_converter.h"
|
|
#include "exec/text_converter.hpp"
|
|
#include "exec/plain_text_line_reader.h"
|
|
#include "exec/hdfs_file_reader.h"
|
|
#include "exec/local_file_reader.h"
|
|
#include "exec/broker_reader.h"
|
|
#include "exec/decompressor.h"
|
|
#include "util/utf8_check.h"
|
|
|
|
namespace doris {
|
|
|
|
BrokerScanner::BrokerScanner(RuntimeState* state, RuntimeProfile* profile,
|
|
const TBrokerScanRangeParams& params,
|
|
const std::vector<TBrokerRangeDesc>& ranges,
|
|
const std::vector<TNetworkAddress>& broker_addresses,
|
|
const std::vector<ExprContext*>& pre_filter_ctxs,
|
|
ScannerCounter* counter)
|
|
: BaseScanner(state, profile, params, pre_filter_ctxs, counter),
|
|
_ranges(ranges),
|
|
_broker_addresses(broker_addresses),
|
|
_cur_file_reader(nullptr),
|
|
_cur_line_reader(nullptr),
|
|
_cur_decompressor(nullptr),
|
|
_next_range(0),
|
|
_cur_line_reader_eof(false),
|
|
_scanner_eof(false),
|
|
_skip_next_line(false) {
|
|
if (params.__isset.column_separator_length && params.column_separator_length > 1) {
|
|
_value_separator = params.column_separator_str;
|
|
_value_separator_length = params.column_separator_length;
|
|
} else {
|
|
_value_separator.push_back(static_cast<char>(params.column_separator));
|
|
_value_separator_length = 1;
|
|
}
|
|
if (params.__isset.line_delimiter_length && params.line_delimiter_length > 1) {
|
|
_line_delimiter = params.line_delimiter_str;
|
|
_line_delimiter_length = params.line_delimiter_length;
|
|
} else {
|
|
_line_delimiter.push_back(static_cast<char>(params.line_delimiter));
|
|
_line_delimiter_length = 1;
|
|
}
|
|
}
|
|
|
|
BrokerScanner::~BrokerScanner() {
|
|
close();
|
|
}
|
|
|
|
Status BrokerScanner::open() {
|
|
RETURN_IF_ERROR(BaseScanner::open()); // base default function
|
|
_text_converter.reset(new (std::nothrow) TextConverter('\\'));
|
|
if (_text_converter == nullptr) {
|
|
return Status::InternalError("No memory error.");
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BrokerScanner::get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof) {
|
|
SCOPED_TIMER(_read_timer);
|
|
// Get one line
|
|
while (!_scanner_eof) {
|
|
if (_cur_line_reader == nullptr || _cur_line_reader_eof) {
|
|
RETURN_IF_ERROR(open_next_reader());
|
|
// If there isn't any more reader, break this
|
|
if (_scanner_eof) {
|
|
continue;
|
|
}
|
|
}
|
|
const uint8_t* ptr = nullptr;
|
|
size_t size = 0;
|
|
RETURN_IF_ERROR(_cur_line_reader->read_line(&ptr, &size, &_cur_line_reader_eof));
|
|
if (_skip_next_line) {
|
|
_skip_next_line = false;
|
|
continue;
|
|
}
|
|
if (size == 0) {
|
|
// Read empty row, just continue
|
|
continue;
|
|
}
|
|
{
|
|
COUNTER_UPDATE(_rows_read_counter, 1);
|
|
SCOPED_TIMER(_materialize_timer);
|
|
if (convert_one_row(Slice(ptr, size), tuple, tuple_pool)) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (_scanner_eof) {
|
|
*eof = true;
|
|
} else {
|
|
*eof = false;
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BrokerScanner::open_next_reader() {
|
|
if (_next_range >= _ranges.size()) {
|
|
_scanner_eof = true;
|
|
return Status::OK();
|
|
}
|
|
|
|
RETURN_IF_ERROR(open_file_reader());
|
|
RETURN_IF_ERROR(open_line_reader());
|
|
_next_range++;
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BrokerScanner::open_file_reader() {
|
|
if (_cur_file_reader != nullptr) {
|
|
if (_stream_load_pipe != nullptr) {
|
|
_stream_load_pipe.reset();
|
|
_cur_file_reader = nullptr;
|
|
} else {
|
|
delete _cur_file_reader;
|
|
_cur_file_reader = nullptr;
|
|
}
|
|
}
|
|
|
|
const TBrokerRangeDesc& range = _ranges[_next_range];
|
|
int64_t start_offset = range.start_offset;
|
|
if (start_offset != 0) {
|
|
start_offset -= 1;
|
|
}
|
|
switch (range.file_type) {
|
|
case TFileType::FILE_LOCAL: {
|
|
LocalFileReader* file_reader = new LocalFileReader(range.path, start_offset);
|
|
RETURN_IF_ERROR(file_reader->open());
|
|
_cur_file_reader = file_reader;
|
|
break;
|
|
}
|
|
case TFileType::FILE_HDFS: {
|
|
HdfsFileReader* file_reader = new HdfsFileReader(
|
|
range.hdfs_params, range.path, start_offset);
|
|
RETURN_IF_ERROR(file_reader->open());
|
|
_cur_file_reader = file_reader;
|
|
break;
|
|
}
|
|
case TFileType::FILE_BROKER: {
|
|
BrokerReader* broker_reader =
|
|
new BrokerReader(_state->exec_env(), _broker_addresses, _params.properties,
|
|
range.path, start_offset);
|
|
RETURN_IF_ERROR(broker_reader->open());
|
|
_cur_file_reader = broker_reader;
|
|
break;
|
|
}
|
|
case TFileType::FILE_S3: {
|
|
S3Reader* s3_reader = new S3Reader(_params.properties, range.path, start_offset);
|
|
RETURN_IF_ERROR(s3_reader->open());
|
|
_cur_file_reader = s3_reader;
|
|
break;
|
|
}
|
|
case TFileType::FILE_STREAM: {
|
|
_stream_load_pipe = _state->exec_env()->load_stream_mgr()->get(range.load_id);
|
|
if (_stream_load_pipe == nullptr) {
|
|
VLOG_NOTICE << "unknown stream load id: " << UniqueId(range.load_id);
|
|
return Status::InternalError("unknown stream load id");
|
|
}
|
|
_cur_file_reader = _stream_load_pipe.get();
|
|
break;
|
|
}
|
|
default: {
|
|
std::stringstream ss;
|
|
ss << "Unknown file type, type=" << range.file_type;
|
|
return Status::InternalError(ss.str());
|
|
}
|
|
}
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BrokerScanner::create_decompressor(TFileFormatType::type type) {
|
|
if (_cur_decompressor != nullptr) {
|
|
delete _cur_decompressor;
|
|
_cur_decompressor = nullptr;
|
|
}
|
|
|
|
CompressType compress_type;
|
|
switch (type) {
|
|
case TFileFormatType::FORMAT_CSV_PLAIN:
|
|
case TFileFormatType::FORMAT_JSON:
|
|
compress_type = CompressType::UNCOMPRESSED;
|
|
break;
|
|
case TFileFormatType::FORMAT_CSV_GZ:
|
|
compress_type = CompressType::GZIP;
|
|
break;
|
|
case TFileFormatType::FORMAT_CSV_BZ2:
|
|
compress_type = CompressType::BZIP2;
|
|
break;
|
|
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
|
|
compress_type = CompressType::LZ4FRAME;
|
|
break;
|
|
case TFileFormatType::FORMAT_CSV_LZOP:
|
|
compress_type = CompressType::LZOP;
|
|
break;
|
|
case TFileFormatType::FORMAT_CSV_DEFLATE:
|
|
compress_type = CompressType::DEFLATE;
|
|
break;
|
|
default: {
|
|
std::stringstream ss;
|
|
ss << "Unknown format type, cannot inference compress type, type=" << type;
|
|
return Status::InternalError(ss.str());
|
|
}
|
|
}
|
|
RETURN_IF_ERROR(Decompressor::create_decompressor(compress_type, &_cur_decompressor));
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BrokerScanner::open_line_reader() {
|
|
if (_cur_decompressor != nullptr) {
|
|
delete _cur_decompressor;
|
|
_cur_decompressor = nullptr;
|
|
}
|
|
|
|
if (_cur_line_reader != nullptr) {
|
|
delete _cur_line_reader;
|
|
_cur_line_reader = nullptr;
|
|
}
|
|
|
|
const TBrokerRangeDesc& range = _ranges[_next_range];
|
|
int64_t size = range.size;
|
|
if (range.start_offset != 0) {
|
|
if (range.format_type != TFileFormatType::FORMAT_CSV_PLAIN) {
|
|
std::stringstream ss;
|
|
ss << "For now we do not support split compressed file";
|
|
return Status::InternalError(ss.str());
|
|
}
|
|
size += 1;
|
|
_skip_next_line = true;
|
|
} else {
|
|
_skip_next_line = false;
|
|
}
|
|
|
|
// create decompressor.
|
|
// _decompressor may be NULL if this is not a compressed file
|
|
RETURN_IF_ERROR(create_decompressor(range.format_type));
|
|
|
|
// open line reader
|
|
switch (range.format_type) {
|
|
case TFileFormatType::FORMAT_CSV_PLAIN:
|
|
case TFileFormatType::FORMAT_CSV_GZ:
|
|
case TFileFormatType::FORMAT_CSV_BZ2:
|
|
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
|
|
case TFileFormatType::FORMAT_CSV_LZOP:
|
|
case TFileFormatType::FORMAT_CSV_DEFLATE:
|
|
_cur_line_reader = new PlainTextLineReader(_profile, _cur_file_reader, _cur_decompressor,
|
|
size, _line_delimiter, _line_delimiter_length);
|
|
break;
|
|
default: {
|
|
std::stringstream ss;
|
|
ss << "Unknown format type, cannot init line reader, type=" << range.format_type;
|
|
return Status::InternalError(ss.str());
|
|
}
|
|
}
|
|
|
|
_cur_line_reader_eof = false;
|
|
|
|
return Status::OK();
|
|
}
|
|
|
|
void BrokerScanner::close() {
|
|
if (_cur_decompressor != nullptr) {
|
|
delete _cur_decompressor;
|
|
_cur_decompressor = nullptr;
|
|
}
|
|
|
|
if (_cur_line_reader != nullptr) {
|
|
delete _cur_line_reader;
|
|
_cur_line_reader = nullptr;
|
|
}
|
|
|
|
if (_cur_file_reader != nullptr) {
|
|
if (_stream_load_pipe != nullptr) {
|
|
_stream_load_pipe.reset();
|
|
_cur_file_reader = nullptr;
|
|
} else {
|
|
delete _cur_file_reader;
|
|
_cur_file_reader = nullptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
void BrokerScanner::split_line(const Slice& line, std::vector<Slice>* values) {
|
|
const char* value = line.data;
|
|
size_t start = 0; // point to the start pos of next col value.
|
|
size_t curpos= 0; // point to the start pos of separator matching sequence.
|
|
size_t p1 = 0; // point to the current pos of separator matching sequence.
|
|
|
|
// Separator: AAAA
|
|
//
|
|
// curpos
|
|
// ▼
|
|
// AAAA
|
|
// 1000AAAA2000AAAA
|
|
// ▲ ▲
|
|
// Start │
|
|
// p1
|
|
|
|
while (curpos < line.size) {
|
|
if (*(value + curpos + p1) != _value_separator[p1]) {
|
|
// Not match, move forward:
|
|
curpos += (p1 == 0 ? 1 : p1);
|
|
p1 = 0;
|
|
} else {
|
|
p1++;
|
|
if (p1 == _value_separator_length) {
|
|
// Match a separator
|
|
values->emplace_back(value + start, curpos - start);
|
|
start = curpos + _value_separator_length;
|
|
curpos = start;
|
|
p1 = 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
CHECK(curpos == line.size) << curpos << " vs " << line.size;
|
|
values->emplace_back(value + start, curpos - start);
|
|
}
|
|
|
|
void BrokerScanner::fill_fix_length_string(const Slice& value, MemPool* pool, char** new_value_p,
|
|
const int new_value_length) {
|
|
if (new_value_length != 0 && value.size < new_value_length) {
|
|
*new_value_p = reinterpret_cast<char*>(pool->allocate(new_value_length));
|
|
|
|
// 'value' is guaranteed not to be nullptr
|
|
memcpy(*new_value_p, value.data, value.size);
|
|
for (int i = value.size; i < new_value_length; ++i) {
|
|
(*new_value_p)[i] = '\0';
|
|
}
|
|
}
|
|
}
|
|
|
|
// Following format are included.
|
|
// .123 1.23 123. -1.23
|
|
// ATTN: The decimal point and (for negative numbers) the "-" sign are not counted.
|
|
// like '.123', it will be regarded as '0.123', but it match decimal(3, 3)
|
|
bool BrokerScanner::check_decimal_input(const Slice& slice, int precision, int scale,
|
|
std::stringstream* error_msg) {
|
|
const char* value = slice.data;
|
|
size_t value_length = slice.size;
|
|
|
|
if (value_length > (precision + 2)) {
|
|
(*error_msg) << "the length of decimal value is overflow. "
|
|
<< "precision in schema: (" << precision << ", " << scale << "); "
|
|
<< "value: [" << slice.to_string() << "]; "
|
|
<< "str actual length: " << value_length << ";";
|
|
return false;
|
|
}
|
|
|
|
// ignore leading spaces and trailing spaces
|
|
int begin_index = 0;
|
|
while (begin_index < value_length && std::isspace(value[begin_index])) {
|
|
++begin_index;
|
|
}
|
|
int end_index = value_length - 1;
|
|
while (end_index >= begin_index && std::isspace(value[end_index])) {
|
|
--end_index;
|
|
}
|
|
|
|
if (value[begin_index] == '+' || value[begin_index] == '-') {
|
|
++begin_index;
|
|
}
|
|
|
|
int point_index = -1;
|
|
for (int i = begin_index; i <= end_index; ++i) {
|
|
if (value[i] == '.') {
|
|
point_index = i;
|
|
}
|
|
}
|
|
|
|
int value_int_len = 0;
|
|
int value_frac_len = 0;
|
|
value_int_len = point_index - begin_index;
|
|
value_frac_len = end_index - point_index;
|
|
|
|
if (point_index == -1) {
|
|
// an int value: like 123
|
|
value_int_len = end_index - begin_index + 1;
|
|
value_frac_len = 0;
|
|
} else {
|
|
value_int_len = point_index - begin_index;
|
|
value_frac_len = end_index - point_index;
|
|
}
|
|
|
|
if (value_int_len > (precision - scale)) {
|
|
(*error_msg) << "the int part length longer than schema precision [" << precision << "]. "
|
|
<< "value [" << slice.to_string() << "]. ";
|
|
return false;
|
|
} else if (value_frac_len > scale) {
|
|
(*error_msg) << "the frac part length longer than schema scale [" << scale << "]. "
|
|
<< "value [" << slice.to_string() << "]. ";
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool is_null(const Slice& slice) {
|
|
return slice.size == 2 && slice.data[0] == '\\' && slice.data[1] == 'N';
|
|
}
|
|
|
|
// Convert one row to this tuple
|
|
bool BrokerScanner::convert_one_row(const Slice& line, Tuple* tuple, MemPool* tuple_pool) {
|
|
if (!line_to_src_tuple(line)) {
|
|
return false;
|
|
}
|
|
|
|
return fill_dest_tuple(tuple, tuple_pool);
|
|
}
|
|
|
|
// Convert one row to this tuple
|
|
bool BrokerScanner::line_to_src_tuple(const Slice& line) {
|
|
if (!validate_utf8(line.data, line.size)) {
|
|
std::stringstream error_msg;
|
|
error_msg << "data is not encoded by UTF-8";
|
|
_state->append_error_msg_to_file("Unable to display", error_msg.str());
|
|
_counter->num_rows_filtered++;
|
|
return false;
|
|
}
|
|
|
|
std::vector<Slice> values;
|
|
{ split_line(line, &values); }
|
|
|
|
// range of current file
|
|
const TBrokerRangeDesc& range = _ranges.at(_next_range - 1);
|
|
const std::vector<std::string>& columns_from_path = range.columns_from_path;
|
|
if (values.size() + columns_from_path.size() < _src_slot_descs.size()) {
|
|
std::stringstream error_msg;
|
|
error_msg << "actual column number is less than schema column number. "
|
|
<< "actual number: " << values.size() << " column separator: ["
|
|
<< _value_separator << "], "
|
|
<< "line delimiter: [" << _line_delimiter << "], "
|
|
<< "schema number: " << _src_slot_descs.size() << "; ";
|
|
_state->append_error_msg_to_file(std::string(line.data, line.size), error_msg.str());
|
|
_counter->num_rows_filtered++;
|
|
return false;
|
|
} else if (values.size() + columns_from_path.size() > _src_slot_descs.size()) {
|
|
std::stringstream error_msg;
|
|
error_msg << "actual column number is more than schema column number. "
|
|
<< "actual number: " << values.size() << " column separator: ["
|
|
<< _value_separator << "], "
|
|
<< "line delimiter: [" << _line_delimiter << "], "
|
|
<< "schema number: " << _src_slot_descs.size() << "; ";
|
|
_state->append_error_msg_to_file(std::string(line.data, line.size), error_msg.str());
|
|
_counter->num_rows_filtered++;
|
|
return false;
|
|
}
|
|
|
|
for (int i = 0; i < values.size(); ++i) {
|
|
auto slot_desc = _src_slot_descs[i];
|
|
const Slice& value = values[i];
|
|
if (slot_desc->is_nullable() && is_null(value)) {
|
|
_src_tuple->set_null(slot_desc->null_indicator_offset());
|
|
continue;
|
|
}
|
|
_src_tuple->set_not_null(slot_desc->null_indicator_offset());
|
|
void* slot = _src_tuple->get_slot(slot_desc->tuple_offset());
|
|
StringValue* str_slot = reinterpret_cast<StringValue*>(slot);
|
|
str_slot->ptr = value.data;
|
|
str_slot->len = value.size;
|
|
}
|
|
|
|
if (range.__isset.num_of_columns_from_file) {
|
|
fill_slots_of_columns_from_path(range.num_of_columns_from_file, columns_from_path);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
} // namespace doris
|