[feature-wip](new-scan) Support stream load with csv in new scan framework (#13354)

1. Refactor the file reader creation in FileFactory, for simplicity.
    Previously, FileFactory had too many `create_file_reader` interfaces.
    Now unified into two categories: the interface used by the previous BrokerScanNode,
    and the interface used by the new FileScanNode.
    And separate the creation methods of readers that read `StreamLoadPipe` and other readers that read files.

2. Modify the StreamLoadPlanner on FE side to support using ExternalFileScanNode

3. Now for generic reader, the file reader will be created inside the reader, not passed from the outside.

4. Add some test cases for csv stream load, the behavior is same as the old broker scanner.
This commit is contained in:
Mingyu Chen
2022-10-17 23:33:41 +08:00
committed by GitHub
parent c114d87d13
commit dbf71ed3be
58 changed files with 3671 additions and 566 deletions

1
.gitignore vendored
View File

@ -5,7 +5,6 @@
*.iml
*.swp
*.jar
*.gz
*.log
*.so.tmp
*.flattened-pom.xml

View File

@ -136,10 +136,16 @@ Status BrokerScanner::open_file_reader() {
}
}
RETURN_IF_ERROR(FileFactory::create_file_reader(range.file_type, _state->exec_env(), _profile,
_broker_addresses, _params.properties, range,
start_offset, _cur_file_reader));
return _cur_file_reader->open();
if (range.file_type == TFileType::FILE_STREAM) {
RETURN_IF_ERROR(FileFactory::create_pipe_reader(range.load_id, _cur_file_reader_s));
_real_reader = _cur_file_reader_s.get();
} else {
RETURN_IF_ERROR(FileFactory::create_file_reader(
range.file_type, _state->exec_env(), _profile, _broker_addresses,
_params.properties, range, start_offset, _cur_file_reader));
_real_reader = _cur_file_reader.get();
}
return _real_reader->open();
}
Status BrokerScanner::create_decompressor(TFileFormatType::type type) {
@ -215,12 +221,11 @@ Status BrokerScanner::open_line_reader() {
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
case TFileFormatType::FORMAT_CSV_LZOP:
case TFileFormatType::FORMAT_CSV_DEFLATE:
_cur_line_reader =
new PlainTextLineReader(_profile, _cur_file_reader.get(), _cur_decompressor, size,
_line_delimiter, _line_delimiter_length);
_cur_line_reader = new PlainTextLineReader(_profile, _real_reader, _cur_decompressor, size,
_line_delimiter, _line_delimiter_length);
break;
case TFileFormatType::FORMAT_PROTO:
_cur_line_reader = new PlainBinaryLineReader(_cur_file_reader.get());
_cur_line_reader = new PlainBinaryLineReader(_real_reader);
break;
default: {
return Status::InternalError("Unknown format type, cannot init line reader, type={}",

View File

@ -106,7 +106,12 @@ protected:
int _line_delimiter_length;
// Reader
std::shared_ptr<FileReader> _cur_file_reader;
// _cur_file_reader_s is for stream load pipe reader,
// and _cur_file_reader is for other file reader.
// TODO: refactor this to use only shared_ptr or unique_ptr
std::unique_ptr<FileReader> _cur_file_reader;
std::shared_ptr<FileReader> _cur_file_reader_s;
FileReader* _real_reader;
LineReader* _cur_line_reader;
Decompressor* _cur_decompressor;
bool _cur_line_reader_eof;

View File

@ -36,6 +36,8 @@ JsonScanner::JsonScanner(RuntimeState* state, RuntimeProfile* profile,
const std::vector<TExpr>& pre_filter_texprs, ScannerCounter* counter)
: BaseScanner(state, profile, params, ranges, broker_addresses, pre_filter_texprs, counter),
_cur_file_reader(nullptr),
_cur_file_reader_s(nullptr),
_real_reader(nullptr),
_cur_line_reader(nullptr),
_cur_json_reader(nullptr),
_cur_reader_eof(false),
@ -61,7 +63,7 @@ Status JsonScanner::get_next(Tuple* tuple, MemPool* tuple_pool, bool* eof, bool*
SCOPED_TIMER(_read_timer);
// Get one line
while (!_scanner_eof) {
if (!_cur_file_reader || _cur_reader_eof) {
if (!_real_reader || _cur_reader_eof) {
RETURN_IF_ERROR(open_next_reader());
// If there isn't any more reader, break this
if (_scanner_eof) {
@ -127,11 +129,17 @@ Status JsonScanner::open_file_reader() {
_read_json_by_line = range.read_json_by_line;
}
RETURN_IF_ERROR(FileFactory::create_file_reader(range.file_type, _state->exec_env(), _profile,
_broker_addresses, _params.properties, range,
start_offset, _cur_file_reader));
if (range.file_type == TFileType::FILE_STREAM) {
RETURN_IF_ERROR(FileFactory::create_pipe_reader(range.load_id, _cur_file_reader_s));
_real_reader = _cur_file_reader_s.get();
} else {
RETURN_IF_ERROR(FileFactory::create_file_reader(
range.file_type, _state->exec_env(), _profile, _broker_addresses,
_params.properties, range, start_offset, _cur_file_reader));
_real_reader = _cur_file_reader.get();
}
_cur_reader_eof = false;
return _cur_file_reader->open();
return _real_reader->open();
}
Status JsonScanner::open_line_reader() {
@ -148,7 +156,7 @@ Status JsonScanner::open_line_reader() {
} else {
_skip_next_line = false;
}
_cur_line_reader = new PlainTextLineReader(_profile, _cur_file_reader.get(), nullptr, size,
_cur_line_reader = new PlainTextLineReader(_profile, _real_reader, nullptr, size,
_line_delimiter, _line_delimiter_length);
_cur_reader_eof = false;
return Status::OK();
@ -173,9 +181,8 @@ Status JsonScanner::open_json_reader() {
new JsonReader(_state, _counter, _profile, strip_outer_array, num_as_string,
fuzzy_parse, &_scanner_eof, nullptr, _cur_line_reader);
} else {
_cur_json_reader =
new JsonReader(_state, _counter, _profile, strip_outer_array, num_as_string,
fuzzy_parse, &_scanner_eof, _cur_file_reader.get());
_cur_json_reader = new JsonReader(_state, _counter, _profile, strip_outer_array,
num_as_string, fuzzy_parse, &_scanner_eof, _real_reader);
}
RETURN_IF_ERROR(_cur_json_reader->init(jsonpath, json_root));

View File

@ -87,7 +87,12 @@ protected:
int _line_delimiter_length;
// Reader
std::shared_ptr<FileReader> _cur_file_reader;
// _cur_file_reader_s is for stream load pipe reader,
// and _cur_file_reader is for other file reader.
// TODO: refactor this to use only shared_ptr or unique_ptr
std::unique_ptr<FileReader> _cur_file_reader;
std::shared_ptr<FileReader> _cur_file_reader_s;
FileReader* _real_reader;
LineReader* _cur_line_reader;
JsonReader* _cur_json_reader;
bool _cur_reader_eof;

View File

@ -60,6 +60,9 @@ private:
RuntimeProfile* _profile;
FileReader* _file_reader;
Decompressor* _decompressor;
// the min length that should be read.
// -1 means endless(for stream load)
// and only valid if the content is uncompressed
size_t _min_length;
size_t _total_read_bytes;
std::string _line_delimiter;

View File

@ -71,39 +71,47 @@ DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(streaming_load_current_processing, MetricUnit
TStreamLoadPutResult k_stream_load_put_result;
#endif
static TFileFormatType::type parse_format(const std::string& format_str,
const std::string& compress_type) {
static void parse_format(const std::string& format_str, const std::string& compress_type_str,
TFileFormatType::type* format_type,
TFileCompressType::type* compress_type) {
if (format_str.empty()) {
return parse_format("CSV", compress_type);
parse_format("CSV", compress_type_str, format_type, compress_type);
return;
}
TFileFormatType::type format_type = TFileFormatType::FORMAT_UNKNOWN;
*compress_type = TFileCompressType::PLAIN;
*format_type = TFileFormatType::FORMAT_UNKNOWN;
if (iequal(format_str, "CSV")) {
if (compress_type.empty()) {
format_type = TFileFormatType::FORMAT_CSV_PLAIN;
}
if (iequal(compress_type, "GZ")) {
format_type = TFileFormatType::FORMAT_CSV_GZ;
} else if (iequal(compress_type, "LZO")) {
format_type = TFileFormatType::FORMAT_CSV_LZO;
} else if (iequal(compress_type, "BZ2")) {
format_type = TFileFormatType::FORMAT_CSV_BZ2;
} else if (iequal(compress_type, "LZ4FRAME")) {
format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME;
} else if (iequal(compress_type, "LZOP")) {
format_type = TFileFormatType::FORMAT_CSV_LZOP;
} else if (iequal(compress_type, "DEFLATE")) {
format_type = TFileFormatType::FORMAT_CSV_DEFLATE;
if (compress_type_str.empty()) {
*format_type = TFileFormatType::FORMAT_CSV_PLAIN;
} else if (iequal(compress_type_str, "GZ")) {
*format_type = TFileFormatType::FORMAT_CSV_GZ;
*compress_type = TFileCompressType::GZ;
} else if (iequal(compress_type_str, "LZO")) {
*format_type = TFileFormatType::FORMAT_CSV_LZO;
*compress_type = TFileCompressType::LZO;
} else if (iequal(compress_type_str, "BZ2")) {
*format_type = TFileFormatType::FORMAT_CSV_BZ2;
*compress_type = TFileCompressType::BZ2;
} else if (iequal(compress_type_str, "LZ4")) {
*format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME;
*compress_type = TFileCompressType::LZ4FRAME;
} else if (iequal(compress_type_str, "LZOP")) {
*format_type = TFileFormatType::FORMAT_CSV_LZOP;
*compress_type = TFileCompressType::LZO;
} else if (iequal(compress_type_str, "DEFLATE")) {
*format_type = TFileFormatType::FORMAT_CSV_DEFLATE;
*compress_type = TFileCompressType::DEFLATE;
}
} else if (iequal(format_str, "JSON")) {
if (compress_type.empty()) {
format_type = TFileFormatType::FORMAT_JSON;
if (compress_type_str.empty()) {
*format_type = TFileFormatType::FORMAT_JSON;
}
} else if (iequal(format_str, "PARQUET")) {
format_type = TFileFormatType::FORMAT_PARQUET;
*format_type = TFileFormatType::FORMAT_PARQUET;
} else if (iequal(format_str, "ORC")) {
format_type = TFileFormatType::FORMAT_ORC;
*format_type = TFileFormatType::FORMAT_ORC;
}
return format_type;
return;
}
static bool is_format_support_streaming(TFileFormatType::type format) {
@ -275,7 +283,8 @@ Status StreamLoadAction::_on_header(HttpRequest* http_req, StreamLoadContext* ct
//treat as CSV
format_str = BeConsts::CSV;
}
ctx->format = parse_format(format_str, http_req->header(HTTP_COMPRESS_TYPE));
parse_format(format_str, http_req->header(HTTP_COMPRESS_TYPE), &ctx->format,
&ctx->compress_type);
if (ctx->format == TFileFormatType::FORMAT_UNKNOWN) {
return Status::InternalError("unknown data format, format={}",
http_req->header(HTTP_FORMAT_KEY));
@ -387,6 +396,7 @@ Status StreamLoadAction::_process_put(HttpRequest* http_req, StreamLoadContext*
request.tbl = ctx->table;
request.txnId = ctx->txn_id;
request.formatType = ctx->format;
request.__set_compress_type(ctx->compress_type);
request.__set_header_type(ctx->header_type);
request.__set_loadId(ctx->id.to_thrift());
if (ctx->use_streaming) {

View File

@ -52,31 +52,34 @@ doris::Status doris::FileFactory::create_file_writer(
break;
}
default:
return Status::InternalError("UnSupport File Writer Type: " + std::to_string(type));
return Status::InternalError("unsupported file writer type: {}", std::to_string(type));
}
return Status::OK();
}
doris::Status doris::FileFactory::_new_file_reader(
// ============================
// broker scan node/unique ptr
doris::Status doris::FileFactory::create_file_reader(
doris::TFileType::type type, doris::ExecEnv* env, RuntimeProfile* profile,
const std::vector<TNetworkAddress>& broker_addresses,
const std::map<std::string, std::string>& properties, const TBrokerRangeDesc& range,
int64_t start_offset, FileReader*& file_reader) {
const std::map<std::string, std::string>& properties, const doris::TBrokerRangeDesc& range,
int64_t start_offset, std::unique_ptr<FileReader>& file_reader) {
FileReader* file_reader_ptr;
switch (type) {
case TFileType::FILE_LOCAL: {
file_reader = new LocalFileReader(range.path, start_offset);
file_reader_ptr = new LocalFileReader(range.path, start_offset);
break;
}
case TFileType::FILE_BROKER: {
file_reader = new BufferedReader(
file_reader_ptr = new BufferedReader(
profile,
new BrokerReader(env, broker_addresses, properties, range.path, start_offset,
range.__isset.file_size ? range.file_size : 0));
break;
}
case TFileType::FILE_S3: {
file_reader =
file_reader_ptr =
new BufferedReader(profile, new S3Reader(properties, range.path, start_offset));
break;
}
@ -84,149 +87,49 @@ doris::Status doris::FileFactory::_new_file_reader(
FileReader* hdfs_reader = nullptr;
RETURN_IF_ERROR(HdfsReaderWriter::create_reader(range.hdfs_params, range.path, start_offset,
&hdfs_reader));
file_reader = new BufferedReader(profile, hdfs_reader);
break;
}
default:
return Status::InternalError("UnSupport File Reader Type: " + std::to_string(type));
}
return Status::OK();
}
doris::Status doris::FileFactory::create_file_reader(
doris::TFileType::type type, doris::ExecEnv* env, RuntimeProfile* profile,
const std::vector<TNetworkAddress>& broker_addresses,
const std::map<std::string, std::string>& properties, const doris::TBrokerRangeDesc& range,
int64_t start_offset, std::unique_ptr<FileReader>& file_reader) {
if (type == TFileType::FILE_STREAM) {
return Status::InternalError("UnSupport UniquePtr For FileStream type");
}
FileReader* file_reader_ptr;
RETURN_IF_ERROR(_new_file_reader(type, env, profile, broker_addresses, properties, range,
start_offset, file_reader_ptr));
file_reader.reset(file_reader_ptr);
return Status::OK();
}
doris::Status doris::FileFactory::create_file_reader(
doris::TFileType::type type, doris::ExecEnv* env, RuntimeProfile* profile,
const std::vector<TNetworkAddress>& broker_addresses,
const std::map<std::string, std::string>& properties, const doris::TBrokerRangeDesc& range,
int64_t start_offset, std::shared_ptr<FileReader>& file_reader) {
if (type == TFileType::FILE_STREAM) {
file_reader = env->load_stream_mgr()->get(range.load_id);
if (!file_reader) {
VLOG_NOTICE << "unknown stream load id: " << UniqueId(range.load_id);
return Status::InternalError("unknown stream load id");
}
} else {
FileReader* file_reader_ptr;
RETURN_IF_ERROR(_new_file_reader(type, env, profile, broker_addresses, properties, range,
start_offset, file_reader_ptr));
file_reader.reset(file_reader_ptr);
}
return Status::OK();
}
doris::Status doris::FileFactory::_new_file_reader(doris::ExecEnv* env, RuntimeProfile* profile,
const TFileScanRangeParams& params,
const doris::TFileRangeDesc& range,
FileReader*& file_reader_ptr) {
doris::TFileType::type type = params.file_type;
if (type == TFileType::FILE_STREAM) {
return Status::InternalError("UnSupport UniquePtr For FileStream type");
}
int64_t start_offset = range.start_offset;
switch (params.format_type) {
case TFileFormatType::FORMAT_CSV_PLAIN:
case TFileFormatType::FORMAT_CSV_GZ:
case TFileFormatType::FORMAT_CSV_BZ2:
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
case TFileFormatType::FORMAT_CSV_LZOP:
case TFileFormatType::FORMAT_CSV_DEFLATE:
if (start_offset != 0) {
start_offset -= 1;
}
break;
default:
break;
}
switch (type) {
case TFileType::FILE_LOCAL: {
file_reader_ptr = new LocalFileReader(range.path, start_offset);
break;
}
case TFileType::FILE_S3: {
file_reader_ptr = new BufferedReader(
profile, new S3Reader(params.properties, range.path, start_offset));
break;
}
case TFileType::FILE_HDFS: {
FileReader* hdfs_reader = nullptr;
RETURN_IF_ERROR(HdfsReaderWriter::create_reader(params.hdfs_params, range.path,
start_offset, &hdfs_reader));
file_reader_ptr = new BufferedReader(profile, hdfs_reader);
break;
}
default:
return Status::InternalError("Unsupported File Reader Type: " + std::to_string(type));
return Status::InternalError("unsupported file reader type: " + std::to_string(type));
}
return Status::OK();
}
doris::Status doris::FileFactory::create_file_reader(doris::ExecEnv* env, RuntimeProfile* profile,
const TFileScanRangeParams& params,
const doris::TFileRangeDesc& range,
std::shared_ptr<FileReader>& file_reader) {
FileReader* file_reader_ptr;
RETURN_IF_ERROR(_new_file_reader(env, profile, params, range, file_reader_ptr));
file_reader.reset(file_reader_ptr);
return Status::OK();
}
doris::Status doris::FileFactory::create_file_reader(doris::ExecEnv* env, RuntimeProfile* profile,
const TFileScanRangeParams& params,
const doris::TFileRangeDesc& range,
std::unique_ptr<FileReader>& file_reader) {
FileReader* file_reader_ptr;
RETURN_IF_ERROR(_new_file_reader(env, profile, params, range, file_reader_ptr));
file_reader.reset(file_reader_ptr);
return Status::OK();
}
// ============================
// file scan node/unique ptr
doris::Status doris::FileFactory::create_file_reader(RuntimeProfile* profile,
const TFileScanRangeParams& params,
const TFileRangeDesc& range,
std::unique_ptr<FileReader>& file_reader,
int64_t buffer_size) {
doris::TFileType::type type = params.file_type;
const std::string& path, int64_t start_offset,
int64_t file_size, int64_t buffer_size,
std::unique_ptr<FileReader>& file_reader) {
FileReader* file_reader_ptr;
doris::TFileType::type type = params.file_type;
switch (type) {
case TFileType::FILE_LOCAL: {
file_reader_ptr = new LocalFileReader(range.path, range.start_offset);
file_reader_ptr = new LocalFileReader(path, start_offset);
break;
}
case TFileType::FILE_S3: {
file_reader_ptr = new S3Reader(params.properties, range.path, range.start_offset);
file_reader_ptr = new S3Reader(params.properties, path, start_offset);
break;
}
case TFileType::FILE_HDFS: {
RETURN_IF_ERROR(HdfsReaderWriter::create_reader(params.hdfs_params, range.path,
range.start_offset, &file_reader_ptr));
RETURN_IF_ERROR(HdfsReaderWriter::create_reader(params.hdfs_params, path, start_offset,
&file_reader_ptr));
break;
}
case TFileType::FILE_BROKER: {
file_reader_ptr = new BrokerReader(ExecEnv::GetInstance(), params.broker_addresses,
params.properties, path, start_offset, file_size);
break;
}
default:
return Status::InternalError("Unsupported File Reader Type: " + std::to_string(type));
return Status::InternalError("unsupported file reader type: {}", std::to_string(type));
}
if (buffer_size > 0) {
file_reader.reset(new BufferedReader(profile, file_reader_ptr, buffer_size));
} else {
@ -234,3 +137,13 @@ doris::Status doris::FileFactory::create_file_reader(RuntimeProfile* profile,
}
return Status::OK();
}
// file scan node/stream load pipe
doris::Status doris::FileFactory::create_pipe_reader(const TUniqueId& load_id,
std::shared_ptr<FileReader>& file_reader) {
file_reader = ExecEnv::GetInstance()->load_stream_mgr()->get(load_id);
if (!file_reader) {
return Status::InternalError("unknown stream load id: {}", UniqueId(load_id).to_string());
}
return Status::OK();
}

View File

@ -28,43 +28,34 @@ class RuntimeProfile;
class FileFactory {
public:
// Create FileWriter
static Status create_file_writer(TFileType::type type, ExecEnv* env,
const std::vector<TNetworkAddress>& broker_addresses,
const std::map<std::string, std::string>& properties,
const std::string& path, int64_t start_offset,
std::unique_ptr<FileWriter>& file_writer);
// Because StreamLoadPipe use std::shared_ptr, here we have to support both unique_ptr
// and shared_ptr create_file_reader
static Status create_file_reader(TFileType::type type, ExecEnv* env, RuntimeProfile* profile,
const std::vector<TNetworkAddress>& broker_addresses,
const std::map<std::string, std::string>& properties,
const TBrokerRangeDesc& range, int64_t start_offset,
std::unique_ptr<FileReader>& file_reader);
static Status create_file_reader(TFileType::type type, ExecEnv* env, RuntimeProfile* profile,
const std::vector<TNetworkAddress>& broker_addresses,
const std::map<std::string, std::string>& properties,
const TBrokerRangeDesc& range, int64_t start_offset,
std::shared_ptr<FileReader>& file_reader);
static Status create_file_reader(ExecEnv* env, RuntimeProfile* profile,
const TFileScanRangeParams& params,
const TFileRangeDesc& range,
std::unique_ptr<FileReader>& file_reader);
static Status create_file_reader(ExecEnv* env, RuntimeProfile* profile,
const TFileScanRangeParams& params,
const TFileRangeDesc& range,
std::shared_ptr<FileReader>& file_reader);
/**
* Create FileReader. If buffer_size > 0, use BufferedReader to wrap the underlying FileReader;
* Create FileReader for broker scan node related scanners and readers
*/
static Status create_file_reader(TFileType::type type, ExecEnv* env, RuntimeProfile* profile,
const std::vector<TNetworkAddress>& broker_addresses,
const std::map<std::string, std::string>& properties,
const TBrokerRangeDesc& range, int64_t start_offset,
std::unique_ptr<FileReader>& file_reader);
/**
* Create FileReader for file scan node rlated scanners and readers
* If buffer_size > 0, use BufferedReader to wrap the underlying FileReader;
* Otherwise, return the underlying FileReader directly.
*/
static Status create_file_reader(RuntimeProfile* profile, const TFileScanRangeParams& params,
const TFileRangeDesc& range,
std::unique_ptr<FileReader>& file_reader, int64_t buffer_size);
const std::string& path, int64_t start_offset,
int64_t file_size, int64_t buffer_size,
std::unique_ptr<FileReader>& file_reader);
// Create FileReader for stream load pipe
static Status create_pipe_reader(const TUniqueId& load_id,
std::shared_ptr<FileReader>& file_reader);
static TFileType::type convert_storage_type(TStorageBackendType::type type) {
switch (type) {
@ -81,19 +72,6 @@ public:
}
__builtin_unreachable();
}
private:
// Note: if the function return Status::OK() means new the file_reader. the caller
// should delete the memory of file_reader or use the smart_ptr to hold the own of file_reader
static Status _new_file_reader(TFileType::type type, ExecEnv* env, RuntimeProfile* profile,
const std::vector<TNetworkAddress>& broker_addresses,
const std::map<std::string, std::string>& properties,
const TBrokerRangeDesc& range, int64_t start_offset,
FileReader*& file_reader);
static Status _new_file_reader(ExecEnv* env, RuntimeProfile* profile,
const TFileScanRangeParams& params, const TFileRangeDesc& range,
FileReader*& file_reader);
};
} // namespace doris
} // namespace doris

View File

@ -161,6 +161,7 @@ public:
// otherwise we save source data to file first, then process it.
bool use_streaming = false;
TFileFormatType::type format = TFileFormatType::FORMAT_CSV_PLAIN;
TFileCompressType::type compress_type = TFileCompressType::UNKNOWN;
std::shared_ptr<MessageBodySink> body_sink;

View File

@ -261,7 +261,7 @@ set(VEC_FILES
exec/scan/new_jdbc_scan_node.cpp
exec/scan/new_es_scanner.cpp
exec/scan/new_es_scan_node.cpp
exec/format/csv/vcsv_reader.cpp
exec/format/csv/csv_reader.cpp
)
add_library(Vec STATIC

View File

@ -59,9 +59,9 @@ Status FileArrowScanner::_open_next_reader() {
}
const TFileRangeDesc& range = _ranges[_next_range++];
std::unique_ptr<FileReader> file_reader;
RETURN_IF_ERROR(FileFactory::create_file_reader(_state->exec_env(), _profile, _params,
range, file_reader));
RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _params, range.path,
range.start_offset, range.file_size, 0,
file_reader));
RETURN_IF_ERROR(file_reader->open());
if (file_reader->size() == 0) {
file_reader->close();

View File

@ -158,7 +158,8 @@ Status FileTextScanner::_open_next_reader() {
Status FileTextScanner::_open_file_reader() {
const TFileRangeDesc& range = _ranges[_next_range];
RETURN_IF_ERROR(FileFactory::create_file_reader(_state->exec_env(), _profile, _params, range,
RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _params, range.path,
range.start_offset, range.file_size, 0,
_cur_file_reader));
return _cur_file_reader->open();
}

View File

@ -53,7 +53,7 @@ private:
Status _line_split_to_values(const Slice& line);
Status _split_line(const Slice& line);
// Reader
std::shared_ptr<FileReader> _cur_file_reader;
std::unique_ptr<FileReader> _cur_file_reader;
LineReader* _cur_line_reader;
bool _cur_line_reader_eof;

View File

@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
#include "vcsv_reader.h"
#include "csv_reader.h"
#include <gen_cpp/PlanNodes_types.h>
#include <gen_cpp/internal_service.pb.h>
@ -35,24 +35,32 @@
namespace doris::vectorized {
CsvReader::CsvReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter,
const TFileScanRangeParams& params, const TFileRangeDesc& range,
const std::vector<SlotDescriptor*>& file_slot_descs, FileReader* file_reader)
const std::vector<SlotDescriptor*>& file_slot_descs)
: _state(state),
_profile(profile),
_counter(counter),
_params(params),
_range(range),
_file_slot_descs(file_slot_descs),
_file_reader(file_reader),
_line_reader(nullptr),
_line_reader_eof(false),
_text_converter(nullptr),
_decompressor(nullptr),
_skip_lines(0) {
_file_format_type = _params.format_type;
_file_compress_type = _params.compress_type;
_size = _range.size;
//means first range
if (_range.start_offset == 0 && _params.__isset.file_attributes &&
_text_converter.reset(new (std::nothrow) TextConverter('\\'));
_split_values.reserve(sizeof(Slice) * _file_slot_descs.size());
}
CsvReader::~CsvReader() {}
Status CsvReader::init_reader() {
// set the skip lines and start offset
int64_t start_offset = _range.start_offset;
if (start_offset == 0 && _params.__isset.file_attributes &&
_params.file_attributes.__isset.header_type &&
_params.file_attributes.header_type.size() > 0) {
std::string header_type = to_lower(_params.file_attributes.header_type);
@ -61,52 +69,43 @@ CsvReader::CsvReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounte
} else if (header_type == BeConsts::CSV_WITH_NAMES_AND_TYPES) {
_skip_lines = 2;
}
}
_text_converter.reset(new (std::nothrow) TextConverter('\\'));
_split_values.reserve(sizeof(Slice) * _file_slot_descs.size());
}
CsvReader::~CsvReader() {
if (_decompressor != nullptr) {
delete _decompressor;
_decompressor = nullptr;
}
if (_file_reader != nullptr) {
delete _file_reader;
_file_reader = nullptr;
}
}
Status CsvReader::init_reader() {
// get column_separator and line_delimiter
if (_params.__isset.file_attributes && _params.file_attributes.__isset.text_params &&
_params.file_attributes.text_params.__isset.column_separator) {
_value_separator = _params.file_attributes.text_params.column_separator;
_value_separator_length = _value_separator.size();
} else {
return Status::InternalError("Can not find column_separator");
}
if (_params.__isset.file_attributes && _params.file_attributes.__isset.text_params &&
_params.file_attributes.text_params.__isset.line_delimiter) {
_line_delimiter = _params.file_attributes.text_params.line_delimiter;
_line_delimiter_length = _line_delimiter.size();
} else {
return Status::InternalError("Can not find line_delimiter");
}
if (_range.start_offset != 0) {
if (_file_format_type != TFileFormatType::FORMAT_CSV_PLAIN) {
} else if (start_offset != 0) {
if (_file_format_type != TFileFormatType::FORMAT_CSV_PLAIN ||
(_file_compress_type != TFileCompressType::UNKNOWN &&
_file_compress_type != TFileCompressType::PLAIN)) {
return Status::InternalError("For now we do not support split compressed file");
}
start_offset -= 1;
_size += 1;
// not first range will always skip one line
_skip_lines = 1;
}
// create and open file reader
FileReader* real_reader = nullptr;
if (_params.file_type == TFileType::FILE_STREAM) {
RETURN_IF_ERROR(FileFactory::create_pipe_reader(_range.load_id, _file_reader_s));
real_reader = _file_reader_s.get();
} else {
RETURN_IF_ERROR(FileFactory::create_file_reader(
_profile, _params, _range.path, start_offset, _range.file_size, 0, _file_reader));
real_reader = _file_reader.get();
}
RETURN_IF_ERROR(real_reader->open());
if (real_reader->size() == 0 && _params.file_type != TFileType::FILE_STREAM &&
_params.file_type != TFileType::FILE_BROKER) {
return Status::EndOfFile("Empty File");
}
// get column_separator and line_delimiter
_value_separator = _params.file_attributes.text_params.column_separator;
_value_separator_length = _value_separator.size();
_line_delimiter = _params.file_attributes.text_params.line_delimiter;
_line_delimiter_length = _line_delimiter.size();
// create decompressor.
// _decompressor may be nullptr if this is not a compressed file
RETURN_IF_ERROR(_create_decompressor(_file_format_type));
RETURN_IF_ERROR(_create_decompressor());
switch (_file_format_type) {
case TFileFormatType::FORMAT_CSV_PLAIN:
@ -115,8 +114,8 @@ Status CsvReader::init_reader() {
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
case TFileFormatType::FORMAT_CSV_LZOP:
case TFileFormatType::FORMAT_CSV_DEFLATE:
_line_reader.reset(new PlainTextLineReader(_profile, _file_reader, _decompressor, _size,
_line_delimiter, _line_delimiter_length));
_line_reader.reset(new PlainTextLineReader(_profile, real_reader, _decompressor.get(),
_size, _line_delimiter, _line_delimiter_length));
break;
default:
@ -173,33 +172,58 @@ Status CsvReader::get_columns(std::unordered_map<std::string, TypeDescriptor>* n
return Status::OK();
}
Status CsvReader::_create_decompressor(TFileFormatType::type type) {
Status CsvReader::_create_decompressor() {
CompressType compress_type;
switch (type) {
case TFileFormatType::FORMAT_CSV_PLAIN:
compress_type = CompressType::UNCOMPRESSED;
break;
case TFileFormatType::FORMAT_CSV_GZ:
compress_type = CompressType::GZIP;
break;
case TFileFormatType::FORMAT_CSV_BZ2:
compress_type = CompressType::BZIP2;
break;
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
compress_type = CompressType::LZ4FRAME;
break;
case TFileFormatType::FORMAT_CSV_LZOP:
compress_type = CompressType::LZOP;
break;
case TFileFormatType::FORMAT_CSV_DEFLATE:
compress_type = CompressType::DEFLATE;
break;
default: {
return Status::InternalError(
"Unknown format type, cannot inference compress type in csv reader, type={}", type);
if (_file_compress_type != TFileCompressType::UNKNOWN) {
switch (_file_compress_type) {
case TFileCompressType::PLAIN:
compress_type = CompressType::UNCOMPRESSED;
break;
case TFileCompressType::GZ:
compress_type = CompressType::GZIP;
break;
case TFileCompressType::LZO:
compress_type = CompressType::LZOP;
break;
case TFileCompressType::BZ2:
compress_type = CompressType::BZIP2;
break;
case TFileCompressType::LZ4FRAME:
compress_type = CompressType::LZ4FRAME;
break;
case TFileCompressType::DEFLATE:
compress_type = CompressType::DEFLATE;
break;
default:
return Status::InternalError("unknown compress type: {}", _file_compress_type);
}
} else {
switch (_file_format_type) {
case TFileFormatType::FORMAT_CSV_PLAIN:
compress_type = CompressType::UNCOMPRESSED;
break;
case TFileFormatType::FORMAT_CSV_GZ:
compress_type = CompressType::GZIP;
break;
case TFileFormatType::FORMAT_CSV_BZ2:
compress_type = CompressType::BZIP2;
break;
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
compress_type = CompressType::LZ4FRAME;
break;
case TFileFormatType::FORMAT_CSV_LZOP:
compress_type = CompressType::LZOP;
break;
case TFileFormatType::FORMAT_CSV_DEFLATE:
compress_type = CompressType::DEFLATE;
break;
default:
return Status::InternalError("unknown format type: {}", _file_format_type);
}
}
}
RETURN_IF_ERROR(Decompressor::create_decompressor(compress_type, &_decompressor));
Decompressor* decompressor;
RETURN_IF_ERROR(Decompressor::create_decompressor(compress_type, &decompressor));
_decompressor.reset(decompressor);
return Status::OK();
}
@ -248,16 +272,17 @@ Status CsvReader::_line_split_to_values(const Slice& line, bool* success) {
_split_line(line);
// if actual column number in csv file is less than _file_slot_descs.size()
// if actual column number in csv file is not equal to _file_slot_descs.size()
// then filter this line.
if (_split_values.size() < _file_slot_descs.size()) {
if (_split_values.size() != _file_slot_descs.size()) {
std::string cmp_str =
_split_values.size() > _file_slot_descs.size() ? "more than" : "less than";
RETURN_IF_ERROR(_state->append_error_msg_to_file(
[&]() -> std::string { return std::string(line.data, line.size); },
[&]() -> std::string {
fmt::memory_buffer error_msg;
fmt::format_to(
error_msg, "{}",
"actual column number in csv file is less than schema column number.");
fmt::format_to(error_msg, "{} {} {}", "actual column number in csv file is ",
cmp_str, " schema column number.");
fmt::format_to(error_msg, "actual number: {}, column separator: [{}], ",
_split_values.size(), _value_separator);
fmt::format_to(error_msg, "line delimiter: [{}], schema column number: {}; ",

View File

@ -33,7 +33,7 @@ class CsvReader : public GenericReader {
public:
CsvReader(RuntimeState* state, RuntimeProfile* profile, ScannerCounter* counter,
const TFileScanRangeParams& params, const TFileRangeDesc& range,
const std::vector<SlotDescriptor*>& file_slot_descs, FileReader* file_reader);
const std::vector<SlotDescriptor*>& file_slot_descs);
~CsvReader() override;
Status init_reader();
@ -42,7 +42,7 @@ public:
std::unordered_set<std::string>* missing_cols) override;
private:
Status _create_decompressor(TFileFormatType::type type);
Status _create_decompressor();
Status _fill_dest_columns(const Slice& line, std::vector<MutableColumnPtr>& columns);
Status _line_split_to_values(const Slice& line, bool* success);
void _split_line(const Slice& line);
@ -58,13 +58,18 @@ private:
const TFileRangeDesc& _range;
const std::vector<SlotDescriptor*>& _file_slot_descs;
FileReader* _file_reader;
// _file_reader_s is for stream load pipe reader,
// and _file_reader is for other file reader.
// TODO: refactor this to use only shared_ptr or unique_ptr
std::unique_ptr<FileReader> _file_reader;
std::shared_ptr<FileReader> _file_reader_s;
std::unique_ptr<LineReader> _line_reader;
bool _line_reader_eof;
std::unique_ptr<TextConverter> _text_converter;
Decompressor* _decompressor;
std::unique_ptr<Decompressor> _decompressor;
TFileFormatType::type _file_format_type;
TFileCompressType::type _file_compress_type;
int64_t _size;
// When we fetch range start from 0, header_type="csv_with_names" skip first line
// When we fetch range start from 0, header_type="csv_with_names_and_types" skip first two line

View File

@ -121,8 +121,9 @@ Status ParquetReader::init_reader(
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range) {
SCOPED_RAW_TIMER(&_statistics.parse_meta_time);
if (_file_reader == nullptr) {
RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _scan_params, _scan_range,
_file_reader, 0));
RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _scan_params, _scan_range.path,
_scan_range.start_offset,
_scan_range.file_size, 0, _file_reader));
}
RETURN_IF_ERROR(_file_reader->open());
if (_file_reader->size() == 0) {

View File

@ -202,9 +202,9 @@ Status NewFileArrowScanner::_open_next_reader() {
}
const TFileRangeDesc& range = _ranges[_next_range++];
std::unique_ptr<FileReader> file_reader;
RETURN_IF_ERROR(FileFactory::create_file_reader(_state->exec_env(), _profile, _params,
range, file_reader));
RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _params, range.path,
range.start_offset, range.file_size, 0,
file_reader));
RETURN_IF_ERROR(file_reader->open());
if (file_reader->size() == 0) {
file_reader->close();

View File

@ -139,7 +139,8 @@ Status NewFileTextScanner::_open_next_reader() {
Status NewFileTextScanner::_open_file_reader() {
const TFileRangeDesc& range = _ranges[_next_range];
RETURN_IF_ERROR(FileFactory::create_file_reader(_state->exec_env(), _profile, _params, range,
RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _params, range.path,
range.start_offset, range.file_size, 0,
_cur_file_reader));
return _cur_file_reader->open();
}

View File

@ -47,7 +47,7 @@ private:
Status _line_split_to_values(const Slice& line);
Status _split_line(const Slice& line);
// Reader
std::shared_ptr<FileReader> _cur_file_reader;
std::unique_ptr<FileReader> _cur_file_reader;
LineReader* _cur_line_reader;
bool _cur_line_reader_eof;

View File

@ -30,7 +30,7 @@
#include "runtime/descriptors.h"
#include "runtime/raw_value.h"
#include "runtime/runtime_state.h"
#include "vec/exec/format/csv/vcsv_reader.h"
#include "vec/exec/format/csv/csv_reader.h"
#include "vec/exec/format/parquet/vparquet_reader.h"
#include "vec/exec/scan/new_file_scan_node.h"
#include "vec/functions/simple_function_factory.h"
@ -47,7 +47,11 @@ VFileScanner::VFileScanner(RuntimeState* state, NewFileScanNode* parent, int64_t
_cur_reader_eof(false),
_mem_pool(std::make_unique<MemPool>()),
_profile(profile),
_strict_mode(false) {}
_strict_mode(false) {
if (scan_range.params.__isset.strict_mode) {
_strict_mode = scan_range.params.strict_mode;
}
}
Status VFileScanner::prepare(
VExprContext** vconjunct_ctx_ptr,
@ -158,11 +162,11 @@ Status VFileScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eo
} while (true);
// Update filtered rows and unselected rows for load, reset counter.
{
state->update_num_rows_load_filtered(_counter.num_rows_filtered);
state->update_num_rows_load_unselected(_counter.num_rows_unselected);
_reset_counter();
}
// {
// state->update_num_rows_load_filtered(_counter.num_rows_filtered);
// state->update_num_rows_load_unselected(_counter.num_rows_unselected);
// _reset_counter();
// }
return Status::OK();
}
@ -447,7 +451,6 @@ Status VFileScanner::_convert_to_output_block(Block* block) {
"filter column"));
RETURN_IF_ERROR(vectorized::Block::filter_block(block, dest_size, dest_size));
_counter.num_rows_filtered += rows - block->rows();
return Status::OK();
}
@ -461,22 +464,8 @@ Status VFileScanner::_get_next_reader() {
}
const TFileRangeDesc& range = _ranges[_next_range++];
// 1. create file reader
// TODO: Each format requires its own FileReader to achieve a special access mode,
// so create the FileReader inner the format.
std::unique_ptr<FileReader> file_reader;
if (_params.format_type != TFileFormatType::FORMAT_PARQUET) {
RETURN_IF_ERROR(FileFactory::create_file_reader(_state->exec_env(), _profile, _params,
range, file_reader));
RETURN_IF_ERROR(file_reader->open());
if (file_reader->size() == 0) {
file_reader->close();
continue;
}
}
// 2. create reader for specific format
// TODO: add csv, json, avro
// TODO: add json, avro
Status init_status;
switch (_params.format_type) {
case TFileFormatType::FORMAT_PARQUET: {
@ -488,6 +477,18 @@ Status VFileScanner::_get_next_reader() {
break;
}
case TFileFormatType::FORMAT_ORC: {
// create file reader of orc reader.
std::unique_ptr<FileReader> file_reader;
RETURN_IF_ERROR(FileFactory::create_file_reader(_profile, _params, range.path,
range.start_offset, range.file_size, 0,
file_reader));
RETURN_IF_ERROR(file_reader->open());
if (file_reader->size() == 0) {
file_reader->close();
init_status = Status::EndOfFile("Empty orc file");
break;
}
_cur_reader.reset(new ORCReaderWrap(_state, _file_slot_descs, file_reader.release(),
_num_of_columns_from_file, range.start_offset,
range.size, false));
@ -502,8 +503,8 @@ Status VFileScanner::_get_next_reader() {
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
case TFileFormatType::FORMAT_CSV_LZOP:
case TFileFormatType::FORMAT_CSV_DEFLATE: {
_cur_reader.reset(new CsvReader(_state, _profile, &_counter, _params, range,
_file_slot_descs, file_reader.release()));
_cur_reader.reset(
new CsvReader(_state, _profile, &_counter, _params, range, _file_slot_descs));
init_status = ((CsvReader*)(_cur_reader.get()))->init_reader();
break;
}

View File

@ -30,16 +30,6 @@ namespace doris::vectorized {
class NewFileScanNode;
// The counter will be passed to each scanner.
// Note that this struct is not thread safe.
// So if we support concurrent scan in the future, we need to modify this struct.
struct ScannerCounter {
ScannerCounter() : num_rows_filtered(0), num_rows_unselected(0) {}
int64_t num_rows_filtered; // unqualified rows (unmatched the dest schema, or no partition)
int64_t num_rows_unselected; // rows filtered by predicates
};
class VFileScanner : public VScanner {
public:
VFileScanner(RuntimeState* state, NewFileScanNode* parent, int64_t limit,
@ -115,7 +105,6 @@ protected:
// Profile
RuntimeProfile* _profile;
ScannerCounter _counter;
bool _scanner_eof = false;
int _rows = 0;

View File

@ -70,7 +70,11 @@ Status VScanner::get_block(RuntimeState* state, Block* block, bool* eof) {
}
Status VScanner::_filter_output_block(Block* block) {
return VExprContext::filter_block(_vconjunct_ctx, block, _output_tuple_desc->slots().size());
auto old_rows = block->rows();
Status st =
VExprContext::filter_block(_vconjunct_ctx, block, _output_tuple_desc->slots().size());
_counter.num_rows_unselected += old_rows - block->rows();
return st;
}
Status VScanner::try_append_late_arrival_runtime_filter() {
@ -116,8 +120,14 @@ Status VScanner::close(RuntimeState* state) {
}
void VScanner::_update_counters_before_close() {
if (!_state->enable_profile()) return;
LOG(INFO) << "cmy _update_counters_before_close: _counter.num_rows_filtered: "
<< _counter.num_rows_filtered
<< ", _counter.num_rows_unselected: " << _counter.num_rows_unselected;
if (!_state->enable_profile() && !_is_load) return;
COUNTER_UPDATE(_parent->_rows_read_counter, _num_rows_read);
// Update stats for load
_state->update_num_rows_load_filtered(_counter.num_rows_filtered);
_state->update_num_rows_load_unselected(_counter.num_rows_unselected);
}
} // namespace doris::vectorized

View File

@ -28,6 +28,14 @@ namespace doris::vectorized {
class Block;
class VScanNode;
// Counter for load
struct ScannerCounter {
ScannerCounter() : num_rows_filtered(0), num_rows_unselected(0) {}
int64_t num_rows_filtered; // unqualified rows (unmatched the dest schema, or no partition)
int64_t num_rows_unselected; // rows filtered by predicates
};
class VScanner {
public:
VScanner(RuntimeState* state, VScanNode* parent, int64_t limit);
@ -162,6 +170,8 @@ protected:
bool _is_load = false;
// set to true after decrease the "_num_unfinished_scanners" in scanner context
bool _is_counted_down = false;
ScannerCounter _counter;
};
} // namespace doris::vectorized

View File

@ -47,7 +47,7 @@ Status VJsonScanner<JsonReader>::get_next(vectorized::Block* output_block, bool*
auto columns = _src_block.mutate_columns();
// Get one line
while (columns[0]->size() < batch_size && !_scanner_eof) {
if (_cur_file_reader == nullptr || _cur_reader_eof) {
if (_real_reader == nullptr || _cur_reader_eof) {
RETURN_IF_ERROR(open_next_reader());
// If there isn't any more reader, break this
if (_scanner_eof) {
@ -110,7 +110,7 @@ Status VJsonScanner<JsonReader>::open_vjson_reader() {
num_as_string, fuzzy_parse));
_cur_vjson_reader.reset(new JsonReader(_state, _counter, _profile, strip_outer_array,
num_as_string, fuzzy_parse, &_scanner_eof,
_read_json_by_line ? nullptr : _cur_file_reader.get(),
_read_json_by_line ? nullptr : _real_reader,
_read_json_by_line ? _cur_line_reader : nullptr));
RETURN_IF_ERROR(_cur_vjson_reader->init(jsonpath, json_root));

View File

@ -174,8 +174,11 @@ ERRORS:
SHOW LOAD WARNINGS ON 'url
````
where url is the url given by ErrorURL.
where url is the url given by ErrorURL.
23: compress_type
Specify compress type file. Only support compressed csv file now. Support gz, lzo, bz2, lz4, lzop, deflate.
### Example

View File

@ -135,7 +135,7 @@ curl --location-trusted -u user:passwd [-H ""...] -T data.file -XPUT http://fe_h
21. send_batch_parallelism: 整型,用于设置发送批处理数据的并行度,如果并行度的值超过 BE 配置中的 `max_send_batch_parallelism_per_job`,那么作为协调点的 BE 将使用 `max_send_batch_parallelism_per_job` 的值。
22. hidden_columns: 用于指定导入数据中包含的隐藏列,在Header中不包含columns时生效,多个hidden column用逗号分割。
```
```
hidden_columns: __DORIS_DELETE_SIGN__,__DORIS_SEQUENCE_COL__
系统会使用用户指定的数据导入数据。在上述用例中,导入数据中最后一列数据为__DORIS_SEQUENCE_COL__。
```
@ -166,11 +166,14 @@ ERRORS:
可以通过以下语句查看导入错误详细信息:
```sql
SHOW LOAD WARNINGS ON 'url
SHOW LOAD WARNINGS ON 'url'
```
其中 url 为 ErrorURL 给出的 url。
​ 其中 url 为 ErrorURL 给出的 url。
23: compress_type
指定文件的压缩格式。目前只支持 csv 文件的压缩。支持 gz, lzo, bz2, lz4, lzop, deflate 压缩格式。
### Example

View File

@ -17,6 +17,7 @@
package org.apache.doris.analysis;
import org.apache.doris.analysis.StorageBackend.StorageType;
import org.apache.doris.backup.BlobStorage;
import org.apache.doris.common.io.Text;
import org.apache.doris.common.io.Writable;
@ -53,6 +54,13 @@ public class BrokerDesc extends StorageDesc implements Writable {
this.storageType = StorageBackend.StorageType.BROKER;
}
// for empty broker desc
public BrokerDesc(String name) {
this.name = name;
this.properties = Maps.newHashMap();
this.storageType = StorageType.LOCAL;
}
public BrokerDesc(String name, Map<String, String> properties) {
this.name = name;
this.properties = properties;
@ -77,6 +85,11 @@ public class BrokerDesc extends StorageDesc implements Writable {
tryConvertToS3();
}
public static BrokerDesc createForStreamLoad() {
BrokerDesc brokerDesc = new BrokerDesc("", StorageType.STREAM, null);
return brokerDesc;
}
public String getName() {
return name;
}
@ -94,19 +107,19 @@ public class BrokerDesc extends StorageDesc implements Writable {
}
public TFileType getFileType() {
if (storageType == StorageBackend.StorageType.LOCAL) {
return TFileType.FILE_LOCAL;
switch (storageType) {
case LOCAL:
return TFileType.FILE_LOCAL;
case S3:
return TFileType.FILE_S3;
case HDFS:
return TFileType.FILE_HDFS;
case STREAM:
return TFileType.FILE_STREAM;
case BROKER:
default:
return TFileType.FILE_BROKER;
}
if (storageType == StorageBackend.StorageType.BROKER) {
return TFileType.FILE_BROKER;
}
if (storageType == StorageBackend.StorageType.S3) {
return TFileType.FILE_S3;
}
if (storageType == StorageBackend.StorageType.HDFS) {
return TFileType.FILE_HDFS;
}
return TFileType.FILE_BROKER;
}
public StorageBackend.StorageType storageType() {

View File

@ -28,9 +28,13 @@ import org.apache.doris.common.ErrorCode;
import org.apache.doris.common.ErrorReport;
import org.apache.doris.common.Pair;
import org.apache.doris.common.util.SqlParserUtils;
import org.apache.doris.common.util.Util;
import org.apache.doris.load.loadv2.LoadTask;
import org.apache.doris.mysql.privilege.PrivPredicate;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.task.LoadTaskInfo;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TNetworkAddress;
import com.google.common.base.Function;
@ -94,7 +98,8 @@ public class DataDescription {
private final PartitionNames partitionNames;
private final List<String> filePaths;
private final Separator columnSeparator;
private final String fileFormat;
private String fileFormat;
private TFileCompressType compressType = TFileCompressType.UNKNOWN;
private final boolean isNegative;
// column names in the path
private final List<String> columnsFromPath;
@ -210,9 +215,57 @@ public class DataDescription {
this.properties = properties;
}
// For stream load using external file scan node.
public DataDescription(String tableName, LoadTaskInfo taskInfo) {
this.tableName = tableName;
this.partitionNames = taskInfo.getPartitions();
// Add a dummy path to just make analyze() happy.
// Stream load does not need this field.
this.filePaths = Lists.newArrayList("dummy");
this.fileFieldNames = taskInfo.getColumnExprDescs().getFileColNames();
this.columnSeparator = taskInfo.getColumnSeparator();
this.lineDelimiter = taskInfo.getLineDelimiter();
getFileFormatAndCompressType(taskInfo);
this.columnsFromPath = null;
this.isNegative = taskInfo.getNegative();
this.columnMappingList = taskInfo.getColumnExprDescs().getColumnMappingList();
this.precedingFilterExpr = taskInfo.getPrecedingFilter();
this.whereExpr = taskInfo.getWhereExpr();
this.srcTableName = null;
this.mergeType = taskInfo.getMergeType();
this.deleteCondition = taskInfo.getDeleteCondition();
this.sequenceCol = taskInfo.getSequenceCol();
this.stripOuterArray = taskInfo.isStripOuterArray();
this.jsonPaths = taskInfo.getJsonPaths();
this.jsonRoot = taskInfo.getJsonRoot();
this.fuzzyParse = taskInfo.isFuzzyParse();
this.readJsonByLine = taskInfo.isReadJsonByLine();
this.numAsString = taskInfo.isNumAsString();
this.properties = Maps.newHashMap();
}
private void getFileFormatAndCompressType(LoadTaskInfo taskInfo) {
// get file format
if (!Strings.isNullOrEmpty(taskInfo.getHeaderType())) {
// for "csv_with_name" and "csv_with_name_and_type"
this.fileFormat = taskInfo.getHeaderType();
} else {
TFileFormatType type = taskInfo.getFormatType();
if (Util.isCsvFormat(type)) {
// ignore the "compress type" in format, such as FORMAT_CSV_GZ
// the compress type is saved in "compressType"
this.fileFormat = "csv";
} else {
this.fileFormat = "json";
}
}
// get compress type
this.compressType = taskInfo.getCompressType();
}
public static void validateMappingFunction(String functionName, List<String> args,
Map<String, String> columnNameMap,
Column mappingColumn, boolean isHadoopLoad) throws AnalysisException {
Map<String, String> columnNameMap,
Column mappingColumn, boolean isHadoopLoad) throws AnalysisException {
if (functionName.equalsIgnoreCase("alignment_timestamp")) {
validateAlignmentTimestamp(args, columnNameMap);
} else if (functionName.equalsIgnoreCase("strftime")) {
@ -425,6 +478,10 @@ public class DataDescription {
return fileFormat;
}
public TFileCompressType getCompressType() {
return compressType;
}
public List<String> getColumnsFromPath() {
return columnsFromPath;
}

View File

@ -17,8 +17,11 @@
package org.apache.doris.analysis;
import org.apache.doris.analysis.BinaryPredicate.Operator;
import org.apache.doris.catalog.Column;
import com.google.common.base.Preconditions;
public class ImportColumnDesc {
private String columnName;
private Expr expr;
@ -59,6 +62,12 @@ public class ImportColumnDesc {
return expr == null;
}
public Expr toBinaryPredicate() {
Preconditions.checkState(!isColumn());
BinaryPredicate pred = new BinaryPredicate(Operator.EQ, new SlotRef(null, columnName), expr);
return pred;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
@ -68,4 +77,5 @@ public class ImportColumnDesc {
}
return sb.toString();
}
}

View File

@ -87,6 +87,25 @@ public class LargeIntLiteral extends LiteralExpr {
analysisDone();
}
public LargeIntLiteral(BigDecimal value) throws AnalysisException {
super();
BigInteger bigInt;
try {
bigInt = new BigInteger(value.toPlainString());
// ATTN: value from 'sql_parser.y' is always be positive. for example: '-256' will to be
// 256, and for int8_t, 256 is invalid, while -256 is valid. So we check the right border
// is LARGE_INT_MAX_ABS
if (bigInt.compareTo(LARGE_INT_MIN) < 0 || bigInt.compareTo(LARGE_INT_MAX_ABS) > 0) {
throw new AnalysisException("Large int literal is out of range: " + value);
}
} catch (NumberFormatException e) {
throw new AnalysisException("Invalid integer literal: " + value, e);
}
this.value = bigInt;
type = Type.LARGEINT;
analysisDone();
}
protected LargeIntLiteral(LargeIntLiteral other) {
super(other);
value = other.value;

View File

@ -31,6 +31,7 @@ import org.apache.doris.common.UserException;
import org.apache.doris.common.util.BrokerUtil;
import org.apache.doris.common.util.ParseUtil;
import org.apache.doris.common.util.PrintableMap;
import org.apache.doris.common.util.Util;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TParquetCompressionType;
@ -563,7 +564,7 @@ public class OutFileClause {
analyzeBrokerDesc(processedPropKeys);
if (properties.containsKey(PROP_COLUMN_SEPARATOR)) {
if (!isCsvFormat()) {
if (!Util.isCsvFormat(fileFormatType)) {
throw new AnalysisException(PROP_COLUMN_SEPARATOR + " is only for CSV format");
}
columnSeparator = Separator.convertSeparator(properties.get(PROP_COLUMN_SEPARATOR));
@ -571,7 +572,7 @@ public class OutFileClause {
}
if (properties.containsKey(PROP_LINE_DELIMITER)) {
if (!isCsvFormat()) {
if (!Util.isCsvFormat(fileFormatType)) {
throw new AnalysisException(PROP_LINE_DELIMITER + " is only for CSV format");
}
lineDelimiter = Separator.convertSeparator(properties.get(PROP_LINE_DELIMITER));
@ -772,16 +773,6 @@ public class OutFileClause {
processedPropKeys.add(SCHEMA);
}
private boolean isCsvFormat() {
return fileFormatType == TFileFormatType.FORMAT_CSV_BZ2
|| fileFormatType == TFileFormatType.FORMAT_CSV_DEFLATE
|| fileFormatType == TFileFormatType.FORMAT_CSV_GZ
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZ4FRAME
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZO
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZOP
|| fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN;
}
private boolean isParquetFormat() {
return fileFormatType == TFileFormatType.FORMAT_PARQUET;
}
@ -817,7 +808,7 @@ public class OutFileClause {
public TResultFileSinkOptions toSinkOptions() {
TResultFileSinkOptions sinkOptions = new TResultFileSinkOptions(filePath, fileFormatType);
if (isCsvFormat()) {
if (Util.isCsvFormat(fileFormatType)) {
sinkOptions.setColumnSeparator(columnSeparator);
sinkOptions.setLineDelimiter(lineDelimiter);
}

View File

@ -114,7 +114,8 @@ public class StorageBackend extends StorageDesc implements ParseNode {
S3("Amazon S3 Simple Storage Service"),
HDFS("Hadoop Distributed File System"),
LOCAL("Local file system"),
OFS("Tencent CHDFS");
OFS("Tencent CHDFS"),
STREAM("Stream load pipe");
private final String description;

View File

@ -24,12 +24,14 @@ import org.apache.doris.common.Config;
import org.apache.doris.common.FeNameFormat;
import org.apache.doris.datasource.InternalCatalog;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.thrift.TFileFormatType;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jetbrains.annotations.NotNull;
import java.io.BufferedReader;
import java.io.DataInput;
@ -520,4 +522,35 @@ public class Util {
}
return new String(hexChars);
}
@NotNull
public static TFileFormatType getFileFormatType(String path) {
String lowerCasePath = path.toLowerCase();
if (lowerCasePath.endsWith(".parquet") || lowerCasePath.endsWith(".parq")) {
return TFileFormatType.FORMAT_PARQUET;
} else if (lowerCasePath.endsWith(".gz")) {
return TFileFormatType.FORMAT_CSV_GZ;
} else if (lowerCasePath.endsWith(".bz2")) {
return TFileFormatType.FORMAT_CSV_BZ2;
} else if (lowerCasePath.endsWith(".lz4")) {
return TFileFormatType.FORMAT_CSV_LZ4FRAME;
} else if (lowerCasePath.endsWith(".lzo")) {
return TFileFormatType.FORMAT_CSV_LZOP;
} else if (lowerCasePath.endsWith(".deflate")) {
return TFileFormatType.FORMAT_CSV_DEFLATE;
} else {
return TFileFormatType.FORMAT_CSV_PLAIN;
}
}
public static boolean isCsvFormat(TFileFormatType fileFormatType) {
return fileFormatType == TFileFormatType.FORMAT_CSV_BZ2
|| fileFormatType == TFileFormatType.FORMAT_CSV_DEFLATE
|| fileFormatType == TFileFormatType.FORMAT_CSV_GZ
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZ4FRAME
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZO
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZOP
|| fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN;
}
}

View File

@ -39,6 +39,7 @@ import org.apache.doris.common.Pair;
import org.apache.doris.common.io.Text;
import org.apache.doris.common.io.Writable;
import org.apache.doris.load.loadv2.LoadTask;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.thrift.TNetworkAddress;
import com.google.common.base.Strings;
@ -66,6 +67,7 @@ public class BrokerFileGroup implements Writable {
private String lineDelimiter;
// fileFormat may be null, which means format will be decided by file's suffix
private String fileFormat;
private TFileCompressType compressType = TFileCompressType.UNKNOWN;
private boolean isNegative;
private List<Long> partitionIds; // can be null, means no partition specified
private List<String> filePaths;
@ -217,6 +219,7 @@ public class BrokerFileGroup implements Writable {
throw new DdlException("File Format Type " + fileFormat + " is invalid.");
}
}
compressType = dataDescription.getCompressType();
isNegative = dataDescription.isNegative();
// FilePath
@ -276,6 +279,10 @@ public class BrokerFileGroup implements Writable {
return fileFormat;
}
public TFileCompressType getCompressType() {
return compressType;
}
public boolean isNegative() {
return isNegative;
}
@ -405,7 +412,7 @@ public class BrokerFileGroup implements Writable {
// null means default: csv
return false;
}
return fileFormat.toLowerCase().equals("parquet") || fileFormat.toLowerCase().equals("orc");
return fileFormat.equalsIgnoreCase("parquet") || fileFormat.equalsIgnoreCase("orc");
}
@Override

View File

@ -146,7 +146,7 @@ public class LoadingTaskPlanner {
// 1. Broker scan node
ScanNode scanNode;
if (Config.enable_new_load_scan_node) {
scanNode = new ExternalFileScanNode(new PlanNodeId(nextNodeId++), scanTupleDesc, "FileScanNode");
scanNode = new ExternalFileScanNode(new PlanNodeId(nextNodeId++), scanTupleDesc);
((ExternalFileScanNode) scanNode).setLoadInfo(loadJobId, txnId, table, brokerDesc, fileGroups,
fileStatusesList, filesAdded, strictMode, loadParallelism, userInfo);
} else {

View File

@ -39,6 +39,7 @@ import org.apache.doris.common.util.SmallFileMgr;
import org.apache.doris.common.util.SmallFileMgr.SmallFile;
import org.apache.doris.common.util.TimeUtils;
import org.apache.doris.persist.AlterRoutineLoadJobOperationLog;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.transaction.TransactionState;
import org.apache.doris.transaction.TransactionStatus;
@ -724,6 +725,11 @@ public class KafkaRoutineLoadJob extends RoutineLoadJob {
return gson.toJson(partitionIdToOffsetLag);
}
@Override
public TFileCompressType getCompressType() {
return TFileCompressType.PLAIN;
}
@Override
public double getMaxFilterRatio() {
// for kafka routine load, the max filter ratio is always 1, because it use max error num instead of this.

View File

@ -38,6 +38,7 @@ import org.apache.doris.common.DdlException;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.BrokerUtil;
import org.apache.doris.common.util.Util;
import org.apache.doris.common.util.VectorizedUtil;
import org.apache.doris.load.BrokerFileGroup;
import org.apache.doris.load.Load;
@ -460,22 +461,7 @@ public class BrokerScanNode extends LoadScanNode {
}
}
String lowerCasePath = path.toLowerCase();
if (lowerCasePath.endsWith(".parquet") || lowerCasePath.endsWith(".parq")) {
return TFileFormatType.FORMAT_PARQUET;
} else if (lowerCasePath.endsWith(".gz")) {
return TFileFormatType.FORMAT_CSV_GZ;
} else if (lowerCasePath.endsWith(".bz2")) {
return TFileFormatType.FORMAT_CSV_BZ2;
} else if (lowerCasePath.endsWith(".lz4")) {
return TFileFormatType.FORMAT_CSV_LZ4FRAME;
} else if (lowerCasePath.endsWith(".lzo")) {
return TFileFormatType.FORMAT_CSV_LZOP;
} else if (lowerCasePath.endsWith(".deflate")) {
return TFileFormatType.FORMAT_CSV_DEFLATE;
} else {
return TFileFormatType.FORMAT_CSV_PLAIN;
}
return Util.getFileFormatType(path);
}
public String getHostUri() throws UserException {

View File

@ -1940,7 +1940,7 @@ public class SingleNodePlanner {
"TableValuedFunctionScanNode", ((TableValuedFunctionRef) tblRef).getTableFunction());
break;
case HMS_EXTERNAL_TABLE:
scanNode = new ExternalFileScanNode(ctx.getNextNodeId(), tblRef.getDesc(), "HMS_FILE_SCAN_NODE");
scanNode = new ExternalFileScanNode(ctx.getNextNodeId(), tblRef.getDesc());
break;
case ES_EXTERNAL_TABLE:
scanNode = new EsScanNode(ctx.getNextNodeId(), tblRef.getDesc(), "EsScanNode", true);

View File

@ -18,6 +18,8 @@
package org.apache.doris.planner;
import org.apache.doris.analysis.Analyzer;
import org.apache.doris.analysis.BrokerDesc;
import org.apache.doris.analysis.DataDescription;
import org.apache.doris.analysis.DescriptorTable;
import org.apache.doris.analysis.Expr;
import org.apache.doris.analysis.ImportColumnDesc;
@ -40,12 +42,16 @@ import org.apache.doris.common.DdlException;
import org.apache.doris.common.ErrorCode;
import org.apache.doris.common.ErrorReport;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.Util;
import org.apache.doris.load.BrokerFileGroup;
import org.apache.doris.load.LoadErrorHub;
import org.apache.doris.load.loadv2.LoadTask;
import org.apache.doris.load.routineload.RoutineLoadJob;
import org.apache.doris.planner.external.ExternalFileScanNode;
import org.apache.doris.service.FrontendOptions;
import org.apache.doris.task.LoadTaskInfo;
import org.apache.doris.thrift.PaloInternalServiceVersion;
import org.apache.doris.thrift.TBrokerFileStatus;
import org.apache.doris.thrift.TExecPlanFragmentParams;
import org.apache.doris.thrift.TLoadErrorHubInfo;
import org.apache.doris.thrift.TNetworkAddress;
@ -85,9 +91,8 @@ public class StreamLoadPlanner {
private Analyzer analyzer;
private DescriptorTable descTable;
private StreamLoadScanNode scanNode;
private ScanNode scanNode;
private TupleDescriptor tupleDesc;
private TupleDescriptor scanTupleDesc;
public StreamLoadPlanner(Database db, OlapTable destTable, LoadTaskInfo taskInfo) {
this.db = db;
@ -167,13 +172,35 @@ public class StreamLoadPlanner {
}
// create scan node
scanNode = new StreamLoadScanNode(loadId, new PlanNodeId(0), scanTupleDesc, destTable, taskInfo);
if (Config.enable_new_load_scan_node) {
ExternalFileScanNode fileScanNode = new ExternalFileScanNode(new PlanNodeId(0), scanTupleDesc);
if (!Util.isCsvFormat(taskInfo.getFormatType())) {
throw new AnalysisException(
"New stream load scan load not support non-csv type now: " + taskInfo.getFormatType());
}
// 1. create file group
DataDescription dataDescription = new DataDescription(destTable.getName(), taskInfo);
dataDescription.analyzeWithoutCheckPriv(db.getFullName());
BrokerFileGroup fileGroup = new BrokerFileGroup(dataDescription);
fileGroup.parse(db, dataDescription);
// 2. create dummy file status
TBrokerFileStatus fileStatus = new TBrokerFileStatus();
fileStatus.setPath("");
fileStatus.setIsDir(false);
fileStatus.setSize(-1); // must set to -1, means stream.
fileScanNode.setLoadInfo(loadId, taskInfo.getTxnId(), destTable, BrokerDesc.createForStreamLoad(),
fileGroup, fileStatus, taskInfo.isStrictMode(), taskInfo.getFileType());
scanNode = fileScanNode;
} else {
scanNode = new StreamLoadScanNode(loadId, new PlanNodeId(0), scanTupleDesc, destTable, taskInfo);
}
scanNode.init(analyzer);
descTable.computeStatAndMemLayout();
scanNode.finalize(analyzer);
if (Config.enable_vectorized_load) {
scanNode.convertToVectoriezd();
}
descTable.computeStatAndMemLayout();
int timeout = taskInfo.getTimeout();
if (taskInfo instanceof RoutineLoadJob) {

View File

@ -143,7 +143,7 @@ public class StreamLoadScanNode extends LoadScanNode {
}
if (params.getSrcSlotIds() == null) {
params.setSrcSlotIds(new java.util.ArrayList<java.lang.Integer>());
params.setSrcSlotIds(Lists.newArrayList());
}
Load.initColumns(dstTable, columnExprDescs, null /* no hadoop function */, exprsByName, analyzer, srcTupleDesc,
slotDescByName, params.getSrcSlotIds(), taskInfo.getFormatType(), taskInfo.getHiddenColumns(),

View File

@ -49,9 +49,11 @@ import org.apache.doris.thrift.TExplainLevel;
import org.apache.doris.thrift.TExpr;
import org.apache.doris.thrift.TFileScanNode;
import org.apache.doris.thrift.TFileScanRangeParams;
import org.apache.doris.thrift.TFileType;
import org.apache.doris.thrift.TPlanNode;
import org.apache.doris.thrift.TPlanNodeType;
import org.apache.doris.thrift.TScanRangeLocations;
import org.apache.doris.thrift.TUniqueId;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
@ -123,23 +125,32 @@ public class ExternalFileScanNode extends ExternalScanNode {
* 1. Query hms table
* 2. Load from file
*/
public ExternalFileScanNode(PlanNodeId id, TupleDescriptor desc, String planNodeName) {
super(id, desc, planNodeName, StatisticalType.FILE_SCAN_NODE);
public ExternalFileScanNode(PlanNodeId id, TupleDescriptor desc) {
super(id, desc, "EXTERNAL_FILE_SCAN_NODE", StatisticalType.FILE_SCAN_NODE);
}
// Only for load job.
// Only for broker load job.
public void setLoadInfo(long loadJobId, long txnId, Table targetTable, BrokerDesc brokerDesc,
List<BrokerFileGroup> fileGroups, List<List<TBrokerFileStatus>> fileStatusesList, int filesAdded,
boolean strictMode, int loadParallelism, UserIdentity userIdentity) {
Preconditions.checkState(fileGroups.size() == fileStatusesList.size());
for (int i = 0; i < fileGroups.size(); ++i) {
FileGroupInfo fileGroupInfo = new FileGroupInfo(loadJobId, txnId, targetTable, brokerDesc,
fileGroups.get(i), fileStatusesList.get(i), filesAdded, strictMode, loadParallelism, userIdentity);
fileGroups.get(i), fileStatusesList.get(i), filesAdded, strictMode, loadParallelism);
fileGroupInfos.add(fileGroupInfo);
}
this.type = Type.LOAD;
}
// Only for stream load/routine load job.
public void setLoadInfo(TUniqueId loadId, long txnId, Table targetTable, BrokerDesc brokerDesc,
BrokerFileGroup fileGroup, TBrokerFileStatus fileStatus, boolean strictMode, TFileType fileType) {
FileGroupInfo fileGroupInfo = new FileGroupInfo(loadId, txnId, targetTable, brokerDesc,
fileGroup, fileStatus, strictMode, fileType);
fileGroupInfos.add(fileGroupInfo);
this.type = Type.LOAD;
}
@Override
public void init(Analyzer analyzer) throws UserException {
super.init(analyzer);

View File

@ -19,7 +19,6 @@ package org.apache.doris.planner.external;
import org.apache.doris.analysis.BrokerDesc;
import org.apache.doris.analysis.StorageBackend;
import org.apache.doris.analysis.UserIdentity;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.FsBroker;
import org.apache.doris.catalog.Table;
@ -28,6 +27,7 @@ import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.BrokerUtil;
import org.apache.doris.common.util.Util;
import org.apache.doris.load.BrokerFileGroup;
import org.apache.doris.planner.external.ExternalFileScanNode.ParamCreateContext;
import org.apache.doris.system.Backend;
@ -37,11 +37,14 @@ import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileRangeDesc;
import org.apache.doris.thrift.TFileScanRange;
import org.apache.doris.thrift.TFileScanRangeParams;
import org.apache.doris.thrift.TFileType;
import org.apache.doris.thrift.TNetworkAddress;
import org.apache.doris.thrift.TScanRange;
import org.apache.doris.thrift.TScanRangeLocation;
import org.apache.doris.thrift.TScanRangeLocations;
import org.apache.doris.thrift.TUniqueId;
import com.google.common.collect.Lists;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@ -59,6 +62,14 @@ public class FileGroupInfo {
private static final String HIVE_DEFAULT_COLUMN_SEPARATOR = "\001";
private static final String HIVE_DEFAULT_LINE_DELIMITER = "\n";
public enum JobType {
BULK_LOAD,
STREAM_LOAD
}
private JobType jobType;
private TUniqueId loadId;
private long loadJobId;
private long txnId;
private Table targetTable;
@ -68,13 +79,16 @@ public class FileGroupInfo {
private int filesAdded;
private boolean strictMode;
private int loadParallelism;
private UserIdentity userIdentity;
// set by getFileStatusAndCalcInstance
long bytesPerInstance = 0;
private long bytesPerInstance = 0;
// used for stream load, FILE_LOCAL or FILE_STREAM
private TFileType fileType;
// for broker load
public FileGroupInfo(long loadJobId, long txnId, Table targetTable, BrokerDesc brokerDesc,
BrokerFileGroup fileGroup, List<TBrokerFileStatus> fileStatuses, int filesAdded, boolean strictMode,
int loadParallelism, UserIdentity userIdentity) {
int loadParallelism) {
this.jobType = JobType.BULK_LOAD;
this.loadJobId = loadJobId;
this.txnId = txnId;
this.targetTable = targetTable;
@ -84,7 +98,22 @@ public class FileGroupInfo {
this.filesAdded = filesAdded;
this.strictMode = strictMode;
this.loadParallelism = loadParallelism;
this.userIdentity = userIdentity;
}
// for stream load
public FileGroupInfo(TUniqueId loadId, long txnId, Table targetTable, BrokerDesc brokerDesc,
BrokerFileGroup fileGroup, TBrokerFileStatus fileStatus, boolean strictMode, TFileType fileType) {
this.jobType = JobType.STREAM_LOAD;
this.loadId = loadId;
this.txnId = txnId;
this.targetTable = targetTable;
this.brokerDesc = brokerDesc;
this.fileGroup = fileGroup;
this.fileStatuses = Lists.newArrayList();
this.fileStatuses.add(fileStatus);
this.filesAdded = 1;
this.strictMode = strictMode;
this.fileType = fileType;
}
public Table getTargetTable() {
@ -111,10 +140,6 @@ public class FileGroupInfo {
return loadParallelism;
}
public UserIdentity getUserIdentity() {
return userIdentity;
}
public String getExplainString(String prefix) {
StringBuilder sb = new StringBuilder();
sb.append("file scan\n");
@ -126,19 +151,26 @@ public class FileGroupInfo {
throw new UserException("No source file in this table(" + targetTable.getName() + ").");
}
long totalBytes = 0;
for (TBrokerFileStatus fileStatus : fileStatuses) {
totalBytes += fileStatus.size;
}
int numInstances = (int) (totalBytes / Config.min_bytes_per_broker_scanner);
int totalLoadParallelism = loadParallelism * backendPolicy.numBackends();
numInstances = Math.min(totalLoadParallelism, numInstances);
numInstances = Math.min(numInstances, Config.max_broker_concurrency);
numInstances = Math.max(1, numInstances);
int numInstances = 1;
if (jobType == JobType.BULK_LOAD) {
long totalBytes = 0;
for (TBrokerFileStatus fileStatus : fileStatuses) {
totalBytes += fileStatus.size;
}
numInstances = (int) (totalBytes / Config.min_bytes_per_broker_scanner);
int totalLoadParallelism = loadParallelism * backendPolicy.numBackends();
numInstances = Math.min(totalLoadParallelism, numInstances);
numInstances = Math.min(numInstances, Config.max_broker_concurrency);
numInstances = Math.max(1, numInstances);
bytesPerInstance = totalBytes / numInstances + 1;
if (bytesPerInstance > Config.max_bytes_per_broker_scanner) {
throw new UserException("Scan bytes per file scanner exceed limit: " + Config.max_bytes_per_broker_scanner);
bytesPerInstance = totalBytes / numInstances + 1;
if (bytesPerInstance > Config.max_bytes_per_broker_scanner) {
throw new UserException(
"Scan bytes per file scanner exceed limit: " + Config.max_bytes_per_broker_scanner);
}
} else {
// stream load, not need to split
bytesPerInstance = Long.MAX_VALUE;
}
LOG.info("number instance of file scan node is: {}, bytes per instance: {}", numInstances, bytesPerInstance);
}
@ -156,7 +188,9 @@ public class FileGroupInfo {
TFileFormatType formatType = formatType(context.fileGroup.getFileFormat(), fileStatus.path);
List<String> columnsFromPath = BrokerUtil.parseColumnsFromPath(fileStatus.path,
context.fileGroup.getColumnNamesFromPath());
if (tmpBytes > bytesPerInstance) {
// Assign scan range locations only for broker load.
// stream load has only one file, and no need to set multi scan ranges.
if (tmpBytes > bytesPerInstance && jobType != JobType.STREAM_LOAD) {
// Now only support split plain text
if ((formatType == TFileFormatType.FORMAT_CSV_PLAIN && fileStatus.isSplitable)
|| formatType == TFileFormatType.FORMAT_JSON) {
@ -224,69 +258,55 @@ public class FileGroupInfo {
TScanRangeLocations locations = new TScanRangeLocations();
locations.setScanRange(scanRange);
TScanRangeLocation location = new TScanRangeLocation();
location.setBackendId(selectedBackend.getId());
location.setServer(new TNetworkAddress(selectedBackend.getHost(), selectedBackend.getBePort()));
locations.addToLocations(location);
if (jobType == JobType.BULK_LOAD) {
TScanRangeLocation location = new TScanRangeLocation();
location.setBackendId(selectedBackend.getId());
location.setServer(new TNetworkAddress(selectedBackend.getHost(), selectedBackend.getBePort()));
locations.addToLocations(location);
} else {
// stream load do not need locations
locations.setLocations(Lists.newArrayList());
}
return locations;
}
private String getHeaderType(String formatType) {
if (formatType != null) {
if (formatType.toLowerCase().equals(FeConstants.csv_with_names) || formatType.toLowerCase()
.equals(FeConstants.csv_with_names_and_types)) {
return formatType;
}
}
return "";
}
private TFileFormatType formatType(String fileFormat, String path) throws UserException {
if (fileFormat != null) {
if (fileFormat.toLowerCase().equals("parquet")) {
if (fileFormat.equalsIgnoreCase("parquet")) {
return TFileFormatType.FORMAT_PARQUET;
} else if (fileFormat.toLowerCase().equals("orc")) {
} else if (fileFormat.equalsIgnoreCase("orc")) {
return TFileFormatType.FORMAT_ORC;
} else if (fileFormat.toLowerCase().equals("json")) {
} else if (fileFormat.equalsIgnoreCase("json")) {
return TFileFormatType.FORMAT_JSON;
// csv/csv_with_name/csv_with_names_and_types treat as csv format
} else if (fileFormat.toLowerCase().equals(FeConstants.csv) || fileFormat.toLowerCase()
} else if (fileFormat.equalsIgnoreCase(FeConstants.csv) || fileFormat.toLowerCase()
.equals(FeConstants.csv_with_names) || fileFormat.toLowerCase()
.equals(FeConstants.csv_with_names_and_types)
// TODO: Add TEXTFILE to TFileFormatType to Support hive text file format.
|| fileFormat.toLowerCase().equals(FeConstants.text)) {
|| fileFormat.equalsIgnoreCase(FeConstants.text)) {
return TFileFormatType.FORMAT_CSV_PLAIN;
} else {
throw new UserException("Not supported file format: " + fileFormat);
}
}
String lowerCasePath = path.toLowerCase();
if (lowerCasePath.endsWith(".parquet") || lowerCasePath.endsWith(".parq")) {
return TFileFormatType.FORMAT_PARQUET;
} else if (lowerCasePath.endsWith(".gz")) {
return TFileFormatType.FORMAT_CSV_GZ;
} else if (lowerCasePath.endsWith(".bz2")) {
return TFileFormatType.FORMAT_CSV_BZ2;
} else if (lowerCasePath.endsWith(".lz4")) {
return TFileFormatType.FORMAT_CSV_LZ4FRAME;
} else if (lowerCasePath.endsWith(".lzo")) {
return TFileFormatType.FORMAT_CSV_LZOP;
} else if (lowerCasePath.endsWith(".deflate")) {
return TFileFormatType.FORMAT_CSV_DEFLATE;
} else {
return TFileFormatType.FORMAT_CSV_PLAIN;
}
return Util.getFileFormatType(path);
}
private TFileRangeDesc createFileRangeDesc(long curFileOffset, TBrokerFileStatus fileStatus, long rangeBytes,
List<String> columnsFromPath) {
TFileRangeDesc rangeDesc = new TFileRangeDesc();
rangeDesc.setPath(fileStatus.path);
rangeDesc.setStartOffset(curFileOffset);
rangeDesc.setSize(rangeBytes);
rangeDesc.setColumnsFromPath(columnsFromPath);
if (jobType == JobType.BULK_LOAD) {
rangeDesc.setPath(fileStatus.path);
rangeDesc.setStartOffset(curFileOffset);
rangeDesc.setSize(rangeBytes);
rangeDesc.setColumnsFromPath(columnsFromPath);
} else {
rangeDesc.setLoadId(loadId);
rangeDesc.setSize(fileStatus.size);
}
return rangeDesc;
}
}

View File

@ -30,6 +30,7 @@ import org.apache.doris.common.FeConstants;
import org.apache.doris.common.MetaNotFoundException;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.BrokerUtil;
import org.apache.doris.common.util.Util;
import org.apache.doris.common.util.VectorizedUtil;
import org.apache.doris.load.BrokerFileGroup;
import org.apache.doris.load.Load;
@ -98,7 +99,8 @@ public class LoadScanProvider implements FileScanProviderIf {
ctx.timezone = analyzer.getTimezone();
TFileScanRangeParams params = new TFileScanRangeParams();
params.format_type = formatType(fileGroupInfo.getFileGroup().getFileFormat(), "");
params.setFormatType(formatType(fileGroupInfo.getFileGroup().getFileFormat(), ""));
params.setCompressType(fileGroupInfo.getFileGroup().getCompressType());
params.setStrictMode(fileGroupInfo.isStrictMode());
params.setProperties(fileGroupInfo.getBrokerDesc().getProperties());
if (fileGroupInfo.getBrokerDesc().getFileType() == TFileType.FILE_HDFS) {
@ -233,23 +235,9 @@ public class LoadScanProvider implements FileScanProviderIf {
} else {
throw new UserException("Not supported file format: " + fileFormat);
}
}
String lowerCasePath = path.toLowerCase();
if (lowerCasePath.endsWith(".parquet") || lowerCasePath.endsWith(".parq")) {
return TFileFormatType.FORMAT_PARQUET;
} else if (lowerCasePath.endsWith(".gz")) {
return TFileFormatType.FORMAT_CSV_GZ;
} else if (lowerCasePath.endsWith(".bz2")) {
return TFileFormatType.FORMAT_CSV_BZ2;
} else if (lowerCasePath.endsWith(".lz4")) {
return TFileFormatType.FORMAT_CSV_LZ4FRAME;
} else if (lowerCasePath.endsWith(".lzo")) {
return TFileFormatType.FORMAT_CSV_LZOP;
} else if (lowerCasePath.endsWith(".deflate")) {
return TFileFormatType.FORMAT_CSV_DEFLATE;
} else {
return TFileFormatType.FORMAT_CSV_PLAIN;
// get file format by the suffix of file
return Util.getFileFormatType(path);
}
}

View File

@ -22,6 +22,7 @@ import org.apache.doris.analysis.ImportColumnDesc;
import org.apache.doris.analysis.PartitionNames;
import org.apache.doris.analysis.Separator;
import org.apache.doris.load.loadv2.LoadTask;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileType;
@ -54,6 +55,8 @@ public interface LoadTaskInfo {
TFileFormatType getFormatType();
TFileCompressType getCompressType();
String getJsonPaths();
String getJsonRoot();
@ -93,5 +96,25 @@ public interface LoadTaskInfo {
class ImportColumnDescs {
public List<ImportColumnDesc> descs = Lists.newArrayList();
public boolean isColumnDescsRewrited = false;
public List<String> getFileColNames() {
List<String> colNames = Lists.newArrayList();
for (ImportColumnDesc desc : descs) {
if (desc.isColumn()) {
colNames.add(desc.getColumnName());
}
}
return colNames;
}
public List<Expr> getColumnMappingList() {
List<Expr> exprs = Lists.newArrayList();
for (ImportColumnDesc desc : descs) {
if (!desc.isColumn()) {
exprs.add(desc.toBinaryPredicate());
}
}
return exprs;
}
}
}

View File

@ -30,6 +30,7 @@ import org.apache.doris.common.UserException;
import org.apache.doris.common.util.SqlParserUtils;
import org.apache.doris.common.util.TimeUtils;
import org.apache.doris.load.loadv2.LoadTask;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileType;
import org.apache.doris.thrift.TStreamLoadPutRequest;
@ -52,6 +53,7 @@ public class StreamLoadTask implements LoadTaskInfo {
private long txnId;
private TFileType fileType;
private TFileFormatType formatType;
private TFileCompressType compressType = TFileCompressType.UNKNOWN;
private boolean stripOuterArray;
private boolean numAsString;
private String jsonPaths;
@ -80,11 +82,13 @@ public class StreamLoadTask implements LoadTaskInfo {
private String headerType = "";
private List<String> hiddenColumns;
public StreamLoadTask(TUniqueId id, long txnId, TFileType fileType, TFileFormatType formatType) {
public StreamLoadTask(TUniqueId id, long txnId, TFileType fileType, TFileFormatType formatType,
TFileCompressType compressType) {
this.id = id;
this.txnId = txnId;
this.fileType = fileType;
this.formatType = formatType;
this.compressType = compressType;
this.jsonPaths = "";
this.jsonRoot = "";
this.stripOuterArray = false;
@ -109,6 +113,10 @@ public class StreamLoadTask implements LoadTaskInfo {
return formatType;
}
public TFileCompressType getCompressType() {
return compressType;
}
public ImportColumnDescs getColumnExprDescs() {
return columnExprDescs;
}
@ -238,7 +246,8 @@ public class StreamLoadTask implements LoadTaskInfo {
public static StreamLoadTask fromTStreamLoadPutRequest(TStreamLoadPutRequest request) throws UserException {
StreamLoadTask streamLoadTask = new StreamLoadTask(request.getLoadId(), request.getTxnId(),
request.getFileType(), request.getFormatType());
request.getFileType(), request.getFormatType(),
request.getCompressType());
streamLoadTask.setOptionalFromTSLPutRequest(request);
return streamLoadTask;
}

View File

@ -543,6 +543,7 @@ struct TStreamLoadPutRequest {
37: optional bool load_to_single_tablet
38: optional string header_type
39: optional string hidden_columns
40: optional PlanNodes.TFileCompressType compress_type
}
struct TStreamLoadPutResult {

View File

@ -114,6 +114,21 @@ enum TFileFormatType {
FORMAT_PROTO,
}
// In previous versions, the data compression format and file format were stored together, as TFileFormatType,
// which was inconvenient for flexible combination of file format and compression format.
// Therefore, the compressed format is separately added here.
// In order to ensure forward compatibility, if this type is set, the type shall prevail,
// otherwise, the TFileFormatType shall prevail
enum TFileCompressType {
UNKNOWN,
PLAIN,
GZ,
LZO,
BZ2,
LZ4FRAME,
DEFLATE
}
struct THdfsConf {
1: required string key
2: required string value
@ -245,45 +260,50 @@ struct TFileAttributes {
struct TFileScanRangeParams {
1: optional Types.TFileType file_type;
2: optional TFileFormatType format_type;
3: optional TFileCompressType compress_type;
// If this is for load job, src point to the source table and dest point to the doris table.
// If this is for query, only dest_tuple_id is set, including both file slot and partition slot.
3: optional Types.TTupleId src_tuple_id;
4: optional Types.TTupleId dest_tuple_id
4: optional Types.TTupleId src_tuple_id;
5: optional Types.TTupleId dest_tuple_id
// num_of_columns_from_file can spilt the all_file_slot and all_partition_slot
5: optional i32 num_of_columns_from_file;
6: optional i32 num_of_columns_from_file;
// all selected slots which may compose from file and partition value.
6: optional list<TFileScanSlotInfo> required_slots;
7: optional list<TFileScanSlotInfo> required_slots;
7: optional THdfsParams hdfs_params;
8: optional THdfsParams hdfs_params;
// properties for file such as s3 information
8: optional map<string, string> properties;
9: optional map<string, string> properties;
// The convert exprt map for load job
// desc slot id -> expr
9: optional map<Types.TSlotId, Exprs.TExpr> expr_of_dest_slot
10: optional map<Types.TSlotId, Exprs.TExpr> default_value_of_src_slot
10: optional map<Types.TSlotId, Exprs.TExpr> expr_of_dest_slot
11: optional map<Types.TSlotId, Exprs.TExpr> default_value_of_src_slot
// This is the mapping of dest slot id and src slot id in load expr
// It excludes the slot id which has the transform expr
11: optional map<Types.TSlotId, Types.TSlotId> dest_sid_to_src_sid_without_trans
12: optional map<Types.TSlotId, Types.TSlotId> dest_sid_to_src_sid_without_trans
// strictMode is a boolean
// if strict mode is true, the incorrect data (the result of cast is null) will not be loaded
12: optional bool strict_mode
13: optional bool strict_mode
13: optional list<Types.TNetworkAddress> broker_addresses
14: optional TFileAttributes file_attributes
15: optional Exprs.TExpr pre_filter_exprs
14: optional list<Types.TNetworkAddress> broker_addresses
15: optional TFileAttributes file_attributes
16: optional Exprs.TExpr pre_filter_exprs
}
struct TFileRangeDesc {
// If load_id is set, this is for stream/routine load.
// If path is set, this is for bulk load.
1: optional Types.TUniqueId load_id
// Path of this range
1: optional string path;
2: optional string path;
// Offset of this file start
2: optional i64 start_offset;
3: optional i64 start_offset;
// Size of this range, if size = -1, this means that will read to the end of file
3: optional i64 size;
4: optional i64 size;
5: optional i64 file_size;
// columns parsed from file path should be after the columns read from file
4: optional list<string> columns_from_path;
6: optional list<string> columns_from_path;
}
// TFileScanRange represents a set of descriptions of a file and the rules for reading and converting it.

View File

@ -20,11 +20,11 @@
// **Note**: default db will be create if not exist
defaultDb = "regression_test"
jdbcUrl = "jdbc:mysql://127.0.0.1:9030/?"
jdbcUrl = "jdbc:mysql://127.0.0.1:9033/?"
jdbcUser = "root"
jdbcPassword = ""
feHttpAddress = "127.0.0.1:8030"
feHttpAddress = "127.0.0.1:8033"
feHttpUser = "root"
feHttpPassword = ""

File diff suppressed because it is too large Load Diff

View File

@ -3,3 +3,51 @@
-2 -51 \N 1 \N \N \N \N \N \N \N 2 \N \N
-2 -50 \N 1 \N \N \N \N \N \N \N \N j \N
-- !sql1 --
2019 9 9 9 7.7 a 2019-09-09 1970-01-01T08:33:39 k7 9.0 9.0
-- !all11 --
2500
-- !all12 --
11
-- !all21 --
2500
-- !all22 --
0
-- !all23 --
2500
-- !all24 --
2500
-- !all31 --
11
-- !all32 --
11
-- !all33 --
11
-- !all41 --
2500
-- !all51 --
0
-- !all61 --
0
-- !all71 --
1 2 1025 1028
-- !all81 --
2
-- !all91 --
1

View File

@ -0,0 +1,6 @@
1,2,1,0,1
2,2,1,0,2
3,2,1,0,3
1,2,2,1,4
2,2,1,1,5
3,2,2,0,0
1 1 2 1 0 1
2 2 2 1 0 2
3 3 2 1 0 3
4 1 2 2 1 4
5 2 2 1 1 5
6 3 2 2 0 0

View File

@ -1,64 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
import org.codehaus.groovy.runtime.IOGroovyMethods
import java.nio.charset.StandardCharsets
suite("load_nullable_to_not_nullable") {
def tableName = "load_nullable_to_not_nullable"
def dbName = "test_query_db"
sql "CREATE DATABASE IF NOT EXISTS ${dbName}"
sql "USE $dbName"
sql "DROP TABLE IF EXISTS ${tableName} "
sql """
CREATE TABLE `${tableName}` (
k1 int(32) NOT NULL,
k2 smallint NOT NULL,
k3 int NOT NULL,
k4 bigint NOT NULL,
k5 decimal(9, 3) NOT NULL,
k6 char(5) NOT NULL,
k10 date NOT NULL,
k11 datetime NOT NULL,
k7 varchar(20) NOT NULL,
k8 double max NOT NULL,
k9 float sum NOT NULL )
AGGREGATE KEY(k1,k2,k3,k4,k5,k6,k10,k11,k7)
PARTITION BY RANGE(k2) (
PARTITION partition_a VALUES LESS THAN MAXVALUE
)
DISTRIBUTED BY HASH(k1, k2, k5)
BUCKETS 3
PROPERTIES ( "replication_allocation" = "tag.location.default: 1");
"""
StringBuilder commandBuilder = new StringBuilder()
commandBuilder.append("""curl -v --location-trusted -u ${context.config.feHttpUser}:${context.config.feHttpPassword}""")
commandBuilder.append(""" -H columns:col,k1=year(col),k2=month(col),k3=month(col),k4=day(col),k5=7.7,k6='a',k10=date(col),k11=FROM_UNIXTIME(2019,'%Y-%m-%dT%H:%i:%s'),k7='k7',k8=month(col),k9=day(col) -T ${context.file.parent}/data/test_time.data http://${context.config.feHttpAddress}/api/""" + dbName + "/" + tableName + "/_stream_load")
String command = commandBuilder.toString()
def process = command.execute()
int code = process.waitFor()
String err = IOGroovyMethods.getText(new BufferedReader(new InputStreamReader(process.getErrorStream())));
String out = process.getText()
logger.info("Run command: command=" + command + ",code=" + code + ", out=" + out + ", err=" + err)
assertEquals(code, 0)
sql "sync"
qt_sql " SELECT * FROM ${tableName} "
sql "DROP TABLE ${tableName} "
}

View File

@ -132,4 +132,422 @@ suite("test_stream_load", "p0") {
sql "sync"
rowCount = sql "select count(1) from ${tableName}"
assertEquals(3, rowCount[0][0])
// test load_nullable_to_not_nullable
def tableName2 = "load_nullable_to_not_nullable"
sql """ DROP TABLE IF EXISTS ${tableName2} """
sql """
CREATE TABLE `${tableName2}` (
k1 int(32) NOT NULL,
k2 smallint NOT NULL,
k3 int NOT NULL,
k4 bigint NOT NULL,
k5 decimal(9, 3) NOT NULL,
k6 char(5) NOT NULL,
k10 date NOT NULL,
k11 datetime NOT NULL,
k7 varchar(20) NOT NULL,
k8 double max NOT NULL,
k9 float sum NOT NULL )
AGGREGATE KEY(k1,k2,k3,k4,k5,k6,k10,k11,k7)
PARTITION BY RANGE(k2) (
PARTITION partition_a VALUES LESS THAN MAXVALUE
)
DISTRIBUTED BY HASH(k1, k2, k5)
BUCKETS 3
PROPERTIES ( "replication_allocation" = "tag.location.default: 1");
"""
streamLoad {
table "${tableName2}"
set 'column_separator', '\t'
set 'columns', 'col,k1=year(col),k2=month(col),k3=month(col),k4=day(col),k5=7.7,k6="a",k10=date(col),k11=FROM_UNIXTIME(2019,"%Y-%m-%dT%H:%i:%s"),k7="k7",k8=month(col),k9=day(col)'
file 'test_time.data'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(1, json.NumberTotalRows)
assertEquals(0, json.NumberFilteredRows)
}
}
order_qt_sql1 " SELECT * FROM ${tableName2}"
// test common case
def tableName3 = "test_all"
def tableName4 = "test_less_col"
def tableName5 = "test_bitmap_and_hll"
def tableName6 = "test_unique_key"
def tableName7 = "test_unique_key_with_delete"
sql """ DROP TABLE IF EXISTS ${tableName3} """
sql """ DROP TABLE IF EXISTS ${tableName4} """
sql """ DROP TABLE IF EXISTS ${tableName5} """
sql """ DROP TABLE IF EXISTS ${tableName6} """
sql """ DROP TABLE IF EXISTS ${tableName7} """
sql """
CREATE TABLE ${tableName3} (
`k1` int(11) NULL,
`k2` tinyint(4) NULL,
`k3` smallint(6) NULL,
`k4` bigint(20) NULL,
`k5` largeint(40) NULL,
`k6` float NULL,
`k7` double NULL,
`k8` decimal(9, 0) NULL,
`k9` char(10) NULL,
`k10` varchar(1024) NULL,
`k11` text NULL,
`k12` date NULL,
`k13` datetime NULL
) ENGINE=OLAP
DISTRIBUTED BY HASH(`k1`) BUCKETS 3
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
"""
sql """
CREATE TABLE ${tableName4} (
`k1` int(11) NULL,
`k2` tinyint(4) NULL,
`k3` smallint(6) NULL,
`k4` bigint(20) NULL,
`k5` largeint(40) NULL
) ENGINE=OLAP
DISTRIBUTED BY HASH(`k1`) BUCKETS 3
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
"""
sql """
CREATE TABLE ${tableName5} (
`k1` int(11) NULL,
`k2` tinyint(4) NULL,
`v1` bitmap bitmap_union,
`v2` hll hll_union
) ENGINE=OLAP
DISTRIBUTED BY HASH(`k1`) BUCKETS 3
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
"""
sql """
CREATE TABLE ${tableName6} (
`k1` int(11) NULL,
`k2` tinyint(4) NULL,
`v1` varchar(1024)
) ENGINE=OLAP
UNIQUE KEY(k1, k2)
DISTRIBUTED BY HASH(`k1`) BUCKETS 3
PROPERTIES (
"replication_allocation" = "tag.location.default: 1"
);
"""
sql """
CREATE TABLE ${tableName7} (
`k1` int(11) NULL,
`k2` tinyint(4) NULL,
`v1` varchar(1024)
) ENGINE=OLAP
UNIQUE KEY(k1, k2)
DISTRIBUTED BY HASH(`k1`) BUCKETS 3
PROPERTIES (
"function_column.sequence_type" = "int",
"replication_allocation" = "tag.location.default: 1"
);
"""
// load all columns
streamLoad {
table "${tableName3}"
set 'column_separator', ','
file 'all_types.csv'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(2500, json.NumberTotalRows)
assertEquals(0, json.NumberFilteredRows)
}
}
sql "sync"
order_qt_all11 "SELECT count(*) FROM ${tableName3}" // 2500
order_qt_all12 "SELECT count(*) FROM ${tableName3} where k1 <= 10" // 11
sql """truncate table ${tableName3}"""
sql """sync"""
// load part of columns
streamLoad {
table "${tableName3}"
set 'column_separator', ','
set 'columns', 'k1, k2'
file 'all_types.csv'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("fail", json.Status.toLowerCase())
assertEquals(0, json.NumberLoadedRows)
}
}
// load with skip 2 columns, with gzip
streamLoad {
table "${tableName3}"
set 'column_separator', ','
set 'columns', 'k1, k2, k3, k4, tmp1, tmp2, k7, k8, k9, k10, k11, k12, k13'
set 'compress_type', 'gz'
file 'all_types.csv.gz'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(2500, json.NumberTotalRows)
assertEquals(2500, json.NumberLoadedRows)
assertEquals(0, json.NumberFilteredRows)
assertEquals(0, json.NumberUnselectedRows)
}
}
sql "sync"
order_qt_all21 "SELECT count(*) FROM ${tableName3}" // 2500
order_qt_all22 "SELECT count(*) FROM ${tableName3} where k1 is null" // 0
order_qt_all23 "SELECT count(*) FROM ${tableName3} where k5 is null" // 2500
order_qt_all24 "SELECT count(*) FROM ${tableName3} where k6 is null" // 2500
sql """truncate table ${tableName3}"""
sql """sync"""
// load with column mapping and where predicate
streamLoad {
table "${tableName3}"
set 'column_separator', ','
set 'columns', 'k1, k2, k3, k4, tmp5, k6, tmpk7, k8, k9, k10, k11, k12, k13, k7=tmpk7+1'
set 'where', 'k1 <= 10'
file 'all_types.csv'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(2500, json.NumberTotalRows)
assertEquals(11, json.NumberLoadedRows)
assertEquals(0, json.NumberFilteredRows)
assertEquals(2489, json.NumberUnselectedRows)
}
}
sql "sync"
order_qt_all31 "SELECT count(*) FROM ${tableName3}" // 11
order_qt_all32 "SELECT count(*) FROM ${tableName3} where k7 >= 7" // 11
order_qt_all33 "SELECT count(*) FROM ${tableName3} where k5 is null" // 11
sql """truncate table ${tableName3}"""
sql """sync"""
// load without strict_mode
streamLoad {
table "${tableName3}"
set 'column_separator', ','
set 'columns', 'tmpk1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k1=k13'
file 'all_types.csv'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(2500, json.NumberTotalRows)
assertEquals(2500, json.NumberLoadedRows)
assertEquals(0, json.NumberFilteredRows)
assertEquals(0, json.NumberUnselectedRows)
}
}
sql "sync"
order_qt_all41 "SELECT count(*) FROM ${tableName3} where k1 is null" // 2500
sql """truncate table ${tableName3}"""
sql """sync"""
// load with strict_mode false and max_filter_ratio
streamLoad {
table "${tableName4}"
set 'column_separator', ','
set 'columns', 'k1, k2, k3, k4, tmpk5, tmpk6, tmpk7, tmpk8, tmpk9, tmpk10, tmpk11, tmpk12, k5'
set 'max_filter_ratio', '1'
set 'strict_mode', 'true'
file 'all_types.csv'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(2500, json.NumberTotalRows)
assertEquals(0, json.NumberLoadedRows)
assertEquals(2500, json.NumberFilteredRows)
assertEquals(0, json.NumberUnselectedRows)
}
}
sql "sync"
order_qt_all51 "SELECT count(*) FROM ${tableName4}" // 0
sql """truncate table ${tableName4}"""
sql """sync"""
// load with strict_mode true and max_filter_ratio
streamLoad {
table "${tableName4}"
set 'column_separator', ','
set 'columns', 'k1, k2, k3, k4, tmpk5, tmpk6, tmpk7, tmpk8, tmpk9, tmpk10, tmpk11, tmpk12, k5'
set 'max_filter_ratio', '0'
set 'strict_mode', 'true'
file 'all_types.csv'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("fail", json.Status.toLowerCase())
assertEquals(0, json.NumberLoadedRows)
}
}
sql "sync"
order_qt_all61 "SELECT count(*) FROM ${tableName4}" // 0
sql """truncate table ${tableName4}"""
sql """sync"""
// load bitmap and hll with bzip2
streamLoad {
table "${tableName5}"
set 'column_separator', ','
set 'columns', 'k1, k2, tmp1, tmp2, v1=to_bitmap(tmp1), v2=hll_hash(tmp2)'
set 'compress_type', 'bz2'
file 'bitmap_hll.csv.bz2'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(1025, json.NumberTotalRows)
assertEquals(1025, json.NumberLoadedRows)
assertEquals(0, json.NumberFilteredRows)
assertEquals(0, json.NumberUnselectedRows)
}
}
sql "sync"
order_qt_all71 "SELECT k1, k2, bitmap_union_count(v1), HLL_UNION_AGG(v2) FROM ${tableName5} group by k1, k2" // 1,2,1025,1028
sql """truncate table ${tableName5}"""
sql """sync"""
// load unique key
streamLoad {
table "${tableName6}"
set 'column_separator', ','
set 'compress_type', 'lz4'
file 'unique_key.csv.lz4'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(8001, json.NumberTotalRows)
assertEquals(8001, json.NumberLoadedRows)
assertEquals(0, json.NumberFilteredRows)
assertEquals(0, json.NumberUnselectedRows)
}
}
sql "sync"
order_qt_all81 "SELECT count(*) from ${tableName6}" // 2
sql """truncate table ${tableName6}"""
sql """sync"""
// load unique key with delete and sequence
streamLoad {
table "${tableName7}"
set 'column_separator', ','
set 'columns', 'k1,k2,v1,del,seq'
set 'delete', 'del=1'
set 'merge_type', 'merge'
set 'function_column.sequence_col', 'seq'
file 'unique_key_with_delete.csv'
time 10000 // limit inflight 10s
check { result, exception, startTime, endTime ->
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
assertEquals("success", json.Status.toLowerCase())
assertEquals(6, json.NumberTotalRows)
assertEquals(6, json.NumberLoadedRows)
assertEquals(0, json.NumberFilteredRows)
assertEquals(0, json.NumberUnselectedRows)
}
}
sql "sync"
order_qt_all91 "SELECT count(*) from ${tableName7}" // 2
sql """truncate table ${tableName7}"""
sql """sync"""
}