Files
doris/be/src/exec/plain_text_line_reader.cpp
sduzh 6fedf5881b [CodeFormat] Clang-format cpp sources (#4965)
Clang-format all c++ source files.
2020-11-28 18:36:49 +08:00

342 lines
13 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "exec/plain_text_line_reader.h"
#include "common/status.h"
#include "exec/decompressor.h"
#include "exec/file_reader.h"
// INPUT_CHUNK must
// larger than 15B for correct lz4 file decompressing
// larger than 300B for correct lzo header decompressing
#define INPUT_CHUNK (2 * 1024 * 1024)
// #define INPUT_CHUNK (34)
#define OUTPUT_CHUNK (8 * 1024 * 1024)
// #define OUTPUT_CHUNK (32)
// leave these 2 size small for debugging
namespace doris {
PlainTextLineReader::PlainTextLineReader(RuntimeProfile* profile, FileReader* file_reader,
Decompressor* decompressor, size_t length,
uint8_t line_delimiter)
: _profile(profile),
_file_reader(file_reader),
_decompressor(decompressor),
_min_length(length),
_total_read_bytes(0),
_line_delimiter(line_delimiter),
_input_buf(new uint8_t[INPUT_CHUNK]),
_input_buf_size(INPUT_CHUNK),
_input_buf_pos(0),
_input_buf_limit(0),
_output_buf(new uint8_t[OUTPUT_CHUNK]),
_output_buf_size(OUTPUT_CHUNK),
_output_buf_pos(0),
_output_buf_limit(0),
_file_eof(false),
_eof(false),
_stream_end(true),
_more_input_bytes(0),
_more_output_bytes(0),
_bytes_read_counter(nullptr),
_read_timer(nullptr),
_bytes_decompress_counter(nullptr),
_decompress_timer(nullptr) {
_bytes_read_counter = ADD_COUNTER(_profile, "BytesRead", TUnit::BYTES);
_read_timer = ADD_TIMER(_profile, "FileReadTime");
_bytes_decompress_counter = ADD_COUNTER(_profile, "BytesDecompressed", TUnit::BYTES);
_decompress_timer = ADD_TIMER(_profile, "DecompressTime");
}
PlainTextLineReader::~PlainTextLineReader() {
close();
}
void PlainTextLineReader::close() {
if (_input_buf != nullptr) {
delete[] _input_buf;
_input_buf = nullptr;
}
if (_output_buf != nullptr) {
delete[] _output_buf;
_output_buf = nullptr;
}
}
inline bool PlainTextLineReader::update_eof() {
if (done()) {
_eof = true;
} else if (_decompressor == nullptr && (_min_length >= 0 && _total_read_bytes >= _min_length)) {
_eof = true;
}
return _eof;
}
uint8_t* PlainTextLineReader::update_field_pos_and_find_line_delimiter(const uint8_t* start,
size_t len) {
// TODO: meanwhile find and save field pos
return (uint8_t*)memmem(start, len, &_line_delimiter, 1);
}
// extend input buf if necessary only when _more_input_bytes > 0
void PlainTextLineReader::extend_input_buf() {
DCHECK(_more_input_bytes > 0);
// left capacity
size_t capacity = _input_buf_size - _input_buf_limit;
// we want at least _more_input_bytes capacity left
do {
if (capacity >= _more_input_bytes) {
// enough
break;
}
capacity = capacity + _input_buf_pos;
if (capacity >= _more_input_bytes) {
// move the read remaining to the beginning of the current input buf,
memmove(_input_buf, _input_buf + _input_buf_pos, input_buf_read_remaining());
_input_buf_limit -= _input_buf_pos;
_input_buf_pos = 0;
break;
}
while (_input_buf_size - input_buf_read_remaining() < _more_input_bytes) {
_input_buf_size = _input_buf_size * 2;
}
uint8_t* new_input_buf = new uint8_t[_input_buf_size];
memmove(new_input_buf, _input_buf + _input_buf_pos, input_buf_read_remaining());
delete[] _input_buf;
_input_buf = new_input_buf;
_input_buf_limit -= _input_buf_pos;
_input_buf_pos = 0;
} while (false);
// LOG(INFO) << "extend input buf."
// << " input_buf_size: " << _input_buf_size
// << " input_buf_pos: " << _input_buf_pos
// << " input_buf_limit: " << _input_buf_limit;
}
void PlainTextLineReader::extend_output_buf() {
// left capacity
size_t capacity = _output_buf_size - _output_buf_limit;
// we want at least 1024 bytes capacity left
size_t target = std::max<size_t>(1024, capacity + _more_output_bytes);
do {
// 1. if left capacity is enough, return;
if (capacity >= target) {
break;
}
// 2. try reuse buf
capacity = capacity + _output_buf_pos;
if (capacity >= target) {
// move the read remaining to the beginning of the current output buf,
memmove(_output_buf, _output_buf + _output_buf_pos, output_buf_read_remaining());
_output_buf_limit -= _output_buf_pos;
_output_buf_pos = 0;
break;
}
// 3. extend buf size to meet the target
while (_output_buf_size - output_buf_read_remaining() < target) {
_output_buf_size = _output_buf_size * 2;
}
uint8_t* new_output_buf = new uint8_t[_output_buf_size];
memmove(new_output_buf, _output_buf + _output_buf_pos, output_buf_read_remaining());
delete[] _output_buf;
_output_buf = new_output_buf;
_output_buf_limit -= _output_buf_pos;
_output_buf_pos = 0;
} while (false);
// LOG(INFO) << "extend output buf."
// << " output_buf_size: " << _output_buf_size
// << " output_buf_pos: " << _output_buf_pos
// << " output_buf_limit: " << _output_buf_limit;
}
Status PlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool* eof) {
if (_eof || update_eof()) {
*size = 0;
*eof = true;
return Status::OK();
}
int found_line_delimiter = 0;
size_t offset = 0;
while (!done()) {
// find line delimiter in current decompressed data
uint8_t* cur_ptr = _output_buf + _output_buf_pos;
uint8_t* pos = update_field_pos_and_find_line_delimiter(
cur_ptr + offset, output_buf_read_remaining() - offset);
if (pos == nullptr) {
// didn't find line delimiter, read more data from decompressor
// 1. point 'offset' to _output_buf_limit
offset = output_buf_read_remaining();
// 2. read from file reader
extend_output_buf();
if ((_input_buf_limit > _input_buf_pos) && _more_input_bytes == 0) {
// we still have data in input which is not decompressed.
// and no more data is required for input
} else {
size_t read_len = 0;
uint8_t* file_buf;
if (_decompressor == nullptr) {
// uncompressed file, read directly into output buf
file_buf = _output_buf + _output_buf_limit;
read_len = _output_buf_size - _output_buf_limit;
} else {
// MARK
if (_more_input_bytes > 0) {
// we already extend input buf.
// current data in input buf should remain unchanged
file_buf = _input_buf + _input_buf_limit;
read_len = _input_buf_size - _input_buf_limit;
// leave input pos and limit unchanged
} else {
// here we are sure that all data in input buf has been consumed.
// which means input pos and limit should be reset.
file_buf = _input_buf;
read_len = _input_buf_size;
// reset input pos and limit
_input_buf_pos = 0;
_input_buf_limit = 0;
}
}
{
SCOPED_TIMER(_read_timer);
RETURN_IF_ERROR(_file_reader->read(file_buf, &read_len, &_file_eof));
COUNTER_UPDATE(_bytes_read_counter, read_len);
}
// LOG(INFO) << "after read file: _file_eof: " << _file_eof << " read_len: " << read_len;
if (_file_eof || read_len == 0) {
if (!_stream_end) {
std::stringstream ss;
ss << "Compressed file has been truncated, which is not allowed";
return Status::InternalError(ss.str());
} else {
// last loop we meet stream end,
// and now we finished reading file, so we are finished
// break this loop to see if there is data in buffer
break;
}
}
if (_decompressor == nullptr) {
_output_buf_limit += read_len;
_stream_end = true;
} else {
// only update input limit.
// input pos is set at MARK step
_input_buf_limit += read_len;
}
if (read_len < _more_input_bytes) {
// we failed to read enough data, continue to read from file
_more_input_bytes = _more_input_bytes - read_len;
continue;
}
}
if (_decompressor != nullptr) {
SCOPED_TIMER(_decompress_timer);
// 2. decompress
size_t input_read_bytes = 0;
size_t decompressed_len = 0;
_more_input_bytes = 0;
_more_output_bytes = 0;
RETURN_IF_ERROR(_decompressor->decompress(
_input_buf + _input_buf_pos, /* input */
_input_buf_limit - _input_buf_pos, /* input_len */
&input_read_bytes, _output_buf + _output_buf_limit, /* output */
_output_buf_size - _output_buf_limit, /* output_max_len */
&decompressed_len, &_stream_end, &_more_input_bytes, &_more_output_bytes));
// LOG(INFO) << "after decompress:"
// << " stream_end: " << _stream_end
// << " input_read_bytes: " << input_read_bytes
// << " decompressed_len: " << decompressed_len
// << " more_input_bytes: " << _more_input_bytes
// << " more_output_bytes: " << _more_output_bytes;
// update pos and limit
_input_buf_pos += input_read_bytes;
_output_buf_limit += decompressed_len;
COUNTER_UPDATE(_bytes_decompress_counter, decompressed_len);
// TODO(cmy): watch this case
if ((input_read_bytes == 0 /*decompressed_len == 0*/) && _more_input_bytes == 0 &&
_more_output_bytes == 0) {
// decompress made no progress, may be
// A. input data is not enough to decompress data to output
// B. output buf is too small to save decompressed output
// this is very unlikely to happen
// print the log and just go to next loop to read more data or extend output buf.
// (cmy), for now, return failed to avoid potential endless loop
std::stringstream ss;
ss << "decompress made no progress."
<< " input_read_bytes: " << input_read_bytes
<< " decompressed_len: " << decompressed_len;
LOG(WARNING) << ss.str();
return Status::InternalError(ss.str());
}
if (_more_input_bytes > 0) {
extend_input_buf();
}
}
} else {
// we found a complete line
// ready to return
offset = pos - cur_ptr;
found_line_delimiter = 1;
break;
}
} // while (!done())
*ptr = _output_buf + _output_buf_pos;
*size = offset;
// Skip offset and _line_delimiter size;
_output_buf_pos += offset + found_line_delimiter;
if (offset == 0 && found_line_delimiter == 0) {
*eof = true;
} else {
*eof = false;
}
// update total read bytes
_total_read_bytes += *size + found_line_delimiter;
return Status::OK();
}
} // namespace doris