[fix](hive) do not split compress data file and support lz4/snappy block codec (#23245)

1. do not split compress data file
Some data file in hive is compressed with gzip, deflate, etc.
These kinds of file can not be splitted.

2. Support lz4 block codec
for hive scan node, use lz4 block codec instead of lz4 frame codec

4. Support snappy block codec
For hadoop snappy

5. Optimize the `count(*)` query of csv file
For query like `select count(*) from tbl`, only need to split the line, no need to split the column.

Need to pick to branch-2.0 after this PR: #22304
This commit is contained in:
Mingyu Chen
2023-08-26 12:59:05 +08:00
committed by GitHub
parent 36b7fcf055
commit 40be6a0b05
18 changed files with 465 additions and 57 deletions

View File

@ -42,6 +42,12 @@ Status Decompressor::create_decompressor(CompressType type, Decompressor** decom
case CompressType::LZ4FRAME:
*decompressor = new Lz4FrameDecompressor();
break;
case CompressType::LZ4BLOCK:
*decompressor = new Lz4BlockDecompressor();
break;
case CompressType::SNAPPYBLOCK:
*decompressor = new SnappyBlockDecompressor();
break;
#ifdef DORIS_WITH_LZO
case CompressType::LZOP:
*decompressor = new LzopDecompressor();
@ -59,6 +65,10 @@ Status Decompressor::create_decompressor(CompressType type, Decompressor** decom
return st;
}
uint32_t Decompressor::_read_int32(uint8_t* buf) {
return (buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3];
}
std::string Decompressor::debug_info() {
return "Decompressor";
}
@ -239,7 +249,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t
size_t* decompressed_len, bool* stream_end,
size_t* more_input_bytes, size_t* more_output_bytes) {
uint8_t* src = input;
size_t src_size = input_len;
size_t remaining_input_size = input_len;
size_t ret = 1;
*input_bytes_read = 0;
@ -257,7 +267,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t
}
LZ4F_frameInfo_t info;
ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &src_size);
ret = LZ4F_getFrameInfo(_dctx, &info, (void*)src, &remaining_input_size);
if (LZ4F_isError(ret)) {
return Status::InternalError("LZ4F_getFrameInfo error: {}",
std::string(LZ4F_getErrorName(ret)));
@ -270,17 +280,17 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t
std::string(LZ4F_getErrorName(ret)));
}
*input_bytes_read = src_size;
*input_bytes_read = remaining_input_size;
src += src_size;
src_size = input_len - src_size;
src += remaining_input_size;
remaining_input_size = input_len - remaining_input_size;
LOG(INFO) << "lz4 block size: " << _expect_dec_buf_size;
}
// decompress
size_t output_len = output_max_len;
ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &src_size,
ret = LZ4F_decompress(_dctx, (void*)output, &output_len, (void*)src, &remaining_input_size,
/* LZ4F_decompressOptions_t */ nullptr);
if (LZ4F_isError(ret)) {
return Status::InternalError("Decompression error: {}",
@ -288,7 +298,7 @@ Status Lz4FrameDecompressor::decompress(uint8_t* input, size_t input_len, size_t
}
// update
*input_bytes_read += src_size;
*input_bytes_read += remaining_input_size;
*decompressed_len = output_len;
if (ret == 0) {
*stream_end = true;
@ -324,4 +334,165 @@ size_t Lz4FrameDecompressor::get_block_size(const LZ4F_frameInfo_t* info) {
}
}
/// Lz4BlockDecompressor
Status Lz4BlockDecompressor::init() {
return Status::OK();
}
Status Lz4BlockDecompressor::decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read,
uint8_t* output, size_t output_max_len,
size_t* decompressed_len, bool* stream_end,
size_t* more_input_bytes, size_t* more_output_bytes) {
uint8_t* src = input;
size_t remaining_input_size = input_len;
int64_t uncompressed_total_len = 0;
*input_bytes_read = 0;
// The hadoop lz4 codec is as:
// <4 byte big endian uncompressed size>
// <4 byte big endian compressed size>
// <lz4 compressed block>
// ....
// <4 byte big endian uncompressed size>
// <4 byte big endian compressed size>
// <lz4 compressed block>
//
// See:
// https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/Lz4Codec.cc
while (remaining_input_size > 0) {
// Read uncompressed size
uint32_t uncompressed_block_len = Decompressor::_read_int32(src);
int64_t remaining_output_size = output_max_len - uncompressed_total_len;
if (remaining_output_size < uncompressed_block_len) {
// Need more output buffer
*more_output_bytes = uncompressed_block_len - remaining_output_size;
break;
}
// Read compressed size
size_t tmp_src_size = remaining_input_size - sizeof(uint32_t);
size_t compressed_len = Decompressor::_read_int32(src + sizeof(uint32_t));
if (compressed_len == 0 || compressed_len > tmp_src_size) {
// Need more input data
*more_input_bytes = compressed_len - tmp_src_size;
break;
}
src += 2 * sizeof(uint32_t);
remaining_input_size -= 2 * sizeof(uint32_t);
// Decompress
int uncompressed_len = LZ4_decompress_safe(reinterpret_cast<const char*>(src),
reinterpret_cast<char*>(output), compressed_len,
remaining_output_size);
if (uncompressed_len < 0 || uncompressed_len != uncompressed_block_len) {
return Status::InternalError(
"lz4 block decompress failed. uncompressed_len: {}, expected: {}",
uncompressed_len, uncompressed_block_len);
}
output += uncompressed_len;
src += compressed_len;
remaining_input_size -= compressed_len;
uncompressed_total_len += uncompressed_len;
}
*input_bytes_read += (input_len - remaining_input_size);
*decompressed_len = uncompressed_total_len;
// If no more input and output need, means this is the end of a compressed block
*stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0);
return Status::OK();
}
std::string Lz4BlockDecompressor::debug_info() {
std::stringstream ss;
ss << "Lz4BlockDecompressor.";
return ss.str();
}
/// SnappyBlockDecompressor
Status SnappyBlockDecompressor::init() {
return Status::OK();
}
Status SnappyBlockDecompressor::decompress(uint8_t* input, size_t input_len,
size_t* input_bytes_read, uint8_t* output,
size_t output_max_len, size_t* decompressed_len,
bool* stream_end, size_t* more_input_bytes,
size_t* more_output_bytes) {
uint8_t* src = input;
size_t remaining_input_size = input_len;
int64_t uncompressed_total_len = 0;
*input_bytes_read = 0;
// The hadoop snappy codec is as:
// <4 byte big endian uncompressed size>
// <4 byte big endian compressed size>
// <snappy compressed block>
// ....
// <4 byte big endian uncompressed size>
// <4 byte big endian compressed size>
// <snappy compressed block>
//
// See:
// https://github.com/apache/hadoop/blob/trunk/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-nativetask/src/main/native/src/codec/SnappyCodec.cc
while (remaining_input_size > 0) {
// Read uncompressed size
uint32_t uncompressed_block_len = Decompressor::_read_int32(src);
int64_t remaining_output_size = output_max_len - uncompressed_total_len;
if (remaining_output_size < uncompressed_block_len) {
// Need more output buffer
*more_output_bytes = uncompressed_block_len - remaining_output_size;
break;
}
// Read compressed size
size_t tmp_src_size = remaining_input_size - sizeof(uint32_t);
size_t compressed_len = _read_int32(src + sizeof(uint32_t));
if (compressed_len == 0 || compressed_len > tmp_src_size) {
// Need more input data
*more_input_bytes = compressed_len - tmp_src_size;
break;
}
src += 2 * sizeof(uint32_t);
remaining_input_size -= 2 * sizeof(uint32_t);
// ATTN: the uncompressed len from GetUncompressedLength() is same as
// uncompressed_block_len, so I think it is unnecessary to get it again.
// Get uncompressed len from snappy
// size_t uncompressed_len;
// if (!snappy::GetUncompressedLength(reinterpret_cast<const char*>(src),
// compressed_len, &uncompressed_len)) {
// return Status::InternalError("snappy block decompress failed to get uncompressed len");
// }
// Decompress
if (!snappy::RawUncompress(reinterpret_cast<const char*>(src), compressed_len,
reinterpret_cast<char*>(output))) {
return Status::InternalError("snappy block decompress failed. uncompressed_len: {}",
uncompressed_block_len);
}
output += uncompressed_block_len;
src += compressed_len;
remaining_input_size -= compressed_len;
uncompressed_total_len += uncompressed_block_len;
}
*input_bytes_read += (input_len - remaining_input_size);
*decompressed_len = uncompressed_total_len;
// If no more input and output need, means this is the end of a compressed block
*stream_end = (*more_input_bytes == 0 && *more_output_bytes == 0);
return Status::OK();
}
std::string SnappyBlockDecompressor::debug_info() {
std::stringstream ss;
ss << "SnappyBlockDecompressor.";
return ss.str();
}
} // namespace doris

View File

@ -18,7 +18,10 @@
#pragma once
#include <bzlib.h>
#include <lz4/lz4.h>
#include <lz4/lz4frame.h>
#include <lz4/lz4hc.h>
#include <snappy.h>
#include <stddef.h>
#include <stdint.h>
#include <zlib.h>
@ -34,7 +37,7 @@
namespace doris {
enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP };
enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP, LZ4BLOCK, SNAPPYBLOCK };
class Decompressor {
public:
@ -68,6 +71,8 @@ public:
protected:
virtual Status init() = 0;
static uint32_t _read_int32(uint8_t* buf);
Decompressor(CompressType ctype) : _ctype(ctype) {}
CompressType _ctype;
@ -140,6 +145,38 @@ private:
const static unsigned DORIS_LZ4F_VERSION;
};
class Lz4BlockDecompressor : public Decompressor {
public:
~Lz4BlockDecompressor() override {}
Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output,
size_t output_max_len, size_t* decompressed_len, bool* stream_end,
size_t* more_input_bytes, size_t* more_output_bytes) override;
std::string debug_info() override;
private:
friend class Decompressor;
Lz4BlockDecompressor() : Decompressor(CompressType::LZ4FRAME) {}
Status init() override;
};
class SnappyBlockDecompressor : public Decompressor {
public:
~SnappyBlockDecompressor() override {}
Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output,
size_t output_max_len, size_t* decompressed_len, bool* stream_end,
size_t* more_input_bytes, size_t* more_output_bytes) override;
std::string debug_info() override;
private:
friend class Decompressor;
SnappyBlockDecompressor() : Decompressor(CompressType::SNAPPYBLOCK) {}
Status init() override;
};
#ifdef DORIS_WITH_LZO
class LzopDecompressor : public Decompressor {
public:

View File

@ -632,6 +632,8 @@ void PInternalServiceImpl::fetch_table_schema(google::protobuf::RpcController* c
case TFileFormatType::FORMAT_CSV_GZ:
case TFileFormatType::FORMAT_CSV_BZ2:
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
case TFileFormatType::FORMAT_CSV_LZ4BLOCK:
case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK:
case TFileFormatType::FORMAT_CSV_LZOP:
case TFileFormatType::FORMAT_CSV_DEFLATE: {
// file_slots is no use

View File

@ -46,9 +46,15 @@ void LoadUtil::parse_format(const std::string& format_str, const std::string& co
} else if (iequal(compress_type_str, "LZ4")) {
*format_type = TFileFormatType::FORMAT_CSV_LZ4FRAME;
*compress_type = TFileCompressType::LZ4FRAME;
} else if (iequal(compress_type_str, "LZ4_BLOCK")) {
*format_type = TFileFormatType::FORMAT_CSV_LZ4BLOCK;
*compress_type = TFileCompressType::LZ4BLOCK;
} else if (iequal(compress_type_str, "LZOP")) {
*format_type = TFileFormatType::FORMAT_CSV_LZOP;
*compress_type = TFileCompressType::LZO;
} else if (iequal(compress_type_str, "SNAPPY_BLOCK")) {
*format_type = TFileFormatType::FORMAT_CSV_SNAPPYBLOCK;
*compress_type = TFileCompressType::SNAPPYBLOCK;
} else if (iequal(compress_type_str, "DEFLATE")) {
*format_type = TFileFormatType::FORMAT_CSV_DEFLATE;
*compress_type = TFileCompressType::DEFLATE;
@ -72,6 +78,7 @@ bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) {
case TFileFormatType::FORMAT_CSV_DEFLATE:
case TFileFormatType::FORMAT_CSV_GZ:
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
case TFileFormatType::FORMAT_CSV_LZ4BLOCK:
case TFileFormatType::FORMAT_CSV_LZO:
case TFileFormatType::FORMAT_CSV_LZOP:
case TFileFormatType::FORMAT_JSON:
@ -81,4 +88,4 @@ bool LoadUtil::is_format_support_streaming(TFileFormatType::type format) {
}
return false;
}
} // namespace doris
} // namespace doris

View File

@ -343,8 +343,12 @@ Status CsvReader::init_reader(bool is_load) {
[[fallthrough]];
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
[[fallthrough]];
case TFileFormatType::FORMAT_CSV_LZ4BLOCK:
[[fallthrough]];
case TFileFormatType::FORMAT_CSV_LZOP:
[[fallthrough]];
case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK:
[[fallthrough]];
case TFileFormatType::FORMAT_CSV_DEFLATE:
_line_reader =
NewPlainTextLineReader::create_unique(_profile, _file_reader, _decompressor.get(),
@ -400,21 +404,51 @@ Status CsvReader::get_next_block(Block* block, size_t* read_rows, bool* eof) {
const int batch_size = std::max(_state->batch_size(), (int)_MIN_BATCH_SIZE);
size_t rows = 0;
auto columns = block->mutate_columns();
while (rows < batch_size && !_line_reader_eof) {
const uint8_t* ptr = nullptr;
size_t size = 0;
RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx));
if (_skip_lines > 0) {
_skip_lines--;
continue;
}
if (size == 0) {
// Read empty row, just continue
continue;
bool success = false;
if (_push_down_agg_type == TPushAggOp::type::COUNT) {
while (rows < batch_size && !_line_reader_eof) {
const uint8_t* ptr = nullptr;
size_t size = 0;
RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx));
if (_skip_lines > 0) {
_skip_lines--;
continue;
}
if (size == 0) {
// Read empty row, just continue
continue;
}
RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success));
++rows;
}
RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows));
for (auto& col : block->mutate_columns()) {
col->resize(rows);
}
} else {
auto columns = block->mutate_columns();
while (rows < batch_size && !_line_reader_eof) {
const uint8_t* ptr = nullptr;
size_t size = 0;
RETURN_IF_ERROR(_line_reader->read_line(&ptr, &size, &_line_reader_eof, _io_ctx));
if (_skip_lines > 0) {
_skip_lines--;
continue;
}
if (size == 0) {
// Read empty row, just continue
continue;
}
RETURN_IF_ERROR(_validate_line(Slice(ptr, size), &success));
if (!success) {
continue;
}
RETURN_IF_ERROR(_fill_dest_columns(Slice(ptr, size), block, columns, &rows));
}
}
*eof = (rows == 0);
@ -476,9 +510,15 @@ Status CsvReader::_create_decompressor() {
case TFileCompressType::LZ4FRAME:
compress_type = CompressType::LZ4FRAME;
break;
case TFileCompressType::LZ4BLOCK:
compress_type = CompressType::LZ4BLOCK;
break;
case TFileCompressType::DEFLATE:
compress_type = CompressType::DEFLATE;
break;
case TFileCompressType::SNAPPYBLOCK:
compress_type = CompressType::SNAPPYBLOCK;
break;
default:
return Status::InternalError("unknown compress type: {}", _file_compress_type);
}
@ -498,12 +538,18 @@ Status CsvReader::_create_decompressor() {
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
compress_type = CompressType::LZ4FRAME;
break;
case TFileFormatType::FORMAT_CSV_LZ4BLOCK:
compress_type = CompressType::LZ4BLOCK;
break;
case TFileFormatType::FORMAT_CSV_LZOP:
compress_type = CompressType::LZOP;
break;
case TFileFormatType::FORMAT_CSV_DEFLATE:
compress_type = CompressType::DEFLATE;
break;
case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK:
compress_type = CompressType::SNAPPYBLOCK;
break;
default:
return Status::InternalError("unknown format type: {}", _file_format_type);
}
@ -557,7 +603,7 @@ Status CsvReader::_fill_dest_columns(const Slice& line, Block* block,
return Status::OK();
}
Status CsvReader::_line_split_to_values(const Slice& line, bool* success) {
Status CsvReader::_validate_line(const Slice& line, bool* success) {
if (!_is_proto_format && !validate_utf8(line.data, line.size)) {
if (!_is_load) {
return Status::InternalError("Only support csv data in utf8 codec");
@ -575,7 +621,11 @@ Status CsvReader::_line_split_to_values(const Slice& line, bool* success) {
return Status::OK();
}
}
*success = true;
return Status::OK();
}
Status CsvReader::_line_split_to_values(const Slice& line, bool* success) {
_split_line(line);
if (_is_load) {

View File

@ -221,6 +221,12 @@ private:
// TODO(ftw): parse type
Status _parse_col_types(size_t col_nums, std::vector<TypeDescriptor>* col_types);
// check the utf8 encoding of a line.
// return error status to stop processing.
// If return Status::OK but "success" is false, which means this is load request
// and the line is skipped as unqualified row, and the process should continue.
Status _validate_line(const Slice& line, bool* success);
RuntimeState* _state;
RuntimeProfile* _profile;
ScannerCounter* _counter;

View File

@ -201,7 +201,6 @@ NewPlainTextLineReader::NewPlainTextLineReader(RuntimeProfile* profile,
_output_buf_limit(0),
_file_eof(false),
_eof(false),
_stream_end(true),
_more_input_bytes(0),
_more_output_bytes(0),
_current_offset(current_offset),
@ -324,6 +323,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool
_line_reader_ctx->refresh();
int found_line_delimiter = 0;
size_t offset = 0;
bool stream_end = true;
while (!done()) {
// find line delimiter in current decompressed data
uint8_t* cur_ptr = _output_buf + _output_buf_pos;
@ -379,7 +379,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool
COUNTER_UPDATE(_bytes_read_counter, read_len);
}
if (_file_eof || read_len == 0) {
if (!_stream_end) {
if (!stream_end) {
return Status::InternalError(
"Compressed file has been truncated, which is not allowed");
} else {
@ -392,7 +392,7 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool
if (_decompressor == nullptr) {
_output_buf_limit += read_len;
_stream_end = true;
stream_end = true;
} else {
// only update input limit.
// input pos is set at MARK step
@ -418,10 +418,10 @@ Status NewPlainTextLineReader::read_line(const uint8_t** ptr, size_t* size, bool
_input_buf_limit - _input_buf_pos, /* input_len */
&input_read_bytes, _output_buf + _output_buf_limit, /* output */
_output_buf_size - _output_buf_limit, /* output_max_len */
&decompressed_len, &_stream_end, &_more_input_bytes, &_more_output_bytes));
&decompressed_len, &stream_end, &_more_input_bytes, &_more_output_bytes));
// LOG(INFO) << "after decompress:"
// << " stream_end: " << _stream_end
// << " stream_end: " << stream_end
// << " input_read_bytes: " << input_read_bytes
// << " decompressed_len: " << decompressed_len
// << " more_input_bytes: " << _more_input_bytes

View File

@ -235,7 +235,6 @@ private:
bool _file_eof;
bool _eof;
bool _stream_end;
size_t _more_input_bytes;
size_t _more_output_bytes;

View File

@ -261,22 +261,26 @@ Status VFileScanner::_get_block_impl(RuntimeState* state, Block* block, bool* eo
// use read_rows instead of _src_block_ptr->rows(), because the first column of _src_block_ptr
// may not be filled after calling `get_next_block()`, so _src_block_ptr->rows() may return wrong result.
if (read_rows > 0) {
// Convert the src block columns type to string in-place.
RETURN_IF_ERROR(_cast_to_input_block(block));
// FileReader can fill partition and missing columns itself
if (!_cur_reader->fill_all_columns()) {
// Fill rows in src block with partition columns from path. (e.g. Hive partition columns)
RETURN_IF_ERROR(_fill_columns_from_path(read_rows));
// Fill columns not exist in file with null or default value
RETURN_IF_ERROR(_fill_missing_columns(read_rows));
// If the push_down_agg_type is COUNT, no need to do the rest,
// because we only save a number in block.
if (_parent->get_push_down_agg_type() != TPushAggOp::type::COUNT) {
// Convert the src block columns type to string in-place.
RETURN_IF_ERROR(_cast_to_input_block(block));
// FileReader can fill partition and missing columns itself
if (!_cur_reader->fill_all_columns()) {
// Fill rows in src block with partition columns from path. (e.g. Hive partition columns)
RETURN_IF_ERROR(_fill_columns_from_path(read_rows));
// Fill columns not exist in file with null or default value
RETURN_IF_ERROR(_fill_missing_columns(read_rows));
}
// Apply _pre_conjunct_ctxs to filter src block.
RETURN_IF_ERROR(_pre_filter_src_block());
// Convert src block to output block (dest block), string to dest data type and apply filters.
RETURN_IF_ERROR(_convert_to_output_block(block));
// Truncate char columns or varchar columns if size is smaller than file columns
// or not found in the file column schema.
RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block));
}
// Apply _pre_conjunct_ctxs to filter src block.
RETURN_IF_ERROR(_pre_filter_src_block());
// Convert src block to output block (dest block), string to dest data type and apply filters.
RETURN_IF_ERROR(_convert_to_output_block(block));
// Truncate char columns or varchar columns if size is smaller than file columns
// or not found in the file column schema.
RETURN_IF_ERROR(_truncate_char_or_varchar_columns(block));
break;
}
} while (true);
@ -755,8 +759,10 @@ Status VFileScanner::_get_next_reader() {
case TFileFormatType::FORMAT_CSV_GZ:
case TFileFormatType::FORMAT_CSV_BZ2:
case TFileFormatType::FORMAT_CSV_LZ4FRAME:
case TFileFormatType::FORMAT_CSV_LZ4BLOCK:
case TFileFormatType::FORMAT_CSV_LZOP:
case TFileFormatType::FORMAT_CSV_DEFLATE:
case TFileFormatType::FORMAT_CSV_SNAPPYBLOCK:
case TFileFormatType::FORMAT_PROTO: {
_cur_reader = CsvReader::create_unique(_state, _profile, &_counter, *_params, range,
_file_slot_descs, _io_ctx.get());

View File

@ -550,6 +550,8 @@ public class Util {
return TFileFormatType.FORMAT_CSV_LZO;
} else if (lowerCasePath.endsWith(".deflate")) {
return TFileFormatType.FORMAT_CSV_DEFLATE;
} else if (lowerCasePath.endsWith(".snappy")) {
return TFileFormatType.FORMAT_CSV_SNAPPYBLOCK;
} else {
return TFileFormatType.FORMAT_CSV_PLAIN;
}
@ -575,6 +577,8 @@ public class Util {
return TFileCompressType.LZO;
} else if (lowerCasePath.endsWith(".deflate")) {
return TFileCompressType.DEFLATE;
} else if (lowerCasePath.endsWith(".snappy")) {
return TFileCompressType.SNAPPYBLOCK;
} else {
return TFileCompressType.PLAIN;
}
@ -599,6 +603,8 @@ public class Util {
|| fileFormatType == TFileFormatType.FORMAT_CSV_DEFLATE
|| fileFormatType == TFileFormatType.FORMAT_CSV_GZ
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZ4FRAME
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZ4BLOCK
|| fileFormatType == TFileFormatType.FORMAT_CSV_SNAPPYBLOCK
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZO
|| fileFormatType == TFileFormatType.FORMAT_CSV_LZOP
|| fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN;

View File

@ -1000,6 +1000,7 @@ public class HiveMetaStoreCache {
// File Cache for self splitter.
private final List<HiveFileStatus> files = Lists.newArrayList();
// File split cache for old splitter. This is a temp variable.
@Deprecated
private final List<FileSplit> splits = Lists.newArrayList();
private boolean isSplittable;
// The values of partitions.
@ -1021,6 +1022,7 @@ public class HiveMetaStoreCache {
}
}
@Deprecated
public void addSplit(FileSplit split) {
if (isFileVisible(split.getPath())) {
splits.add(split);

View File

@ -190,7 +190,8 @@ public final class HiveUtil {
return true;
}
// use reflection to get isSplittable method on FileInputFormat
// use reflection to get isSplitable method on FileInputFormat
// ATTN: the method name is actually "isSplitable", but the right spell is "isSplittable"
Method method = null;
for (Class<?> clazz = inputFormat.getClass(); clazz != null; clazz = clazz.getSuperclass()) {
try {

View File

@ -25,6 +25,7 @@ import org.apache.doris.analysis.TupleDescriptor;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.Util;
import org.apache.doris.planner.PlanNodeId;
import org.apache.doris.planner.external.FileSplit.FileSplitCreator;
import org.apache.doris.qe.ConnectContext;
@ -32,6 +33,7 @@ import org.apache.doris.spi.Split;
import org.apache.doris.statistics.StatisticalType;
import org.apache.doris.thrift.TExplainLevel;
import org.apache.doris.thrift.TExpr;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.thrift.TFileRangeDesc;
import org.apache.doris.thrift.TFileScanNode;
import org.apache.doris.thrift.TFileScanRangeParams;
@ -221,19 +223,20 @@ public abstract class FileScanNode extends ExternalScanNode {
if (blockLocations == null) {
blockLocations = new BlockLocation[0];
}
List<Split> result = Lists.newArrayList();
TFileCompressType compressType = Util.inferFileCompressTypeByPath(path.toString());
if (!splittable || compressType != TFileCompressType.PLAIN) {
LOG.debug("Path {} is not splittable.", path);
String[] hosts = blockLocations.length == 0 ? null : blockLocations[0].getHosts();
result.add(splitCreator.create(path, 0, length, length, modificationTime, hosts, partitionValues));
return result;
}
long splitSize = ConnectContext.get().getSessionVariable().getFileSplitSize();
if (splitSize <= 0) {
splitSize = blockSize;
}
// Min split size is DEFAULT_SPLIT_SIZE(128MB).
splitSize = Math.max(splitSize, DEFAULT_SPLIT_SIZE);
List<Split> result = Lists.newArrayList();
if (!splittable) {
LOG.debug("Path {} is not splittable.", path);
String[] hosts = blockLocations.length == 0 ? null : blockLocations[0].getHosts();
result.add(splitCreator.create(path, 0, length, length, modificationTime, hosts, partitionValues));
return result;
}
long bytesRemaining;
for (bytesRemaining = length; (double) bytesRemaining / (double) splitSize > 1.1D;
bytesRemaining -= splitSize) {

View File

@ -51,6 +51,7 @@ import org.apache.doris.qe.ConnectContext;
import org.apache.doris.spi.Split;
import org.apache.doris.statistics.StatisticalType;
import org.apache.doris.thrift.TFileAttributes;
import org.apache.doris.thrift.TFileCompressType;
import org.apache.doris.thrift.TFileFormatType;
import org.apache.doris.thrift.TFileTextScanRangeParams;
import org.apache.doris.thrift.TFileType;
@ -391,4 +392,14 @@ public class HiveScanNode extends FileQueryScanNode {
public boolean pushDownAggNoGroupingCheckCol(FunctionCallExpr aggExpr, Column col) {
return !col.isAllowNull();
}
@Override
protected TFileCompressType getFileCompressType(FileSplit fileSplit) throws UserException {
TFileCompressType compressType = super.getFileCompressType(fileSplit);
// hadoop use lz4 blocked codec
if (compressType == TFileCompressType.LZ4FRAME) {
compressType = TFileCompressType.LZ4BLOCK;
}
return compressType;
}
}

View File

@ -32,11 +32,6 @@ public class HiveSplit extends FileSplit {
this.acidInfo = acidInfo;
}
public HiveSplit(Path path, long start, long length, long fileLength, String[] hosts, AcidInfo acidInfo) {
super(path, start, length, fileLength, hosts, null);
this.acidInfo = acidInfo;
}
@Override
public Object getInfo() {
return acidInfo;

View File

@ -116,6 +116,8 @@ enum TFileFormatType {
FORMAT_PROTO,
FORMAT_JNI,
FORMAT_AVRO,
FORMAT_CSV_LZ4BLOCK,
FORMAT_CSV_SNAPPYBLOCK,
}
// In previous versions, the data compression format and file format were stored together, as TFileFormatType,
@ -132,6 +134,8 @@ enum TFileCompressType {
LZ4FRAME,
DEFLATE,
LZOP,
LZ4BLOCK,
SNAPPYBLOCK
}
struct THdfsConf {

View File

@ -0,0 +1,47 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !q21 --
600005
-- !q22 --
1510010
-- !q23 --
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 2023-08-21
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 lz4
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 snappy
-- !q31 --
600005
-- !q32 --
1510010
-- !q33 --
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 2023-08-21
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 bzip2
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 deflate
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 gzip
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 lz4
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 mix
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 plain
4611870011201662970 0 HD Tube 5* 1 2014-03-22T05:11:29 2014-03-22 598875 4243808759 92f6fe1be9b9773206d6b63e50feb470 196 2314158381335918424 0 3 3 http://public_search yandex.ru.livemaster 0 0 [] [4,15,333,3912,14512,12818] [18,348,1010] [] 1846 952 29 10 1 0.77 0 0 24 73d7 1 1 0 0 3238011 0 0 0 0 1119 641 157 2014-03-22T19:51:48 0 0 0 0 utf-8 330 0 0 0 7774109565808082252 11274076 0 0 0 0 0 E 2014-03-22T11:54:54 55 2 3 4 6 [105,11,9,88,45,14,98,72,3,925,2193,6,25,1] 3137666015 cc184643699dccab8d5d4af796c47449 -1 -1 -1 nD Tp 0 -1 0 0 81 0 0 0 -1 -1 -1 -1 -1 -1 -1 -1 0 0 07d21f 0 [] 0 15284527577228392792 14270691585016129648 0 0 [] [] [] [] [] \N c1889e2b9ad1e219ed04c0e9624b5139 1404 0 snappy

View File

@ -0,0 +1,61 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_compress_type", "p2,external,hive,external_remote,external_remote_hive") {
String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost")
String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort")
String catalog_name = "test_compress_type"
sql """drop catalog if exists ${catalog_name};"""
sql """
create catalog if not exists ${catalog_name} properties (
'type'='hms',
'hadoop.username' = 'hadoop',
'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
);
"""
logger.info("catalog " + catalog_name + " created")
sql """switch ${catalog_name};"""
logger.info("switched to catalog " + catalog_name)
sql """ use multi_catalog """
// table test_compress_partitioned has 6 partitions with different compressed file: plain, gzip, bzip2, deflate
sql """set file_split_size=0"""
explain {
sql("select count(*) from test_compress_partitioned")
contains "inputSplitNum=16, totalFileSize=734675596, scanRanges=16"
contains "partition=8/8"
}
qt_q21 """select count(*) from test_compress_partitioned where dt="gzip" or dt="mix""""
qt_q22 """select count(*) from test_compress_partitioned"""
order_qt_q23 """select * from test_compress_partitioned where watchid=4611870011201662970"""
sql """set file_split_size=8388608"""
explain {
sql("select count(*) from test_compress_partitioned")
contains "inputSplitNum=82, totalFileSize=734675596, scanRanges=82"
contains "partition=8/8"
}
qt_q31 """select count(*) from test_compress_partitioned where dt="gzip" or dt="mix""""
qt_q32 """select count(*) from test_compress_partitioned"""
order_qt_q33 """select * from test_compress_partitioned where watchid=4611870011201662970"""
sql """set file_split_size=0"""
}
}