1. do not split compress data file Some data file in hive is compressed with gzip, deflate, etc. These kinds of file can not be splitted. 2. Support lz4 block codec for hive scan node, use lz4 block codec instead of lz4 frame codec 4. Support snappy block codec For hadoop snappy 5. Optimize the `count(*)` query of csv file For query like `select count(*) from tbl`, only need to split the line, no need to split the column. Need to pick to branch-2.0 after this PR: #22304
277 lines
8.9 KiB
C++
277 lines
8.9 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <bzlib.h>
|
|
#include <lz4/lz4.h>
|
|
#include <lz4/lz4frame.h>
|
|
#include <lz4/lz4hc.h>
|
|
#include <snappy.h>
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <zlib.h>
|
|
|
|
#include <string>
|
|
|
|
#ifdef DORIS_WITH_LZO
|
|
#include <lzo/lzo1x.h>
|
|
#include <lzo/lzoconf.h>
|
|
#endif
|
|
|
|
#include "common/status.h"
|
|
|
|
namespace doris {
|
|
|
|
enum CompressType { UNCOMPRESSED, GZIP, DEFLATE, BZIP2, LZ4FRAME, LZOP, LZ4BLOCK, SNAPPYBLOCK };
|
|
|
|
class Decompressor {
|
|
public:
|
|
virtual ~Decompressor() = default;
|
|
|
|
// implement in derived class
|
|
// input(in): buf where decompress begin
|
|
// input_len(in): max length of input buf
|
|
// input_bytes_read(out): bytes which is consumed by decompressor
|
|
// output(out): buf where to save decompressed data
|
|
// output_max_len(in): max length of output buf
|
|
// decompressed_len(out): decompressed data size in output buf
|
|
// stream_end(out): true if reach the and of stream,
|
|
// or normally finished decompressing entire block
|
|
// more_input_bytes(out): decompressor need more bytes to consume
|
|
// more_output_bytes(out): decompressor need more space to save decompressed data
|
|
//
|
|
// input and output buf should be allocated and released outside
|
|
virtual Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read,
|
|
uint8_t* output, size_t output_max_len, size_t* decompressed_len,
|
|
bool* stream_end, size_t* more_input_bytes,
|
|
size_t* more_output_bytes) = 0;
|
|
|
|
public:
|
|
static Status create_decompressor(CompressType type, Decompressor** decompressor);
|
|
|
|
virtual std::string debug_info();
|
|
|
|
CompressType get_type() { return _ctype; }
|
|
|
|
protected:
|
|
virtual Status init() = 0;
|
|
|
|
static uint32_t _read_int32(uint8_t* buf);
|
|
|
|
Decompressor(CompressType ctype) : _ctype(ctype) {}
|
|
|
|
CompressType _ctype;
|
|
};
|
|
|
|
class GzipDecompressor : public Decompressor {
|
|
public:
|
|
~GzipDecompressor() override;
|
|
|
|
Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output,
|
|
size_t output_max_len, size_t* decompressed_len, bool* stream_end,
|
|
size_t* more_input_bytes, size_t* more_output_bytes) override;
|
|
|
|
std::string debug_info() override;
|
|
|
|
private:
|
|
friend class Decompressor;
|
|
GzipDecompressor(bool is_deflate);
|
|
Status init() override;
|
|
|
|
private:
|
|
bool _is_deflate;
|
|
|
|
z_stream _z_strm;
|
|
|
|
// These are magic numbers from zlib.h. Not clear why they are not defined there.
|
|
const static int WINDOW_BITS = 15; // Maximum window size
|
|
const static int DETECT_CODEC = 32; // Determine if this is libz or gzip from header.
|
|
};
|
|
|
|
class Bzip2Decompressor : public Decompressor {
|
|
public:
|
|
~Bzip2Decompressor() override;
|
|
|
|
Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output,
|
|
size_t output_max_len, size_t* decompressed_len, bool* stream_end,
|
|
size_t* more_input_bytes, size_t* more_output_bytes) override;
|
|
|
|
std::string debug_info() override;
|
|
|
|
private:
|
|
friend class Decompressor;
|
|
Bzip2Decompressor() : Decompressor(CompressType::BZIP2) {}
|
|
Status init() override;
|
|
|
|
private:
|
|
bz_stream _bz_strm;
|
|
};
|
|
|
|
class Lz4FrameDecompressor : public Decompressor {
|
|
public:
|
|
~Lz4FrameDecompressor() override;
|
|
|
|
Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output,
|
|
size_t output_max_len, size_t* decompressed_len, bool* stream_end,
|
|
size_t* more_input_bytes, size_t* more_output_bytes) override;
|
|
|
|
std::string debug_info() override;
|
|
|
|
private:
|
|
friend class Decompressor;
|
|
Lz4FrameDecompressor() : Decompressor(CompressType::LZ4FRAME) {}
|
|
Status init() override;
|
|
|
|
size_t get_block_size(const LZ4F_frameInfo_t* info);
|
|
|
|
private:
|
|
LZ4F_dctx* _dctx;
|
|
size_t _expect_dec_buf_size;
|
|
const static unsigned DORIS_LZ4F_VERSION;
|
|
};
|
|
|
|
class Lz4BlockDecompressor : public Decompressor {
|
|
public:
|
|
~Lz4BlockDecompressor() override {}
|
|
|
|
Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output,
|
|
size_t output_max_len, size_t* decompressed_len, bool* stream_end,
|
|
size_t* more_input_bytes, size_t* more_output_bytes) override;
|
|
|
|
std::string debug_info() override;
|
|
|
|
private:
|
|
friend class Decompressor;
|
|
Lz4BlockDecompressor() : Decompressor(CompressType::LZ4FRAME) {}
|
|
Status init() override;
|
|
};
|
|
|
|
class SnappyBlockDecompressor : public Decompressor {
|
|
public:
|
|
~SnappyBlockDecompressor() override {}
|
|
|
|
Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output,
|
|
size_t output_max_len, size_t* decompressed_len, bool* stream_end,
|
|
size_t* more_input_bytes, size_t* more_output_bytes) override;
|
|
|
|
std::string debug_info() override;
|
|
|
|
private:
|
|
friend class Decompressor;
|
|
SnappyBlockDecompressor() : Decompressor(CompressType::SNAPPYBLOCK) {}
|
|
Status init() override;
|
|
};
|
|
|
|
#ifdef DORIS_WITH_LZO
|
|
class LzopDecompressor : public Decompressor {
|
|
public:
|
|
~LzopDecompressor() override = default;
|
|
|
|
Status decompress(uint8_t* input, size_t input_len, size_t* input_bytes_read, uint8_t* output,
|
|
size_t output_max_len, size_t* decompressed_len, bool* stream_end,
|
|
size_t* more_input_bytes, size_t* more_output_bytes) override;
|
|
|
|
std::string debug_info() override;
|
|
|
|
private:
|
|
friend class Decompressor;
|
|
LzopDecompressor()
|
|
: Decompressor(CompressType::LZOP), _header_info(), _is_header_loaded(false) {}
|
|
Status init() override;
|
|
|
|
private:
|
|
enum LzoChecksum { CHECK_NONE, CHECK_CRC32, CHECK_ADLER };
|
|
|
|
private:
|
|
uint8_t* get_uint8(uint8_t* ptr, uint8_t* value) {
|
|
*value = *ptr;
|
|
return ptr + sizeof(uint8_t);
|
|
}
|
|
|
|
uint8_t* get_uint16(uint8_t* ptr, uint16_t* value) {
|
|
*value = *ptr << 8 | *(ptr + 1);
|
|
return ptr + sizeof(uint16_t);
|
|
}
|
|
|
|
uint8_t* get_uint32(uint8_t* ptr, uint32_t* value) {
|
|
*value = (*ptr << 24) | (*(ptr + 1) << 16) | (*(ptr + 2) << 8) | *(ptr + 3);
|
|
return ptr + sizeof(uint32_t);
|
|
}
|
|
|
|
LzoChecksum header_type(int flags) { return (flags & F_H_CRC32) ? CHECK_CRC32 : CHECK_ADLER; }
|
|
|
|
LzoChecksum input_type(int flags) {
|
|
return (flags & F_CRC32_C) ? CHECK_CRC32 : (flags & F_ADLER32_C) ? CHECK_ADLER : CHECK_NONE;
|
|
}
|
|
|
|
LzoChecksum output_type(int flags) {
|
|
return (flags & F_CRC32_D) ? CHECK_CRC32 : (flags & F_ADLER32_D) ? CHECK_ADLER : CHECK_NONE;
|
|
}
|
|
|
|
Status parse_header_info(uint8_t* input, size_t input_len, size_t* input_bytes_read,
|
|
size_t* more_bytes_needed);
|
|
|
|
Status checksum(LzoChecksum type, const std::string& source, uint32_t expected, uint8_t* ptr,
|
|
size_t len);
|
|
|
|
private:
|
|
// lzop header info
|
|
struct HeaderInfo {
|
|
uint16_t version;
|
|
uint16_t lib_version;
|
|
uint16_t version_needed;
|
|
uint8_t method;
|
|
std::string filename;
|
|
uint32_t header_size;
|
|
LzoChecksum header_checksum_type;
|
|
LzoChecksum input_checksum_type;
|
|
LzoChecksum output_checksum_type;
|
|
};
|
|
|
|
struct HeaderInfo _header_info;
|
|
|
|
// true if header is decompressed and loaded
|
|
bool _is_header_loaded;
|
|
|
|
private:
|
|
const static uint8_t LZOP_MAGIC[9];
|
|
const static uint64_t LZOP_VERSION;
|
|
const static uint64_t MIN_LZO_VERSION;
|
|
const static uint32_t MIN_HEADER_SIZE;
|
|
const static uint32_t LZO_MAX_BLOCK_SIZE;
|
|
|
|
const static uint32_t CRC32_INIT_VALUE;
|
|
const static uint32_t ADLER32_INIT_VALUE;
|
|
|
|
const static uint64_t F_H_CRC32;
|
|
const static uint64_t F_MASK;
|
|
const static uint64_t F_OS_MASK;
|
|
const static uint64_t F_CS_MASK;
|
|
const static uint64_t F_RESERVED;
|
|
const static uint64_t F_MULTIPART;
|
|
const static uint64_t F_H_FILTER;
|
|
const static uint64_t F_H_EXTRA_FIELD;
|
|
const static uint64_t F_CRC32_C;
|
|
const static uint64_t F_ADLER32_C;
|
|
const static uint64_t F_CRC32_D;
|
|
const static uint64_t F_ADLER32_D;
|
|
};
|
|
#endif // DORIS_WITH_LZO
|
|
|
|
} // namespace doris
|