393 lines
13 KiB
C++
393 lines
13 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include <boost/assign/list_of.hpp>
|
|
#include "util/decompress.h"
|
|
#include "exec/read_write_util.h"
|
|
#include "runtime/runtime_state.h"
|
|
#include "gen_cpp/Descriptors_types.h"
|
|
|
|
// Codec libraries
|
|
#include <zlib.h>
|
|
#include <bzlib.h>
|
|
#include <snappy/snappy.h>
|
|
|
|
namespace doris {
|
|
|
|
GzipDecompressor::GzipDecompressor(MemPool* mem_pool, bool reuse_buffer, bool is_deflate) :
|
|
Codec(mem_pool, reuse_buffer),
|
|
_is_deflate(is_deflate) {
|
|
bzero(&_stream, sizeof(_stream));
|
|
}
|
|
|
|
GzipDecompressor::~GzipDecompressor() {
|
|
(void)inflateEnd(&_stream);
|
|
}
|
|
|
|
Status GzipDecompressor::init() {
|
|
int ret = 0;
|
|
// Initialize to run either deflate or zlib/gzip format
|
|
int window_bits = _is_deflate ? -WINDOW_BITS : WINDOW_BITS | DETECT_CODEC;
|
|
|
|
if ((ret = inflateInit2(&_stream, window_bits)) != Z_OK) {
|
|
return Status("zlib inflateInit failed: " + std::string(_stream.msg));
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status GzipDecompressor::process_block(int input_length, uint8_t* input,
|
|
int* output_length, uint8_t** output) {
|
|
bool use_temp = false;
|
|
|
|
// If length is set then the output has been allocated.
|
|
if (*output_length != 0) {
|
|
_buffer_length = *output_length;
|
|
_out_buffer = *output;
|
|
} else if (!_reuse_buffer || _out_buffer == NULL) {
|
|
// guess that we will need 2x the input length.
|
|
_buffer_length = input_length * 2;
|
|
|
|
if (_buffer_length > MAX_BLOCK_SIZE) {
|
|
return Status("Decompressor: block size is too big");
|
|
}
|
|
|
|
_out_buffer = _temp_memory_pool.allocate(_buffer_length);
|
|
use_temp = true;
|
|
}
|
|
|
|
int ret = 0;
|
|
|
|
while (ret != Z_STREAM_END) {
|
|
_stream.next_in = reinterpret_cast<Bytef*>(input);
|
|
_stream.avail_in = input_length;
|
|
_stream.next_out = reinterpret_cast<Bytef*>(_out_buffer);
|
|
_stream.avail_out = _buffer_length;
|
|
|
|
ret = inflate(&_stream, 1);
|
|
|
|
if (ret != Z_STREAM_END) {
|
|
if (ret == Z_OK) {
|
|
// Not enough output space.
|
|
DCHECK_EQ(*output_length, 0);
|
|
|
|
if (*output_length != 0) {
|
|
return Status("Too small a buffer passed to GzipDecompressor");
|
|
}
|
|
|
|
_temp_memory_pool.clear();
|
|
_buffer_length *= 2;
|
|
|
|
if (_buffer_length > MAX_BLOCK_SIZE) {
|
|
return Status("Decompressor: block size is too big");
|
|
}
|
|
|
|
_out_buffer = _temp_memory_pool.allocate(_buffer_length);
|
|
|
|
if (inflateReset(&_stream) != Z_OK) {
|
|
return Status("zlib inflateEnd failed: " + std::string(_stream.msg));
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
return Status("zlib inflate failed: " + std::string(_stream.msg));
|
|
}
|
|
}
|
|
|
|
if (inflateReset(&_stream) != Z_OK) {
|
|
return Status("zlib inflateEnd failed: " + std::string(_stream.msg));
|
|
}
|
|
|
|
*output = _out_buffer;
|
|
|
|
// _stream.avail_out is the number of bytes *left* in the out buffer, but
|
|
// we're interested in the number of bytes used.
|
|
if (*output_length == 0) {
|
|
*output_length = _buffer_length - _stream.avail_out;
|
|
}
|
|
|
|
if (use_temp) {
|
|
_memory_pool->acquire_data(&_temp_memory_pool, _reuse_buffer);
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
BzipDecompressor::BzipDecompressor(MemPool* mem_pool, bool reuse_buffer) :
|
|
Codec(mem_pool, reuse_buffer) {
|
|
}
|
|
|
|
Status BzipDecompressor::process_block(int input_length, uint8_t* input,
|
|
int* output_length, uint8_t** output) {
|
|
bool use_temp = false;
|
|
|
|
// If length is set then the output has been allocated.
|
|
if (*output_length != 0) {
|
|
_buffer_length = *output_length;
|
|
_out_buffer = *output;
|
|
} else if (!_reuse_buffer || _out_buffer == NULL) {
|
|
// guess that we will need 2x the input length.
|
|
_buffer_length = input_length * 2;
|
|
|
|
if (_buffer_length > MAX_BLOCK_SIZE) {
|
|
return Status("Decompressor: block size is too big");
|
|
}
|
|
|
|
_out_buffer = _temp_memory_pool.allocate(_buffer_length);
|
|
use_temp = true;
|
|
}
|
|
|
|
int ret = BZ_OUTBUFF_FULL;
|
|
unsigned int outlen = 0;
|
|
|
|
while (ret == BZ_OUTBUFF_FULL) {
|
|
if (_out_buffer == NULL) {
|
|
DCHECK_EQ(*output_length, 0);
|
|
_temp_memory_pool.clear();
|
|
_buffer_length = _buffer_length * 2;
|
|
|
|
if (_buffer_length > MAX_BLOCK_SIZE) {
|
|
return Status("Decompressor: block size is too big");
|
|
}
|
|
|
|
_out_buffer = _temp_memory_pool.allocate(_buffer_length);
|
|
}
|
|
|
|
outlen = static_cast<unsigned int>(_buffer_length);
|
|
|
|
if ((ret = BZ2_bzBuffToBuffDecompress(
|
|
reinterpret_cast<char*>(_out_buffer), &outlen,
|
|
reinterpret_cast<char*>(input),
|
|
static_cast<unsigned int>(input_length), 0, 0)) == BZ_OUTBUFF_FULL) {
|
|
// If the output_length was passed we must have enough room.
|
|
DCHECK_EQ(*output_length, 0);
|
|
|
|
if (*output_length != 0) {
|
|
return Status("Too small a buffer passed to BzipDecompressor");
|
|
}
|
|
|
|
_out_buffer = NULL;
|
|
}
|
|
}
|
|
|
|
if (ret != BZ_OK) {
|
|
std::stringstream ss;
|
|
ss << "bzlib BZ2_bzBuffToBuffDecompressor failed: " << ret;
|
|
return Status(ss.str());
|
|
|
|
}
|
|
|
|
*output = _out_buffer;
|
|
|
|
if (*output_length == 0) {
|
|
*output_length = outlen;
|
|
}
|
|
|
|
if (use_temp) {
|
|
_memory_pool->acquire_data(&_temp_memory_pool, _reuse_buffer);
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
SnappyDecompressor::SnappyDecompressor(MemPool* mem_pool, bool reuse_buffer)
|
|
: Codec(mem_pool, reuse_buffer) {
|
|
}
|
|
|
|
Status SnappyDecompressor::process_block(int input_length, uint8_t* input,
|
|
int* output_length, uint8_t** output) {
|
|
// If length is set then the output has been allocated.
|
|
size_t uncompressed_length = 0;
|
|
|
|
if (*output_length != 0) {
|
|
_buffer_length = *output_length;
|
|
_out_buffer = *output;
|
|
} else {
|
|
// Snappy saves the uncompressed length so we never have to retry.
|
|
if (!snappy::GetUncompressedLength(reinterpret_cast<const char*>(input),
|
|
input_length, &uncompressed_length)) {
|
|
return Status("Snappy: GetUncompressedLength failed");
|
|
}
|
|
|
|
if (!_reuse_buffer || _out_buffer == NULL || _buffer_length < uncompressed_length) {
|
|
_buffer_length = uncompressed_length;
|
|
|
|
if (_buffer_length > MAX_BLOCK_SIZE) {
|
|
return Status("Decompressor: block size is too big");
|
|
}
|
|
|
|
_out_buffer = _memory_pool->allocate(_buffer_length);
|
|
}
|
|
}
|
|
|
|
if (!snappy::RawUncompress(
|
|
reinterpret_cast<const char*>(input),
|
|
static_cast<size_t>(input_length), reinterpret_cast<char*>(_out_buffer))) {
|
|
return Status("Snappy: RawUncompress failed");
|
|
}
|
|
|
|
if (*output_length == 0) {
|
|
*output_length = uncompressed_length;
|
|
*output = _out_buffer;
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
SnappyBlockDecompressor::SnappyBlockDecompressor(MemPool* mem_pool, bool reuse_buffer) :
|
|
Codec(mem_pool, reuse_buffer) {
|
|
}
|
|
|
|
// Hadoop uses a block compression scheme on top of snappy. As per the hadoop docs
|
|
// the input is split into blocks. Each block "contains the uncompressed length for
|
|
// the block followed by one of more length-prefixed blocks of compressed data."
|
|
// This is essentially blocks of blocks.
|
|
// The outer block consists of:
|
|
// - 4 byte little endian uncompressed_size
|
|
// < inner blocks >
|
|
// ... repeated until input_len is consumed ..
|
|
// The inner blocks have:
|
|
// - 4-byte little endian compressed_size
|
|
// < snappy compressed block >
|
|
// - 4-byte little endian compressed_size
|
|
// < snappy compressed block >
|
|
// ... repeated until uncompressed_size from outer block is consumed ...
|
|
|
|
// Utility function to decompress snappy block compressed data. If size_only is true,
|
|
// this function does not decompress but only computes the output size and writes
|
|
// the result to *output_len.
|
|
// If size_only is false, output must be preallocated to output_len and this needs to
|
|
// be exactly big enough to hold the decompressed output.
|
|
// size_only is a O(1) operations (just reads a single varint for each snappy block).
|
|
static Status snappy_block_decompress(int input_len, uint8_t* input, bool size_only,
|
|
int* output_len, char* output) {
|
|
|
|
int uncompressed_total_len = 0;
|
|
|
|
while (input_len > 0) {
|
|
size_t uncompressed_block_len = ReadWriteUtil::get_int(input);
|
|
input += sizeof(int32_t);
|
|
input_len -= sizeof(int32_t);
|
|
|
|
if (uncompressed_block_len > Codec::MAX_BLOCK_SIZE || uncompressed_block_len == 0) {
|
|
if (uncompressed_total_len == 0) {
|
|
// TODO: is this check really robust?
|
|
std::stringstream ss;
|
|
ss << "Decompressor: block size is too big. Data is likely corrupt. "
|
|
<< "Size: " << uncompressed_block_len;
|
|
return Status(ss.str());
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
if (!size_only) {
|
|
int remaining_output_size = *output_len - uncompressed_total_len;
|
|
DCHECK_GE(remaining_output_size, uncompressed_block_len);
|
|
}
|
|
|
|
while (uncompressed_block_len > 0) {
|
|
// Read the length of the next snappy compressed block.
|
|
size_t compressed_len = ReadWriteUtil::get_int(input);
|
|
input += sizeof(int32_t);
|
|
input_len -= sizeof(int32_t);
|
|
|
|
if (compressed_len == 0 || compressed_len > input_len) {
|
|
if (uncompressed_total_len == 0) {
|
|
return Status(
|
|
"Decompressor: invalid compressed length. Data is likely corrupt.");
|
|
}
|
|
|
|
input_len = 0;
|
|
break;
|
|
}
|
|
|
|
// Read how big the output will be.
|
|
size_t uncompressed_len = 0;
|
|
|
|
if (!snappy::GetUncompressedLength(reinterpret_cast<char*>(input),
|
|
input_len, &uncompressed_len)) {
|
|
if (uncompressed_total_len == 0) {
|
|
return Status("Snappy: GetUncompressedLength failed");
|
|
}
|
|
|
|
input_len = 0;
|
|
break;
|
|
}
|
|
|
|
DCHECK_GT(uncompressed_len, 0);
|
|
|
|
if (!size_only) {
|
|
// Decompress this snappy block
|
|
if (!snappy::RawUncompress(reinterpret_cast<char*>(input),
|
|
compressed_len, output)) {
|
|
return Status("Snappy: RawUncompress failed");
|
|
}
|
|
|
|
output += uncompressed_len;
|
|
}
|
|
|
|
input += compressed_len;
|
|
input_len -= compressed_len;
|
|
uncompressed_block_len -= uncompressed_len;
|
|
uncompressed_total_len += uncompressed_len;
|
|
}
|
|
}
|
|
|
|
if (size_only) {
|
|
*output_len = uncompressed_total_len;
|
|
} else if (*output_len != uncompressed_total_len) {
|
|
return Status("Snappy: Decompressed size is not correct.");
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status SnappyBlockDecompressor::process_block(int input_len, uint8_t* input,
|
|
int* output_len, uint8_t** output) {
|
|
if (*output_len == 0) {
|
|
// If we don't know the size beforehand, compute it.
|
|
RETURN_IF_ERROR(snappy_block_decompress(input_len, input, true, output_len, NULL));
|
|
DCHECK_NE(*output_len, 0);
|
|
|
|
if (!_reuse_buffer || _out_buffer == NULL || _buffer_length < *output_len) {
|
|
// Need to allocate a new buffer
|
|
_buffer_length = *output_len;
|
|
_out_buffer = _memory_pool->allocate(_buffer_length);
|
|
}
|
|
|
|
*output = _out_buffer;
|
|
}
|
|
|
|
DCHECK(*output != NULL);
|
|
|
|
if (*output_len > MAX_BLOCK_SIZE) {
|
|
// TODO: is this check really robust?
|
|
std::stringstream ss;
|
|
ss << "Decompressor: block size is too big. Data is likely corrupt. "
|
|
<< "Size: " << *output_len;
|
|
return Status(ss.str());
|
|
}
|
|
|
|
char* out_ptr = reinterpret_cast<char*>(*output);
|
|
RETURN_IF_ERROR(snappy_block_decompress(input_len, input, false, output_len, out_ptr));
|
|
return Status::OK;
|
|
}
|
|
|
|
}
|