[Opt](Compression) Opt zstd block decompression by ZSTD_decompressDCtx(). (#27534)
Opt zstd block decompression by `ZSTD_decompressDCtx()` to replace streaming decompression. It will improve performance but consume more memory. Test result: - env: 1 node(16 cores, 64G). - parquet column: 100 million rows of char(255) column. - result: 5.2 -> 4.6.
This commit is contained in:
@ -848,42 +848,28 @@ public:
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// follow ZSTD official example
|
||||
// https://github.com/facebook/zstd/blob/dev/examples/streaming_decompression.c
|
||||
Status decompress(const Slice& input, Slice* output) override {
|
||||
DContext* context;
|
||||
bool compress_failed = false;
|
||||
bool decompress_failed = false;
|
||||
RETURN_IF_ERROR(_acquire_decompression_ctx(&context));
|
||||
Defer defer {[&] {
|
||||
if (compress_failed) {
|
||||
if (decompress_failed) {
|
||||
_delete_decompression_ctx(context);
|
||||
} else {
|
||||
_release_decompression_ctx(context);
|
||||
}
|
||||
}};
|
||||
|
||||
ZSTD_inBuffer in_buf = {input.data, input.size, 0};
|
||||
ZSTD_outBuffer out_buf = {output->data, output->size, 0};
|
||||
|
||||
while (in_buf.pos < in_buf.size) {
|
||||
// do decompress
|
||||
auto ret = ZSTD_decompressStream(context->ctx, &out_buf, &in_buf);
|
||||
|
||||
if (ZSTD_isError(ret)) {
|
||||
compress_failed = true;
|
||||
return Status::InvalidArgument("ZSTD_decompressStream error: {}",
|
||||
ZSTD_getErrorString(ZSTD_getErrorCode(ret)));
|
||||
}
|
||||
|
||||
// ret is ZSTD hint for needed output buffer size
|
||||
if (ret > 0 && out_buf.pos == out_buf.size) {
|
||||
compress_failed = true;
|
||||
return Status::InvalidArgument("ZSTD_decompressStream output buffer full");
|
||||
}
|
||||
size_t ret = ZSTD_decompressDCtx(context->ctx, output->data, output->size, input.data,
|
||||
input.size);
|
||||
if (ZSTD_isError(ret)) {
|
||||
decompress_failed = true;
|
||||
return Status::InvalidArgument("ZSTD_decompressDCtx error: {}",
|
||||
ZSTD_getErrorString(ZSTD_getErrorCode(ret)));
|
||||
}
|
||||
|
||||
// set decompressed size for caller
|
||||
output->size = out_buf.pos;
|
||||
output->size = ret;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -1412,6 +1412,88 @@ TBLPROPERTIES (
|
||||
|
||||
msck repair table parquet_gzip_all_types;
|
||||
|
||||
CREATE TABLE `parquet_zstd_all_types`(
|
||||
`t_null_string` string,
|
||||
`t_null_varchar` varchar(65535),
|
||||
`t_null_char` char(10),
|
||||
`t_null_decimal_precision_2` decimal(2,1),
|
||||
`t_null_decimal_precision_4` decimal(4,2),
|
||||
`t_null_decimal_precision_8` decimal(8,4),
|
||||
`t_null_decimal_precision_17` decimal(17,8),
|
||||
`t_null_decimal_precision_18` decimal(18,8),
|
||||
`t_null_decimal_precision_38` decimal(38,16),
|
||||
`t_empty_string` string,
|
||||
`t_string` string,
|
||||
`t_empty_varchar` varchar(65535),
|
||||
`t_varchar` varchar(65535),
|
||||
`t_varchar_max_length` varchar(65535),
|
||||
`t_char` char(10),
|
||||
`t_int` int,
|
||||
`t_bigint` bigint,
|
||||
`t_float` float,
|
||||
`t_double` double,
|
||||
`t_boolean_true` boolean,
|
||||
`t_boolean_false` boolean,
|
||||
`t_decimal_precision_2` decimal(2,1),
|
||||
`t_decimal_precision_4` decimal(4,2),
|
||||
`t_decimal_precision_8` decimal(8,4),
|
||||
`t_decimal_precision_17` decimal(17,8),
|
||||
`t_decimal_precision_18` decimal(18,8),
|
||||
`t_decimal_precision_38` decimal(38,16),
|
||||
`t_binary` binary,
|
||||
`t_map_string` map<string,string>,
|
||||
`t_map_varchar` map<varchar(65535),varchar(65535)>,
|
||||
`t_map_char` map<char(10),char(10)>,
|
||||
`t_map_int` map<int,int>,
|
||||
`t_map_bigint` map<bigint,bigint>,
|
||||
`t_map_float` map<float,float>,
|
||||
`t_map_double` map<double,double>,
|
||||
`t_map_boolean` map<boolean,boolean>,
|
||||
`t_map_decimal_precision_2` map<decimal(2,1),decimal(2,1)>,
|
||||
`t_map_decimal_precision_4` map<decimal(4,2),decimal(4,2)>,
|
||||
`t_map_decimal_precision_8` map<decimal(8,4),decimal(8,4)>,
|
||||
`t_map_decimal_precision_17` map<decimal(17,8),decimal(17,8)>,
|
||||
`t_map_decimal_precision_18` map<decimal(18,8),decimal(18,8)>,
|
||||
`t_map_decimal_precision_38` map<decimal(38,16),decimal(38,16)>,
|
||||
`t_array_string` array<string>,
|
||||
`t_array_int` array<int>,
|
||||
`t_array_bigint` array<bigint>,
|
||||
`t_array_float` array<float>,
|
||||
`t_array_double` array<double>,
|
||||
`t_array_boolean` array<boolean>,
|
||||
`t_array_varchar` array<varchar(65535)>,
|
||||
`t_array_char` array<char(10)>,
|
||||
`t_array_decimal_precision_2` array<decimal(2,1)>,
|
||||
`t_array_decimal_precision_4` array<decimal(4,2)>,
|
||||
`t_array_decimal_precision_8` array<decimal(8,4)>,
|
||||
`t_array_decimal_precision_17` array<decimal(17,8)>,
|
||||
`t_array_decimal_precision_18` array<decimal(18,8)>,
|
||||
`t_array_decimal_precision_38` array<decimal(38,16)>,
|
||||
`t_struct_bigint` struct<s_bigint:bigint>,
|
||||
`t_complex` map<string,array<struct<s_int:int>>>,
|
||||
`t_struct_nested` struct<struct_field:array<string>>,
|
||||
`t_struct_null` struct<struct_field_null:string,struct_field_null2:string>,
|
||||
`t_struct_non_nulls_after_nulls` struct<struct_non_nulls_after_nulls1:int,struct_non_nulls_after_nulls2:string>,
|
||||
`t_nested_struct_non_nulls_after_nulls` struct<struct_field1:int,struct_field2:string,strict_field3:struct<nested_struct_field1:int,nested_struct_field2:string>>,
|
||||
`t_map_null_value` map<string,string>,
|
||||
`t_array_string_starting_with_nulls` array<string>,
|
||||
`t_array_string_with_nulls_in_between` array<string>,
|
||||
`t_array_string_ending_with_nulls` array<string>,
|
||||
`t_array_string_all_nulls` array<string>
|
||||
) ROW FORMAT SERDE
|
||||
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
|
||||
STORED AS INPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
|
||||
OUTPUTFORMAT
|
||||
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
|
||||
LOCATION
|
||||
'/user/doris/preinstalled_data/parquet_table/parquet_gzip_all_types'
|
||||
TBLPROPERTIES (
|
||||
'transient_lastDdlTime'='1681213018',
|
||||
"parquet.compression"="ZSTD");
|
||||
|
||||
msck repair table parquet_zstd_all_types;
|
||||
|
||||
CREATE TABLE `rcbinary_all_types`(
|
||||
`t_null_string` string,
|
||||
`t_null_varchar` varchar(65535),
|
||||
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
@ -96,6 +96,7 @@ suite("test_hive_basic_type", "external_docker,hive,external_docker_hive,p0,exte
|
||||
order_qt_33 """select * from ${catalog_name}.${ex_db_name}.parquet_all_types limit 1;"""
|
||||
|
||||
order_qt_36 """select * from ${catalog_name}.${ex_db_name}.parquet_gzip_all_types limit 1;"""
|
||||
order_qt_42 """select * from ${catalog_name}.${ex_db_name}.parquet_zstd_all_types limit 1;"""
|
||||
|
||||
// hive tables of json classes do not necessarily support column separation to identify errors
|
||||
//order_qt_8 """select * from ${catalog_name}.${ex_db_name}.json_all_types limit 1;"""
|
||||
|
||||
Reference in New Issue
Block a user