[Test][Fix](parquet-reader) Add parquet decoder unit tests and fix bugs by these tests. (#49922)

This commit is contained in:
Qi Chen
2025-04-10 21:56:53 +08:00
committed by GitHub
parent 7ce420c194
commit 8199febcdb
27 changed files with 3986 additions and 226 deletions

View File

@ -43,11 +43,12 @@ public:
~BoolPlainDecoder() override = default;
// Set the data to be decoded
void set_data(Slice* data) override {
Status set_data(Slice* data) override {
bool_values_.Reset((const uint8_t*)data->data, data->size);
num_unpacked_values_ = 0;
unpacked_value_idx_ = 0;
_offset = 0;
return Status::OK();
}
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,

View File

@ -30,29 +30,29 @@
#include "vec/exec/format/parquet/parquet_common.h"
namespace doris::vectorized {
void BoolRLEDecoder::set_data(Slice* slice) {
Status BoolRLEDecoder::set_data(Slice* slice) {
_data = slice;
_num_bytes = slice->size;
_offset = 0;
_current_value_idx = 0;
if (_num_bytes < 4) {
LOG(FATAL) << "Received invalid length : " + std::to_string(_num_bytes) +
" (corrupt data page?)";
return Status::IOError("Received invalid length : " + std::to_string(_num_bytes) +
" (corrupt data page?)");
}
// Load the first 4 bytes in little-endian, which indicates the length
const uint8_t* data = reinterpret_cast<const uint8_t*>(_data->data);
uint32_t num_bytes = decode_fixed32_le(data);
if (num_bytes > static_cast<uint32_t>(_num_bytes - 4)) {
LOG(FATAL) << ("Received invalid number of bytes : " + std::to_string(num_bytes) +
" (corrupt data page?)");
return Status::IOError("Received invalid number of bytes : " + std::to_string(num_bytes) +
" (corrupt data page?)");
}
_num_bytes = num_bytes;
auto decoder_data = data + 4;
_decoder = RleDecoder<uint8_t>(decoder_data, num_bytes, 1);
return Status::OK();
}
Status BoolRLEDecoder::skip_values(size_t num_values) {
_current_value_idx += num_values;
_decoder.Skip(num_values);
return Status::OK();
}
@ -76,15 +76,16 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt
if (!_decoder.get_values(_values.data(), max_values)) {
return Status::IOError("Can't read enough booleans in rle decoder");
}
size_t current_value_idx = 0;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
bool value; // Can't use uint8_t directly, we should correct it.
for (size_t i = 0; i < run_length; ++i) {
DCHECK(_current_value_idx < max_values)
<< _current_value_idx << " vs. " << max_values;
value = _values[_current_value_idx++];
DCHECK(current_value_idx < max_values)
<< current_value_idx << " vs. " << max_values;
value = _values[current_value_idx++];
column_data[data_index++] = (UInt8)value;
}
break;
@ -94,7 +95,7 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_current_value_idx += run_length;
current_value_idx += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
@ -102,7 +103,6 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt
}
}
}
_current_value_idx = 0;
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -40,7 +40,7 @@ public:
BoolRLEDecoder() = default;
~BoolRLEDecoder() override = default;
void set_data(Slice* slice) override;
Status set_data(Slice* slice) override;
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector, bool is_dict_filter) override;
@ -55,6 +55,5 @@ private:
RleDecoder<uint8_t> _decoder;
std::vector<uint8_t> _values;
size_t _num_bytes;
size_t _current_value_idx = 0;
};
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -59,9 +59,10 @@ public:
void set_type_length(int32_t type_length) { _type_length = type_length; }
// Set the data to be decoded
virtual void set_data(Slice* data) {
virtual Status set_data(Slice* data) {
_data = data;
_offset = 0;
return Status::OK();
}
// Write the decoded values batch to doris's column
@ -95,13 +96,14 @@ public:
~BaseDictDecoder() override = default;
// Set the data to be decoded
void set_data(Slice* data) override {
Status set_data(Slice* data) override {
_data = data;
_offset = 0;
uint8_t bit_width = *data->data;
_index_batch_decoder = std::make_unique<RleBatchDecoder<uint32_t>>(
reinterpret_cast<uint8_t*>(data->data) + 1, static_cast<int>(data->size) - 1,
bit_width);
return Status::OK();
}
protected:

View File

@ -0,0 +1,134 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
namespace doris::vectorized {
Status DeltaLengthByteArrayDecoder::_decode_lengths() {
RETURN_IF_ERROR(_len_decoder.set_bit_reader(_bit_reader));
// get the number of encoded lengths
int num_length = _len_decoder.valid_values_count();
_buffered_length.resize(num_length);
// decode all the lengths. all the lengths are buffered in buffered_length_.
int ret;
RETURN_IF_ERROR(_len_decoder.decode(_buffered_length.data(), num_length, &ret));
DCHECK_EQ(ret, num_length);
_length_idx = 0;
_num_valid_values = num_length;
return Status::OK();
}
Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values,
int* out_num_values) {
// Decode up to `max_values` strings into an internal buffer
// and reference them into `buffer`.
max_values = std::min(max_values, _num_valid_values);
if (max_values == 0) {
*out_num_values = 0;
return Status::OK();
}
int32_t data_size = 0;
const int32_t* length_ptr = _buffered_length.data() + _length_idx;
for (int i = 0; i < max_values; ++i) {
int32_t len = length_ptr[i];
if (PREDICT_FALSE(len < 0)) {
return Status::InvalidArgument("Negative string delta length");
}
buffer[i].size = len;
if (common::add_overflow(data_size, len, data_size)) {
return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
}
}
_length_idx += max_values;
_buffered_data.resize(data_size);
char* data_ptr = _buffered_data.data();
for (int j = 0; j < data_size; j++) {
if (!_bit_reader->GetValue(8, data_ptr + j)) {
return Status::IOError("Get length bytes EOF");
}
}
for (int i = 0; i < max_values; ++i) {
buffer[i].data = data_ptr;
data_ptr += buffer[i].size;
}
// this->num_values_ -= max_values;
_num_valid_values -= max_values;
*out_num_values = max_values;
return Status::OK();
}
Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) {
// Decode up to `max_values` strings into an internal buffer
// and reference them into `buffer`.
max_values = std::min(max_values, _num_valid_values);
if (max_values == 0) {
*out_num_values = max_values;
return Status::OK();
}
int suffix_read;
RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read));
if (PREDICT_FALSE(suffix_read != max_values)) {
return Status::IOError("Read {}, expecting {} from suffix decoder",
std::to_string(suffix_read), std::to_string(max_values));
}
int64_t data_size = 0;
const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset;
for (int i = 0; i < max_values; ++i) {
if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY");
}
if (PREDICT_FALSE(common::add_overflow(data_size, static_cast<int64_t>(prefix_len_ptr[i]),
data_size) ||
common::add_overflow(data_size, static_cast<int64_t>(buffer[i].size),
data_size))) {
return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY");
}
}
_buffered_data.resize(data_size);
std::string_view prefix {_last_value};
char* data_ptr = _buffered_data.data();
for (int i = 0; i < max_values; ++i) {
if (PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY");
}
memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
// buffer[i] currently points to the string suffix
memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size);
buffer[i].data = data_ptr;
buffer[i].size += prefix_len_ptr[i];
data_ptr += buffer[i].size;
prefix = std::string_view {buffer[i].data, buffer[i].size};
}
_prefix_len_offset += max_values;
_num_valid_values -= max_values;
_last_value = std::string {prefix};
if (_num_valid_values == 0) {
_last_value_in_previous_page = _last_value;
}
*out_num_values = max_values;
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -47,10 +47,6 @@ public:
~DeltaDecoder() override = default;
Status skip_values(size_t num_values) override {
return _type_converted_decoder->skip_values(num_values);
}
template <bool has_filter>
Status decode_byte_array(const std::vector<Slice>& decoded_vals, MutableColumnPtr& doris_column,
DataTypePtr& data_type, ColumnSelectVector& select_vector) {
@ -125,9 +121,10 @@ public:
}
protected:
void init_values_converter() {
_type_converted_decoder->set_data(_data);
Status init_values_converter() {
RETURN_IF_ERROR(_type_converted_decoder->set_data(_data));
_type_converted_decoder->set_type_length(_type_length);
return Status::OK();
}
// Convert decoded value to doris type value.
std::unique_ptr<Decoder> _type_converted_decoder;
@ -148,6 +145,13 @@ public:
DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder()) {}
~DeltaBitPackDecoder() override = default;
Status skip_values(size_t num_values) override {
_values.resize(num_values);
int num_valid_values;
return _get_internal(_values.data(), num_values, &num_valid_values);
}
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector, bool is_dict_filter) override {
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
@ -159,7 +163,7 @@ public:
_type_length = sizeof(T);
_data->size = _values.size() * _type_length;
// set decoded value with fix plain decoder
init_values_converter();
RETURN_IF_ERROR(init_values_converter());
return _type_converted_decoder->decode_values(doris_column, data_type, select_vector,
is_dict_filter);
}
@ -173,24 +177,20 @@ public:
return static_cast<int>(_total_values_remaining);
}
void set_data(Slice* slice) override {
Status set_data(Slice* slice) override {
_bit_reader.reset(new BitReader((const uint8_t*)slice->data, slice->size));
Status st = _init_header();
if (!st.ok()) {
LOG(FATAL) << "Fail to init delta encoding header for " << st.to_string();
}
RETURN_IF_ERROR(_init_header());
_data = slice;
_offset = 0;
return Status::OK();
}
// Set BitReader which is already initialized by DeltaLengthByteArrayDecoder or
// DeltaByteArrayDecoder
void set_bit_reader(std::shared_ptr<BitReader> bit_reader) {
Status set_bit_reader(std::shared_ptr<BitReader> bit_reader) {
_bit_reader = std::move(bit_reader);
Status st = _init_header();
if (!st.ok()) {
LOG(FATAL) << "Fail to init delta encoding header for " << st.to_string();
}
RETURN_IF_ERROR(_init_header());
return Status::OK();
}
private:
@ -265,25 +265,27 @@ public:
return _get_internal(buffer, num_values, out_num_values);
}
void set_data(Slice* slice) override {
Status set_data(Slice* slice) override {
if (slice->size == 0) {
return;
return Status::OK();
}
_bit_reader = std::make_shared<BitReader>((const uint8_t*)slice->data, slice->size);
_data = slice;
_offset = 0;
_decode_lengths();
RETURN_IF_ERROR(_decode_lengths());
return Status::OK();
}
void set_bit_reader(std::shared_ptr<BitReader> bit_reader) {
Status set_bit_reader(std::shared_ptr<BitReader> bit_reader) {
_bit_reader = std::move(bit_reader);
_decode_lengths();
RETURN_IF_ERROR(_decode_lengths());
return Status::OK();
}
private:
// Decode all the encoded lengths. The decoder_ will be at the start of the encoded data
// after that.
void _decode_lengths();
Status _decode_lengths();
Status _get_internal(Slice* buffer, int max_values, int* out_num_values);
std::vector<Slice> _values;
@ -333,9 +335,9 @@ public:
}
}
void set_data(Slice* slice) override {
Status set_data(Slice* slice) override {
_bit_reader = std::make_shared<BitReader>((const uint8_t*)slice->data, slice->size);
_prefix_len_decoder.set_bit_reader(_bit_reader);
RETURN_IF_ERROR(_prefix_len_decoder.set_bit_reader(_bit_reader));
// get the number of encoded prefix lengths
int num_prefix = _prefix_len_decoder.valid_values_count();
@ -343,20 +345,19 @@ public:
// all the prefix lengths are buffered in _buffered_prefix_length.
_buffered_prefix_length.resize(num_prefix);
int ret;
Status st = _prefix_len_decoder.decode(_buffered_prefix_length.data(), num_prefix, &ret);
if (!st.ok()) {
LOG(FATAL) << "Fail to decode delta prefix, status: " << st;
}
RETURN_IF_ERROR(
_prefix_len_decoder.decode(_buffered_prefix_length.data(), num_prefix, &ret));
DCHECK_EQ(ret, num_prefix);
_prefix_len_offset = 0;
_num_valid_values = num_prefix;
// at this time, the decoder_ will be at the start of the encoded suffix data.
_suffix_decoder.set_bit_reader(_bit_reader);
RETURN_IF_ERROR(_suffix_decoder.set_bit_reader(_bit_reader));
// TODO: read corrupted files written with bug(PARQUET-246). _last_value should be set
// to _last_value_in_previous_page when decoding a new page(except the first page)
_last_value = "";
return Status::OK();
}
Status decode(Slice* buffer, int num_values, int* out_num_values) {
@ -517,119 +518,4 @@ Status DeltaBitPackDecoder<T>::_get_internal(T* buffer, int num_values, int* out
return Status::OK();
}
void DeltaLengthByteArrayDecoder::_decode_lengths() {
_len_decoder.set_bit_reader(_bit_reader);
// get the number of encoded lengths
int num_length = _len_decoder.valid_values_count();
_buffered_length.resize(num_length);
// decode all the lengths. all the lengths are buffered in buffered_length_.
int ret;
Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret);
if (!st.ok()) {
LOG(FATAL) << "Fail to decode delta length, status: " << st;
}
DCHECK_EQ(ret, num_length);
_length_idx = 0;
_num_valid_values = num_length;
}
Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values,
int* out_num_values) {
// Decode up to `max_values` strings into an internal buffer
// and reference them into `buffer`.
max_values = std::min(max_values, _num_valid_values);
if (max_values == 0) {
*out_num_values = 0;
return Status::OK();
}
int32_t data_size = 0;
const int32_t* length_ptr = _buffered_length.data() + _length_idx;
for (int i = 0; i < max_values; ++i) {
int32_t len = length_ptr[i];
if (PREDICT_FALSE(len < 0)) {
return Status::InvalidArgument("Negative string delta length");
}
buffer[i].size = len;
if (common::add_overflow(data_size, len, data_size)) {
return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
}
}
_length_idx += max_values;
_buffered_data.resize(data_size);
char* data_ptr = _buffered_data.data();
for (int j = 0; j < data_size; j++) {
if (!_bit_reader->GetValue(8, data_ptr + j)) {
return Status::IOError("Get length bytes EOF");
}
}
for (int i = 0; i < max_values; ++i) {
buffer[i].data = data_ptr;
data_ptr += buffer[i].size;
}
// this->num_values_ -= max_values;
_num_valid_values -= max_values;
*out_num_values = max_values;
return Status::OK();
}
Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) {
// Decode up to `max_values` strings into an internal buffer
// and reference them into `buffer`.
max_values = std::min(max_values, _num_valid_values);
if (max_values == 0) {
*out_num_values = max_values;
return Status::OK();
}
int suffix_read;
RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read));
if (PREDICT_FALSE(suffix_read != max_values)) {
return Status::IOError("Read {}, expecting {} from suffix decoder",
std::to_string(suffix_read), std::to_string(max_values));
}
int64_t data_size = 0;
const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset;
for (int i = 0; i < max_values; ++i) {
if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY");
}
if (PREDICT_FALSE(common::add_overflow(data_size, static_cast<int64_t>(prefix_len_ptr[i]),
data_size) ||
common::add_overflow(data_size, static_cast<int64_t>(buffer[i].size),
data_size))) {
return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY");
}
}
_buffered_data.resize(data_size);
std::string_view prefix {_last_value};
char* data_ptr = _buffered_data.data();
for (int i = 0; i < max_values; ++i) {
if (PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY");
}
memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
// buffer[i] currently points to the string suffix
memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size);
buffer[i].data = data_ptr;
buffer[i].size += prefix_len_ptr[i];
data_ptr += buffer[i].size;
prefix = std::string_view {buffer[i].data, buffer[i].size};
}
_prefix_len_offset += max_values;
_num_valid_values -= max_values;
_last_value = std::string {prefix};
if (_num_valid_values == 0) {
_last_value_in_previous_page = _last_value;
}
*out_num_values = max_values;
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -121,7 +121,8 @@ protected:
Status read_dict_values_to_column(MutableColumnPtr& doris_column) override {
size_t dict_items_size = _dict_items.size();
std::vector<StringRef> dict_values(dict_items_size);
std::vector<StringRef> dict_values;
dict_values.reserve(dict_items_size);
for (size_t i = 0; i < dict_items_size; ++i) {
dict_values.emplace_back(_dict_items[i], _type_length);
}
@ -131,7 +132,8 @@ protected:
MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override {
auto res = ColumnString::create();
std::vector<StringRef> dict_values(dict_column->size());
std::vector<StringRef> dict_values;
dict_values.reserve(dict_column->size());
const auto& data = dict_column->get_data();
for (size_t i = 0; i < dict_column->size(); ++i) {
dict_values.emplace_back(_dict_items[data[i]], _type_length);

View File

@ -0,0 +1,40 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/fix_length_plain_decoder.h"
namespace doris::vectorized {
Status FixLengthPlainDecoder::skip_values(size_t num_values) {
_offset += _type_length * num_values;
if (UNLIKELY(_offset > _data->size)) {
return Status::IOError("Out-of-bounds access in parquet data decoder");
}
return Status::OK();
}
Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector,
bool is_dict_filter) {
if (select_vector.has_filter()) {
return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
} else {
return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
}
}
} // namespace doris::vectorized

View File

@ -40,67 +40,45 @@ public:
template <bool has_filter>
Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector, bool is_dict_filter);
ColumnSelectVector& select_vector, bool is_dict_filter) {
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) {
return Status::IOError("Out-of-bounds access in parquet data decoder");
}
size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
size_t data_index = doris_column->size() * primitive_length;
size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
(_type_length / primitive_length);
doris_column->resize(doris_column->size() + scale_size);
char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
memcpy(raw_data + data_index, _data->data + _offset, run_length * _type_length);
_offset += run_length * _type_length;
data_index += run_length * _type_length;
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length * _type_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
Status skip_values(size_t num_values) override;
};
Status FixLengthPlainDecoder::skip_values(size_t num_values) {
_offset += _type_length * num_values;
if (UNLIKELY(_offset > _data->size)) {
return Status::IOError("Out-of-bounds access in parquet data decoder");
}
return Status::OK();
}
Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector,
bool is_dict_filter) {
if (select_vector.has_filter()) {
return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
} else {
return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
}
}
template <bool has_filter>
Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector,
bool is_dict_filter) {
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) {
return Status::IOError("Out-of-bounds access in parquet data decoder");
}
size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
size_t data_index = doris_column->size() * primitive_length;
size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
(_type_length / primitive_length);
doris_column->resize(doris_column->size() + scale_size);
char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
memcpy(raw_data + data_index, _data->data + _offset, run_length * _type_length);
_offset += run_length * _type_length;
data_index += run_length * _type_length;
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length * _type_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -90,7 +90,15 @@ size_t doris::vectorized::LevelDecoder::get_levels(doris::vectorized::level_t* l
_num_levels -= num_decoded;
return num_decoded;
} else if (_encoding == tparquet::Encoding::BIT_PACKED) {
// TODO(gaoxin): BIT_PACKED encoding
n = std::min((size_t)_num_levels, n);
for (size_t i = 0; i < n; ++i) {
if (!_bit_packed_decoder.GetValue(_bit_width, &levels[i])) {
throw doris::Exception(ErrorCode::INTERNAL_ERROR,
"Failed to decode BIT_PACKED levels");
}
}
_num_levels -= n;
return n;
}
return 0;
}

View File

@ -208,7 +208,7 @@ Status ColumnChunkReader::load_page_data() {
_page_decoder = _decoders[static_cast<int>(encoding)].get();
}
// Reset page data for each page
_page_decoder->set_data(&_page_data);
RETURN_IF_ERROR(_page_decoder->set_data(&_page_data));
_state = DATA_LOADED;
return Status::OK();

View File

@ -0,0 +1,239 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/bool_plain_decoder.h"
#include <gtest/gtest.h>
#include "parquet/encoding.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "util/slice.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_number.h"
namespace doris::vectorized {
class BoolPlainDecoderTest : public ::testing::Test {
protected:
void SetUp() override { _decoder = std::make_unique<BoolPlainDecoder>(); }
std::unique_ptr<BoolPlainDecoder> _decoder;
};
// Test basic decoding functionality
TEST_F(BoolPlainDecoderTest, test_basic_decode) {
// Prepare encoded data: [true, false, true, true, false, false, false, true]
std::vector<uint8_t> encoded_data = {0b10001101};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create selection vector without filter
size_t num_values = 8;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 1);
EXPECT_EQ(result_column->get_data()[1], 0);
EXPECT_EQ(result_column->get_data()[2], 1);
EXPECT_EQ(result_column->get_data()[3], 1);
EXPECT_EQ(result_column->get_data()[4], 0);
EXPECT_EQ(result_column->get_data()[5], 0);
EXPECT_EQ(result_column->get_data()[6], 0);
EXPECT_EQ(result_column->get_data()[7], 1);
}
// Test decoding with filter
TEST_F(BoolPlainDecoderTest, test_decode_with_filter) {
// Prepare encoded data: [true, false, true, true, false, false, false, true]
std::vector<uint8_t> encoded_data = {0b10001101};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create filter vector [1, 0, 1, 0, 1, 0, 1, 0]
size_t num_values = 8;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 0, 1, 0};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 4); // 4 values after filtering
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 1);
EXPECT_EQ(result_column->get_data()[1], 1);
EXPECT_EQ(result_column->get_data()[2], 0);
EXPECT_EQ(result_column->get_data()[3], 0);
}
// Test skipping values
TEST_F(BoolPlainDecoderTest, test_skip_value) {
// Prepare encoded data: [true, false, true, true, false, false, false, true]
std::vector<uint8_t> encoded_data = {0b10001101};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Skip first 3 values
ASSERT_TRUE(_decoder->skip_values(3).ok());
// Create column and data type
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create selection vector
size_t num_values = 5; // Total 8 values, skip 3, remaining 5
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
// Expected values after skipping first 3 values (true, false, true)
std::vector<uint8_t> expected_values = {1, 0, 0, 0, 1};
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i;
}
}
// Test decoding with filter and null
TEST_F(BoolPlainDecoderTest, test_decode_with_filter_and_null) {
// Prepare encoded data: [true, false, true, true, false, false]
std::vector<uint8_t> encoded_data = {0b00001101};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create filter vector [1, 0, 1, 0, 1, 1, 1] and null vector [0, 0, 0, 0, 1, 0, 1]
size_t num_values = 7;
std::vector<uint16_t> run_length_null_map {
4, 1, 1, 1}; // data: [true, false, true, true, null, false, null]
std::vector<uint8_t> filter_data = {1, 0, 1, 0,
1, 1, 1}; // filtered_data: [true, true, null, false, null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<uint8_t>> expected_values = {1, 1, std::nullopt, 0, std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(result_column->get_data()[i], expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test decoding data generated by arrow
TEST_F(BoolPlainDecoderTest, test_data_generated_by_arrow) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BOOLEAN);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<bool> values = {true, false, true, true, false, false, false, true};
// Create encoder
auto encoder = MakeTypedEncoder<parquet::BooleanType>(parquet::Encoding::PLAIN,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(encoder->Put(values, static_cast<int>(values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create selection vector
size_t num_values = values.size();
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 1);
EXPECT_EQ(result_column->get_data()[1], 0);
EXPECT_EQ(result_column->get_data()[2], 1);
EXPECT_EQ(result_column->get_data()[3], 1);
EXPECT_EQ(result_column->get_data()[4], 0);
EXPECT_EQ(result_column->get_data()[5], 0);
EXPECT_EQ(result_column->get_data()[6], 0);
EXPECT_EQ(result_column->get_data()[7], 1);
}
// Test invalid data case
//TEST_F(BoolPlainDecoderTest, test_invalid_data) {
// // Prepare invalid encoded data
// std::vector<uint8_t> encoded_data = {0b111111111}; // 9 bits
// Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
// ASSERT_FALSE(_decoder->set_data(&data_slice).ok());
//}
} // namespace doris::vectorized

View File

@ -0,0 +1,239 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/bool_rle_decoder.h"
#include <gtest/gtest.h>
#include "parquet/encoding.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "util/slice.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_number.h"
namespace doris::vectorized {
class BoolRLEDecoderTest : public ::testing::Test {
protected:
void SetUp() override { _decoder = std::make_unique<BoolRLEDecoder>(); }
std::unique_ptr<BoolRLEDecoder> _decoder;
};
// Test basic decoding functionality
TEST_F(BoolRLEDecoderTest, test_basic_decode) {
// Prepare encoded data: [true, false, true, true, false, false, false, true]
std::vector<uint8_t> encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create selection vector without filter
size_t num_values = 8;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 1);
EXPECT_EQ(result_column->get_data()[1], 0);
EXPECT_EQ(result_column->get_data()[2], 1);
EXPECT_EQ(result_column->get_data()[3], 1);
EXPECT_EQ(result_column->get_data()[4], 0);
EXPECT_EQ(result_column->get_data()[5], 0);
EXPECT_EQ(result_column->get_data()[6], 0);
EXPECT_EQ(result_column->get_data()[7], 1);
}
// Test decoding with filter
TEST_F(BoolRLEDecoderTest, test_decode_with_filter) {
// Prepare encoded data: [true, false, true, true, false, false, false, true]
std::vector<uint8_t> encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create filter vector [1, 0, 1, 0, 1, 0, 1, 0]
size_t num_values = 8;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 0, 1, 0};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 4); // 4 values after filtering
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 1);
EXPECT_EQ(result_column->get_data()[1], 1);
EXPECT_EQ(result_column->get_data()[2], 0);
EXPECT_EQ(result_column->get_data()[3], 0);
}
// Test decoding with filter and null values
TEST_F(BoolRLEDecoderTest, test_decode_with_filter_and_null) {
// Prepare encoded data: [true, false, true, true, false, false, false, true]
std::vector<uint8_t> encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x25};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create filter vector [1, 0, 1, 0, 1, 0, 1, 0] and null vector [0, 0, 1, 0, 0, 0, 1, 0]
size_t num_values = 8;
std::vector<uint16_t> run_length_null_map = {
2, 1, 3, 1, 1}; // data: [true, false, null, true, false, false, null, true]
std::vector<uint8_t> filter_data = {1, 0, 1, 0,
1, 0, 1, 0}; // filtered_data: [true, null, false, null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 4); // 4 values after filtering
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<uint8_t>> expected_values = {1, std::nullopt, 0, std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(result_column->get_data()[i], expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test skipping values for bool RLE decoding
TEST_F(BoolRLEDecoderTest, test_skip_value) {
// Prepare encoded data: [true, false, true, true, false, false, false, true]
std::vector<uint8_t> encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Skip first 3 values
ASSERT_TRUE(_decoder->skip_values(3).ok());
// Create column and data type
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create selection vector
size_t num_values = 5; // Total 8 values, skip 3, remaining 5
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
// Expected values after skipping first 3 values (true, false, true)
std::vector<uint8_t> expected_values = {1, 0, 0, 0, 1};
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i;
}
}
// Test decoding data generated by arrow
TEST_F(BoolRLEDecoderTest, test_data_generated_by_arrow) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BOOLEAN);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<bool> values = {true, false, true, true, false, false, false, true};
// Create encoder
auto encoder = MakeTypedEncoder<parquet::BooleanType>(parquet::Encoding::RLE,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(encoder->Put(values, static_cast<int>(values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// Create selection vector
size_t num_values = values.size();
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 1);
EXPECT_EQ(result_column->get_data()[1], 0);
EXPECT_EQ(result_column->get_data()[2], 1);
EXPECT_EQ(result_column->get_data()[3], 1);
EXPECT_EQ(result_column->get_data()[4], 0);
EXPECT_EQ(result_column->get_data()[5], 0);
EXPECT_EQ(result_column->get_data()[6], 0);
EXPECT_EQ(result_column->get_data()[7], 1);
}
// Test invalid data case
TEST_F(BoolRLEDecoderTest, test_invalid_data) {
// Prepare invalid encoded data
std::vector<uint8_t> encoded_data = {0x08, 0x01}; // Incomplete data
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_FALSE(_decoder->set_data(&data_slice).ok());
}
} // namespace doris::vectorized

View File

@ -0,0 +1,500 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/byte_array_dict_decoder.h"
#include <gtest/gtest.h>
#include "parquet/encoding.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "util/slice.h"
#include "vec/columns/column_dictionary.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_string.h"
namespace doris::vectorized {
class ByteArrayDictDecoderTest : public ::testing::Test {
protected:
void SetUp() override {
// Prepare test data: create a dictionary with byte array strings
const char* values[3] = {"apple", "banana", "cherry"};
size_t dict_size = 3;
size_t dict_data_size = 0;
// Calculate total dictionary data size
for (int i = 0; i < 3; i++) {
dict_data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
}
auto dict_data = std::make_unique<uint8_t[]>(dict_data_size);
size_t offset = 0;
for (int i = 0; i < 3; i++) {
uint32_t len = strlen(values[i]);
encode_fixed32_le(dict_data.get() + offset, len);
offset += 4;
memcpy(dict_data.get() + offset, values[i], len);
offset += len;
}
ASSERT_TRUE(_decoder.set_dict(dict_data, dict_data_size, dict_size).ok());
}
ByteArrayDictDecoder _decoder;
};
// Test basic decoding functionality
TEST_F(ByteArrayDictDecoderTest, test_basic_decode) {
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
// std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
std::vector<uint8_t> rle_data = {0x02, 0x03, 0x00, 0x19};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
size_t num_values = 7;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnString*>(column.get());
// Verify first 4 repeated values (dict index 0 -> value "apple")
for (int i = 0; i < 4; i++) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), "apple");
}
// Verify last 3 literal values
EXPECT_EQ(result_column->get_data_at(4).to_string(), "banana");
EXPECT_EQ(result_column->get_data_at(5).to_string(), "cherry");
EXPECT_EQ(result_column->get_data_at(6).to_string(), "banana");
}
// Test decoding with filter
TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter) {
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1]
size_t num_values = 7;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* result_column = assert_cast<ColumnString*>(column.get());
// Verify filtered values
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
EXPECT_EQ(result_column->get_data_at(1).to_string(), "apple");
EXPECT_EQ(result_column->get_data_at(2).to_string(), "banana");
EXPECT_EQ(result_column->get_data_at(3).to_string(), "cherry");
EXPECT_EQ(result_column->get_data_at(4).to_string(), "banana");
}
// Test decoding with filter and null
TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter_and_null) {
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
size_t num_values = 7;
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* result_column = assert_cast<ColumnString*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<std::string>> expected_values = {"apple", "apple", std::nullopt,
"cherry", std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test empty dictionary case
TEST_F(ByteArrayDictDecoderTest, test_empty_dict) {
ByteArrayDictDecoder empty_decoder;
auto dict_data = std::make_unique<uint8_t[]>(0);
ASSERT_TRUE(empty_decoder.set_dict(dict_data, 0, 0).ok());
}
// Test decoding with ColumnDictI32
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32) {
// Create ColumnDictI32 column
MutableColumnPtr column = ColumnDictI32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
// Verify first 4 repeated values (dict index 0 -> value "apple")
for (int i = 0; i < 4; i++) {
EXPECT_EQ(dict_column->get_data()[i], 0);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(), "apple");
}
// Verify last 3 literal values
EXPECT_EQ(dict_column->get_data()[4], 1);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana");
EXPECT_EQ(dict_column->get_data()[5], 2);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[5]).to_string(), "cherry");
EXPECT_EQ(dict_column->get_data()[6], 1);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[6]).to_string(), "banana");
}
// Test decoding with ColumnDictI32 and filter
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32_with_filter) {
// Create ColumnDictI32 column
MutableColumnPtr column = ColumnDictI32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1]
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
// Verify filtered values
EXPECT_EQ(dict_column->get_data()[0], 0);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[0]).to_string(), "apple");
EXPECT_EQ(dict_column->get_data()[1], 0);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[1]).to_string(), "apple");
EXPECT_EQ(dict_column->get_data()[2], 1);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[2]).to_string(), "banana");
EXPECT_EQ(dict_column->get_data()[3], 2);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[3]).to_string(), "cherry");
EXPECT_EQ(dict_column->get_data()[4], 1);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana");
}
// Test decoding with ColumnDictI32 with filter and null
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32_with_filter_and_null) {
// Create ColumnDictI32 column
MutableColumnPtr column = ColumnDictI32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<std::string>> expected_values = {"apple", "apple", std::nullopt,
"cherry", std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(),
expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test decoding with ColumnInt32
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32) {
// Create ColumnInt32 column
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
// Verify first 4 repeated values (dict index 0 -> value "apple")
for (int i = 0; i < 4; i++) {
EXPECT_EQ(dict_column->get_data()[i], 0);
}
// Verify last 3 literal values
EXPECT_EQ(dict_column->get_data()[4], 1);
EXPECT_EQ(dict_column->get_data()[5], 2);
EXPECT_EQ(dict_column->get_data()[6], 1);
}
// Test decoding with ColumnInt32 and filter
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32_with_filter) {
// Create ColumnInt32 column
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1]
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
// Verify filtered values
EXPECT_EQ(dict_column->get_data()[0], 0);
EXPECT_EQ(dict_column->get_data()[1], 0);
EXPECT_EQ(dict_column->get_data()[2], 1);
EXPECT_EQ(dict_column->get_data()[3], 2);
EXPECT_EQ(dict_column->get_data()[4], 1);
}
// Test decoding with ColumnInt32 with filter and null
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32_with_filter_and_null) {
// Create ColumnInt32 column
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<int32_t>> expected_values = {0, 0, std::nullopt, 2, std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(dict_column->get_data()[i], expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test reading dictionary values to column
TEST_F(ByteArrayDictDecoderTest, test_read_dict_values_to_column) {
// Create a column to store dictionary values
MutableColumnPtr column = ColumnString::create();
// Read dictionary values to column
ASSERT_TRUE(_decoder.read_dict_values_to_column(column).ok());
// Verify results
ASSERT_EQ(column->size(), 3); // 3 dictionary items
auto* result_column = assert_cast<ColumnString*>(column.get());
// Verify dictionary values
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana");
EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry");
}
// Test convert_dict_column_to_string_column function
TEST_F(ByteArrayDictDecoderTest, test_convert_dict_column_to_string_column) {
// Create a ColumnInt32 with some dictionary codes
MutableColumnPtr dict_column = ColumnInt32::create();
dict_column->insert(0);
dict_column->insert(1);
dict_column->insert(2);
dict_column->insert(1);
// Convert to string column
MutableColumnPtr string_column = _decoder.convert_dict_column_to_string_column(
assert_cast<ColumnInt32*>(dict_column.get()));
// Verify results
ASSERT_EQ(string_column->size(), 4);
auto* result_column = assert_cast<ColumnString*>(string_column.get());
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana");
EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry");
EXPECT_EQ(result_column->get_data_at(3).to_string(), "banana");
}
// Test skipping values for byte array dictionary decoding
TEST_F(ByteArrayDictDecoderTest, test_skip_value) {
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Skip first 3 values
ASSERT_TRUE(_decoder.skip_values(3).ok());
// Create selection vector
size_t num_values = 4; // Total 7 values, skip 3, remaining 4
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnString*>(column.get());
// Expected values after skipping first 3 values ("apple", "apple", "apple")
std::vector<std::string> expected_values = {"apple", "banana", "cherry", "banana"};
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i])
<< "Mismatch at value " << i;
}
}
} // namespace doris::vectorized

View File

@ -0,0 +1,242 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/byte_array_plain_decoder.h"
#include <gtest/gtest.h>
#include "util/slice.h"
#include "vec/columns/column_string.h"
#include "vec/data_types/data_type_string.h"
namespace doris::vectorized {
class ByteArrayPlainDecoderTest : public ::testing::Test {
protected:
void SetUp() override {}
Slice _data_slice;
std::unique_ptr<uint8_t[]> _data;
};
// Test basic decoding functionality
TEST_F(ByteArrayPlainDecoderTest, test_basic_decode) {
// Prepare test data: create byte array strings
const char* values[3] = {"apple", "banana", "cherry"};
size_t data_size = 0;
// Calculate total data size
for (int i = 0; i < 3; i++) {
data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
}
_data = std::make_unique<uint8_t[]>(data_size);
size_t offset = 0;
for (int i = 0; i < 3; i++) {
uint32_t len = strlen(values[i]);
encode_fixed32_le(_data.get() + offset, len);
offset += 4;
memcpy(_data.get() + offset, values[i], len);
offset += len;
}
_data_slice = Slice(_data.get(), data_size);
ByteArrayPlainDecoder decoder;
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create selection vector without filter
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnString*>(column.get());
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana");
EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry");
}
// Test decoding with filter
TEST_F(ByteArrayPlainDecoderTest, test_decode_with_filter) {
// Prepare test data: create byte array strings
const char* values[3] = {"apple", "banana", "cherry"};
size_t data_size = 0;
// Calculate total data size
for (int i = 0; i < 3; i++) {
data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
}
_data = std::make_unique<uint8_t[]>(data_size);
size_t offset = 0;
for (int i = 0; i < 3; i++) {
uint32_t len = strlen(values[i]);
encode_fixed32_le(_data.get() + offset, len);
offset += 4;
memcpy(_data.get() + offset, values[i], len);
offset += len;
}
_data_slice = Slice(_data.get(), data_size);
ByteArrayPlainDecoder decoder;
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create filter vector [1,0,1]
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnString*>(column.get());
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
EXPECT_EQ(result_column->get_data_at(1).to_string(), "cherry");
}
// Test decoding with filter and null
TEST_F(ByteArrayPlainDecoderTest, test_decode_with_filter_and_null) {
// Prepare test data: create byte array strings
const char* values[2] = {"apple", "cherry"};
size_t data_size = 0;
// Calculate total data size
for (int i = 0; i < 2; i++) {
data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
}
_data = std::make_unique<uint8_t[]>(data_size);
size_t offset = 0;
for (int i = 0; i < 2; i++) {
uint32_t len = strlen(values[i]);
encode_fixed32_le(_data.get() + offset, len);
offset += 4;
memcpy(_data.get() + offset, values[i], len);
offset += len;
}
_data_slice = Slice(_data.get(), data_size);
ByteArrayPlainDecoder decoder;
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create filter vector [1,0,1] and null vector [0,1,0]
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map = {1, 1, 1}; // data: ["apple", null, "cherry"]
std::vector<uint8_t> filter_data = {1, 0, 1}; // filtered_data: ["apple", "cherry"]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnString*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<std::string>> expected_values = {"apple", "cherry"};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test skipping values
TEST_F(ByteArrayPlainDecoderTest, test_skip_value) {
// Prepare test data: create byte array strings
const char* values[3] = {"apple", "banana", "cherry"};
size_t data_size = 0;
// Calculate total data size
for (int i = 0; i < 3; i++) {
data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
}
_data = std::make_unique<uint8_t[]>(data_size);
size_t offset = 0;
for (int i = 0; i < 3; i++) {
uint32_t len = strlen(values[i]);
encode_fixed32_le(_data.get() + offset, len);
offset += 4;
memcpy(_data.get() + offset, values[i], len);
offset += len;
}
_data_slice = Slice(_data.get(), data_size);
ByteArrayPlainDecoder decoder;
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
// Skip first 2 values
ASSERT_TRUE(decoder.skip_values(2).ok());
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create selection vector
size_t num_values = 1; // Total 3 values, skip 2, remaining 1
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnString*>(column.get());
EXPECT_EQ(result_column->get_data_at(0).to_string(), "cherry");
}
} // namespace doris::vectorized

View File

@ -0,0 +1,395 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/byte_stream_split_decoder.h"
#include <gtest/gtest.h>
#include "util/slice.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_number.h"
namespace doris::vectorized {
class ByteStreamSplitDecoderTest : public ::testing::Test {
protected:
void SetUp() override {}
ByteStreamSplitDecoder _decoder;
};
//// Test basic decoding functionality for FLOAT type
TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_float) {
// Prepare test data for FLOAT type
size_t type_length_float = sizeof(float);
size_t num_values_float = 3;
size_t data_size_float = num_values_float * type_length_float;
auto data_float = std::make_unique<uint8_t[]>(data_size_float);
const float values_float[3] = {1.0f, 2.0f, 3.0f};
for (int i = 0; i < num_values_float; i++) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_float[i]);
for (int j = 0; j < type_length_float; j++) {
data_float[j * num_values_float + i] = bytes[j];
}
}
Slice data_slice_float(data_float.get(), data_size_float);
MutableColumnPtr column = ColumnFloat32::create();
DataTypePtr data_type = std::make_shared<DataTypeFloat32>();
// Set data for FLOAT type
ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok());
_decoder.set_type_length(type_length_float);
// Create selection vector without filter, total 3 values
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnFloat32*>(column.get());
EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f);
EXPECT_FLOAT_EQ(result_column->get_data()[1], 2.0f);
EXPECT_FLOAT_EQ(result_column->get_data()[2], 3.0f);
}
//// Test basic decoding functionality for DOUBLE type
TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_double) {
// Prepare test data for DOUBLE type
size_t type_length_double = sizeof(double);
size_t num_values_double = 3;
size_t data_size_double = num_values_double * type_length_double;
auto data_double = std::make_unique<uint8_t[]>(data_size_double);
const double values_double[3] = {1.0, 2.0, 3.0};
for (int i = 0; i < num_values_double; i++) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_double[i]);
for (int j = 0; j < type_length_double; j++) {
data_double[j * num_values_double + i] = bytes[j];
}
}
Slice data_slice_double(data_double.get(), data_size_double);
MutableColumnPtr column = ColumnFloat64::create();
DataTypePtr data_type = std::make_shared<DataTypeFloat64>();
// Set data for DOUBLE type
ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok());
_decoder.set_type_length(type_length_double);
// Create selection vector without filter, total 3 values
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnFloat64*>(column.get());
EXPECT_DOUBLE_EQ(result_column->get_data()[0], 1.0);
EXPECT_DOUBLE_EQ(result_column->get_data()[1], 2.0);
EXPECT_DOUBLE_EQ(result_column->get_data()[2], 3.0);
}
// Test decoding with filter for FLOAT type
TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_float) {
// Prepare test data for FLOAT type
size_t type_length_float = sizeof(float);
size_t num_values_float = 3;
size_t data_size_float = num_values_float * type_length_float;
auto data_float = std::make_unique<uint8_t[]>(data_size_float);
const float values_float[3] = {1.0f, 2.0f, 3.0f};
for (int i = 0; i < num_values_float; i++) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_float[i]);
for (int j = 0; j < type_length_float; j++) {
data_float[j * num_values_float + i] = bytes[j];
}
}
Slice data_slice_float(data_float.get(), data_size_float);
MutableColumnPtr column = ColumnFloat32::create();
DataTypePtr data_type = std::make_shared<DataTypeFloat32>();
// Set data for FLOAT type
ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok());
_decoder.set_type_length(type_length_float);
// Create filter vector [1, 0, 1]
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnFloat32*>(column.get());
EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f);
EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f);
}
// Test decoding with filter for DOUBLE type
TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_double) {
// Prepare test data for DOUBLE type
size_t type_length_double = sizeof(double);
size_t num_values_double = 3;
size_t data_size_double = num_values_double * type_length_double;
auto data_double = std::make_unique<uint8_t[]>(data_size_double);
const double values_double[3] = {1.0, 2.0, 3.0};
for (int i = 0; i < num_values_double; i++) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_double[i]);
for (int j = 0; j < type_length_double; j++) {
data_double[j * num_values_double + i] = bytes[j];
}
}
Slice data_slice_double(data_double.get(), data_size_double);
MutableColumnPtr column = ColumnFloat64::create();
DataTypePtr data_type = std::make_shared<DataTypeFloat64>();
// Set data for DOUBLE type
ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok());
_decoder.set_type_length(type_length_double);
// Create filter vector [1, 0, 1]
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnFloat64*>(column.get());
EXPECT_DOUBLE_EQ(result_column->get_data()[0], 1.0);
EXPECT_DOUBLE_EQ(result_column->get_data()[1], 3.0);
}
// Test decoding with filter and null for FLOAT type
TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_and_null_float) {
// Prepare test data for FLOAT type
size_t type_length_float = sizeof(float);
size_t num_values_float = 2;
size_t data_size_float = num_values_float * type_length_float;
auto data_float = std::make_unique<uint8_t[]>(data_size_float);
const float values_float[2] = {1.0f, 3.0f};
for (int i = 0; i < num_values_float; i++) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_float[i]);
for (int j = 0; j < type_length_float; j++) {
data_float[j * num_values_float + i] = bytes[j];
}
}
Slice data_slice_float(data_float.get(), data_size_float);
MutableColumnPtr column = ColumnFloat32::create();
DataTypePtr data_type = std::make_shared<DataTypeFloat32>();
// Set data for FLOAT type
ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok());
_decoder.set_type_length(type_length_float);
// Create filter vector [1, 0, 1] and null vector [0, 1, 0]
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map = {1, 1, 1}; // data: [1.0f, null, 3.0f]
std::vector<uint8_t> filter_data = {0, 1, 1}; // filtered_data: [null, 3.0f]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnFloat32*>(column.get());
// EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f);
// EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f);
// Expected values after filtering and null handling
std::vector<std::optional<float>> expected_values = {std::nullopt, 3.0f};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_FLOAT_EQ(result_column->get_data()[i], expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test decoding with filter and null for DOUBLE type
TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_and_null_double) {
// Prepare test data for DOUBLE type
size_t type_length_double = sizeof(double);
size_t num_values_double = 2;
size_t data_size_double = num_values_double * type_length_double;
auto data_double = std::make_unique<uint8_t[]>(data_size_double);
const double values_double[2] = {1.0, 3.0};
for (int i = 0; i < num_values_double; i++) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_double[i]);
for (int j = 0; j < type_length_double; j++) {
data_double[j * num_values_double + i] = bytes[j];
}
}
Slice data_slice_double(data_double.get(), data_size_double);
MutableColumnPtr column = ColumnFloat64::create();
DataTypePtr data_type = std::make_shared<DataTypeFloat64>();
// Set data for DOUBLE type
ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok());
_decoder.set_type_length(type_length_double);
// Create filter vector [1, 0, 1] and null vector [0, 1, 0]
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map = {1, 1, 1}; // data: [1.0f, null, 3.0f]
std::vector<uint8_t> filter_data = {0, 1, 1}; // filtered_data: [null, 3.0f]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnFloat64*>(column.get());
// EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f);
// EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f);
// Expected values after filtering and null handling
std::vector<std::optional<float>> expected_values = {std::nullopt, 3.0f};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_FLOAT_EQ(result_column->get_data()[i], expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test skipping values for FLOAT type
TEST_F(ByteStreamSplitDecoderTest, test_skip_value_float) {
// Prepare test data for FLOAT type
size_t type_length_float = sizeof(float);
size_t num_values_float = 3;
size_t data_size_float = num_values_float * type_length_float;
auto data_float = std::make_unique<uint8_t[]>(data_size_float);
const float values_float[3] = {1.0f, 2.0f, 3.0f};
for (int i = 0; i < num_values_float; i++) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_float[i]);
for (int j = 0; j < type_length_float; j++) {
data_float[j * num_values_float + i] = bytes[j];
}
}
Slice data_slice_float(data_float.get(), data_size_float);
MutableColumnPtr column = ColumnFloat32::create();
DataTypePtr data_type = std::make_shared<DataTypeFloat32>();
// Set data for FLOAT type
ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok());
_decoder.set_type_length(type_length_float);
// Skip first 2 values
ASSERT_TRUE(_decoder.skip_values(2).ok());
// Create selection vector
size_t num_values = 1; // Total 3 values, skip 2, remaining 1
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnFloat32*>(column.get());
EXPECT_FLOAT_EQ(result_column->get_data()[0], 3.0f);
}
// Test skipping values for DOUBLE type
TEST_F(ByteStreamSplitDecoderTest, test_skip_value_double) {
// Prepare test data for DOUBLE type
size_t type_length_double = sizeof(double);
size_t num_values_double = 3;
size_t data_size_double = num_values_double * type_length_double;
auto data_double = std::make_unique<uint8_t[]>(data_size_double);
const double values_double[3] = {1.0, 2.0, 3.0};
for (int i = 0; i < num_values_double; i++) {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_double[i]);
for (int j = 0; j < type_length_double; j++) {
data_double[j * num_values_double + i] = bytes[j];
}
}
Slice data_slice_double(data_double.get(), data_size_double);
MutableColumnPtr column = ColumnFloat64::create();
DataTypePtr data_type = std::make_shared<DataTypeFloat64>();
// Set data for DOUBLE type
ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok());
_decoder.set_type_length(type_length_double);
// Skip first 2 values
ASSERT_TRUE(_decoder.skip_values(2).ok());
// Create selection vector
size_t num_values = 1; // Total 3 values, skip 2, remaining 1
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnFloat64*>(column.get());
EXPECT_DOUBLE_EQ(result_column->get_data()[0], 3.0);
}
} // namespace doris::vectorized

View File

@ -0,0 +1,265 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
#include <gtest/gtest.h>
#include "parquet/encoding.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "util/slice.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_number.h"
#include "vec/data_types/data_type_string.h"
namespace doris::vectorized {
class DeltaBitPackDecoderTest : public ::testing::Test {
protected:
void SetUp() override { _decoder = std::make_unique<DeltaBitPackDecoder<int32_t>>(); }
std::unique_ptr<DeltaBitPackDecoder<int32_t>> _decoder;
};
// Test basic decoding functionality
TEST_F(DeltaBitPackDecoderTest, test_basic_decode) {
// Prepare encoded data
std::vector<uint8_t> encoded_data = {
// Header: block_size=128, mini_blocks_per_block=4, total_value_count=5, first_value=10
0x80, 0x01, 0x04, 0x05, 0x14,
// Block: min_delta=1, bit_width=[0, 0, 0, 0]
0x02, 0x00, 0x00, 0x00, 0x00
// MiniBlocks: no data needed for bit_width 0
};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create selection vector without filter
size_t num_values = 5;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnInt32*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 10);
EXPECT_EQ(result_column->get_data()[1], 11);
EXPECT_EQ(result_column->get_data()[2], 12);
EXPECT_EQ(result_column->get_data()[3], 13);
EXPECT_EQ(result_column->get_data()[4], 14);
}
// Test decoding with filter
TEST_F(DeltaBitPackDecoderTest, test_decode_with_filter) {
// Prepare encoded data
std::vector<uint8_t> encoded_data = {
// Header: block_size=128, mini_blocks_per_block=4, total_value_count=5, first_value=10
0x80, 0x01, 0x04, 0x05, 0x14,
// Block: min_delta=1, bit_width=[0, 0, 0, 0]
0x02, 0x00, 0x00, 0x00, 0x00
// MiniBlocks: no data needed for bit_width 0
};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create filter vector [1,0,1,0,1]
size_t num_values = 5;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 3); // 3 values after filtering
auto* result_column = assert_cast<ColumnInt32*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 10);
EXPECT_EQ(result_column->get_data()[1], 12);
EXPECT_EQ(result_column->get_data()[2], 14);
}
// Test decoding with filter and null values
TEST_F(DeltaBitPackDecoderTest, test_decode_with_filter_and_null) {
std::vector<uint8_t> encoded_data = {
// Header: block_size=128, mini_blocks_per_block=4, total_value_count=4, first_value=10
0x80, 0x01, 0x04, 0x04, 0x14,
// Block: min_delta=1, bit_width=[1, 0, 0, 0]
0x02, 0x01, 0x00, 0x00, 0x00,
// MiniBlocks
0x02, 0x00, 0x00, 0x00};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create filter vector [1,0,1,0,1] and null vector [0,0,1,0,0]
size_t num_values = 5;
std::vector<uint16_t> run_length_null_map = {2, 1, 2}; // data: [10 11 null 13 14]
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1}; // filtered_data: [10 null 14]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 3); // 3 values after filtering
auto* result_column = assert_cast<ColumnInt32*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<int32_t>> expected_values = {10, std::nullopt, 14};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(result_column->get_data()[i], expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test skipping values for delta bit pack decoding
TEST_F(DeltaBitPackDecoderTest, test_skip_value) {
// Prepare encoded data
std::vector<uint8_t> encoded_data = {
// Header: block_size=128, mini_blocks_per_block=4, total_value_count=8, first_value=10
0x80, 0x01, 0x04, 0x08, 0x14,
// Block: min_delta=1, bit_width=[0, 0, 0, 0]
0x02, 0x00, 0x00, 0x00, 0x00
// MiniBlocks: no data needed for bit_width 0
};
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Skip first 3 values
ASSERT_TRUE(_decoder->skip_values(3).ok());
// Create column and data type
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create selection vector
size_t num_values = 5; // Total 8 values, skip 3, remaining 5
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnInt32*>(column.get());
// Expected values after skipping first 3 values (10,11,12)
std::vector<int32_t> expected_values = {13, 14, 15, 16, 17};
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i;
}
}
// Test decoding data generated by arrow
TEST_F(DeltaBitPackDecoderTest, test_data_generated_by_arrow) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::INT32);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<int32_t> values = {10, 11, 13, 14};
// Create encoder
auto encoder = MakeTypedEncoder<parquet::Int32Type>(parquet::Encoding::DELTA_BINARY_PACKED,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(encoder->Put(values.data(), static_cast<int>(values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create selection vector
size_t num_values = values.size();
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnInt32*>(column.get());
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data()[i], values[i]);
}
}
// Test invalid data case
TEST_F(DeltaBitPackDecoderTest, test_invalid_data) {
// Prepare invalid encoded data
std::vector<uint8_t> encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
size_t num_values = 5;
std::vector<uint16_t> run_length_null_map(1, num_values);
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Decoding should fail due to invalid data
ASSERT_FALSE(_decoder->decode_values(column, data_type, select_vector, false).ok());
}
} // namespace doris::vectorized

View File

@ -0,0 +1,588 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <arrow/array.h>
#include <arrow/builder.h>
#include <gtest/gtest.h>
#include "arrow/api.h"
#include "parquet/encoding.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "util/slice.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_number.h"
#include "vec/data_types/data_type_string.h"
#include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
namespace doris::vectorized {
class DeltaByteArrayDecoderTest : public ::testing::Test {
protected:
void SetUp() override { _decoder = std::make_unique<DeltaByteArrayDecoder>(); }
std::unique_ptr<DeltaByteArrayDecoder> _decoder;
};
// Test basic decoding byte array functionality
TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_byte_array) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BYTE_ARRAY);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& value : values) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(value.size()),
reinterpret_cast<const uint8_t*>(value.data())});
}
// Create encoder
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create selection vector
size_t num_values = values.size();
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnString*>(column.get());
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i]);
}
}
// Test decoding byte array with filter
TEST_F(DeltaByteArrayDecoderTest, test_decode_byte_array_with_filter) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BYTE_ARRAY);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& value : values) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(value.size()),
reinterpret_cast<const uint8_t*>(value.data())});
}
// Create encoder
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create filter vector [1, 0, 1, 0]
size_t num_values = values.size();
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnString*>(column.get());
EXPECT_EQ(result_column->get_data_at(0).to_string(), "Hello");
EXPECT_EQ(result_column->get_data_at(1).to_string(), "Foobar");
}
// Test decoding byte array with filter and null values
TEST_F(DeltaByteArrayDecoderTest, test_decode_byte_array_with_filter_and_null) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BYTE_ARRAY);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<std::string> values = {"Hello", "World", "ABCDEF"};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& value : values) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(value.size()),
reinterpret_cast<const uint8_t*>(value.data())});
}
// Create encoder
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create filter vector [1, 0, 1, 0] and null vector [0, 0, 1, 0]
size_t num_values = 4;
std::vector<uint16_t> run_length_null_map = {2, 1,
1}; // data: ["Hello", "World", null, "ABCDEF"]
std::vector<uint8_t> filter_data = {1, 0, 1, 0}; // filtered_data: ["Hello", null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnString*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<std::string>> expected_values = {"Hello", std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test skipping values for byte array decoding
TEST_F(DeltaByteArrayDecoderTest, test_skip_value_for_byte_array) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BYTE_ARRAY);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare test data
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& value : values) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(value.size()),
reinterpret_cast<const uint8_t*>(value.data())});
}
// Encode data
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
auto encoded_buffer = encoder->FlushValues();
// Set decoder data
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Skip the first two values
ASSERT_TRUE(_decoder->skip_values(2).ok());
// Create column and data type
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create selection vector
size_t num_values = values.size() - 2; // Skip first two values
std::vector<uint16_t> run_length_null_map(1, num_values);
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnString*>(column.get());
// Verify decoded results (should start from the third value)
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i + 2])
<< "Mismatch at value " << (i + 2);
}
}
// Test basic decoding fixed-length byte array functionality
TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_fixed_len_byte_array) {
// Configure DECIMAL type parameters
const int32_t type_length = 16;
int precision = 10;
int scale = 2;
_decoder->set_type_length(type_length);
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make(
"test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
parquet::ConvertedType::DECIMAL, type_length, precision, scale);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare test data
std::vector<std::vector<uint8_t>> test_fixed_len_buffers = {
{0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc,
0x61, 0x40}, // Data 1
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00}, // Data 2 (all zeros)
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF}, // Data 3 (all ones)
{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC,
0xDE, 0xF0} // Data 4 (random)
};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& buffer : test_fixed_len_buffers) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(buffer.size()), buffer.data()});
}
// Encode data
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
auto encoded_buffer = encoder->FlushValues();
// Set decoder data
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeInt8>();
// Create selection vector
size_t num_values = test_fixed_len_buffers.size();
std::vector<uint16_t> run_length_null_map(1, num_values);
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values * type_length);
auto* result_column = assert_cast<ColumnInt8*>(column.get());
// Verify decoded results one by one
for (size_t i = 0; i < num_values; ++i) {
for (size_t j = 0; j < type_length; ++j) {
size_t index = i * type_length + j;
EXPECT_EQ(result_column->get_element(index),
static_cast<int8_t>(test_fixed_len_buffers[i][j]))
<< "Mismatch at buffer " << i << ", byte " << j;
}
}
}
// Test decoding fixed-length byte array with filter
TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter) {
// Configure DECIMAL type parameters
const int32_t type_length = 16;
int precision = 10;
int scale = 2;
_decoder->set_type_length(type_length);
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make(
"test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
parquet::ConvertedType::DECIMAL, type_length, precision, scale);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare test data
std::vector<std::vector<uint8_t>> test_fixed_len_buffers = {
{0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc,
0x61, 0x40}, // Data 1
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00}, // Data 2 (all zeros)
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF}, // Data 3 (all ones)
{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC,
0xDE, 0xF0} // Data 4 (random)
};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& buffer : test_fixed_len_buffers) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(buffer.size()), buffer.data()});
}
// Encode data
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
auto encoded_buffer = encoder->FlushValues();
// Set decoder data
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeInt8>();
// Create filter [1, 0, 1, 0]
size_t num_values = test_fixed_len_buffers.size();
std::vector<uint16_t> run_length_null_map(1, num_values);
std::vector<uint8_t> filter_data = {1, 0, 1, 0};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2 * type_length); // 2 values after filtering
auto* result_column = assert_cast<ColumnInt8*>(column.get());
// Verify first value
for (size_t j = 0; j < type_length; ++j) {
EXPECT_EQ(result_column->get_element(j), static_cast<int8_t>(test_fixed_len_buffers[0][j]))
<< "Mismatch at buffer 0, byte " << j;
}
// Verify third value
for (size_t j = 0; j < type_length; ++j) {
size_t index = type_length + j;
EXPECT_EQ(result_column->get_element(index),
static_cast<int8_t>(test_fixed_len_buffers[2][j]))
<< "Mismatch at buffer 2, byte " << j;
}
}
// Test decoding fixed-length byte array with filter and null values
TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter_and_null) {
// Configure DECIMAL type parameters
const int32_t type_length = 16;
int precision = 10;
int scale = 2;
_decoder->set_type_length(type_length);
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make(
"test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
parquet::ConvertedType::DECIMAL, type_length, precision, scale);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare test data
std::vector<std::vector<uint8_t>> test_fixed_len_buffers = {
{0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc,
0x61, 0x40}, // Data 1
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00}, // Data 2 (all zeros)
{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC,
0xDE, 0xF0} // Data 4 (random)
};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& buffer : test_fixed_len_buffers) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(buffer.size()), buffer.data()});
}
// Encode data
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
auto encoded_buffer = encoder->FlushValues();
// Set decoder data
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeInt8>();
// Create filter [1, 0, 1, 0] and null vector [0, 0, 1, 0]
size_t num_values = 4;
std::vector<uint16_t> run_length_null_map = {2, 1, 1}; // Data: [Data 1, Data 2, null, Data 4]
std::vector<uint8_t> filter_data = {1, 0, 1, 0}; // Filtered data: [Data 1, null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2 * type_length); // 2 values after filtering (Data 1 and null)
auto* result_column = assert_cast<ColumnInt8*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<std::vector<uint8_t>>> expected_values;
expected_values.push_back(std::vector<uint8_t> {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13,
0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, 0x61,
0x40}); // Data 1
expected_values.push_back(std::nullopt); // Only filtered values (Data 1 and null)
// Verify results
size_t filtered_index = 0;
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
for (size_t j = 0; j < type_length; ++j) {
size_t index = filtered_index * type_length + j;
EXPECT_EQ(result_column->get_element(index),
static_cast<int8_t>(expected_values[i].value()[j]))
<< "Mismatch at filtered value " << i << ", byte " << j;
}
EXPECT_FALSE(null_map[filtered_index])
<< "Expected non-null at filtered position " << filtered_index;
filtered_index++;
} else {
EXPECT_TRUE(null_map[filtered_index])
<< "Expected null at filtered position " << filtered_index;
filtered_index++;
}
}
}
// Test skipping values for fixed-length byte array decoding
TEST_F(DeltaByteArrayDecoderTest, test_skip_value_for_fixed_len_byte_array) {
// Configure DECIMAL type parameters
const int32_t type_length = 16;
int precision = 10;
int scale = 2;
_decoder->set_type_length(type_length);
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make(
"test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
parquet::ConvertedType::DECIMAL, type_length, precision, scale);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare test data
std::vector<std::vector<uint8_t>> test_fixed_len_buffers = {
{0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc,
0x61, 0x40}, // Data 1
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00}, // Data 2 (all zeros)
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF}, // Data 3 (all ones)
{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC,
0xDE, 0xF0} // Data 4 (random)
};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& buffer : test_fixed_len_buffers) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(buffer.size()), buffer.data()});
}
// Encode data
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
auto encoded_buffer = encoder->FlushValues();
// Set decoder data
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Skip the first two values
ASSERT_TRUE(_decoder->skip_values(2).ok());
// Create column and data type
MutableColumnPtr column = ColumnInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeInt8>();
// Create selection vector
size_t num_values = test_fixed_len_buffers.size() - 2; // Skip first two values
std::vector<uint16_t> run_length_null_map(1, num_values);
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values * type_length);
auto* result_column = assert_cast<ColumnInt8*>(column.get());
// Verify decoded results (should start from the third value)
for (size_t i = 0; i < num_values; ++i) {
for (size_t j = 0; j < type_length; ++j) {
size_t index = i * type_length + j;
EXPECT_EQ(result_column->get_element(index),
static_cast<int8_t>(test_fixed_len_buffers[i + 2][j]))
<< "Mismatch at buffer " << (i + 2) << ", byte " << j;
}
}
}
// Test decoding with invalid data
TEST_F(DeltaByteArrayDecoderTest, test_invalid_data) {
// Prepare invalid encoded data
std::vector<uint8_t> encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_FALSE(_decoder->set_data(&data_slice).ok());
}
} // namespace doris::vectorized

View File

@ -0,0 +1,276 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <gtest/gtest.h>
#include "parquet/encoding.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "util/slice.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_number.h"
#include "vec/data_types/data_type_string.h"
#include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
namespace doris::vectorized {
class DeltaLengthByteArrayDecoderTest : public ::testing::Test {
protected:
void SetUp() override { _decoder = std::make_unique<DeltaLengthByteArrayDecoder>(); }
std::unique_ptr<DeltaLengthByteArrayDecoder> _decoder;
};
// Test basic decoding functionality
TEST_F(DeltaLengthByteArrayDecoderTest, test_basic_decode) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BYTE_ARRAY);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& value : values) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(value.size()),
reinterpret_cast<const uint8_t*>(value.data())});
}
// Create encoder
auto encoder =
MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create selection vector
size_t num_values = values.size();
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnString*>(column.get());
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i]);
}
}
// Test decoding with filter
TEST_F(DeltaLengthByteArrayDecoderTest, test_decode_with_filter) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BYTE_ARRAY);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& value : values) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(value.size()),
reinterpret_cast<const uint8_t*>(value.data())});
}
// Create encoder
auto encoder =
MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create filter vector [1, 0, 1, 0]
size_t num_values = values.size();
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnString*>(column.get());
EXPECT_EQ(result_column->get_data_at(0).to_string(), "Hello");
EXPECT_EQ(result_column->get_data_at(1).to_string(), "Foobar");
}
// Test decoding with filter and null values
TEST_F(DeltaLengthByteArrayDecoderTest, test_decode_with_filter_and_null) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BYTE_ARRAY);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<std::string> values = {"Hello", "World", "ABCDEF"};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& value : values) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(value.size()),
reinterpret_cast<const uint8_t*>(value.data())});
}
// Create encoder
auto encoder =
MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Create column and data type
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create filter vector [1, 0, 1, 0] and null vector [0, 0, 1, 0]
size_t num_values = 4;
std::vector<uint16_t> run_length_null_map = {2, 1,
1}; // data: ["Hello", "World", null, "ABCDEF"]
std::vector<uint8_t> filter_data = {1, 0, 1, 0}; // filtered_data: ["Hello", null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnString*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<std::string>> expected_values = {"Hello", std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test decoding with invalid data
TEST_F(DeltaLengthByteArrayDecoderTest, test_invalid_data) {
// Prepare invalid encoded data
std::vector<uint8_t> encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
ASSERT_FALSE(_decoder->set_data(&data_slice).ok());
}
// Test skipping values for delta length byte array decoding
TEST_F(DeltaLengthByteArrayDecoderTest, test_skip_value) {
// Create ColumnDescriptor
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
parquet::Type::BYTE_ARRAY);
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
// Prepare original data
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF", "Test", "Skip"};
std::vector<parquet::ByteArray> byte_array_values;
for (const auto& value : values) {
byte_array_values.emplace_back(
parquet::ByteArray {static_cast<uint32_t>(value.size()),
reinterpret_cast<const uint8_t*>(value.data())});
}
// Create encoder
auto encoder =
MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
/*use_dictionary=*/false, descr.get());
// Put data into encoder
ASSERT_NO_THROW(
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
// Get encoded data
auto encoded_buffer = encoder->FlushValues();
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
// Skip first 3 values
ASSERT_TRUE(_decoder->skip_values(3).ok());
// Create column and data type
MutableColumnPtr column = ColumnString::create();
DataTypePtr data_type = std::make_shared<DataTypeString>();
// Create selection vector
size_t num_values = values.size() - 3; // Total 6 values, skip 3, remaining 3
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnString*>(column.get());
// Expected values after skipping first 3 values ("Hello", "World", "Foobar")
std::vector<std::string> expected_values = {"ABCDEF", "Test", "Skip"};
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i])
<< "Mismatch at value " << i;
}
}
} // namespace doris::vectorized

View File

@ -0,0 +1,538 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/fix_length_dict_decoder.hpp"
#include <gtest/gtest.h>
#include "util/slice.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_number.h"
namespace doris::vectorized {
class FixLengthDictDecoderTest : public ::testing::Test {
protected:
void SetUp() override {
// Prepare test data: create a dictionary with fixed-length strings
_type_length = 6; // Each string has length 6
size_t dict_size = 3;
size_t dict_data_size = dict_size * _type_length;
auto dict_data = std::make_unique<uint8_t[]>(dict_data_size);
const char* values[3] = {"apple ", "banana", "cherry"}; // Dictionary values
for (int i = 0; i < 3; i++) {
memcpy(dict_data.get() + i * _type_length, values[i], _type_length);
}
_decoder.set_type_length(_type_length);
ASSERT_TRUE(_decoder.set_dict(dict_data, dict_data_size, dict_size).ok());
}
FixLengthDictDecoder _decoder;
size_t _type_length;
};
// Test basic decoding functionality
TEST_F(FixLengthDictDecoderTest, test_basic_decode) {
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
size_t num_values = 7;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values * _type_length);
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
// Split decoded results into strings based on _type_length
std::vector<std::string> decoded_strings;
const auto& data = result_column->get_data();
for (size_t i = 0; i < num_values; ++i) {
std::string str;
for (size_t j = 0; j < _type_length; ++j) {
str.push_back(static_cast<char>(data[i * _type_length + j]));
}
decoded_strings.push_back(str);
}
// Verify first 4 repeated values (dict index 0 -> value "apple ")
for (int i = 0; i < 4; i++) {
EXPECT_EQ(decoded_strings[i], "apple ");
}
// Verify last 3 literal values
EXPECT_EQ(decoded_strings[4], "banana");
EXPECT_EQ(decoded_strings[5], "cherry");
EXPECT_EQ(decoded_strings[6], "banana");
}
// Test decoding with filter
TEST_F(FixLengthDictDecoderTest, test_decode_with_filter) {
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
;
// Create filter vector [1,0,1,0,1,1,1]
size_t num_values = 7;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5 * _type_length); // 5 values after filtering
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
// Split decoded results into strings based on _type_length
std::vector<std::string> decoded_strings;
const auto& data = result_column->get_data();
for (size_t i = 0; i < 5; ++i) {
std::string str;
for (size_t j = 0; j < _type_length; ++j) {
str.push_back(static_cast<char>(data[i * _type_length + j]));
}
decoded_strings.push_back(str);
}
// Verify filtered values
EXPECT_EQ(decoded_strings[0], "apple ");
EXPECT_EQ(decoded_strings[1], "apple ");
EXPECT_EQ(decoded_strings[2], "banana");
EXPECT_EQ(decoded_strings[3], "cherry");
EXPECT_EQ(decoded_strings[4], "banana");
}
// Test decoding with filter and null
TEST_F(FixLengthDictDecoderTest, test_decode_with_filter_and_null) {
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
size_t num_values = 7;
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5 * _type_length); // 5 values after filtering
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
// Split decoded results into strings based on _type_length
std::vector<std::string> decoded_strings;
const auto& data = result_column->get_data();
for (size_t i = 0; i < 5; ++i) {
std::string str;
for (size_t j = 0; j < _type_length; ++j) {
str.push_back(static_cast<char>(data[i * _type_length + j]));
}
decoded_strings.push_back(str);
}
// Expected values after filtering and null handling
std::vector<std::optional<std::string>> expected_values = {"apple ", "apple ", std::nullopt,
"cherry", std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(decoded_strings[i], expected_values[i].value()) << "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test empty dictionary case
TEST_F(FixLengthDictDecoderTest, test_empty_dict) {
FixLengthDictDecoder empty_decoder;
empty_decoder.set_type_length(sizeof(int32_t));
auto dict_data = std::make_unique<uint8_t[]>(0);
ASSERT_TRUE(empty_decoder.set_dict(dict_data, 0, 0).ok());
}
// Test decoding with ColumnDictI32
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32) {
// Create ColumnDictI32 column
MutableColumnPtr column = ColumnDictI32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
// Verify first 4 repeated values (dict index 0 -> value "apple ")
for (int i = 0; i < 4; i++) {
EXPECT_EQ(dict_column->get_data()[i], 0);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(), "apple ");
}
// Verify last 3 literal values
EXPECT_EQ(dict_column->get_data()[4], 1);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana");
EXPECT_EQ(dict_column->get_data()[5], 2);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[5]).to_string(), "cherry");
EXPECT_EQ(dict_column->get_data()[6], 1);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[6]).to_string(), "banana");
}
// Test decoding with ColumnDictI32 and filter
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32_with_filter) {
// Create ColumnDictI32 column
MutableColumnPtr column = ColumnDictI32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1]
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
// Verify filtered values
EXPECT_EQ(dict_column->get_data()[0], 0);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[0]).to_string(), "apple ");
EXPECT_EQ(dict_column->get_data()[1], 0);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[1]).to_string(), "apple ");
EXPECT_EQ(dict_column->get_data()[2], 1);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[2]).to_string(), "banana");
EXPECT_EQ(dict_column->get_data()[3], 2);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[3]).to_string(), "cherry");
EXPECT_EQ(dict_column->get_data()[4], 1);
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana");
}
// Test decoding with ColumnDictI32 with filter and null
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32_with_filter_and_null) {
// Create ColumnDictI32 column
MutableColumnPtr column = ColumnDictI32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<std::string>> expected_values = {"apple ", "apple ", std::nullopt,
"cherry", std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(),
expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test decoding with ColumnInt32
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32) {
// Create ColumnInt32 column
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
// Verify first 4 repeated values (dict index 0 -> value "apple ")
for (int i = 0; i < 4; i++) {
EXPECT_EQ(dict_column->get_data()[i], 0);
}
// Verify last 3 literal values
EXPECT_EQ(dict_column->get_data()[4], 1);
EXPECT_EQ(dict_column->get_data()[5], 2);
EXPECT_EQ(dict_column->get_data()[6], 1);
}
// Test decoding with ColumnInt32 and filter
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32_with_filter) {
// Create ColumnInt32 column
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1]
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
// Verify filtered values
EXPECT_EQ(dict_column->get_data()[0], 0);
EXPECT_EQ(dict_column->get_data()[1], 0);
EXPECT_EQ(dict_column->get_data()[2], 1);
EXPECT_EQ(dict_column->get_data()[3], 2);
EXPECT_EQ(dict_column->get_data()[4], 1);
}
// Test decoding with ColumnInt32 with filter and null
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32_with_filter_and_null) {
// Create ColumnInt32 column
MutableColumnPtr column = ColumnInt32::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
const size_t num_values = 7;
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
// Verify results
ASSERT_EQ(column->size(), 5); // 5 values after filtering
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<int32_t>> expected_values = {0, 0, std::nullopt, 2, std::nullopt};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(dict_column->get_data()[i], expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test reading dictionary values to column
TEST_F(FixLengthDictDecoderTest, test_read_dict_values_to_column) {
// Create a column to store dictionary values
MutableColumnPtr column = ColumnString::create();
// Read dictionary values to column
ASSERT_TRUE(_decoder.read_dict_values_to_column(column).ok());
// Verify results
ASSERT_EQ(column->size(), 3); // 3 dictionary items
auto* result_column = assert_cast<ColumnString*>(column.get());
// Get decoded strings directly
std::vector<std::string> decoded_strings;
for (size_t i = 0; i < 3; ++i) {
decoded_strings.push_back(result_column->get_data_at(i).to_string());
}
// Verify dictionary values
EXPECT_EQ(decoded_strings[0], "apple ");
EXPECT_EQ(decoded_strings[1], "banana");
EXPECT_EQ(decoded_strings[2], "cherry");
}
// Test convert_dict_column_to_string_column function
TEST_F(FixLengthDictDecoderTest, test_convert_dict_column_to_string_column) {
// Create a ColumnInt32 with some dictionary codes
MutableColumnPtr dict_column = ColumnInt32::create();
dict_column->insert(0);
dict_column->insert(1);
dict_column->insert(2);
dict_column->insert(1);
// Convert to string column
MutableColumnPtr string_column = _decoder.convert_dict_column_to_string_column(
assert_cast<ColumnInt32*>(dict_column.get()));
// Verify results
ASSERT_EQ(string_column->size(), 4);
auto* result_column = assert_cast<ColumnString*>(string_column.get());
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple ");
EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana");
EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry");
EXPECT_EQ(result_column->get_data_at(3).to_string(), "banana");
}
// Test skipping values for fixed length dictionary decoding
TEST_F(FixLengthDictDecoderTest, test_skip_value) {
MutableColumnPtr column = ColumnUInt8::create();
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
// Skip first 3 values
ASSERT_TRUE(_decoder.skip_values(3).ok());
// Create selection vector
size_t num_values = 4; // Total 7 values, skip 3, remaining 4
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values * _type_length);
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
// Split decoded results into strings based on _type_length
std::vector<std::string> decoded_strings;
const auto& data = result_column->get_data();
for (size_t i = 0; i < num_values; ++i) {
std::string str;
for (size_t j = 0; j < _type_length; ++j) {
str.push_back(static_cast<char>(data[i * _type_length + j]));
}
decoded_strings.push_back(str);
}
// Expected values after skipping first 3 values ("apple ", "apple ", "apple ")
std::vector<std::string> expected_values = {"apple ", "banana", "cherry", "banana"};
for (size_t i = 0; i < num_values; ++i) {
EXPECT_EQ(decoded_strings[i], expected_values[i]) << "Mismatch at value " << i;
}
}
} // namespace doris::vectorized

View File

@ -0,0 +1,203 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/fix_length_plain_decoder.h"
#include <gtest/gtest.h>
#include "util/slice.h"
#include "vec/columns/column_vector.h"
#include "vec/data_types/data_type_number.h"
namespace doris::vectorized {
class FixLengthPlainDecoderTest : public ::testing::Test {
protected:
void SetUp() override {}
std::unique_ptr<uint8_t[]> _data;
Slice _data_slice;
size_t _type_length;
};
// Test basic decoding functionality
TEST_F(FixLengthPlainDecoderTest, test_basic_decode) {
// Prepare test data: create fixed-length integer values
int32_t values[3] = {123, 456, 789};
size_t data_size = sizeof(values);
_data = std::make_unique<uint8_t[]>(data_size);
memcpy(_data.get(), values, data_size);
_data_slice = Slice(_data.get(), data_size);
_type_length = sizeof(int32_t);
FixLengthPlainDecoder decoder;
decoder.set_type_length(_type_length);
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
MutableColumnPtr column = ColumnVector<int32_t>::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create selection vector without filter
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnVector<int32_t>*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 123);
EXPECT_EQ(result_column->get_data()[1], 456);
EXPECT_EQ(result_column->get_data()[2], 789);
}
// Test decoding with filter
TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter) {
// Prepare test data: create fixed-length integer values
int32_t values[3] = {123, 456, 789};
size_t data_size = sizeof(values);
_data = std::make_unique<uint8_t[]>(data_size);
memcpy(_data.get(), values, data_size);
_data_slice = Slice(_data.get(), data_size);
_type_length = sizeof(int32_t);
FixLengthPlainDecoder decoder;
decoder.set_type_length(_type_length);
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
MutableColumnPtr column = ColumnVector<int32_t>::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create filter vector [1,0,1]
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data = {1, 0, 1};
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnVector<int32_t>*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 123);
EXPECT_EQ(result_column->get_data()[1], 789);
}
// Test decoding with filter and null
TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter_and_null) {
// Prepare test data: create fixed-length integer values
int32_t values[2] = {123, 789};
size_t data_size = sizeof(values);
_data = std::make_unique<uint8_t[]>(data_size);
memcpy(_data.get(), values, data_size);
_data_slice = Slice(_data.get(), data_size);
_type_length = sizeof(int32_t);
FixLengthPlainDecoder decoder;
decoder.set_type_length(_type_length);
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
MutableColumnPtr column = ColumnVector<int32_t>::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create filter vector [1,0,1] and null vector [0,1,0]
size_t num_values = 3;
std::vector<uint16_t> run_length_null_map = {1, 1, 1}; // data: [123, null, 789]
std::vector<uint8_t> filter_data = {1, 0, 1}; // filtered_data: [123, 789]
ColumnSelectVector select_vector;
NullMap null_map;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
// Perform decoding
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), 2); // 2 values after filtering
auto* result_column = assert_cast<ColumnVector<int32_t>*>(column.get());
// Expected values after filtering and null handling
std::vector<std::optional<int32_t>> expected_values = {123, 789};
for (size_t i = 0; i < expected_values.size(); ++i) {
if (expected_values[i].has_value()) {
EXPECT_EQ(result_column->get_data()[i], expected_values[i].value())
<< "Mismatch at value " << i;
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
} else {
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
}
}
}
// Test skipping values
TEST_F(FixLengthPlainDecoderTest, test_skip_value) {
// Prepare test data: create fixed-length integer values
int32_t values[3] = {123, 456, 789};
size_t data_size = sizeof(values);
_data = std::make_unique<uint8_t[]>(data_size);
memcpy(_data.get(), values, data_size);
_data_slice = Slice(_data.get(), data_size);
_type_length = sizeof(int32_t);
FixLengthPlainDecoder decoder;
decoder.set_type_length(_type_length);
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
// Skip first 2 values
ASSERT_TRUE(decoder.skip_values(2).ok());
MutableColumnPtr column = ColumnVector<int32_t>::create();
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
// Create selection vector
size_t num_values = 1; // Total 3 values, skip 2, remaining 1
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
std::vector<uint8_t> filter_data(num_values, 1);
ColumnSelectVector select_vector;
select_vector.build(filter_data.data(), filter_data.size(), false);
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
// Perform decoding
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
// Verify results
ASSERT_EQ(column->size(), num_values);
auto* result_column = assert_cast<ColumnVector<int32_t>*>(column.get());
EXPECT_EQ(result_column->get_data()[0], 789);
}
} // namespace doris::vectorized

View File

@ -0,0 +1,225 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/level_decoder.h"
#include <gtest/gtest.h>
#include "parquet/encoding.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "util/slice.h"
namespace doris::vectorized {
class LevelDecoderTest : public ::testing::Test {
protected:
void SetUp() override { _decoder = std::make_unique<LevelDecoder>(); }
std::unique_ptr<LevelDecoder> _decoder;
};
// Test basic RLE level decoding for data page v1
TEST_F(LevelDecoderTest, test_rle_decode_v1) {
// Prepare RLE encoded data
// RLE encoded data: 4 zeros followed by 1, 2, 1 [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {
0x04, 0x00, 0x00, 0x00, // RLE length (4 bytes)
8, 0, 3, 0b00011001 // RLE data
};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
// Initialize decoder
ASSERT_TRUE(_decoder->init(&data_slice, tparquet::Encoding::RLE, 2, 7).ok());
// Decode levels
level_t levels[7];
size_t num_levels = _decoder->get_levels(levels, 7);
// Verify results
ASSERT_EQ(num_levels, 7);
EXPECT_EQ(levels[0], 0);
EXPECT_EQ(levels[1], 0);
EXPECT_EQ(levels[2], 0);
EXPECT_EQ(levels[3], 0);
EXPECT_EQ(levels[4], 1);
EXPECT_EQ(levels[5], 2);
EXPECT_EQ(levels[6], 1);
}
// Test basic BIT-PACKED level decoding for data page v1
TEST_F(LevelDecoderTest, test_bit_packed_decode_v1) {
// Prepare BIT-PACKED encoded data
// [1 2 1]
std::vector<uint8_t> rle_data = {0b00011001};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
// Initialize decoder
ASSERT_TRUE(_decoder->init(&data_slice, tparquet::Encoding::BIT_PACKED, 2, 3).ok());
// Decode levels
level_t levels[3];
size_t num_levels = _decoder->get_levels(levels, 3);
// Verify results
ASSERT_EQ(num_levels, 3);
EXPECT_EQ(levels[0], 1);
EXPECT_EQ(levels[1], 2);
EXPECT_EQ(levels[2], 1);
}
// Test RLE level decoding for data page v2
TEST_F(LevelDecoderTest, test_rle_decode_v2) {
// Prepare RLE encoded data
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
// Initialize decoder
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
// Decode levels
level_t levels[7];
size_t num_levels = _decoder->get_levels(levels, 7);
// Verify results
ASSERT_EQ(num_levels, 7);
EXPECT_EQ(levels[0], 0);
EXPECT_EQ(levels[1], 0);
EXPECT_EQ(levels[2], 0);
EXPECT_EQ(levels[3], 0);
EXPECT_EQ(levels[4], 1);
EXPECT_EQ(levels[5], 2);
EXPECT_EQ(levels[6], 1);
}
// Test invalid RLE data for data page v1
TEST_F(LevelDecoderTest, test_invalid_rle_data_v1) {
// Prepare invalid RLE data
std::vector<uint8_t> rle_data = {0x04, 0x00, 0x00, 0x00, // RLE length (4 bytes)
8, 0, 3};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
// Initialize decoder should fail
ASSERT_FALSE(_decoder->init(&data_slice, tparquet::Encoding::RLE, 1, 8).ok());
}
// TODO: Currently not working, so commented out.
// Test invalid RLE data for data page v2
//TEST_F(LevelDecoderTest, test_invalid_rle_data_v2) {
// // Prepare invalid RLE data
// std::vector<uint8_t> rle_data = {8, 0, 3};
// Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
//
// // Initialize decoder should fail
// ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
//
// // Decode levels
// level_t levels[7];
// size_t num_levels = _decoder->get_levels(levels, 7);
//
// // Verify results
// ASSERT_EQ(num_levels, 7);
//}
// Test unsupported encoding
TEST_F(LevelDecoderTest, test_unsupported_encoding) {
// Prepare dummy data
std::vector<uint8_t> dummy_data = {0x00};
Slice data_slice(reinterpret_cast<char*>(dummy_data.data()), dummy_data.size());
// Initialize decoder with unsupported encoding should fail
ASSERT_FALSE(_decoder->init(&data_slice, tparquet::Encoding::PLAIN, 1, 8).ok());
}
// Test has_levels() function
TEST_F(LevelDecoderTest, test_has_levels) {
// Initially, there should be no levels
EXPECT_FALSE(_decoder->has_levels());
// Prepare RLE encoded data
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
// Initialize decoder with valid data
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
// Now there should be levels
EXPECT_TRUE(_decoder->has_levels());
}
// Test get_next() function
TEST_F(LevelDecoderTest, test_get_next) {
// Prepare RLE encoded data
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
// Initialize decoder
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
// Verify the first level
EXPECT_EQ(_decoder->get_next(), 0);
// Verify the next level
EXPECT_EQ(_decoder->get_next(), 0);
}
// Test rewind_one() function
TEST_F(LevelDecoderTest, test_rewind_one) {
// Prepare RLE encoded data
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
// Initialize decoder
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
// Get the first level
level_t first_level = _decoder->get_next();
// Get the second level
level_t second_level = _decoder->get_next();
// Rewind one level
_decoder->rewind_one();
// Verify that we get the second level again
EXPECT_EQ(_decoder->get_next(), second_level);
// Rewind one more level
_decoder->rewind_one();
// Verify that we get the first level again
EXPECT_EQ(_decoder->get_next(), first_level);
}
// Test rle_decoder() function
TEST_F(LevelDecoderTest, test_rle_decoder) {
// Prepare RLE encoded data
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
// Initialize decoder
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
// Get the RLE decoder
const RleDecoder<level_t>& rle_decoder = _decoder->rle_decoder();
// Verify that the RLE decoder is not null
EXPECT_NE(&rle_decoder, nullptr);
}
} // namespace doris::vectorized