[Test][Fix](parquet-reader) Add parquet decoder unit tests and fix bugs by these tests. (#49922)
This commit is contained in:
@ -43,11 +43,12 @@ public:
|
||||
~BoolPlainDecoder() override = default;
|
||||
|
||||
// Set the data to be decoded
|
||||
void set_data(Slice* data) override {
|
||||
Status set_data(Slice* data) override {
|
||||
bool_values_.Reset((const uint8_t*)data->data, data->size);
|
||||
num_unpacked_values_ = 0;
|
||||
unpacked_value_idx_ = 0;
|
||||
_offset = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
|
||||
@ -30,29 +30,29 @@
|
||||
#include "vec/exec/format/parquet/parquet_common.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
void BoolRLEDecoder::set_data(Slice* slice) {
|
||||
Status BoolRLEDecoder::set_data(Slice* slice) {
|
||||
_data = slice;
|
||||
_num_bytes = slice->size;
|
||||
_offset = 0;
|
||||
_current_value_idx = 0;
|
||||
if (_num_bytes < 4) {
|
||||
LOG(FATAL) << "Received invalid length : " + std::to_string(_num_bytes) +
|
||||
" (corrupt data page?)";
|
||||
return Status::IOError("Received invalid length : " + std::to_string(_num_bytes) +
|
||||
" (corrupt data page?)");
|
||||
}
|
||||
// Load the first 4 bytes in little-endian, which indicates the length
|
||||
const uint8_t* data = reinterpret_cast<const uint8_t*>(_data->data);
|
||||
uint32_t num_bytes = decode_fixed32_le(data);
|
||||
if (num_bytes > static_cast<uint32_t>(_num_bytes - 4)) {
|
||||
LOG(FATAL) << ("Received invalid number of bytes : " + std::to_string(num_bytes) +
|
||||
" (corrupt data page?)");
|
||||
return Status::IOError("Received invalid number of bytes : " + std::to_string(num_bytes) +
|
||||
" (corrupt data page?)");
|
||||
}
|
||||
_num_bytes = num_bytes;
|
||||
auto decoder_data = data + 4;
|
||||
_decoder = RleDecoder<uint8_t>(decoder_data, num_bytes, 1);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status BoolRLEDecoder::skip_values(size_t num_values) {
|
||||
_current_value_idx += num_values;
|
||||
_decoder.Skip(num_values);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -76,15 +76,16 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt
|
||||
if (!_decoder.get_values(_values.data(), max_values)) {
|
||||
return Status::IOError("Can't read enough booleans in rle decoder");
|
||||
}
|
||||
size_t current_value_idx = 0;
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
bool value; // Can't use uint8_t directly, we should correct it.
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
DCHECK(_current_value_idx < max_values)
|
||||
<< _current_value_idx << " vs. " << max_values;
|
||||
value = _values[_current_value_idx++];
|
||||
DCHECK(current_value_idx < max_values)
|
||||
<< current_value_idx << " vs. " << max_values;
|
||||
value = _values[current_value_idx++];
|
||||
column_data[data_index++] = (UInt8)value;
|
||||
}
|
||||
break;
|
||||
@ -94,7 +95,7 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_current_value_idx += run_length;
|
||||
current_value_idx += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
@ -102,7 +103,6 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt
|
||||
}
|
||||
}
|
||||
}
|
||||
_current_value_idx = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -40,7 +40,7 @@ public:
|
||||
BoolRLEDecoder() = default;
|
||||
~BoolRLEDecoder() override = default;
|
||||
|
||||
void set_data(Slice* slice) override;
|
||||
Status set_data(Slice* slice) override;
|
||||
|
||||
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter) override;
|
||||
@ -55,6 +55,5 @@ private:
|
||||
RleDecoder<uint8_t> _decoder;
|
||||
std::vector<uint8_t> _values;
|
||||
size_t _num_bytes;
|
||||
size_t _current_value_idx = 0;
|
||||
};
|
||||
} // namespace doris::vectorized
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -59,9 +59,10 @@ public:
|
||||
void set_type_length(int32_t type_length) { _type_length = type_length; }
|
||||
|
||||
// Set the data to be decoded
|
||||
virtual void set_data(Slice* data) {
|
||||
virtual Status set_data(Slice* data) {
|
||||
_data = data;
|
||||
_offset = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Write the decoded values batch to doris's column
|
||||
@ -95,13 +96,14 @@ public:
|
||||
~BaseDictDecoder() override = default;
|
||||
|
||||
// Set the data to be decoded
|
||||
void set_data(Slice* data) override {
|
||||
Status set_data(Slice* data) override {
|
||||
_data = data;
|
||||
_offset = 0;
|
||||
uint8_t bit_width = *data->data;
|
||||
_index_batch_decoder = std::make_unique<RleBatchDecoder<uint32_t>>(
|
||||
reinterpret_cast<uint8_t*>(data->data) + 1, static_cast<int>(data->size) - 1,
|
||||
bit_width);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
134
be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp
Normal file
134
be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp
Normal file
@ -0,0 +1,134 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
Status DeltaLengthByteArrayDecoder::_decode_lengths() {
|
||||
RETURN_IF_ERROR(_len_decoder.set_bit_reader(_bit_reader));
|
||||
// get the number of encoded lengths
|
||||
int num_length = _len_decoder.valid_values_count();
|
||||
_buffered_length.resize(num_length);
|
||||
|
||||
// decode all the lengths. all the lengths are buffered in buffered_length_.
|
||||
int ret;
|
||||
RETURN_IF_ERROR(_len_decoder.decode(_buffered_length.data(), num_length, &ret));
|
||||
DCHECK_EQ(ret, num_length);
|
||||
_length_idx = 0;
|
||||
_num_valid_values = num_length;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values,
|
||||
int* out_num_values) {
|
||||
// Decode up to `max_values` strings into an internal buffer
|
||||
// and reference them into `buffer`.
|
||||
max_values = std::min(max_values, _num_valid_values);
|
||||
if (max_values == 0) {
|
||||
*out_num_values = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int32_t data_size = 0;
|
||||
const int32_t* length_ptr = _buffered_length.data() + _length_idx;
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
int32_t len = length_ptr[i];
|
||||
if (PREDICT_FALSE(len < 0)) {
|
||||
return Status::InvalidArgument("Negative string delta length");
|
||||
}
|
||||
buffer[i].size = len;
|
||||
if (common::add_overflow(data_size, len, data_size)) {
|
||||
return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
|
||||
}
|
||||
}
|
||||
_length_idx += max_values;
|
||||
|
||||
_buffered_data.resize(data_size);
|
||||
char* data_ptr = _buffered_data.data();
|
||||
for (int j = 0; j < data_size; j++) {
|
||||
if (!_bit_reader->GetValue(8, data_ptr + j)) {
|
||||
return Status::IOError("Get length bytes EOF");
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
buffer[i].data = data_ptr;
|
||||
data_ptr += buffer[i].size;
|
||||
}
|
||||
// this->num_values_ -= max_values;
|
||||
_num_valid_values -= max_values;
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) {
|
||||
// Decode up to `max_values` strings into an internal buffer
|
||||
// and reference them into `buffer`.
|
||||
max_values = std::min(max_values, _num_valid_values);
|
||||
if (max_values == 0) {
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int suffix_read;
|
||||
RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read));
|
||||
if (PREDICT_FALSE(suffix_read != max_values)) {
|
||||
return Status::IOError("Read {}, expecting {} from suffix decoder",
|
||||
std::to_string(suffix_read), std::to_string(max_values));
|
||||
}
|
||||
|
||||
int64_t data_size = 0;
|
||||
const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset;
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
|
||||
return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
if (PREDICT_FALSE(common::add_overflow(data_size, static_cast<int64_t>(prefix_len_ptr[i]),
|
||||
data_size) ||
|
||||
common::add_overflow(data_size, static_cast<int64_t>(buffer[i].size),
|
||||
data_size))) {
|
||||
return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
}
|
||||
_buffered_data.resize(data_size);
|
||||
|
||||
std::string_view prefix {_last_value};
|
||||
|
||||
char* data_ptr = _buffered_data.data();
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
if (PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
|
||||
return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
|
||||
// buffer[i] currently points to the string suffix
|
||||
memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size);
|
||||
buffer[i].data = data_ptr;
|
||||
buffer[i].size += prefix_len_ptr[i];
|
||||
data_ptr += buffer[i].size;
|
||||
prefix = std::string_view {buffer[i].data, buffer[i].size};
|
||||
}
|
||||
_prefix_len_offset += max_values;
|
||||
_num_valid_values -= max_values;
|
||||
_last_value = std::string {prefix};
|
||||
|
||||
if (_num_valid_values == 0) {
|
||||
_last_value_in_previous_page = _last_value;
|
||||
}
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
@ -47,10 +47,6 @@ public:
|
||||
|
||||
~DeltaDecoder() override = default;
|
||||
|
||||
Status skip_values(size_t num_values) override {
|
||||
return _type_converted_decoder->skip_values(num_values);
|
||||
}
|
||||
|
||||
template <bool has_filter>
|
||||
Status decode_byte_array(const std::vector<Slice>& decoded_vals, MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type, ColumnSelectVector& select_vector) {
|
||||
@ -125,9 +121,10 @@ public:
|
||||
}
|
||||
|
||||
protected:
|
||||
void init_values_converter() {
|
||||
_type_converted_decoder->set_data(_data);
|
||||
Status init_values_converter() {
|
||||
RETURN_IF_ERROR(_type_converted_decoder->set_data(_data));
|
||||
_type_converted_decoder->set_type_length(_type_length);
|
||||
return Status::OK();
|
||||
}
|
||||
// Convert decoded value to doris type value.
|
||||
std::unique_ptr<Decoder> _type_converted_decoder;
|
||||
@ -148,6 +145,13 @@ public:
|
||||
|
||||
DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder()) {}
|
||||
~DeltaBitPackDecoder() override = default;
|
||||
|
||||
Status skip_values(size_t num_values) override {
|
||||
_values.resize(num_values);
|
||||
int num_valid_values;
|
||||
return _get_internal(_values.data(), num_values, &num_valid_values);
|
||||
}
|
||||
|
||||
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter) override {
|
||||
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
|
||||
@ -159,7 +163,7 @@ public:
|
||||
_type_length = sizeof(T);
|
||||
_data->size = _values.size() * _type_length;
|
||||
// set decoded value with fix plain decoder
|
||||
init_values_converter();
|
||||
RETURN_IF_ERROR(init_values_converter());
|
||||
return _type_converted_decoder->decode_values(doris_column, data_type, select_vector,
|
||||
is_dict_filter);
|
||||
}
|
||||
@ -173,24 +177,20 @@ public:
|
||||
return static_cast<int>(_total_values_remaining);
|
||||
}
|
||||
|
||||
void set_data(Slice* slice) override {
|
||||
Status set_data(Slice* slice) override {
|
||||
_bit_reader.reset(new BitReader((const uint8_t*)slice->data, slice->size));
|
||||
Status st = _init_header();
|
||||
if (!st.ok()) {
|
||||
LOG(FATAL) << "Fail to init delta encoding header for " << st.to_string();
|
||||
}
|
||||
RETURN_IF_ERROR(_init_header());
|
||||
_data = slice;
|
||||
_offset = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Set BitReader which is already initialized by DeltaLengthByteArrayDecoder or
|
||||
// DeltaByteArrayDecoder
|
||||
void set_bit_reader(std::shared_ptr<BitReader> bit_reader) {
|
||||
Status set_bit_reader(std::shared_ptr<BitReader> bit_reader) {
|
||||
_bit_reader = std::move(bit_reader);
|
||||
Status st = _init_header();
|
||||
if (!st.ok()) {
|
||||
LOG(FATAL) << "Fail to init delta encoding header for " << st.to_string();
|
||||
}
|
||||
RETURN_IF_ERROR(_init_header());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
@ -265,25 +265,27 @@ public:
|
||||
return _get_internal(buffer, num_values, out_num_values);
|
||||
}
|
||||
|
||||
void set_data(Slice* slice) override {
|
||||
Status set_data(Slice* slice) override {
|
||||
if (slice->size == 0) {
|
||||
return;
|
||||
return Status::OK();
|
||||
}
|
||||
_bit_reader = std::make_shared<BitReader>((const uint8_t*)slice->data, slice->size);
|
||||
_data = slice;
|
||||
_offset = 0;
|
||||
_decode_lengths();
|
||||
RETURN_IF_ERROR(_decode_lengths());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void set_bit_reader(std::shared_ptr<BitReader> bit_reader) {
|
||||
Status set_bit_reader(std::shared_ptr<BitReader> bit_reader) {
|
||||
_bit_reader = std::move(bit_reader);
|
||||
_decode_lengths();
|
||||
RETURN_IF_ERROR(_decode_lengths());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
// Decode all the encoded lengths. The decoder_ will be at the start of the encoded data
|
||||
// after that.
|
||||
void _decode_lengths();
|
||||
Status _decode_lengths();
|
||||
Status _get_internal(Slice* buffer, int max_values, int* out_num_values);
|
||||
|
||||
std::vector<Slice> _values;
|
||||
@ -333,9 +335,9 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
void set_data(Slice* slice) override {
|
||||
Status set_data(Slice* slice) override {
|
||||
_bit_reader = std::make_shared<BitReader>((const uint8_t*)slice->data, slice->size);
|
||||
_prefix_len_decoder.set_bit_reader(_bit_reader);
|
||||
RETURN_IF_ERROR(_prefix_len_decoder.set_bit_reader(_bit_reader));
|
||||
|
||||
// get the number of encoded prefix lengths
|
||||
int num_prefix = _prefix_len_decoder.valid_values_count();
|
||||
@ -343,20 +345,19 @@ public:
|
||||
// all the prefix lengths are buffered in _buffered_prefix_length.
|
||||
_buffered_prefix_length.resize(num_prefix);
|
||||
int ret;
|
||||
Status st = _prefix_len_decoder.decode(_buffered_prefix_length.data(), num_prefix, &ret);
|
||||
if (!st.ok()) {
|
||||
LOG(FATAL) << "Fail to decode delta prefix, status: " << st;
|
||||
}
|
||||
RETURN_IF_ERROR(
|
||||
_prefix_len_decoder.decode(_buffered_prefix_length.data(), num_prefix, &ret));
|
||||
DCHECK_EQ(ret, num_prefix);
|
||||
_prefix_len_offset = 0;
|
||||
_num_valid_values = num_prefix;
|
||||
|
||||
// at this time, the decoder_ will be at the start of the encoded suffix data.
|
||||
_suffix_decoder.set_bit_reader(_bit_reader);
|
||||
RETURN_IF_ERROR(_suffix_decoder.set_bit_reader(_bit_reader));
|
||||
|
||||
// TODO: read corrupted files written with bug(PARQUET-246). _last_value should be set
|
||||
// to _last_value_in_previous_page when decoding a new page(except the first page)
|
||||
_last_value = "";
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status decode(Slice* buffer, int num_values, int* out_num_values) {
|
||||
@ -517,119 +518,4 @@ Status DeltaBitPackDecoder<T>::_get_internal(T* buffer, int num_values, int* out
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void DeltaLengthByteArrayDecoder::_decode_lengths() {
|
||||
_len_decoder.set_bit_reader(_bit_reader);
|
||||
// get the number of encoded lengths
|
||||
int num_length = _len_decoder.valid_values_count();
|
||||
_buffered_length.resize(num_length);
|
||||
|
||||
// decode all the lengths. all the lengths are buffered in buffered_length_.
|
||||
int ret;
|
||||
Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret);
|
||||
if (!st.ok()) {
|
||||
LOG(FATAL) << "Fail to decode delta length, status: " << st;
|
||||
}
|
||||
DCHECK_EQ(ret, num_length);
|
||||
_length_idx = 0;
|
||||
_num_valid_values = num_length;
|
||||
}
|
||||
|
||||
Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values,
|
||||
int* out_num_values) {
|
||||
// Decode up to `max_values` strings into an internal buffer
|
||||
// and reference them into `buffer`.
|
||||
max_values = std::min(max_values, _num_valid_values);
|
||||
if (max_values == 0) {
|
||||
*out_num_values = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int32_t data_size = 0;
|
||||
const int32_t* length_ptr = _buffered_length.data() + _length_idx;
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
int32_t len = length_ptr[i];
|
||||
if (PREDICT_FALSE(len < 0)) {
|
||||
return Status::InvalidArgument("Negative string delta length");
|
||||
}
|
||||
buffer[i].size = len;
|
||||
if (common::add_overflow(data_size, len, data_size)) {
|
||||
return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
|
||||
}
|
||||
}
|
||||
_length_idx += max_values;
|
||||
|
||||
_buffered_data.resize(data_size);
|
||||
char* data_ptr = _buffered_data.data();
|
||||
for (int j = 0; j < data_size; j++) {
|
||||
if (!_bit_reader->GetValue(8, data_ptr + j)) {
|
||||
return Status::IOError("Get length bytes EOF");
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
buffer[i].data = data_ptr;
|
||||
data_ptr += buffer[i].size;
|
||||
}
|
||||
// this->num_values_ -= max_values;
|
||||
_num_valid_values -= max_values;
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) {
|
||||
// Decode up to `max_values` strings into an internal buffer
|
||||
// and reference them into `buffer`.
|
||||
max_values = std::min(max_values, _num_valid_values);
|
||||
if (max_values == 0) {
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int suffix_read;
|
||||
RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read));
|
||||
if (PREDICT_FALSE(suffix_read != max_values)) {
|
||||
return Status::IOError("Read {}, expecting {} from suffix decoder",
|
||||
std::to_string(suffix_read), std::to_string(max_values));
|
||||
}
|
||||
|
||||
int64_t data_size = 0;
|
||||
const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset;
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
|
||||
return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
if (PREDICT_FALSE(common::add_overflow(data_size, static_cast<int64_t>(prefix_len_ptr[i]),
|
||||
data_size) ||
|
||||
common::add_overflow(data_size, static_cast<int64_t>(buffer[i].size),
|
||||
data_size))) {
|
||||
return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
}
|
||||
_buffered_data.resize(data_size);
|
||||
|
||||
std::string_view prefix {_last_value};
|
||||
|
||||
char* data_ptr = _buffered_data.data();
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
if (PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
|
||||
return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
|
||||
// buffer[i] currently points to the string suffix
|
||||
memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size);
|
||||
buffer[i].data = data_ptr;
|
||||
buffer[i].size += prefix_len_ptr[i];
|
||||
data_ptr += buffer[i].size;
|
||||
prefix = std::string_view {buffer[i].data, buffer[i].size};
|
||||
}
|
||||
_prefix_len_offset += max_values;
|
||||
_num_valid_values -= max_values;
|
||||
_last_value = std::string {prefix};
|
||||
|
||||
if (_num_valid_values == 0) {
|
||||
_last_value_in_previous_page = _last_value;
|
||||
}
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -121,7 +121,8 @@ protected:
|
||||
|
||||
Status read_dict_values_to_column(MutableColumnPtr& doris_column) override {
|
||||
size_t dict_items_size = _dict_items.size();
|
||||
std::vector<StringRef> dict_values(dict_items_size);
|
||||
std::vector<StringRef> dict_values;
|
||||
dict_values.reserve(dict_items_size);
|
||||
for (size_t i = 0; i < dict_items_size; ++i) {
|
||||
dict_values.emplace_back(_dict_items[i], _type_length);
|
||||
}
|
||||
@ -131,7 +132,8 @@ protected:
|
||||
|
||||
MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override {
|
||||
auto res = ColumnString::create();
|
||||
std::vector<StringRef> dict_values(dict_column->size());
|
||||
std::vector<StringRef> dict_values;
|
||||
dict_values.reserve(dict_column->size());
|
||||
const auto& data = dict_column->get_data();
|
||||
for (size_t i = 0; i < dict_column->size(); ++i) {
|
||||
dict_values.emplace_back(_dict_items[data[i]], _type_length);
|
||||
|
||||
40
be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
Normal file
40
be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp
Normal file
@ -0,0 +1,40 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/fix_length_plain_decoder.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
Status FixLengthPlainDecoder::skip_values(size_t num_values) {
|
||||
_offset += _type_length * num_values;
|
||||
if (UNLIKELY(_offset > _data->size)) {
|
||||
return Status::IOError("Out-of-bounds access in parquet data decoder");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector,
|
||||
bool is_dict_filter) {
|
||||
if (select_vector.has_filter()) {
|
||||
return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
|
||||
} else {
|
||||
return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -40,67 +40,45 @@ public:
|
||||
|
||||
template <bool has_filter>
|
||||
Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter);
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter) {
|
||||
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
|
||||
if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) {
|
||||
return Status::IOError("Out-of-bounds access in parquet data decoder");
|
||||
}
|
||||
|
||||
size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
|
||||
size_t data_index = doris_column->size() * primitive_length;
|
||||
size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
|
||||
(_type_length / primitive_length);
|
||||
doris_column->resize(doris_column->size() + scale_size);
|
||||
char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
memcpy(raw_data + data_index, _data->data + _offset, run_length * _type_length);
|
||||
_offset += run_length * _type_length;
|
||||
data_index += run_length * _type_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length * _type_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status skip_values(size_t num_values) override;
|
||||
};
|
||||
|
||||
Status FixLengthPlainDecoder::skip_values(size_t num_values) {
|
||||
_offset += _type_length * num_values;
|
||||
if (UNLIKELY(_offset > _data->size)) {
|
||||
return Status::IOError("Out-of-bounds access in parquet data decoder");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector,
|
||||
bool is_dict_filter) {
|
||||
if (select_vector.has_filter()) {
|
||||
return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
|
||||
} else {
|
||||
return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector,
|
||||
bool is_dict_filter) {
|
||||
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
|
||||
if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) {
|
||||
return Status::IOError("Out-of-bounds access in parquet data decoder");
|
||||
}
|
||||
|
||||
size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
|
||||
size_t data_index = doris_column->size() * primitive_length;
|
||||
size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
|
||||
(_type_length / primitive_length);
|
||||
doris_column->resize(doris_column->size() + scale_size);
|
||||
char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
memcpy(raw_data + data_index, _data->data + _offset, run_length * _type_length);
|
||||
_offset += run_length * _type_length;
|
||||
data_index += run_length * _type_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length * _type_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -90,7 +90,15 @@ size_t doris::vectorized::LevelDecoder::get_levels(doris::vectorized::level_t* l
|
||||
_num_levels -= num_decoded;
|
||||
return num_decoded;
|
||||
} else if (_encoding == tparquet::Encoding::BIT_PACKED) {
|
||||
// TODO(gaoxin): BIT_PACKED encoding
|
||||
n = std::min((size_t)_num_levels, n);
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
if (!_bit_packed_decoder.GetValue(_bit_width, &levels[i])) {
|
||||
throw doris::Exception(ErrorCode::INTERNAL_ERROR,
|
||||
"Failed to decode BIT_PACKED levels");
|
||||
}
|
||||
}
|
||||
_num_levels -= n;
|
||||
return n;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -208,7 +208,7 @@ Status ColumnChunkReader::load_page_data() {
|
||||
_page_decoder = _decoders[static_cast<int>(encoding)].get();
|
||||
}
|
||||
// Reset page data for each page
|
||||
_page_decoder->set_data(&_page_data);
|
||||
RETURN_IF_ERROR(_page_decoder->set_data(&_page_data));
|
||||
|
||||
_state = DATA_LOADED;
|
||||
return Status::OK();
|
||||
|
||||
239
be/test/vec/exec/format/parquet/bool_plain_decoder_test.cpp
Normal file
239
be/test/vec/exec/format/parquet/bool_plain_decoder_test.cpp
Normal file
@ -0,0 +1,239 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/bool_plain_decoder.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class BoolPlainDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override { _decoder = std::make_unique<BoolPlainDecoder>(); }
|
||||
|
||||
std::unique_ptr<BoolPlainDecoder> _decoder;
|
||||
};
|
||||
|
||||
// Test basic decoding functionality
|
||||
TEST_F(BoolPlainDecoderTest, test_basic_decode) {
|
||||
// Prepare encoded data: [true, false, true, true, false, false, false, true]
|
||||
std::vector<uint8_t> encoded_data = {0b10001101};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create selection vector without filter
|
||||
size_t num_values = 8;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data()[0], 1);
|
||||
EXPECT_EQ(result_column->get_data()[1], 0);
|
||||
EXPECT_EQ(result_column->get_data()[2], 1);
|
||||
EXPECT_EQ(result_column->get_data()[3], 1);
|
||||
EXPECT_EQ(result_column->get_data()[4], 0);
|
||||
EXPECT_EQ(result_column->get_data()[5], 0);
|
||||
EXPECT_EQ(result_column->get_data()[6], 0);
|
||||
EXPECT_EQ(result_column->get_data()[7], 1);
|
||||
}
|
||||
|
||||
// Test decoding with filter
|
||||
TEST_F(BoolPlainDecoderTest, test_decode_with_filter) {
|
||||
// Prepare encoded data: [true, false, true, true, false, false, false, true]
|
||||
std::vector<uint8_t> encoded_data = {0b10001101};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create filter vector [1, 0, 1, 0, 1, 0, 1, 0]
|
||||
size_t num_values = 8;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 0, 1, 0};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 4); // 4 values after filtering
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data()[0], 1);
|
||||
EXPECT_EQ(result_column->get_data()[1], 1);
|
||||
EXPECT_EQ(result_column->get_data()[2], 0);
|
||||
EXPECT_EQ(result_column->get_data()[3], 0);
|
||||
}
|
||||
|
||||
// Test skipping values
|
||||
TEST_F(BoolPlainDecoderTest, test_skip_value) {
|
||||
// Prepare encoded data: [true, false, true, true, false, false, false, true]
|
||||
std::vector<uint8_t> encoded_data = {0b10001101};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Skip first 3 values
|
||||
ASSERT_TRUE(_decoder->skip_values(3).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 5; // Total 8 values, skip 3, remaining 5
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
|
||||
// Expected values after skipping first 3 values (true, false, true)
|
||||
std::vector<uint8_t> expected_values = {1, 0, 0, 0, 1};
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i;
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding with filter and null
|
||||
TEST_F(BoolPlainDecoderTest, test_decode_with_filter_and_null) {
|
||||
// Prepare encoded data: [true, false, true, true, false, false]
|
||||
std::vector<uint8_t> encoded_data = {0b00001101};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create filter vector [1, 0, 1, 0, 1, 1, 1] and null vector [0, 0, 0, 0, 1, 0, 1]
|
||||
size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map {
|
||||
4, 1, 1, 1}; // data: [true, false, true, true, null, false, null]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0,
|
||||
1, 1, 1}; // filtered_data: [true, true, null, false, null]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<uint8_t>> expected_values = {1, 1, std::nullopt, 0, std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(result_column->get_data()[i], expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding data generated by arrow
|
||||
TEST_F(BoolPlainDecoderTest, test_data_generated_by_arrow) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BOOLEAN);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<bool> values = {true, false, true, true, false, false, false, true};
|
||||
|
||||
// Create encoder
|
||||
auto encoder = MakeTypedEncoder<parquet::BooleanType>(parquet::Encoding::PLAIN,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(encoder->Put(values, static_cast<int>(values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = values.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data()[0], 1);
|
||||
EXPECT_EQ(result_column->get_data()[1], 0);
|
||||
EXPECT_EQ(result_column->get_data()[2], 1);
|
||||
EXPECT_EQ(result_column->get_data()[3], 1);
|
||||
EXPECT_EQ(result_column->get_data()[4], 0);
|
||||
EXPECT_EQ(result_column->get_data()[5], 0);
|
||||
EXPECT_EQ(result_column->get_data()[6], 0);
|
||||
EXPECT_EQ(result_column->get_data()[7], 1);
|
||||
}
|
||||
|
||||
// Test invalid data case
|
||||
//TEST_F(BoolPlainDecoderTest, test_invalid_data) {
|
||||
// // Prepare invalid encoded data
|
||||
// std::vector<uint8_t> encoded_data = {0b111111111}; // 9 bits
|
||||
// Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
// ASSERT_FALSE(_decoder->set_data(&data_slice).ok());
|
||||
//}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
239
be/test/vec/exec/format/parquet/bool_rle_decoder_test.cpp
Normal file
239
be/test/vec/exec/format/parquet/bool_rle_decoder_test.cpp
Normal file
@ -0,0 +1,239 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/bool_rle_decoder.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class BoolRLEDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override { _decoder = std::make_unique<BoolRLEDecoder>(); }
|
||||
|
||||
std::unique_ptr<BoolRLEDecoder> _decoder;
|
||||
};
|
||||
|
||||
// Test basic decoding functionality
|
||||
TEST_F(BoolRLEDecoderTest, test_basic_decode) {
|
||||
// Prepare encoded data: [true, false, true, true, false, false, false, true]
|
||||
std::vector<uint8_t> encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create selection vector without filter
|
||||
size_t num_values = 8;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data()[0], 1);
|
||||
EXPECT_EQ(result_column->get_data()[1], 0);
|
||||
EXPECT_EQ(result_column->get_data()[2], 1);
|
||||
EXPECT_EQ(result_column->get_data()[3], 1);
|
||||
EXPECT_EQ(result_column->get_data()[4], 0);
|
||||
EXPECT_EQ(result_column->get_data()[5], 0);
|
||||
EXPECT_EQ(result_column->get_data()[6], 0);
|
||||
EXPECT_EQ(result_column->get_data()[7], 1);
|
||||
}
|
||||
|
||||
// Test decoding with filter
|
||||
TEST_F(BoolRLEDecoderTest, test_decode_with_filter) {
|
||||
// Prepare encoded data: [true, false, true, true, false, false, false, true]
|
||||
std::vector<uint8_t> encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create filter vector [1, 0, 1, 0, 1, 0, 1, 0]
|
||||
size_t num_values = 8;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 0, 1, 0};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 4); // 4 values after filtering
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data()[0], 1);
|
||||
EXPECT_EQ(result_column->get_data()[1], 1);
|
||||
EXPECT_EQ(result_column->get_data()[2], 0);
|
||||
EXPECT_EQ(result_column->get_data()[3], 0);
|
||||
}
|
||||
|
||||
// Test decoding with filter and null values
|
||||
TEST_F(BoolRLEDecoderTest, test_decode_with_filter_and_null) {
|
||||
// Prepare encoded data: [true, false, true, true, false, false, false, true]
|
||||
std::vector<uint8_t> encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x25};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create filter vector [1, 0, 1, 0, 1, 0, 1, 0] and null vector [0, 0, 1, 0, 0, 0, 1, 0]
|
||||
size_t num_values = 8;
|
||||
std::vector<uint16_t> run_length_null_map = {
|
||||
2, 1, 3, 1, 1}; // data: [true, false, null, true, false, false, null, true]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0,
|
||||
1, 0, 1, 0}; // filtered_data: [true, null, false, null]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 4); // 4 values after filtering
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<uint8_t>> expected_values = {1, std::nullopt, 0, std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(result_column->get_data()[i], expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test skipping values for bool RLE decoding
|
||||
TEST_F(BoolRLEDecoderTest, test_skip_value) {
|
||||
// Prepare encoded data: [true, false, true, true, false, false, false, true]
|
||||
std::vector<uint8_t> encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Skip first 3 values
|
||||
ASSERT_TRUE(_decoder->skip_values(3).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 5; // Total 8 values, skip 3, remaining 5
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
|
||||
// Expected values after skipping first 3 values (true, false, true)
|
||||
std::vector<uint8_t> expected_values = {1, 0, 0, 0, 1};
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i;
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding data generated by arrow
|
||||
TEST_F(BoolRLEDecoderTest, test_data_generated_by_arrow) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BOOLEAN);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<bool> values = {true, false, true, true, false, false, false, true};
|
||||
|
||||
// Create encoder
|
||||
auto encoder = MakeTypedEncoder<parquet::BooleanType>(parquet::Encoding::RLE,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(encoder->Put(values, static_cast<int>(values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = values.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data()[0], 1);
|
||||
EXPECT_EQ(result_column->get_data()[1], 0);
|
||||
EXPECT_EQ(result_column->get_data()[2], 1);
|
||||
EXPECT_EQ(result_column->get_data()[3], 1);
|
||||
EXPECT_EQ(result_column->get_data()[4], 0);
|
||||
EXPECT_EQ(result_column->get_data()[5], 0);
|
||||
EXPECT_EQ(result_column->get_data()[6], 0);
|
||||
EXPECT_EQ(result_column->get_data()[7], 1);
|
||||
}
|
||||
|
||||
// Test invalid data case
|
||||
TEST_F(BoolRLEDecoderTest, test_invalid_data) {
|
||||
// Prepare invalid encoded data
|
||||
std::vector<uint8_t> encoded_data = {0x08, 0x01}; // Incomplete data
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_FALSE(_decoder->set_data(&data_slice).ok());
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
500
be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp
Normal file
500
be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp
Normal file
@ -0,0 +1,500 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/byte_array_dict_decoder.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_dictionary.h"
|
||||
#include "vec/columns/column_string.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_string.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class ByteArrayDictDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
// Prepare test data: create a dictionary with byte array strings
|
||||
const char* values[3] = {"apple", "banana", "cherry"};
|
||||
size_t dict_size = 3;
|
||||
size_t dict_data_size = 0;
|
||||
|
||||
// Calculate total dictionary data size
|
||||
for (int i = 0; i < 3; i++) {
|
||||
dict_data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
|
||||
}
|
||||
|
||||
auto dict_data = std::make_unique<uint8_t[]>(dict_data_size);
|
||||
size_t offset = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
uint32_t len = strlen(values[i]);
|
||||
encode_fixed32_le(dict_data.get() + offset, len);
|
||||
offset += 4;
|
||||
memcpy(dict_data.get() + offset, values[i], len);
|
||||
offset += len;
|
||||
}
|
||||
|
||||
ASSERT_TRUE(_decoder.set_dict(dict_data, dict_data_size, dict_size).ok());
|
||||
}
|
||||
|
||||
ByteArrayDictDecoder _decoder;
|
||||
};
|
||||
|
||||
// Test basic decoding functionality
|
||||
TEST_F(ByteArrayDictDecoderTest, test_basic_decode) {
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
// std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
std::vector<uint8_t> rle_data = {0x02, 0x03, 0x00, 0x19};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
|
||||
size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Verify first 4 repeated values (dict index 0 -> value "apple")
|
||||
for (int i = 0; i < 4; i++) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), "apple");
|
||||
}
|
||||
|
||||
// Verify last 3 literal values
|
||||
EXPECT_EQ(result_column->get_data_at(4).to_string(), "banana");
|
||||
EXPECT_EQ(result_column->get_data_at(5).to_string(), "cherry");
|
||||
EXPECT_EQ(result_column->get_data_at(6).to_string(), "banana");
|
||||
}
|
||||
|
||||
// Test decoding with filter
|
||||
TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter) {
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1]
|
||||
size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Verify filtered values
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
|
||||
EXPECT_EQ(result_column->get_data_at(1).to_string(), "apple");
|
||||
EXPECT_EQ(result_column->get_data_at(2).to_string(), "banana");
|
||||
EXPECT_EQ(result_column->get_data_at(3).to_string(), "cherry");
|
||||
EXPECT_EQ(result_column->get_data_at(4).to_string(), "banana");
|
||||
}
|
||||
|
||||
// Test decoding with filter and null
|
||||
TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter_and_null) {
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
|
||||
size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<std::string>> expected_values = {"apple", "apple", std::nullopt,
|
||||
"cherry", std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test empty dictionary case
|
||||
TEST_F(ByteArrayDictDecoderTest, test_empty_dict) {
|
||||
ByteArrayDictDecoder empty_decoder;
|
||||
auto dict_data = std::make_unique<uint8_t[]>(0);
|
||||
ASSERT_TRUE(empty_decoder.set_dict(dict_data, 0, 0).ok());
|
||||
}
|
||||
|
||||
// Test decoding with ColumnDictI32
|
||||
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32) {
|
||||
// Create ColumnDictI32 column
|
||||
MutableColumnPtr column = ColumnDictI32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
|
||||
|
||||
// Verify first 4 repeated values (dict index 0 -> value "apple")
|
||||
for (int i = 0; i < 4; i++) {
|
||||
EXPECT_EQ(dict_column->get_data()[i], 0);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(), "apple");
|
||||
}
|
||||
|
||||
// Verify last 3 literal values
|
||||
EXPECT_EQ(dict_column->get_data()[4], 1);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana");
|
||||
EXPECT_EQ(dict_column->get_data()[5], 2);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[5]).to_string(), "cherry");
|
||||
EXPECT_EQ(dict_column->get_data()[6], 1);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[6]).to_string(), "banana");
|
||||
}
|
||||
|
||||
// Test decoding with ColumnDictI32 and filter
|
||||
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32_with_filter) {
|
||||
// Create ColumnDictI32 column
|
||||
MutableColumnPtr column = ColumnDictI32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1]
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
|
||||
|
||||
// Verify filtered values
|
||||
EXPECT_EQ(dict_column->get_data()[0], 0);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[0]).to_string(), "apple");
|
||||
EXPECT_EQ(dict_column->get_data()[1], 0);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[1]).to_string(), "apple");
|
||||
EXPECT_EQ(dict_column->get_data()[2], 1);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[2]).to_string(), "banana");
|
||||
EXPECT_EQ(dict_column->get_data()[3], 2);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[3]).to_string(), "cherry");
|
||||
EXPECT_EQ(dict_column->get_data()[4], 1);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana");
|
||||
}
|
||||
|
||||
// Test decoding with ColumnDictI32 with filter and null
|
||||
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32_with_filter_and_null) {
|
||||
// Create ColumnDictI32 column
|
||||
MutableColumnPtr column = ColumnDictI32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<std::string>> expected_values = {"apple", "apple", std::nullopt,
|
||||
"cherry", std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(),
|
||||
expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding with ColumnInt32
|
||||
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32) {
|
||||
// Create ColumnInt32 column
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
|
||||
|
||||
// Verify first 4 repeated values (dict index 0 -> value "apple")
|
||||
for (int i = 0; i < 4; i++) {
|
||||
EXPECT_EQ(dict_column->get_data()[i], 0);
|
||||
}
|
||||
|
||||
// Verify last 3 literal values
|
||||
EXPECT_EQ(dict_column->get_data()[4], 1);
|
||||
EXPECT_EQ(dict_column->get_data()[5], 2);
|
||||
EXPECT_EQ(dict_column->get_data()[6], 1);
|
||||
}
|
||||
|
||||
// Test decoding with ColumnInt32 and filter
|
||||
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32_with_filter) {
|
||||
// Create ColumnInt32 column
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1]
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
|
||||
|
||||
// Verify filtered values
|
||||
EXPECT_EQ(dict_column->get_data()[0], 0);
|
||||
EXPECT_EQ(dict_column->get_data()[1], 0);
|
||||
EXPECT_EQ(dict_column->get_data()[2], 1);
|
||||
EXPECT_EQ(dict_column->get_data()[3], 2);
|
||||
EXPECT_EQ(dict_column->get_data()[4], 1);
|
||||
}
|
||||
|
||||
// Test decoding with ColumnInt32 with filter and null
|
||||
TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32_with_filter_and_null) {
|
||||
// Create ColumnInt32 column
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<int32_t>> expected_values = {0, 0, std::nullopt, 2, std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(dict_column->get_data()[i], expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test reading dictionary values to column
|
||||
TEST_F(ByteArrayDictDecoderTest, test_read_dict_values_to_column) {
|
||||
// Create a column to store dictionary values
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
|
||||
// Read dictionary values to column
|
||||
ASSERT_TRUE(_decoder.read_dict_values_to_column(column).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 3); // 3 dictionary items
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Verify dictionary values
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
|
||||
EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana");
|
||||
EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry");
|
||||
}
|
||||
|
||||
// Test convert_dict_column_to_string_column function
|
||||
TEST_F(ByteArrayDictDecoderTest, test_convert_dict_column_to_string_column) {
|
||||
// Create a ColumnInt32 with some dictionary codes
|
||||
MutableColumnPtr dict_column = ColumnInt32::create();
|
||||
dict_column->insert(0);
|
||||
dict_column->insert(1);
|
||||
dict_column->insert(2);
|
||||
dict_column->insert(1);
|
||||
|
||||
// Convert to string column
|
||||
MutableColumnPtr string_column = _decoder.convert_dict_column_to_string_column(
|
||||
assert_cast<ColumnInt32*>(dict_column.get()));
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(string_column->size(), 4);
|
||||
auto* result_column = assert_cast<ColumnString*>(string_column.get());
|
||||
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
|
||||
EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana");
|
||||
EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry");
|
||||
EXPECT_EQ(result_column->get_data_at(3).to_string(), "banana");
|
||||
}
|
||||
|
||||
// Test skipping values for byte array dictionary decoding
|
||||
TEST_F(ByteArrayDictDecoderTest, test_skip_value) {
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Skip first 3 values
|
||||
ASSERT_TRUE(_decoder.skip_values(3).ok());
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 4; // Total 7 values, skip 3, remaining 4
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Expected values after skipping first 3 values ("apple", "apple", "apple")
|
||||
std::vector<std::string> expected_values = {"apple", "banana", "cherry", "banana"};
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i])
|
||||
<< "Mismatch at value " << i;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -0,0 +1,242 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/byte_array_plain_decoder.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_string.h"
|
||||
#include "vec/data_types/data_type_string.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class ByteArrayPlainDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {}
|
||||
|
||||
Slice _data_slice;
|
||||
std::unique_ptr<uint8_t[]> _data;
|
||||
};
|
||||
|
||||
// Test basic decoding functionality
|
||||
TEST_F(ByteArrayPlainDecoderTest, test_basic_decode) {
|
||||
// Prepare test data: create byte array strings
|
||||
const char* values[3] = {"apple", "banana", "cherry"};
|
||||
size_t data_size = 0;
|
||||
|
||||
// Calculate total data size
|
||||
for (int i = 0; i < 3; i++) {
|
||||
data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
|
||||
}
|
||||
|
||||
_data = std::make_unique<uint8_t[]>(data_size);
|
||||
size_t offset = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
uint32_t len = strlen(values[i]);
|
||||
encode_fixed32_le(_data.get() + offset, len);
|
||||
offset += 4;
|
||||
memcpy(_data.get() + offset, values[i], len);
|
||||
offset += len;
|
||||
}
|
||||
|
||||
_data_slice = Slice(_data.get(), data_size);
|
||||
|
||||
ByteArrayPlainDecoder decoder;
|
||||
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create selection vector without filter
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
|
||||
EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana");
|
||||
EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry");
|
||||
}
|
||||
|
||||
// Test decoding with filter
|
||||
TEST_F(ByteArrayPlainDecoderTest, test_decode_with_filter) {
|
||||
// Prepare test data: create byte array strings
|
||||
const char* values[3] = {"apple", "banana", "cherry"};
|
||||
size_t data_size = 0;
|
||||
|
||||
// Calculate total data size
|
||||
for (int i = 0; i < 3; i++) {
|
||||
data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
|
||||
}
|
||||
|
||||
_data = std::make_unique<uint8_t[]>(data_size);
|
||||
size_t offset = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
uint32_t len = strlen(values[i]);
|
||||
encode_fixed32_le(_data.get() + offset, len);
|
||||
offset += 4;
|
||||
memcpy(_data.get() + offset, values[i], len);
|
||||
offset += len;
|
||||
}
|
||||
|
||||
_data_slice = Slice(_data.get(), data_size);
|
||||
|
||||
ByteArrayPlainDecoder decoder;
|
||||
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create filter vector [1,0,1]
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple");
|
||||
EXPECT_EQ(result_column->get_data_at(1).to_string(), "cherry");
|
||||
}
|
||||
|
||||
// Test decoding with filter and null
|
||||
TEST_F(ByteArrayPlainDecoderTest, test_decode_with_filter_and_null) {
|
||||
// Prepare test data: create byte array strings
|
||||
const char* values[2] = {"apple", "cherry"};
|
||||
size_t data_size = 0;
|
||||
|
||||
// Calculate total data size
|
||||
for (int i = 0; i < 2; i++) {
|
||||
data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
|
||||
}
|
||||
|
||||
_data = std::make_unique<uint8_t[]>(data_size);
|
||||
size_t offset = 0;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
uint32_t len = strlen(values[i]);
|
||||
encode_fixed32_le(_data.get() + offset, len);
|
||||
offset += 4;
|
||||
memcpy(_data.get() + offset, values[i], len);
|
||||
offset += len;
|
||||
}
|
||||
|
||||
_data_slice = Slice(_data.get(), data_size);
|
||||
|
||||
ByteArrayPlainDecoder decoder;
|
||||
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create filter vector [1,0,1] and null vector [0,1,0]
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map = {1, 1, 1}; // data: ["apple", null, "cherry"]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1}; // filtered_data: ["apple", "cherry"]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<std::string>> expected_values = {"apple", "cherry"};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test skipping values
|
||||
TEST_F(ByteArrayPlainDecoderTest, test_skip_value) {
|
||||
// Prepare test data: create byte array strings
|
||||
const char* values[3] = {"apple", "banana", "cherry"};
|
||||
size_t data_size = 0;
|
||||
|
||||
// Calculate total data size
|
||||
for (int i = 0; i < 3; i++) {
|
||||
data_size += 4 + strlen(values[i]); // 4 bytes for length + string data
|
||||
}
|
||||
|
||||
_data = std::make_unique<uint8_t[]>(data_size);
|
||||
size_t offset = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
uint32_t len = strlen(values[i]);
|
||||
encode_fixed32_le(_data.get() + offset, len);
|
||||
offset += 4;
|
||||
memcpy(_data.get() + offset, values[i], len);
|
||||
offset += len;
|
||||
}
|
||||
|
||||
_data_slice = Slice(_data.get(), data_size);
|
||||
|
||||
ByteArrayPlainDecoder decoder;
|
||||
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
|
||||
|
||||
// Skip first 2 values
|
||||
ASSERT_TRUE(decoder.skip_values(2).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 1; // Total 3 values, skip 2, remaining 1
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "cherry");
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -0,0 +1,395 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/byte_stream_split_decoder.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class ByteStreamSplitDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {}
|
||||
|
||||
ByteStreamSplitDecoder _decoder;
|
||||
};
|
||||
|
||||
//// Test basic decoding functionality for FLOAT type
|
||||
TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_float) {
|
||||
// Prepare test data for FLOAT type
|
||||
size_t type_length_float = sizeof(float);
|
||||
size_t num_values_float = 3;
|
||||
size_t data_size_float = num_values_float * type_length_float;
|
||||
auto data_float = std::make_unique<uint8_t[]>(data_size_float);
|
||||
const float values_float[3] = {1.0f, 2.0f, 3.0f};
|
||||
for (int i = 0; i < num_values_float; i++) {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_float[i]);
|
||||
for (int j = 0; j < type_length_float; j++) {
|
||||
data_float[j * num_values_float + i] = bytes[j];
|
||||
}
|
||||
}
|
||||
Slice data_slice_float(data_float.get(), data_size_float);
|
||||
|
||||
MutableColumnPtr column = ColumnFloat32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeFloat32>();
|
||||
|
||||
// Set data for FLOAT type
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok());
|
||||
_decoder.set_type_length(type_length_float);
|
||||
|
||||
// Create selection vector without filter, total 3 values
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnFloat32*>(column.get());
|
||||
EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f);
|
||||
EXPECT_FLOAT_EQ(result_column->get_data()[1], 2.0f);
|
||||
EXPECT_FLOAT_EQ(result_column->get_data()[2], 3.0f);
|
||||
}
|
||||
|
||||
//// Test basic decoding functionality for DOUBLE type
|
||||
TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_double) {
|
||||
// Prepare test data for DOUBLE type
|
||||
size_t type_length_double = sizeof(double);
|
||||
size_t num_values_double = 3;
|
||||
size_t data_size_double = num_values_double * type_length_double;
|
||||
auto data_double = std::make_unique<uint8_t[]>(data_size_double);
|
||||
const double values_double[3] = {1.0, 2.0, 3.0};
|
||||
for (int i = 0; i < num_values_double; i++) {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_double[i]);
|
||||
for (int j = 0; j < type_length_double; j++) {
|
||||
data_double[j * num_values_double + i] = bytes[j];
|
||||
}
|
||||
}
|
||||
Slice data_slice_double(data_double.get(), data_size_double);
|
||||
|
||||
MutableColumnPtr column = ColumnFloat64::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeFloat64>();
|
||||
|
||||
// Set data for DOUBLE type
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok());
|
||||
_decoder.set_type_length(type_length_double);
|
||||
|
||||
// Create selection vector without filter, total 3 values
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnFloat64*>(column.get());
|
||||
EXPECT_DOUBLE_EQ(result_column->get_data()[0], 1.0);
|
||||
EXPECT_DOUBLE_EQ(result_column->get_data()[1], 2.0);
|
||||
EXPECT_DOUBLE_EQ(result_column->get_data()[2], 3.0);
|
||||
}
|
||||
|
||||
// Test decoding with filter for FLOAT type
|
||||
TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_float) {
|
||||
// Prepare test data for FLOAT type
|
||||
size_t type_length_float = sizeof(float);
|
||||
size_t num_values_float = 3;
|
||||
size_t data_size_float = num_values_float * type_length_float;
|
||||
auto data_float = std::make_unique<uint8_t[]>(data_size_float);
|
||||
const float values_float[3] = {1.0f, 2.0f, 3.0f};
|
||||
for (int i = 0; i < num_values_float; i++) {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_float[i]);
|
||||
for (int j = 0; j < type_length_float; j++) {
|
||||
data_float[j * num_values_float + i] = bytes[j];
|
||||
}
|
||||
}
|
||||
Slice data_slice_float(data_float.get(), data_size_float);
|
||||
|
||||
MutableColumnPtr column = ColumnFloat32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeFloat32>();
|
||||
|
||||
// Set data for FLOAT type
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok());
|
||||
_decoder.set_type_length(type_length_float);
|
||||
|
||||
// Create filter vector [1, 0, 1]
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnFloat32*>(column.get());
|
||||
EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f);
|
||||
EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f);
|
||||
}
|
||||
|
||||
// Test decoding with filter for DOUBLE type
|
||||
TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_double) {
|
||||
// Prepare test data for DOUBLE type
|
||||
size_t type_length_double = sizeof(double);
|
||||
size_t num_values_double = 3;
|
||||
size_t data_size_double = num_values_double * type_length_double;
|
||||
auto data_double = std::make_unique<uint8_t[]>(data_size_double);
|
||||
const double values_double[3] = {1.0, 2.0, 3.0};
|
||||
for (int i = 0; i < num_values_double; i++) {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_double[i]);
|
||||
for (int j = 0; j < type_length_double; j++) {
|
||||
data_double[j * num_values_double + i] = bytes[j];
|
||||
}
|
||||
}
|
||||
Slice data_slice_double(data_double.get(), data_size_double);
|
||||
|
||||
MutableColumnPtr column = ColumnFloat64::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeFloat64>();
|
||||
|
||||
// Set data for DOUBLE type
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok());
|
||||
_decoder.set_type_length(type_length_double);
|
||||
|
||||
// Create filter vector [1, 0, 1]
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnFloat64*>(column.get());
|
||||
EXPECT_DOUBLE_EQ(result_column->get_data()[0], 1.0);
|
||||
EXPECT_DOUBLE_EQ(result_column->get_data()[1], 3.0);
|
||||
}
|
||||
|
||||
// Test decoding with filter and null for FLOAT type
|
||||
TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_and_null_float) {
|
||||
// Prepare test data for FLOAT type
|
||||
size_t type_length_float = sizeof(float);
|
||||
size_t num_values_float = 2;
|
||||
size_t data_size_float = num_values_float * type_length_float;
|
||||
auto data_float = std::make_unique<uint8_t[]>(data_size_float);
|
||||
const float values_float[2] = {1.0f, 3.0f};
|
||||
for (int i = 0; i < num_values_float; i++) {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_float[i]);
|
||||
for (int j = 0; j < type_length_float; j++) {
|
||||
data_float[j * num_values_float + i] = bytes[j];
|
||||
}
|
||||
}
|
||||
Slice data_slice_float(data_float.get(), data_size_float);
|
||||
|
||||
MutableColumnPtr column = ColumnFloat32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeFloat32>();
|
||||
|
||||
// Set data for FLOAT type
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok());
|
||||
_decoder.set_type_length(type_length_float);
|
||||
|
||||
// Create filter vector [1, 0, 1] and null vector [0, 1, 0]
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map = {1, 1, 1}; // data: [1.0f, null, 3.0f]
|
||||
std::vector<uint8_t> filter_data = {0, 1, 1}; // filtered_data: [null, 3.0f]
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnFloat32*>(column.get());
|
||||
// EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f);
|
||||
// EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f);
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<float>> expected_values = {std::nullopt, 3.0f};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_FLOAT_EQ(result_column->get_data()[i], expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding with filter and null for DOUBLE type
|
||||
TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_and_null_double) {
|
||||
// Prepare test data for DOUBLE type
|
||||
size_t type_length_double = sizeof(double);
|
||||
size_t num_values_double = 2;
|
||||
size_t data_size_double = num_values_double * type_length_double;
|
||||
auto data_double = std::make_unique<uint8_t[]>(data_size_double);
|
||||
const double values_double[2] = {1.0, 3.0};
|
||||
for (int i = 0; i < num_values_double; i++) {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_double[i]);
|
||||
for (int j = 0; j < type_length_double; j++) {
|
||||
data_double[j * num_values_double + i] = bytes[j];
|
||||
}
|
||||
}
|
||||
Slice data_slice_double(data_double.get(), data_size_double);
|
||||
|
||||
MutableColumnPtr column = ColumnFloat64::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeFloat64>();
|
||||
|
||||
// Set data for DOUBLE type
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok());
|
||||
_decoder.set_type_length(type_length_double);
|
||||
|
||||
// Create filter vector [1, 0, 1] and null vector [0, 1, 0]
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map = {1, 1, 1}; // data: [1.0f, null, 3.0f]
|
||||
std::vector<uint8_t> filter_data = {0, 1, 1}; // filtered_data: [null, 3.0f]
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnFloat64*>(column.get());
|
||||
// EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f);
|
||||
// EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f);
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<float>> expected_values = {std::nullopt, 3.0f};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_FLOAT_EQ(result_column->get_data()[i], expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test skipping values for FLOAT type
|
||||
TEST_F(ByteStreamSplitDecoderTest, test_skip_value_float) {
|
||||
// Prepare test data for FLOAT type
|
||||
size_t type_length_float = sizeof(float);
|
||||
size_t num_values_float = 3;
|
||||
size_t data_size_float = num_values_float * type_length_float;
|
||||
auto data_float = std::make_unique<uint8_t[]>(data_size_float);
|
||||
const float values_float[3] = {1.0f, 2.0f, 3.0f};
|
||||
for (int i = 0; i < num_values_float; i++) {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_float[i]);
|
||||
for (int j = 0; j < type_length_float; j++) {
|
||||
data_float[j * num_values_float + i] = bytes[j];
|
||||
}
|
||||
}
|
||||
Slice data_slice_float(data_float.get(), data_size_float);
|
||||
|
||||
MutableColumnPtr column = ColumnFloat32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeFloat32>();
|
||||
|
||||
// Set data for FLOAT type
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok());
|
||||
_decoder.set_type_length(type_length_float);
|
||||
|
||||
// Skip first 2 values
|
||||
ASSERT_TRUE(_decoder.skip_values(2).ok());
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 1; // Total 3 values, skip 2, remaining 1
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnFloat32*>(column.get());
|
||||
EXPECT_FLOAT_EQ(result_column->get_data()[0], 3.0f);
|
||||
}
|
||||
|
||||
// Test skipping values for DOUBLE type
|
||||
TEST_F(ByteStreamSplitDecoderTest, test_skip_value_double) {
|
||||
// Prepare test data for DOUBLE type
|
||||
size_t type_length_double = sizeof(double);
|
||||
size_t num_values_double = 3;
|
||||
size_t data_size_double = num_values_double * type_length_double;
|
||||
auto data_double = std::make_unique<uint8_t[]>(data_size_double);
|
||||
const double values_double[3] = {1.0, 2.0, 3.0};
|
||||
for (int i = 0; i < num_values_double; i++) {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&values_double[i]);
|
||||
for (int j = 0; j < type_length_double; j++) {
|
||||
data_double[j * num_values_double + i] = bytes[j];
|
||||
}
|
||||
}
|
||||
Slice data_slice_double(data_double.get(), data_size_double);
|
||||
|
||||
MutableColumnPtr column = ColumnFloat64::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeFloat64>();
|
||||
|
||||
// Set data for DOUBLE type
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok());
|
||||
_decoder.set_type_length(type_length_double);
|
||||
|
||||
// Skip first 2 values
|
||||
ASSERT_TRUE(_decoder.skip_values(2).ok());
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 1; // Total 3 values, skip 2, remaining 1
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnFloat64*>(column.get());
|
||||
EXPECT_DOUBLE_EQ(result_column->get_data()[0], 3.0);
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
265
be/test/vec/exec/format/parquet/delta_bit_pack_decoder_test.cpp
Normal file
265
be/test/vec/exec/format/parquet/delta_bit_pack_decoder_test.cpp
Normal file
@ -0,0 +1,265 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
#include "vec/data_types/data_type_string.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class DeltaBitPackDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override { _decoder = std::make_unique<DeltaBitPackDecoder<int32_t>>(); }
|
||||
|
||||
std::unique_ptr<DeltaBitPackDecoder<int32_t>> _decoder;
|
||||
};
|
||||
|
||||
// Test basic decoding functionality
|
||||
TEST_F(DeltaBitPackDecoderTest, test_basic_decode) {
|
||||
// Prepare encoded data
|
||||
std::vector<uint8_t> encoded_data = {
|
||||
// Header: block_size=128, mini_blocks_per_block=4, total_value_count=5, first_value=10
|
||||
0x80, 0x01, 0x04, 0x05, 0x14,
|
||||
// Block: min_delta=1, bit_width=[0, 0, 0, 0]
|
||||
0x02, 0x00, 0x00, 0x00, 0x00
|
||||
// MiniBlocks: no data needed for bit_width 0
|
||||
};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create selection vector without filter
|
||||
size_t num_values = 5;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnInt32*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data()[0], 10);
|
||||
EXPECT_EQ(result_column->get_data()[1], 11);
|
||||
EXPECT_EQ(result_column->get_data()[2], 12);
|
||||
EXPECT_EQ(result_column->get_data()[3], 13);
|
||||
EXPECT_EQ(result_column->get_data()[4], 14);
|
||||
}
|
||||
|
||||
// Test decoding with filter
|
||||
TEST_F(DeltaBitPackDecoderTest, test_decode_with_filter) {
|
||||
// Prepare encoded data
|
||||
std::vector<uint8_t> encoded_data = {
|
||||
// Header: block_size=128, mini_blocks_per_block=4, total_value_count=5, first_value=10
|
||||
0x80, 0x01, 0x04, 0x05, 0x14,
|
||||
// Block: min_delta=1, bit_width=[0, 0, 0, 0]
|
||||
0x02, 0x00, 0x00, 0x00, 0x00
|
||||
// MiniBlocks: no data needed for bit_width 0
|
||||
};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create filter vector [1,0,1,0,1]
|
||||
size_t num_values = 5;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 3); // 3 values after filtering
|
||||
auto* result_column = assert_cast<ColumnInt32*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data()[0], 10);
|
||||
EXPECT_EQ(result_column->get_data()[1], 12);
|
||||
EXPECT_EQ(result_column->get_data()[2], 14);
|
||||
}
|
||||
|
||||
// Test decoding with filter and null values
|
||||
TEST_F(DeltaBitPackDecoderTest, test_decode_with_filter_and_null) {
|
||||
std::vector<uint8_t> encoded_data = {
|
||||
// Header: block_size=128, mini_blocks_per_block=4, total_value_count=4, first_value=10
|
||||
0x80, 0x01, 0x04, 0x04, 0x14,
|
||||
// Block: min_delta=1, bit_width=[1, 0, 0, 0]
|
||||
0x02, 0x01, 0x00, 0x00, 0x00,
|
||||
// MiniBlocks
|
||||
0x02, 0x00, 0x00, 0x00};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create filter vector [1,0,1,0,1] and null vector [0,0,1,0,0]
|
||||
size_t num_values = 5;
|
||||
std::vector<uint16_t> run_length_null_map = {2, 1, 2}; // data: [10 11 null 13 14]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1}; // filtered_data: [10 null 14]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 3); // 3 values after filtering
|
||||
auto* result_column = assert_cast<ColumnInt32*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<int32_t>> expected_values = {10, std::nullopt, 14};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(result_column->get_data()[i], expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test skipping values for delta bit pack decoding
|
||||
TEST_F(DeltaBitPackDecoderTest, test_skip_value) {
|
||||
// Prepare encoded data
|
||||
std::vector<uint8_t> encoded_data = {
|
||||
// Header: block_size=128, mini_blocks_per_block=4, total_value_count=8, first_value=10
|
||||
0x80, 0x01, 0x04, 0x08, 0x14,
|
||||
// Block: min_delta=1, bit_width=[0, 0, 0, 0]
|
||||
0x02, 0x00, 0x00, 0x00, 0x00
|
||||
// MiniBlocks: no data needed for bit_width 0
|
||||
};
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Skip first 3 values
|
||||
ASSERT_TRUE(_decoder->skip_values(3).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 5; // Total 8 values, skip 3, remaining 5
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnInt32*>(column.get());
|
||||
|
||||
// Expected values after skipping first 3 values (10,11,12)
|
||||
std::vector<int32_t> expected_values = {13, 14, 15, 16, 17};
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i;
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding data generated by arrow
|
||||
TEST_F(DeltaBitPackDecoderTest, test_data_generated_by_arrow) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::INT32);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<int32_t> values = {10, 11, 13, 14};
|
||||
|
||||
// Create encoder
|
||||
auto encoder = MakeTypedEncoder<parquet::Int32Type>(parquet::Encoding::DELTA_BINARY_PACKED,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(encoder->Put(values.data(), static_cast<int>(values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = values.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnInt32*>(column.get());
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data()[i], values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Test invalid data case
|
||||
TEST_F(DeltaBitPackDecoderTest, test_invalid_data) {
|
||||
// Prepare invalid encoded data
|
||||
std::vector<uint8_t> encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
size_t num_values = 5;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values);
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Decoding should fail due to invalid data
|
||||
ASSERT_FALSE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -0,0 +1,588 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <arrow/array.h>
|
||||
#include <arrow/builder.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "arrow/api.h"
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
#include "vec/data_types/data_type_string.h"
|
||||
#include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class DeltaByteArrayDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override { _decoder = std::make_unique<DeltaByteArrayDecoder>(); }
|
||||
|
||||
std::unique_ptr<DeltaByteArrayDecoder> _decoder;
|
||||
};
|
||||
|
||||
// Test basic decoding byte array functionality
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_byte_array) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BYTE_ARRAY);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& value : values) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(value.size()),
|
||||
reinterpret_cast<const uint8_t*>(value.data())});
|
||||
}
|
||||
|
||||
// Create encoder
|
||||
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = values.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding byte array with filter
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_decode_byte_array_with_filter) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BYTE_ARRAY);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& value : values) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(value.size()),
|
||||
reinterpret_cast<const uint8_t*>(value.data())});
|
||||
}
|
||||
|
||||
// Create encoder
|
||||
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create filter vector [1, 0, 1, 0]
|
||||
size_t num_values = values.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "Hello");
|
||||
EXPECT_EQ(result_column->get_data_at(1).to_string(), "Foobar");
|
||||
}
|
||||
|
||||
// Test decoding byte array with filter and null values
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_decode_byte_array_with_filter_and_null) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BYTE_ARRAY);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<std::string> values = {"Hello", "World", "ABCDEF"};
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& value : values) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(value.size()),
|
||||
reinterpret_cast<const uint8_t*>(value.data())});
|
||||
}
|
||||
|
||||
// Create encoder
|
||||
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create filter vector [1, 0, 1, 0] and null vector [0, 0, 1, 0]
|
||||
size_t num_values = 4;
|
||||
std::vector<uint16_t> run_length_null_map = {2, 1,
|
||||
1}; // data: ["Hello", "World", null, "ABCDEF"]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0}; // filtered_data: ["Hello", null]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<std::string>> expected_values = {"Hello", std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test skipping values for byte array decoding
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_skip_value_for_byte_array) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BYTE_ARRAY);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare test data
|
||||
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& value : values) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(value.size()),
|
||||
reinterpret_cast<const uint8_t*>(value.data())});
|
||||
}
|
||||
|
||||
// Encode data
|
||||
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
|
||||
// Set decoder data
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Skip the first two values
|
||||
ASSERT_TRUE(_decoder->skip_values(2).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = values.size() - 2; // Skip first two values
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values);
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Verify decoded results (should start from the third value)
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i + 2])
|
||||
<< "Mismatch at value " << (i + 2);
|
||||
}
|
||||
}
|
||||
|
||||
// Test basic decoding fixed-length byte array functionality
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_fixed_len_byte_array) {
|
||||
// Configure DECIMAL type parameters
|
||||
const int32_t type_length = 16;
|
||||
int precision = 10;
|
||||
int scale = 2;
|
||||
_decoder->set_type_length(type_length);
|
||||
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make(
|
||||
"test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
|
||||
parquet::ConvertedType::DECIMAL, type_length, precision, scale);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare test data
|
||||
std::vector<std::vector<uint8_t>> test_fixed_len_buffers = {
|
||||
{0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc,
|
||||
0x61, 0x40}, // Data 1
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00}, // Data 2 (all zeros)
|
||||
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF}, // Data 3 (all ones)
|
||||
{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC,
|
||||
0xDE, 0xF0} // Data 4 (random)
|
||||
};
|
||||
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& buffer : test_fixed_len_buffers) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(buffer.size()), buffer.data()});
|
||||
}
|
||||
|
||||
// Encode data
|
||||
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
|
||||
// Set decoder data
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt8>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = test_fixed_len_buffers.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values);
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values * type_length);
|
||||
auto* result_column = assert_cast<ColumnInt8*>(column.get());
|
||||
|
||||
// Verify decoded results one by one
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
for (size_t j = 0; j < type_length; ++j) {
|
||||
size_t index = i * type_length + j;
|
||||
EXPECT_EQ(result_column->get_element(index),
|
||||
static_cast<int8_t>(test_fixed_len_buffers[i][j]))
|
||||
<< "Mismatch at buffer " << i << ", byte " << j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding fixed-length byte array with filter
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter) {
|
||||
// Configure DECIMAL type parameters
|
||||
const int32_t type_length = 16;
|
||||
int precision = 10;
|
||||
int scale = 2;
|
||||
_decoder->set_type_length(type_length);
|
||||
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make(
|
||||
"test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
|
||||
parquet::ConvertedType::DECIMAL, type_length, precision, scale);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare test data
|
||||
std::vector<std::vector<uint8_t>> test_fixed_len_buffers = {
|
||||
{0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc,
|
||||
0x61, 0x40}, // Data 1
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00}, // Data 2 (all zeros)
|
||||
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF}, // Data 3 (all ones)
|
||||
{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC,
|
||||
0xDE, 0xF0} // Data 4 (random)
|
||||
};
|
||||
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& buffer : test_fixed_len_buffers) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(buffer.size()), buffer.data()});
|
||||
}
|
||||
|
||||
// Encode data
|
||||
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
|
||||
// Set decoder data
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt8>();
|
||||
|
||||
// Create filter [1, 0, 1, 0]
|
||||
size_t num_values = test_fixed_len_buffers.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values);
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2 * type_length); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnInt8*>(column.get());
|
||||
|
||||
// Verify first value
|
||||
for (size_t j = 0; j < type_length; ++j) {
|
||||
EXPECT_EQ(result_column->get_element(j), static_cast<int8_t>(test_fixed_len_buffers[0][j]))
|
||||
<< "Mismatch at buffer 0, byte " << j;
|
||||
}
|
||||
|
||||
// Verify third value
|
||||
for (size_t j = 0; j < type_length; ++j) {
|
||||
size_t index = type_length + j;
|
||||
EXPECT_EQ(result_column->get_element(index),
|
||||
static_cast<int8_t>(test_fixed_len_buffers[2][j]))
|
||||
<< "Mismatch at buffer 2, byte " << j;
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding fixed-length byte array with filter and null values
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter_and_null) {
|
||||
// Configure DECIMAL type parameters
|
||||
const int32_t type_length = 16;
|
||||
int precision = 10;
|
||||
int scale = 2;
|
||||
_decoder->set_type_length(type_length);
|
||||
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make(
|
||||
"test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
|
||||
parquet::ConvertedType::DECIMAL, type_length, precision, scale);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare test data
|
||||
std::vector<std::vector<uint8_t>> test_fixed_len_buffers = {
|
||||
{0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc,
|
||||
0x61, 0x40}, // Data 1
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00}, // Data 2 (all zeros)
|
||||
{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC,
|
||||
0xDE, 0xF0} // Data 4 (random)
|
||||
};
|
||||
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& buffer : test_fixed_len_buffers) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(buffer.size()), buffer.data()});
|
||||
}
|
||||
|
||||
// Encode data
|
||||
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
|
||||
// Set decoder data
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt8>();
|
||||
|
||||
// Create filter [1, 0, 1, 0] and null vector [0, 0, 1, 0]
|
||||
size_t num_values = 4;
|
||||
std::vector<uint16_t> run_length_null_map = {2, 1, 1}; // Data: [Data 1, Data 2, null, Data 4]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0}; // Filtered data: [Data 1, null]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2 * type_length); // 2 values after filtering (Data 1 and null)
|
||||
auto* result_column = assert_cast<ColumnInt8*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<std::vector<uint8_t>>> expected_values;
|
||||
expected_values.push_back(std::vector<uint8_t> {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13,
|
||||
0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, 0x61,
|
||||
0x40}); // Data 1
|
||||
expected_values.push_back(std::nullopt); // Only filtered values (Data 1 and null)
|
||||
|
||||
// Verify results
|
||||
size_t filtered_index = 0;
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
for (size_t j = 0; j < type_length; ++j) {
|
||||
size_t index = filtered_index * type_length + j;
|
||||
EXPECT_EQ(result_column->get_element(index),
|
||||
static_cast<int8_t>(expected_values[i].value()[j]))
|
||||
<< "Mismatch at filtered value " << i << ", byte " << j;
|
||||
}
|
||||
EXPECT_FALSE(null_map[filtered_index])
|
||||
<< "Expected non-null at filtered position " << filtered_index;
|
||||
filtered_index++;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[filtered_index])
|
||||
<< "Expected null at filtered position " << filtered_index;
|
||||
filtered_index++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test skipping values for fixed-length byte array decoding
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_skip_value_for_fixed_len_byte_array) {
|
||||
// Configure DECIMAL type parameters
|
||||
const int32_t type_length = 16;
|
||||
int precision = 10;
|
||||
int scale = 2;
|
||||
_decoder->set_type_length(type_length);
|
||||
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make(
|
||||
"test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY,
|
||||
parquet::ConvertedType::DECIMAL, type_length, precision, scale);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare test data
|
||||
std::vector<std::vector<uint8_t>> test_fixed_len_buffers = {
|
||||
{0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc,
|
||||
0x61, 0x40}, // Data 1
|
||||
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00}, // Data 2 (all zeros)
|
||||
{0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF}, // Data 3 (all ones)
|
||||
{0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC,
|
||||
0xDE, 0xF0} // Data 4 (random)
|
||||
};
|
||||
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& buffer : test_fixed_len_buffers) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(buffer.size()), buffer.data()});
|
||||
}
|
||||
|
||||
// Encode data
|
||||
auto encoder = MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
|
||||
// Set decoder data
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Skip the first two values
|
||||
ASSERT_TRUE(_decoder->skip_values(2).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt8>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = test_fixed_len_buffers.size() - 2; // Skip first two values
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values);
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values * type_length);
|
||||
auto* result_column = assert_cast<ColumnInt8*>(column.get());
|
||||
|
||||
// Verify decoded results (should start from the third value)
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
for (size_t j = 0; j < type_length; ++j) {
|
||||
size_t index = i * type_length + j;
|
||||
EXPECT_EQ(result_column->get_element(index),
|
||||
static_cast<int8_t>(test_fixed_len_buffers[i + 2][j]))
|
||||
<< "Mismatch at buffer " << (i + 2) << ", byte " << j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding with invalid data
|
||||
TEST_F(DeltaByteArrayDecoderTest, test_invalid_data) {
|
||||
// Prepare invalid encoded data
|
||||
std::vector<uint8_t> encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_FALSE(_decoder->set_data(&data_slice).ok());
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -0,0 +1,276 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
#include "vec/data_types/data_type_string.h"
|
||||
#include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class DeltaLengthByteArrayDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override { _decoder = std::make_unique<DeltaLengthByteArrayDecoder>(); }
|
||||
|
||||
std::unique_ptr<DeltaLengthByteArrayDecoder> _decoder;
|
||||
};
|
||||
|
||||
// Test basic decoding functionality
|
||||
TEST_F(DeltaLengthByteArrayDecoderTest, test_basic_decode) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BYTE_ARRAY);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& value : values) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(value.size()),
|
||||
reinterpret_cast<const uint8_t*>(value.data())});
|
||||
}
|
||||
|
||||
// Create encoder
|
||||
auto encoder =
|
||||
MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = values.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding with filter
|
||||
TEST_F(DeltaLengthByteArrayDecoderTest, test_decode_with_filter) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BYTE_ARRAY);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF"};
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& value : values) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(value.size()),
|
||||
reinterpret_cast<const uint8_t*>(value.data())});
|
||||
}
|
||||
|
||||
// Create encoder
|
||||
auto encoder =
|
||||
MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create filter vector [1, 0, 1, 0]
|
||||
size_t num_values = values.size();
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "Hello");
|
||||
EXPECT_EQ(result_column->get_data_at(1).to_string(), "Foobar");
|
||||
}
|
||||
|
||||
// Test decoding with filter and null values
|
||||
TEST_F(DeltaLengthByteArrayDecoderTest, test_decode_with_filter_and_null) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BYTE_ARRAY);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<std::string> values = {"Hello", "World", "ABCDEF"};
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& value : values) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(value.size()),
|
||||
reinterpret_cast<const uint8_t*>(value.data())});
|
||||
}
|
||||
|
||||
// Create encoder
|
||||
auto encoder =
|
||||
MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create filter vector [1, 0, 1, 0] and null vector [0, 0, 1, 0]
|
||||
size_t num_values = 4;
|
||||
std::vector<uint16_t> run_length_null_map = {2, 1,
|
||||
1}; // data: ["Hello", "World", null, "ABCDEF"]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0}; // filtered_data: ["Hello", null]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<std::string>> expected_values = {"Hello", std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding with invalid data
|
||||
TEST_F(DeltaLengthByteArrayDecoderTest, test_invalid_data) {
|
||||
// Prepare invalid encoded data
|
||||
std::vector<uint8_t> encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data
|
||||
Slice data_slice(reinterpret_cast<char*>(encoded_data.data()), encoded_data.size());
|
||||
ASSERT_FALSE(_decoder->set_data(&data_slice).ok());
|
||||
}
|
||||
|
||||
// Test skipping values for delta length byte array decoding
|
||||
TEST_F(DeltaLengthByteArrayDecoderTest, test_skip_value) {
|
||||
// Create ColumnDescriptor
|
||||
auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED,
|
||||
parquet::Type::BYTE_ARRAY);
|
||||
auto descr = std::make_shared<parquet::ColumnDescriptor>(node, 0, 0);
|
||||
|
||||
// Prepare original data
|
||||
std::vector<std::string> values = {"Hello", "World", "Foobar", "ABCDEF", "Test", "Skip"};
|
||||
std::vector<parquet::ByteArray> byte_array_values;
|
||||
for (const auto& value : values) {
|
||||
byte_array_values.emplace_back(
|
||||
parquet::ByteArray {static_cast<uint32_t>(value.size()),
|
||||
reinterpret_cast<const uint8_t*>(value.data())});
|
||||
}
|
||||
|
||||
// Create encoder
|
||||
auto encoder =
|
||||
MakeTypedEncoder<parquet::ByteArrayType>(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY,
|
||||
/*use_dictionary=*/false, descr.get());
|
||||
|
||||
// Put data into encoder
|
||||
ASSERT_NO_THROW(
|
||||
encoder->Put(byte_array_values.data(), static_cast<int>(byte_array_values.size())));
|
||||
|
||||
// Get encoded data
|
||||
auto encoded_buffer = encoder->FlushValues();
|
||||
Slice data_slice(encoded_buffer->data(), encoded_buffer->size());
|
||||
ASSERT_TRUE(_decoder->set_data(&data_slice).ok());
|
||||
|
||||
// Skip first 3 values
|
||||
ASSERT_TRUE(_decoder->skip_values(3).ok());
|
||||
|
||||
// Create column and data type
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeString>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = values.size() - 3; // Total 6 values, skip 3, remaining 3
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Expected values after skipping first 3 values ("Hello", "World", "Foobar")
|
||||
std::vector<std::string> expected_values = {"ABCDEF", "Test", "Skip"};
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i])
|
||||
<< "Mismatch at value " << i;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
538
be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp
Normal file
538
be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp
Normal file
@ -0,0 +1,538 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/fix_length_dict_decoder.hpp"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class FixLengthDictDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {
|
||||
// Prepare test data: create a dictionary with fixed-length strings
|
||||
_type_length = 6; // Each string has length 6
|
||||
size_t dict_size = 3;
|
||||
size_t dict_data_size = dict_size * _type_length;
|
||||
|
||||
auto dict_data = std::make_unique<uint8_t[]>(dict_data_size);
|
||||
const char* values[3] = {"apple ", "banana", "cherry"}; // Dictionary values
|
||||
for (int i = 0; i < 3; i++) {
|
||||
memcpy(dict_data.get() + i * _type_length, values[i], _type_length);
|
||||
}
|
||||
|
||||
_decoder.set_type_length(_type_length);
|
||||
ASSERT_TRUE(_decoder.set_dict(dict_data, dict_data_size, dict_size).ok());
|
||||
}
|
||||
|
||||
FixLengthDictDecoder _decoder;
|
||||
size_t _type_length;
|
||||
};
|
||||
|
||||
// Test basic decoding functionality
|
||||
TEST_F(FixLengthDictDecoderTest, test_basic_decode) {
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
|
||||
size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values * _type_length);
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
|
||||
// Split decoded results into strings based on _type_length
|
||||
std::vector<std::string> decoded_strings;
|
||||
const auto& data = result_column->get_data();
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
std::string str;
|
||||
for (size_t j = 0; j < _type_length; ++j) {
|
||||
str.push_back(static_cast<char>(data[i * _type_length + j]));
|
||||
}
|
||||
decoded_strings.push_back(str);
|
||||
}
|
||||
|
||||
// Verify first 4 repeated values (dict index 0 -> value "apple ")
|
||||
for (int i = 0; i < 4; i++) {
|
||||
EXPECT_EQ(decoded_strings[i], "apple ");
|
||||
}
|
||||
|
||||
// Verify last 3 literal values
|
||||
EXPECT_EQ(decoded_strings[4], "banana");
|
||||
EXPECT_EQ(decoded_strings[5], "cherry");
|
||||
EXPECT_EQ(decoded_strings[6], "banana");
|
||||
}
|
||||
|
||||
// Test decoding with filter
|
||||
TEST_F(FixLengthDictDecoderTest, test_decode_with_filter) {
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
;
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1]
|
||||
size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5 * _type_length); // 5 values after filtering
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
|
||||
// Split decoded results into strings based on _type_length
|
||||
std::vector<std::string> decoded_strings;
|
||||
const auto& data = result_column->get_data();
|
||||
for (size_t i = 0; i < 5; ++i) {
|
||||
std::string str;
|
||||
for (size_t j = 0; j < _type_length; ++j) {
|
||||
str.push_back(static_cast<char>(data[i * _type_length + j]));
|
||||
}
|
||||
decoded_strings.push_back(str);
|
||||
}
|
||||
|
||||
// Verify filtered values
|
||||
EXPECT_EQ(decoded_strings[0], "apple ");
|
||||
EXPECT_EQ(decoded_strings[1], "apple ");
|
||||
EXPECT_EQ(decoded_strings[2], "banana");
|
||||
EXPECT_EQ(decoded_strings[3], "cherry");
|
||||
EXPECT_EQ(decoded_strings[4], "banana");
|
||||
}
|
||||
|
||||
// Test decoding with filter and null
|
||||
TEST_F(FixLengthDictDecoderTest, test_decode_with_filter_and_null) {
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
|
||||
size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5 * _type_length); // 5 values after filtering
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
|
||||
// Split decoded results into strings based on _type_length
|
||||
std::vector<std::string> decoded_strings;
|
||||
const auto& data = result_column->get_data();
|
||||
for (size_t i = 0; i < 5; ++i) {
|
||||
std::string str;
|
||||
for (size_t j = 0; j < _type_length; ++j) {
|
||||
str.push_back(static_cast<char>(data[i * _type_length + j]));
|
||||
}
|
||||
decoded_strings.push_back(str);
|
||||
}
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<std::string>> expected_values = {"apple ", "apple ", std::nullopt,
|
||||
"cherry", std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(decoded_strings[i], expected_values[i].value()) << "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test empty dictionary case
|
||||
TEST_F(FixLengthDictDecoderTest, test_empty_dict) {
|
||||
FixLengthDictDecoder empty_decoder;
|
||||
empty_decoder.set_type_length(sizeof(int32_t));
|
||||
|
||||
auto dict_data = std::make_unique<uint8_t[]>(0);
|
||||
ASSERT_TRUE(empty_decoder.set_dict(dict_data, 0, 0).ok());
|
||||
}
|
||||
|
||||
// Test decoding with ColumnDictI32
|
||||
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32) {
|
||||
// Create ColumnDictI32 column
|
||||
MutableColumnPtr column = ColumnDictI32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
|
||||
|
||||
// Verify first 4 repeated values (dict index 0 -> value "apple ")
|
||||
for (int i = 0; i < 4; i++) {
|
||||
EXPECT_EQ(dict_column->get_data()[i], 0);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(), "apple ");
|
||||
}
|
||||
|
||||
// Verify last 3 literal values
|
||||
EXPECT_EQ(dict_column->get_data()[4], 1);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana");
|
||||
EXPECT_EQ(dict_column->get_data()[5], 2);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[5]).to_string(), "cherry");
|
||||
EXPECT_EQ(dict_column->get_data()[6], 1);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[6]).to_string(), "banana");
|
||||
}
|
||||
|
||||
// Test decoding with ColumnDictI32 and filter
|
||||
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32_with_filter) {
|
||||
// Create ColumnDictI32 column
|
||||
MutableColumnPtr column = ColumnDictI32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1]
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
|
||||
|
||||
// Verify filtered values
|
||||
EXPECT_EQ(dict_column->get_data()[0], 0);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[0]).to_string(), "apple ");
|
||||
EXPECT_EQ(dict_column->get_data()[1], 0);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[1]).to_string(), "apple ");
|
||||
EXPECT_EQ(dict_column->get_data()[2], 1);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[2]).to_string(), "banana");
|
||||
EXPECT_EQ(dict_column->get_data()[3], 2);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[3]).to_string(), "cherry");
|
||||
EXPECT_EQ(dict_column->get_data()[4], 1);
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana");
|
||||
}
|
||||
|
||||
// Test decoding with ColumnDictI32 with filter and null
|
||||
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32_with_filter_and_null) {
|
||||
// Create ColumnDictI32 column
|
||||
MutableColumnPtr column = ColumnDictI32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* dict_column = assert_cast<ColumnDictI32*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<std::string>> expected_values = {"apple ", "apple ", std::nullopt,
|
||||
"cherry", std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(),
|
||||
expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test decoding with ColumnInt32
|
||||
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32) {
|
||||
// Create ColumnInt32 column
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create selection vector without filter, total 7 values (4 repeated + 3 literal)
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
|
||||
|
||||
// Verify first 4 repeated values (dict index 0 -> value "apple ")
|
||||
for (int i = 0; i < 4; i++) {
|
||||
EXPECT_EQ(dict_column->get_data()[i], 0);
|
||||
}
|
||||
|
||||
// Verify last 3 literal values
|
||||
EXPECT_EQ(dict_column->get_data()[4], 1);
|
||||
EXPECT_EQ(dict_column->get_data()[5], 2);
|
||||
EXPECT_EQ(dict_column->get_data()[6], 1);
|
||||
}
|
||||
|
||||
// Test decoding with ColumnInt32 and filter
|
||||
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32_with_filter) {
|
||||
// Create ColumnInt32 column
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1]
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map = {num_values}; // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
|
||||
|
||||
// Verify filtered values
|
||||
EXPECT_EQ(dict_column->get_data()[0], 0);
|
||||
EXPECT_EQ(dict_column->get_data()[1], 0);
|
||||
EXPECT_EQ(dict_column->get_data()[2], 1);
|
||||
EXPECT_EQ(dict_column->get_data()[3], 2);
|
||||
EXPECT_EQ(dict_column->get_data()[4], 1);
|
||||
}
|
||||
|
||||
// Test decoding with ColumnInt32 with filter and null
|
||||
TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32_with_filter_and_null) {
|
||||
// Create ColumnInt32 column
|
||||
MutableColumnPtr column = ColumnInt32::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00000010, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1]
|
||||
const size_t num_values = 7;
|
||||
std::vector<uint16_t> run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null]
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 5); // 5 values after filtering
|
||||
auto* dict_column = assert_cast<ColumnInt32*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<int32_t>> expected_values = {0, 0, std::nullopt, 2, std::nullopt};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(dict_column->get_data()[i], expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test reading dictionary values to column
|
||||
TEST_F(FixLengthDictDecoderTest, test_read_dict_values_to_column) {
|
||||
// Create a column to store dictionary values
|
||||
MutableColumnPtr column = ColumnString::create();
|
||||
|
||||
// Read dictionary values to column
|
||||
ASSERT_TRUE(_decoder.read_dict_values_to_column(column).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 3); // 3 dictionary items
|
||||
auto* result_column = assert_cast<ColumnString*>(column.get());
|
||||
|
||||
// Get decoded strings directly
|
||||
std::vector<std::string> decoded_strings;
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
decoded_strings.push_back(result_column->get_data_at(i).to_string());
|
||||
}
|
||||
|
||||
// Verify dictionary values
|
||||
EXPECT_EQ(decoded_strings[0], "apple ");
|
||||
EXPECT_EQ(decoded_strings[1], "banana");
|
||||
EXPECT_EQ(decoded_strings[2], "cherry");
|
||||
}
|
||||
|
||||
// Test convert_dict_column_to_string_column function
|
||||
TEST_F(FixLengthDictDecoderTest, test_convert_dict_column_to_string_column) {
|
||||
// Create a ColumnInt32 with some dictionary codes
|
||||
MutableColumnPtr dict_column = ColumnInt32::create();
|
||||
dict_column->insert(0);
|
||||
dict_column->insert(1);
|
||||
dict_column->insert(2);
|
||||
dict_column->insert(1);
|
||||
|
||||
// Convert to string column
|
||||
MutableColumnPtr string_column = _decoder.convert_dict_column_to_string_column(
|
||||
assert_cast<ColumnInt32*>(dict_column.get()));
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(string_column->size(), 4);
|
||||
auto* result_column = assert_cast<ColumnString*>(string_column.get());
|
||||
|
||||
EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple ");
|
||||
EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana");
|
||||
EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry");
|
||||
EXPECT_EQ(result_column->get_data_at(3).to_string(), "banana");
|
||||
}
|
||||
|
||||
// Test skipping values for fixed length dictionary decoding
|
||||
TEST_F(FixLengthDictDecoderTest, test_skip_value) {
|
||||
MutableColumnPtr column = ColumnUInt8::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeUInt8>();
|
||||
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {2, 8, 0, 3, 0b00011001, 0};
|
||||
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
ASSERT_TRUE(_decoder.set_data(&data_slice).ok());
|
||||
|
||||
// Skip first 3 values
|
||||
ASSERT_TRUE(_decoder.skip_values(3).ok());
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 4; // Total 7 values, skip 3, remaining 4
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values * _type_length);
|
||||
auto* result_column = assert_cast<ColumnUInt8*>(column.get());
|
||||
|
||||
// Split decoded results into strings based on _type_length
|
||||
std::vector<std::string> decoded_strings;
|
||||
const auto& data = result_column->get_data();
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
std::string str;
|
||||
for (size_t j = 0; j < _type_length; ++j) {
|
||||
str.push_back(static_cast<char>(data[i * _type_length + j]));
|
||||
}
|
||||
decoded_strings.push_back(str);
|
||||
}
|
||||
|
||||
// Expected values after skipping first 3 values ("apple ", "apple ", "apple ")
|
||||
std::vector<std::string> expected_values = {"apple ", "banana", "cherry", "banana"};
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
EXPECT_EQ(decoded_strings[i], expected_values[i]) << "Mismatch at value " << i;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
@ -0,0 +1,203 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/fix_length_plain_decoder.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class FixLengthPlainDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override {}
|
||||
|
||||
std::unique_ptr<uint8_t[]> _data;
|
||||
Slice _data_slice;
|
||||
size_t _type_length;
|
||||
};
|
||||
|
||||
// Test basic decoding functionality
|
||||
TEST_F(FixLengthPlainDecoderTest, test_basic_decode) {
|
||||
// Prepare test data: create fixed-length integer values
|
||||
int32_t values[3] = {123, 456, 789};
|
||||
size_t data_size = sizeof(values);
|
||||
|
||||
_data = std::make_unique<uint8_t[]>(data_size);
|
||||
memcpy(_data.get(), values, data_size);
|
||||
|
||||
_data_slice = Slice(_data.get(), data_size);
|
||||
_type_length = sizeof(int32_t);
|
||||
|
||||
FixLengthPlainDecoder decoder;
|
||||
decoder.set_type_length(_type_length);
|
||||
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnVector<int32_t>::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create selection vector without filter
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnVector<int32_t>*>(column.get());
|
||||
|
||||
EXPECT_EQ(result_column->get_data()[0], 123);
|
||||
EXPECT_EQ(result_column->get_data()[1], 456);
|
||||
EXPECT_EQ(result_column->get_data()[2], 789);
|
||||
}
|
||||
|
||||
// Test decoding with filter
|
||||
TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter) {
|
||||
// Prepare test data: create fixed-length integer values
|
||||
int32_t values[3] = {123, 456, 789};
|
||||
size_t data_size = sizeof(values);
|
||||
|
||||
_data = std::make_unique<uint8_t[]>(data_size);
|
||||
memcpy(_data.get(), values, data_size);
|
||||
|
||||
_data_slice = Slice(_data.get(), data_size);
|
||||
_type_length = sizeof(int32_t);
|
||||
|
||||
FixLengthPlainDecoder decoder;
|
||||
decoder.set_type_length(_type_length);
|
||||
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnVector<int32_t>::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create filter vector [1,0,1]
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1};
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnVector<int32_t>*>(column.get());
|
||||
|
||||
EXPECT_EQ(result_column->get_data()[0], 123);
|
||||
EXPECT_EQ(result_column->get_data()[1], 789);
|
||||
}
|
||||
|
||||
// Test decoding with filter and null
|
||||
TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter_and_null) {
|
||||
// Prepare test data: create fixed-length integer values
|
||||
int32_t values[2] = {123, 789};
|
||||
size_t data_size = sizeof(values);
|
||||
|
||||
_data = std::make_unique<uint8_t[]>(data_size);
|
||||
memcpy(_data.get(), values, data_size);
|
||||
|
||||
_data_slice = Slice(_data.get(), data_size);
|
||||
_type_length = sizeof(int32_t);
|
||||
|
||||
FixLengthPlainDecoder decoder;
|
||||
decoder.set_type_length(_type_length);
|
||||
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnVector<int32_t>::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create filter vector [1,0,1] and null vector [0,1,0]
|
||||
size_t num_values = 3;
|
||||
std::vector<uint16_t> run_length_null_map = {1, 1, 1}; // data: [123, null, 789]
|
||||
std::vector<uint8_t> filter_data = {1, 0, 1}; // filtered_data: [123, 789]
|
||||
|
||||
ColumnSelectVector select_vector;
|
||||
NullMap null_map;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), 2); // 2 values after filtering
|
||||
auto* result_column = assert_cast<ColumnVector<int32_t>*>(column.get());
|
||||
|
||||
// Expected values after filtering and null handling
|
||||
std::vector<std::optional<int32_t>> expected_values = {123, 789};
|
||||
for (size_t i = 0; i < expected_values.size(); ++i) {
|
||||
if (expected_values[i].has_value()) {
|
||||
EXPECT_EQ(result_column->get_data()[i], expected_values[i].value())
|
||||
<< "Mismatch at value " << i;
|
||||
EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i;
|
||||
} else {
|
||||
EXPECT_TRUE(null_map[i]) << "Expected null at position " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Test skipping values
|
||||
TEST_F(FixLengthPlainDecoderTest, test_skip_value) {
|
||||
// Prepare test data: create fixed-length integer values
|
||||
int32_t values[3] = {123, 456, 789};
|
||||
size_t data_size = sizeof(values);
|
||||
|
||||
_data = std::make_unique<uint8_t[]>(data_size);
|
||||
memcpy(_data.get(), values, data_size);
|
||||
|
||||
_data_slice = Slice(_data.get(), data_size);
|
||||
_type_length = sizeof(int32_t);
|
||||
|
||||
FixLengthPlainDecoder decoder;
|
||||
decoder.set_type_length(_type_length);
|
||||
ASSERT_TRUE(decoder.set_data(&_data_slice).ok());
|
||||
|
||||
// Skip first 2 values
|
||||
ASSERT_TRUE(decoder.skip_values(2).ok());
|
||||
|
||||
MutableColumnPtr column = ColumnVector<int32_t>::create();
|
||||
DataTypePtr data_type = std::make_shared<DataTypeInt32>();
|
||||
|
||||
// Create selection vector
|
||||
size_t num_values = 1; // Total 3 values, skip 2, remaining 1
|
||||
std::vector<uint16_t> run_length_null_map(1, num_values); // All non-null
|
||||
std::vector<uint8_t> filter_data(num_values, 1);
|
||||
ColumnSelectVector select_vector;
|
||||
select_vector.build(filter_data.data(), filter_data.size(), false);
|
||||
select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr);
|
||||
|
||||
// Perform decoding
|
||||
ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok());
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(column->size(), num_values);
|
||||
auto* result_column = assert_cast<ColumnVector<int32_t>*>(column.get());
|
||||
|
||||
EXPECT_EQ(result_column->get_data()[0], 789);
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
225
be/test/vec/exec/format/parquet/level_decoder_test.cpp
Normal file
225
be/test/vec/exec/format/parquet/level_decoder_test.cpp
Normal file
@ -0,0 +1,225 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/level_decoder.h"
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "parquet/encoding.h"
|
||||
#include "parquet/schema.h"
|
||||
#include "parquet/types.h"
|
||||
#include "util/slice.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
class LevelDecoderTest : public ::testing::Test {
|
||||
protected:
|
||||
void SetUp() override { _decoder = std::make_unique<LevelDecoder>(); }
|
||||
|
||||
std::unique_ptr<LevelDecoder> _decoder;
|
||||
};
|
||||
|
||||
// Test basic RLE level decoding for data page v1
|
||||
TEST_F(LevelDecoderTest, test_rle_decode_v1) {
|
||||
// Prepare RLE encoded data
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1 [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {
|
||||
0x04, 0x00, 0x00, 0x00, // RLE length (4 bytes)
|
||||
8, 0, 3, 0b00011001 // RLE data
|
||||
};
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
|
||||
// Initialize decoder
|
||||
ASSERT_TRUE(_decoder->init(&data_slice, tparquet::Encoding::RLE, 2, 7).ok());
|
||||
|
||||
// Decode levels
|
||||
level_t levels[7];
|
||||
size_t num_levels = _decoder->get_levels(levels, 7);
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(num_levels, 7);
|
||||
EXPECT_EQ(levels[0], 0);
|
||||
EXPECT_EQ(levels[1], 0);
|
||||
EXPECT_EQ(levels[2], 0);
|
||||
EXPECT_EQ(levels[3], 0);
|
||||
EXPECT_EQ(levels[4], 1);
|
||||
EXPECT_EQ(levels[5], 2);
|
||||
EXPECT_EQ(levels[6], 1);
|
||||
}
|
||||
|
||||
// Test basic BIT-PACKED level decoding for data page v1
|
||||
TEST_F(LevelDecoderTest, test_bit_packed_decode_v1) {
|
||||
// Prepare BIT-PACKED encoded data
|
||||
// [1 2 1]
|
||||
std::vector<uint8_t> rle_data = {0b00011001};
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
|
||||
// Initialize decoder
|
||||
ASSERT_TRUE(_decoder->init(&data_slice, tparquet::Encoding::BIT_PACKED, 2, 3).ok());
|
||||
|
||||
// Decode levels
|
||||
level_t levels[3];
|
||||
size_t num_levels = _decoder->get_levels(levels, 3);
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(num_levels, 3);
|
||||
EXPECT_EQ(levels[0], 1);
|
||||
EXPECT_EQ(levels[1], 2);
|
||||
EXPECT_EQ(levels[2], 1);
|
||||
}
|
||||
|
||||
// Test RLE level decoding for data page v2
|
||||
TEST_F(LevelDecoderTest, test_rle_decode_v2) {
|
||||
// Prepare RLE encoded data
|
||||
// RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1]
|
||||
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
|
||||
// Initialize decoder
|
||||
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
|
||||
|
||||
// Decode levels
|
||||
level_t levels[7];
|
||||
size_t num_levels = _decoder->get_levels(levels, 7);
|
||||
|
||||
// Verify results
|
||||
ASSERT_EQ(num_levels, 7);
|
||||
EXPECT_EQ(levels[0], 0);
|
||||
EXPECT_EQ(levels[1], 0);
|
||||
EXPECT_EQ(levels[2], 0);
|
||||
EXPECT_EQ(levels[3], 0);
|
||||
EXPECT_EQ(levels[4], 1);
|
||||
EXPECT_EQ(levels[5], 2);
|
||||
EXPECT_EQ(levels[6], 1);
|
||||
}
|
||||
|
||||
// Test invalid RLE data for data page v1
|
||||
TEST_F(LevelDecoderTest, test_invalid_rle_data_v1) {
|
||||
// Prepare invalid RLE data
|
||||
std::vector<uint8_t> rle_data = {0x04, 0x00, 0x00, 0x00, // RLE length (4 bytes)
|
||||
8, 0, 3};
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
|
||||
// Initialize decoder should fail
|
||||
ASSERT_FALSE(_decoder->init(&data_slice, tparquet::Encoding::RLE, 1, 8).ok());
|
||||
}
|
||||
|
||||
// TODO: Currently not working, so commented out.
|
||||
// Test invalid RLE data for data page v2
|
||||
//TEST_F(LevelDecoderTest, test_invalid_rle_data_v2) {
|
||||
// // Prepare invalid RLE data
|
||||
// std::vector<uint8_t> rle_data = {8, 0, 3};
|
||||
// Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
//
|
||||
// // Initialize decoder should fail
|
||||
// ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
|
||||
//
|
||||
// // Decode levels
|
||||
// level_t levels[7];
|
||||
// size_t num_levels = _decoder->get_levels(levels, 7);
|
||||
//
|
||||
// // Verify results
|
||||
// ASSERT_EQ(num_levels, 7);
|
||||
//}
|
||||
|
||||
// Test unsupported encoding
|
||||
TEST_F(LevelDecoderTest, test_unsupported_encoding) {
|
||||
// Prepare dummy data
|
||||
std::vector<uint8_t> dummy_data = {0x00};
|
||||
Slice data_slice(reinterpret_cast<char*>(dummy_data.data()), dummy_data.size());
|
||||
|
||||
// Initialize decoder with unsupported encoding should fail
|
||||
ASSERT_FALSE(_decoder->init(&data_slice, tparquet::Encoding::PLAIN, 1, 8).ok());
|
||||
}
|
||||
|
||||
// Test has_levels() function
|
||||
TEST_F(LevelDecoderTest, test_has_levels) {
|
||||
// Initially, there should be no levels
|
||||
EXPECT_FALSE(_decoder->has_levels());
|
||||
|
||||
// Prepare RLE encoded data
|
||||
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
|
||||
// Initialize decoder with valid data
|
||||
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
|
||||
|
||||
// Now there should be levels
|
||||
EXPECT_TRUE(_decoder->has_levels());
|
||||
}
|
||||
|
||||
// Test get_next() function
|
||||
TEST_F(LevelDecoderTest, test_get_next) {
|
||||
// Prepare RLE encoded data
|
||||
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
|
||||
// Initialize decoder
|
||||
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
|
||||
|
||||
// Verify the first level
|
||||
EXPECT_EQ(_decoder->get_next(), 0);
|
||||
|
||||
// Verify the next level
|
||||
EXPECT_EQ(_decoder->get_next(), 0);
|
||||
}
|
||||
|
||||
// Test rewind_one() function
|
||||
TEST_F(LevelDecoderTest, test_rewind_one) {
|
||||
// Prepare RLE encoded data
|
||||
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
|
||||
// Initialize decoder
|
||||
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
|
||||
|
||||
// Get the first level
|
||||
level_t first_level = _decoder->get_next();
|
||||
|
||||
// Get the second level
|
||||
level_t second_level = _decoder->get_next();
|
||||
|
||||
// Rewind one level
|
||||
_decoder->rewind_one();
|
||||
|
||||
// Verify that we get the second level again
|
||||
EXPECT_EQ(_decoder->get_next(), second_level);
|
||||
|
||||
// Rewind one more level
|
||||
_decoder->rewind_one();
|
||||
|
||||
// Verify that we get the first level again
|
||||
EXPECT_EQ(_decoder->get_next(), first_level);
|
||||
}
|
||||
|
||||
// Test rle_decoder() function
|
||||
TEST_F(LevelDecoderTest, test_rle_decoder) {
|
||||
// Prepare RLE encoded data
|
||||
std::vector<uint8_t> rle_data = {8, 0, 3, 0b00011001, 0};
|
||||
Slice data_slice(reinterpret_cast<char*>(rle_data.data()), rle_data.size());
|
||||
|
||||
// Initialize decoder
|
||||
ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok());
|
||||
|
||||
// Get the RLE decoder
|
||||
const RleDecoder<level_t>& rle_decoder = _decoder->rle_decoder();
|
||||
|
||||
// Verify that the RLE decoder is not null
|
||||
EXPECT_NE(&rle_decoder, nullptr);
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
Reference in New Issue
Block a user