diff --git a/be/src/vec/exec/format/parquet/bool_plain_decoder.h b/be/src/vec/exec/format/parquet/bool_plain_decoder.h index 0dc20d6197..6220f89663 100644 --- a/be/src/vec/exec/format/parquet/bool_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/bool_plain_decoder.h @@ -43,11 +43,12 @@ public: ~BoolPlainDecoder() override = default; // Set the data to be decoded - void set_data(Slice* data) override { + Status set_data(Slice* data) override { bool_values_.Reset((const uint8_t*)data->data, data->size); num_unpacked_values_ = 0; unpacked_value_idx_ = 0; _offset = 0; + return Status::OK(); } Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, diff --git a/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp b/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp index 17ce68e604..de286c828e 100644 --- a/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp +++ b/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp @@ -30,29 +30,29 @@ #include "vec/exec/format/parquet/parquet_common.h" namespace doris::vectorized { -void BoolRLEDecoder::set_data(Slice* slice) { +Status BoolRLEDecoder::set_data(Slice* slice) { _data = slice; _num_bytes = slice->size; _offset = 0; - _current_value_idx = 0; if (_num_bytes < 4) { - LOG(FATAL) << "Received invalid length : " + std::to_string(_num_bytes) + - " (corrupt data page?)"; + return Status::IOError("Received invalid length : " + std::to_string(_num_bytes) + + " (corrupt data page?)"); } // Load the first 4 bytes in little-endian, which indicates the length const uint8_t* data = reinterpret_cast(_data->data); uint32_t num_bytes = decode_fixed32_le(data); if (num_bytes > static_cast(_num_bytes - 4)) { - LOG(FATAL) << ("Received invalid number of bytes : " + std::to_string(num_bytes) + - " (corrupt data page?)"); + return Status::IOError("Received invalid number of bytes : " + std::to_string(num_bytes) + + " (corrupt data page?)"); } _num_bytes = num_bytes; auto decoder_data = data + 4; _decoder = RleDecoder(decoder_data, num_bytes, 1); + return Status::OK(); } Status BoolRLEDecoder::skip_values(size_t num_values) { - _current_value_idx += num_values; + _decoder.Skip(num_values); return Status::OK(); } @@ -76,15 +76,16 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt if (!_decoder.get_values(_values.data(), max_values)) { return Status::IOError("Can't read enough booleans in rle decoder"); } + size_t current_value_idx = 0; ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { case ColumnSelectVector::CONTENT: { bool value; // Can't use uint8_t directly, we should correct it. for (size_t i = 0; i < run_length; ++i) { - DCHECK(_current_value_idx < max_values) - << _current_value_idx << " vs. " << max_values; - value = _values[_current_value_idx++]; + DCHECK(current_value_idx < max_values) + << current_value_idx << " vs. " << max_values; + value = _values[current_value_idx++]; column_data[data_index++] = (UInt8)value; } break; @@ -94,7 +95,7 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt break; } case ColumnSelectVector::FILTERED_CONTENT: { - _current_value_idx += run_length; + current_value_idx += run_length; break; } case ColumnSelectVector::FILTERED_NULL: { @@ -102,7 +103,6 @@ Status BoolRLEDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePt } } } - _current_value_idx = 0; return Status::OK(); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/bool_rle_decoder.h b/be/src/vec/exec/format/parquet/bool_rle_decoder.h index 23118b8c38..a7b69e4e31 100644 --- a/be/src/vec/exec/format/parquet/bool_rle_decoder.h +++ b/be/src/vec/exec/format/parquet/bool_rle_decoder.h @@ -40,7 +40,7 @@ public: BoolRLEDecoder() = default; ~BoolRLEDecoder() override = default; - void set_data(Slice* slice) override; + Status set_data(Slice* slice) override; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) override; @@ -55,6 +55,5 @@ private: RleDecoder _decoder; std::vector _values; size_t _num_bytes; - size_t _current_value_idx = 0; }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/decoder.h b/be/src/vec/exec/format/parquet/decoder.h index 1654878af8..7122a61163 100644 --- a/be/src/vec/exec/format/parquet/decoder.h +++ b/be/src/vec/exec/format/parquet/decoder.h @@ -59,9 +59,10 @@ public: void set_type_length(int32_t type_length) { _type_length = type_length; } // Set the data to be decoded - virtual void set_data(Slice* data) { + virtual Status set_data(Slice* data) { _data = data; _offset = 0; + return Status::OK(); } // Write the decoded values batch to doris's column @@ -95,13 +96,14 @@ public: ~BaseDictDecoder() override = default; // Set the data to be decoded - void set_data(Slice* data) override { + Status set_data(Slice* data) override { _data = data; _offset = 0; uint8_t bit_width = *data->data; _index_batch_decoder = std::make_unique>( reinterpret_cast(data->data) + 1, static_cast(data->size) - 1, bit_width); + return Status::OK(); } protected: diff --git a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp new file mode 100644 index 0000000000..12ae27b2db --- /dev/null +++ b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp @@ -0,0 +1,134 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/delta_bit_pack_decoder.h" + +namespace doris::vectorized { +Status DeltaLengthByteArrayDecoder::_decode_lengths() { + RETURN_IF_ERROR(_len_decoder.set_bit_reader(_bit_reader)); + // get the number of encoded lengths + int num_length = _len_decoder.valid_values_count(); + _buffered_length.resize(num_length); + + // decode all the lengths. all the lengths are buffered in buffered_length_. + int ret; + RETURN_IF_ERROR(_len_decoder.decode(_buffered_length.data(), num_length, &ret)); + DCHECK_EQ(ret, num_length); + _length_idx = 0; + _num_valid_values = num_length; + return Status::OK(); +} + +Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values, + int* out_num_values) { + // Decode up to `max_values` strings into an internal buffer + // and reference them into `buffer`. + max_values = std::min(max_values, _num_valid_values); + if (max_values == 0) { + *out_num_values = 0; + return Status::OK(); + } + + int32_t data_size = 0; + const int32_t* length_ptr = _buffered_length.data() + _length_idx; + for (int i = 0; i < max_values; ++i) { + int32_t len = length_ptr[i]; + if (PREDICT_FALSE(len < 0)) { + return Status::InvalidArgument("Negative string delta length"); + } + buffer[i].size = len; + if (common::add_overflow(data_size, len, data_size)) { + return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY"); + } + } + _length_idx += max_values; + + _buffered_data.resize(data_size); + char* data_ptr = _buffered_data.data(); + for (int j = 0; j < data_size; j++) { + if (!_bit_reader->GetValue(8, data_ptr + j)) { + return Status::IOError("Get length bytes EOF"); + } + } + + for (int i = 0; i < max_values; ++i) { + buffer[i].data = data_ptr; + data_ptr += buffer[i].size; + } + // this->num_values_ -= max_values; + _num_valid_values -= max_values; + *out_num_values = max_values; + return Status::OK(); +} + +Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) { + // Decode up to `max_values` strings into an internal buffer + // and reference them into `buffer`. + max_values = std::min(max_values, _num_valid_values); + if (max_values == 0) { + *out_num_values = max_values; + return Status::OK(); + } + + int suffix_read; + RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read)); + if (PREDICT_FALSE(suffix_read != max_values)) { + return Status::IOError("Read {}, expecting {} from suffix decoder", + std::to_string(suffix_read), std::to_string(max_values)); + } + + int64_t data_size = 0; + const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset; + for (int i = 0; i < max_values; ++i) { + if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) { + return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY"); + } + if (PREDICT_FALSE(common::add_overflow(data_size, static_cast(prefix_len_ptr[i]), + data_size) || + common::add_overflow(data_size, static_cast(buffer[i].size), + data_size))) { + return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY"); + } + } + _buffered_data.resize(data_size); + + std::string_view prefix {_last_value}; + + char* data_ptr = _buffered_data.data(); + for (int i = 0; i < max_values; ++i) { + if (PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { + return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY"); + } + memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); + // buffer[i] currently points to the string suffix + memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size); + buffer[i].data = data_ptr; + buffer[i].size += prefix_len_ptr[i]; + data_ptr += buffer[i].size; + prefix = std::string_view {buffer[i].data, buffer[i].size}; + } + _prefix_len_offset += max_values; + _num_valid_values -= max_values; + _last_value = std::string {prefix}; + + if (_num_valid_values == 0) { + _last_value_in_previous_page = _last_value; + } + *out_num_values = max_values; + return Status::OK(); +} +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h index 9497aa1cb1..e05653f15d 100644 --- a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h +++ b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h @@ -47,10 +47,6 @@ public: ~DeltaDecoder() override = default; - Status skip_values(size_t num_values) override { - return _type_converted_decoder->skip_values(num_values); - } - template Status decode_byte_array(const std::vector& decoded_vals, MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) { @@ -125,9 +121,10 @@ public: } protected: - void init_values_converter() { - _type_converted_decoder->set_data(_data); + Status init_values_converter() { + RETURN_IF_ERROR(_type_converted_decoder->set_data(_data)); _type_converted_decoder->set_type_length(_type_length); + return Status::OK(); } // Convert decoded value to doris type value. std::unique_ptr _type_converted_decoder; @@ -148,6 +145,13 @@ public: DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder()) {} ~DeltaBitPackDecoder() override = default; + + Status skip_values(size_t num_values) override { + _values.resize(num_values); + int num_valid_values; + return _get_internal(_values.data(), num_values, &num_valid_values); + } + Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) override { size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); @@ -159,7 +163,7 @@ public: _type_length = sizeof(T); _data->size = _values.size() * _type_length; // set decoded value with fix plain decoder - init_values_converter(); + RETURN_IF_ERROR(init_values_converter()); return _type_converted_decoder->decode_values(doris_column, data_type, select_vector, is_dict_filter); } @@ -173,24 +177,20 @@ public: return static_cast(_total_values_remaining); } - void set_data(Slice* slice) override { + Status set_data(Slice* slice) override { _bit_reader.reset(new BitReader((const uint8_t*)slice->data, slice->size)); - Status st = _init_header(); - if (!st.ok()) { - LOG(FATAL) << "Fail to init delta encoding header for " << st.to_string(); - } + RETURN_IF_ERROR(_init_header()); _data = slice; _offset = 0; + return Status::OK(); } // Set BitReader which is already initialized by DeltaLengthByteArrayDecoder or // DeltaByteArrayDecoder - void set_bit_reader(std::shared_ptr bit_reader) { + Status set_bit_reader(std::shared_ptr bit_reader) { _bit_reader = std::move(bit_reader); - Status st = _init_header(); - if (!st.ok()) { - LOG(FATAL) << "Fail to init delta encoding header for " << st.to_string(); - } + RETURN_IF_ERROR(_init_header()); + return Status::OK(); } private: @@ -265,25 +265,27 @@ public: return _get_internal(buffer, num_values, out_num_values); } - void set_data(Slice* slice) override { + Status set_data(Slice* slice) override { if (slice->size == 0) { - return; + return Status::OK(); } _bit_reader = std::make_shared((const uint8_t*)slice->data, slice->size); _data = slice; _offset = 0; - _decode_lengths(); + RETURN_IF_ERROR(_decode_lengths()); + return Status::OK(); } - void set_bit_reader(std::shared_ptr bit_reader) { + Status set_bit_reader(std::shared_ptr bit_reader) { _bit_reader = std::move(bit_reader); - _decode_lengths(); + RETURN_IF_ERROR(_decode_lengths()); + return Status::OK(); } private: // Decode all the encoded lengths. The decoder_ will be at the start of the encoded data // after that. - void _decode_lengths(); + Status _decode_lengths(); Status _get_internal(Slice* buffer, int max_values, int* out_num_values); std::vector _values; @@ -333,9 +335,9 @@ public: } } - void set_data(Slice* slice) override { + Status set_data(Slice* slice) override { _bit_reader = std::make_shared((const uint8_t*)slice->data, slice->size); - _prefix_len_decoder.set_bit_reader(_bit_reader); + RETURN_IF_ERROR(_prefix_len_decoder.set_bit_reader(_bit_reader)); // get the number of encoded prefix lengths int num_prefix = _prefix_len_decoder.valid_values_count(); @@ -343,20 +345,19 @@ public: // all the prefix lengths are buffered in _buffered_prefix_length. _buffered_prefix_length.resize(num_prefix); int ret; - Status st = _prefix_len_decoder.decode(_buffered_prefix_length.data(), num_prefix, &ret); - if (!st.ok()) { - LOG(FATAL) << "Fail to decode delta prefix, status: " << st; - } + RETURN_IF_ERROR( + _prefix_len_decoder.decode(_buffered_prefix_length.data(), num_prefix, &ret)); DCHECK_EQ(ret, num_prefix); _prefix_len_offset = 0; _num_valid_values = num_prefix; // at this time, the decoder_ will be at the start of the encoded suffix data. - _suffix_decoder.set_bit_reader(_bit_reader); + RETURN_IF_ERROR(_suffix_decoder.set_bit_reader(_bit_reader)); // TODO: read corrupted files written with bug(PARQUET-246). _last_value should be set // to _last_value_in_previous_page when decoding a new page(except the first page) _last_value = ""; + return Status::OK(); } Status decode(Slice* buffer, int num_values, int* out_num_values) { @@ -517,119 +518,4 @@ Status DeltaBitPackDecoder::_get_internal(T* buffer, int num_values, int* out return Status::OK(); } -void DeltaLengthByteArrayDecoder::_decode_lengths() { - _len_decoder.set_bit_reader(_bit_reader); - // get the number of encoded lengths - int num_length = _len_decoder.valid_values_count(); - _buffered_length.resize(num_length); - - // decode all the lengths. all the lengths are buffered in buffered_length_. - int ret; - Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret); - if (!st.ok()) { - LOG(FATAL) << "Fail to decode delta length, status: " << st; - } - DCHECK_EQ(ret, num_length); - _length_idx = 0; - _num_valid_values = num_length; -} - -Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values, - int* out_num_values) { - // Decode up to `max_values` strings into an internal buffer - // and reference them into `buffer`. - max_values = std::min(max_values, _num_valid_values); - if (max_values == 0) { - *out_num_values = 0; - return Status::OK(); - } - - int32_t data_size = 0; - const int32_t* length_ptr = _buffered_length.data() + _length_idx; - for (int i = 0; i < max_values; ++i) { - int32_t len = length_ptr[i]; - if (PREDICT_FALSE(len < 0)) { - return Status::InvalidArgument("Negative string delta length"); - } - buffer[i].size = len; - if (common::add_overflow(data_size, len, data_size)) { - return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY"); - } - } - _length_idx += max_values; - - _buffered_data.resize(data_size); - char* data_ptr = _buffered_data.data(); - for (int j = 0; j < data_size; j++) { - if (!_bit_reader->GetValue(8, data_ptr + j)) { - return Status::IOError("Get length bytes EOF"); - } - } - - for (int i = 0; i < max_values; ++i) { - buffer[i].data = data_ptr; - data_ptr += buffer[i].size; - } - // this->num_values_ -= max_values; - _num_valid_values -= max_values; - *out_num_values = max_values; - return Status::OK(); -} - -Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) { - // Decode up to `max_values` strings into an internal buffer - // and reference them into `buffer`. - max_values = std::min(max_values, _num_valid_values); - if (max_values == 0) { - *out_num_values = max_values; - return Status::OK(); - } - - int suffix_read; - RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read)); - if (PREDICT_FALSE(suffix_read != max_values)) { - return Status::IOError("Read {}, expecting {} from suffix decoder", - std::to_string(suffix_read), std::to_string(max_values)); - } - - int64_t data_size = 0; - const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset; - for (int i = 0; i < max_values; ++i) { - if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) { - return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY"); - } - if (PREDICT_FALSE(common::add_overflow(data_size, static_cast(prefix_len_ptr[i]), - data_size) || - common::add_overflow(data_size, static_cast(buffer[i].size), - data_size))) { - return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY"); - } - } - _buffered_data.resize(data_size); - - std::string_view prefix {_last_value}; - - char* data_ptr = _buffered_data.data(); - for (int i = 0; i < max_values; ++i) { - if (PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { - return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY"); - } - memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); - // buffer[i] currently points to the string suffix - memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size); - buffer[i].data = data_ptr; - buffer[i].size += prefix_len_ptr[i]; - data_ptr += buffer[i].size; - prefix = std::string_view {buffer[i].data, buffer[i].size}; - } - _prefix_len_offset += max_values; - _num_valid_values -= max_values; - _last_value = std::string {prefix}; - - if (_num_valid_values == 0) { - _last_value_in_previous_page = _last_value; - } - *out_num_values = max_values; - return Status::OK(); -} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 2d96172c03..d588e105f8 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -121,7 +121,8 @@ protected: Status read_dict_values_to_column(MutableColumnPtr& doris_column) override { size_t dict_items_size = _dict_items.size(); - std::vector dict_values(dict_items_size); + std::vector dict_values; + dict_values.reserve(dict_items_size); for (size_t i = 0; i < dict_items_size; ++i) { dict_values.emplace_back(_dict_items[i], _type_length); } @@ -131,7 +132,8 @@ protected: MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override { auto res = ColumnString::create(); - std::vector dict_values(dict_column->size()); + std::vector dict_values; + dict_values.reserve(dict_column->size()); const auto& data = dict_column->get_data(); for (size_t i = 0; i < dict_column->size(); ++i) { dict_values.emplace_back(_dict_items[data[i]], _type_length); diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp new file mode 100644 index 0000000000..b07cd03cf0 --- /dev/null +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp @@ -0,0 +1,40 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/fix_length_plain_decoder.h" + +namespace doris::vectorized { + +Status FixLengthPlainDecoder::skip_values(size_t num_values) { + _offset += _type_length * num_values; + if (UNLIKELY(_offset > _data->size)) { + return Status::IOError("Out-of-bounds access in parquet data decoder"); + } + return Status::OK(); +} + +Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, + ColumnSelectVector& select_vector, + bool is_dict_filter) { + if (select_vector.has_filter()) { + return _decode_values(doris_column, data_type, select_vector, is_dict_filter); + } else { + return _decode_values(doris_column, data_type, select_vector, is_dict_filter); + } +} + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h index 40e4c54a82..f5070a995a 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h @@ -40,67 +40,45 @@ public: template Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector, bool is_dict_filter); + ColumnSelectVector& select_vector, bool is_dict_filter) { + size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); + if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) { + return Status::IOError("Out-of-bounds access in parquet data decoder"); + } + + size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); + size_t data_index = doris_column->size() * primitive_length; + size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) * + (_type_length / primitive_length); + doris_column->resize(doris_column->size() + scale_size); + char* raw_data = const_cast(doris_column->get_raw_data().data); + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + memcpy(raw_data + data_index, _data->data + _offset, run_length * _type_length); + _offset += run_length * _type_length; + data_index += run_length * _type_length; + break; + } + case ColumnSelectVector::NULL_DATA: { + data_index += run_length * _type_length; + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + _offset += _type_length * run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } + } + } + return Status::OK(); + } Status skip_values(size_t num_values) override; }; -Status FixLengthPlainDecoder::skip_values(size_t num_values) { - _offset += _type_length * num_values; - if (UNLIKELY(_offset > _data->size)) { - return Status::IOError("Out-of-bounds access in parquet data decoder"); - } - return Status::OK(); -} - -Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector, - bool is_dict_filter) { - if (select_vector.has_filter()) { - return _decode_values(doris_column, data_type, select_vector, is_dict_filter); - } else { - return _decode_values(doris_column, data_type, select_vector, is_dict_filter); - } -} - -template -Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector, - bool is_dict_filter) { - size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); - if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) { - return Status::IOError("Out-of-bounds access in parquet data decoder"); - } - - size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory(); - size_t data_index = doris_column->size() * primitive_length; - size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) * - (_type_length / primitive_length); - doris_column->resize(doris_column->size() + scale_size); - char* raw_data = const_cast(doris_column->get_raw_data().data); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - memcpy(raw_data + data_index, _data->data + _offset, run_length * _type_length); - _offset += run_length * _type_length; - data_index += run_length * _type_length; - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length * _type_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/level_decoder.cpp b/be/src/vec/exec/format/parquet/level_decoder.cpp index 1e7c03f87b..b9f981e460 100644 --- a/be/src/vec/exec/format/parquet/level_decoder.cpp +++ b/be/src/vec/exec/format/parquet/level_decoder.cpp @@ -90,7 +90,15 @@ size_t doris::vectorized::LevelDecoder::get_levels(doris::vectorized::level_t* l _num_levels -= num_decoded; return num_decoded; } else if (_encoding == tparquet::Encoding::BIT_PACKED) { - // TODO(gaoxin): BIT_PACKED encoding + n = std::min((size_t)_num_levels, n); + for (size_t i = 0; i < n; ++i) { + if (!_bit_packed_decoder.GetValue(_bit_width, &levels[i])) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "Failed to decode BIT_PACKED levels"); + } + } + _num_levels -= n; + return n; } return 0; } diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index 5c5ee475bd..7be54204b5 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -208,7 +208,7 @@ Status ColumnChunkReader::load_page_data() { _page_decoder = _decoders[static_cast(encoding)].get(); } // Reset page data for each page - _page_decoder->set_data(&_page_data); + RETURN_IF_ERROR(_page_decoder->set_data(&_page_data)); _state = DATA_LOADED; return Status::OK(); diff --git a/be/test/vec/exec/format/parquet/bool_plain_decoder_test.cpp b/be/test/vec/exec/format/parquet/bool_plain_decoder_test.cpp new file mode 100644 index 0000000000..60af9bcecc --- /dev/null +++ b/be/test/vec/exec/format/parquet/bool_plain_decoder_test.cpp @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/bool_plain_decoder.h" + +#include + +#include "parquet/encoding.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "util/slice.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_number.h" + +namespace doris::vectorized { + +class BoolPlainDecoderTest : public ::testing::Test { +protected: + void SetUp() override { _decoder = std::make_unique(); } + + std::unique_ptr _decoder; +}; + +// Test basic decoding functionality +TEST_F(BoolPlainDecoderTest, test_basic_decode) { + // Prepare encoded data: [true, false, true, true, false, false, false, true] + std::vector encoded_data = {0b10001101}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector without filter + size_t num_values = 8; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data()[0], 1); + EXPECT_EQ(result_column->get_data()[1], 0); + EXPECT_EQ(result_column->get_data()[2], 1); + EXPECT_EQ(result_column->get_data()[3], 1); + EXPECT_EQ(result_column->get_data()[4], 0); + EXPECT_EQ(result_column->get_data()[5], 0); + EXPECT_EQ(result_column->get_data()[6], 0); + EXPECT_EQ(result_column->get_data()[7], 1); +} + +// Test decoding with filter +TEST_F(BoolPlainDecoderTest, test_decode_with_filter) { + // Prepare encoded data: [true, false, true, true, false, false, false, true] + std::vector encoded_data = {0b10001101}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1, 0, 1, 0, 1, 0, 1, 0] + size_t num_values = 8; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1, 0, 1, 0, 1, 0}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 4); // 4 values after filtering + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data()[0], 1); + EXPECT_EQ(result_column->get_data()[1], 1); + EXPECT_EQ(result_column->get_data()[2], 0); + EXPECT_EQ(result_column->get_data()[3], 0); +} + +// Test skipping values +TEST_F(BoolPlainDecoderTest, test_skip_value) { + // Prepare encoded data: [true, false, true, true, false, false, false, true] + std::vector encoded_data = {0b10001101}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Skip first 3 values + ASSERT_TRUE(_decoder->skip_values(3).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = 5; // Total 8 values, skip 3, remaining 5 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + // Expected values after skipping first 3 values (true, false, true) + std::vector expected_values = {1, 0, 0, 0, 1}; + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i; + } +} + +// Test decoding with filter and null +TEST_F(BoolPlainDecoderTest, test_decode_with_filter_and_null) { + // Prepare encoded data: [true, false, true, true, false, false] + std::vector encoded_data = {0b00001101}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1, 0, 1, 0, 1, 1, 1] and null vector [0, 0, 0, 0, 1, 0, 1] + size_t num_values = 7; + std::vector run_length_null_map { + 4, 1, 1, 1}; // data: [true, false, true, true, null, false, null] + std::vector filter_data = {1, 0, 1, 0, + 1, 1, 1}; // filtered_data: [true, true, null, false, null] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* result_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {1, 1, std::nullopt, 0, std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(result_column->get_data()[i], expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test decoding data generated by arrow +TEST_F(BoolPlainDecoderTest, test_data_generated_by_arrow) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BOOLEAN); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {true, false, true, true, false, false, false, true}; + + // Create encoder + auto encoder = MakeTypedEncoder(parquet::Encoding::PLAIN, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW(encoder->Put(values, static_cast(values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = values.size(); + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data()[0], 1); + EXPECT_EQ(result_column->get_data()[1], 0); + EXPECT_EQ(result_column->get_data()[2], 1); + EXPECT_EQ(result_column->get_data()[3], 1); + EXPECT_EQ(result_column->get_data()[4], 0); + EXPECT_EQ(result_column->get_data()[5], 0); + EXPECT_EQ(result_column->get_data()[6], 0); + EXPECT_EQ(result_column->get_data()[7], 1); +} + +// Test invalid data case +//TEST_F(BoolPlainDecoderTest, test_invalid_data) { +// // Prepare invalid encoded data +// std::vector encoded_data = {0b111111111}; // 9 bits +// Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); +// ASSERT_FALSE(_decoder->set_data(&data_slice).ok()); +//} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/bool_rle_decoder_test.cpp b/be/test/vec/exec/format/parquet/bool_rle_decoder_test.cpp new file mode 100644 index 0000000000..7ab6c96f01 --- /dev/null +++ b/be/test/vec/exec/format/parquet/bool_rle_decoder_test.cpp @@ -0,0 +1,239 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/bool_rle_decoder.h" + +#include + +#include "parquet/encoding.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "util/slice.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_number.h" + +namespace doris::vectorized { + +class BoolRLEDecoderTest : public ::testing::Test { +protected: + void SetUp() override { _decoder = std::make_unique(); } + + std::unique_ptr _decoder; +}; + +// Test basic decoding functionality +TEST_F(BoolRLEDecoderTest, test_basic_decode) { + // Prepare encoded data: [true, false, true, true, false, false, false, true] + std::vector encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector without filter + size_t num_values = 8; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data()[0], 1); + EXPECT_EQ(result_column->get_data()[1], 0); + EXPECT_EQ(result_column->get_data()[2], 1); + EXPECT_EQ(result_column->get_data()[3], 1); + EXPECT_EQ(result_column->get_data()[4], 0); + EXPECT_EQ(result_column->get_data()[5], 0); + EXPECT_EQ(result_column->get_data()[6], 0); + EXPECT_EQ(result_column->get_data()[7], 1); +} + +// Test decoding with filter +TEST_F(BoolRLEDecoderTest, test_decode_with_filter) { + // Prepare encoded data: [true, false, true, true, false, false, false, true] + std::vector encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1, 0, 1, 0, 1, 0, 1, 0] + size_t num_values = 8; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1, 0, 1, 0, 1, 0}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 4); // 4 values after filtering + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data()[0], 1); + EXPECT_EQ(result_column->get_data()[1], 1); + EXPECT_EQ(result_column->get_data()[2], 0); + EXPECT_EQ(result_column->get_data()[3], 0); +} + +// Test decoding with filter and null values +TEST_F(BoolRLEDecoderTest, test_decode_with_filter_and_null) { + // Prepare encoded data: [true, false, true, true, false, false, false, true] + std::vector encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x25}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1, 0, 1, 0, 1, 0, 1, 0] and null vector [0, 0, 1, 0, 0, 0, 1, 0] + size_t num_values = 8; + std::vector run_length_null_map = { + 2, 1, 3, 1, 1}; // data: [true, false, null, true, false, false, null, true] + std::vector filter_data = {1, 0, 1, 0, + 1, 0, 1, 0}; // filtered_data: [true, null, false, null] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 4); // 4 values after filtering + auto* result_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {1, std::nullopt, 0, std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(result_column->get_data()[i], expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test skipping values for bool RLE decoding +TEST_F(BoolRLEDecoderTest, test_skip_value) { + // Prepare encoded data: [true, false, true, true, false, false, false, true] + std::vector encoded_data = {0x02, 0x00, 0x00, 0x00, 0x03, 0x8d}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Skip first 3 values + ASSERT_TRUE(_decoder->skip_values(3).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = 5; // Total 8 values, skip 3, remaining 5 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + // Expected values after skipping first 3 values (true, false, true) + std::vector expected_values = {1, 0, 0, 0, 1}; + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i; + } +} + +// Test decoding data generated by arrow +TEST_F(BoolRLEDecoderTest, test_data_generated_by_arrow) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BOOLEAN); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {true, false, true, true, false, false, false, true}; + + // Create encoder + auto encoder = MakeTypedEncoder(parquet::Encoding::RLE, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW(encoder->Put(values, static_cast(values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = values.size(); + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data()[0], 1); + EXPECT_EQ(result_column->get_data()[1], 0); + EXPECT_EQ(result_column->get_data()[2], 1); + EXPECT_EQ(result_column->get_data()[3], 1); + EXPECT_EQ(result_column->get_data()[4], 0); + EXPECT_EQ(result_column->get_data()[5], 0); + EXPECT_EQ(result_column->get_data()[6], 0); + EXPECT_EQ(result_column->get_data()[7], 1); +} + +// Test invalid data case +TEST_F(BoolRLEDecoderTest, test_invalid_data) { + // Prepare invalid encoded data + std::vector encoded_data = {0x08, 0x01}; // Incomplete data + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_FALSE(_decoder->set_data(&data_slice).ok()); +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp b/be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp new file mode 100644 index 0000000000..d154ee144c --- /dev/null +++ b/be/test/vec/exec/format/parquet/byte_array_dict_decoder_test.cpp @@ -0,0 +1,500 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/byte_array_dict_decoder.h" + +#include + +#include "parquet/encoding.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "util/slice.h" +#include "vec/columns/column_dictionary.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +class ByteArrayDictDecoderTest : public ::testing::Test { +protected: + void SetUp() override { + // Prepare test data: create a dictionary with byte array strings + const char* values[3] = {"apple", "banana", "cherry"}; + size_t dict_size = 3; + size_t dict_data_size = 0; + + // Calculate total dictionary data size + for (int i = 0; i < 3; i++) { + dict_data_size += 4 + strlen(values[i]); // 4 bytes for length + string data + } + + auto dict_data = std::make_unique(dict_data_size); + size_t offset = 0; + for (int i = 0; i < 3; i++) { + uint32_t len = strlen(values[i]); + encode_fixed32_le(dict_data.get() + offset, len); + offset += 4; + memcpy(dict_data.get() + offset, values[i], len); + offset += len; + } + + ASSERT_TRUE(_decoder.set_dict(dict_data, dict_data_size, dict_size).ok()); + } + + ByteArrayDictDecoder _decoder; +}; + +// Test basic decoding functionality +TEST_F(ByteArrayDictDecoderTest, test_basic_decode) { + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + // std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + std::vector rle_data = {0x02, 0x03, 0x00, 0x19}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create selection vector without filter, total 7 values (4 repeated + 3 literal) + size_t num_values = 7; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + // Verify first 4 repeated values (dict index 0 -> value "apple") + for (int i = 0; i < 4; i++) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), "apple"); + } + + // Verify last 3 literal values + EXPECT_EQ(result_column->get_data_at(4).to_string(), "banana"); + EXPECT_EQ(result_column->get_data_at(5).to_string(), "cherry"); + EXPECT_EQ(result_column->get_data_at(6).to_string(), "banana"); +} + +// Test decoding with filter +TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter) { + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] + size_t num_values = 7; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* result_column = assert_cast(column.get()); + + // Verify filtered values + EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple"); + EXPECT_EQ(result_column->get_data_at(1).to_string(), "apple"); + EXPECT_EQ(result_column->get_data_at(2).to_string(), "banana"); + EXPECT_EQ(result_column->get_data_at(3).to_string(), "cherry"); + EXPECT_EQ(result_column->get_data_at(4).to_string(), "banana"); +} + +// Test decoding with filter and null +TEST_F(ByteArrayDictDecoderTest, test_decode_with_filter_and_null) { + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2] + std::vector rle_data = {2, 8, 0, 3, 0b00000010, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1] + size_t num_values = 7; + std::vector run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null] + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* result_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {"apple", "apple", std::nullopt, + "cherry", std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test empty dictionary case +TEST_F(ByteArrayDictDecoderTest, test_empty_dict) { + ByteArrayDictDecoder empty_decoder; + auto dict_data = std::make_unique(0); + ASSERT_TRUE(empty_decoder.set_dict(dict_data, 0, 0).ok()); +} + +// Test decoding with ColumnDictI32 +TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32) { + // Create ColumnDictI32 column + MutableColumnPtr column = ColumnDictI32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create selection vector without filter, total 7 values (4 repeated + 3 literal) + const size_t num_values = 7; + std::vector run_length_null_map = {num_values}; // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* dict_column = assert_cast(column.get()); + + // Verify first 4 repeated values (dict index 0 -> value "apple") + for (int i = 0; i < 4; i++) { + EXPECT_EQ(dict_column->get_data()[i], 0); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(), "apple"); + } + + // Verify last 3 literal values + EXPECT_EQ(dict_column->get_data()[4], 1); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana"); + EXPECT_EQ(dict_column->get_data()[5], 2); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[5]).to_string(), "cherry"); + EXPECT_EQ(dict_column->get_data()[6], 1); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[6]).to_string(), "banana"); +} + +// Test decoding with ColumnDictI32 and filter +TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32_with_filter) { + // Create ColumnDictI32 column + MutableColumnPtr column = ColumnDictI32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] + const size_t num_values = 7; + std::vector run_length_null_map = {num_values}; // All non-null + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* dict_column = assert_cast(column.get()); + + // Verify filtered values + EXPECT_EQ(dict_column->get_data()[0], 0); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[0]).to_string(), "apple"); + EXPECT_EQ(dict_column->get_data()[1], 0); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[1]).to_string(), "apple"); + EXPECT_EQ(dict_column->get_data()[2], 1); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[2]).to_string(), "banana"); + EXPECT_EQ(dict_column->get_data()[3], 2); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[3]).to_string(), "cherry"); + EXPECT_EQ(dict_column->get_data()[4], 1); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana"); +} + +// Test decoding with ColumnDictI32 with filter and null +TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_dict_i32_with_filter_and_null) { + // Create ColumnDictI32 column + MutableColumnPtr column = ColumnDictI32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2] + std::vector rle_data = {2, 8, 0, 3, 0b00000010, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1] + const size_t num_values = 7; + std::vector run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null] + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null] + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* dict_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {"apple", "apple", std::nullopt, + "cherry", std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(), + expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test decoding with ColumnInt32 +TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32) { + // Create ColumnInt32 column + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create selection vector without filter, total 7 values (4 repeated + 3 literal) + const size_t num_values = 7; + std::vector run_length_null_map = {num_values}; // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* dict_column = assert_cast(column.get()); + + // Verify first 4 repeated values (dict index 0 -> value "apple") + for (int i = 0; i < 4; i++) { + EXPECT_EQ(dict_column->get_data()[i], 0); + } + + // Verify last 3 literal values + EXPECT_EQ(dict_column->get_data()[4], 1); + EXPECT_EQ(dict_column->get_data()[5], 2); + EXPECT_EQ(dict_column->get_data()[6], 1); +} + +// Test decoding with ColumnInt32 and filter +TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32_with_filter) { + // Create ColumnInt32 column + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] + const size_t num_values = 7; + std::vector run_length_null_map = {num_values}; // All non-null + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* dict_column = assert_cast(column.get()); + + // Verify filtered values + EXPECT_EQ(dict_column->get_data()[0], 0); + EXPECT_EQ(dict_column->get_data()[1], 0); + EXPECT_EQ(dict_column->get_data()[2], 1); + EXPECT_EQ(dict_column->get_data()[3], 2); + EXPECT_EQ(dict_column->get_data()[4], 1); +} + +// Test decoding with ColumnInt32 with filter and null +TEST_F(ByteArrayDictDecoderTest, test_decode_with_column_int_32_with_filter_and_null) { + // Create ColumnInt32 column + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2] + std::vector rle_data = {2, 8, 0, 3, 0b00000010, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1] + const size_t num_values = 7; + std::vector run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null] + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null] + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* dict_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {0, 0, std::nullopt, 2, std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(dict_column->get_data()[i], expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test reading dictionary values to column +TEST_F(ByteArrayDictDecoderTest, test_read_dict_values_to_column) { + // Create a column to store dictionary values + MutableColumnPtr column = ColumnString::create(); + + // Read dictionary values to column + ASSERT_TRUE(_decoder.read_dict_values_to_column(column).ok()); + + // Verify results + ASSERT_EQ(column->size(), 3); // 3 dictionary items + auto* result_column = assert_cast(column.get()); + + // Verify dictionary values + EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple"); + EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana"); + EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry"); +} + +// Test convert_dict_column_to_string_column function +TEST_F(ByteArrayDictDecoderTest, test_convert_dict_column_to_string_column) { + // Create a ColumnInt32 with some dictionary codes + MutableColumnPtr dict_column = ColumnInt32::create(); + dict_column->insert(0); + dict_column->insert(1); + dict_column->insert(2); + dict_column->insert(1); + + // Convert to string column + MutableColumnPtr string_column = _decoder.convert_dict_column_to_string_column( + assert_cast(dict_column.get())); + + // Verify results + ASSERT_EQ(string_column->size(), 4); + auto* result_column = assert_cast(string_column.get()); + + EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple"); + EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana"); + EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry"); + EXPECT_EQ(result_column->get_data_at(3).to_string(), "banana"); +} + +// Test skipping values for byte array dictionary decoding +TEST_F(ByteArrayDictDecoderTest, test_skip_value) { + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Skip first 3 values + ASSERT_TRUE(_decoder.skip_values(3).ok()); + + // Create selection vector + size_t num_values = 4; // Total 7 values, skip 3, remaining 4 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + // Expected values after skipping first 3 values ("apple", "apple", "apple") + std::vector expected_values = {"apple", "banana", "cherry", "banana"}; + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i]) + << "Mismatch at value " << i; + } +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/byte_array_plain_decoder_test.cpp b/be/test/vec/exec/format/parquet/byte_array_plain_decoder_test.cpp new file mode 100644 index 0000000000..c047935a4b --- /dev/null +++ b/be/test/vec/exec/format/parquet/byte_array_plain_decoder_test.cpp @@ -0,0 +1,242 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/byte_array_plain_decoder.h" + +#include + +#include "util/slice.h" +#include "vec/columns/column_string.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +class ByteArrayPlainDecoderTest : public ::testing::Test { +protected: + void SetUp() override {} + + Slice _data_slice; + std::unique_ptr _data; +}; + +// Test basic decoding functionality +TEST_F(ByteArrayPlainDecoderTest, test_basic_decode) { + // Prepare test data: create byte array strings + const char* values[3] = {"apple", "banana", "cherry"}; + size_t data_size = 0; + + // Calculate total data size + for (int i = 0; i < 3; i++) { + data_size += 4 + strlen(values[i]); // 4 bytes for length + string data + } + + _data = std::make_unique(data_size); + size_t offset = 0; + for (int i = 0; i < 3; i++) { + uint32_t len = strlen(values[i]); + encode_fixed32_le(_data.get() + offset, len); + offset += 4; + memcpy(_data.get() + offset, values[i], len); + offset += len; + } + + _data_slice = Slice(_data.get(), data_size); + + ByteArrayPlainDecoder decoder; + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector without filter + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple"); + EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana"); + EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry"); +} + +// Test decoding with filter +TEST_F(ByteArrayPlainDecoderTest, test_decode_with_filter) { + // Prepare test data: create byte array strings + const char* values[3] = {"apple", "banana", "cherry"}; + size_t data_size = 0; + + // Calculate total data size + for (int i = 0; i < 3; i++) { + data_size += 4 + strlen(values[i]); // 4 bytes for length + string data + } + + _data = std::make_unique(data_size); + size_t offset = 0; + for (int i = 0; i < 3; i++) { + uint32_t len = strlen(values[i]); + encode_fixed32_le(_data.get() + offset, len); + offset += 4; + memcpy(_data.get() + offset, values[i], len); + offset += len; + } + + _data_slice = Slice(_data.get(), data_size); + + ByteArrayPlainDecoder decoder; + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1,0,1] + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + + EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple"); + EXPECT_EQ(result_column->get_data_at(1).to_string(), "cherry"); +} + +// Test decoding with filter and null +TEST_F(ByteArrayPlainDecoderTest, test_decode_with_filter_and_null) { + // Prepare test data: create byte array strings + const char* values[2] = {"apple", "cherry"}; + size_t data_size = 0; + + // Calculate total data size + for (int i = 0; i < 2; i++) { + data_size += 4 + strlen(values[i]); // 4 bytes for length + string data + } + + _data = std::make_unique(data_size); + size_t offset = 0; + for (int i = 0; i < 2; i++) { + uint32_t len = strlen(values[i]); + encode_fixed32_le(_data.get() + offset, len); + offset += 4; + memcpy(_data.get() + offset, values[i], len); + offset += len; + } + + _data_slice = Slice(_data.get(), data_size); + + ByteArrayPlainDecoder decoder; + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1,0,1] and null vector [0,1,0] + size_t num_values = 3; + std::vector run_length_null_map = {1, 1, 1}; // data: ["apple", null, "cherry"] + std::vector filter_data = {1, 0, 1}; // filtered_data: ["apple", "cherry"] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {"apple", "cherry"}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test skipping values +TEST_F(ByteArrayPlainDecoderTest, test_skip_value) { + // Prepare test data: create byte array strings + const char* values[3] = {"apple", "banana", "cherry"}; + size_t data_size = 0; + + // Calculate total data size + for (int i = 0; i < 3; i++) { + data_size += 4 + strlen(values[i]); // 4 bytes for length + string data + } + + _data = std::make_unique(data_size); + size_t offset = 0; + for (int i = 0; i < 3; i++) { + uint32_t len = strlen(values[i]); + encode_fixed32_le(_data.get() + offset, len); + offset += 4; + memcpy(_data.get() + offset, values[i], len); + offset += len; + } + + _data_slice = Slice(_data.get(), data_size); + + ByteArrayPlainDecoder decoder; + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + // Skip first 2 values + ASSERT_TRUE(decoder.skip_values(2).ok()); + + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = 1; // Total 3 values, skip 2, remaining 1 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + EXPECT_EQ(result_column->get_data_at(0).to_string(), "cherry"); +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/byte_stream_split_decoder_test.cpp b/be/test/vec/exec/format/parquet/byte_stream_split_decoder_test.cpp new file mode 100644 index 0000000000..0d2e05882f --- /dev/null +++ b/be/test/vec/exec/format/parquet/byte_stream_split_decoder_test.cpp @@ -0,0 +1,395 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/byte_stream_split_decoder.h" + +#include + +#include "util/slice.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_number.h" + +namespace doris::vectorized { + +class ByteStreamSplitDecoderTest : public ::testing::Test { +protected: + void SetUp() override {} + + ByteStreamSplitDecoder _decoder; +}; + +//// Test basic decoding functionality for FLOAT type +TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_float) { + // Prepare test data for FLOAT type + size_t type_length_float = sizeof(float); + size_t num_values_float = 3; + size_t data_size_float = num_values_float * type_length_float; + auto data_float = std::make_unique(data_size_float); + const float values_float[3] = {1.0f, 2.0f, 3.0f}; + for (int i = 0; i < num_values_float; i++) { + const uint8_t* bytes = reinterpret_cast(&values_float[i]); + for (int j = 0; j < type_length_float; j++) { + data_float[j * num_values_float + i] = bytes[j]; + } + } + Slice data_slice_float(data_float.get(), data_size_float); + + MutableColumnPtr column = ColumnFloat32::create(); + DataTypePtr data_type = std::make_shared(); + + // Set data for FLOAT type + ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok()); + _decoder.set_type_length(type_length_float); + + // Create selection vector without filter, total 3 values + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f); + EXPECT_FLOAT_EQ(result_column->get_data()[1], 2.0f); + EXPECT_FLOAT_EQ(result_column->get_data()[2], 3.0f); +} + +//// Test basic decoding functionality for DOUBLE type +TEST_F(ByteStreamSplitDecoderTest, test_basic_decode_double) { + // Prepare test data for DOUBLE type + size_t type_length_double = sizeof(double); + size_t num_values_double = 3; + size_t data_size_double = num_values_double * type_length_double; + auto data_double = std::make_unique(data_size_double); + const double values_double[3] = {1.0, 2.0, 3.0}; + for (int i = 0; i < num_values_double; i++) { + const uint8_t* bytes = reinterpret_cast(&values_double[i]); + for (int j = 0; j < type_length_double; j++) { + data_double[j * num_values_double + i] = bytes[j]; + } + } + Slice data_slice_double(data_double.get(), data_size_double); + + MutableColumnPtr column = ColumnFloat64::create(); + DataTypePtr data_type = std::make_shared(); + + // Set data for DOUBLE type + ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok()); + _decoder.set_type_length(type_length_double); + + // Create selection vector without filter, total 3 values + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_DOUBLE_EQ(result_column->get_data()[0], 1.0); + EXPECT_DOUBLE_EQ(result_column->get_data()[1], 2.0); + EXPECT_DOUBLE_EQ(result_column->get_data()[2], 3.0); +} + +// Test decoding with filter for FLOAT type +TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_float) { + // Prepare test data for FLOAT type + size_t type_length_float = sizeof(float); + size_t num_values_float = 3; + size_t data_size_float = num_values_float * type_length_float; + auto data_float = std::make_unique(data_size_float); + const float values_float[3] = {1.0f, 2.0f, 3.0f}; + for (int i = 0; i < num_values_float; i++) { + const uint8_t* bytes = reinterpret_cast(&values_float[i]); + for (int j = 0; j < type_length_float; j++) { + data_float[j * num_values_float + i] = bytes[j]; + } + } + Slice data_slice_float(data_float.get(), data_size_float); + + MutableColumnPtr column = ColumnFloat32::create(); + DataTypePtr data_type = std::make_shared(); + + // Set data for FLOAT type + ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok()); + _decoder.set_type_length(type_length_float); + + // Create filter vector [1, 0, 1] + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f); + EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f); +} + +// Test decoding with filter for DOUBLE type +TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_double) { + // Prepare test data for DOUBLE type + size_t type_length_double = sizeof(double); + size_t num_values_double = 3; + size_t data_size_double = num_values_double * type_length_double; + auto data_double = std::make_unique(data_size_double); + const double values_double[3] = {1.0, 2.0, 3.0}; + for (int i = 0; i < num_values_double; i++) { + const uint8_t* bytes = reinterpret_cast(&values_double[i]); + for (int j = 0; j < type_length_double; j++) { + data_double[j * num_values_double + i] = bytes[j]; + } + } + Slice data_slice_double(data_double.get(), data_size_double); + + MutableColumnPtr column = ColumnFloat64::create(); + DataTypePtr data_type = std::make_shared(); + + // Set data for DOUBLE type + ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok()); + _decoder.set_type_length(type_length_double); + + // Create filter vector [1, 0, 1] + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + EXPECT_DOUBLE_EQ(result_column->get_data()[0], 1.0); + EXPECT_DOUBLE_EQ(result_column->get_data()[1], 3.0); +} + +// Test decoding with filter and null for FLOAT type +TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_and_null_float) { + // Prepare test data for FLOAT type + size_t type_length_float = sizeof(float); + size_t num_values_float = 2; + size_t data_size_float = num_values_float * type_length_float; + auto data_float = std::make_unique(data_size_float); + const float values_float[2] = {1.0f, 3.0f}; + for (int i = 0; i < num_values_float; i++) { + const uint8_t* bytes = reinterpret_cast(&values_float[i]); + for (int j = 0; j < type_length_float; j++) { + data_float[j * num_values_float + i] = bytes[j]; + } + } + Slice data_slice_float(data_float.get(), data_size_float); + + MutableColumnPtr column = ColumnFloat32::create(); + DataTypePtr data_type = std::make_shared(); + + // Set data for FLOAT type + ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok()); + _decoder.set_type_length(type_length_float); + + // Create filter vector [1, 0, 1] and null vector [0, 1, 0] + size_t num_values = 3; + std::vector run_length_null_map = {1, 1, 1}; // data: [1.0f, null, 3.0f] + std::vector filter_data = {0, 1, 1}; // filtered_data: [null, 3.0f] + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + // EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f); + // EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f); + + // Expected values after filtering and null handling + std::vector> expected_values = {std::nullopt, 3.0f}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_FLOAT_EQ(result_column->get_data()[i], expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test decoding with filter and null for DOUBLE type +TEST_F(ByteStreamSplitDecoderTest, test_decode_with_filter_and_null_double) { + // Prepare test data for DOUBLE type + size_t type_length_double = sizeof(double); + size_t num_values_double = 2; + size_t data_size_double = num_values_double * type_length_double; + auto data_double = std::make_unique(data_size_double); + const double values_double[2] = {1.0, 3.0}; + for (int i = 0; i < num_values_double; i++) { + const uint8_t* bytes = reinterpret_cast(&values_double[i]); + for (int j = 0; j < type_length_double; j++) { + data_double[j * num_values_double + i] = bytes[j]; + } + } + Slice data_slice_double(data_double.get(), data_size_double); + + MutableColumnPtr column = ColumnFloat64::create(); + DataTypePtr data_type = std::make_shared(); + + // Set data for DOUBLE type + ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok()); + _decoder.set_type_length(type_length_double); + + // Create filter vector [1, 0, 1] and null vector [0, 1, 0] + size_t num_values = 3; + std::vector run_length_null_map = {1, 1, 1}; // data: [1.0f, null, 3.0f] + std::vector filter_data = {0, 1, 1}; // filtered_data: [null, 3.0f] + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + // EXPECT_FLOAT_EQ(result_column->get_data()[0], 1.0f); + // EXPECT_FLOAT_EQ(result_column->get_data()[1], 3.0f); + + // Expected values after filtering and null handling + std::vector> expected_values = {std::nullopt, 3.0f}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_FLOAT_EQ(result_column->get_data()[i], expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test skipping values for FLOAT type +TEST_F(ByteStreamSplitDecoderTest, test_skip_value_float) { + // Prepare test data for FLOAT type + size_t type_length_float = sizeof(float); + size_t num_values_float = 3; + size_t data_size_float = num_values_float * type_length_float; + auto data_float = std::make_unique(data_size_float); + const float values_float[3] = {1.0f, 2.0f, 3.0f}; + for (int i = 0; i < num_values_float; i++) { + const uint8_t* bytes = reinterpret_cast(&values_float[i]); + for (int j = 0; j < type_length_float; j++) { + data_float[j * num_values_float + i] = bytes[j]; + } + } + Slice data_slice_float(data_float.get(), data_size_float); + + MutableColumnPtr column = ColumnFloat32::create(); + DataTypePtr data_type = std::make_shared(); + + // Set data for FLOAT type + ASSERT_TRUE(_decoder.set_data(&data_slice_float).ok()); + _decoder.set_type_length(type_length_float); + + // Skip first 2 values + ASSERT_TRUE(_decoder.skip_values(2).ok()); + + // Create selection vector + size_t num_values = 1; // Total 3 values, skip 2, remaining 1 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_FLOAT_EQ(result_column->get_data()[0], 3.0f); +} + +// Test skipping values for DOUBLE type +TEST_F(ByteStreamSplitDecoderTest, test_skip_value_double) { + // Prepare test data for DOUBLE type + size_t type_length_double = sizeof(double); + size_t num_values_double = 3; + size_t data_size_double = num_values_double * type_length_double; + auto data_double = std::make_unique(data_size_double); + const double values_double[3] = {1.0, 2.0, 3.0}; + for (int i = 0; i < num_values_double; i++) { + const uint8_t* bytes = reinterpret_cast(&values_double[i]); + for (int j = 0; j < type_length_double; j++) { + data_double[j * num_values_double + i] = bytes[j]; + } + } + Slice data_slice_double(data_double.get(), data_size_double); + + MutableColumnPtr column = ColumnFloat64::create(); + DataTypePtr data_type = std::make_shared(); + + // Set data for DOUBLE type + ASSERT_TRUE(_decoder.set_data(&data_slice_double).ok()); + _decoder.set_type_length(type_length_double); + + // Skip first 2 values + ASSERT_TRUE(_decoder.skip_values(2).ok()); + + // Create selection vector + size_t num_values = 1; // Total 3 values, skip 2, remaining 1 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_DOUBLE_EQ(result_column->get_data()[0], 3.0); +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/vec/exec/format/parquet/delta_bit_pack_decoder_test.cpp b/be/test/vec/exec/format/parquet/delta_bit_pack_decoder_test.cpp new file mode 100644 index 0000000000..9ab631029a --- /dev/null +++ b/be/test/vec/exec/format/parquet/delta_bit_pack_decoder_test.cpp @@ -0,0 +1,265 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/delta_bit_pack_decoder.h" + +#include + +#include "parquet/encoding.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "util/slice.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { + +class DeltaBitPackDecoderTest : public ::testing::Test { +protected: + void SetUp() override { _decoder = std::make_unique>(); } + + std::unique_ptr> _decoder; +}; + +// Test basic decoding functionality +TEST_F(DeltaBitPackDecoderTest, test_basic_decode) { + // Prepare encoded data + std::vector encoded_data = { + // Header: block_size=128, mini_blocks_per_block=4, total_value_count=5, first_value=10 + 0x80, 0x01, 0x04, 0x05, 0x14, + // Block: min_delta=1, bit_width=[0, 0, 0, 0] + 0x02, 0x00, 0x00, 0x00, 0x00 + // MiniBlocks: no data needed for bit_width 0 + }; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector without filter + size_t num_values = 5; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data()[0], 10); + EXPECT_EQ(result_column->get_data()[1], 11); + EXPECT_EQ(result_column->get_data()[2], 12); + EXPECT_EQ(result_column->get_data()[3], 13); + EXPECT_EQ(result_column->get_data()[4], 14); +} + +// Test decoding with filter +TEST_F(DeltaBitPackDecoderTest, test_decode_with_filter) { + // Prepare encoded data + std::vector encoded_data = { + // Header: block_size=128, mini_blocks_per_block=4, total_value_count=5, first_value=10 + 0x80, 0x01, 0x04, 0x05, 0x14, + // Block: min_delta=1, bit_width=[0, 0, 0, 0] + 0x02, 0x00, 0x00, 0x00, 0x00 + // MiniBlocks: no data needed for bit_width 0 + }; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1,0,1,0,1] + size_t num_values = 5; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1, 0, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 3); // 3 values after filtering + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data()[0], 10); + EXPECT_EQ(result_column->get_data()[1], 12); + EXPECT_EQ(result_column->get_data()[2], 14); +} + +// Test decoding with filter and null values +TEST_F(DeltaBitPackDecoderTest, test_decode_with_filter_and_null) { + std::vector encoded_data = { + // Header: block_size=128, mini_blocks_per_block=4, total_value_count=4, first_value=10 + 0x80, 0x01, 0x04, 0x04, 0x14, + // Block: min_delta=1, bit_width=[1, 0, 0, 0] + 0x02, 0x01, 0x00, 0x00, 0x00, + // MiniBlocks + 0x02, 0x00, 0x00, 0x00}; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1,0,1,0,1] and null vector [0,0,1,0,0] + size_t num_values = 5; + std::vector run_length_null_map = {2, 1, 2}; // data: [10 11 null 13 14] + std::vector filter_data = {1, 0, 1, 0, 1}; // filtered_data: [10 null 14] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 3); // 3 values after filtering + auto* result_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {10, std::nullopt, 14}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(result_column->get_data()[i], expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test skipping values for delta bit pack decoding +TEST_F(DeltaBitPackDecoderTest, test_skip_value) { + // Prepare encoded data + std::vector encoded_data = { + // Header: block_size=128, mini_blocks_per_block=4, total_value_count=8, first_value=10 + 0x80, 0x01, 0x04, 0x08, 0x14, + // Block: min_delta=1, bit_width=[0, 0, 0, 0] + 0x02, 0x00, 0x00, 0x00, 0x00 + // MiniBlocks: no data needed for bit_width 0 + }; + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Skip first 3 values + ASSERT_TRUE(_decoder->skip_values(3).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = 5; // Total 8 values, skip 3, remaining 5 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + // Expected values after skipping first 3 values (10,11,12) + std::vector expected_values = {13, 14, 15, 16, 17}; + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data()[i], expected_values[i]) << "Mismatch at value " << i; + } +} + +// Test decoding data generated by arrow +TEST_F(DeltaBitPackDecoderTest, test_data_generated_by_arrow) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::INT32); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {10, 11, 13, 14}; + + // Create encoder + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BINARY_PACKED, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW(encoder->Put(values.data(), static_cast(values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = values.size(); + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data()[i], values[i]); + } +} + +// Test invalid data case +TEST_F(DeltaBitPackDecoderTest, test_invalid_data) { + // Prepare invalid encoded data + std::vector encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + size_t num_values = 5; + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Decoding should fail due to invalid data + ASSERT_FALSE(_decoder->decode_values(column, data_type, select_vector, false).ok()); +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/delta_byte_array_decoder_test.cpp b/be/test/vec/exec/format/parquet/delta_byte_array_decoder_test.cpp new file mode 100644 index 0000000000..556571fa5a --- /dev/null +++ b/be/test/vec/exec/format/parquet/delta_byte_array_decoder_test.cpp @@ -0,0 +1,588 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/api.h" +#include "parquet/encoding.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "util/slice.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" +#include "vec/exec/format/parquet/delta_bit_pack_decoder.h" + +namespace doris::vectorized { + +class DeltaByteArrayDecoderTest : public ::testing::Test { +protected: + void SetUp() override { _decoder = std::make_unique(); } + + std::unique_ptr _decoder; +}; + +// Test basic decoding byte array functionality +TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_byte_array) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BYTE_ARRAY); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {"Hello", "World", "Foobar", "ABCDEF"}; + std::vector byte_array_values; + for (const auto& value : values) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(value.size()), + reinterpret_cast(value.data())}); + } + + // Create encoder + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = values.size(); + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i]); + } +} + +// Test decoding byte array with filter +TEST_F(DeltaByteArrayDecoderTest, test_decode_byte_array_with_filter) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BYTE_ARRAY); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {"Hello", "World", "Foobar", "ABCDEF"}; + std::vector byte_array_values; + for (const auto& value : values) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(value.size()), + reinterpret_cast(value.data())}); + } + + // Create encoder + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1, 0, 1, 0] + size_t num_values = values.size(); + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1, 0}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data_at(0).to_string(), "Hello"); + EXPECT_EQ(result_column->get_data_at(1).to_string(), "Foobar"); +} + +// Test decoding byte array with filter and null values +TEST_F(DeltaByteArrayDecoderTest, test_decode_byte_array_with_filter_and_null) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BYTE_ARRAY); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {"Hello", "World", "ABCDEF"}; + std::vector byte_array_values; + for (const auto& value : values) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(value.size()), + reinterpret_cast(value.data())}); + } + + // Create encoder + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1, 0, 1, 0] and null vector [0, 0, 1, 0] + size_t num_values = 4; + std::vector run_length_null_map = {2, 1, + 1}; // data: ["Hello", "World", null, "ABCDEF"] + std::vector filter_data = {1, 0, 1, 0}; // filtered_data: ["Hello", null] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {"Hello", std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test skipping values for byte array decoding +TEST_F(DeltaByteArrayDecoderTest, test_skip_value_for_byte_array) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BYTE_ARRAY); + auto descr = std::make_shared(node, 0, 0); + + // Prepare test data + std::vector values = {"Hello", "World", "Foobar", "ABCDEF"}; + std::vector byte_array_values; + for (const auto& value : values) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(value.size()), + reinterpret_cast(value.data())}); + } + + // Encode data + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + auto encoded_buffer = encoder->FlushValues(); + + // Set decoder data + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Skip the first two values + ASSERT_TRUE(_decoder->skip_values(2).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = values.size() - 2; // Skip first two values + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + // Verify decoded results (should start from the third value) + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i + 2]) + << "Mismatch at value " << (i + 2); + } +} + +// Test basic decoding fixed-length byte array functionality +TEST_F(DeltaByteArrayDecoderTest, test_basic_decode_fixed_len_byte_array) { + // Configure DECIMAL type parameters + const int32_t type_length = 16; + int precision = 10; + int scale = 2; + _decoder->set_type_length(type_length); + + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make( + "test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY, + parquet::ConvertedType::DECIMAL, type_length, precision, scale); + auto descr = std::make_shared(node, 0, 0); + + // Prepare test data + std::vector> test_fixed_len_buffers = { + {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, + 0x61, 0x40}, // Data 1 + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00}, // Data 2 (all zeros) + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF}, // Data 3 (all ones) + {0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, + 0xDE, 0xF0} // Data 4 (random) + }; + + std::vector byte_array_values; + for (const auto& buffer : test_fixed_len_buffers) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(buffer.size()), buffer.data()}); + } + + // Encode data + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + auto encoded_buffer = encoder->FlushValues(); + + // Set decoder data + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = test_fixed_len_buffers.size(); + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values * type_length); + auto* result_column = assert_cast(column.get()); + + // Verify decoded results one by one + for (size_t i = 0; i < num_values; ++i) { + for (size_t j = 0; j < type_length; ++j) { + size_t index = i * type_length + j; + EXPECT_EQ(result_column->get_element(index), + static_cast(test_fixed_len_buffers[i][j])) + << "Mismatch at buffer " << i << ", byte " << j; + } + } +} + +// Test decoding fixed-length byte array with filter +TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter) { + // Configure DECIMAL type parameters + const int32_t type_length = 16; + int precision = 10; + int scale = 2; + _decoder->set_type_length(type_length); + + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make( + "test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY, + parquet::ConvertedType::DECIMAL, type_length, precision, scale); + auto descr = std::make_shared(node, 0, 0); + + // Prepare test data + std::vector> test_fixed_len_buffers = { + {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, + 0x61, 0x40}, // Data 1 + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00}, // Data 2 (all zeros) + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF}, // Data 3 (all ones) + {0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, + 0xDE, 0xF0} // Data 4 (random) + }; + + std::vector byte_array_values; + for (const auto& buffer : test_fixed_len_buffers) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(buffer.size()), buffer.data()}); + } + + // Encode data + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + auto encoded_buffer = encoder->FlushValues(); + + // Set decoder data + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter [1, 0, 1, 0] + size_t num_values = test_fixed_len_buffers.size(); + std::vector run_length_null_map(1, num_values); + std::vector filter_data = {1, 0, 1, 0}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2 * type_length); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + + // Verify first value + for (size_t j = 0; j < type_length; ++j) { + EXPECT_EQ(result_column->get_element(j), static_cast(test_fixed_len_buffers[0][j])) + << "Mismatch at buffer 0, byte " << j; + } + + // Verify third value + for (size_t j = 0; j < type_length; ++j) { + size_t index = type_length + j; + EXPECT_EQ(result_column->get_element(index), + static_cast(test_fixed_len_buffers[2][j])) + << "Mismatch at buffer 2, byte " << j; + } +} + +// Test decoding fixed-length byte array with filter and null values +TEST_F(DeltaByteArrayDecoderTest, test_decode_fixed_len_byte_array_with_filter_and_null) { + // Configure DECIMAL type parameters + const int32_t type_length = 16; + int precision = 10; + int scale = 2; + _decoder->set_type_length(type_length); + + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make( + "test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY, + parquet::ConvertedType::DECIMAL, type_length, precision, scale); + auto descr = std::make_shared(node, 0, 0); + + // Prepare test data + std::vector> test_fixed_len_buffers = { + {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, + 0x61, 0x40}, // Data 1 + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00}, // Data 2 (all zeros) + {0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, + 0xDE, 0xF0} // Data 4 (random) + }; + + std::vector byte_array_values; + for (const auto& buffer : test_fixed_len_buffers) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(buffer.size()), buffer.data()}); + } + + // Encode data + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + auto encoded_buffer = encoder->FlushValues(); + + // Set decoder data + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter [1, 0, 1, 0] and null vector [0, 0, 1, 0] + size_t num_values = 4; + std::vector run_length_null_map = {2, 1, 1}; // Data: [Data 1, Data 2, null, Data 4] + std::vector filter_data = {1, 0, 1, 0}; // Filtered data: [Data 1, null] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2 * type_length); // 2 values after filtering (Data 1 and null) + auto* result_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector>> expected_values; + expected_values.push_back(std::vector {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, + 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, 0x61, + 0x40}); // Data 1 + expected_values.push_back(std::nullopt); // Only filtered values (Data 1 and null) + + // Verify results + size_t filtered_index = 0; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + for (size_t j = 0; j < type_length; ++j) { + size_t index = filtered_index * type_length + j; + EXPECT_EQ(result_column->get_element(index), + static_cast(expected_values[i].value()[j])) + << "Mismatch at filtered value " << i << ", byte " << j; + } + EXPECT_FALSE(null_map[filtered_index]) + << "Expected non-null at filtered position " << filtered_index; + filtered_index++; + } else { + EXPECT_TRUE(null_map[filtered_index]) + << "Expected null at filtered position " << filtered_index; + filtered_index++; + } + } +} + +// Test skipping values for fixed-length byte array decoding +TEST_F(DeltaByteArrayDecoderTest, test_skip_value_for_fixed_len_byte_array) { + // Configure DECIMAL type parameters + const int32_t type_length = 16; + int precision = 10; + int scale = 2; + _decoder->set_type_length(type_length); + + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make( + "test_column", parquet::Repetition::REQUIRED, parquet::Type::FIXED_LEN_BYTE_ARRAY, + parquet::ConvertedType::DECIMAL, type_length, precision, scale); + auto descr = std::make_shared(node, 0, 0); + + // Prepare test data + std::vector> test_fixed_len_buffers = { + {0x1a, 0x05, 0x06, 0x1b, 0x00, 0x00, 0x00, 0x13, 0x1c, 0x00, 0x00, 0x00, 0x00, 0xbc, + 0x61, 0x40}, // Data 1 + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00}, // Data 2 (all zeros) + {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF}, // Data 3 (all ones) + {0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, 0xDE, 0xF0, 0x12, 0x34, 0x56, 0x78, 0x9A, 0xBC, + 0xDE, 0xF0} // Data 4 (random) + }; + + std::vector byte_array_values; + for (const auto& buffer : test_fixed_len_buffers) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(buffer.size()), buffer.data()}); + } + + // Encode data + auto encoder = MakeTypedEncoder(parquet::Encoding::DELTA_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + auto encoded_buffer = encoder->FlushValues(); + + // Set decoder data + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Skip the first two values + ASSERT_TRUE(_decoder->skip_values(2).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = test_fixed_len_buffers.size() - 2; // Skip first two values + std::vector run_length_null_map(1, num_values); + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values * type_length); + auto* result_column = assert_cast(column.get()); + + // Verify decoded results (should start from the third value) + for (size_t i = 0; i < num_values; ++i) { + for (size_t j = 0; j < type_length; ++j) { + size_t index = i * type_length + j; + EXPECT_EQ(result_column->get_element(index), + static_cast(test_fixed_len_buffers[i + 2][j])) + << "Mismatch at buffer " << (i + 2) << ", byte " << j; + } + } +} + +// Test decoding with invalid data +TEST_F(DeltaByteArrayDecoderTest, test_invalid_data) { + // Prepare invalid encoded data + std::vector encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_FALSE(_decoder->set_data(&data_slice).ok()); +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/delta_length_byte_array_decoder_test.cpp b/be/test/vec/exec/format/parquet/delta_length_byte_array_decoder_test.cpp new file mode 100644 index 0000000000..7823709e4b --- /dev/null +++ b/be/test/vec/exec/format/parquet/delta_length_byte_array_decoder_test.cpp @@ -0,0 +1,276 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "parquet/encoding.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "util/slice.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" +#include "vec/exec/format/parquet/delta_bit_pack_decoder.h" + +namespace doris::vectorized { + +class DeltaLengthByteArrayDecoderTest : public ::testing::Test { +protected: + void SetUp() override { _decoder = std::make_unique(); } + + std::unique_ptr _decoder; +}; + +// Test basic decoding functionality +TEST_F(DeltaLengthByteArrayDecoderTest, test_basic_decode) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BYTE_ARRAY); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {"Hello", "World", "Foobar", "ABCDEF"}; + std::vector byte_array_values; + for (const auto& value : values) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(value.size()), + reinterpret_cast(value.data())}); + } + + // Create encoder + auto encoder = + MakeTypedEncoder(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = values.size(); + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), values[i]); + } +} + +// Test decoding with filter +TEST_F(DeltaLengthByteArrayDecoderTest, test_decode_with_filter) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BYTE_ARRAY); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {"Hello", "World", "Foobar", "ABCDEF"}; + std::vector byte_array_values; + for (const auto& value : values) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(value.size()), + reinterpret_cast(value.data())}); + } + + // Create encoder + auto encoder = + MakeTypedEncoder(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1, 0, 1, 0] + size_t num_values = values.size(); + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1, 0}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + EXPECT_EQ(result_column->get_data_at(0).to_string(), "Hello"); + EXPECT_EQ(result_column->get_data_at(1).to_string(), "Foobar"); +} + +// Test decoding with filter and null values +TEST_F(DeltaLengthByteArrayDecoderTest, test_decode_with_filter_and_null) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BYTE_ARRAY); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {"Hello", "World", "ABCDEF"}; + std::vector byte_array_values; + for (const auto& value : values) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(value.size()), + reinterpret_cast(value.data())}); + } + + // Create encoder + auto encoder = + MakeTypedEncoder(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1, 0, 1, 0] and null vector [0, 0, 1, 0] + size_t num_values = 4; + std::vector run_length_null_map = {2, 1, + 1}; // data: ["Hello", "World", null, "ABCDEF"] + std::vector filter_data = {1, 0, 1, 0}; // filtered_data: ["Hello", null] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {"Hello", std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test decoding with invalid data +TEST_F(DeltaLengthByteArrayDecoderTest, test_invalid_data) { + // Prepare invalid encoded data + std::vector encoded_data = {0x80, 0x01, 0x04, 0x05, 0x14}; // Incomplete data + Slice data_slice(reinterpret_cast(encoded_data.data()), encoded_data.size()); + ASSERT_FALSE(_decoder->set_data(&data_slice).ok()); +} + +// Test skipping values for delta length byte array decoding +TEST_F(DeltaLengthByteArrayDecoderTest, test_skip_value) { + // Create ColumnDescriptor + auto node = parquet::schema::PrimitiveNode::Make("test_column", parquet::Repetition::REQUIRED, + parquet::Type::BYTE_ARRAY); + auto descr = std::make_shared(node, 0, 0); + + // Prepare original data + std::vector values = {"Hello", "World", "Foobar", "ABCDEF", "Test", "Skip"}; + std::vector byte_array_values; + for (const auto& value : values) { + byte_array_values.emplace_back( + parquet::ByteArray {static_cast(value.size()), + reinterpret_cast(value.data())}); + } + + // Create encoder + auto encoder = + MakeTypedEncoder(parquet::Encoding::DELTA_LENGTH_BYTE_ARRAY, + /*use_dictionary=*/false, descr.get()); + + // Put data into encoder + ASSERT_NO_THROW( + encoder->Put(byte_array_values.data(), static_cast(byte_array_values.size()))); + + // Get encoded data + auto encoded_buffer = encoder->FlushValues(); + Slice data_slice(encoded_buffer->data(), encoded_buffer->size()); + ASSERT_TRUE(_decoder->set_data(&data_slice).ok()); + + // Skip first 3 values + ASSERT_TRUE(_decoder->skip_values(3).ok()); + + // Create column and data type + MutableColumnPtr column = ColumnString::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = values.size() - 3; // Total 6 values, skip 3, remaining 3 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder->decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast(column.get()); + + // Expected values after skipping first 3 values ("Hello", "World", "Foobar") + std::vector expected_values = {"ABCDEF", "Test", "Skip"}; + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(result_column->get_data_at(i).to_string(), expected_values[i]) + << "Mismatch at value " << i; + } +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp b/be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp new file mode 100644 index 0000000000..7dfcb12ce7 --- /dev/null +++ b/be/test/vec/exec/format/parquet/fix_length_dict_decoder_test.cpp @@ -0,0 +1,538 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/fix_length_dict_decoder.hpp" + +#include + +#include "util/slice.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_number.h" + +namespace doris::vectorized { + +class FixLengthDictDecoderTest : public ::testing::Test { +protected: + void SetUp() override { + // Prepare test data: create a dictionary with fixed-length strings + _type_length = 6; // Each string has length 6 + size_t dict_size = 3; + size_t dict_data_size = dict_size * _type_length; + + auto dict_data = std::make_unique(dict_data_size); + const char* values[3] = {"apple ", "banana", "cherry"}; // Dictionary values + for (int i = 0; i < 3; i++) { + memcpy(dict_data.get() + i * _type_length, values[i], _type_length); + } + + _decoder.set_type_length(_type_length); + ASSERT_TRUE(_decoder.set_dict(dict_data, dict_data_size, dict_size).ok()); + } + + FixLengthDictDecoder _decoder; + size_t _type_length; +}; + +// Test basic decoding functionality +TEST_F(FixLengthDictDecoderTest, test_basic_decode) { + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create selection vector without filter, total 7 values (4 repeated + 3 literal) + size_t num_values = 7; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values * _type_length); + auto* result_column = assert_cast(column.get()); + + // Split decoded results into strings based on _type_length + std::vector decoded_strings; + const auto& data = result_column->get_data(); + for (size_t i = 0; i < num_values; ++i) { + std::string str; + for (size_t j = 0; j < _type_length; ++j) { + str.push_back(static_cast(data[i * _type_length + j])); + } + decoded_strings.push_back(str); + } + + // Verify first 4 repeated values (dict index 0 -> value "apple ") + for (int i = 0; i < 4; i++) { + EXPECT_EQ(decoded_strings[i], "apple "); + } + + // Verify last 3 literal values + EXPECT_EQ(decoded_strings[4], "banana"); + EXPECT_EQ(decoded_strings[5], "cherry"); + EXPECT_EQ(decoded_strings[6], "banana"); +} + +// Test decoding with filter +TEST_F(FixLengthDictDecoderTest, test_decode_with_filter) { + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + ; + + // Create filter vector [1,0,1,0,1,1,1] + size_t num_values = 7; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5 * _type_length); // 5 values after filtering + auto* result_column = assert_cast(column.get()); + + // Split decoded results into strings based on _type_length + std::vector decoded_strings; + const auto& data = result_column->get_data(); + for (size_t i = 0; i < 5; ++i) { + std::string str; + for (size_t j = 0; j < _type_length; ++j) { + str.push_back(static_cast(data[i * _type_length + j])); + } + decoded_strings.push_back(str); + } + + // Verify filtered values + EXPECT_EQ(decoded_strings[0], "apple "); + EXPECT_EQ(decoded_strings[1], "apple "); + EXPECT_EQ(decoded_strings[2], "banana"); + EXPECT_EQ(decoded_strings[3], "cherry"); + EXPECT_EQ(decoded_strings[4], "banana"); +} + +// Test decoding with filter and null +TEST_F(FixLengthDictDecoderTest, test_decode_with_filter_and_null) { + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2] + std::vector rle_data = {2, 8, 0, 3, 0b00000010, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1] + size_t num_values = 7; + std::vector run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null] + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5 * _type_length); // 5 values after filtering + auto* result_column = assert_cast(column.get()); + + // Split decoded results into strings based on _type_length + std::vector decoded_strings; + const auto& data = result_column->get_data(); + for (size_t i = 0; i < 5; ++i) { + std::string str; + for (size_t j = 0; j < _type_length; ++j) { + str.push_back(static_cast(data[i * _type_length + j])); + } + decoded_strings.push_back(str); + } + + // Expected values after filtering and null handling + std::vector> expected_values = {"apple ", "apple ", std::nullopt, + "cherry", std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(decoded_strings[i], expected_values[i].value()) << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test empty dictionary case +TEST_F(FixLengthDictDecoderTest, test_empty_dict) { + FixLengthDictDecoder empty_decoder; + empty_decoder.set_type_length(sizeof(int32_t)); + + auto dict_data = std::make_unique(0); + ASSERT_TRUE(empty_decoder.set_dict(dict_data, 0, 0).ok()); +} + +// Test decoding with ColumnDictI32 +TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32) { + // Create ColumnDictI32 column + MutableColumnPtr column = ColumnDictI32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create selection vector without filter, total 7 values (4 repeated + 3 literal) + const size_t num_values = 7; + std::vector run_length_null_map = {num_values}; // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* dict_column = assert_cast(column.get()); + + // Verify first 4 repeated values (dict index 0 -> value "apple ") + for (int i = 0; i < 4; i++) { + EXPECT_EQ(dict_column->get_data()[i], 0); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(), "apple "); + } + + // Verify last 3 literal values + EXPECT_EQ(dict_column->get_data()[4], 1); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana"); + EXPECT_EQ(dict_column->get_data()[5], 2); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[5]).to_string(), "cherry"); + EXPECT_EQ(dict_column->get_data()[6], 1); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[6]).to_string(), "banana"); +} + +// Test decoding with ColumnDictI32 and filter +TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32_with_filter) { + // Create ColumnDictI32 column + MutableColumnPtr column = ColumnDictI32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] + const size_t num_values = 7; + std::vector run_length_null_map = {num_values}; // All non-null + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* dict_column = assert_cast(column.get()); + + // Verify filtered values + EXPECT_EQ(dict_column->get_data()[0], 0); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[0]).to_string(), "apple "); + EXPECT_EQ(dict_column->get_data()[1], 0); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[1]).to_string(), "apple "); + EXPECT_EQ(dict_column->get_data()[2], 1); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[2]).to_string(), "banana"); + EXPECT_EQ(dict_column->get_data()[3], 2); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[3]).to_string(), "cherry"); + EXPECT_EQ(dict_column->get_data()[4], 1); + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[4]).to_string(), "banana"); +} + +// Test decoding with ColumnDictI32 with filter and null +TEST_F(FixLengthDictDecoderTest, test_decode_with_column_dict_i32_with_filter_and_null) { + // Create ColumnDictI32 column + MutableColumnPtr column = ColumnDictI32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2] + std::vector rle_data = {2, 8, 0, 3, 0b00000010, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1] + const size_t num_values = 7; + std::vector run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null] + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null] + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* dict_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {"apple ", "apple ", std::nullopt, + "cherry", std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(dict_column->get_value(dict_column->get_data()[i]).to_string(), + expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test decoding with ColumnInt32 +TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32) { + // Create ColumnInt32 column + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create selection vector without filter, total 7 values (4 repeated + 3 literal) + const size_t num_values = 7; + std::vector run_length_null_map = {num_values}; // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* dict_column = assert_cast(column.get()); + + // Verify first 4 repeated values (dict index 0 -> value "apple ") + for (int i = 0; i < 4; i++) { + EXPECT_EQ(dict_column->get_data()[i], 0); + } + + // Verify last 3 literal values + EXPECT_EQ(dict_column->get_data()[4], 1); + EXPECT_EQ(dict_column->get_data()[5], 2); + EXPECT_EQ(dict_column->get_data()[6], 1); +} + +// Test decoding with ColumnInt32 and filter +TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32_with_filter) { + // Create ColumnInt32 column + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] + const size_t num_values = 7; + std::vector run_length_null_map = {num_values}; // All non-null + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* dict_column = assert_cast(column.get()); + + // Verify filtered values + EXPECT_EQ(dict_column->get_data()[0], 0); + EXPECT_EQ(dict_column->get_data()[1], 0); + EXPECT_EQ(dict_column->get_data()[2], 1); + EXPECT_EQ(dict_column->get_data()[3], 2); + EXPECT_EQ(dict_column->get_data()[4], 1); +} + +// Test decoding with ColumnInt32 with filter and null +TEST_F(FixLengthDictDecoderTest, test_decode_with_column_int_32_with_filter_and_null) { + // Create ColumnInt32 column + MutableColumnPtr column = ColumnInt32::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 2, padded to 8 values, [0 0 0 0 2] + std::vector rle_data = {2, 8, 0, 3, 0b00000010, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Create filter vector [1,0,1,0,1,1,1] and null vector [0,0,0,0,1,0,1] + const size_t num_values = 7; + std::vector run_length_null_map {4, 1, 1, 1}; // data: [0 0 0 0 null 2 null] + std::vector filter_data = {1, 0, 1, 0, 1, 1, 1}; // filtered_data: [0 0 null 2 null] + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, true).ok()); + + // Verify results + ASSERT_EQ(column->size(), 5); // 5 values after filtering + auto* dict_column = assert_cast(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {0, 0, std::nullopt, 2, std::nullopt}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(dict_column->get_data()[i], expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test reading dictionary values to column +TEST_F(FixLengthDictDecoderTest, test_read_dict_values_to_column) { + // Create a column to store dictionary values + MutableColumnPtr column = ColumnString::create(); + + // Read dictionary values to column + ASSERT_TRUE(_decoder.read_dict_values_to_column(column).ok()); + + // Verify results + ASSERT_EQ(column->size(), 3); // 3 dictionary items + auto* result_column = assert_cast(column.get()); + + // Get decoded strings directly + std::vector decoded_strings; + for (size_t i = 0; i < 3; ++i) { + decoded_strings.push_back(result_column->get_data_at(i).to_string()); + } + + // Verify dictionary values + EXPECT_EQ(decoded_strings[0], "apple "); + EXPECT_EQ(decoded_strings[1], "banana"); + EXPECT_EQ(decoded_strings[2], "cherry"); +} + +// Test convert_dict_column_to_string_column function +TEST_F(FixLengthDictDecoderTest, test_convert_dict_column_to_string_column) { + // Create a ColumnInt32 with some dictionary codes + MutableColumnPtr dict_column = ColumnInt32::create(); + dict_column->insert(0); + dict_column->insert(1); + dict_column->insert(2); + dict_column->insert(1); + + // Convert to string column + MutableColumnPtr string_column = _decoder.convert_dict_column_to_string_column( + assert_cast(dict_column.get())); + + // Verify results + ASSERT_EQ(string_column->size(), 4); + auto* result_column = assert_cast(string_column.get()); + + EXPECT_EQ(result_column->get_data_at(0).to_string(), "apple "); + EXPECT_EQ(result_column->get_data_at(1).to_string(), "banana"); + EXPECT_EQ(result_column->get_data_at(2).to_string(), "cherry"); + EXPECT_EQ(result_column->get_data_at(3).to_string(), "banana"); +} + +// Test skipping values for fixed length dictionary decoding +TEST_F(FixLengthDictDecoderTest, test_skip_value) { + MutableColumnPtr column = ColumnUInt8::create(); + DataTypePtr data_type = std::make_shared(); + + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {2, 8, 0, 3, 0b00011001, 0}; + + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + ASSERT_TRUE(_decoder.set_data(&data_slice).ok()); + + // Skip first 3 values + ASSERT_TRUE(_decoder.skip_values(3).ok()); + + // Create selection vector + size_t num_values = 4; // Total 7 values, skip 3, remaining 4 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(_decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values * _type_length); + auto* result_column = assert_cast(column.get()); + + // Split decoded results into strings based on _type_length + std::vector decoded_strings; + const auto& data = result_column->get_data(); + for (size_t i = 0; i < num_values; ++i) { + std::string str; + for (size_t j = 0; j < _type_length; ++j) { + str.push_back(static_cast(data[i * _type_length + j])); + } + decoded_strings.push_back(str); + } + + // Expected values after skipping first 3 values ("apple ", "apple ", "apple ") + std::vector expected_values = {"apple ", "banana", "cherry", "banana"}; + for (size_t i = 0; i < num_values; ++i) { + EXPECT_EQ(decoded_strings[i], expected_values[i]) << "Mismatch at value " << i; + } +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/fix_length_plain_decoder_test.cpp b/be/test/vec/exec/format/parquet/fix_length_plain_decoder_test.cpp new file mode 100644 index 0000000000..c6232e110e --- /dev/null +++ b/be/test/vec/exec/format/parquet/fix_length_plain_decoder_test.cpp @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/fix_length_plain_decoder.h" + +#include + +#include "util/slice.h" +#include "vec/columns/column_vector.h" +#include "vec/data_types/data_type_number.h" + +namespace doris::vectorized { + +class FixLengthPlainDecoderTest : public ::testing::Test { +protected: + void SetUp() override {} + + std::unique_ptr _data; + Slice _data_slice; + size_t _type_length; +}; + +// Test basic decoding functionality +TEST_F(FixLengthPlainDecoderTest, test_basic_decode) { + // Prepare test data: create fixed-length integer values + int32_t values[3] = {123, 456, 789}; + size_t data_size = sizeof(values); + + _data = std::make_unique(data_size); + memcpy(_data.get(), values, data_size); + + _data_slice = Slice(_data.get(), data_size); + _type_length = sizeof(int32_t); + + FixLengthPlainDecoder decoder; + decoder.set_type_length(_type_length); + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnVector::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector without filter + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast*>(column.get()); + + EXPECT_EQ(result_column->get_data()[0], 123); + EXPECT_EQ(result_column->get_data()[1], 456); + EXPECT_EQ(result_column->get_data()[2], 789); +} + +// Test decoding with filter +TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter) { + // Prepare test data: create fixed-length integer values + int32_t values[3] = {123, 456, 789}; + size_t data_size = sizeof(values); + + _data = std::make_unique(data_size); + memcpy(_data.get(), values, data_size); + + _data_slice = Slice(_data.get(), data_size); + _type_length = sizeof(int32_t); + + FixLengthPlainDecoder decoder; + decoder.set_type_length(_type_length); + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnVector::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1,0,1] + size_t num_values = 3; + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data = {1, 0, 1}; + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast*>(column.get()); + + EXPECT_EQ(result_column->get_data()[0], 123); + EXPECT_EQ(result_column->get_data()[1], 789); +} + +// Test decoding with filter and null +TEST_F(FixLengthPlainDecoderTest, test_decode_with_filter_and_null) { + // Prepare test data: create fixed-length integer values + int32_t values[2] = {123, 789}; + size_t data_size = sizeof(values); + + _data = std::make_unique(data_size); + memcpy(_data.get(), values, data_size); + + _data_slice = Slice(_data.get(), data_size); + _type_length = sizeof(int32_t); + + FixLengthPlainDecoder decoder; + decoder.set_type_length(_type_length); + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + MutableColumnPtr column = ColumnVector::create(); + DataTypePtr data_type = std::make_shared(); + + // Create filter vector [1,0,1] and null vector [0,1,0] + size_t num_values = 3; + std::vector run_length_null_map = {1, 1, 1}; // data: [123, null, 789] + std::vector filter_data = {1, 0, 1}; // filtered_data: [123, 789] + + ColumnSelectVector select_vector; + NullMap null_map; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, &null_map); + + // Perform decoding + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), 2); // 2 values after filtering + auto* result_column = assert_cast*>(column.get()); + + // Expected values after filtering and null handling + std::vector> expected_values = {123, 789}; + for (size_t i = 0; i < expected_values.size(); ++i) { + if (expected_values[i].has_value()) { + EXPECT_EQ(result_column->get_data()[i], expected_values[i].value()) + << "Mismatch at value " << i; + EXPECT_FALSE(null_map[i]) << "Expected non-null at position " << i; + } else { + EXPECT_TRUE(null_map[i]) << "Expected null at position " << i; + } + } +} + +// Test skipping values +TEST_F(FixLengthPlainDecoderTest, test_skip_value) { + // Prepare test data: create fixed-length integer values + int32_t values[3] = {123, 456, 789}; + size_t data_size = sizeof(values); + + _data = std::make_unique(data_size); + memcpy(_data.get(), values, data_size); + + _data_slice = Slice(_data.get(), data_size); + _type_length = sizeof(int32_t); + + FixLengthPlainDecoder decoder; + decoder.set_type_length(_type_length); + ASSERT_TRUE(decoder.set_data(&_data_slice).ok()); + + // Skip first 2 values + ASSERT_TRUE(decoder.skip_values(2).ok()); + + MutableColumnPtr column = ColumnVector::create(); + DataTypePtr data_type = std::make_shared(); + + // Create selection vector + size_t num_values = 1; // Total 3 values, skip 2, remaining 1 + std::vector run_length_null_map(1, num_values); // All non-null + std::vector filter_data(num_values, 1); + ColumnSelectVector select_vector; + select_vector.build(filter_data.data(), filter_data.size(), false); + select_vector.set_run_length_null_map(run_length_null_map, num_values, nullptr); + + // Perform decoding + ASSERT_TRUE(decoder.decode_values(column, data_type, select_vector, false).ok()); + + // Verify results + ASSERT_EQ(column->size(), num_values); + auto* result_column = assert_cast*>(column.get()); + + EXPECT_EQ(result_column->get_data()[0], 789); +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/format/parquet/level_decoder_test.cpp b/be/test/vec/exec/format/parquet/level_decoder_test.cpp new file mode 100644 index 0000000000..d9385db734 --- /dev/null +++ b/be/test/vec/exec/format/parquet/level_decoder_test.cpp @@ -0,0 +1,225 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/level_decoder.h" + +#include + +#include "parquet/encoding.h" +#include "parquet/schema.h" +#include "parquet/types.h" +#include "util/slice.h" + +namespace doris::vectorized { + +class LevelDecoderTest : public ::testing::Test { +protected: + void SetUp() override { _decoder = std::make_unique(); } + + std::unique_ptr _decoder; +}; + +// Test basic RLE level decoding for data page v1 +TEST_F(LevelDecoderTest, test_rle_decode_v1) { + // Prepare RLE encoded data + // RLE encoded data: 4 zeros followed by 1, 2, 1 [0 0 0 0 1 2 1] + std::vector rle_data = { + 0x04, 0x00, 0x00, 0x00, // RLE length (4 bytes) + 8, 0, 3, 0b00011001 // RLE data + }; + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + + // Initialize decoder + ASSERT_TRUE(_decoder->init(&data_slice, tparquet::Encoding::RLE, 2, 7).ok()); + + // Decode levels + level_t levels[7]; + size_t num_levels = _decoder->get_levels(levels, 7); + + // Verify results + ASSERT_EQ(num_levels, 7); + EXPECT_EQ(levels[0], 0); + EXPECT_EQ(levels[1], 0); + EXPECT_EQ(levels[2], 0); + EXPECT_EQ(levels[3], 0); + EXPECT_EQ(levels[4], 1); + EXPECT_EQ(levels[5], 2); + EXPECT_EQ(levels[6], 1); +} + +// Test basic BIT-PACKED level decoding for data page v1 +TEST_F(LevelDecoderTest, test_bit_packed_decode_v1) { + // Prepare BIT-PACKED encoded data + // [1 2 1] + std::vector rle_data = {0b00011001}; + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + + // Initialize decoder + ASSERT_TRUE(_decoder->init(&data_slice, tparquet::Encoding::BIT_PACKED, 2, 3).ok()); + + // Decode levels + level_t levels[3]; + size_t num_levels = _decoder->get_levels(levels, 3); + + // Verify results + ASSERT_EQ(num_levels, 3); + EXPECT_EQ(levels[0], 1); + EXPECT_EQ(levels[1], 2); + EXPECT_EQ(levels[2], 1); +} + +// Test RLE level decoding for data page v2 +TEST_F(LevelDecoderTest, test_rle_decode_v2) { + // Prepare RLE encoded data + // RLE encoded data: 4 zeros followed by 1, 2, 1, padded to 8 values, [0 0 0 0 1 2 1] + std::vector rle_data = {8, 0, 3, 0b00011001, 0}; + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + + // Initialize decoder + ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok()); + + // Decode levels + level_t levels[7]; + size_t num_levels = _decoder->get_levels(levels, 7); + + // Verify results + ASSERT_EQ(num_levels, 7); + EXPECT_EQ(levels[0], 0); + EXPECT_EQ(levels[1], 0); + EXPECT_EQ(levels[2], 0); + EXPECT_EQ(levels[3], 0); + EXPECT_EQ(levels[4], 1); + EXPECT_EQ(levels[5], 2); + EXPECT_EQ(levels[6], 1); +} + +// Test invalid RLE data for data page v1 +TEST_F(LevelDecoderTest, test_invalid_rle_data_v1) { + // Prepare invalid RLE data + std::vector rle_data = {0x04, 0x00, 0x00, 0x00, // RLE length (4 bytes) + 8, 0, 3}; + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + + // Initialize decoder should fail + ASSERT_FALSE(_decoder->init(&data_slice, tparquet::Encoding::RLE, 1, 8).ok()); +} + +// TODO: Currently not working, so commented out. +// Test invalid RLE data for data page v2 +//TEST_F(LevelDecoderTest, test_invalid_rle_data_v2) { +// // Prepare invalid RLE data +// std::vector rle_data = {8, 0, 3}; +// Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); +// +// // Initialize decoder should fail +// ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok()); +// +// // Decode levels +// level_t levels[7]; +// size_t num_levels = _decoder->get_levels(levels, 7); +// +// // Verify results +// ASSERT_EQ(num_levels, 7); +//} + +// Test unsupported encoding +TEST_F(LevelDecoderTest, test_unsupported_encoding) { + // Prepare dummy data + std::vector dummy_data = {0x00}; + Slice data_slice(reinterpret_cast(dummy_data.data()), dummy_data.size()); + + // Initialize decoder with unsupported encoding should fail + ASSERT_FALSE(_decoder->init(&data_slice, tparquet::Encoding::PLAIN, 1, 8).ok()); +} + +// Test has_levels() function +TEST_F(LevelDecoderTest, test_has_levels) { + // Initially, there should be no levels + EXPECT_FALSE(_decoder->has_levels()); + + // Prepare RLE encoded data + std::vector rle_data = {8, 0, 3, 0b00011001, 0}; + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + + // Initialize decoder with valid data + ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok()); + + // Now there should be levels + EXPECT_TRUE(_decoder->has_levels()); +} + +// Test get_next() function +TEST_F(LevelDecoderTest, test_get_next) { + // Prepare RLE encoded data + std::vector rle_data = {8, 0, 3, 0b00011001, 0}; + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + + // Initialize decoder + ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok()); + + // Verify the first level + EXPECT_EQ(_decoder->get_next(), 0); + + // Verify the next level + EXPECT_EQ(_decoder->get_next(), 0); +} + +// Test rewind_one() function +TEST_F(LevelDecoderTest, test_rewind_one) { + // Prepare RLE encoded data + std::vector rle_data = {8, 0, 3, 0b00011001, 0}; + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + + // Initialize decoder + ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok()); + + // Get the first level + level_t first_level = _decoder->get_next(); + + // Get the second level + level_t second_level = _decoder->get_next(); + + // Rewind one level + _decoder->rewind_one(); + + // Verify that we get the second level again + EXPECT_EQ(_decoder->get_next(), second_level); + + // Rewind one more level + _decoder->rewind_one(); + + // Verify that we get the first level again + EXPECT_EQ(_decoder->get_next(), first_level); +} + +// Test rle_decoder() function +TEST_F(LevelDecoderTest, test_rle_decoder) { + // Prepare RLE encoded data + std::vector rle_data = {8, 0, 3, 0b00011001, 0}; + Slice data_slice(reinterpret_cast(rle_data.data()), rle_data.size()); + + // Initialize decoder + ASSERT_TRUE(_decoder->init_v2(data_slice, 2, 7).ok()); + + // Get the RLE decoder + const RleDecoder& rle_decoder = _decoder->rle_decoder(); + + // Verify that the RLE decoder is not null + EXPECT_NE(&rle_decoder, nullptr); +} + +} // namespace doris::vectorized diff --git a/be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp b/be/test/vec/exec/format/parquet/parquet_corrupt_statistics_test.cpp similarity index 100% rename from be/test/vec/exec/parquet/parquet_corrupt_statistics_test.cpp rename to be/test/vec/exec/format/parquet/parquet_corrupt_statistics_test.cpp diff --git a/be/test/vec/exec/parquet/parquet_reader_test.cpp b/be/test/vec/exec/format/parquet/parquet_reader_test.cpp similarity index 100% rename from be/test/vec/exec/parquet/parquet_reader_test.cpp rename to be/test/vec/exec/format/parquet/parquet_reader_test.cpp diff --git a/be/test/vec/exec/parquet/parquet_statistics_test.cpp b/be/test/vec/exec/format/parquet/parquet_statistics_test.cpp similarity index 100% rename from be/test/vec/exec/parquet/parquet_statistics_test.cpp rename to be/test/vec/exec/format/parquet/parquet_statistics_test.cpp diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/format/parquet/parquet_thrift_test.cpp similarity index 100% rename from be/test/vec/exec/parquet/parquet_thrift_test.cpp rename to be/test/vec/exec/format/parquet/parquet_thrift_test.cpp diff --git a/be/test/vec/exec/parquet/parquet_version_test.cpp b/be/test/vec/exec/format/parquet/parquet_version_test.cpp similarity index 100% rename from be/test/vec/exec/parquet/parquet_version_test.cpp rename to be/test/vec/exec/format/parquet/parquet_version_test.cpp