diff --git a/be/src/vec/common/cow.h b/be/src/vec/common/cow.h index d3ab0ebd68..5cd701e6ba 100644 --- a/be/src/vec/common/cow.h +++ b/be/src/vec/common/cow.h @@ -410,4 +410,4 @@ protected: MutablePtr shallow_mutate() const { return MutablePtr(static_cast(Base::shallow_mutate().get())); } -}; +}; \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp index 6f5f36a33a..b6a614831a 100644 --- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp +++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.cpp @@ -125,56 +125,35 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: { - size_t dict_index = 0; + size_t dict_index = 0; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - std::vector string_values; - string_values.reserve(run_length); - for (size_t i = 0; i < run_length; ++i) { - string_values.emplace_back(_dict_items[_indexes[dict_index++]]); - } - doris_column->insert_many_strings_overflow(&string_values[0], run_length, - _max_value_length); - break; - } - case ColumnSelectVector::NULL_DATA: { - doris_column->insert_many_defaults(run_length); - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + std::vector string_values; + string_values.reserve(run_length); + for (size_t i = 0; i < run_length; ++i) { + string_values.emplace_back(_dict_items[_indexes[dict_index++]]); } + doris_column->insert_many_strings_overflow(&string_values[0], run_length, + _max_value_length); + break; + } + case ColumnSelectVector::NULL_DATA: { + doris_column->insert_many_defaults(run_length); + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + dict_index += run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } } - return Status::OK(); } - case TypeIndex::Decimal32: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal64: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal128: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal128I: - return _decode_binary_decimal(doris_column, data_type, select_vector); - // TODO: decimal256 - default: - break; - } - return Status::InvalidArgument( - "Can't decode parquet physical type BYTE_ARRAY to doris logical type {}", - getTypeName(logical_type)); + return Status::OK(); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h index 2f90ada428..0267cf17f7 100644 --- a/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h +++ b/be/src/vec/exec/format/parquet/byte_array_dict_decoder.h @@ -66,97 +66,10 @@ public: MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override; protected: - template - Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - // For dictionary encoding std::vector _dict_items; std::vector _dict_data; size_t _max_value_length; std::unordered_map _dict_value_to_code; - -private: - template - Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); }; - -template -Status ByteArrayDictDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } else { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } -} - -template -Status ByteArrayDictDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - StringRef& slice = _dict_items[_indexes[dict_index++]]; - char* buf_start = const_cast(slice.data); - uint32_t length = (uint32_t)slice.size; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - DecimalPrimitiveType value = 0; - memcpy(reinterpret_cast(&value), buf_start, length); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - length) * 8); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp index e91f9f1db9..4dde378dc8 100644 --- a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp +++ b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.cpp @@ -56,74 +56,53 @@ template Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) { - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: { - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - std::vector string_values; - string_values.reserve(run_length); - for (size_t i = 0; i < run_length; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = decode_fixed32_le( - reinterpret_cast(_data->data) + _offset); - _offset += 4; - if (UNLIKELY(_offset + length) > _data->size) { - return Status::IOError("Can't read enough bytes in plain decoder"); - } - string_values.emplace_back(_data->data + _offset, length); - _offset += length; + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + std::vector string_values; + string_values.reserve(run_length); + for (size_t i = 0; i < run_length; ++i) { + if (UNLIKELY(_offset + 4 > _data->size)) { + return Status::IOError("Can't read byte array length from plain decoder"); } - doris_column->insert_many_strings(&string_values[0], run_length); - break; - } - case ColumnSelectVector::NULL_DATA: { - doris_column->insert_many_defaults(run_length); - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - for (int i = 0; i < run_length; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = decode_fixed32_le( - reinterpret_cast(_data->data) + _offset); - _offset += 4; - if (UNLIKELY(_offset + length) > _data->size) { - return Status::IOError("Can't read enough bytes in plain decoder"); - } - _offset += length; + uint32_t length = + decode_fixed32_le(reinterpret_cast(_data->data) + _offset); + _offset += 4; + if (UNLIKELY(_offset + length) > _data->size) { + return Status::IOError("Can't read enough bytes in plain decoder"); } - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } + string_values.emplace_back(_data->data + _offset, length); + _offset += length; } + doris_column->insert_many_strings(&string_values[0], run_length); + break; + } + case ColumnSelectVector::NULL_DATA: { + doris_column->insert_many_defaults(run_length); + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + for (int i = 0; i < run_length; ++i) { + if (UNLIKELY(_offset + 4 > _data->size)) { + return Status::IOError("Can't read byte array length from plain decoder"); + } + uint32_t length = + decode_fixed32_le(reinterpret_cast(_data->data) + _offset); + _offset += 4; + if (UNLIKELY(_offset + length) > _data->size) { + return Status::IOError("Can't read enough bytes in plain decoder"); + } + _offset += length; + } + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } } - return Status::OK(); } - case TypeIndex::Decimal32: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal64: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal128: - return _decode_binary_decimal(doris_column, data_type, select_vector); - case TypeIndex::Decimal128I: - return _decode_binary_decimal(doris_column, data_type, select_vector); - // TODO: decimal256 - default: - break; - } - return Status::InvalidArgument( - "Can't decode parquet physical type BYTE_ARRAY to doris logical type {}", - getTypeName(logical_type)); + return Status::OK(); } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h index 5d5d23db60..5fb8a9622c 100644 --- a/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/byte_array_plain_decoder.h @@ -56,97 +56,5 @@ public: ColumnSelectVector& select_vector, bool is_dict_filter); Status skip_values(size_t num_values) override; - -protected: - template - Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - -private: - template - Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); }; - -template -Status ByteArrayPlainDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } else { - return _decode_binary_decimal_internal( - doris_column, data_type, select_vector); - } -} - -template -Status ByteArrayPlainDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - if (UNLIKELY(_offset + 4 > _data->size)) { - return Status::IOError("Can't read byte array length from plain decoder"); - } - uint32_t length = - decode_fixed32_le(reinterpret_cast(_data->data) + _offset); - _offset += 4; - char* buf_start = _data->data + _offset; - _offset += length; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - DecimalPrimitiveType value = 0; - memcpy(reinterpret_cast(&value), buf_start, length); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - length) * 8); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/decoder.cpp b/be/src/vec/exec/format/parquet/decoder.cpp index 0a15817609..952d226af2 100644 --- a/be/src/vec/exec/format/parquet/decoder.cpp +++ b/be/src/vec/exec/format/parquet/decoder.cpp @@ -31,8 +31,6 @@ namespace doris::vectorized { -const cctz::time_zone DecodeParams::utc0 = cctz::utc_time_zone(); - Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type encoding, std::unique_ptr& decoder) { switch (encoding) { @@ -45,17 +43,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type decoder.reset(new ByteArrayPlainDecoder()); break; case tparquet::Type::INT32: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::INT64: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::INT96: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::FLOAT: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::DOUBLE: - [[fallthrough]]; + decoder.reset(new FixLengthPlainDecoder()); + break; case tparquet::Type::FIXED_LEN_BYTE_ARRAY: - decoder.reset(new FixLengthPlainDecoder(type)); + decoder.reset(new FixLengthPlainDecoder()); break; default: return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder", @@ -70,22 +73,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type decoder.reset(new ByteArrayDictDecoder()); break; case tparquet::Type::INT32: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::INT64: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::INT96: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::FLOAT: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::DOUBLE: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; case tparquet::Type::FIXED_LEN_BYTE_ARRAY: - decoder.reset(new FixLengthDictDecoder(type)); + decoder.reset(new FixLengthDictDecoder()); break; default: return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder", @@ -106,10 +109,10 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type // Supports only INT32 and INT64. switch (type) { case tparquet::Type::INT32: - decoder.reset(new DeltaBitPackDecoder(type)); + decoder.reset(new DeltaBitPackDecoder()); break; case tparquet::Type::INT64: - decoder.reset(new DeltaBitPackDecoder(type)); + decoder.reset(new DeltaBitPackDecoder()); break; default: return Status::InternalError("DELTA_BINARY_PACKED only supports INT32 and INT64"); @@ -118,7 +121,7 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type case tparquet::Encoding::DELTA_BYTE_ARRAY: switch (type) { case tparquet::Type::BYTE_ARRAY: - decoder.reset(new DeltaByteArrayDecoder(type)); + decoder.reset(new DeltaByteArrayDecoder()); break; default: return Status::InternalError("DELTA_BYTE_ARRAY only supports BYTE_ARRAY."); @@ -127,7 +130,7 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type case tparquet::Encoding::DELTA_LENGTH_BYTE_ARRAY: switch (type) { case tparquet::Type::FIXED_LEN_BYTE_ARRAY: - decoder.reset(new DeltaLengthByteArrayDecoder(type)); + decoder.reset(new DeltaLengthByteArrayDecoder()); break; default: return Status::InternalError( @@ -141,47 +144,4 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type return Status::OK(); } -void Decoder::init(FieldSchema* field_schema, cctz::time_zone* ctz) { - _field_schema = field_schema; - if (_decode_params == nullptr) { - _decode_params.reset(new DecodeParams()); - } - if (ctz != nullptr) { - _decode_params->ctz = ctz; - } - const auto& schema = field_schema->parquet_schema; - if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) { - const auto& timestamp_info = schema.logicalType.TIMESTAMP; - if (!timestamp_info.isAdjustedToUTC) { - // should set timezone to utc+0 - _decode_params->ctz = const_cast(&_decode_params->utc0); - } - const auto& time_unit = timestamp_info.unit; - if (time_unit.__isset.MILLIS) { - _decode_params->second_mask = 1000; - _decode_params->scale_to_nano_factor = 1000000; - } else if (time_unit.__isset.MICROS) { - _decode_params->second_mask = 1000000; - _decode_params->scale_to_nano_factor = 1000; - } else if (time_unit.__isset.NANOS) { - _decode_params->second_mask = 1000000000; - _decode_params->scale_to_nano_factor = 1; - } - } else if (schema.__isset.converted_type) { - const auto& converted_type = schema.converted_type; - if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) { - _decode_params->second_mask = 1000; - _decode_params->scale_to_nano_factor = 1000000; - } else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) { - _decode_params->second_mask = 1000000; - _decode_params->scale_to_nano_factor = 1000; - } - } - - if (_decode_params->ctz) { - VecDateTimeValue t; - t.from_unixtime(0, *_decode_params->ctz); - _decode_params->offset_days = t.day() == 31 ? -1 : 0; // If 1969-12-31, then returns -1. - } -} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/decoder.h b/be/src/vec/exec/format/parquet/decoder.h index acd9965bad..4e56dea3d1 100644 --- a/be/src/vec/exec/format/parquet/decoder.h +++ b/be/src/vec/exec/format/parquet/decoder.h @@ -54,29 +54,6 @@ class ColumnString; namespace doris::vectorized { -#define FOR_LOGICAL_NUMERIC_TYPES(M) \ - M(TypeIndex::Int8, Int8, Int32) \ - M(TypeIndex::UInt8, UInt8, Int32) \ - M(TypeIndex::Int16, Int16, Int32) \ - M(TypeIndex::UInt16, UInt16, Int32) \ - M(TypeIndex::Int32, Int32, Int32) \ - M(TypeIndex::UInt32, UInt32, Int32) \ - M(TypeIndex::Int64, Int64, Int64) \ - M(TypeIndex::UInt64, UInt64, Int64) \ - M(TypeIndex::Float32, Float32, Float32) \ - M(TypeIndex::Float64, Float64, Float64) - -struct DecodeParams { - // schema.logicalType.TIMESTAMP.isAdjustedToUTC == false - static const cctz::time_zone utc0; - // schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set the time zone - cctz::time_zone* ctz = nullptr; - int32_t offset_days = 0; - int64_t second_mask = 1; - int64_t scale_to_nano_factor = 1; - DecimalScaleParams decimal_scale; -}; - class Decoder { public: Decoder() = default; @@ -94,11 +71,6 @@ public: _offset = 0; } - void init(FieldSchema* field_schema, cctz::time_zone* ctz); - - template - void init_decimal_converter(DataTypePtr& data_type); - // Write the decoded values batch to doris's column virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) = 0; @@ -126,34 +98,8 @@ protected: int32_t _type_length; Slice* _data = nullptr; uint32_t _offset = 0; - FieldSchema* _field_schema = nullptr; - std::unique_ptr _decode_params = nullptr; }; -template -void Decoder::init_decimal_converter(DataTypePtr& data_type) { - if (_decode_params == nullptr || _field_schema == nullptr || - _decode_params->decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { - return; - } - auto scale = _field_schema->parquet_schema.scale; - auto* decimal_type = reinterpret_cast>*>( - const_cast(remove_nullable(data_type).get())); - auto dest_scale = decimal_type->get_scale(); - if (dest_scale > scale) { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; - _decode_params->decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor(dest_scale - scale); - } else if (dest_scale < scale) { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; - _decode_params->decimal_scale.scale_factor = - DecimalScaleParams::get_scale_factor(scale - dest_scale); - } else { - _decode_params->decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; - _decode_params->decimal_scale.scale_factor = 1; - } -} - class BaseDictDecoder : public Decoder { public: BaseDictDecoder() = default; diff --git a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp deleted file mode 100644 index f734f3012c..0000000000 --- a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.cpp +++ /dev/null @@ -1,283 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "delta_bit_pack_decoder.h" - -#include - -#include -#include - -#include "vec/columns/column.h" -#include "vec/common/arithmetic_overflow.h" -#include "vec/common/string_ref.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_nullable.h" - -namespace doris::vectorized { - -template -Status DeltaBitPackDecoder::_init_header() { - if (!_bit_reader->GetVlqInt(&_values_per_block) || - !_bit_reader->GetVlqInt(&_mini_blocks_per_block) || - !_bit_reader->GetVlqInt(&_total_value_count) || - !_bit_reader->GetZigZagVlqInt(&_last_value)) { - return Status::IOError("Init header eof"); - } - if (_values_per_block == 0) { - return Status::InvalidArgument("Cannot have zero value per block"); - } - if (_values_per_block % 128 != 0) { - return Status::InvalidArgument( - "the number of values in a block must be multiple of 128, but it's " + - std::to_string(_values_per_block)); - } - if (_mini_blocks_per_block == 0) { - return Status::InvalidArgument("Cannot have zero miniblock per block"); - } - _values_per_mini_block = _values_per_block / _mini_blocks_per_block; - if (_values_per_mini_block == 0) { - return Status::InvalidArgument("Cannot have zero value per miniblock"); - } - if (_values_per_mini_block % 32 != 0) { - return Status::InvalidArgument( - "The number of values in a miniblock must be multiple of 32, but it's " + - std::to_string(_values_per_mini_block)); - } - _total_values_remaining = _total_value_count; - _delta_bit_widths.resize(_mini_blocks_per_block); - // init as empty property - _block_initialized = false; - _values_remaining_current_mini_block = 0; - return Status::OK(); -} - -template -Status DeltaBitPackDecoder::_init_block() { - DCHECK_GT(_total_values_remaining, 0) << "InitBlock called at EOF"; - if (!_bit_reader->GetZigZagVlqInt(&_min_delta)) { - return Status::IOError("Init block eof"); - } - - // read the bitwidth of each miniblock - uint8_t* bit_width_data = _delta_bit_widths.data(); - for (uint32_t i = 0; i < _mini_blocks_per_block; ++i) { - if (!_bit_reader->GetAligned(1, bit_width_data + i)) { - return Status::IOError("Decode bit-width EOF"); - } - // Note that non-conformant bitwidth entries are allowed by the Parquet spec - // for extraneous miniblocks in the last block (GH-14923), so we check - // the bitwidths when actually using them (see InitMiniBlock()). - } - _mini_block_idx = 0; - _block_initialized = true; - RETURN_IF_ERROR(_init_mini_block(bit_width_data[0])); - return Status::OK(); -} - -template -Status DeltaBitPackDecoder::_init_mini_block(int bit_width) { - if (PREDICT_FALSE(bit_width > kMaxDeltaBitWidth)) { - return Status::InvalidArgument("delta bit width larger than integer bit width"); - } - _delta_bit_width = bit_width; - _values_remaining_current_mini_block = _values_per_mini_block; - return Status::OK(); -} - -template -Status DeltaBitPackDecoder::_get_internal(T* buffer, int num_values, int* out_num_values) { - num_values = static_cast(std::min(num_values, _total_values_remaining)); - if (num_values == 0) { - *out_num_values = 0; - return Status::OK(); - } - int i = 0; - while (i < num_values) { - if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) { - if (PREDICT_FALSE(!_block_initialized)) { - buffer[i++] = _last_value; - DCHECK_EQ(i, 1); // we're at the beginning of the page - if (i == num_values) { - // When block is uninitialized and i reaches num_values we have two - // different possibilities: - // 1. _total_value_count == 1, which means that the page may have only - // one value (encoded in the header), and we should not initialize - // any block. - // 2. _total_value_count != 1, which means we should initialize the - // incoming block for subsequent reads. - if (_total_value_count != 1) { - RETURN_IF_ERROR(_init_block()); - } - break; - } - RETURN_IF_ERROR(_init_block()); - } else { - ++_mini_block_idx; - if (_mini_block_idx < _mini_blocks_per_block) { - RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx])); - } else { - RETURN_IF_ERROR(_init_block()); - } - } - } - - int values_decode = std::min(_values_remaining_current_mini_block, - static_cast(num_values - i)); - for (int j = 0; j < values_decode; ++j) { - if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) { - return Status::IOError("Get batch EOF"); - } - } - for (int j = 0; j < values_decode; ++j) { - // Addition between min_delta, packed int and last_value should be treated as - // unsigned addition. Overflow is as expected. - buffer[i + j] = static_cast(_min_delta) + static_cast(buffer[i + j]) + - static_cast(_last_value); - _last_value = buffer[i + j]; - } - _values_remaining_current_mini_block -= values_decode; - i += values_decode; - } - _total_values_remaining -= num_values; - - if (PREDICT_FALSE(_total_values_remaining == 0)) { - if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) { - return Status::IOError("Skip padding EOF"); - } - _values_remaining_current_mini_block = 0; - } - *out_num_values = num_values; - return Status::OK(); -} - -void DeltaLengthByteArrayDecoder::_decode_lengths() { - _len_decoder.set_bit_reader(_bit_reader); - // get the number of encoded lengths - int num_length = _len_decoder.valid_values_count(); - _buffered_length.resize(num_length); - - // decode all the lengths. all the lengths are buffered in buffered_length_. - int ret; - Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret); - if (!st.ok()) { - LOG(FATAL) << "Fail to decode delta length, status: " << st; - } - DCHECK_EQ(ret, num_length); - _length_idx = 0; - _num_valid_values = num_length; -} - -Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values, - int* out_num_values) { - // Decode up to `max_values` strings into an internal buffer - // and reference them into `buffer`. - max_values = std::min(max_values, _num_valid_values); - if (max_values == 0) { - *out_num_values = 0; - return Status::OK(); - } - - int32_t data_size = 0; - const int32_t* length_ptr = _buffered_length.data() + _length_idx; - for (int i = 0; i < max_values; ++i) { - int32_t len = length_ptr[i]; - if (PREDICT_FALSE(len < 0)) { - return Status::InvalidArgument("Negative string delta length"); - } - buffer[i].size = len; - if (common::add_overflow(data_size, len, data_size)) { - return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY"); - } - } - _length_idx += max_values; - - _buffered_data.resize(data_size); - char* data_ptr = _buffered_data.data(); - for (int j = 0; j < data_size; j++) { - if (!_bit_reader->GetValue(8, data_ptr + j)) { - return Status::IOError("Get length bytes EOF"); - } - } - - for (int i = 0; i < max_values; ++i) { - buffer[i].data = data_ptr; - data_ptr += buffer[i].size; - } - // this->num_values_ -= max_values; - _num_valid_values -= max_values; - *out_num_values = max_values; - return Status::OK(); -} - -Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) { - // Decode up to `max_values` strings into an internal buffer - // and reference them into `buffer`. - max_values = std::min(max_values, _num_valid_values); - if (max_values == 0) { - *out_num_values = max_values; - return Status::OK(); - } - - int suffix_read; - RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read)); - if (PREDICT_FALSE(suffix_read != max_values)) { - return Status::IOError("Read {}, expecting {} from suffix decoder", - std::to_string(suffix_read), std::to_string(max_values)); - } - - int64_t data_size = 0; - const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset; - for (int i = 0; i < max_values; ++i) { - if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) { - return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY"); - } - if (PREDICT_FALSE(common::add_overflow(data_size, static_cast(prefix_len_ptr[i]), - data_size) || - common::add_overflow(data_size, static_cast(buffer[i].size), - data_size))) { - return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY"); - } - } - _buffered_data.resize(data_size); - - std::string_view prefix {_last_value}; - - char* data_ptr = _buffered_data.data(); - for (int i = 0; i < max_values; ++i) { - if (PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { - return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY"); - } - memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); - // buffer[i] currently points to the string suffix - memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size); - buffer[i].data = data_ptr; - buffer[i].size += prefix_len_ptr[i]; - data_ptr += buffer[i].size; - prefix = std::string_view {buffer[i].data, buffer[i].size}; - } - _prefix_len_offset += max_values; - _num_valid_values -= max_values; - _last_value = std::string {prefix}; - - if (_num_valid_values == 0) { - _last_value_in_previous_page = _last_value; - } - *out_num_values = max_values; - return Status::OK(); -} -} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h index 464229cda9..ff615aeb96 100644 --- a/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h +++ b/be/src/vec/exec/format/parquet/delta_bit_pack_decoder.h @@ -51,14 +51,10 @@ public: return _type_converted_decoder->skip_values(num_values); } - template + template Status decode_byte_array(const std::vector& decoded_vals, MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) { - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: { + if constexpr (PhysicalType == tparquet::Type::BYTE_ARRAY) { ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { @@ -88,21 +84,14 @@ public: } } _current_value_idx = 0; - return Status::OK(); } - default: - break; - } - return Status::InvalidArgument( - "Can't decode parquet physical type BYTE_ARRAY to doris logical type {}", - getTypeName(logical_type)); + return Status::OK(); } protected: void init_values_converter() { _type_converted_decoder->set_data(_data); _type_converted_decoder->set_type_length(_type_length); - _type_converted_decoder->init(_field_schema, _decode_params->ctz); } // Convert decoded value to doris type value. std::unique_ptr _type_converted_decoder; @@ -117,13 +106,12 @@ protected: * Block * [min delta] [list of bitwidths of the mini blocks] [miniblocks] */ -template +template class DeltaBitPackDecoder final : public DeltaDecoder { public: using UT = std::make_unsigned_t; - DeltaBitPackDecoder(const tparquet::Type::type& physical_type) - : DeltaDecoder(new FixLengthPlainDecoder(physical_type)) {} + DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder()) {} ~DeltaBitPackDecoder() override = default; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) override { @@ -200,16 +188,13 @@ private: // _values_remaining_current_mini_block may greater than _total_values_remaining. uint32_t _values_remaining_current_mini_block; }; -template class DeltaBitPackDecoder; -template class DeltaBitPackDecoder; - +//template class DeltaBitPackDecoder; +//template class DeltaBitPackDecoder; +template class DeltaLengthByteArrayDecoder final : public DeltaDecoder { public: - explicit DeltaLengthByteArrayDecoder(const tparquet::Type::type& physical_type) - : DeltaDecoder(nullptr), - _len_decoder(physical_type), - _buffered_length(0), - _buffered_data(0) {} + explicit DeltaLengthByteArrayDecoder() + : DeltaDecoder(nullptr), _len_decoder(), _buffered_length(0), _buffered_data(0) {} Status skip_values(size_t num_values) override { _current_value_idx += num_values; @@ -240,7 +225,8 @@ public: return Status::IOError("Expected to decode {} values, but decoded {} values.", num_values - null_count, num_valid_values); } - return decode_byte_array(_values, doris_column, data_type, select_vector); + return decode_byte_array(_values, doris_column, data_type, + select_vector); } Status decode(Slice* buffer, int num_values, int* out_num_values) { @@ -270,7 +256,7 @@ private: std::vector _values; std::shared_ptr _bit_reader; - DeltaBitPackDecoder _len_decoder; + DeltaBitPackDecoder _len_decoder; int _num_valid_values; uint32_t _length_idx; @@ -278,14 +264,11 @@ private: std::vector _buffered_data; }; +template class DeltaByteArrayDecoder : public DeltaDecoder { public: - explicit DeltaByteArrayDecoder(const tparquet::Type::type& physical_type) - : DeltaDecoder(nullptr), - _prefix_len_decoder(physical_type), - _suffix_decoder(physical_type), - _buffered_prefix_length(0), - _buffered_data(0) {} + explicit DeltaByteArrayDecoder() + : DeltaDecoder(nullptr), _buffered_prefix_length(0), _buffered_data(0) {} Status skip_values(size_t num_values) override { _current_value_idx += num_values; @@ -312,7 +295,8 @@ public: int num_valid_values; RETURN_IF_ERROR(_get_internal(_values.data(), num_values - null_count, &num_valid_values)); DCHECK_EQ(num_values - null_count, num_valid_values); - return decode_byte_array(_values, doris_column, data_type, select_vector); + return decode_byte_array(_values, doris_column, data_type, + select_vector); } void set_data(Slice* slice) override { @@ -350,8 +334,8 @@ private: std::vector _values; std::shared_ptr _bit_reader; - DeltaBitPackDecoder _prefix_len_decoder; - DeltaLengthByteArrayDecoder _suffix_decoder; + DeltaBitPackDecoder _prefix_len_decoder; + DeltaLengthByteArrayDecoder _suffix_decoder; std::string _last_value; // string buffer for last value in previous page std::string _last_value_in_previous_page; @@ -361,3 +345,260 @@ private: std::vector _buffered_data; }; } // namespace doris::vectorized + +namespace doris::vectorized { + +template +Status DeltaBitPackDecoder::_init_header() { + if (!_bit_reader->GetVlqInt(&_values_per_block) || + !_bit_reader->GetVlqInt(&_mini_blocks_per_block) || + !_bit_reader->GetVlqInt(&_total_value_count) || + !_bit_reader->GetZigZagVlqInt(&_last_value)) { + return Status::IOError("Init header eof"); + } + if (_values_per_block == 0) { + return Status::InvalidArgument("Cannot have zero value per block"); + } + if (_values_per_block % 128 != 0) { + return Status::InvalidArgument( + "the number of values in a block must be multiple of 128, but it's " + + std::to_string(_values_per_block)); + } + if (_mini_blocks_per_block == 0) { + return Status::InvalidArgument("Cannot have zero miniblock per block"); + } + _values_per_mini_block = _values_per_block / _mini_blocks_per_block; + if (_values_per_mini_block == 0) { + return Status::InvalidArgument("Cannot have zero value per miniblock"); + } + if (_values_per_mini_block % 32 != 0) { + return Status::InvalidArgument( + "The number of values in a miniblock must be multiple of 32, but it's " + + std::to_string(_values_per_mini_block)); + } + _total_values_remaining = _total_value_count; + _delta_bit_widths.resize(_mini_blocks_per_block); + // init as empty property + _block_initialized = false; + _values_remaining_current_mini_block = 0; + return Status::OK(); +} + +template +Status DeltaBitPackDecoder::_init_block() { + DCHECK_GT(_total_values_remaining, 0) << "InitBlock called at EOF"; + if (!_bit_reader->GetZigZagVlqInt(&_min_delta)) { + return Status::IOError("Init block eof"); + } + + // read the bitwidth of each miniblock + uint8_t* bit_width_data = _delta_bit_widths.data(); + for (uint32_t i = 0; i < _mini_blocks_per_block; ++i) { + if (!_bit_reader->GetAligned(1, bit_width_data + i)) { + return Status::IOError("Decode bit-width EOF"); + } + // Note that non-conformant bitwidth entries are allowed by the Parquet spec + // for extraneous miniblocks in the last block (GH-14923), so we check + // the bitwidths when actually using them (see InitMiniBlock()). + } + _mini_block_idx = 0; + _block_initialized = true; + RETURN_IF_ERROR(_init_mini_block(bit_width_data[0])); + return Status::OK(); +} + +template +Status DeltaBitPackDecoder::_init_mini_block(int bit_width) { + if (PREDICT_FALSE(bit_width > kMaxDeltaBitWidth)) { + return Status::InvalidArgument("delta bit width larger than integer bit width"); + } + _delta_bit_width = bit_width; + _values_remaining_current_mini_block = _values_per_mini_block; + return Status::OK(); +} + +template +Status DeltaBitPackDecoder::_get_internal(T* buffer, int num_values, + int* out_num_values) { + num_values = static_cast(std::min(num_values, _total_values_remaining)); + if (num_values == 0) { + *out_num_values = 0; + return Status::OK(); + } + int i = 0; + while (i < num_values) { + if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) { + if (PREDICT_FALSE(!_block_initialized)) { + buffer[i++] = _last_value; + DCHECK_EQ(i, 1); // we're at the beginning of the page + if (i == num_values) { + // When block is uninitialized and i reaches num_values we have two + // different possibilities: + // 1. _total_value_count == 1, which means that the page may have only + // one value (encoded in the header), and we should not initialize + // any block. + // 2. _total_value_count != 1, which means we should initialize the + // incoming block for subsequent reads. + if (_total_value_count != 1) { + RETURN_IF_ERROR(_init_block()); + } + break; + } + RETURN_IF_ERROR(_init_block()); + } else { + ++_mini_block_idx; + if (_mini_block_idx < _mini_blocks_per_block) { + RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx])); + } else { + RETURN_IF_ERROR(_init_block()); + } + } + } + + int values_decode = std::min(_values_remaining_current_mini_block, + static_cast(num_values - i)); + for (int j = 0; j < values_decode; ++j) { + if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) { + return Status::IOError("Get batch EOF"); + } + } + for (int j = 0; j < values_decode; ++j) { + // Addition between min_delta, packed int and last_value should be treated as + // unsigned addition. Overflow is as expected. + buffer[i + j] = static_cast(_min_delta) + static_cast(buffer[i + j]) + + static_cast(_last_value); + _last_value = buffer[i + j]; + } + _values_remaining_current_mini_block -= values_decode; + i += values_decode; + } + _total_values_remaining -= num_values; + + if (PREDICT_FALSE(_total_values_remaining == 0)) { + if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) { + return Status::IOError("Skip padding EOF"); + } + _values_remaining_current_mini_block = 0; + } + *out_num_values = num_values; + return Status::OK(); +} +template +void DeltaLengthByteArrayDecoder::_decode_lengths() { + _len_decoder.set_bit_reader(_bit_reader); + // get the number of encoded lengths + int num_length = _len_decoder.valid_values_count(); + _buffered_length.resize(num_length); + + // decode all the lengths. all the lengths are buffered in buffered_length_. + int ret; + Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret); + if (!st.ok()) { + LOG(FATAL) << "Fail to decode delta length, status: " << st; + } + DCHECK_EQ(ret, num_length); + _length_idx = 0; + _num_valid_values = num_length; +} +template +Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values, + int* out_num_values) { + // Decode up to `max_values` strings into an internal buffer + // and reference them into `buffer`. + max_values = std::min(max_values, _num_valid_values); + if (max_values == 0) { + *out_num_values = 0; + return Status::OK(); + } + + int32_t data_size = 0; + const int32_t* length_ptr = _buffered_length.data() + _length_idx; + for (int i = 0; i < max_values; ++i) { + int32_t len = length_ptr[i]; + if (PREDICT_FALSE(len < 0)) { + return Status::InvalidArgument("Negative string delta length"); + } + buffer[i].size = len; + if (common::add_overflow(data_size, len, data_size)) { + return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY"); + } + } + _length_idx += max_values; + + _buffered_data.resize(data_size); + char* data_ptr = _buffered_data.data(); + for (int j = 0; j < data_size; j++) { + if (!_bit_reader->GetValue(8, data_ptr + j)) { + return Status::IOError("Get length bytes EOF"); + } + } + + for (int i = 0; i < max_values; ++i) { + buffer[i].data = data_ptr; + data_ptr += buffer[i].size; + } + // this->num_values_ -= max_values; + _num_valid_values -= max_values; + *out_num_values = max_values; + return Status::OK(); +} + +template +Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, + int* out_num_values) { + // Decode up to `max_values` strings into an internal buffer + // and reference them into `buffer`. + max_values = std::min(max_values, _num_valid_values); + if (max_values == 0) { + *out_num_values = max_values; + return Status::OK(); + } + + int suffix_read; + RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read)); + if (PREDICT_FALSE(suffix_read != max_values)) { + return Status::IOError("Read {}, expecting {} from suffix decoder", + std::to_string(suffix_read), std::to_string(max_values)); + } + + int64_t data_size = 0; + const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset; + for (int i = 0; i < max_values; ++i) { + if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) { + return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY"); + } + if (PREDICT_FALSE(common::add_overflow(data_size, static_cast(prefix_len_ptr[i]), + data_size) || + common::add_overflow(data_size, static_cast(buffer[i].size), + data_size))) { + return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY"); + } + } + _buffered_data.resize(data_size); + + std::string_view prefix {_last_value}; + + char* data_ptr = _buffered_data.data(); + for (int i = 0; i < max_values; ++i) { + if (PREDICT_FALSE(static_cast(prefix_len_ptr[i]) > prefix.length())) { + return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY"); + } + memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]); + // buffer[i] currently points to the string suffix + memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size); + buffer[i].data = data_ptr; + buffer[i].size += prefix_len_ptr[i]; + data_ptr += buffer[i].size; + prefix = std::string_view {buffer[i].data, buffer[i].size}; + } + _prefix_len_offset += max_values; + _num_valid_values -= max_values; + _last_value = std::string {prefix}; + + if (_num_valid_values == 0) { + _last_value_in_previous_page = _last_value; + } + *out_num_values = max_values; + return Status::OK(); +} +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 35880cfcdd..c29e742f51 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -25,11 +25,10 @@ namespace doris::vectorized { -template +template class FixLengthDictDecoder final : public BaseDictDecoder { public: - FixLengthDictDecoder(tparquet::Type::type physical_type) - : BaseDictDecoder(), _physical_type(physical_type) {}; + FixLengthDictDecoder() : BaseDictDecoder() {}; ~FixLengthDictDecoder() override = default; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, @@ -73,95 +72,7 @@ public: return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { -#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - if constexpr (!std::is_same_v) { \ - return _decode_numeric(doris_column, select_vector); \ - } - FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) -#undef DISPATCH - case TypeIndex::Date: - if constexpr (std::is_same_v) { - return _decode_date(doris_column, - select_vector); - } - break; - case TypeIndex::DateV2: - if constexpr (std::is_same_v) { - return _decode_date, UInt32, has_filter>( - doris_column, select_vector); - } - break; - case TypeIndex::DateTime: - if constexpr (std::is_same_v) { - return _decode_datetime96(doris_column, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_datetime64(doris_column, - select_vector); - } - break; - case TypeIndex::DateTimeV2: - // Spark can set the timestamp precision by the following configuration: - // spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS - if constexpr (std::is_same_v) { - return _decode_datetime96, UInt64, has_filter>( - doris_column, select_vector); - } else if constexpr (std::is_same_v) { - return _decode_datetime64, UInt64, has_filter>( - doris_column, select_vector); - } - break; - case TypeIndex::Decimal32: - if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal64: - if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128: - if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128I: - if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if constexpr (std::is_same_v) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - // TODO: decimal256 - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: - break; - default: - break; - } - - return Status::InvalidArgument( - "Can't decode parquet physical type {} to doris logical type {}", - tparquet::to_string(_physical_type), getTypeName(logical_type)); + return _decode_numeric(doris_column, select_vector); } Status set_dict(std::unique_ptr& dict, int32_t length, size_t num_values) override { @@ -172,26 +83,27 @@ public: char* dict_item_address = reinterpret_cast(_dict.get()); _dict_items.resize(num_values); for (size_t i = 0; i < num_values; ++i) { - _dict_items[i] = *(T*)dict_item_address; + _dict_items[i] = *(DataType*)dict_item_address; dict_item_address += _type_length; } return Status::OK(); } protected: - template + template Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); + auto& column_data = reinterpret_cast&>(*doris_column).get_data(); + size_t data_index = column_data.size() / _type_length; + column_data.resize(column_data.size() + _type_length * (select_vector.num_values() - + select_vector.num_filtered())); size_t dict_index = 0; + DataType* data = (DataType*)column_data.data(); ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { case ColumnSelectVector::CONTENT: { for (size_t i = 0; i < run_length; ++i) { - column_data[data_index++] = - static_cast(_dict_items[_indexes[dict_index++]]); + data[data_index++] = _dict_items[_indexes[dict_index++]]; } break; } @@ -211,250 +123,17 @@ protected: } return Status::OK(); } - - template - Status _decode_date(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - date_day_offset_dict& date_dict = date_day_offset_dict::get(); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - int64_t date_value = - _dict_items[_indexes[dict_index++]] + _decode_params->offset_days; - if constexpr (std::is_same_v) { - auto& v = reinterpret_cast(column_data[data_index++]); - v.create_from_date_v2(date_dict[date_value], TIME_DATE); - // we should cast to date if using date v1. - v.cast_to_date(); - } else { - reinterpret_cast(column_data[data_index++]) = - date_dict[date_value]; - } - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - template - Status _decode_datetime64(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - int64_t date_value = _dict_items[_indexes[dict_index++]]; - auto& v = reinterpret_cast(column_data[data_index++]); - v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz); - if constexpr (std::is_same_v>) { - // nanoseconds will be ignored. - v.set_microsecond((date_value % _decode_params->second_mask) * - _decode_params->scale_to_nano_factor / 1000); - // TODO: the precision of datetime v1 - } - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - template - Status _decode_datetime96(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - ParquetInt96& datetime96 = _dict_items[_indexes[dict_index++]]; - auto& v = reinterpret_cast(column_data[data_index++]); - int64_t micros = datetime96.to_timestamp_micros(); - v.from_unixtime(micros / 1000000, *_decode_params->ctz); - if constexpr (std::is_same_v>) { - // spark.sql.parquet.outputTimestampType = INT96(NANOS) will lost precision. - // only keep microseconds. - v.set_microsecond(micros % 1000000); - } - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - template - Status _decode_primitive_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; -#define M(FixedTypeLength, ValueCopyType, ScaleType) \ - case FixedTypeLength: \ - return _decode_primitive_decimal_internal(doris_column, data_type, \ - select_vector); - -#define APPLY_FOR_DECIMALS(ScaleType) \ - M(1, int64_t, ScaleType) \ - M(2, int64_t, ScaleType) \ - M(3, int64_t, ScaleType) \ - M(4, int64_t, ScaleType) \ - M(5, int64_t, ScaleType) \ - M(6, int64_t, ScaleType) \ - M(7, int64_t, ScaleType) \ - M(8, int64_t, ScaleType) \ - M(9, int128_t, ScaleType) \ - M(10, int128_t, ScaleType) \ - M(11, int128_t, ScaleType) \ - M(12, int128_t, ScaleType) \ - M(13, int128_t, ScaleType) \ - M(14, int128_t, ScaleType) \ - M(15, int128_t, ScaleType) \ - M(16, int128_t, ScaleType) - - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } - return Status::OK(); -#undef APPLY_FOR_DECIMALS -#undef M - } - - template - Status _decode_primitive_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column) - .get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - ValueCopyType value = static_cast(_dict_items[_indexes[dict_index++]]); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - tparquet::Type::type _physical_type; + using ColumnType = ParquetConvert::PhysicalTypeTraits::ColumnType; + using DataType = ParquetConvert::PhysicalTypeTraits::DataType; // For dictionary encoding - std::vector _dict_items; + std::vector _dict_items; }; template <> -class FixLengthDictDecoder final : public BaseDictDecoder { +class FixLengthDictDecoder final : public BaseDictDecoder { public: - FixLengthDictDecoder(tparquet::Type::type physical_type) - : BaseDictDecoder(), _physical_type(physical_type) {}; + FixLengthDictDecoder() : BaseDictDecoder() {}; ~FixLengthDictDecoder() override = default; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, @@ -487,52 +166,39 @@ public: return _decode_dict_values(doris_column, select_vector, is_dict_filter); } - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { - case TypeIndex::Decimal32: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal64: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128I: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } - break; - // TODO: decimal256 - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_string(doris_column, select_vector); - } - break; - default: - break; - } - - return Status::InvalidArgument( - "Can't decode parquet physical type {} to doris logical type {}", - tparquet::to_string(_physical_type), getTypeName(logical_type)); + return _decode_string(doris_column, select_vector); } - Status skip_values(size_t num_values) override { - _indexes.resize(num_values); - _index_batch_decoder->GetBatch(&_indexes[0], num_values); +protected: + template + Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { + size_t dict_index = 0; + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + std::vector string_values; + string_values.reserve(run_length); + for (size_t i = 0; i < run_length; ++i) { + string_values.emplace_back(_dict_items[_indexes[dict_index++]], _type_length); + } + doris_column->insert_many_strings(&string_values[0], run_length); + break; + } + case ColumnSelectVector::NULL_DATA: { + doris_column->insert_many_defaults(run_length); + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + dict_index += run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } + } + } return Status::OK(); } @@ -583,160 +249,9 @@ public: res->insert_many_strings(&dict_values[0], dict_values.size()); return res; } - -protected: - template - Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; -#define M(FixedTypeLength, ValueCopyType, ScaleType) \ - case FixedTypeLength: \ - return _decode_binary_decimal_internal(doris_column, data_type, \ - select_vector); - -#define APPLY_FOR_DECIMALS(ScaleType) \ - M(1, int64_t, ScaleType) \ - M(2, int64_t, ScaleType) \ - M(3, int64_t, ScaleType) \ - M(4, int64_t, ScaleType) \ - M(5, int64_t, ScaleType) \ - M(6, int64_t, ScaleType) \ - M(7, int64_t, ScaleType) \ - M(8, int64_t, ScaleType) \ - M(9, int128_t, ScaleType) \ - M(10, int128_t, ScaleType) \ - M(11, int128_t, ScaleType) \ - M(12, int128_t, ScaleType) \ - M(13, int128_t, ScaleType) \ - M(14, int128_t, ScaleType) \ - M(15, int128_t, ScaleType) \ - M(16, int128_t, ScaleType) - - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } - return Status::OK(); -#undef APPLY_FOR_DECIMALS -#undef M - } - - template - Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) { - size_t dict_index = 0; - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - std::vector string_values; - string_values.reserve(run_length); - for (size_t i = 0; i < run_length; ++i) { - string_values.emplace_back(_dict_items[_indexes[dict_index++]], _type_length); - } - doris_column->insert_many_strings(&string_values[0], run_length); - break; - } - case ColumnSelectVector::NULL_DATA: { - doris_column->insert_many_defaults(run_length); - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } - - tparquet::Type::type _physical_type; - + std::unordered_map _dict_value_to_code; // For dictionary encoding std::vector _dict_items; - std::unordered_map _dict_value_to_code; - -private: - template - Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column) - .get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t dict_index = 0; - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _dict_items[_indexes[dict_index++]]; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - DecimalPrimitiveType result_value = 0; - ValueCopyType value = 0; - memcpy(reinterpret_cast(&value), buf_start, fixed_type_length); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - fixed_type_length) * 8); - result_value = value; - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - result_value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - result_value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)result_value; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - dict_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); - } }; } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp deleted file mode 100644 index 8e6f6ebb67..0000000000 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp +++ /dev/null @@ -1,609 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "vec/exec/format/parquet/fix_length_plain_decoder.h" - -#include -#include -#include - -#include -#include - -// IWYU pragma: no_include -#include "common/compiler_util.h" // IWYU pragma: keep -#include "util/bit_util.h" -#include "util/slice.h" -#include "vec/columns/column.h" -#include "vec/common/string_ref.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_nullable.h" -#include "vec/exec/format/format_common.h" -#include "vec/exec/format/parquet/parquet_common.h" -#include "vec/runtime/vdatetime_value.h" - -namespace doris { -namespace vectorized { -template -class ColumnDecimal; -template -class ColumnVector; -} // namespace vectorized -} // namespace doris - -namespace doris::vectorized { - -Status FixLengthPlainDecoder::skip_values(size_t num_values) { - _offset += _type_length * num_values; - if (UNLIKELY(_offset > _data->size)) { - return Status::IOError("Out-of-bounds access in parquet data decoder"); - } - return Status::OK(); -} - -Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector, - bool is_dict_filter) { - if (select_vector.has_filter()) { - return _decode_values(doris_column, data_type, select_vector, is_dict_filter); - } else { - return _decode_values(doris_column, data_type, select_vector, is_dict_filter); - } -} - -template -Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector, - bool is_dict_filter) { - size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); - if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) { - return Status::IOError("Out-of-bounds access in parquet data decoder"); - } - TypeIndex logical_type = remove_nullable(data_type)->get_type_id(); - switch (logical_type) { -#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ - case NUMERIC_TYPE: \ - if (_physical_type == tparquet::Type::INT32) { \ - return _decode_numeric(doris_column, \ - select_vector); \ - } else if (_physical_type == tparquet::Type::INT64) { \ - return _decode_numeric(doris_column, \ - select_vector); \ - } else if (_physical_type == tparquet::Type::FLOAT) { \ - return _decode_numeric(doris_column, \ - select_vector); \ - } else if (_physical_type == tparquet::Type::DOUBLE) { \ - return _decode_numeric(doris_column, \ - select_vector); \ - } else { \ - break; \ - } - FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) -#undef DISPATCH - case TypeIndex::Date: - if (_physical_type == tparquet::Type::INT32) { - return _decode_date(doris_column, select_vector); - } - break; - case TypeIndex::DateV2: - if (_physical_type == tparquet::Type::INT32) { - return _decode_date, UInt32, has_filter>(doris_column, - select_vector); - } - break; - case TypeIndex::DateTime: - if (_physical_type == tparquet::Type::INT96) { - return _decode_datetime96(doris_column, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_datetime64(doris_column, - select_vector); - } - break; - case TypeIndex::DateTimeV2: - // Spark can set the timestamp precision by the following configuration: - // spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS - if (_physical_type == tparquet::Type::INT96) { - return _decode_datetime96, UInt64, has_filter>( - doris_column, select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_datetime64, UInt64, has_filter>( - doris_column, select_vector); - } - break; - case TypeIndex::Decimal32: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT32) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal64: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT32) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT32) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - case TypeIndex::Decimal128I: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_binary_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT32) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } else if (_physical_type == tparquet::Type::INT64) { - return _decode_primitive_decimal(doris_column, data_type, - select_vector); - } - break; - // TODO: decimal256 - case TypeIndex::String: - [[fallthrough]]; - case TypeIndex::FixedString: - if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { - return _decode_string(doris_column, select_vector); - } - break; - default: - break; - } - - return Status::InvalidArgument("Can't decode parquet physical type {} to doris logical type {}", - tparquet::to_string(_physical_type), getTypeName(logical_type)); -} - -template -Status FixLengthPlainDecoder::_decode_string(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - std::vector string_values; - string_values.reserve(run_length); - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - string_values.emplace_back(buf_start, _type_length); - _offset += _type_length; - } - doris_column->insert_many_strings(&string_values[0], run_length); - break; - } - case ColumnSelectVector::NULL_DATA: { - doris_column->insert_many_defaults(run_length); - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} -template -Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - column_data[data_index++] = *(PhysicalType*)buf_start; - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - date_day_offset_dict& date_dict = date_day_offset_dict::get(); - - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - int64_t date_value = static_cast(*reinterpret_cast(buf_start)) + - _decode_params->offset_days; - if constexpr (std::is_same_v) { - auto& v = reinterpret_cast(column_data[data_index++]); - v.create_from_date_v2(date_dict[date_value], TIME_DATE); - // we should cast to date if using date v1. - v.cast_to_date(); - } else { - reinterpret_cast(column_data[data_index++]) = date_dict[date_value]; - } - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_datetime64(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - int64_t& date_value = *reinterpret_cast(buf_start); - auto& v = reinterpret_cast(column_data[data_index++]); - v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz); - if constexpr (std::is_same_v>) { - // nanoseconds will be ignored. - v.set_microsecond((date_value % _decode_params->second_mask) * - _decode_params->scale_to_nano_factor / 1000); - // TODO: the precision of datetime v1 - } - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_datetime96(MutableColumnPtr& doris_column, - ColumnSelectVector& select_vector) { - auto& column_data = static_cast&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - ParquetInt96& datetime96 = *reinterpret_cast(buf_start); - auto& v = reinterpret_cast(column_data[data_index++]); - int64_t micros = datetime96.to_timestamp_micros(); - v.from_unixtime(micros / 1000000, *_decode_params->ctz); - if constexpr (std::is_same_v>) { - // spark.sql.parquet.outputTimestampType = INT96(NANOS) will lost precision. - // only keep microseconds. - v.set_microsecond(micros % 1000000); - } - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; -#define M(FixedTypeLength, ValueCopyType, ScaleType) \ - case FixedTypeLength: \ - return _decode_binary_decimal_internal(doris_column, data_type, \ - select_vector); - -#define APPLY_FOR_DECIMALS(ScaleType) \ - M(1, int64_t, ScaleType) \ - M(2, int64_t, ScaleType) \ - M(3, int64_t, ScaleType) \ - M(4, int64_t, ScaleType) \ - M(5, int64_t, ScaleType) \ - M(6, int64_t, ScaleType) \ - M(7, int64_t, ScaleType) \ - M(8, int64_t, ScaleType) \ - M(9, int128_t, ScaleType) \ - M(10, int128_t, ScaleType) \ - M(11, int128_t, ScaleType) \ - M(12, int128_t, ScaleType) \ - M(13, int128_t, ScaleType) \ - M(14, int128_t, ScaleType) \ - M(15, int128_t, ScaleType) \ - M(16, int128_t, ScaleType) - - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } - return Status::OK(); -#undef APPLY_FOR_DECIMALS -#undef M -} - -template -Status FixLengthPlainDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - // When Decimal in parquet is stored in byte arrays, binary and fixed, - // the unscaled number must be encoded as two's complement using big-endian byte order. - DecimalPrimitiveType result_value = 0; - ValueCopyType value = 0; - memcpy(reinterpret_cast(&value), buf_start, fixed_type_length); - value = BitUtil::big_endian_to_host(value); - value = value >> ((sizeof(value) - fixed_type_length) * 8); - result_value = value; - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - result_value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - result_value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)result_value; - _offset += fixed_type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} - -template -Status FixLengthPlainDecoder::_decode_primitive_decimal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector) { - init_decimal_converter(data_type); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; -#define M(FixedTypeLength, T, ScaleType) \ - case FixedTypeLength: \ - return _decode_primitive_decimal_internal( \ - doris_column, data_type, select_vector); - -#define APPLY_FOR_DECIMALS(ScaleType) \ - M(1, int64_t, ScaleType) \ - M(2, int64_t, ScaleType) \ - M(3, int64_t, ScaleType) \ - M(4, int64_t, ScaleType) \ - M(5, int64_t, ScaleType) \ - M(6, int64_t, ScaleType) \ - M(7, int64_t, ScaleType) \ - M(8, int64_t, ScaleType) \ - M(9, int128_t, ScaleType) \ - M(10, int128_t, ScaleType) \ - M(11, int128_t, ScaleType) \ - M(12, int128_t, ScaleType) \ - M(13, int128_t, ScaleType) \ - M(14, int128_t, ScaleType) \ - M(15, int128_t, ScaleType) \ - M(16, int128_t, ScaleType) - - if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } else { - switch (_type_length) { - APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE) - default: - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - } - return Status::OK(); -#undef APPLY_FOR_DECIMALS -#undef M -} - -template -Status FixLengthPlainDecoder::_decode_primitive_decimal_internal( - MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) { - auto& column_data = - static_cast>&>(*doris_column).get_data(); - size_t data_index = column_data.size(); - column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - DecimalScaleParams& scale_params = _decode_params->decimal_scale; - - ColumnSelectVector::DataReadType read_type; - while (size_t run_length = select_vector.get_next_run(&read_type)) { - switch (read_type) { - case ColumnSelectVector::CONTENT: { - for (size_t i = 0; i < run_length; ++i) { - char* buf_start = _data->data + _offset; - ValueCopyType value = *reinterpret_cast(buf_start); - if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { - value *= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { - value /= scale_params.scale_factor; - } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { - // do nothing - } else { - LOG(FATAL) << "__builtin_unreachable"; - __builtin_unreachable(); - } - auto& v = reinterpret_cast(column_data[data_index++]); - v = (DecimalPrimitiveType)value; - _offset += _type_length; - } - break; - } - case ColumnSelectVector::NULL_DATA: { - data_index += run_length; - break; - } - case ColumnSelectVector::FILTERED_CONTENT: { - _offset += _type_length * run_length; - break; - } - case ColumnSelectVector::FILTERED_NULL: { - // do nothing - break; - } - } - } - return Status::OK(); -} -} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h index 96d674e258..b21f58601d 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.h @@ -23,7 +23,8 @@ #include "common/status.h" #include "vec/data_types/data_type.h" #include "vec/exec/format/parquet/decoder.h" - +#include "vec/exec/format/parquet/parquet_column_convert.h" +#include "vec/exec/format/parquet/parquet_common.h" namespace doris { namespace vectorized { class ColumnSelectVector; @@ -32,56 +33,135 @@ class ColumnSelectVector; namespace doris::vectorized { +template class FixLengthPlainDecoder final : public Decoder { public: - FixLengthPlainDecoder(tparquet::Type::type physical_type) : _physical_type(physical_type) {}; + FixLengthPlainDecoder() {}; ~FixLengthPlainDecoder() override = default; Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter) override; - template + template Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector, bool is_dict_filter); Status skip_values(size_t num_values) override; protected: - template + template Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - template - Status _decode_date(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - template - Status _decode_datetime64(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - template - Status _decode_datetime96(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - template - Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - - template - Status _decode_primitive_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - template Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector); - - tparquet::Type::type _physical_type; - -private: - template - Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type, - ColumnSelectVector& select_vector); - template - Status _decode_primitive_decimal_internal(MutableColumnPtr& doris_column, - DataTypePtr& data_type, - ColumnSelectVector& select_vector); }; + +template +Status FixLengthPlainDecoder::skip_values(size_t num_values) { + _offset += _type_length * num_values; + if (UNLIKELY(_offset > _data->size)) { + return Status::IOError("Out-of-bounds access in parquet data decoder"); + } + return Status::OK(); +} + +template +Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, + DataTypePtr& data_type, + ColumnSelectVector& select_vector, + bool is_dict_filter) { + if (select_vector.has_filter()) { + return _decode_values(doris_column, data_type, select_vector, is_dict_filter); + } else { + return _decode_values(doris_column, data_type, select_vector, is_dict_filter); + } +} + +template +template +Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, + DataTypePtr& data_type, + ColumnSelectVector& select_vector, + bool is_dict_filter) { + size_t non_null_size = select_vector.num_values() - select_vector.num_nulls(); + if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) { + return Status::IOError("Out-of-bounds access in parquet data decoder"); + } + + if constexpr (PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY) { + return _decode_string(doris_column, select_vector); + } else { + return _decode_numeric(doris_column, select_vector); + } +} + +template +template +Status FixLengthPlainDecoder::_decode_string(MutableColumnPtr& doris_column, + ColumnSelectVector& select_vector) { + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + std::vector string_values; + string_values.reserve(run_length); + for (size_t i = 0; i < run_length; ++i) { + char* buf_start = _data->data + _offset; + string_values.emplace_back(buf_start, _type_length); + _offset += _type_length; + } + doris_column->insert_many_strings(&string_values[0], run_length); + break; + } + case ColumnSelectVector::NULL_DATA: { + doris_column->insert_many_defaults(run_length); + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + _offset += _type_length * run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } + } + } + return Status::OK(); +} + +template +template +Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& doris_column, + ColumnSelectVector& select_vector) { + auto& column_data = reinterpret_cast&>(*doris_column).get_data(); + size_t data_index = column_data.size(); + column_data.resize(data_index + + _type_length * (select_vector.num_values() - select_vector.num_filtered())); + ColumnSelectVector::DataReadType read_type; + while (size_t run_length = select_vector.get_next_run(&read_type)) { + switch (read_type) { + case ColumnSelectVector::CONTENT: { + memcpy(column_data.data() + data_index, _data->data + _offset, + run_length * _type_length); + _offset += run_length * _type_length; + data_index += run_length * _type_length; + break; + } + case ColumnSelectVector::NULL_DATA: { + data_index += run_length * _type_length; + break; + } + case ColumnSelectVector::FILTERED_CONTENT: { + _offset += _type_length * run_length; + break; + } + case ColumnSelectVector::FILTERED_NULL: { + // do nothing + break; + } + } + } + return Status::OK(); +} } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.cpp b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp new file mode 100644 index 0000000000..34b6da3e57 --- /dev/null +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.cpp @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exec/format/parquet/parquet_column_convert.h" + +#include + +#include "vec/columns/column_nullable.h" +namespace doris::vectorized { +namespace ParquetConvert { +const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone(); + +ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, + ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert) { + ColumnPtr ans_column = doris_column; + DataTypePtr tmp_data_type; + + switch (parquet_physical_type) { + case tparquet::Type::type::BOOLEAN: + tmp_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT32: + tmp_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT64: + tmp_data_type = std::make_shared(); + break; + case tparquet::Type::type::FLOAT: + tmp_data_type = std::make_shared(); + break; + case tparquet::Type::type::DOUBLE: + tmp_data_type = std::make_shared(); + break; + case tparquet::Type::type::BYTE_ARRAY: + case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY: + tmp_data_type = std::make_shared(); + break; + case tparquet::Type::type::INT96: + tmp_data_type = std::make_shared(); + break; + } + + if (tmp_data_type->get_type_id() == remove_nullable(doris_type)->get_type_id()) { + if (tmp_data_type->get_type_id() == TypeIndex::String && + (show_type == PrimitiveType::TYPE_DECIMAL32 || + show_type == PrimitiveType::TYPE_DECIMAL64 || + show_type == PrimitiveType::TYPE_DECIMALV2 || + show_type == PrimitiveType::TYPE_DECIMAL128I)) { + *need_convert = true; + ans_column = tmp_data_type->create_column(); + } else { + *need_convert = false; + } + } else { + ans_column = tmp_data_type->create_column(); + *need_convert = true; + } + + if (*need_convert && doris_type->is_nullable()) { + auto doris_nullable_column = static_cast(doris_column.get()); + ans_column = ColumnNullable::create(ans_column, + doris_nullable_column->get_null_map_column_ptr()); + } + return ans_column; +} + +} // namespace ParquetConvert +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/parquet_column_convert.h b/be/src/vec/exec/format/parquet/parquet_column_convert.h new file mode 100644 index 0000000000..6cf3cfb6c5 --- /dev/null +++ b/be/src/vec/exec/format/parquet/parquet_column_convert.h @@ -0,0 +1,665 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include +#include +#include + +#include "common/compiler_util.h" // IWYU pragma: keep +#include "common/status.h" +#include "gen_cpp/descriptors.pb.h" +#include "gutil/endian.h" +#include "gutil/strings/numbers.h" +#include "io/file_factory.h" +#include "olap/olap_common.h" +#include "util/coding.h" +#include "util/slice.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" +#include "vec/exec/format/format_common.h" +#include "vec/exec/format/parquet/decoder.h" +#include "vec/exec/format/parquet/parquet_common.h" + +namespace doris::vectorized { + +namespace ParquetConvert { + +template +struct PhysicalTypeTraits {}; + +template <> +struct PhysicalTypeTraits { + using DataType = int32_t; + using ColumnType = ColumnVector; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = uint8; + using ColumnType = ColumnVector; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = int64_t; + using ColumnType = ColumnVector; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = float; + using ColumnType = ColumnVector; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = double; + using ColumnType = ColumnVector; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = String; + using ColumnType = ColumnString; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = String; + using ColumnType = ColumnString; +}; + +template <> +struct PhysicalTypeTraits { + using DataType = ParquetInt96; + using ColumnType = ColumnVector; +}; + +#define FOR_LOGICAL_NUMERIC_TYPES(M) \ + M(TypeIndex::Int8, Int8, Int32) \ + M(TypeIndex::Int16, Int16, Int32) \ + M(TypeIndex::Int32, Int32, Int32) \ + M(TypeIndex::Int64, Int64, Int64) \ + M(TypeIndex::Float32, Float32, Float32) \ + M(TypeIndex::Float64, Float64, Float64) + +#define FOR_LOGICAL_DECIMAL_TYPES(M) \ + M(TypeIndex::Decimal32, Decimal32, Int32) \ + M(TypeIndex::Decimal64, Decimal64, Int64) \ + M(TypeIndex::Decimal128, Decimal128, Int128) \ + M(TypeIndex::Decimal128I, Decimal128, Int128) + +struct ConvertParams { + // schema.logicalType.TIMESTAMP.isAdjustedToUTC == false + static const cctz::time_zone utc0; + // schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set the time zone + cctz::time_zone* ctz = nullptr; + size_t offset_days = 0; + int64_t second_mask = 1; + int64_t scale_to_nano_factor = 1; + DecimalScaleParams decimal_scale; + FieldSchema* field_schema = nullptr; + size_t start_idx = 0; + + void init(FieldSchema* field_schema_, cctz::time_zone* ctz_, size_t start_idx_ = 0) { + field_schema = field_schema_; + if (ctz_ != nullptr) { + ctz = ctz_; + } + const auto& schema = field_schema->parquet_schema; + if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) { + const auto& timestamp_info = schema.logicalType.TIMESTAMP; + if (!timestamp_info.isAdjustedToUTC) { + // should set timezone to utc+0 + ctz = const_cast(&utc0); + } + const auto& time_unit = timestamp_info.unit; + if (time_unit.__isset.MILLIS) { + second_mask = 1000; + scale_to_nano_factor = 1000000; + } else if (time_unit.__isset.MICROS) { + second_mask = 1000000; + scale_to_nano_factor = 1000; + } else if (time_unit.__isset.NANOS) { + second_mask = 1000000000; + scale_to_nano_factor = 1; + } + } else if (schema.__isset.converted_type) { + const auto& converted_type = schema.converted_type; + if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) { + second_mask = 1000; + scale_to_nano_factor = 1000000; + } else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) { + second_mask = 1000000; + scale_to_nano_factor = 1000; + } + } + + if (ctz) { + VecDateTimeValue t; + t.from_unixtime(0, *ctz); + offset_days = t.day() == 31 ? -1 : 0; + } + start_idx = start_idx_; + } + + template + void init_decimal_converter(DataTypePtr& data_type) { + if (field_schema == nullptr || decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) { + return; + } + auto scale = field_schema->parquet_schema.scale; + auto* decimal_type = static_cast>*>( + const_cast(remove_nullable(data_type).get())); + auto dest_scale = decimal_type->get_scale(); + if (dest_scale > scale) { + decimal_scale.scale_type = DecimalScaleParams::SCALE_UP; + decimal_scale.scale_factor = + DecimalScaleParams::get_scale_factor(dest_scale - scale); + } else if (dest_scale < scale) { + decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN; + decimal_scale.scale_factor = + DecimalScaleParams::get_scale_factor(scale - dest_scale); + } else { + decimal_scale.scale_type = DecimalScaleParams::NO_SCALE; + decimal_scale.scale_factor = 1; + } + } +}; + +/* +* parquet_physical_type : The type of data stored in parquet. +* Read data into columns returned by get_column according to the physical type of parquet. +* show_type : The data format that should be displayed. +* doris_column : What type of column does the upper layer need to put the data in. +* +* example : +* In hive, if decimal is stored as FIXED_LENBYTE_ARRAY in parquet, +* then we use `ALTER TABLE TableName CHANGE COLUMN Col_Decimal Col_Decimal String;` +* to convert this column to string type. +* parquet_type : FIXED_LEN_BYTE_ARRAY. +* ans_data_type : ColumnInt8 +* show_type : Decimal. +* doris_column : ColumnString. +*/ +ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, + ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert); + +struct ColumnConvert { + virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); } + + virtual ~ColumnConvert() = default; + + void convert_null(ColumnPtr& src_col, MutableColumnPtr& dst_col) { + src_col = remove_nullable(src_col); + dst_col = remove_nullable(dst_col->get_ptr())->assume_mutable(); + } + +public: + ConvertParams* _convert_params; +}; + +template +struct NumberToNumberConvert : public ColumnConvert { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using ColumnType = typename PhysicalTypeTraits::ColumnType; + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + auto& src_data = static_cast(src_col.get())->get_data(); + + dst_col->resize(_convert_params->start_idx + rows); + auto& data = static_cast&>(*dst_col.get()).get_data(); + for (int i = 0; i < rows; i++) { + dst_type value = static_cast(src_data[i]); + data[_convert_params->start_idx + i] = value; + } + + return Status::OK(); + } +}; + +template +struct NumberToStringConvert : public ColumnConvert { + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + using ColumnType = typename PhysicalTypeTraits::ColumnType; + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + auto& src_data = static_cast(src_col.get())->get_data(); + + char buf[100]; + auto str_col = static_cast(dst_col.get()); + for (int i = 0; i < rows; i++) { + if constexpr (parquet_physical_type == tparquet::Type::FLOAT) { + int len = FastFloatToBuffer(src_data[i], buf, true); + str_col->insert_data(buf, len); + + } else if constexpr (parquet_physical_type == tparquet::Type::DOUBLE) { + int len = FastDoubleToBuffer(src_data[i], buf, true); + str_col->insert_data(buf, len); + } else if constexpr (parquet_physical_type == tparquet::Type::INT32) { + char* end = FastInt32ToBufferLeft(src_data[i], buf); + str_col->insert_data(buf, end - buf); + + } else if constexpr (parquet_physical_type == tparquet::Type::INT64) { + char* end = FastInt64ToBufferLeft(src_data[i], buf); + str_col->insert_data(buf, end - buf); + + } else { + string value = std::to_string(src_data[i]); + str_col->insert_data(value.data(), value.size()); + } + } + return Status::OK(); + } +}; + +struct Int96toTimestamp : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size() / sizeof(ParquetInt96); + auto& src_data = static_cast*>(src_col.get())->get_data(); + auto ParquetInt96_data = (ParquetInt96*)src_data.data(); + dst_col->resize(_convert_params->start_idx + rows); + auto& data = static_cast*>(dst_col.get())->get_data(); + + for (int i = 0; i < rows; i++) { + ParquetInt96 x = ParquetInt96_data[i]; + auto& num = data[_convert_params->start_idx + i]; + auto& value = reinterpret_cast&>(num); + int64_t micros = x.to_timestamp_micros(); + value.from_unixtime(micros / 1000000, *_convert_params->ctz); + value.set_microsecond(micros % 1000000); + } + return Status::OK(); + } +}; + +struct Int64ToTimestamp : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + dst_col->resize(_convert_params->start_idx + rows); + + auto src_data = static_cast*>(src_col.get())->get_data().data(); + auto& data = static_cast*>(dst_col.get())->get_data(); + + for (int i = 0; i < rows; i++) { + int64_t x = src_data[i]; + auto& num = data[_convert_params->start_idx + i]; + auto& value = reinterpret_cast&>(num); + value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz); + value.set_microsecond((x % _convert_params->second_mask) * + (_convert_params->scale_to_nano_factor / 1000)); + } + return Status::OK(); + } +}; + +class Int32ToDate : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + dst_col->resize(_convert_params->start_idx + rows); + + auto& src_data = static_cast*>(src_col.get())->get_data(); + auto& data = static_cast(dst_col.get())->get_data(); + date_day_offset_dict& date_dict = date_day_offset_dict::get(); + + for (int i = 0; i < rows; i++) { + auto& value = reinterpret_cast&>( + data[_convert_params->start_idx + i]); + int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; + value = date_dict[date_value]; + } + + return Status::OK(); + } +}; + +template +class StringToDecimal : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + DecimalScaleParams& scale_params = _convert_params->decimal_scale; + auto buf = static_cast(src_col.get())->get_chars().data(); + auto& offset = static_cast(src_col.get())->get_offsets(); + dst_col->resize(_convert_params->start_idx + rows); + + auto& data = static_cast*>(dst_col.get())->get_data(); + for (int i = 0; i < rows; i++) { + size_t len = offset[i] - offset[i - 1]; + // When Decimal in parquet is stored in byte arrays, binary and fixed, + // the unscaled number must be encoded as two's complement using big-endian byte order. + ValueCopyType value = 0; + memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); + value = BitUtil::big_endian_to_host(value); + value = value >> ((sizeof(value) - len) * 8); + if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { + value *= scale_params.scale_factor; + } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { + value /= scale_params.scale_factor; + } else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) { + // do nothing + } else { + LOG(FATAL) << "__builtin_unreachable"; + __builtin_unreachable(); + } + auto& v = reinterpret_cast(data[_convert_params->start_idx + i]); + v = (DecimalType)value; + } + + return Status::OK(); + } +}; +template +class NumberToDecimal : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + auto* src_data = + static_cast*>(src_col.get())->get_data().data(); + dst_col->resize(_convert_params->start_idx + rows); + + DecimalScaleParams& scale_params = _convert_params->decimal_scale; + auto* data = static_cast>*>(dst_col.get()) + ->get_data() + .data(); + + for (int i = 0; i < rows; i++) { + ValueCopyType value = src_data[i]; + if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) { + value *= scale_params.scale_factor; + } else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) { + value /= scale_params.scale_factor; + } + data[_convert_params->start_idx + i] = (DecimalPhysicalType)value; + } + return Status::OK(); + } +}; + +template +class StringToDecimalString : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + + auto buf = static_cast(src_col.get())->get_chars().data(); + auto& offset = static_cast(src_col.get())->get_offsets(); + + auto data = static_cast(dst_col.get()); + for (int i = 0; i < rows; i++) { + int len = offset[i] - offset[i - 1]; + // When Decimal in parquet is stored in byte arrays, binary and fixed, + // the unscaled number must be encoded as two's complement using big-endian byte order. + ValueCopyType value = 0; + memcpy(reinterpret_cast(&value), buf + offset[i - 1], len); + value = BitUtil::big_endian_to_host(value); + value = value >> ((sizeof(value) - len) * 8); + std::string ans = reinterpret_cast(value).to_string( + _convert_params->field_schema->parquet_schema.scale); + data->insert_data(ans.data(), ans.size()); + } + return Status::OK(); + } +}; + +class Int32ToDateString : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + size_t rows = src_col->size(); + + auto& src_data = static_cast*>(src_col.get())->get_data(); + date_day_offset_dict& date_dict = date_day_offset_dict::get(); + + auto str_col = static_cast(dst_col.get()); + char buf[50]; + for (int i = 0; i < rows; i++) { + int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days; + DateV2Value value = date_dict[date_value]; + char* end = value.to_string(buf); + str_col->insert_data(buf, end - buf); + } + + return Status::OK(); + } +}; + +class Int96ToTimestampString : public ColumnConvert { +public: + Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override { + convert_null(src_col, dst_col); + + auto& src_data = static_cast*>(src_col.get())->get_data(); + auto dst_data = static_cast(dst_col.get()); + + size_t rows = src_col->size() / sizeof(ParquetInt96); + ParquetInt96* data = (ParquetInt96*)src_data.data(); + + char buf[50]; + for (int i = 0; i < rows; i++) { + uint64_t num = 0; + auto& value = reinterpret_cast&>(num); + int64_t micros = data[i].to_timestamp_micros(); + value.from_unixtime(micros / 1000000, *_convert_params->ctz); + value.set_microsecond(micros % 1000000); + char* end = value.to_string(buf); + dst_data->insert_data(buf, end - buf); + } + return Status::OK(); + } +}; + +inline Status get_converter(tparquet::Type::type parquet_physical_type, PrimitiveType show_type, + std::shared_ptr dst_data_type, + std::unique_ptr* converter, + ConvertParams* convert_params) { + auto dst_type = remove_nullable(dst_data_type)->get_type_id(); + switch (dst_type) { +#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \ + case NUMERIC_TYPE: \ + switch (parquet_physical_type) { \ + case tparquet::Type::BOOLEAN: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + case tparquet::Type::INT32: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + case tparquet::Type::INT64: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + case tparquet::Type::FLOAT: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + case tparquet::Type::DOUBLE: \ + *converter = std::make_unique< \ + NumberToNumberConvert>(); \ + break; \ + default: \ + break; \ + } \ + break; + FOR_LOGICAL_NUMERIC_TYPES(DISPATCH) +#undef DISPATCH + + case TypeIndex::String: { + if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { + if (show_type == PrimitiveType::TYPE_DECIMAL32) { + *converter = std::make_unique>(); + break; + } else if (show_type == PrimitiveType::TYPE_DECIMAL64) { + *converter = std::make_unique>(); + break; + } else if (show_type == PrimitiveType::TYPE_DECIMALV2) { + *converter = std::make_unique>(); + break; + } else if (show_type == PrimitiveType::TYPE_DECIMAL128I) { + *converter = std::make_unique>(); + break; + } + + } else if (tparquet::Type::INT96 == parquet_physical_type) { + *converter = std::make_unique(); + break; + } else if (tparquet::Type::INT32 == parquet_physical_type) { + if (show_type == PrimitiveType::TYPE_DATEV2) { + *converter = std::make_unique(); + break; + } + } + + if (parquet_physical_type == tparquet::Type::BOOLEAN) { + *converter = std::make_unique>(); + } else if (parquet_physical_type == tparquet::Type::INT32) { + *converter = std::make_unique>(); + + } else if (parquet_physical_type == tparquet::Type::INT64) { + *converter = std::make_unique>(); + + } else if (parquet_physical_type == tparquet::Type::FLOAT) { + *converter = std::make_unique>(); + + } else if (parquet_physical_type == tparquet::Type::DOUBLE) { + *converter = std::make_unique>(); + } + + break; + } + case TypeIndex::DateV2: + if (tparquet::Type::INT32 == parquet_physical_type) { + *converter = std::make_unique(); + } + break; + case TypeIndex::DateTimeV2: + if (tparquet::Type::INT96 == parquet_physical_type) { + *converter = std::make_unique(); + } else if (tparquet::Type::INT64 == parquet_physical_type) { + *converter = std::make_unique(); + } + break; +#define DISPATCH2(TypeIndex_DECIMAL_TYPE, DECIMAL_TYPE, PRIMARY_TYPE) \ + case TypeIndex_DECIMAL_TYPE: { \ + convert_params->init_decimal_converter(dst_data_type); \ + DecimalScaleParams& scale_params = convert_params->decimal_scale; \ + if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { \ + size_t string_length = convert_params->field_schema->parquet_schema.type_length; \ + if (string_length <= 8) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = \ + std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = \ + std::make_unique>(); \ + } else { \ + *converter = \ + std::make_unique>(); \ + } \ + } else if (string_length <= 16) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = \ + std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = \ + std::make_unique>(); \ + } else { \ + *converter = \ + std::make_unique>(); \ + } \ + } \ + } else if (tparquet::Type::INT32 == parquet_physical_type) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = std::make_unique>(); \ + } else { \ + *converter = std::make_unique>(); \ + } \ + } else if (tparquet::Type::INT64 == parquet_physical_type) { \ + if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \ + *converter = std::make_unique>(); \ + } else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \ + *converter = std::make_unique>(); \ + } else { \ + *converter = std::make_unique>(); \ + } \ + } \ + break; \ + } + + FOR_LOGICAL_DECIMAL_TYPES(DISPATCH2) +#undef DISPATCH2 + default: + break; + } + + if (*converter == nullptr) { + return Status::NotSupported("Can't cast type parquet physical {} to doris logical type {}", + tparquet::to_string(parquet_physical_type), + getTypeName(dst_type)); + } + (*converter)->_convert_params = convert_params; + return Status::OK(); +} + +}; // namespace ParquetConvert + +}; // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/parquet_common.h b/be/src/vec/exec/format/parquet/parquet_common.h index 0a4278ae67..6667ab2c10 100644 --- a/be/src/vec/exec/format/parquet/parquet_common.h +++ b/be/src/vec/exec/format/parquet/parquet_common.h @@ -54,6 +54,11 @@ struct ParquetInt96 { inline uint64_t to_timestamp_micros() const { return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND; } + inline __int128 to_int128() const { + __int128 ans = 0; + ans = (((__int128)hi) << 64) + lo; + return ans; + } static const uint32_t JULIAN_EPOCH_OFFSET_DAYS; static const uint64_t MICROS_IN_DAY; @@ -151,4 +156,4 @@ private: size_t _num_filtered; size_t _read_index; }; -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/schema_desc.h b/be/src/vec/exec/format/parquet/schema_desc.h index fb61ad918a..8e8f735056 100644 --- a/be/src/vec/exec/format/parquet/schema_desc.h +++ b/be/src/vec/exec/format/parquet/schema_desc.h @@ -88,9 +88,9 @@ private: TypeDescriptor convert_to_doris_type(const tparquet::SchemaElement& physical_schema); +public: TypeDescriptor get_doris_type(const tparquet::SchemaElement& physical_schema); -public: FieldDescriptor() = default; ~FieldDescriptor() = default; diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp index 704703c5ad..86fbba8b25 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -54,7 +54,7 @@ ColumnChunkReader::ColumnChunkReader(io::BufferedStreamReader* reader, _max_def_level(field_schema->definition_level), _stream_reader(reader), _metadata(column_chunk->meta_data), - _ctz(ctz), + // _ctz(ctz), _io_ctx(io_ctx) {} Status ColumnChunkReader::init() { @@ -194,7 +194,7 @@ Status ColumnChunkReader::load_page_data() { // Set type length page_decoder->set_type_length(_get_type_length()); // Initialize the time convert context - page_decoder->init(_field_schema, _ctz); + // page_decoder->init(_field_schema, _ctz); _decoders[static_cast(encoding)] = std::move(page_decoder); _page_decoder = _decoders[static_cast(encoding)].get(); } @@ -242,7 +242,7 @@ Status ColumnChunkReader::_decode_dict_page() { // Set type length page_decoder->set_type_length(_get_type_length()); // Initialize the time convert context - page_decoder->init(_field_schema, _ctz); + // page_decoder->init(_field_schema, _ctz); // Set the dictionary data RETURN_IF_ERROR(page_decoder->set_dict(dict_data, uncompressed_size, header.dictionary_page_header.num_values)); @@ -323,4 +323,4 @@ int32_t ColumnChunkReader::_get_type_length() { return -1; } } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h index 24415c9830..daf8512b3b 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h @@ -193,7 +193,7 @@ private: io::BufferedStreamReader* _stream_reader; tparquet::ColumnMetaData _metadata; - cctz::time_zone* _ctz; + // cctz::time_zone* _ctz; io::IOContext* _io_ctx; std::unique_ptr _page_reader = nullptr; diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp index 4143a5e079..d6f7d746bc 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.cpp @@ -25,6 +25,7 @@ #include #include +#include "parquet_column_convert.h" #include "runtime/define_primitive_type.h" #include "schema_desc.h" #include "util/runtime_profile.h" @@ -252,8 +253,9 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu NullMap* map_data_column = nullptr; if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); - auto* nullable_column = reinterpret_cast( - (*std::move(doris_column)).mutate().get()); + auto* nullable_column = + static_cast(const_cast(doris_column.get())); + data_column = nullable_column->get_nested_column_ptr(); map_data_column = &(nullable_column->get_null_map_data()); if (_chunk_reader->max_def_level() > 0) { @@ -360,8 +362,11 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType NullMap* map_data_column = nullptr; if (doris_column->is_nullable()) { SCOPED_RAW_TIMER(&_decode_null_map_time); - auto* nullable_column = reinterpret_cast( - (*std::move(doris_column)).mutate().get()); + auto* nullable_column = const_cast( + static_cast(doris_column.get())); + + // auto* nullable_column = reinterpret_cast( + // (*std::move(src_column)).mutate().get()); data_column = nullable_column->get_nested_column_ptr(); map_data_column = &(nullable_column->get_null_map_data()); } else { @@ -476,86 +481,108 @@ Status ScalarColumnReader::_try_load_dict_page(bool* loaded, bool* has_dict) { Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr& type, ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter) { - if (_chunk_reader->remaining_num_values() == 0) { - if (!_chunk_reader->has_next_page()) { - *eof = true; - *read_rows = 0; - return Status::OK(); - } - RETURN_IF_ERROR(_chunk_reader->next_page()); - } - if (_nested_column) { - RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - return _read_nested_column(doris_column, type, select_vector, batch_size, read_rows, eof, - is_dict_filter); - } + bool need_convert = false; + auto& parquet_physical_type = _chunk_meta.meta_data.type; + auto& show_type = _field_schema->type.type; - // generate the row ranges that should be read - std::list read_ranges; - _generate_read_ranges(_current_row_index, - _current_row_index + _chunk_reader->remaining_num_values(), read_ranges); - if (read_ranges.size() == 0) { - // skip the whole page - _current_row_index += _chunk_reader->remaining_num_values(); - RETURN_IF_ERROR(_chunk_reader->skip_page()); - *read_rows = 0; - } else { - bool skip_whole_batch = false; - // Determining whether to skip page or batch will increase the calculation time. - // When the filtering effect is greater than 60%, it is possible to skip the page or batch. - if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) { - // lazy read - size_t remaining_num_values = 0; - for (auto& range : read_ranges) { - remaining_num_values += range.last_row - range.first_row; - } - if (batch_size >= remaining_num_values && - select_vector.can_filter_all(remaining_num_values)) { - // We can skip the whole page if the remaining values is filtered by predicate columns - select_vector.skip(remaining_num_values); - _current_row_index += _chunk_reader->remaining_num_values(); - RETURN_IF_ERROR(_chunk_reader->skip_page()); - *read_rows = remaining_num_values; - if (!_chunk_reader->has_next_page()) { - *eof = true; - } + ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type, + doris_column, type, &need_convert); + + do { + if (_chunk_reader->remaining_num_values() == 0) { + if (!_chunk_reader->has_next_page()) { + *eof = true; + *read_rows = 0; return Status::OK(); } - skip_whole_batch = - batch_size <= remaining_num_values && select_vector.can_filter_all(batch_size); - if (skip_whole_batch) { - select_vector.skip(batch_size); - } + RETURN_IF_ERROR(_chunk_reader->next_page()); } - // load page data to decode or skip values - RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); - size_t has_read = 0; - for (auto& range : read_ranges) { - // generate the skipped values - size_t skip_values = range.first_row - _current_row_index; - RETURN_IF_ERROR(_skip_values(skip_values)); - _current_row_index += skip_values; - // generate the read values - size_t read_values = - std::min((size_t)(range.last_row - range.first_row), batch_size - has_read); - if (skip_whole_batch) { - RETURN_IF_ERROR(_skip_values(read_values)); - } else { - RETURN_IF_ERROR(_read_values(read_values, doris_column, type, select_vector, - is_dict_filter)); - } - has_read += read_values; - _current_row_index += read_values; - if (has_read == batch_size) { - break; - } + if (_nested_column) { + RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); + RETURN_IF_ERROR(_read_nested_column(src_column, type, select_vector, batch_size, + read_rows, eof, is_dict_filter)); + break; } - *read_rows = has_read; + + // generate the row ranges that should be read + std::list read_ranges; + _generate_read_ranges(_current_row_index, + _current_row_index + _chunk_reader->remaining_num_values(), + read_ranges); + if (read_ranges.size() == 0) { + // skip the whole page + _current_row_index += _chunk_reader->remaining_num_values(); + RETURN_IF_ERROR(_chunk_reader->skip_page()); + *read_rows = 0; + } else { + bool skip_whole_batch = false; + // Determining whether to skip page or batch will increase the calculation time. + // When the filtering effect is greater than 60%, it is possible to skip the page or batch. + if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) { + // lazy read + size_t remaining_num_values = 0; + for (auto& range : read_ranges) { + remaining_num_values += range.last_row - range.first_row; + } + if (batch_size >= remaining_num_values && + select_vector.can_filter_all(remaining_num_values)) { + // We can skip the whole page if the remaining values is filtered by predicate columns + select_vector.skip(remaining_num_values); + _current_row_index += _chunk_reader->remaining_num_values(); + RETURN_IF_ERROR(_chunk_reader->skip_page()); + *read_rows = remaining_num_values; + if (!_chunk_reader->has_next_page()) { + *eof = true; + } + break; + } + skip_whole_batch = batch_size <= remaining_num_values && + select_vector.can_filter_all(batch_size); + if (skip_whole_batch) { + select_vector.skip(batch_size); + } + } + // load page data to decode or skip values + RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent()); + size_t has_read = 0; + for (auto& range : read_ranges) { + // generate the skipped values + size_t skip_values = range.first_row - _current_row_index; + RETURN_IF_ERROR(_skip_values(skip_values)); + _current_row_index += skip_values; + // generate the read values + size_t read_values = + std::min((size_t)(range.last_row - range.first_row), batch_size - has_read); + if (skip_whole_batch) { + RETURN_IF_ERROR(_skip_values(read_values)); + } else { + RETURN_IF_ERROR(_read_values(read_values, src_column, type, select_vector, + is_dict_filter)); + } + has_read += read_values; + _current_row_index += read_values; + if (has_read == batch_size) { + break; + } + } + *read_rows = has_read; + } + + if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { + *eof = true; + } + } while (false); + + if (need_convert) { + std::unique_ptr converter; + ParquetConvert::ConvertParams convert_params; + convert_params.init(_field_schema, _ctz, doris_column->size()); + RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, type, + &converter, &convert_params)); + auto x = doris_column->assume_mutable(); + RETURN_IF_ERROR(converter->convert(src_column, x)); } - if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) { - *eof = true; - } return Status::OK(); } @@ -732,4 +759,4 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr return Status::OK(); } -}; // namespace doris::vectorized +}; // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_reader.h index f4973b4b4e..f8061d0485 100644 --- a/be/src/vec/exec/format/parquet/vparquet_column_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_column_reader.h @@ -288,4 +288,4 @@ private: std::vector> _child_readers; }; -}; // namespace doris::vectorized +}; // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp index 14d6e00dbd..2c23ed100e 100644 --- a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -175,15 +175,8 @@ Status RowGroupReader::init( bool RowGroupReader::_can_filter_by_dict(int slot_id, const tparquet::ColumnMetaData& column_metadata) { - SlotDescriptor* slot = nullptr; - const std::vector& slots = _tuple_descriptor->slots(); - for (auto each : slots) { - if (each->id() == slot_id) { - slot = each; - break; - } - } - if (!slot->type().is_string_type()) { + if (column_metadata.encodings[0] != tparquet::Encoding::RLE_DICTIONARY || + column_metadata.type != tparquet::Type::BYTE_ARRAY) { return false; } @@ -336,6 +329,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ bool can_filter_all = false; RETURN_IF_ERROR_OR_CATCH_EXCEPTION(VExprContext::execute_conjuncts( _filter_conjuncts, &filters, block, &result_filter, &can_filter_all)); + if (can_filter_all) { for (auto& col : columns_to_filter) { std::move(*block->get_by_position(col).column).assume_mutable()->clear(); @@ -344,6 +338,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ _convert_dict_cols_to_string_cols(block); return Status::OK(); } + if (!_not_single_slot_filter_conjuncts.empty()) { _convert_dict_cols_to_string_cols(block); std::vector merged_filters; @@ -362,7 +357,6 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_ RETURN_IF_CATCH_EXCEPTION( RETURN_IF_ERROR(_filter_block(block, column_to_keep, columns_to_filter))); } - *read_rows = block->rows(); return Status::OK(); } @@ -421,8 +415,10 @@ Status RowGroupReader::_read_column_data(Block* block, const std::vector #include -#include #include #include -#include #include #include #include #include "common/status.h" +#include "exec/schema_scanner.h" +#include "gen_cpp/descriptors.pb.h" +#include "gtest/gtest_pred_impl.h" #include "io/file_factory.h" +#include "io/fs/buffered_reader.h" +#include "io/fs/file_reader.h" +#include "io/fs/file_reader_writer_fwd.h" +#include "olap/olap_common.h" #include "parquet_pred_cmp.h" #include "parquet_thrift_util.h" #include "runtime/define_primitive_type.h" +#include "runtime/descriptors.h" #include "runtime/types.h" #include "util/slice.h" +#include "util/timezone_utils.h" +#include "vec/columns/column.h" #include "vec/common/typeid_cast.h" -#include "vec/exec/format/format_common.h" +#include "vec/core/block.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/types.h" +#include "vec/exec/format/parquet/parquet_common.h" #include "vec/exec/format/parquet/schema_desc.h" #include "vec/exec/format/parquet/vparquet_file_metadata.h" #include "vec/exec/format/parquet/vparquet_group_reader.h" #include "vec/exec/format/parquet/vparquet_page_index.h" #include "vec/exprs/vbloom_predicate.h" #include "vec/exprs/vexpr.h" -#include "vec/exprs/vexpr_context.h" #include "vec/exprs/vin_predicate.h" #include "vec/exprs/vruntimefilter_wrapper.h" #include "vec/exprs/vslot_ref.h" @@ -520,15 +530,14 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof) return Status::OK(); } - { - SCOPED_RAW_TIMER(&_statistics.column_read_time); - Status batch_st = - _current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof); - if (!batch_st.ok()) { - return Status::InternalError("Read parquet file {} failed, reason = {}", - _scan_range.path, batch_st.to_string()); - } + SCOPED_RAW_TIMER(&_statistics.column_read_time); + Status batch_st = + _current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof); + if (!batch_st.ok()) { + return Status::InternalError("Read parquet file {} failed, reason = {}", _scan_range.path, + batch_st.to_string()); } + if (_row_group_eof) { auto column_st = _current_group_reader->statistics(); _column_statistics.merge(column_st); @@ -897,4 +906,4 @@ int64_t ParquetReader::_get_column_start_offset(const tparquet::ColumnMetaData& } return column.data_page_offset; } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/scan/scanner_context.cpp b/be/src/vec/exec/scan/scanner_context.cpp index a2ee93815c..5723a58bea 100644 --- a/be/src/vec/exec/scan/scanner_context.cpp +++ b/be/src/vec/exec/scan/scanner_context.cpp @@ -168,6 +168,7 @@ vectorized::BlockUPtr ScannerContext::get_free_block() { block = vectorized::Block::create_unique(_output_tuple_desc->slots(), _batch_size, true /*ignore invalid slots*/); + COUNTER_UPDATE(_newly_create_free_blocks_num, 1); _serving_blocks_num++; diff --git a/be/src/vec/exec/scan/scanner_scheduler.cpp b/be/src/vec/exec/scan/scanner_scheduler.cpp index 8ebb6405bd..3a7b7759bf 100644 --- a/be/src/vec/exec/scan/scanner_scheduler.cpp +++ b/be/src/vec/exec/scan/scanner_scheduler.cpp @@ -464,4 +464,4 @@ void ScannerScheduler::_task_group_scanner_scan(ScannerScheduler* scheduler, } } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/scan/vfile_scanner.cpp b/be/src/vec/exec/scan/vfile_scanner.cpp index 1da53c114e..35e1d3dff5 100644 --- a/be/src/vec/exec/scan/vfile_scanner.cpp +++ b/be/src/vec/exec/scan/vfile_scanner.cpp @@ -1105,4 +1105,4 @@ Status VFileScanner::close(RuntimeState* state) { return Status::OK(); } -} // namespace doris::vectorized +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/exec/test_data/parquet_scanner/dict-decoder.txt b/be/test/exec/test_data/parquet_scanner/dict-decoder.txt index 35414043ed..6dd9a5dfb4 100644 --- a/be/test/exec/test_data/parquet_scanner/dict-decoder.txt +++ b/be/test/exec/test_data/parquet_scanner/dict-decoder.txt @@ -1,16 +1,16 @@ -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ -|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTime))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(Date))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))| -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ +|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTimeV2))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(DateV2))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))| ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ diff --git a/be/test/exec/test_data/parquet_scanner/type-decoder.txt b/be/test/exec/test_data/parquet_scanner/type-decoder.txt index 6a2805d661..e56b5574b1 100644 --- a/be/test/exec/test_data/parquet_scanner/type-decoder.txt +++ b/be/test/exec/test_data/parquet_scanner/type-decoder.txt @@ -1,14 +1,14 @@ -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ -|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTime))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(Date))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))| -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ -| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| -| 2| 2| 2| 2| 1| 2.14| 2.14| NULL| b-row1| 2022-08-02 07:23:18| 2.140000000| c-row1| vc-row1| 2022-08-02| 2022-08-02| 2022-08-02 07:23:18| -| -3| -3| -3| -3| 0| -3.14| -3.14| s-row2| b-row2| 2022-08-03 07:23:19| -3.140000000| c-row2| vc-row2| 2022-08-03| 2022-08-03| 2022-08-03 07:23:19| -| 4| 4| 4| 4| 1| 4.14| 4.14| NULL| b-row3| 2022-08-04 07:24:17| 4.140000000| c-row3| vc-row3| 2022-08-04| 2022-08-04| 2022-08-04 07:24:17| -| -5| -5| -5| -5| 0| -5.14| -5.14| s-row4| b-row4| 2022-08-05 07:25:17| -5.140000000| c-row4| vc-row4| 2022-08-05| 2022-08-05| 2022-08-05 07:25:17| -| 6| 6| 6| 6| 0| 6.14| 6.14| s-row5| b-row5| 2022-08-06 07:26:17| 6.140000000| c-row5| vc-row5| 2022-08-06| 2022-08-06| 2022-08-06 07:26:17| -| -7| -7| -7| -7| 1| -7.14| -7.14| s-row6| b-row6| 2022-08-07 07:27:17| -7.140000000| c-row6| vc-row6| 2022-08-07| 2022-08-07| 2022-08-07 07:27:17| -| 8| 8| 8| 8| 0| 8.14| 8.14| NULL| b-row7| 2022-08-08 07:28:17| 8.140000000| c-row7| vc-row7| 2022-08-08| 2022-08-08| 2022-08-08 07:28:17| -| -9| -9| -9| -9| 0| -9.14| -9.14| s-row8| b-row8| 2022-08-09 07:29:17| -9.140000000| c-row8| vc-row8| 2022-08-09| 2022-08-09| 2022-08-09 07:29:17| -| 10| 10| 10| 10| 0| 10.14| 10.14| s-row9| b-row9| 2022-08-10 07:21:17| 10.140000000| c-row9| vc-row9| 2022-08-10| 2022-08-10| 2022-08-10 07:21:17| -+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+ ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ +|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTimeV2))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(DateV2))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))| ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ +| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17| +| 2| 2| 2| 2| 1| 2.14| 2.14| NULL| b-row1| 2022-08-02 07:23:18.000000| 2.140000000| c-row1| vc-row1| 2022-08-02| 2022-08-02| 2022-08-02 07:23:18| +| -3| -3| -3| -3| 0| -3.14| -3.14| s-row2| b-row2| 2022-08-03 07:23:19.000000| -3.140000000| c-row2| vc-row2| 2022-08-03| 2022-08-03| 2022-08-03 07:23:19| +| 4| 4| 4| 4| 1| 4.14| 4.14| NULL| b-row3| 2022-08-04 07:24:17.000000| 4.140000000| c-row3| vc-row3| 2022-08-04| 2022-08-04| 2022-08-04 07:24:17| +| -5| -5| -5| -5| 0| -5.14| -5.14| s-row4| b-row4| 2022-08-05 07:25:17.000000| -5.140000000| c-row4| vc-row4| 2022-08-05| 2022-08-05| 2022-08-05 07:25:17| +| 6| 6| 6| 6| 0| 6.14| 6.14| s-row5| b-row5| 2022-08-06 07:26:17.000000| 6.140000000| c-row5| vc-row5| 2022-08-06| 2022-08-06| 2022-08-06 07:26:17| +| -7| -7| -7| -7| 1| -7.14| -7.14| s-row6| b-row6| 2022-08-07 07:27:17.000000| -7.140000000| c-row6| vc-row6| 2022-08-07| 2022-08-07| 2022-08-07 07:27:17| +| 8| 8| 8| 8| 0| 8.14| 8.14| NULL| b-row7| 2022-08-08 07:28:17.000000| 8.140000000| c-row7| vc-row7| 2022-08-08| 2022-08-08| 2022-08-08 07:28:17| +| -9| -9| -9| -9| 0| -9.14| -9.14| s-row8| b-row8| 2022-08-09 07:29:17.000000| -9.140000000| c-row8| vc-row8| 2022-08-09| 2022-08-09| 2022-08-09 07:29:17| +| 10| 10| 10| 10| 0| 10.14| 10.14| s-row9| b-row9| 2022-08-10 07:21:17.000000| 10.140000000| c-row9| vc-row9| 2022-08-10| 2022-08-10| 2022-08-10 07:21:17| ++---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+ diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp index be78d46815..4daa548e2e 100644 --- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -59,6 +59,7 @@ #include "vec/core/column_with_type_and_name.h" #include "vec/data_types/data_type.h" #include "vec/data_types/data_type_factory.hpp" +#include "vec/exec/format/parquet/parquet_column_convert.h" #include "vec/exec/format/parquet/parquet_common.h" #include "vec/exec/format/parquet/parquet_thrift_util.h" #include "vec/exec/format/parquet/schema_desc.h" @@ -167,8 +168,8 @@ TEST_F(ParquetThriftReaderTest, complex_nested_file) { static int fill_nullable_column(ColumnPtr& doris_column, level_t* definitions, size_t num_values) { CHECK(doris_column->is_nullable()); - auto* nullable_column = reinterpret_cast( - (*std::move(doris_column)).mutate().get()); + auto* nullable_column = const_cast( + static_cast(doris_column.get())); NullMap& map_data = nullable_column->get_null_map_data(); int null_cnt = 0; for (int i = 0; i < num_values; ++i) { @@ -189,6 +190,14 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column ? chunk_meta.dictionary_page_offset : chunk_meta.data_page_offset; size_t chunk_size = chunk_meta.total_compressed_size; + + bool need_convert = false; + auto& parquet_physical_type = column_chunk->meta_data.type; + auto& show_type = field_schema->type.type; + + ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type, + doris_column, data_type, &need_convert); + io::BufferedFileStreamReader stream_reader(file_reader, start_offset, chunk_size, 1024); cctz::time_zone ctz; @@ -208,14 +217,14 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column chunk_reader.get_def_levels(definitions, rows); } MutableColumnPtr data_column; - if (doris_column->is_nullable()) { + if (src_column->is_nullable()) { // fill nullable values - fill_nullable_column(doris_column, definitions, rows); - auto* nullable_column = reinterpret_cast( - (*std::move(doris_column)).mutate().get()); + fill_nullable_column(src_column, definitions, rows); + auto* nullable_column = const_cast( + static_cast(src_column.get())); data_column = nullable_column->get_nested_column_ptr(); } else { - data_column = doris_column->assume_mutable(); + data_column = src_column->assume_mutable(); } ColumnSelectVector run_length_map; // decode page data @@ -223,7 +232,7 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column // required column std::vector null_map = {(u_short)rows}; run_length_map.set_run_length_null_map(null_map, rows, nullptr); - return chunk_reader.decode_values(data_column, data_type, run_length_map, false); + RETURN_IF_ERROR(chunk_reader.decode_values(data_column, data_type, run_length_map, false)); } else { // column with null values level_t level_type = definitions[0]; @@ -254,8 +263,18 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column RETURN_IF_ERROR( chunk_reader.decode_values(data_column, data_type, run_length_map, false)); } - return Status::OK(); } + if (need_convert) { + std::unique_ptr converter; + ParquetConvert::ConvertParams convert_params; + convert_params.init(field_schema, &ctz, doris_column->size()); + RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, data_type, + &converter, &convert_params)); + auto x = doris_column->assume_mutable(); + RETURN_IF_ERROR(converter->convert(src_column, x)); + } + + return Status::OK(); } // Only the unit test depend on this, but it is wrong, should not use TTupleDesc to create tuple desc, not @@ -340,11 +359,11 @@ static void create_block(std::unique_ptr& block) { // binary is not supported, use string instead {"binary_col", TYPE_STRING, sizeof(StringRef), true}, // 64-bit-length, see doris::get_slot_size in primitive_type.cpp - {"timestamp_col", TYPE_DATETIME, sizeof(int128_t), true}, + {"timestamp_col", TYPE_DATETIMEV2, sizeof(int128_t), true}, {"decimal_col", TYPE_DECIMALV2, sizeof(DecimalV2Value), true}, {"char_col", TYPE_CHAR, sizeof(StringRef), true}, {"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true}, - {"date_col", TYPE_DATE, sizeof(int128_t), true}, + {"date_col", TYPE_DATEV2, sizeof(uint32_t), true}, {"date_v2_col", TYPE_DATEV2, sizeof(uint32_t), true}, {"timestamp_v2_col", TYPE_DATETIMEV2, sizeof(int128_t), true, 18, 0}}; SchemaScanner schema_scanner(column_descs); @@ -448,118 +467,6 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) { read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/dict-decoder.parquet", "./be/test/exec/test_data/parquet_scanner/dict-decoder.txt", 12); } - -TEST_F(ParquetThriftReaderTest, group_reader) { - std::vector column_descs = { - {"tinyint_col", TYPE_TINYINT, sizeof(int8_t), true}, - {"smallint_col", TYPE_SMALLINT, sizeof(int16_t), true}, - {"int_col", TYPE_INT, sizeof(int32_t), true}, - {"bigint_col", TYPE_BIGINT, sizeof(int64_t), true}, - {"boolean_col", TYPE_BOOLEAN, sizeof(bool), true}, - {"float_col", TYPE_FLOAT, sizeof(float_t), true}, - {"double_col", TYPE_DOUBLE, sizeof(double_t), true}, - {"string_col", TYPE_STRING, sizeof(StringRef), true}, - {"binary_col", TYPE_STRING, sizeof(StringRef), true}, - {"timestamp_col", TYPE_DATETIME, sizeof(int128_t), true}, - {"decimal_col", TYPE_DECIMALV2, sizeof(DecimalV2Value), true}, - {"char_col", TYPE_CHAR, sizeof(StringRef), true}, - {"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true}, - {"date_col", TYPE_DATE, sizeof(int128_t), true}}; - SchemaScanner schema_scanner(column_descs); - ObjectPool object_pool; - doris::TupleDescriptor* tuple_desc = create_tuple_desc(&object_pool, column_descs); - auto tuple_slots = tuple_desc->slots(); - - TSlotDescriptor tslot_desc; - { - tslot_desc.id = 14; - tslot_desc.parent = 0; - TTypeDesc type; - { - TTypeNode node; - node.__set_type(TTypeNodeType::ARRAY); - std::vector contains_nulls {true}; - node.__set_contains_nulls(contains_nulls); - TTypeNode inner; - inner.__set_type(TTypeNodeType::SCALAR); - TScalarType scalar_type; - scalar_type.__set_type(TPrimitiveType::STRING); - inner.__set_scalar_type(scalar_type); - inner.__set_contains_nulls(contains_nulls); - type.types.push_back(node); - type.types.push_back(inner); - } - tslot_desc.slotType = type; - tslot_desc.columnPos = 14; - tslot_desc.byteOffset = 0; - tslot_desc.nullIndicatorByte = 0; - tslot_desc.nullIndicatorBit = -1; - tslot_desc.colName = "list_string"; - tslot_desc.slotIdx = 14; - tslot_desc.isMaterialized = true; - } - SlotDescriptor string_slot(tslot_desc); - tuple_slots.emplace_back(&string_slot); - - std::vector read_columns; - RowGroupReader::LazyReadContext lazy_read_ctx; - for (const auto& slot : tuple_slots) { - lazy_read_ctx.all_read_columns.emplace_back(slot->col_name()); - read_columns.emplace_back(slot->col_name()); - } - io::FileSystemSPtr local_fs = io::LocalFileSystem::create(""); - io::FileReaderSPtr file_reader; - auto st = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet", - &file_reader); - EXPECT_TRUE(st.ok()); - - // prepare metadata - FileMetaData* meta_data; - size_t meta_size; - static_cast(parse_thrift_footer(file_reader, &meta_data, &meta_size, nullptr)); - tparquet::FileMetaData t_metadata = meta_data->to_thrift(); - - cctz::time_zone ctz; - TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz); - auto row_group = t_metadata.row_groups[0]; - std::shared_ptr row_group_reader; - RowGroupReader::PositionDeleteContext position_delete_ctx(row_group.num_rows, 0); - row_group_reader.reset(new RowGroupReader(file_reader, read_columns, 0, row_group, &ctz, - nullptr, position_delete_ctx, lazy_read_ctx, - nullptr)); - std::vector row_ranges; - row_ranges.emplace_back(0, row_group.num_rows); - - auto col_offsets = std::unordered_map(); - auto stg = row_group_reader->init(meta_data->schema(), row_ranges, col_offsets, nullptr, - nullptr, nullptr, nullptr, nullptr); - EXPECT_TRUE(stg.ok()); - - vectorized::Block block; - for (const auto& slot_desc : tuple_slots) { - auto data_type = - vectorized::DataTypeFactory::instance().create_data_type(slot_desc->type(), true); - MutableColumnPtr data_column = data_type->create_column(); - block.insert( - ColumnWithTypeAndName(std::move(data_column), data_type, slot_desc->col_name())); - } - bool batch_eof = false; - size_t read_rows = 0; - auto stb = row_group_reader->next_batch(&block, 1024, &read_rows, &batch_eof); - EXPECT_TRUE(stb.ok()); - - io::FileReaderSPtr result; - auto rst = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/group-reader.txt", - &result); - EXPECT_TRUE(rst.ok()); - uint8_t result_buf[result->size() + 1]; - result_buf[result->size()] = '\0'; - size_t bytes_read; - Slice res(result_buf, result->size()); - static_cast(result->read_at(0, res, &bytes_read)); - ASSERT_STREQ(block.dump_data(0, 10).c_str(), reinterpret_cast(result_buf)); - delete meta_data; -} } // namespace vectorized } // namespace doris diff --git a/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out new file mode 100644 index 0000000000..cca084ff0f --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_hive_parquet_alter_column.out @@ -0,0 +1,1676 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !desc -- +col_int INT Yes true \N +col_smallint INT Yes true \N +col_tinyint INT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint SMALLINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint TINYINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int BIGINT Yes true \N +col_smallint BIGINT Yes true \N +col_tinyint BIGINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int FLOAT Yes true \N +col_smallint FLOAT Yes true \N +col_tinyint FLOAT Yes true \N +col_bigint FLOAT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1.0 +-1.0 +-1.0 + +-- !order -- +-400.0 +-400.0 +-400.0 + +-- !order -- +-20.0 +-20.0 +-20.0 + +-- !order -- +-4.0E8 +-4.0E8 +-4.0E8 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int DOUBLE Yes true \N +col_smallint DOUBLE Yes true \N +col_tinyint DOUBLE Yes true \N +col_bigint DOUBLE Yes true \N +col_float DOUBLE Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1.0 -400.0 -20.0 -4.0E8 40.54439926147461 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1.0 +-1.0 +-1.0 + +-- !order -- +-400.0 +-400.0 +-400.0 + +-- !order -- +-20.0 +-20.0 +-20.0 + +-- !order -- +-4.0E8 +-4.0E8 +-4.0E8 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint TINYINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int TEXT Yes true \N +col_smallint TEXT Yes true \N +col_tinyint TEXT Yes true \N +col_bigint TEXT Yes true \N +col_float TEXT Yes true \N +col_double TEXT Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char TEXT Yes true \N +col_varchar TEXT Yes true \N +col_date TEXT Yes true \N +col_timestamp TEXT Yes true \N +col_decimal TEXT Yes true \N + +-- !show -- +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-200 +-200 +-200 + +-- !order -- +-10 +-10 +-10 + +-- !order -- +-20000000 +-20000000 +-20000000 + +-- !order -- +10.500000 +10.500000 +10.500000 + +-- !order -- +20.750000 +20.750000 +20.750000 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06 14:30:00 +2023-10-06 14:30:00 +2023-10-06 14:30:00 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int CHAR(10) Yes true \N +col_smallint CHAR(10) Yes true \N +col_tinyint CHAR(10) Yes true \N +col_bigint CHAR(10) Yes true \N +col_float CHAR(10) Yes true \N +col_double CHAR(10) Yes true \N +col_boolean BOOLEAN Yes true \N +col_string CHAR(10) Yes true \N +col_char CHAR(10) Yes true \N +col_varchar CHAR(10) Yes true \N +col_date CHAR(10) Yes true \N +col_timestamp CHAR(10) Yes true \N +col_decimal CHAR(10) Yes true \N + +-- !show -- +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-200 +-200 +-200 + +-- !order -- +-10 +-10 +-10 + +-- !order -- +-20000000 +-20000000 +-20000000 + +-- !order -- +10.500000 +10.500000 +10.500000 + +-- !order -- +20.750000 +20.750000 +20.750000 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06 14:30:00 +2023-10-06 14:30:00 +2023-10-06 14:30:00 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int VARCHAR(20) Yes true \N +col_smallint VARCHAR(20) Yes true \N +col_tinyint VARCHAR(20) Yes true \N +col_bigint VARCHAR(20) Yes true \N +col_float VARCHAR(20) Yes true \N +col_double VARCHAR(20) Yes true \N +col_boolean BOOLEAN Yes true \N +col_string VARCHAR(20) Yes true \N +col_char VARCHAR(20) Yes true \N +col_varchar VARCHAR(20) Yes true \N +col_date VARCHAR(20) Yes true \N +col_timestamp VARCHAR(20) Yes true \N +col_decimal VARCHAR(20) Yes true \N + +-- !show -- +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 +-1 -200 -10 -20000000 20.577700 30.750000 false First A ADC 2023-10-06 2023-10-09 17:15:00 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-200 +-200 +-200 + +-- !order -- +-10 +-10 +-10 + +-- !order -- +-20000000 +-20000000 +-20000000 + +-- !order -- +10.500000 +10.500000 +10.500000 + +-- !order -- +20.750000 +20.750000 +20.750000 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06 14:30:00 +2023-10-06 14:30:00 +2023-10-06 14:30:00 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint TINYINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int INT Yes true \N +col_smallint SMALLINT Yes true \N +col_tinyint TINYINT Yes true \N +col_bigint BIGINT Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(10, 2) Yes true \N + +-- !show -- +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 +-1 -400 -20 -400000000 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.45 + +-- !order -- +-1 +-1 +-1 + +-- !order -- +-400 +-400 +-400 + +-- !order -- +-20 +-20 +-20 + +-- !order -- +-400000000 +-400000000 +-400000000 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.45 +123.45 +123.45 + +-- !desc -- +col_int DECIMAL(5, 1) Yes true \N +col_smallint DECIMAL(5, 1) Yes true \N +col_tinyint DECIMAL(5, 1) Yes true \N +col_bigint DECIMAL(5, 1) Yes true \N +col_float FLOAT Yes true \N +col_double DOUBLE Yes true \N +col_boolean BOOLEAN Yes true \N +col_string TEXT Yes true \N +col_char CHAR(10) Yes true \N +col_varchar VARCHAR(255) Yes true \N +col_date DATE Yes true \N +col_timestamp DATETIME(6) Yes true \N +col_decimal DECIMAL(5, 1) Yes true \N + +-- !show -- +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 +-1.0 -400.0 -20.0 29496729.6 40.5444 50.75 false First A ADC 2023-10-06 2023-10-09T17:15 1238.4 + +-- !order -- +-1.0 +-1.0 +-1.0 + +-- !order -- +-400.0 +-400.0 +-400.0 + +-- !order -- +-20.0 +-20.0 +-20.0 + +-- !order -- +-153960755.2 +-153960755.2 +-153960755.2 + +-- !order -- +10.5 +10.5 +10.5 + +-- !order -- +20.75 +20.75 +20.75 + +-- !order -- +false +false +false + +-- !order -- +Fifth +Fifth +Fifth + +-- !order -- +A +A +A + +-- !order -- +ADC +ADC +ADC + +-- !order -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !order -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !order -- +123.4 +123.4 +123.4 + +-- !int_int -- +2 +2 +2 + +-- !int_smallint -- +100 +100 +100 + +-- !int_tinyint -- +5 +5 +5 + +-- !int_bigint -- +1000000000 +1000000000 +1000000000 + +-- !int_float -- + +-- !int_double -- + +-- !int_boolean -- + +-- !int_string -- + +-- !int_char -- +B +B +B + +-- !int_varchar -- + +-- !int_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !int_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !int_decimal -- + +-- !smallint_int -- +1 +1 +1 + +-- !smallint_smallint -- +100 +100 +100 + +-- !smallint_tinyint -- +5 +5 +5 + +-- !smallint_bigint -- +1000000000 +1000000000 +1000000000 + +-- !smallint_float -- + +-- !smallint_double -- + +-- !smallint_boolean -- + +-- !smallint_string -- + +-- !smallint_char -- +C +C +C + +-- !smallint_varchar -- + +-- !smallint_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !smallint_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !smallint_decimal -- + +-- !tinyint_int -- +3 +3 +3 + +-- !tinyint_smallint -- +100 +100 +100 + +-- !tinyint_tinyint -- +5 +5 +5 + +-- !tinyint_bigint -- +1000000000 +1000000000 +1000000000 + +-- !tinyint_float -- + +-- !tinyint_double -- + +-- !tinyint_boolean -- + +-- !tinyint_string -- + +-- !tinyint_char -- +A +A +A + +-- !tinyint_varchar -- + +-- !tinyint_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !tinyint_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !tinyint_decimal -- + +-- !bigint_int -- +3 +3 +3 + +-- !bigint_smallint -- +100 +100 +100 + +-- !bigint_tinyint -- +5 +5 +5 + +-- !bigint_bigint -- +1000000000 +1000000000 +1000000000 + +-- !bigint_float -- + +-- !bigint_double -- + +-- !bigint_boolean -- + +-- !bigint_string -- + +-- !bigint_char -- +A +A +A + +-- !bigint_varchar -- + +-- !bigint_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !bigint_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !bigint_decimal -- + +-- !float_int -- + +-- !float_smallint -- + +-- !float_tinyint -- + +-- !float_bigint -- + +-- !float_float -- + +-- !float_double -- + +-- !float_boolean -- + +-- !float_string -- + +-- !float_char -- + +-- !float_varchar -- + +-- !float_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !float_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !float_decimal -- + +-- !double_int -- +2.0 +2.0 +2.0 + +-- !double_smallint -- + +-- !double_tinyint -- + +-- !double_bigint -- + +-- !double_float -- + +-- !double_double -- + +-- !double_boolean -- + +-- !double_string -- + +-- !double_char -- +A +A +A + +-- !double_varchar -- + +-- !double_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !double_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !double_decimal -- + +-- !boolean_int -- +3 +3 +3 + +-- !boolean_smallint -- +100 +100 +100 + +-- !boolean_tinyint -- +5 +5 +5 + +-- !boolean_bigint -- +1000000000 +1000000000 +1000000000 + +-- !boolean_float -- + +-- !boolean_double -- + +-- !boolean_boolean -- + +-- !boolean_string -- + +-- !boolean_char -- +A +A +A + +-- !boolean_varchar -- + +-- !boolean_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !boolean_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !boolean_decimal -- + +-- !string_int -- + +-- !string_smallint -- + +-- !string_tinyint -- + +-- !string_bigint -- + +-- !string_float -- + +-- !string_double -- + +-- !string_boolean -- + +-- !string_string -- + +-- !string_char -- +A +A +A + +-- !string_varchar -- + +-- !string_date -- + +-- !string_timestamp -- + +-- !string_decimal -- + +-- !char_int -- + +-- !char_smallint -- + +-- !char_tinyint -- + +-- !char_bigint -- + +-- !char_float -- + +-- !char_double -- + +-- !char_boolean -- + +-- !char_string -- + +-- !char_char -- +A +A +A + +-- !char_varchar -- + +-- !char_date -- + +-- !char_timestamp -- + +-- !char_decimal -- + +-- !varchar_int -- + +-- !varchar_smallint -- + +-- !varchar_tinyint -- + +-- !varchar_bigint -- + +-- !varchar_float -- + +-- !varchar_double -- + +-- !varchar_boolean -- + +-- !varchar_string -- + +-- !varchar_char -- +B +B +B + +-- !varchar_varchar -- + +-- !varchar_date -- + +-- !varchar_timestamp -- + +-- !varchar_decimal -- + +-- !date_int -- +3 +3 +3 + +-- !date_smallint -- +100 +100 +100 + +-- !date_tinyint -- +5 +5 +5 + +-- !date_bigint -- +1000000000 +1000000000 +1000000000 + +-- !date_float -- + +-- !date_double -- + +-- !date_boolean -- + +-- !date_string -- + +-- !date_char -- +A +A +A + +-- !date_varchar -- + +-- !date_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !date_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !date_decimal -- + +-- !timestamp_int -- +3 +3 +3 + +-- !timestamp_smallint -- +100 +100 +100 + +-- !timestamp_tinyint -- +5 +5 +5 + +-- !timestamp_bigint -- +1000000000 +1000000000 +1000000000 + +-- !timestamp_float -- + +-- !timestamp_double -- + +-- !timestamp_boolean -- + +-- !timestamp_string -- + +-- !timestamp_char -- +B +B +B + +-- !timestamp_varchar -- + +-- !timestamp_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !timestamp_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !timestamp_decimal -- + +-- !decimal_int -- + +-- !decimal_smallint -- + +-- !decimal_tinyint -- + +-- !decimal_bigint -- + +-- !decimal_float -- + +-- !decimal_double -- + +-- !decimal_boolean -- + +-- !decimal_string -- + +-- !decimal_char -- + +-- !decimal_varchar -- + +-- !decimal_date -- +2023-10-06 +2023-10-06 +2023-10-06 + +-- !decimal_timestamp -- +2023-10-06T14:30 +2023-10-06T14:30 +2023-10-06T14:30 + +-- !decimal_decimal -- + diff --git a/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy b/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy new file mode 100644 index 0000000000..379554c7f1 --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_parquet_alter_column.groovy @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_parquet_alter_column", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String hms_port = context.config.otherConfigs.get("hms_port") + + String catalog_name = "test_hive_parquet_alter_column" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + sql """switch ${catalog_name};""" + logger.info("switched to catalog " + catalog_name) + String Orderby = """ order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_decimal,col_date,col_timestamp limit 7 """ + + sql """ use multi_catalog """ + + + + types = ["int","smallint","tinyint","bigint","float","double","boolean","string","char","varchar","date","timestamp","decimal"] + + for( String type1 in types) { + qt_desc """ desc parquet_alter_column_to_${type1} ; """ + + qt_show """ select * from parquet_alter_column_to_${type1} ${Orderby} """ + + for( String type2 in types) { + + qt_order """ select col_${type2} from parquet_alter_column_to_${type1} order by col_${type2} limit 3 """ + + } + } + + order_qt_int_int """ select col_int from parquet_alter_column_to_int where col_int>=2 order by col_int limit 3""" + order_qt_int_smallint """ select col_smallint from parquet_alter_column_to_int where col_smallint>=3 order by col_smallint limit 3""" + order_qt_int_tinyint """ select col_tinyint from parquet_alter_column_to_int where col_tinyint>=3 order by col_tinyint limit 3""" + order_qt_int_bigint """ select col_bigint from parquet_alter_column_to_int where col_bigint>=3 order by col_bigint limit 3""" + order_qt_int_float """ select col_float from parquet_alter_column_to_int where col_float=2.6 order by col_float limit 3""" + order_qt_int_double """ select col_double from parquet_alter_column_to_int where col_double=0.8 order by col_double limit 3""" + order_qt_int_boolean """ select col_boolean from parquet_alter_column_to_int where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_int_string """ select col_string from parquet_alter_column_to_int where col_string="B" order by col_string limit 3""" + order_qt_int_char """ select col_char from parquet_alter_column_to_int where col_char="B" order by col_char limit 3""" + order_qt_int_varchar """ select col_varchar from parquet_alter_column_to_int where col_varchar="C" order by col_varchar limit 3""" + order_qt_int_date """ select col_date from parquet_alter_column_to_int where year(col_date)=2023 order by col_date limit 3""" + order_qt_int_timestamp """ select col_timestamp from parquet_alter_column_to_int where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_int_decimal """ select col_decimal from parquet_alter_column_to_int where col_decimal=1.1 order by col_decimal limit 3""" + order_qt_smallint_int """ select col_int from parquet_alter_column_to_smallint where col_int>=1 order by col_int limit 3""" + order_qt_smallint_smallint """ select col_smallint from parquet_alter_column_to_smallint where col_smallint>=3 order by col_smallint limit 3""" + order_qt_smallint_tinyint """ select col_tinyint from parquet_alter_column_to_smallint where col_tinyint>=2 order by col_tinyint limit 3""" + order_qt_smallint_bigint """ select col_bigint from parquet_alter_column_to_smallint where col_bigint>=2 order by col_bigint limit 3""" + order_qt_smallint_float """ select col_float from parquet_alter_column_to_smallint where col_float=3.0 order by col_float limit 3""" + order_qt_smallint_double """ select col_double from parquet_alter_column_to_smallint where col_double=0.5 order by col_double limit 3""" + order_qt_smallint_boolean """ select col_boolean from parquet_alter_column_to_smallint where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_smallint_string """ select col_string from parquet_alter_column_to_smallint where col_string="helloworld" order by col_string limit 3""" + order_qt_smallint_char """ select col_char from parquet_alter_column_to_smallint where col_char="C" order by col_char limit 3""" + order_qt_smallint_varchar """ select col_varchar from parquet_alter_column_to_smallint where col_varchar="A" order by col_varchar limit 3""" + order_qt_smallint_date """ select col_date from parquet_alter_column_to_smallint where year(col_date)=2023 order by col_date limit 3""" + order_qt_smallint_timestamp """ select col_timestamp from parquet_alter_column_to_smallint where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_smallint_decimal """ select col_decimal from parquet_alter_column_to_smallint where col_decimal=2.5 order by col_decimal limit 3""" + order_qt_tinyint_int """ select col_int from parquet_alter_column_to_tinyint where col_int>=3 order by col_int limit 3""" + order_qt_tinyint_smallint """ select col_smallint from parquet_alter_column_to_tinyint where col_smallint>=3 order by col_smallint limit 3""" + order_qt_tinyint_tinyint """ select col_tinyint from parquet_alter_column_to_tinyint where col_tinyint>=3 order by col_tinyint limit 3""" + order_qt_tinyint_bigint """ select col_bigint from parquet_alter_column_to_tinyint where col_bigint>=1 order by col_bigint limit 3""" + order_qt_tinyint_float """ select col_float from parquet_alter_column_to_tinyint where col_float=0.6 order by col_float limit 3""" + order_qt_tinyint_double """ select col_double from parquet_alter_column_to_tinyint where col_double=1.1 order by col_double limit 3""" + order_qt_tinyint_boolean """ select col_boolean from parquet_alter_column_to_tinyint where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_tinyint_string """ select col_string from parquet_alter_column_to_tinyint where col_string="helloworld" order by col_string limit 3""" + order_qt_tinyint_char """ select col_char from parquet_alter_column_to_tinyint where col_char="A" order by col_char limit 3""" + order_qt_tinyint_varchar """ select col_varchar from parquet_alter_column_to_tinyint where col_varchar="C" order by col_varchar limit 3""" + order_qt_tinyint_date """ select col_date from parquet_alter_column_to_tinyint where year(col_date)=2023 order by col_date limit 3""" + order_qt_tinyint_timestamp """ select col_timestamp from parquet_alter_column_to_tinyint where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_tinyint_decimal """ select col_decimal from parquet_alter_column_to_tinyint where col_decimal=1.4 order by col_decimal limit 3""" + order_qt_bigint_int """ select col_int from parquet_alter_column_to_bigint where col_int>=3 order by col_int limit 3""" + order_qt_bigint_smallint """ select col_smallint from parquet_alter_column_to_bigint where col_smallint>=2 order by col_smallint limit 3""" + order_qt_bigint_tinyint """ select col_tinyint from parquet_alter_column_to_bigint where col_tinyint>=2 order by col_tinyint limit 3""" + order_qt_bigint_bigint """ select col_bigint from parquet_alter_column_to_bigint where col_bigint>=1 order by col_bigint limit 3""" + order_qt_bigint_float """ select col_float from parquet_alter_column_to_bigint where col_float=2.5 order by col_float limit 3""" + order_qt_bigint_double """ select col_double from parquet_alter_column_to_bigint where col_double=0.2 order by col_double limit 3""" + order_qt_bigint_boolean """ select col_boolean from parquet_alter_column_to_bigint where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_bigint_string """ select col_string from parquet_alter_column_to_bigint where col_string="A" order by col_string limit 3""" + order_qt_bigint_char """ select col_char from parquet_alter_column_to_bigint where col_char="A" order by col_char limit 3""" + order_qt_bigint_varchar """ select col_varchar from parquet_alter_column_to_bigint where col_varchar="A" order by col_varchar limit 3""" + order_qt_bigint_date """ select col_date from parquet_alter_column_to_bigint where year(col_date)=2023 order by col_date limit 3""" + order_qt_bigint_timestamp """ select col_timestamp from parquet_alter_column_to_bigint where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_bigint_decimal """ select col_decimal from parquet_alter_column_to_bigint where col_decimal=0.8 order by col_decimal limit 3""" + order_qt_float_int """ select col_int from parquet_alter_column_to_float where col_int=1.4 order by col_int limit 3""" + order_qt_float_smallint """ select col_smallint from parquet_alter_column_to_float where col_smallint=0.3 order by col_smallint limit 3""" + order_qt_float_tinyint """ select col_tinyint from parquet_alter_column_to_float where col_tinyint=0.2 order by col_tinyint limit 3""" + order_qt_float_bigint """ select col_bigint from parquet_alter_column_to_float where col_bigint=2.2 order by col_bigint limit 3""" + order_qt_float_float """ select col_float from parquet_alter_column_to_float where col_float=1.2 order by col_float limit 3""" + order_qt_float_double """ select col_double from parquet_alter_column_to_float where col_double=1.5 order by col_double limit 3""" + order_qt_float_boolean """ select col_boolean from parquet_alter_column_to_float where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_float_string """ select col_string from parquet_alter_column_to_float where col_string="A" order by col_string limit 3""" + order_qt_float_char """ select col_char from parquet_alter_column_to_float where col_char="helloworld" order by col_char limit 3""" + order_qt_float_varchar """ select col_varchar from parquet_alter_column_to_float where col_varchar="1" order by col_varchar limit 3""" + order_qt_float_date """ select col_date from parquet_alter_column_to_float where year(col_date)=2023 order by col_date limit 3""" + order_qt_float_timestamp """ select col_timestamp from parquet_alter_column_to_float where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_float_decimal """ select col_decimal from parquet_alter_column_to_float where col_decimal=0.8 order by col_decimal limit 3""" + order_qt_double_int """ select col_int from parquet_alter_column_to_double where col_int=2.0 order by col_int limit 3""" + order_qt_double_smallint """ select col_smallint from parquet_alter_column_to_double where col_smallint=2.0 order by col_smallint limit 3""" + order_qt_double_tinyint """ select col_tinyint from parquet_alter_column_to_double where col_tinyint=1.4 order by col_tinyint limit 3""" + order_qt_double_bigint """ select col_bigint from parquet_alter_column_to_double where col_bigint=1.5 order by col_bigint limit 3""" + order_qt_double_float """ select col_float from parquet_alter_column_to_double where col_float=2.2 order by col_float limit 3""" + order_qt_double_double """ select col_double from parquet_alter_column_to_double where col_double=0.6 order by col_double limit 3""" + order_qt_double_boolean """ select col_boolean from parquet_alter_column_to_double where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_double_string """ select col_string from parquet_alter_column_to_double where col_string="B" order by col_string limit 3""" + order_qt_double_char """ select col_char from parquet_alter_column_to_double where col_char="A" order by col_char limit 3""" + order_qt_double_varchar """ select col_varchar from parquet_alter_column_to_double where col_varchar="C" order by col_varchar limit 3""" + order_qt_double_date """ select col_date from parquet_alter_column_to_double where year(col_date)=2023 order by col_date limit 3""" + order_qt_double_timestamp """ select col_timestamp from parquet_alter_column_to_double where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_double_decimal """ select col_decimal from parquet_alter_column_to_double where col_decimal=0.3 order by col_decimal limit 3""" + order_qt_boolean_int """ select col_int from parquet_alter_column_to_boolean where col_int>=3 order by col_int limit 3""" + order_qt_boolean_smallint """ select col_smallint from parquet_alter_column_to_boolean where col_smallint>=2 order by col_smallint limit 3""" + order_qt_boolean_tinyint """ select col_tinyint from parquet_alter_column_to_boolean where col_tinyint>=1 order by col_tinyint limit 3""" + order_qt_boolean_bigint """ select col_bigint from parquet_alter_column_to_boolean where col_bigint>=3 order by col_bigint limit 3""" + order_qt_boolean_float """ select col_float from parquet_alter_column_to_boolean where col_float=1.1 order by col_float limit 3""" + order_qt_boolean_double """ select col_double from parquet_alter_column_to_boolean where col_double=0.5 order by col_double limit 3""" + order_qt_boolean_boolean """ select col_boolean from parquet_alter_column_to_boolean where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_boolean_string """ select col_string from parquet_alter_column_to_boolean where col_string="1" order by col_string limit 3""" + order_qt_boolean_char """ select col_char from parquet_alter_column_to_boolean where col_char="A" order by col_char limit 3""" + order_qt_boolean_varchar """ select col_varchar from parquet_alter_column_to_boolean where col_varchar="B" order by col_varchar limit 3""" + order_qt_boolean_date """ select col_date from parquet_alter_column_to_boolean where year(col_date)=2023 order by col_date limit 3""" + order_qt_boolean_timestamp """ select col_timestamp from parquet_alter_column_to_boolean where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_boolean_decimal """ select col_decimal from parquet_alter_column_to_boolean where col_decimal=2.8 order by col_decimal limit 3""" + order_qt_string_int """ select col_int from parquet_alter_column_to_string where col_int="C" order by col_int limit 3""" + order_qt_string_smallint """ select col_smallint from parquet_alter_column_to_string where col_smallint="C" order by col_smallint limit 3""" + order_qt_string_tinyint """ select col_tinyint from parquet_alter_column_to_string where col_tinyint="B" order by col_tinyint limit 3""" + order_qt_string_bigint """ select col_bigint from parquet_alter_column_to_string where col_bigint="helloworld" order by col_bigint limit 3""" + order_qt_string_float """ select col_float from parquet_alter_column_to_string where col_float="1" order by col_float limit 3""" + order_qt_string_double """ select col_double from parquet_alter_column_to_string where col_double="C" order by col_double limit 3""" + order_qt_string_boolean """ select col_boolean from parquet_alter_column_to_string where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_string_string """ select col_string from parquet_alter_column_to_string where col_string="B" order by col_string limit 3""" + order_qt_string_char """ select col_char from parquet_alter_column_to_string where col_char="A" order by col_char limit 3""" + order_qt_string_varchar """ select col_varchar from parquet_alter_column_to_string where col_varchar="B" order by col_varchar limit 3""" + order_qt_string_date """ select col_date from parquet_alter_column_to_string where col_date="helloworld" order by col_date limit 3""" + order_qt_string_timestamp """ select col_timestamp from parquet_alter_column_to_string where col_timestamp="B" order by col_timestamp limit 3""" + order_qt_string_decimal """ select col_decimal from parquet_alter_column_to_string where col_decimal="1" order by col_decimal limit 3""" + order_qt_char_int """ select col_int from parquet_alter_column_to_char where col_int="B" order by col_int limit 3""" + order_qt_char_smallint """ select col_smallint from parquet_alter_column_to_char where col_smallint="A" order by col_smallint limit 3""" + order_qt_char_tinyint """ select col_tinyint from parquet_alter_column_to_char where col_tinyint="A" order by col_tinyint limit 3""" + order_qt_char_bigint """ select col_bigint from parquet_alter_column_to_char where col_bigint="B" order by col_bigint limit 3""" + order_qt_char_float """ select col_float from parquet_alter_column_to_char where col_float="C" order by col_float limit 3""" + order_qt_char_double """ select col_double from parquet_alter_column_to_char where col_double="A" order by col_double limit 3""" + order_qt_char_boolean """ select col_boolean from parquet_alter_column_to_char where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_char_string """ select col_string from parquet_alter_column_to_char where col_string="C" order by col_string limit 3""" + order_qt_char_char """ select col_char from parquet_alter_column_to_char where col_char="A" order by col_char limit 3""" + order_qt_char_varchar """ select col_varchar from parquet_alter_column_to_char where col_varchar="B" order by col_varchar limit 3""" + order_qt_char_date """ select col_date from parquet_alter_column_to_char where col_date="B" order by col_date limit 3""" + order_qt_char_timestamp """ select col_timestamp from parquet_alter_column_to_char where col_timestamp="A" order by col_timestamp limit 3""" + order_qt_char_decimal """ select col_decimal from parquet_alter_column_to_char where col_decimal="C" order by col_decimal limit 3""" + order_qt_varchar_int """ select col_int from parquet_alter_column_to_varchar where col_int="B" order by col_int limit 3""" + order_qt_varchar_smallint """ select col_smallint from parquet_alter_column_to_varchar where col_smallint="helloworld" order by col_smallint limit 3""" + order_qt_varchar_tinyint """ select col_tinyint from parquet_alter_column_to_varchar where col_tinyint="A" order by col_tinyint limit 3""" + order_qt_varchar_bigint """ select col_bigint from parquet_alter_column_to_varchar where col_bigint="helloworld" order by col_bigint limit 3""" + order_qt_varchar_float """ select col_float from parquet_alter_column_to_varchar where col_float="1" order by col_float limit 3""" + order_qt_varchar_double """ select col_double from parquet_alter_column_to_varchar where col_double="B" order by col_double limit 3""" + order_qt_varchar_boolean """ select col_boolean from parquet_alter_column_to_varchar where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_varchar_string """ select col_string from parquet_alter_column_to_varchar where col_string="A" order by col_string limit 3""" + order_qt_varchar_char """ select col_char from parquet_alter_column_to_varchar where col_char="B" order by col_char limit 3""" + order_qt_varchar_varchar """ select col_varchar from parquet_alter_column_to_varchar where col_varchar="B" order by col_varchar limit 3""" + order_qt_varchar_date """ select col_date from parquet_alter_column_to_varchar where col_date="C" order by col_date limit 3""" + order_qt_varchar_timestamp """ select col_timestamp from parquet_alter_column_to_varchar where col_timestamp="C" order by col_timestamp limit 3""" + order_qt_varchar_decimal """ select col_decimal from parquet_alter_column_to_varchar where col_decimal="helloworld" order by col_decimal limit 3""" + order_qt_date_int """ select col_int from parquet_alter_column_to_date where col_int>=3 order by col_int limit 3""" + order_qt_date_smallint """ select col_smallint from parquet_alter_column_to_date where col_smallint>=1 order by col_smallint limit 3""" + order_qt_date_tinyint """ select col_tinyint from parquet_alter_column_to_date where col_tinyint>=3 order by col_tinyint limit 3""" + order_qt_date_bigint """ select col_bigint from parquet_alter_column_to_date where col_bigint>=1 order by col_bigint limit 3""" + order_qt_date_float """ select col_float from parquet_alter_column_to_date where col_float=2.8 order by col_float limit 3""" + order_qt_date_double """ select col_double from parquet_alter_column_to_date where col_double=2.5 order by col_double limit 3""" + order_qt_date_boolean """ select col_boolean from parquet_alter_column_to_date where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_date_string """ select col_string from parquet_alter_column_to_date where col_string="helloworld" order by col_string limit 3""" + order_qt_date_char """ select col_char from parquet_alter_column_to_date where col_char="A" order by col_char limit 3""" + order_qt_date_varchar """ select col_varchar from parquet_alter_column_to_date where col_varchar="1" order by col_varchar limit 3""" + order_qt_date_date """ select col_date from parquet_alter_column_to_date where year(col_date)=2023 order by col_date limit 3""" + order_qt_date_timestamp """ select col_timestamp from parquet_alter_column_to_date where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_date_decimal """ select col_decimal from parquet_alter_column_to_date where col_decimal=0.3 order by col_decimal limit 3""" + order_qt_timestamp_int """ select col_int from parquet_alter_column_to_timestamp where col_int>=3 order by col_int limit 3""" + order_qt_timestamp_smallint """ select col_smallint from parquet_alter_column_to_timestamp where col_smallint>=3 order by col_smallint limit 3""" + order_qt_timestamp_tinyint """ select col_tinyint from parquet_alter_column_to_timestamp where col_tinyint>=1 order by col_tinyint limit 3""" + order_qt_timestamp_bigint """ select col_bigint from parquet_alter_column_to_timestamp where col_bigint>=3 order by col_bigint limit 3""" + order_qt_timestamp_float """ select col_float from parquet_alter_column_to_timestamp where col_float=2.4 order by col_float limit 3""" + order_qt_timestamp_double """ select col_double from parquet_alter_column_to_timestamp where col_double=1.3 order by col_double limit 3""" + order_qt_timestamp_boolean """ select col_boolean from parquet_alter_column_to_timestamp where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_timestamp_string """ select col_string from parquet_alter_column_to_timestamp where col_string="C" order by col_string limit 3""" + order_qt_timestamp_char """ select col_char from parquet_alter_column_to_timestamp where col_char="B" order by col_char limit 3""" + order_qt_timestamp_varchar """ select col_varchar from parquet_alter_column_to_timestamp where col_varchar="C" order by col_varchar limit 3""" + order_qt_timestamp_date """ select col_date from parquet_alter_column_to_timestamp where year(col_date)=2023 order by col_date limit 3""" + order_qt_timestamp_timestamp """ select col_timestamp from parquet_alter_column_to_timestamp where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_timestamp_decimal """ select col_decimal from parquet_alter_column_to_timestamp where col_decimal=1.3 order by col_decimal limit 3""" + order_qt_decimal_int """ select col_int from parquet_alter_column_to_decimal where col_int=2.8 order by col_int limit 3""" + order_qt_decimal_smallint """ select col_smallint from parquet_alter_column_to_decimal where col_smallint=0.1 order by col_smallint limit 3""" + order_qt_decimal_tinyint """ select col_tinyint from parquet_alter_column_to_decimal where col_tinyint=2.9 order by col_tinyint limit 3""" + order_qt_decimal_bigint """ select col_bigint from parquet_alter_column_to_decimal where col_bigint=2.3 order by col_bigint limit 3""" + order_qt_decimal_float """ select col_float from parquet_alter_column_to_decimal where col_float=2.5 order by col_float limit 3""" + order_qt_decimal_double """ select col_double from parquet_alter_column_to_decimal where col_double=1.7 order by col_double limit 3""" + order_qt_decimal_boolean """ select col_boolean from parquet_alter_column_to_decimal where year(col_boolean)=2023 order by col_boolean limit 3""" + order_qt_decimal_string """ select col_string from parquet_alter_column_to_decimal where col_string="helloworld" order by col_string limit 3""" + order_qt_decimal_char """ select col_char from parquet_alter_column_to_decimal where col_char="helloworld" order by col_char limit 3""" + order_qt_decimal_varchar """ select col_varchar from parquet_alter_column_to_decimal where col_varchar="helloworld" order by col_varchar limit 3""" + order_qt_decimal_date """ select col_date from parquet_alter_column_to_decimal where year(col_date)=2023 order by col_date limit 3""" + order_qt_decimal_timestamp """ select col_timestamp from parquet_alter_column_to_decimal where year(col_timestamp)=2023 order by col_timestamp limit 3""" + order_qt_decimal_decimal """ select col_decimal from parquet_alter_column_to_decimal where col_decimal=1.5 order by col_decimal limit 3""" + + } +}