[feature](hive)Support hive tables after alter type. (#25138)

1.Reconstruct the logic of decode to read parquet. The parquet  reader first reads the data according to the parquet physical type, and then performs a type conversion.

2.Support hive alter table.
This commit is contained in:
daidai
2023-11-02 00:24:21 +08:00
committed by GitHub
parent 3e10e5af39
commit a4e415ab09
30 changed files with 3392 additions and 2167 deletions

View File

@ -410,4 +410,4 @@ protected:
MutablePtr shallow_mutate() const {
return MutablePtr(static_cast<Derived*>(Base::shallow_mutate().get()));
}
};
};

View File

@ -125,56 +125,35 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data
return _decode_dict_values<has_filter>(doris_column, select_vector, is_dict_filter);
}
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
switch (logical_type) {
case TypeIndex::String:
[[fallthrough]];
case TypeIndex::FixedString: {
size_t dict_index = 0;
size_t dict_index = 0;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
}
doris_column->insert_many_strings_overflow(&string_values[0], run_length,
_max_value_length);
break;
}
case ColumnSelectVector::NULL_DATA: {
doris_column->insert_many_defaults(run_length);
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
}
doris_column->insert_many_strings_overflow(&string_values[0], run_length,
_max_value_length);
break;
}
case ColumnSelectVector::NULL_DATA: {
doris_column->insert_many_defaults(run_length);
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
return Status::OK();
}
case TypeIndex::Decimal32:
return _decode_binary_decimal<Int32, has_filter>(doris_column, data_type, select_vector);
case TypeIndex::Decimal64:
return _decode_binary_decimal<Int64, has_filter>(doris_column, data_type, select_vector);
case TypeIndex::Decimal128:
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type, select_vector);
case TypeIndex::Decimal128I:
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type, select_vector);
// TODO: decimal256
default:
break;
}
return Status::InvalidArgument(
"Can't decode parquet physical type BYTE_ARRAY to doris logical type {}",
getTypeName(logical_type));
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -66,97 +66,10 @@ public:
MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override;
protected:
template <typename DecimalPrimitiveType, bool has_filter>
Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector);
// For dictionary encoding
std::vector<StringRef> _dict_items;
std::vector<uint8_t> _dict_data;
size_t _max_value_length;
std::unordered_map<StringRef, int32_t> _dict_value_to_code;
private:
template <typename DecimalPrimitiveType, bool has_filter,
DecimalScaleParams::ScaleType ScaleType>
Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector);
};
template <typename DecimalPrimitiveType, bool has_filter>
Status ByteArrayDictDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
init_decimal_converter<DecimalPrimitiveType>(data_type);
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
DecimalScaleParams::SCALE_UP>(
doris_column, data_type, select_vector);
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
DecimalScaleParams::SCALE_DOWN>(
doris_column, data_type, select_vector);
} else {
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
DecimalScaleParams::NO_SCALE>(
doris_column, data_type, select_vector);
}
}
template <typename DecimalPrimitiveType, bool has_filter, DecimalScaleParams::ScaleType ScaleType>
Status ByteArrayDictDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
auto& column_data =
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
size_t dict_index = 0;
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
StringRef& slice = _dict_items[_indexes[dict_index++]];
char* buf_start = const_cast<char*>(slice.data);
uint32_t length = (uint32_t)slice.size;
// When Decimal in parquet is stored in byte arrays, binary and fixed,
// the unscaled number must be encoded as two's complement using big-endian byte order.
DecimalPrimitiveType value = 0;
memcpy(reinterpret_cast<char*>(&value), buf_start, length);
value = BitUtil::big_endian_to_host(value);
value = value >> ((sizeof(value) - length) * 8);
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
value *= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
value /= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
// do nothing
} else {
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
v = (DecimalPrimitiveType)value;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -56,74 +56,53 @@ template <bool has_filter>
Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector,
bool is_dict_filter) {
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
switch (logical_type) {
case TypeIndex::String:
[[fallthrough]];
case TypeIndex::FixedString: {
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
if (UNLIKELY(_offset + 4 > _data->size)) {
return Status::IOError("Can't read byte array length from plain decoder");
}
uint32_t length = decode_fixed32_le(
reinterpret_cast<const uint8_t*>(_data->data) + _offset);
_offset += 4;
if (UNLIKELY(_offset + length) > _data->size) {
return Status::IOError("Can't read enough bytes in plain decoder");
}
string_values.emplace_back(_data->data + _offset, length);
_offset += length;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
if (UNLIKELY(_offset + 4 > _data->size)) {
return Status::IOError("Can't read byte array length from plain decoder");
}
doris_column->insert_many_strings(&string_values[0], run_length);
break;
}
case ColumnSelectVector::NULL_DATA: {
doris_column->insert_many_defaults(run_length);
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
for (int i = 0; i < run_length; ++i) {
if (UNLIKELY(_offset + 4 > _data->size)) {
return Status::IOError("Can't read byte array length from plain decoder");
}
uint32_t length = decode_fixed32_le(
reinterpret_cast<const uint8_t*>(_data->data) + _offset);
_offset += 4;
if (UNLIKELY(_offset + length) > _data->size) {
return Status::IOError("Can't read enough bytes in plain decoder");
}
_offset += length;
uint32_t length =
decode_fixed32_le(reinterpret_cast<const uint8_t*>(_data->data) + _offset);
_offset += 4;
if (UNLIKELY(_offset + length) > _data->size) {
return Status::IOError("Can't read enough bytes in plain decoder");
}
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
string_values.emplace_back(_data->data + _offset, length);
_offset += length;
}
doris_column->insert_many_strings(&string_values[0], run_length);
break;
}
case ColumnSelectVector::NULL_DATA: {
doris_column->insert_many_defaults(run_length);
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
for (int i = 0; i < run_length; ++i) {
if (UNLIKELY(_offset + 4 > _data->size)) {
return Status::IOError("Can't read byte array length from plain decoder");
}
uint32_t length =
decode_fixed32_le(reinterpret_cast<const uint8_t*>(_data->data) + _offset);
_offset += 4;
if (UNLIKELY(_offset + length) > _data->size) {
return Status::IOError("Can't read enough bytes in plain decoder");
}
_offset += length;
}
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
return Status::OK();
}
case TypeIndex::Decimal32:
return _decode_binary_decimal<Int32, has_filter>(doris_column, data_type, select_vector);
case TypeIndex::Decimal64:
return _decode_binary_decimal<Int64, has_filter>(doris_column, data_type, select_vector);
case TypeIndex::Decimal128:
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type, select_vector);
case TypeIndex::Decimal128I:
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type, select_vector);
// TODO: decimal256
default:
break;
}
return Status::InvalidArgument(
"Can't decode parquet physical type BYTE_ARRAY to doris logical type {}",
getTypeName(logical_type));
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -56,97 +56,5 @@ public:
ColumnSelectVector& select_vector, bool is_dict_filter);
Status skip_values(size_t num_values) override;
protected:
template <typename DecimalPrimitiveType, bool has_filter>
Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector);
private:
template <typename DecimalPrimitiveType, bool has_filter,
DecimalScaleParams::ScaleType ScaleType>
Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector);
};
template <typename DecimalPrimitiveType, bool has_filter>
Status ByteArrayPlainDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
init_decimal_converter<DecimalPrimitiveType>(data_type);
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
DecimalScaleParams::SCALE_UP>(
doris_column, data_type, select_vector);
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
DecimalScaleParams::SCALE_DOWN>(
doris_column, data_type, select_vector);
} else {
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
DecimalScaleParams::NO_SCALE>(
doris_column, data_type, select_vector);
}
}
template <typename DecimalPrimitiveType, bool has_filter, DecimalScaleParams::ScaleType ScaleType>
Status ByteArrayPlainDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
auto& column_data =
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
if (UNLIKELY(_offset + 4 > _data->size)) {
return Status::IOError("Can't read byte array length from plain decoder");
}
uint32_t length =
decode_fixed32_le(reinterpret_cast<const uint8_t*>(_data->data) + _offset);
_offset += 4;
char* buf_start = _data->data + _offset;
_offset += length;
// When Decimal in parquet is stored in byte arrays, binary and fixed,
// the unscaled number must be encoded as two's complement using big-endian byte order.
DecimalPrimitiveType value = 0;
memcpy(reinterpret_cast<char*>(&value), buf_start, length);
value = BitUtil::big_endian_to_host(value);
value = value >> ((sizeof(value) - length) * 8);
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
value *= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
value /= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
// do nothing
} else {
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
v = (DecimalPrimitiveType)value;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -31,8 +31,6 @@
namespace doris::vectorized {
const cctz::time_zone DecodeParams::utc0 = cctz::utc_time_zone();
Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type encoding,
std::unique_ptr<Decoder>& decoder) {
switch (encoding) {
@ -45,17 +43,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
decoder.reset(new ByteArrayPlainDecoder());
break;
case tparquet::Type::INT32:
[[fallthrough]];
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::INT32>());
break;
case tparquet::Type::INT64:
[[fallthrough]];
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::INT64>());
break;
case tparquet::Type::INT96:
[[fallthrough]];
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::INT96>());
break;
case tparquet::Type::FLOAT:
[[fallthrough]];
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::FLOAT>());
break;
case tparquet::Type::DOUBLE:
[[fallthrough]];
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::DOUBLE>());
break;
case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
decoder.reset(new FixLengthPlainDecoder(type));
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY>());
break;
default:
return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder",
@ -70,22 +73,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
decoder.reset(new ByteArrayDictDecoder());
break;
case tparquet::Type::INT32:
decoder.reset(new FixLengthDictDecoder<Int32>(type));
decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT32>());
break;
case tparquet::Type::INT64:
decoder.reset(new FixLengthDictDecoder<Int64>(type));
decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT64>());
break;
case tparquet::Type::INT96:
decoder.reset(new FixLengthDictDecoder<ParquetInt96>(type));
decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT96>());
break;
case tparquet::Type::FLOAT:
decoder.reset(new FixLengthDictDecoder<Float32>(type));
decoder.reset(new FixLengthDictDecoder<tparquet::Type::FLOAT>());
break;
case tparquet::Type::DOUBLE:
decoder.reset(new FixLengthDictDecoder<Float64>(type));
decoder.reset(new FixLengthDictDecoder<tparquet::Type::DOUBLE>());
break;
case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
decoder.reset(new FixLengthDictDecoder<char*>(type));
decoder.reset(new FixLengthDictDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY>());
break;
default:
return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder",
@ -106,10 +109,10 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
// Supports only INT32 and INT64.
switch (type) {
case tparquet::Type::INT32:
decoder.reset(new DeltaBitPackDecoder<Int32>(type));
decoder.reset(new DeltaBitPackDecoder<int32, tparquet::Type::INT32>());
break;
case tparquet::Type::INT64:
decoder.reset(new DeltaBitPackDecoder<Int64>(type));
decoder.reset(new DeltaBitPackDecoder<int64, tparquet::Type::INT64>());
break;
default:
return Status::InternalError("DELTA_BINARY_PACKED only supports INT32 and INT64");
@ -118,7 +121,7 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
case tparquet::Encoding::DELTA_BYTE_ARRAY:
switch (type) {
case tparquet::Type::BYTE_ARRAY:
decoder.reset(new DeltaByteArrayDecoder(type));
decoder.reset(new DeltaByteArrayDecoder<tparquet::Type::BYTE_ARRAY>());
break;
default:
return Status::InternalError("DELTA_BYTE_ARRAY only supports BYTE_ARRAY.");
@ -127,7 +130,7 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
case tparquet::Encoding::DELTA_LENGTH_BYTE_ARRAY:
switch (type) {
case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
decoder.reset(new DeltaLengthByteArrayDecoder(type));
decoder.reset(new DeltaLengthByteArrayDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY>());
break;
default:
return Status::InternalError(
@ -141,47 +144,4 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
return Status::OK();
}
void Decoder::init(FieldSchema* field_schema, cctz::time_zone* ctz) {
_field_schema = field_schema;
if (_decode_params == nullptr) {
_decode_params.reset(new DecodeParams());
}
if (ctz != nullptr) {
_decode_params->ctz = ctz;
}
const auto& schema = field_schema->parquet_schema;
if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) {
const auto& timestamp_info = schema.logicalType.TIMESTAMP;
if (!timestamp_info.isAdjustedToUTC) {
// should set timezone to utc+0
_decode_params->ctz = const_cast<cctz::time_zone*>(&_decode_params->utc0);
}
const auto& time_unit = timestamp_info.unit;
if (time_unit.__isset.MILLIS) {
_decode_params->second_mask = 1000;
_decode_params->scale_to_nano_factor = 1000000;
} else if (time_unit.__isset.MICROS) {
_decode_params->second_mask = 1000000;
_decode_params->scale_to_nano_factor = 1000;
} else if (time_unit.__isset.NANOS) {
_decode_params->second_mask = 1000000000;
_decode_params->scale_to_nano_factor = 1;
}
} else if (schema.__isset.converted_type) {
const auto& converted_type = schema.converted_type;
if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) {
_decode_params->second_mask = 1000;
_decode_params->scale_to_nano_factor = 1000000;
} else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) {
_decode_params->second_mask = 1000000;
_decode_params->scale_to_nano_factor = 1000;
}
}
if (_decode_params->ctz) {
VecDateTimeValue t;
t.from_unixtime(0, *_decode_params->ctz);
_decode_params->offset_days = t.day() == 31 ? -1 : 0; // If 1969-12-31, then returns -1.
}
}
} // namespace doris::vectorized

View File

@ -54,29 +54,6 @@ class ColumnString;
namespace doris::vectorized {
#define FOR_LOGICAL_NUMERIC_TYPES(M) \
M(TypeIndex::Int8, Int8, Int32) \
M(TypeIndex::UInt8, UInt8, Int32) \
M(TypeIndex::Int16, Int16, Int32) \
M(TypeIndex::UInt16, UInt16, Int32) \
M(TypeIndex::Int32, Int32, Int32) \
M(TypeIndex::UInt32, UInt32, Int32) \
M(TypeIndex::Int64, Int64, Int64) \
M(TypeIndex::UInt64, UInt64, Int64) \
M(TypeIndex::Float32, Float32, Float32) \
M(TypeIndex::Float64, Float64, Float64)
struct DecodeParams {
// schema.logicalType.TIMESTAMP.isAdjustedToUTC == false
static const cctz::time_zone utc0;
// schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set the time zone
cctz::time_zone* ctz = nullptr;
int32_t offset_days = 0;
int64_t second_mask = 1;
int64_t scale_to_nano_factor = 1;
DecimalScaleParams decimal_scale;
};
class Decoder {
public:
Decoder() = default;
@ -94,11 +71,6 @@ public:
_offset = 0;
}
void init(FieldSchema* field_schema, cctz::time_zone* ctz);
template <typename DecimalPrimitiveType>
void init_decimal_converter(DataTypePtr& data_type);
// Write the decoded values batch to doris's column
virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector, bool is_dict_filter) = 0;
@ -126,34 +98,8 @@ protected:
int32_t _type_length;
Slice* _data = nullptr;
uint32_t _offset = 0;
FieldSchema* _field_schema = nullptr;
std::unique_ptr<DecodeParams> _decode_params = nullptr;
};
template <typename DecimalPrimitiveType>
void Decoder::init_decimal_converter(DataTypePtr& data_type) {
if (_decode_params == nullptr || _field_schema == nullptr ||
_decode_params->decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) {
return;
}
auto scale = _field_schema->parquet_schema.scale;
auto* decimal_type = reinterpret_cast<DataTypeDecimal<Decimal<DecimalPrimitiveType>>*>(
const_cast<IDataType*>(remove_nullable(data_type).get()));
auto dest_scale = decimal_type->get_scale();
if (dest_scale > scale) {
_decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_UP;
_decode_params->decimal_scale.scale_factor =
DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dest_scale - scale);
} else if (dest_scale < scale) {
_decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN;
_decode_params->decimal_scale.scale_factor =
DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dest_scale);
} else {
_decode_params->decimal_scale.scale_type = DecimalScaleParams::NO_SCALE;
_decode_params->decimal_scale.scale_factor = 1;
}
}
class BaseDictDecoder : public Decoder {
public:
BaseDictDecoder() = default;

View File

@ -1,283 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "delta_bit_pack_decoder.h"
#include <string.h>
#include <algorithm>
#include <string_view>
#include "vec/columns/column.h"
#include "vec/common/arithmetic_overflow.h"
#include "vec/common/string_ref.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type_nullable.h"
namespace doris::vectorized {
template <typename T>
Status DeltaBitPackDecoder<T>::_init_header() {
if (!_bit_reader->GetVlqInt(&_values_per_block) ||
!_bit_reader->GetVlqInt(&_mini_blocks_per_block) ||
!_bit_reader->GetVlqInt(&_total_value_count) ||
!_bit_reader->GetZigZagVlqInt(&_last_value)) {
return Status::IOError("Init header eof");
}
if (_values_per_block == 0) {
return Status::InvalidArgument("Cannot have zero value per block");
}
if (_values_per_block % 128 != 0) {
return Status::InvalidArgument(
"the number of values in a block must be multiple of 128, but it's " +
std::to_string(_values_per_block));
}
if (_mini_blocks_per_block == 0) {
return Status::InvalidArgument("Cannot have zero miniblock per block");
}
_values_per_mini_block = _values_per_block / _mini_blocks_per_block;
if (_values_per_mini_block == 0) {
return Status::InvalidArgument("Cannot have zero value per miniblock");
}
if (_values_per_mini_block % 32 != 0) {
return Status::InvalidArgument(
"The number of values in a miniblock must be multiple of 32, but it's " +
std::to_string(_values_per_mini_block));
}
_total_values_remaining = _total_value_count;
_delta_bit_widths.resize(_mini_blocks_per_block);
// init as empty property
_block_initialized = false;
_values_remaining_current_mini_block = 0;
return Status::OK();
}
template <typename T>
Status DeltaBitPackDecoder<T>::_init_block() {
DCHECK_GT(_total_values_remaining, 0) << "InitBlock called at EOF";
if (!_bit_reader->GetZigZagVlqInt(&_min_delta)) {
return Status::IOError("Init block eof");
}
// read the bitwidth of each miniblock
uint8_t* bit_width_data = _delta_bit_widths.data();
for (uint32_t i = 0; i < _mini_blocks_per_block; ++i) {
if (!_bit_reader->GetAligned<uint8_t>(1, bit_width_data + i)) {
return Status::IOError("Decode bit-width EOF");
}
// Note that non-conformant bitwidth entries are allowed by the Parquet spec
// for extraneous miniblocks in the last block (GH-14923), so we check
// the bitwidths when actually using them (see InitMiniBlock()).
}
_mini_block_idx = 0;
_block_initialized = true;
RETURN_IF_ERROR(_init_mini_block(bit_width_data[0]));
return Status::OK();
}
template <typename T>
Status DeltaBitPackDecoder<T>::_init_mini_block(int bit_width) {
if (PREDICT_FALSE(bit_width > kMaxDeltaBitWidth)) {
return Status::InvalidArgument("delta bit width larger than integer bit width");
}
_delta_bit_width = bit_width;
_values_remaining_current_mini_block = _values_per_mini_block;
return Status::OK();
}
template <typename T>
Status DeltaBitPackDecoder<T>::_get_internal(T* buffer, int num_values, int* out_num_values) {
num_values = static_cast<int>(std::min<int64_t>(num_values, _total_values_remaining));
if (num_values == 0) {
*out_num_values = 0;
return Status::OK();
}
int i = 0;
while (i < num_values) {
if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) {
if (PREDICT_FALSE(!_block_initialized)) {
buffer[i++] = _last_value;
DCHECK_EQ(i, 1); // we're at the beginning of the page
if (i == num_values) {
// When block is uninitialized and i reaches num_values we have two
// different possibilities:
// 1. _total_value_count == 1, which means that the page may have only
// one value (encoded in the header), and we should not initialize
// any block.
// 2. _total_value_count != 1, which means we should initialize the
// incoming block for subsequent reads.
if (_total_value_count != 1) {
RETURN_IF_ERROR(_init_block());
}
break;
}
RETURN_IF_ERROR(_init_block());
} else {
++_mini_block_idx;
if (_mini_block_idx < _mini_blocks_per_block) {
RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx]));
} else {
RETURN_IF_ERROR(_init_block());
}
}
}
int values_decode = std::min(_values_remaining_current_mini_block,
static_cast<uint32_t>(num_values - i));
for (int j = 0; j < values_decode; ++j) {
if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) {
return Status::IOError("Get batch EOF");
}
}
for (int j = 0; j < values_decode; ++j) {
// Addition between min_delta, packed int and last_value should be treated as
// unsigned addition. Overflow is as expected.
buffer[i + j] = static_cast<UT>(_min_delta) + static_cast<UT>(buffer[i + j]) +
static_cast<UT>(_last_value);
_last_value = buffer[i + j];
}
_values_remaining_current_mini_block -= values_decode;
i += values_decode;
}
_total_values_remaining -= num_values;
if (PREDICT_FALSE(_total_values_remaining == 0)) {
if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) {
return Status::IOError("Skip padding EOF");
}
_values_remaining_current_mini_block = 0;
}
*out_num_values = num_values;
return Status::OK();
}
void DeltaLengthByteArrayDecoder::_decode_lengths() {
_len_decoder.set_bit_reader(_bit_reader);
// get the number of encoded lengths
int num_length = _len_decoder.valid_values_count();
_buffered_length.resize(num_length);
// decode all the lengths. all the lengths are buffered in buffered_length_.
int ret;
Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret);
if (!st.ok()) {
LOG(FATAL) << "Fail to decode delta length, status: " << st;
}
DCHECK_EQ(ret, num_length);
_length_idx = 0;
_num_valid_values = num_length;
}
Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values,
int* out_num_values) {
// Decode up to `max_values` strings into an internal buffer
// and reference them into `buffer`.
max_values = std::min(max_values, _num_valid_values);
if (max_values == 0) {
*out_num_values = 0;
return Status::OK();
}
int32_t data_size = 0;
const int32_t* length_ptr = _buffered_length.data() + _length_idx;
for (int i = 0; i < max_values; ++i) {
int32_t len = length_ptr[i];
if (PREDICT_FALSE(len < 0)) {
return Status::InvalidArgument("Negative string delta length");
}
buffer[i].size = len;
if (common::add_overflow(data_size, len, data_size)) {
return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
}
}
_length_idx += max_values;
_buffered_data.resize(data_size);
char* data_ptr = _buffered_data.data();
for (int j = 0; j < data_size; j++) {
if (!_bit_reader->GetValue(8, data_ptr + j)) {
return Status::IOError("Get length bytes EOF");
}
}
for (int i = 0; i < max_values; ++i) {
buffer[i].data = data_ptr;
data_ptr += buffer[i].size;
}
// this->num_values_ -= max_values;
_num_valid_values -= max_values;
*out_num_values = max_values;
return Status::OK();
}
Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) {
// Decode up to `max_values` strings into an internal buffer
// and reference them into `buffer`.
max_values = std::min(max_values, _num_valid_values);
if (max_values == 0) {
*out_num_values = max_values;
return Status::OK();
}
int suffix_read;
RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read));
if (PREDICT_FALSE(suffix_read != max_values)) {
return Status::IOError("Read {}, expecting {} from suffix decoder",
std::to_string(suffix_read), std::to_string(max_values));
}
int64_t data_size = 0;
const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset;
for (int i = 0; i < max_values; ++i) {
if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY");
}
if (PREDICT_FALSE(common::add_overflow(data_size, static_cast<int64_t>(prefix_len_ptr[i]),
data_size) ||
common::add_overflow(data_size, static_cast<int64_t>(buffer[i].size),
data_size))) {
return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY");
}
}
_buffered_data.resize(data_size);
std::string_view prefix {_last_value};
char* data_ptr = _buffered_data.data();
for (int i = 0; i < max_values; ++i) {
if (PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY");
}
memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
// buffer[i] currently points to the string suffix
memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size);
buffer[i].data = data_ptr;
buffer[i].size += prefix_len_ptr[i];
data_ptr += buffer[i].size;
prefix = std::string_view {buffer[i].data, buffer[i].size};
}
_prefix_len_offset += max_values;
_num_valid_values -= max_values;
_last_value = std::string {prefix};
if (_num_valid_values == 0) {
_last_value_in_previous_page = _last_value;
}
*out_num_values = max_values;
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -51,14 +51,10 @@ public:
return _type_converted_decoder->skip_values(num_values);
}
template <bool has_filter>
template <tparquet::Type::type PhysicalType, bool has_filter>
Status decode_byte_array(const std::vector<Slice>& decoded_vals, MutableColumnPtr& doris_column,
DataTypePtr& data_type, ColumnSelectVector& select_vector) {
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
switch (logical_type) {
case TypeIndex::String:
[[fallthrough]];
case TypeIndex::FixedString: {
if constexpr (PhysicalType == tparquet::Type::BYTE_ARRAY) {
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
@ -88,21 +84,14 @@ public:
}
}
_current_value_idx = 0;
return Status::OK();
}
default:
break;
}
return Status::InvalidArgument(
"Can't decode parquet physical type BYTE_ARRAY to doris logical type {}",
getTypeName(logical_type));
return Status::OK();
}
protected:
void init_values_converter() {
_type_converted_decoder->set_data(_data);
_type_converted_decoder->set_type_length(_type_length);
_type_converted_decoder->init(_field_schema, _decode_params->ctz);
}
// Convert decoded value to doris type value.
std::unique_ptr<Decoder> _type_converted_decoder;
@ -117,13 +106,12 @@ protected:
* Block
* [min delta] [list of bitwidths of the mini blocks] [miniblocks]
*/
template <typename T>
template <typename T, tparquet::Type::type PhysicalType>
class DeltaBitPackDecoder final : public DeltaDecoder {
public:
using UT = std::make_unsigned_t<T>;
DeltaBitPackDecoder(const tparquet::Type::type& physical_type)
: DeltaDecoder(new FixLengthPlainDecoder(physical_type)) {}
DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder<PhysicalType>()) {}
~DeltaBitPackDecoder() override = default;
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector, bool is_dict_filter) override {
@ -200,16 +188,13 @@ private:
// _values_remaining_current_mini_block may greater than _total_values_remaining.
uint32_t _values_remaining_current_mini_block;
};
template class DeltaBitPackDecoder<int32_t>;
template class DeltaBitPackDecoder<int64_t>;
//template class DeltaBitPackDecoder<int32_t>;
//template class DeltaBitPackDecoder<int64_t>;
template <tparquet::Type::type PhysicalType>
class DeltaLengthByteArrayDecoder final : public DeltaDecoder {
public:
explicit DeltaLengthByteArrayDecoder(const tparquet::Type::type& physical_type)
: DeltaDecoder(nullptr),
_len_decoder(physical_type),
_buffered_length(0),
_buffered_data(0) {}
explicit DeltaLengthByteArrayDecoder()
: DeltaDecoder(nullptr), _len_decoder(), _buffered_length(0), _buffered_data(0) {}
Status skip_values(size_t num_values) override {
_current_value_idx += num_values;
@ -240,7 +225,8 @@ public:
return Status::IOError("Expected to decode {} values, but decoded {} values.",
num_values - null_count, num_valid_values);
}
return decode_byte_array<has_filter>(_values, doris_column, data_type, select_vector);
return decode_byte_array<PhysicalType, has_filter>(_values, doris_column, data_type,
select_vector);
}
Status decode(Slice* buffer, int num_values, int* out_num_values) {
@ -270,7 +256,7 @@ private:
std::vector<Slice> _values;
std::shared_ptr<BitReader> _bit_reader;
DeltaBitPackDecoder<int32_t> _len_decoder;
DeltaBitPackDecoder<int32_t, PhysicalType> _len_decoder;
int _num_valid_values;
uint32_t _length_idx;
@ -278,14 +264,11 @@ private:
std::vector<char> _buffered_data;
};
template <tparquet::Type::type PhysicalType>
class DeltaByteArrayDecoder : public DeltaDecoder {
public:
explicit DeltaByteArrayDecoder(const tparquet::Type::type& physical_type)
: DeltaDecoder(nullptr),
_prefix_len_decoder(physical_type),
_suffix_decoder(physical_type),
_buffered_prefix_length(0),
_buffered_data(0) {}
explicit DeltaByteArrayDecoder()
: DeltaDecoder(nullptr), _buffered_prefix_length(0), _buffered_data(0) {}
Status skip_values(size_t num_values) override {
_current_value_idx += num_values;
@ -312,7 +295,8 @@ public:
int num_valid_values;
RETURN_IF_ERROR(_get_internal(_values.data(), num_values - null_count, &num_valid_values));
DCHECK_EQ(num_values - null_count, num_valid_values);
return decode_byte_array<has_filter>(_values, doris_column, data_type, select_vector);
return decode_byte_array<PhysicalType, has_filter>(_values, doris_column, data_type,
select_vector);
}
void set_data(Slice* slice) override {
@ -350,8 +334,8 @@ private:
std::vector<Slice> _values;
std::shared_ptr<BitReader> _bit_reader;
DeltaBitPackDecoder<int32_t> _prefix_len_decoder;
DeltaLengthByteArrayDecoder _suffix_decoder;
DeltaBitPackDecoder<int32_t, PhysicalType> _prefix_len_decoder;
DeltaLengthByteArrayDecoder<PhysicalType> _suffix_decoder;
std::string _last_value;
// string buffer for last value in previous page
std::string _last_value_in_previous_page;
@ -361,3 +345,260 @@ private:
std::vector<char> _buffered_data;
};
} // namespace doris::vectorized
namespace doris::vectorized {
template <typename T, tparquet::Type::type PhysicalType>
Status DeltaBitPackDecoder<T, PhysicalType>::_init_header() {
if (!_bit_reader->GetVlqInt(&_values_per_block) ||
!_bit_reader->GetVlqInt(&_mini_blocks_per_block) ||
!_bit_reader->GetVlqInt(&_total_value_count) ||
!_bit_reader->GetZigZagVlqInt(&_last_value)) {
return Status::IOError("Init header eof");
}
if (_values_per_block == 0) {
return Status::InvalidArgument("Cannot have zero value per block");
}
if (_values_per_block % 128 != 0) {
return Status::InvalidArgument(
"the number of values in a block must be multiple of 128, but it's " +
std::to_string(_values_per_block));
}
if (_mini_blocks_per_block == 0) {
return Status::InvalidArgument("Cannot have zero miniblock per block");
}
_values_per_mini_block = _values_per_block / _mini_blocks_per_block;
if (_values_per_mini_block == 0) {
return Status::InvalidArgument("Cannot have zero value per miniblock");
}
if (_values_per_mini_block % 32 != 0) {
return Status::InvalidArgument(
"The number of values in a miniblock must be multiple of 32, but it's " +
std::to_string(_values_per_mini_block));
}
_total_values_remaining = _total_value_count;
_delta_bit_widths.resize(_mini_blocks_per_block);
// init as empty property
_block_initialized = false;
_values_remaining_current_mini_block = 0;
return Status::OK();
}
template <typename T, tparquet::Type::type PhysicalType>
Status DeltaBitPackDecoder<T, PhysicalType>::_init_block() {
DCHECK_GT(_total_values_remaining, 0) << "InitBlock called at EOF";
if (!_bit_reader->GetZigZagVlqInt(&_min_delta)) {
return Status::IOError("Init block eof");
}
// read the bitwidth of each miniblock
uint8_t* bit_width_data = _delta_bit_widths.data();
for (uint32_t i = 0; i < _mini_blocks_per_block; ++i) {
if (!_bit_reader->GetAligned<uint8_t>(1, bit_width_data + i)) {
return Status::IOError("Decode bit-width EOF");
}
// Note that non-conformant bitwidth entries are allowed by the Parquet spec
// for extraneous miniblocks in the last block (GH-14923), so we check
// the bitwidths when actually using them (see InitMiniBlock()).
}
_mini_block_idx = 0;
_block_initialized = true;
RETURN_IF_ERROR(_init_mini_block(bit_width_data[0]));
return Status::OK();
}
template <typename T, tparquet::Type::type PhysicalType>
Status DeltaBitPackDecoder<T, PhysicalType>::_init_mini_block(int bit_width) {
if (PREDICT_FALSE(bit_width > kMaxDeltaBitWidth)) {
return Status::InvalidArgument("delta bit width larger than integer bit width");
}
_delta_bit_width = bit_width;
_values_remaining_current_mini_block = _values_per_mini_block;
return Status::OK();
}
template <typename T, tparquet::Type::type PhysicalType>
Status DeltaBitPackDecoder<T, PhysicalType>::_get_internal(T* buffer, int num_values,
int* out_num_values) {
num_values = static_cast<int>(std::min<int64_t>(num_values, _total_values_remaining));
if (num_values == 0) {
*out_num_values = 0;
return Status::OK();
}
int i = 0;
while (i < num_values) {
if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) {
if (PREDICT_FALSE(!_block_initialized)) {
buffer[i++] = _last_value;
DCHECK_EQ(i, 1); // we're at the beginning of the page
if (i == num_values) {
// When block is uninitialized and i reaches num_values we have two
// different possibilities:
// 1. _total_value_count == 1, which means that the page may have only
// one value (encoded in the header), and we should not initialize
// any block.
// 2. _total_value_count != 1, which means we should initialize the
// incoming block for subsequent reads.
if (_total_value_count != 1) {
RETURN_IF_ERROR(_init_block());
}
break;
}
RETURN_IF_ERROR(_init_block());
} else {
++_mini_block_idx;
if (_mini_block_idx < _mini_blocks_per_block) {
RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx]));
} else {
RETURN_IF_ERROR(_init_block());
}
}
}
int values_decode = std::min(_values_remaining_current_mini_block,
static_cast<uint32_t>(num_values - i));
for (int j = 0; j < values_decode; ++j) {
if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) {
return Status::IOError("Get batch EOF");
}
}
for (int j = 0; j < values_decode; ++j) {
// Addition between min_delta, packed int and last_value should be treated as
// unsigned addition. Overflow is as expected.
buffer[i + j] = static_cast<UT>(_min_delta) + static_cast<UT>(buffer[i + j]) +
static_cast<UT>(_last_value);
_last_value = buffer[i + j];
}
_values_remaining_current_mini_block -= values_decode;
i += values_decode;
}
_total_values_remaining -= num_values;
if (PREDICT_FALSE(_total_values_remaining == 0)) {
if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) {
return Status::IOError("Skip padding EOF");
}
_values_remaining_current_mini_block = 0;
}
*out_num_values = num_values;
return Status::OK();
}
template <tparquet::Type::type PhysicalType>
void DeltaLengthByteArrayDecoder<PhysicalType>::_decode_lengths() {
_len_decoder.set_bit_reader(_bit_reader);
// get the number of encoded lengths
int num_length = _len_decoder.valid_values_count();
_buffered_length.resize(num_length);
// decode all the lengths. all the lengths are buffered in buffered_length_.
int ret;
Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret);
if (!st.ok()) {
LOG(FATAL) << "Fail to decode delta length, status: " << st;
}
DCHECK_EQ(ret, num_length);
_length_idx = 0;
_num_valid_values = num_length;
}
template <tparquet::Type::type PhysicalType>
Status DeltaLengthByteArrayDecoder<PhysicalType>::_get_internal(Slice* buffer, int max_values,
int* out_num_values) {
// Decode up to `max_values` strings into an internal buffer
// and reference them into `buffer`.
max_values = std::min(max_values, _num_valid_values);
if (max_values == 0) {
*out_num_values = 0;
return Status::OK();
}
int32_t data_size = 0;
const int32_t* length_ptr = _buffered_length.data() + _length_idx;
for (int i = 0; i < max_values; ++i) {
int32_t len = length_ptr[i];
if (PREDICT_FALSE(len < 0)) {
return Status::InvalidArgument("Negative string delta length");
}
buffer[i].size = len;
if (common::add_overflow(data_size, len, data_size)) {
return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
}
}
_length_idx += max_values;
_buffered_data.resize(data_size);
char* data_ptr = _buffered_data.data();
for (int j = 0; j < data_size; j++) {
if (!_bit_reader->GetValue(8, data_ptr + j)) {
return Status::IOError("Get length bytes EOF");
}
}
for (int i = 0; i < max_values; ++i) {
buffer[i].data = data_ptr;
data_ptr += buffer[i].size;
}
// this->num_values_ -= max_values;
_num_valid_values -= max_values;
*out_num_values = max_values;
return Status::OK();
}
template <tparquet::Type::type PhysicalType>
Status DeltaByteArrayDecoder<PhysicalType>::_get_internal(Slice* buffer, int max_values,
int* out_num_values) {
// Decode up to `max_values` strings into an internal buffer
// and reference them into `buffer`.
max_values = std::min(max_values, _num_valid_values);
if (max_values == 0) {
*out_num_values = max_values;
return Status::OK();
}
int suffix_read;
RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read));
if (PREDICT_FALSE(suffix_read != max_values)) {
return Status::IOError("Read {}, expecting {} from suffix decoder",
std::to_string(suffix_read), std::to_string(max_values));
}
int64_t data_size = 0;
const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset;
for (int i = 0; i < max_values; ++i) {
if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY");
}
if (PREDICT_FALSE(common::add_overflow(data_size, static_cast<int64_t>(prefix_len_ptr[i]),
data_size) ||
common::add_overflow(data_size, static_cast<int64_t>(buffer[i].size),
data_size))) {
return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY");
}
}
_buffered_data.resize(data_size);
std::string_view prefix {_last_value};
char* data_ptr = _buffered_data.data();
for (int i = 0; i < max_values; ++i) {
if (PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY");
}
memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
// buffer[i] currently points to the string suffix
memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size);
buffer[i].data = data_ptr;
buffer[i].size += prefix_len_ptr[i];
data_ptr += buffer[i].size;
prefix = std::string_view {buffer[i].data, buffer[i].size};
}
_prefix_len_offset += max_values;
_num_valid_values -= max_values;
_last_value = std::string {prefix};
if (_num_valid_values == 0) {
_last_value_in_previous_page = _last_value;
}
*out_num_values = max_values;
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -25,11 +25,10 @@
namespace doris::vectorized {
template <typename T>
template <tparquet::Type::type PhysicalType>
class FixLengthDictDecoder final : public BaseDictDecoder {
public:
FixLengthDictDecoder(tparquet::Type::type physical_type)
: BaseDictDecoder(), _physical_type(physical_type) {};
FixLengthDictDecoder() : BaseDictDecoder() {};
~FixLengthDictDecoder() override = default;
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
@ -73,95 +72,7 @@ public:
return _decode_dict_values<has_filter>(doris_column, select_vector, is_dict_filter);
}
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
switch (logical_type) {
#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \
case NUMERIC_TYPE: \
if constexpr (!std::is_same_v<T, ParquetInt96>) { \
return _decode_numeric<CPP_NUMERIC_TYPE, T, has_filter>(doris_column, select_vector); \
}
FOR_LOGICAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
case TypeIndex::Date:
if constexpr (std::is_same_v<T, Int32>) {
return _decode_date<VecDateTimeValue, Int64, has_filter>(doris_column,
select_vector);
}
break;
case TypeIndex::DateV2:
if constexpr (std::is_same_v<T, Int32>) {
return _decode_date<DateV2Value<DateV2ValueType>, UInt32, has_filter>(
doris_column, select_vector);
}
break;
case TypeIndex::DateTime:
if constexpr (std::is_same_v<T, ParquetInt96>) {
return _decode_datetime96<VecDateTimeValue, Int64, has_filter>(doris_column,
select_vector);
} else if constexpr (std::is_same_v<T, Int64>) {
return _decode_datetime64<VecDateTimeValue, Int64, has_filter>(doris_column,
select_vector);
}
break;
case TypeIndex::DateTimeV2:
// Spark can set the timestamp precision by the following configuration:
// spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS
if constexpr (std::is_same_v<T, ParquetInt96>) {
return _decode_datetime96<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>(
doris_column, select_vector);
} else if constexpr (std::is_same_v<T, Int64>) {
return _decode_datetime64<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>(
doris_column, select_vector);
}
break;
case TypeIndex::Decimal32:
if constexpr (std::is_same_v<T, Int32>) {
return _decode_primitive_decimal<Int32, Int32, has_filter>(doris_column, data_type,
select_vector);
} else if constexpr (std::is_same_v<T, Int64>) {
return _decode_primitive_decimal<Int32, Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal64:
if constexpr (std::is_same_v<T, Int32>) {
return _decode_primitive_decimal<Int64, Int32, has_filter>(doris_column, data_type,
select_vector);
} else if constexpr (std::is_same_v<T, Int64>) {
return _decode_primitive_decimal<Int64, Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal128:
if constexpr (std::is_same_v<T, Int32>) {
return _decode_primitive_decimal<Int128, Int32, has_filter>(doris_column, data_type,
select_vector);
} else if constexpr (std::is_same_v<T, Int64>) {
return _decode_primitive_decimal<Int128, Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal128I:
if constexpr (std::is_same_v<T, Int32>) {
return _decode_primitive_decimal<Int128, Int32, has_filter>(doris_column, data_type,
select_vector);
} else if constexpr (std::is_same_v<T, Int64>) {
return _decode_primitive_decimal<Int128, Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
// TODO: decimal256
case TypeIndex::String:
[[fallthrough]];
case TypeIndex::FixedString:
break;
default:
break;
}
return Status::InvalidArgument(
"Can't decode parquet physical type {} to doris logical type {}",
tparquet::to_string(_physical_type), getTypeName(logical_type));
return _decode_numeric<has_filter>(doris_column, select_vector);
}
Status set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t length, size_t num_values) override {
@ -172,26 +83,27 @@ public:
char* dict_item_address = reinterpret_cast<char*>(_dict.get());
_dict_items.resize(num_values);
for (size_t i = 0; i < num_values; ++i) {
_dict_items[i] = *(T*)dict_item_address;
_dict_items[i] = *(DataType*)dict_item_address;
dict_item_address += _type_length;
}
return Status::OK();
}
protected:
template <typename Numeric, typename PhysicalType, bool has_filter>
template <bool has_filter>
Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
auto& column_data = static_cast<ColumnVector<Numeric>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
auto& column_data = reinterpret_cast<ColumnVector<Int8>&>(*doris_column).get_data();
size_t data_index = column_data.size() / _type_length;
column_data.resize(column_data.size() + _type_length * (select_vector.num_values() -
select_vector.num_filtered()));
size_t dict_index = 0;
DataType* data = (DataType*)column_data.data();
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
column_data[data_index++] =
static_cast<PhysicalType>(_dict_items[_indexes[dict_index++]]);
data[data_index++] = _dict_items[_indexes[dict_index++]];
}
break;
}
@ -211,250 +123,17 @@ protected:
}
return Status::OK();
}
template <typename CppType, typename ColumnType, bool has_filter>
Status _decode_date(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
size_t dict_index = 0;
date_day_offset_dict& date_dict = date_day_offset_dict::get();
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
int64_t date_value =
_dict_items[_indexes[dict_index++]] + _decode_params->offset_days;
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
// we should cast to date if using date v1.
v.cast_to_date();
} else {
reinterpret_cast<CppType&>(column_data[data_index++]) =
date_dict[date_value];
}
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename CppType, typename ColumnType, bool has_filter>
Status _decode_datetime64(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
size_t dict_index = 0;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
int64_t date_value = _dict_items[_indexes[dict_index++]];
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz);
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
// nanoseconds will be ignored.
v.set_microsecond((date_value % _decode_params->second_mask) *
_decode_params->scale_to_nano_factor / 1000);
// TODO: the precision of datetime v1
}
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename CppType, typename ColumnType, bool has_filter>
Status _decode_datetime96(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
size_t dict_index = 0;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
ParquetInt96& datetime96 = _dict_items[_indexes[dict_index++]];
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
int64_t micros = datetime96.to_timestamp_micros();
v.from_unixtime(micros / 1000000, *_decode_params->ctz);
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
// spark.sql.parquet.outputTimestampType = INT96(NANOS) will lost precision.
// only keep microseconds.
v.set_microsecond(micros % 1000000);
}
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter>
Status _decode_primitive_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
init_decimal_converter<DecimalPrimitiveType>(data_type);
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
#define M(FixedTypeLength, ValueCopyType, ScaleType) \
case FixedTypeLength: \
return _decode_primitive_decimal_internal<DecimalPrimitiveType, DecimalPhysicalType, \
has_filter, FixedTypeLength, ValueCopyType, \
ScaleType>(doris_column, data_type, \
select_vector);
#define APPLY_FOR_DECIMALS(ScaleType) \
M(1, int64_t, ScaleType) \
M(2, int64_t, ScaleType) \
M(3, int64_t, ScaleType) \
M(4, int64_t, ScaleType) \
M(5, int64_t, ScaleType) \
M(6, int64_t, ScaleType) \
M(7, int64_t, ScaleType) \
M(8, int64_t, ScaleType) \
M(9, int128_t, ScaleType) \
M(10, int128_t, ScaleType) \
M(11, int128_t, ScaleType) \
M(12, int128_t, ScaleType) \
M(13, int128_t, ScaleType) \
M(14, int128_t, ScaleType) \
M(15, int128_t, ScaleType) \
M(16, int128_t, ScaleType)
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
} else {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
}
return Status::OK();
#undef APPLY_FOR_DECIMALS
#undef M
}
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter,
int fixed_type_length, typename ValueCopyType,
DecimalScaleParams::ScaleType ScaleType>
Status _decode_primitive_decimal_internal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
auto& column_data =
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column)
.get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
size_t dict_index = 0;
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
ValueCopyType value = static_cast<T>(_dict_items[_indexes[dict_index++]]);
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
value *= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
value /= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
// do nothing
} else {
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
v = (DecimalPrimitiveType)value;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
tparquet::Type::type _physical_type;
using ColumnType = ParquetConvert::PhysicalTypeTraits<PhysicalType>::ColumnType;
using DataType = ParquetConvert::PhysicalTypeTraits<PhysicalType>::DataType;
// For dictionary encoding
std::vector<T> _dict_items;
std::vector<DataType> _dict_items;
};
template <>
class FixLengthDictDecoder<char*> final : public BaseDictDecoder {
class FixLengthDictDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY> final : public BaseDictDecoder {
public:
FixLengthDictDecoder(tparquet::Type::type physical_type)
: BaseDictDecoder(), _physical_type(physical_type) {};
FixLengthDictDecoder() : BaseDictDecoder() {};
~FixLengthDictDecoder() override = default;
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
@ -487,52 +166,39 @@ public:
return _decode_dict_values<has_filter>(doris_column, select_vector, is_dict_filter);
}
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
switch (logical_type) {
case TypeIndex::Decimal32:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_binary_decimal<Int32, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal64:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_binary_decimal<Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal128:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal128I:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type,
select_vector);
}
break;
// TODO: decimal256
case TypeIndex::String:
[[fallthrough]];
case TypeIndex::FixedString:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_string<has_filter>(doris_column, select_vector);
}
break;
default:
break;
}
return Status::InvalidArgument(
"Can't decode parquet physical type {} to doris logical type {}",
tparquet::to_string(_physical_type), getTypeName(logical_type));
return _decode_string<has_filter>(doris_column, select_vector);
}
Status skip_values(size_t num_values) override {
_indexes.resize(num_values);
_index_batch_decoder->GetBatch(&_indexes[0], num_values);
protected:
template <bool has_filter>
Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
size_t dict_index = 0;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
string_values.emplace_back(_dict_items[_indexes[dict_index++]], _type_length);
}
doris_column->insert_many_strings(&string_values[0], run_length);
break;
}
case ColumnSelectVector::NULL_DATA: {
doris_column->insert_many_defaults(run_length);
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
@ -583,160 +249,9 @@ public:
res->insert_many_strings(&dict_values[0], dict_values.size());
return res;
}
protected:
template <typename DecimalPrimitiveType, bool has_filter>
Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
init_decimal_converter<DecimalPrimitiveType>(data_type);
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
#define M(FixedTypeLength, ValueCopyType, ScaleType) \
case FixedTypeLength: \
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter, FixedTypeLength, \
ValueCopyType, ScaleType>(doris_column, data_type, \
select_vector);
#define APPLY_FOR_DECIMALS(ScaleType) \
M(1, int64_t, ScaleType) \
M(2, int64_t, ScaleType) \
M(3, int64_t, ScaleType) \
M(4, int64_t, ScaleType) \
M(5, int64_t, ScaleType) \
M(6, int64_t, ScaleType) \
M(7, int64_t, ScaleType) \
M(8, int64_t, ScaleType) \
M(9, int128_t, ScaleType) \
M(10, int128_t, ScaleType) \
M(11, int128_t, ScaleType) \
M(12, int128_t, ScaleType) \
M(13, int128_t, ScaleType) \
M(14, int128_t, ScaleType) \
M(15, int128_t, ScaleType) \
M(16, int128_t, ScaleType)
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
} else {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
}
return Status::OK();
#undef APPLY_FOR_DECIMALS
#undef M
}
template <bool has_filter>
Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
size_t dict_index = 0;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
string_values.emplace_back(_dict_items[_indexes[dict_index++]], _type_length);
}
doris_column->insert_many_strings(&string_values[0], run_length);
break;
}
case ColumnSelectVector::NULL_DATA: {
doris_column->insert_many_defaults(run_length);
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
tparquet::Type::type _physical_type;
std::unordered_map<StringRef, int32_t> _dict_value_to_code;
// For dictionary encoding
std::vector<char*> _dict_items;
std::unordered_map<StringRef, int32_t> _dict_value_to_code;
private:
template <typename DecimalPrimitiveType, bool has_filter, int fixed_type_length,
typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
auto& column_data =
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column)
.get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
size_t dict_index = 0;
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _dict_items[_indexes[dict_index++]];
// When Decimal in parquet is stored in byte arrays, binary and fixed,
// the unscaled number must be encoded as two's complement using big-endian byte order.
DecimalPrimitiveType result_value = 0;
ValueCopyType value = 0;
memcpy(reinterpret_cast<char*>(&value), buf_start, fixed_type_length);
value = BitUtil::big_endian_to_host(value);
value = value >> ((sizeof(value) - fixed_type_length) * 8);
result_value = value;
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
result_value *= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
result_value /= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
// do nothing
} else {
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
v = (DecimalPrimitiveType)result_value;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
dict_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
};
} // namespace doris::vectorized

View File

@ -1,609 +0,0 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/fix_length_plain_decoder.h"
#include <gen_cpp/parquet_types.h>
#include <stdint.h>
#include <string.h>
#include <memory>
#include <vector>
// IWYU pragma: no_include <opentelemetry/common/threadlocal.h>
#include "common/compiler_util.h" // IWYU pragma: keep
#include "util/bit_util.h"
#include "util/slice.h"
#include "vec/columns/column.h"
#include "vec/common/string_ref.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/exec/format/format_common.h"
#include "vec/exec/format/parquet/parquet_common.h"
#include "vec/runtime/vdatetime_value.h"
namespace doris {
namespace vectorized {
template <typename T>
class ColumnDecimal;
template <typename T>
class ColumnVector;
} // namespace vectorized
} // namespace doris
namespace doris::vectorized {
Status FixLengthPlainDecoder::skip_values(size_t num_values) {
_offset += _type_length * num_values;
if (UNLIKELY(_offset > _data->size)) {
return Status::IOError("Out-of-bounds access in parquet data decoder");
}
return Status::OK();
}
Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector,
bool is_dict_filter) {
if (select_vector.has_filter()) {
return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
} else {
return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
}
}
template <bool has_filter>
Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector,
bool is_dict_filter) {
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) {
return Status::IOError("Out-of-bounds access in parquet data decoder");
}
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
switch (logical_type) {
#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \
case NUMERIC_TYPE: \
if (_physical_type == tparquet::Type::INT32) { \
return _decode_numeric<CPP_NUMERIC_TYPE, Int32, has_filter>(doris_column, \
select_vector); \
} else if (_physical_type == tparquet::Type::INT64) { \
return _decode_numeric<CPP_NUMERIC_TYPE, Int64, has_filter>(doris_column, \
select_vector); \
} else if (_physical_type == tparquet::Type::FLOAT) { \
return _decode_numeric<CPP_NUMERIC_TYPE, Float32, has_filter>(doris_column, \
select_vector); \
} else if (_physical_type == tparquet::Type::DOUBLE) { \
return _decode_numeric<CPP_NUMERIC_TYPE, Float64, has_filter>(doris_column, \
select_vector); \
} else { \
break; \
}
FOR_LOGICAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
case TypeIndex::Date:
if (_physical_type == tparquet::Type::INT32) {
return _decode_date<VecDateTimeValue, Int64, has_filter>(doris_column, select_vector);
}
break;
case TypeIndex::DateV2:
if (_physical_type == tparquet::Type::INT32) {
return _decode_date<DateV2Value<DateV2ValueType>, UInt32, has_filter>(doris_column,
select_vector);
}
break;
case TypeIndex::DateTime:
if (_physical_type == tparquet::Type::INT96) {
return _decode_datetime96<VecDateTimeValue, Int64, has_filter>(doris_column,
select_vector);
} else if (_physical_type == tparquet::Type::INT64) {
return _decode_datetime64<VecDateTimeValue, Int64, has_filter>(doris_column,
select_vector);
}
break;
case TypeIndex::DateTimeV2:
// Spark can set the timestamp precision by the following configuration:
// spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS
if (_physical_type == tparquet::Type::INT96) {
return _decode_datetime96<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>(
doris_column, select_vector);
} else if (_physical_type == tparquet::Type::INT64) {
return _decode_datetime64<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>(
doris_column, select_vector);
}
break;
case TypeIndex::Decimal32:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_binary_decimal<Int32, has_filter>(doris_column, data_type,
select_vector);
} else if (_physical_type == tparquet::Type::INT32) {
return _decode_primitive_decimal<Int32, Int32, has_filter>(doris_column, data_type,
select_vector);
} else if (_physical_type == tparquet::Type::INT64) {
return _decode_primitive_decimal<Int32, Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal64:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_binary_decimal<Int64, has_filter>(doris_column, data_type,
select_vector);
} else if (_physical_type == tparquet::Type::INT32) {
return _decode_primitive_decimal<Int64, Int32, has_filter>(doris_column, data_type,
select_vector);
} else if (_physical_type == tparquet::Type::INT64) {
return _decode_primitive_decimal<Int64, Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal128:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type,
select_vector);
} else if (_physical_type == tparquet::Type::INT32) {
return _decode_primitive_decimal<Int128, Int32, has_filter>(doris_column, data_type,
select_vector);
} else if (_physical_type == tparquet::Type::INT64) {
return _decode_primitive_decimal<Int128, Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
case TypeIndex::Decimal128I:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type,
select_vector);
} else if (_physical_type == tparquet::Type::INT32) {
return _decode_primitive_decimal<Int128, Int32, has_filter>(doris_column, data_type,
select_vector);
} else if (_physical_type == tparquet::Type::INT64) {
return _decode_primitive_decimal<Int128, Int64, has_filter>(doris_column, data_type,
select_vector);
}
break;
// TODO: decimal256
case TypeIndex::String:
[[fallthrough]];
case TypeIndex::FixedString:
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_string<has_filter>(doris_column, select_vector);
}
break;
default:
break;
}
return Status::InvalidArgument("Can't decode parquet physical type {} to doris logical type {}",
tparquet::to_string(_physical_type), getTypeName(logical_type));
}
template <bool has_filter>
Status FixLengthPlainDecoder::_decode_string(MutableColumnPtr& doris_column,
ColumnSelectVector& select_vector) {
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _data->data + _offset;
string_values.emplace_back(buf_start, _type_length);
_offset += _type_length;
}
doris_column->insert_many_strings(&string_values[0], run_length);
break;
}
case ColumnSelectVector::NULL_DATA: {
doris_column->insert_many_defaults(run_length);
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename Numeric, typename PhysicalType, bool has_filter>
Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& doris_column,
ColumnSelectVector& select_vector) {
auto& column_data = static_cast<ColumnVector<Numeric>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _data->data + _offset;
column_data[data_index++] = *(PhysicalType*)buf_start;
_offset += _type_length;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename CppType, typename ColumnType, bool has_filter>
Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column,
ColumnSelectVector& select_vector) {
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
ColumnSelectVector::DataReadType read_type;
date_day_offset_dict& date_dict = date_day_offset_dict::get();
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _data->data + _offset;
int64_t date_value = static_cast<int64_t>(*reinterpret_cast<int32_t*>(buf_start)) +
_decode_params->offset_days;
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
// we should cast to date if using date v1.
v.cast_to_date();
} else {
reinterpret_cast<CppType&>(column_data[data_index++]) = date_dict[date_value];
}
_offset += _type_length;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename CppType, typename ColumnType, bool has_filter>
Status FixLengthPlainDecoder::_decode_datetime64(MutableColumnPtr& doris_column,
ColumnSelectVector& select_vector) {
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _data->data + _offset;
int64_t& date_value = *reinterpret_cast<int64_t*>(buf_start);
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz);
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
// nanoseconds will be ignored.
v.set_microsecond((date_value % _decode_params->second_mask) *
_decode_params->scale_to_nano_factor / 1000);
// TODO: the precision of datetime v1
}
_offset += _type_length;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename CppType, typename ColumnType, bool has_filter>
Status FixLengthPlainDecoder::_decode_datetime96(MutableColumnPtr& doris_column,
ColumnSelectVector& select_vector) {
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _data->data + _offset;
ParquetInt96& datetime96 = *reinterpret_cast<ParquetInt96*>(buf_start);
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
int64_t micros = datetime96.to_timestamp_micros();
v.from_unixtime(micros / 1000000, *_decode_params->ctz);
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
// spark.sql.parquet.outputTimestampType = INT96(NANOS) will lost precision.
// only keep microseconds.
v.set_microsecond(micros % 1000000);
}
_offset += _type_length;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename DecimalPrimitiveType, bool has_filter>
Status FixLengthPlainDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
init_decimal_converter<DecimalPrimitiveType>(data_type);
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
#define M(FixedTypeLength, ValueCopyType, ScaleType) \
case FixedTypeLength: \
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter, FixedTypeLength, \
ValueCopyType, ScaleType>(doris_column, data_type, \
select_vector);
#define APPLY_FOR_DECIMALS(ScaleType) \
M(1, int64_t, ScaleType) \
M(2, int64_t, ScaleType) \
M(3, int64_t, ScaleType) \
M(4, int64_t, ScaleType) \
M(5, int64_t, ScaleType) \
M(6, int64_t, ScaleType) \
M(7, int64_t, ScaleType) \
M(8, int64_t, ScaleType) \
M(9, int128_t, ScaleType) \
M(10, int128_t, ScaleType) \
M(11, int128_t, ScaleType) \
M(12, int128_t, ScaleType) \
M(13, int128_t, ScaleType) \
M(14, int128_t, ScaleType) \
M(15, int128_t, ScaleType) \
M(16, int128_t, ScaleType)
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
} else {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
}
return Status::OK();
#undef APPLY_FOR_DECIMALS
#undef M
}
template <typename DecimalPrimitiveType, bool has_filter, int fixed_type_length,
typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
Status FixLengthPlainDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
auto& column_data =
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _data->data + _offset;
// When Decimal in parquet is stored in byte arrays, binary and fixed,
// the unscaled number must be encoded as two's complement using big-endian byte order.
DecimalPrimitiveType result_value = 0;
ValueCopyType value = 0;
memcpy(reinterpret_cast<char*>(&value), buf_start, fixed_type_length);
value = BitUtil::big_endian_to_host(value);
value = value >> ((sizeof(value) - fixed_type_length) * 8);
result_value = value;
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
result_value *= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
result_value /= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
// do nothing
} else {
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
v = (DecimalPrimitiveType)result_value;
_offset += fixed_type_length;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter>
Status FixLengthPlainDecoder::_decode_primitive_decimal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector) {
init_decimal_converter<DecimalPrimitiveType>(data_type);
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
#define M(FixedTypeLength, T, ScaleType) \
case FixedTypeLength: \
return _decode_primitive_decimal_internal<DecimalPrimitiveType, DecimalPhysicalType, \
has_filter, FixedTypeLength, T, ScaleType>( \
doris_column, data_type, select_vector);
#define APPLY_FOR_DECIMALS(ScaleType) \
M(1, int64_t, ScaleType) \
M(2, int64_t, ScaleType) \
M(3, int64_t, ScaleType) \
M(4, int64_t, ScaleType) \
M(5, int64_t, ScaleType) \
M(6, int64_t, ScaleType) \
M(7, int64_t, ScaleType) \
M(8, int64_t, ScaleType) \
M(9, int128_t, ScaleType) \
M(10, int128_t, ScaleType) \
M(11, int128_t, ScaleType) \
M(12, int128_t, ScaleType) \
M(13, int128_t, ScaleType) \
M(14, int128_t, ScaleType) \
M(15, int128_t, ScaleType) \
M(16, int128_t, ScaleType)
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
} else {
switch (_type_length) {
APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE)
default:
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
}
return Status::OK();
#undef APPLY_FOR_DECIMALS
#undef M
}
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter,
int fixed_type_length, typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
Status FixLengthPlainDecoder::_decode_primitive_decimal_internal(
MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) {
auto& column_data =
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _data->data + _offset;
ValueCopyType value = *reinterpret_cast<DecimalPhysicalType*>(buf_start);
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
value *= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
value /= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
// do nothing
} else {
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
v = (DecimalPrimitiveType)value;
_offset += _type_length;
}
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -23,7 +23,8 @@
#include "common/status.h"
#include "vec/data_types/data_type.h"
#include "vec/exec/format/parquet/decoder.h"
#include "vec/exec/format/parquet/parquet_column_convert.h"
#include "vec/exec/format/parquet/parquet_common.h"
namespace doris {
namespace vectorized {
class ColumnSelectVector;
@ -32,56 +33,135 @@ class ColumnSelectVector;
namespace doris::vectorized {
template <tparquet::Type::type PhysicalType>
class FixLengthPlainDecoder final : public Decoder {
public:
FixLengthPlainDecoder(tparquet::Type::type physical_type) : _physical_type(physical_type) {};
FixLengthPlainDecoder() {};
~FixLengthPlainDecoder() override = default;
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector, bool is_dict_filter) override;
template <bool hasFilter>
template <bool has_filter>
Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector, bool is_dict_filter);
Status skip_values(size_t num_values) override;
protected:
template <typename Numeric, typename PhysicalType, bool has_filter>
template <bool has_filter>
Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
template <typename CppType, typename ColumnType, bool has_filter>
Status _decode_date(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
template <typename CppType, typename ColumnType, bool has_filter>
Status _decode_datetime64(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
template <typename CppType, typename ColumnType, bool has_filter>
Status _decode_datetime96(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
template <typename DecimalPrimitiveType, bool has_filter>
Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector);
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter>
Status _decode_primitive_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector);
template <bool has_filter>
Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
tparquet::Type::type _physical_type;
private:
template <typename DecimalPrimitiveType, bool has_filter, int fixed_type_length,
typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
ColumnSelectVector& select_vector);
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter,
int fixed_type_length, typename ValueCopyType,
DecimalScaleParams::ScaleType ScaleType>
Status _decode_primitive_decimal_internal(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector);
};
template <tparquet::Type::type PhysicalType>
Status FixLengthPlainDecoder<PhysicalType>::skip_values(size_t num_values) {
_offset += _type_length * num_values;
if (UNLIKELY(_offset > _data->size)) {
return Status::IOError("Out-of-bounds access in parquet data decoder");
}
return Status::OK();
}
template <tparquet::Type::type PhysicalType>
Status FixLengthPlainDecoder<PhysicalType>::decode_values(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector,
bool is_dict_filter) {
if (select_vector.has_filter()) {
return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
} else {
return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
}
}
template <tparquet::Type::type PhysicalType>
template <bool has_filter>
Status FixLengthPlainDecoder<PhysicalType>::_decode_values(MutableColumnPtr& doris_column,
DataTypePtr& data_type,
ColumnSelectVector& select_vector,
bool is_dict_filter) {
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) {
return Status::IOError("Out-of-bounds access in parquet data decoder");
}
if constexpr (PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
return _decode_string<has_filter>(doris_column, select_vector);
} else {
return _decode_numeric<has_filter>(doris_column, select_vector);
}
}
template <tparquet::Type::type PhysicalType>
template <bool has_filter>
Status FixLengthPlainDecoder<PhysicalType>::_decode_string(MutableColumnPtr& doris_column,
ColumnSelectVector& select_vector) {
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
std::vector<StringRef> string_values;
string_values.reserve(run_length);
for (size_t i = 0; i < run_length; ++i) {
char* buf_start = _data->data + _offset;
string_values.emplace_back(buf_start, _type_length);
_offset += _type_length;
}
doris_column->insert_many_strings(&string_values[0], run_length);
break;
}
case ColumnSelectVector::NULL_DATA: {
doris_column->insert_many_defaults(run_length);
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
template <tparquet::Type::type PhysicalType>
template <bool has_filter>
Status FixLengthPlainDecoder<PhysicalType>::_decode_numeric(MutableColumnPtr& doris_column,
ColumnSelectVector& select_vector) {
auto& column_data = reinterpret_cast<ColumnVector<Int8>&>(*doris_column).get_data();
size_t data_index = column_data.size();
column_data.resize(data_index +
_type_length * (select_vector.num_values() - select_vector.num_filtered()));
ColumnSelectVector::DataReadType read_type;
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
switch (read_type) {
case ColumnSelectVector::CONTENT: {
memcpy(column_data.data() + data_index, _data->data + _offset,
run_length * _type_length);
_offset += run_length * _type_length;
data_index += run_length * _type_length;
break;
}
case ColumnSelectVector::NULL_DATA: {
data_index += run_length * _type_length;
break;
}
case ColumnSelectVector::FILTERED_CONTENT: {
_offset += _type_length * run_length;
break;
}
case ColumnSelectVector::FILTERED_NULL: {
// do nothing
break;
}
}
}
return Status::OK();
}
} // namespace doris::vectorized

View File

@ -0,0 +1,82 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "vec/exec/format/parquet/parquet_column_convert.h"
#include <cctz/time_zone.h>
#include "vec/columns/column_nullable.h"
namespace doris::vectorized {
namespace ParquetConvert {
const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone();
ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type,
ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert) {
ColumnPtr ans_column = doris_column;
DataTypePtr tmp_data_type;
switch (parquet_physical_type) {
case tparquet::Type::type::BOOLEAN:
tmp_data_type = std::make_shared<DataTypeUInt8>();
break;
case tparquet::Type::type::INT32:
tmp_data_type = std::make_shared<DataTypeInt32>();
break;
case tparquet::Type::type::INT64:
tmp_data_type = std::make_shared<DataTypeInt64>();
break;
case tparquet::Type::type::FLOAT:
tmp_data_type = std::make_shared<DataTypeFloat32>();
break;
case tparquet::Type::type::DOUBLE:
tmp_data_type = std::make_shared<DataTypeFloat64>();
break;
case tparquet::Type::type::BYTE_ARRAY:
case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY:
tmp_data_type = std::make_shared<DataTypeString>();
break;
case tparquet::Type::type::INT96:
tmp_data_type = std::make_shared<DataTypeInt8>();
break;
}
if (tmp_data_type->get_type_id() == remove_nullable(doris_type)->get_type_id()) {
if (tmp_data_type->get_type_id() == TypeIndex::String &&
(show_type == PrimitiveType::TYPE_DECIMAL32 ||
show_type == PrimitiveType::TYPE_DECIMAL64 ||
show_type == PrimitiveType::TYPE_DECIMALV2 ||
show_type == PrimitiveType::TYPE_DECIMAL128I)) {
*need_convert = true;
ans_column = tmp_data_type->create_column();
} else {
*need_convert = false;
}
} else {
ans_column = tmp_data_type->create_column();
*need_convert = true;
}
if (*need_convert && doris_type->is_nullable()) {
auto doris_nullable_column = static_cast<const ColumnNullable*>(doris_column.get());
ans_column = ColumnNullable::create(ans_column,
doris_nullable_column->get_null_map_column_ptr());
}
return ans_column;
}
} // namespace ParquetConvert
} // namespace doris::vectorized

View File

@ -0,0 +1,665 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <gen_cpp/PlanNodes_types.h>
#include <gen_cpp/Types_types.h>
#include <gen_cpp/parquet_types.h>
#include <algorithm>
#include <functional>
#include <ostream>
#include <utility>
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/status.h"
#include "gen_cpp/descriptors.pb.h"
#include "gutil/endian.h"
#include "gutil/strings/numbers.h"
#include "io/file_factory.h"
#include "olap/olap_common.h"
#include "util/coding.h"
#include "util/slice.h"
#include "vec/columns/column_string.h"
#include "vec/columns/column_vector.h"
#include "vec/core/types.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_factory.hpp"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_number.h"
#include "vec/data_types/data_type_string.h"
#include "vec/exec/format/format_common.h"
#include "vec/exec/format/parquet/decoder.h"
#include "vec/exec/format/parquet/parquet_common.h"
namespace doris::vectorized {
namespace ParquetConvert {
template <tparquet::Type::type ParquetType>
struct PhysicalTypeTraits {};
template <>
struct PhysicalTypeTraits<tparquet::Type::INT32> {
using DataType = int32_t;
using ColumnType = ColumnVector<DataType>;
};
template <>
struct PhysicalTypeTraits<tparquet::Type::BOOLEAN> {
using DataType = uint8;
using ColumnType = ColumnVector<DataType>;
};
template <>
struct PhysicalTypeTraits<tparquet::Type::INT64> {
using DataType = int64_t;
using ColumnType = ColumnVector<DataType>;
};
template <>
struct PhysicalTypeTraits<tparquet::Type::FLOAT> {
using DataType = float;
using ColumnType = ColumnVector<DataType>;
};
template <>
struct PhysicalTypeTraits<tparquet::Type::DOUBLE> {
using DataType = double;
using ColumnType = ColumnVector<DataType>;
};
template <>
struct PhysicalTypeTraits<tparquet::Type::BYTE_ARRAY> {
using DataType = String;
using ColumnType = ColumnString;
};
template <>
struct PhysicalTypeTraits<tparquet::Type::FIXED_LEN_BYTE_ARRAY> {
using DataType = String;
using ColumnType = ColumnString;
};
template <>
struct PhysicalTypeTraits<tparquet::Type::INT96> {
using DataType = ParquetInt96;
using ColumnType = ColumnVector<Int8>;
};
#define FOR_LOGICAL_NUMERIC_TYPES(M) \
M(TypeIndex::Int8, Int8, Int32) \
M(TypeIndex::Int16, Int16, Int32) \
M(TypeIndex::Int32, Int32, Int32) \
M(TypeIndex::Int64, Int64, Int64) \
M(TypeIndex::Float32, Float32, Float32) \
M(TypeIndex::Float64, Float64, Float64)
#define FOR_LOGICAL_DECIMAL_TYPES(M) \
M(TypeIndex::Decimal32, Decimal32, Int32) \
M(TypeIndex::Decimal64, Decimal64, Int64) \
M(TypeIndex::Decimal128, Decimal128, Int128) \
M(TypeIndex::Decimal128I, Decimal128, Int128)
struct ConvertParams {
// schema.logicalType.TIMESTAMP.isAdjustedToUTC == false
static const cctz::time_zone utc0;
// schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set the time zone
cctz::time_zone* ctz = nullptr;
size_t offset_days = 0;
int64_t second_mask = 1;
int64_t scale_to_nano_factor = 1;
DecimalScaleParams decimal_scale;
FieldSchema* field_schema = nullptr;
size_t start_idx = 0;
void init(FieldSchema* field_schema_, cctz::time_zone* ctz_, size_t start_idx_ = 0) {
field_schema = field_schema_;
if (ctz_ != nullptr) {
ctz = ctz_;
}
const auto& schema = field_schema->parquet_schema;
if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) {
const auto& timestamp_info = schema.logicalType.TIMESTAMP;
if (!timestamp_info.isAdjustedToUTC) {
// should set timezone to utc+0
ctz = const_cast<cctz::time_zone*>(&utc0);
}
const auto& time_unit = timestamp_info.unit;
if (time_unit.__isset.MILLIS) {
second_mask = 1000;
scale_to_nano_factor = 1000000;
} else if (time_unit.__isset.MICROS) {
second_mask = 1000000;
scale_to_nano_factor = 1000;
} else if (time_unit.__isset.NANOS) {
second_mask = 1000000000;
scale_to_nano_factor = 1;
}
} else if (schema.__isset.converted_type) {
const auto& converted_type = schema.converted_type;
if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) {
second_mask = 1000;
scale_to_nano_factor = 1000000;
} else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) {
second_mask = 1000000;
scale_to_nano_factor = 1000;
}
}
if (ctz) {
VecDateTimeValue t;
t.from_unixtime(0, *ctz);
offset_days = t.day() == 31 ? -1 : 0;
}
start_idx = start_idx_;
}
template <typename DecimalPrimitiveType>
void init_decimal_converter(DataTypePtr& data_type) {
if (field_schema == nullptr || decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) {
return;
}
auto scale = field_schema->parquet_schema.scale;
auto* decimal_type = static_cast<DataTypeDecimal<Decimal<DecimalPrimitiveType>>*>(
const_cast<IDataType*>(remove_nullable(data_type).get()));
auto dest_scale = decimal_type->get_scale();
if (dest_scale > scale) {
decimal_scale.scale_type = DecimalScaleParams::SCALE_UP;
decimal_scale.scale_factor =
DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dest_scale - scale);
} else if (dest_scale < scale) {
decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN;
decimal_scale.scale_factor =
DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dest_scale);
} else {
decimal_scale.scale_type = DecimalScaleParams::NO_SCALE;
decimal_scale.scale_factor = 1;
}
}
};
/*
* parquet_physical_type : The type of data stored in parquet.
* Read data into columns returned by get_column according to the physical type of parquet.
* show_type : The data format that should be displayed.
* doris_column : What type of column does the upper layer need to put the data in.
*
* example :
* In hive, if decimal is stored as FIXED_LENBYTE_ARRAY in parquet,
* then we use `ALTER TABLE TableName CHANGE COLUMN Col_Decimal Col_Decimal String;`
* to convert this column to string type.
* parquet_type : FIXED_LEN_BYTE_ARRAY.
* ans_data_type : ColumnInt8
* show_type : Decimal.
* doris_column : ColumnString.
*/
ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type,
ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert);
struct ColumnConvert {
virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); }
virtual ~ColumnConvert() = default;
void convert_null(ColumnPtr& src_col, MutableColumnPtr& dst_col) {
src_col = remove_nullable(src_col);
dst_col = remove_nullable(dst_col->get_ptr())->assume_mutable();
}
public:
ConvertParams* _convert_params;
};
template <tparquet::Type::type parquet_physical_type, typename dst_type>
struct NumberToNumberConvert : public ColumnConvert {
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
using ColumnType = typename PhysicalTypeTraits<parquet_physical_type>::ColumnType;
convert_null(src_col, dst_col);
size_t rows = src_col->size();
auto& src_data = static_cast<const ColumnType*>(src_col.get())->get_data();
dst_col->resize(_convert_params->start_idx + rows);
auto& data = static_cast<ColumnVector<dst_type>&>(*dst_col.get()).get_data();
for (int i = 0; i < rows; i++) {
dst_type value = static_cast<dst_type>(src_data[i]);
data[_convert_params->start_idx + i] = value;
}
return Status::OK();
}
};
template <tparquet::Type::type parquet_physical_type>
struct NumberToStringConvert : public ColumnConvert {
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
using ColumnType = typename PhysicalTypeTraits<parquet_physical_type>::ColumnType;
convert_null(src_col, dst_col);
size_t rows = src_col->size();
auto& src_data = static_cast<const ColumnType*>(src_col.get())->get_data();
char buf[100];
auto str_col = static_cast<ColumnString*>(dst_col.get());
for (int i = 0; i < rows; i++) {
if constexpr (parquet_physical_type == tparquet::Type::FLOAT) {
int len = FastFloatToBuffer(src_data[i], buf, true);
str_col->insert_data(buf, len);
} else if constexpr (parquet_physical_type == tparquet::Type::DOUBLE) {
int len = FastDoubleToBuffer(src_data[i], buf, true);
str_col->insert_data(buf, len);
} else if constexpr (parquet_physical_type == tparquet::Type::INT32) {
char* end = FastInt32ToBufferLeft(src_data[i], buf);
str_col->insert_data(buf, end - buf);
} else if constexpr (parquet_physical_type == tparquet::Type::INT64) {
char* end = FastInt64ToBufferLeft(src_data[i], buf);
str_col->insert_data(buf, end - buf);
} else {
string value = std::to_string(src_data[i]);
str_col->insert_data(value.data(), value.size());
}
}
return Status::OK();
}
};
struct Int96toTimestamp : public ColumnConvert {
public:
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
convert_null(src_col, dst_col);
size_t rows = src_col->size() / sizeof(ParquetInt96);
auto& src_data = static_cast<const ColumnVector<Int8>*>(src_col.get())->get_data();
auto ParquetInt96_data = (ParquetInt96*)src_data.data();
dst_col->resize(_convert_params->start_idx + rows);
auto& data = static_cast<ColumnVector<UInt64>*>(dst_col.get())->get_data();
for (int i = 0; i < rows; i++) {
ParquetInt96 x = ParquetInt96_data[i];
auto& num = data[_convert_params->start_idx + i];
auto& value = reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(num);
int64_t micros = x.to_timestamp_micros();
value.from_unixtime(micros / 1000000, *_convert_params->ctz);
value.set_microsecond(micros % 1000000);
}
return Status::OK();
}
};
struct Int64ToTimestamp : public ColumnConvert {
public:
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
convert_null(src_col, dst_col);
size_t rows = src_col->size();
dst_col->resize(_convert_params->start_idx + rows);
auto src_data = static_cast<const ColumnVector<int64_t>*>(src_col.get())->get_data().data();
auto& data = static_cast<ColumnVector<UInt64>*>(dst_col.get())->get_data();
for (int i = 0; i < rows; i++) {
int64_t x = src_data[i];
auto& num = data[_convert_params->start_idx + i];
auto& value = reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(num);
value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz);
value.set_microsecond((x % _convert_params->second_mask) *
(_convert_params->scale_to_nano_factor / 1000));
}
return Status::OK();
}
};
class Int32ToDate : public ColumnConvert {
public:
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
convert_null(src_col, dst_col);
size_t rows = src_col->size();
dst_col->resize(_convert_params->start_idx + rows);
auto& src_data = static_cast<const ColumnVector<int32>*>(src_col.get())->get_data();
auto& data = static_cast<ColumnDateV2*>(dst_col.get())->get_data();
date_day_offset_dict& date_dict = date_day_offset_dict::get();
for (int i = 0; i < rows; i++) {
auto& value = reinterpret_cast<DateV2Value<DateV2ValueType>&>(
data[_convert_params->start_idx + i]);
int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days;
value = date_dict[date_value];
}
return Status::OK();
}
};
template <typename DecimalType, typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
class StringToDecimal : public ColumnConvert {
public:
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
convert_null(src_col, dst_col);
size_t rows = src_col->size();
DecimalScaleParams& scale_params = _convert_params->decimal_scale;
auto buf = static_cast<const ColumnString*>(src_col.get())->get_chars().data();
auto& offset = static_cast<const ColumnString*>(src_col.get())->get_offsets();
dst_col->resize(_convert_params->start_idx + rows);
auto& data = static_cast<ColumnDecimal<DecimalType>*>(dst_col.get())->get_data();
for (int i = 0; i < rows; i++) {
size_t len = offset[i] - offset[i - 1];
// When Decimal in parquet is stored in byte arrays, binary and fixed,
// the unscaled number must be encoded as two's complement using big-endian byte order.
ValueCopyType value = 0;
memcpy(reinterpret_cast<char*>(&value), buf + offset[i - 1], len);
value = BitUtil::big_endian_to_host(value);
value = value >> ((sizeof(value) - len) * 8);
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
value *= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
value /= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
// do nothing
} else {
LOG(FATAL) << "__builtin_unreachable";
__builtin_unreachable();
}
auto& v = reinterpret_cast<DecimalType&>(data[_convert_params->start_idx + i]);
v = (DecimalType)value;
}
return Status::OK();
}
};
template <typename NumberType, typename DecimalPhysicalType, typename ValueCopyType,
DecimalScaleParams::ScaleType ScaleType>
class NumberToDecimal : public ColumnConvert {
public:
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
convert_null(src_col, dst_col);
size_t rows = src_col->size();
auto* src_data =
static_cast<const ColumnVector<NumberType>*>(src_col.get())->get_data().data();
dst_col->resize(_convert_params->start_idx + rows);
DecimalScaleParams& scale_params = _convert_params->decimal_scale;
auto* data = static_cast<ColumnDecimal<Decimal<DecimalPhysicalType>>*>(dst_col.get())
->get_data()
.data();
for (int i = 0; i < rows; i++) {
ValueCopyType value = src_data[i];
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
value *= scale_params.scale_factor;
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
value /= scale_params.scale_factor;
}
data[_convert_params->start_idx + i] = (DecimalPhysicalType)value;
}
return Status::OK();
}
};
template <typename DecimalType, typename ValueCopyType>
class StringToDecimalString : public ColumnConvert {
public:
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
convert_null(src_col, dst_col);
size_t rows = src_col->size();
auto buf = static_cast<const ColumnString*>(src_col.get())->get_chars().data();
auto& offset = static_cast<const ColumnString*>(src_col.get())->get_offsets();
auto data = static_cast<ColumnString*>(dst_col.get());
for (int i = 0; i < rows; i++) {
int len = offset[i] - offset[i - 1];
// When Decimal in parquet is stored in byte arrays, binary and fixed,
// the unscaled number must be encoded as two's complement using big-endian byte order.
ValueCopyType value = 0;
memcpy(reinterpret_cast<char*>(&value), buf + offset[i - 1], len);
value = BitUtil::big_endian_to_host(value);
value = value >> ((sizeof(value) - len) * 8);
std::string ans = reinterpret_cast<DecimalType&>(value).to_string(
_convert_params->field_schema->parquet_schema.scale);
data->insert_data(ans.data(), ans.size());
}
return Status::OK();
}
};
class Int32ToDateString : public ColumnConvert {
public:
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
convert_null(src_col, dst_col);
size_t rows = src_col->size();
auto& src_data = static_cast<const ColumnVector<int32>*>(src_col.get())->get_data();
date_day_offset_dict& date_dict = date_day_offset_dict::get();
auto str_col = static_cast<ColumnString*>(dst_col.get());
char buf[50];
for (int i = 0; i < rows; i++) {
int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days;
DateV2Value<DateV2ValueType> value = date_dict[date_value];
char* end = value.to_string(buf);
str_col->insert_data(buf, end - buf);
}
return Status::OK();
}
};
class Int96ToTimestampString : public ColumnConvert {
public:
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
convert_null(src_col, dst_col);
auto& src_data = static_cast<const ColumnVector<Int8>*>(src_col.get())->get_data();
auto dst_data = static_cast<ColumnString*>(dst_col.get());
size_t rows = src_col->size() / sizeof(ParquetInt96);
ParquetInt96* data = (ParquetInt96*)src_data.data();
char buf[50];
for (int i = 0; i < rows; i++) {
uint64_t num = 0;
auto& value = reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(num);
int64_t micros = data[i].to_timestamp_micros();
value.from_unixtime(micros / 1000000, *_convert_params->ctz);
value.set_microsecond(micros % 1000000);
char* end = value.to_string(buf);
dst_data->insert_data(buf, end - buf);
}
return Status::OK();
}
};
inline Status get_converter(tparquet::Type::type parquet_physical_type, PrimitiveType show_type,
std::shared_ptr<const IDataType> dst_data_type,
std::unique_ptr<ColumnConvert>* converter,
ConvertParams* convert_params) {
auto dst_type = remove_nullable(dst_data_type)->get_type_id();
switch (dst_type) {
#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \
case NUMERIC_TYPE: \
switch (parquet_physical_type) { \
case tparquet::Type::BOOLEAN: \
*converter = std::make_unique< \
NumberToNumberConvert<tparquet::Type::BOOLEAN, CPP_NUMERIC_TYPE>>(); \
break; \
case tparquet::Type::INT32: \
*converter = std::make_unique< \
NumberToNumberConvert<tparquet::Type::INT32, CPP_NUMERIC_TYPE>>(); \
break; \
case tparquet::Type::INT64: \
*converter = std::make_unique< \
NumberToNumberConvert<tparquet::Type::INT64, CPP_NUMERIC_TYPE>>(); \
break; \
case tparquet::Type::FLOAT: \
*converter = std::make_unique< \
NumberToNumberConvert<tparquet::Type::FLOAT, CPP_NUMERIC_TYPE>>(); \
break; \
case tparquet::Type::DOUBLE: \
*converter = std::make_unique< \
NumberToNumberConvert<tparquet::Type::DOUBLE, CPP_NUMERIC_TYPE>>(); \
break; \
default: \
break; \
} \
break;
FOR_LOGICAL_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
case TypeIndex::String: {
if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) {
if (show_type == PrimitiveType::TYPE_DECIMAL32) {
*converter = std::make_unique<StringToDecimalString<Decimal32, Int32>>();
break;
} else if (show_type == PrimitiveType::TYPE_DECIMAL64) {
*converter = std::make_unique<StringToDecimalString<Decimal64, Int64>>();
break;
} else if (show_type == PrimitiveType::TYPE_DECIMALV2) {
*converter = std::make_unique<StringToDecimalString<Decimal128, Int128>>();
break;
} else if (show_type == PrimitiveType::TYPE_DECIMAL128I) {
*converter = std::make_unique<StringToDecimalString<Decimal128, Int128>>();
break;
}
} else if (tparquet::Type::INT96 == parquet_physical_type) {
*converter = std::make_unique<Int96ToTimestampString>();
break;
} else if (tparquet::Type::INT32 == parquet_physical_type) {
if (show_type == PrimitiveType::TYPE_DATEV2) {
*converter = std::make_unique<Int32ToDateString>();
break;
}
}
if (parquet_physical_type == tparquet::Type::BOOLEAN) {
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::BOOLEAN>>();
} else if (parquet_physical_type == tparquet::Type::INT32) {
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::INT32>>();
} else if (parquet_physical_type == tparquet::Type::INT64) {
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::INT64>>();
} else if (parquet_physical_type == tparquet::Type::FLOAT) {
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::FLOAT>>();
} else if (parquet_physical_type == tparquet::Type::DOUBLE) {
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::DOUBLE>>();
}
break;
}
case TypeIndex::DateV2:
if (tparquet::Type::INT32 == parquet_physical_type) {
*converter = std::make_unique<Int32ToDate>();
}
break;
case TypeIndex::DateTimeV2:
if (tparquet::Type::INT96 == parquet_physical_type) {
*converter = std::make_unique<Int96toTimestamp>();
} else if (tparquet::Type::INT64 == parquet_physical_type) {
*converter = std::make_unique<Int64ToTimestamp>();
}
break;
#define DISPATCH2(TypeIndex_DECIMAL_TYPE, DECIMAL_TYPE, PRIMARY_TYPE) \
case TypeIndex_DECIMAL_TYPE: { \
convert_params->init_decimal_converter<PRIMARY_TYPE>(dst_data_type); \
DecimalScaleParams& scale_params = convert_params->decimal_scale; \
if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { \
size_t string_length = convert_params->field_schema->parquet_schema.type_length; \
if (string_length <= 8) { \
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \
*converter = \
std::make_unique<StringToDecimal<DECIMAL_TYPE, int64_t, \
DecimalScaleParams::SCALE_UP>>(); \
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \
*converter = \
std::make_unique<StringToDecimal<DECIMAL_TYPE, int64_t, \
DecimalScaleParams::SCALE_DOWN>>(); \
} else { \
*converter = \
std::make_unique<StringToDecimal<DECIMAL_TYPE, int64_t, \
DecimalScaleParams::NO_SCALE>>(); \
} \
} else if (string_length <= 16) { \
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \
*converter = \
std::make_unique<StringToDecimal<DECIMAL_TYPE, int128_t, \
DecimalScaleParams::SCALE_UP>>(); \
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \
*converter = \
std::make_unique<StringToDecimal<DECIMAL_TYPE, int128_t, \
DecimalScaleParams::SCALE_DOWN>>(); \
} else { \
*converter = \
std::make_unique<StringToDecimal<DECIMAL_TYPE, int128_t, \
DecimalScaleParams::NO_SCALE>>(); \
} \
} \
} else if (tparquet::Type::INT32 == parquet_physical_type) { \
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \
*converter = std::make_unique<NumberToDecimal<Int32, PRIMARY_TYPE, int64_t, \
DecimalScaleParams::SCALE_UP>>(); \
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \
*converter = std::make_unique<NumberToDecimal<Int32, PRIMARY_TYPE, int64_t, \
DecimalScaleParams::SCALE_DOWN>>(); \
} else { \
*converter = std::make_unique<NumberToDecimal<Int32, PRIMARY_TYPE, int64_t, \
DecimalScaleParams::NO_SCALE>>(); \
} \
} else if (tparquet::Type::INT64 == parquet_physical_type) { \
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \
*converter = std::make_unique<NumberToDecimal<Int64, PRIMARY_TYPE, int64_t, \
DecimalScaleParams::SCALE_UP>>(); \
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \
*converter = std::make_unique<NumberToDecimal<Int64, PRIMARY_TYPE, int64_t, \
DecimalScaleParams::SCALE_DOWN>>(); \
} else { \
*converter = std::make_unique<NumberToDecimal<Int64, PRIMARY_TYPE, int64_t, \
DecimalScaleParams::NO_SCALE>>(); \
} \
} \
break; \
}
FOR_LOGICAL_DECIMAL_TYPES(DISPATCH2)
#undef DISPATCH2
default:
break;
}
if (*converter == nullptr) {
return Status::NotSupported("Can't cast type parquet physical {} to doris logical type {}",
tparquet::to_string(parquet_physical_type),
getTypeName(dst_type));
}
(*converter)->_convert_params = convert_params;
return Status::OK();
}
}; // namespace ParquetConvert
}; // namespace doris::vectorized

View File

@ -54,6 +54,11 @@ struct ParquetInt96 {
inline uint64_t to_timestamp_micros() const {
return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND;
}
inline __int128 to_int128() const {
__int128 ans = 0;
ans = (((__int128)hi) << 64) + lo;
return ans;
}
static const uint32_t JULIAN_EPOCH_OFFSET_DAYS;
static const uint64_t MICROS_IN_DAY;
@ -151,4 +156,4 @@ private:
size_t _num_filtered;
size_t _read_index;
};
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -88,9 +88,9 @@ private:
TypeDescriptor convert_to_doris_type(const tparquet::SchemaElement& physical_schema);
public:
TypeDescriptor get_doris_type(const tparquet::SchemaElement& physical_schema);
public:
FieldDescriptor() = default;
~FieldDescriptor() = default;

View File

@ -54,7 +54,7 @@ ColumnChunkReader::ColumnChunkReader(io::BufferedStreamReader* reader,
_max_def_level(field_schema->definition_level),
_stream_reader(reader),
_metadata(column_chunk->meta_data),
_ctz(ctz),
// _ctz(ctz),
_io_ctx(io_ctx) {}
Status ColumnChunkReader::init() {
@ -194,7 +194,7 @@ Status ColumnChunkReader::load_page_data() {
// Set type length
page_decoder->set_type_length(_get_type_length());
// Initialize the time convert context
page_decoder->init(_field_schema, _ctz);
// page_decoder->init(_field_schema, _ctz);
_decoders[static_cast<int>(encoding)] = std::move(page_decoder);
_page_decoder = _decoders[static_cast<int>(encoding)].get();
}
@ -242,7 +242,7 @@ Status ColumnChunkReader::_decode_dict_page() {
// Set type length
page_decoder->set_type_length(_get_type_length());
// Initialize the time convert context
page_decoder->init(_field_schema, _ctz);
// page_decoder->init(_field_schema, _ctz);
// Set the dictionary data
RETURN_IF_ERROR(page_decoder->set_dict(dict_data, uncompressed_size,
header.dictionary_page_header.num_values));
@ -323,4 +323,4 @@ int32_t ColumnChunkReader::_get_type_length() {
return -1;
}
}
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -193,7 +193,7 @@ private:
io::BufferedStreamReader* _stream_reader;
tparquet::ColumnMetaData _metadata;
cctz::time_zone* _ctz;
// cctz::time_zone* _ctz;
io::IOContext* _io_ctx;
std::unique_ptr<PageReader> _page_reader = nullptr;

View File

@ -25,6 +25,7 @@
#include <algorithm>
#include <utility>
#include "parquet_column_convert.h"
#include "runtime/define_primitive_type.h"
#include "schema_desc.h"
#include "util/runtime_profile.h"
@ -252,8 +253,9 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu
NullMap* map_data_column = nullptr;
if (doris_column->is_nullable()) {
SCOPED_RAW_TIMER(&_decode_null_map_time);
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
(*std::move(doris_column)).mutate().get());
auto* nullable_column =
static_cast<vectorized::ColumnNullable*>(const_cast<IColumn*>(doris_column.get()));
data_column = nullable_column->get_nested_column_ptr();
map_data_column = &(nullable_column->get_null_map_data());
if (_chunk_reader->max_def_level() > 0) {
@ -360,8 +362,11 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType
NullMap* map_data_column = nullptr;
if (doris_column->is_nullable()) {
SCOPED_RAW_TIMER(&_decode_null_map_time);
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
(*std::move(doris_column)).mutate().get());
auto* nullable_column = const_cast<vectorized::ColumnNullable*>(
static_cast<const vectorized::ColumnNullable*>(doris_column.get()));
// auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
// (*std::move(src_column)).mutate().get());
data_column = nullable_column->get_nested_column_ptr();
map_data_column = &(nullable_column->get_null_map_data());
} else {
@ -476,86 +481,108 @@ Status ScalarColumnReader::_try_load_dict_page(bool* loaded, bool* has_dict) {
Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr& type,
ColumnSelectVector& select_vector, size_t batch_size,
size_t* read_rows, bool* eof, bool is_dict_filter) {
if (_chunk_reader->remaining_num_values() == 0) {
if (!_chunk_reader->has_next_page()) {
*eof = true;
*read_rows = 0;
return Status::OK();
}
RETURN_IF_ERROR(_chunk_reader->next_page());
}
if (_nested_column) {
RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent());
return _read_nested_column(doris_column, type, select_vector, batch_size, read_rows, eof,
is_dict_filter);
}
bool need_convert = false;
auto& parquet_physical_type = _chunk_meta.meta_data.type;
auto& show_type = _field_schema->type.type;
// generate the row ranges that should be read
std::list<RowRange> read_ranges;
_generate_read_ranges(_current_row_index,
_current_row_index + _chunk_reader->remaining_num_values(), read_ranges);
if (read_ranges.size() == 0) {
// skip the whole page
_current_row_index += _chunk_reader->remaining_num_values();
RETURN_IF_ERROR(_chunk_reader->skip_page());
*read_rows = 0;
} else {
bool skip_whole_batch = false;
// Determining whether to skip page or batch will increase the calculation time.
// When the filtering effect is greater than 60%, it is possible to skip the page or batch.
if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) {
// lazy read
size_t remaining_num_values = 0;
for (auto& range : read_ranges) {
remaining_num_values += range.last_row - range.first_row;
}
if (batch_size >= remaining_num_values &&
select_vector.can_filter_all(remaining_num_values)) {
// We can skip the whole page if the remaining values is filtered by predicate columns
select_vector.skip(remaining_num_values);
_current_row_index += _chunk_reader->remaining_num_values();
RETURN_IF_ERROR(_chunk_reader->skip_page());
*read_rows = remaining_num_values;
if (!_chunk_reader->has_next_page()) {
*eof = true;
}
ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type,
doris_column, type, &need_convert);
do {
if (_chunk_reader->remaining_num_values() == 0) {
if (!_chunk_reader->has_next_page()) {
*eof = true;
*read_rows = 0;
return Status::OK();
}
skip_whole_batch =
batch_size <= remaining_num_values && select_vector.can_filter_all(batch_size);
if (skip_whole_batch) {
select_vector.skip(batch_size);
}
RETURN_IF_ERROR(_chunk_reader->next_page());
}
// load page data to decode or skip values
RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent());
size_t has_read = 0;
for (auto& range : read_ranges) {
// generate the skipped values
size_t skip_values = range.first_row - _current_row_index;
RETURN_IF_ERROR(_skip_values(skip_values));
_current_row_index += skip_values;
// generate the read values
size_t read_values =
std::min((size_t)(range.last_row - range.first_row), batch_size - has_read);
if (skip_whole_batch) {
RETURN_IF_ERROR(_skip_values(read_values));
} else {
RETURN_IF_ERROR(_read_values(read_values, doris_column, type, select_vector,
is_dict_filter));
}
has_read += read_values;
_current_row_index += read_values;
if (has_read == batch_size) {
break;
}
if (_nested_column) {
RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent());
RETURN_IF_ERROR(_read_nested_column(src_column, type, select_vector, batch_size,
read_rows, eof, is_dict_filter));
break;
}
*read_rows = has_read;
// generate the row ranges that should be read
std::list<RowRange> read_ranges;
_generate_read_ranges(_current_row_index,
_current_row_index + _chunk_reader->remaining_num_values(),
read_ranges);
if (read_ranges.size() == 0) {
// skip the whole page
_current_row_index += _chunk_reader->remaining_num_values();
RETURN_IF_ERROR(_chunk_reader->skip_page());
*read_rows = 0;
} else {
bool skip_whole_batch = false;
// Determining whether to skip page or batch will increase the calculation time.
// When the filtering effect is greater than 60%, it is possible to skip the page or batch.
if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) {
// lazy read
size_t remaining_num_values = 0;
for (auto& range : read_ranges) {
remaining_num_values += range.last_row - range.first_row;
}
if (batch_size >= remaining_num_values &&
select_vector.can_filter_all(remaining_num_values)) {
// We can skip the whole page if the remaining values is filtered by predicate columns
select_vector.skip(remaining_num_values);
_current_row_index += _chunk_reader->remaining_num_values();
RETURN_IF_ERROR(_chunk_reader->skip_page());
*read_rows = remaining_num_values;
if (!_chunk_reader->has_next_page()) {
*eof = true;
}
break;
}
skip_whole_batch = batch_size <= remaining_num_values &&
select_vector.can_filter_all(batch_size);
if (skip_whole_batch) {
select_vector.skip(batch_size);
}
}
// load page data to decode or skip values
RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent());
size_t has_read = 0;
for (auto& range : read_ranges) {
// generate the skipped values
size_t skip_values = range.first_row - _current_row_index;
RETURN_IF_ERROR(_skip_values(skip_values));
_current_row_index += skip_values;
// generate the read values
size_t read_values =
std::min((size_t)(range.last_row - range.first_row), batch_size - has_read);
if (skip_whole_batch) {
RETURN_IF_ERROR(_skip_values(read_values));
} else {
RETURN_IF_ERROR(_read_values(read_values, src_column, type, select_vector,
is_dict_filter));
}
has_read += read_values;
_current_row_index += read_values;
if (has_read == batch_size) {
break;
}
}
*read_rows = has_read;
}
if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) {
*eof = true;
}
} while (false);
if (need_convert) {
std::unique_ptr<ParquetConvert::ColumnConvert> converter;
ParquetConvert::ConvertParams convert_params;
convert_params.init(_field_schema, _ctz, doris_column->size());
RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, type,
&converter, &convert_params));
auto x = doris_column->assume_mutable();
RETURN_IF_ERROR(converter->convert(src_column, x));
}
if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) {
*eof = true;
}
return Status::OK();
}
@ -732,4 +759,4 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
return Status::OK();
}
}; // namespace doris::vectorized
}; // namespace doris::vectorized

View File

@ -288,4 +288,4 @@ private:
std::vector<std::unique_ptr<ParquetColumnReader>> _child_readers;
};
}; // namespace doris::vectorized
}; // namespace doris::vectorized

View File

@ -175,15 +175,8 @@ Status RowGroupReader::init(
bool RowGroupReader::_can_filter_by_dict(int slot_id,
const tparquet::ColumnMetaData& column_metadata) {
SlotDescriptor* slot = nullptr;
const std::vector<SlotDescriptor*>& slots = _tuple_descriptor->slots();
for (auto each : slots) {
if (each->id() == slot_id) {
slot = each;
break;
}
}
if (!slot->type().is_string_type()) {
if (column_metadata.encodings[0] != tparquet::Encoding::RLE_DICTIONARY ||
column_metadata.type != tparquet::Type::BYTE_ARRAY) {
return false;
}
@ -336,6 +329,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_
bool can_filter_all = false;
RETURN_IF_ERROR_OR_CATCH_EXCEPTION(VExprContext::execute_conjuncts(
_filter_conjuncts, &filters, block, &result_filter, &can_filter_all));
if (can_filter_all) {
for (auto& col : columns_to_filter) {
std::move(*block->get_by_position(col).column).assume_mutable()->clear();
@ -344,6 +338,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_
_convert_dict_cols_to_string_cols(block);
return Status::OK();
}
if (!_not_single_slot_filter_conjuncts.empty()) {
_convert_dict_cols_to_string_cols(block);
std::vector<IColumn::Filter*> merged_filters;
@ -362,7 +357,6 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_
RETURN_IF_CATCH_EXCEPTION(
RETURN_IF_ERROR(_filter_block(block, column_to_keep, columns_to_filter)));
}
*read_rows = block->rows();
return Status::OK();
}
@ -421,8 +415,10 @@ Status RowGroupReader::_read_column_data(Block* block, const std::vector<std::st
has_eof = true;
}
}
*read_rows = batch_read_rows;
*batch_eof = has_eof;
return Status::OK();
}
@ -1008,4 +1004,4 @@ ParquetColumnReader::Statistics RowGroupReader::statistics() {
return st;
}
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -19,31 +19,41 @@
#include <gen_cpp/Metrics_types.h>
#include <gen_cpp/PlanNodes_types.h>
#include <gen_cpp/Types_types.h>
#include <gen_cpp/parquet_types.h>
#include <glog/logging.h>
#include <algorithm>
#include <functional>
#include <ostream>
#include <utility>
#include "common/status.h"
#include "exec/schema_scanner.h"
#include "gen_cpp/descriptors.pb.h"
#include "gtest/gtest_pred_impl.h"
#include "io/file_factory.h"
#include "io/fs/buffered_reader.h"
#include "io/fs/file_reader.h"
#include "io/fs/file_reader_writer_fwd.h"
#include "olap/olap_common.h"
#include "parquet_pred_cmp.h"
#include "parquet_thrift_util.h"
#include "runtime/define_primitive_type.h"
#include "runtime/descriptors.h"
#include "runtime/types.h"
#include "util/slice.h"
#include "util/timezone_utils.h"
#include "vec/columns/column.h"
#include "vec/common/typeid_cast.h"
#include "vec/exec/format/format_common.h"
#include "vec/core/block.h"
#include "vec/core/column_with_type_and_name.h"
#include "vec/core/types.h"
#include "vec/exec/format/parquet/parquet_common.h"
#include "vec/exec/format/parquet/schema_desc.h"
#include "vec/exec/format/parquet/vparquet_file_metadata.h"
#include "vec/exec/format/parquet/vparquet_group_reader.h"
#include "vec/exec/format/parquet/vparquet_page_index.h"
#include "vec/exprs/vbloom_predicate.h"
#include "vec/exprs/vexpr.h"
#include "vec/exprs/vexpr_context.h"
#include "vec/exprs/vin_predicate.h"
#include "vec/exprs/vruntimefilter_wrapper.h"
#include "vec/exprs/vslot_ref.h"
@ -520,15 +530,14 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof)
return Status::OK();
}
{
SCOPED_RAW_TIMER(&_statistics.column_read_time);
Status batch_st =
_current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof);
if (!batch_st.ok()) {
return Status::InternalError("Read parquet file {} failed, reason = {}",
_scan_range.path, batch_st.to_string());
}
SCOPED_RAW_TIMER(&_statistics.column_read_time);
Status batch_st =
_current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof);
if (!batch_st.ok()) {
return Status::InternalError("Read parquet file {} failed, reason = {}", _scan_range.path,
batch_st.to_string());
}
if (_row_group_eof) {
auto column_st = _current_group_reader->statistics();
_column_statistics.merge(column_st);
@ -897,4 +906,4 @@ int64_t ParquetReader::_get_column_start_offset(const tparquet::ColumnMetaData&
}
return column.data_page_offset;
}
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -168,6 +168,7 @@ vectorized::BlockUPtr ScannerContext::get_free_block() {
block = vectorized::Block::create_unique(_output_tuple_desc->slots(), _batch_size,
true /*ignore invalid slots*/);
COUNTER_UPDATE(_newly_create_free_blocks_num, 1);
_serving_blocks_num++;

View File

@ -464,4 +464,4 @@ void ScannerScheduler::_task_group_scanner_scan(ScannerScheduler* scheduler,
}
}
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -1105,4 +1105,4 @@ Status VFileScanner::close(RuntimeState* state) {
return Status::OK();
}
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -1,16 +1,16 @@
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTime))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(Date))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))|
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTimeV2))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(DateV2))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))|
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+

View File

@ -1,14 +1,14 @@
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTime))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(Date))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))|
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| 2| 2| 2| 2| 1| 2.14| 2.14| NULL| b-row1| 2022-08-02 07:23:18| 2.140000000| c-row1| vc-row1| 2022-08-02| 2022-08-02| 2022-08-02 07:23:18|
| -3| -3| -3| -3| 0| -3.14| -3.14| s-row2| b-row2| 2022-08-03 07:23:19| -3.140000000| c-row2| vc-row2| 2022-08-03| 2022-08-03| 2022-08-03 07:23:19|
| 4| 4| 4| 4| 1| 4.14| 4.14| NULL| b-row3| 2022-08-04 07:24:17| 4.140000000| c-row3| vc-row3| 2022-08-04| 2022-08-04| 2022-08-04 07:24:17|
| -5| -5| -5| -5| 0| -5.14| -5.14| s-row4| b-row4| 2022-08-05 07:25:17| -5.140000000| c-row4| vc-row4| 2022-08-05| 2022-08-05| 2022-08-05 07:25:17|
| 6| 6| 6| 6| 0| 6.14| 6.14| s-row5| b-row5| 2022-08-06 07:26:17| 6.140000000| c-row5| vc-row5| 2022-08-06| 2022-08-06| 2022-08-06 07:26:17|
| -7| -7| -7| -7| 1| -7.14| -7.14| s-row6| b-row6| 2022-08-07 07:27:17| -7.140000000| c-row6| vc-row6| 2022-08-07| 2022-08-07| 2022-08-07 07:27:17|
| 8| 8| 8| 8| 0| 8.14| 8.14| NULL| b-row7| 2022-08-08 07:28:17| 8.140000000| c-row7| vc-row7| 2022-08-08| 2022-08-08| 2022-08-08 07:28:17|
| -9| -9| -9| -9| 0| -9.14| -9.14| s-row8| b-row8| 2022-08-09 07:29:17| -9.140000000| c-row8| vc-row8| 2022-08-09| 2022-08-09| 2022-08-09 07:29:17|
| 10| 10| 10| 10| 0| 10.14| 10.14| s-row9| b-row9| 2022-08-10 07:21:17| 10.140000000| c-row9| vc-row9| 2022-08-10| 2022-08-10| 2022-08-10 07:21:17|
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTimeV2))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(DateV2))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))|
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
| 2| 2| 2| 2| 1| 2.14| 2.14| NULL| b-row1| 2022-08-02 07:23:18.000000| 2.140000000| c-row1| vc-row1| 2022-08-02| 2022-08-02| 2022-08-02 07:23:18|
| -3| -3| -3| -3| 0| -3.14| -3.14| s-row2| b-row2| 2022-08-03 07:23:19.000000| -3.140000000| c-row2| vc-row2| 2022-08-03| 2022-08-03| 2022-08-03 07:23:19|
| 4| 4| 4| 4| 1| 4.14| 4.14| NULL| b-row3| 2022-08-04 07:24:17.000000| 4.140000000| c-row3| vc-row3| 2022-08-04| 2022-08-04| 2022-08-04 07:24:17|
| -5| -5| -5| -5| 0| -5.14| -5.14| s-row4| b-row4| 2022-08-05 07:25:17.000000| -5.140000000| c-row4| vc-row4| 2022-08-05| 2022-08-05| 2022-08-05 07:25:17|
| 6| 6| 6| 6| 0| 6.14| 6.14| s-row5| b-row5| 2022-08-06 07:26:17.000000| 6.140000000| c-row5| vc-row5| 2022-08-06| 2022-08-06| 2022-08-06 07:26:17|
| -7| -7| -7| -7| 1| -7.14| -7.14| s-row6| b-row6| 2022-08-07 07:27:17.000000| -7.140000000| c-row6| vc-row6| 2022-08-07| 2022-08-07| 2022-08-07 07:27:17|
| 8| 8| 8| 8| 0| 8.14| 8.14| NULL| b-row7| 2022-08-08 07:28:17.000000| 8.140000000| c-row7| vc-row7| 2022-08-08| 2022-08-08| 2022-08-08 07:28:17|
| -9| -9| -9| -9| 0| -9.14| -9.14| s-row8| b-row8| 2022-08-09 07:29:17.000000| -9.140000000| c-row8| vc-row8| 2022-08-09| 2022-08-09| 2022-08-09 07:29:17|
| 10| 10| 10| 10| 0| 10.14| 10.14| s-row9| b-row9| 2022-08-10 07:21:17.000000| 10.140000000| c-row9| vc-row9| 2022-08-10| 2022-08-10| 2022-08-10 07:21:17|
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+

View File

@ -59,6 +59,7 @@
#include "vec/core/column_with_type_and_name.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_factory.hpp"
#include "vec/exec/format/parquet/parquet_column_convert.h"
#include "vec/exec/format/parquet/parquet_common.h"
#include "vec/exec/format/parquet/parquet_thrift_util.h"
#include "vec/exec/format/parquet/schema_desc.h"
@ -167,8 +168,8 @@ TEST_F(ParquetThriftReaderTest, complex_nested_file) {
static int fill_nullable_column(ColumnPtr& doris_column, level_t* definitions, size_t num_values) {
CHECK(doris_column->is_nullable());
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
(*std::move(doris_column)).mutate().get());
auto* nullable_column = const_cast<vectorized::ColumnNullable*>(
static_cast<const vectorized::ColumnNullable*>(doris_column.get()));
NullMap& map_data = nullable_column->get_null_map_data();
int null_cnt = 0;
for (int i = 0; i < num_values; ++i) {
@ -189,6 +190,14 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column
? chunk_meta.dictionary_page_offset
: chunk_meta.data_page_offset;
size_t chunk_size = chunk_meta.total_compressed_size;
bool need_convert = false;
auto& parquet_physical_type = column_chunk->meta_data.type;
auto& show_type = field_schema->type.type;
ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type,
doris_column, data_type, &need_convert);
io::BufferedFileStreamReader stream_reader(file_reader, start_offset, chunk_size, 1024);
cctz::time_zone ctz;
@ -208,14 +217,14 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column
chunk_reader.get_def_levels(definitions, rows);
}
MutableColumnPtr data_column;
if (doris_column->is_nullable()) {
if (src_column->is_nullable()) {
// fill nullable values
fill_nullable_column(doris_column, definitions, rows);
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
(*std::move(doris_column)).mutate().get());
fill_nullable_column(src_column, definitions, rows);
auto* nullable_column = const_cast<vectorized::ColumnNullable*>(
static_cast<const vectorized::ColumnNullable*>(src_column.get()));
data_column = nullable_column->get_nested_column_ptr();
} else {
data_column = doris_column->assume_mutable();
data_column = src_column->assume_mutable();
}
ColumnSelectVector run_length_map;
// decode page data
@ -223,7 +232,7 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column
// required column
std::vector<u_short> null_map = {(u_short)rows};
run_length_map.set_run_length_null_map(null_map, rows, nullptr);
return chunk_reader.decode_values(data_column, data_type, run_length_map, false);
RETURN_IF_ERROR(chunk_reader.decode_values(data_column, data_type, run_length_map, false));
} else {
// column with null values
level_t level_type = definitions[0];
@ -254,8 +263,18 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column
RETURN_IF_ERROR(
chunk_reader.decode_values(data_column, data_type, run_length_map, false));
}
return Status::OK();
}
if (need_convert) {
std::unique_ptr<ParquetConvert::ColumnConvert> converter;
ParquetConvert::ConvertParams convert_params;
convert_params.init(field_schema, &ctz, doris_column->size());
RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, data_type,
&converter, &convert_params));
auto x = doris_column->assume_mutable();
RETURN_IF_ERROR(converter->convert(src_column, x));
}
return Status::OK();
}
// Only the unit test depend on this, but it is wrong, should not use TTupleDesc to create tuple desc, not
@ -340,11 +359,11 @@ static void create_block(std::unique_ptr<vectorized::Block>& block) {
// binary is not supported, use string instead
{"binary_col", TYPE_STRING, sizeof(StringRef), true},
// 64-bit-length, see doris::get_slot_size in primitive_type.cpp
{"timestamp_col", TYPE_DATETIME, sizeof(int128_t), true},
{"timestamp_col", TYPE_DATETIMEV2, sizeof(int128_t), true},
{"decimal_col", TYPE_DECIMALV2, sizeof(DecimalV2Value), true},
{"char_col", TYPE_CHAR, sizeof(StringRef), true},
{"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true},
{"date_col", TYPE_DATE, sizeof(int128_t), true},
{"date_col", TYPE_DATEV2, sizeof(uint32_t), true},
{"date_v2_col", TYPE_DATEV2, sizeof(uint32_t), true},
{"timestamp_v2_col", TYPE_DATETIMEV2, sizeof(int128_t), true, 18, 0}};
SchemaScanner schema_scanner(column_descs);
@ -448,118 +467,6 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) {
read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/dict-decoder.parquet",
"./be/test/exec/test_data/parquet_scanner/dict-decoder.txt", 12);
}
TEST_F(ParquetThriftReaderTest, group_reader) {
std::vector<doris::SchemaScanner::ColumnDesc> column_descs = {
{"tinyint_col", TYPE_TINYINT, sizeof(int8_t), true},
{"smallint_col", TYPE_SMALLINT, sizeof(int16_t), true},
{"int_col", TYPE_INT, sizeof(int32_t), true},
{"bigint_col", TYPE_BIGINT, sizeof(int64_t), true},
{"boolean_col", TYPE_BOOLEAN, sizeof(bool), true},
{"float_col", TYPE_FLOAT, sizeof(float_t), true},
{"double_col", TYPE_DOUBLE, sizeof(double_t), true},
{"string_col", TYPE_STRING, sizeof(StringRef), true},
{"binary_col", TYPE_STRING, sizeof(StringRef), true},
{"timestamp_col", TYPE_DATETIME, sizeof(int128_t), true},
{"decimal_col", TYPE_DECIMALV2, sizeof(DecimalV2Value), true},
{"char_col", TYPE_CHAR, sizeof(StringRef), true},
{"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true},
{"date_col", TYPE_DATE, sizeof(int128_t), true}};
SchemaScanner schema_scanner(column_descs);
ObjectPool object_pool;
doris::TupleDescriptor* tuple_desc = create_tuple_desc(&object_pool, column_descs);
auto tuple_slots = tuple_desc->slots();
TSlotDescriptor tslot_desc;
{
tslot_desc.id = 14;
tslot_desc.parent = 0;
TTypeDesc type;
{
TTypeNode node;
node.__set_type(TTypeNodeType::ARRAY);
std::vector<bool> contains_nulls {true};
node.__set_contains_nulls(contains_nulls);
TTypeNode inner;
inner.__set_type(TTypeNodeType::SCALAR);
TScalarType scalar_type;
scalar_type.__set_type(TPrimitiveType::STRING);
inner.__set_scalar_type(scalar_type);
inner.__set_contains_nulls(contains_nulls);
type.types.push_back(node);
type.types.push_back(inner);
}
tslot_desc.slotType = type;
tslot_desc.columnPos = 14;
tslot_desc.byteOffset = 0;
tslot_desc.nullIndicatorByte = 0;
tslot_desc.nullIndicatorBit = -1;
tslot_desc.colName = "list_string";
tslot_desc.slotIdx = 14;
tslot_desc.isMaterialized = true;
}
SlotDescriptor string_slot(tslot_desc);
tuple_slots.emplace_back(&string_slot);
std::vector<std::string> read_columns;
RowGroupReader::LazyReadContext lazy_read_ctx;
for (const auto& slot : tuple_slots) {
lazy_read_ctx.all_read_columns.emplace_back(slot->col_name());
read_columns.emplace_back(slot->col_name());
}
io::FileSystemSPtr local_fs = io::LocalFileSystem::create("");
io::FileReaderSPtr file_reader;
auto st = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet",
&file_reader);
EXPECT_TRUE(st.ok());
// prepare metadata
FileMetaData* meta_data;
size_t meta_size;
static_cast<void>(parse_thrift_footer(file_reader, &meta_data, &meta_size, nullptr));
tparquet::FileMetaData t_metadata = meta_data->to_thrift();
cctz::time_zone ctz;
TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz);
auto row_group = t_metadata.row_groups[0];
std::shared_ptr<RowGroupReader> row_group_reader;
RowGroupReader::PositionDeleteContext position_delete_ctx(row_group.num_rows, 0);
row_group_reader.reset(new RowGroupReader(file_reader, read_columns, 0, row_group, &ctz,
nullptr, position_delete_ctx, lazy_read_ctx,
nullptr));
std::vector<RowRange> row_ranges;
row_ranges.emplace_back(0, row_group.num_rows);
auto col_offsets = std::unordered_map<int, tparquet::OffsetIndex>();
auto stg = row_group_reader->init(meta_data->schema(), row_ranges, col_offsets, nullptr,
nullptr, nullptr, nullptr, nullptr);
EXPECT_TRUE(stg.ok());
vectorized::Block block;
for (const auto& slot_desc : tuple_slots) {
auto data_type =
vectorized::DataTypeFactory::instance().create_data_type(slot_desc->type(), true);
MutableColumnPtr data_column = data_type->create_column();
block.insert(
ColumnWithTypeAndName(std::move(data_column), data_type, slot_desc->col_name()));
}
bool batch_eof = false;
size_t read_rows = 0;
auto stb = row_group_reader->next_batch(&block, 1024, &read_rows, &batch_eof);
EXPECT_TRUE(stb.ok());
io::FileReaderSPtr result;
auto rst = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/group-reader.txt",
&result);
EXPECT_TRUE(rst.ok());
uint8_t result_buf[result->size() + 1];
result_buf[result->size()] = '\0';
size_t bytes_read;
Slice res(result_buf, result->size());
static_cast<void>(result->read_at(0, res, &bytes_read));
ASSERT_STREQ(block.dump_data(0, 10).c_str(), reinterpret_cast<char*>(result_buf));
delete meta_data;
}
} // namespace vectorized
} // namespace doris

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,228 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_hive_parquet_alter_column", "p2,external,hive,external_remote,external_remote_hive") {
String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost")
String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort")
String hms_port = context.config.otherConfigs.get("hms_port")
String catalog_name = "test_hive_parquet_alter_column"
sql """drop catalog if exists ${catalog_name};"""
sql """
create catalog if not exists ${catalog_name} properties (
'type'='hms',
'hadoop.username' = 'hadoop',
'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
);
"""
logger.info("catalog " + catalog_name + " created")
sql """switch ${catalog_name};"""
logger.info("switched to catalog " + catalog_name)
String Orderby = """ order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_decimal,col_date,col_timestamp limit 7 """
sql """ use multi_catalog """
types = ["int","smallint","tinyint","bigint","float","double","boolean","string","char","varchar","date","timestamp","decimal"]
for( String type1 in types) {
qt_desc """ desc parquet_alter_column_to_${type1} ; """
qt_show """ select * from parquet_alter_column_to_${type1} ${Orderby} """
for( String type2 in types) {
qt_order """ select col_${type2} from parquet_alter_column_to_${type1} order by col_${type2} limit 3 """
}
}
order_qt_int_int """ select col_int from parquet_alter_column_to_int where col_int>=2 order by col_int limit 3"""
order_qt_int_smallint """ select col_smallint from parquet_alter_column_to_int where col_smallint>=3 order by col_smallint limit 3"""
order_qt_int_tinyint """ select col_tinyint from parquet_alter_column_to_int where col_tinyint>=3 order by col_tinyint limit 3"""
order_qt_int_bigint """ select col_bigint from parquet_alter_column_to_int where col_bigint>=3 order by col_bigint limit 3"""
order_qt_int_float """ select col_float from parquet_alter_column_to_int where col_float=2.6 order by col_float limit 3"""
order_qt_int_double """ select col_double from parquet_alter_column_to_int where col_double=0.8 order by col_double limit 3"""
order_qt_int_boolean """ select col_boolean from parquet_alter_column_to_int where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_int_string """ select col_string from parquet_alter_column_to_int where col_string="B" order by col_string limit 3"""
order_qt_int_char """ select col_char from parquet_alter_column_to_int where col_char="B" order by col_char limit 3"""
order_qt_int_varchar """ select col_varchar from parquet_alter_column_to_int where col_varchar="C" order by col_varchar limit 3"""
order_qt_int_date """ select col_date from parquet_alter_column_to_int where year(col_date)=2023 order by col_date limit 3"""
order_qt_int_timestamp """ select col_timestamp from parquet_alter_column_to_int where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_int_decimal """ select col_decimal from parquet_alter_column_to_int where col_decimal=1.1 order by col_decimal limit 3"""
order_qt_smallint_int """ select col_int from parquet_alter_column_to_smallint where col_int>=1 order by col_int limit 3"""
order_qt_smallint_smallint """ select col_smallint from parquet_alter_column_to_smallint where col_smallint>=3 order by col_smallint limit 3"""
order_qt_smallint_tinyint """ select col_tinyint from parquet_alter_column_to_smallint where col_tinyint>=2 order by col_tinyint limit 3"""
order_qt_smallint_bigint """ select col_bigint from parquet_alter_column_to_smallint where col_bigint>=2 order by col_bigint limit 3"""
order_qt_smallint_float """ select col_float from parquet_alter_column_to_smallint where col_float=3.0 order by col_float limit 3"""
order_qt_smallint_double """ select col_double from parquet_alter_column_to_smallint where col_double=0.5 order by col_double limit 3"""
order_qt_smallint_boolean """ select col_boolean from parquet_alter_column_to_smallint where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_smallint_string """ select col_string from parquet_alter_column_to_smallint where col_string="helloworld" order by col_string limit 3"""
order_qt_smallint_char """ select col_char from parquet_alter_column_to_smallint where col_char="C" order by col_char limit 3"""
order_qt_smallint_varchar """ select col_varchar from parquet_alter_column_to_smallint where col_varchar="A" order by col_varchar limit 3"""
order_qt_smallint_date """ select col_date from parquet_alter_column_to_smallint where year(col_date)=2023 order by col_date limit 3"""
order_qt_smallint_timestamp """ select col_timestamp from parquet_alter_column_to_smallint where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_smallint_decimal """ select col_decimal from parquet_alter_column_to_smallint where col_decimal=2.5 order by col_decimal limit 3"""
order_qt_tinyint_int """ select col_int from parquet_alter_column_to_tinyint where col_int>=3 order by col_int limit 3"""
order_qt_tinyint_smallint """ select col_smallint from parquet_alter_column_to_tinyint where col_smallint>=3 order by col_smallint limit 3"""
order_qt_tinyint_tinyint """ select col_tinyint from parquet_alter_column_to_tinyint where col_tinyint>=3 order by col_tinyint limit 3"""
order_qt_tinyint_bigint """ select col_bigint from parquet_alter_column_to_tinyint where col_bigint>=1 order by col_bigint limit 3"""
order_qt_tinyint_float """ select col_float from parquet_alter_column_to_tinyint where col_float=0.6 order by col_float limit 3"""
order_qt_tinyint_double """ select col_double from parquet_alter_column_to_tinyint where col_double=1.1 order by col_double limit 3"""
order_qt_tinyint_boolean """ select col_boolean from parquet_alter_column_to_tinyint where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_tinyint_string """ select col_string from parquet_alter_column_to_tinyint where col_string="helloworld" order by col_string limit 3"""
order_qt_tinyint_char """ select col_char from parquet_alter_column_to_tinyint where col_char="A" order by col_char limit 3"""
order_qt_tinyint_varchar """ select col_varchar from parquet_alter_column_to_tinyint where col_varchar="C" order by col_varchar limit 3"""
order_qt_tinyint_date """ select col_date from parquet_alter_column_to_tinyint where year(col_date)=2023 order by col_date limit 3"""
order_qt_tinyint_timestamp """ select col_timestamp from parquet_alter_column_to_tinyint where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_tinyint_decimal """ select col_decimal from parquet_alter_column_to_tinyint where col_decimal=1.4 order by col_decimal limit 3"""
order_qt_bigint_int """ select col_int from parquet_alter_column_to_bigint where col_int>=3 order by col_int limit 3"""
order_qt_bigint_smallint """ select col_smallint from parquet_alter_column_to_bigint where col_smallint>=2 order by col_smallint limit 3"""
order_qt_bigint_tinyint """ select col_tinyint from parquet_alter_column_to_bigint where col_tinyint>=2 order by col_tinyint limit 3"""
order_qt_bigint_bigint """ select col_bigint from parquet_alter_column_to_bigint where col_bigint>=1 order by col_bigint limit 3"""
order_qt_bigint_float """ select col_float from parquet_alter_column_to_bigint where col_float=2.5 order by col_float limit 3"""
order_qt_bigint_double """ select col_double from parquet_alter_column_to_bigint where col_double=0.2 order by col_double limit 3"""
order_qt_bigint_boolean """ select col_boolean from parquet_alter_column_to_bigint where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_bigint_string """ select col_string from parquet_alter_column_to_bigint where col_string="A" order by col_string limit 3"""
order_qt_bigint_char """ select col_char from parquet_alter_column_to_bigint where col_char="A" order by col_char limit 3"""
order_qt_bigint_varchar """ select col_varchar from parquet_alter_column_to_bigint where col_varchar="A" order by col_varchar limit 3"""
order_qt_bigint_date """ select col_date from parquet_alter_column_to_bigint where year(col_date)=2023 order by col_date limit 3"""
order_qt_bigint_timestamp """ select col_timestamp from parquet_alter_column_to_bigint where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_bigint_decimal """ select col_decimal from parquet_alter_column_to_bigint where col_decimal=0.8 order by col_decimal limit 3"""
order_qt_float_int """ select col_int from parquet_alter_column_to_float where col_int=1.4 order by col_int limit 3"""
order_qt_float_smallint """ select col_smallint from parquet_alter_column_to_float where col_smallint=0.3 order by col_smallint limit 3"""
order_qt_float_tinyint """ select col_tinyint from parquet_alter_column_to_float where col_tinyint=0.2 order by col_tinyint limit 3"""
order_qt_float_bigint """ select col_bigint from parquet_alter_column_to_float where col_bigint=2.2 order by col_bigint limit 3"""
order_qt_float_float """ select col_float from parquet_alter_column_to_float where col_float=1.2 order by col_float limit 3"""
order_qt_float_double """ select col_double from parquet_alter_column_to_float where col_double=1.5 order by col_double limit 3"""
order_qt_float_boolean """ select col_boolean from parquet_alter_column_to_float where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_float_string """ select col_string from parquet_alter_column_to_float where col_string="A" order by col_string limit 3"""
order_qt_float_char """ select col_char from parquet_alter_column_to_float where col_char="helloworld" order by col_char limit 3"""
order_qt_float_varchar """ select col_varchar from parquet_alter_column_to_float where col_varchar="1" order by col_varchar limit 3"""
order_qt_float_date """ select col_date from parquet_alter_column_to_float where year(col_date)=2023 order by col_date limit 3"""
order_qt_float_timestamp """ select col_timestamp from parquet_alter_column_to_float where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_float_decimal """ select col_decimal from parquet_alter_column_to_float where col_decimal=0.8 order by col_decimal limit 3"""
order_qt_double_int """ select col_int from parquet_alter_column_to_double where col_int=2.0 order by col_int limit 3"""
order_qt_double_smallint """ select col_smallint from parquet_alter_column_to_double where col_smallint=2.0 order by col_smallint limit 3"""
order_qt_double_tinyint """ select col_tinyint from parquet_alter_column_to_double where col_tinyint=1.4 order by col_tinyint limit 3"""
order_qt_double_bigint """ select col_bigint from parquet_alter_column_to_double where col_bigint=1.5 order by col_bigint limit 3"""
order_qt_double_float """ select col_float from parquet_alter_column_to_double where col_float=2.2 order by col_float limit 3"""
order_qt_double_double """ select col_double from parquet_alter_column_to_double where col_double=0.6 order by col_double limit 3"""
order_qt_double_boolean """ select col_boolean from parquet_alter_column_to_double where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_double_string """ select col_string from parquet_alter_column_to_double where col_string="B" order by col_string limit 3"""
order_qt_double_char """ select col_char from parquet_alter_column_to_double where col_char="A" order by col_char limit 3"""
order_qt_double_varchar """ select col_varchar from parquet_alter_column_to_double where col_varchar="C" order by col_varchar limit 3"""
order_qt_double_date """ select col_date from parquet_alter_column_to_double where year(col_date)=2023 order by col_date limit 3"""
order_qt_double_timestamp """ select col_timestamp from parquet_alter_column_to_double where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_double_decimal """ select col_decimal from parquet_alter_column_to_double where col_decimal=0.3 order by col_decimal limit 3"""
order_qt_boolean_int """ select col_int from parquet_alter_column_to_boolean where col_int>=3 order by col_int limit 3"""
order_qt_boolean_smallint """ select col_smallint from parquet_alter_column_to_boolean where col_smallint>=2 order by col_smallint limit 3"""
order_qt_boolean_tinyint """ select col_tinyint from parquet_alter_column_to_boolean where col_tinyint>=1 order by col_tinyint limit 3"""
order_qt_boolean_bigint """ select col_bigint from parquet_alter_column_to_boolean where col_bigint>=3 order by col_bigint limit 3"""
order_qt_boolean_float """ select col_float from parquet_alter_column_to_boolean where col_float=1.1 order by col_float limit 3"""
order_qt_boolean_double """ select col_double from parquet_alter_column_to_boolean where col_double=0.5 order by col_double limit 3"""
order_qt_boolean_boolean """ select col_boolean from parquet_alter_column_to_boolean where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_boolean_string """ select col_string from parquet_alter_column_to_boolean where col_string="1" order by col_string limit 3"""
order_qt_boolean_char """ select col_char from parquet_alter_column_to_boolean where col_char="A" order by col_char limit 3"""
order_qt_boolean_varchar """ select col_varchar from parquet_alter_column_to_boolean where col_varchar="B" order by col_varchar limit 3"""
order_qt_boolean_date """ select col_date from parquet_alter_column_to_boolean where year(col_date)=2023 order by col_date limit 3"""
order_qt_boolean_timestamp """ select col_timestamp from parquet_alter_column_to_boolean where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_boolean_decimal """ select col_decimal from parquet_alter_column_to_boolean where col_decimal=2.8 order by col_decimal limit 3"""
order_qt_string_int """ select col_int from parquet_alter_column_to_string where col_int="C" order by col_int limit 3"""
order_qt_string_smallint """ select col_smallint from parquet_alter_column_to_string where col_smallint="C" order by col_smallint limit 3"""
order_qt_string_tinyint """ select col_tinyint from parquet_alter_column_to_string where col_tinyint="B" order by col_tinyint limit 3"""
order_qt_string_bigint """ select col_bigint from parquet_alter_column_to_string where col_bigint="helloworld" order by col_bigint limit 3"""
order_qt_string_float """ select col_float from parquet_alter_column_to_string where col_float="1" order by col_float limit 3"""
order_qt_string_double """ select col_double from parquet_alter_column_to_string where col_double="C" order by col_double limit 3"""
order_qt_string_boolean """ select col_boolean from parquet_alter_column_to_string where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_string_string """ select col_string from parquet_alter_column_to_string where col_string="B" order by col_string limit 3"""
order_qt_string_char """ select col_char from parquet_alter_column_to_string where col_char="A" order by col_char limit 3"""
order_qt_string_varchar """ select col_varchar from parquet_alter_column_to_string where col_varchar="B" order by col_varchar limit 3"""
order_qt_string_date """ select col_date from parquet_alter_column_to_string where col_date="helloworld" order by col_date limit 3"""
order_qt_string_timestamp """ select col_timestamp from parquet_alter_column_to_string where col_timestamp="B" order by col_timestamp limit 3"""
order_qt_string_decimal """ select col_decimal from parquet_alter_column_to_string where col_decimal="1" order by col_decimal limit 3"""
order_qt_char_int """ select col_int from parquet_alter_column_to_char where col_int="B" order by col_int limit 3"""
order_qt_char_smallint """ select col_smallint from parquet_alter_column_to_char where col_smallint="A" order by col_smallint limit 3"""
order_qt_char_tinyint """ select col_tinyint from parquet_alter_column_to_char where col_tinyint="A" order by col_tinyint limit 3"""
order_qt_char_bigint """ select col_bigint from parquet_alter_column_to_char where col_bigint="B" order by col_bigint limit 3"""
order_qt_char_float """ select col_float from parquet_alter_column_to_char where col_float="C" order by col_float limit 3"""
order_qt_char_double """ select col_double from parquet_alter_column_to_char where col_double="A" order by col_double limit 3"""
order_qt_char_boolean """ select col_boolean from parquet_alter_column_to_char where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_char_string """ select col_string from parquet_alter_column_to_char where col_string="C" order by col_string limit 3"""
order_qt_char_char """ select col_char from parquet_alter_column_to_char where col_char="A" order by col_char limit 3"""
order_qt_char_varchar """ select col_varchar from parquet_alter_column_to_char where col_varchar="B" order by col_varchar limit 3"""
order_qt_char_date """ select col_date from parquet_alter_column_to_char where col_date="B" order by col_date limit 3"""
order_qt_char_timestamp """ select col_timestamp from parquet_alter_column_to_char where col_timestamp="A" order by col_timestamp limit 3"""
order_qt_char_decimal """ select col_decimal from parquet_alter_column_to_char where col_decimal="C" order by col_decimal limit 3"""
order_qt_varchar_int """ select col_int from parquet_alter_column_to_varchar where col_int="B" order by col_int limit 3"""
order_qt_varchar_smallint """ select col_smallint from parquet_alter_column_to_varchar where col_smallint="helloworld" order by col_smallint limit 3"""
order_qt_varchar_tinyint """ select col_tinyint from parquet_alter_column_to_varchar where col_tinyint="A" order by col_tinyint limit 3"""
order_qt_varchar_bigint """ select col_bigint from parquet_alter_column_to_varchar where col_bigint="helloworld" order by col_bigint limit 3"""
order_qt_varchar_float """ select col_float from parquet_alter_column_to_varchar where col_float="1" order by col_float limit 3"""
order_qt_varchar_double """ select col_double from parquet_alter_column_to_varchar where col_double="B" order by col_double limit 3"""
order_qt_varchar_boolean """ select col_boolean from parquet_alter_column_to_varchar where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_varchar_string """ select col_string from parquet_alter_column_to_varchar where col_string="A" order by col_string limit 3"""
order_qt_varchar_char """ select col_char from parquet_alter_column_to_varchar where col_char="B" order by col_char limit 3"""
order_qt_varchar_varchar """ select col_varchar from parquet_alter_column_to_varchar where col_varchar="B" order by col_varchar limit 3"""
order_qt_varchar_date """ select col_date from parquet_alter_column_to_varchar where col_date="C" order by col_date limit 3"""
order_qt_varchar_timestamp """ select col_timestamp from parquet_alter_column_to_varchar where col_timestamp="C" order by col_timestamp limit 3"""
order_qt_varchar_decimal """ select col_decimal from parquet_alter_column_to_varchar where col_decimal="helloworld" order by col_decimal limit 3"""
order_qt_date_int """ select col_int from parquet_alter_column_to_date where col_int>=3 order by col_int limit 3"""
order_qt_date_smallint """ select col_smallint from parquet_alter_column_to_date where col_smallint>=1 order by col_smallint limit 3"""
order_qt_date_tinyint """ select col_tinyint from parquet_alter_column_to_date where col_tinyint>=3 order by col_tinyint limit 3"""
order_qt_date_bigint """ select col_bigint from parquet_alter_column_to_date where col_bigint>=1 order by col_bigint limit 3"""
order_qt_date_float """ select col_float from parquet_alter_column_to_date where col_float=2.8 order by col_float limit 3"""
order_qt_date_double """ select col_double from parquet_alter_column_to_date where col_double=2.5 order by col_double limit 3"""
order_qt_date_boolean """ select col_boolean from parquet_alter_column_to_date where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_date_string """ select col_string from parquet_alter_column_to_date where col_string="helloworld" order by col_string limit 3"""
order_qt_date_char """ select col_char from parquet_alter_column_to_date where col_char="A" order by col_char limit 3"""
order_qt_date_varchar """ select col_varchar from parquet_alter_column_to_date where col_varchar="1" order by col_varchar limit 3"""
order_qt_date_date """ select col_date from parquet_alter_column_to_date where year(col_date)=2023 order by col_date limit 3"""
order_qt_date_timestamp """ select col_timestamp from parquet_alter_column_to_date where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_date_decimal """ select col_decimal from parquet_alter_column_to_date where col_decimal=0.3 order by col_decimal limit 3"""
order_qt_timestamp_int """ select col_int from parquet_alter_column_to_timestamp where col_int>=3 order by col_int limit 3"""
order_qt_timestamp_smallint """ select col_smallint from parquet_alter_column_to_timestamp where col_smallint>=3 order by col_smallint limit 3"""
order_qt_timestamp_tinyint """ select col_tinyint from parquet_alter_column_to_timestamp where col_tinyint>=1 order by col_tinyint limit 3"""
order_qt_timestamp_bigint """ select col_bigint from parquet_alter_column_to_timestamp where col_bigint>=3 order by col_bigint limit 3"""
order_qt_timestamp_float """ select col_float from parquet_alter_column_to_timestamp where col_float=2.4 order by col_float limit 3"""
order_qt_timestamp_double """ select col_double from parquet_alter_column_to_timestamp where col_double=1.3 order by col_double limit 3"""
order_qt_timestamp_boolean """ select col_boolean from parquet_alter_column_to_timestamp where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_timestamp_string """ select col_string from parquet_alter_column_to_timestamp where col_string="C" order by col_string limit 3"""
order_qt_timestamp_char """ select col_char from parquet_alter_column_to_timestamp where col_char="B" order by col_char limit 3"""
order_qt_timestamp_varchar """ select col_varchar from parquet_alter_column_to_timestamp where col_varchar="C" order by col_varchar limit 3"""
order_qt_timestamp_date """ select col_date from parquet_alter_column_to_timestamp where year(col_date)=2023 order by col_date limit 3"""
order_qt_timestamp_timestamp """ select col_timestamp from parquet_alter_column_to_timestamp where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_timestamp_decimal """ select col_decimal from parquet_alter_column_to_timestamp where col_decimal=1.3 order by col_decimal limit 3"""
order_qt_decimal_int """ select col_int from parquet_alter_column_to_decimal where col_int=2.8 order by col_int limit 3"""
order_qt_decimal_smallint """ select col_smallint from parquet_alter_column_to_decimal where col_smallint=0.1 order by col_smallint limit 3"""
order_qt_decimal_tinyint """ select col_tinyint from parquet_alter_column_to_decimal where col_tinyint=2.9 order by col_tinyint limit 3"""
order_qt_decimal_bigint """ select col_bigint from parquet_alter_column_to_decimal where col_bigint=2.3 order by col_bigint limit 3"""
order_qt_decimal_float """ select col_float from parquet_alter_column_to_decimal where col_float=2.5 order by col_float limit 3"""
order_qt_decimal_double """ select col_double from parquet_alter_column_to_decimal where col_double=1.7 order by col_double limit 3"""
order_qt_decimal_boolean """ select col_boolean from parquet_alter_column_to_decimal where year(col_boolean)=2023 order by col_boolean limit 3"""
order_qt_decimal_string """ select col_string from parquet_alter_column_to_decimal where col_string="helloworld" order by col_string limit 3"""
order_qt_decimal_char """ select col_char from parquet_alter_column_to_decimal where col_char="helloworld" order by col_char limit 3"""
order_qt_decimal_varchar """ select col_varchar from parquet_alter_column_to_decimal where col_varchar="helloworld" order by col_varchar limit 3"""
order_qt_decimal_date """ select col_date from parquet_alter_column_to_decimal where year(col_date)=2023 order by col_date limit 3"""
order_qt_decimal_timestamp """ select col_timestamp from parquet_alter_column_to_decimal where year(col_timestamp)=2023 order by col_timestamp limit 3"""
order_qt_decimal_decimal """ select col_decimal from parquet_alter_column_to_decimal where col_decimal=1.5 order by col_decimal limit 3"""
}
}