[feature](hive)Support hive tables after alter type. (#25138)
1.Reconstruct the logic of decode to read parquet. The parquet reader first reads the data according to the parquet physical type, and then performs a type conversion. 2.Support hive alter table.
This commit is contained in:
@ -410,4 +410,4 @@ protected:
|
||||
MutablePtr shallow_mutate() const {
|
||||
return MutablePtr(static_cast<Derived*>(Base::shallow_mutate().get()));
|
||||
}
|
||||
};
|
||||
};
|
||||
@ -125,56 +125,35 @@ Status ByteArrayDictDecoder::_decode_values(MutableColumnPtr& doris_column, Data
|
||||
return _decode_dict_values<has_filter>(doris_column, select_vector, is_dict_filter);
|
||||
}
|
||||
|
||||
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
|
||||
switch (logical_type) {
|
||||
case TypeIndex::String:
|
||||
[[fallthrough]];
|
||||
case TypeIndex::FixedString: {
|
||||
size_t dict_index = 0;
|
||||
size_t dict_index = 0;
|
||||
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
std::vector<StringRef> string_values;
|
||||
string_values.reserve(run_length);
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
|
||||
}
|
||||
doris_column->insert_many_strings_overflow(&string_values[0], run_length,
|
||||
_max_value_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
doris_column->insert_many_defaults(run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
std::vector<StringRef> string_values;
|
||||
string_values.reserve(run_length);
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
string_values.emplace_back(_dict_items[_indexes[dict_index++]]);
|
||||
}
|
||||
doris_column->insert_many_strings_overflow(&string_values[0], run_length,
|
||||
_max_value_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
doris_column->insert_many_defaults(run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
case TypeIndex::Decimal32:
|
||||
return _decode_binary_decimal<Int32, has_filter>(doris_column, data_type, select_vector);
|
||||
case TypeIndex::Decimal64:
|
||||
return _decode_binary_decimal<Int64, has_filter>(doris_column, data_type, select_vector);
|
||||
case TypeIndex::Decimal128:
|
||||
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type, select_vector);
|
||||
case TypeIndex::Decimal128I:
|
||||
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type, select_vector);
|
||||
// TODO: decimal256
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"Can't decode parquet physical type BYTE_ARRAY to doris logical type {}",
|
||||
getTypeName(logical_type));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -66,97 +66,10 @@ public:
|
||||
MutableColumnPtr convert_dict_column_to_string_column(const ColumnInt32* dict_column) override;
|
||||
|
||||
protected:
|
||||
template <typename DecimalPrimitiveType, bool has_filter>
|
||||
Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector);
|
||||
|
||||
// For dictionary encoding
|
||||
std::vector<StringRef> _dict_items;
|
||||
std::vector<uint8_t> _dict_data;
|
||||
size_t _max_value_length;
|
||||
std::unordered_map<StringRef, int32_t> _dict_value_to_code;
|
||||
|
||||
private:
|
||||
template <typename DecimalPrimitiveType, bool has_filter,
|
||||
DecimalScaleParams::ScaleType ScaleType>
|
||||
Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector);
|
||||
};
|
||||
|
||||
template <typename DecimalPrimitiveType, bool has_filter>
|
||||
Status ByteArrayDictDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
init_decimal_converter<DecimalPrimitiveType>(data_type);
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
|
||||
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
|
||||
DecimalScaleParams::SCALE_UP>(
|
||||
doris_column, data_type, select_vector);
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
|
||||
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
|
||||
DecimalScaleParams::SCALE_DOWN>(
|
||||
doris_column, data_type, select_vector);
|
||||
} else {
|
||||
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
|
||||
DecimalScaleParams::NO_SCALE>(
|
||||
doris_column, data_type, select_vector);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType, bool has_filter, DecimalScaleParams::ScaleType ScaleType>
|
||||
Status ByteArrayDictDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data =
|
||||
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
size_t dict_index = 0;
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
StringRef& slice = _dict_items[_indexes[dict_index++]];
|
||||
char* buf_start = const_cast<char*>(slice.data);
|
||||
uint32_t length = (uint32_t)slice.size;
|
||||
// When Decimal in parquet is stored in byte arrays, binary and fixed,
|
||||
// the unscaled number must be encoded as two's complement using big-endian byte order.
|
||||
DecimalPrimitiveType value = 0;
|
||||
memcpy(reinterpret_cast<char*>(&value), buf_start, length);
|
||||
value = BitUtil::big_endian_to_host(value);
|
||||
value = value >> ((sizeof(value) - length) * 8);
|
||||
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
|
||||
value *= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
|
||||
value /= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
|
||||
// do nothing
|
||||
} else {
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
|
||||
v = (DecimalPrimitiveType)value;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -56,74 +56,53 @@ template <bool has_filter>
|
||||
Status ByteArrayPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector,
|
||||
bool is_dict_filter) {
|
||||
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
|
||||
switch (logical_type) {
|
||||
case TypeIndex::String:
|
||||
[[fallthrough]];
|
||||
case TypeIndex::FixedString: {
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
std::vector<StringRef> string_values;
|
||||
string_values.reserve(run_length);
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
if (UNLIKELY(_offset + 4 > _data->size)) {
|
||||
return Status::IOError("Can't read byte array length from plain decoder");
|
||||
}
|
||||
uint32_t length = decode_fixed32_le(
|
||||
reinterpret_cast<const uint8_t*>(_data->data) + _offset);
|
||||
_offset += 4;
|
||||
if (UNLIKELY(_offset + length) > _data->size) {
|
||||
return Status::IOError("Can't read enough bytes in plain decoder");
|
||||
}
|
||||
string_values.emplace_back(_data->data + _offset, length);
|
||||
_offset += length;
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
std::vector<StringRef> string_values;
|
||||
string_values.reserve(run_length);
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
if (UNLIKELY(_offset + 4 > _data->size)) {
|
||||
return Status::IOError("Can't read byte array length from plain decoder");
|
||||
}
|
||||
doris_column->insert_many_strings(&string_values[0], run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
doris_column->insert_many_defaults(run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
for (int i = 0; i < run_length; ++i) {
|
||||
if (UNLIKELY(_offset + 4 > _data->size)) {
|
||||
return Status::IOError("Can't read byte array length from plain decoder");
|
||||
}
|
||||
uint32_t length = decode_fixed32_le(
|
||||
reinterpret_cast<const uint8_t*>(_data->data) + _offset);
|
||||
_offset += 4;
|
||||
if (UNLIKELY(_offset + length) > _data->size) {
|
||||
return Status::IOError("Can't read enough bytes in plain decoder");
|
||||
}
|
||||
_offset += length;
|
||||
uint32_t length =
|
||||
decode_fixed32_le(reinterpret_cast<const uint8_t*>(_data->data) + _offset);
|
||||
_offset += 4;
|
||||
if (UNLIKELY(_offset + length) > _data->size) {
|
||||
return Status::IOError("Can't read enough bytes in plain decoder");
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
string_values.emplace_back(_data->data + _offset, length);
|
||||
_offset += length;
|
||||
}
|
||||
doris_column->insert_many_strings(&string_values[0], run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
doris_column->insert_many_defaults(run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
for (int i = 0; i < run_length; ++i) {
|
||||
if (UNLIKELY(_offset + 4 > _data->size)) {
|
||||
return Status::IOError("Can't read byte array length from plain decoder");
|
||||
}
|
||||
uint32_t length =
|
||||
decode_fixed32_le(reinterpret_cast<const uint8_t*>(_data->data) + _offset);
|
||||
_offset += 4;
|
||||
if (UNLIKELY(_offset + length) > _data->size) {
|
||||
return Status::IOError("Can't read enough bytes in plain decoder");
|
||||
}
|
||||
_offset += length;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
case TypeIndex::Decimal32:
|
||||
return _decode_binary_decimal<Int32, has_filter>(doris_column, data_type, select_vector);
|
||||
case TypeIndex::Decimal64:
|
||||
return _decode_binary_decimal<Int64, has_filter>(doris_column, data_type, select_vector);
|
||||
case TypeIndex::Decimal128:
|
||||
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type, select_vector);
|
||||
case TypeIndex::Decimal128I:
|
||||
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type, select_vector);
|
||||
// TODO: decimal256
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"Can't decode parquet physical type BYTE_ARRAY to doris logical type {}",
|
||||
getTypeName(logical_type));
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -56,97 +56,5 @@ public:
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter);
|
||||
|
||||
Status skip_values(size_t num_values) override;
|
||||
|
||||
protected:
|
||||
template <typename DecimalPrimitiveType, bool has_filter>
|
||||
Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector);
|
||||
|
||||
private:
|
||||
template <typename DecimalPrimitiveType, bool has_filter,
|
||||
DecimalScaleParams::ScaleType ScaleType>
|
||||
Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector);
|
||||
};
|
||||
|
||||
template <typename DecimalPrimitiveType, bool has_filter>
|
||||
Status ByteArrayPlainDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
init_decimal_converter<DecimalPrimitiveType>(data_type);
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
|
||||
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
|
||||
DecimalScaleParams::SCALE_UP>(
|
||||
doris_column, data_type, select_vector);
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
|
||||
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
|
||||
DecimalScaleParams::SCALE_DOWN>(
|
||||
doris_column, data_type, select_vector);
|
||||
} else {
|
||||
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter,
|
||||
DecimalScaleParams::NO_SCALE>(
|
||||
doris_column, data_type, select_vector);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType, bool has_filter, DecimalScaleParams::ScaleType ScaleType>
|
||||
Status ByteArrayPlainDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data =
|
||||
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
if (UNLIKELY(_offset + 4 > _data->size)) {
|
||||
return Status::IOError("Can't read byte array length from plain decoder");
|
||||
}
|
||||
uint32_t length =
|
||||
decode_fixed32_le(reinterpret_cast<const uint8_t*>(_data->data) + _offset);
|
||||
_offset += 4;
|
||||
char* buf_start = _data->data + _offset;
|
||||
_offset += length;
|
||||
// When Decimal in parquet is stored in byte arrays, binary and fixed,
|
||||
// the unscaled number must be encoded as two's complement using big-endian byte order.
|
||||
DecimalPrimitiveType value = 0;
|
||||
memcpy(reinterpret_cast<char*>(&value), buf_start, length);
|
||||
value = BitUtil::big_endian_to_host(value);
|
||||
value = value >> ((sizeof(value) - length) * 8);
|
||||
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
|
||||
value *= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
|
||||
value /= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
|
||||
// do nothing
|
||||
} else {
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
|
||||
v = (DecimalPrimitiveType)value;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -31,8 +31,6 @@
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
const cctz::time_zone DecodeParams::utc0 = cctz::utc_time_zone();
|
||||
|
||||
Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type encoding,
|
||||
std::unique_ptr<Decoder>& decoder) {
|
||||
switch (encoding) {
|
||||
@ -45,17 +43,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
|
||||
decoder.reset(new ByteArrayPlainDecoder());
|
||||
break;
|
||||
case tparquet::Type::INT32:
|
||||
[[fallthrough]];
|
||||
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::INT32>());
|
||||
break;
|
||||
case tparquet::Type::INT64:
|
||||
[[fallthrough]];
|
||||
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::INT64>());
|
||||
break;
|
||||
case tparquet::Type::INT96:
|
||||
[[fallthrough]];
|
||||
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::INT96>());
|
||||
break;
|
||||
case tparquet::Type::FLOAT:
|
||||
[[fallthrough]];
|
||||
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::FLOAT>());
|
||||
break;
|
||||
case tparquet::Type::DOUBLE:
|
||||
[[fallthrough]];
|
||||
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::DOUBLE>());
|
||||
break;
|
||||
case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
|
||||
decoder.reset(new FixLengthPlainDecoder(type));
|
||||
decoder.reset(new FixLengthPlainDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY>());
|
||||
break;
|
||||
default:
|
||||
return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder",
|
||||
@ -70,22 +73,22 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
|
||||
decoder.reset(new ByteArrayDictDecoder());
|
||||
break;
|
||||
case tparquet::Type::INT32:
|
||||
decoder.reset(new FixLengthDictDecoder<Int32>(type));
|
||||
decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT32>());
|
||||
break;
|
||||
case tparquet::Type::INT64:
|
||||
decoder.reset(new FixLengthDictDecoder<Int64>(type));
|
||||
decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT64>());
|
||||
break;
|
||||
case tparquet::Type::INT96:
|
||||
decoder.reset(new FixLengthDictDecoder<ParquetInt96>(type));
|
||||
decoder.reset(new FixLengthDictDecoder<tparquet::Type::INT96>());
|
||||
break;
|
||||
case tparquet::Type::FLOAT:
|
||||
decoder.reset(new FixLengthDictDecoder<Float32>(type));
|
||||
decoder.reset(new FixLengthDictDecoder<tparquet::Type::FLOAT>());
|
||||
break;
|
||||
case tparquet::Type::DOUBLE:
|
||||
decoder.reset(new FixLengthDictDecoder<Float64>(type));
|
||||
decoder.reset(new FixLengthDictDecoder<tparquet::Type::DOUBLE>());
|
||||
break;
|
||||
case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
|
||||
decoder.reset(new FixLengthDictDecoder<char*>(type));
|
||||
decoder.reset(new FixLengthDictDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY>());
|
||||
break;
|
||||
default:
|
||||
return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder",
|
||||
@ -106,10 +109,10 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
|
||||
// Supports only INT32 and INT64.
|
||||
switch (type) {
|
||||
case tparquet::Type::INT32:
|
||||
decoder.reset(new DeltaBitPackDecoder<Int32>(type));
|
||||
decoder.reset(new DeltaBitPackDecoder<int32, tparquet::Type::INT32>());
|
||||
break;
|
||||
case tparquet::Type::INT64:
|
||||
decoder.reset(new DeltaBitPackDecoder<Int64>(type));
|
||||
decoder.reset(new DeltaBitPackDecoder<int64, tparquet::Type::INT64>());
|
||||
break;
|
||||
default:
|
||||
return Status::InternalError("DELTA_BINARY_PACKED only supports INT32 and INT64");
|
||||
@ -118,7 +121,7 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
|
||||
case tparquet::Encoding::DELTA_BYTE_ARRAY:
|
||||
switch (type) {
|
||||
case tparquet::Type::BYTE_ARRAY:
|
||||
decoder.reset(new DeltaByteArrayDecoder(type));
|
||||
decoder.reset(new DeltaByteArrayDecoder<tparquet::Type::BYTE_ARRAY>());
|
||||
break;
|
||||
default:
|
||||
return Status::InternalError("DELTA_BYTE_ARRAY only supports BYTE_ARRAY.");
|
||||
@ -127,7 +130,7 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
|
||||
case tparquet::Encoding::DELTA_LENGTH_BYTE_ARRAY:
|
||||
switch (type) {
|
||||
case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
|
||||
decoder.reset(new DeltaLengthByteArrayDecoder(type));
|
||||
decoder.reset(new DeltaLengthByteArrayDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY>());
|
||||
break;
|
||||
default:
|
||||
return Status::InternalError(
|
||||
@ -141,47 +144,4 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void Decoder::init(FieldSchema* field_schema, cctz::time_zone* ctz) {
|
||||
_field_schema = field_schema;
|
||||
if (_decode_params == nullptr) {
|
||||
_decode_params.reset(new DecodeParams());
|
||||
}
|
||||
if (ctz != nullptr) {
|
||||
_decode_params->ctz = ctz;
|
||||
}
|
||||
const auto& schema = field_schema->parquet_schema;
|
||||
if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) {
|
||||
const auto& timestamp_info = schema.logicalType.TIMESTAMP;
|
||||
if (!timestamp_info.isAdjustedToUTC) {
|
||||
// should set timezone to utc+0
|
||||
_decode_params->ctz = const_cast<cctz::time_zone*>(&_decode_params->utc0);
|
||||
}
|
||||
const auto& time_unit = timestamp_info.unit;
|
||||
if (time_unit.__isset.MILLIS) {
|
||||
_decode_params->second_mask = 1000;
|
||||
_decode_params->scale_to_nano_factor = 1000000;
|
||||
} else if (time_unit.__isset.MICROS) {
|
||||
_decode_params->second_mask = 1000000;
|
||||
_decode_params->scale_to_nano_factor = 1000;
|
||||
} else if (time_unit.__isset.NANOS) {
|
||||
_decode_params->second_mask = 1000000000;
|
||||
_decode_params->scale_to_nano_factor = 1;
|
||||
}
|
||||
} else if (schema.__isset.converted_type) {
|
||||
const auto& converted_type = schema.converted_type;
|
||||
if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) {
|
||||
_decode_params->second_mask = 1000;
|
||||
_decode_params->scale_to_nano_factor = 1000000;
|
||||
} else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) {
|
||||
_decode_params->second_mask = 1000000;
|
||||
_decode_params->scale_to_nano_factor = 1000;
|
||||
}
|
||||
}
|
||||
|
||||
if (_decode_params->ctz) {
|
||||
VecDateTimeValue t;
|
||||
t.from_unixtime(0, *_decode_params->ctz);
|
||||
_decode_params->offset_days = t.day() == 31 ? -1 : 0; // If 1969-12-31, then returns -1.
|
||||
}
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -54,29 +54,6 @@ class ColumnString;
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
#define FOR_LOGICAL_NUMERIC_TYPES(M) \
|
||||
M(TypeIndex::Int8, Int8, Int32) \
|
||||
M(TypeIndex::UInt8, UInt8, Int32) \
|
||||
M(TypeIndex::Int16, Int16, Int32) \
|
||||
M(TypeIndex::UInt16, UInt16, Int32) \
|
||||
M(TypeIndex::Int32, Int32, Int32) \
|
||||
M(TypeIndex::UInt32, UInt32, Int32) \
|
||||
M(TypeIndex::Int64, Int64, Int64) \
|
||||
M(TypeIndex::UInt64, UInt64, Int64) \
|
||||
M(TypeIndex::Float32, Float32, Float32) \
|
||||
M(TypeIndex::Float64, Float64, Float64)
|
||||
|
||||
struct DecodeParams {
|
||||
// schema.logicalType.TIMESTAMP.isAdjustedToUTC == false
|
||||
static const cctz::time_zone utc0;
|
||||
// schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set the time zone
|
||||
cctz::time_zone* ctz = nullptr;
|
||||
int32_t offset_days = 0;
|
||||
int64_t second_mask = 1;
|
||||
int64_t scale_to_nano_factor = 1;
|
||||
DecimalScaleParams decimal_scale;
|
||||
};
|
||||
|
||||
class Decoder {
|
||||
public:
|
||||
Decoder() = default;
|
||||
@ -94,11 +71,6 @@ public:
|
||||
_offset = 0;
|
||||
}
|
||||
|
||||
void init(FieldSchema* field_schema, cctz::time_zone* ctz);
|
||||
|
||||
template <typename DecimalPrimitiveType>
|
||||
void init_decimal_converter(DataTypePtr& data_type);
|
||||
|
||||
// Write the decoded values batch to doris's column
|
||||
virtual Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter) = 0;
|
||||
@ -126,34 +98,8 @@ protected:
|
||||
int32_t _type_length;
|
||||
Slice* _data = nullptr;
|
||||
uint32_t _offset = 0;
|
||||
FieldSchema* _field_schema = nullptr;
|
||||
std::unique_ptr<DecodeParams> _decode_params = nullptr;
|
||||
};
|
||||
|
||||
template <typename DecimalPrimitiveType>
|
||||
void Decoder::init_decimal_converter(DataTypePtr& data_type) {
|
||||
if (_decode_params == nullptr || _field_schema == nullptr ||
|
||||
_decode_params->decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) {
|
||||
return;
|
||||
}
|
||||
auto scale = _field_schema->parquet_schema.scale;
|
||||
auto* decimal_type = reinterpret_cast<DataTypeDecimal<Decimal<DecimalPrimitiveType>>*>(
|
||||
const_cast<IDataType*>(remove_nullable(data_type).get()));
|
||||
auto dest_scale = decimal_type->get_scale();
|
||||
if (dest_scale > scale) {
|
||||
_decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_UP;
|
||||
_decode_params->decimal_scale.scale_factor =
|
||||
DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dest_scale - scale);
|
||||
} else if (dest_scale < scale) {
|
||||
_decode_params->decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN;
|
||||
_decode_params->decimal_scale.scale_factor =
|
||||
DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dest_scale);
|
||||
} else {
|
||||
_decode_params->decimal_scale.scale_type = DecimalScaleParams::NO_SCALE;
|
||||
_decode_params->decimal_scale.scale_factor = 1;
|
||||
}
|
||||
}
|
||||
|
||||
class BaseDictDecoder : public Decoder {
|
||||
public:
|
||||
BaseDictDecoder() = default;
|
||||
|
||||
@ -1,283 +0,0 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "delta_bit_pack_decoder.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <string_view>
|
||||
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/common/arithmetic_overflow.h"
|
||||
#include "vec/common/string_ref.h"
|
||||
#include "vec/core/types.h"
|
||||
#include "vec/data_types/data_type_nullable.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
template <typename T>
|
||||
Status DeltaBitPackDecoder<T>::_init_header() {
|
||||
if (!_bit_reader->GetVlqInt(&_values_per_block) ||
|
||||
!_bit_reader->GetVlqInt(&_mini_blocks_per_block) ||
|
||||
!_bit_reader->GetVlqInt(&_total_value_count) ||
|
||||
!_bit_reader->GetZigZagVlqInt(&_last_value)) {
|
||||
return Status::IOError("Init header eof");
|
||||
}
|
||||
if (_values_per_block == 0) {
|
||||
return Status::InvalidArgument("Cannot have zero value per block");
|
||||
}
|
||||
if (_values_per_block % 128 != 0) {
|
||||
return Status::InvalidArgument(
|
||||
"the number of values in a block must be multiple of 128, but it's " +
|
||||
std::to_string(_values_per_block));
|
||||
}
|
||||
if (_mini_blocks_per_block == 0) {
|
||||
return Status::InvalidArgument("Cannot have zero miniblock per block");
|
||||
}
|
||||
_values_per_mini_block = _values_per_block / _mini_blocks_per_block;
|
||||
if (_values_per_mini_block == 0) {
|
||||
return Status::InvalidArgument("Cannot have zero value per miniblock");
|
||||
}
|
||||
if (_values_per_mini_block % 32 != 0) {
|
||||
return Status::InvalidArgument(
|
||||
"The number of values in a miniblock must be multiple of 32, but it's " +
|
||||
std::to_string(_values_per_mini_block));
|
||||
}
|
||||
_total_values_remaining = _total_value_count;
|
||||
_delta_bit_widths.resize(_mini_blocks_per_block);
|
||||
// init as empty property
|
||||
_block_initialized = false;
|
||||
_values_remaining_current_mini_block = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status DeltaBitPackDecoder<T>::_init_block() {
|
||||
DCHECK_GT(_total_values_remaining, 0) << "InitBlock called at EOF";
|
||||
if (!_bit_reader->GetZigZagVlqInt(&_min_delta)) {
|
||||
return Status::IOError("Init block eof");
|
||||
}
|
||||
|
||||
// read the bitwidth of each miniblock
|
||||
uint8_t* bit_width_data = _delta_bit_widths.data();
|
||||
for (uint32_t i = 0; i < _mini_blocks_per_block; ++i) {
|
||||
if (!_bit_reader->GetAligned<uint8_t>(1, bit_width_data + i)) {
|
||||
return Status::IOError("Decode bit-width EOF");
|
||||
}
|
||||
// Note that non-conformant bitwidth entries are allowed by the Parquet spec
|
||||
// for extraneous miniblocks in the last block (GH-14923), so we check
|
||||
// the bitwidths when actually using them (see InitMiniBlock()).
|
||||
}
|
||||
_mini_block_idx = 0;
|
||||
_block_initialized = true;
|
||||
RETURN_IF_ERROR(_init_mini_block(bit_width_data[0]));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status DeltaBitPackDecoder<T>::_init_mini_block(int bit_width) {
|
||||
if (PREDICT_FALSE(bit_width > kMaxDeltaBitWidth)) {
|
||||
return Status::InvalidArgument("delta bit width larger than integer bit width");
|
||||
}
|
||||
_delta_bit_width = bit_width;
|
||||
_values_remaining_current_mini_block = _values_per_mini_block;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
Status DeltaBitPackDecoder<T>::_get_internal(T* buffer, int num_values, int* out_num_values) {
|
||||
num_values = static_cast<int>(std::min<int64_t>(num_values, _total_values_remaining));
|
||||
if (num_values == 0) {
|
||||
*out_num_values = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
int i = 0;
|
||||
while (i < num_values) {
|
||||
if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) {
|
||||
if (PREDICT_FALSE(!_block_initialized)) {
|
||||
buffer[i++] = _last_value;
|
||||
DCHECK_EQ(i, 1); // we're at the beginning of the page
|
||||
if (i == num_values) {
|
||||
// When block is uninitialized and i reaches num_values we have two
|
||||
// different possibilities:
|
||||
// 1. _total_value_count == 1, which means that the page may have only
|
||||
// one value (encoded in the header), and we should not initialize
|
||||
// any block.
|
||||
// 2. _total_value_count != 1, which means we should initialize the
|
||||
// incoming block for subsequent reads.
|
||||
if (_total_value_count != 1) {
|
||||
RETURN_IF_ERROR(_init_block());
|
||||
}
|
||||
break;
|
||||
}
|
||||
RETURN_IF_ERROR(_init_block());
|
||||
} else {
|
||||
++_mini_block_idx;
|
||||
if (_mini_block_idx < _mini_blocks_per_block) {
|
||||
RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx]));
|
||||
} else {
|
||||
RETURN_IF_ERROR(_init_block());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int values_decode = std::min(_values_remaining_current_mini_block,
|
||||
static_cast<uint32_t>(num_values - i));
|
||||
for (int j = 0; j < values_decode; ++j) {
|
||||
if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) {
|
||||
return Status::IOError("Get batch EOF");
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < values_decode; ++j) {
|
||||
// Addition between min_delta, packed int and last_value should be treated as
|
||||
// unsigned addition. Overflow is as expected.
|
||||
buffer[i + j] = static_cast<UT>(_min_delta) + static_cast<UT>(buffer[i + j]) +
|
||||
static_cast<UT>(_last_value);
|
||||
_last_value = buffer[i + j];
|
||||
}
|
||||
_values_remaining_current_mini_block -= values_decode;
|
||||
i += values_decode;
|
||||
}
|
||||
_total_values_remaining -= num_values;
|
||||
|
||||
if (PREDICT_FALSE(_total_values_remaining == 0)) {
|
||||
if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) {
|
||||
return Status::IOError("Skip padding EOF");
|
||||
}
|
||||
_values_remaining_current_mini_block = 0;
|
||||
}
|
||||
*out_num_values = num_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void DeltaLengthByteArrayDecoder::_decode_lengths() {
|
||||
_len_decoder.set_bit_reader(_bit_reader);
|
||||
// get the number of encoded lengths
|
||||
int num_length = _len_decoder.valid_values_count();
|
||||
_buffered_length.resize(num_length);
|
||||
|
||||
// decode all the lengths. all the lengths are buffered in buffered_length_.
|
||||
int ret;
|
||||
Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret);
|
||||
if (!st.ok()) {
|
||||
LOG(FATAL) << "Fail to decode delta length, status: " << st;
|
||||
}
|
||||
DCHECK_EQ(ret, num_length);
|
||||
_length_idx = 0;
|
||||
_num_valid_values = num_length;
|
||||
}
|
||||
|
||||
Status DeltaLengthByteArrayDecoder::_get_internal(Slice* buffer, int max_values,
|
||||
int* out_num_values) {
|
||||
// Decode up to `max_values` strings into an internal buffer
|
||||
// and reference them into `buffer`.
|
||||
max_values = std::min(max_values, _num_valid_values);
|
||||
if (max_values == 0) {
|
||||
*out_num_values = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int32_t data_size = 0;
|
||||
const int32_t* length_ptr = _buffered_length.data() + _length_idx;
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
int32_t len = length_ptr[i];
|
||||
if (PREDICT_FALSE(len < 0)) {
|
||||
return Status::InvalidArgument("Negative string delta length");
|
||||
}
|
||||
buffer[i].size = len;
|
||||
if (common::add_overflow(data_size, len, data_size)) {
|
||||
return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
|
||||
}
|
||||
}
|
||||
_length_idx += max_values;
|
||||
|
||||
_buffered_data.resize(data_size);
|
||||
char* data_ptr = _buffered_data.data();
|
||||
for (int j = 0; j < data_size; j++) {
|
||||
if (!_bit_reader->GetValue(8, data_ptr + j)) {
|
||||
return Status::IOError("Get length bytes EOF");
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
buffer[i].data = data_ptr;
|
||||
data_ptr += buffer[i].size;
|
||||
}
|
||||
// this->num_values_ -= max_values;
|
||||
_num_valid_values -= max_values;
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status DeltaByteArrayDecoder::_get_internal(Slice* buffer, int max_values, int* out_num_values) {
|
||||
// Decode up to `max_values` strings into an internal buffer
|
||||
// and reference them into `buffer`.
|
||||
max_values = std::min(max_values, _num_valid_values);
|
||||
if (max_values == 0) {
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int suffix_read;
|
||||
RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read));
|
||||
if (PREDICT_FALSE(suffix_read != max_values)) {
|
||||
return Status::IOError("Read {}, expecting {} from suffix decoder",
|
||||
std::to_string(suffix_read), std::to_string(max_values));
|
||||
}
|
||||
|
||||
int64_t data_size = 0;
|
||||
const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset;
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
|
||||
return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
if (PREDICT_FALSE(common::add_overflow(data_size, static_cast<int64_t>(prefix_len_ptr[i]),
|
||||
data_size) ||
|
||||
common::add_overflow(data_size, static_cast<int64_t>(buffer[i].size),
|
||||
data_size))) {
|
||||
return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
}
|
||||
_buffered_data.resize(data_size);
|
||||
|
||||
std::string_view prefix {_last_value};
|
||||
|
||||
char* data_ptr = _buffered_data.data();
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
if (PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
|
||||
return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
|
||||
// buffer[i] currently points to the string suffix
|
||||
memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size);
|
||||
buffer[i].data = data_ptr;
|
||||
buffer[i].size += prefix_len_ptr[i];
|
||||
data_ptr += buffer[i].size;
|
||||
prefix = std::string_view {buffer[i].data, buffer[i].size};
|
||||
}
|
||||
_prefix_len_offset += max_values;
|
||||
_num_valid_values -= max_values;
|
||||
_last_value = std::string {prefix};
|
||||
|
||||
if (_num_valid_values == 0) {
|
||||
_last_value_in_previous_page = _last_value;
|
||||
}
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
@ -51,14 +51,10 @@ public:
|
||||
return _type_converted_decoder->skip_values(num_values);
|
||||
}
|
||||
|
||||
template <bool has_filter>
|
||||
template <tparquet::Type::type PhysicalType, bool has_filter>
|
||||
Status decode_byte_array(const std::vector<Slice>& decoded_vals, MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type, ColumnSelectVector& select_vector) {
|
||||
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
|
||||
switch (logical_type) {
|
||||
case TypeIndex::String:
|
||||
[[fallthrough]];
|
||||
case TypeIndex::FixedString: {
|
||||
if constexpr (PhysicalType == tparquet::Type::BYTE_ARRAY) {
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
@ -88,21 +84,14 @@ public:
|
||||
}
|
||||
}
|
||||
_current_value_idx = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return Status::InvalidArgument(
|
||||
"Can't decode parquet physical type BYTE_ARRAY to doris logical type {}",
|
||||
getTypeName(logical_type));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
void init_values_converter() {
|
||||
_type_converted_decoder->set_data(_data);
|
||||
_type_converted_decoder->set_type_length(_type_length);
|
||||
_type_converted_decoder->init(_field_schema, _decode_params->ctz);
|
||||
}
|
||||
// Convert decoded value to doris type value.
|
||||
std::unique_ptr<Decoder> _type_converted_decoder;
|
||||
@ -117,13 +106,12 @@ protected:
|
||||
* Block
|
||||
* [min delta] [list of bitwidths of the mini blocks] [miniblocks]
|
||||
*/
|
||||
template <typename T>
|
||||
template <typename T, tparquet::Type::type PhysicalType>
|
||||
class DeltaBitPackDecoder final : public DeltaDecoder {
|
||||
public:
|
||||
using UT = std::make_unsigned_t<T>;
|
||||
|
||||
DeltaBitPackDecoder(const tparquet::Type::type& physical_type)
|
||||
: DeltaDecoder(new FixLengthPlainDecoder(physical_type)) {}
|
||||
DeltaBitPackDecoder() : DeltaDecoder(new FixLengthPlainDecoder<PhysicalType>()) {}
|
||||
~DeltaBitPackDecoder() override = default;
|
||||
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter) override {
|
||||
@ -200,16 +188,13 @@ private:
|
||||
// _values_remaining_current_mini_block may greater than _total_values_remaining.
|
||||
uint32_t _values_remaining_current_mini_block;
|
||||
};
|
||||
template class DeltaBitPackDecoder<int32_t>;
|
||||
template class DeltaBitPackDecoder<int64_t>;
|
||||
|
||||
//template class DeltaBitPackDecoder<int32_t>;
|
||||
//template class DeltaBitPackDecoder<int64_t>;
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
class DeltaLengthByteArrayDecoder final : public DeltaDecoder {
|
||||
public:
|
||||
explicit DeltaLengthByteArrayDecoder(const tparquet::Type::type& physical_type)
|
||||
: DeltaDecoder(nullptr),
|
||||
_len_decoder(physical_type),
|
||||
_buffered_length(0),
|
||||
_buffered_data(0) {}
|
||||
explicit DeltaLengthByteArrayDecoder()
|
||||
: DeltaDecoder(nullptr), _len_decoder(), _buffered_length(0), _buffered_data(0) {}
|
||||
|
||||
Status skip_values(size_t num_values) override {
|
||||
_current_value_idx += num_values;
|
||||
@ -240,7 +225,8 @@ public:
|
||||
return Status::IOError("Expected to decode {} values, but decoded {} values.",
|
||||
num_values - null_count, num_valid_values);
|
||||
}
|
||||
return decode_byte_array<has_filter>(_values, doris_column, data_type, select_vector);
|
||||
return decode_byte_array<PhysicalType, has_filter>(_values, doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
|
||||
Status decode(Slice* buffer, int num_values, int* out_num_values) {
|
||||
@ -270,7 +256,7 @@ private:
|
||||
|
||||
std::vector<Slice> _values;
|
||||
std::shared_ptr<BitReader> _bit_reader;
|
||||
DeltaBitPackDecoder<int32_t> _len_decoder;
|
||||
DeltaBitPackDecoder<int32_t, PhysicalType> _len_decoder;
|
||||
|
||||
int _num_valid_values;
|
||||
uint32_t _length_idx;
|
||||
@ -278,14 +264,11 @@ private:
|
||||
std::vector<char> _buffered_data;
|
||||
};
|
||||
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
class DeltaByteArrayDecoder : public DeltaDecoder {
|
||||
public:
|
||||
explicit DeltaByteArrayDecoder(const tparquet::Type::type& physical_type)
|
||||
: DeltaDecoder(nullptr),
|
||||
_prefix_len_decoder(physical_type),
|
||||
_suffix_decoder(physical_type),
|
||||
_buffered_prefix_length(0),
|
||||
_buffered_data(0) {}
|
||||
explicit DeltaByteArrayDecoder()
|
||||
: DeltaDecoder(nullptr), _buffered_prefix_length(0), _buffered_data(0) {}
|
||||
|
||||
Status skip_values(size_t num_values) override {
|
||||
_current_value_idx += num_values;
|
||||
@ -312,7 +295,8 @@ public:
|
||||
int num_valid_values;
|
||||
RETURN_IF_ERROR(_get_internal(_values.data(), num_values - null_count, &num_valid_values));
|
||||
DCHECK_EQ(num_values - null_count, num_valid_values);
|
||||
return decode_byte_array<has_filter>(_values, doris_column, data_type, select_vector);
|
||||
return decode_byte_array<PhysicalType, has_filter>(_values, doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
|
||||
void set_data(Slice* slice) override {
|
||||
@ -350,8 +334,8 @@ private:
|
||||
|
||||
std::vector<Slice> _values;
|
||||
std::shared_ptr<BitReader> _bit_reader;
|
||||
DeltaBitPackDecoder<int32_t> _prefix_len_decoder;
|
||||
DeltaLengthByteArrayDecoder _suffix_decoder;
|
||||
DeltaBitPackDecoder<int32_t, PhysicalType> _prefix_len_decoder;
|
||||
DeltaLengthByteArrayDecoder<PhysicalType> _suffix_decoder;
|
||||
std::string _last_value;
|
||||
// string buffer for last value in previous page
|
||||
std::string _last_value_in_previous_page;
|
||||
@ -361,3 +345,260 @@ private:
|
||||
std::vector<char> _buffered_data;
|
||||
};
|
||||
} // namespace doris::vectorized
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
template <typename T, tparquet::Type::type PhysicalType>
|
||||
Status DeltaBitPackDecoder<T, PhysicalType>::_init_header() {
|
||||
if (!_bit_reader->GetVlqInt(&_values_per_block) ||
|
||||
!_bit_reader->GetVlqInt(&_mini_blocks_per_block) ||
|
||||
!_bit_reader->GetVlqInt(&_total_value_count) ||
|
||||
!_bit_reader->GetZigZagVlqInt(&_last_value)) {
|
||||
return Status::IOError("Init header eof");
|
||||
}
|
||||
if (_values_per_block == 0) {
|
||||
return Status::InvalidArgument("Cannot have zero value per block");
|
||||
}
|
||||
if (_values_per_block % 128 != 0) {
|
||||
return Status::InvalidArgument(
|
||||
"the number of values in a block must be multiple of 128, but it's " +
|
||||
std::to_string(_values_per_block));
|
||||
}
|
||||
if (_mini_blocks_per_block == 0) {
|
||||
return Status::InvalidArgument("Cannot have zero miniblock per block");
|
||||
}
|
||||
_values_per_mini_block = _values_per_block / _mini_blocks_per_block;
|
||||
if (_values_per_mini_block == 0) {
|
||||
return Status::InvalidArgument("Cannot have zero value per miniblock");
|
||||
}
|
||||
if (_values_per_mini_block % 32 != 0) {
|
||||
return Status::InvalidArgument(
|
||||
"The number of values in a miniblock must be multiple of 32, but it's " +
|
||||
std::to_string(_values_per_mini_block));
|
||||
}
|
||||
_total_values_remaining = _total_value_count;
|
||||
_delta_bit_widths.resize(_mini_blocks_per_block);
|
||||
// init as empty property
|
||||
_block_initialized = false;
|
||||
_values_remaining_current_mini_block = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T, tparquet::Type::type PhysicalType>
|
||||
Status DeltaBitPackDecoder<T, PhysicalType>::_init_block() {
|
||||
DCHECK_GT(_total_values_remaining, 0) << "InitBlock called at EOF";
|
||||
if (!_bit_reader->GetZigZagVlqInt(&_min_delta)) {
|
||||
return Status::IOError("Init block eof");
|
||||
}
|
||||
|
||||
// read the bitwidth of each miniblock
|
||||
uint8_t* bit_width_data = _delta_bit_widths.data();
|
||||
for (uint32_t i = 0; i < _mini_blocks_per_block; ++i) {
|
||||
if (!_bit_reader->GetAligned<uint8_t>(1, bit_width_data + i)) {
|
||||
return Status::IOError("Decode bit-width EOF");
|
||||
}
|
||||
// Note that non-conformant bitwidth entries are allowed by the Parquet spec
|
||||
// for extraneous miniblocks in the last block (GH-14923), so we check
|
||||
// the bitwidths when actually using them (see InitMiniBlock()).
|
||||
}
|
||||
_mini_block_idx = 0;
|
||||
_block_initialized = true;
|
||||
RETURN_IF_ERROR(_init_mini_block(bit_width_data[0]));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T, tparquet::Type::type PhysicalType>
|
||||
Status DeltaBitPackDecoder<T, PhysicalType>::_init_mini_block(int bit_width) {
|
||||
if (PREDICT_FALSE(bit_width > kMaxDeltaBitWidth)) {
|
||||
return Status::InvalidArgument("delta bit width larger than integer bit width");
|
||||
}
|
||||
_delta_bit_width = bit_width;
|
||||
_values_remaining_current_mini_block = _values_per_mini_block;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename T, tparquet::Type::type PhysicalType>
|
||||
Status DeltaBitPackDecoder<T, PhysicalType>::_get_internal(T* buffer, int num_values,
|
||||
int* out_num_values) {
|
||||
num_values = static_cast<int>(std::min<int64_t>(num_values, _total_values_remaining));
|
||||
if (num_values == 0) {
|
||||
*out_num_values = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
int i = 0;
|
||||
while (i < num_values) {
|
||||
if (PREDICT_FALSE(_values_remaining_current_mini_block == 0)) {
|
||||
if (PREDICT_FALSE(!_block_initialized)) {
|
||||
buffer[i++] = _last_value;
|
||||
DCHECK_EQ(i, 1); // we're at the beginning of the page
|
||||
if (i == num_values) {
|
||||
// When block is uninitialized and i reaches num_values we have two
|
||||
// different possibilities:
|
||||
// 1. _total_value_count == 1, which means that the page may have only
|
||||
// one value (encoded in the header), and we should not initialize
|
||||
// any block.
|
||||
// 2. _total_value_count != 1, which means we should initialize the
|
||||
// incoming block for subsequent reads.
|
||||
if (_total_value_count != 1) {
|
||||
RETURN_IF_ERROR(_init_block());
|
||||
}
|
||||
break;
|
||||
}
|
||||
RETURN_IF_ERROR(_init_block());
|
||||
} else {
|
||||
++_mini_block_idx;
|
||||
if (_mini_block_idx < _mini_blocks_per_block) {
|
||||
RETURN_IF_ERROR(_init_mini_block(_delta_bit_widths.data()[_mini_block_idx]));
|
||||
} else {
|
||||
RETURN_IF_ERROR(_init_block());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int values_decode = std::min(_values_remaining_current_mini_block,
|
||||
static_cast<uint32_t>(num_values - i));
|
||||
for (int j = 0; j < values_decode; ++j) {
|
||||
if (!_bit_reader->GetValue(_delta_bit_width, buffer + i + j)) {
|
||||
return Status::IOError("Get batch EOF");
|
||||
}
|
||||
}
|
||||
for (int j = 0; j < values_decode; ++j) {
|
||||
// Addition between min_delta, packed int and last_value should be treated as
|
||||
// unsigned addition. Overflow is as expected.
|
||||
buffer[i + j] = static_cast<UT>(_min_delta) + static_cast<UT>(buffer[i + j]) +
|
||||
static_cast<UT>(_last_value);
|
||||
_last_value = buffer[i + j];
|
||||
}
|
||||
_values_remaining_current_mini_block -= values_decode;
|
||||
i += values_decode;
|
||||
}
|
||||
_total_values_remaining -= num_values;
|
||||
|
||||
if (PREDICT_FALSE(_total_values_remaining == 0)) {
|
||||
if (!_bit_reader->Advance(_delta_bit_width * _values_remaining_current_mini_block)) {
|
||||
return Status::IOError("Skip padding EOF");
|
||||
}
|
||||
_values_remaining_current_mini_block = 0;
|
||||
}
|
||||
*out_num_values = num_values;
|
||||
return Status::OK();
|
||||
}
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
void DeltaLengthByteArrayDecoder<PhysicalType>::_decode_lengths() {
|
||||
_len_decoder.set_bit_reader(_bit_reader);
|
||||
// get the number of encoded lengths
|
||||
int num_length = _len_decoder.valid_values_count();
|
||||
_buffered_length.resize(num_length);
|
||||
|
||||
// decode all the lengths. all the lengths are buffered in buffered_length_.
|
||||
int ret;
|
||||
Status st = _len_decoder.decode(_buffered_length.data(), num_length, &ret);
|
||||
if (!st.ok()) {
|
||||
LOG(FATAL) << "Fail to decode delta length, status: " << st;
|
||||
}
|
||||
DCHECK_EQ(ret, num_length);
|
||||
_length_idx = 0;
|
||||
_num_valid_values = num_length;
|
||||
}
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
Status DeltaLengthByteArrayDecoder<PhysicalType>::_get_internal(Slice* buffer, int max_values,
|
||||
int* out_num_values) {
|
||||
// Decode up to `max_values` strings into an internal buffer
|
||||
// and reference them into `buffer`.
|
||||
max_values = std::min(max_values, _num_valid_values);
|
||||
if (max_values == 0) {
|
||||
*out_num_values = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int32_t data_size = 0;
|
||||
const int32_t* length_ptr = _buffered_length.data() + _length_idx;
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
int32_t len = length_ptr[i];
|
||||
if (PREDICT_FALSE(len < 0)) {
|
||||
return Status::InvalidArgument("Negative string delta length");
|
||||
}
|
||||
buffer[i].size = len;
|
||||
if (common::add_overflow(data_size, len, data_size)) {
|
||||
return Status::InvalidArgument("Excess expansion in DELTA_(LENGTH_)BYTE_ARRAY");
|
||||
}
|
||||
}
|
||||
_length_idx += max_values;
|
||||
|
||||
_buffered_data.resize(data_size);
|
||||
char* data_ptr = _buffered_data.data();
|
||||
for (int j = 0; j < data_size; j++) {
|
||||
if (!_bit_reader->GetValue(8, data_ptr + j)) {
|
||||
return Status::IOError("Get length bytes EOF");
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
buffer[i].data = data_ptr;
|
||||
data_ptr += buffer[i].size;
|
||||
}
|
||||
// this->num_values_ -= max_values;
|
||||
_num_valid_values -= max_values;
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
Status DeltaByteArrayDecoder<PhysicalType>::_get_internal(Slice* buffer, int max_values,
|
||||
int* out_num_values) {
|
||||
// Decode up to `max_values` strings into an internal buffer
|
||||
// and reference them into `buffer`.
|
||||
max_values = std::min(max_values, _num_valid_values);
|
||||
if (max_values == 0) {
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
int suffix_read;
|
||||
RETURN_IF_ERROR(_suffix_decoder.decode(buffer, max_values, &suffix_read));
|
||||
if (PREDICT_FALSE(suffix_read != max_values)) {
|
||||
return Status::IOError("Read {}, expecting {} from suffix decoder",
|
||||
std::to_string(suffix_read), std::to_string(max_values));
|
||||
}
|
||||
|
||||
int64_t data_size = 0;
|
||||
const int32_t* prefix_len_ptr = _buffered_prefix_length.data() + _prefix_len_offset;
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
if (PREDICT_FALSE(prefix_len_ptr[i] < 0)) {
|
||||
return Status::InvalidArgument("negative prefix length in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
if (PREDICT_FALSE(common::add_overflow(data_size, static_cast<int64_t>(prefix_len_ptr[i]),
|
||||
data_size) ||
|
||||
common::add_overflow(data_size, static_cast<int64_t>(buffer[i].size),
|
||||
data_size))) {
|
||||
return Status::InvalidArgument("excess expansion in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
}
|
||||
_buffered_data.resize(data_size);
|
||||
|
||||
std::string_view prefix {_last_value};
|
||||
|
||||
char* data_ptr = _buffered_data.data();
|
||||
for (int i = 0; i < max_values; ++i) {
|
||||
if (PREDICT_FALSE(static_cast<size_t>(prefix_len_ptr[i]) > prefix.length())) {
|
||||
return Status::InvalidArgument("prefix length too large in DELTA_BYTE_ARRAY");
|
||||
}
|
||||
memcpy(data_ptr, prefix.data(), prefix_len_ptr[i]);
|
||||
// buffer[i] currently points to the string suffix
|
||||
memcpy(data_ptr + prefix_len_ptr[i], buffer[i].data, buffer[i].size);
|
||||
buffer[i].data = data_ptr;
|
||||
buffer[i].size += prefix_len_ptr[i];
|
||||
data_ptr += buffer[i].size;
|
||||
prefix = std::string_view {buffer[i].data, buffer[i].size};
|
||||
}
|
||||
_prefix_len_offset += max_values;
|
||||
_num_valid_values -= max_values;
|
||||
_last_value = std::string {prefix};
|
||||
|
||||
if (_num_valid_values == 0) {
|
||||
_last_value_in_previous_page = _last_value;
|
||||
}
|
||||
*out_num_values = max_values;
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -25,11 +25,10 @@
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
template <typename T>
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
class FixLengthDictDecoder final : public BaseDictDecoder {
|
||||
public:
|
||||
FixLengthDictDecoder(tparquet::Type::type physical_type)
|
||||
: BaseDictDecoder(), _physical_type(physical_type) {};
|
||||
FixLengthDictDecoder() : BaseDictDecoder() {};
|
||||
~FixLengthDictDecoder() override = default;
|
||||
|
||||
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
@ -73,95 +72,7 @@ public:
|
||||
return _decode_dict_values<has_filter>(doris_column, select_vector, is_dict_filter);
|
||||
}
|
||||
|
||||
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
|
||||
switch (logical_type) {
|
||||
#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \
|
||||
case NUMERIC_TYPE: \
|
||||
if constexpr (!std::is_same_v<T, ParquetInt96>) { \
|
||||
return _decode_numeric<CPP_NUMERIC_TYPE, T, has_filter>(doris_column, select_vector); \
|
||||
}
|
||||
FOR_LOGICAL_NUMERIC_TYPES(DISPATCH)
|
||||
#undef DISPATCH
|
||||
case TypeIndex::Date:
|
||||
if constexpr (std::is_same_v<T, Int32>) {
|
||||
return _decode_date<VecDateTimeValue, Int64, has_filter>(doris_column,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::DateV2:
|
||||
if constexpr (std::is_same_v<T, Int32>) {
|
||||
return _decode_date<DateV2Value<DateV2ValueType>, UInt32, has_filter>(
|
||||
doris_column, select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::DateTime:
|
||||
if constexpr (std::is_same_v<T, ParquetInt96>) {
|
||||
return _decode_datetime96<VecDateTimeValue, Int64, has_filter>(doris_column,
|
||||
select_vector);
|
||||
} else if constexpr (std::is_same_v<T, Int64>) {
|
||||
return _decode_datetime64<VecDateTimeValue, Int64, has_filter>(doris_column,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::DateTimeV2:
|
||||
// Spark can set the timestamp precision by the following configuration:
|
||||
// spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS
|
||||
if constexpr (std::is_same_v<T, ParquetInt96>) {
|
||||
return _decode_datetime96<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>(
|
||||
doris_column, select_vector);
|
||||
} else if constexpr (std::is_same_v<T, Int64>) {
|
||||
return _decode_datetime64<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>(
|
||||
doris_column, select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal32:
|
||||
if constexpr (std::is_same_v<T, Int32>) {
|
||||
return _decode_primitive_decimal<Int32, Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if constexpr (std::is_same_v<T, Int64>) {
|
||||
return _decode_primitive_decimal<Int32, Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal64:
|
||||
if constexpr (std::is_same_v<T, Int32>) {
|
||||
return _decode_primitive_decimal<Int64, Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if constexpr (std::is_same_v<T, Int64>) {
|
||||
return _decode_primitive_decimal<Int64, Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal128:
|
||||
if constexpr (std::is_same_v<T, Int32>) {
|
||||
return _decode_primitive_decimal<Int128, Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if constexpr (std::is_same_v<T, Int64>) {
|
||||
return _decode_primitive_decimal<Int128, Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal128I:
|
||||
if constexpr (std::is_same_v<T, Int32>) {
|
||||
return _decode_primitive_decimal<Int128, Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if constexpr (std::is_same_v<T, Int64>) {
|
||||
return _decode_primitive_decimal<Int128, Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
// TODO: decimal256
|
||||
case TypeIndex::String:
|
||||
[[fallthrough]];
|
||||
case TypeIndex::FixedString:
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
"Can't decode parquet physical type {} to doris logical type {}",
|
||||
tparquet::to_string(_physical_type), getTypeName(logical_type));
|
||||
return _decode_numeric<has_filter>(doris_column, select_vector);
|
||||
}
|
||||
|
||||
Status set_dict(std::unique_ptr<uint8_t[]>& dict, int32_t length, size_t num_values) override {
|
||||
@ -172,26 +83,27 @@ public:
|
||||
char* dict_item_address = reinterpret_cast<char*>(_dict.get());
|
||||
_dict_items.resize(num_values);
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
_dict_items[i] = *(T*)dict_item_address;
|
||||
_dict_items[i] = *(DataType*)dict_item_address;
|
||||
dict_item_address += _type_length;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename Numeric, typename PhysicalType, bool has_filter>
|
||||
template <bool has_filter>
|
||||
Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
|
||||
auto& column_data = static_cast<ColumnVector<Numeric>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
auto& column_data = reinterpret_cast<ColumnVector<Int8>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size() / _type_length;
|
||||
column_data.resize(column_data.size() + _type_length * (select_vector.num_values() -
|
||||
select_vector.num_filtered()));
|
||||
size_t dict_index = 0;
|
||||
DataType* data = (DataType*)column_data.data();
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
column_data[data_index++] =
|
||||
static_cast<PhysicalType>(_dict_items[_indexes[dict_index++]]);
|
||||
data[data_index++] = _dict_items[_indexes[dict_index++]];
|
||||
}
|
||||
break;
|
||||
}
|
||||
@ -211,250 +123,17 @@ protected:
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status _decode_date(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
|
||||
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
size_t dict_index = 0;
|
||||
date_day_offset_dict& date_dict = date_day_offset_dict::get();
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
int64_t date_value =
|
||||
_dict_items[_indexes[dict_index++]] + _decode_params->offset_days;
|
||||
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
|
||||
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
|
||||
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
|
||||
// we should cast to date if using date v1.
|
||||
v.cast_to_date();
|
||||
} else {
|
||||
reinterpret_cast<CppType&>(column_data[data_index++]) =
|
||||
date_dict[date_value];
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status _decode_datetime64(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
|
||||
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
size_t dict_index = 0;
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
int64_t date_value = _dict_items[_indexes[dict_index++]];
|
||||
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
|
||||
v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz);
|
||||
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
|
||||
// nanoseconds will be ignored.
|
||||
v.set_microsecond((date_value % _decode_params->second_mask) *
|
||||
_decode_params->scale_to_nano_factor / 1000);
|
||||
// TODO: the precision of datetime v1
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status _decode_datetime96(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
|
||||
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
size_t dict_index = 0;
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
ParquetInt96& datetime96 = _dict_items[_indexes[dict_index++]];
|
||||
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
|
||||
int64_t micros = datetime96.to_timestamp_micros();
|
||||
v.from_unixtime(micros / 1000000, *_decode_params->ctz);
|
||||
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
|
||||
// spark.sql.parquet.outputTimestampType = INT96(NANOS) will lost precision.
|
||||
// only keep microseconds.
|
||||
v.set_microsecond(micros % 1000000);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter>
|
||||
Status _decode_primitive_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
init_decimal_converter<DecimalPrimitiveType>(data_type);
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
#define M(FixedTypeLength, ValueCopyType, ScaleType) \
|
||||
case FixedTypeLength: \
|
||||
return _decode_primitive_decimal_internal<DecimalPrimitiveType, DecimalPhysicalType, \
|
||||
has_filter, FixedTypeLength, ValueCopyType, \
|
||||
ScaleType>(doris_column, data_type, \
|
||||
select_vector);
|
||||
|
||||
#define APPLY_FOR_DECIMALS(ScaleType) \
|
||||
M(1, int64_t, ScaleType) \
|
||||
M(2, int64_t, ScaleType) \
|
||||
M(3, int64_t, ScaleType) \
|
||||
M(4, int64_t, ScaleType) \
|
||||
M(5, int64_t, ScaleType) \
|
||||
M(6, int64_t, ScaleType) \
|
||||
M(7, int64_t, ScaleType) \
|
||||
M(8, int64_t, ScaleType) \
|
||||
M(9, int128_t, ScaleType) \
|
||||
M(10, int128_t, ScaleType) \
|
||||
M(11, int128_t, ScaleType) \
|
||||
M(12, int128_t, ScaleType) \
|
||||
M(13, int128_t, ScaleType) \
|
||||
M(14, int128_t, ScaleType) \
|
||||
M(15, int128_t, ScaleType) \
|
||||
M(16, int128_t, ScaleType)
|
||||
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
} else {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
#undef APPLY_FOR_DECIMALS
|
||||
#undef M
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter,
|
||||
int fixed_type_length, typename ValueCopyType,
|
||||
DecimalScaleParams::ScaleType ScaleType>
|
||||
Status _decode_primitive_decimal_internal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data =
|
||||
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column)
|
||||
.get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
size_t dict_index = 0;
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
ValueCopyType value = static_cast<T>(_dict_items[_indexes[dict_index++]]);
|
||||
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
|
||||
value *= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
|
||||
value /= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
|
||||
// do nothing
|
||||
} else {
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
|
||||
v = (DecimalPrimitiveType)value;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
tparquet::Type::type _physical_type;
|
||||
using ColumnType = ParquetConvert::PhysicalTypeTraits<PhysicalType>::ColumnType;
|
||||
using DataType = ParquetConvert::PhysicalTypeTraits<PhysicalType>::DataType;
|
||||
|
||||
// For dictionary encoding
|
||||
std::vector<T> _dict_items;
|
||||
std::vector<DataType> _dict_items;
|
||||
};
|
||||
|
||||
template <>
|
||||
class FixLengthDictDecoder<char*> final : public BaseDictDecoder {
|
||||
class FixLengthDictDecoder<tparquet::Type::FIXED_LEN_BYTE_ARRAY> final : public BaseDictDecoder {
|
||||
public:
|
||||
FixLengthDictDecoder(tparquet::Type::type physical_type)
|
||||
: BaseDictDecoder(), _physical_type(physical_type) {};
|
||||
FixLengthDictDecoder() : BaseDictDecoder() {};
|
||||
~FixLengthDictDecoder() override = default;
|
||||
|
||||
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
@ -487,52 +166,39 @@ public:
|
||||
return _decode_dict_values<has_filter>(doris_column, select_vector, is_dict_filter);
|
||||
}
|
||||
|
||||
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
|
||||
switch (logical_type) {
|
||||
case TypeIndex::Decimal32:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_binary_decimal<Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal64:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_binary_decimal<Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal128:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal128I:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
// TODO: decimal256
|
||||
case TypeIndex::String:
|
||||
[[fallthrough]];
|
||||
case TypeIndex::FixedString:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_string<has_filter>(doris_column, select_vector);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument(
|
||||
"Can't decode parquet physical type {} to doris logical type {}",
|
||||
tparquet::to_string(_physical_type), getTypeName(logical_type));
|
||||
return _decode_string<has_filter>(doris_column, select_vector);
|
||||
}
|
||||
|
||||
Status skip_values(size_t num_values) override {
|
||||
_indexes.resize(num_values);
|
||||
_index_batch_decoder->GetBatch(&_indexes[0], num_values);
|
||||
protected:
|
||||
template <bool has_filter>
|
||||
Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
|
||||
size_t dict_index = 0;
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
std::vector<StringRef> string_values;
|
||||
string_values.reserve(run_length);
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
string_values.emplace_back(_dict_items[_indexes[dict_index++]], _type_length);
|
||||
}
|
||||
doris_column->insert_many_strings(&string_values[0], run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
doris_column->insert_many_defaults(run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -583,160 +249,9 @@ public:
|
||||
res->insert_many_strings(&dict_values[0], dict_values.size());
|
||||
return res;
|
||||
}
|
||||
|
||||
protected:
|
||||
template <typename DecimalPrimitiveType, bool has_filter>
|
||||
Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
init_decimal_converter<DecimalPrimitiveType>(data_type);
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
#define M(FixedTypeLength, ValueCopyType, ScaleType) \
|
||||
case FixedTypeLength: \
|
||||
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter, FixedTypeLength, \
|
||||
ValueCopyType, ScaleType>(doris_column, data_type, \
|
||||
select_vector);
|
||||
|
||||
#define APPLY_FOR_DECIMALS(ScaleType) \
|
||||
M(1, int64_t, ScaleType) \
|
||||
M(2, int64_t, ScaleType) \
|
||||
M(3, int64_t, ScaleType) \
|
||||
M(4, int64_t, ScaleType) \
|
||||
M(5, int64_t, ScaleType) \
|
||||
M(6, int64_t, ScaleType) \
|
||||
M(7, int64_t, ScaleType) \
|
||||
M(8, int64_t, ScaleType) \
|
||||
M(9, int128_t, ScaleType) \
|
||||
M(10, int128_t, ScaleType) \
|
||||
M(11, int128_t, ScaleType) \
|
||||
M(12, int128_t, ScaleType) \
|
||||
M(13, int128_t, ScaleType) \
|
||||
M(14, int128_t, ScaleType) \
|
||||
M(15, int128_t, ScaleType) \
|
||||
M(16, int128_t, ScaleType)
|
||||
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
} else {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
#undef APPLY_FOR_DECIMALS
|
||||
#undef M
|
||||
}
|
||||
|
||||
template <bool has_filter>
|
||||
Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector) {
|
||||
size_t dict_index = 0;
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
std::vector<StringRef> string_values;
|
||||
string_values.reserve(run_length);
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
string_values.emplace_back(_dict_items[_indexes[dict_index++]], _type_length);
|
||||
}
|
||||
doris_column->insert_many_strings(&string_values[0], run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
doris_column->insert_many_defaults(run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
tparquet::Type::type _physical_type;
|
||||
|
||||
std::unordered_map<StringRef, int32_t> _dict_value_to_code;
|
||||
// For dictionary encoding
|
||||
std::vector<char*> _dict_items;
|
||||
std::unordered_map<StringRef, int32_t> _dict_value_to_code;
|
||||
|
||||
private:
|
||||
template <typename DecimalPrimitiveType, bool has_filter, int fixed_type_length,
|
||||
typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
|
||||
Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data =
|
||||
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column)
|
||||
.get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
size_t dict_index = 0;
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _dict_items[_indexes[dict_index++]];
|
||||
// When Decimal in parquet is stored in byte arrays, binary and fixed,
|
||||
// the unscaled number must be encoded as two's complement using big-endian byte order.
|
||||
DecimalPrimitiveType result_value = 0;
|
||||
ValueCopyType value = 0;
|
||||
memcpy(reinterpret_cast<char*>(&value), buf_start, fixed_type_length);
|
||||
value = BitUtil::big_endian_to_host(value);
|
||||
value = value >> ((sizeof(value) - fixed_type_length) * 8);
|
||||
result_value = value;
|
||||
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
|
||||
result_value *= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
|
||||
result_value /= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
|
||||
// do nothing
|
||||
} else {
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
|
||||
v = (DecimalPrimitiveType)result_value;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
dict_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -1,609 +0,0 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/fix_length_plain_decoder.h"
|
||||
|
||||
#include <gen_cpp/parquet_types.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <memory>
|
||||
#include <vector>
|
||||
|
||||
// IWYU pragma: no_include <opentelemetry/common/threadlocal.h>
|
||||
#include "common/compiler_util.h" // IWYU pragma: keep
|
||||
#include "util/bit_util.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/common/string_ref.h"
|
||||
#include "vec/core/types.h"
|
||||
#include "vec/data_types/data_type_nullable.h"
|
||||
#include "vec/exec/format/format_common.h"
|
||||
#include "vec/exec/format/parquet/parquet_common.h"
|
||||
#include "vec/runtime/vdatetime_value.h"
|
||||
|
||||
namespace doris {
|
||||
namespace vectorized {
|
||||
template <typename T>
|
||||
class ColumnDecimal;
|
||||
template <typename T>
|
||||
class ColumnVector;
|
||||
} // namespace vectorized
|
||||
} // namespace doris
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
Status FixLengthPlainDecoder::skip_values(size_t num_values) {
|
||||
_offset += _type_length * num_values;
|
||||
if (UNLIKELY(_offset > _data->size)) {
|
||||
return Status::IOError("Out-of-bounds access in parquet data decoder");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status FixLengthPlainDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector,
|
||||
bool is_dict_filter) {
|
||||
if (select_vector.has_filter()) {
|
||||
return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
|
||||
} else {
|
||||
return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
|
||||
}
|
||||
}
|
||||
|
||||
template <bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector,
|
||||
bool is_dict_filter) {
|
||||
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
|
||||
if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) {
|
||||
return Status::IOError("Out-of-bounds access in parquet data decoder");
|
||||
}
|
||||
TypeIndex logical_type = remove_nullable(data_type)->get_type_id();
|
||||
switch (logical_type) {
|
||||
#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \
|
||||
case NUMERIC_TYPE: \
|
||||
if (_physical_type == tparquet::Type::INT32) { \
|
||||
return _decode_numeric<CPP_NUMERIC_TYPE, Int32, has_filter>(doris_column, \
|
||||
select_vector); \
|
||||
} else if (_physical_type == tparquet::Type::INT64) { \
|
||||
return _decode_numeric<CPP_NUMERIC_TYPE, Int64, has_filter>(doris_column, \
|
||||
select_vector); \
|
||||
} else if (_physical_type == tparquet::Type::FLOAT) { \
|
||||
return _decode_numeric<CPP_NUMERIC_TYPE, Float32, has_filter>(doris_column, \
|
||||
select_vector); \
|
||||
} else if (_physical_type == tparquet::Type::DOUBLE) { \
|
||||
return _decode_numeric<CPP_NUMERIC_TYPE, Float64, has_filter>(doris_column, \
|
||||
select_vector); \
|
||||
} else { \
|
||||
break; \
|
||||
}
|
||||
FOR_LOGICAL_NUMERIC_TYPES(DISPATCH)
|
||||
#undef DISPATCH
|
||||
case TypeIndex::Date:
|
||||
if (_physical_type == tparquet::Type::INT32) {
|
||||
return _decode_date<VecDateTimeValue, Int64, has_filter>(doris_column, select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::DateV2:
|
||||
if (_physical_type == tparquet::Type::INT32) {
|
||||
return _decode_date<DateV2Value<DateV2ValueType>, UInt32, has_filter>(doris_column,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::DateTime:
|
||||
if (_physical_type == tparquet::Type::INT96) {
|
||||
return _decode_datetime96<VecDateTimeValue, Int64, has_filter>(doris_column,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT64) {
|
||||
return _decode_datetime64<VecDateTimeValue, Int64, has_filter>(doris_column,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::DateTimeV2:
|
||||
// Spark can set the timestamp precision by the following configuration:
|
||||
// spark.sql.parquet.outputTimestampType = INT96(NANOS), TIMESTAMP_MICROS, TIMESTAMP_MILLIS
|
||||
if (_physical_type == tparquet::Type::INT96) {
|
||||
return _decode_datetime96<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>(
|
||||
doris_column, select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT64) {
|
||||
return _decode_datetime64<DateV2Value<DateTimeV2ValueType>, UInt64, has_filter>(
|
||||
doris_column, select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal32:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_binary_decimal<Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT32) {
|
||||
return _decode_primitive_decimal<Int32, Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT64) {
|
||||
return _decode_primitive_decimal<Int32, Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal64:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_binary_decimal<Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT32) {
|
||||
return _decode_primitive_decimal<Int64, Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT64) {
|
||||
return _decode_primitive_decimal<Int64, Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal128:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT32) {
|
||||
return _decode_primitive_decimal<Int128, Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT64) {
|
||||
return _decode_primitive_decimal<Int128, Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
case TypeIndex::Decimal128I:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_binary_decimal<Int128, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT32) {
|
||||
return _decode_primitive_decimal<Int128, Int32, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
} else if (_physical_type == tparquet::Type::INT64) {
|
||||
return _decode_primitive_decimal<Int128, Int64, has_filter>(doris_column, data_type,
|
||||
select_vector);
|
||||
}
|
||||
break;
|
||||
// TODO: decimal256
|
||||
case TypeIndex::String:
|
||||
[[fallthrough]];
|
||||
case TypeIndex::FixedString:
|
||||
if (_physical_type == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_string<has_filter>(doris_column, select_vector);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return Status::InvalidArgument("Can't decode parquet physical type {} to doris logical type {}",
|
||||
tparquet::to_string(_physical_type), getTypeName(logical_type));
|
||||
}
|
||||
|
||||
template <bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_string(MutableColumnPtr& doris_column,
|
||||
ColumnSelectVector& select_vector) {
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
std::vector<StringRef> string_values;
|
||||
string_values.reserve(run_length);
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _data->data + _offset;
|
||||
string_values.emplace_back(buf_start, _type_length);
|
||||
_offset += _type_length;
|
||||
}
|
||||
doris_column->insert_many_strings(&string_values[0], run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
doris_column->insert_many_defaults(run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
template <typename Numeric, typename PhysicalType, bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_numeric(MutableColumnPtr& doris_column,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data = static_cast<ColumnVector<Numeric>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _data->data + _offset;
|
||||
column_data[data_index++] = *(PhysicalType*)buf_start;
|
||||
_offset += _type_length;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
date_day_offset_dict& date_dict = date_day_offset_dict::get();
|
||||
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _data->data + _offset;
|
||||
int64_t date_value = static_cast<int64_t>(*reinterpret_cast<int32_t*>(buf_start)) +
|
||||
_decode_params->offset_days;
|
||||
if constexpr (std::is_same_v<CppType, VecDateTimeValue>) {
|
||||
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
|
||||
v.create_from_date_v2(date_dict[date_value], TIME_DATE);
|
||||
// we should cast to date if using date v1.
|
||||
v.cast_to_date();
|
||||
} else {
|
||||
reinterpret_cast<CppType&>(column_data[data_index++]) = date_dict[date_value];
|
||||
}
|
||||
_offset += _type_length;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_datetime64(MutableColumnPtr& doris_column,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _data->data + _offset;
|
||||
int64_t& date_value = *reinterpret_cast<int64_t*>(buf_start);
|
||||
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
|
||||
v.from_unixtime(date_value / _decode_params->second_mask, *_decode_params->ctz);
|
||||
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
|
||||
// nanoseconds will be ignored.
|
||||
v.set_microsecond((date_value % _decode_params->second_mask) *
|
||||
_decode_params->scale_to_nano_factor / 1000);
|
||||
// TODO: the precision of datetime v1
|
||||
}
|
||||
_offset += _type_length;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_datetime96(MutableColumnPtr& doris_column,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data = static_cast<ColumnVector<ColumnType>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _data->data + _offset;
|
||||
ParquetInt96& datetime96 = *reinterpret_cast<ParquetInt96*>(buf_start);
|
||||
auto& v = reinterpret_cast<CppType&>(column_data[data_index++]);
|
||||
int64_t micros = datetime96.to_timestamp_micros();
|
||||
v.from_unixtime(micros / 1000000, *_decode_params->ctz);
|
||||
if constexpr (std::is_same_v<CppType, DateV2Value<DateTimeV2ValueType>>) {
|
||||
// spark.sql.parquet.outputTimestampType = INT96(NANOS) will lost precision.
|
||||
// only keep microseconds.
|
||||
v.set_microsecond(micros % 1000000);
|
||||
}
|
||||
_offset += _type_length;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType, bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_binary_decimal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
init_decimal_converter<DecimalPrimitiveType>(data_type);
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
#define M(FixedTypeLength, ValueCopyType, ScaleType) \
|
||||
case FixedTypeLength: \
|
||||
return _decode_binary_decimal_internal<DecimalPrimitiveType, has_filter, FixedTypeLength, \
|
||||
ValueCopyType, ScaleType>(doris_column, data_type, \
|
||||
select_vector);
|
||||
|
||||
#define APPLY_FOR_DECIMALS(ScaleType) \
|
||||
M(1, int64_t, ScaleType) \
|
||||
M(2, int64_t, ScaleType) \
|
||||
M(3, int64_t, ScaleType) \
|
||||
M(4, int64_t, ScaleType) \
|
||||
M(5, int64_t, ScaleType) \
|
||||
M(6, int64_t, ScaleType) \
|
||||
M(7, int64_t, ScaleType) \
|
||||
M(8, int64_t, ScaleType) \
|
||||
M(9, int128_t, ScaleType) \
|
||||
M(10, int128_t, ScaleType) \
|
||||
M(11, int128_t, ScaleType) \
|
||||
M(12, int128_t, ScaleType) \
|
||||
M(13, int128_t, ScaleType) \
|
||||
M(14, int128_t, ScaleType) \
|
||||
M(15, int128_t, ScaleType) \
|
||||
M(16, int128_t, ScaleType)
|
||||
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
} else {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
#undef APPLY_FOR_DECIMALS
|
||||
#undef M
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType, bool has_filter, int fixed_type_length,
|
||||
typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
|
||||
Status FixLengthPlainDecoder::_decode_binary_decimal_internal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data =
|
||||
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _data->data + _offset;
|
||||
// When Decimal in parquet is stored in byte arrays, binary and fixed,
|
||||
// the unscaled number must be encoded as two's complement using big-endian byte order.
|
||||
DecimalPrimitiveType result_value = 0;
|
||||
ValueCopyType value = 0;
|
||||
memcpy(reinterpret_cast<char*>(&value), buf_start, fixed_type_length);
|
||||
value = BitUtil::big_endian_to_host(value);
|
||||
value = value >> ((sizeof(value) - fixed_type_length) * 8);
|
||||
result_value = value;
|
||||
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
|
||||
result_value *= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
|
||||
result_value /= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
|
||||
// do nothing
|
||||
} else {
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
|
||||
v = (DecimalPrimitiveType)result_value;
|
||||
_offset += fixed_type_length;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter>
|
||||
Status FixLengthPlainDecoder::_decode_primitive_decimal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector) {
|
||||
init_decimal_converter<DecimalPrimitiveType>(data_type);
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
#define M(FixedTypeLength, T, ScaleType) \
|
||||
case FixedTypeLength: \
|
||||
return _decode_primitive_decimal_internal<DecimalPrimitiveType, DecimalPhysicalType, \
|
||||
has_filter, FixedTypeLength, T, ScaleType>( \
|
||||
doris_column, data_type, select_vector);
|
||||
|
||||
#define APPLY_FOR_DECIMALS(ScaleType) \
|
||||
M(1, int64_t, ScaleType) \
|
||||
M(2, int64_t, ScaleType) \
|
||||
M(3, int64_t, ScaleType) \
|
||||
M(4, int64_t, ScaleType) \
|
||||
M(5, int64_t, ScaleType) \
|
||||
M(6, int64_t, ScaleType) \
|
||||
M(7, int64_t, ScaleType) \
|
||||
M(8, int64_t, ScaleType) \
|
||||
M(9, int128_t, ScaleType) \
|
||||
M(10, int128_t, ScaleType) \
|
||||
M(11, int128_t, ScaleType) \
|
||||
M(12, int128_t, ScaleType) \
|
||||
M(13, int128_t, ScaleType) \
|
||||
M(14, int128_t, ScaleType) \
|
||||
M(15, int128_t, ScaleType) \
|
||||
M(16, int128_t, ScaleType)
|
||||
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_UP)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::SCALE_DOWN)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
} else {
|
||||
switch (_type_length) {
|
||||
APPLY_FOR_DECIMALS(DecimalScaleParams::NO_SCALE)
|
||||
default:
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
#undef APPLY_FOR_DECIMALS
|
||||
#undef M
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter,
|
||||
int fixed_type_length, typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
|
||||
Status FixLengthPlainDecoder::_decode_primitive_decimal_internal(
|
||||
MutableColumnPtr& doris_column, DataTypePtr& data_type, ColumnSelectVector& select_vector) {
|
||||
auto& column_data =
|
||||
static_cast<ColumnDecimal<Decimal<DecimalPrimitiveType>>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered());
|
||||
DecimalScaleParams& scale_params = _decode_params->decimal_scale;
|
||||
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _data->data + _offset;
|
||||
ValueCopyType value = *reinterpret_cast<DecimalPhysicalType*>(buf_start);
|
||||
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
|
||||
value *= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
|
||||
value /= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
|
||||
// do nothing
|
||||
} else {
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
auto& v = reinterpret_cast<DecimalPrimitiveType&>(column_data[data_index++]);
|
||||
v = (DecimalPrimitiveType)value;
|
||||
_offset += _type_length;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
@ -23,7 +23,8 @@
|
||||
#include "common/status.h"
|
||||
#include "vec/data_types/data_type.h"
|
||||
#include "vec/exec/format/parquet/decoder.h"
|
||||
|
||||
#include "vec/exec/format/parquet/parquet_column_convert.h"
|
||||
#include "vec/exec/format/parquet/parquet_common.h"
|
||||
namespace doris {
|
||||
namespace vectorized {
|
||||
class ColumnSelectVector;
|
||||
@ -32,56 +33,135 @@ class ColumnSelectVector;
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
class FixLengthPlainDecoder final : public Decoder {
|
||||
public:
|
||||
FixLengthPlainDecoder(tparquet::Type::type physical_type) : _physical_type(physical_type) {};
|
||||
FixLengthPlainDecoder() {};
|
||||
~FixLengthPlainDecoder() override = default;
|
||||
|
||||
Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter) override;
|
||||
|
||||
template <bool hasFilter>
|
||||
template <bool has_filter>
|
||||
Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector, bool is_dict_filter);
|
||||
|
||||
Status skip_values(size_t num_values) override;
|
||||
|
||||
protected:
|
||||
template <typename Numeric, typename PhysicalType, bool has_filter>
|
||||
template <bool has_filter>
|
||||
Status _decode_numeric(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status _decode_date(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status _decode_datetime64(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
|
||||
|
||||
template <typename CppType, typename ColumnType, bool has_filter>
|
||||
Status _decode_datetime96(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
|
||||
|
||||
template <typename DecimalPrimitiveType, bool has_filter>
|
||||
Status _decode_binary_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector);
|
||||
|
||||
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter>
|
||||
Status _decode_primitive_decimal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector);
|
||||
|
||||
template <bool has_filter>
|
||||
Status _decode_string(MutableColumnPtr& doris_column, ColumnSelectVector& select_vector);
|
||||
|
||||
tparquet::Type::type _physical_type;
|
||||
|
||||
private:
|
||||
template <typename DecimalPrimitiveType, bool has_filter, int fixed_type_length,
|
||||
typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
|
||||
Status _decode_binary_decimal_internal(MutableColumnPtr& doris_column, DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector);
|
||||
template <typename DecimalPrimitiveType, typename DecimalPhysicalType, bool has_filter,
|
||||
int fixed_type_length, typename ValueCopyType,
|
||||
DecimalScaleParams::ScaleType ScaleType>
|
||||
Status _decode_primitive_decimal_internal(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector);
|
||||
};
|
||||
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
Status FixLengthPlainDecoder<PhysicalType>::skip_values(size_t num_values) {
|
||||
_offset += _type_length * num_values;
|
||||
if (UNLIKELY(_offset > _data->size)) {
|
||||
return Status::IOError("Out-of-bounds access in parquet data decoder");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
Status FixLengthPlainDecoder<PhysicalType>::decode_values(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector,
|
||||
bool is_dict_filter) {
|
||||
if (select_vector.has_filter()) {
|
||||
return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
|
||||
} else {
|
||||
return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
|
||||
}
|
||||
}
|
||||
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
template <bool has_filter>
|
||||
Status FixLengthPlainDecoder<PhysicalType>::_decode_values(MutableColumnPtr& doris_column,
|
||||
DataTypePtr& data_type,
|
||||
ColumnSelectVector& select_vector,
|
||||
bool is_dict_filter) {
|
||||
size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
|
||||
if (UNLIKELY(_offset + _type_length * non_null_size > _data->size)) {
|
||||
return Status::IOError("Out-of-bounds access in parquet data decoder");
|
||||
}
|
||||
|
||||
if constexpr (PhysicalType == tparquet::Type::FIXED_LEN_BYTE_ARRAY) {
|
||||
return _decode_string<has_filter>(doris_column, select_vector);
|
||||
} else {
|
||||
return _decode_numeric<has_filter>(doris_column, select_vector);
|
||||
}
|
||||
}
|
||||
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
template <bool has_filter>
|
||||
Status FixLengthPlainDecoder<PhysicalType>::_decode_string(MutableColumnPtr& doris_column,
|
||||
ColumnSelectVector& select_vector) {
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
std::vector<StringRef> string_values;
|
||||
string_values.reserve(run_length);
|
||||
for (size_t i = 0; i < run_length; ++i) {
|
||||
char* buf_start = _data->data + _offset;
|
||||
string_values.emplace_back(buf_start, _type_length);
|
||||
_offset += _type_length;
|
||||
}
|
||||
doris_column->insert_many_strings(&string_values[0], run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
doris_column->insert_many_defaults(run_length);
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
template <tparquet::Type::type PhysicalType>
|
||||
template <bool has_filter>
|
||||
Status FixLengthPlainDecoder<PhysicalType>::_decode_numeric(MutableColumnPtr& doris_column,
|
||||
ColumnSelectVector& select_vector) {
|
||||
auto& column_data = reinterpret_cast<ColumnVector<Int8>&>(*doris_column).get_data();
|
||||
size_t data_index = column_data.size();
|
||||
column_data.resize(data_index +
|
||||
_type_length * (select_vector.num_values() - select_vector.num_filtered()));
|
||||
ColumnSelectVector::DataReadType read_type;
|
||||
while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
|
||||
switch (read_type) {
|
||||
case ColumnSelectVector::CONTENT: {
|
||||
memcpy(column_data.data() + data_index, _data->data + _offset,
|
||||
run_length * _type_length);
|
||||
_offset += run_length * _type_length;
|
||||
data_index += run_length * _type_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::NULL_DATA: {
|
||||
data_index += run_length * _type_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_CONTENT: {
|
||||
_offset += _type_length * run_length;
|
||||
break;
|
||||
}
|
||||
case ColumnSelectVector::FILTERED_NULL: {
|
||||
// do nothing
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
|
||||
82
be/src/vec/exec/format/parquet/parquet_column_convert.cpp
Normal file
82
be/src/vec/exec/format/parquet/parquet_column_convert.cpp
Normal file
@ -0,0 +1,82 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "vec/exec/format/parquet/parquet_column_convert.h"
|
||||
|
||||
#include <cctz/time_zone.h>
|
||||
|
||||
#include "vec/columns/column_nullable.h"
|
||||
namespace doris::vectorized {
|
||||
namespace ParquetConvert {
|
||||
const cctz::time_zone ConvertParams::utc0 = cctz::utc_time_zone();
|
||||
|
||||
ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type,
|
||||
ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert) {
|
||||
ColumnPtr ans_column = doris_column;
|
||||
DataTypePtr tmp_data_type;
|
||||
|
||||
switch (parquet_physical_type) {
|
||||
case tparquet::Type::type::BOOLEAN:
|
||||
tmp_data_type = std::make_shared<DataTypeUInt8>();
|
||||
break;
|
||||
case tparquet::Type::type::INT32:
|
||||
tmp_data_type = std::make_shared<DataTypeInt32>();
|
||||
break;
|
||||
case tparquet::Type::type::INT64:
|
||||
tmp_data_type = std::make_shared<DataTypeInt64>();
|
||||
break;
|
||||
case tparquet::Type::type::FLOAT:
|
||||
tmp_data_type = std::make_shared<DataTypeFloat32>();
|
||||
break;
|
||||
case tparquet::Type::type::DOUBLE:
|
||||
tmp_data_type = std::make_shared<DataTypeFloat64>();
|
||||
break;
|
||||
case tparquet::Type::type::BYTE_ARRAY:
|
||||
case tparquet::Type::type::FIXED_LEN_BYTE_ARRAY:
|
||||
tmp_data_type = std::make_shared<DataTypeString>();
|
||||
break;
|
||||
case tparquet::Type::type::INT96:
|
||||
tmp_data_type = std::make_shared<DataTypeInt8>();
|
||||
break;
|
||||
}
|
||||
|
||||
if (tmp_data_type->get_type_id() == remove_nullable(doris_type)->get_type_id()) {
|
||||
if (tmp_data_type->get_type_id() == TypeIndex::String &&
|
||||
(show_type == PrimitiveType::TYPE_DECIMAL32 ||
|
||||
show_type == PrimitiveType::TYPE_DECIMAL64 ||
|
||||
show_type == PrimitiveType::TYPE_DECIMALV2 ||
|
||||
show_type == PrimitiveType::TYPE_DECIMAL128I)) {
|
||||
*need_convert = true;
|
||||
ans_column = tmp_data_type->create_column();
|
||||
} else {
|
||||
*need_convert = false;
|
||||
}
|
||||
} else {
|
||||
ans_column = tmp_data_type->create_column();
|
||||
*need_convert = true;
|
||||
}
|
||||
|
||||
if (*need_convert && doris_type->is_nullable()) {
|
||||
auto doris_nullable_column = static_cast<const ColumnNullable*>(doris_column.get());
|
||||
ans_column = ColumnNullable::create(ans_column,
|
||||
doris_nullable_column->get_null_map_column_ptr());
|
||||
}
|
||||
return ans_column;
|
||||
}
|
||||
|
||||
} // namespace ParquetConvert
|
||||
} // namespace doris::vectorized
|
||||
665
be/src/vec/exec/format/parquet/parquet_column_convert.h
Normal file
665
be/src/vec/exec/format/parquet/parquet_column_convert.h
Normal file
@ -0,0 +1,665 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <gen_cpp/PlanNodes_types.h>
|
||||
#include <gen_cpp/Types_types.h>
|
||||
#include <gen_cpp/parquet_types.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <ostream>
|
||||
#include <utility>
|
||||
|
||||
#include "common/compiler_util.h" // IWYU pragma: keep
|
||||
#include "common/status.h"
|
||||
#include "gen_cpp/descriptors.pb.h"
|
||||
#include "gutil/endian.h"
|
||||
#include "gutil/strings/numbers.h"
|
||||
#include "io/file_factory.h"
|
||||
#include "olap/olap_common.h"
|
||||
#include "util/coding.h"
|
||||
#include "util/slice.h"
|
||||
#include "vec/columns/column_string.h"
|
||||
#include "vec/columns/column_vector.h"
|
||||
#include "vec/core/types.h"
|
||||
#include "vec/data_types/data_type.h"
|
||||
#include "vec/data_types/data_type_factory.hpp"
|
||||
#include "vec/data_types/data_type_nullable.h"
|
||||
#include "vec/data_types/data_type_number.h"
|
||||
#include "vec/data_types/data_type_string.h"
|
||||
#include "vec/exec/format/format_common.h"
|
||||
#include "vec/exec/format/parquet/decoder.h"
|
||||
#include "vec/exec/format/parquet/parquet_common.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
namespace ParquetConvert {
|
||||
|
||||
template <tparquet::Type::type ParquetType>
|
||||
struct PhysicalTypeTraits {};
|
||||
|
||||
template <>
|
||||
struct PhysicalTypeTraits<tparquet::Type::INT32> {
|
||||
using DataType = int32_t;
|
||||
using ColumnType = ColumnVector<DataType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PhysicalTypeTraits<tparquet::Type::BOOLEAN> {
|
||||
using DataType = uint8;
|
||||
using ColumnType = ColumnVector<DataType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PhysicalTypeTraits<tparquet::Type::INT64> {
|
||||
using DataType = int64_t;
|
||||
using ColumnType = ColumnVector<DataType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PhysicalTypeTraits<tparquet::Type::FLOAT> {
|
||||
using DataType = float;
|
||||
using ColumnType = ColumnVector<DataType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PhysicalTypeTraits<tparquet::Type::DOUBLE> {
|
||||
using DataType = double;
|
||||
using ColumnType = ColumnVector<DataType>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PhysicalTypeTraits<tparquet::Type::BYTE_ARRAY> {
|
||||
using DataType = String;
|
||||
using ColumnType = ColumnString;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PhysicalTypeTraits<tparquet::Type::FIXED_LEN_BYTE_ARRAY> {
|
||||
using DataType = String;
|
||||
using ColumnType = ColumnString;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PhysicalTypeTraits<tparquet::Type::INT96> {
|
||||
using DataType = ParquetInt96;
|
||||
using ColumnType = ColumnVector<Int8>;
|
||||
};
|
||||
|
||||
#define FOR_LOGICAL_NUMERIC_TYPES(M) \
|
||||
M(TypeIndex::Int8, Int8, Int32) \
|
||||
M(TypeIndex::Int16, Int16, Int32) \
|
||||
M(TypeIndex::Int32, Int32, Int32) \
|
||||
M(TypeIndex::Int64, Int64, Int64) \
|
||||
M(TypeIndex::Float32, Float32, Float32) \
|
||||
M(TypeIndex::Float64, Float64, Float64)
|
||||
|
||||
#define FOR_LOGICAL_DECIMAL_TYPES(M) \
|
||||
M(TypeIndex::Decimal32, Decimal32, Int32) \
|
||||
M(TypeIndex::Decimal64, Decimal64, Int64) \
|
||||
M(TypeIndex::Decimal128, Decimal128, Int128) \
|
||||
M(TypeIndex::Decimal128I, Decimal128, Int128)
|
||||
|
||||
struct ConvertParams {
|
||||
// schema.logicalType.TIMESTAMP.isAdjustedToUTC == false
|
||||
static const cctz::time_zone utc0;
|
||||
// schema.logicalType.TIMESTAMP.isAdjustedToUTC == true, we should set the time zone
|
||||
cctz::time_zone* ctz = nullptr;
|
||||
size_t offset_days = 0;
|
||||
int64_t second_mask = 1;
|
||||
int64_t scale_to_nano_factor = 1;
|
||||
DecimalScaleParams decimal_scale;
|
||||
FieldSchema* field_schema = nullptr;
|
||||
size_t start_idx = 0;
|
||||
|
||||
void init(FieldSchema* field_schema_, cctz::time_zone* ctz_, size_t start_idx_ = 0) {
|
||||
field_schema = field_schema_;
|
||||
if (ctz_ != nullptr) {
|
||||
ctz = ctz_;
|
||||
}
|
||||
const auto& schema = field_schema->parquet_schema;
|
||||
if (schema.__isset.logicalType && schema.logicalType.__isset.TIMESTAMP) {
|
||||
const auto& timestamp_info = schema.logicalType.TIMESTAMP;
|
||||
if (!timestamp_info.isAdjustedToUTC) {
|
||||
// should set timezone to utc+0
|
||||
ctz = const_cast<cctz::time_zone*>(&utc0);
|
||||
}
|
||||
const auto& time_unit = timestamp_info.unit;
|
||||
if (time_unit.__isset.MILLIS) {
|
||||
second_mask = 1000;
|
||||
scale_to_nano_factor = 1000000;
|
||||
} else if (time_unit.__isset.MICROS) {
|
||||
second_mask = 1000000;
|
||||
scale_to_nano_factor = 1000;
|
||||
} else if (time_unit.__isset.NANOS) {
|
||||
second_mask = 1000000000;
|
||||
scale_to_nano_factor = 1;
|
||||
}
|
||||
} else if (schema.__isset.converted_type) {
|
||||
const auto& converted_type = schema.converted_type;
|
||||
if (converted_type == tparquet::ConvertedType::TIMESTAMP_MILLIS) {
|
||||
second_mask = 1000;
|
||||
scale_to_nano_factor = 1000000;
|
||||
} else if (converted_type == tparquet::ConvertedType::TIMESTAMP_MICROS) {
|
||||
second_mask = 1000000;
|
||||
scale_to_nano_factor = 1000;
|
||||
}
|
||||
}
|
||||
|
||||
if (ctz) {
|
||||
VecDateTimeValue t;
|
||||
t.from_unixtime(0, *ctz);
|
||||
offset_days = t.day() == 31 ? -1 : 0;
|
||||
}
|
||||
start_idx = start_idx_;
|
||||
}
|
||||
|
||||
template <typename DecimalPrimitiveType>
|
||||
void init_decimal_converter(DataTypePtr& data_type) {
|
||||
if (field_schema == nullptr || decimal_scale.scale_type != DecimalScaleParams::NOT_INIT) {
|
||||
return;
|
||||
}
|
||||
auto scale = field_schema->parquet_schema.scale;
|
||||
auto* decimal_type = static_cast<DataTypeDecimal<Decimal<DecimalPrimitiveType>>*>(
|
||||
const_cast<IDataType*>(remove_nullable(data_type).get()));
|
||||
auto dest_scale = decimal_type->get_scale();
|
||||
if (dest_scale > scale) {
|
||||
decimal_scale.scale_type = DecimalScaleParams::SCALE_UP;
|
||||
decimal_scale.scale_factor =
|
||||
DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(dest_scale - scale);
|
||||
} else if (dest_scale < scale) {
|
||||
decimal_scale.scale_type = DecimalScaleParams::SCALE_DOWN;
|
||||
decimal_scale.scale_factor =
|
||||
DecimalScaleParams::get_scale_factor<DecimalPrimitiveType>(scale - dest_scale);
|
||||
} else {
|
||||
decimal_scale.scale_type = DecimalScaleParams::NO_SCALE;
|
||||
decimal_scale.scale_factor = 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* parquet_physical_type : The type of data stored in parquet.
|
||||
* Read data into columns returned by get_column according to the physical type of parquet.
|
||||
* show_type : The data format that should be displayed.
|
||||
* doris_column : What type of column does the upper layer need to put the data in.
|
||||
*
|
||||
* example :
|
||||
* In hive, if decimal is stored as FIXED_LENBYTE_ARRAY in parquet,
|
||||
* then we use `ALTER TABLE TableName CHANGE COLUMN Col_Decimal Col_Decimal String;`
|
||||
* to convert this column to string type.
|
||||
* parquet_type : FIXED_LEN_BYTE_ARRAY.
|
||||
* ans_data_type : ColumnInt8
|
||||
* show_type : Decimal.
|
||||
* doris_column : ColumnString.
|
||||
*/
|
||||
ColumnPtr get_column(tparquet::Type::type parquet_physical_type, PrimitiveType show_type,
|
||||
ColumnPtr& doris_column, DataTypePtr& doris_type, bool* need_convert);
|
||||
|
||||
struct ColumnConvert {
|
||||
virtual Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) { return Status::OK(); }
|
||||
|
||||
virtual ~ColumnConvert() = default;
|
||||
|
||||
void convert_null(ColumnPtr& src_col, MutableColumnPtr& dst_col) {
|
||||
src_col = remove_nullable(src_col);
|
||||
dst_col = remove_nullable(dst_col->get_ptr())->assume_mutable();
|
||||
}
|
||||
|
||||
public:
|
||||
ConvertParams* _convert_params;
|
||||
};
|
||||
|
||||
template <tparquet::Type::type parquet_physical_type, typename dst_type>
|
||||
struct NumberToNumberConvert : public ColumnConvert {
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
using ColumnType = typename PhysicalTypeTraits<parquet_physical_type>::ColumnType;
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size();
|
||||
auto& src_data = static_cast<const ColumnType*>(src_col.get())->get_data();
|
||||
|
||||
dst_col->resize(_convert_params->start_idx + rows);
|
||||
auto& data = static_cast<ColumnVector<dst_type>&>(*dst_col.get()).get_data();
|
||||
for (int i = 0; i < rows; i++) {
|
||||
dst_type value = static_cast<dst_type>(src_data[i]);
|
||||
data[_convert_params->start_idx + i] = value;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
template <tparquet::Type::type parquet_physical_type>
|
||||
struct NumberToStringConvert : public ColumnConvert {
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
using ColumnType = typename PhysicalTypeTraits<parquet_physical_type>::ColumnType;
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size();
|
||||
auto& src_data = static_cast<const ColumnType*>(src_col.get())->get_data();
|
||||
|
||||
char buf[100];
|
||||
auto str_col = static_cast<ColumnString*>(dst_col.get());
|
||||
for (int i = 0; i < rows; i++) {
|
||||
if constexpr (parquet_physical_type == tparquet::Type::FLOAT) {
|
||||
int len = FastFloatToBuffer(src_data[i], buf, true);
|
||||
str_col->insert_data(buf, len);
|
||||
|
||||
} else if constexpr (parquet_physical_type == tparquet::Type::DOUBLE) {
|
||||
int len = FastDoubleToBuffer(src_data[i], buf, true);
|
||||
str_col->insert_data(buf, len);
|
||||
} else if constexpr (parquet_physical_type == tparquet::Type::INT32) {
|
||||
char* end = FastInt32ToBufferLeft(src_data[i], buf);
|
||||
str_col->insert_data(buf, end - buf);
|
||||
|
||||
} else if constexpr (parquet_physical_type == tparquet::Type::INT64) {
|
||||
char* end = FastInt64ToBufferLeft(src_data[i], buf);
|
||||
str_col->insert_data(buf, end - buf);
|
||||
|
||||
} else {
|
||||
string value = std::to_string(src_data[i]);
|
||||
str_col->insert_data(value.data(), value.size());
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
struct Int96toTimestamp : public ColumnConvert {
|
||||
public:
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size() / sizeof(ParquetInt96);
|
||||
auto& src_data = static_cast<const ColumnVector<Int8>*>(src_col.get())->get_data();
|
||||
auto ParquetInt96_data = (ParquetInt96*)src_data.data();
|
||||
dst_col->resize(_convert_params->start_idx + rows);
|
||||
auto& data = static_cast<ColumnVector<UInt64>*>(dst_col.get())->get_data();
|
||||
|
||||
for (int i = 0; i < rows; i++) {
|
||||
ParquetInt96 x = ParquetInt96_data[i];
|
||||
auto& num = data[_convert_params->start_idx + i];
|
||||
auto& value = reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(num);
|
||||
int64_t micros = x.to_timestamp_micros();
|
||||
value.from_unixtime(micros / 1000000, *_convert_params->ctz);
|
||||
value.set_microsecond(micros % 1000000);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
struct Int64ToTimestamp : public ColumnConvert {
|
||||
public:
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size();
|
||||
dst_col->resize(_convert_params->start_idx + rows);
|
||||
|
||||
auto src_data = static_cast<const ColumnVector<int64_t>*>(src_col.get())->get_data().data();
|
||||
auto& data = static_cast<ColumnVector<UInt64>*>(dst_col.get())->get_data();
|
||||
|
||||
for (int i = 0; i < rows; i++) {
|
||||
int64_t x = src_data[i];
|
||||
auto& num = data[_convert_params->start_idx + i];
|
||||
auto& value = reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(num);
|
||||
value.from_unixtime(x / _convert_params->second_mask, *_convert_params->ctz);
|
||||
value.set_microsecond((x % _convert_params->second_mask) *
|
||||
(_convert_params->scale_to_nano_factor / 1000));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
class Int32ToDate : public ColumnConvert {
|
||||
public:
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size();
|
||||
dst_col->resize(_convert_params->start_idx + rows);
|
||||
|
||||
auto& src_data = static_cast<const ColumnVector<int32>*>(src_col.get())->get_data();
|
||||
auto& data = static_cast<ColumnDateV2*>(dst_col.get())->get_data();
|
||||
date_day_offset_dict& date_dict = date_day_offset_dict::get();
|
||||
|
||||
for (int i = 0; i < rows; i++) {
|
||||
auto& value = reinterpret_cast<DateV2Value<DateV2ValueType>&>(
|
||||
data[_convert_params->start_idx + i]);
|
||||
int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days;
|
||||
value = date_dict[date_value];
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DecimalType, typename ValueCopyType, DecimalScaleParams::ScaleType ScaleType>
|
||||
class StringToDecimal : public ColumnConvert {
|
||||
public:
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size();
|
||||
DecimalScaleParams& scale_params = _convert_params->decimal_scale;
|
||||
auto buf = static_cast<const ColumnString*>(src_col.get())->get_chars().data();
|
||||
auto& offset = static_cast<const ColumnString*>(src_col.get())->get_offsets();
|
||||
dst_col->resize(_convert_params->start_idx + rows);
|
||||
|
||||
auto& data = static_cast<ColumnDecimal<DecimalType>*>(dst_col.get())->get_data();
|
||||
for (int i = 0; i < rows; i++) {
|
||||
size_t len = offset[i] - offset[i - 1];
|
||||
// When Decimal in parquet is stored in byte arrays, binary and fixed,
|
||||
// the unscaled number must be encoded as two's complement using big-endian byte order.
|
||||
ValueCopyType value = 0;
|
||||
memcpy(reinterpret_cast<char*>(&value), buf + offset[i - 1], len);
|
||||
value = BitUtil::big_endian_to_host(value);
|
||||
value = value >> ((sizeof(value) - len) * 8);
|
||||
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
|
||||
value *= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
|
||||
value /= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::NO_SCALE) {
|
||||
// do nothing
|
||||
} else {
|
||||
LOG(FATAL) << "__builtin_unreachable";
|
||||
__builtin_unreachable();
|
||||
}
|
||||
auto& v = reinterpret_cast<DecimalType&>(data[_convert_params->start_idx + i]);
|
||||
v = (DecimalType)value;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
template <typename NumberType, typename DecimalPhysicalType, typename ValueCopyType,
|
||||
DecimalScaleParams::ScaleType ScaleType>
|
||||
class NumberToDecimal : public ColumnConvert {
|
||||
public:
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size();
|
||||
auto* src_data =
|
||||
static_cast<const ColumnVector<NumberType>*>(src_col.get())->get_data().data();
|
||||
dst_col->resize(_convert_params->start_idx + rows);
|
||||
|
||||
DecimalScaleParams& scale_params = _convert_params->decimal_scale;
|
||||
auto* data = static_cast<ColumnDecimal<Decimal<DecimalPhysicalType>>*>(dst_col.get())
|
||||
->get_data()
|
||||
.data();
|
||||
|
||||
for (int i = 0; i < rows; i++) {
|
||||
ValueCopyType value = src_data[i];
|
||||
if constexpr (ScaleType == DecimalScaleParams::SCALE_UP) {
|
||||
value *= scale_params.scale_factor;
|
||||
} else if constexpr (ScaleType == DecimalScaleParams::SCALE_DOWN) {
|
||||
value /= scale_params.scale_factor;
|
||||
}
|
||||
data[_convert_params->start_idx + i] = (DecimalPhysicalType)value;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename DecimalType, typename ValueCopyType>
|
||||
class StringToDecimalString : public ColumnConvert {
|
||||
public:
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size();
|
||||
|
||||
auto buf = static_cast<const ColumnString*>(src_col.get())->get_chars().data();
|
||||
auto& offset = static_cast<const ColumnString*>(src_col.get())->get_offsets();
|
||||
|
||||
auto data = static_cast<ColumnString*>(dst_col.get());
|
||||
for (int i = 0; i < rows; i++) {
|
||||
int len = offset[i] - offset[i - 1];
|
||||
// When Decimal in parquet is stored in byte arrays, binary and fixed,
|
||||
// the unscaled number must be encoded as two's complement using big-endian byte order.
|
||||
ValueCopyType value = 0;
|
||||
memcpy(reinterpret_cast<char*>(&value), buf + offset[i - 1], len);
|
||||
value = BitUtil::big_endian_to_host(value);
|
||||
value = value >> ((sizeof(value) - len) * 8);
|
||||
std::string ans = reinterpret_cast<DecimalType&>(value).to_string(
|
||||
_convert_params->field_schema->parquet_schema.scale);
|
||||
data->insert_data(ans.data(), ans.size());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
class Int32ToDateString : public ColumnConvert {
|
||||
public:
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
size_t rows = src_col->size();
|
||||
|
||||
auto& src_data = static_cast<const ColumnVector<int32>*>(src_col.get())->get_data();
|
||||
date_day_offset_dict& date_dict = date_day_offset_dict::get();
|
||||
|
||||
auto str_col = static_cast<ColumnString*>(dst_col.get());
|
||||
char buf[50];
|
||||
for (int i = 0; i < rows; i++) {
|
||||
int64_t date_value = (int64_t)src_data[i] + _convert_params->offset_days;
|
||||
DateV2Value<DateV2ValueType> value = date_dict[date_value];
|
||||
char* end = value.to_string(buf);
|
||||
str_col->insert_data(buf, end - buf);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
class Int96ToTimestampString : public ColumnConvert {
|
||||
public:
|
||||
Status convert(ColumnPtr& src_col, MutableColumnPtr& dst_col) override {
|
||||
convert_null(src_col, dst_col);
|
||||
|
||||
auto& src_data = static_cast<const ColumnVector<Int8>*>(src_col.get())->get_data();
|
||||
auto dst_data = static_cast<ColumnString*>(dst_col.get());
|
||||
|
||||
size_t rows = src_col->size() / sizeof(ParquetInt96);
|
||||
ParquetInt96* data = (ParquetInt96*)src_data.data();
|
||||
|
||||
char buf[50];
|
||||
for (int i = 0; i < rows; i++) {
|
||||
uint64_t num = 0;
|
||||
auto& value = reinterpret_cast<DateV2Value<DateTimeV2ValueType>&>(num);
|
||||
int64_t micros = data[i].to_timestamp_micros();
|
||||
value.from_unixtime(micros / 1000000, *_convert_params->ctz);
|
||||
value.set_microsecond(micros % 1000000);
|
||||
char* end = value.to_string(buf);
|
||||
dst_data->insert_data(buf, end - buf);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
inline Status get_converter(tparquet::Type::type parquet_physical_type, PrimitiveType show_type,
|
||||
std::shared_ptr<const IDataType> dst_data_type,
|
||||
std::unique_ptr<ColumnConvert>* converter,
|
||||
ConvertParams* convert_params) {
|
||||
auto dst_type = remove_nullable(dst_data_type)->get_type_id();
|
||||
switch (dst_type) {
|
||||
#define DISPATCH(NUMERIC_TYPE, CPP_NUMERIC_TYPE, PHYSICAL_TYPE) \
|
||||
case NUMERIC_TYPE: \
|
||||
switch (parquet_physical_type) { \
|
||||
case tparquet::Type::BOOLEAN: \
|
||||
*converter = std::make_unique< \
|
||||
NumberToNumberConvert<tparquet::Type::BOOLEAN, CPP_NUMERIC_TYPE>>(); \
|
||||
break; \
|
||||
case tparquet::Type::INT32: \
|
||||
*converter = std::make_unique< \
|
||||
NumberToNumberConvert<tparquet::Type::INT32, CPP_NUMERIC_TYPE>>(); \
|
||||
break; \
|
||||
case tparquet::Type::INT64: \
|
||||
*converter = std::make_unique< \
|
||||
NumberToNumberConvert<tparquet::Type::INT64, CPP_NUMERIC_TYPE>>(); \
|
||||
break; \
|
||||
case tparquet::Type::FLOAT: \
|
||||
*converter = std::make_unique< \
|
||||
NumberToNumberConvert<tparquet::Type::FLOAT, CPP_NUMERIC_TYPE>>(); \
|
||||
break; \
|
||||
case tparquet::Type::DOUBLE: \
|
||||
*converter = std::make_unique< \
|
||||
NumberToNumberConvert<tparquet::Type::DOUBLE, CPP_NUMERIC_TYPE>>(); \
|
||||
break; \
|
||||
default: \
|
||||
break; \
|
||||
} \
|
||||
break;
|
||||
FOR_LOGICAL_NUMERIC_TYPES(DISPATCH)
|
||||
#undef DISPATCH
|
||||
|
||||
case TypeIndex::String: {
|
||||
if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) {
|
||||
if (show_type == PrimitiveType::TYPE_DECIMAL32) {
|
||||
*converter = std::make_unique<StringToDecimalString<Decimal32, Int32>>();
|
||||
break;
|
||||
} else if (show_type == PrimitiveType::TYPE_DECIMAL64) {
|
||||
*converter = std::make_unique<StringToDecimalString<Decimal64, Int64>>();
|
||||
break;
|
||||
} else if (show_type == PrimitiveType::TYPE_DECIMALV2) {
|
||||
*converter = std::make_unique<StringToDecimalString<Decimal128, Int128>>();
|
||||
break;
|
||||
} else if (show_type == PrimitiveType::TYPE_DECIMAL128I) {
|
||||
*converter = std::make_unique<StringToDecimalString<Decimal128, Int128>>();
|
||||
break;
|
||||
}
|
||||
|
||||
} else if (tparquet::Type::INT96 == parquet_physical_type) {
|
||||
*converter = std::make_unique<Int96ToTimestampString>();
|
||||
break;
|
||||
} else if (tparquet::Type::INT32 == parquet_physical_type) {
|
||||
if (show_type == PrimitiveType::TYPE_DATEV2) {
|
||||
*converter = std::make_unique<Int32ToDateString>();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (parquet_physical_type == tparquet::Type::BOOLEAN) {
|
||||
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::BOOLEAN>>();
|
||||
} else if (parquet_physical_type == tparquet::Type::INT32) {
|
||||
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::INT32>>();
|
||||
|
||||
} else if (parquet_physical_type == tparquet::Type::INT64) {
|
||||
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::INT64>>();
|
||||
|
||||
} else if (parquet_physical_type == tparquet::Type::FLOAT) {
|
||||
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::FLOAT>>();
|
||||
|
||||
} else if (parquet_physical_type == tparquet::Type::DOUBLE) {
|
||||
*converter = std::make_unique<NumberToStringConvert<tparquet::Type::DOUBLE>>();
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case TypeIndex::DateV2:
|
||||
if (tparquet::Type::INT32 == parquet_physical_type) {
|
||||
*converter = std::make_unique<Int32ToDate>();
|
||||
}
|
||||
break;
|
||||
case TypeIndex::DateTimeV2:
|
||||
if (tparquet::Type::INT96 == parquet_physical_type) {
|
||||
*converter = std::make_unique<Int96toTimestamp>();
|
||||
} else if (tparquet::Type::INT64 == parquet_physical_type) {
|
||||
*converter = std::make_unique<Int64ToTimestamp>();
|
||||
}
|
||||
break;
|
||||
#define DISPATCH2(TypeIndex_DECIMAL_TYPE, DECIMAL_TYPE, PRIMARY_TYPE) \
|
||||
case TypeIndex_DECIMAL_TYPE: { \
|
||||
convert_params->init_decimal_converter<PRIMARY_TYPE>(dst_data_type); \
|
||||
DecimalScaleParams& scale_params = convert_params->decimal_scale; \
|
||||
if (tparquet::Type::FIXED_LEN_BYTE_ARRAY == parquet_physical_type) { \
|
||||
size_t string_length = convert_params->field_schema->parquet_schema.type_length; \
|
||||
if (string_length <= 8) { \
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \
|
||||
*converter = \
|
||||
std::make_unique<StringToDecimal<DECIMAL_TYPE, int64_t, \
|
||||
DecimalScaleParams::SCALE_UP>>(); \
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \
|
||||
*converter = \
|
||||
std::make_unique<StringToDecimal<DECIMAL_TYPE, int64_t, \
|
||||
DecimalScaleParams::SCALE_DOWN>>(); \
|
||||
} else { \
|
||||
*converter = \
|
||||
std::make_unique<StringToDecimal<DECIMAL_TYPE, int64_t, \
|
||||
DecimalScaleParams::NO_SCALE>>(); \
|
||||
} \
|
||||
} else if (string_length <= 16) { \
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \
|
||||
*converter = \
|
||||
std::make_unique<StringToDecimal<DECIMAL_TYPE, int128_t, \
|
||||
DecimalScaleParams::SCALE_UP>>(); \
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \
|
||||
*converter = \
|
||||
std::make_unique<StringToDecimal<DECIMAL_TYPE, int128_t, \
|
||||
DecimalScaleParams::SCALE_DOWN>>(); \
|
||||
} else { \
|
||||
*converter = \
|
||||
std::make_unique<StringToDecimal<DECIMAL_TYPE, int128_t, \
|
||||
DecimalScaleParams::NO_SCALE>>(); \
|
||||
} \
|
||||
} \
|
||||
} else if (tparquet::Type::INT32 == parquet_physical_type) { \
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \
|
||||
*converter = std::make_unique<NumberToDecimal<Int32, PRIMARY_TYPE, int64_t, \
|
||||
DecimalScaleParams::SCALE_UP>>(); \
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \
|
||||
*converter = std::make_unique<NumberToDecimal<Int32, PRIMARY_TYPE, int64_t, \
|
||||
DecimalScaleParams::SCALE_DOWN>>(); \
|
||||
} else { \
|
||||
*converter = std::make_unique<NumberToDecimal<Int32, PRIMARY_TYPE, int64_t, \
|
||||
DecimalScaleParams::NO_SCALE>>(); \
|
||||
} \
|
||||
} else if (tparquet::Type::INT64 == parquet_physical_type) { \
|
||||
if (scale_params.scale_type == DecimalScaleParams::SCALE_UP) { \
|
||||
*converter = std::make_unique<NumberToDecimal<Int64, PRIMARY_TYPE, int64_t, \
|
||||
DecimalScaleParams::SCALE_UP>>(); \
|
||||
} else if (scale_params.scale_type == DecimalScaleParams::SCALE_DOWN) { \
|
||||
*converter = std::make_unique<NumberToDecimal<Int64, PRIMARY_TYPE, int64_t, \
|
||||
DecimalScaleParams::SCALE_DOWN>>(); \
|
||||
} else { \
|
||||
*converter = std::make_unique<NumberToDecimal<Int64, PRIMARY_TYPE, int64_t, \
|
||||
DecimalScaleParams::NO_SCALE>>(); \
|
||||
} \
|
||||
} \
|
||||
break; \
|
||||
}
|
||||
|
||||
FOR_LOGICAL_DECIMAL_TYPES(DISPATCH2)
|
||||
#undef DISPATCH2
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
if (*converter == nullptr) {
|
||||
return Status::NotSupported("Can't cast type parquet physical {} to doris logical type {}",
|
||||
tparquet::to_string(parquet_physical_type),
|
||||
getTypeName(dst_type));
|
||||
}
|
||||
(*converter)->_convert_params = convert_params;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
}; // namespace ParquetConvert
|
||||
|
||||
}; // namespace doris::vectorized
|
||||
@ -54,6 +54,11 @@ struct ParquetInt96 {
|
||||
inline uint64_t to_timestamp_micros() const {
|
||||
return (hi - JULIAN_EPOCH_OFFSET_DAYS) * MICROS_IN_DAY + lo / NANOS_PER_MICROSECOND;
|
||||
}
|
||||
inline __int128 to_int128() const {
|
||||
__int128 ans = 0;
|
||||
ans = (((__int128)hi) << 64) + lo;
|
||||
return ans;
|
||||
}
|
||||
|
||||
static const uint32_t JULIAN_EPOCH_OFFSET_DAYS;
|
||||
static const uint64_t MICROS_IN_DAY;
|
||||
@ -151,4 +156,4 @@ private:
|
||||
size_t _num_filtered;
|
||||
size_t _read_index;
|
||||
};
|
||||
} // namespace doris::vectorized
|
||||
} // namespace doris::vectorized
|
||||
@ -88,9 +88,9 @@ private:
|
||||
|
||||
TypeDescriptor convert_to_doris_type(const tparquet::SchemaElement& physical_schema);
|
||||
|
||||
public:
|
||||
TypeDescriptor get_doris_type(const tparquet::SchemaElement& physical_schema);
|
||||
|
||||
public:
|
||||
FieldDescriptor() = default;
|
||||
~FieldDescriptor() = default;
|
||||
|
||||
|
||||
@ -54,7 +54,7 @@ ColumnChunkReader::ColumnChunkReader(io::BufferedStreamReader* reader,
|
||||
_max_def_level(field_schema->definition_level),
|
||||
_stream_reader(reader),
|
||||
_metadata(column_chunk->meta_data),
|
||||
_ctz(ctz),
|
||||
// _ctz(ctz),
|
||||
_io_ctx(io_ctx) {}
|
||||
|
||||
Status ColumnChunkReader::init() {
|
||||
@ -194,7 +194,7 @@ Status ColumnChunkReader::load_page_data() {
|
||||
// Set type length
|
||||
page_decoder->set_type_length(_get_type_length());
|
||||
// Initialize the time convert context
|
||||
page_decoder->init(_field_schema, _ctz);
|
||||
// page_decoder->init(_field_schema, _ctz);
|
||||
_decoders[static_cast<int>(encoding)] = std::move(page_decoder);
|
||||
_page_decoder = _decoders[static_cast<int>(encoding)].get();
|
||||
}
|
||||
@ -242,7 +242,7 @@ Status ColumnChunkReader::_decode_dict_page() {
|
||||
// Set type length
|
||||
page_decoder->set_type_length(_get_type_length());
|
||||
// Initialize the time convert context
|
||||
page_decoder->init(_field_schema, _ctz);
|
||||
// page_decoder->init(_field_schema, _ctz);
|
||||
// Set the dictionary data
|
||||
RETURN_IF_ERROR(page_decoder->set_dict(dict_data, uncompressed_size,
|
||||
header.dictionary_page_header.num_values));
|
||||
@ -323,4 +323,4 @@ int32_t ColumnChunkReader::_get_type_length() {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
} // namespace doris::vectorized
|
||||
@ -193,7 +193,7 @@ private:
|
||||
|
||||
io::BufferedStreamReader* _stream_reader;
|
||||
tparquet::ColumnMetaData _metadata;
|
||||
cctz::time_zone* _ctz;
|
||||
// cctz::time_zone* _ctz;
|
||||
io::IOContext* _io_ctx;
|
||||
|
||||
std::unique_ptr<PageReader> _page_reader = nullptr;
|
||||
|
||||
@ -25,6 +25,7 @@
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
#include "parquet_column_convert.h"
|
||||
#include "runtime/define_primitive_type.h"
|
||||
#include "schema_desc.h"
|
||||
#include "util/runtime_profile.h"
|
||||
@ -252,8 +253,9 @@ Status ScalarColumnReader::_read_values(size_t num_values, ColumnPtr& doris_colu
|
||||
NullMap* map_data_column = nullptr;
|
||||
if (doris_column->is_nullable()) {
|
||||
SCOPED_RAW_TIMER(&_decode_null_map_time);
|
||||
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
|
||||
(*std::move(doris_column)).mutate().get());
|
||||
auto* nullable_column =
|
||||
static_cast<vectorized::ColumnNullable*>(const_cast<IColumn*>(doris_column.get()));
|
||||
|
||||
data_column = nullable_column->get_nested_column_ptr();
|
||||
map_data_column = &(nullable_column->get_null_map_data());
|
||||
if (_chunk_reader->max_def_level() > 0) {
|
||||
@ -360,8 +362,11 @@ Status ScalarColumnReader::_read_nested_column(ColumnPtr& doris_column, DataType
|
||||
NullMap* map_data_column = nullptr;
|
||||
if (doris_column->is_nullable()) {
|
||||
SCOPED_RAW_TIMER(&_decode_null_map_time);
|
||||
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
|
||||
(*std::move(doris_column)).mutate().get());
|
||||
auto* nullable_column = const_cast<vectorized::ColumnNullable*>(
|
||||
static_cast<const vectorized::ColumnNullable*>(doris_column.get()));
|
||||
|
||||
// auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
|
||||
// (*std::move(src_column)).mutate().get());
|
||||
data_column = nullable_column->get_nested_column_ptr();
|
||||
map_data_column = &(nullable_column->get_null_map_data());
|
||||
} else {
|
||||
@ -476,86 +481,108 @@ Status ScalarColumnReader::_try_load_dict_page(bool* loaded, bool* has_dict) {
|
||||
Status ScalarColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr& type,
|
||||
ColumnSelectVector& select_vector, size_t batch_size,
|
||||
size_t* read_rows, bool* eof, bool is_dict_filter) {
|
||||
if (_chunk_reader->remaining_num_values() == 0) {
|
||||
if (!_chunk_reader->has_next_page()) {
|
||||
*eof = true;
|
||||
*read_rows = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
RETURN_IF_ERROR(_chunk_reader->next_page());
|
||||
}
|
||||
if (_nested_column) {
|
||||
RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent());
|
||||
return _read_nested_column(doris_column, type, select_vector, batch_size, read_rows, eof,
|
||||
is_dict_filter);
|
||||
}
|
||||
bool need_convert = false;
|
||||
auto& parquet_physical_type = _chunk_meta.meta_data.type;
|
||||
auto& show_type = _field_schema->type.type;
|
||||
|
||||
// generate the row ranges that should be read
|
||||
std::list<RowRange> read_ranges;
|
||||
_generate_read_ranges(_current_row_index,
|
||||
_current_row_index + _chunk_reader->remaining_num_values(), read_ranges);
|
||||
if (read_ranges.size() == 0) {
|
||||
// skip the whole page
|
||||
_current_row_index += _chunk_reader->remaining_num_values();
|
||||
RETURN_IF_ERROR(_chunk_reader->skip_page());
|
||||
*read_rows = 0;
|
||||
} else {
|
||||
bool skip_whole_batch = false;
|
||||
// Determining whether to skip page or batch will increase the calculation time.
|
||||
// When the filtering effect is greater than 60%, it is possible to skip the page or batch.
|
||||
if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) {
|
||||
// lazy read
|
||||
size_t remaining_num_values = 0;
|
||||
for (auto& range : read_ranges) {
|
||||
remaining_num_values += range.last_row - range.first_row;
|
||||
}
|
||||
if (batch_size >= remaining_num_values &&
|
||||
select_vector.can_filter_all(remaining_num_values)) {
|
||||
// We can skip the whole page if the remaining values is filtered by predicate columns
|
||||
select_vector.skip(remaining_num_values);
|
||||
_current_row_index += _chunk_reader->remaining_num_values();
|
||||
RETURN_IF_ERROR(_chunk_reader->skip_page());
|
||||
*read_rows = remaining_num_values;
|
||||
if (!_chunk_reader->has_next_page()) {
|
||||
*eof = true;
|
||||
}
|
||||
ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type,
|
||||
doris_column, type, &need_convert);
|
||||
|
||||
do {
|
||||
if (_chunk_reader->remaining_num_values() == 0) {
|
||||
if (!_chunk_reader->has_next_page()) {
|
||||
*eof = true;
|
||||
*read_rows = 0;
|
||||
return Status::OK();
|
||||
}
|
||||
skip_whole_batch =
|
||||
batch_size <= remaining_num_values && select_vector.can_filter_all(batch_size);
|
||||
if (skip_whole_batch) {
|
||||
select_vector.skip(batch_size);
|
||||
}
|
||||
RETURN_IF_ERROR(_chunk_reader->next_page());
|
||||
}
|
||||
// load page data to decode or skip values
|
||||
RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent());
|
||||
size_t has_read = 0;
|
||||
for (auto& range : read_ranges) {
|
||||
// generate the skipped values
|
||||
size_t skip_values = range.first_row - _current_row_index;
|
||||
RETURN_IF_ERROR(_skip_values(skip_values));
|
||||
_current_row_index += skip_values;
|
||||
// generate the read values
|
||||
size_t read_values =
|
||||
std::min((size_t)(range.last_row - range.first_row), batch_size - has_read);
|
||||
if (skip_whole_batch) {
|
||||
RETURN_IF_ERROR(_skip_values(read_values));
|
||||
} else {
|
||||
RETURN_IF_ERROR(_read_values(read_values, doris_column, type, select_vector,
|
||||
is_dict_filter));
|
||||
}
|
||||
has_read += read_values;
|
||||
_current_row_index += read_values;
|
||||
if (has_read == batch_size) {
|
||||
break;
|
||||
}
|
||||
if (_nested_column) {
|
||||
RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent());
|
||||
RETURN_IF_ERROR(_read_nested_column(src_column, type, select_vector, batch_size,
|
||||
read_rows, eof, is_dict_filter));
|
||||
break;
|
||||
}
|
||||
*read_rows = has_read;
|
||||
|
||||
// generate the row ranges that should be read
|
||||
std::list<RowRange> read_ranges;
|
||||
_generate_read_ranges(_current_row_index,
|
||||
_current_row_index + _chunk_reader->remaining_num_values(),
|
||||
read_ranges);
|
||||
if (read_ranges.size() == 0) {
|
||||
// skip the whole page
|
||||
_current_row_index += _chunk_reader->remaining_num_values();
|
||||
RETURN_IF_ERROR(_chunk_reader->skip_page());
|
||||
*read_rows = 0;
|
||||
} else {
|
||||
bool skip_whole_batch = false;
|
||||
// Determining whether to skip page or batch will increase the calculation time.
|
||||
// When the filtering effect is greater than 60%, it is possible to skip the page or batch.
|
||||
if (select_vector.has_filter() && select_vector.filter_ratio() > 0.6) {
|
||||
// lazy read
|
||||
size_t remaining_num_values = 0;
|
||||
for (auto& range : read_ranges) {
|
||||
remaining_num_values += range.last_row - range.first_row;
|
||||
}
|
||||
if (batch_size >= remaining_num_values &&
|
||||
select_vector.can_filter_all(remaining_num_values)) {
|
||||
// We can skip the whole page if the remaining values is filtered by predicate columns
|
||||
select_vector.skip(remaining_num_values);
|
||||
_current_row_index += _chunk_reader->remaining_num_values();
|
||||
RETURN_IF_ERROR(_chunk_reader->skip_page());
|
||||
*read_rows = remaining_num_values;
|
||||
if (!_chunk_reader->has_next_page()) {
|
||||
*eof = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
skip_whole_batch = batch_size <= remaining_num_values &&
|
||||
select_vector.can_filter_all(batch_size);
|
||||
if (skip_whole_batch) {
|
||||
select_vector.skip(batch_size);
|
||||
}
|
||||
}
|
||||
// load page data to decode or skip values
|
||||
RETURN_IF_ERROR(_chunk_reader->load_page_data_idempotent());
|
||||
size_t has_read = 0;
|
||||
for (auto& range : read_ranges) {
|
||||
// generate the skipped values
|
||||
size_t skip_values = range.first_row - _current_row_index;
|
||||
RETURN_IF_ERROR(_skip_values(skip_values));
|
||||
_current_row_index += skip_values;
|
||||
// generate the read values
|
||||
size_t read_values =
|
||||
std::min((size_t)(range.last_row - range.first_row), batch_size - has_read);
|
||||
if (skip_whole_batch) {
|
||||
RETURN_IF_ERROR(_skip_values(read_values));
|
||||
} else {
|
||||
RETURN_IF_ERROR(_read_values(read_values, src_column, type, select_vector,
|
||||
is_dict_filter));
|
||||
}
|
||||
has_read += read_values;
|
||||
_current_row_index += read_values;
|
||||
if (has_read == batch_size) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
*read_rows = has_read;
|
||||
}
|
||||
|
||||
if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) {
|
||||
*eof = true;
|
||||
}
|
||||
} while (false);
|
||||
|
||||
if (need_convert) {
|
||||
std::unique_ptr<ParquetConvert::ColumnConvert> converter;
|
||||
ParquetConvert::ConvertParams convert_params;
|
||||
convert_params.init(_field_schema, _ctz, doris_column->size());
|
||||
RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, type,
|
||||
&converter, &convert_params));
|
||||
auto x = doris_column->assume_mutable();
|
||||
RETURN_IF_ERROR(converter->convert(src_column, x));
|
||||
}
|
||||
|
||||
if (_chunk_reader->remaining_num_values() == 0 && !_chunk_reader->has_next_page()) {
|
||||
*eof = true;
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -732,4 +759,4 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
}; // namespace doris::vectorized
|
||||
}; // namespace doris::vectorized
|
||||
@ -288,4 +288,4 @@ private:
|
||||
std::vector<std::unique_ptr<ParquetColumnReader>> _child_readers;
|
||||
};
|
||||
|
||||
}; // namespace doris::vectorized
|
||||
}; // namespace doris::vectorized
|
||||
@ -175,15 +175,8 @@ Status RowGroupReader::init(
|
||||
|
||||
bool RowGroupReader::_can_filter_by_dict(int slot_id,
|
||||
const tparquet::ColumnMetaData& column_metadata) {
|
||||
SlotDescriptor* slot = nullptr;
|
||||
const std::vector<SlotDescriptor*>& slots = _tuple_descriptor->slots();
|
||||
for (auto each : slots) {
|
||||
if (each->id() == slot_id) {
|
||||
slot = each;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!slot->type().is_string_type()) {
|
||||
if (column_metadata.encodings[0] != tparquet::Encoding::RLE_DICTIONARY ||
|
||||
column_metadata.type != tparquet::Type::BYTE_ARRAY) {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -336,6 +329,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_
|
||||
bool can_filter_all = false;
|
||||
RETURN_IF_ERROR_OR_CATCH_EXCEPTION(VExprContext::execute_conjuncts(
|
||||
_filter_conjuncts, &filters, block, &result_filter, &can_filter_all));
|
||||
|
||||
if (can_filter_all) {
|
||||
for (auto& col : columns_to_filter) {
|
||||
std::move(*block->get_by_position(col).column).assume_mutable()->clear();
|
||||
@ -344,6 +338,7 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_
|
||||
_convert_dict_cols_to_string_cols(block);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
if (!_not_single_slot_filter_conjuncts.empty()) {
|
||||
_convert_dict_cols_to_string_cols(block);
|
||||
std::vector<IColumn::Filter*> merged_filters;
|
||||
@ -362,7 +357,6 @@ Status RowGroupReader::next_batch(Block* block, size_t batch_size, size_t* read_
|
||||
RETURN_IF_CATCH_EXCEPTION(
|
||||
RETURN_IF_ERROR(_filter_block(block, column_to_keep, columns_to_filter)));
|
||||
}
|
||||
|
||||
*read_rows = block->rows();
|
||||
return Status::OK();
|
||||
}
|
||||
@ -421,8 +415,10 @@ Status RowGroupReader::_read_column_data(Block* block, const std::vector<std::st
|
||||
has_eof = true;
|
||||
}
|
||||
}
|
||||
|
||||
*read_rows = batch_read_rows;
|
||||
*batch_eof = has_eof;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -1008,4 +1004,4 @@ ParquetColumnReader::Statistics RowGroupReader::statistics() {
|
||||
return st;
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
} // namespace doris::vectorized
|
||||
@ -19,31 +19,41 @@
|
||||
|
||||
#include <gen_cpp/Metrics_types.h>
|
||||
#include <gen_cpp/PlanNodes_types.h>
|
||||
#include <gen_cpp/Types_types.h>
|
||||
#include <gen_cpp/parquet_types.h>
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <ostream>
|
||||
#include <utility>
|
||||
|
||||
#include "common/status.h"
|
||||
#include "exec/schema_scanner.h"
|
||||
#include "gen_cpp/descriptors.pb.h"
|
||||
#include "gtest/gtest_pred_impl.h"
|
||||
#include "io/file_factory.h"
|
||||
#include "io/fs/buffered_reader.h"
|
||||
#include "io/fs/file_reader.h"
|
||||
#include "io/fs/file_reader_writer_fwd.h"
|
||||
#include "olap/olap_common.h"
|
||||
#include "parquet_pred_cmp.h"
|
||||
#include "parquet_thrift_util.h"
|
||||
#include "runtime/define_primitive_type.h"
|
||||
#include "runtime/descriptors.h"
|
||||
#include "runtime/types.h"
|
||||
#include "util/slice.h"
|
||||
#include "util/timezone_utils.h"
|
||||
#include "vec/columns/column.h"
|
||||
#include "vec/common/typeid_cast.h"
|
||||
#include "vec/exec/format/format_common.h"
|
||||
#include "vec/core/block.h"
|
||||
#include "vec/core/column_with_type_and_name.h"
|
||||
#include "vec/core/types.h"
|
||||
#include "vec/exec/format/parquet/parquet_common.h"
|
||||
#include "vec/exec/format/parquet/schema_desc.h"
|
||||
#include "vec/exec/format/parquet/vparquet_file_metadata.h"
|
||||
#include "vec/exec/format/parquet/vparquet_group_reader.h"
|
||||
#include "vec/exec/format/parquet/vparquet_page_index.h"
|
||||
#include "vec/exprs/vbloom_predicate.h"
|
||||
#include "vec/exprs/vexpr.h"
|
||||
#include "vec/exprs/vexpr_context.h"
|
||||
#include "vec/exprs/vin_predicate.h"
|
||||
#include "vec/exprs/vruntimefilter_wrapper.h"
|
||||
#include "vec/exprs/vslot_ref.h"
|
||||
@ -520,15 +530,14 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof)
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
{
|
||||
SCOPED_RAW_TIMER(&_statistics.column_read_time);
|
||||
Status batch_st =
|
||||
_current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof);
|
||||
if (!batch_st.ok()) {
|
||||
return Status::InternalError("Read parquet file {} failed, reason = {}",
|
||||
_scan_range.path, batch_st.to_string());
|
||||
}
|
||||
SCOPED_RAW_TIMER(&_statistics.column_read_time);
|
||||
Status batch_st =
|
||||
_current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof);
|
||||
if (!batch_st.ok()) {
|
||||
return Status::InternalError("Read parquet file {} failed, reason = {}", _scan_range.path,
|
||||
batch_st.to_string());
|
||||
}
|
||||
|
||||
if (_row_group_eof) {
|
||||
auto column_st = _current_group_reader->statistics();
|
||||
_column_statistics.merge(column_st);
|
||||
@ -897,4 +906,4 @@ int64_t ParquetReader::_get_column_start_offset(const tparquet::ColumnMetaData&
|
||||
}
|
||||
return column.data_page_offset;
|
||||
}
|
||||
} // namespace doris::vectorized
|
||||
} // namespace doris::vectorized
|
||||
@ -168,6 +168,7 @@ vectorized::BlockUPtr ScannerContext::get_free_block() {
|
||||
|
||||
block = vectorized::Block::create_unique(_output_tuple_desc->slots(), _batch_size,
|
||||
true /*ignore invalid slots*/);
|
||||
|
||||
COUNTER_UPDATE(_newly_create_free_blocks_num, 1);
|
||||
|
||||
_serving_blocks_num++;
|
||||
|
||||
@ -464,4 +464,4 @@ void ScannerScheduler::_task_group_scanner_scan(ScannerScheduler* scheduler,
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
} // namespace doris::vectorized
|
||||
@ -1105,4 +1105,4 @@ Status VFileScanner::close(RuntimeState* state) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
} // namespace doris::vectorized
|
||||
@ -1,16 +1,16 @@
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
|
||||
|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTime))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(Date))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))|
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
|
||||
|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTimeV2))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(DateV2))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))|
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
|
||||
|
||||
@ -1,14 +1,14 @@
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
|
||||
|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTime))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(Date))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))|
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| 2| 2| 2| 2| 1| 2.14| 2.14| NULL| b-row1| 2022-08-02 07:23:18| 2.140000000| c-row1| vc-row1| 2022-08-02| 2022-08-02| 2022-08-02 07:23:18|
|
||||
| -3| -3| -3| -3| 0| -3.14| -3.14| s-row2| b-row2| 2022-08-03 07:23:19| -3.140000000| c-row2| vc-row2| 2022-08-03| 2022-08-03| 2022-08-03 07:23:19|
|
||||
| 4| 4| 4| 4| 1| 4.14| 4.14| NULL| b-row3| 2022-08-04 07:24:17| 4.140000000| c-row3| vc-row3| 2022-08-04| 2022-08-04| 2022-08-04 07:24:17|
|
||||
| -5| -5| -5| -5| 0| -5.14| -5.14| s-row4| b-row4| 2022-08-05 07:25:17| -5.140000000| c-row4| vc-row4| 2022-08-05| 2022-08-05| 2022-08-05 07:25:17|
|
||||
| 6| 6| 6| 6| 0| 6.14| 6.14| s-row5| b-row5| 2022-08-06 07:26:17| 6.140000000| c-row5| vc-row5| 2022-08-06| 2022-08-06| 2022-08-06 07:26:17|
|
||||
| -7| -7| -7| -7| 1| -7.14| -7.14| s-row6| b-row6| 2022-08-07 07:27:17| -7.140000000| c-row6| vc-row6| 2022-08-07| 2022-08-07| 2022-08-07 07:27:17|
|
||||
| 8| 8| 8| 8| 0| 8.14| 8.14| NULL| b-row7| 2022-08-08 07:28:17| 8.140000000| c-row7| vc-row7| 2022-08-08| 2022-08-08| 2022-08-08 07:28:17|
|
||||
| -9| -9| -9| -9| 0| -9.14| -9.14| s-row8| b-row8| 2022-08-09 07:29:17| -9.140000000| c-row8| vc-row8| 2022-08-09| 2022-08-09| 2022-08-09 07:29:17|
|
||||
| 10| 10| 10| 10| 0| 10.14| 10.14| s-row9| b-row9| 2022-08-10 07:21:17| 10.140000000| c-row9| vc-row9| 2022-08-10| 2022-08-10| 2022-08-10 07:21:17|
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+---------------------------------+-------------------------------------+--------------------------+-----------------------------+------------------------+-----------------------------+--------------------------------------+
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
|
||||
|tinyint_col(Nullable(Int8))|smallint_col(Nullable(Int16))|int_col(Nullable(Int32))|bigint_col(Nullable(Int64))|boolean_col(Nullable(UInt8))|float_col(Nullable(Float32))|double_col(Nullable(Float64))|string_col(Nullable(String))|binary_col(Nullable(String))|timestamp_col(Nullable(DateTimeV2))|decimal_col(Nullable(Decimal(27, 9)))|char_col(Nullable(String))|varchar_col(Nullable(String))|date_col(Nullable(DateV2))|date_v2_col(Nullable(DateV2))|timestamp_v2_col(Nullable(DateTimeV2))|
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
|
||||
| -1| -1| -1| -1| 0| -1.14| -1.14| s-row0| b-row0| 2022-08-01 07:23:17.000000| -1.140000000| c-row0| vc-row0| 2022-08-01| 2022-08-01| 2022-08-01 07:23:17|
|
||||
| 2| 2| 2| 2| 1| 2.14| 2.14| NULL| b-row1| 2022-08-02 07:23:18.000000| 2.140000000| c-row1| vc-row1| 2022-08-02| 2022-08-02| 2022-08-02 07:23:18|
|
||||
| -3| -3| -3| -3| 0| -3.14| -3.14| s-row2| b-row2| 2022-08-03 07:23:19.000000| -3.140000000| c-row2| vc-row2| 2022-08-03| 2022-08-03| 2022-08-03 07:23:19|
|
||||
| 4| 4| 4| 4| 1| 4.14| 4.14| NULL| b-row3| 2022-08-04 07:24:17.000000| 4.140000000| c-row3| vc-row3| 2022-08-04| 2022-08-04| 2022-08-04 07:24:17|
|
||||
| -5| -5| -5| -5| 0| -5.14| -5.14| s-row4| b-row4| 2022-08-05 07:25:17.000000| -5.140000000| c-row4| vc-row4| 2022-08-05| 2022-08-05| 2022-08-05 07:25:17|
|
||||
| 6| 6| 6| 6| 0| 6.14| 6.14| s-row5| b-row5| 2022-08-06 07:26:17.000000| 6.140000000| c-row5| vc-row5| 2022-08-06| 2022-08-06| 2022-08-06 07:26:17|
|
||||
| -7| -7| -7| -7| 1| -7.14| -7.14| s-row6| b-row6| 2022-08-07 07:27:17.000000| -7.140000000| c-row6| vc-row6| 2022-08-07| 2022-08-07| 2022-08-07 07:27:17|
|
||||
| 8| 8| 8| 8| 0| 8.14| 8.14| NULL| b-row7| 2022-08-08 07:28:17.000000| 8.140000000| c-row7| vc-row7| 2022-08-08| 2022-08-08| 2022-08-08 07:28:17|
|
||||
| -9| -9| -9| -9| 0| -9.14| -9.14| s-row8| b-row8| 2022-08-09 07:29:17.000000| -9.140000000| c-row8| vc-row8| 2022-08-09| 2022-08-09| 2022-08-09 07:29:17|
|
||||
| 10| 10| 10| 10| 0| 10.14| 10.14| s-row9| b-row9| 2022-08-10 07:21:17.000000| 10.140000000| c-row9| vc-row9| 2022-08-10| 2022-08-10| 2022-08-10 07:21:17|
|
||||
+---------------------------+-----------------------------+------------------------+---------------------------+----------------------------+----------------------------+-----------------------------+----------------------------+----------------------------+-----------------------------------+-------------------------------------+--------------------------+-----------------------------+--------------------------+-----------------------------+--------------------------------------+
|
||||
|
||||
@ -59,6 +59,7 @@
|
||||
#include "vec/core/column_with_type_and_name.h"
|
||||
#include "vec/data_types/data_type.h"
|
||||
#include "vec/data_types/data_type_factory.hpp"
|
||||
#include "vec/exec/format/parquet/parquet_column_convert.h"
|
||||
#include "vec/exec/format/parquet/parquet_common.h"
|
||||
#include "vec/exec/format/parquet/parquet_thrift_util.h"
|
||||
#include "vec/exec/format/parquet/schema_desc.h"
|
||||
@ -167,8 +168,8 @@ TEST_F(ParquetThriftReaderTest, complex_nested_file) {
|
||||
|
||||
static int fill_nullable_column(ColumnPtr& doris_column, level_t* definitions, size_t num_values) {
|
||||
CHECK(doris_column->is_nullable());
|
||||
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
|
||||
(*std::move(doris_column)).mutate().get());
|
||||
auto* nullable_column = const_cast<vectorized::ColumnNullable*>(
|
||||
static_cast<const vectorized::ColumnNullable*>(doris_column.get()));
|
||||
NullMap& map_data = nullable_column->get_null_map_data();
|
||||
int null_cnt = 0;
|
||||
for (int i = 0; i < num_values; ++i) {
|
||||
@ -189,6 +190,14 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column
|
||||
? chunk_meta.dictionary_page_offset
|
||||
: chunk_meta.data_page_offset;
|
||||
size_t chunk_size = chunk_meta.total_compressed_size;
|
||||
|
||||
bool need_convert = false;
|
||||
auto& parquet_physical_type = column_chunk->meta_data.type;
|
||||
auto& show_type = field_schema->type.type;
|
||||
|
||||
ColumnPtr src_column = ParquetConvert::get_column(parquet_physical_type, show_type,
|
||||
doris_column, data_type, &need_convert);
|
||||
|
||||
io::BufferedFileStreamReader stream_reader(file_reader, start_offset, chunk_size, 1024);
|
||||
|
||||
cctz::time_zone ctz;
|
||||
@ -208,14 +217,14 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column
|
||||
chunk_reader.get_def_levels(definitions, rows);
|
||||
}
|
||||
MutableColumnPtr data_column;
|
||||
if (doris_column->is_nullable()) {
|
||||
if (src_column->is_nullable()) {
|
||||
// fill nullable values
|
||||
fill_nullable_column(doris_column, definitions, rows);
|
||||
auto* nullable_column = reinterpret_cast<vectorized::ColumnNullable*>(
|
||||
(*std::move(doris_column)).mutate().get());
|
||||
fill_nullable_column(src_column, definitions, rows);
|
||||
auto* nullable_column = const_cast<vectorized::ColumnNullable*>(
|
||||
static_cast<const vectorized::ColumnNullable*>(src_column.get()));
|
||||
data_column = nullable_column->get_nested_column_ptr();
|
||||
} else {
|
||||
data_column = doris_column->assume_mutable();
|
||||
data_column = src_column->assume_mutable();
|
||||
}
|
||||
ColumnSelectVector run_length_map;
|
||||
// decode page data
|
||||
@ -223,7 +232,7 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column
|
||||
// required column
|
||||
std::vector<u_short> null_map = {(u_short)rows};
|
||||
run_length_map.set_run_length_null_map(null_map, rows, nullptr);
|
||||
return chunk_reader.decode_values(data_column, data_type, run_length_map, false);
|
||||
RETURN_IF_ERROR(chunk_reader.decode_values(data_column, data_type, run_length_map, false));
|
||||
} else {
|
||||
// column with null values
|
||||
level_t level_type = definitions[0];
|
||||
@ -254,8 +263,18 @@ static Status get_column_values(io::FileReaderSPtr file_reader, tparquet::Column
|
||||
RETURN_IF_ERROR(
|
||||
chunk_reader.decode_values(data_column, data_type, run_length_map, false));
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
if (need_convert) {
|
||||
std::unique_ptr<ParquetConvert::ColumnConvert> converter;
|
||||
ParquetConvert::ConvertParams convert_params;
|
||||
convert_params.init(field_schema, &ctz, doris_column->size());
|
||||
RETURN_IF_ERROR(ParquetConvert::get_converter(parquet_physical_type, show_type, data_type,
|
||||
&converter, &convert_params));
|
||||
auto x = doris_column->assume_mutable();
|
||||
RETURN_IF_ERROR(converter->convert(src_column, x));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Only the unit test depend on this, but it is wrong, should not use TTupleDesc to create tuple desc, not
|
||||
@ -340,11 +359,11 @@ static void create_block(std::unique_ptr<vectorized::Block>& block) {
|
||||
// binary is not supported, use string instead
|
||||
{"binary_col", TYPE_STRING, sizeof(StringRef), true},
|
||||
// 64-bit-length, see doris::get_slot_size in primitive_type.cpp
|
||||
{"timestamp_col", TYPE_DATETIME, sizeof(int128_t), true},
|
||||
{"timestamp_col", TYPE_DATETIMEV2, sizeof(int128_t), true},
|
||||
{"decimal_col", TYPE_DECIMALV2, sizeof(DecimalV2Value), true},
|
||||
{"char_col", TYPE_CHAR, sizeof(StringRef), true},
|
||||
{"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true},
|
||||
{"date_col", TYPE_DATE, sizeof(int128_t), true},
|
||||
{"date_col", TYPE_DATEV2, sizeof(uint32_t), true},
|
||||
{"date_v2_col", TYPE_DATEV2, sizeof(uint32_t), true},
|
||||
{"timestamp_v2_col", TYPE_DATETIMEV2, sizeof(int128_t), true, 18, 0}};
|
||||
SchemaScanner schema_scanner(column_descs);
|
||||
@ -448,118 +467,6 @@ TEST_F(ParquetThriftReaderTest, dict_decoder) {
|
||||
read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/dict-decoder.parquet",
|
||||
"./be/test/exec/test_data/parquet_scanner/dict-decoder.txt", 12);
|
||||
}
|
||||
|
||||
TEST_F(ParquetThriftReaderTest, group_reader) {
|
||||
std::vector<doris::SchemaScanner::ColumnDesc> column_descs = {
|
||||
{"tinyint_col", TYPE_TINYINT, sizeof(int8_t), true},
|
||||
{"smallint_col", TYPE_SMALLINT, sizeof(int16_t), true},
|
||||
{"int_col", TYPE_INT, sizeof(int32_t), true},
|
||||
{"bigint_col", TYPE_BIGINT, sizeof(int64_t), true},
|
||||
{"boolean_col", TYPE_BOOLEAN, sizeof(bool), true},
|
||||
{"float_col", TYPE_FLOAT, sizeof(float_t), true},
|
||||
{"double_col", TYPE_DOUBLE, sizeof(double_t), true},
|
||||
{"string_col", TYPE_STRING, sizeof(StringRef), true},
|
||||
{"binary_col", TYPE_STRING, sizeof(StringRef), true},
|
||||
{"timestamp_col", TYPE_DATETIME, sizeof(int128_t), true},
|
||||
{"decimal_col", TYPE_DECIMALV2, sizeof(DecimalV2Value), true},
|
||||
{"char_col", TYPE_CHAR, sizeof(StringRef), true},
|
||||
{"varchar_col", TYPE_VARCHAR, sizeof(StringRef), true},
|
||||
{"date_col", TYPE_DATE, sizeof(int128_t), true}};
|
||||
SchemaScanner schema_scanner(column_descs);
|
||||
ObjectPool object_pool;
|
||||
doris::TupleDescriptor* tuple_desc = create_tuple_desc(&object_pool, column_descs);
|
||||
auto tuple_slots = tuple_desc->slots();
|
||||
|
||||
TSlotDescriptor tslot_desc;
|
||||
{
|
||||
tslot_desc.id = 14;
|
||||
tslot_desc.parent = 0;
|
||||
TTypeDesc type;
|
||||
{
|
||||
TTypeNode node;
|
||||
node.__set_type(TTypeNodeType::ARRAY);
|
||||
std::vector<bool> contains_nulls {true};
|
||||
node.__set_contains_nulls(contains_nulls);
|
||||
TTypeNode inner;
|
||||
inner.__set_type(TTypeNodeType::SCALAR);
|
||||
TScalarType scalar_type;
|
||||
scalar_type.__set_type(TPrimitiveType::STRING);
|
||||
inner.__set_scalar_type(scalar_type);
|
||||
inner.__set_contains_nulls(contains_nulls);
|
||||
type.types.push_back(node);
|
||||
type.types.push_back(inner);
|
||||
}
|
||||
tslot_desc.slotType = type;
|
||||
tslot_desc.columnPos = 14;
|
||||
tslot_desc.byteOffset = 0;
|
||||
tslot_desc.nullIndicatorByte = 0;
|
||||
tslot_desc.nullIndicatorBit = -1;
|
||||
tslot_desc.colName = "list_string";
|
||||
tslot_desc.slotIdx = 14;
|
||||
tslot_desc.isMaterialized = true;
|
||||
}
|
||||
SlotDescriptor string_slot(tslot_desc);
|
||||
tuple_slots.emplace_back(&string_slot);
|
||||
|
||||
std::vector<std::string> read_columns;
|
||||
RowGroupReader::LazyReadContext lazy_read_ctx;
|
||||
for (const auto& slot : tuple_slots) {
|
||||
lazy_read_ctx.all_read_columns.emplace_back(slot->col_name());
|
||||
read_columns.emplace_back(slot->col_name());
|
||||
}
|
||||
io::FileSystemSPtr local_fs = io::LocalFileSystem::create("");
|
||||
io::FileReaderSPtr file_reader;
|
||||
auto st = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet",
|
||||
&file_reader);
|
||||
EXPECT_TRUE(st.ok());
|
||||
|
||||
// prepare metadata
|
||||
FileMetaData* meta_data;
|
||||
size_t meta_size;
|
||||
static_cast<void>(parse_thrift_footer(file_reader, &meta_data, &meta_size, nullptr));
|
||||
tparquet::FileMetaData t_metadata = meta_data->to_thrift();
|
||||
|
||||
cctz::time_zone ctz;
|
||||
TimezoneUtils::find_cctz_time_zone(TimezoneUtils::default_time_zone, ctz);
|
||||
auto row_group = t_metadata.row_groups[0];
|
||||
std::shared_ptr<RowGroupReader> row_group_reader;
|
||||
RowGroupReader::PositionDeleteContext position_delete_ctx(row_group.num_rows, 0);
|
||||
row_group_reader.reset(new RowGroupReader(file_reader, read_columns, 0, row_group, &ctz,
|
||||
nullptr, position_delete_ctx, lazy_read_ctx,
|
||||
nullptr));
|
||||
std::vector<RowRange> row_ranges;
|
||||
row_ranges.emplace_back(0, row_group.num_rows);
|
||||
|
||||
auto col_offsets = std::unordered_map<int, tparquet::OffsetIndex>();
|
||||
auto stg = row_group_reader->init(meta_data->schema(), row_ranges, col_offsets, nullptr,
|
||||
nullptr, nullptr, nullptr, nullptr);
|
||||
EXPECT_TRUE(stg.ok());
|
||||
|
||||
vectorized::Block block;
|
||||
for (const auto& slot_desc : tuple_slots) {
|
||||
auto data_type =
|
||||
vectorized::DataTypeFactory::instance().create_data_type(slot_desc->type(), true);
|
||||
MutableColumnPtr data_column = data_type->create_column();
|
||||
block.insert(
|
||||
ColumnWithTypeAndName(std::move(data_column), data_type, slot_desc->col_name()));
|
||||
}
|
||||
bool batch_eof = false;
|
||||
size_t read_rows = 0;
|
||||
auto stb = row_group_reader->next_batch(&block, 1024, &read_rows, &batch_eof);
|
||||
EXPECT_TRUE(stb.ok());
|
||||
|
||||
io::FileReaderSPtr result;
|
||||
auto rst = local_fs->open_file("./be/test/exec/test_data/parquet_scanner/group-reader.txt",
|
||||
&result);
|
||||
EXPECT_TRUE(rst.ok());
|
||||
uint8_t result_buf[result->size() + 1];
|
||||
result_buf[result->size()] = '\0';
|
||||
size_t bytes_read;
|
||||
Slice res(result_buf, result->size());
|
||||
static_cast<void>(result->read_at(0, res, &bytes_read));
|
||||
ASSERT_STREQ(block.dump_data(0, 10).c_str(), reinterpret_cast<char*>(result_buf));
|
||||
delete meta_data;
|
||||
}
|
||||
} // namespace vectorized
|
||||
|
||||
} // namespace doris
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,228 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
suite("test_hive_parquet_alter_column", "p2,external,hive,external_remote,external_remote_hive") {
|
||||
String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
|
||||
if (enabled != null && enabled.equalsIgnoreCase("true")) {
|
||||
String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost")
|
||||
String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort")
|
||||
String hms_port = context.config.otherConfigs.get("hms_port")
|
||||
|
||||
String catalog_name = "test_hive_parquet_alter_column"
|
||||
sql """drop catalog if exists ${catalog_name};"""
|
||||
sql """
|
||||
create catalog if not exists ${catalog_name} properties (
|
||||
'type'='hms',
|
||||
'hadoop.username' = 'hadoop',
|
||||
'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
|
||||
);
|
||||
"""
|
||||
logger.info("catalog " + catalog_name + " created")
|
||||
sql """switch ${catalog_name};"""
|
||||
logger.info("switched to catalog " + catalog_name)
|
||||
String Orderby = """ order by col_int,col_smallint,col_tinyint,col_bigint,col_float,col_double,col_boolean,col_string,col_char,col_varchar,col_decimal,col_date,col_timestamp limit 7 """
|
||||
|
||||
sql """ use multi_catalog """
|
||||
|
||||
|
||||
|
||||
types = ["int","smallint","tinyint","bigint","float","double","boolean","string","char","varchar","date","timestamp","decimal"]
|
||||
|
||||
for( String type1 in types) {
|
||||
qt_desc """ desc parquet_alter_column_to_${type1} ; """
|
||||
|
||||
qt_show """ select * from parquet_alter_column_to_${type1} ${Orderby} """
|
||||
|
||||
for( String type2 in types) {
|
||||
|
||||
qt_order """ select col_${type2} from parquet_alter_column_to_${type1} order by col_${type2} limit 3 """
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
order_qt_int_int """ select col_int from parquet_alter_column_to_int where col_int>=2 order by col_int limit 3"""
|
||||
order_qt_int_smallint """ select col_smallint from parquet_alter_column_to_int where col_smallint>=3 order by col_smallint limit 3"""
|
||||
order_qt_int_tinyint """ select col_tinyint from parquet_alter_column_to_int where col_tinyint>=3 order by col_tinyint limit 3"""
|
||||
order_qt_int_bigint """ select col_bigint from parquet_alter_column_to_int where col_bigint>=3 order by col_bigint limit 3"""
|
||||
order_qt_int_float """ select col_float from parquet_alter_column_to_int where col_float=2.6 order by col_float limit 3"""
|
||||
order_qt_int_double """ select col_double from parquet_alter_column_to_int where col_double=0.8 order by col_double limit 3"""
|
||||
order_qt_int_boolean """ select col_boolean from parquet_alter_column_to_int where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_int_string """ select col_string from parquet_alter_column_to_int where col_string="B" order by col_string limit 3"""
|
||||
order_qt_int_char """ select col_char from parquet_alter_column_to_int where col_char="B" order by col_char limit 3"""
|
||||
order_qt_int_varchar """ select col_varchar from parquet_alter_column_to_int where col_varchar="C" order by col_varchar limit 3"""
|
||||
order_qt_int_date """ select col_date from parquet_alter_column_to_int where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_int_timestamp """ select col_timestamp from parquet_alter_column_to_int where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_int_decimal """ select col_decimal from parquet_alter_column_to_int where col_decimal=1.1 order by col_decimal limit 3"""
|
||||
order_qt_smallint_int """ select col_int from parquet_alter_column_to_smallint where col_int>=1 order by col_int limit 3"""
|
||||
order_qt_smallint_smallint """ select col_smallint from parquet_alter_column_to_smallint where col_smallint>=3 order by col_smallint limit 3"""
|
||||
order_qt_smallint_tinyint """ select col_tinyint from parquet_alter_column_to_smallint where col_tinyint>=2 order by col_tinyint limit 3"""
|
||||
order_qt_smallint_bigint """ select col_bigint from parquet_alter_column_to_smallint where col_bigint>=2 order by col_bigint limit 3"""
|
||||
order_qt_smallint_float """ select col_float from parquet_alter_column_to_smallint where col_float=3.0 order by col_float limit 3"""
|
||||
order_qt_smallint_double """ select col_double from parquet_alter_column_to_smallint where col_double=0.5 order by col_double limit 3"""
|
||||
order_qt_smallint_boolean """ select col_boolean from parquet_alter_column_to_smallint where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_smallint_string """ select col_string from parquet_alter_column_to_smallint where col_string="helloworld" order by col_string limit 3"""
|
||||
order_qt_smallint_char """ select col_char from parquet_alter_column_to_smallint where col_char="C" order by col_char limit 3"""
|
||||
order_qt_smallint_varchar """ select col_varchar from parquet_alter_column_to_smallint where col_varchar="A" order by col_varchar limit 3"""
|
||||
order_qt_smallint_date """ select col_date from parquet_alter_column_to_smallint where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_smallint_timestamp """ select col_timestamp from parquet_alter_column_to_smallint where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_smallint_decimal """ select col_decimal from parquet_alter_column_to_smallint where col_decimal=2.5 order by col_decimal limit 3"""
|
||||
order_qt_tinyint_int """ select col_int from parquet_alter_column_to_tinyint where col_int>=3 order by col_int limit 3"""
|
||||
order_qt_tinyint_smallint """ select col_smallint from parquet_alter_column_to_tinyint where col_smallint>=3 order by col_smallint limit 3"""
|
||||
order_qt_tinyint_tinyint """ select col_tinyint from parquet_alter_column_to_tinyint where col_tinyint>=3 order by col_tinyint limit 3"""
|
||||
order_qt_tinyint_bigint """ select col_bigint from parquet_alter_column_to_tinyint where col_bigint>=1 order by col_bigint limit 3"""
|
||||
order_qt_tinyint_float """ select col_float from parquet_alter_column_to_tinyint where col_float=0.6 order by col_float limit 3"""
|
||||
order_qt_tinyint_double """ select col_double from parquet_alter_column_to_tinyint where col_double=1.1 order by col_double limit 3"""
|
||||
order_qt_tinyint_boolean """ select col_boolean from parquet_alter_column_to_tinyint where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_tinyint_string """ select col_string from parquet_alter_column_to_tinyint where col_string="helloworld" order by col_string limit 3"""
|
||||
order_qt_tinyint_char """ select col_char from parquet_alter_column_to_tinyint where col_char="A" order by col_char limit 3"""
|
||||
order_qt_tinyint_varchar """ select col_varchar from parquet_alter_column_to_tinyint where col_varchar="C" order by col_varchar limit 3"""
|
||||
order_qt_tinyint_date """ select col_date from parquet_alter_column_to_tinyint where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_tinyint_timestamp """ select col_timestamp from parquet_alter_column_to_tinyint where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_tinyint_decimal """ select col_decimal from parquet_alter_column_to_tinyint where col_decimal=1.4 order by col_decimal limit 3"""
|
||||
order_qt_bigint_int """ select col_int from parquet_alter_column_to_bigint where col_int>=3 order by col_int limit 3"""
|
||||
order_qt_bigint_smallint """ select col_smallint from parquet_alter_column_to_bigint where col_smallint>=2 order by col_smallint limit 3"""
|
||||
order_qt_bigint_tinyint """ select col_tinyint from parquet_alter_column_to_bigint where col_tinyint>=2 order by col_tinyint limit 3"""
|
||||
order_qt_bigint_bigint """ select col_bigint from parquet_alter_column_to_bigint where col_bigint>=1 order by col_bigint limit 3"""
|
||||
order_qt_bigint_float """ select col_float from parquet_alter_column_to_bigint where col_float=2.5 order by col_float limit 3"""
|
||||
order_qt_bigint_double """ select col_double from parquet_alter_column_to_bigint where col_double=0.2 order by col_double limit 3"""
|
||||
order_qt_bigint_boolean """ select col_boolean from parquet_alter_column_to_bigint where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_bigint_string """ select col_string from parquet_alter_column_to_bigint where col_string="A" order by col_string limit 3"""
|
||||
order_qt_bigint_char """ select col_char from parquet_alter_column_to_bigint where col_char="A" order by col_char limit 3"""
|
||||
order_qt_bigint_varchar """ select col_varchar from parquet_alter_column_to_bigint where col_varchar="A" order by col_varchar limit 3"""
|
||||
order_qt_bigint_date """ select col_date from parquet_alter_column_to_bigint where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_bigint_timestamp """ select col_timestamp from parquet_alter_column_to_bigint where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_bigint_decimal """ select col_decimal from parquet_alter_column_to_bigint where col_decimal=0.8 order by col_decimal limit 3"""
|
||||
order_qt_float_int """ select col_int from parquet_alter_column_to_float where col_int=1.4 order by col_int limit 3"""
|
||||
order_qt_float_smallint """ select col_smallint from parquet_alter_column_to_float where col_smallint=0.3 order by col_smallint limit 3"""
|
||||
order_qt_float_tinyint """ select col_tinyint from parquet_alter_column_to_float where col_tinyint=0.2 order by col_tinyint limit 3"""
|
||||
order_qt_float_bigint """ select col_bigint from parquet_alter_column_to_float where col_bigint=2.2 order by col_bigint limit 3"""
|
||||
order_qt_float_float """ select col_float from parquet_alter_column_to_float where col_float=1.2 order by col_float limit 3"""
|
||||
order_qt_float_double """ select col_double from parquet_alter_column_to_float where col_double=1.5 order by col_double limit 3"""
|
||||
order_qt_float_boolean """ select col_boolean from parquet_alter_column_to_float where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_float_string """ select col_string from parquet_alter_column_to_float where col_string="A" order by col_string limit 3"""
|
||||
order_qt_float_char """ select col_char from parquet_alter_column_to_float where col_char="helloworld" order by col_char limit 3"""
|
||||
order_qt_float_varchar """ select col_varchar from parquet_alter_column_to_float where col_varchar="1" order by col_varchar limit 3"""
|
||||
order_qt_float_date """ select col_date from parquet_alter_column_to_float where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_float_timestamp """ select col_timestamp from parquet_alter_column_to_float where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_float_decimal """ select col_decimal from parquet_alter_column_to_float where col_decimal=0.8 order by col_decimal limit 3"""
|
||||
order_qt_double_int """ select col_int from parquet_alter_column_to_double where col_int=2.0 order by col_int limit 3"""
|
||||
order_qt_double_smallint """ select col_smallint from parquet_alter_column_to_double where col_smallint=2.0 order by col_smallint limit 3"""
|
||||
order_qt_double_tinyint """ select col_tinyint from parquet_alter_column_to_double where col_tinyint=1.4 order by col_tinyint limit 3"""
|
||||
order_qt_double_bigint """ select col_bigint from parquet_alter_column_to_double where col_bigint=1.5 order by col_bigint limit 3"""
|
||||
order_qt_double_float """ select col_float from parquet_alter_column_to_double where col_float=2.2 order by col_float limit 3"""
|
||||
order_qt_double_double """ select col_double from parquet_alter_column_to_double where col_double=0.6 order by col_double limit 3"""
|
||||
order_qt_double_boolean """ select col_boolean from parquet_alter_column_to_double where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_double_string """ select col_string from parquet_alter_column_to_double where col_string="B" order by col_string limit 3"""
|
||||
order_qt_double_char """ select col_char from parquet_alter_column_to_double where col_char="A" order by col_char limit 3"""
|
||||
order_qt_double_varchar """ select col_varchar from parquet_alter_column_to_double where col_varchar="C" order by col_varchar limit 3"""
|
||||
order_qt_double_date """ select col_date from parquet_alter_column_to_double where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_double_timestamp """ select col_timestamp from parquet_alter_column_to_double where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_double_decimal """ select col_decimal from parquet_alter_column_to_double where col_decimal=0.3 order by col_decimal limit 3"""
|
||||
order_qt_boolean_int """ select col_int from parquet_alter_column_to_boolean where col_int>=3 order by col_int limit 3"""
|
||||
order_qt_boolean_smallint """ select col_smallint from parquet_alter_column_to_boolean where col_smallint>=2 order by col_smallint limit 3"""
|
||||
order_qt_boolean_tinyint """ select col_tinyint from parquet_alter_column_to_boolean where col_tinyint>=1 order by col_tinyint limit 3"""
|
||||
order_qt_boolean_bigint """ select col_bigint from parquet_alter_column_to_boolean where col_bigint>=3 order by col_bigint limit 3"""
|
||||
order_qt_boolean_float """ select col_float from parquet_alter_column_to_boolean where col_float=1.1 order by col_float limit 3"""
|
||||
order_qt_boolean_double """ select col_double from parquet_alter_column_to_boolean where col_double=0.5 order by col_double limit 3"""
|
||||
order_qt_boolean_boolean """ select col_boolean from parquet_alter_column_to_boolean where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_boolean_string """ select col_string from parquet_alter_column_to_boolean where col_string="1" order by col_string limit 3"""
|
||||
order_qt_boolean_char """ select col_char from parquet_alter_column_to_boolean where col_char="A" order by col_char limit 3"""
|
||||
order_qt_boolean_varchar """ select col_varchar from parquet_alter_column_to_boolean where col_varchar="B" order by col_varchar limit 3"""
|
||||
order_qt_boolean_date """ select col_date from parquet_alter_column_to_boolean where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_boolean_timestamp """ select col_timestamp from parquet_alter_column_to_boolean where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_boolean_decimal """ select col_decimal from parquet_alter_column_to_boolean where col_decimal=2.8 order by col_decimal limit 3"""
|
||||
order_qt_string_int """ select col_int from parquet_alter_column_to_string where col_int="C" order by col_int limit 3"""
|
||||
order_qt_string_smallint """ select col_smallint from parquet_alter_column_to_string where col_smallint="C" order by col_smallint limit 3"""
|
||||
order_qt_string_tinyint """ select col_tinyint from parquet_alter_column_to_string where col_tinyint="B" order by col_tinyint limit 3"""
|
||||
order_qt_string_bigint """ select col_bigint from parquet_alter_column_to_string where col_bigint="helloworld" order by col_bigint limit 3"""
|
||||
order_qt_string_float """ select col_float from parquet_alter_column_to_string where col_float="1" order by col_float limit 3"""
|
||||
order_qt_string_double """ select col_double from parquet_alter_column_to_string where col_double="C" order by col_double limit 3"""
|
||||
order_qt_string_boolean """ select col_boolean from parquet_alter_column_to_string where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_string_string """ select col_string from parquet_alter_column_to_string where col_string="B" order by col_string limit 3"""
|
||||
order_qt_string_char """ select col_char from parquet_alter_column_to_string where col_char="A" order by col_char limit 3"""
|
||||
order_qt_string_varchar """ select col_varchar from parquet_alter_column_to_string where col_varchar="B" order by col_varchar limit 3"""
|
||||
order_qt_string_date """ select col_date from parquet_alter_column_to_string where col_date="helloworld" order by col_date limit 3"""
|
||||
order_qt_string_timestamp """ select col_timestamp from parquet_alter_column_to_string where col_timestamp="B" order by col_timestamp limit 3"""
|
||||
order_qt_string_decimal """ select col_decimal from parquet_alter_column_to_string where col_decimal="1" order by col_decimal limit 3"""
|
||||
order_qt_char_int """ select col_int from parquet_alter_column_to_char where col_int="B" order by col_int limit 3"""
|
||||
order_qt_char_smallint """ select col_smallint from parquet_alter_column_to_char where col_smallint="A" order by col_smallint limit 3"""
|
||||
order_qt_char_tinyint """ select col_tinyint from parquet_alter_column_to_char where col_tinyint="A" order by col_tinyint limit 3"""
|
||||
order_qt_char_bigint """ select col_bigint from parquet_alter_column_to_char where col_bigint="B" order by col_bigint limit 3"""
|
||||
order_qt_char_float """ select col_float from parquet_alter_column_to_char where col_float="C" order by col_float limit 3"""
|
||||
order_qt_char_double """ select col_double from parquet_alter_column_to_char where col_double="A" order by col_double limit 3"""
|
||||
order_qt_char_boolean """ select col_boolean from parquet_alter_column_to_char where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_char_string """ select col_string from parquet_alter_column_to_char where col_string="C" order by col_string limit 3"""
|
||||
order_qt_char_char """ select col_char from parquet_alter_column_to_char where col_char="A" order by col_char limit 3"""
|
||||
order_qt_char_varchar """ select col_varchar from parquet_alter_column_to_char where col_varchar="B" order by col_varchar limit 3"""
|
||||
order_qt_char_date """ select col_date from parquet_alter_column_to_char where col_date="B" order by col_date limit 3"""
|
||||
order_qt_char_timestamp """ select col_timestamp from parquet_alter_column_to_char where col_timestamp="A" order by col_timestamp limit 3"""
|
||||
order_qt_char_decimal """ select col_decimal from parquet_alter_column_to_char where col_decimal="C" order by col_decimal limit 3"""
|
||||
order_qt_varchar_int """ select col_int from parquet_alter_column_to_varchar where col_int="B" order by col_int limit 3"""
|
||||
order_qt_varchar_smallint """ select col_smallint from parquet_alter_column_to_varchar where col_smallint="helloworld" order by col_smallint limit 3"""
|
||||
order_qt_varchar_tinyint """ select col_tinyint from parquet_alter_column_to_varchar where col_tinyint="A" order by col_tinyint limit 3"""
|
||||
order_qt_varchar_bigint """ select col_bigint from parquet_alter_column_to_varchar where col_bigint="helloworld" order by col_bigint limit 3"""
|
||||
order_qt_varchar_float """ select col_float from parquet_alter_column_to_varchar where col_float="1" order by col_float limit 3"""
|
||||
order_qt_varchar_double """ select col_double from parquet_alter_column_to_varchar where col_double="B" order by col_double limit 3"""
|
||||
order_qt_varchar_boolean """ select col_boolean from parquet_alter_column_to_varchar where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_varchar_string """ select col_string from parquet_alter_column_to_varchar where col_string="A" order by col_string limit 3"""
|
||||
order_qt_varchar_char """ select col_char from parquet_alter_column_to_varchar where col_char="B" order by col_char limit 3"""
|
||||
order_qt_varchar_varchar """ select col_varchar from parquet_alter_column_to_varchar where col_varchar="B" order by col_varchar limit 3"""
|
||||
order_qt_varchar_date """ select col_date from parquet_alter_column_to_varchar where col_date="C" order by col_date limit 3"""
|
||||
order_qt_varchar_timestamp """ select col_timestamp from parquet_alter_column_to_varchar where col_timestamp="C" order by col_timestamp limit 3"""
|
||||
order_qt_varchar_decimal """ select col_decimal from parquet_alter_column_to_varchar where col_decimal="helloworld" order by col_decimal limit 3"""
|
||||
order_qt_date_int """ select col_int from parquet_alter_column_to_date where col_int>=3 order by col_int limit 3"""
|
||||
order_qt_date_smallint """ select col_smallint from parquet_alter_column_to_date where col_smallint>=1 order by col_smallint limit 3"""
|
||||
order_qt_date_tinyint """ select col_tinyint from parquet_alter_column_to_date where col_tinyint>=3 order by col_tinyint limit 3"""
|
||||
order_qt_date_bigint """ select col_bigint from parquet_alter_column_to_date where col_bigint>=1 order by col_bigint limit 3"""
|
||||
order_qt_date_float """ select col_float from parquet_alter_column_to_date where col_float=2.8 order by col_float limit 3"""
|
||||
order_qt_date_double """ select col_double from parquet_alter_column_to_date where col_double=2.5 order by col_double limit 3"""
|
||||
order_qt_date_boolean """ select col_boolean from parquet_alter_column_to_date where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_date_string """ select col_string from parquet_alter_column_to_date where col_string="helloworld" order by col_string limit 3"""
|
||||
order_qt_date_char """ select col_char from parquet_alter_column_to_date where col_char="A" order by col_char limit 3"""
|
||||
order_qt_date_varchar """ select col_varchar from parquet_alter_column_to_date where col_varchar="1" order by col_varchar limit 3"""
|
||||
order_qt_date_date """ select col_date from parquet_alter_column_to_date where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_date_timestamp """ select col_timestamp from parquet_alter_column_to_date where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_date_decimal """ select col_decimal from parquet_alter_column_to_date where col_decimal=0.3 order by col_decimal limit 3"""
|
||||
order_qt_timestamp_int """ select col_int from parquet_alter_column_to_timestamp where col_int>=3 order by col_int limit 3"""
|
||||
order_qt_timestamp_smallint """ select col_smallint from parquet_alter_column_to_timestamp where col_smallint>=3 order by col_smallint limit 3"""
|
||||
order_qt_timestamp_tinyint """ select col_tinyint from parquet_alter_column_to_timestamp where col_tinyint>=1 order by col_tinyint limit 3"""
|
||||
order_qt_timestamp_bigint """ select col_bigint from parquet_alter_column_to_timestamp where col_bigint>=3 order by col_bigint limit 3"""
|
||||
order_qt_timestamp_float """ select col_float from parquet_alter_column_to_timestamp where col_float=2.4 order by col_float limit 3"""
|
||||
order_qt_timestamp_double """ select col_double from parquet_alter_column_to_timestamp where col_double=1.3 order by col_double limit 3"""
|
||||
order_qt_timestamp_boolean """ select col_boolean from parquet_alter_column_to_timestamp where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_timestamp_string """ select col_string from parquet_alter_column_to_timestamp where col_string="C" order by col_string limit 3"""
|
||||
order_qt_timestamp_char """ select col_char from parquet_alter_column_to_timestamp where col_char="B" order by col_char limit 3"""
|
||||
order_qt_timestamp_varchar """ select col_varchar from parquet_alter_column_to_timestamp where col_varchar="C" order by col_varchar limit 3"""
|
||||
order_qt_timestamp_date """ select col_date from parquet_alter_column_to_timestamp where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_timestamp_timestamp """ select col_timestamp from parquet_alter_column_to_timestamp where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_timestamp_decimal """ select col_decimal from parquet_alter_column_to_timestamp where col_decimal=1.3 order by col_decimal limit 3"""
|
||||
order_qt_decimal_int """ select col_int from parquet_alter_column_to_decimal where col_int=2.8 order by col_int limit 3"""
|
||||
order_qt_decimal_smallint """ select col_smallint from parquet_alter_column_to_decimal where col_smallint=0.1 order by col_smallint limit 3"""
|
||||
order_qt_decimal_tinyint """ select col_tinyint from parquet_alter_column_to_decimal where col_tinyint=2.9 order by col_tinyint limit 3"""
|
||||
order_qt_decimal_bigint """ select col_bigint from parquet_alter_column_to_decimal where col_bigint=2.3 order by col_bigint limit 3"""
|
||||
order_qt_decimal_float """ select col_float from parquet_alter_column_to_decimal where col_float=2.5 order by col_float limit 3"""
|
||||
order_qt_decimal_double """ select col_double from parquet_alter_column_to_decimal where col_double=1.7 order by col_double limit 3"""
|
||||
order_qt_decimal_boolean """ select col_boolean from parquet_alter_column_to_decimal where year(col_boolean)=2023 order by col_boolean limit 3"""
|
||||
order_qt_decimal_string """ select col_string from parquet_alter_column_to_decimal where col_string="helloworld" order by col_string limit 3"""
|
||||
order_qt_decimal_char """ select col_char from parquet_alter_column_to_decimal where col_char="helloworld" order by col_char limit 3"""
|
||||
order_qt_decimal_varchar """ select col_varchar from parquet_alter_column_to_decimal where col_varchar="helloworld" order by col_varchar limit 3"""
|
||||
order_qt_decimal_date """ select col_date from parquet_alter_column_to_decimal where year(col_date)=2023 order by col_date limit 3"""
|
||||
order_qt_decimal_timestamp """ select col_timestamp from parquet_alter_column_to_decimal where year(col_timestamp)=2023 order by col_timestamp limit 3"""
|
||||
order_qt_decimal_decimal """ select col_decimal from parquet_alter_column_to_decimal where col_decimal=1.5 order by col_decimal limit 3"""
|
||||
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user