[fix](parquet-reader) fix coredump when load datatime data to doris from parquet (#15794)

`date_time_v2` will check scale when constructed datatimev2:
```
LOG(FATAL) << fmt::format("Scale {} is out of bounds", scale);
```

This [PR](https://github.com/apache/doris/pull/15510) has fixed this issue, but parquet does not use constructor to create `TypeDescriptor`, leading the `scale = -1` when reading datetimev2 data.
This commit is contained in:
Ashin Gau
2023-01-13 11:51:11 +08:00
committed by GitHub
parent 5e59954531
commit 34bb9cd5d3
3 changed files with 40 additions and 32 deletions

View File

@ -69,8 +69,12 @@ HdfsFileSystem::HdfsFileSystem(const THdfsParams& hdfs_params, const std::string
}
HdfsFileSystem::~HdfsFileSystem() {
if (_fs_handle && _fs_handle->from_cache) {
_fs_handle->dec_ref();
if (_fs_handle != nullptr) {
if (_fs_handle->from_cache) {
_fs_handle->dec_ref();
} else {
delete _fs_handle;
}
}
}

View File

@ -74,6 +74,7 @@ struct TypeDescriptor {
precision = 27;
scale = 9;
} else if (type == TYPE_DATETIMEV2) {
precision = 18;
scale = 6;
}
}

View File

@ -167,24 +167,27 @@ TypeDescriptor FieldDescriptor::get_doris_type(const tparquet::SchemaElement& ph
if (type.type == INVALID_TYPE) {
switch (physical_schema.type) {
case tparquet::Type::BOOLEAN:
type.type = TYPE_BOOLEAN;
type = TypeDescriptor(TYPE_BOOLEAN);
break;
case tparquet::Type::INT32:
type.type = TYPE_INT;
type = TypeDescriptor(TYPE_INT);
break;
case tparquet::Type::INT64:
type = TypeDescriptor(TYPE_BIGINT);
break;
case tparquet::Type::INT96:
type.type = TYPE_BIGINT;
// in most cases, it's a nano timestamp
type = TypeDescriptor(TYPE_DATETIMEV2);
break;
case tparquet::Type::FLOAT:
type.type = TYPE_FLOAT;
type = TypeDescriptor(TYPE_FLOAT);
break;
case tparquet::Type::DOUBLE:
type.type = TYPE_DOUBLE;
type = TypeDescriptor(TYPE_DOUBLE);
break;
case tparquet::Type::BYTE_ARRAY:
case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
type.type = TYPE_STRING;
type = TypeDescriptor(TYPE_STRING);
break;
default:
break;
@ -196,33 +199,31 @@ TypeDescriptor FieldDescriptor::get_doris_type(const tparquet::SchemaElement& ph
TypeDescriptor FieldDescriptor::convert_to_doris_type(tparquet::LogicalType logicalType) {
TypeDescriptor type;
if (logicalType.__isset.STRING) {
type.type = TYPE_STRING;
type = TypeDescriptor(TYPE_STRING);
} else if (logicalType.__isset.DECIMAL) {
type.type = TYPE_DECIMALV2;
type.precision = 27;
type.scale = 9;
type = TypeDescriptor(TYPE_DECIMALV2);
} else if (logicalType.__isset.DATE) {
type.type = TYPE_DATEV2;
type = TypeDescriptor(TYPE_DATEV2);
} else if (logicalType.__isset.INTEGER) {
if (logicalType.INTEGER.isSigned) {
if (logicalType.INTEGER.bitWidth <= 32) {
type.type = TYPE_INT;
type = TypeDescriptor(TYPE_INT);
} else {
type.type = TYPE_BIGINT;
type = TypeDescriptor(TYPE_BIGINT);
}
} else {
if (logicalType.INTEGER.bitWidth <= 16) {
type.type = TYPE_INT;
type = TypeDescriptor(TYPE_INT);
} else {
type.type = TYPE_BIGINT;
type = TypeDescriptor(TYPE_BIGINT);
}
}
} else if (logicalType.__isset.TIME) {
type.type = TYPE_TIMEV2;
type = TypeDescriptor(TYPE_TIMEV2);
} else if (logicalType.__isset.TIMESTAMP) {
type.type = TYPE_DATETIMEV2;
type = TypeDescriptor(TYPE_DATETIMEV2);
} else {
type.type = INVALID_TYPE;
type = TypeDescriptor(INVALID_TYPE);
}
return type;
}
@ -231,39 +232,41 @@ TypeDescriptor FieldDescriptor::convert_to_doris_type(tparquet::ConvertedType::t
TypeDescriptor type;
switch (convertedType) {
case tparquet::ConvertedType::type::UTF8:
type.type = TYPE_STRING;
type = TypeDescriptor(TYPE_STRING);
break;
case tparquet::ConvertedType::type::DECIMAL:
type.type = TYPE_DECIMALV2;
type.precision = 27;
type.scale = 9;
type = TypeDescriptor(TYPE_DECIMALV2);
break;
case tparquet::ConvertedType::type::DATE:
type.type = TYPE_DATEV2;
type = TypeDescriptor(TYPE_DATEV2);
break;
case tparquet::ConvertedType::type::TIME_MILLIS:
case tparquet::ConvertedType::type::TIME_MICROS:
type.type = TYPE_TIMEV2;
type = TypeDescriptor(TYPE_TIMEV2);
break;
case tparquet::ConvertedType::type::TIMESTAMP_MILLIS:
case tparquet::ConvertedType::type::TIMESTAMP_MICROS:
type.type = TYPE_DATETIMEV2;
type = TypeDescriptor(TYPE_DATETIMEV2);
break;
case tparquet::ConvertedType::type::INT_8:
type = TypeDescriptor(TYPE_TINYINT);
break;
case tparquet::ConvertedType::type::UINT_8:
case tparquet::ConvertedType::type::UINT_16:
case tparquet::ConvertedType::type::INT_8:
case tparquet::ConvertedType::type::INT_16:
type = TypeDescriptor(TYPE_SMALLINT);
break;
case tparquet::ConvertedType::type::UINT_16:
case tparquet::ConvertedType::type::INT_32:
type.type = TYPE_INT;
type = TypeDescriptor(TYPE_INT);
break;
case tparquet::ConvertedType::type::UINT_32:
case tparquet::ConvertedType::type::UINT_64:
case tparquet::ConvertedType::type::INT_64:
type.type = TYPE_BIGINT;
type = TypeDescriptor(TYPE_BIGINT);
break;
default:
LOG(WARNING) << "Not supported parquet ConvertedType: " << convertedType;
type = INVALID_TYPE;
type = TypeDescriptor(INVALID_TYPE);
break;
}
return type;