[fix](OrcReader) fix the issue that orc_reader can not read DECIMAL(0,0) type of orc file #41795 (#42220)

cherry pick from #41795

Co-authored-by: Tiewei Fang <43782773+BePPPower@users.noreply.github.com>
This commit is contained in:
Rayner Chen
2024-10-22 10:10:25 +08:00
committed by GitHub
parent 1f8d685f26
commit 47ff6f1300
6 changed files with 39 additions and 6 deletions

View File

@ -95,6 +95,11 @@ namespace doris::vectorized {
// TODO: we need to determine it by test.
static constexpr uint32_t MAX_DICT_CODE_PREDICATE_TO_REWRITE = std::numeric_limits<uint32_t>::max();
static constexpr char EMPTY_STRING_FOR_OVERFLOW[ColumnString::MAX_STRINGS_OVERFLOW_SIZE] = "";
// Because HIVE 0.11 & 0.12 does not support precision and scale for decimal
// The decimal type of orc file produced by HIVE 0.11 & 0.12 are DECIMAL(0,0)
// We should set a default precision and scale for these orc files.
static constexpr int decimal_precision_for_hive11 = BeConsts::MAX_DECIMAL128_PRECISION;
static constexpr int decimal_scale_for_hive11 = 10;
#define FOR_FLAT_ORC_COLUMNS(M) \
M(TypeIndex::Int8, Int8, orc::LongVectorBatch) \
@ -1050,6 +1055,10 @@ TypeDescriptor OrcReader::convert_to_doris_type(const orc::Type* orc_type) {
case orc::TypeKind::TIMESTAMP:
return TypeDescriptor(PrimitiveType::TYPE_DATETIMEV2);
case orc::TypeKind::DECIMAL:
if (orc_type->getPrecision() == 0) {
return TypeDescriptor::create_decimalv3_type(decimal_precision_for_hive11,
decimal_scale_for_hive11);
}
return TypeDescriptor::create_decimalv3_type(orc_type->getPrecision(),
orc_type->getScale());
case orc::TypeKind::DATE:

View File

@ -587,7 +587,6 @@ private:
std::unique_ptr<orc::Reader> _reader;
std::unique_ptr<orc::RowReader> _row_reader;
std::unique_ptr<ORCFilterImpl> _orc_filter;
orc::ReaderOptions _reader_options;
orc::RowReaderOptions _row_reader_options;
std::shared_ptr<io::FileSystem> _file_system;

View File

@ -9,6 +9,13 @@
2014-02-11
8200-02-11
-- !test_2 --
12345678.6547450000
12345678.6547450000
12345678.6547450000
12345678.6547450000
12345678.6547450000
-- !test_3 --
2 foo 0.8 1 1969-12-31T16:00
5 eat 0.8 6 1969-12-31T16:00:20

View File

@ -21,3 +21,15 @@ row 000009
Alyssa \N [3, 9, 15, 20]
Ben red []
-- !test_4 --
2 foo 0.8 1.2000000000 1969-12-31T16:00
5 eat 0.8 5.5000000000 1969-12-31T16:00:20
13 bar 80.0 2.2000000000 1969-12-31T16:00:05
29 cat 8.0 3.3000000000 1969-12-31T16:00:10
70 dog 1.8 4.4000000000 1969-12-31T16:00:15
100 zebra 8.0 0E-10 1969-12-31T16:04:10
100 zebra 8.0 0E-10 1969-12-31T16:04:10
100 zebra 8.0 0E-10 1969-12-31T16:04:10
100 zebra 8.0 0E-10 1969-12-31T16:04:10
100 zebra 8.0 0E-10 1969-12-31T16:04:10

View File

@ -41,11 +41,11 @@ suite("test_hdfs_orc_group1_orc_files","external,hive,tvf,external_docker") {
// Doris cannot read this ORC file because of a NOT_IMPLEMENT error.
// uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_orc/group1/orc-file-11-format.orc"
// order_qt_test_2 """ select * from HDFS(
// "uri" = "${uri}",
// "hadoop.username" = "${hdfsUserName}",
// "format" = "orc"); """
uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_orc/group1/orc-file-11-format.orc"
order_qt_test_2 """ select decimal1 from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "orc") limit 5; """
uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_orc/group1/orc_split_elim.orc"

View File

@ -49,6 +49,12 @@ suite("test_hdfs_orc_group2_orc_files","external,hive,tvf,external_docker") {
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "orc"); """
uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_orc/group2/orc_split_elim.orc"
qt_test_4 """ select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "orc") order by userid limit 10; """
} finally {
}
}