From 47ff6f1300e504ddf09ca9e2d8bf1df85bd9a867 Mon Sep 17 00:00:00 2001 From: Rayner Chen Date: Tue, 22 Oct 2024 10:10:25 +0800 Subject: [PATCH] [fix](OrcReader) fix the issue that orc_reader can not read DECIMAL(0,0) type of orc file #41795 (#42220) cherry pick from #41795 Co-authored-by: Tiewei Fang <43782773+BePPPower@users.noreply.github.com> --- be/src/vec/exec/format/orc/vorc_reader.cpp | 9 +++++++++ be/src/vec/exec/format/orc/vorc_reader.h | 1 - .../tvf/orc_tvf/test_hdfs_orc_group1_orc_files.out | 7 +++++++ .../tvf/orc_tvf/test_hdfs_orc_group2_orc_files.out | 12 ++++++++++++ .../orc_tvf/test_hdfs_orc_group1_orc_files.groovy | 10 +++++----- .../orc_tvf/test_hdfs_orc_group2_orc_files.groovy | 6 ++++++ 6 files changed, 39 insertions(+), 6 deletions(-) diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 413e158143..de1dc499e6 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -95,6 +95,11 @@ namespace doris::vectorized { // TODO: we need to determine it by test. static constexpr uint32_t MAX_DICT_CODE_PREDICATE_TO_REWRITE = std::numeric_limits::max(); static constexpr char EMPTY_STRING_FOR_OVERFLOW[ColumnString::MAX_STRINGS_OVERFLOW_SIZE] = ""; +// Because HIVE 0.11 & 0.12 does not support precision and scale for decimal +// The decimal type of orc file produced by HIVE 0.11 & 0.12 are DECIMAL(0,0) +// We should set a default precision and scale for these orc files. +static constexpr int decimal_precision_for_hive11 = BeConsts::MAX_DECIMAL128_PRECISION; +static constexpr int decimal_scale_for_hive11 = 10; #define FOR_FLAT_ORC_COLUMNS(M) \ M(TypeIndex::Int8, Int8, orc::LongVectorBatch) \ @@ -1050,6 +1055,10 @@ TypeDescriptor OrcReader::convert_to_doris_type(const orc::Type* orc_type) { case orc::TypeKind::TIMESTAMP: return TypeDescriptor(PrimitiveType::TYPE_DATETIMEV2); case orc::TypeKind::DECIMAL: + if (orc_type->getPrecision() == 0) { + return TypeDescriptor::create_decimalv3_type(decimal_precision_for_hive11, + decimal_scale_for_hive11); + } return TypeDescriptor::create_decimalv3_type(orc_type->getPrecision(), orc_type->getScale()); case orc::TypeKind::DATE: diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index c0b372dfce..4aad5637ef 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -587,7 +587,6 @@ private: std::unique_ptr _reader; std::unique_ptr _row_reader; std::unique_ptr _orc_filter; - orc::ReaderOptions _reader_options; orc::RowReaderOptions _row_reader_options; std::shared_ptr _file_system; diff --git a/regression-test/data/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group1_orc_files.out b/regression-test/data/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group1_orc_files.out index 12864d9c8a..1e27e51148 100644 --- a/regression-test/data/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group1_orc_files.out +++ b/regression-test/data/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group1_orc_files.out @@ -9,6 +9,13 @@ 2014-02-11 8200-02-11 +-- !test_2 -- +12345678.6547450000 +12345678.6547450000 +12345678.6547450000 +12345678.6547450000 +12345678.6547450000 + -- !test_3 -- 2 foo 0.8 1 1969-12-31T16:00 5 eat 0.8 6 1969-12-31T16:00:20 diff --git a/regression-test/data/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group2_orc_files.out b/regression-test/data/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group2_orc_files.out index 3046384b92..9b5840ac0c 100644 --- a/regression-test/data/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group2_orc_files.out +++ b/regression-test/data/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group2_orc_files.out @@ -21,3 +21,15 @@ row 000009 Alyssa \N [3, 9, 15, 20] Ben red [] +-- !test_4 -- +2 foo 0.8 1.2000000000 1969-12-31T16:00 +5 eat 0.8 5.5000000000 1969-12-31T16:00:20 +13 bar 80.0 2.2000000000 1969-12-31T16:00:05 +29 cat 8.0 3.3000000000 1969-12-31T16:00:10 +70 dog 1.8 4.4000000000 1969-12-31T16:00:15 +100 zebra 8.0 0E-10 1969-12-31T16:04:10 +100 zebra 8.0 0E-10 1969-12-31T16:04:10 +100 zebra 8.0 0E-10 1969-12-31T16:04:10 +100 zebra 8.0 0E-10 1969-12-31T16:04:10 +100 zebra 8.0 0E-10 1969-12-31T16:04:10 + diff --git a/regression-test/suites/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group1_orc_files.groovy b/regression-test/suites/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group1_orc_files.groovy index 44176a47fd..e42b745bfa 100644 --- a/regression-test/suites/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group1_orc_files.groovy +++ b/regression-test/suites/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group1_orc_files.groovy @@ -41,11 +41,11 @@ suite("test_hdfs_orc_group1_orc_files","external,hive,tvf,external_docker") { // Doris cannot read this ORC file because of a NOT_IMPLEMENT error. - // uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_orc/group1/orc-file-11-format.orc" - // order_qt_test_2 """ select * from HDFS( - // "uri" = "${uri}", - // "hadoop.username" = "${hdfsUserName}", - // "format" = "orc"); """ + uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_orc/group1/orc-file-11-format.orc" + order_qt_test_2 """ select decimal1 from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "orc") limit 5; """ uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_orc/group1/orc_split_elim.orc" diff --git a/regression-test/suites/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group2_orc_files.groovy b/regression-test/suites/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group2_orc_files.groovy index 4495494a3f..daf1d2a138 100644 --- a/regression-test/suites/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group2_orc_files.groovy +++ b/regression-test/suites/external_table_p0/tvf/orc_tvf/test_hdfs_orc_group2_orc_files.groovy @@ -49,6 +49,12 @@ suite("test_hdfs_orc_group2_orc_files","external,hive,tvf,external_docker") { "uri" = "${uri}", "hadoop.username" = "${hdfsUserName}", "format" = "orc"); """ + + uri = "${defaultFS}" + "/user/doris/tvf_data/test_hdfs_orc/group2/orc_split_elim.orc" + qt_test_4 """ select * from HDFS( + "uri" = "${uri}", + "hadoop.username" = "${hdfsUserName}", + "format" = "orc") order by userid limit 10; """ } finally { } }