From 97aab138aa61402e9b9fa5f587ee8554e3be561c Mon Sep 17 00:00:00 2001 From: slothever <18522955+wsjz@users.noreply.github.com> Date: Sat, 1 Apr 2023 21:00:01 +0800 Subject: [PATCH] [fix](parquet-reader) reset value idx in bool rle decoder and support iceberg datetime(3) (#18245) 1. Fix value idx in bool rle decoder 2. Iceberg table support datetimev2(3). In the previous version, we converted hive timestamp to datetimev2(0) default. --- .../exec/format/parquet/bool_rle_decoder.cpp | 6 +- .../catalog/HiveMetaStoreClientHelper.java | 9 ++- .../catalog/external/HMSExternalTable.java | 3 +- .../external/IcebergExternalTable.java | 4 +- .../hive/test_external_catalog_glue_table.out | 59 +++++++++++-------- .../test_external_catalog_glue_table.groovy | 1 + 6 files changed, 50 insertions(+), 32 deletions(-) diff --git a/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp b/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp index c954f98b25..46d403e6f7 100644 --- a/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp +++ b/be/src/vec/exec/format/parquet/bool_rle_decoder.cpp @@ -24,7 +24,7 @@ void BoolRLEDecoder::set_data(Slice* slice) { _data = slice; _num_bytes = slice->size; _offset = 0; - + _current_value_idx = 0; if (_num_bytes < 4) { LOG(FATAL) << "Received invalid length : " + std::to_string(_num_bytes) + " (corrupt data page?)"; @@ -51,12 +51,11 @@ Status BoolRLEDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr auto& column_data = static_cast&>(*doris_column).get_data(); size_t data_index = column_data.size(); column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); - size_t max_values = column_data.size(); + size_t max_values = select_vector.num_values() - select_vector.num_nulls(); _values.resize(max_values); if (!_decoder.get_values(_values.data(), max_values)) { return Status::IOError("Can't read enough booleans in rle decoder"); } - // _num_bytes -= max_values; ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run(&read_type)) { switch (read_type) { @@ -83,6 +82,7 @@ Status BoolRLEDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr } } } + _current_value_idx = 0; return Status::OK(); } } // namespace doris::vectorized diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java index e4f2ad993b..038f108439 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/HiveMetaStoreClientHelper.java @@ -698,6 +698,13 @@ public class HiveMetaStoreClientHelper { * Convert hive type to doris type. */ public static Type hiveTypeToDorisType(String hiveType) { + return hiveTypeToDorisType(hiveType, 0); + } + + /** + * Convert hive type to doris type with timescale. + */ + public static Type hiveTypeToDorisType(String hiveType, int timeScale) { String lowerCaseType = hiveType.toLowerCase(); switch (lowerCaseType) { case "boolean": @@ -713,7 +720,7 @@ public class HiveMetaStoreClientHelper { case "date": return ScalarType.createDateV2Type(); case "timestamp": - return ScalarType.createDatetimeV2Type(0); + return ScalarType.createDatetimeV2Type(timeScale); case "float": return Type.FLOAT; case "double": diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index 1a558e1bd9..7398d87867 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -318,7 +318,8 @@ public class HMSExternalTable extends ExternalTable { List tmpSchema = Lists.newArrayListWithCapacity(hmsSchema.size()); for (FieldSchema field : hmsSchema) { tmpSchema.add(new Column(field.getName(), - HiveMetaStoreClientHelper.hiveTypeToDorisType(field.getType()), true, null, + HiveMetaStoreClientHelper.hiveTypeToDorisType(field.getType(), + IcebergExternalTable.ICEBERG_DATETIME_SCALE_MS), true, null, true, null, field.getComment(), true, null, schema.caseInsensitiveFindField(field.getName()).fieldId(), null)); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java index 521a813182..a8998c6365 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java @@ -36,6 +36,8 @@ import java.util.List; public class IcebergExternalTable extends ExternalTable { + public static final int ICEBERG_DATETIME_SCALE_MS = 3; + public IcebergExternalTable(long id, String name, String dbName, IcebergExternalCatalog catalog) { super(id, name, catalog, dbName, TableType.ICEBERG_EXTERNAL_TABLE); } @@ -88,7 +90,7 @@ public class IcebergExternalTable extends ExternalTable { case DATE: return ScalarType.createDateV2Type(); case TIMESTAMP: - return ScalarType.createDatetimeV2Type(0); + return ScalarType.createDatetimeV2Type(ICEBERG_DATETIME_SCALE_MS); case TIME: return Type.UNSUPPORTED; default: diff --git a/regression-test/data/external_table_emr_p2/hive/test_external_catalog_glue_table.out b/regression-test/data/external_table_emr_p2/hive/test_external_catalog_glue_table.out index a2860e3bdd..9fbbbabaf0 100644 --- a/regression-test/data/external_table_emr_p2/hive/test_external_catalog_glue_table.out +++ b/regression-test/data/external_table_emr_p2/hive/test_external_catalog_glue_table.out @@ -35,26 +35,26 @@ 1876.4831949153224 -- !q06 -- -2023-03-07 20:34:59 -2023-03-07 20:34:59 -2023-03-07 20:34:59 -2023-03-07 20:34:59 -2023-03-07 20:34:59 -2023-03-07 20:34:59 -2023-03-07 20:34:59 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 -2023-03-07 20:35 +2023-03-07 20:34:59.601 +2023-03-07 20:34:59.693 +2023-03-07 20:34:59.708 +2023-03-07 20:34:59.782 +2023-03-07 20:34:59.836 +2023-03-07 20:34:59.934 +2023-03-07 20:34:59.950 +2023-03-07 20:35:00.042 +2023-03-07 20:35:00.053 +2023-03-07 20:35:00.114 +2023-03-07 20:35:00.134 +2023-03-07 20:35:00.201 +2023-03-07 20:35:00.272 +2023-03-07 20:35:00.316 +2023-03-07 20:35:00.337 +2023-03-07 20:35:00.409 +2023-03-07 20:35:00.420 +2023-03-07 20:35:00.428 +2023-03-07 20:35:00.500 +2023-03-07 20:35:00.535 -- !q07 -- 6f77a7baae184d @@ -82,7 +82,7 @@ f14889 66.8626 true 66.9046 true 67.0202 true -67.7351 false +67.7351 true -- !q11 -- 54078 8184 @@ -124,8 +124,15 @@ b5e6bf2b5 5000 -- !q16 -- -2023-03-07 20:35:59 -2023-03-07 20:35:59 -2023-03-07 20:35:59 -2023-03-07 20:35:59 -2023-03-07 20:35:59 +2023-03-07 20:35:59.064 +2023-03-07 20:35:59.087 +2023-03-07 20:35:59.110 +2023-03-07 20:35:59.129 +2023-03-07 20:35:59.224 + +-- !q17 -- +14040216 \N 2147483647 2023-03-07 20:38:02.140 81.607142423775869 b1d54a8ac60a4c8aa 66.6566 a54742979109 9a8247ed7c74 false +7847742 17740 2147483647 2023-03-07 20:36:02.376 1740.7904511543441 ff588a918be 66.8626 41c532d698024 18d9fa638cd449d893 true +9045125 27361 2147483647 2023-03-07 20:35:51.997 1245.2170379359104 b31a143e67 66.9046 52ab9d8a748f4c9 5d70ec319e true +10410585 \N 1938534851 2023-03-07 20:35:17.731 955.1760424982325 643e7c71b83d444e9261 67.0202 6a15d14103dc4 55b15adbec34 true +10055090 \N 2147483647 2023-03-07 20:38:59.078 1387.1527042831178 47 67.7351 c4c5 960637955914682b6 true diff --git a/regression-test/suites/external_table_emr_p2/hive/test_external_catalog_glue_table.groovy b/regression-test/suites/external_table_emr_p2/hive/test_external_catalog_glue_table.groovy index 85056ad772..26b1291ae2 100644 --- a/regression-test/suites/external_table_emr_p2/hive/test_external_catalog_glue_table.groovy +++ b/regression-test/suites/external_table_emr_p2/hive/test_external_catalog_glue_table.groovy @@ -47,6 +47,7 @@ suite("test_external_catalog_glue_table", "p2") { qt_q14 """ select glue_string from iceberg_glue_types where glue_string>'040abff1da4748e4b' order by glue_int limit 5 """ qt_q15 """ select count(1) from iceberg_glue_types """ qt_q16 """ select glue_timstamp from iceberg_glue_types where glue_timstamp > '2023-03-07 20:35:59' order by glue_timstamp limit 5 """ + qt_q17 """ select * from iceberg_glue_types order by glue_decimal limit 5 """ } sql """ use `iceberg_catalog`; """ q01()