From 124516c1ea429c6743093e41d607fb3de0a29e3c Mon Sep 17 00:00:00 2001 From: Qi Chen Date: Mon, 3 Jul 2023 09:32:55 +0800 Subject: [PATCH] [Fix](orc-reader) Fix `Wrong data type for column` error when column order in hive table is not same in orc file schema. (#21306) `Wrong data type for column` error when column order in hive table is not same in orc file schema. The root cause is in order to handle the following case: The table in orc format of Hive 1.x may encounter system column names such as `_col0`, `_col1`, `_col2`... in the underlying orc file schema, which need to use the column names in the hive table for mapping. ### Solution Currently fix this issue by handling the following case by specifying hive version to 1.x.x in the hive catalog configuration. ```sql CREATE CATALOG hive PROPERTIES ( 'hive.version' = '1.x.x' ); ``` --- .../scripts/create_preinstalled_table.hql | 20 ++++++ .../orc/test_different_column_orders.orc | Bin 0 -> 671 bytes .../test_different_column_orders.parquet | Bin 0 -> 925 bytes docs/en/docs/lakehouse/faq.md | 11 +++- docs/zh-CN/docs/lakehouse/faq.md | 8 +++ .../catalog/external/HMSExternalTable.java | 4 ++ .../doris/datasource/HMSExternalCatalog.java | 4 ++ .../datasource/hive/HiveVersionUtil.java | 22 +++++++ .../doris/planner/external/HiveScanNode.java | 5 +- .../hive/test_different_column_orders.out | 29 +++++++++ .../hive/test_different_column_orders.groovy | 60 ++++++++++++++++++ 11 files changed, 161 insertions(+), 2 deletions(-) create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/orc/test_different_column_orders.orc create mode 100644 docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/parquet/test_different_column_orders.parquet create mode 100644 regression-test/data/external_catalog_p0/hive/test_different_column_orders.out create mode 100644 regression-test/suites/external_catalog_p0/hive/test_different_column_orders.groovy diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql index 69c58d0ac8..9f34c121df 100644 --- a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_table.hql @@ -660,4 +660,24 @@ update orc_full_acid_par set value = 'BB' where id = 2; alter table orc_full_acid_par PARTITION(part_col=20230101) compact 'major'; alter table orc_full_acid_par PARTITION(part_col=20230102) compact 'major'; +CREATE TABLE `test_different_column_orders_orc`( + `name` string, + `id` int, + `city` string, + `age` int, + `sex` string) +STORED AS ORC +LOCATION + '/user/doris/preinstalled_data/test_different_column_orders/orc'; + +CREATE TABLE `test_different_column_orders_parquet`( + `name` string, + `id` int, + `city` string, + `age` int, + `sex` string) +STORED AS PARQUET +LOCATION + '/user/doris/preinstalled_data/test_different_column_orders/parquet'; + show tables; diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/orc/test_different_column_orders.orc b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/test_different_column_orders/orc/test_different_column_orders.orc new file mode 100644 index 0000000000000000000000000000000000000000..3b8313cc4a8f3dec60d61f97297626daa10e1561 GIT binary patch literal 671 zcmeYdau#G@;9?VE;a~}1&}LxZ66RuNU|qS*JS=g6KfJx6>v+ZosxY(VBeO-f5h5D)kFjP4}H+ z_IYFL_MTn0bt=40n5vu2{>#7f*CV-2r&NwNii!N0wDmCe0|ti2T91<+CU7P^c=R|y zFzIoE(1QsAoCzHX&xIZ)1U`D0ROaI;lEioN{L}!3F0aCm%%K~LUvQ{+Y%ySDwb+r; zc*^Af`vd(Sw*L`- z{H)-8Yq4I?KgCt8Q}#PYE`6<;qMXyMp(3K>8nbYL;nvVAQ^F1&yerog_tsX~qxQ!A zO_muBCz$U#3K$CYZJN#=y{wue(t6?PLmazyUNQO_@^W3h;hc|iYlQd%`2Vh*zhQ5w z`t>PH>!)U(I%To*L)D_C*KIocx0$7DKe%4>=4`^psP#?y9NQQL7$q7S1eBN$fq?-g%gMmNkY1XYmtK{TUkZ`{=>oCY85kI#8bw(|RYX;ML|H^xAl}YR%t>Wq zU|>i~1+hdKM1@3!K$f$B!_kaEg9Rim!3L3s+snwzA<86~>YLHXG+8}n%V6u^50Y#Gp3zm39He?<%lI0*@f=$`OD7J@DZ5}ho zax77aY``rRumLP=9Fh{CkOAwT#wa$8QSBBBNIz<9Na}zhQj`hgdlpfU|HWRgig7SV z%18*57iE^D7U`8_=BDaZ<>#eJ@H!S}Ch7-+5=BO0CWC}mL1Iy1X=;gXZjnM+YEf}! zex8D%o}q!Bfr3U-X=YA}f`PGNqM4bgiGhJ-s&SHqk%5tcfuT{Np@oUDNwTShQBsPz YNwT?dqNZew3 getCatalogProperties() { return catalog.getProperties(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java index 4d6c8b86fe..9d4fc479a9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/HMSExternalCatalog.java @@ -124,6 +124,10 @@ public class HMSExternalCatalog extends ExternalCatalog { return catalogProperty.getOrDefault(HMSProperties.HIVE_METASTORE_URIS, ""); } + public String getHiveVersion() { + return catalogProperty.getOrDefault(HMSProperties.HIVE_VERSION, ""); + } + protected List listDatabaseNames() { return client.getAllDatabases(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveVersionUtil.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveVersionUtil.java index 749520b62c..b93d5653ad 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveVersionUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveVersionUtil.java @@ -72,4 +72,26 @@ public class HiveVersionUtil { return DEFAULT_HIVE_VERSION; } } + + public static boolean isHive1(String version) { + if (Strings.isNullOrEmpty(version)) { + return false; + } + String[] parts = version.split("\\."); + if (parts.length < 2) { + LOG.warn("invalid hive version: " + version); + return false; + } + try { + int major = Integer.parseInt(parts[0]); + if (major == 1) { + return true; + } else { + return false; + } + } catch (NumberFormatException e) { + LOG.warn("invalid hive version: " + version); + return false; + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java index 738a2e3933..710fed10dd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java @@ -37,6 +37,7 @@ import org.apache.doris.datasource.hive.HiveMetaStoreCache; import org.apache.doris.datasource.hive.HiveMetaStoreCache.FileCacheValue; import org.apache.doris.datasource.hive.HivePartition; import org.apache.doris.datasource.hive.HiveTransaction; +import org.apache.doris.datasource.hive.HiveVersionUtil; import org.apache.doris.planner.ListPartitionPrunerV2; import org.apache.doris.planner.PlanNodeId; import org.apache.doris.planner.external.HiveSplit.HiveSplitCreator; @@ -91,7 +92,9 @@ public class HiveScanNode extends FileQueryScanNode { @Override protected void doInitialize() throws UserException { super.doInitialize(); - genSlotToSchemaIdMap(); + if (HiveVersionUtil.isHive1(hmsTable.getHiveVersion())) { + genSlotToSchemaIdMap(); + } String inputFormat = hmsTable.getRemoteTable().getSd().getInputFormat(); if (inputFormat.contains("TextInputFormat")) { for (SlotDescriptor slot : desc.getSlots()) { diff --git a/regression-test/data/external_catalog_p0/hive/test_different_column_orders.out b/regression-test/data/external_catalog_p0/hive/test_different_column_orders.out new file mode 100644 index 0000000000..6e3fc17262 --- /dev/null +++ b/regression-test/data/external_catalog_p0/hive/test_different_column_orders.out @@ -0,0 +1,29 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !q01 -- +Tom 1 shanghai 48 male +Jerry 2 guangzhou 35 male +Frank 3 hangzhou 25 male +Ada 4 beijing 22 female + +-- !q02 -- +4 + +-- !q03 -- +guangzhou 1 +hangzhou 1 +shanghai 1 + +-- !q01 -- +Tom 1 shanghai 48 male +Jerry 2 guangzhou 35 male +Frank 3 hangzhou 25 male +Ada 4 beijing 22 female + +-- !q02 -- +4 + +-- !q03 -- +guangzhou 1 +hangzhou 1 +shanghai 1 + diff --git a/regression-test/suites/external_catalog_p0/hive/test_different_column_orders.groovy b/regression-test/suites/external_catalog_p0/hive/test_different_column_orders.groovy new file mode 100644 index 0000000000..3fcb5a3c3f --- /dev/null +++ b/regression-test/suites/external_catalog_p0/hive/test_different_column_orders.groovy @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_different_column_orders", "p0") { + def q_parquet = { + qt_q01 """ + select * from test_different_column_orders_parquet order by id; + """ + qt_q02 """ + select count(id) from test_different_column_orders_parquet; + """ + qt_q03 """ + select city, count(*) from test_different_column_orders_parquet where sex = 'male' group by city order by city; + """ + } + def q_orc = { + qt_q01 """ + select * from test_different_column_orders_orc order by id; + """ + qt_q02 """ + select count(id) from test_different_column_orders_orc; + """ + qt_q03 """ + select city, count(*) from test_different_column_orders_orc where sex = 'male' group by city order by city; + """ + } + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + try { + String hms_port = context.config.otherConfigs.get("hms_port") + String catalog_name = "test_different_column_orders" + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + "type"="hms", + 'hive.metastore.uris' = 'thrift://127.0.0.1:${hms_port}' + );""" + sql """use `${catalog_name}`.`default`""" + + q_parquet() + q_orc() + + sql """drop catalog if exists ${catalog_name}""" + } finally { + } + } +}