[feature](hive)Support reading renamed Parquet Hive and Orc Hive tables. (#38432) (#38809)

bp #38432 ## Proposed changes Add `hive_parquet_use_column_names` and `hive_orc_use_column_names` session variables to read the table after rename column in `Hive`. These two session variables are referenced from `parquet_use_column_names` and `orc_use_column_names` of `Trino` hive connector. By default, these two session variables are true. When they are set to false, reading orc/parquet will access the columns according to the ordinal position in the Hive table definition. For example: ```mysql in Hive : hive> create table tmp (a int , b string) stored as parquet; hive> insert into table tmp values(1,"2"); hive> alter table tmp change column a new_a int; hive> insert into table tmp values(2,"4"); in Doris : mysql> set hive_parquet_use_column_names=true; Query OK, 0 rows affected (0.00 sec) mysql> select * from tmp; +-------+------+ | new_a | b | +-------+------+ | NULL | 2 | | 2 | 4 | +-------+------+ 2 rows in set (0.02 sec) mysql> set hive_parquet_use_column_names=false; Query OK, 0 rows affected (0.00 sec) mysql> select * from tmp; +-------+------+ | new_a | b | +-------+------+ | 1 | 2 | | 2 | 4 | +-------+------+ 2 rows in set (0.02 sec) ``` You can use `set parquet.column.index.access/orc.force.positional.evolution = true/false` in hive 3 to control the results of reading the table like these two session variables. However, for the rename struct inside column parquet table, the effects of hive and doris are different.
2024-08-05 09:06:49 +08:00
parent 53773ae6b7
commit 5d02c48715
20 changed files with 803 additions and 35 deletions
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/FileQueryScanNode.java
@ -421,6 +421,10 @@ public abstract class FileQueryScanNode extends FileScanNode {
            transactionalHiveDesc.setDeleteDeltas(deleteDeltaDescs);
            tableFormatFileDesc.setTransactionalHiveParams(transactionalHiveDesc);
            rangeDesc.setTableFormatParams(tableFormatFileDesc);
+        } else if (fileSplit instanceof HiveSplit) {
+            TTableFormatFileDesc tableFormatFileDesc = new TTableFormatFileDesc();
+            tableFormatFileDesc.setTableFormatType(TableFormatType.HIVE.value());
+            rangeDesc.setTableFormatParams(tableFormatFileDesc);
        }

        setScanParams(rangeDesc, fileSplit);
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@ -556,6 +556,10 @@ public class SessionVariable implements Serializable, Writable {

    public static final String ENABLE_PUSHDOWN_MINMAX_ON_UNIQUE = "enable_pushdown_minmax_on_unique";

+    public static final String HIVE_PARQUET_USE_COLUMN_NAMES = "hive_parquet_use_column_names";
+
+    public static final String HIVE_ORC_USE_COLUMN_NAMES = "hive_orc_use_column_names";
+
    public static final String KEEP_CARRIAGE_RETURN = "keep_carriage_return";

    public static final String ENABLE_PUSHDOWN_STRING_MINMAX = "enable_pushdown_string_minmax";
@ -1770,11 +1774,25 @@ public class SessionVariable implements Serializable, Writable {
    public int createTablePartitionMaxNum = 10000;


+    @VariableMgr.VarAttr(name = HIVE_PARQUET_USE_COLUMN_NAMES,
+            description = {"默认情况下按名称访问 Parquet 列。将此属性设置为“false”可按 Hive 表定义中的序号位置访问列。",
+                    "Access Parquet columns by name by default. Set this property to `false` to access columns "
+                            + "by their ordinal position in the Hive table definition."})
+    public boolean hiveParquetUseColumnNames = true;
+
+
+    @VariableMgr.VarAttr(name = HIVE_ORC_USE_COLUMN_NAMES,
+            description = {"默认情况下按名称访问 Orc 列。将此属性设置为“false”可按 Hive 表定义中的序号位置访问列。",
+                    "Access Parquet columns by name by default. Set this property to `false` to access columns "
+                            + "by their ordinal position in the Hive table definition."})
+    public boolean hiveOrcUseColumnNames = true;
+
    @VariableMgr.VarAttr(name = KEEP_CARRIAGE_RETURN,
            description = {"在同时处理\r和\r\n作为CSV的行分隔符时，是否保留\r",
                    "When processing both \\n and \\r\\n as CSV line separators, should \\r be retained?"})
    public boolean keepCarriageReturn = false;

+
    @VariableMgr.VarAttr(name = FORCE_JNI_SCANNER,
            description = {"强制使用jni方式读取外表", "Force the use of jni mode to read external table"})
    private boolean forceJniScanner = false;
@ -3435,6 +3453,8 @@ public class SessionVariable implements Serializable, Writable {

        tResult.setReadCsvEmptyLineAsNull(readCsvEmptyLineAsNull);
        tResult.setSerdeDialect(getSerdeDialect());
+        tResult.setHiveOrcUseColumnNames(hiveOrcUseColumnNames);
+        tResult.setHiveParquetUseColumnNames(hiveParquetUseColumnNames);
        tResult.setKeepCarriageReturn(keepCarriageReturn);
        return tResult;
    }