[feature](hive)Support reading renamed Parquet Hive and Orc Hive tables. (#38432) (#38809)

bp #38432 

## Proposed changes
Add `hive_parquet_use_column_names` and `hive_orc_use_column_names`
session variables to read the table after rename column in `Hive`.

These two session variables are referenced from
`parquet_use_column_names` and `orc_use_column_names` of `Trino` hive
connector.

By default, these two session variables are true. When they are set to
false, reading orc/parquet will access the columns according to the
ordinal position in the Hive table definition.

For example:
```mysql
in Hive :
hive> create table tmp (a int , b string) stored as parquet;
hive> insert into table tmp values(1,"2");
hive> alter table tmp  change column  a new_a int;
hive> insert into table tmp values(2,"4");

in Doris :
mysql> set hive_parquet_use_column_names=true;
Query OK, 0 rows affected (0.00 sec)

mysql> select  * from tmp;
+-------+------+
| new_a | b    |
+-------+------+
|  NULL | 2    |
|     2 | 4    |
+-------+------+
2 rows in set (0.02 sec)

mysql> set hive_parquet_use_column_names=false;
Query OK, 0 rows affected (0.00 sec)

mysql> select  * from tmp;
+-------+------+
| new_a | b    |
+-------+------+
|     1 | 2    |
|     2 | 4    |
+-------+------+
2 rows in set (0.02 sec)
```

You can use `set
parquet.column.index.access/orc.force.positional.evolution = true/false`
in hive 3 to control the results of reading the table like these two
session variables. However, for the rename struct inside column parquet
table, the effects of hive and doris are different.
This commit is contained in:
daidai
2024-08-05 09:06:49 +08:00
committed by GitHub
parent 53773ae6b7
commit 5d02c48715
20 changed files with 803 additions and 35 deletions

View File

@ -421,6 +421,10 @@ public abstract class FileQueryScanNode extends FileScanNode {
transactionalHiveDesc.setDeleteDeltas(deleteDeltaDescs);
tableFormatFileDesc.setTransactionalHiveParams(transactionalHiveDesc);
rangeDesc.setTableFormatParams(tableFormatFileDesc);
} else if (fileSplit instanceof HiveSplit) {
TTableFormatFileDesc tableFormatFileDesc = new TTableFormatFileDesc();
tableFormatFileDesc.setTableFormatType(TableFormatType.HIVE.value());
rangeDesc.setTableFormatParams(tableFormatFileDesc);
}
setScanParams(rangeDesc, fileSplit);

View File

@ -556,6 +556,10 @@ public class SessionVariable implements Serializable, Writable {
public static final String ENABLE_PUSHDOWN_MINMAX_ON_UNIQUE = "enable_pushdown_minmax_on_unique";
public static final String HIVE_PARQUET_USE_COLUMN_NAMES = "hive_parquet_use_column_names";
public static final String HIVE_ORC_USE_COLUMN_NAMES = "hive_orc_use_column_names";
public static final String KEEP_CARRIAGE_RETURN = "keep_carriage_return";
public static final String ENABLE_PUSHDOWN_STRING_MINMAX = "enable_pushdown_string_minmax";
@ -1770,11 +1774,25 @@ public class SessionVariable implements Serializable, Writable {
public int createTablePartitionMaxNum = 10000;
@VariableMgr.VarAttr(name = HIVE_PARQUET_USE_COLUMN_NAMES,
description = {"默认情况下按名称访问 Parquet 列。将此属性设置为“false”可按 Hive 表定义中的序号位置访问列。",
"Access Parquet columns by name by default. Set this property to `false` to access columns "
+ "by their ordinal position in the Hive table definition."})
public boolean hiveParquetUseColumnNames = true;
@VariableMgr.VarAttr(name = HIVE_ORC_USE_COLUMN_NAMES,
description = {"默认情况下按名称访问 Orc 列。将此属性设置为“false”可按 Hive 表定义中的序号位置访问列。",
"Access Parquet columns by name by default. Set this property to `false` to access columns "
+ "by their ordinal position in the Hive table definition."})
public boolean hiveOrcUseColumnNames = true;
@VariableMgr.VarAttr(name = KEEP_CARRIAGE_RETURN,
description = {"在同时处理\r和\r\n作为CSV的行分隔符时,是否保留\r",
"When processing both \\n and \\r\\n as CSV line separators, should \\r be retained?"})
public boolean keepCarriageReturn = false;
@VariableMgr.VarAttr(name = FORCE_JNI_SCANNER,
description = {"强制使用jni方式读取外表", "Force the use of jni mode to read external table"})
private boolean forceJniScanner = false;
@ -3435,6 +3453,8 @@ public class SessionVariable implements Serializable, Writable {
tResult.setReadCsvEmptyLineAsNull(readCsvEmptyLineAsNull);
tResult.setSerdeDialect(getSerdeDialect());
tResult.setHiveOrcUseColumnNames(hiveOrcUseColumnNames);
tResult.setHiveParquetUseColumnNames(hiveParquetUseColumnNames);
tResult.setKeepCarriageReturn(keepCarriageReturn);
return tResult;
}