[feature](hive)Support reading renamed Parquet Hive and Orc Hive tables. (#38432) (#38809)

bp #38432 

## Proposed changes
Add `hive_parquet_use_column_names` and `hive_orc_use_column_names`
session variables to read the table after rename column in `Hive`.

These two session variables are referenced from
`parquet_use_column_names` and `orc_use_column_names` of `Trino` hive
connector.

By default, these two session variables are true. When they are set to
false, reading orc/parquet will access the columns according to the
ordinal position in the Hive table definition.

For example:
```mysql
in Hive :
hive> create table tmp (a int , b string) stored as parquet;
hive> insert into table tmp values(1,"2");
hive> alter table tmp  change column  a new_a int;
hive> insert into table tmp values(2,"4");

in Doris :
mysql> set hive_parquet_use_column_names=true;
Query OK, 0 rows affected (0.00 sec)

mysql> select  * from tmp;
+-------+------+
| new_a | b    |
+-------+------+
|  NULL | 2    |
|     2 | 4    |
+-------+------+
2 rows in set (0.02 sec)

mysql> set hive_parquet_use_column_names=false;
Query OK, 0 rows affected (0.00 sec)

mysql> select  * from tmp;
+-------+------+
| new_a | b    |
+-------+------+
|     1 | 2    |
|     2 | 4    |
+-------+------+
2 rows in set (0.02 sec)
```

You can use `set
parquet.column.index.access/orc.force.positional.evolution = true/false`
in hive 3 to control the results of reading the table like these two
session variables. However, for the rename struct inside column parquet
table, the effects of hive and doris are different.
This commit is contained in:
daidai
2024-08-05 09:06:49 +08:00
committed by GitHub
parent 53773ae6b7
commit 5d02c48715
20 changed files with 803 additions and 35 deletions

View File

@ -279,13 +279,15 @@ Status OrcReader::init_reader(
const VExprContextSPtrs& conjuncts, bool is_acid, const TupleDescriptor* tuple_descriptor,
const RowDescriptor* row_descriptor,
const VExprContextSPtrs* not_single_slot_filter_conjuncts,
const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts,
const bool hive_use_column_names) {
_column_names = column_names;
_colname_to_value_range = colname_to_value_range;
_lazy_read_ctx.conjuncts = conjuncts;
_is_acid = is_acid;
_tuple_descriptor = tuple_descriptor;
_row_descriptor = row_descriptor;
_is_hive1_orc_or_use_idx = !hive_use_column_names;
if (not_single_slot_filter_conjuncts != nullptr && !not_single_slot_filter_conjuncts->empty()) {
_not_single_slot_filter_conjuncts.insert(_not_single_slot_filter_conjuncts.end(),
not_single_slot_filter_conjuncts->begin(),
@ -337,10 +339,11 @@ Status OrcReader::_init_read_columns() {
// In old version slot_name_to_schema_pos may not be set in _scan_params
// TODO, should be removed in 2.2 or later
_is_hive1_orc = is_hive1_orc && _scan_params.__isset.slot_name_to_schema_pos;
_is_hive1_orc_or_use_idx = (is_hive1_orc || _is_hive1_orc_or_use_idx) &&
_scan_params.__isset.slot_name_to_schema_pos;
for (size_t i = 0; i < _column_names->size(); ++i) {
auto& col_name = (*_column_names)[i];
if (_is_hive1_orc) {
if (_is_hive1_orc_or_use_idx) {
auto iter = _scan_params.slot_name_to_schema_pos.find(col_name);
if (iter != _scan_params.slot_name_to_schema_pos.end()) {
int pos = iter->second;
@ -375,9 +378,10 @@ Status OrcReader::_init_read_columns() {
_read_cols_lower_case.emplace_back(col_name);
// For hive engine, store the orc column name to schema column name map.
// This is for Hive 1.x orc file with internal column name _col0, _col1...
if (_is_hive1_orc) {
if (_is_hive1_orc_or_use_idx) {
_removed_acid_file_col_name_to_schema_col[orc_cols[pos]] = col_name;
}
_col_name_to_file_col_name[col_name] = read_col;
}
}
@ -708,7 +712,7 @@ bool OrcReader::_init_search_argument(
if (iter == colname_to_value_range->end()) {
continue;
}
auto type_it = type_map.find(col_name);
auto type_it = type_map.find(_col_name_to_file_col_name[col_name]);
if (type_it == type_map.end()) {
continue;
}
@ -913,7 +917,7 @@ Status OrcReader::_init_select_types(const orc::Type& type, int idx) {
std::string name;
// For hive engine, translate the column name in orc file to schema column name.
// This is for Hive 1.x which use internal column name _col0, _col1...
if (_is_hive1_orc) {
if (_is_hive1_orc_or_use_idx) {
name = _removed_acid_file_col_name_to_schema_col[type.getFieldName(i)];
} else {
name = get_field_name_lower_case(&type, i);

View File

@ -139,14 +139,15 @@ public:
const std::string& ctz, io::IOContext* io_ctx, bool enable_lazy_mat = true);
~OrcReader() override;
//If you want to read the file by index instead of column name, set hive_use_column_names to false.
Status init_reader(
const std::vector<std::string>* column_names,
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range,
const VExprContextSPtrs& conjuncts, bool is_acid,
const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
const VExprContextSPtrs* not_single_slot_filter_conjuncts,
const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts);
const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts,
const bool hive_use_column_names = true);
Status set_fill_columns(
const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>&
@ -570,9 +571,11 @@ private:
// This is used for Hive 1.x which use internal column name in Orc file.
// _col0, _col1...
std::unordered_map<std::string, std::string> _removed_acid_file_col_name_to_schema_col;
// Flag for hive engine. True if the external table engine is Hive1.x with orc col name
// as _col1, col2, ...
bool _is_hive1_orc = false;
// Flag for hive engine.
// 1. True if the external table engine is Hive1.x with orc col name as _col1, col2, ...
// 2. If true, use indexes instead of column names when reading orc tables.
bool _is_hive1_orc_or_use_idx = false;
std::unordered_map<std::string, std::string> _col_name_to_file_col_name;
std::unordered_map<std::string, const orc::Type*> _type_map;
std::vector<const orc::Type*> _col_orc_type;
@ -621,6 +624,8 @@ private:
// resolve schema change
std::unordered_map<std::string, std::unique_ptr<converter::ColumnTypeConverter>> _converters;
//for iceberg table , when table column name != file column name
//TODO(CXY) : remove _table_col_to_file_col,because we hava _col_name_to_file_col_name,
// the two have the same effect.
std::unordered_map<std::string, std::string> _table_col_to_file_col;
//support iceberg position delete .
std::vector<int64_t>* _position_delete_ordered_rowids = nullptr;

View File

@ -22,6 +22,7 @@
#include <gen_cpp/parquet_types.h>
#include <glog/logging.h>
#include <algorithm>
#include <functional>
#include <utility>
@ -300,12 +301,14 @@ Status ParquetReader::init_reader(
const std::unordered_map<std::string, int>* colname_to_slot_id,
const VExprContextSPtrs* not_single_slot_filter_conjuncts,
const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts,
bool filter_groups) {
bool filter_groups, const bool hive_use_column_names) {
_tuple_descriptor = tuple_descriptor;
_row_descriptor = row_descriptor;
_colname_to_slot_id = colname_to_slot_id;
_not_single_slot_filter_conjuncts = not_single_slot_filter_conjuncts;
_slot_id_to_filter_conjuncts = slot_id_to_filter_conjuncts;
_colname_to_value_range = colname_to_value_range;
_hive_use_column_names = hive_use_column_names;
if (_file_metadata == nullptr) {
return Status::InternalError("failed to init parquet reader, please open reader first");
}
@ -320,28 +323,59 @@ Status ParquetReader::init_reader(
// e.g. table added a column after this parquet file was written.
_column_names = &all_column_names;
auto schema_desc = _file_metadata->schema();
std::set<std::string> required_columns(all_column_names.begin(), all_column_names.end());
// Currently only used in iceberg, the columns are dropped but added back
std::set<std::string> dropped_columns(missing_column_names.begin(), missing_column_names.end());
// Make the order of read columns the same as physical order in parquet file
for (int i = 0; i < schema_desc.size(); ++i) {
auto name = schema_desc.get_column(i)->name;
// If the column in parquet file is included in all_column_names and not in missing_column_names,
// add it to _map_column, which means the reader should read the data of this column.
// Here to check against missing_column_names is for the 'Add a column back to the table
// with the same column name' case. (drop column a then add column a).
// Shouldn't read this column data in this case.
if (required_columns.find(name) != required_columns.end() &&
dropped_columns.find(name) == dropped_columns.end()) {
required_columns.erase(name);
_read_columns.emplace_back(name);
if (_hive_use_column_names) {
std::set<std::string> required_columns(all_column_names.begin(), all_column_names.end());
// Currently only used in iceberg, the columns are dropped but added back
std::set<std::string> dropped_columns(missing_column_names.begin(),
missing_column_names.end());
// Make the order of read columns the same as physical order in parquet file
for (int i = 0; i < schema_desc.size(); ++i) {
auto name = schema_desc.get_column(i)->name;
// If the column in parquet file is included in all_column_names and not in missing_column_names,
// add it to _map_column, which means the reader should read the data of this column.
// Here to check against missing_column_names is for the 'Add a column back to the table
// with the same column name' case. (drop column a then add column a).
// Shouldn't read this column data in this case.
if (required_columns.find(name) != required_columns.end() &&
dropped_columns.find(name) == dropped_columns.end()) {
required_columns.erase(name);
_read_columns.emplace_back(name);
}
}
for (const std::string& name : required_columns) {
_missing_cols.emplace_back(name);
}
} else {
std::unordered_map<std::string, ColumnValueRangeType> new_colname_to_value_range;
const auto& table_column_idxs = _scan_params.column_idxs;
std::map<int, int> table_col_id_to_idx;
for (int i = 0; i < table_column_idxs.size(); i++) {
table_col_id_to_idx.insert({table_column_idxs[i], i});
}
for (auto [id, idx] : table_col_id_to_idx) {
if (id >= schema_desc.size()) {
_missing_cols.emplace_back(all_column_names[idx]);
} else {
auto& table_col = all_column_names[idx];
auto file_col = schema_desc.get_column(id)->name;
_read_columns.emplace_back(file_col);
if (table_col != file_col) {
_table_col_to_file_col[table_col] = file_col;
auto iter = _colname_to_value_range->find(table_col);
if (iter != _colname_to_value_range->end()) {
continue;
}
new_colname_to_value_range[file_col] = iter->second;
_colname_to_value_range->erase(iter->first);
}
}
}
for (auto it : new_colname_to_value_range) {
_colname_to_value_range->emplace(it.first, std::move(it.second));
}
}
for (const std::string& name : required_columns) {
_missing_cols.emplace_back(name);
}
_colname_to_value_range = colname_to_value_range;
// build column predicates for column lazy read
_lazy_read_ctx.conjuncts = conjuncts;
RETURN_IF_ERROR(_init_row_groups(filter_groups));
@ -525,6 +559,16 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof)
return Status::OK();
}
if (!_hive_use_column_names) {
for (auto i = 0; i < block->get_names().size(); i++) {
auto& col = block->get_by_position(i);
if (_table_col_to_file_col.contains(col.name)) {
col.name = _table_col_to_file_col[col.name];
}
}
block->initialize_index_by_name();
}
SCOPED_RAW_TIMER(&_statistics.column_read_time);
Status batch_st =
_current_group_reader->next_batch(block, _batch_size, read_rows, &_row_group_eof);
@ -535,6 +579,13 @@ Status ParquetReader::get_next_block(Block* block, size_t* read_rows, bool* eof)
*eof = true;
return Status::OK();
}
if (!_hive_use_column_names) {
for (auto i = 0; i < block->columns(); i++) {
block->get_by_position(i).name = (*_column_names)[i];
}
block->initialize_index_by_name();
}
if (!batch_st.ok()) {
return Status::InternalError("Read parquet file {} failed, reason = {}", _scan_range.path,
batch_st.to_string());

View File

@ -116,7 +116,7 @@ public:
const std::unordered_map<std::string, int>* colname_to_slot_id,
const VExprContextSPtrs* not_single_slot_filter_conjuncts,
const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts,
bool filter_groups = true);
bool filter_groups = true, const bool hive_use_column_names = true);
Status get_next_block(Block* block, size_t* read_rows, bool* eof) override;
@ -283,5 +283,6 @@ private:
const std::unordered_map<std::string, int>* _colname_to_slot_id = nullptr;
const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr;
const std::unordered_map<int, VExprContextSPtrs>* _slot_id_to_filter_conjuncts = nullptr;
bool _hive_use_column_names = false;
};
} // namespace doris::vectorized

View File

@ -862,12 +862,21 @@ Status VFileScanner::_get_next_reader() {
RETURN_IF_ERROR(paimon_reader->init_row_filters(range));
_cur_reader = std::move(paimon_reader);
} else {
bool hive_parquet_use_column_names = true;
if (range.__isset.table_format_params &&
range.table_format_params.table_format_type == "hive" && _state != nullptr)
[[likely]] {
hive_parquet_use_column_names =
_state->query_options().hive_parquet_use_column_names;
}
std::vector<std::string> place_holder;
init_status = parquet_reader->init_reader(
_file_col_names, place_holder, _colname_to_value_range,
_push_down_conjuncts, _real_tuple_desc, _default_val_row_desc.get(),
_col_name_to_slot_id, &_not_single_slot_filter_conjuncts,
&_slot_id_to_filter_conjuncts);
&_slot_id_to_filter_conjuncts, true, hive_parquet_use_column_names);
_cur_reader = std::move(parquet_reader);
}
need_to_get_parsed_schema = true;
@ -923,10 +932,18 @@ Status VFileScanner::_get_next_reader() {
RETURN_IF_ERROR(paimon_reader->init_row_filters(range));
_cur_reader = std::move(paimon_reader);
} else {
bool hive_orc_use_column_names = true;
if (range.__isset.table_format_params &&
range.table_format_params.table_format_type == "hive" && _state != nullptr)
[[likely]] {
hive_orc_use_column_names = _state->query_options().hive_orc_use_column_names;
}
init_status = orc_reader->init_reader(
&_file_col_names, _colname_to_value_range, _push_down_conjuncts, false,
_real_tuple_desc, _default_val_row_desc.get(),
&_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts);
&_not_single_slot_filter_conjuncts, &_slot_id_to_filter_conjuncts,
hive_orc_use_column_names);
_cur_reader = std::move(orc_reader);
}
need_to_get_parsed_schema = true;

View File

@ -0,0 +1,29 @@
use default;
create table simulation_hive1_orc(
`a` boolean,
`b` int,
`c` string
)stored as orc
LOCATION '/user/doris/preinstalled_data/orc_table/simulation_hive1_orc';
msck repair table simulation_hive1_orc;
create table test_hive_rename_column_parquet(
`new_a` boolean,
`new_b` int,
`c` string,
`new_d` int,
`f` string
)stored as parquet
LOCATION '/user/doris/preinstalled_data/parquet_table/test_hive_rename_column_parquet';
msck repair table test_hive_rename_column_parquet;
create table test_hive_rename_column_orc(
`new_a` boolean,
`new_b` int,
`c` string,
`new_d` int,
`f` string
)stored as orc
LOCATION '/user/doris/preinstalled_data/orc_table/test_hive_rename_column_orc';
msck repair table test_hive_rename_column_orc;

View File

@ -421,6 +421,10 @@ public abstract class FileQueryScanNode extends FileScanNode {
transactionalHiveDesc.setDeleteDeltas(deleteDeltaDescs);
tableFormatFileDesc.setTransactionalHiveParams(transactionalHiveDesc);
rangeDesc.setTableFormatParams(tableFormatFileDesc);
} else if (fileSplit instanceof HiveSplit) {
TTableFormatFileDesc tableFormatFileDesc = new TTableFormatFileDesc();
tableFormatFileDesc.setTableFormatType(TableFormatType.HIVE.value());
rangeDesc.setTableFormatParams(tableFormatFileDesc);
}
setScanParams(rangeDesc, fileSplit);

View File

@ -556,6 +556,10 @@ public class SessionVariable implements Serializable, Writable {
public static final String ENABLE_PUSHDOWN_MINMAX_ON_UNIQUE = "enable_pushdown_minmax_on_unique";
public static final String HIVE_PARQUET_USE_COLUMN_NAMES = "hive_parquet_use_column_names";
public static final String HIVE_ORC_USE_COLUMN_NAMES = "hive_orc_use_column_names";
public static final String KEEP_CARRIAGE_RETURN = "keep_carriage_return";
public static final String ENABLE_PUSHDOWN_STRING_MINMAX = "enable_pushdown_string_minmax";
@ -1770,11 +1774,25 @@ public class SessionVariable implements Serializable, Writable {
public int createTablePartitionMaxNum = 10000;
@VariableMgr.VarAttr(name = HIVE_PARQUET_USE_COLUMN_NAMES,
description = {"默认情况下按名称访问 Parquet 列。将此属性设置为“false”可按 Hive 表定义中的序号位置访问列。",
"Access Parquet columns by name by default. Set this property to `false` to access columns "
+ "by their ordinal position in the Hive table definition."})
public boolean hiveParquetUseColumnNames = true;
@VariableMgr.VarAttr(name = HIVE_ORC_USE_COLUMN_NAMES,
description = {"默认情况下按名称访问 Orc 列。将此属性设置为“false”可按 Hive 表定义中的序号位置访问列。",
"Access Parquet columns by name by default. Set this property to `false` to access columns "
+ "by their ordinal position in the Hive table definition."})
public boolean hiveOrcUseColumnNames = true;
@VariableMgr.VarAttr(name = KEEP_CARRIAGE_RETURN,
description = {"在同时处理\r和\r\n作为CSV的行分隔符时,是否保留\r",
"When processing both \\n and \\r\\n as CSV line separators, should \\r be retained?"})
public boolean keepCarriageReturn = false;
@VariableMgr.VarAttr(name = FORCE_JNI_SCANNER,
description = {"强制使用jni方式读取外表", "Force the use of jni mode to read external table"})
private boolean forceJniScanner = false;
@ -3435,6 +3453,8 @@ public class SessionVariable implements Serializable, Writable {
tResult.setReadCsvEmptyLineAsNull(readCsvEmptyLineAsNull);
tResult.setSerdeDialect(getSerdeDialect());
tResult.setHiveOrcUseColumnNames(hiveOrcUseColumnNames);
tResult.setHiveParquetUseColumnNames(hiveParquetUseColumnNames);
tResult.setKeepCarriageReturn(keepCarriageReturn);
return tResult;
}

View File

@ -311,6 +311,12 @@ struct TQueryOptions {
119: optional bool keep_carriage_return = false; // \n,\r\n split line in CSV.
122: optional i32 runtime_bloom_filter_min_size = 1048576;
//Access Parquet/ORC columns by name by default. Set this property to `false` to access columns
//by their ordinal position in the Hive table definition.
123: optional bool hive_parquet_use_column_names = true;
124: optional bool hive_orc_use_column_names = true;
// For cloud, to control if the content would be written into file cache
1000: optional bool disable_file_cache = false
}

View File

@ -0,0 +1,435 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !hive1_orc_1_true --
true 10 hello world
false 20 keep
-- !hive1_orc_2_true --
10 hello world true
20 keep false
-- !hive1_orc_3_true --
hello world true
keep false
-- !hive1_orc_4_true --
2
-- !hive1_orc_5_true --
2
-- !hive1_orc_6_true --
10
20
-- !hive1_orc_7_true --
10 1
20 1
-- !hive1_orc_8_true --
true 10 hello world
-- !hive1_orc_9_true --
false 20 keep
-- !hive1_orc_10_true --
false 20 keep
-- !hive1_orc_11_true --
false 20 keep
-- !hive1_orc_12_true --
hello world
keep
-- !hive1_orc_1_false --
true 10 hello world
false 20 keep
-- !hive1_orc_2_false --
10 hello world true
20 keep false
-- !hive1_orc_3_false --
hello world true
keep false
-- !hive1_orc_4_false --
2
-- !hive1_orc_5_false --
2
-- !hive1_orc_6_false --
10
20
-- !hive1_orc_7_false --
10 1
20 1
-- !hive1_orc_8_false --
true 10 hello world
-- !hive1_orc_9_false --
false 20 keep
-- !hive1_orc_10_false --
false 20 keep
-- !hive1_orc_11_false --
false 20 keep
-- !hive1_orc_12_false --
hello world
keep
-- !rename_orc_1_true --
\N \N hello world \N \N
\N \N keep \N \N
true 30 abcd \N \N
false 40 new adcd \N \N
true 50 xxx \N cols
false 60 yyy \N yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_orc_2_true --
\N
\N
30
40
50
60
70
80
-- !rename_orc_3_true --
\N 2
30 1
40 1
50 1
60 1
70 1
80 1
-- !rename_orc_4_true --
true 30 abcd \N \N
true 50 xxx \N cols
true 70 hahaha 8888 abcd
-- !rename_orc_5_true --
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_orc_6_true --
\N \N hello world \N \N
\N \N keep \N \N
true 30 abcd \N \N
false 40 new adcd \N \N
true 50 xxx \N cols
false 60 yyy \N yyyyyy
-- !rename_orc_7_true --
true 30 abcd \N \N
-- !rename_orc_8_true --
true
true
true
-- !rename_orc_9_true --
-- !rename_orc_10_true --
-- !rename_orc_11_true --
\N \N
\N \N
30 true
40 false
50 true
60 false
70 true
80 false
-- !rename_orc_12_true --
\N \N hello world \N \N
\N \N keep \N \N
\N \N abcd 30 true
\N \N new adcd 40 false
cols \N xxx 50 true
yyyyyy \N yyy 60 false
abcd 8888 hahaha 70 true
efg 9999 cmake 80 false
-- !rename_orc_13_true --
false 40 new adcd \N \N
true 50 xxx \N cols
false 60 yyy \N yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_orc_1_false --
true 10 hello world \N \N
false 20 keep \N \N
true 30 abcd \N \N
false 40 new adcd \N \N
true 50 xxx 60 cols
false 60 yyy 100 yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_orc_2_false --
10
20
30
40
50
60
70
80
-- !rename_orc_3_false --
10 1
20 1
30 1
40 1
50 1
60 1
70 1
80 1
-- !rename_orc_4_false --
true 10 hello world \N \N
true 30 abcd \N \N
true 50 xxx 60 cols
true 70 hahaha 8888 abcd
-- !rename_orc_5_false --
true 50 xxx 60 cols
false 60 yyy 100 yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_orc_6_false --
true 10 hello world \N \N
false 20 keep \N \N
true 30 abcd \N \N
false 40 new adcd \N \N
-- !rename_orc_7_false --
true 30 abcd \N \N
-- !rename_orc_8_false --
true
true
true
true
-- !rename_orc_9_false --
-- !rename_orc_10_false --
-- !rename_orc_11_false --
10 true
20 false
30 true
40 false
50 true
60 false
70 true
80 false
-- !rename_orc_12_false --
\N \N hello world 10 true
\N \N keep 20 false
\N \N abcd 30 true
\N \N new adcd 40 false
cols 60 xxx 50 true
yyyyyy 100 yyy 60 false
abcd 8888 hahaha 70 true
efg 9999 cmake 80 false
-- !rename_orc_13_false --
true 10 hello world \N \N
false 20 keep \N \N
false 40 new adcd \N \N
true 50 xxx 60 cols
false 60 yyy 100 yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_parquet_1_true --
\N \N hello world \N \N
\N \N keep \N \N
true 30 abcd \N \N
false 40 new adcd \N \N
true 50 xxx \N cols
false 60 yyy \N yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_parquet_2_true --
\N
\N
30
40
50
60
70
80
-- !rename_parquet_3_true --
\N 2
30 1
40 1
50 1
60 1
70 1
80 1
-- !rename_parquet_4_true --
true 30 abcd \N \N
true 50 xxx \N cols
true 70 hahaha 8888 abcd
-- !rename_parquet_5_true --
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_parquet_6_true --
\N \N hello world \N \N
\N \N keep \N \N
true 30 abcd \N \N
false 40 new adcd \N \N
true 50 xxx \N cols
false 60 yyy \N yyyyyy
-- !rename_parquet_7_true --
true 30 abcd \N \N
-- !rename_parquet_8_true --
true
true
true
-- !rename_parquet_9_true --
-- !rename_parquet_10_true --
-- !rename_parquet_11_true --
\N \N
\N \N
30 true
40 false
50 true
60 false
70 true
80 false
-- !rename_parquet_12_true --
\N \N hello world \N \N
\N \N keep \N \N
\N \N abcd 30 true
\N \N new adcd 40 false
cols \N xxx 50 true
yyyyyy \N yyy 60 false
abcd 8888 hahaha 70 true
efg 9999 cmake 80 false
-- !rename_parquet_13_true --
false 40 new adcd \N \N
true 50 xxx \N cols
false 60 yyy \N yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_parquet_1_false --
true 10 hello world \N \N
false 20 keep \N \N
true 30 abcd \N \N
false 40 new adcd \N \N
true 50 xxx 60 cols
false 60 yyy 100 yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_parquet_2_false --
10
20
30
40
50
60
70
80
-- !rename_parquet_3_false --
10 1
20 1
30 1
40 1
50 1
60 1
70 1
80 1
-- !rename_parquet_4_false --
true 10 hello world \N \N
true 30 abcd \N \N
true 50 xxx 60 cols
true 70 hahaha 8888 abcd
-- !rename_parquet_5_false --
true 50 xxx 60 cols
false 60 yyy 100 yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg
-- !rename_parquet_6_false --
true 10 hello world \N \N
false 20 keep \N \N
true 30 abcd \N \N
false 40 new adcd \N \N
-- !rename_parquet_7_false --
true 30 abcd \N \N
-- !rename_parquet_8_false --
true
true
true
true
-- !rename_parquet_9_false --
-- !rename_parquet_10_false --
-- !rename_parquet_11_false --
10 true
20 false
30 true
40 false
50 true
60 false
70 true
80 false
-- !rename_parquet_12_false --
\N \N hello world 10 true
\N \N keep 20 false
\N \N abcd 30 true
\N \N new adcd 40 false
cols 60 xxx 50 true
yyyyyy 100 yyy 60 false
abcd 8888 hahaha 70 true
efg 9999 cmake 80 false
-- !rename_parquet_13_false --
true 10 hello world \N \N
false 20 keep \N \N
false 40 new adcd \N \N
true 50 xxx 60 cols
false 60 yyy 100 yyyyyy
true 70 hahaha 8888 abcd
false 80 cmake 9999 efg

View File

@ -0,0 +1,196 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_hive_rename_column_orc_parquet", "p0,external,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
String hivePrefix ="hive3";
setHivePrefix(hivePrefix)
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
String hmsPort = context.config.otherConfigs.get(hivePrefix + "HmsPort")
String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort")
String catalog_name = "test_hive_schema_change2"
sql """drop catalog if exists ${catalog_name};"""
sql """
create catalog if not exists ${catalog_name} properties (
'type'='hms',
'hadoop.username' = 'hadoop',
'fs.defaultFS' = 'hdfs://${externalEnvIp}:${hdfs_port}',
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hmsPort}'
);
"""
sql """ switch ${catalog_name} """
sql """ use `default` """
sql """ set hive_orc_use_column_names=true; """
qt_hive1_orc_1_true """ select * from simulation_hive1_orc order by b; """
qt_hive1_orc_2_true """ select b,c,a from simulation_hive1_orc order by b; """
qt_hive1_orc_3_true """ select c,a from simulation_hive1_orc order by b; """
qt_hive1_orc_4_true """ select count(*) from simulation_hive1_orc; """
qt_hive1_orc_5_true """ select count(a) from simulation_hive1_orc; """
qt_hive1_orc_6_true """ select b from simulation_hive1_orc order by b; """
qt_hive1_orc_7_true """ select b,count(*) from simulation_hive1_orc group by b order by b; """
qt_hive1_orc_8_true """ select * from simulation_hive1_orc where a +b = 11 ; """
qt_hive1_orc_9_true """ select * from simulation_hive1_orc where a +b != 11 ; """
qt_hive1_orc_10_true """ select * from simulation_hive1_orc where a +b != 11 and c = "keep"; """
qt_hive1_orc_11_true """ select * from simulation_hive1_orc where a +b != 11 and c != "keepxxx"; """
qt_hive1_orc_12_true """ select c from simulation_hive1_orc order by c; """
sql """ set hive_orc_use_column_names=false; """
qt_hive1_orc_1_false """ select * from simulation_hive1_orc order by b; """
qt_hive1_orc_2_false """ select b,c,a from simulation_hive1_orc order by b; """
qt_hive1_orc_3_false """ select c,a from simulation_hive1_orc order by b; """
qt_hive1_orc_4_false """ select count(*) from simulation_hive1_orc; """
qt_hive1_orc_5_false """ select count(a) from simulation_hive1_orc; """
qt_hive1_orc_6_false """ select b from simulation_hive1_orc order by b; """
qt_hive1_orc_7_false """ select b,count(*) from simulation_hive1_orc group by b order by b; """
qt_hive1_orc_8_false """ select * from simulation_hive1_orc where a +b = 11 ; """
qt_hive1_orc_9_false """ select * from simulation_hive1_orc where a +b != 11 ; """
qt_hive1_orc_10_false """ select * from simulation_hive1_orc where a +b != 11 and c = "keep"; """
qt_hive1_orc_11_false """ select * from simulation_hive1_orc where a +b != 11 and c != "keepxxx"; """
qt_hive1_orc_12_false """ select c from simulation_hive1_orc order by c; """
sql """ set hive_orc_use_column_names=true; """
qt_rename_orc_1_true """ select * from test_hive_rename_column_orc order by new_b,c """;
qt_rename_orc_2_true """ select new_b from test_hive_rename_column_orc order by new_b,c """;
qt_rename_orc_3_true """ select new_b,count(*) from test_hive_rename_column_orc group by new_b order by new_b """;
qt_rename_orc_4_true """ select * from test_hive_rename_column_orc where new_a = 1 order by new_b,c """;
qt_rename_orc_5_true """ select * from test_hive_rename_column_orc where new_d is not null order by new_b,c """
qt_rename_orc_6_true """ select * from test_hive_rename_column_orc where new_d is null order by new_b,c; """
qt_rename_orc_7_true """ select * from test_hive_rename_column_orc where new_b + new_a = 31 order by new_b,c; """
qt_rename_orc_8_true """ select new_a from test_hive_rename_column_orc where new_a = 1 order by new_b,c; """
qt_rename_orc_9_true """ select new_b from test_hive_rename_column_orc where new_b = 1 order by new_b; """
qt_rename_orc_10_true """ select new_b,new_d from test_hive_rename_column_orc where new_d +30*new_b=100 order by new_b,c; """
qt_rename_orc_11_true """ select new_b,new_a from test_hive_rename_column_orc order by new_b,c,new_a; """
qt_rename_orc_12_true """ select f,new_d,c,new_b,new_a from test_hive_rename_column_orc order by new_b,c; """
qt_rename_orc_13_true """ select * from test_hive_rename_column_orc where new_b + new_a != 31 order by new_b,c; """
sql """ set hive_orc_use_column_names=false; """
qt_rename_orc_1_false """ select * from test_hive_rename_column_orc order by new_b,c """;
qt_rename_orc_2_false """ select new_b from test_hive_rename_column_orc order by new_b,c """;
qt_rename_orc_3_false """ select new_b,count(*) from test_hive_rename_column_orc group by new_b order by new_b """;
qt_rename_orc_4_false """ select * from test_hive_rename_column_orc where new_a = 1 order by new_b,c """;
qt_rename_orc_5_false """ select * from test_hive_rename_column_orc where new_d is not null order by new_b """
qt_rename_orc_6_false """ select * from test_hive_rename_column_orc where new_d is null order by new_b,c; """
qt_rename_orc_7_false """ select * from test_hive_rename_column_orc where new_b + new_a = 31 order by new_b,c; """
qt_rename_orc_8_false """ select new_a from test_hive_rename_column_orc where new_a = 1 order by new_b,c; """
qt_rename_orc_9_false """ select new_b from test_hive_rename_column_orc where new_b = 1 order by new_b; """
qt_rename_orc_10_false """ select new_b,new_d from test_hive_rename_column_orc where new_d +30*new_b=100 order by new_b,c; """
qt_rename_orc_11_false """ select new_b,new_a from test_hive_rename_column_orc order by new_b,c,new_a; """
qt_rename_orc_12_false """ select f,new_d,c,new_b,new_a from test_hive_rename_column_orc order by new_b,c; """
qt_rename_orc_13_false """ select * from test_hive_rename_column_orc where new_b + new_a != 31 order by new_b,c; """
sql """ set hive_parquet_use_column_names=true; """
qt_rename_parquet_1_true """ select * from test_hive_rename_column_parquet order by new_b,c """;
qt_rename_parquet_2_true """ select new_b from test_hive_rename_column_parquet order by new_b,c """;
qt_rename_parquet_3_true """ select new_b,count(*) from test_hive_rename_column_parquet group by new_b order by new_b """;
qt_rename_parquet_4_true """ select * from test_hive_rename_column_parquet where new_a = 1 order by new_b,c """;
qt_rename_parquet_5_true """ select * from test_hive_rename_column_parquet where new_d is not null order by new_b,c """
qt_rename_parquet_6_true """ select * from test_hive_rename_column_parquet where new_d is null order by new_b,c; """
qt_rename_parquet_7_true """ select * from test_hive_rename_column_parquet where new_b + new_a = 31 order by new_b,c; """
qt_rename_parquet_8_true """ select new_a from test_hive_rename_column_parquet where new_a = 1 order by new_b,c; """
qt_rename_parquet_9_true """ select new_b from test_hive_rename_column_parquet where new_b = 1 order by new_b; """
qt_rename_parquet_10_true """ select new_b,new_d from test_hive_rename_column_parquet where new_d +30*new_b=100 order by new_b,c; """
qt_rename_parquet_11_true """ select new_b,new_a from test_hive_rename_column_parquet order by new_b,c,new_a; """
qt_rename_parquet_12_true """ select f,new_d,c,new_b,new_a from test_hive_rename_column_parquet order by new_b,c; """
qt_rename_parquet_13_true """ select * from test_hive_rename_column_parquet where new_b + new_a != 31 order by new_b,c; """
sql """ set hive_parquet_use_column_names=false; """
qt_rename_parquet_1_false """ select * from test_hive_rename_column_parquet order by new_b,c """;
qt_rename_parquet_2_false """ select new_b from test_hive_rename_column_parquet order by new_b,c """;
qt_rename_parquet_3_false """ select new_b,count(*) from test_hive_rename_column_parquet group by new_b order by new_b """;
qt_rename_parquet_4_false """ select * from test_hive_rename_column_parquet where new_a = 1 order by new_b,c """;
qt_rename_parquet_5_false """ select * from test_hive_rename_column_parquet where new_d is not null order by new_b,c """
qt_rename_parquet_6_false """ select * from test_hive_rename_column_parquet where new_d is null order by new_b,c; """
qt_rename_parquet_7_false """ select * from test_hive_rename_column_parquet where new_b + new_a = 31 order by new_b,c; """
qt_rename_parquet_8_false """ select new_a from test_hive_rename_column_parquet where new_a = 1 order by new_b,c; """
qt_rename_parquet_9_false """ select new_b from test_hive_rename_column_parquet where new_b = 1 order by new_b; """
qt_rename_parquet_10_false """ select new_b,new_d from test_hive_rename_column_parquet where new_d +30*new_b=100 order by new_b,c; """
qt_rename_parquet_11_false """ select new_b,new_a from test_hive_rename_column_parquet order by new_b,c,new_a; """
qt_rename_parquet_12_false """ select f,new_d,c,new_b,new_a from test_hive_rename_column_parquet order by new_b,c; """
qt_rename_parquet_13_false """ select * from test_hive_rename_column_parquet where new_b + new_a != 31 order by new_b,c; """
}
}
/*
CREATE TABLE simulation_hive1_orc(
`_col0` boolean,
`_col1` INT,
`_col2` STRING
)stored as orc;
insert into simulation_hive1_orc values(true,10,"hello world"),(false,20,"keep");
select * from simulation_hive1_orc;
alter table simulation_hive1_orc change column `_col0` a boolean;
alter table simulation_hive1_orc change column `_col1` b int;
alter table simulation_hive1_orc change column `_col2` c string;
select * from simulation_hive1_orc;
show create table simulation_hive1_orc;
CREATE TABLE test_hive_rename_column_orc(
a boolean,
b INT,
c STRING
)stored as orc;
insert into test_hive_rename_column_orc values (true,10,"hello world"),(false,20,"keep");
alter table test_hive_rename_column_orc change column a new_a boolean;
alter table test_hive_rename_column_orc change column b new_b int;
insert into test_hive_rename_column_orc values (true,30,"abcd"),(false,40,"new adcd");
select * from test_hive_rename_column_orc;
alter table test_hive_rename_column_orc add columns(d int,f string);
insert into test_hive_rename_column_orc values (true,50,"xxx",60,"cols"),(false,60,"yyy",100,"yyyyyy");
alter table test_hive_rename_column_orc change column d new_d int;
insert into test_hive_rename_column_orc values (true,70,"hahaha",8888,"abcd"),(false,80,"cmake",9999,"efg");
select * from test_hive_rename_column_orc;
show create table test_hive_rename_column_orc;
CREATE TABLE test_hive_rename_column_parquet(
a boolean,
b INT,
c STRING
)stored as parquet;
insert into test_hive_rename_column_parquet values (true,10,"hello world"),(false,20,"keep");
alter table test_hive_rename_column_parquet change column a new_a boolean;
alter table test_hive_rename_column_parquet change column b new_b int;
insert into test_hive_rename_column_parquet values (true,30,"abcd"),(false,40,"new adcd");
select * from test_hive_rename_column_parquet;
alter table test_hive_rename_column_parquet add columns(d int,f string);
insert into test_hive_rename_column_parquet values (true,50,"xxx",60,"cols"),(false,60,"yyy",100,"yyyyyy");
alter table test_hive_rename_column_parquet change column d new_d int;
insert into test_hive_rename_column_parquet values (true,70,"hahaha",8888,"abcd"),(false,80,"cmake",9999,"efg");
select * from test_hive_rename_column_parquet;
show create table test_hive_rename_column_parquet;
*/