[fix](orc)fix orc reader missing column. (#35735)

## Proposed changes
bp #35583 
Issue Number: close #xxx

<!--Describe your changes.-->
This commit is contained in:
daidai
2024-05-31 22:51:44 +08:00
committed by GitHub
parent 1d89dd7607
commit bc062a2595
4 changed files with 310 additions and 9 deletions

View File

@ -821,6 +821,15 @@ Status OrcReader::set_fill_columns(
if (iter == predicate_columns.end()) {
_lazy_read_ctx.missing_columns.emplace(kv.first, kv.second);
} else {
//For check missing column : missing column == xx, missing column is null,missing column is not null.
if (_slot_id_to_filter_conjuncts->find(iter->second.second) !=
_slot_id_to_filter_conjuncts->end()) {
for (auto& ctx : _slot_id_to_filter_conjuncts->find(iter->second.second)->second) {
_filter_conjuncts.emplace_back(ctx);
}
}
// predicate_missing_columns is VLiteral.To fill in default values for missing columns.
_lazy_read_ctx.predicate_missing_columns.emplace(kv.first, kv.second);
_lazy_read_ctx.all_predicate_col_ids.emplace_back(iter->second.first);
}
@ -1732,10 +1741,6 @@ Status OrcReader::get_next_block_impl(Block* block, size_t* read_rows, bool* eof
for (auto& conjunct : _non_dict_filter_conjuncts) {
filter_conjuncts.emplace_back(conjunct);
}
//include missing_columns != missing_columns ; missing_column is null; missing_column != file_columns etc...
for (auto& [missing_col, conjunct] : _lazy_read_ctx.predicate_missing_columns) {
filter_conjuncts.emplace_back(conjunct);
}
std::vector<IColumn::Filter*> filters;
if (_delete_rows_filter_ptr) {
filters.push_back(_delete_rows_filter_ptr.get());
@ -1757,6 +1762,7 @@ Status OrcReader::get_next_block_impl(Block* block, size_t* read_rows, bool* eof
RETURN_IF_CATCH_EXCEPTION(
Block::filter_block_internal(block, columns_to_filter, result_filter));
}
//_not_single_slot_filter_conjuncts check : missing column1 == missing column2 , missing column == exists column ...
if (!_not_single_slot_filter_conjuncts.empty()) {
RETURN_IF_ERROR(_convert_dict_cols_to_string_cols(block, &batch_vec));
RETURN_IF_CATCH_EXCEPTION(
@ -1894,10 +1900,6 @@ Status OrcReader::filter(orc::ColumnVectorBatch& data, uint16_t* sel, uint16_t s
for (auto& conjunct : _non_dict_filter_conjuncts) {
filter_conjuncts.emplace_back(conjunct);
}
//include missing_columns != missing_columns ; missing_column is null; missing_column != file_columns etc...
for (auto& [missing_col, conjunct] : _lazy_read_ctx.predicate_missing_columns) {
filter_conjuncts.emplace_back(conjunct);
}
std::vector<IColumn::Filter*> filters;
if (_delete_rows_filter_ptr) {
filters.push_back(_delete_rows_filter_ptr.get());

View File

@ -603,6 +603,26 @@ CREATE TABLE `unsupported_type_table`(
set hive.stats.column.autogather=false;
CREATE TABLE `test_hive_orc_add_column`(
id int,
col1 int
)
stored as orc;
insert into `test_hive_orc_add_column` values(1,2);
insert into `test_hive_orc_add_column` values(3,4),(4,6);
alter table `test_hive_orc_add_column` ADD COLUMNS (col2 int);
insert into `test_hive_orc_add_column` values(7,8,9);
insert into `test_hive_orc_add_column` values(10,11,null);
insert into `test_hive_orc_add_column` values(12,13,null);
insert into `test_hive_orc_add_column` values(14,15,16);
alter table `test_hive_orc_add_column` ADD COLUMNS (col3 int,col4 string);
insert into `test_hive_orc_add_column` values(17,18,19,20,"hello world");
insert into `test_hive_orc_add_column` values(21,22,23,24,"cywcywcyw");
insert into `test_hive_orc_add_column` values(25,26,null,null,null);
insert into `test_hive_orc_add_column` values(27,28,29,null,null);
insert into `test_hive_orc_add_column` values(30,31,32,33,null);
CREATE TABLE `schema_evo_test_text`(
id int,
name string
@ -2500,4 +2520,3 @@ PARTITIONED BY (
`varchar_col` varchar(50))
stored as orc
TBLPROPERTIES("orc.compress"="ZLIB");

View File

@ -0,0 +1,185 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !orc_add_col1 --
1 2 \N \N \N
3 4 \N \N \N
4 6 \N \N \N
7 8 9 \N \N
10 11 \N \N \N
12 13 \N \N \N
14 15 16 \N \N
17 18 19 20 hello world
21 22 23 24 cywcywcyw
25 26 \N \N \N
27 28 29 \N \N
30 31 32 33 \N
-- !orc_add_col2 --
-- !orc_add_col3 --
-- !orc_add_col4 --
1 2 \N \N \N
3 4 \N \N \N
4 6 \N \N \N
10 11 \N \N \N
12 13 \N \N \N
25 26 \N \N \N
-- !orc_add_col5 --
\N
\N
\N
\N
\N
\N
-- !orc_add_col6 --
1 2 \N \N \N
3 4 \N \N \N
4 6 \N \N \N
7 8 9 \N \N
10 11 \N \N \N
12 13 \N \N \N
14 15 16 \N \N
25 26 \N \N \N
27 28 29 \N \N
-- !orc_add_col7 --
\N
\N
\N
\N
\N
\N
\N
\N
\N
-- !orc_add_col8 --
1 2 \N \N \N
3 4 \N \N \N
4 6 \N \N \N
7 8 9 \N \N
10 11 \N \N \N
12 13 \N \N \N
14 15 16 \N \N
25 26 \N \N \N
27 28 29 \N \N
30 31 32 33 \N
-- !orc_add_col9 --
\N
\N
\N
\N
\N
\N
\N
\N
\N
\N
-- !orc_add_col10 --
1 2 \N \N \N
3 4 \N \N \N
4 6 \N \N \N
7 8 9 \N \N
10 11 \N \N \N
12 13 \N \N \N
14 15 16 \N \N
17 18 19 20 hello world
21 22 23 24 cywcywcyw
25 26 \N \N \N
27 28 29 \N \N
30 31 32 33 \N
-- !orc_add_col11 --
2
4
6
8
11
13
15
18
22
26
28
31
-- !orc_add_col12 --
7 8 9 \N \N
14 15 16 \N \N
17 18 19 20 hello world
21 22 23 24 cywcywcyw
27 28 29 \N \N
30 31 32 33 \N
-- !orc_add_col13 --
9
16
19
23
29
32
-- !orc_add_col14 --
17 18 19 20 hello world
21 22 23 24 cywcywcyw
30 31 32 33 \N
-- !orc_add_col15 --
20
24
33
-- !orc_add_col16 --
17 18 19 20 hello world
21 22 23 24 cywcywcyw
-- !orc_add_col17 --
cywcywcyw
hello world
-- !orc_add_col18 --
7 8 9 \N \N
-- !orc_add_col19 --
-- !orc_add_col20 --
7 8 9 \N \N
14 15 16 \N \N
17 18 19 20 hello world
21 22 23 24 cywcywcyw
27 28 29 \N \N
30 31 32 33 \N
-- !orc_add_col21 --
7 8 9 \N \N
14 15 16 \N \N
17 18 19 20 hello world
21 22 23 24 cywcywcyw
27 28 29 \N \N
30 31 32 33 \N
-- !orc_add_col22 --
-- !orc_add_col23 --
30 31 32 33 \N
-- !orc_add_col24 --
-- !orc_add_col25 --
17 18 19 20 hello world
21 22 23 24 cywcywcyw
30 31 32 33 \N
-- !orc_add_col26 --
-- !orc_add_col27 --
21 22 23 24 cywcywcyw
-- !orc_add_col28 --
17 18 19 20 hello world
21 22 23 24 cywcywcyw

View File

@ -0,0 +1,95 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_hive_orc_add_column", "all_types,p0,external,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
return;
}
try {
String hms_port = context.config.otherConfigs.get("hive3HmsPort")
String catalog_name = "hive3_test_orc_add_column"
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
sql """drop catalog if exists ${catalog_name}"""
sql """create catalog if not exists ${catalog_name} properties (
"type"="hms",
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
);"""
sql """use `${catalog_name}`.`default`"""
qt_orc_add_col1 """select * from test_hive_orc_add_column order by id ;"""
qt_orc_add_col2 """select * from test_hive_orc_add_column where col1 is null order by id ;"""
qt_orc_add_col3 """select col1 from test_hive_orc_add_column where col1 is null;"""
qt_orc_add_col4 """select * from test_hive_orc_add_column where col2 is null order by id ;"""
qt_orc_add_col5 """select col2 from test_hive_orc_add_column where col2 is null;"""
qt_orc_add_col6 """select * from test_hive_orc_add_column where col3 is null order by id ;"""
qt_orc_add_col7 """select col3 from test_hive_orc_add_column where col3 is null;"""
qt_orc_add_col8 """select * from test_hive_orc_add_column where col4 is null order by id ;"""
qt_orc_add_col9 """select col4 from test_hive_orc_add_column where col4 is null;"""
qt_orc_add_col10 """select * from test_hive_orc_add_column where col1 is not null order by id ;"""
qt_orc_add_col11 """select col1 from test_hive_orc_add_column where col1 is not null order by col1;"""
qt_orc_add_col12 """select * from test_hive_orc_add_column where col2 is not null order by id ;"""
qt_orc_add_col13 """select col2 from test_hive_orc_add_column where col2 is not null order by col2;"""
qt_orc_add_col14 """select * from test_hive_orc_add_column where col3 is not null order by id ;"""
qt_orc_add_col15 """select col3 from test_hive_orc_add_column where col3 is not null order by col3;"""
qt_orc_add_col16 """select * from test_hive_orc_add_column where col4 is not null order by id ;"""
qt_orc_add_col17 """select col4 from test_hive_orc_add_column where col4 is not null order by col4;"""
qt_orc_add_col18 """select * from test_hive_orc_add_column where col2 = 9 order by id ;"""
qt_orc_add_col19 """select * from test_hive_orc_add_column where col2 = 190 order by id ;"""
qt_orc_add_col20 """select * from test_hive_orc_add_column where col2 - col1 = 1 order by id ;"""
qt_orc_add_col21 """select * from test_hive_orc_add_column where col2 - id = 2 order by id ;"""
qt_orc_add_col22 """select * from test_hive_orc_add_column where col2 - id = 3 order by id ;"""
qt_orc_add_col23 """select * from test_hive_orc_add_column where col3 = 33 order by id ;"""
qt_orc_add_col24 """select * from test_hive_orc_add_column where col3 = 330 order by id ;"""
qt_orc_add_col25 """select * from test_hive_orc_add_column where col3 - col1 = 2 order by id ;"""
qt_orc_add_col26 """select * from test_hive_orc_add_column where col3 - id != 3 order by id ;"""
qt_orc_add_col27 """select * from test_hive_orc_add_column where col1 + col2 + col3 = 23*3 order by id ;"""
qt_orc_add_col28 """select * from test_hive_orc_add_column where col1 + col2 + col3 != 32*3 order by id ; """
sql """drop catalog if exists ${catalog_name}"""
} finally {
}
}
// CREATE TABLE `test_hive_orc_add_column`(
// id int,
// col1 int
// )
// stored as orc;
// insert into `test_hive_orc_add_column` values(1,2);
// insert into `test_hive_orc_add_column` values(3,4),(4,6);
// alter table `test_hive_orc_add_column` ADD COLUMNS(col2 int);
// insert into `test_hive_orc_add_column` values(7,8,9);
// insert into `test_hive_orc_add_column` values(10,11,null);
// insert into `test_hive_orc_add_column` values(12,13,null);
// insert into `test_hive_orc_add_column` values(14,15,16);
// alter table `test_hive_orc_add_column` ADD COLUMNS(col3 int,col4 string);
// insert into `test_hive_orc_add_column` values(17,18,19,20,"hello world");
// insert into `test_hive_orc_add_column` values(21,22,23,24,"cywcywcyw");
// insert into `test_hive_orc_add_column` values(25,26,null,null,null);
// insert into `test_hive_orc_add_column` values(27,28,29,null,null);
// insert into `test_hive_orc_add_column` values(30,31,32,33,null);