[fix](join) fix wrong result of right join (#18365)

When processing data in hash table for right join and full outer join, if the output data rows of one hash bucket excceeds batch size, the logic when continue processing this bucket is wrong, it should differentiate between different join types.
This commit is contained in:
TengJianPing
2023-04-06 10:55:58 +08:00
committed by GitHub
parent a01d824256
commit 4ca0c0face
3 changed files with 196 additions and 9 deletions

View File

@ -1058,8 +1058,24 @@ Status ProcessHashTableProbe<JoinOpType>::process_data_in_hashtable(HashTableTyp
};
if (visited_iter.ok()) {
for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
insert_from_hash_table(visited_iter->block_offset, visited_iter->row_num);
if constexpr (std::is_same_v<Mapped, RowRefListWithFlag>) {
for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
insert_from_hash_table(visited_iter->block_offset, visited_iter->row_num);
}
} else {
for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
if (visited_iter->visited) {
insert_from_hash_table(visited_iter->block_offset,
visited_iter->row_num);
}
} else {
if (!visited_iter->visited) {
insert_from_hash_table(visited_iter->block_offset,
visited_iter->row_num);
}
}
}
}
if (!visited_iter.ok()) {
++iter;
@ -1070,16 +1086,16 @@ Status ProcessHashTableProbe<JoinOpType>::process_data_in_hashtable(HashTableTyp
auto& mapped = iter->get_second();
if constexpr (std::is_same_v<Mapped, RowRefListWithFlag>) {
if (mapped.visited) {
visited_iter = mapped.begin();
for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
if constexpr (JoinOpType == TJoinOp::RIGHT_SEMI_JOIN) {
visited_iter = mapped.begin();
for (; visited_iter.ok() && block_size < _batch_size; ++visited_iter) {
insert_from_hash_table(visited_iter->block_offset,
visited_iter->row_num);
}
}
if (visited_iter.ok()) {
// block_size >= _batch_size, quit for loop
break;
if (visited_iter.ok()) {
// block_size >= _batch_size, quit for loop
break;
}
}
} else {
if constexpr (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN) {

View File

@ -0,0 +1,13 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql_right_outer_join_with_other_conjuncts --
27796
-- !sql_full_join_with_other_conjuncts --
27796
-- !sql_right_semi_join_with_other_conjuncts --
8153
-- !sql_right_anti_join_with_other_conjuncts --
19643

View File

@ -0,0 +1,158 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
suite("test_right_join") {
sql """set enable_nereids_planner=false"""
sql """set parallel_fragment_exec_instance_num=1"""
sql "use regression_test_tpcds_sf1_p1"
qt_sql_right_outer_join_with_other_conjuncts """
WITH customer_total_return AS (
SELECT
ca_state ctr_state,
sum(cr_return_amt_inc_tax) ctr_total_return
FROM
catalog_returns,
date_dim,
customer_address
WHERE
(cr_returned_date_sk = d_date_sk)
AND (d_year = 2000)
AND (cr_returning_addr_sk = ca_address_sk)
GROUP BY
cr_returning_customer_sk,
ca_state
),
ctr2 as (
SELECT
(avg(ctr_total_return)) ctr_total_return,
ctr_state
FROM
customer_total_return
group by
ctr_state
)
SELECT
count(ctr1.ctr_state)
FROM
ctr2 right
join customer_total_return ctr1 on ctr1.ctr_state = ctr2.ctr_state
and ctr1.ctr_total_return > ctr2.ctr_total_return;
"""
qt_sql_full_join_with_other_conjuncts """
WITH customer_total_return AS (
SELECT
ca_state ctr_state,
sum(cr_return_amt_inc_tax) ctr_total_return
FROM
catalog_returns,
date_dim,
customer_address
WHERE
(cr_returned_date_sk = d_date_sk)
AND (d_year = 2000)
AND (cr_returning_addr_sk = ca_address_sk)
GROUP BY
cr_returning_customer_sk,
ca_state
),
ctr2 as (
SELECT
(avg(ctr_total_return)) ctr_total_return,
ctr_state
FROM
customer_total_return
group by
ctr_state
)
SELECT
count(ctr1.ctr_state)
FROM
ctr2 full
join customer_total_return ctr1 on ctr1.ctr_state = ctr2.ctr_state
and ctr1.ctr_total_return > ctr2.ctr_total_return;
"""
qt_sql_right_semi_join_with_other_conjuncts """
WITH customer_total_return AS (
SELECT
ca_state ctr_state,
sum(cr_return_amt_inc_tax) ctr_total_return
FROM
catalog_returns,
date_dim,
customer_address
WHERE
(cr_returned_date_sk = d_date_sk)
AND (d_year = 2000)
AND (cr_returning_addr_sk = ca_address_sk)
GROUP BY
cr_returning_customer_sk,
ca_state
),
ctr2 as (
SELECT
(avg(ctr_total_return)) ctr_total_return,
ctr_state
FROM
customer_total_return
group by
ctr_state
)
SELECT
count(ctr1.ctr_state)
FROM
ctr2 right semi
join customer_total_return ctr1 on ctr1.ctr_state = ctr2.ctr_state
and ctr1.ctr_total_return > ctr2.ctr_total_return;
"""
qt_sql_right_anti_join_with_other_conjuncts """
WITH customer_total_return AS (
SELECT
ca_state ctr_state,
sum(cr_return_amt_inc_tax) ctr_total_return
FROM
catalog_returns,
date_dim,
customer_address
WHERE
(cr_returned_date_sk = d_date_sk)
AND (d_year = 2000)
AND (cr_returning_addr_sk = ca_address_sk)
GROUP BY
cr_returning_customer_sk,
ca_state
),
ctr2 as (
SELECT
(avg(ctr_total_return)) ctr_total_return,
ctr_state
FROM
customer_total_return
group by
ctr_state
)
SELECT
count(ctr1.ctr_state)
FROM
ctr2 right anti
join customer_total_return ctr1 on ctr1.ctr_state = ctr2.ctr_state
and ctr1.ctr_total_return > ctr2.ctr_total_return;
"""
}