From 3e160aeb66516e1b4063af6b1768f9ea7313cfd6 Mon Sep 17 00:00:00 2001 From: yangzhg <780531911@qq.com> Date: Wed, 12 Feb 2020 21:50:12 +0800 Subject: [PATCH] [GroupingSet] fix a bug when using grouping set without all column in a grouping set item (#2877) fix a bug when using grouping sets without all column in a grouping set item will produce wrong value. fix grouping function check will not work in group by clause --- be/src/exec/repeat_node.cpp | 4 +-- be/src/exec/repeat_node.h | 2 ++ .../cn/internal/grouping_sets_design.md | 4 +-- .../en/internal/grouping_sets_design_EN.md | 3 +- .../apache/doris/analysis/GroupByClause.java | 18 ++++------ .../apache/doris/analysis/GroupingInfo.java | 36 +++++++++---------- .../org/apache/doris/analysis/SelectStmt.java | 10 +++++- .../org/apache/doris/planner/RepeatNode.java | 9 ++++- .../doris/analysis/GroupByClauseTest.java | 12 ++----- gensrc/thrift/PlanNodes.thrift | 2 ++ 10 files changed, 53 insertions(+), 47 deletions(-) diff --git a/be/src/exec/repeat_node.cpp b/be/src/exec/repeat_node.cpp index 9de70ee079..274f07b0fe 100644 --- a/be/src/exec/repeat_node.cpp +++ b/be/src/exec/repeat_node.cpp @@ -30,6 +30,7 @@ RepeatNode::RepeatNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : ExecNode(pool, tnode, descs), _slot_id_set_list(tnode.repeat_node.slot_id_set_list), + _all_slot_ids(tnode.repeat_node.all_slot_ids), _repeat_id_list(tnode.repeat_node.repeat_id_list), _grouping_list(tnode.repeat_node.grouping_list), _output_tuple_id(tnode.repeat_node.output_tuple_id), @@ -114,8 +115,7 @@ Status RepeatNode::get_repeated_batch( DCHECK_EQ(src_slot_desc->type().type, dst_slot_desc->type().type); DCHECK_EQ(src_slot_desc->col_name(), dst_slot_desc->col_name()); // set null base on repeated list - // the first element in _slot_id_set_list contain all slots, so find in the _slot_id_set_list[0] - if (_slot_id_set_list[0].find(src_slot_desc->id()) != _slot_id_set_list[0].end()) { + if (_all_slot_ids.find(src_slot_desc->id()) != _all_slot_ids.end()) { std::set& repeat_ids = _slot_id_set_list[repeat_id_idx]; if (repeat_ids.find(src_slot_desc->id()) == repeat_ids.end()) { dst_tuples[j]->set_null(dst_slot_desc->null_indicator_offset()); diff --git a/be/src/exec/repeat_node.h b/be/src/exec/repeat_node.h index 98627a91d2..9f734f713c 100644 --- a/be/src/exec/repeat_node.h +++ b/be/src/exec/repeat_node.h @@ -45,6 +45,8 @@ private: // Slot id set used to indicate those slots need to set to null. std::vector> _slot_id_set_list; + // all slot id + std::set _all_slot_ids; // An integer bitmap list, it indicates the bit position of the exprs not null. std::vector _repeat_id_list; std::vector> _grouping_list; diff --git a/docs/documentation/cn/internal/grouping_sets_design.md b/docs/documentation/cn/internal/grouping_sets_design.md index 8e1b3c702f..9298b70777 100644 --- a/docs/documentation/cn/internal/grouping_sets_design.md +++ b/docs/documentation/cn/internal/grouping_sets_design.md @@ -127,7 +127,7 @@ GROUPING SETS ( ### 1.4 GROUPING 和 GROUPING_ID 函数 当我们没有统计某一列时,它的值显示为 NULL,这也可能是列本身就有 NULL 值,这就需要一种方法区分是没有统计还是值本来就是 NULL。为此引入 GROUPING 和 GROUPING_ID 函数。 -GROUPING(column:Column) 函数用于区分分组后的单个列是普通行和聚合行。如果是聚合行,则返回1,反之,则是0. GROUPING() 只能有一个参数列。 +GROUPING(column:Column) 函数用于区分分组后的单个列是普通列和聚合列。如果是聚合列,则返回1,反之,则是0. GROUPING() 只能有一个参数列。 GROUPING_ID(column1, column2) 则根据指定的column 顺序,否则根据聚合的时候给的集合的元素顺序,计算出一个列列表的 bitmap 值,一个列如果是聚合列为0,否则为1. GROUPING_ID()函数返回位向量的十进制值。 比如 [0 1 0] ->2 从下列第三个查询可以看到这种对应关系 @@ -255,7 +255,7 @@ Presto 支持组合,但不支持嵌套。 从语法上支持 GROUPING SETS, ROLLUP 和 CUBE。实现上述所述的1.1, 1.2, 1.3 1.4. -对于 1.5 GROUPING 函数 和 1.6 GROUPING SETS 的组合与嵌套 先不实现。 +对于1.6 GROUPING SETS 的组合与嵌套 先不实现。 具体语法列出如下: diff --git a/docs/documentation/en/internal/grouping_sets_design_EN.md b/docs/documentation/en/internal/grouping_sets_design_EN.md index 2c947f79f0..e1c82b41e8 100644 --- a/docs/documentation/en/internal/grouping_sets_design_EN.md +++ b/docs/documentation/en/internal/grouping_sets_design_EN.md @@ -253,7 +253,8 @@ Presto supports composition, but not nesting. ## 2. Object -Support `GROUPING SETS`, `ROLLUP` and `CUBE ` syntax,impliments 1.1, 1.2, 1.3 1.4, 1.5 +Support `GROUPING SETS`, `ROLLUP` and `CUBE ` syntax,impliments 1.1, 1.2, 1.3 1.4, 1.5, not support the combination + and nesting of GROUPING SETS at current version. ### 2.1 GROUPING SETS Syntax diff --git a/fe/src/main/java/org/apache/doris/analysis/GroupByClause.java b/fe/src/main/java/org/apache/doris/analysis/GroupByClause.java index 16e33e1e71..fcde85260e 100644 --- a/fe/src/main/java/org/apache/doris/analysis/GroupByClause.java +++ b/fe/src/main/java/org/apache/doris/analysis/GroupByClause.java @@ -48,9 +48,8 @@ public class GroupByClause implements ParseNode { private final static Logger LOG = LogManager.getLogger(GroupByClause.class); // max num of distinct sets in grouping sets clause - private final static int MAX_GROUPING_SETS_NUM = 16; + private final static int MAX_GROUPING_SETS_NUM = 64; // max num of distinct expressions - private final static int MAX_GROUPING_SETS_EXPRESSION_NUM = 64; private boolean analyzed_ = false; private boolean exprGenerated = false; private GroupingType groupingType; @@ -193,21 +192,16 @@ public class GroupByClause implements ParseNode { } } - if (groupingExprs != null && groupingExprs.size() > MAX_GROUPING_SETS_NUM) { - throw new AnalysisException( - "Too many sets in GROUP BY clause, it must be not more than " - + MAX_GROUPING_SETS_NUM); + if (isGroupByExtension() && groupingExprs != null && groupingExprs.size() > MAX_GROUPING_SETS_NUM) { + throw new AnalysisException("Too many sets in GROUP BY clause, the max grouping sets item is " + + MAX_GROUPING_SETS_NUM); } analyzed_ = true; } + // check if group by clause is contain grouping set/rollup/cube public boolean isGroupByExtension() { - if (groupingType == GroupingType.GROUP_BY || - groupingType == GroupingType.GROUPING_SETS && (groupingSetList == null || groupingSetList.size() < 2)) { - return false; - } else { - return true; - } + return groupingType != GroupingType.GROUP_BY; } @Override diff --git a/fe/src/main/java/org/apache/doris/analysis/GroupingInfo.java b/fe/src/main/java/org/apache/doris/analysis/GroupingInfo.java index 49ceb2dd32..391d2e613d 100644 --- a/fe/src/main/java/org/apache/doris/analysis/GroupingInfo.java +++ b/fe/src/main/java/org/apache/doris/analysis/GroupingInfo.java @@ -37,12 +37,13 @@ public class GroupingInfo { private Set groupingSlots; private List groupingIdList; private GroupByClause.GroupingType groupingType; + private BitSet bitSetAll; public GroupingInfo(Analyzer analyzer, GroupByClause.GroupingType groupingType) throws AnalysisException { this.groupingType = groupingType; groupingSlots = new LinkedHashSet<>(); virtualTuple = analyzer.getDescTbl().createTupleDescriptor("VIRTUAL_TUPLE"); - String colName = "GROUPING__ID"; + String colName = "GROUPING_ID"; groupingIDSlot = new VirtualSlotRef(colName, Type.BIGINT, virtualTuple, new ArrayList<>()); groupingIDSlot.analyze(analyzer); groupingSlots.add(groupingIDSlot); @@ -85,12 +86,14 @@ public class GroupingInfo { // generate the bitmap that repeated node will repeat rows according to public void buildRepeat(ArrayList groupingExprs, List> groupingSetList) { groupingIdList = new ArrayList<>(); - BitSet bitSetAll = new BitSet(); + bitSetAll = new BitSet(); bitSetAll.set(0, groupingExprs.size(), true); - groupingIdList.add(bitSetAll); switch (groupingType) { case CUBE: - for (int i = 0; i < (1 << groupingExprs.size()) - 1; i++) { + // it will generate the full permutation bitmap of cube item + // e.g. cube (k1,k2,k3) the result is ["{}", "{1}", "{0}", "{0, 1}", "{2}", "{1, 2}", "{0, 1, 2}", + // "{0, 2}"] + for (int i = 0; i < (1 << groupingExprs.size()); i++) { BitSet bitSet = new BitSet(); for (int j = 0; j < groupingExprs.size(); j++) { if ((i & (1 << j)) > 0) { @@ -102,7 +105,7 @@ public class GroupingInfo { break; case ROLLUP: - for (int i = 0; i < groupingExprs.size(); i++) { + for (int i = 0; i <= groupingExprs.size(); i++) { BitSet bitSet = new BitSet(); bitSet.set(0, i); groupingIdList.add(bitSet); @@ -110,17 +113,13 @@ public class GroupingInfo { break; case GROUPING_SETS: - BitSet bitSetBase = new BitSet(); - bitSetBase.set(0, groupingExprs.size()); for (ArrayList list : groupingSetList) { BitSet bitSet = new BitSet(); for (int i = 0; i < groupingExprs.size(); i++) { bitSet.set(i, list.contains(groupingExprs.get(i))); } - if (!bitSet.equals(bitSetBase)) { - if (!groupingIdList.contains(bitSet)) { - groupingIdList.add(bitSet); - } + if (!groupingIdList.contains(bitSet)) { + groupingIdList.add(bitSet); } } break; @@ -134,19 +133,18 @@ public class GroupingInfo { // generate grouping function's value public List> genGroupingList(ArrayList groupingExprs) { List> groupingList = new ArrayList<>(); - BitSet base = groupingIdList.get(0); for (SlotRef slot : groupingSlots) { List glist = new ArrayList<>(); for (BitSet bitSet : groupingIdList) { long l = 0L; // for all column, using for group by - if ("GROUPING__ID".equals(slot.getColumnName())) { + if ("GROUPING_ID".equals(slot.getColumnName())) { BitSet newBitSet = new BitSet(); - for (int i = 0; i < base.length(); ++i) { - newBitSet.set(i, bitSet.get(base.length() - i - 1)); + for (int i = 0; i < bitSetAll.length(); ++i) { + newBitSet.set(i, bitSet.get(bitSetAll.length() - i - 1)); } - newBitSet.flip(0, base.length()); - newBitSet.and(base); + newBitSet.flip(0, bitSetAll.length()); + newBitSet.and(bitSetAll); for (int i = 0; i < newBitSet.length(); ++i) { l += newBitSet.get(i) ? (1L << i) : 0L; } @@ -179,7 +177,7 @@ public class GroupingInfo { public void substituteGroupingFn(Expr expr, Analyzer analyzer) throws AnalysisException { if (expr instanceof GroupingFunctionCallExpr) { - // TODO(yangzhengguo) support expression in grouping functuons + // TODO(yangzhengguo) support expression in grouping functions for (Expr child: expr.getChildren()) { if (!(child instanceof SlotRef)) { throw new AnalysisException("grouping functions only support column in current version."); @@ -200,7 +198,7 @@ public class GroupingInfo { VirtualSlotRef vSlot = addGroupingSlots(((GroupingFunctionCallExpr) expr).getRealSlot(), analyzer); ((GroupingFunctionCallExpr) expr).resetChild(vSlot); expr.analyze(analyzer); - } else if (expr.getChildren().size() > 0) { + } else if (expr.getChildren() != null && expr.getChildren().size() > 0) { for (Expr child : expr.getChildren()) { substituteGroupingFn(child, analyzer); } diff --git a/fe/src/main/java/org/apache/doris/analysis/SelectStmt.java b/fe/src/main/java/org/apache/doris/analysis/SelectStmt.java index fec4a34d14..a4a92f91e9 100644 --- a/fe/src/main/java/org/apache/doris/analysis/SelectStmt.java +++ b/fe/src/main/java/org/apache/doris/analysis/SelectStmt.java @@ -351,6 +351,14 @@ public class SelectStmt extends QueryStmt { if (groupByClause != null && groupByClause.isGroupByExtension()) { groupingInfo = new GroupingInfo(analyzer, groupByClause.getGroupingType()); groupingInfo.substituteGroupingFn(resultExprs, analyzer); + } else { + for (Expr expr : resultExprs) { + if (checkGroupingFn(expr)) { + throw new AnalysisException( + "cannot use GROUPING functions without [grouping sets|rollup|cube] " + + "clause or grouping sets only have one element."); + } + } } if (valueList != null) { @@ -1404,7 +1412,7 @@ public class SelectStmt extends QueryStmt { private boolean checkGroupingFn(Expr expr) { if (expr instanceof GroupingFunctionCallExpr) { return true; - } else if (expr.getChildren().size() > 0) { + } else if (expr.getChildren() != null && expr.getChildren().size() > 0) { for (Expr child : expr.getChildren()) { if (checkGroupingFn(child)) { return true; diff --git a/fe/src/main/java/org/apache/doris/planner/RepeatNode.java b/fe/src/main/java/org/apache/doris/planner/RepeatNode.java index c06150607e..a9c02a9272 100644 --- a/fe/src/main/java/org/apache/doris/planner/RepeatNode.java +++ b/fe/src/main/java/org/apache/doris/planner/RepeatNode.java @@ -53,6 +53,7 @@ import java.util.stream.Collectors; public class RepeatNode extends PlanNode { private List> repeatSlotIdList; + private Set allSlotId; private TupleDescriptor outputTupleDesc; private List> groupingList; private GroupingInfo groupingInfo; @@ -88,6 +89,7 @@ public class RepeatNode extends PlanNode { } slotIdList.add(intSet); } + return slotIdList; } @@ -131,6 +133,7 @@ public class RepeatNode extends PlanNode { List> groupingIdList = new ArrayList<>(); List exprList = groupByClause.getGroupingExprs(); Preconditions.checkState(exprList.size() >= 2); + allSlotId = new HashSet<>(); for (BitSet bitSet : Collections.unmodifiableList(groupingInfo.getGroupingIdList())) { Set slotIdSet = new HashSet<>(); for (SlotDescriptor slotDesc : slotDescSet) { @@ -150,7 +153,11 @@ public class RepeatNode extends PlanNode { } groupingIdList.add(slotIdSet); } + this.repeatSlotIdList = buildIdSetList(groupingIdList); + for (Set s : this.repeatSlotIdList) { + allSlotId.addAll(s); + } this.groupingList = groupingInfo.genGroupingList(groupByClause.getGroupingExprs()); tupleIds.add(outputTupleDesc.getId()); for (TupleId id : tupleIds) { @@ -165,7 +172,7 @@ public class RepeatNode extends PlanNode { protected void toThrift(TPlanNode msg) { msg.node_type = TPlanNodeType.REPEAT_NODE; msg.repeat_node = new TRepeatNode(outputTupleDesc.getId().asInt(), repeatSlotIdList, groupingList.get(0), - groupingList); + groupingList, allSlotId); } @Override diff --git a/fe/src/test/java/org/apache/doris/analysis/GroupByClauseTest.java b/fe/src/test/java/org/apache/doris/analysis/GroupByClauseTest.java index efcd1b8736..1bfa604af5 100644 --- a/fe/src/test/java/org/apache/doris/analysis/GroupByClauseTest.java +++ b/fe/src/test/java/org/apache/doris/analysis/GroupByClauseTest.java @@ -97,10 +97,8 @@ public class GroupByClauseTest { + ".`k3`, `testdb`.`t`.`k2`), (`testdb`.`t`.`k1`, `testdb`.`t`.`k3`), (`testdb`.`t`.`k4`), (`testdb`" + ".`t`.`k1`, `testdb`.`t`.`k2`, `testdb`.`t`.`k3`, `testdb`.`t`.`k4`))", groupByClause.toSql()); List bitSetList = groupingInfo.getGroupingIdList(); - bitSetList.remove(0); - { - String[] answer = {"{0, 1}", "{0, 2}", "{3}"}; + String[] answer = {"{0, 1, 2, 3}", "{0, 1}", "{0, 2}", "{3}"}; Set answerSet = new HashSet(Arrays.asList(answer)); Set resultSet = new HashSet<>(); for (BitSet aBitSetList : bitSetList) { @@ -137,10 +135,8 @@ public class GroupByClauseTest { Assert.assertEquals("ROLLUP (`testdb`.`t`.`k2`, `testdb`.`t`.`k3`, " + "`testdb`.`t`.`k4`, `testdb`.`t`.`k3`)", groupByClause.toSql()); List bitSetList = groupingInfo.getGroupingIdList(); - bitSetList.remove(0); - { - String[] answer = {"{}", "{0}", "{0, 1}"}; + String[] answer = {"{}", "{0}", "{0, 1}", "{0, 1, 2}"}; Set answerSet = new HashSet(Arrays.asList(answer)); Set resultSet = new HashSet<>(); for (BitSet aBitSetList : bitSetList) { @@ -176,10 +172,8 @@ public class GroupByClauseTest { Assert.assertEquals(4, groupByClause.getGroupingExprs().size()); List bitSetList = groupingInfo.getGroupingIdList(); - bitSetList.remove(0); - { - String[] answer = {"{}", "{1}", "{0}", "{0, 1}", "{2}", "{1, 2}", "{0, 2}"}; + String[] answer = {"{}", "{1}", "{0}", "{0, 1}", "{2}", "{1, 2}", "{0, 1, 2}", "{0, 2}"}; Set answerSet = new HashSet(Arrays.asList(answer)); Set resultSet = new HashSet<>(); for (BitSet aBitSetList : bitSetList) { diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 6775a0e231..0fcc44bb65 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -396,6 +396,8 @@ struct TRepeatNode { 3: required list repeat_id_list // A list of integer list, it indicates the position of the grouping virtual slot. 4: required list> grouping_list + // A list of all slot + 5: required set all_slot_ids } struct TPreAggregationNode {