diff --git a/planner/core/logical_plans.go b/planner/core/logical_plans.go index b1554122b2..7f89a82693 100644 --- a/planner/core/logical_plans.go +++ b/planner/core/logical_plans.go @@ -965,19 +965,6 @@ func (p *LogicalSelection) ExtractFD() *fd.FDSet { // extract equivalence cols. equivUniqueIDs := extractEquivalenceCols(p.Conditions, p.SCtx(), fds) - // after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence - // cols pointing to outer equivalence cols. eg: t left join t1 on t.a = t1.b, leading a - // lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all - // left rows, once there is a null-refusing predicate on the inner side on upper layer, this - // can be equivalence again. (the outer rows left are all coming from equal matching) - // - // why not just makeNotNull of them, because even a non-equiv-related inner col can also - // refuse supplied null values. - if fds.Rule333Equiv.InnerCols.Len() != 0 && notnullColsUniqueIDs.Intersects(fds.Rule333Equiv.InnerCols) { - // restore/re-strength FDs from rule 333 - fds.MakeRestoreRule333() - } - // apply operator's characteristic's FD setting. fds.MakeNotNull(notnullColsUniqueIDs) fds.AddConstants(constUniqueIDs) diff --git a/planner/funcdep/doc.go b/planner/funcdep/doc.go new file mode 100644 index 0000000000..599439c025 --- /dev/null +++ b/planner/funcdep/doc.go @@ -0,0 +1,180 @@ +// Copyright 2022 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package funcdep + +// Theory to Practice +// +// For more rigorous examination of functional dependencies and their +// interaction with various SQL operators, see the following Master's Thesis: +// +// Norman Paulley, Glenn. (2000). +// Exploiting Functional Dependence in Query Optimization. +// https://cs.uwaterloo.ca/research/tr/2000/11/CS-2000-11.thesis.pdf + +// TODO: Add the RFC design. + +// NOTE 1. +// when handling Lax FD, we don't care the null value in the dependency, which means +// as long as null-attribute coverage of the determinant can make a Lax FD as strict one. + +// The definition of "lax" used in the paper differs from the definition used by this +// library. For a lax dependency A~~>B, the paper allows this set of rows: +// +// a b +// ------- +// 1 1 +// 1 NULL +// +// This alternate definition is briefly covered in section 2.5.3.2 of the paper (see definition +// 2.19). The reason for this change is to allow a lax dependency to be upgraded to a strict +// dependency more readily, needing only the determinant columns to be not-null rather than +// both determinant and dependant columns. +// +// This is on the condition that, for definite values of determinant of a Lax FD, it won't +// have two same definite dependant value. That's true, because there is no way can derive +// to this kind of FD. +// +// Even in our implementation of outer join, the only way to produce duplicate definite +// determinant is the join predicate. But for now, we only maintain the equivalence and +// some strict FD of it. +// +// t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1 +// a b | c d e +// ------+---------------- +// 1 1 | 1 NULL 1 +// 1 2 | NULL NULL NULL +// 2 1 | NULL NULL NULL +// +// Actually it's possible, the lax FD {a} -> {c} can be derived but not that useful. we only +// maintain the {c} ~> {a} for existence after outer join. Besides, there two Cond-FD should +// be preserved waiting for be visible again once with the null-reject on the condition of +// null constraint columns. (see below) +// +// NOTE 2. +// When handle outer join, it won't produce lax FD with duplicate definite determinant values and +// different dependency values. +// +// In implementation,we come across some lax FD dependent on null-reject of some other cols. For +// example. +// t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1 +// a b | c d e +// ------+---------------- +// 1 1 | 1 NULL 1 +// 1 2 | NULL NULL NULL +// 2 1 | NULL NULL NULL +// +// here constant FD {} -> {b} won't be existed after the outer join is done. Notice null-constraint +// {c,d,e} -| {c,d,e}, this FD should be preserved and will be visible again when some null-reject +// predicate take effect on the null-constraint cols. +// +// It's same for strict equivalence {t.a} = {t1.c}. Notice there are no lax equivalence here, because +// left side couldn't be guaranteed to be definite or null. like a=2 here. Let's collect all of this +// on-condition FD down, correspondent with a null-constraints column set, name it as Cond-FD. +// +// lax equivalencies are theoretically possible, but it won't be constructed from an outer join unless +// t already has a constant FD in column `a` here before outer join take a run. So the lax equivalence +// has some pre-conditions as you see, and it couldn't cover the case shown above. Let us do it like a +// Cond-FD does. +// +// The FD constructed from the join predicate should be considered as Cond-FD. Here like equivalence of +// {a} == {c} and constant FD {b} = 1 (if the join condition is e=1, it's here too). We can say that for +// every matched row, this FDs is valid, while for the other rows, the inner side are supplied of null +// rows. So this FDs are stored as ncEdges with nc condition of all inner table cols. +// +// We introduced invisible FD with null-constraint column to solve the problem above named as Cond-FD. +// For multi embedded left join, we take the following case as an example. +// a,b c,d,e +// -----------+----------- +// 1 2 | 1 1 1 +// 2 2 | +// -----------+----------- +// +// left join on (a=c) res: +// a b c e e +// ------------------------- +// 1 2 1 1 1 +// 2 2 +- null null null -+ +// | | +// +-------------------+ +// \ +// \ +// the Cond-FD are < a=c with {c,d,e} > the latter is as null constraint cols +// +// e,f +// ----------------------- +// 1 2 +// 2 2 +// 3 3 +// ----------------------- +// +// left join on (e=a) res: +// e f a b c d e +// ----------------------------------- +// 1 2 1 2 1 1 1 +// 2 2 2 2 +- null null null --+---------------> Cond-FD are still exists. +// 3 3 +-null null | null null null |---+ +// | +-------------------+ | +// +-----------------------------------+-----------> New Cond-FD are occurs. +// +// +// the old Cond-FD with null constraint columns set {c,d,e} is preserved cause new appended cols are all null too. +// the new Cond-FD with null constraint columns set {a,b,c,d,e} are also meaningful, even if the null-reject column +// is one of {c,d,e} which may reduce one of the matched row out of the result, the equivalence {a}={e} still exist. +// +// Provide that the result of the first left join is like: +// left join on (a=c) res: +// a b c e e +// --------------------------- +// 1 2 1 1 1 +// null 2 null null null +// +// THEN: left join on (e=a) res: +// e f a b c d e +// --------------------------------- +// 1 2 1 2 1 1 1 +// 2 2 null null null null null +// 3 3 null null null null null +// +// Even like that, the case of old Cond-FD and new Cond-FD are existed too. Seems the null-constraint column set of +// old Cond-FD {c,d,e} can be expanded as {a,b,c,d,e} visually, but we couldn't derive the inference of the join predicate +// (e=a). The null-reject of column `a` couldn't bring the visibility to the old Cond-FD theoretically, it just happened +// to refuse that row with a null value in column a. +// +// Think about adding one more row in first left join result. +// +// left join on (a=c) res: +// a b c e e +// --------------------------- +// 1 2 1 1 1 +// null 2 null null null +// 3 3 null null null +// +// THEN: left join on (e=a) res: +// e f a b c d e +// --------------------------------- +// 1 2 1 2 1 1 1 +// 2 2 null null null null null +// 3 3 3 3 null null null +// +// Conclusion: +// As you see that's right we couldn't derive the inference of the join predicate (e=a) to expand old Cond-FD's nc +// {c,d,e} as {a,b,c,d,e}. So the rule for Cond-FD is quite simple, just keep the old ncEdge from right, appending +// the new ncEdges in current left join. +// +// If the first left join result is in the outer side of the second left join, just keep the ncEdge from left as well, +// appending the new ncEdges in current left join. +// +// For a inner join, both side of the join result won't be appended with null-supplied rows, so we can simply collect +// the ncEdges from both join side together. diff --git a/planner/funcdep/fd_graph.go b/planner/funcdep/fd_graph.go index 9aad5f2d13..4dee30d437 100644 --- a/planner/funcdep/fd_graph.go +++ b/planner/funcdep/fd_graph.go @@ -32,12 +32,23 @@ type fdEdge struct { // And if there's a functional dependency `const` -> `column` exists. We would let the from side be empty. strict bool equiv bool + + // FD with non-nil conditionNC is hidden in FDSet, it will be visible again when at least one null-reject column in conditionNC. + // conditionNC should be satisfied before some FD make vision again, it's quite like lax FD to be strengthened as strict + // one. But the constraints should take effect on specified columns from conditionNC rather than just determinant columns. + conditionNC *FastIntSet } // FDSet is the main portal of functional dependency, it stores the relationship between (extended table / physical table)'s // columns. For more theory about this design, ref the head comments in the funcdep/doc.go. type FDSet struct { fdEdges []*fdEdge + // after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence + // cols pointing to outer equivalence cols. eg: t left join t1 on t.a = t1.b, leading a + // lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all + // left rows, once there is a null-refusing predicate on the inner side on upper layer, this + // can be equivalence again. (the outer rows left are all coming from equal matching) + ncEdges []*fdEdge // NotNullCols is used to record the columns with not-null attributes applied. // eg: {1} ~~> {2,3}, when {2,3} not null is applied, it actually does nothing. // but we should record {2,3} as not-null down for the convenience of transferring @@ -50,18 +61,9 @@ type FDSet struct { // GroupByCols is used to record columns / expressions that under the group by phrase. GroupByCols FastIntSet HasAggBuilt bool - // after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence - // cols pointing to outer equivalence cols. eg: t left join t1 on t.a = t1.b, leading a - // lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all - // left rows, once there is a null-refusing predicate on the inner side on upper layer, this - // can be equivalence again. (the outer rows left are all coming from equal matching) - // - // why not just makeNotNull of them, because even a non-equiv-related inner col can also - // refuse supplied null values. - Rule333Equiv struct { - Edges []*fdEdge - InnerCols FastIntSet - } + + // todo: when multi join and across select block, this may need to be maintained more precisely. + } // ClosureOfStrict is exported for outer usage. @@ -215,6 +217,20 @@ func (s *FDSet) AddLaxFunctionalDependency(from, to FastIntSet) { s.addFunctionalDependency(from, to, false, false) } +// AddNCFunctionalDependency is to add conditional functional dependency to the fdGraph. +func (s *FDSet) AddNCFunctionalDependency(from, to, nc FastIntSet, strict, equiv bool) { + // Since nc edge is invisible by now, just collecting them together simply, once the + // null-reject on nc cols is satisfied, let's pick them out and insert into the fdEdge + // normally. + s.ncEdges = append(s.ncEdges, &fdEdge{ + from: from, + to: to, + strict: strict, + equiv: equiv, + conditionNC: &nc, + }) +} + // addFunctionalDependency will add strict/lax functional dependency to the fdGraph. // eg: // CREATE TABLE t (a int key, b int, c int, d int, e int, UNIQUE (b,c)) @@ -425,6 +441,7 @@ func (s *FDSet) AddConstants(cons FastIntSet) { shouldRemoved = true } } + // pre-condition NOTE 1 in doc.go, it won't occur duplicate definite determinant of Lax FD. // for strict or lax FDs, both can reduce the dependencies side columns with constant closure. if fd.removeColumnsToSide(cols) { shouldRemoved = true @@ -507,6 +524,29 @@ func (s *FDSet) EquivalenceCols() (eqs []*FastIntSet) { func (s *FDSet) MakeNotNull(notNullCols FastIntSet) { notNullCols.UnionWith(s.NotNullCols) notNullColsSet := s.closureOfEquivalence(notNullCols) + // make nc FD visible. + for i := 0; i < len(s.ncEdges); i++ { + fd := s.ncEdges[i] + if fd.conditionNC.Intersects(notNullColsSet) { + // condition satisfied. + s.ncEdges = append(s.ncEdges[:i], s.ncEdges[i+1:]...) + i-- + if fd.isConstant() { + s.AddConstants(fd.to) + } else if fd.equiv { + s.AddEquivalence(fd.from, fd.to) + newNotNullColsSet := s.closureOfEquivalence(notNullColsSet) + if !newNotNullColsSet.Difference(notNullColsSet).IsEmpty() { + notNullColsSet = newNotNullColsSet + // expand not-null set. + i = -1 + } + } else { + s.addFunctionalDependency(fd.from, fd.to, fd.strict, fd.equiv) + } + } + } + // make origin FD strengthened. for i := 0; i < len(s.fdEdges); i++ { fd := s.fdEdges[i] if fd.strict { @@ -545,6 +585,8 @@ func (s *FDSet) MakeCartesianProduct(rhs *FDSet) { s.fdEdges = append(s.fdEdges, fd) } } + // just simple merge the ncEdge from both side together. + s.ncEdges = append(s.ncEdges, rhs.ncEdges...) // todo: add strict FD: (left key + right key) -> all cols. // maintain a key? } @@ -711,18 +753,13 @@ func (s *FDSet) MakeOuterJoin(innerFDs, filterFDs *FDSet, outerCols, innerCols F s.addFunctionalDependency(edge.from, edge.to, false, edge.equiv) } } + s.ncEdges = append(s.ncEdges, innerFDs.ncEdges...) leftCombinedFDFrom := NewFastIntSet() leftCombinedFDTo := NewFastIntSet() for _, edge := range filterFDs.fdEdges { // Rule #3.2, constant FD are removed from right side of left join. if edge.isConstant() { - s.Rule333Equiv.Edges = append(s.Rule333Equiv.Edges, &fdEdge{ - from: edge.from, - to: edge.to, - strict: edge.strict, - equiv: edge.equiv, - }) - s.Rule333Equiv.InnerCols = innerCols + s.AddNCFunctionalDependency(edge.from, edge.to, innerCols, edge.strict, edge.equiv) continue } // Rule #3.3, we only keep the lax FD from right side pointing the left side. @@ -757,13 +794,7 @@ func (s *FDSet) MakeOuterJoin(innerFDs, filterFDs *FDSet, outerCols, innerCols F s.addFunctionalDependency(NewFastIntSet(i), NewFastIntSet(j), false, false) } } - s.Rule333Equiv.Edges = append(s.Rule333Equiv.Edges, &fdEdge{ - from: laxFDFrom, - to: laxFDTo, - strict: true, - equiv: true, - }) - s.Rule333Equiv.InnerCols = innerCols + s.AddNCFunctionalDependency(equivColsLeft, equivColsRight, innerCols, true, true) } // Rule #3.1, filters won't produce any strict/lax FDs. } @@ -796,7 +827,6 @@ func (s *FDSet) MakeOuterJoin(innerFDs, filterFDs *FDSet, outerCols, innerCols F } // merge the not-null-cols/registered-map from both side together. - s.NotNullCols.UnionWith(innerFDs.NotNullCols) s.NotNullCols.UnionWith(filterFDs.NotNullCols) // inner cols can be nullable since then. s.NotNullCols.DifferenceWith(innerCols) @@ -816,19 +846,6 @@ func (s *FDSet) MakeOuterJoin(innerFDs, filterFDs *FDSet, outerCols, innerCols F s.HasAggBuilt = s.HasAggBuilt || innerFDs.HasAggBuilt } -// MakeRestoreRule333 reset the status of how we deal with this rule. -func (s *FDSet) MakeRestoreRule333() { - for _, eg := range s.Rule333Equiv.Edges { - if eg.isConstant() { - s.AddConstants(eg.to) - } else { - s.AddEquivalence(eg.from, eg.to) - } - } - s.Rule333Equiv.Edges = nil - s.Rule333Equiv.InnerCols.Clear() -} - // ArgOpts contains some arg used for FD maintenance. type ArgOpts struct { SkipFDRule331 bool @@ -882,6 +899,10 @@ func (s *FDSet) AddFrom(fds *FDSet) { s.AddLaxFunctionalDependency(fd.from, fd.to) } } + for i := range fds.ncEdges { + fd := fds.ncEdges[i] + s.ncEdges = append(s.ncEdges, fd) + } s.NotNullCols.UnionWith(fds.NotNullCols) if s.HashCodeToUniqueID == nil { s.HashCodeToUniqueID = fds.HashCodeToUniqueID @@ -898,7 +919,6 @@ func (s *FDSet) AddFrom(fds *FDSet) { s.GroupByCols.Insert(i) } s.HasAggBuilt = fds.HasAggBuilt - s.Rule333Equiv = fds.Rule333Equiv } // MaxOneRow will regard every column in the fdSet as a constant. Since constant is stronger that strict FD, it will @@ -1033,9 +1053,12 @@ func (s *FDSet) ProjectCols(cols FastIntSet) { continue } } - if fd.removeColumnsToSide(fd.from) { - // fd.to side is empty, remove this FD. - continue + // from and to side of equiv are same, don't do trivial elimination. + if !fd.isEquivalence() { + if fd.removeColumnsToSide(fd.from) { + // fd.to side is empty, remove this FD. + continue + } } } @@ -1088,6 +1111,32 @@ func (s *FDSet) ProjectCols(cols FastIntSet) { s.AddLaxFunctionalDependency(fd.from, fd.to) } } + // ncEdge should also be projected. + for i := 0; i < len(s.ncEdges); i++ { + nc := s.ncEdges[i] + if !nc.conditionNC.Intersects(cols) { + // edge is projected out, the nc edge's condition won't be satisfied anymore. + continue + } + if nc.isConstant() { + nc.to.IntersectionWith(cols) + if nc.to.IsEmpty() { + // edge is projected out. + s.ncEdges = append(s.ncEdges[:i], s.ncEdges[i+1:]...) + i-- + } + continue + } + if nc.equiv { + nc.from.IntersectionWith(cols) + nc.to.IntersectionWith(cols) + if nc.from.IsEmpty() { + // edge is projected out. + s.ncEdges = append(s.ncEdges[:i], s.ncEdges[i+1:]...) + i-- + } + } + } } // makeEquivMap try to find the equivalence column of every deleted column in the project list. diff --git a/planner/funcdep/only_full_group_by_test.go b/planner/funcdep/only_full_group_by_test.go index 5e017f8026..aecad5c518 100644 --- a/planner/funcdep/only_full_group_by_test.go +++ b/planner/funcdep/only_full_group_by_test.go @@ -169,6 +169,21 @@ func TestOnlyFullGroupByOldCases(t *testing.T) { // classic cases tk.MustQuery("select customer1.a, count(*) from customer1 left join customer2 on customer1.a=customer2.b where customer2.pk in (7,9) group by customer2.b;") tk.MustQuery("select customer1.a, count(*) from customer1 left join customer2 on customer1.a=1 where customer2.pk in (7,9) group by customer2.b;") + // c2.pk reject the null from both inner side of the left join. + tk.MustQuery("select c1.a, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;") + tk.MustQuery("select c3.b, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;") + tk.MustQuery("select c1.a, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;") + tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=1 where c2.pk in (7,9) group by c2.b;") + // inner join nested with outer join. + tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=1 where c2.pk in (7,9) group by c2.b;") + tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;") + tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 join (customer1 c1 left join customer2 c2 on c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;") + // outer join nested with inner join. + // TODO: inner side's strict FD and equiv FD can be saved. + //tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 left join (customer1 c1 inner join customer2 c2 on c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;") + //tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 left join (customer1 c1 inner join customer2 c2 on c1.a=1) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;") + //tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 left join (customer1 c1 inner join customer2 c2 on c1.a=1 and c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;") + tk.MustExec("drop view if exists customer") // this left join can extend left pk to all cols. tk.MustExec("CREATE algorithm=merge definer='root'@'localhost' VIEW customer as SELECT pk,a,b FROM customer1 LEFT JOIN customer2 USING (pk);")