planner: introduce Cond-FD to maintain null-constraint FD (#34147)

close pingcap/tidb#34148
This commit is contained in:
Arenatlx
2022-05-09 19:06:33 +08:00
committed by GitHub
parent 20ecaef36d
commit 2f86eac3cf
4 changed files with 288 additions and 57 deletions

View File

@ -965,19 +965,6 @@ func (p *LogicalSelection) ExtractFD() *fd.FDSet {
// extract equivalence cols.
equivUniqueIDs := extractEquivalenceCols(p.Conditions, p.SCtx(), fds)
// after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence
// cols pointing to outer equivalence cols. eg: t left join t1 on t.a = t1.b, leading a
// lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all
// left rows, once there is a null-refusing predicate on the inner side on upper layer, this
// can be equivalence again. (the outer rows left are all coming from equal matching)
//
// why not just makeNotNull of them, because even a non-equiv-related inner col can also
// refuse supplied null values.
if fds.Rule333Equiv.InnerCols.Len() != 0 && notnullColsUniqueIDs.Intersects(fds.Rule333Equiv.InnerCols) {
// restore/re-strength FDs from rule 333
fds.MakeRestoreRule333()
}
// apply operator's characteristic's FD setting.
fds.MakeNotNull(notnullColsUniqueIDs)
fds.AddConstants(constUniqueIDs)

180
planner/funcdep/doc.go Normal file
View File

@ -0,0 +1,180 @@
// Copyright 2022 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package funcdep
// Theory to Practice
//
// For more rigorous examination of functional dependencies and their
// interaction with various SQL operators, see the following Master's Thesis:
//
// Norman Paulley, Glenn. (2000).
// Exploiting Functional Dependence in Query Optimization.
// https://cs.uwaterloo.ca/research/tr/2000/11/CS-2000-11.thesis.pdf
// TODO: Add the RFC design.
// NOTE 1.
// when handling Lax FD, we don't care the null value in the dependency, which means
// as long as null-attribute coverage of the determinant can make a Lax FD as strict one.
// The definition of "lax" used in the paper differs from the definition used by this
// library. For a lax dependency A~~>B, the paper allows this set of rows:
//
// a b
// -------
// 1 1
// 1 NULL
//
// This alternate definition is briefly covered in section 2.5.3.2 of the paper (see definition
// 2.19). The reason for this change is to allow a lax dependency to be upgraded to a strict
// dependency more readily, needing only the determinant columns to be not-null rather than
// both determinant and dependant columns.
//
// This is on the condition that, for definite values of determinant of a Lax FD, it won't
// have two same definite dependant value. That's true, because there is no way can derive
// to this kind of FD.
//
// Even in our implementation of outer join, the only way to produce duplicate definite
// determinant is the join predicate. But for now, we only maintain the equivalence and
// some strict FD of it.
//
// t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1
// a b | c d e
// ------+----------------
// 1 1 | 1 NULL 1
// 1 2 | NULL NULL NULL
// 2 1 | NULL NULL NULL
//
// Actually it's possible, the lax FD {a} -> {c} can be derived but not that useful. we only
// maintain the {c} ~> {a} for existence after outer join. Besides, there two Cond-FD should
// be preserved waiting for be visible again once with the null-reject on the condition of
// null constraint columns. (see below)
//
// NOTE 2.
// When handle outer join, it won't produce lax FD with duplicate definite determinant values and
// different dependency values.
//
// In implementation,we come across some lax FD dependent on null-reject of some other cols. For
// example.
// t(a,b) left join t1(c,d,e) on t.a = t1.c and b=1
// a b | c d e
// ------+----------------
// 1 1 | 1 NULL 1
// 1 2 | NULL NULL NULL
// 2 1 | NULL NULL NULL
//
// here constant FD {} -> {b} won't be existed after the outer join is done. Notice null-constraint
// {c,d,e} -| {c,d,e}, this FD should be preserved and will be visible again when some null-reject
// predicate take effect on the null-constraint cols.
//
// It's same for strict equivalence {t.a} = {t1.c}. Notice there are no lax equivalence here, because
// left side couldn't be guaranteed to be definite or null. like a=2 here. Let's collect all of this
// on-condition FD down, correspondent with a null-constraints column set, name it as Cond-FD.
//
// lax equivalencies are theoretically possible, but it won't be constructed from an outer join unless
// t already has a constant FD in column `a` here before outer join take a run. So the lax equivalence
// has some pre-conditions as you see, and it couldn't cover the case shown above. Let us do it like a
// Cond-FD does.
//
// The FD constructed from the join predicate should be considered as Cond-FD. Here like equivalence of
// {a} == {c} and constant FD {b} = 1 (if the join condition is e=1, it's here too). We can say that for
// every matched row, this FDs is valid, while for the other rows, the inner side are supplied of null
// rows. So this FDs are stored as ncEdges with nc condition of all inner table cols.
//
// We introduced invisible FD with null-constraint column to solve the problem above named as Cond-FD.
// For multi embedded left join, we take the following case as an example.
// a,b c,d,e
// -----------+-----------
// 1 2 | 1 1 1
// 2 2 |
// -----------+-----------
//
// left join on (a=c) res:
// a b c e e
// -------------------------
// 1 2 1 1 1
// 2 2 +- null null null -+
// | |
// +-------------------+
// \
// \
// the Cond-FD are < a=c with {c,d,e} > the latter is as null constraint cols
//
// e,f
// -----------------------
// 1 2
// 2 2
// 3 3
// -----------------------
//
// left join on (e=a) res:
// e f a b c d e
// -----------------------------------
// 1 2 1 2 1 1 1
// 2 2 2 2 +- null null null --+---------------> Cond-FD are <a=c with {c,d,e}> still exists.
// 3 3 +-null null | null null null |---+
// | +-------------------+ |
// +-----------------------------------+-----------> New Cond-FD are <e=a with {a,b,c,d,e}> occurs.
//
//
// the old Cond-FD with null constraint columns set {c,d,e} is preserved cause new appended cols are all null too.
// the new Cond-FD with null constraint columns set {a,b,c,d,e} are also meaningful, even if the null-reject column
// is one of {c,d,e} which may reduce one of the matched row out of the result, the equivalence {a}={e} still exist.
//
// Provide that the result of the first left join is like:
// left join on (a=c) res:
// a b c e e
// ---------------------------
// 1 2 1 1 1
// null 2 null null null
//
// THEN: left join on (e=a) res:
// e f a b c d e
// ---------------------------------
// 1 2 1 2 1 1 1
// 2 2 null null null null null
// 3 3 null null null null null
//
// Even like that, the case of old Cond-FD and new Cond-FD are existed too. Seems the null-constraint column set of
// old Cond-FD {c,d,e} can be expanded as {a,b,c,d,e} visually, but we couldn't derive the inference of the join predicate
// (e=a). The null-reject of column `a` couldn't bring the visibility to the old Cond-FD theoretically, it just happened
// to refuse that row with a null value in column a.
//
// Think about adding one more row in first left join result.
//
// left join on (a=c) res:
// a b c e e
// ---------------------------
// 1 2 1 1 1
// null 2 null null null
// 3 3 null null null
//
// THEN: left join on (e=a) res:
// e f a b c d e
// ---------------------------------
// 1 2 1 2 1 1 1
// 2 2 null null null null null
// 3 3 3 3 null null null
//
// Conclusion:
// As you see that's right we couldn't derive the inference of the join predicate (e=a) to expand old Cond-FD's nc
// {c,d,e} as {a,b,c,d,e}. So the rule for Cond-FD is quite simple, just keep the old ncEdge from right, appending
// the new ncEdges in current left join.
//
// If the first left join result is in the outer side of the second left join, just keep the ncEdge from left as well,
// appending the new ncEdges in current left join.
//
// For a inner join, both side of the join result won't be appended with null-supplied rows, so we can simply collect
// the ncEdges from both join side together.

View File

@ -32,12 +32,23 @@ type fdEdge struct {
// And if there's a functional dependency `const` -> `column` exists. We would let the from side be empty.
strict bool
equiv bool
// FD with non-nil conditionNC is hidden in FDSet, it will be visible again when at least one null-reject column in conditionNC.
// conditionNC should be satisfied before some FD make vision again, it's quite like lax FD to be strengthened as strict
// one. But the constraints should take effect on specified columns from conditionNC rather than just determinant columns.
conditionNC *FastIntSet
}
// FDSet is the main portal of functional dependency, it stores the relationship between (extended table / physical table)'s
// columns. For more theory about this design, ref the head comments in the funcdep/doc.go.
type FDSet struct {
fdEdges []*fdEdge
// after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence
// cols pointing to outer equivalence cols. eg: t left join t1 on t.a = t1.b, leading a
// lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all
// left rows, once there is a null-refusing predicate on the inner side on upper layer, this
// can be equivalence again. (the outer rows left are all coming from equal matching)
ncEdges []*fdEdge
// NotNullCols is used to record the columns with not-null attributes applied.
// eg: {1} ~~> {2,3}, when {2,3} not null is applied, it actually does nothing.
// but we should record {2,3} as not-null down for the convenience of transferring
@ -50,18 +61,9 @@ type FDSet struct {
// GroupByCols is used to record columns / expressions that under the group by phrase.
GroupByCols FastIntSet
HasAggBuilt bool
// after left join, according to rule 3.3.3, it may create a lax FD from inner equivalence
// cols pointing to outer equivalence cols. eg: t left join t1 on t.a = t1.b, leading a
// lax FD from t1.b ~> t.a, this lax attribute is coming from supplied null value to all
// left rows, once there is a null-refusing predicate on the inner side on upper layer, this
// can be equivalence again. (the outer rows left are all coming from equal matching)
//
// why not just makeNotNull of them, because even a non-equiv-related inner col can also
// refuse supplied null values.
Rule333Equiv struct {
Edges []*fdEdge
InnerCols FastIntSet
}
// todo: when multi join and across select block, this may need to be maintained more precisely.
}
// ClosureOfStrict is exported for outer usage.
@ -215,6 +217,20 @@ func (s *FDSet) AddLaxFunctionalDependency(from, to FastIntSet) {
s.addFunctionalDependency(from, to, false, false)
}
// AddNCFunctionalDependency is to add conditional functional dependency to the fdGraph.
func (s *FDSet) AddNCFunctionalDependency(from, to, nc FastIntSet, strict, equiv bool) {
// Since nc edge is invisible by now, just collecting them together simply, once the
// null-reject on nc cols is satisfied, let's pick them out and insert into the fdEdge
// normally.
s.ncEdges = append(s.ncEdges, &fdEdge{
from: from,
to: to,
strict: strict,
equiv: equiv,
conditionNC: &nc,
})
}
// addFunctionalDependency will add strict/lax functional dependency to the fdGraph.
// eg:
// CREATE TABLE t (a int key, b int, c int, d int, e int, UNIQUE (b,c))
@ -425,6 +441,7 @@ func (s *FDSet) AddConstants(cons FastIntSet) {
shouldRemoved = true
}
}
// pre-condition NOTE 1 in doc.go, it won't occur duplicate definite determinant of Lax FD.
// for strict or lax FDs, both can reduce the dependencies side columns with constant closure.
if fd.removeColumnsToSide(cols) {
shouldRemoved = true
@ -507,6 +524,29 @@ func (s *FDSet) EquivalenceCols() (eqs []*FastIntSet) {
func (s *FDSet) MakeNotNull(notNullCols FastIntSet) {
notNullCols.UnionWith(s.NotNullCols)
notNullColsSet := s.closureOfEquivalence(notNullCols)
// make nc FD visible.
for i := 0; i < len(s.ncEdges); i++ {
fd := s.ncEdges[i]
if fd.conditionNC.Intersects(notNullColsSet) {
// condition satisfied.
s.ncEdges = append(s.ncEdges[:i], s.ncEdges[i+1:]...)
i--
if fd.isConstant() {
s.AddConstants(fd.to)
} else if fd.equiv {
s.AddEquivalence(fd.from, fd.to)
newNotNullColsSet := s.closureOfEquivalence(notNullColsSet)
if !newNotNullColsSet.Difference(notNullColsSet).IsEmpty() {
notNullColsSet = newNotNullColsSet
// expand not-null set.
i = -1
}
} else {
s.addFunctionalDependency(fd.from, fd.to, fd.strict, fd.equiv)
}
}
}
// make origin FD strengthened.
for i := 0; i < len(s.fdEdges); i++ {
fd := s.fdEdges[i]
if fd.strict {
@ -545,6 +585,8 @@ func (s *FDSet) MakeCartesianProduct(rhs *FDSet) {
s.fdEdges = append(s.fdEdges, fd)
}
}
// just simple merge the ncEdge from both side together.
s.ncEdges = append(s.ncEdges, rhs.ncEdges...)
// todo: add strict FD: (left key + right key) -> all cols.
// maintain a key?
}
@ -711,18 +753,13 @@ func (s *FDSet) MakeOuterJoin(innerFDs, filterFDs *FDSet, outerCols, innerCols F
s.addFunctionalDependency(edge.from, edge.to, false, edge.equiv)
}
}
s.ncEdges = append(s.ncEdges, innerFDs.ncEdges...)
leftCombinedFDFrom := NewFastIntSet()
leftCombinedFDTo := NewFastIntSet()
for _, edge := range filterFDs.fdEdges {
// Rule #3.2, constant FD are removed from right side of left join.
if edge.isConstant() {
s.Rule333Equiv.Edges = append(s.Rule333Equiv.Edges, &fdEdge{
from: edge.from,
to: edge.to,
strict: edge.strict,
equiv: edge.equiv,
})
s.Rule333Equiv.InnerCols = innerCols
s.AddNCFunctionalDependency(edge.from, edge.to, innerCols, edge.strict, edge.equiv)
continue
}
// Rule #3.3, we only keep the lax FD from right side pointing the left side.
@ -757,13 +794,7 @@ func (s *FDSet) MakeOuterJoin(innerFDs, filterFDs *FDSet, outerCols, innerCols F
s.addFunctionalDependency(NewFastIntSet(i), NewFastIntSet(j), false, false)
}
}
s.Rule333Equiv.Edges = append(s.Rule333Equiv.Edges, &fdEdge{
from: laxFDFrom,
to: laxFDTo,
strict: true,
equiv: true,
})
s.Rule333Equiv.InnerCols = innerCols
s.AddNCFunctionalDependency(equivColsLeft, equivColsRight, innerCols, true, true)
}
// Rule #3.1, filters won't produce any strict/lax FDs.
}
@ -796,7 +827,6 @@ func (s *FDSet) MakeOuterJoin(innerFDs, filterFDs *FDSet, outerCols, innerCols F
}
// merge the not-null-cols/registered-map from both side together.
s.NotNullCols.UnionWith(innerFDs.NotNullCols)
s.NotNullCols.UnionWith(filterFDs.NotNullCols)
// inner cols can be nullable since then.
s.NotNullCols.DifferenceWith(innerCols)
@ -816,19 +846,6 @@ func (s *FDSet) MakeOuterJoin(innerFDs, filterFDs *FDSet, outerCols, innerCols F
s.HasAggBuilt = s.HasAggBuilt || innerFDs.HasAggBuilt
}
// MakeRestoreRule333 reset the status of how we deal with this rule.
func (s *FDSet) MakeRestoreRule333() {
for _, eg := range s.Rule333Equiv.Edges {
if eg.isConstant() {
s.AddConstants(eg.to)
} else {
s.AddEquivalence(eg.from, eg.to)
}
}
s.Rule333Equiv.Edges = nil
s.Rule333Equiv.InnerCols.Clear()
}
// ArgOpts contains some arg used for FD maintenance.
type ArgOpts struct {
SkipFDRule331 bool
@ -882,6 +899,10 @@ func (s *FDSet) AddFrom(fds *FDSet) {
s.AddLaxFunctionalDependency(fd.from, fd.to)
}
}
for i := range fds.ncEdges {
fd := fds.ncEdges[i]
s.ncEdges = append(s.ncEdges, fd)
}
s.NotNullCols.UnionWith(fds.NotNullCols)
if s.HashCodeToUniqueID == nil {
s.HashCodeToUniqueID = fds.HashCodeToUniqueID
@ -898,7 +919,6 @@ func (s *FDSet) AddFrom(fds *FDSet) {
s.GroupByCols.Insert(i)
}
s.HasAggBuilt = fds.HasAggBuilt
s.Rule333Equiv = fds.Rule333Equiv
}
// MaxOneRow will regard every column in the fdSet as a constant. Since constant is stronger that strict FD, it will
@ -1033,9 +1053,12 @@ func (s *FDSet) ProjectCols(cols FastIntSet) {
continue
}
}
if fd.removeColumnsToSide(fd.from) {
// fd.to side is empty, remove this FD.
continue
// from and to side of equiv are same, don't do trivial elimination.
if !fd.isEquivalence() {
if fd.removeColumnsToSide(fd.from) {
// fd.to side is empty, remove this FD.
continue
}
}
}
@ -1088,6 +1111,32 @@ func (s *FDSet) ProjectCols(cols FastIntSet) {
s.AddLaxFunctionalDependency(fd.from, fd.to)
}
}
// ncEdge should also be projected.
for i := 0; i < len(s.ncEdges); i++ {
nc := s.ncEdges[i]
if !nc.conditionNC.Intersects(cols) {
// edge is projected out, the nc edge's condition won't be satisfied anymore.
continue
}
if nc.isConstant() {
nc.to.IntersectionWith(cols)
if nc.to.IsEmpty() {
// edge is projected out.
s.ncEdges = append(s.ncEdges[:i], s.ncEdges[i+1:]...)
i--
}
continue
}
if nc.equiv {
nc.from.IntersectionWith(cols)
nc.to.IntersectionWith(cols)
if nc.from.IsEmpty() {
// edge is projected out.
s.ncEdges = append(s.ncEdges[:i], s.ncEdges[i+1:]...)
i--
}
}
}
}
// makeEquivMap try to find the equivalence column of every deleted column in the project list.

View File

@ -169,6 +169,21 @@ func TestOnlyFullGroupByOldCases(t *testing.T) {
// classic cases
tk.MustQuery("select customer1.a, count(*) from customer1 left join customer2 on customer1.a=customer2.b where customer2.pk in (7,9) group by customer2.b;")
tk.MustQuery("select customer1.a, count(*) from customer1 left join customer2 on customer1.a=1 where customer2.pk in (7,9) group by customer2.b;")
// c2.pk reject the null from both inner side of the left join.
tk.MustQuery("select c1.a, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;")
tk.MustQuery("select c3.b, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;")
tk.MustQuery("select c1.a, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;")
tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 left join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=1 where c2.pk in (7,9) group by c2.b;")
// inner join nested with outer join.
tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=1 where c2.pk in (7,9) group by c2.b;")
tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 join (customer1 c1 left join customer2 c2 on c1.a=1) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;")
tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 join (customer1 c1 left join customer2 c2 on c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;")
// outer join nested with inner join.
// TODO: inner side's strict FD and equiv FD can be saved.
//tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 left join (customer1 c1 inner join customer2 c2 on c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;")
//tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 left join (customer1 c1 inner join customer2 c2 on c1.a=1) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;")
//tk.MustQuery("select c1.a, c3.b, count(*) from customer2 c3 left join (customer1 c1 inner join customer2 c2 on c1.a=1 and c1.a=c2.b) on c3.b=c1.a where c2.pk in (7,9) group by c2.b;")
tk.MustExec("drop view if exists customer")
// this left join can extend left pk to all cols.
tk.MustExec("CREATE algorithm=merge definer='root'@'localhost' VIEW customer as SELECT pk,a,b FROM customer1 LEFT JOIN customer2 USING (pk);")