// Copyright 2017 PingCAP, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // See the License for the specific language governing permissions and // limitations under the License. package core import ( "math" "github.com/pingcap/parser/ast" "github.com/pingcap/tidb/expression" "github.com/pingcap/tidb/planner/property" "github.com/pingcap/tidb/statistics" "github.com/pingcap/tidb/util/logutil" "go.uber.org/zap" ) func (p *basePhysicalPlan) StatsCount() float64 { return p.stats.RowCount } // DeriveStats implement LogicalPlan DeriveStats interface. func (p *LogicalTableDual) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { profile := &property.StatsInfo{ RowCount: float64(p.RowCount), Cardinality: make([]float64, p.Schema().Len()), } for i := range profile.Cardinality { profile.Cardinality[i] = float64(p.RowCount) } p.stats = profile return p.stats, nil } func (p *baseLogicalPlan) recursiveDeriveStats() (*property.StatsInfo, error) { if p.stats != nil { return p.stats, nil } childStats := make([]*property.StatsInfo, len(p.children)) for i, child := range p.children { childProfile, err := child.recursiveDeriveStats() if err != nil { return nil, err } childStats[i] = childProfile } return p.self.DeriveStats(childStats) } // DeriveStats implement LogicalPlan DeriveStats interface. func (p *baseLogicalPlan) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { if len(childStats) == 1 { p.stats = childStats[0] return p.stats, nil } if len(childStats) > 1 { err := ErrInternal.GenWithStack("LogicalPlans with more than one child should implement their own DeriveStats().") return nil, err } profile := &property.StatsInfo{ RowCount: float64(1), Cardinality: make([]float64, p.self.Schema().Len()), } for i := range profile.Cardinality { profile.Cardinality[i] = float64(1) } p.stats = profile return profile, nil } // getColumnNDV computes estimated NDV of specified column using the original // histogram of `DataSource` which is retrieved from storage(not the derived one). func (ds *DataSource) getColumnNDV(colID int64) (ndv float64) { hist, ok := ds.statisticTable.Columns[colID] if ok && hist.Count > 0 { factor := float64(ds.statisticTable.Count) / float64(hist.Count) ndv = float64(hist.NDV) * factor } else { ndv = float64(ds.statisticTable.Count) * distinctFactor } return ndv } func (ds *DataSource) deriveStatsByFilter(conds expression.CNFExprs) { tableStats := &property.StatsInfo{ RowCount: float64(ds.statisticTable.Count), Cardinality: make([]float64, len(ds.Columns)), HistColl: ds.statisticTable.GenerateHistCollFromColumnInfo(ds.Columns, ds.schema.Columns), StatsVersion: ds.statisticTable.Version, } if ds.statisticTable.Pseudo { tableStats.StatsVersion = statistics.PseudoVersion } for i, col := range ds.Columns { tableStats.Cardinality[i] = ds.getColumnNDV(col.ID) } ds.tableStats = tableStats ds.TblColHists = ds.statisticTable.ID2UniqueID(ds.TblCols) selectivity, nodes, err := tableStats.HistColl.Selectivity(ds.ctx, conds) if err != nil { logutil.BgLogger().Debug("an error happened, use the default selectivity", zap.Error(err)) selectivity = selectionFactor } ds.stats = tableStats.Scale(selectivity) if ds.ctx.GetSessionVars().OptimizerSelectivityLevel >= 1 { ds.stats.HistColl = ds.stats.HistColl.NewHistCollBySelectivity(ds.ctx.GetSessionVars().StmtCtx, nodes) } } // DeriveStats implement LogicalPlan DeriveStats interface. func (ds *DataSource) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { // PushDownNot here can convert query 'not (a != 1)' to 'a = 1'. for i, expr := range ds.pushedDownConds { ds.pushedDownConds[i] = expression.PushDownNot(nil, expr, false) } ds.deriveStatsByFilter(ds.pushedDownConds) for _, path := range ds.possibleAccessPaths { if path.isTablePath { noIntervalRanges, err := ds.deriveTablePathStats(path, ds.pushedDownConds) if err != nil { return nil, err } // If we have point or empty range, just remove other possible paths. if noIntervalRanges || len(path.ranges) == 0 { ds.possibleAccessPaths[0] = path ds.possibleAccessPaths = ds.possibleAccessPaths[:1] break } continue } noIntervalRanges, err := ds.deriveIndexPathStats(path, ds.pushedDownConds) if err != nil { return nil, err } // If we have empty range, or point range on unique index, just remove other possible paths. if (noIntervalRanges && path.index.Unique) || len(path.ranges) == 0 { ds.possibleAccessPaths[0] = path ds.possibleAccessPaths = ds.possibleAccessPaths[:1] break } } // Consider the IndexMergePath. Now, we just generate `IndexMergePath` in DNF case. if len(ds.pushedDownConds) > 0 && len(ds.possibleAccessPaths) > 1 && ds.ctx.GetSessionVars().EnableIndexMerge { needConsiderIndexMerge := true for i := 1; i < len(ds.possibleAccessPaths); i++ { if len(ds.possibleAccessPaths[i].accessConds) != 0 { needConsiderIndexMerge = false break } } if needConsiderIndexMerge { ds.generateIndexMergeOrPaths() } } return ds.stats, nil } // getIndexMergeOrPath generates all possible IndexMergeOrPaths. func (ds *DataSource) generateIndexMergeOrPaths() { usedIndexCount := len(ds.possibleAccessPaths) for i, cond := range ds.pushedDownConds { sf, ok := cond.(*expression.ScalarFunction) if !ok || sf.FuncName.L != ast.LogicOr { continue } var partialPaths = make([]*accessPath, 0, usedIndexCount) dnfItems := expression.FlattenDNFConditions(sf) for _, item := range dnfItems { cnfItems := expression.SplitCNFItems(item) itemPaths := ds.accessPathsForConds(cnfItems, usedIndexCount) if len(itemPaths) == 0 { partialPaths = nil break } partialPath := ds.buildIndexMergePartialPath(itemPaths) if partialPath == nil { partialPaths = nil break } partialPaths = append(partialPaths, partialPath) } if len(partialPaths) > 1 { possiblePath := ds.buildIndexMergeOrPath(partialPaths, i) if possiblePath != nil { ds.possibleAccessPaths = append(ds.possibleAccessPaths, possiblePath) } } } } // accessPathsForConds generates all possible index paths for conditions. func (ds *DataSource) accessPathsForConds(conditions []expression.Expression, usedIndexCount int) []*accessPath { var results = make([]*accessPath, 0, usedIndexCount) for i := 0; i < usedIndexCount; i++ { path := &accessPath{} if ds.possibleAccessPaths[i].isTablePath { path.isTablePath = true noIntervalRanges, err := ds.deriveTablePathStats(path, conditions) if err != nil { logutil.BgLogger().Debug("can not derive statistics of a path", zap.Error(err)) continue } // If we have point or empty range, just remove other possible paths. if noIntervalRanges || len(path.ranges) == 0 { results[0] = path results = results[:1] break } } else { path.index = ds.possibleAccessPaths[i].index noIntervalRanges, err := ds.deriveIndexPathStats(path, conditions) if err != nil { logutil.BgLogger().Debug("can not derive statistics of a path", zap.Error(err)) continue } // If we have empty range, or point range on unique index, just remove other possible paths. if (noIntervalRanges && path.index.Unique) || len(path.ranges) == 0 { results[0] = path results = results[:1] break } } // If accessConds is empty or tableFilter is not empty, we ignore the access path. // Now these conditions are too strict. // For example, a sql `select * from t where a > 1 or (b < 2 and c > 3)` and table `t` with indexes // on a and b separately. we can generate a `IndexMergePath` with table filter `a > 1 or (b < 2 and c > 3)`. // TODO: solve the above case if len(path.tableFilters) > 0 || len(path.accessConds) == 0 { continue } results = append(results, path) } return results } // buildIndexMergePartialPath chooses the best index path from all possible paths. // Now we just choose the index with most columns. // We should improve this strategy, because it is not always better to choose index // with most columns, e.g, filter is c > 1 and the input indexes are c and c_d_e, // the former one is enough, and it is less expensive in execution compared with the latter one. // TODO: improve strategy of the partial path selection func (ds *DataSource) buildIndexMergePartialPath(indexAccessPaths []*accessPath) *accessPath { if len(indexAccessPaths) == 1 { return indexAccessPaths[0] } maxColsIndex := 0 maxCols := len(indexAccessPaths[0].idxCols) for i := 1; i < len(indexAccessPaths); i++ { current := len(indexAccessPaths[i].idxCols) if current > maxCols { maxColsIndex = i maxCols = current } } return indexAccessPaths[maxColsIndex] } // buildIndexMergeOrPath generates one possible IndexMergePath. func (ds *DataSource) buildIndexMergeOrPath(partialPaths []*accessPath, current int) *accessPath { indexMergePath := &accessPath{partialIndexPaths: partialPaths} indexMergePath.tableFilters = append(indexMergePath.tableFilters, ds.pushedDownConds[:current]...) indexMergePath.tableFilters = append(indexMergePath.tableFilters, ds.pushedDownConds[current+1:]...) return indexMergePath } // DeriveStats implement LogicalPlan DeriveStats interface. func (p *LogicalSelection) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { p.stats = childStats[0].Scale(selectionFactor) return p.stats, nil } // DeriveStats implement LogicalPlan DeriveStats interface. func (p *LogicalUnionAll) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { p.stats = &property.StatsInfo{ Cardinality: make([]float64, p.Schema().Len()), } for _, childProfile := range childStats { p.stats.RowCount += childProfile.RowCount for i := range p.stats.Cardinality { p.stats.Cardinality[i] += childProfile.Cardinality[i] } } return p.stats, nil } func deriveLimitStats(childProfile *property.StatsInfo, limitCount float64) *property.StatsInfo { stats := &property.StatsInfo{ RowCount: math.Min(limitCount, childProfile.RowCount), Cardinality: make([]float64, len(childProfile.Cardinality)), } for i := range stats.Cardinality { stats.Cardinality[i] = math.Min(childProfile.Cardinality[i], stats.RowCount) } return stats } // DeriveStats implement LogicalPlan DeriveStats interface. func (p *LogicalLimit) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { p.stats = deriveLimitStats(childStats[0], float64(p.Count)) return p.stats, nil } // DeriveStats implement LogicalPlan DeriveStats interface. func (lt *LogicalTopN) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { lt.stats = deriveLimitStats(childStats[0], float64(lt.Count)) return lt.stats, nil } // getCardinality will return the Cardinality of a couple of columns. We simply return the max one, because we cannot know // the Cardinality for multi-dimension attributes properly. This is a simple and naive scheme of Cardinality estimation. func getCardinality(cols []*expression.Column, schema *expression.Schema, profile *property.StatsInfo) float64 { cardinality := 1.0 indices := schema.ColumnsIndices(cols) if indices == nil { logutil.BgLogger().Error("column not found in schema", zap.Any("columns", cols), zap.String("schema", schema.String())) return cardinality } for _, idx := range indices { // It is a very elementary estimation. cardinality = math.Max(cardinality, profile.Cardinality[idx]) } return cardinality } // DeriveStats implement LogicalPlan DeriveStats interface. func (p *LogicalProjection) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { childProfile := childStats[0] p.stats = &property.StatsInfo{ RowCount: childProfile.RowCount, Cardinality: make([]float64, len(p.Exprs)), } for i, expr := range p.Exprs { cols := expression.ExtractColumns(expr) p.stats.Cardinality[i] = getCardinality(cols, p.children[0].Schema(), childProfile) } return p.stats, nil } // DeriveStats implement LogicalPlan DeriveStats interface. func (la *LogicalAggregation) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { childProfile := childStats[0] gbyCols := make([]*expression.Column, 0, len(la.GroupByItems)) for _, gbyExpr := range la.GroupByItems { cols := expression.ExtractColumns(gbyExpr) gbyCols = append(gbyCols, cols...) } cardinality := getCardinality(gbyCols, la.children[0].Schema(), childProfile) la.stats = &property.StatsInfo{ RowCount: cardinality, Cardinality: make([]float64, la.schema.Len()), } // We cannot estimate the Cardinality for every output, so we use a conservative strategy. for i := range la.stats.Cardinality { la.stats.Cardinality[i] = cardinality } la.inputCount = childProfile.RowCount return la.stats, nil } // DeriveStats implement LogicalPlan DeriveStats interface. // If the type of join is SemiJoin, the selectivity of it will be same as selection's. // If the type of join is LeftOuterSemiJoin, it will not add or remove any row. The last column is a boolean value, whose Cardinality should be two. // If the type of join is inner/outer join, the output of join(s, t) should be N(s) * N(t) / (V(s.key) * V(t.key)) * Min(s.key, t.key). // N(s) stands for the number of rows in relation s. V(s.key) means the Cardinality of join key in s. // This is a quite simple strategy: We assume every bucket of relation which will participate join has the same number of rows, and apply cross join for // every matched bucket. func (p *LogicalJoin) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { leftProfile, rightProfile := childStats[0], childStats[1] if p.JoinType == SemiJoin || p.JoinType == AntiSemiJoin { p.stats = &property.StatsInfo{ RowCount: leftProfile.RowCount * selectionFactor, Cardinality: make([]float64, len(leftProfile.Cardinality)), } for i := range p.stats.Cardinality { p.stats.Cardinality[i] = leftProfile.Cardinality[i] * selectionFactor } return p.stats, nil } if p.JoinType == LeftOuterSemiJoin || p.JoinType == AntiLeftOuterSemiJoin { p.stats = &property.StatsInfo{ RowCount: leftProfile.RowCount, Cardinality: make([]float64, p.schema.Len()), } copy(p.stats.Cardinality, leftProfile.Cardinality) p.stats.Cardinality[len(p.stats.Cardinality)-1] = 2.0 return p.stats, nil } helper := &fullJoinRowCountHelper{ cartesian: 0 == len(p.EqualConditions), leftProfile: leftProfile, rightProfile: rightProfile, leftJoinKeys: p.LeftJoinKeys, rightJoinKeys: p.RightJoinKeys, leftSchema: p.children[0].Schema(), rightSchema: p.children[1].Schema(), } count := helper.estimate() if p.JoinType == LeftOuterJoin { count = math.Max(count, leftProfile.RowCount) } else if p.JoinType == RightOuterJoin { count = math.Max(count, rightProfile.RowCount) } cardinality := make([]float64, 0, p.schema.Len()) cardinality = append(cardinality, leftProfile.Cardinality...) cardinality = append(cardinality, rightProfile.Cardinality...) for i := range cardinality { cardinality[i] = math.Min(cardinality[i], count) } p.stats = &property.StatsInfo{ RowCount: count, Cardinality: cardinality, } return p.stats, nil } type fullJoinRowCountHelper struct { cartesian bool leftProfile *property.StatsInfo rightProfile *property.StatsInfo leftJoinKeys []*expression.Column rightJoinKeys []*expression.Column leftSchema *expression.Schema rightSchema *expression.Schema } func (h *fullJoinRowCountHelper) estimate() float64 { if h.cartesian { return h.leftProfile.RowCount * h.rightProfile.RowCount } leftKeyCardinality := getCardinality(h.leftJoinKeys, h.leftSchema, h.leftProfile) rightKeyCardinality := getCardinality(h.rightJoinKeys, h.rightSchema, h.rightProfile) count := h.leftProfile.RowCount * h.rightProfile.RowCount / math.Max(leftKeyCardinality, rightKeyCardinality) return count } // DeriveStats implement LogicalPlan DeriveStats interface. func (la *LogicalApply) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { leftProfile := childStats[0] la.stats = &property.StatsInfo{ RowCount: leftProfile.RowCount, Cardinality: make([]float64, la.schema.Len()), } copy(la.stats.Cardinality, leftProfile.Cardinality) if la.JoinType == LeftOuterSemiJoin || la.JoinType == AntiLeftOuterSemiJoin { la.stats.Cardinality[len(la.stats.Cardinality)-1] = 2.0 } else { for i := la.children[0].Schema().Len(); i < la.schema.Len(); i++ { la.stats.Cardinality[i] = leftProfile.RowCount } } return la.stats, nil } // Exists and MaxOneRow produce at most one row, so we set the RowCount of stats one. func getSingletonStats(len int) *property.StatsInfo { ret := &property.StatsInfo{ RowCount: 1.0, Cardinality: make([]float64, len), } for i := 0; i < len; i++ { ret.Cardinality[i] = 1 } return ret } // DeriveStats implement LogicalPlan DeriveStats interface. func (p *LogicalMaxOneRow) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { p.stats = getSingletonStats(p.Schema().Len()) return p.stats, nil } // DeriveStats implement LogicalPlan DeriveStats interface. func (p *LogicalWindow) DeriveStats(childStats []*property.StatsInfo) (*property.StatsInfo, error) { childProfile := childStats[0] p.stats = &property.StatsInfo{ RowCount: childProfile.RowCount, Cardinality: make([]float64, p.schema.Len()), } childLen := p.schema.Len() - len(p.WindowFuncDescs) for i := 0; i < childLen; i++ { colIdx := p.children[0].Schema().ColumnIndex(p.schema.Columns[i]) p.stats.Cardinality[i] = childProfile.Cardinality[colIdx] } for i := childLen; i < p.schema.Len(); i++ { p.stats.Cardinality[i] = childProfile.RowCount } return p.stats, nil }