Files
tidb/executor/analyze.go

258 lines
6.8 KiB
Go

// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package executor
import (
"math/rand"
"strconv"
"time"
"github.com/juju/errors"
"github.com/pingcap/tidb/ast"
"github.com/pingcap/tidb/context"
"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/sessionctx/varsutil"
"github.com/pingcap/tidb/statistics"
"github.com/pingcap/tidb/util/types"
)
var _ Executor = &AnalyzeExec{}
// AnalyzeExec represents Analyze executor.
type AnalyzeExec struct {
ctx context.Context
tasks []analyzeTask
}
const (
maxSampleCount = 10000
maxSketchSize = 1000
defaultBucketCount = 256
)
// Schema implements the Executor Schema interface.
func (e *AnalyzeExec) Schema() *expression.Schema {
return expression.NewSchema()
}
// Open implements the Executor Open interface.
func (e *AnalyzeExec) Open() error {
for _, task := range e.tasks {
err := task.src.Open()
if err != nil {
return errors.Trace(err)
}
}
return nil
}
// Close implements the Executor Close interface.
func (e *AnalyzeExec) Close() error {
for _, task := range e.tasks {
err := task.src.Close()
if err != nil {
return errors.Trace(err)
}
}
return nil
}
// Next implements the Executor Next interface.
func (e *AnalyzeExec) Next() (*Row, error) {
concurrency, err := getBuildStatsConcurrency(e.ctx)
if err != nil {
return nil, errors.Trace(err)
}
taskCh := make(chan analyzeTask, len(e.tasks))
resultCh := make(chan analyzeResult, len(e.tasks))
for i := 0; i < concurrency; i++ {
go analyzeWorker(taskCh, resultCh)
}
for _, task := range e.tasks {
taskCh <- task
}
close(taskCh)
results := make([]analyzeResult, 0, len(e.tasks))
for i := 0; i < len(e.tasks); i++ {
result := <-resultCh
results = append(results, result)
if result.err != nil {
return nil, errors.Trace(err)
}
}
for _, result := range results {
for _, hg := range result.hist {
err = hg.SaveToStorage(e.ctx, result.tableID, result.count, result.isIndex)
if err != nil {
return nil, errors.Trace(err)
}
}
}
dom := sessionctx.GetDomain(e.ctx)
lease := dom.DDL().GetLease()
if lease > 0 {
// We sleep two lease to make sure other tidb node has updated this node.
time.Sleep(lease * 2)
} else {
err := dom.StatsHandle().Update(GetInfoSchema(e.ctx))
if err != nil {
return nil, errors.Trace(err)
}
}
return nil, nil
}
func getBuildStatsConcurrency(ctx context.Context) (int, error) {
sessionVars := ctx.GetSessionVars()
concurrency, err := varsutil.GetSessionSystemVar(sessionVars, variable.TiDBBuildStatsConcurrency)
if err != nil {
return 0, errors.Trace(err)
}
c, err := strconv.ParseInt(concurrency, 10, 64)
return int(c), errors.Trace(err)
}
type taskType int
const (
pkTask taskType = iota
colTask
idxTask
)
type analyzeTask struct {
taskType taskType
src Executor
}
type analyzeResult struct {
tableID int64
hist []*statistics.Histogram
count int64
isIndex int
err error
}
func analyzeWorker(taskCh <-chan analyzeTask, resultCh chan<- analyzeResult) {
for task := range taskCh {
switch task.taskType {
case pkTask:
resultCh <- analyzePK(task.src.(*XSelectTableExec))
case colTask:
resultCh <- analyzeColumns(task.src.(*XSelectTableExec))
case idxTask:
resultCh <- analyzeIndex(task.src.(*XSelectIndexExec))
}
}
}
func analyzePK(exec *XSelectTableExec) analyzeResult {
count, hg, err := statistics.BuildPK(exec.ctx, defaultBucketCount, exec.Columns[0].ID, &recordSet{executor: exec})
return analyzeResult{tableID: exec.tableInfo.ID, hist: []*statistics.Histogram{hg}, count: count, isIndex: 0, err: err}
}
func analyzeColumns(exec *XSelectTableExec) analyzeResult {
collectors, err := CollectSamplesAndEstimateNDVs(&recordSet{executor: exec}, len(exec.Columns))
if err != nil {
return analyzeResult{err: err}
}
result := analyzeResult{tableID: exec.tableInfo.ID, count: collectors[0].Count + collectors[0].NullCount, isIndex: 0}
for i, col := range exec.Columns {
hg, err := statistics.BuildColumn(exec.ctx, defaultBucketCount, col.ID, collectors[i].Sketch.NDV(), collectors[i].Count, collectors[i].NullCount, collectors[i].samples)
result.hist = append(result.hist, hg)
if err != nil && result.err == nil {
result.err = err
}
}
return result
}
func analyzeIndex(exec *XSelectIndexExec) analyzeResult {
count, hg, err := statistics.BuildIndex(exec.ctx, defaultBucketCount, exec.index.ID, &recordSet{executor: exec})
return analyzeResult{tableID: exec.tableInfo.ID, hist: []*statistics.Histogram{hg}, count: count, isIndex: 1, err: err}
}
// SampleCollector will collect samples and calculate the count and ndv of an attribute.
type SampleCollector struct {
samples []types.Datum
NullCount int64
Count int64
Sketch *statistics.FMSketch
}
func (c *SampleCollector) collect(d types.Datum) error {
if d.IsNull() {
c.NullCount++
return nil
}
c.Count++
if len(c.samples) < maxSampleCount {
c.samples = append(c.samples, d)
} else {
shouldAdd := rand.Int63n(c.Count) < maxSampleCount
if shouldAdd {
idx := rand.Intn(maxSampleCount)
c.samples[idx] = d
}
}
return errors.Trace(c.Sketch.InsertValue(d))
}
// CollectSamplesAndEstimateNDVs collects sample from the result set using Reservoir Sampling algorithm,
// and estimates NDVs using FM Sketch during the collecting process. It returns the sample collectors which contain total
// count, null count and distinct values count.
// See https://en.wikipedia.org/wiki/Reservoir_sampling
// Exported for test.
func CollectSamplesAndEstimateNDVs(e ast.RecordSet, numCols int) ([]*SampleCollector, error) {
collectors := make([]*SampleCollector, numCols)
for i := range collectors {
collectors[i] = &SampleCollector{
Sketch: statistics.NewFMSketch(maxSketchSize),
}
}
for {
row, err := e.Next()
if err != nil {
return nil, errors.Trace(err)
}
if row == nil {
return collectors, nil
}
for i, val := range row.Data {
err = collectors[i].collect(val)
if err != nil {
return nil, errors.Trace(err)
}
}
}
}
func rowsToColumnSamples(rows []*ast.Row) [][]types.Datum {
if len(rows) == 0 {
return nil
}
columnSamples := make([][]types.Datum, len(rows[0].Data))
for i := range columnSamples {
columnSamples[i] = make([]types.Datum, len(rows))
}
for j, row := range rows {
for i, val := range row.Data {
columnSamples[i][j] = val
}
}
return columnSamples
}