143 lines
4.9 KiB
Go
143 lines
4.9 KiB
Go
// Copyright 2023 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package statistics
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
|
|
"github.com/pingcap/errors"
|
|
"github.com/pingcap/tidb/pkg/kv"
|
|
"github.com/pingcap/tidb/pkg/meta/model"
|
|
"github.com/pingcap/tidb/pkg/parser/ast"
|
|
"github.com/pingcap/tidb/pkg/sessionctx"
|
|
"github.com/pingcap/tidb/pkg/util/logutil"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// BuildExtendedStats build extended stats for column groups if needed based on the column samples.
|
|
func BuildExtendedStats(sctx sessionctx.Context,
|
|
tableID int64, cols []*model.ColumnInfo, collectors []*SampleCollector) (*ExtendedStatsColl, error) {
|
|
const sql = "SELECT name, type, column_ids FROM mysql.stats_extended WHERE table_id = %? and status in (%?, %?)"
|
|
|
|
sqlExec := sctx.GetRestrictedSQLExecutor()
|
|
rows, _, err := sqlExec.ExecRestrictedSQL(kv.WithInternalSourceType(context.Background(), kv.InternalTxnStats), nil, sql, tableID, ExtendedStatsAnalyzed, ExtendedStatsInited)
|
|
if err != nil {
|
|
return nil, errors.Trace(err)
|
|
}
|
|
if len(rows) == 0 {
|
|
return nil, nil
|
|
}
|
|
statsColl := NewExtendedStatsColl()
|
|
for _, row := range rows {
|
|
name := row.GetString(0)
|
|
item := &ExtendedStatsItem{Tp: uint8(row.GetInt64(1))}
|
|
colIDs := row.GetString(2)
|
|
err := json.Unmarshal([]byte(colIDs), &item.ColIDs)
|
|
if err != nil {
|
|
logutil.BgLogger().Error("invalid column_ids in mysql.stats_extended, skip collecting extended stats for this row", zap.String("column_ids", colIDs), zap.Error(err))
|
|
continue
|
|
}
|
|
item = fillExtendedStatsItemVals(sctx, item, cols, collectors)
|
|
if item != nil {
|
|
statsColl.Stats[name] = item
|
|
}
|
|
}
|
|
if len(statsColl.Stats) == 0 {
|
|
return nil, nil
|
|
}
|
|
return statsColl, nil
|
|
}
|
|
|
|
func fillExtendedStatsItemVals(sctx sessionctx.Context, item *ExtendedStatsItem, cols []*model.ColumnInfo, collectors []*SampleCollector) *ExtendedStatsItem {
|
|
switch item.Tp {
|
|
case ast.StatsTypeCardinality, ast.StatsTypeDependency:
|
|
return nil
|
|
case ast.StatsTypeCorrelation:
|
|
return fillExtStatsCorrVals(sctx, item, cols, collectors)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func fillExtStatsCorrVals(sctx sessionctx.Context, item *ExtendedStatsItem, cols []*model.ColumnInfo, collectors []*SampleCollector) *ExtendedStatsItem {
|
|
colOffsets := make([]int, 0, 2)
|
|
for _, id := range item.ColIDs {
|
|
for i, col := range cols {
|
|
if col.ID == id {
|
|
colOffsets = append(colOffsets, i)
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if len(colOffsets) != 2 {
|
|
return nil
|
|
}
|
|
// samplesX and samplesY are in order of handle, i.e, their SampleItem.Ordinals are in order.
|
|
samplesX := collectors[colOffsets[0]].Samples
|
|
// We would modify Ordinal of samplesY, so we make a deep copy.
|
|
samplesY := CopySampleItems(collectors[colOffsets[1]].Samples)
|
|
sampleNum := min(len(samplesX), len(samplesY))
|
|
if sampleNum == 1 {
|
|
item.ScalarVals = 1
|
|
return item
|
|
}
|
|
if sampleNum <= 0 {
|
|
item.ScalarVals = 0
|
|
return item
|
|
}
|
|
|
|
sc := sctx.GetSessionVars().StmtCtx
|
|
|
|
var err error
|
|
err = sortSampleItems(sc, samplesX)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
samplesYInXOrder := make([]*SampleItem, 0, sampleNum)
|
|
for i, itemX := range samplesX {
|
|
if itemX.Ordinal >= len(samplesY) {
|
|
continue
|
|
}
|
|
itemY := samplesY[itemX.Ordinal]
|
|
itemY.Ordinal = i
|
|
samplesYInXOrder = append(samplesYInXOrder, itemY)
|
|
}
|
|
samplesYInYOrder := make([]*SampleItem, len(samplesYInXOrder))
|
|
copy(samplesYInYOrder, samplesYInXOrder)
|
|
err = sortSampleItems(sc, samplesYInYOrder)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
var corrXYSum float64
|
|
for i := 1; i < len(samplesYInYOrder); i++ {
|
|
corrXYSum += float64(i) * float64(samplesYInYOrder[i].Ordinal)
|
|
}
|
|
// X means the ordinal of the item in original sequence, Y means the oridnal of the item in the
|
|
// sorted sequence, we know that X and Y value sets are both:
|
|
// 0, 1, ..., sampleNum-1
|
|
// we can simply compute sum(X) = sum(Y) =
|
|
// (sampleNum-1)*sampleNum / 2
|
|
// and sum(X^2) = sum(Y^2) =
|
|
// (sampleNum-1)*sampleNum*(2*sampleNum-1) / 6
|
|
// We use "Pearson correlation coefficient" to compute the order correlation of columns,
|
|
// the formula is based on https://en.wikipedia.org/wiki/Pearson_correlation_coefficient.
|
|
// Note that (itemsCount*corrX2Sum - corrXSum*corrXSum) would never be zero when sampleNum is larger than 1.
|
|
itemsCount := float64(sampleNum)
|
|
corrXSum := (itemsCount - 1) * itemsCount / 2.0
|
|
corrX2Sum := (itemsCount - 1) * itemsCount * (2*itemsCount - 1) / 6.0
|
|
item.ScalarVals = (itemsCount*corrXYSum - corrXSum*corrXSum) / (itemsCount*corrX2Sum - corrXSum*corrXSum)
|
|
return item
|
|
}
|