Files
tidb/pkg/planner/indexadvisor/algorithm.go
2025-05-19 16:17:43 +00:00

574 lines
17 KiB
Go

// Copyright 2024 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package indexadvisor
import (
"fmt"
"slices"
"strings"
"time"
"github.com/google/uuid"
s "github.com/pingcap/tidb/pkg/util/set"
)
/*
This algorithm resembles the index selection algorithm published in 1997 by Chaudhuri
and Narasayya. Details can be found in the original paper:
Surajit Chaudhuri, Vivek R. Narasayya: An Efficient Cost-Driven Index Selection
Tool for Microsoft Query Server. VLDB 1997: 146-155
This implementation is the Golang version of
https://github.com/hyrise/index_selection_evaluation/blob/refactoring/selection/algorithms/auto_admin_algorithm.py.
*/
// adviseIndexes implements the auto-admin algorithm.
func adviseIndexes(querySet s.Set[Query], indexableColSet s.Set[Column],
optimizer Optimizer, option *Option) (result, allCandidates s.Set[Index], err error) {
aa := &autoAdmin{
optimizer: optimizer,
option: option,
startAt: time.Now(),
allCandidates: s.NewSet[Index](),
}
bestIndexes, allCandidates, err := aa.calculateBestIndexes(querySet, indexableColSet)
if err != nil {
return nil, nil, err
}
return bestIndexes, allCandidates, nil
}
type autoAdmin struct {
optimizer Optimizer
option *Option
startAt time.Time
allCandidates s.Set[Index]
}
func (aa *autoAdmin) recordCandidates(currentCandidates s.Set[Index]) {
if aa.allCandidates.Size() > 10000 {
return
}
aa.allCandidates = s.UnionSet(aa.allCandidates, currentCandidates)
}
func (aa *autoAdmin) calculateBestIndexes(querySet s.Set[Query],
indexableColSet s.Set[Column]) (currentBestIndexes, allCandidates s.Set[Index], err error) {
if aa.option.MaxNumIndexes == 0 {
return nil, nil, nil
}
potentialIndexes := s.NewSet[Index]() // each indexable column as a single-column index
for _, col := range indexableColSet.ToList() {
potentialIndexes.Add(NewIndex(col.SchemaName, col.TableName, aa.tempIndexName(col), col.ColumnName))
}
if err := aa.timeout(); err != nil {
return nil, nil, err
}
currentBestIndexes = s.NewSet[Index]()
for currentMaxIndexWidth := 1; currentMaxIndexWidth <= aa.option.MaxIndexWidth; currentMaxIndexWidth++ {
aa.recordCandidates(potentialIndexes)
candidates, err := aa.selectIndexCandidates(querySet, potentialIndexes)
if err != nil {
return nil, nil, err
}
//maxIndexes := aa.option.MaxNumIndexes * (aa.option.MaxIndexWidth - currentMaxIndexWidth + 1)
maxIndexes := aa.option.MaxNumIndexes
currentBestIndexes, err = aa.enumerateCombinations(querySet, candidates, maxIndexes)
if err != nil {
return nil, nil, err
}
if currentMaxIndexWidth < aa.option.MaxIndexWidth {
// Update potential indexes for the next iteration
potentialIndexes = currentBestIndexes
potentialIndexes.Add(aa.createMultiColumnIndexes(indexableColSet, currentBestIndexes).ToList()...)
}
if err := aa.timeout(); err != nil {
return nil, nil, err
}
}
currentBestIndexes, err = aa.heuristicMergeIndexes(currentBestIndexes, querySet)
if err != nil {
return nil, nil, err
}
currentBestIndexes, err = aa.heuristicCoveredIndexes(currentBestIndexes, querySet)
if err != nil {
return nil, nil, err
}
currentBestIndexes, err = aa.filterIndexes(querySet, currentBestIndexes)
if err != nil {
return nil, nil, err
}
currentBestIndexes, err = aa.cutDown(currentBestIndexes, querySet, aa.optimizer, aa.option.MaxNumIndexes)
if err != nil {
return nil, nil, err
}
if err := aa.timeout(); err != nil {
return nil, nil, err
}
// try to add more indexes if the number of indexes is less than maxIndexes
for limit := 0; limit < 3 && currentBestIndexes.Size() < aa.option.MaxNumIndexes; limit++ {
potentialIndexes = s.DiffSet(potentialIndexes, currentBestIndexes)
currentCost, err := evaluateIndexSetCost(querySet, aa.optimizer, currentBestIndexes)
if err != nil {
return nil, nil, err
}
currentBestIndexes, _, err = aa.enumerateGreedy(querySet, currentBestIndexes,
currentCost, potentialIndexes, aa.option.MaxNumIndexes)
if err != nil {
return nil, nil, err
}
currentBestIndexes, err = aa.filterIndexes(querySet, currentBestIndexes)
if err != nil {
return nil, nil, err
}
if err := aa.timeout(); err != nil {
return nil, nil, err
}
}
allCandidates = aa.allCandidates
aa.allCandidates = nil
return currentBestIndexes, allCandidates, nil
}
// cutDown removes indexes from candidateIndexes until the number of indexes is less than or equal to maxIndexes.
func (aa *autoAdmin) cutDown(candidateIndexes s.Set[Index],
querySet s.Set[Query], op Optimizer, maxIndexes int) (s.Set[Index], error) {
if candidateIndexes.Size() <= maxIndexes {
return candidateIndexes, nil
}
// find the target index to remove, which is the one that has the least impact on the cost.
var bestCost IndexSetCost
var targetIndex Index
for i, idx := range candidateIndexes.ToList() {
candidateIndexes.Remove(idx)
cost, err := evaluateIndexSetCost(querySet, op, candidateIndexes)
if err != nil {
return nil, err
}
candidateIndexes.Add(idx)
if i == 0 || cost.Less(bestCost) {
bestCost = cost
targetIndex = idx
}
}
candidateIndexes.Remove(targetIndex)
return aa.cutDown(candidateIndexes, querySet, op, maxIndexes)
}
func (aa *autoAdmin) heuristicCoveredIndexes(
candidateIndexes s.Set[Index], querySet s.Set[Query]) (s.Set[Index], error) {
// build an index (b, a) for `select a from t where b=1` to convert IndexLookup to IndexScan
currentCost, err := evaluateIndexSetCost(querySet, aa.optimizer, candidateIndexes)
if err != nil {
return nil, err
}
for _, q := range querySet.ToList() {
// parse select columns
selectCols, err := CollectSelectColumnsFromQuery(q)
if err != nil {
return nil, err
}
if selectCols == nil || selectCols.Size() == 0 || selectCols.Size() > aa.option.MaxIndexWidth {
continue
}
schemaName, tableName := selectCols.ToList()[0].SchemaName, selectCols.ToList()[0].TableName
// generate cover-index candidates
coverIndexSet := s.NewSet[Index]()
for _, idx := range candidateIndexes.ToList() {
if idx.SchemaName != schemaName || idx.TableName != tableName {
continue // not for the same table
}
if len(idx.Columns)+selectCols.Size() > aa.option.MaxIndexWidth {
continue // exceed the max-index-width limitation
}
// try this cover-index: idx-cols + select-cols
var newCols []Column
for _, col := range selectCols.ToList() {
duplicated := false
for _, idxCol := range idx.Columns {
if col.Key() == idxCol.Key() {
duplicated = true
break
}
}
if !duplicated {
newCols = append(newCols, col)
}
}
var cols []Column
cols = append(cols, idx.Columns...)
cols = append(cols, newCols...)
coverIndexSet.Add(Index{
SchemaName: schemaName,
TableName: tableName,
IndexName: aa.tempIndexName(cols...),
Columns: cols,
})
}
// select the best cover-index
var bestCoverIndex Index
var bestCoverIndexCost IndexSetCost
for i, coverIndex := range coverIndexSet.ToList() {
if candidateIndexes.Contains(coverIndex) {
continue // the new generated cover-index is duplicated
}
candidateIndexes.Add(coverIndex)
aa.recordCandidates(candidateIndexes)
cost, err := evaluateIndexSetCost(querySet, aa.optimizer, candidateIndexes)
if err != nil {
return nil, err
}
candidateIndexes.Remove(coverIndex)
if i == 0 || cost.Less(bestCoverIndexCost) {
bestCoverIndexCost = cost
bestCoverIndex = coverIndex
}
}
// check whether this cover-index can bring any benefits
if bestCoverIndexCost.Less(currentCost) {
candidateIndexes.Add(bestCoverIndex)
currentCost = bestCoverIndexCost
}
}
return candidateIndexes, nil
}
func (aa *autoAdmin) heuristicMergeIndexes(
candidateIndexes s.Set[Index], querySet s.Set[Query]) (s.Set[Index], error) {
// try to build index s.Set {(c1), (c2)} for predicate like `where c1=1 or c2=2` so that index-merge can be applied.
currentCost, err := evaluateIndexSetCost(querySet, aa.optimizer, candidateIndexes)
if err != nil {
return nil, err
}
for _, q := range querySet.ToList() {
// get all DNF columns from the query
dnfCols, err := CollectDNFColumnsFromQuery(q)
if err != nil {
return nil, err
}
if dnfCols == nil || dnfCols.Size() == 0 {
continue
}
orderByCols, err := CollectOrderByColumnsFromQuery(q)
if err != nil {
return nil, err
}
// create indexes for these DNF columns
newIndexes := s.NewSet[Index]()
for _, col := range dnfCols.ToList() {
idx := NewIndex(col.SchemaName, col.TableName, aa.tempIndexName(col), col.ColumnName)
contained := false
for _, existingIndex := range candidateIndexes.ToList() {
if existingIndex.PrefixContain(idx) {
contained = true
continue
}
}
if !contained {
newIndexes.Add(idx)
}
// index with DNF column + order-by column
if len(orderByCols) == 0 {
continue
}
cols := []Column{col}
cols = append(cols, orderByCols...)
if len(cols) > aa.option.MaxIndexWidth {
cols = cols[:aa.option.MaxIndexWidth]
}
idx = NewIndexWithColumns(aa.tempIndexName(cols...), cols...)
contained = false
for _, existingIndex := range candidateIndexes.ToList() {
if existingIndex.PrefixContain(idx) {
contained = true
continue
}
}
if !contained {
newIndexes.Add(idx)
}
}
if newIndexes.Size() == 0 {
continue
}
// check whether these new indexes for IndexMerge can bring some benefits.
newCandidateIndexes := s.UnionSet(candidateIndexes, newIndexes)
aa.recordCandidates(newCandidateIndexes)
newCost, err := evaluateIndexSetCost(querySet, aa.optimizer, newCandidateIndexes)
if err != nil {
return nil, err
}
if newCost.Less(currentCost) {
currentCost = newCost
candidateIndexes, err = aa.filterIndexes(querySet, newCandidateIndexes)
if err != nil {
return nil, err
}
}
}
return candidateIndexes, nil
}
func (aa *autoAdmin) createMultiColumnIndexes(indexableColsSet s.Set[Column], indexes s.Set[Index]) s.Set[Index] {
multiColumnCandidates := s.NewSet[Index]()
for _, index := range indexes.ToList() {
columns, err := aa.optimizer.TableColumns(index.SchemaName, index.TableName)
if err != nil {
continue
}
tableColsSet := s.ListToSet[Column](columns...)
indexColsSet := s.ListToSet[Column](index.Columns...)
for _, column := range s.DiffSet(s.AndSet(tableColsSet, indexableColsSet), indexColsSet).ToList() {
cols := slices.Clone(index.Columns)
cols = append(cols, column)
multiColumnCandidates.Add(Index{
SchemaName: index.SchemaName,
TableName: index.TableName,
IndexName: aa.tempIndexName(cols...),
Columns: cols,
})
}
}
return multiColumnCandidates
}
// filterIndexes filters some obviously unreasonable indexes.
// Rule 1: if index X is a prefix of index Y, then remove X.
// Rule 2: if index X has no any benefit, then remove X.
// Rule 3: if candidate index X is a prefix of some existing index in the workload, then remove X.
// Rule 4(TBD): remove unnecessary suffix columns, e.g. X(a, b, c) to X(a, b)
//
// if no query can gain benefit from the suffix column c.
func (aa *autoAdmin) filterIndexes(querySet s.Set[Query], indexSet s.Set[Index]) (s.Set[Index], error) {
indexList := indexSet.ToList()
filteredIndexes := s.NewSet[Index]()
originalCost, err := evaluateIndexSetCost(querySet, aa.optimizer, indexSet)
if err != nil {
return nil, err
}
for i, x := range indexList {
filtered := false
// rule 1
for j, y := range indexList {
if i == j {
continue
}
if y.PrefixContain(x) {
filtered = true
continue
}
}
if filtered {
continue
}
// rule 2
indexSet.Remove(x)
newCost, err := evaluateIndexSetCost(querySet, aa.optimizer, indexSet)
if err != nil {
return nil, err
}
indexSet.Add(x)
if !originalCost.Less(newCost) {
continue
}
// rule 3
prefixContain, err := aa.optimizer.PrefixContainIndex(x)
if err != nil {
return nil, err
}
if prefixContain {
continue
}
filteredIndexes.Add(x)
}
return filteredIndexes, nil
}
// selectIndexCandidates selects the best indexes for each single-query.
func (aa *autoAdmin) selectIndexCandidates(querySet s.Set[Query], potentialIndexes s.Set[Index]) (s.Set[Index], error) {
candidates := s.NewSet[Index]()
for _, query := range querySet.ToList() {
tmpQuerySet := s.ListToSet(query) // each query as a workload
indexes, err := aa.potentialIndexesForQuery(query, potentialIndexes)
if err != nil {
return nil, err
}
bestPerQuery := 3 // keep 3 best indexes for each single-query
bestQueryIndexes := s.NewSet[Index]()
for range bestPerQuery {
best, err := aa.enumerateCombinations(tmpQuerySet, indexes, 1)
if err != nil {
return nil, err
}
if best.Size() == 0 {
break
}
bestQueryIndexes.Add(best.ToList()...)
for _, index := range best.ToList() {
indexes.Remove(index)
}
if bestQueryIndexes.Size() > bestPerQuery {
break
}
}
candidates.Add(bestQueryIndexes.ToList()...)
}
return candidates, nil
}
// potentialIndexesForQuery returns best recommended indexes of this workload from these candidates.
func (aa *autoAdmin) enumerateCombinations(
querySet s.Set[Query],
candidateIndexes s.Set[Index],
maxNumberIndexes int) (s.Set[Index], error) {
maxIndexesNative := 2
if candidateIndexes.Size() > 50 {
maxIndexesNative = 1
}
numberIndexesNaive := min(maxIndexesNative, candidateIndexes.Size(), maxNumberIndexes)
currentIndexes, cost, err := aa.enumerateNaive(querySet, candidateIndexes, numberIndexesNaive)
if err != nil {
return nil, err
}
numberIndexes := min(maxNumberIndexes, candidateIndexes.Size())
indexes, _, err := aa.enumerateGreedy(querySet, currentIndexes, cost, candidateIndexes, numberIndexes)
return indexes, err
}
// enumerateGreedy finds the best combination of indexes with a greedy algorithm.
func (aa *autoAdmin) enumerateGreedy(querySet s.Set[Query], currentIndexes s.Set[Index],
currentCost IndexSetCost, candidateIndexes s.Set[Index], numberIndexes int) (s.Set[Index], IndexSetCost, error) {
if currentIndexes.Size() >= numberIndexes || candidateIndexes.Size() == 0 {
return currentIndexes, currentCost, nil
}
// iterate all unused indexes and add one into the current s.Set
indexCombinations := make([]s.Set[Index], 0, 128)
for _, index := range candidateIndexes.ToList() {
newCombination := s.UnionSet(currentIndexes, s.ListToSet(index))
if newCombination.Size() != currentIndexes.Size()+1 {
continue // duplicated index
}
indexCombinations = append(indexCombinations, newCombination)
}
if len(indexCombinations) == 0 {
return currentIndexes, currentCost, nil
}
// find the best s.Set
bestSet, bestCost, err := chooseBestIndexSet(querySet, aa.optimizer, indexCombinations)
if err != nil {
return nil, bestCost, err
}
if bestSet.Size() == 0 {
return currentIndexes, currentCost, nil
}
bestNewIndex := s.DiffSet(bestSet, currentIndexes).ToList()[0]
if bestCost.Less(currentCost) {
currentIndexes.Add(bestNewIndex)
candidateIndexes.Remove(bestNewIndex)
currentCost = bestCost
return aa.enumerateGreedy(querySet, currentIndexes, currentCost, candidateIndexes, numberIndexes)
}
return currentIndexes, currentCost, nil
}
// enumerateNaive enumerates all possible combinations of indexes with
// at most numberIndexesNaive indexes and returns the best one.
func (aa *autoAdmin) enumerateNaive(querySet s.Set[Query],
candidateIndexes s.Set[Index], numberIndexesNaive int) (s.Set[Index], IndexSetCost, error) {
// get all index combinations
indexCombinations := make([]s.Set[Index], 0, 128)
for numberOfIndexes := 0; numberOfIndexes <= numberIndexesNaive; numberOfIndexes++ {
indexCombinations = append(indexCombinations, s.CombSet(candidateIndexes, numberOfIndexes)...)
}
lowestCostIndexes, lowestCost, err := chooseBestIndexSet(querySet, aa.optimizer, indexCombinations)
if err != nil {
return nil, lowestCost, err
}
return lowestCostIndexes, lowestCost, nil
}
func (aa *autoAdmin) potentialIndexesForQuery(query Query, potentialIndexes s.Set[Index]) (s.Set[Index], error) {
indexes := s.NewSet[Index]()
for _, index := range potentialIndexes.ToList() {
// The leading index column must be referenced by the query.
indexableColSetOfQuery, err := CollectIndexableColumnsFromQuery(query, aa.optimizer)
if err != nil {
return nil, err
}
if indexableColSetOfQuery.Contains(index.Columns[0]) {
indexes.Add(index)
}
}
return indexes, nil
}
// tempIndexName returns a temp index name for the given columns.
func (*autoAdmin) tempIndexName(cols ...Column) string {
names := make([]string, 0, len(cols))
for _, col := range cols {
names = append(names, col.ColumnName)
}
idxName := fmt.Sprintf("idx_%v", strings.Join(names, "_"))
if len(idxName) <= 64 {
return idxName
}
return fmt.Sprintf("idx_%v", uuid.New().String())
}
func (aa *autoAdmin) timeout() error {
if time.Since(aa.startAt) > aa.option.Timeout {
return fmt.Errorf("index advisor timeout after %v", aa.option.Timeout)
}
return nil
}