tidb/pkg/executor/batch_point_get.go

// Copyright 2018 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package executor

import (
	"context"
	"errors"
	"fmt"
	"slices"
	"sync/atomic"
	"time"

	"github.com/pingcap/failpoint"
	"github.com/pingcap/tidb/pkg/executor/internal/exec"
	"github.com/pingcap/tidb/pkg/kv"
	"github.com/pingcap/tidb/pkg/meta/model"
	"github.com/pingcap/tidb/pkg/parser/ast"
	"github.com/pingcap/tidb/pkg/parser/mysql"
	"github.com/pingcap/tidb/pkg/planner/core/operator/physicalop"
	"github.com/pingcap/tidb/pkg/sessionctx"
	"github.com/pingcap/tidb/pkg/sessionctx/variable"
	driver "github.com/pingcap/tidb/pkg/store/driver/txn"
	"github.com/pingcap/tidb/pkg/table"
	"github.com/pingcap/tidb/pkg/tablecodec"
	"github.com/pingcap/tidb/pkg/types"
	"github.com/pingcap/tidb/pkg/util/chunk"
	"github.com/pingcap/tidb/pkg/util/codec"
	"github.com/pingcap/tidb/pkg/util/hack"
	"github.com/pingcap/tidb/pkg/util/intest"
	"github.com/pingcap/tidb/pkg/util/logutil/consistency"
	"github.com/pingcap/tidb/pkg/util/rowcodec"
	tikv "github.com/tikv/client-go/v2/kv"
	"github.com/tikv/client-go/v2/tikvrpc"
)

// BatchPointGetExec executes a bunch of point select queries.
type BatchPointGetExec struct {
	exec.BaseExecutor
	indexUsageReporter *exec.IndexUsageReporter

	tblInfo *model.TableInfo
	idxInfo *model.IndexInfo
	handles []kv.Handle
	// table/partition IDs for handle or index read
	// (can be secondary unique key,
	// and need lookup through handle)
	planPhysIDs []int64
	// If != 0 then it is a single partition under Static Prune mode.
	singlePartID   int64
	partitionNames []ast.CIStr
	idxVals        [][]types.Datum
	txn            kv.Transaction
	lock           bool
	waitTime       int64
	inited         uint32
	values         [][]byte
	index          int
	rowDecoder     *rowcodec.ChunkDecoder
	keepOrder      bool
	desc           bool
	batchGetter    kv.BatchGetter

	columns []*model.ColumnInfo
	// virtualColumnIndex records all the indices of virtual columns and sort them in definition
	// to make sure we can compute the virtual column in right order.
	virtualColumnIndex []int

	// virtualColumnRetFieldTypes records the RetFieldTypes of virtual columns.
	virtualColumnRetFieldTypes []*types.FieldType

	snapshot kv.Snapshot
	stats    *runtimeStatsWithSnapshot
}

// buildVirtualColumnInfo saves virtual column indices and sort them in definition order
func (e *BatchPointGetExec) buildVirtualColumnInfo() {
	e.virtualColumnIndex = buildVirtualColumnIndex(e.Schema(), e.columns)
	if len(e.virtualColumnIndex) > 0 {
		e.virtualColumnRetFieldTypes = make([]*types.FieldType, len(e.virtualColumnIndex))
		for i, idx := range e.virtualColumnIndex {
			e.virtualColumnRetFieldTypes[i] = e.Schema().Columns[idx].RetType
		}
	}
}

// Open implements the Executor interface.
func (e *BatchPointGetExec) Open(context.Context) error {
	sessVars := e.Ctx().GetSessionVars()
	txnCtx := sessVars.TxnCtx
	txn, err := e.Ctx().Txn(false)
	if err != nil {
		return err
	}
	e.txn = txn

	setOptionForTopSQL(e.Ctx().GetSessionVars().StmtCtx, e.snapshot)
	var batchGetter kv.BatchGetter = e.snapshot
	if txn.Valid() {
		lock := e.tblInfo.Lock
		if e.lock {
			batchGetter = driver.NewBufferBatchGetter(txn.GetMemBuffer(), &PessimisticLockCacheGetter{txnCtx: txnCtx}, e.snapshot)
		} else if lock != nil && (lock.Tp == ast.TableLockRead || lock.Tp == ast.TableLockReadOnly) && e.Ctx().GetSessionVars().EnablePointGetCache {
			batchGetter = newCacheBatchGetter(e.Ctx(), e.tblInfo.ID, e.snapshot)
		} else {
			batchGetter = driver.NewBufferBatchGetter(txn.GetMemBuffer(), nil, e.snapshot)
		}
	}
	e.batchGetter = batchGetter
	return nil
}

// CacheTable always use memBuffer in session as snapshot.
// cacheTableSnapshot inherits kv.Snapshot and override the BatchGet methods and Get methods.
type cacheTableSnapshot struct {
	kv.Snapshot
	memBuffer kv.MemBuffer
}

func (s cacheTableSnapshot) BatchGet(ctx context.Context, keys []kv.Key, options ...kv.BatchGetOption) (map[string]kv.ValueEntry, error) {
	if len(options) > 0 {
		var opt tikv.BatchGetOptions
		opt.Apply(options)
		if opt.ReturnCommitTS() {
			return nil, errors.New("WithReturnCommitTS option is not supported for cacheTableSnapshot.BatchGet")
		}
	}
	values := make(map[string]kv.ValueEntry)
	if s.memBuffer == nil {
		return values, nil
	}

	getOptions := kv.BatchGetToGetOptions(options)
	for _, key := range keys {
		val, err := s.memBuffer.Get(ctx, key, getOptions...)
		if kv.ErrNotExist.Equal(err) {
			continue
		}

		if err != nil {
			return nil, err
		}

		if val.IsValueEmpty() {
			continue
		}

		values[string(key)] = val
	}

	return values, nil
}

func (s cacheTableSnapshot) Get(ctx context.Context, key kv.Key, options ...kv.GetOption) (kv.ValueEntry, error) {
	if len(options) > 0 {
		var opt tikv.GetOptions
		opt.Apply(options)
		if opt.ReturnCommitTS() {
			return kv.ValueEntry{}, errors.New("WithReturnCommitTS option is not supported for cacheTableSnapshot.Get")
		}
	}
	return s.memBuffer.Get(ctx, key, options...)
}

// MockNewCacheTableSnapShot only serves for test.
func MockNewCacheTableSnapShot(snapshot kv.Snapshot, memBuffer kv.MemBuffer) *cacheTableSnapshot {
	return &cacheTableSnapshot{snapshot, memBuffer}
}

// Close implements the Executor interface.
func (e *BatchPointGetExec) Close() error {
	if e.RuntimeStats() != nil {
		defer func() {
			sc := e.Ctx().GetSessionVars().StmtCtx
			sc.RuntimeStatsColl.RegisterStats(e.ID(), e.stats)
			timeDetail := e.stats.SnapshotRuntimeStats.GetTimeDetail()
			if timeDetail != nil {
				e.Ctx().GetSessionVars().SQLCPUUsages.MergeTikvCPUTime(timeDetail.ProcessTime)
			}
		}()
	}

	if e.RuntimeStats() != nil && e.snapshot != nil {
		e.snapshot.SetOption(kv.CollectRuntimeStats, nil)
	}
	if e.indexUsageReporter != nil && e.stats != nil {
		kvReqTotal := e.stats.GetCmdRPCCount(tikvrpc.CmdBatchGet)
		// We cannot distinguish how many rows are coming from each partition. Here, we calculate all index usages
		// percentage according to the row counts for the whole table.
		rows := e.RuntimeStats().GetActRows()
		if e.idxInfo != nil {
			e.indexUsageReporter.ReportPointGetIndexUsage(e.tblInfo.ID, e.tblInfo.ID, e.idxInfo.ID, kvReqTotal, rows)
		} else {
			e.indexUsageReporter.ReportPointGetIndexUsageForHandle(e.tblInfo, e.tblInfo.ID, kvReqTotal, rows)
		}
	}
	e.inited = 0
	e.index = 0
	return nil
}

// Next implements the Executor interface.
func (e *BatchPointGetExec) Next(ctx context.Context, req *chunk.Chunk) error {
	req.Reset()
	if atomic.CompareAndSwapUint32(&e.inited, 0, 1) {
		if err := e.initialize(ctx); err != nil {
			return err
		}
		if e.lock {
			e.UpdateDeltaForTableID(e.tblInfo.ID)
		}
	}

	if e.index >= len(e.values) {
		return nil
	}

	schema := e.Schema()
	sctx := e.BaseExecutor.Ctx()
	start := e.index
	for !req.IsFull() && e.index < len(e.values) {
		handle, val := e.handles[e.index], e.values[e.index]
		err := DecodeRowValToChunk(sctx, schema, e.tblInfo, handle, val, req, e.rowDecoder)
		if err != nil {
			return err
		}
		e.index++
	}

	err := fillRowChecksum(sctx, start, e.index, schema, e.tblInfo, e.values, e.handles, req, nil)
	if err != nil {
		return err
	}
	err = table.FillVirtualColumnValue(e.virtualColumnRetFieldTypes, e.virtualColumnIndex, schema.Columns, e.columns, sctx.GetExprCtx(), req)
	if err != nil {
		return err
	}
	return nil
}

func (e *BatchPointGetExec) initialize(ctx context.Context) error {
	var handleVals map[string]kv.ValueEntry
	var indexKeys []kv.Key
	var err error
	batchGetter := e.batchGetter
	maxExecutionTime := e.Ctx().GetSessionVars().GetMaxExecutionTime()
	if maxExecutionTime > 0 {
		// If MaxExecutionTime is set, we need to set the context deadline for the batch get.
		var cancel context.CancelFunc
		ctx, cancel = context.WithTimeout(ctx, time.Duration(maxExecutionTime)*time.Millisecond)
		defer cancel()
	}
	rc := e.Ctx().GetSessionVars().IsPessimisticReadConsistency()
	if e.idxInfo != nil && !isCommonHandleRead(e.tblInfo, e.idxInfo) {
		// `SELECT a, b FROM t WHERE (a, b) IN ((1, 2), (1, 2), (2, 1), (1, 2))` should not return duplicated rows
		dedup := make(map[hack.MutableString]struct{})
		toFetchIndexKeys := make([]kv.Key, 0, len(e.idxVals))
		for i, idxVals := range e.idxVals {
			physID := e.tblInfo.ID
			if e.singlePartID != 0 {
				physID = e.singlePartID
			} else if len(e.planPhysIDs) > i {
				physID = e.planPhysIDs[i]
			}
			idxKey, err1 := physicalop.EncodeUniqueIndexKey(e.Ctx(), e.tblInfo, e.idxInfo, idxVals, physID)
			if err1 != nil && !kv.ErrNotExist.Equal(err1) {
				return err1
			}
			if idxKey == nil {
				continue
			}
			s := hack.String(idxKey)
			if _, found := dedup[s]; found {
				continue
			}
			dedup[s] = struct{}{}
			toFetchIndexKeys = append(toFetchIndexKeys, idxKey)
		}
		if e.keepOrder {
			// TODO: if multiple partitions, then the IDs needs to be
			// in the same order as the index keys
			// and should skip table id part when comparing
			intest.Assert(e.singlePartID != 0 || len(e.planPhysIDs) <= 1 || e.idxInfo.Global)
			slices.SortFunc(toFetchIndexKeys, func(i, j kv.Key) int {
				if e.desc {
					return j.Cmp(i)
				}
				return i.Cmp(j)
			})
		}

		// lock all keys in repeatable read isolation.
		// for read consistency, only lock exist keys,
		// indexKeys will be generated after getting handles.
		if !rc {
			indexKeys = toFetchIndexKeys
		} else {
			indexKeys = make([]kv.Key, 0, len(toFetchIndexKeys))
		}

		// SELECT * FROM t WHERE x IN (null), in this case there is no key.
		if len(toFetchIndexKeys) == 0 {
			return nil
		}

		// Fetch all handles.
		handleVals, err = batchGetter.BatchGet(ctx, toFetchIndexKeys)
		if err != nil {
			return err
		}

		e.handles = make([]kv.Handle, 0, len(toFetchIndexKeys))
		if e.tblInfo.Partition != nil {
			e.planPhysIDs = e.planPhysIDs[:0]
		}
		for _, key := range toFetchIndexKeys {
			handleVal := handleVals[string(key)]
			if handleVal.IsValueEmpty() {
				continue
			}
			handle, err1 := tablecodec.DecodeHandleInIndexValue(handleVal.Value)
			if err1 != nil {
				return err1
			}
			if e.tblInfo.Partition != nil {
				var pid int64
				if e.idxInfo.Global {
					_, pid, err = codec.DecodeInt(tablecodec.SplitIndexValue(handleVal.Value).PartitionID)
					if err != nil {
						return err
					}
					if e.singlePartID != 0 && e.singlePartID != pid {
						continue
					}
					if !matchPartitionNames(pid, e.partitionNames, e.tblInfo.GetPartitionInfo()) {
						continue
					}
					e.planPhysIDs = append(e.planPhysIDs, pid)
				} else {
					pid = tablecodec.DecodeTableID(key)
					e.planPhysIDs = append(e.planPhysIDs, pid)
				}
				if e.lock {
					e.UpdateDeltaForTableID(pid)
				}
			}
			e.handles = append(e.handles, handle)
			if rc {
				indexKeys = append(indexKeys, key)
			}
		}

		// The injection is used to simulate following scenario:
		// 1. Session A create a point get query but pause before second time `GET` kv from backend
		// 2. Session B create an UPDATE query to update the record that will be obtained in step 1
		// 3. Then point get retrieve data from backend after step 2 finished
		// 4. Check the result
		failpoint.InjectContext(ctx, "batchPointGetRepeatableReadTest-step1", func() {
			if ch, ok := ctx.Value("batchPointGetRepeatableReadTest").(chan struct{}); ok {
				// Make `UPDATE` continue
				close(ch)
			}
			// Wait `UPDATE` finished
			failpoint.InjectContext(ctx, "batchPointGetRepeatableReadTest-step2", nil)
		})
	} else if e.keepOrder {
		less := func(i, j kv.Handle) int {
			if e.desc {
				return j.Compare(i)
			}
			return i.Compare(j)
		}
		if e.tblInfo.PKIsHandle && mysql.HasUnsignedFlag(e.tblInfo.GetPkColInfo().GetFlag()) {
			uintComparator := func(i, h kv.Handle) int {
				if !i.IsInt() || !h.IsInt() {
					panic(fmt.Sprintf("both handles need be IntHandle, but got %T and %T ", i, h))
				}
				ihVal := uint64(i.IntValue())
				hVal := uint64(h.IntValue())
				if ihVal > hVal {
					return 1
				}
				if ihVal < hVal {
					return -1
				}
				return 0
			}
			less = func(i, j kv.Handle) int {
				if e.desc {
					return uintComparator(j, i)
				}
				return uintComparator(i, j)
			}
		}
		slices.SortFunc(e.handles, less)
		// TODO: if partitioned table, sorting the handles would also
		//  need to have the physIDs rearranged in the same order!
		intest.Assert(e.singlePartID != 0 || len(e.planPhysIDs) <= 1)
	}

	keys := make([]kv.Key, 0, len(e.handles))
	newHandles := make([]kv.Handle, 0, len(e.handles))
	for i, handle := range e.handles {
		tID := e.tblInfo.ID
		if e.singlePartID != 0 {
			tID = e.singlePartID
		} else if len(e.planPhysIDs) > 0 {
			// Direct handle read
			tID = e.planPhysIDs[i]
		}
		if tID <= 0 {
			// not matching any partition
			continue
		}
		key := tablecodec.EncodeRowKeyWithHandle(tID, handle)
		keys = append(keys, key)
		newHandles = append(newHandles, handle)
	}
	e.handles = newHandles

	var values map[string]kv.ValueEntry
	// Lock keys (include exists and non-exists keys) before fetch all values for Repeatable Read Isolation.
	if e.lock && !rc {
		lockKeys := make([]kv.Key, len(keys)+len(indexKeys))
		copy(lockKeys, keys)
		copy(lockKeys[len(keys):], indexKeys)
		err = LockKeys(ctx, e.Ctx(), e.waitTime, lockKeys...)
		if err != nil {
			return err
		}
	}
	// Fetch all values.
	values, err = batchGetter.BatchGet(ctx, keys)
	if err != nil {
		return err
	}
	handles := make([]kv.Handle, 0, len(values))
	var existKeys []kv.Key
	if e.lock && rc {
		existKeys = make([]kv.Key, 0, 2*len(values))
	}
	e.values = make([][]byte, 0, len(values))
	for i, key := range keys {
		val := values[string(key)]
		if val.IsValueEmpty() {
			if e.idxInfo != nil && (!e.tblInfo.IsCommonHandle || !e.idxInfo.Primary) &&
				!e.Ctx().GetSessionVars().StmtCtx.WeakConsistency {
				return (&consistency.Reporter{
					HandleEncode: func(_ kv.Handle) kv.Key {
						return key
					},
					IndexEncode: func(_ *consistency.RecordData) kv.Key {
						return indexKeys[i]
					},
					Tbl:             e.tblInfo,
					Idx:             e.idxInfo,
					EnableRedactLog: e.Ctx().GetSessionVars().EnableRedactLog,
					Storage:         e.Ctx().GetStore(),
				}).ReportLookupInconsistent(ctx,
					1, 0,
					e.handles[i:i+1],
					e.handles,
					[]consistency.RecordData{{}},
				)
			}
			continue
		}
		e.values = append(e.values, val.Value)
		handles = append(handles, e.handles[i])
		if e.lock && rc {
			existKeys = append(existKeys, key)
			// when e.handles is set in builder directly, index should be primary key and the plan is CommonHandleRead
			// with clustered index enabled, indexKeys is empty in this situation
			// lock primary key for clustered index table is redundant
			if len(indexKeys) != 0 {
				existKeys = append(existKeys, indexKeys[i])
			}
		}
	}
	// Lock exists keys only for Read Committed Isolation.
	if e.lock && rc {
		err = LockKeys(ctx, e.Ctx(), e.waitTime, existKeys...)
		if err != nil {
			return err
		}
	}
	e.handles = handles
	return nil
}

// LockKeys locks the keys for pessimistic transaction.
func LockKeys(ctx context.Context, sctx sessionctx.Context, lockWaitTime int64, keys ...kv.Key) error {
	sessVars := sctx.GetSessionVars()

	if err := checkMaxExecutionTimeExceeded(sctx); err != nil {
		return err
	}

	txnCtx := sessVars.TxnCtx
	lctx, err := newLockCtx(sctx, lockWaitTime, len(keys))
	if err != nil {
		return err
	}
	if txnCtx.IsPessimistic {
		lctx.InitReturnValues(len(keys))
	}
	err = doLockKeys(ctx, sctx, lctx, keys...)
	if err != nil {
		return err
	}
	if txnCtx.IsPessimistic {
		// When doLockKeys returns without error, no other goroutines access the map,
		// it's safe to read it without mutex.
		for _, key := range keys {
			if v, ok := lctx.GetValueNotLocked(key); ok {
				txnCtx.SetPessimisticLockCache(key, v)
			}
		}
	}
	return nil
}

// PessimisticLockCacheGetter implements the kv.Getter interface.
// It is used as a middle cache to construct the BufferedBatchGetter.
type PessimisticLockCacheGetter struct {
	txnCtx *variable.TransactionContext
}

// Get implements the kv.Getter interface.
func (getter *PessimisticLockCacheGetter) Get(_ context.Context, key kv.Key, options ...kv.GetOption) (kv.ValueEntry, error) {
	if len(options) > 0 {
		var opt tikv.GetOptions
		opt.Apply(options)
		if opt.ReturnCommitTS() {
			return kv.ValueEntry{}, errors.New("WithReturnCommitTS option is not supported for pessimistic lock cacheBatchGetter.Get")
		}
	}

	val, ok := getter.txnCtx.GetKeyInPessimisticLockCache(key)
	if ok {
		return kv.NewValueEntry(val, 0), nil
	}
	return kv.ValueEntry{}, kv.ErrNotExist
}

type cacheBatchGetter struct {
	ctx      sessionctx.Context
	tid      int64
	snapshot kv.Snapshot
}

func (b *cacheBatchGetter) BatchGet(ctx context.Context, keys []kv.Key, options ...kv.BatchGetOption) (map[string]kv.ValueEntry, error) {
	if len(options) > 0 {
		var opt tikv.BatchGetOptions
		opt.Apply(options)
		if opt.ReturnCommitTS() {
			return nil, errors.New("WithReturnCommitTS option is not supported for pessimistic lock cacheBatchGetter.BatchGet")
		}
	}

	cacheDB := b.ctx.GetStore().GetMemCache()
	vals := make(map[string]kv.ValueEntry)
	for _, key := range keys {
		val, err := cacheDB.UnionGet(ctx, b.tid, b.snapshot, key)
		if err != nil {
			if !kv.ErrNotExist.Equal(err) {
				return nil, err
			}
			continue
		}
		vals[string(key)] = kv.NewValueEntry(val, 0)
	}
	return vals, nil
}

func newCacheBatchGetter(ctx sessionctx.Context, tid int64, snapshot kv.Snapshot) *cacheBatchGetter {
	return &cacheBatchGetter{ctx, tid, snapshot}
}