1539 lines
44 KiB
Go
1539 lines
44 KiB
Go
// Copyright 2024 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package join
|
|
|
|
import (
|
|
"context"
|
|
"hash"
|
|
"math"
|
|
"math/bits"
|
|
"math/rand"
|
|
"runtime/trace"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
"unsafe"
|
|
|
|
"github.com/pingcap/errors"
|
|
"github.com/pingcap/failpoint"
|
|
"github.com/pingcap/tidb/pkg/executor/internal/exec"
|
|
"github.com/pingcap/tidb/pkg/executor/join/joinversion"
|
|
"github.com/pingcap/tidb/pkg/expression"
|
|
"github.com/pingcap/tidb/pkg/parser/mysql"
|
|
"github.com/pingcap/tidb/pkg/planner/core/base"
|
|
"github.com/pingcap/tidb/pkg/sessionctx/vardef"
|
|
"github.com/pingcap/tidb/pkg/types"
|
|
"github.com/pingcap/tidb/pkg/util"
|
|
"github.com/pingcap/tidb/pkg/util/channel"
|
|
"github.com/pingcap/tidb/pkg/util/chunk"
|
|
"github.com/pingcap/tidb/pkg/util/disk"
|
|
"github.com/pingcap/tidb/pkg/util/intest"
|
|
"github.com/pingcap/tidb/pkg/util/memory"
|
|
)
|
|
|
|
const minimalHashTableLen = 32
|
|
|
|
var (
|
|
_ exec.Executor = &HashJoinV2Exec{}
|
|
// EnableHashJoinV2 enable hash join v2, used for test
|
|
EnableHashJoinV2 = "set tidb_hash_join_version = " + joinversion.HashJoinVersionOptimized
|
|
// DisableHashJoinV2 disable hash join v2, used for test
|
|
DisableHashJoinV2 = "set tidb_hash_join_version = " + joinversion.HashJoinVersionLegacy
|
|
// HashJoinV2Strings is used for test
|
|
HashJoinV2Strings = []string{DisableHashJoinV2, EnableHashJoinV2}
|
|
// fakeSel is used when chunk does not have sel field
|
|
fakeSel []int
|
|
// the length of fakeSelLength, default max_chunk_size is 1024,
|
|
// we set fakeSel size to 4*max_chunk_size so it should be enough for most cases
|
|
fakeSelLength = 4096
|
|
)
|
|
|
|
func init() {
|
|
fakeSel = make([]int, fakeSelLength)
|
|
for i := range fakeSel {
|
|
fakeSel[i] = i
|
|
}
|
|
}
|
|
|
|
type hashTableContext struct {
|
|
// rowTables is used during split partition stage, each buildWorker has
|
|
// its own rowTable
|
|
rowTables [][]*rowTable
|
|
hashTable *hashTableV2
|
|
tagHelper *tagPtrHelper
|
|
memoryTracker *memory.Tracker
|
|
}
|
|
|
|
func (htc *hashTableContext) reset() {
|
|
htc.rowTables = nil
|
|
htc.hashTable = nil
|
|
htc.tagHelper = nil
|
|
htc.memoryTracker.Detach()
|
|
}
|
|
|
|
func (htc *hashTableContext) getAllMemoryUsageInHashTable() int64 {
|
|
partNum := len(htc.hashTable.tables)
|
|
totalMemoryUsage := int64(0)
|
|
for i := range partNum {
|
|
mem := htc.hashTable.getPartitionMemoryUsage(i)
|
|
totalMemoryUsage += mem
|
|
}
|
|
return totalMemoryUsage
|
|
}
|
|
|
|
func (htc *hashTableContext) clearHashTable() {
|
|
partNum := len(htc.hashTable.tables)
|
|
for i := range partNum {
|
|
htc.hashTable.clearPartitionSegments(i)
|
|
}
|
|
}
|
|
|
|
func (htc *hashTableContext) getPartitionMemoryUsage(partID int) int64 {
|
|
totalMemoryUsage := int64(0)
|
|
for _, tables := range htc.rowTables {
|
|
if tables != nil && tables[partID] != nil {
|
|
totalMemoryUsage += tables[partID].getTotalMemoryUsage()
|
|
}
|
|
}
|
|
|
|
return totalMemoryUsage
|
|
}
|
|
|
|
func (htc *hashTableContext) getSegmentsInRowTable(workerID, partitionID int) []*rowTableSegment {
|
|
if htc.rowTables[workerID] != nil && htc.rowTables[workerID][partitionID] != nil {
|
|
return htc.rowTables[workerID][partitionID].getSegments()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (htc *hashTableContext) getAllSegmentsMemoryUsageInRowTable() int64 {
|
|
totalMemoryUsage := int64(0)
|
|
for _, tables := range htc.rowTables {
|
|
for _, table := range tables {
|
|
if table != nil {
|
|
totalMemoryUsage += table.getTotalMemoryUsage()
|
|
}
|
|
}
|
|
}
|
|
return totalMemoryUsage
|
|
}
|
|
|
|
func (htc *hashTableContext) clearAllSegmentsInRowTable() {
|
|
for _, tables := range htc.rowTables {
|
|
for _, table := range tables {
|
|
if table != nil {
|
|
table.clearSegments()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (htc *hashTableContext) clearSegmentsInRowTable(workerID, partitionID int) {
|
|
if htc.rowTables[workerID] != nil && htc.rowTables[workerID][partitionID] != nil {
|
|
htc.rowTables[workerID][partitionID].clearSegments()
|
|
}
|
|
}
|
|
|
|
func (htc *hashTableContext) build(task *buildTask) {
|
|
htc.hashTable.tables[task.partitionIdx].build(task.segStartIdx, task.segEndIdx, htc.tagHelper)
|
|
}
|
|
|
|
func (htc *hashTableContext) lookup(partitionIndex int, hashValue uint64) taggedPtr {
|
|
return htc.hashTable.tables[partitionIndex].lookup(hashValue, htc.tagHelper)
|
|
}
|
|
|
|
func (htc *hashTableContext) appendRowSegment(workerID, partitionID int, seg *rowTableSegment) {
|
|
if len(seg.hashValues) == 0 {
|
|
return
|
|
}
|
|
|
|
if htc.rowTables[workerID][partitionID] == nil {
|
|
htc.rowTables[workerID][partitionID] = newRowTable()
|
|
}
|
|
|
|
seg.initTaggedBits()
|
|
htc.rowTables[workerID][partitionID].segments = append(htc.rowTables[workerID][partitionID].segments, seg)
|
|
}
|
|
|
|
func (*hashTableContext) calculateHashTableMemoryUsage(rowTables []*rowTable) (int64, []int64) {
|
|
totalMemoryUsage := int64(0)
|
|
partitionsMemoryUsage := make([]int64, 0)
|
|
for _, table := range rowTables {
|
|
hashTableLength := getHashTableLengthByRowTable(table)
|
|
memoryUsage := getHashTableMemoryUsage(hashTableLength)
|
|
partitionsMemoryUsage = append(partitionsMemoryUsage, memoryUsage)
|
|
totalMemoryUsage += memoryUsage
|
|
}
|
|
return totalMemoryUsage, partitionsMemoryUsage
|
|
}
|
|
|
|
// In order to avoid the allocation of hash table, we pre-calculate the memory usage in advance
|
|
// to know which hash tables need to be created.
|
|
func (htc *hashTableContext) tryToSpill(rowTables []*rowTable, spillHelper *hashJoinSpillHelper) ([]*rowTable, error) {
|
|
totalMemoryUsage, hashTableMemoryUsage := htc.calculateHashTableMemoryUsage(rowTables)
|
|
|
|
// Pre-consume the memory usage
|
|
htc.memoryTracker.Consume(totalMemoryUsage)
|
|
|
|
if spillHelper != nil && spillHelper.isSpillNeeded() {
|
|
spillHelper.spillTriggeredBeforeBuildingHashTableForTest = true
|
|
err := spillHelper.spillRowTable(hashTableMemoryUsage)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
spilledPartition := spillHelper.getSpilledPartitions()
|
|
for _, partID := range spilledPartition {
|
|
// Clear spilled row tables
|
|
rowTables[partID].clearSegments()
|
|
}
|
|
|
|
// Though some partitions have been spilled or are empty, their hash tables are still be created
|
|
// because probe rows in these partitions may access their hash tables.
|
|
// We need to consider these memory usage.
|
|
totalDefaultMemUsage := getHashTableMemoryUsage(minimalHashTableLen) * int64(len(spilledPartition))
|
|
|
|
// Hash table memory usage has already been released in spill operation.
|
|
// So it's unnecessary to release them again.
|
|
htc.memoryTracker.Consume(totalDefaultMemUsage)
|
|
}
|
|
|
|
return rowTables, nil
|
|
}
|
|
|
|
func (htc *hashTableContext) mergeRowTablesToHashTable(partitionNumber uint, spillHelper *hashJoinSpillHelper) (int, error) {
|
|
rowTables := make([]*rowTable, partitionNumber)
|
|
for i := range partitionNumber {
|
|
rowTables[i] = newRowTable()
|
|
}
|
|
|
|
totalSegmentCnt := 0
|
|
for _, rowTablesPerWorker := range htc.rowTables {
|
|
for partIdx, rt := range rowTablesPerWorker {
|
|
if rt == nil {
|
|
continue
|
|
}
|
|
rowTables[partIdx].merge(rt)
|
|
totalSegmentCnt += len(rt.segments)
|
|
}
|
|
}
|
|
|
|
var err error
|
|
|
|
// spillHelper may be nil in ut
|
|
if spillHelper != nil {
|
|
rowTables, err = htc.tryToSpill(rowTables, spillHelper)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
|
|
spillHelper.setCanSpillFlag(false)
|
|
}
|
|
|
|
taggedBits := uint8(maxTaggedBits)
|
|
for i := range partitionNumber {
|
|
for _, seg := range rowTables[i].segments {
|
|
taggedBits = min(taggedBits, seg.taggedBits)
|
|
}
|
|
htc.hashTable.tables[i] = newSubTable(rowTables[i])
|
|
}
|
|
|
|
htc.tagHelper = &tagPtrHelper{}
|
|
htc.tagHelper.init(taggedBits)
|
|
|
|
htc.clearAllSegmentsInRowTable()
|
|
return totalSegmentCnt, nil
|
|
}
|
|
|
|
// HashJoinCtxV2 is the hash join ctx used in hash join v2
|
|
type HashJoinCtxV2 struct {
|
|
hashJoinCtxBase
|
|
partitionNumber uint
|
|
partitionMaskOffset int
|
|
ProbeKeyTypes []*types.FieldType
|
|
BuildKeyTypes []*types.FieldType
|
|
stats *hashJoinRuntimeStatsV2
|
|
|
|
RightAsBuildSide bool
|
|
BuildFilter expression.CNFExprs
|
|
ProbeFilter expression.CNFExprs
|
|
OtherCondition expression.CNFExprs
|
|
hashTableContext *hashTableContext
|
|
hashTableMeta *joinTableMeta
|
|
needScanRowTableAfterProbeDone bool
|
|
|
|
LUsed, RUsed []int
|
|
LUsedInOtherCondition, RUsedInOtherCondition []int
|
|
|
|
maxSpillRound int
|
|
spillHelper *hashJoinSpillHelper
|
|
spillAction *hashJoinSpillAction
|
|
}
|
|
|
|
func (hCtx *HashJoinCtxV2) resetHashTableContextForRestore() {
|
|
memoryUsage := hCtx.hashTableContext.getAllSegmentsMemoryUsageInRowTable()
|
|
if intest.InTest && memoryUsage != 0 {
|
|
panic("All rowTables in hashTableContext should be cleared")
|
|
}
|
|
|
|
memoryUsage = hCtx.hashTableContext.getAllMemoryUsageInHashTable()
|
|
hCtx.hashTableContext.clearHashTable()
|
|
hCtx.hashTableContext.memoryTracker.Consume(-memoryUsage)
|
|
}
|
|
|
|
// partitionNumber is always power of 2
|
|
func genHashJoinPartitionNumber(partitionHint uint) uint {
|
|
partitionNumber := uint(1)
|
|
for partitionNumber < partitionHint && partitionNumber < 16 {
|
|
partitionNumber <<= 1
|
|
}
|
|
return partitionNumber
|
|
}
|
|
|
|
func getPartitionMaskOffset(partitionNumber uint) int {
|
|
msbPos := bits.TrailingZeros64(uint64(partitionNumber))
|
|
// top MSB bits in hash value will be used to partition data
|
|
return 64 - msbPos
|
|
}
|
|
|
|
// SetupPartitionInfo set up partitionNumber and partitionMaskOffset based on concurrency
|
|
func (hCtx *HashJoinCtxV2) SetupPartitionInfo() {
|
|
hCtx.partitionNumber = genHashJoinPartitionNumber(hCtx.Concurrency)
|
|
hCtx.partitionMaskOffset = getPartitionMaskOffset(hCtx.partitionNumber)
|
|
}
|
|
|
|
// initHashTableContext create hashTableContext for current HashJoinCtxV2
|
|
func (hCtx *HashJoinCtxV2) initHashTableContext() {
|
|
hCtx.hashTableContext = &hashTableContext{}
|
|
hCtx.hashTableContext.rowTables = make([][]*rowTable, hCtx.Concurrency)
|
|
for index := range hCtx.hashTableContext.rowTables {
|
|
hCtx.hashTableContext.rowTables[index] = make([]*rowTable, hCtx.partitionNumber)
|
|
}
|
|
hCtx.hashTableContext.hashTable = &hashTableV2{
|
|
tables: make([]*subTable, hCtx.partitionNumber),
|
|
partitionNumber: uint64(hCtx.partitionNumber),
|
|
}
|
|
hCtx.hashTableContext.memoryTracker = memory.NewTracker(memory.LabelForHashTableInHashJoinV2, -1)
|
|
}
|
|
|
|
// ProbeSideTupleFetcherV2 reads tuples from ProbeSideExec and send them to ProbeWorkers.
|
|
type ProbeSideTupleFetcherV2 struct {
|
|
probeSideTupleFetcherBase
|
|
*HashJoinCtxV2
|
|
canSkipProbeIfHashTableIsEmpty bool
|
|
}
|
|
|
|
// ProbeWorkerV2 is the probe worker used in hash join v2
|
|
type ProbeWorkerV2 struct {
|
|
probeWorkerBase
|
|
HashJoinCtx *HashJoinCtxV2
|
|
// We build individual joinProbe for each join worker when use chunk-based
|
|
// execution, to avoid the concurrency of joiner.chk and joiner.selected.
|
|
JoinProbe ProbeV2
|
|
|
|
restoredChkBuf *chunk.Chunk
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) updateProbeStatistic(start time.Time, probeTime int64) {
|
|
t := time.Since(start)
|
|
atomic.AddInt64(&w.HashJoinCtx.stats.probe, probeTime)
|
|
atomic.AddInt64(&w.HashJoinCtx.stats.workerFetchAndProbe, int64(t))
|
|
setMaxValue(&w.HashJoinCtx.stats.maxProbeForCurrentRound, probeTime)
|
|
setMaxValue(&w.HashJoinCtx.stats.maxWorkerFetchAndProbeForCurrentRound, int64(t))
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) restoreAndProbe(inDisk *chunk.DataInDiskByChunks, start time.Time) {
|
|
probeTime := int64(0)
|
|
if w.HashJoinCtx.stats != nil {
|
|
defer func() {
|
|
w.updateProbeStatistic(start, probeTime)
|
|
}()
|
|
}
|
|
|
|
ok, joinResult := w.getNewJoinResult()
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
chunkNum := inDisk.NumChunks()
|
|
|
|
for i := range chunkNum {
|
|
select {
|
|
case <-w.HashJoinCtx.closeCh:
|
|
return
|
|
default:
|
|
}
|
|
failpoint.Inject("ConsumeRandomPanic", nil)
|
|
|
|
err := inDisk.FillChunk(i, w.restoredChkBuf)
|
|
if err != nil {
|
|
joinResult.err = err
|
|
break
|
|
}
|
|
|
|
err = triggerIntest(2)
|
|
if err != nil {
|
|
joinResult.err = err
|
|
break
|
|
}
|
|
|
|
start := time.Now()
|
|
waitTime := int64(0)
|
|
ok, waitTime, joinResult = w.processOneRestoredProbeChunk(joinResult)
|
|
probeTime += int64(time.Since(start)) - waitTime
|
|
if !ok {
|
|
break
|
|
}
|
|
}
|
|
|
|
err := w.JoinProbe.SpillRemainingProbeChunks()
|
|
if err != nil {
|
|
joinResult.err = err
|
|
}
|
|
|
|
if joinResult.err != nil || (joinResult.chk != nil && joinResult.chk.NumRows() > 0) {
|
|
w.HashJoinCtx.joinResultCh <- joinResult
|
|
} else if joinResult.chk != nil && joinResult.chk.NumRows() == 0 {
|
|
w.joinChkResourceCh <- joinResult.chk
|
|
}
|
|
}
|
|
|
|
// BuildWorkerV2 is the build worker used in hash join v2
|
|
type BuildWorkerV2 struct {
|
|
buildWorkerBase
|
|
HashJoinCtx *HashJoinCtxV2
|
|
BuildTypes []*types.FieldType
|
|
HasNullableKey bool
|
|
WorkerID uint
|
|
builder *rowTableBuilder
|
|
restoredChkBuf *chunk.Chunk
|
|
}
|
|
|
|
func (b *BuildWorkerV2) getSegmentsInRowTable(partID int) []*rowTableSegment {
|
|
return b.HashJoinCtx.hashTableContext.getSegmentsInRowTable(int(b.WorkerID), partID)
|
|
}
|
|
|
|
func (b *BuildWorkerV2) clearSegmentsInRowTable(partID int) {
|
|
b.HashJoinCtx.hashTableContext.clearSegmentsInRowTable(int(b.WorkerID), partID)
|
|
}
|
|
|
|
func (b *BuildWorkerV2) updatePartitionData(cost int64) {
|
|
atomic.AddInt64(&b.HashJoinCtx.stats.partitionData, cost)
|
|
setMaxValue(&b.HashJoinCtx.stats.maxPartitionDataForCurrentRound, cost)
|
|
}
|
|
|
|
func (b *BuildWorkerV2) processOneRestoredChunk(cost *int64) error {
|
|
start := time.Now()
|
|
err := b.builder.processOneRestoredChunk(b.restoredChkBuf, b.HashJoinCtx, int(b.WorkerID), int(b.HashJoinCtx.partitionNumber))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
*cost += int64(time.Since(start))
|
|
return nil
|
|
}
|
|
|
|
func (b *BuildWorkerV2) splitPartitionAndAppendToRowTableForRestoreImpl(i int, inDisk *chunk.DataInDiskByChunks, fetcherAndWorkerSyncer *sync.WaitGroup, hasErr bool, cost *int64) (err error) {
|
|
defer func() {
|
|
fetcherAndWorkerSyncer.Done()
|
|
|
|
if r := recover(); r != nil {
|
|
// We shouldn't throw the panic out of this function, or
|
|
// we can't continue to consume `syncCh` channel and call
|
|
// the `Done` function of `fetcherAndWorkerSyncer`.
|
|
// So it's necessary to handle it here.
|
|
err = util.GetRecoverError(r)
|
|
}
|
|
}()
|
|
|
|
if hasErr {
|
|
return nil
|
|
}
|
|
|
|
err = inDisk.FillChunk(i, b.restoredChkBuf)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = triggerIntest(3)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = b.processOneRestoredChunk(cost)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (b *BuildWorkerV2) splitPartitionAndAppendToRowTableForRestore(inDisk *chunk.DataInDiskByChunks, syncCh chan *chunk.Chunk, fetcherAndWorkerSyncer *sync.WaitGroup, errCh chan error, doneCh chan struct{}) {
|
|
cost := int64(0)
|
|
defer func() {
|
|
if b.HashJoinCtx.stats != nil {
|
|
b.updatePartitionData(cost)
|
|
}
|
|
}()
|
|
|
|
// When error happens, hasErr will be set to true.
|
|
// However, we should not directly exit the function, as we must
|
|
// call `fetcherAndWorkerSyncer.Done()` in `splitPartitionAndAppendToRowTableForRestoreImpl`
|
|
// fetcherAndWorkerSyncer is a counter for synchronizing, it should be `Done` for `chunkNum`.
|
|
// When `hasErr` is set, `splitPartitionAndAppendToRowTableForRestoreImpl` could exit early.
|
|
hasErr := false
|
|
|
|
chunkNum := inDisk.NumChunks()
|
|
for i := range chunkNum {
|
|
_, ok := <-syncCh
|
|
if !ok {
|
|
break
|
|
}
|
|
|
|
err := b.splitPartitionAndAppendToRowTableForRestoreImpl(i, inDisk, fetcherAndWorkerSyncer, hasErr, &cost)
|
|
if err != nil {
|
|
hasErr = true
|
|
handleErr(err, errCh, doneCh)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (b *BuildWorkerV2) splitPartitionAndAppendToRowTable(typeCtx types.Context, fetcherAndWorkerSyncer *sync.WaitGroup, srcChkCh chan *chunk.Chunk, errCh chan error, doneCh chan struct{}) {
|
|
cost := int64(0)
|
|
defer func() {
|
|
if b.HashJoinCtx.stats != nil {
|
|
b.updatePartitionData(cost)
|
|
}
|
|
}()
|
|
|
|
// When error happens, hasErr will be set to true.
|
|
// However, we should not directly exit the function, as we must
|
|
// call `fetcherAndWorkerSyncer.Done()` in `splitPartitionAndAppendToRowTableImpl`
|
|
// fetcherAndWorkerSyncer is a counter for synchronizing, it should be `Done` for `chunkNum`.
|
|
// When `hasErr` is set, `splitPartitionAndAppendToRowTableImpl` could exit early.
|
|
hasErr := false
|
|
|
|
for chk := range srcChkCh {
|
|
err := b.splitPartitionAndAppendToRowTableImpl(typeCtx, chk, fetcherAndWorkerSyncer, hasErr, &cost)
|
|
if err != nil {
|
|
hasErr = true
|
|
handleErr(err, errCh, doneCh)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (b *BuildWorkerV2) processOneChunk(typeCtx types.Context, chk *chunk.Chunk, cost *int64) error {
|
|
start := time.Now()
|
|
err := b.builder.processOneChunk(chk, typeCtx, b.HashJoinCtx, int(b.WorkerID))
|
|
failpoint.Inject("splitPartitionPanic", nil)
|
|
*cost += int64(time.Since(start))
|
|
return err
|
|
}
|
|
|
|
func (b *BuildWorkerV2) splitPartitionAndAppendToRowTableImpl(typeCtx types.Context, chk *chunk.Chunk, fetcherAndWorkerSyncer *sync.WaitGroup, hasErr bool, cost *int64) error {
|
|
defer func() {
|
|
fetcherAndWorkerSyncer.Done()
|
|
}()
|
|
|
|
if hasErr {
|
|
return nil
|
|
}
|
|
|
|
err := triggerIntest(5)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = b.processOneChunk(typeCtx, chk, cost)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// buildHashTableForList builds hash table from `list`.
|
|
func (b *BuildWorkerV2) buildHashTable(taskCh chan *buildTask) error {
|
|
cost := int64(0)
|
|
defer func() {
|
|
if b.HashJoinCtx.stats != nil {
|
|
atomic.AddInt64(&b.HashJoinCtx.stats.buildHashTable, cost)
|
|
setMaxValue(&b.HashJoinCtx.stats.maxBuildHashTableForCurrentRound, cost)
|
|
}
|
|
}()
|
|
for task := range taskCh {
|
|
start := time.Now()
|
|
b.HashJoinCtx.hashTableContext.build(task)
|
|
failpoint.Inject("buildHashTablePanic", nil)
|
|
cost += int64(time.Since(start))
|
|
err := triggerIntest(5)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// NewJoinBuildWorkerV2 create a BuildWorkerV2
|
|
func NewJoinBuildWorkerV2(ctx *HashJoinCtxV2, workID uint, buildSideExec exec.Executor, buildKeyColIdx []int, buildTypes []*types.FieldType) *BuildWorkerV2 {
|
|
hasNullableKey := false
|
|
for _, idx := range buildKeyColIdx {
|
|
if !mysql.HasNotNullFlag(buildTypes[idx].GetFlag()) {
|
|
hasNullableKey = true
|
|
break
|
|
}
|
|
}
|
|
worker := &BuildWorkerV2{
|
|
HashJoinCtx: ctx,
|
|
BuildTypes: buildTypes,
|
|
WorkerID: workID,
|
|
HasNullableKey: hasNullableKey,
|
|
}
|
|
worker.BuildSideExec = buildSideExec
|
|
worker.BuildKeyColIdx = buildKeyColIdx
|
|
return worker
|
|
}
|
|
|
|
// HashJoinV2Exec implements the hash join algorithm.
|
|
type HashJoinV2Exec struct {
|
|
exec.BaseExecutor
|
|
*HashJoinCtxV2
|
|
|
|
ProbeSideTupleFetcher *ProbeSideTupleFetcherV2
|
|
ProbeWorkers []*ProbeWorkerV2
|
|
BuildWorkers []*BuildWorkerV2
|
|
|
|
workerWg util.WaitGroupWrapper
|
|
waiterWg util.WaitGroupWrapper
|
|
|
|
restoredBuildInDisk []*chunk.DataInDiskByChunks
|
|
restoredProbeInDisk []*chunk.DataInDiskByChunks
|
|
|
|
prepared bool
|
|
inRestore bool
|
|
|
|
IsGA bool
|
|
|
|
isMemoryClearedForTest bool
|
|
|
|
FileNamePrefixForTest string
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) isAllMemoryClearedForTest() bool {
|
|
return e.isMemoryClearedForTest
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) initMaxSpillRound() {
|
|
if e.partitionNumber > 1024 {
|
|
e.maxSpillRound = 1
|
|
return
|
|
}
|
|
|
|
// Calculate the minimum number of rounds required for the total partitions to exceed 1024
|
|
e.maxSpillRound = int(math.Log(1024) / math.Log(float64(e.partitionNumber)))
|
|
}
|
|
|
|
// Close implements the Executor Close interface.
|
|
func (e *HashJoinV2Exec) Close() error {
|
|
if e.closeCh != nil {
|
|
close(e.closeCh)
|
|
}
|
|
e.finished.Store(true)
|
|
if e.prepared {
|
|
if e.buildFinished != nil {
|
|
channel.Clear(e.buildFinished)
|
|
}
|
|
if e.joinResultCh != nil {
|
|
channel.Clear(e.joinResultCh)
|
|
}
|
|
if e.ProbeSideTupleFetcher.probeChkResourceCh != nil {
|
|
close(e.ProbeSideTupleFetcher.probeChkResourceCh)
|
|
channel.Clear(e.ProbeSideTupleFetcher.probeChkResourceCh)
|
|
}
|
|
for i := range e.ProbeSideTupleFetcher.probeResultChs {
|
|
channel.Clear(e.ProbeSideTupleFetcher.probeResultChs[i])
|
|
}
|
|
for i := range e.ProbeWorkers {
|
|
close(e.ProbeWorkers[i].joinChkResourceCh)
|
|
channel.Clear(e.ProbeWorkers[i].joinChkResourceCh)
|
|
}
|
|
e.ProbeSideTupleFetcher.probeChkResourceCh = nil
|
|
e.waiterWg.Wait()
|
|
e.hashTableContext.reset()
|
|
}
|
|
for _, w := range e.ProbeWorkers {
|
|
w.joinChkResourceCh = nil
|
|
}
|
|
|
|
if e.stats != nil {
|
|
defer e.Ctx().GetSessionVars().StmtCtx.RuntimeStatsColl.RegisterStats(e.ID(), e.stats)
|
|
}
|
|
e.releaseDisk()
|
|
if e.spillHelper != nil {
|
|
e.spillHelper.close()
|
|
}
|
|
err := e.BaseExecutor.Close()
|
|
return err
|
|
}
|
|
|
|
// Open implements the Executor Open interface.
|
|
func (e *HashJoinV2Exec) Open(ctx context.Context) error {
|
|
if err := e.BaseExecutor.Open(ctx); err != nil {
|
|
e.closeCh = nil
|
|
e.prepared = false
|
|
return err
|
|
}
|
|
return e.OpenSelf()
|
|
}
|
|
|
|
// OpenSelf opens hash join itself and initializes the hash join context.
|
|
func (e *HashJoinV2Exec) OpenSelf() error {
|
|
e.prepared = false
|
|
e.inRestore = false
|
|
needScanRowTableAfterProbeDone := e.ProbeWorkers[0].JoinProbe.NeedScanRowTable()
|
|
e.HashJoinCtxV2.needScanRowTableAfterProbeDone = needScanRowTableAfterProbeDone
|
|
if e.RightAsBuildSide {
|
|
e.hashTableMeta = newTableMeta(e.BuildWorkers[0].BuildKeyColIdx, e.BuildWorkers[0].BuildTypes,
|
|
e.BuildKeyTypes, e.ProbeKeyTypes, e.RUsedInOtherCondition, e.RUsed, needScanRowTableAfterProbeDone)
|
|
} else {
|
|
e.hashTableMeta = newTableMeta(e.BuildWorkers[0].BuildKeyColIdx, e.BuildWorkers[0].BuildTypes,
|
|
e.BuildKeyTypes, e.ProbeKeyTypes, e.LUsedInOtherCondition, e.LUsed, needScanRowTableAfterProbeDone)
|
|
}
|
|
e.HashJoinCtxV2.ChunkAllocPool = e.AllocPool
|
|
if e.memTracker != nil {
|
|
e.memTracker.Reset()
|
|
} else {
|
|
e.memTracker = memory.NewTracker(e.ID(), -1)
|
|
}
|
|
e.memTracker.AttachTo(e.Ctx().GetSessionVars().StmtCtx.MemTracker)
|
|
|
|
if e.diskTracker != nil {
|
|
e.diskTracker.Reset()
|
|
} else {
|
|
e.diskTracker = disk.NewTracker(e.ID(), -1)
|
|
}
|
|
e.diskTracker.AttachTo(e.Ctx().GetSessionVars().StmtCtx.DiskTracker)
|
|
e.spillHelper = newHashJoinSpillHelper(e, int(e.partitionNumber), e.ProbeSideTupleFetcher.ProbeSideExec.RetFieldTypes(), e.FileNamePrefixForTest)
|
|
e.maxSpillRound = 1
|
|
|
|
if vardef.EnableTmpStorageOnOOM.Load() && e.partitionNumber > 1 {
|
|
e.initMaxSpillRound()
|
|
e.spillAction = newHashJoinSpillAction(e.spillHelper)
|
|
e.Ctx().GetSessionVars().MemTracker.FallbackOldAndSetNewAction(e.spillAction)
|
|
}
|
|
|
|
e.workerWg = util.WaitGroupWrapper{}
|
|
e.waiterWg = util.WaitGroupWrapper{}
|
|
e.closeCh = make(chan struct{})
|
|
e.finished.Store(false)
|
|
|
|
if e.RuntimeStats() != nil && e.stats == nil {
|
|
e.stats = &hashJoinRuntimeStatsV2{}
|
|
e.stats.concurrent = int(e.Concurrency)
|
|
}
|
|
|
|
if e.stats != nil {
|
|
e.stats.reset()
|
|
e.stats.spill.partitionNum = int(e.partitionNumber)
|
|
e.stats.isHashJoinGA = e.IsGA
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (fetcher *ProbeSideTupleFetcherV2) shouldLimitProbeFetchSize() bool {
|
|
if fetcher.JoinType == base.LeftOuterJoin && fetcher.RightAsBuildSide {
|
|
return true
|
|
}
|
|
if fetcher.JoinType == base.RightOuterJoin && !fetcher.RightAsBuildSide {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) canSkipProbeIfHashTableIsEmpty() bool {
|
|
switch e.JoinType {
|
|
case base.InnerJoin:
|
|
return true
|
|
case base.LeftOuterJoin:
|
|
return !e.RightAsBuildSide
|
|
case base.RightOuterJoin:
|
|
return e.RightAsBuildSide
|
|
case base.SemiJoin:
|
|
return e.RightAsBuildSide
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) initializeForProbe() {
|
|
e.ProbeSideTupleFetcher.HashJoinCtxV2 = e.HashJoinCtxV2
|
|
// e.joinResultCh is for transmitting the join result chunks to the main thread.
|
|
e.joinResultCh = make(chan *hashjoinWorkerResult, e.Concurrency+1)
|
|
e.ProbeSideTupleFetcher.initializeForProbeBase(e.Concurrency, e.joinResultCh)
|
|
e.ProbeSideTupleFetcher.canSkipProbeIfHashTableIsEmpty = e.canSkipProbeIfHashTableIsEmpty()
|
|
// set buildSuccess to false by default, it will be set to true if build finishes successfully
|
|
e.ProbeSideTupleFetcher.buildSuccess = false
|
|
|
|
for i := range e.Concurrency {
|
|
e.ProbeWorkers[i].initializeForProbe(e.ProbeSideTupleFetcher.probeChkResourceCh, e.ProbeSideTupleFetcher.probeResultChs[i], e)
|
|
e.ProbeWorkers[i].JoinProbe.ResetProbeCollision()
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) startProbeFetcher(ctx context.Context) {
|
|
if !e.inRestore {
|
|
fetchProbeSideChunksFunc := func() {
|
|
defer trace.StartRegion(ctx, "HashJoinProbeSideFetcher").End()
|
|
e.ProbeSideTupleFetcher.fetchProbeSideChunks(
|
|
ctx,
|
|
e.MaxChunkSize(),
|
|
func() bool { return e.ProbeSideTupleFetcher.hashTableContext.hashTable.isHashTableEmpty() },
|
|
func() bool { return e.spillHelper.isSpillTriggered() },
|
|
e.ProbeSideTupleFetcher.canSkipProbeIfHashTableIsEmpty,
|
|
e.ProbeSideTupleFetcher.needScanRowTableAfterProbeDone,
|
|
e.ProbeSideTupleFetcher.shouldLimitProbeFetchSize(),
|
|
&e.ProbeSideTupleFetcher.hashJoinCtxBase)
|
|
}
|
|
e.workerWg.RunWithRecover(fetchProbeSideChunksFunc, e.ProbeSideTupleFetcher.handleProbeSideFetcherPanic)
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) startProbeJoinWorkers(ctx context.Context) {
|
|
var start time.Time
|
|
if e.HashJoinCtxV2.stats != nil {
|
|
start = time.Now()
|
|
}
|
|
|
|
if e.inRestore {
|
|
// Wait for the restore build
|
|
err := <-e.buildFinished
|
|
if err != nil {
|
|
return
|
|
}
|
|
// in restore, there is no standalone probe fetcher goroutine, so set buildSuccess here
|
|
e.ProbeSideTupleFetcher.buildSuccess = true
|
|
}
|
|
|
|
for i := range e.Concurrency {
|
|
workerID := i
|
|
e.workerWg.RunWithRecover(func() {
|
|
defer trace.StartRegion(ctx, "HashJoinWorker").End()
|
|
if e.inRestore {
|
|
e.ProbeWorkers[workerID].restoreAndProbe(e.restoredProbeInDisk[workerID], start)
|
|
} else {
|
|
e.ProbeWorkers[workerID].runJoinWorker(start)
|
|
}
|
|
}, e.ProbeWorkers[workerID].handleProbeWorkerPanic)
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) fetchAndProbeHashTable(ctx context.Context) {
|
|
start := time.Now()
|
|
e.startProbeFetcher(ctx)
|
|
|
|
// Join workers directly read data from disk when we are in restore status
|
|
// and read data from fetcher otherwise.
|
|
e.startProbeJoinWorkers(ctx)
|
|
|
|
e.waiterWg.RunWithRecover(
|
|
func() {
|
|
e.waitJoinWorkers(start)
|
|
}, nil)
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) handleProbeWorkerPanic(r any) {
|
|
if r != nil {
|
|
w.HashJoinCtx.joinResultCh <- &hashjoinWorkerResult{err: util.GetRecoverError(r)}
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) handleJoinWorkerPanic(r any) {
|
|
if r != nil {
|
|
e.joinResultCh <- &hashjoinWorkerResult{err: util.GetRecoverError(r)}
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) waitJoinWorkers(start time.Time) {
|
|
e.workerWg.Wait()
|
|
if e.stats != nil {
|
|
e.HashJoinCtxV2.stats.fetchAndProbe += int64(time.Since(start))
|
|
for _, prober := range e.ProbeWorkers {
|
|
e.stats.probeCollision += int64(prober.JoinProbe.GetProbeCollision())
|
|
}
|
|
}
|
|
|
|
if e.ProbeSideTupleFetcher.buildSuccess {
|
|
// only scan row table if build is successful
|
|
if e.ProbeWorkers[0] != nil && e.ProbeWorkers[0].JoinProbe.NeedScanRowTable() {
|
|
for i := range e.Concurrency {
|
|
var workerID = i
|
|
e.workerWg.RunWithRecover(func() {
|
|
e.ProbeWorkers[workerID].scanRowTableAfterProbeDone()
|
|
}, e.handleJoinWorkerPanic)
|
|
}
|
|
e.workerWg.Wait()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) scanRowTableAfterProbeDone() {
|
|
w.JoinProbe.InitForScanRowTable()
|
|
ok, joinResult := w.getNewJoinResult()
|
|
if !ok {
|
|
return
|
|
}
|
|
for !w.JoinProbe.IsScanRowTableDone() {
|
|
joinResult = w.JoinProbe.ScanRowTable(joinResult, &w.HashJoinCtx.SessCtx.GetSessionVars().SQLKiller)
|
|
if joinResult.err != nil {
|
|
w.HashJoinCtx.joinResultCh <- joinResult
|
|
return
|
|
}
|
|
|
|
err := triggerIntest(4)
|
|
if err != nil {
|
|
w.HashJoinCtx.joinResultCh <- &hashjoinWorkerResult{err: err}
|
|
return
|
|
}
|
|
|
|
if joinResult.chk.IsFull() {
|
|
w.HashJoinCtx.joinResultCh <- joinResult
|
|
ok, joinResult = w.getNewJoinResult()
|
|
if !ok {
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
if joinResult.err != nil || (joinResult.chk != nil && joinResult.chk.NumRows() > 0) {
|
|
w.HashJoinCtx.joinResultCh <- joinResult
|
|
} else if joinResult.chk != nil && joinResult.chk.NumRows() == 0 {
|
|
w.joinChkResourceCh <- joinResult.chk
|
|
}
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) processOneRestoredProbeChunk(joinResult *hashjoinWorkerResult) (ok bool, waitTime int64, _ *hashjoinWorkerResult) {
|
|
joinResult.err = w.JoinProbe.SetRestoredChunkForProbe(w.restoredChkBuf)
|
|
if joinResult.err != nil {
|
|
return false, 0, joinResult
|
|
}
|
|
return w.probeAndSendResult(joinResult)
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) processOneProbeChunk(probeChunk *chunk.Chunk, joinResult *hashjoinWorkerResult) (ok bool, waitTime int64, _ *hashjoinWorkerResult) {
|
|
joinResult.err = w.JoinProbe.SetChunkForProbe(probeChunk)
|
|
if joinResult.err != nil {
|
|
return false, 0, joinResult
|
|
}
|
|
return w.probeAndSendResult(joinResult)
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) probeAndSendResult(joinResult *hashjoinWorkerResult) (bool, int64, *hashjoinWorkerResult) {
|
|
if w.HashJoinCtx.spillHelper.areAllPartitionsSpilled() {
|
|
if intest.InTest && w.HashJoinCtx.spillHelper.hashJoinExec.inRestore {
|
|
w.HashJoinCtx.spillHelper.skipProbeInRestoreForTest.Store(true)
|
|
}
|
|
return true, 0, joinResult
|
|
}
|
|
|
|
var ok bool
|
|
waitTime := int64(0)
|
|
for !w.JoinProbe.IsCurrentChunkProbeDone() {
|
|
ok, joinResult = w.JoinProbe.Probe(joinResult, &w.HashJoinCtx.SessCtx.GetSessionVars().SQLKiller)
|
|
if !ok || joinResult.err != nil {
|
|
return ok, waitTime, joinResult
|
|
}
|
|
|
|
failpoint.Inject("processOneProbeChunkPanic", nil)
|
|
if joinResult.chk.IsFull() {
|
|
waitStart := time.Now()
|
|
w.HashJoinCtx.joinResultCh <- joinResult
|
|
ok, joinResult = w.getNewJoinResult()
|
|
waitTime += int64(time.Since(waitStart))
|
|
if !ok {
|
|
return false, waitTime, joinResult
|
|
}
|
|
}
|
|
}
|
|
return true, waitTime, joinResult
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) runJoinWorker(start time.Time) {
|
|
probeTime := int64(0)
|
|
if w.HashJoinCtx.stats != nil {
|
|
defer func() {
|
|
w.updateProbeStatistic(start, probeTime)
|
|
}()
|
|
}
|
|
|
|
var (
|
|
probeSideResult *chunk.Chunk
|
|
)
|
|
ok, joinResult := w.getNewJoinResult()
|
|
if !ok {
|
|
return
|
|
}
|
|
|
|
// Read and filter probeSideResult, and join the probeSideResult with the build side rows.
|
|
emptyProbeSideResult := &probeChkResource{
|
|
dest: w.probeResultCh,
|
|
}
|
|
for ok := true; ok; {
|
|
select {
|
|
case <-w.HashJoinCtx.closeCh:
|
|
return
|
|
case probeSideResult, ok = <-w.probeResultCh:
|
|
}
|
|
failpoint.Inject("ConsumeRandomPanic", nil)
|
|
if !ok {
|
|
break
|
|
}
|
|
|
|
err := triggerIntest(2)
|
|
if err != nil {
|
|
joinResult.err = err
|
|
break
|
|
}
|
|
|
|
start := time.Now()
|
|
waitTime := int64(0)
|
|
ok, waitTime, joinResult = w.processOneProbeChunk(probeSideResult, joinResult)
|
|
probeTime += int64(time.Since(start)) - waitTime
|
|
if !ok {
|
|
break
|
|
}
|
|
probeSideResult.Reset()
|
|
emptyProbeSideResult.chk = probeSideResult
|
|
|
|
// Give back to probe fetcher
|
|
w.probeChkResourceCh <- emptyProbeSideResult
|
|
}
|
|
|
|
err := w.JoinProbe.SpillRemainingProbeChunks()
|
|
if err != nil {
|
|
joinResult.err = err
|
|
}
|
|
|
|
if joinResult.err != nil || (joinResult.chk != nil && joinResult.chk.NumRows() > 0) {
|
|
w.HashJoinCtx.joinResultCh <- joinResult
|
|
} else if joinResult.chk != nil && joinResult.chk.NumRows() == 0 {
|
|
w.joinChkResourceCh <- joinResult.chk
|
|
}
|
|
}
|
|
|
|
func (w *ProbeWorkerV2) getNewJoinResult() (bool, *hashjoinWorkerResult) {
|
|
joinResult := &hashjoinWorkerResult{
|
|
src: w.joinChkResourceCh,
|
|
}
|
|
ok := true
|
|
select {
|
|
case <-w.HashJoinCtx.closeCh:
|
|
ok = false
|
|
case joinResult.chk, ok = <-w.joinChkResourceCh:
|
|
}
|
|
return ok, joinResult
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) reset() {
|
|
e.resetProbeStatus()
|
|
e.releaseDisk()
|
|
// set buildSuccess to false by default, it will be set to true if build finishes successfully
|
|
e.ProbeSideTupleFetcher.buildSuccess = false
|
|
e.resetHashTableContextForRestore()
|
|
e.spillHelper.setCanSpillFlag(true)
|
|
if e.HashJoinCtxV2.stats != nil {
|
|
e.HashJoinCtxV2.stats.resetCurrentRound()
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) collectSpillStats() {
|
|
if e.stats == nil || !e.spillHelper.isSpillTriggered() {
|
|
return
|
|
}
|
|
|
|
round := e.spillHelper.round
|
|
if len(e.stats.spill.totalSpillBytesPerRound) < round+1 {
|
|
e.stats.spill.totalSpillBytesPerRound = append(e.stats.spill.totalSpillBytesPerRound, 0)
|
|
e.stats.spill.spillBuildRowTableBytesPerRound = append(e.stats.spill.spillBuildRowTableBytesPerRound, 0)
|
|
e.stats.spill.spillBuildHashTableBytesPerRound = append(e.stats.spill.spillBuildHashTableBytesPerRound, 0)
|
|
e.stats.spill.spilledPartitionNumPerRound = append(e.stats.spill.spilledPartitionNumPerRound, 0)
|
|
}
|
|
|
|
buildRowTableSpillBytes := e.spillHelper.getBuildSpillBytes()
|
|
buildHashTableSpillBytes := getHashTableMemoryUsage(getHashTableLengthByRowLen(e.spillHelper.spilledValidRowNum.Load()))
|
|
probeSpillBytes := e.spillHelper.getProbeSpillBytes()
|
|
spilledPartitionNum := e.spillHelper.getSpilledPartitionsNum()
|
|
|
|
e.stats.spill.spillBuildRowTableBytesPerRound[round] += buildRowTableSpillBytes
|
|
e.stats.spill.spillBuildHashTableBytesPerRound[round] += buildHashTableSpillBytes
|
|
e.stats.spill.totalSpillBytesPerRound[round] += buildRowTableSpillBytes + probeSpillBytes
|
|
e.stats.spill.spilledPartitionNumPerRound[round] += spilledPartitionNum
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) startBuildAndProbe(ctx context.Context) {
|
|
defer func() {
|
|
if r := recover(); r != nil {
|
|
e.joinResultCh <- &hashjoinWorkerResult{err: util.GetRecoverError(r)}
|
|
}
|
|
close(e.joinResultCh)
|
|
}()
|
|
|
|
lastRound := 0
|
|
for {
|
|
if e.finished.Load() {
|
|
return
|
|
}
|
|
|
|
e.buildFinished = make(chan error, 1)
|
|
|
|
e.fetchAndBuildHashTable(ctx)
|
|
e.fetchAndProbeHashTable(ctx)
|
|
|
|
e.waiterWg.Wait()
|
|
e.collectSpillStats()
|
|
e.reset()
|
|
|
|
e.spillHelper.spillRoundForTest = max(e.spillHelper.spillRoundForTest, lastRound)
|
|
err := e.spillHelper.prepareForRestoring(lastRound)
|
|
if err != nil {
|
|
e.joinResultCh <- &hashjoinWorkerResult{err: err}
|
|
return
|
|
}
|
|
|
|
restoredPartition := e.spillHelper.stack.pop()
|
|
if restoredPartition == nil {
|
|
// No more data to restore
|
|
return
|
|
}
|
|
e.spillHelper.round = restoredPartition.round
|
|
|
|
if e.memTracker.BytesConsumed() != 0 {
|
|
e.isMemoryClearedForTest = false
|
|
}
|
|
|
|
lastRound = restoredPartition.round
|
|
e.restoredBuildInDisk = restoredPartition.buildSideChunks
|
|
e.restoredProbeInDisk = restoredPartition.probeSideChunks
|
|
|
|
if e.stats != nil && e.stats.spill.round < lastRound {
|
|
e.stats.spill.round = lastRound
|
|
}
|
|
|
|
e.inRestore = true
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) resetProbeStatus() {
|
|
for _, probe := range e.ProbeWorkers {
|
|
probe.JoinProbe.ResetProbe()
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) releaseDisk() {
|
|
if e.restoredBuildInDisk != nil {
|
|
for _, inDisk := range e.restoredBuildInDisk {
|
|
inDisk.Close()
|
|
}
|
|
e.restoredBuildInDisk = nil
|
|
}
|
|
|
|
if e.restoredProbeInDisk != nil {
|
|
for _, inDisk := range e.restoredProbeInDisk {
|
|
inDisk.Close()
|
|
}
|
|
e.restoredProbeInDisk = nil
|
|
}
|
|
}
|
|
|
|
// Next implements the Executor Next interface.
|
|
// hash join constructs the result following these steps:
|
|
// step 1. fetch data from build side child and build a hash table;
|
|
// step 2. fetch data from probe child in a background goroutine and probe the hash table in multiple join workers.
|
|
func (e *HashJoinV2Exec) Next(ctx context.Context, req *chunk.Chunk) (err error) {
|
|
if !e.prepared {
|
|
e.initHashTableContext()
|
|
e.initializeForProbe()
|
|
e.spillHelper.setCanSpillFlag(true)
|
|
e.buildFinished = make(chan error, 1)
|
|
e.hashTableContext.memoryTracker.AttachTo(e.memTracker)
|
|
go e.startBuildAndProbe(ctx)
|
|
e.prepared = true
|
|
}
|
|
if e.ProbeSideTupleFetcher.shouldLimitProbeFetchSize() {
|
|
atomic.StoreInt64(&e.ProbeSideTupleFetcher.requiredRows, int64(req.RequiredRows()))
|
|
}
|
|
req.Reset()
|
|
|
|
result, ok := <-e.joinResultCh
|
|
if !ok {
|
|
return nil
|
|
}
|
|
if result.err != nil {
|
|
e.finished.Store(true)
|
|
return result.err
|
|
}
|
|
req.SwapColumns(result.chk)
|
|
result.src <- result.chk
|
|
return nil
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) handleFetchAndBuildHashTablePanic(r any) {
|
|
if r != nil {
|
|
e.buildFinished <- util.GetRecoverError(r)
|
|
}
|
|
close(e.buildFinished)
|
|
}
|
|
|
|
// checkBalance checks whether the segment count of each partition is balanced.
|
|
func (e *HashJoinV2Exec) checkBalance(totalSegmentCnt int) bool {
|
|
isBalanced := e.Concurrency == e.partitionNumber
|
|
if !isBalanced {
|
|
return false
|
|
}
|
|
avgSegCnt := totalSegmentCnt / int(e.partitionNumber)
|
|
balanceThreshold := int(float64(avgSegCnt) * 0.8)
|
|
subTables := e.HashJoinCtxV2.hashTableContext.hashTable.tables
|
|
|
|
for _, subTable := range subTables {
|
|
if math.Abs(float64(len(subTable.rowData.segments)-avgSegCnt)) > float64(balanceThreshold) {
|
|
isBalanced = false
|
|
break
|
|
}
|
|
}
|
|
return isBalanced
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) createTasks(buildTaskCh chan<- *buildTask, totalSegmentCnt int, doneCh chan struct{}) {
|
|
isBalanced := e.checkBalance(totalSegmentCnt)
|
|
segStep := max(1, totalSegmentCnt/int(e.Concurrency))
|
|
subTables := e.HashJoinCtxV2.hashTableContext.hashTable.tables
|
|
createBuildTask := func(partIdx int, segStartIdx int, segEndIdx int) *buildTask {
|
|
return &buildTask{partitionIdx: partIdx, segStartIdx: segStartIdx, segEndIdx: segEndIdx}
|
|
}
|
|
failpoint.Inject("createTasksPanic", nil)
|
|
|
|
if isBalanced {
|
|
for partIdx, subTable := range subTables {
|
|
_ = triggerIntest(5)
|
|
segmentsLen := len(subTable.rowData.segments)
|
|
select {
|
|
case <-doneCh:
|
|
return
|
|
case buildTaskCh <- createBuildTask(partIdx, 0, segmentsLen):
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
partitionStartIndex := make([]int, len(subTables))
|
|
partitionSegmentLength := make([]int, len(subTables))
|
|
for i := range subTables {
|
|
partitionStartIndex[i] = 0
|
|
partitionSegmentLength[i] = len(subTables[i].rowData.segments)
|
|
}
|
|
|
|
for {
|
|
hasNewTask := false
|
|
for partIdx := range subTables {
|
|
// create table by round-robin all the partitions so the build thread is likely to build different partition at the same time
|
|
if partitionStartIndex[partIdx] < partitionSegmentLength[partIdx] {
|
|
startIndex := partitionStartIndex[partIdx]
|
|
endIndex := min(startIndex+segStep, partitionSegmentLength[partIdx])
|
|
select {
|
|
case <-doneCh:
|
|
return
|
|
case buildTaskCh <- createBuildTask(partIdx, startIndex, endIndex):
|
|
}
|
|
partitionStartIndex[partIdx] = endIndex
|
|
hasNewTask = true
|
|
}
|
|
}
|
|
if !hasNewTask {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) fetchAndBuildHashTable(ctx context.Context) {
|
|
e.workerWg.RunWithRecover(func() {
|
|
defer trace.StartRegion(ctx, "HashJoinHashTableBuilder").End()
|
|
e.fetchAndBuildHashTableImpl(ctx)
|
|
}, e.handleFetchAndBuildHashTablePanic)
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) fetchAndBuildHashTableImpl(ctx context.Context) {
|
|
if e.stats != nil {
|
|
start := time.Now()
|
|
defer func() {
|
|
e.stats.fetchAndBuildHashTable += int64(time.Since(start))
|
|
}()
|
|
}
|
|
|
|
waitJobDone := func(wg *sync.WaitGroup, errCh chan error) bool {
|
|
wg.Wait()
|
|
close(errCh)
|
|
if err := <-errCh; err != nil {
|
|
e.buildFinished <- err
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
// It's useful when spill is triggered and the fetcher could know when workers finish their works.
|
|
fetcherAndWorkerSyncer := &sync.WaitGroup{}
|
|
wg := new(sync.WaitGroup)
|
|
errCh := make(chan error, 1+e.Concurrency)
|
|
|
|
// doneCh is used by the consumer(splitAndAppendToRowTable) to info the producer(fetchBuildSideRows) that the consumer meet error and stop consume data
|
|
doneCh := make(chan struct{}, e.Concurrency)
|
|
// init builder, todo maybe the builder can be reused during the whole life cycle of the executor
|
|
hashJoinCtx := e.HashJoinCtxV2
|
|
for _, worker := range e.BuildWorkers {
|
|
worker.builder = createRowTableBuilder(worker.BuildKeyColIdx, hashJoinCtx.BuildKeyTypes, hashJoinCtx.partitionNumber, worker.HasNullableKey, hashJoinCtx.BuildFilter != nil, hashJoinCtx.needScanRowTableAfterProbeDone, hashJoinCtx.hashTableMeta.nullMapLength)
|
|
}
|
|
srcChkCh := e.fetchBuildSideRows(ctx, fetcherAndWorkerSyncer, wg, errCh, doneCh)
|
|
e.splitAndAppendToRowTable(srcChkCh, fetcherAndWorkerSyncer, wg, errCh, doneCh)
|
|
success := waitJobDone(wg, errCh)
|
|
if !success {
|
|
return
|
|
}
|
|
|
|
if e.spillHelper.spillTriggered {
|
|
e.spillHelper.spillTriggedInBuildingStageForTest = true
|
|
}
|
|
|
|
totalSegmentCnt, err := e.hashTableContext.mergeRowTablesToHashTable(e.partitionNumber, e.spillHelper)
|
|
if err != nil {
|
|
e.buildFinished <- err
|
|
return
|
|
}
|
|
|
|
wg = new(sync.WaitGroup)
|
|
errCh = make(chan error, 1+e.Concurrency)
|
|
// doneCh is used by the consumer(buildHashTable) to info the producer(createBuildTasks) that the consumer meet error and stop consume data
|
|
doneCh = make(chan struct{}, e.Concurrency)
|
|
|
|
buildTaskCh := e.createBuildTasks(totalSegmentCnt, wg, errCh, doneCh)
|
|
e.buildHashTable(buildTaskCh, wg, errCh, doneCh)
|
|
waitJobDone(wg, errCh)
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) fetchBuildSideRows(ctx context.Context, fetcherAndWorkerSyncer *sync.WaitGroup, wg *sync.WaitGroup, errCh chan error, doneCh chan struct{}) chan *chunk.Chunk {
|
|
srcChkCh := make(chan *chunk.Chunk, 1)
|
|
|
|
wg.Add(1)
|
|
e.workerWg.RunWithRecover(
|
|
func() {
|
|
defer trace.StartRegion(ctx, "HashJoinBuildSideFetcher").End()
|
|
if e.inRestore {
|
|
chunkNum := e.getRestoredBuildChunkNum()
|
|
e.controlWorkersForRestore(chunkNum, srcChkCh, fetcherAndWorkerSyncer, errCh, doneCh)
|
|
} else {
|
|
fetcher := e.BuildWorkers[0]
|
|
fetcher.fetchBuildSideRows(ctx, &fetcher.HashJoinCtx.hashJoinCtxBase, fetcherAndWorkerSyncer, e.spillHelper, srcChkCh, errCh, doneCh)
|
|
}
|
|
},
|
|
func(r any) {
|
|
if r != nil {
|
|
errCh <- util.GetRecoverError(r)
|
|
}
|
|
wg.Done()
|
|
},
|
|
)
|
|
return srcChkCh
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) getRestoredBuildChunkNum() int {
|
|
chunkNum := 0
|
|
for _, inDisk := range e.restoredBuildInDisk {
|
|
chunkNum += inDisk.NumChunks()
|
|
}
|
|
return chunkNum
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) controlWorkersForRestore(chunkNum int, syncCh chan *chunk.Chunk, fetcherAndWorkerSyncer *sync.WaitGroup, errCh chan<- error, doneCh <-chan struct{}) {
|
|
defer func() {
|
|
close(syncCh)
|
|
|
|
hasError := false
|
|
if r := recover(); r != nil {
|
|
errCh <- util.GetRecoverError(r)
|
|
hasError = true
|
|
}
|
|
|
|
fetcherAndWorkerSyncer.Wait()
|
|
|
|
// Spill remaining rows
|
|
if !hasError && e.spillHelper.isSpillTriggered() {
|
|
err := e.spillHelper.spillRemainingRows()
|
|
if err != nil {
|
|
errCh <- err
|
|
}
|
|
}
|
|
}()
|
|
|
|
for range chunkNum {
|
|
if e.finished.Load() {
|
|
return
|
|
}
|
|
|
|
err := checkAndSpillRowTableIfNeeded(fetcherAndWorkerSyncer, e.spillHelper)
|
|
if err != nil {
|
|
errCh <- err
|
|
return
|
|
}
|
|
|
|
err = triggerIntest(2)
|
|
if err != nil {
|
|
errCh <- err
|
|
return
|
|
}
|
|
|
|
fetcherAndWorkerSyncer.Add(1)
|
|
select {
|
|
case <-doneCh:
|
|
fetcherAndWorkerSyncer.Done()
|
|
return
|
|
case <-e.hashJoinCtxBase.closeCh:
|
|
fetcherAndWorkerSyncer.Done()
|
|
return
|
|
case syncCh <- nil:
|
|
}
|
|
}
|
|
}
|
|
|
|
func handleErr(err error, errCh chan error, doneCh chan struct{}) {
|
|
errCh <- err
|
|
doneCh <- struct{}{}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) splitAndAppendToRowTable(srcChkCh chan *chunk.Chunk, fetcherAndWorkerSyncer *sync.WaitGroup, wg *sync.WaitGroup, errCh chan error, doneCh chan struct{}) {
|
|
wg.Add(int(e.Concurrency))
|
|
for i := range e.Concurrency {
|
|
workIndex := i
|
|
e.workerWg.RunWithRecover(
|
|
func() {
|
|
if e.inRestore {
|
|
e.BuildWorkers[workIndex].splitPartitionAndAppendToRowTableForRestore(e.restoredBuildInDisk[workIndex], srcChkCh, fetcherAndWorkerSyncer, errCh, doneCh)
|
|
} else {
|
|
e.BuildWorkers[workIndex].splitPartitionAndAppendToRowTable(e.SessCtx.GetSessionVars().StmtCtx.TypeCtx(), fetcherAndWorkerSyncer, srcChkCh, errCh, doneCh)
|
|
}
|
|
},
|
|
func(r any) {
|
|
if r != nil {
|
|
errCh <- util.GetRecoverError(r)
|
|
doneCh <- struct{}{}
|
|
}
|
|
wg.Done()
|
|
},
|
|
)
|
|
}
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) createBuildTasks(totalSegmentCnt int, wg *sync.WaitGroup, errCh chan error, doneCh chan struct{}) chan *buildTask {
|
|
buildTaskCh := make(chan *buildTask, e.Concurrency)
|
|
wg.Add(1)
|
|
e.workerWg.RunWithRecover(
|
|
func() { e.createTasks(buildTaskCh, totalSegmentCnt, doneCh) },
|
|
func(r any) {
|
|
if r != nil {
|
|
errCh <- util.GetRecoverError(r)
|
|
}
|
|
close(buildTaskCh)
|
|
wg.Done()
|
|
},
|
|
)
|
|
return buildTaskCh
|
|
}
|
|
|
|
func (e *HashJoinV2Exec) buildHashTable(buildTaskCh chan *buildTask, wg *sync.WaitGroup, errCh chan error, doneCh chan struct{}) {
|
|
for i := range e.Concurrency {
|
|
wg.Add(1)
|
|
workID := i
|
|
e.workerWg.RunWithRecover(
|
|
func() {
|
|
err := e.BuildWorkers[workID].buildHashTable(buildTaskCh)
|
|
if err != nil {
|
|
errCh <- err
|
|
doneCh <- struct{}{}
|
|
}
|
|
},
|
|
func(r any) {
|
|
if r != nil {
|
|
errCh <- util.GetRecoverError(r)
|
|
doneCh <- struct{}{}
|
|
}
|
|
wg.Done()
|
|
},
|
|
)
|
|
}
|
|
}
|
|
|
|
type buildTask struct {
|
|
partitionIdx int
|
|
segStartIdx int
|
|
segEndIdx int
|
|
}
|
|
|
|
func generatePartitionIndex(hashValue uint64, partitionMaskOffset int) uint64 {
|
|
return hashValue >> uint64(partitionMaskOffset)
|
|
}
|
|
|
|
func getProbeSpillChunkFieldTypes(probeFieldTypes []*types.FieldType) []*types.FieldType {
|
|
ret := make([]*types.FieldType, 0, len(probeFieldTypes)+2)
|
|
hashValueField := types.NewFieldType(mysql.TypeLonglong)
|
|
hashValueField.AddFlag(mysql.UnsignedFlag)
|
|
ret = append(ret, hashValueField) // hash value
|
|
ret = append(ret, types.NewFieldType(mysql.TypeBit)) // serialized key
|
|
ret = append(ret, probeFieldTypes...) // row data
|
|
return ret
|
|
}
|
|
|
|
func rehash(oldHashValue uint64, rehashBuf []byte, hash hash.Hash64) uint64 {
|
|
*(*uint64)(unsafe.Pointer(&rehashBuf[0])) = oldHashValue
|
|
|
|
hash.Reset()
|
|
hash.Write(rehashBuf)
|
|
return hash.Sum64()
|
|
}
|
|
|
|
func issue59377Intest(err *error) {
|
|
failpoint.Inject("Issue59377", func() {
|
|
*err = errors.New("Random failpoint error is triggered")
|
|
})
|
|
}
|
|
|
|
func triggerIntest(errProbability int) error {
|
|
failpoint.Inject("slowWorkers", func(val failpoint.Value) {
|
|
if val.(bool) {
|
|
num := rand.Intn(100000)
|
|
if num < 2 {
|
|
time.Sleep(time.Duration(num) * time.Millisecond)
|
|
}
|
|
}
|
|
})
|
|
|
|
var err error
|
|
failpoint.Inject("panicOrError", func(val failpoint.Value) {
|
|
if val.(bool) {
|
|
num := rand.Intn(100000)
|
|
if num < errProbability/2 {
|
|
panic("Random failpoint panic")
|
|
} else if num < errProbability {
|
|
err = errors.New("Random failpoint error is triggered")
|
|
}
|
|
}
|
|
})
|
|
|
|
return err
|
|
}
|