272 lines
9.4 KiB
Go
272 lines
9.4 KiB
Go
// Copyright 2019 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package executor
|
|
|
|
import (
|
|
"context"
|
|
"math"
|
|
"sync"
|
|
"time"
|
|
"unsafe"
|
|
|
|
"github.com/pingcap/errors"
|
|
"github.com/pingcap/tidb/util"
|
|
"github.com/pingcap/tidb/util/chunk"
|
|
"github.com/pingcap/tidb/util/mvmap"
|
|
log "github.com/sirupsen/logrus"
|
|
"github.com/spaolacci/murmur3"
|
|
)
|
|
|
|
var (
|
|
_ Executor = &RadixHashJoinExec{}
|
|
)
|
|
|
|
// RadixHashJoinExec implements the radix partition-based hash join algorithm.
|
|
// It will partition the input relations into small pairs of partitions where
|
|
// one of the partitions typically fits into one of the caches. The overall goal
|
|
// of this method is to minimize the number of cache misses when building and
|
|
// probing hash tables.
|
|
type RadixHashJoinExec struct {
|
|
*HashJoinExec
|
|
|
|
// radixBits indicates the bits using for radix partitioning. Inner relation
|
|
// will be split to 2^radixBitsNumber sub-relations before building the hash
|
|
// tables. If the complete inner relation can be hold in L2Cache in which
|
|
// case radixBits will be 1, we can skip the partition phase.
|
|
// Note: We actually check whether `size of sub inner relation < 3/4 * L2
|
|
// cache size` to make sure one inner sub-relation, hash table, one outer
|
|
// sub-relation and join result of the sub-relations can be totally loaded
|
|
// in L2 cache size. `3/4` is a magic number, we may adjust it after
|
|
// benchmark.
|
|
radixBits uint32
|
|
innerParts []partition
|
|
numNonEmptyPart int
|
|
// innerRowPrts indicates the position in corresponding partition of every
|
|
// row in innerResult.
|
|
innerRowPrts [][]partRowPtr
|
|
// hashTables stores the hash tables built from the inner relation, if there
|
|
// is no partition phase, a global hash table will be stored in
|
|
// hashTables[0].
|
|
hashTables []*mvmap.MVMap
|
|
}
|
|
|
|
// partition stores the sub-relations of inner relation and outer relation after
|
|
// partition phase. Every partition can be fully stored in L2 cache thus can
|
|
// reduce the cache miss ratio when building and probing the hash table.
|
|
type partition = *chunk.Chunk
|
|
|
|
// partRowPtr stores the actual index in `innerParts` or `outerParts`.
|
|
type partRowPtr struct {
|
|
partitionIdx uint32
|
|
rowIdx uint32
|
|
}
|
|
|
|
// partPtr4NullKey indicates a partition pointer which points to a row with null-join-key.
|
|
var partPtr4NullKey = partRowPtr{math.MaxUint32, math.MaxUint32}
|
|
|
|
// Next implements the Executor Next interface.
|
|
// radix hash join constructs the result following these steps:
|
|
// step 1. fetch data from inner child
|
|
// step 2. parallel partition the inner relation into sub-relations and build an
|
|
// individual hash table for every partition
|
|
// step 3. fetch data from outer child in a background goroutine and partition
|
|
// it into sub-relations
|
|
// step 4. probe the corresponded sub-hash-table for every sub-outer-relation in
|
|
// multiple join workers
|
|
func (e *RadixHashJoinExec) Next(ctx context.Context, req *chunk.RecordBatch) (err error) {
|
|
if e.runtimeStats != nil {
|
|
start := time.Now()
|
|
defer func() { e.runtimeStats.Record(time.Now().Sub(start), req.NumRows()) }()
|
|
}
|
|
if !e.prepared {
|
|
e.innerFinished = make(chan error, 1)
|
|
go util.WithRecovery(func() { e.partitionInnerAndBuildHashTables(ctx) }, e.handleFetchInnerAndBuildHashTablePanic)
|
|
// TODO: parallel fetch outer rows, partition them and do parallel join
|
|
e.prepared = true
|
|
}
|
|
return <-e.innerFinished
|
|
}
|
|
|
|
// partitionInnerRows re-order e.innerResults into sub-relations.
|
|
func (e *RadixHashJoinExec) partitionInnerRows() error {
|
|
e.evalRadixBit()
|
|
if err := e.preAlloc4InnerParts(); err != nil {
|
|
return err
|
|
}
|
|
|
|
wg := sync.WaitGroup{}
|
|
defer wg.Wait()
|
|
wg.Add(int(e.concurrency))
|
|
for i := 0; i < int(e.concurrency); i++ {
|
|
workerID := i
|
|
go util.WithRecovery(func() {
|
|
defer wg.Done()
|
|
e.doInnerPartition(workerID)
|
|
}, e.handlePartitionPanic)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (e *RadixHashJoinExec) handlePartitionPanic(r interface{}) {
|
|
if r != nil {
|
|
e.joinResultCh <- &hashjoinWorkerResult{err: errors.Errorf("%v", r)}
|
|
}
|
|
}
|
|
|
|
// doInnerPartition runs concurrently, partitions and copies the inner relation
|
|
// to several pre-allocated data partitions. The input inner Chunk idx for each
|
|
// partitioner is workerId + x*numPartitioners.
|
|
func (e *RadixHashJoinExec) doInnerPartition(workerID int) {
|
|
chkIdx, chkNum := workerID, e.innerResult.NumChunks()
|
|
for ; chkIdx < chkNum; chkIdx += int(e.concurrency) {
|
|
if e.finished.Load().(bool) {
|
|
return
|
|
}
|
|
chk := e.innerResult.GetChunk(chkIdx)
|
|
for srcRowIdx, partPtr := range e.innerRowPrts[chkIdx] {
|
|
if partPtr == partPtr4NullKey {
|
|
continue
|
|
}
|
|
partIdx, destRowIdx := partPtr.partitionIdx, partPtr.rowIdx
|
|
part := e.innerParts[partIdx]
|
|
part.Insert(int(destRowIdx), chk.GetRow(srcRowIdx))
|
|
}
|
|
}
|
|
}
|
|
|
|
// preAlloc4InnerParts evaluates partRowPtr and pre-alloc the memory space
|
|
// for every inner row to help re-order the inner relation.
|
|
// TODO: we need to evaluate the skewness for the partitions size, if the
|
|
// skewness exceeds a threshold, we do not use partition phase.
|
|
func (e *RadixHashJoinExec) preAlloc4InnerParts() (err error) {
|
|
var hasNull bool
|
|
keyBuf := make([]byte, 0, 64)
|
|
for chkIdx, chkNum := 0, e.innerResult.NumChunks(); chkIdx < chkNum; chkIdx++ {
|
|
chk := e.innerResult.GetChunk(chkIdx)
|
|
partPtrs := make([]partRowPtr, chk.NumRows())
|
|
for rowIdx := 0; rowIdx < chk.NumRows(); rowIdx++ {
|
|
row := chk.GetRow(rowIdx)
|
|
hasNull, keyBuf, err = e.getJoinKeyFromChkRow(false, row, keyBuf)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if hasNull {
|
|
partPtrs[rowIdx] = partPtr4NullKey
|
|
continue
|
|
}
|
|
joinHash := murmur3.Sum32(keyBuf)
|
|
partIdx := e.radixBits & joinHash
|
|
partPtrs[rowIdx].partitionIdx = partIdx
|
|
partPtrs[rowIdx].rowIdx = e.getPartition(partIdx).PreAlloc(row)
|
|
}
|
|
e.innerRowPrts = append(e.innerRowPrts, partPtrs)
|
|
}
|
|
if e.numNonEmptyPart < len(e.innerParts) {
|
|
numTotalPart := len(e.innerParts)
|
|
numEmptyPart := numTotalPart - e.numNonEmptyPart
|
|
log.Debugf("[EMPTY_PART_IN_RADIX_HASH_JOIN] txn_start_ts:%v, num_empty_parts:%v, "+
|
|
"num_total_parts:%v, empty_ratio:%v", e.ctx.GetSessionVars().TxnCtx.StartTS,
|
|
numEmptyPart, numTotalPart, float64(numEmptyPart)/float64(numTotalPart))
|
|
}
|
|
return
|
|
}
|
|
|
|
func (e *RadixHashJoinExec) getPartition(idx uint32) partition {
|
|
if e.innerParts[idx] == nil {
|
|
e.numNonEmptyPart++
|
|
e.innerParts[idx] = chunk.New(e.innerExec.retTypes(), e.initCap, e.maxChunkSize)
|
|
}
|
|
return e.innerParts[idx]
|
|
}
|
|
|
|
// evalRadixBit evaluates the radix bit numbers.
|
|
// To ensure that one partition of inner relation, one hash table, one partition
|
|
// of outer relation and the join result of these two partitions fit into the L2
|
|
// cache when the input data obeys the uniform distribution, we suppose every
|
|
// sub-partition of inner relation using three quarters of the L2 cache size.
|
|
func (e *RadixHashJoinExec) evalRadixBit() {
|
|
sv := e.ctx.GetSessionVars()
|
|
innerResultSize := float64(e.innerResult.GetMemTracker().BytesConsumed())
|
|
l2CacheSize := float64(sv.L2CacheSize) * 3 / 4
|
|
radixBitsNum := math.Ceil(math.Log2(innerResultSize / l2CacheSize))
|
|
if radixBitsNum <= 0 {
|
|
radixBitsNum = 1
|
|
}
|
|
// Take the rightmost radixBitsNum bits as the bitmask.
|
|
e.radixBits = ^(math.MaxUint32 << uint(radixBitsNum))
|
|
e.innerParts = make([]partition, 1<<uint(radixBitsNum))
|
|
}
|
|
|
|
// partitionInnerAndBuildHashTables fetches all the inner rows into memory,
|
|
// partition them into sub-relations, and build individual hash tables for every
|
|
// sub-relations.
|
|
func (e *RadixHashJoinExec) partitionInnerAndBuildHashTables(ctx context.Context) {
|
|
if err := e.fetchInnerRows(ctx); err != nil {
|
|
e.innerFinished <- err
|
|
return
|
|
}
|
|
|
|
if err := e.partitionInnerRows(); err != nil {
|
|
e.innerFinished <- err
|
|
return
|
|
}
|
|
if err := e.buildHashTable4Partitions(); err != nil {
|
|
e.innerFinished <- err
|
|
}
|
|
}
|
|
|
|
func (e *RadixHashJoinExec) wait4BuildHashTable(wg *sync.WaitGroup, finishedCh chan error) {
|
|
wg.Wait()
|
|
close(finishedCh)
|
|
}
|
|
|
|
func (e *RadixHashJoinExec) buildHashTable4Partitions() error {
|
|
e.hashTables = make([]*mvmap.MVMap, len(e.innerParts))
|
|
buildFinishedCh := make(chan error, e.concurrency)
|
|
wg := &sync.WaitGroup{}
|
|
wg.Add(int(e.concurrency))
|
|
go e.wait4BuildHashTable(wg, buildFinishedCh)
|
|
for i := 0; i < int(e.concurrency); i++ {
|
|
workerID := i
|
|
go util.WithRecovery(func() {
|
|
defer wg.Done()
|
|
e.doBuild(workerID, buildFinishedCh)
|
|
}, nil)
|
|
}
|
|
return <-buildFinishedCh
|
|
}
|
|
|
|
func (e *RadixHashJoinExec) doBuild(workerID int, finishedCh chan error) {
|
|
var err error
|
|
keyBuf, valBuf := make([]byte, 0, 64), make([]byte, 4)
|
|
for i := workerID; i < len(e.innerParts); i += int(e.concurrency) {
|
|
if e.innerParts[i] == nil {
|
|
continue
|
|
}
|
|
e.hashTables[i] = mvmap.NewMVMap()
|
|
keyBuf = keyBuf[:0]
|
|
for rowIdx, numRows := 0, e.innerParts[i].NumRows(); rowIdx < numRows; rowIdx++ {
|
|
// Join-key can be promised to be NOT NULL in a partition(see `partPtr4NullKey`), so we do not check it.
|
|
_, keyBuf, err = e.getJoinKeyFromChkRow(false, e.innerParts[i].GetRow(rowIdx), keyBuf)
|
|
if err != nil {
|
|
e.finished.Store(true)
|
|
finishedCh <- err
|
|
return
|
|
}
|
|
*(*uint32)(unsafe.Pointer(&valBuf[0])) = uint32(rowIdx)
|
|
e.hashTables[i].Put(keyBuf, valBuf)
|
|
}
|
|
}
|
|
}
|