Files
tidb/executor/benchmark_test.go

1093 lines
31 KiB
Go

// Copyright 2019 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package executor
import (
"context"
"fmt"
"math/rand"
"sort"
"strings"
"sync"
"testing"
"github.com/pingcap/log"
"github.com/pingcap/parser/ast"
"github.com/pingcap/parser/mysql"
"github.com/pingcap/tidb/expression"
"github.com/pingcap/tidb/expression/aggregation"
"github.com/pingcap/tidb/planner/core"
"github.com/pingcap/tidb/planner/property"
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/memory"
"github.com/pingcap/tidb/util/mock"
"github.com/pingcap/tidb/util/stringutil"
"go.uber.org/zap/zapcore"
)
var (
_ Executor = &mockDataSource{}
)
type mockDataSourceParameters struct {
schema *expression.Schema
genDataFunc func(row int, typ *types.FieldType) interface{}
ndvs []int // number of distinct values on columns[i] and zero represents no limit
orders []bool // columns[i] should be ordered if orders[i] is true
rows int // number of rows the DataSource should output
ctx sessionctx.Context
}
type mockDataSource struct {
baseExecutor
p mockDataSourceParameters
genData []*chunk.Chunk
chunks []*chunk.Chunk
chunkPtr int
}
func (mds *mockDataSource) genColDatums(col int) (results []interface{}) {
typ := mds.retFieldTypes[col]
order := false
if col < len(mds.p.orders) {
order = mds.p.orders[col]
}
rows := mds.p.rows
NDV := 0
if col < len(mds.p.ndvs) {
NDV = mds.p.ndvs[col]
}
results = make([]interface{}, 0, rows)
if mds.p.genDataFunc != nil {
for i := 0; i < rows; i++ {
results = append(results, mds.p.genDataFunc(i, typ))
}
} else if NDV == 0 {
for i := 0; i < rows; i++ {
results = append(results, mds.randDatum(typ))
}
} else {
datumSet := make(map[string]bool, NDV)
datums := make([]interface{}, 0, NDV)
for len(datums) < NDV {
d := mds.randDatum(typ)
str := fmt.Sprintf("%v", d)
if datumSet[str] {
continue
}
datumSet[str] = true
datums = append(datums, d)
}
for i := 0; i < rows; i++ {
results = append(results, datums[rand.Intn(NDV)])
}
}
if order {
sort.Slice(results, func(i, j int) bool {
switch typ.Tp {
case mysql.TypeLong, mysql.TypeLonglong:
return results[i].(int64) < results[j].(int64)
case mysql.TypeDouble:
return results[i].(float64) < results[j].(float64)
case mysql.TypeVarString:
return results[i].(string) < results[j].(string)
default:
panic("not implement")
}
})
}
return
}
func (mds *mockDataSource) randDatum(typ *types.FieldType) interface{} {
switch typ.Tp {
case mysql.TypeLong, mysql.TypeLonglong:
return int64(rand.Int())
case mysql.TypeDouble:
return rand.Float64()
case mysql.TypeVarString:
return rawData
default:
panic("not implement")
}
}
func (mds *mockDataSource) prepareChunks() {
mds.chunks = make([]*chunk.Chunk, len(mds.genData))
for i := range mds.chunks {
mds.chunks[i] = mds.genData[i].CopyConstruct()
}
mds.chunkPtr = 0
}
func (mds *mockDataSource) Next(ctx context.Context, req *chunk.Chunk) error {
if mds.chunkPtr >= len(mds.chunks) {
req.Reset()
return nil
}
dataChk := mds.chunks[mds.chunkPtr]
dataChk.SwapColumns(req)
mds.chunkPtr++
return nil
}
func buildMockDataSource(opt mockDataSourceParameters) *mockDataSource {
baseExec := newBaseExecutor(opt.ctx, opt.schema, nil)
m := &mockDataSource{baseExec, opt, nil, nil, 0}
types := retTypes(m)
colData := make([][]interface{}, len(types))
for i := 0; i < len(types); i++ {
colData[i] = m.genColDatums(i)
}
m.genData = make([]*chunk.Chunk, (m.p.rows+m.maxChunkSize-1)/m.maxChunkSize)
for i := range m.genData {
m.genData[i] = chunk.NewChunkWithCapacity(retTypes(m), m.maxChunkSize)
}
for i := 0; i < m.p.rows; i++ {
idx := i / m.maxChunkSize
retTypes := retTypes(m)
for colIdx := 0; colIdx < len(types); colIdx++ {
switch retTypes[colIdx].Tp {
case mysql.TypeLong, mysql.TypeLonglong:
m.genData[idx].AppendInt64(colIdx, colData[colIdx][i].(int64))
case mysql.TypeDouble:
m.genData[idx].AppendFloat64(colIdx, colData[colIdx][i].(float64))
case mysql.TypeVarString:
m.genData[idx].AppendString(colIdx, colData[colIdx][i].(string))
default:
panic("not implement")
}
}
}
return m
}
func buildMockDataSourceWithIndex(opt mockDataSourceParameters, index []int) *mockDataSource {
opt.orders = make([]bool, len(opt.schema.Columns))
for _, idx := range index {
opt.orders[idx] = true
}
return buildMockDataSource(opt)
}
type aggTestCase struct {
// The test table's schema is fixed (aggCol Double, groupBy LongLong).
execType string // "hash" or "stream"
aggFunc string // sum, avg, count ....
groupByNDV int // the number of distinct group-by keys
hasDistinct bool
rows int
concurrency int
ctx sessionctx.Context
}
func (a aggTestCase) columns() []*expression.Column {
return []*expression.Column{
{Index: 0, RetType: types.NewFieldType(mysql.TypeDouble)},
{Index: 1, RetType: types.NewFieldType(mysql.TypeLonglong)},
}
}
func (a aggTestCase) String() string {
return fmt.Sprintf("(execType:%v, aggFunc:%v, ndv:%v, hasDistinct:%v, rows:%v, concurrency:%v)",
a.execType, a.aggFunc, a.groupByNDV, a.hasDistinct, a.rows, a.concurrency)
}
func defaultAggTestCase(exec string) *aggTestCase {
ctx := mock.NewContext()
ctx.GetSessionVars().InitChunkSize = variable.DefInitChunkSize
ctx.GetSessionVars().MaxChunkSize = variable.DefMaxChunkSize
return &aggTestCase{exec, ast.AggFuncSum, 1000, false, 10000000, 4, ctx}
}
func buildHashAggExecutor(ctx sessionctx.Context, src Executor, schema *expression.Schema,
aggFuncs []*aggregation.AggFuncDesc, groupItems []expression.Expression) Executor {
plan := new(core.PhysicalHashAgg)
plan.AggFuncs = aggFuncs
plan.GroupByItems = groupItems
plan.SetSchema(schema)
plan.Init(ctx, nil, 0)
plan.SetChildren(nil)
b := newExecutorBuilder(ctx, nil)
exec := b.build(plan)
hashAgg := exec.(*HashAggExec)
hashAgg.children[0] = src
return exec
}
func buildStreamAggExecutor(ctx sessionctx.Context, src Executor, schema *expression.Schema,
aggFuncs []*aggregation.AggFuncDesc, groupItems []expression.Expression) Executor {
plan := new(core.PhysicalStreamAgg)
plan.AggFuncs = aggFuncs
plan.GroupByItems = groupItems
plan.SetSchema(schema)
plan.Init(ctx, nil, 0)
plan.SetChildren(nil)
b := newExecutorBuilder(ctx, nil)
exec := b.build(plan)
streamAgg := exec.(*StreamAggExec)
streamAgg.children[0] = src
return exec
}
func buildAggExecutor(b *testing.B, testCase *aggTestCase, child Executor) Executor {
ctx := testCase.ctx
if err := ctx.GetSessionVars().SetSystemVar(variable.TiDBHashAggFinalConcurrency, fmt.Sprintf("%v", testCase.concurrency)); err != nil {
b.Fatal(err)
}
if err := ctx.GetSessionVars().SetSystemVar(variable.TiDBHashAggPartialConcurrency, fmt.Sprintf("%v", testCase.concurrency)); err != nil {
b.Fatal(err)
}
childCols := testCase.columns()
schema := expression.NewSchema(childCols...)
groupBy := []expression.Expression{childCols[1]}
aggFunc, err := aggregation.NewAggFuncDesc(testCase.ctx, testCase.aggFunc, []expression.Expression{childCols[0]}, testCase.hasDistinct)
if err != nil {
b.Fatal(err)
}
aggFuncs := []*aggregation.AggFuncDesc{aggFunc}
var aggExec Executor
switch testCase.execType {
case "hash":
aggExec = buildHashAggExecutor(testCase.ctx, child, schema, aggFuncs, groupBy)
case "stream":
aggExec = buildStreamAggExecutor(testCase.ctx, child, schema, aggFuncs, groupBy)
default:
b.Fatal("not implement")
}
return aggExec
}
func benchmarkAggExecWithCase(b *testing.B, casTest *aggTestCase) {
cols := casTest.columns()
orders := []bool{false, casTest.execType == "stream"}
dataSource := buildMockDataSource(mockDataSourceParameters{
schema: expression.NewSchema(cols...),
ndvs: []int{0, casTest.groupByNDV},
orders: orders,
rows: casTest.rows,
ctx: casTest.ctx,
})
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer() // prepare a new agg-executor
aggExec := buildAggExecutor(b, casTest, dataSource)
tmpCtx := context.Background()
chk := newFirstChunk(aggExec)
dataSource.prepareChunks()
b.StartTimer()
if err := aggExec.Open(tmpCtx); err != nil {
b.Fatal(err)
}
for {
if err := aggExec.Next(tmpCtx, chk); err != nil {
b.Fatal(b)
}
if chk.NumRows() == 0 {
break
}
}
if err := aggExec.Close(); err != nil {
b.Fatal(err)
}
b.StopTimer()
}
}
func BenchmarkAggRows(b *testing.B) {
rows := []int{100000, 1000000, 10000000}
concurrencies := []int{1, 4, 8, 15, 20, 30, 40}
for _, row := range rows {
for _, con := range concurrencies {
for _, exec := range []string{"hash", "stream"} {
if exec == "stream" && con > 1 {
continue
}
cas := defaultAggTestCase(exec)
cas.rows = row
cas.concurrency = con
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkAggExecWithCase(b, cas)
})
}
}
}
}
func BenchmarkAggGroupByNDV(b *testing.B) {
NDVs := []int{10, 100, 1000, 10000, 100000, 1000000, 10000000}
for _, NDV := range NDVs {
for _, exec := range []string{"hash", "stream"} {
cas := defaultAggTestCase(exec)
cas.groupByNDV = NDV
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkAggExecWithCase(b, cas)
})
}
}
}
func BenchmarkAggConcurrency(b *testing.B) {
concs := []int{1, 4, 8, 15, 20, 30, 40}
for _, con := range concs {
for _, exec := range []string{"hash", "stream"} {
if exec == "stream" && con > 1 {
continue
}
cas := defaultAggTestCase(exec)
cas.concurrency = con
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkAggExecWithCase(b, cas)
})
}
}
}
func BenchmarkAggDistinct(b *testing.B) {
rows := []int{100000, 1000000, 10000000}
distincts := []bool{false, true}
for _, row := range rows {
for _, exec := range []string{"hash", "stream"} {
for _, distinct := range distincts {
cas := defaultAggTestCase(exec)
cas.rows = row
cas.hasDistinct = distinct
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkAggExecWithCase(b, cas)
})
}
}
}
}
func buildWindowExecutor(ctx sessionctx.Context, windowFunc string, src Executor, schema *expression.Schema, partitionBy []*expression.Column) Executor {
plan := new(core.PhysicalWindow)
var args []expression.Expression
switch windowFunc {
case ast.WindowFuncNtile:
args = append(args, &expression.Constant{Value: types.NewUintDatum(2)})
case ast.WindowFuncNthValue:
args = append(args, partitionBy[0], &expression.Constant{Value: types.NewUintDatum(2)})
default:
args = append(args, partitionBy[0])
}
desc, _ := aggregation.NewWindowFuncDesc(ctx, windowFunc, args)
plan.WindowFuncDescs = []*aggregation.WindowFuncDesc{desc}
for _, col := range partitionBy {
plan.PartitionBy = append(plan.PartitionBy, property.Item{Col: col})
}
plan.OrderBy = nil
plan.SetSchema(schema)
plan.Init(ctx, nil, 0)
plan.SetChildren(nil)
b := newExecutorBuilder(ctx, nil)
exec := b.build(plan)
window := exec.(*WindowExec)
window.children[0] = src
return exec
}
type windowTestCase struct {
// The test table's schema is fixed (col Double, partitionBy LongLong, rawData VarString(5128), col LongLong).
windowFunc string
ndv int // the number of distinct group-by keys
rows int
ctx sessionctx.Context
}
var rawData = strings.Repeat("x", 5*1024)
func (a windowTestCase) columns() []*expression.Column {
rawDataTp := new(types.FieldType)
types.DefaultTypeForValue(rawData, rawDataTp)
return []*expression.Column{
{Index: 0, RetType: types.NewFieldType(mysql.TypeDouble)},
{Index: 1, RetType: types.NewFieldType(mysql.TypeLonglong)},
{Index: 2, RetType: rawDataTp},
{Index: 3, RetType: types.NewFieldType(mysql.TypeLonglong)},
}
}
func (a windowTestCase) String() string {
return fmt.Sprintf("(func:%v, ndv:%v, rows:%v)",
a.windowFunc, a.ndv, a.rows)
}
func defaultWindowTestCase() *windowTestCase {
ctx := mock.NewContext()
ctx.GetSessionVars().InitChunkSize = variable.DefInitChunkSize
ctx.GetSessionVars().MaxChunkSize = variable.DefMaxChunkSize
return &windowTestCase{ast.WindowFuncRowNumber, 1000, 10000000, ctx}
}
func benchmarkWindowExecWithCase(b *testing.B, casTest *windowTestCase) {
cols := casTest.columns()
dataSource := buildMockDataSource(mockDataSourceParameters{
schema: expression.NewSchema(cols...),
ndvs: []int{0, casTest.ndv, 0, 0},
orders: []bool{false, true, false, false},
rows: casTest.rows,
ctx: casTest.ctx,
})
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer() // prepare a new window-executor
childCols := casTest.columns()
schema := expression.NewSchema(childCols...)
windowExec := buildWindowExecutor(casTest.ctx, casTest.windowFunc, dataSource, schema, childCols[1:2])
tmpCtx := context.Background()
chk := newFirstChunk(windowExec)
dataSource.prepareChunks()
b.StartTimer()
if err := windowExec.Open(tmpCtx); err != nil {
b.Fatal(err)
}
for {
if err := windowExec.Next(tmpCtx, chk); err != nil {
b.Fatal(b)
}
if chk.NumRows() == 0 {
break
}
}
if err := windowExec.Close(); err != nil {
b.Fatal(err)
}
b.StopTimer()
}
}
func BenchmarkWindowRows(b *testing.B) {
b.ReportAllocs()
rows := []int{1000, 100000}
ndvs := []int{10, 1000}
for _, row := range rows {
for _, ndv := range ndvs {
cas := defaultWindowTestCase()
cas.rows = row
cas.ndv = ndv
cas.windowFunc = ast.WindowFuncRowNumber // cheapest
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkWindowExecWithCase(b, cas)
})
}
}
}
func BenchmarkWindowFunctions(b *testing.B) {
b.ReportAllocs()
windowFuncs := []string{
ast.WindowFuncRowNumber,
ast.WindowFuncRank,
ast.WindowFuncDenseRank,
ast.WindowFuncCumeDist,
ast.WindowFuncPercentRank,
ast.WindowFuncNtile,
ast.WindowFuncLead,
ast.WindowFuncLag,
ast.WindowFuncFirstValue,
ast.WindowFuncLastValue,
ast.WindowFuncNthValue,
}
for _, windowFunc := range windowFuncs {
cas := defaultWindowTestCase()
cas.rows = 100000
cas.ndv = 1000
cas.windowFunc = windowFunc
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkWindowExecWithCase(b, cas)
})
}
}
type hashJoinTestCase struct {
rows int
cols []*types.FieldType
concurrency int
ctx sessionctx.Context
keyIdx []int
disk bool
joinType core.JoinType
useOuterToBuild bool
}
func (tc hashJoinTestCase) columns() []*expression.Column {
ret := make([]*expression.Column, 0)
for i, t := range tc.cols {
column := &expression.Column{Index: i, RetType: t}
ret = append(ret, column)
}
return ret
}
func (tc hashJoinTestCase) String() string {
return fmt.Sprintf("(rows:%v, cols:%v, concurency:%v, joinKeyIdx: %v, disk:%v)",
tc.rows, tc.cols, tc.concurrency, tc.keyIdx, tc.disk)
}
func defaultHashJoinTestCase(cols []*types.FieldType, joinType core.JoinType, useOuterToBuild bool) *hashJoinTestCase {
ctx := mock.NewContext()
ctx.GetSessionVars().InitChunkSize = variable.DefInitChunkSize
ctx.GetSessionVars().MaxChunkSize = variable.DefMaxChunkSize
ctx.GetSessionVars().StmtCtx.MemTracker = memory.NewTracker(nil, -1)
ctx.GetSessionVars().IndexLookupJoinConcurrency = 4
tc := &hashJoinTestCase{rows: 100000, concurrency: 4, ctx: ctx, keyIdx: []int{0, 1}}
tc.cols = cols
tc.useOuterToBuild = useOuterToBuild
tc.joinType = joinType
return tc
}
func prepare4HashJoin(testCase *hashJoinTestCase, innerExec, outerExec Executor) *HashJoinExec {
if testCase.useOuterToBuild {
innerExec, outerExec = outerExec, innerExec
}
cols0 := testCase.columns()
cols1 := testCase.columns()
joinSchema := expression.NewSchema(cols0...)
joinSchema.Append(cols1...)
joinKeys := make([]*expression.Column, 0, len(testCase.keyIdx))
for _, keyIdx := range testCase.keyIdx {
joinKeys = append(joinKeys, cols0[keyIdx])
}
e := &HashJoinExec{
baseExecutor: newBaseExecutor(testCase.ctx, joinSchema, stringutil.StringerStr("HashJoin"), innerExec, outerExec),
concurrency: uint(testCase.concurrency),
joinType: testCase.joinType, // 0 for InnerJoin, 1 for LeftOutersJoin, 2 for RightOuterJoin
isOuterJoin: false,
buildKeys: joinKeys,
probeKeys: joinKeys,
buildSideExec: innerExec,
probeSideExec: outerExec,
buildSideEstCount: float64(testCase.rows),
useOuterToBuild: testCase.useOuterToBuild,
}
defaultValues := make([]types.Datum, e.buildSideExec.Schema().Len())
lhsTypes, rhsTypes := retTypes(innerExec), retTypes(outerExec)
e.joiners = make([]joiner, e.concurrency)
for i := uint(0); i < e.concurrency; i++ {
e.joiners[i] = newJoiner(testCase.ctx, e.joinType, true, defaultValues,
nil, lhsTypes, rhsTypes)
}
memLimit := int64(-1)
if testCase.disk {
memLimit = 1
}
t := memory.NewTracker(stringutil.StringerStr("root of prepare4HashJoin"), memLimit)
t.SetActionOnExceed(nil)
e.ctx.GetSessionVars().StmtCtx.MemTracker = t
return e
}
func benchmarkHashJoinExecWithCase(b *testing.B, casTest *hashJoinTestCase) {
opt := mockDataSourceParameters{
schema: expression.NewSchema(casTest.columns()...),
rows: casTest.rows,
ctx: casTest.ctx,
genDataFunc: func(row int, typ *types.FieldType) interface{} {
switch typ.Tp {
case mysql.TypeLong, mysql.TypeLonglong:
return int64(row)
case mysql.TypeVarString:
return rawData
case mysql.TypeDouble:
return float64(row)
default:
panic("not implement")
}
},
}
dataSource1 := buildMockDataSource(opt)
dataSource2 := buildMockDataSource(opt)
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
exec := prepare4HashJoin(casTest, dataSource1, dataSource2)
tmpCtx := context.Background()
chk := newFirstChunk(exec)
dataSource1.prepareChunks()
dataSource2.prepareChunks()
b.StartTimer()
if err := exec.Open(tmpCtx); err != nil {
b.Fatal(err)
}
for {
if err := exec.Next(tmpCtx, chk); err != nil {
b.Fatal(err)
}
if chk.NumRows() == 0 {
break
}
}
if err := exec.Close(); err != nil {
b.Fatal(err)
}
b.StopTimer()
if exec.rowContainer.alreadySpilled() != casTest.disk {
b.Fatal("wrong usage with disk")
}
}
}
func BenchmarkHashJoinExec(b *testing.B) {
lvl := log.GetLevel()
log.SetLevel(zapcore.ErrorLevel)
defer log.SetLevel(lvl)
cols := []*types.FieldType{
types.NewFieldType(mysql.TypeLonglong),
types.NewFieldType(mysql.TypeVarString),
}
b.ReportAllocs()
cas := defaultHashJoinTestCase(cols, 0, false)
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
cas.keyIdx = []int{0}
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
cas.keyIdx = []int{0}
cas.disk = true
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
cas.keyIdx = []int{0}
cas.disk = true
cas.rows = 1000
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
// Replace the wide string column with double column
cols = []*types.FieldType{
types.NewFieldType(mysql.TypeLonglong),
types.NewFieldType(mysql.TypeDouble),
}
cas = defaultHashJoinTestCase(cols, 0, false)
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
cas.keyIdx = []int{0}
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
}
func BenchmarkOuterHashJoinExec(b *testing.B) {
lvl := log.GetLevel()
log.SetLevel(zapcore.ErrorLevel)
defer log.SetLevel(lvl)
cols := []*types.FieldType{
types.NewFieldType(mysql.TypeLonglong),
types.NewFieldType(mysql.TypeVarString),
}
b.ReportAllocs()
cas := defaultHashJoinTestCase(cols, 2, true)
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
cas.keyIdx = []int{0}
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
cas.keyIdx = []int{0}
cas.disk = true
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
cas.keyIdx = []int{0}
cas.disk = true
cas.rows = 1000
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
// Replace the wide string column with double column
cols = []*types.FieldType{
types.NewFieldType(mysql.TypeLonglong),
types.NewFieldType(mysql.TypeDouble),
}
cas = defaultHashJoinTestCase(cols, 2, true)
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
cas.keyIdx = []int{0}
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkHashJoinExecWithCase(b, cas)
})
}
func benchmarkBuildHashTableForList(b *testing.B, casTest *hashJoinTestCase) {
opt := mockDataSourceParameters{
schema: expression.NewSchema(casTest.columns()...),
rows: casTest.rows,
ctx: casTest.ctx,
genDataFunc: func(row int, typ *types.FieldType) interface{} {
switch typ.Tp {
case mysql.TypeLong, mysql.TypeLonglong:
return int64(row)
case mysql.TypeVarString:
return rawData
default:
panic("not implement")
}
},
}
dataSource1 := buildMockDataSource(opt)
dataSource2 := buildMockDataSource(opt)
dataSource1.prepareChunks()
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
exec := prepare4HashJoin(casTest, dataSource1, dataSource2)
tmpCtx := context.Background()
if err := exec.Open(tmpCtx); err != nil {
b.Fatal(err)
}
exec.prepared = true
innerResultCh := make(chan *chunk.Chunk, len(dataSource1.chunks))
for _, chk := range dataSource1.chunks {
innerResultCh <- chk
}
close(innerResultCh)
b.StartTimer()
if err := exec.buildHashTableForList(innerResultCh); err != nil {
b.Fatal(err)
}
if err := exec.Close(); err != nil {
b.Fatal(err)
}
b.StopTimer()
if exec.rowContainer.alreadySpilled() != casTest.disk {
b.Fatal("wrong usage with disk")
}
}
}
func BenchmarkBuildHashTableForList(b *testing.B) {
lvl := log.GetLevel()
log.SetLevel(zapcore.ErrorLevel)
defer log.SetLevel(lvl)
cols := []*types.FieldType{
types.NewFieldType(mysql.TypeLonglong),
types.NewFieldType(mysql.TypeVarString),
}
b.ReportAllocs()
cas := defaultHashJoinTestCase(cols, 0, false)
rows := []int{10, 100000}
keyIdxs := [][]int{{0, 1}, {0}}
disks := []bool{false, true}
for _, row := range rows {
for _, keyIdx := range keyIdxs {
for _, disk := range disks {
cas.rows = row
cas.keyIdx = keyIdx
cas.disk = disk
b.Run(fmt.Sprintf("%v", cas), func(b *testing.B) {
benchmarkBuildHashTableForList(b, cas)
})
}
}
}
}
type indexJoinTestCase struct {
outerRows int
innerRows int
concurrency int
ctx sessionctx.Context
outerJoinKeyIdx []int
innerJoinKeyIdx []int
innerIdx []int
needOuterSort bool
}
func (tc indexJoinTestCase) columns() []*expression.Column {
return []*expression.Column{
{Index: 0, RetType: types.NewFieldType(mysql.TypeLonglong)},
{Index: 1, RetType: types.NewFieldType(mysql.TypeDouble)},
{Index: 2, RetType: types.NewFieldType(mysql.TypeVarString)},
}
}
func defaultIndexJoinTestCase() *indexJoinTestCase {
ctx := mock.NewContext()
ctx.GetSessionVars().InitChunkSize = variable.DefInitChunkSize
ctx.GetSessionVars().MaxChunkSize = variable.DefMaxChunkSize
ctx.GetSessionVars().SnapshotTS = 1
ctx.GetSessionVars().StmtCtx.MemTracker = memory.NewTracker(nil, -1)
tc := &indexJoinTestCase{
outerRows: 100000,
innerRows: variable.DefMaxChunkSize * 100,
concurrency: 4,
ctx: ctx,
outerJoinKeyIdx: []int{0, 1},
innerJoinKeyIdx: []int{0, 1},
innerIdx: []int{0, 1},
}
return tc
}
func (tc indexJoinTestCase) String() string {
return fmt.Sprintf("(outerRows:%v, innerRows:%v, concurency:%v, outerJoinKeyIdx: %v, innerJoinKeyIdx: %v, NeedOuterSort:%v)",
tc.outerRows, tc.innerRows, tc.concurrency, tc.outerJoinKeyIdx, tc.innerJoinKeyIdx, tc.needOuterSort)
}
func (tc indexJoinTestCase) getMockDataSourceOptByRows(rows int) mockDataSourceParameters {
return mockDataSourceParameters{
schema: expression.NewSchema(tc.columns()...),
rows: rows,
ctx: tc.ctx,
genDataFunc: func(row int, typ *types.FieldType) interface{} {
switch typ.Tp {
case mysql.TypeLong, mysql.TypeLonglong:
return int64(row)
case mysql.TypeDouble:
return float64(row)
case mysql.TypeVarString:
return rawData
default:
panic("not implement")
}
},
}
}
func prepare4IndexInnerHashJoin(tc *indexJoinTestCase, outerDS *mockDataSource, innerDS *mockDataSource) Executor {
outerCols, innerCols := tc.columns(), tc.columns()
joinSchema := expression.NewSchema(outerCols...)
joinSchema.Append(innerCols...)
leftTypes, rightTypes := retTypes(outerDS), retTypes(innerDS)
defaultValues := make([]types.Datum, len(innerCols))
colLens := make([]int, len(innerCols))
for i := range colLens {
colLens[i] = types.UnspecifiedLength
}
keyOff2IdxOff := make([]int, len(tc.outerJoinKeyIdx))
for i := range keyOff2IdxOff {
keyOff2IdxOff[i] = i
}
e := &IndexLookUpJoin{
baseExecutor: newBaseExecutor(tc.ctx, joinSchema, stringutil.StringerStr("IndexInnerHashJoin"), outerDS),
outerCtx: outerCtx{
rowTypes: leftTypes,
keyCols: tc.outerJoinKeyIdx,
},
innerCtx: innerCtx{
readerBuilder: &dataReaderBuilder{Plan: &mockPhysicalIndexReader{e: innerDS}, executorBuilder: newExecutorBuilder(tc.ctx, nil)},
rowTypes: rightTypes,
colLens: colLens,
keyCols: tc.innerJoinKeyIdx,
},
workerWg: new(sync.WaitGroup),
joiner: newJoiner(tc.ctx, 0, false, defaultValues, nil, leftTypes, rightTypes),
isOuterJoin: false,
keyOff2IdxOff: keyOff2IdxOff,
lastColHelper: nil,
}
e.joinResult = newFirstChunk(e)
return e
}
func prepare4IndexOuterHashJoin(tc *indexJoinTestCase, outerDS *mockDataSource, innerDS *mockDataSource) Executor {
e := prepare4IndexInnerHashJoin(tc, outerDS, innerDS).(*IndexLookUpJoin)
idxHash := &IndexNestedLoopHashJoin{IndexLookUpJoin: *e}
concurrency := tc.concurrency
idxHash.joiners = make([]joiner, concurrency)
for i := 0; i < concurrency; i++ {
idxHash.joiners[i] = e.joiner.Clone()
}
return idxHash
}
func prepare4IndexMergeJoin(tc *indexJoinTestCase, outerDS *mockDataSource, innerDS *mockDataSource) Executor {
outerCols, innerCols := tc.columns(), tc.columns()
joinSchema := expression.NewSchema(outerCols...)
joinSchema.Append(innerCols...)
outerJoinKeys := make([]*expression.Column, 0, len(tc.outerJoinKeyIdx))
innerJoinKeys := make([]*expression.Column, 0, len(tc.innerJoinKeyIdx))
for _, keyIdx := range tc.outerJoinKeyIdx {
outerJoinKeys = append(outerJoinKeys, outerCols[keyIdx])
}
for _, keyIdx := range tc.innerJoinKeyIdx {
innerJoinKeys = append(innerJoinKeys, innerCols[keyIdx])
}
leftTypes, rightTypes := retTypes(outerDS), retTypes(innerDS)
defaultValues := make([]types.Datum, len(innerCols))
colLens := make([]int, len(innerCols))
for i := range colLens {
colLens[i] = types.UnspecifiedLength
}
keyOff2IdxOff := make([]int, len(outerJoinKeys))
for i := range keyOff2IdxOff {
keyOff2IdxOff[i] = i
}
compareFuncs := make([]expression.CompareFunc, 0, len(outerJoinKeys))
outerCompareFuncs := make([]expression.CompareFunc, 0, len(outerJoinKeys))
for i := range outerJoinKeys {
compareFuncs = append(compareFuncs, expression.GetCmpFunction(outerJoinKeys[i], innerJoinKeys[i]))
outerCompareFuncs = append(outerCompareFuncs, expression.GetCmpFunction(outerJoinKeys[i], outerJoinKeys[i]))
}
e := &IndexLookUpMergeJoin{
baseExecutor: newBaseExecutor(tc.ctx, joinSchema, stringutil.StringerStr("IndexMergeJoin"), outerDS),
outerMergeCtx: outerMergeCtx{
rowTypes: leftTypes,
keyCols: tc.outerJoinKeyIdx,
joinKeys: outerJoinKeys,
needOuterSort: tc.needOuterSort,
compareFuncs: outerCompareFuncs,
},
innerMergeCtx: innerMergeCtx{
readerBuilder: &dataReaderBuilder{Plan: &mockPhysicalIndexReader{e: innerDS}, executorBuilder: newExecutorBuilder(tc.ctx, nil)},
rowTypes: rightTypes,
joinKeys: innerJoinKeys,
colLens: colLens,
keyCols: tc.innerJoinKeyIdx,
compareFuncs: compareFuncs,
},
workerWg: new(sync.WaitGroup),
isOuterJoin: false,
keyOff2IdxOff: keyOff2IdxOff,
lastColHelper: nil,
}
joiners := make([]joiner, e.ctx.GetSessionVars().IndexLookupJoinConcurrency)
for i := 0; i < e.ctx.GetSessionVars().IndexLookupJoinConcurrency; i++ {
joiners[i] = newJoiner(tc.ctx, 0, false, defaultValues, nil, leftTypes, rightTypes)
}
e.joiners = joiners
return e
}
type indexJoinType int8
const (
indexInnerHashJoin indexJoinType = iota
indexOuterHashJoin
indexMergeJoin
)
func benchmarkIndexJoinExecWithCase(
b *testing.B,
tc *indexJoinTestCase,
outerDS *mockDataSource,
innerDS *mockDataSource,
execType indexJoinType,
) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
var exec Executor
switch execType {
case indexInnerHashJoin:
exec = prepare4IndexInnerHashJoin(tc, outerDS, innerDS)
case indexOuterHashJoin:
exec = prepare4IndexOuterHashJoin(tc, outerDS, innerDS)
case indexMergeJoin:
exec = prepare4IndexMergeJoin(tc, outerDS, innerDS)
}
tmpCtx := context.Background()
chk := newFirstChunk(exec)
outerDS.prepareChunks()
innerDS.prepareChunks()
b.StartTimer()
if err := exec.Open(tmpCtx); err != nil {
b.Fatal(err)
}
for {
if err := exec.Next(tmpCtx, chk); err != nil {
b.Fatal(err)
}
if chk.NumRows() == 0 {
break
}
}
if err := exec.Close(); err != nil {
b.Fatal(err)
}
b.StopTimer()
}
}
func BenchmarkIndexJoinExec(b *testing.B) {
lvl := log.GetLevel()
log.SetLevel(zapcore.ErrorLevel)
defer log.SetLevel(lvl)
b.ReportAllocs()
tc := defaultIndexJoinTestCase()
outerOpt := tc.getMockDataSourceOptByRows(tc.outerRows)
innerOpt := tc.getMockDataSourceOptByRows(tc.innerRows)
outerDS := buildMockDataSourceWithIndex(outerOpt, tc.innerIdx)
innerDS := buildMockDataSourceWithIndex(innerOpt, tc.innerIdx)
tc.needOuterSort = true
b.Run(fmt.Sprintf("index merge join need outer sort %v", tc), func(b *testing.B) {
benchmarkIndexJoinExecWithCase(b, tc, outerDS, innerDS, indexMergeJoin)
})
tc.needOuterSort = false
b.Run(fmt.Sprintf("index merge join %v", tc), func(b *testing.B) {
benchmarkIndexJoinExecWithCase(b, tc, outerDS, innerDS, indexMergeJoin)
})
b.Run(fmt.Sprintf("index inner hash join %v", tc), func(b *testing.B) {
benchmarkIndexJoinExecWithCase(b, tc, outerDS, innerDS, indexInnerHashJoin)
})
b.Run(fmt.Sprintf("index outer hash join %v", tc), func(b *testing.B) {
benchmarkIndexJoinExecWithCase(b, tc, outerDS, innerDS, indexOuterHashJoin)
})
}