Files
tidb/statistics/statistics_test.go
2017-09-11 14:06:21 +08:00

468 lines
13 KiB
Go

// Copyright 2016 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package statistics
import (
"bytes"
"math"
"testing"
"github.com/juju/errors"
. "github.com/pingcap/check"
"github.com/pingcap/tidb/ast"
"github.com/pingcap/tidb/context"
"github.com/pingcap/tidb/model"
"github.com/pingcap/tidb/mysql"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/util/codec"
"github.com/pingcap/tidb/util/mock"
"github.com/pingcap/tidb/util/types"
)
func TestT(t *testing.T) {
TestingT(t)
}
var _ = Suite(&testStatisticsSuite{})
type testStatisticsSuite struct {
count int64
samples []types.Datum
rc ast.RecordSet
pk ast.RecordSet
}
type dataTable struct {
count int64
samples []types.Datum
}
type recordSet struct {
data []types.Datum
count int64
cursor int64
}
func (r *recordSet) Fields() ([]*ast.ResultField, error) {
return nil, nil
}
func (r *recordSet) Next() (*ast.Row, error) {
if r.cursor == r.count {
return nil, nil
}
r.cursor++
return &ast.Row{Data: []types.Datum{r.data[r.cursor-1]}}, nil
}
func (r *recordSet) Close() error {
r.cursor = 0
return nil
}
func (s *testStatisticsSuite) SetUpSuite(c *C) {
s.count = 100000
samples := make([]types.Datum, 10000)
start := 1000
samples[0].SetInt64(0)
for i := 1; i < start; i++ {
samples[i].SetInt64(2)
}
for i := start; i < len(samples); i++ {
samples[i].SetInt64(int64(i))
}
for i := start; i < len(samples); i += 3 {
samples[i].SetInt64(samples[i].GetInt64() + 1)
}
for i := start; i < len(samples); i += 5 {
samples[i].SetInt64(samples[i].GetInt64() + 2)
}
sc := new(variable.StatementContext)
err := types.SortDatums(sc, samples)
c.Check(err, IsNil)
s.samples = samples
rc := &recordSet{
data: make([]types.Datum, s.count),
count: s.count,
cursor: 0,
}
rc.data[0].SetInt64(0)
for i := 1; i < start; i++ {
rc.data[i].SetInt64(2)
}
for i := int64(start); i < rc.count; i++ {
rc.data[i].SetInt64(int64(i))
}
for i := int64(start); i < rc.count; i += 3 {
rc.data[i].SetInt64(rc.data[i].GetInt64() + 1)
}
for i := int64(start); i < rc.count; i += 5 {
rc.data[i].SetInt64(rc.data[i].GetInt64() + 2)
}
err = types.SortDatums(sc, rc.data)
c.Check(err, IsNil)
s.rc = rc
pk := &recordSet{
data: make([]types.Datum, s.count),
count: s.count,
cursor: 0,
}
for i := int64(0); i < rc.count; i++ {
pk.data[i].SetInt64(int64(i))
}
s.pk = pk
}
func encodeKey(key types.Datum) types.Datum {
bytes, _ := codec.EncodeKey(nil, key)
return types.NewBytesDatum(bytes)
}
func buildPK(ctx context.Context, numBuckets, id int64, records ast.RecordSet) (int64, *Histogram, error) {
b := NewSortedBuilder(ctx.GetSessionVars().StmtCtx, numBuckets, id)
for {
row, err := records.Next()
if err != nil {
return 0, nil, errors.Trace(err)
}
if row == nil {
break
}
err = b.Iterate(row.Data[0])
if err != nil {
return 0, nil, errors.Trace(err)
}
}
return b.Count, b.hist, nil
}
func (s *testStatisticsSuite) TestBuild(c *C) {
bucketCount := int64(256)
_, ndv, _ := buildFMSketch(s.rc.(*recordSet).data, 1000)
ctx := mock.NewContext()
sc := ctx.GetSessionVars().StmtCtx
col, err := BuildColumn(ctx, bucketCount, 2, ndv, s.count, 0, s.samples)
c.Check(err, IsNil)
c.Check(len(col.Buckets), Equals, 232)
count, err := col.equalRowCount(sc, types.NewIntDatum(1000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 0)
count, err = col.lessRowCount(sc, types.NewIntDatum(1000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 10000)
count, err = col.lessRowCount(sc, types.NewIntDatum(2000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 19964)
count, err = col.greaterRowCount(sc, types.NewIntDatum(2000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 80034)
count, err = col.lessRowCount(sc, types.NewIntDatum(200000000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 100000)
count, err = col.greaterRowCount(sc, types.NewIntDatum(200000000))
c.Check(err, IsNil)
c.Check(count, Equals, 0.0)
count, err = col.equalRowCount(sc, types.NewIntDatum(200000000))
c.Check(err, IsNil)
c.Check(count, Equals, 0.0)
count, err = col.betweenRowCount(sc, types.NewIntDatum(3000), types.NewIntDatum(3500))
c.Check(err, IsNil)
c.Check(int(count), Equals, 5075)
count, err = col.lessRowCount(sc, types.NewIntDatum(1))
c.Check(err, IsNil)
c.Check(int(count), Equals, 9)
tblCount, col, err := BuildIndex(ctx, bucketCount, 1, ast.RecordSet(s.rc))
c.Check(err, IsNil)
c.Check(int(tblCount), Equals, 100000)
count, err = col.equalRowCount(sc, encodeKey(types.NewIntDatum(10000)))
c.Check(err, IsNil)
c.Check(int(count), Equals, 1)
count, err = col.lessRowCount(sc, encodeKey(types.NewIntDatum(20000)))
c.Check(err, IsNil)
c.Check(int(count), Equals, 19983)
count, err = col.betweenRowCount(sc, encodeKey(types.NewIntDatum(30000)), encodeKey(types.NewIntDatum(35000)))
c.Check(err, IsNil)
c.Check(int(count), Equals, 4618)
count, err = col.lessRowCount(sc, encodeKey(types.NewIntDatum(0)))
c.Check(err, IsNil)
c.Check(int(count), Equals, 0)
s.pk.(*recordSet).cursor = 0
tblCount, col, err = buildPK(ctx, bucketCount, 4, ast.RecordSet(s.pk))
c.Check(err, IsNil)
c.Check(int(tblCount), Equals, 100000)
count, err = col.equalRowCount(sc, types.NewIntDatum(10000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 1)
count, err = col.lessRowCount(sc, types.NewIntDatum(20000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 20223)
count, err = col.betweenRowCount(sc, types.NewIntDatum(30000), types.NewIntDatum(35000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 5120)
count, err = col.greaterAndEqRowCount(sc, types.NewIntDatum(1001))
c.Check(err, IsNil)
c.Check(int(count), Equals, 99232)
count, err = col.lessAndEqRowCount(sc, types.NewIntDatum(99999))
c.Check(err, IsNil)
c.Check(int(count), Equals, 100000)
count, err = col.lessAndEqRowCount(sc, types.Datum{})
c.Check(err, IsNil)
c.Check(int(count), Equals, 0)
count, err = col.greaterRowCount(sc, types.NewIntDatum(1001))
c.Check(err, IsNil)
c.Check(int(count), Equals, 99231)
count, err = col.lessRowCount(sc, types.NewIntDatum(99999))
c.Check(err, IsNil)
c.Check(int(count), Equals, 99999)
}
func (s *testStatisticsSuite) TestHistogramProtoConversion(c *C) {
ctx := mock.NewContext()
s.rc.Close()
tblCount, col, err := BuildIndex(ctx, 256, 1, ast.RecordSet(s.rc))
c.Check(err, IsNil)
c.Check(int(tblCount), Equals, 100000)
p := HistogramToProto(col)
h := HistogramFromProto(p)
c.Assert(col.NDV, Equals, h.NDV)
c.Assert(len(col.Buckets), Equals, len(h.Buckets))
for i, bkt := range col.Buckets {
c.Assert(bkt.Count, Equals, h.Buckets[i].Count)
c.Assert(bkt.Repeats, Equals, h.Buckets[i].Repeats)
c.Assert(bytes.Equal(bkt.LowerBound.GetBytes(), h.Buckets[i].LowerBound.GetBytes()), IsTrue)
c.Assert(bytes.Equal(bkt.UpperBound.GetBytes(), h.Buckets[i].UpperBound.GetBytes()), IsTrue)
}
}
func mockHistogram(lower, num int64) *Histogram {
h := &Histogram{
NDV: num,
}
for i := int64(0); i < num; i++ {
bkt := Bucket{
LowerBound: types.NewIntDatum(lower + i),
UpperBound: types.NewIntDatum(lower + i),
Count: i + 1,
Repeats: 1,
}
h.Buckets = append(h.Buckets, bkt)
}
return h
}
func (s *testStatisticsSuite) TestMergeHistogram(c *C) {
tests := []struct {
leftLower int64
leftNum int64
rightLower int64
rightNum int64
bucketNum int
ndv int64
}{
{
leftLower: 0,
leftNum: 0,
rightLower: 0,
rightNum: 1,
bucketNum: 1,
ndv: 1,
},
{
leftLower: 0,
leftNum: 200,
rightLower: 200,
rightNum: 200,
bucketNum: 200,
ndv: 400,
},
{
leftLower: 0,
leftNum: 200,
rightLower: 199,
rightNum: 200,
bucketNum: 200,
ndv: 399,
},
}
sc := mock.NewContext().GetSessionVars().StmtCtx
bucketCount := 256
for _, t := range tests {
lh := mockHistogram(t.leftLower, t.leftNum)
rh := mockHistogram(t.rightLower, t.rightNum)
h, err := MergeHistograms(sc, lh, rh, bucketCount)
c.Assert(err, IsNil)
c.Assert(h.NDV, Equals, t.ndv)
c.Assert(len(h.Buckets), Equals, t.bucketNum)
c.Assert(h.Buckets[len(h.Buckets)-1].Count, Equals, t.leftNum+t.rightNum)
cmp, err := h.Buckets[0].LowerBound.CompareDatum(sc, types.NewIntDatum(t.leftLower))
c.Assert(err, IsNil)
c.Assert(cmp, Equals, 0)
cmp, err = h.Buckets[len(h.Buckets)-1].UpperBound.CompareDatum(sc, types.NewIntDatum(t.rightLower+t.rightNum-1))
c.Assert(err, IsNil)
c.Assert(cmp, Equals, 0)
}
}
func (s *testStatisticsSuite) TestPseudoTable(c *C) {
ti := &model.TableInfo{}
colInfo := &model.ColumnInfo{
ID: 1,
FieldType: *types.NewFieldType(mysql.TypeLonglong),
}
ti.Columns = append(ti.Columns, colInfo)
tbl := PseudoTable(ti.ID)
c.Assert(tbl.Count, Greater, int64(0))
sc := new(variable.StatementContext)
count, err := tbl.ColumnLessRowCount(sc, types.NewIntDatum(100), colInfo)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 3333)
count, err = tbl.ColumnEqualRowCount(sc, types.NewIntDatum(1000), colInfo)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 10)
count, err = tbl.ColumnBetweenRowCount(sc, types.NewIntDatum(1000), types.NewIntDatum(5000), colInfo)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 250)
}
func (s *testStatisticsSuite) TestColumnRange(c *C) {
bucketCount := int64(256)
_, ndv, _ := buildFMSketch(s.rc.(*recordSet).data, 1000)
ctx := mock.NewContext()
sc := ctx.GetSessionVars().StmtCtx
hg, err := BuildColumn(ctx, bucketCount, 5, ndv, s.count, 0, s.samples)
c.Check(err, IsNil)
col := &Column{Histogram: *hg}
tbl := &Table{
Count: int64(col.totalRowCount()),
Columns: make(map[int64]*Column),
}
ran := []*types.ColumnRange{{
Low: types.Datum{},
High: types.MaxValueDatum(),
}}
count, err := tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 100000)
ran[0].Low = types.MinNotNullDatum()
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 99900)
ran[0].Low = types.NewIntDatum(1000)
ran[0].LowExcl = true
ran[0].High = types.NewIntDatum(2000)
ran[0].HighExcl = true
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 2500)
ran[0].LowExcl = false
ran[0].HighExcl = false
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 2500)
ran[0].Low = ran[0].High
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 100)
tbl.Columns[0] = col
ran[0].Low = types.Datum{}
ran[0].High = types.MaxValueDatum()
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 100000)
ran[0].Low = types.NewIntDatum(1000)
ran[0].LowExcl = true
ran[0].High = types.NewIntDatum(2000)
ran[0].HighExcl = true
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 9964)
ran[0].LowExcl = false
ran[0].HighExcl = false
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 9965)
ran[0].Low = ran[0].High
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 1)
}
func (s *testStatisticsSuite) TestIntColumnRanges(c *C) {
bucketCount := int64(256)
ctx := mock.NewContext()
sc := ctx.GetSessionVars().StmtCtx
s.pk.(*recordSet).cursor = 0
rowCount, hg, err := buildPK(ctx, bucketCount, 0, s.pk)
c.Check(err, IsNil)
c.Check(rowCount, Equals, int64(100000))
col := &Column{Histogram: *hg}
tbl := &Table{
Count: int64(col.totalRowCount()),
Columns: make(map[int64]*Column),
}
ran := []types.IntColumnRange{{
LowVal: math.MinInt64,
HighVal: math.MaxInt64,
}}
count, err := tbl.GetRowCountByIntColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 100000)
ran[0].LowVal = 1000
ran[0].HighVal = 2000
count, err = tbl.GetRowCountByIntColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 1000)
ran[0].LowVal = 1001
ran[0].HighVal = 1999
count, err = tbl.GetRowCountByIntColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 998)
ran[0].LowVal = 1000
ran[0].HighVal = 1000
count, err = tbl.GetRowCountByIntColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 100)
tbl.Columns[0] = col
ran[0].LowVal = math.MinInt64
ran[0].HighVal = math.MaxInt64
count, err = tbl.GetRowCountByIntColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 100000)
ran[0].LowVal = 1000
ran[0].HighVal = 2000
count, err = tbl.GetRowCountByIntColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 1000)
ran[0].LowVal = 1001
ran[0].HighVal = 1999
count, err = tbl.GetRowCountByIntColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 998)
ran[0].LowVal = 1000
ran[0].HighVal = 1000
count, err = tbl.GetRowCountByIntColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 1)
}