314 lines
10 KiB
Go
314 lines
10 KiB
Go
// Copyright 2017 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package statistics
|
|
|
|
import (
|
|
"math/rand"
|
|
"time"
|
|
|
|
. "github.com/pingcap/check"
|
|
"github.com/pingcap/parser/mysql"
|
|
"github.com/pingcap/tidb/sessionctx/stmtctx"
|
|
"github.com/pingcap/tidb/types"
|
|
"github.com/pingcap/tidb/util/collate"
|
|
"github.com/pingcap/tidb/util/mock"
|
|
"github.com/pingcap/tidb/util/sqlexec"
|
|
)
|
|
|
|
var _ = Suite(&testSampleSuite{})
|
|
|
|
type testSampleSuite struct {
|
|
count int
|
|
rs sqlexec.RecordSet
|
|
}
|
|
|
|
func (s *testSampleSuite) SetUpSuite(c *C) {
|
|
s.count = 10000
|
|
rs := &recordSet{
|
|
data: make([]types.Datum, s.count),
|
|
count: s.count,
|
|
cursor: 0,
|
|
firstIsID: true,
|
|
}
|
|
rs.setFields(mysql.TypeLonglong, mysql.TypeLonglong)
|
|
start := 1000 // 1000 values is null
|
|
for i := start; i < rs.count; i++ {
|
|
rs.data[i].SetInt64(int64(i))
|
|
}
|
|
for i := start; i < rs.count; i += 3 {
|
|
rs.data[i].SetInt64(rs.data[i].GetInt64() + 1)
|
|
}
|
|
for i := start; i < rs.count; i += 5 {
|
|
rs.data[i].SetInt64(rs.data[i].GetInt64() + 2)
|
|
}
|
|
s.rs = rs
|
|
}
|
|
|
|
func (s *testSampleSuite) TestCollectColumnStats(c *C) {
|
|
sc := mock.NewContext().GetSessionVars().StmtCtx
|
|
builder := SampleBuilder{
|
|
Sc: sc,
|
|
RecordSet: s.rs,
|
|
ColLen: 1,
|
|
PkBuilder: NewSortedBuilder(sc, 256, 1, types.NewFieldType(mysql.TypeLonglong), Version2),
|
|
MaxSampleSize: 10000,
|
|
MaxBucketSize: 256,
|
|
MaxFMSketchSize: 1000,
|
|
CMSketchWidth: 2048,
|
|
CMSketchDepth: 8,
|
|
Collators: make([]collate.Collator, 1),
|
|
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong)},
|
|
}
|
|
c.Assert(s.rs.Close(), IsNil)
|
|
collectors, pkBuilder, err := builder.CollectColumnStats()
|
|
c.Assert(err, IsNil)
|
|
c.Assert(collectors[0].NullCount+collectors[0].Count, Equals, int64(s.count))
|
|
c.Assert(collectors[0].FMSketch.NDV(), Equals, int64(6232))
|
|
c.Assert(collectors[0].CMSketch.TotalCount(), Equals, uint64(collectors[0].Count))
|
|
c.Assert(pkBuilder.Count, Equals, int64(s.count))
|
|
c.Assert(pkBuilder.Hist().NDV, Equals, int64(s.count))
|
|
}
|
|
|
|
func (s *testSampleSuite) TestMergeSampleCollector(c *C) {
|
|
builder := SampleBuilder{
|
|
Sc: mock.NewContext().GetSessionVars().StmtCtx,
|
|
RecordSet: s.rs,
|
|
ColLen: 2,
|
|
MaxSampleSize: 1000,
|
|
MaxBucketSize: 256,
|
|
MaxFMSketchSize: 1000,
|
|
CMSketchWidth: 2048,
|
|
CMSketchDepth: 8,
|
|
Collators: make([]collate.Collator, 2),
|
|
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong), types.NewFieldType(mysql.TypeLonglong)},
|
|
}
|
|
c.Assert(s.rs.Close(), IsNil)
|
|
sc := &stmtctx.StatementContext{TimeZone: time.Local}
|
|
collectors, pkBuilder, err := builder.CollectColumnStats()
|
|
c.Assert(err, IsNil)
|
|
c.Assert(pkBuilder, IsNil)
|
|
c.Assert(len(collectors), Equals, 2)
|
|
collectors[0].IsMerger = true
|
|
collectors[0].MergeSampleCollector(sc, collectors[1])
|
|
c.Assert(collectors[0].FMSketch.NDV(), Equals, int64(9280))
|
|
c.Assert(len(collectors[0].Samples), Equals, 1000)
|
|
c.Assert(collectors[0].NullCount, Equals, int64(1000))
|
|
c.Assert(collectors[0].Count, Equals, int64(19000))
|
|
c.Assert(collectors[0].CMSketch.TotalCount(), Equals, uint64(collectors[0].Count))
|
|
}
|
|
|
|
func (s *testSampleSuite) TestCollectorProtoConversion(c *C) {
|
|
builder := SampleBuilder{
|
|
Sc: mock.NewContext().GetSessionVars().StmtCtx,
|
|
RecordSet: s.rs,
|
|
ColLen: 2,
|
|
MaxSampleSize: 10000,
|
|
MaxBucketSize: 256,
|
|
MaxFMSketchSize: 1000,
|
|
CMSketchWidth: 2048,
|
|
CMSketchDepth: 8,
|
|
Collators: make([]collate.Collator, 2),
|
|
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong), types.NewFieldType(mysql.TypeLonglong)},
|
|
}
|
|
c.Assert(s.rs.Close(), IsNil)
|
|
collectors, pkBuilder, err := builder.CollectColumnStats()
|
|
c.Assert(err, IsNil)
|
|
c.Assert(pkBuilder, IsNil)
|
|
for _, collector := range collectors {
|
|
p := SampleCollectorToProto(collector)
|
|
s := SampleCollectorFromProto(p)
|
|
c.Assert(collector.Count, Equals, s.Count)
|
|
c.Assert(collector.NullCount, Equals, s.NullCount)
|
|
c.Assert(collector.CMSketch.TotalCount(), Equals, s.CMSketch.TotalCount())
|
|
c.Assert(collector.FMSketch.NDV(), Equals, s.FMSketch.NDV())
|
|
c.Assert(collector.TotalSize, Equals, s.TotalSize)
|
|
c.Assert(len(collector.Samples), Equals, len(s.Samples))
|
|
}
|
|
}
|
|
|
|
func (s *testSampleSuite) recordSetForWeightSamplingTest(size int) *recordSet {
|
|
r := &recordSet{
|
|
data: make([]types.Datum, 0, size),
|
|
count: size,
|
|
}
|
|
for i := 0; i < size; i++ {
|
|
r.data = append(r.data, types.NewIntDatum(int64(i)))
|
|
}
|
|
r.setFields(mysql.TypeLonglong)
|
|
return r
|
|
}
|
|
|
|
func (s *testSampleSuite) recordSetForDistributedSamplingTest(size, batch int) []*recordSet {
|
|
sets := make([]*recordSet, 0, batch)
|
|
batchSize := size / batch
|
|
for i := 0; i < batch; i++ {
|
|
r := &recordSet{
|
|
data: make([]types.Datum, 0, batchSize),
|
|
count: batchSize,
|
|
}
|
|
for j := 0; j < size/batch; j++ {
|
|
r.data = append(r.data, types.NewIntDatum(int64(j+batchSize*i)))
|
|
}
|
|
r.setFields(mysql.TypeLonglong)
|
|
sets = append(sets, r)
|
|
}
|
|
return sets
|
|
}
|
|
|
|
func (s *testSampleSuite) TestWeightedSampling(c *C) {
|
|
sampleNum := int64(20)
|
|
rowNum := 100
|
|
loopCnt := 1000
|
|
rs := s.recordSetForWeightSamplingTest(rowNum)
|
|
sc := mock.NewContext().GetSessionVars().StmtCtx
|
|
// The loop which is commented out is used for stability test.
|
|
// This test can run 800 times in a row without any failure.
|
|
// for x := 0; x < 800; x++ {
|
|
itemCnt := make([]int, rowNum)
|
|
for loopI := 0; loopI < loopCnt; loopI++ {
|
|
builder := &RowSampleBuilder{
|
|
Sc: sc,
|
|
RecordSet: rs,
|
|
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong)},
|
|
Collators: make([]collate.Collator, 1),
|
|
ColGroups: nil,
|
|
MaxSampleSize: int(sampleNum),
|
|
MaxFMSketchSize: 1000,
|
|
Rng: rand.New(rand.NewSource(time.Now().UnixNano())),
|
|
}
|
|
collector, err := builder.Collect()
|
|
c.Assert(err, IsNil)
|
|
for i := 0; i < collector.MaxSampleSize; i++ {
|
|
a := collector.Samples[i].Columns[0].GetInt64()
|
|
itemCnt[a]++
|
|
}
|
|
c.Assert(rs.Close(), IsNil)
|
|
}
|
|
expFrequency := float64(sampleNum) * float64(loopCnt) / float64(rowNum)
|
|
delta := 0.5
|
|
for _, cnt := range itemCnt {
|
|
if float64(cnt) < expFrequency/(1+delta) || float64(cnt) > expFrequency*(1+delta) {
|
|
c.Assert(false, IsTrue, Commentf("The frequency %v is exceed the Chernoff Bound", cnt))
|
|
}
|
|
}
|
|
// }
|
|
}
|
|
|
|
func (s *testSampleSuite) TestDistributedWeightedSampling(c *C) {
|
|
sampleNum := int64(10)
|
|
rowNum := 100
|
|
loopCnt := 1500
|
|
batch := 5
|
|
sets := s.recordSetForDistributedSamplingTest(rowNum, batch)
|
|
sc := mock.NewContext().GetSessionVars().StmtCtx
|
|
// The loop which is commented out is used for stability test.
|
|
// This test can run 800 times in a row without any failure.
|
|
// for x := 0; x < 800; x++ {
|
|
itemCnt := make([]int, rowNum)
|
|
for loopI := 1; loopI < loopCnt; loopI++ {
|
|
rootRowCollector := &RowSampleCollector{
|
|
NullCount: make([]int64, 1),
|
|
FMSketches: make([]*FMSketch, 0, 1),
|
|
TotalSizes: make([]int64, 1),
|
|
Samples: make(WeightedRowSampleHeap, 0, sampleNum),
|
|
MaxSampleSize: int(sampleNum),
|
|
}
|
|
rootRowCollector.FMSketches = append(rootRowCollector.FMSketches, NewFMSketch(1000))
|
|
for i := 0; i < batch; i++ {
|
|
builder := &RowSampleBuilder{
|
|
Sc: sc,
|
|
RecordSet: sets[i],
|
|
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong)},
|
|
Collators: make([]collate.Collator, 1),
|
|
ColGroups: nil,
|
|
MaxSampleSize: int(sampleNum),
|
|
MaxFMSketchSize: 1000,
|
|
Rng: rand.New(rand.NewSource(time.Now().UnixNano())),
|
|
}
|
|
collector, err := builder.Collect()
|
|
c.Assert(err, IsNil)
|
|
rootRowCollector.MergeCollector(collector)
|
|
c.Assert(sets[i].Close(), IsNil)
|
|
}
|
|
for _, sample := range rootRowCollector.Samples {
|
|
itemCnt[sample.Columns[0].GetInt64()]++
|
|
}
|
|
}
|
|
expFrequency := float64(sampleNum) * float64(loopCnt) / float64(rowNum)
|
|
delta := 0.5
|
|
for _, cnt := range itemCnt {
|
|
if float64(cnt) < expFrequency/(1+delta) || float64(cnt) > expFrequency*(1+delta) {
|
|
c.Assert(false, IsTrue, Commentf("the frequency %v is exceed the Chernoff Bound", cnt))
|
|
}
|
|
}
|
|
// }
|
|
}
|
|
|
|
func (s *testSampleSuite) TestBuildStatsOnRowSample(c *C) {
|
|
ctx := mock.NewContext()
|
|
sketch := NewFMSketch(1000)
|
|
data := make([]*SampleItem, 0, 8)
|
|
for i := 1; i <= 1000; i++ {
|
|
d := types.NewIntDatum(int64(i))
|
|
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
|
|
c.Assert(err, IsNil)
|
|
data = append(data, &SampleItem{Value: d})
|
|
}
|
|
for i := 1; i < 10; i++ {
|
|
d := types.NewIntDatum(int64(2))
|
|
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
|
|
c.Assert(err, IsNil)
|
|
data = append(data, &SampleItem{Value: d})
|
|
}
|
|
for i := 1; i < 7; i++ {
|
|
d := types.NewIntDatum(int64(4))
|
|
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
|
|
c.Assert(err, IsNil)
|
|
data = append(data, &SampleItem{Value: d})
|
|
}
|
|
for i := 1; i < 5; i++ {
|
|
d := types.NewIntDatum(int64(7))
|
|
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
|
|
c.Assert(err, IsNil)
|
|
data = append(data, &SampleItem{Value: d})
|
|
}
|
|
for i := 1; i < 3; i++ {
|
|
d := types.NewIntDatum(int64(11))
|
|
err := sketch.InsertValue(ctx.GetSessionVars().StmtCtx, d)
|
|
c.Assert(err, IsNil)
|
|
data = append(data, &SampleItem{Value: d})
|
|
}
|
|
collector := &SampleCollector{
|
|
Samples: data,
|
|
NullCount: 0,
|
|
Count: int64(len(data)),
|
|
FMSketch: sketch,
|
|
TotalSize: int64(len(data)) * 8,
|
|
}
|
|
tp := types.NewFieldType(mysql.TypeLonglong)
|
|
hist, topN, err := BuildHistAndTopN(ctx, 5, 4, 1, collector, tp, true)
|
|
c.Assert(err, IsNil, Commentf("%+v", err))
|
|
topNStr, err := topN.DecodedString(ctx, []byte{tp.Tp})
|
|
c.Assert(err, IsNil)
|
|
c.Assert(topNStr, Equals, "TopN{length: 4, [(2, 10), (4, 7), (7, 5), (11, 3)]}")
|
|
c.Assert(hist.ToString(0), Equals, "column:1 ndv:1000 totColSize:8168\n"+
|
|
"num: 200 lower_bound: 1 upper_bound: 204 repeats: 1 ndv: 0\n"+
|
|
"num: 200 lower_bound: 205 upper_bound: 404 repeats: 1 ndv: 0\n"+
|
|
"num: 200 lower_bound: 405 upper_bound: 604 repeats: 1 ndv: 0\n"+
|
|
"num: 200 lower_bound: 605 upper_bound: 804 repeats: 1 ndv: 0\n"+
|
|
"num: 196 lower_bound: 805 upper_bound: 1000 repeats: 1 ndv: 0",
|
|
)
|
|
|
|
}
|