Files
tidb/statistics/sample_test.go

257 lines
8.4 KiB
Go

// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package statistics
import (
"math/rand"
"time"
. "github.com/pingcap/check"
"github.com/pingcap/parser/mysql"
"github.com/pingcap/tidb/sessionctx/stmtctx"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/collate"
"github.com/pingcap/tidb/util/mock"
"github.com/pingcap/tidb/util/sqlexec"
)
var _ = Suite(&testSampleSuite{})
type testSampleSuite struct {
count int
rs sqlexec.RecordSet
}
func (s *testSampleSuite) SetUpSuite(c *C) {
s.count = 10000
rs := &recordSet{
data: make([]types.Datum, s.count),
count: s.count,
cursor: 0,
firstIsID: true,
}
rs.setFields(mysql.TypeLonglong, mysql.TypeLonglong)
start := 1000 // 1000 values is null
for i := start; i < rs.count; i++ {
rs.data[i].SetInt64(int64(i))
}
for i := start; i < rs.count; i += 3 {
rs.data[i].SetInt64(rs.data[i].GetInt64() + 1)
}
for i := start; i < rs.count; i += 5 {
rs.data[i].SetInt64(rs.data[i].GetInt64() + 2)
}
s.rs = rs
}
func (s *testSampleSuite) TestCollectColumnStats(c *C) {
sc := mock.NewContext().GetSessionVars().StmtCtx
builder := SampleBuilder{
Sc: sc,
RecordSet: s.rs,
ColLen: 1,
PkBuilder: NewSortedBuilder(sc, 256, 1, types.NewFieldType(mysql.TypeLonglong), Version2),
MaxSampleSize: 10000,
MaxBucketSize: 256,
MaxFMSketchSize: 1000,
CMSketchWidth: 2048,
CMSketchDepth: 8,
Collators: make([]collate.Collator, 1),
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong)},
}
c.Assert(s.rs.Close(), IsNil)
collectors, pkBuilder, err := builder.CollectColumnStats()
c.Assert(err, IsNil)
c.Assert(collectors[0].NullCount+collectors[0].Count, Equals, int64(s.count))
c.Assert(collectors[0].FMSketch.NDV(), Equals, int64(6232))
c.Assert(collectors[0].CMSketch.TotalCount(), Equals, uint64(collectors[0].Count))
c.Assert(pkBuilder.Count, Equals, int64(s.count))
c.Assert(pkBuilder.Hist().NDV, Equals, int64(s.count))
}
func (s *testSampleSuite) TestMergeSampleCollector(c *C) {
builder := SampleBuilder{
Sc: mock.NewContext().GetSessionVars().StmtCtx,
RecordSet: s.rs,
ColLen: 2,
MaxSampleSize: 1000,
MaxBucketSize: 256,
MaxFMSketchSize: 1000,
CMSketchWidth: 2048,
CMSketchDepth: 8,
Collators: make([]collate.Collator, 2),
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong), types.NewFieldType(mysql.TypeLonglong)},
}
c.Assert(s.rs.Close(), IsNil)
sc := &stmtctx.StatementContext{TimeZone: time.Local}
collectors, pkBuilder, err := builder.CollectColumnStats()
c.Assert(err, IsNil)
c.Assert(pkBuilder, IsNil)
c.Assert(len(collectors), Equals, 2)
collectors[0].IsMerger = true
collectors[0].MergeSampleCollector(sc, collectors[1])
c.Assert(collectors[0].FMSketch.NDV(), Equals, int64(9280))
c.Assert(len(collectors[0].Samples), Equals, 1000)
c.Assert(collectors[0].NullCount, Equals, int64(1000))
c.Assert(collectors[0].Count, Equals, int64(19000))
c.Assert(collectors[0].CMSketch.TotalCount(), Equals, uint64(collectors[0].Count))
}
func (s *testSampleSuite) TestCollectorProtoConversion(c *C) {
builder := SampleBuilder{
Sc: mock.NewContext().GetSessionVars().StmtCtx,
RecordSet: s.rs,
ColLen: 2,
MaxSampleSize: 10000,
MaxBucketSize: 256,
MaxFMSketchSize: 1000,
CMSketchWidth: 2048,
CMSketchDepth: 8,
Collators: make([]collate.Collator, 2),
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong), types.NewFieldType(mysql.TypeLonglong)},
}
c.Assert(s.rs.Close(), IsNil)
collectors, pkBuilder, err := builder.CollectColumnStats()
c.Assert(err, IsNil)
c.Assert(pkBuilder, IsNil)
for _, collector := range collectors {
p := SampleCollectorToProto(collector)
s := SampleCollectorFromProto(p)
c.Assert(collector.Count, Equals, s.Count)
c.Assert(collector.NullCount, Equals, s.NullCount)
c.Assert(collector.CMSketch.TotalCount(), Equals, s.CMSketch.TotalCount())
c.Assert(collector.FMSketch.NDV(), Equals, s.FMSketch.NDV())
c.Assert(collector.TotalSize, Equals, s.TotalSize)
c.Assert(len(collector.Samples), Equals, len(s.Samples))
}
}
func (s *testSampleSuite) recordSetForWeightSamplingTest(size int) *recordSet {
r := &recordSet{
data: make([]types.Datum, 0, size),
count: size,
}
for i := 0; i < size; i++ {
r.data = append(r.data, types.NewIntDatum(int64(i)))
}
r.setFields(mysql.TypeLonglong)
return r
}
func (s *testSampleSuite) recordSetForDistributedSamplingTest(size, batch int) []*recordSet {
sets := make([]*recordSet, 0, batch)
batchSize := size / batch
for i := 0; i < batch; i++ {
r := &recordSet{
data: make([]types.Datum, 0, batchSize),
count: batchSize,
}
for j := 0; j < size/batch; j++ {
r.data = append(r.data, types.NewIntDatum(int64(j+batchSize*i)))
}
r.setFields(mysql.TypeLonglong)
sets = append(sets, r)
}
return sets
}
func (s *testSampleSuite) TestWeightedSampling(c *C) {
sampleNum := int64(20)
rowNum := 100
loopCnt := 1000
rs := s.recordSetForWeightSamplingTest(rowNum)
sc := mock.NewContext().GetSessionVars().StmtCtx
// The loop which is commented out is used for stability test.
// This test can run 800 times in a row without any failure.
// for x := 0; x < 800; x++ {
itemCnt := make([]int, rowNum)
for loopI := 0; loopI < loopCnt; loopI++ {
builder := &RowSampleBuilder{
Sc: sc,
RecordSet: rs,
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong)},
Collators: make([]collate.Collator, 1),
ColGroups: nil,
MaxSampleSize: int(sampleNum),
MaxFMSketchSize: 1000,
Rng: rand.New(rand.NewSource(time.Now().UnixNano())),
}
collector, err := builder.Collect()
c.Assert(err, IsNil)
for i := 0; i < collector.MaxSampleSize; i++ {
a := collector.Samples[i].Columns[0].GetInt64()
itemCnt[a]++
}
c.Assert(rs.Close(), IsNil)
}
expFrequency := float64(sampleNum) * float64(loopCnt) / float64(rowNum)
delta := 0.5
for _, cnt := range itemCnt {
if float64(cnt) < expFrequency/(1+delta) || float64(cnt) > expFrequency*(1+delta) {
c.Assert(false, IsTrue, Commentf("The frequency %v is exceed the Chernoff Bound", cnt))
}
}
// }
}
func (s *testSampleSuite) TestDistributedWeightedSampling(c *C) {
sampleNum := int64(10)
rowNum := 100
loopCnt := 1500
batch := 5
sets := s.recordSetForDistributedSamplingTest(rowNum, batch)
sc := mock.NewContext().GetSessionVars().StmtCtx
// The loop which is commented out is used for stability test.
// This test can run 800 times in a row without any failure.
// for x := 0; x < 800; x++ {
itemCnt := make([]int, rowNum)
for loopI := 1; loopI < loopCnt; loopI++ {
rootRowCollector := &RowSampleCollector{
NullCount: make([]int64, 1),
FMSketches: make([]*FMSketch, 0, 1),
TotalSizes: make([]int64, 1),
Samples: make(WeightedRowSampleHeap, 0, sampleNum),
MaxSampleSize: int(sampleNum),
}
rootRowCollector.FMSketches = append(rootRowCollector.FMSketches, NewFMSketch(1000))
for i := 0; i < batch; i++ {
builder := &RowSampleBuilder{
Sc: sc,
RecordSet: sets[i],
ColsFieldType: []*types.FieldType{types.NewFieldType(mysql.TypeLonglong)},
Collators: make([]collate.Collator, 1),
ColGroups: nil,
MaxSampleSize: int(sampleNum),
MaxFMSketchSize: 1000,
Rng: rand.New(rand.NewSource(time.Now().UnixNano())),
}
collector, err := builder.Collect()
c.Assert(err, IsNil)
rootRowCollector.MergeCollector(collector)
c.Assert(sets[i].Close(), IsNil)
}
for _, sample := range rootRowCollector.Samples {
itemCnt[sample.Columns[0].GetInt64()]++
}
}
expFrequency := float64(sampleNum) * float64(loopCnt) / float64(rowNum)
delta := 0.5
for _, cnt := range itemCnt {
if float64(cnt) < expFrequency/(1+delta) || float64(cnt) > expFrequency*(1+delta) {
c.Assert(false, IsTrue, Commentf("the frequency %v is exceed the Chernoff Bound", cnt))
}
}
// }
}