Files
tidb/statistics/cmsketch_test.go
2022-04-26 19:28:51 +08:00

309 lines
8.6 KiB
Go

// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package statistics
import (
"fmt"
"math"
"math/rand"
"testing"
"time"
"github.com/pingcap/errors"
"github.com/pingcap/tidb/parser/mysql"
"github.com/pingcap/tidb/sessionctx/stmtctx"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/codec"
"github.com/stretchr/testify/require"
)
func (c *CMSketch) insert(val *types.Datum) error {
bytes, err := codec.EncodeValue(nil, nil, *val)
if err != nil {
return errors.Trace(err)
}
c.InsertBytes(bytes)
return nil
}
func prepareCMSAndTopN(d, w int32, vals []*types.Datum, n uint32, total uint64) (*CMSketch, *TopN, error) {
data := make([][]byte, 0, len(vals))
for _, v := range vals {
bytes, err := codec.EncodeValue(nil, nil, *v)
if err != nil {
return nil, nil, errors.Trace(err)
}
data = append(data, bytes)
}
cms, topN, _, _ := NewCMSketchAndTopN(d, w, data, n, total)
return cms, topN, nil
}
// buildCMSketchAndMapWithOffset builds cm sketch using zipf and the generated values starts from `offset`.
func buildCMSketchAndMapWithOffset(d, w int32, seed int64, total, imax uint64, s float64, offset int64) (*CMSketch, map[int64]uint32, error) {
cms := NewCMSketch(d, w)
mp := make(map[int64]uint32)
zipf := rand.NewZipf(rand.New(rand.NewSource(seed)), s, 1, imax)
for i := uint64(0); i < total; i++ {
val := types.NewIntDatum(int64(zipf.Uint64()) + offset)
err := cms.insert(&val)
if err != nil {
return nil, nil, errors.Trace(err)
}
mp[val.GetInt64()]++
}
return cms, mp, nil
}
func buildCMSketchAndMap(d, w int32, seed int64, total, imax uint64, s float64) (*CMSketch, map[int64]uint32, error) {
return buildCMSketchAndMapWithOffset(d, w, seed, total, imax, s, 0)
}
func buildCMSketchTopNAndMap(d, w, n, sample int32, seed int64, total, imax uint64, s float64) (*CMSketch, *TopN, map[int64]uint32, error) {
mp := make(map[int64]uint32)
zipf := rand.NewZipf(rand.New(rand.NewSource(seed)), s, 1, imax)
vals := make([]*types.Datum, 0)
for i := uint64(0); i < total; i++ {
val := types.NewIntDatum(int64(zipf.Uint64()))
mp[val.GetInt64()]++
if i < uint64(sample) {
vals = append(vals, &val)
}
}
cms, topN, err := prepareCMSAndTopN(d, w, vals, uint32(n), total)
return cms, topN, mp, err
}
func averageAbsoluteError(cms *CMSketch, topN *TopN, mp map[int64]uint32) (uint64, error) {
sc := &stmtctx.StatementContext{TimeZone: time.Local}
var total uint64
for num, count := range mp {
estimate, err := queryValue(sc, cms, topN, types.NewIntDatum(num))
if err != nil {
return 0, errors.Trace(err)
}
var diff uint64
if uint64(count) > estimate {
diff = uint64(count) - estimate
} else {
diff = estimate - uint64(count)
}
total += diff
}
return total / uint64(len(mp)), nil
}
func TestCMSketch(t *testing.T) {
tests := []struct {
zipfFactor float64
avgError uint64
}{
{
zipfFactor: 1.1,
avgError: 3,
},
{
zipfFactor: 2,
avgError: 24,
},
{
zipfFactor: 3,
avgError: 63,
},
}
d, w := int32(5), int32(2048)
total, imax := uint64(100000), uint64(1000000)
for _, tt := range tests {
lSketch, lMap, err := buildCMSketchAndMap(d, w, 0, total, imax, tt.zipfFactor)
require.NoError(t, err)
avg, err := averageAbsoluteError(lSketch, nil, lMap)
require.NoError(t, err)
require.LessOrEqual(t, avg, tt.avgError)
rSketch, rMap, err := buildCMSketchAndMap(d, w, 1, total, imax, tt.zipfFactor)
require.NoError(t, err)
avg, err = averageAbsoluteError(rSketch, nil, rMap)
require.NoError(t, err)
require.LessOrEqual(t, avg, tt.avgError)
err = lSketch.MergeCMSketch(rSketch)
require.NoError(t, err)
for val, count := range rMap {
lMap[val] += count
}
avg, err = averageAbsoluteError(lSketch, nil, lMap)
require.NoError(t, err)
require.Less(t, avg, tt.avgError*2)
}
}
func TestCMSketchCoding(t *testing.T) {
lSketch := NewCMSketch(5, 2048)
lSketch.count = 2048 * math.MaxUint32
for i := range lSketch.table {
for j := range lSketch.table[i] {
lSketch.table[i][j] = math.MaxUint32
}
}
bytes, err := EncodeCMSketchWithoutTopN(lSketch)
require.NoError(t, err)
require.Len(t, bytes, 61457)
rSketch, _, err := DecodeCMSketchAndTopN(bytes, nil)
require.NoError(t, err)
require.True(t, lSketch.Equal(rSketch))
}
func TestCMSketchTopN(t *testing.T) {
tests := []struct {
zipfFactor float64
avgError uint64
}{
// If no significant most items, TopN may will produce results worse than normal algorithm.
// The first two tests produces almost same avg.
{
zipfFactor: 1.0000001,
avgError: 30,
},
{
zipfFactor: 1.1,
avgError: 30,
},
{
zipfFactor: 2,
avgError: 89,
},
// If the most data lies in a narrow range, our guess may have better result.
// The error mainly comes from huge numbers.
{
zipfFactor: 5,
avgError: 208,
},
}
d, w := int32(5), int32(2048)
total, imax := uint64(1000000), uint64(1000000)
for _, tt := range tests {
lSketch, topN, lMap, err := buildCMSketchTopNAndMap(d, w, 20, 1000, 0, total, imax, tt.zipfFactor)
require.NoError(t, err)
require.LessOrEqual(t, len(topN.TopN), 40)
avg, err := averageAbsoluteError(lSketch, topN, lMap)
require.NoError(t, err)
require.LessOrEqual(t, avg, tt.avgError)
}
}
func TestMergeCMSketch4IncrementalAnalyze(t *testing.T) {
tests := []struct {
zipfFactor float64
avgError uint64
}{
{
zipfFactor: 1.0000001,
avgError: 48,
},
{
zipfFactor: 1.1,
avgError: 48,
},
{
zipfFactor: 2,
avgError: 128,
},
{
zipfFactor: 5,
avgError: 256,
},
}
d, w := int32(5), int32(2048)
total, imax := uint64(100000), uint64(1000000)
for _, tt := range tests {
lSketch, lMap, err := buildCMSketchAndMap(d, w, 0, total, imax, tt.zipfFactor)
require.NoError(t, err)
avg, err := averageAbsoluteError(lSketch, nil, lMap)
require.NoError(t, err)
require.LessOrEqual(t, avg, tt.avgError)
rSketch, rMap, err := buildCMSketchAndMapWithOffset(d, w, 1, total, imax, tt.zipfFactor, int64(imax))
require.NoError(t, err)
avg, err = averageAbsoluteError(rSketch, nil, rMap)
require.NoError(t, err)
require.LessOrEqual(t, avg, tt.avgError)
for key, val := range rMap {
lMap[key] += val
}
require.NoError(t, lSketch.MergeCMSketch4IncrementalAnalyze(rSketch, 0))
avg, err = averageAbsoluteError(lSketch, nil, lMap)
require.NoError(t, err)
require.LessOrEqual(t, avg, tt.avgError)
width, depth := lSketch.GetWidthAndDepth()
require.Equal(t, int32(2048), width)
require.Equal(t, int32(5), depth)
}
}
func TestCMSketchTopNUniqueData(t *testing.T) {
d, w := int32(5), int32(2048)
total := uint64(1000000)
mp := make(map[int64]uint32)
vals := make([]*types.Datum, 0)
for i := uint64(0); i < total; i++ {
val := types.NewIntDatum(int64(i))
mp[val.GetInt64()]++
if i < uint64(1000) {
vals = append(vals, &val)
}
}
cms, topN, err := prepareCMSAndTopN(d, w, vals, uint32(20), total)
require.NoError(t, err)
avg, err := averageAbsoluteError(cms, topN, mp)
require.NoError(t, err)
require.Equal(t, uint64(1), cms.defaultValue)
require.Equal(t, uint64(0), avg)
require.Nil(t, topN)
}
func TestCMSketchCodingTopN(t *testing.T) {
lSketch := NewCMSketch(5, 2048)
lSketch.count = 2048 * (math.MaxUint32)
for i := range lSketch.table {
for j := range lSketch.table[i] {
lSketch.table[i][j] = math.MaxUint32
}
}
topN := make([]TopNMeta, 20)
unsignedLong := types.NewFieldType(mysql.TypeLonglong)
unsignedLong.AddFlag(mysql.UnsignedFlag)
chk := chunk.New([]*types.FieldType{types.NewFieldType(mysql.TypeBlob), unsignedLong}, 20, 20)
var rows []chunk.Row
for i := 0; i < 20; i++ {
tString := []byte(fmt.Sprintf("%20000d", i))
topN[i] = TopNMeta{tString, math.MaxUint64}
chk.AppendBytes(0, tString)
chk.AppendUint64(1, math.MaxUint64)
rows = append(rows, chk.GetRow(i))
}
bytes, err := EncodeCMSketchWithoutTopN(lSketch)
require.NoError(t, err)
require.Len(t, bytes, 61457)
rSketch, _, err := DecodeCMSketchAndTopN(bytes, rows)
require.NoError(t, err)
require.True(t, lSketch.Equal(rSketch))
// do not panic
_, _, err = DecodeCMSketchAndTopN([]byte{}, rows)
require.NoError(t, err)
}