Files
tidb/pkg/executor/importer/sampler.go

146 lines
4.5 KiB
Go

// Copyright 2025 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package importer
import (
"context"
goerrors "errors"
"math/rand"
"time"
"github.com/docker/go-units"
"github.com/pingcap/errors"
"github.com/pingcap/tidb/pkg/lightning/backend/kv"
"github.com/pingcap/tidb/pkg/lightning/checkpoints"
"github.com/pingcap/tidb/pkg/lightning/mydump"
verify "github.com/pingcap/tidb/pkg/lightning/verification"
"github.com/pingcap/tidb/pkg/table/tables"
"go.uber.org/zap"
)
const (
// maxSampleFileCount is the maximum number of files to sample.
maxSampleFileCount = 3
// totalSampleRowCount is the total number of rows to sample.
// we want to sample about 30 rows in total, if we have 3 files, we sample
// 10 rows per file. if we have less files, we sample more rows per file.
totalSampleRowCount = maxSampleFileCount * 10
)
var (
// maxSampleFileSize is the maximum file size to sample.
// if we sample maxSampleFileCount files, we only read 10 rows at most, if
// each row >= 1MiB, the index KV size ratio is quite small even with large
//number of indices. such as each index KV is 3K, we have 100 indices, it
// means 300K index KV size per row, the ratio is about 0.3.
// so even if we sample less rows due to very long rows, it's still ok for
// our resource params calculation.
// if total files < maxSampleFileCount, the total file size is small, the
// accuracy of the ratio is not that important.
maxSampleFileSize int64 = 10 * units.MiB
stopIterErr = goerrors.New("stop iteration")
)
func (e *LoadDataController) sampleIndexSizeRatio(
ctx context.Context,
ksCodec []byte,
) (float64, error) {
if len(e.dataFiles) == 0 {
return 0, nil
}
perm := rand.Perm(len(e.dataFiles))
files := make([]*mydump.SourceFileMeta, min(len(e.dataFiles), maxSampleFileCount))
for i := range files {
files[i] = e.dataFiles[perm[i]]
}
rowsPerFile := totalSampleRowCount / len(files)
var (
totalDataKVSize, totalIndexKVSize uint64
firstErr error
)
for _, file := range files {
dataKVSize, indexKVSize, err := e.sampleIndexRatioForOneFile(ctx, file, ksCodec, rowsPerFile)
if firstErr == nil {
firstErr = err
}
totalDataKVSize += dataKVSize
totalIndexKVSize += indexKVSize
}
if totalDataKVSize == 0 {
return 0, firstErr
}
return float64(totalIndexKVSize) / float64(totalDataKVSize), firstErr
}
func (e *LoadDataController) sampleIndexRatioForOneFile(
ctx context.Context,
file *mydump.SourceFileMeta,
ksCodec []byte,
maxRowCount int,
) (dataKVSize, indexKVSize uint64, err error) {
chunk := &checkpoints.ChunkCheckpoint{
Key: checkpoints.ChunkCheckpointKey{Path: file.Path},
FileMeta: *file,
Chunk: mydump.Chunk{EndOffset: maxSampleFileSize},
Timestamp: time.Now().Unix(),
}
idAlloc := kv.NewPanickingAllocators(e.Table.Meta().SepAutoInc())
tbl, err := tables.TableFromMeta(idAlloc, e.Table.Meta())
if err != nil {
return 0, 0, errors.Annotatef(err, "failed to tables.TableFromMeta %s", e.Table.Meta().Name)
}
encoder, err := e.getKVEncoder(e.logger, chunk, tbl)
if err != nil {
return 0, 0, err
}
defer func() {
if err2 := encoder.Close(); err2 != nil {
e.logger.Warn("close encoder failed", zap.Error(err2))
}
}()
parser, err := e.getParser(ctx, chunk)
if err != nil {
return 0, 0, err
}
defer func() {
if err2 := parser.Close(); err2 != nil {
e.logger.Warn("close parser failed", zap.Error(err2))
}
}()
var count int
sendFn := func(context.Context, *encodedKVGroupBatch) error {
count++
if count >= maxRowCount {
return stopIterErr
}
return nil
}
chunkEnc := &chunkEncoder{
chunkName: chunk.GetKey(),
readFn: parserEncodeReader(parser, chunk.Chunk.EndOffset, chunk.GetKey()),
sendFn: sendFn,
encoder: encoder,
keyspace: ksCodec,
groupChecksum: verify.NewKVGroupChecksumWithKeyspace(ksCodec),
}
err = chunkEnc.encodeLoop(ctx)
if goerrors.Is(err, stopIterErr) {
err = nil
}
dataKVSize, indexKVSize = chunkEnc.groupChecksum.DataAndIndexSumSize()
return dataKVSize, indexKVSize, err
}