Files
tidb/pkg/ttl/ttlworker/task_manager_integration_test.go

359 lines
14 KiB
Go

// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package ttlworker_test
import (
"context"
"fmt"
"sync"
"testing"
"time"
"github.com/pingcap/tidb/pkg/infoschema"
"github.com/pingcap/tidb/pkg/kv"
"github.com/pingcap/tidb/pkg/parser/model"
"github.com/pingcap/tidb/pkg/store/mockstore"
"github.com/pingcap/tidb/pkg/testkit"
"github.com/pingcap/tidb/pkg/ttl/cache"
"github.com/pingcap/tidb/pkg/ttl/metrics"
"github.com/pingcap/tidb/pkg/ttl/ttlworker"
"github.com/pingcap/tidb/pkg/util/logutil"
dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/require"
"github.com/tikv/client-go/v2/testutils"
"go.uber.org/atomic"
"go.uber.org/zap"
)
func TestParallelLockNewTask(t *testing.T) {
store := testkit.CreateMockStore(t)
tk := testkit.NewTestKit(t, store)
tk.MustExec("set global tidb_ttl_running_tasks = 1000")
ctx := kv.WithInternalSourceType(context.Background(), kv.InternalTxnTTL)
tk.MustExec("create table test.t (id int, created_at datetime) TTL= created_at + interval 1 hour")
testTable, err := tk.Session().GetDomainInfoSchema().(infoschema.InfoSchema).TableByName(context.Background(), model.NewCIStr("test"), model.NewCIStr("t"))
require.NoError(t, err)
sessionFactory := sessionFactory(t, store)
se := sessionFactory()
now := se.Now()
isc := cache.NewInfoSchemaCache(time.Minute)
require.NoError(t, isc.Update(se))
m := ttlworker.NewTaskManager(context.Background(), nil, isc, "test-id", store)
// insert and lock a new task
sql, args, err := cache.InsertIntoTTLTask(tk.Session(), "test-job", testTable.Meta().ID, 1, nil, nil, now, now)
require.NoError(t, err)
_, err = tk.Session().ExecuteInternal(ctx, sql, args...)
require.NoError(t, err)
_, err = m.LockScanTask(se, &cache.TTLTask{
ScanID: 1,
JobID: "test-job",
TableID: testTable.Meta().ID,
}, now)
require.NoError(t, err)
tk.MustExec("DELETE FROM mysql.tidb_ttl_task")
// lock one table in parallel, only one of them should lock successfully
testTimes := 100
concurrency := 5
for i := 0; i < testTimes; i++ {
sql, args, err := cache.InsertIntoTTLTask(tk.Session(), "test-job", testTable.Meta().ID, 1, nil, nil, now, now)
require.NoError(t, err)
_, err = tk.Session().ExecuteInternal(ctx, sql, args...)
require.NoError(t, err)
successCounter := atomic.NewUint64(0)
now = now.Add(time.Hour * 48)
wg := sync.WaitGroup{}
for j := 0; j < concurrency; j++ {
scanManagerID := fmt.Sprintf("test-ttl-manager-%d", j)
wg.Add(1)
go func() {
se := sessionFactory()
isc := cache.NewInfoSchemaCache(time.Minute)
require.NoError(t, isc.Update(se))
m := ttlworker.NewTaskManager(context.Background(), nil, isc, scanManagerID, store)
_, err := m.LockScanTask(se, &cache.TTLTask{
ScanID: 1,
JobID: "test-job",
TableID: testTable.Meta().ID,
}, now)
if err == nil {
successCounter.Add(1)
} else {
logutil.BgLogger().Info("lock new task with error", zap.Error(err))
}
wg.Done()
}()
}
wg.Wait()
require.Equal(t, uint64(1), successCounter.Load())
tk.MustExec("DELETE FROM mysql.tidb_ttl_task")
}
}
func TestParallelSchedule(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
waitAndStopTTLManager(t, dom)
tk := testkit.NewTestKit(t, store)
tk.MustExec("set global tidb_ttl_running_tasks = 1000")
sessionFactory := sessionFactory(t, store)
tk.MustExec("create table test.t(id int, created_at datetime) ttl=created_at + interval 1 day")
table, err := dom.InfoSchema().TableByName(context.Background(), model.NewCIStr("test"), model.NewCIStr("t"))
require.NoError(t, err)
// 16 tasks and 16 scan workers (in 4 task manager) should be able to be scheduled in a single "reschedule"
for i := 0; i < 16; i++ {
sql := fmt.Sprintf("insert into mysql.tidb_ttl_task(job_id,table_id,scan_id,expire_time,created_time) values ('test-job', %d, %d, NOW(), NOW())", table.Meta().ID, i)
tk.MustExec(sql)
}
isc := cache.NewInfoSchemaCache(time.Second)
require.NoError(t, isc.Update(sessionFactory()))
scheduleWg := sync.WaitGroup{}
finishTasks := make([]func(), 0, 4)
for i := 0; i < 4; i++ {
workers := []ttlworker.Worker{}
for j := 0; j < 4; j++ {
scanWorker := ttlworker.NewMockScanWorker(t)
scanWorker.Start()
workers = append(workers, scanWorker)
}
managerID := fmt.Sprintf("task-manager-%d", i)
m := ttlworker.NewTaskManager(context.Background(), nil, isc, managerID, store)
m.SetScanWorkers4Test(workers)
scheduleWg.Add(1)
go func() {
se := sessionFactory()
m.RescheduleTasks(se, se.Now())
scheduleWg.Done()
}()
finishTasks = append(finishTasks, func() {
se := sessionFactory()
for _, task := range m.GetRunningTasks() {
require.Nil(t, task.Context().Err(), fmt.Sprintf("%s %d", managerID, task.ScanID))
task.SetResult(nil)
m.CheckFinishedTask(se, se.Now())
require.NotNil(t, task.Context().Err(), fmt.Sprintf("%s %d", managerID, task.ScanID))
}
})
}
scheduleWg.Wait()
// all tasks should have been scheduled
tk.MustQuery("select count(1) from mysql.tidb_ttl_task where status = 'running'").Check(testkit.Rows("16"))
for i := 0; i < 4; i++ {
sql := fmt.Sprintf("select count(1) from mysql.tidb_ttl_task where status = 'running' AND owner_id = 'task-manager-%d'", i)
tk.MustQuery(sql).Check(testkit.Rows("4"))
finishTasks[i]()
sql = fmt.Sprintf("select count(1) from mysql.tidb_ttl_task where status = 'finished' AND owner_id = 'task-manager-%d'", i)
tk.MustQuery(sql).Check(testkit.Rows("4"))
}
}
func TestTaskScheduleExpireHeartBeat(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
waitAndStopTTLManager(t, dom)
tk := testkit.NewTestKit(t, store)
tk.MustExec("set global tidb_ttl_running_tasks = 1000")
sessionFactory := sessionFactory(t, store)
// create table and scan task
tk.MustExec("create table test.t(id int, created_at datetime) ttl=created_at + interval 1 day")
table, err := dom.InfoSchema().TableByName(context.Background(), model.NewCIStr("test"), model.NewCIStr("t"))
require.NoError(t, err)
sql := fmt.Sprintf("insert into mysql.tidb_ttl_task(job_id,table_id,scan_id,expire_time,created_time) values ('test-job', %d, %d, NOW(), NOW())", table.Meta().ID, 1)
tk.MustExec(sql)
// update the infoschema cache
isc := cache.NewInfoSchemaCache(time.Second)
require.NoError(t, isc.Update(sessionFactory()))
// schedule in a task manager
scanWorker := ttlworker.NewMockScanWorker(t)
scanWorker.Start()
m := ttlworker.NewTaskManager(context.Background(), nil, isc, "task-manager-1", store)
m.SetScanWorkers4Test([]ttlworker.Worker{scanWorker})
se := sessionFactory()
now := se.Now()
m.RescheduleTasks(se, now)
tk.MustQuery("select status,owner_id from mysql.tidb_ttl_task").Check(testkit.Rows("running task-manager-1"))
// another task manager should fetch this task after heartbeat expire
scanWorker2 := ttlworker.NewMockScanWorker(t)
scanWorker2.Start()
m2 := ttlworker.NewTaskManager(context.Background(), nil, isc, "task-manager-2", store)
m2.SetScanWorkers4Test([]ttlworker.Worker{scanWorker2})
m2.RescheduleTasks(sessionFactory(), now.Add(time.Hour))
tk.MustQuery("select status,owner_id from mysql.tidb_ttl_task").Check(testkit.Rows("running task-manager-2"))
// another task manager shouldn't fetch this task if it has finished
task := m2.GetRunningTasks()[0]
task.SetResult(nil)
m2.CheckFinishedTask(sessionFactory(), now)
scanWorker3 := ttlworker.NewMockScanWorker(t)
scanWorker3.Start()
m3 := ttlworker.NewTaskManager(context.Background(), nil, isc, "task-manager-3", store)
m3.SetScanWorkers4Test([]ttlworker.Worker{scanWorker3})
m3.RescheduleTasks(sessionFactory(), now.Add(time.Hour))
tk.MustQuery("select status,owner_id from mysql.tidb_ttl_task").Check(testkit.Rows("finished task-manager-2"))
}
func TestTaskMetrics(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
waitAndStopTTLManager(t, dom)
tk := testkit.NewTestKit(t, store)
tk.MustExec("set global tidb_ttl_running_tasks = 1000")
sessionFactory := sessionFactory(t, store)
// create table and scan task
tk.MustExec("create table test.t(id int, created_at datetime) ttl=created_at + interval 1 day")
table, err := dom.InfoSchema().TableByName(context.Background(), model.NewCIStr("test"), model.NewCIStr("t"))
require.NoError(t, err)
sql := fmt.Sprintf("insert into mysql.tidb_ttl_task(job_id,table_id,scan_id,expire_time,created_time) values ('test-job', %d, %d, NOW(), NOW())", table.Meta().ID, 1)
tk.MustExec(sql)
// update the infoschema cache
isc := cache.NewInfoSchemaCache(time.Second)
require.NoError(t, isc.Update(sessionFactory()))
// schedule in a task manager
scanWorker := ttlworker.NewMockScanWorker(t)
scanWorker.Start()
m := ttlworker.NewTaskManager(context.Background(), nil, isc, "task-manager-1", store)
m.SetScanWorkers4Test([]ttlworker.Worker{scanWorker})
se := sessionFactory()
now := se.Now()
m.RescheduleTasks(sessionFactory(), now)
tk.MustQuery("select status,owner_id from mysql.tidb_ttl_task").Check(testkit.Rows("running task-manager-1"))
m.ReportMetrics()
out := &dto.Metric{}
require.NoError(t, metrics.DeletingTaskCnt.Write(out))
require.Equal(t, float64(1), out.GetGauge().GetValue())
}
func TestRescheduleWithError(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
waitAndStopTTLManager(t, dom)
tk := testkit.NewTestKit(t, store)
tk.MustExec("set global tidb_ttl_running_tasks = 1000")
sessionFactory := sessionFactory(t, store)
// insert a wrong scan task with random table id
sql := fmt.Sprintf("insert into mysql.tidb_ttl_task(job_id,table_id,scan_id,expire_time,created_time) values ('test-job', %d, %d, NOW(), NOW())", 613, 1)
tk.MustExec(sql)
se := sessionFactory()
now := se.Now()
isc := cache.NewInfoSchemaCache(time.Second)
require.NoError(t, isc.Update(se))
// schedule in a task manager
scanWorker := ttlworker.NewMockScanWorker(t)
scanWorker.Start()
m := ttlworker.NewTaskManager(context.Background(), nil, isc, "task-manager-1", store)
m.SetScanWorkers4Test([]ttlworker.Worker{scanWorker})
notify := make(chan struct{})
go func() {
m.RescheduleTasks(sessionFactory(), now)
notify <- struct{}{}
}()
timeout, cancel := context.WithTimeout(context.Background(), time.Second*5)
defer cancel()
select {
case <-timeout.Done():
require.Fail(t, "reschedule didn't finish in time")
case <-notify:
}
tk.MustQuery("select status from mysql.tidb_ttl_task").Check(testkit.Rows("waiting"))
}
func TestTTLRunningTasksLimitation(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
waitAndStopTTLManager(t, dom)
tk := testkit.NewTestKit(t, store)
sessionFactory := sessionFactory(t, store)
tk.MustExec("set global tidb_ttl_running_tasks = 32")
tk.MustExec("create table test.t(id int, created_at datetime) ttl=created_at + interval 1 day")
table, err := dom.InfoSchema().TableByName(context.Background(), model.NewCIStr("test"), model.NewCIStr("t"))
require.NoError(t, err)
// 64 tasks and 128 scan workers (in 16 task manager) should only schedule 32 tasks
for i := 0; i < 128; i++ {
sql := fmt.Sprintf("insert into mysql.tidb_ttl_task(job_id,table_id,scan_id,expire_time,created_time) values ('test-job', %d, %d, NOW(), NOW())", table.Meta().ID, i)
tk.MustExec(sql)
}
isc := cache.NewInfoSchemaCache(time.Second)
require.NoError(t, isc.Update(sessionFactory()))
scheduleWg := sync.WaitGroup{}
for i := 0; i < 16; i++ {
workers := []ttlworker.Worker{}
for j := 0; j < 8; j++ {
scanWorker := ttlworker.NewMockScanWorker(t)
scanWorker.Start()
workers = append(workers, scanWorker)
}
ctx := logutil.WithKeyValue(context.Background(), "ttl-worker-test", fmt.Sprintf("task-manager-%d", i))
m := ttlworker.NewTaskManager(ctx, nil, isc, fmt.Sprintf("task-manager-%d", i), store)
m.SetScanWorkers4Test(workers)
scheduleWg.Add(1)
go func() {
se := sessionFactory()
m.RescheduleTasks(se, se.Now())
scheduleWg.Done()
}()
}
scheduleWg.Wait()
// all tasks should have been scheduled
tk.MustQuery("select count(1) from mysql.tidb_ttl_task where status = 'running'").Check(testkit.Rows("32"))
}
func TestMeetTTLRunningTasks(t *testing.T) {
// initialize a cluster with 3 TiKV
store, dom := testkit.CreateMockStoreAndDomain(t, mockstore.WithStoreType(mockstore.MockTiKV),
mockstore.WithClusterInspector(func(c testutils.Cluster) {
mockstore.BootstrapWithMultiStores(c, 3)
}))
waitAndStopTTLManager(t, dom)
tk := testkit.NewTestKit(t, store)
// -1, the default value, means the count of TiKV
require.True(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(2, cache.TaskStatusWaiting))
require.False(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(3, cache.TaskStatusWaiting))
require.True(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(2, cache.TaskStatusRunning))
// positive number means the limitation
tk.MustExec("set global tidb_ttl_running_tasks = 32")
require.False(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(32, cache.TaskStatusWaiting))
require.True(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(31, cache.TaskStatusWaiting))
require.True(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(32, cache.TaskStatusRunning))
// set it back to auto value
tk.MustExec("set global tidb_ttl_running_tasks = -1")
require.True(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(2, cache.TaskStatusWaiting))
require.False(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(3, cache.TaskStatusWaiting))
require.True(t, dom.TTLJobManager().TaskManager().MeetTTLRunningTasks(3, cache.TaskStatusRunning))
}