204 lines
6.4 KiB
Go
204 lines
6.4 KiB
Go
// Copyright 2024 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package storage
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/pingcap/errors"
|
|
"github.com/pingcap/tidb/pkg/disttask/framework/proto"
|
|
"github.com/pingcap/tidb/pkg/sessionctx"
|
|
"github.com/pingcap/tidb/pkg/util/cpu"
|
|
"github.com/pingcap/tidb/pkg/util/sqlescape"
|
|
"github.com/pingcap/tidb/pkg/util/sqlexec"
|
|
)
|
|
|
|
// InitMeta insert the manager information into dist_framework_meta.
|
|
func (mgr *TaskManager) InitMeta(ctx context.Context, tidbID string, role string) error {
|
|
return mgr.WithNewSession(func(se sessionctx.Context) error {
|
|
return mgr.InitMetaSession(ctx, se, tidbID, role)
|
|
})
|
|
}
|
|
|
|
// InitMetaSession insert the manager information into dist_framework_meta.
|
|
// if the record exists, update the cpu_count and role.
|
|
func (*TaskManager) InitMetaSession(ctx context.Context, se sessionctx.Context, execID string, role string) error {
|
|
cpuCount := cpu.GetCPUCount()
|
|
_, err := sqlexec.ExecSQL(ctx, se, `
|
|
insert into mysql.dist_framework_meta(host, role, cpu_count, keyspace_id)
|
|
values (%?, %?, %?, -1)
|
|
on duplicate key
|
|
update cpu_count = %?, role = %?`,
|
|
execID, role, cpuCount, cpuCount, role)
|
|
return err
|
|
}
|
|
|
|
// RecoverMeta insert the manager information into dist_framework_meta.
|
|
// if the record exists, update the cpu_count.
|
|
// Don't update role for we only update it in `set global tidb_service_scope`.
|
|
// if not there might has a data race.
|
|
func (mgr *TaskManager) RecoverMeta(ctx context.Context, execID string, role string) error {
|
|
cpuCount := cpu.GetCPUCount()
|
|
_, err := mgr.ExecuteSQLWithNewSession(ctx, `
|
|
insert into mysql.dist_framework_meta(host, role, cpu_count, keyspace_id)
|
|
values (%?, %?, %?, -1)
|
|
on duplicate key
|
|
update cpu_count = %?`,
|
|
execID, role, cpuCount, cpuCount)
|
|
return err
|
|
}
|
|
|
|
// DeleteDeadNodes deletes the dead nodes from mysql.dist_framework_meta.
|
|
func (mgr *TaskManager) DeleteDeadNodes(ctx context.Context, nodes []string) error {
|
|
if len(nodes) == 0 {
|
|
return nil
|
|
}
|
|
return mgr.WithNewTxn(ctx, func(se sessionctx.Context) error {
|
|
deleteSQL := new(strings.Builder)
|
|
if err := sqlescape.FormatSQL(deleteSQL, "delete from mysql.dist_framework_meta where host in("); err != nil {
|
|
return err
|
|
}
|
|
deleteElems := make([]string, 0, len(nodes))
|
|
for _, node := range nodes {
|
|
deleteElems = append(deleteElems, fmt.Sprintf(`"%s"`, node))
|
|
}
|
|
|
|
deleteSQL.WriteString(strings.Join(deleteElems, ", "))
|
|
deleteSQL.WriteString(")")
|
|
_, err := sqlexec.ExecSQL(ctx, se, deleteSQL.String())
|
|
return err
|
|
})
|
|
}
|
|
|
|
// GetManagedNodes implements scheduler.TaskManager interface.
|
|
func (mgr *TaskManager) GetManagedNodes(ctx context.Context) ([]proto.ManagedNode, error) {
|
|
var nodes []proto.ManagedNode
|
|
err := mgr.WithNewSession(func(se sessionctx.Context) error {
|
|
var err2 error
|
|
nodes, err2 = mgr.getManagedNodesWithSession(ctx, se)
|
|
return err2
|
|
})
|
|
return nodes, err
|
|
}
|
|
|
|
func (mgr *TaskManager) getManagedNodesWithSession(ctx context.Context, se sessionctx.Context) ([]proto.ManagedNode, error) {
|
|
nodes, err := mgr.getAllNodesWithSession(ctx, se)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nodeMap := make(map[string][]proto.ManagedNode, 2)
|
|
for _, node := range nodes {
|
|
nodeMap[node.Role] = append(nodeMap[node.Role], node)
|
|
}
|
|
if len(nodeMap["background"]) == 0 {
|
|
return nodeMap[""], nil
|
|
}
|
|
return nodeMap["background"], nil
|
|
}
|
|
|
|
// GetAllNodes gets nodes in dist_framework_meta.
|
|
func (mgr *TaskManager) GetAllNodes(ctx context.Context) ([]proto.ManagedNode, error) {
|
|
var nodes []proto.ManagedNode
|
|
err := mgr.WithNewSession(func(se sessionctx.Context) error {
|
|
var err2 error
|
|
nodes, err2 = mgr.getAllNodesWithSession(ctx, se)
|
|
return err2
|
|
})
|
|
return nodes, err
|
|
}
|
|
|
|
func (*TaskManager) getAllNodesWithSession(ctx context.Context, se sessionctx.Context) ([]proto.ManagedNode, error) {
|
|
rs, err := sqlexec.ExecSQL(ctx, se, `
|
|
select host, role, cpu_count
|
|
from mysql.dist_framework_meta
|
|
order by host`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
nodes := make([]proto.ManagedNode, 0, len(rs))
|
|
for _, r := range rs {
|
|
nodes = append(nodes, proto.ManagedNode{
|
|
ID: r.GetString(0),
|
|
Role: r.GetString(1),
|
|
CPUCount: int(r.GetInt64(2)),
|
|
})
|
|
}
|
|
return nodes, nil
|
|
}
|
|
|
|
// GetUsedSlotsOnNodes implements the scheduler.TaskManager interface.
|
|
func (mgr *TaskManager) GetUsedSlotsOnNodes(ctx context.Context) (map[string]int, error) {
|
|
// concurrency of subtasks of some step is the same, we use max(concurrency)
|
|
// to make group by works.
|
|
rs, err := mgr.ExecuteSQLWithNewSession(ctx, `
|
|
select
|
|
exec_id, sum(concurrency)
|
|
from (
|
|
select exec_id, task_key, max(concurrency) concurrency
|
|
from mysql.tidb_background_subtask
|
|
where state in (%?, %?)
|
|
group by exec_id, task_key
|
|
) a
|
|
group by exec_id`,
|
|
proto.SubtaskStatePending, proto.SubtaskStateRunning,
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
slots := make(map[string]int, len(rs))
|
|
for _, r := range rs {
|
|
val, _ := r.GetMyDecimal(1).ToInt()
|
|
slots[r.GetString(0)] = int(val)
|
|
}
|
|
return slots, nil
|
|
}
|
|
|
|
// GetCPUCountOfManagedNode gets the cpu count of managed node.
|
|
func (mgr *TaskManager) GetCPUCountOfManagedNode(ctx context.Context) (int, error) {
|
|
var cnt int
|
|
err := mgr.WithNewSession(func(se sessionctx.Context) error {
|
|
var err2 error
|
|
cnt, err2 = mgr.getCPUCountOfManagedNode(ctx, se)
|
|
return err2
|
|
})
|
|
return cnt, err
|
|
}
|
|
|
|
// getCPUCountOfManagedNode gets the cpu count of managed node.
|
|
// returns error when there's no managed node or no node has valid cpu count.
|
|
func (mgr *TaskManager) getCPUCountOfManagedNode(ctx context.Context, se sessionctx.Context) (int, error) {
|
|
nodes, err := mgr.getManagedNodesWithSession(ctx, se)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if len(nodes) == 0 {
|
|
return 0, errors.New("no managed nodes")
|
|
}
|
|
var cpuCount int
|
|
for _, n := range nodes {
|
|
if n.CPUCount > 0 {
|
|
cpuCount = n.CPUCount
|
|
break
|
|
}
|
|
}
|
|
if cpuCount == 0 {
|
|
return 0, errors.New("no managed node have enough resource for dist task")
|
|
}
|
|
return cpuCount, nil
|
|
}
|