1812 lines
56 KiB
Go
1812 lines
56 KiB
Go
// Copyright 2015 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package domain
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"strconv"
|
|
"sync"
|
|
"sync/atomic"
|
|
"time"
|
|
"unsafe"
|
|
|
|
"github.com/ngaut/pools"
|
|
"github.com/pingcap/errors"
|
|
"github.com/pingcap/failpoint"
|
|
"github.com/pingcap/tidb/bindinfo"
|
|
"github.com/pingcap/tidb/config"
|
|
"github.com/pingcap/tidb/ddl"
|
|
ddlutil "github.com/pingcap/tidb/ddl/util"
|
|
"github.com/pingcap/tidb/domain/globalconfigsync"
|
|
"github.com/pingcap/tidb/domain/infosync"
|
|
"github.com/pingcap/tidb/errno"
|
|
"github.com/pingcap/tidb/infoschema"
|
|
"github.com/pingcap/tidb/infoschema/perfschema"
|
|
"github.com/pingcap/tidb/kv"
|
|
"github.com/pingcap/tidb/meta"
|
|
"github.com/pingcap/tidb/metrics"
|
|
"github.com/pingcap/tidb/owner"
|
|
"github.com/pingcap/tidb/parser/ast"
|
|
"github.com/pingcap/tidb/parser/model"
|
|
"github.com/pingcap/tidb/parser/mysql"
|
|
"github.com/pingcap/tidb/parser/terror"
|
|
"github.com/pingcap/tidb/privilege/privileges"
|
|
"github.com/pingcap/tidb/sessionctx"
|
|
"github.com/pingcap/tidb/sessionctx/variable"
|
|
"github.com/pingcap/tidb/statistics/handle"
|
|
"github.com/pingcap/tidb/telemetry"
|
|
"github.com/pingcap/tidb/types"
|
|
"github.com/pingcap/tidb/util"
|
|
"github.com/pingcap/tidb/util/dbterror"
|
|
"github.com/pingcap/tidb/util/domainutil"
|
|
"github.com/pingcap/tidb/util/expensivequery"
|
|
"github.com/pingcap/tidb/util/logutil"
|
|
"github.com/pingcap/tidb/util/sqlexec"
|
|
"github.com/tikv/client-go/v2/txnkv/transaction"
|
|
"go.etcd.io/etcd/clientv3"
|
|
"go.etcd.io/etcd/clientv3/concurrency"
|
|
atomicutil "go.uber.org/atomic"
|
|
"go.uber.org/zap"
|
|
"google.golang.org/grpc"
|
|
"google.golang.org/grpc/keepalive"
|
|
)
|
|
|
|
// Domain represents a storage space. Different domains can use the same database name.
|
|
// Multiple domains can be used in parallel without synchronization.
|
|
type Domain struct {
|
|
store kv.Storage
|
|
infoCache *infoschema.InfoCache
|
|
privHandle *privileges.Handle
|
|
bindHandle *bindinfo.BindHandle
|
|
statsHandle unsafe.Pointer
|
|
statsLease time.Duration
|
|
ddl ddl.DDL
|
|
info *infosync.InfoSyncer
|
|
globalCfgSyncer *globalconfigsync.GlobalConfigSyncer
|
|
m sync.Mutex
|
|
SchemaValidator SchemaValidator
|
|
sysSessionPool *sessionPool
|
|
exit chan struct{}
|
|
etcdClient *clientv3.Client
|
|
sysVarCache sysVarCache // replaces GlobalVariableCache
|
|
slowQuery *topNSlowQueries
|
|
expensiveQueryHandle *expensivequery.Handle
|
|
wg sync.WaitGroup
|
|
statsUpdating atomicutil.Int32
|
|
cancel context.CancelFunc
|
|
indexUsageSyncLease time.Duration
|
|
planReplayer *planReplayer
|
|
expiredTimeStamp4PC types.Time
|
|
|
|
serverID uint64
|
|
serverIDSession *concurrency.Session
|
|
isLostConnectionToPD atomicutil.Int32 // !0: true, 0: false.
|
|
renewLeaseCh chan func() // It is used to call the renewLease function of the cache table.
|
|
onClose func()
|
|
sysExecutorFactory func(*Domain) (pools.Resource, error)
|
|
}
|
|
|
|
// loadInfoSchema loads infoschema at startTS.
|
|
// It returns:
|
|
// 1. the needed infoschema
|
|
// 2. cache hit indicator
|
|
// 3. currentSchemaVersion(before loading)
|
|
// 4. the changed table IDs if it is not full load
|
|
// 5. an error if any
|
|
func (do *Domain) loadInfoSchema(startTS uint64) (infoschema.InfoSchema, bool, int64, *transaction.RelatedSchemaChange, error) {
|
|
snapshot := do.store.GetSnapshot(kv.NewVersion(startTS))
|
|
m := meta.NewSnapshotMeta(snapshot)
|
|
neededSchemaVersion, err := m.GetSchemaVersion()
|
|
if err != nil {
|
|
return nil, false, 0, nil, err
|
|
}
|
|
|
|
if is := do.infoCache.GetByVersion(neededSchemaVersion); is != nil {
|
|
return is, true, 0, nil, nil
|
|
}
|
|
|
|
currentSchemaVersion := int64(0)
|
|
if oldInfoSchema := do.infoCache.GetLatest(); oldInfoSchema != nil {
|
|
currentSchemaVersion = oldInfoSchema.SchemaMetaVersion()
|
|
}
|
|
|
|
// TODO: tryLoadSchemaDiffs has potential risks of failure. And it becomes worse in history reading cases.
|
|
// It is only kept because there is no alternative diff/partial loading solution.
|
|
// And it is only used to diff upgrading the current latest infoschema, if:
|
|
// 1. Not first time bootstrap loading, which needs a full load.
|
|
// 2. It is newer than the current one, so it will be "the current one" after this function call.
|
|
// 3. There are less 100 diffs.
|
|
startTime := time.Now()
|
|
if currentSchemaVersion != 0 && neededSchemaVersion > currentSchemaVersion && neededSchemaVersion-currentSchemaVersion < 100 {
|
|
is, relatedChanges, err := do.tryLoadSchemaDiffs(m, currentSchemaVersion, neededSchemaVersion)
|
|
if err == nil {
|
|
do.infoCache.Insert(is, startTS)
|
|
logutil.BgLogger().Info("diff load InfoSchema success",
|
|
zap.Int64("currentSchemaVersion", currentSchemaVersion),
|
|
zap.Int64("neededSchemaVersion", neededSchemaVersion),
|
|
zap.Duration("start time", time.Since(startTime)),
|
|
zap.Int64s("phyTblIDs", relatedChanges.PhyTblIDS),
|
|
zap.Uint64s("actionTypes", relatedChanges.ActionTypes))
|
|
return is, false, currentSchemaVersion, relatedChanges, nil
|
|
}
|
|
// We can fall back to full load, don't need to return the error.
|
|
logutil.BgLogger().Error("failed to load schema diff", zap.Error(err))
|
|
}
|
|
|
|
schemas, err := do.fetchAllSchemasWithTables(m)
|
|
if err != nil {
|
|
return nil, false, currentSchemaVersion, nil, err
|
|
}
|
|
|
|
bundles, err := infosync.GetAllRuleBundles(context.TODO())
|
|
if err != nil {
|
|
return nil, false, currentSchemaVersion, nil, err
|
|
}
|
|
|
|
policies, err := do.fetchPolicies(m)
|
|
if err != nil {
|
|
return nil, false, currentSchemaVersion, nil, err
|
|
}
|
|
|
|
newISBuilder, err := infoschema.NewBuilder(do.Store(), do.renewLeaseCh, do.sysFacHack).InitWithDBInfos(schemas, bundles, policies, neededSchemaVersion)
|
|
if err != nil {
|
|
return nil, false, currentSchemaVersion, nil, err
|
|
}
|
|
logutil.BgLogger().Info("full load InfoSchema success",
|
|
zap.Int64("currentSchemaVersion", currentSchemaVersion),
|
|
zap.Int64("neededSchemaVersion", neededSchemaVersion),
|
|
zap.Duration("start time", time.Since(startTime)))
|
|
|
|
is := newISBuilder.Build()
|
|
do.infoCache.Insert(is, startTS)
|
|
return is, false, currentSchemaVersion, nil, nil
|
|
}
|
|
|
|
func (do *Domain) sysFacHack() (pools.Resource, error) {
|
|
// TODO: Here we create new sessions with sysFac in DDL,
|
|
// which will use `do` as Domain instead of call `domap.Get`.
|
|
// That's because `domap.Get` requires a lock, but before
|
|
// we initialize Domain finish, we can't require that again.
|
|
// After we remove the lazy logic of creating Domain, we
|
|
// can simplify code here.
|
|
return do.sysExecutorFactory(do)
|
|
}
|
|
|
|
func (do *Domain) fetchPolicies(m *meta.Meta) ([]*model.PolicyInfo, error) {
|
|
allPolicies, err := m.ListPolicies()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return allPolicies, nil
|
|
}
|
|
|
|
func (do *Domain) fetchAllSchemasWithTables(m *meta.Meta) ([]*model.DBInfo, error) {
|
|
allSchemas, err := m.ListDatabases()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
splittedSchemas := do.splitForConcurrentFetch(allSchemas)
|
|
doneCh := make(chan error, len(splittedSchemas))
|
|
for _, schemas := range splittedSchemas {
|
|
go do.fetchSchemasWithTables(schemas, m, doneCh)
|
|
}
|
|
for range splittedSchemas {
|
|
err = <-doneCh
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return allSchemas, nil
|
|
}
|
|
|
|
// fetchSchemaConcurrency controls the goroutines to load schemas, but more goroutines
|
|
// increase the memory usage when calling json.Unmarshal(), which would cause OOM,
|
|
// so we decrease the concurrency.
|
|
const fetchSchemaConcurrency = 1
|
|
|
|
func (do *Domain) splitForConcurrentFetch(schemas []*model.DBInfo) [][]*model.DBInfo {
|
|
groupSize := (len(schemas) + fetchSchemaConcurrency - 1) / fetchSchemaConcurrency
|
|
splitted := make([][]*model.DBInfo, 0, fetchSchemaConcurrency)
|
|
schemaCnt := len(schemas)
|
|
for i := 0; i < schemaCnt; i += groupSize {
|
|
end := i + groupSize
|
|
if end > schemaCnt {
|
|
end = schemaCnt
|
|
}
|
|
splitted = append(splitted, schemas[i:end])
|
|
}
|
|
return splitted
|
|
}
|
|
|
|
func (do *Domain) fetchSchemasWithTables(schemas []*model.DBInfo, m *meta.Meta, done chan error) {
|
|
for _, di := range schemas {
|
|
if di.State != model.StatePublic {
|
|
// schema is not public, can't be used outside.
|
|
continue
|
|
}
|
|
tables, err := m.ListTables(di.ID)
|
|
if err != nil {
|
|
done <- err
|
|
return
|
|
}
|
|
// If TreatOldVersionUTF8AsUTF8MB4 was enable, need to convert the old version schema UTF8 charset to UTF8MB4.
|
|
if config.GetGlobalConfig().TreatOldVersionUTF8AsUTF8MB4 {
|
|
for _, tbInfo := range tables {
|
|
infoschema.ConvertOldVersionUTF8ToUTF8MB4IfNeed(tbInfo)
|
|
}
|
|
}
|
|
di.Tables = make([]*model.TableInfo, 0, len(tables))
|
|
for _, tbl := range tables {
|
|
if tbl.State != model.StatePublic {
|
|
// schema is not public, can't be used outside.
|
|
continue
|
|
}
|
|
infoschema.ConvertCharsetCollateToLowerCaseIfNeed(tbl)
|
|
// Check whether the table is in repair mode.
|
|
if domainutil.RepairInfo.InRepairMode() && domainutil.RepairInfo.CheckAndFetchRepairedTable(di, tbl) {
|
|
continue
|
|
}
|
|
di.Tables = append(di.Tables, tbl)
|
|
}
|
|
}
|
|
done <- nil
|
|
}
|
|
|
|
// tryLoadSchemaDiffs tries to only load latest schema changes.
|
|
// Return true if the schema is loaded successfully.
|
|
// Return false if the schema can not be loaded by schema diff, then we need to do full load.
|
|
// The second returned value is the delta updated table and partition IDs.
|
|
func (do *Domain) tryLoadSchemaDiffs(m *meta.Meta, usedVersion, newVersion int64) (infoschema.InfoSchema, *transaction.RelatedSchemaChange, error) {
|
|
var diffs []*model.SchemaDiff
|
|
for usedVersion < newVersion {
|
|
usedVersion++
|
|
diff, err := m.GetSchemaDiff(usedVersion)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if diff == nil {
|
|
// If diff is missing for any version between used and new version, we fall back to full reload.
|
|
return nil, nil, fmt.Errorf("failed to get schemadiff")
|
|
}
|
|
diffs = append(diffs, diff)
|
|
}
|
|
builder := infoschema.NewBuilder(do.Store(), do.renewLeaseCh, do.sysFacHack).InitWithOldInfoSchema(do.infoCache.GetLatest())
|
|
phyTblIDs := make([]int64, 0, len(diffs))
|
|
actions := make([]uint64, 0, len(diffs))
|
|
for _, diff := range diffs {
|
|
IDs, err := builder.ApplyDiff(m, diff)
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
if canSkipSchemaCheckerDDL(diff.Type) {
|
|
continue
|
|
}
|
|
phyTblIDs = append(phyTblIDs, IDs...)
|
|
for i := 0; i < len(IDs); i++ {
|
|
actions = append(actions, uint64(1<<diff.Type))
|
|
}
|
|
}
|
|
|
|
is := builder.Build()
|
|
relatedChange := transaction.RelatedSchemaChange{}
|
|
relatedChange.PhyTblIDS = phyTblIDs
|
|
relatedChange.ActionTypes = actions
|
|
return is, &relatedChange, nil
|
|
}
|
|
|
|
func canSkipSchemaCheckerDDL(tp model.ActionType) bool {
|
|
switch tp {
|
|
case model.ActionUpdateTiFlashReplicaStatus, model.ActionSetTiFlashReplica:
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// InfoSchema gets the latest information schema from domain.
|
|
func (do *Domain) InfoSchema() infoschema.InfoSchema {
|
|
return do.infoCache.GetLatest()
|
|
}
|
|
|
|
// GetSnapshotInfoSchema gets a snapshot information schema.
|
|
func (do *Domain) GetSnapshotInfoSchema(snapshotTS uint64) (infoschema.InfoSchema, error) {
|
|
// if the snapshotTS is new enough, we can get infoschema directly through sanpshotTS.
|
|
if is := do.infoCache.GetBySnapshotTS(snapshotTS); is != nil {
|
|
return is, nil
|
|
}
|
|
is, _, _, _, err := do.loadInfoSchema(snapshotTS)
|
|
return is, err
|
|
}
|
|
|
|
// GetSnapshotMeta gets a new snapshot meta at startTS.
|
|
func (do *Domain) GetSnapshotMeta(startTS uint64) (*meta.Meta, error) {
|
|
snapshot := do.store.GetSnapshot(kv.NewVersion(startTS))
|
|
return meta.NewSnapshotMeta(snapshot), nil
|
|
}
|
|
|
|
// ExpiredTimeStamp4PC gets expiredTimeStamp4PC from domain.
|
|
func (do *Domain) ExpiredTimeStamp4PC() types.Time {
|
|
do.m.Lock()
|
|
defer do.m.Unlock()
|
|
|
|
return do.expiredTimeStamp4PC
|
|
}
|
|
|
|
// SetExpiredTimeStamp4PC sets the expiredTimeStamp4PC from domain.
|
|
func (do *Domain) SetExpiredTimeStamp4PC(time types.Time) {
|
|
do.m.Lock()
|
|
defer do.m.Unlock()
|
|
|
|
do.expiredTimeStamp4PC = time
|
|
}
|
|
|
|
// DDL gets DDL from domain.
|
|
func (do *Domain) DDL() ddl.DDL {
|
|
return do.ddl
|
|
}
|
|
|
|
// InfoSyncer gets infoSyncer from domain.
|
|
func (do *Domain) InfoSyncer() *infosync.InfoSyncer {
|
|
return do.info
|
|
}
|
|
|
|
// NotifyGlobalConfigChange notify global config syncer to store the global config into PD(etcd).
|
|
func (do *Domain) NotifyGlobalConfigChange(name, value string) {
|
|
if do.globalCfgSyncer == nil {
|
|
return
|
|
}
|
|
do.globalCfgSyncer.Notify(name, value)
|
|
}
|
|
|
|
// GetGlobalConfigSyncer exports for testing.
|
|
func (do *Domain) GetGlobalConfigSyncer() *globalconfigsync.GlobalConfigSyncer {
|
|
return do.globalCfgSyncer
|
|
}
|
|
|
|
// Store gets KV store from domain.
|
|
func (do *Domain) Store() kv.Storage {
|
|
return do.store
|
|
}
|
|
|
|
// GetScope gets the status variables scope.
|
|
func (do *Domain) GetScope(status string) variable.ScopeFlag {
|
|
// Now domain status variables scope are all default scope.
|
|
return variable.DefaultStatusVarScopeFlag
|
|
}
|
|
|
|
// Reload reloads InfoSchema.
|
|
// It's public in order to do the test.
|
|
func (do *Domain) Reload() error {
|
|
failpoint.Inject("ErrorMockReloadFailed", func(val failpoint.Value) {
|
|
if val.(bool) {
|
|
failpoint.Return(errors.New("mock reload failed"))
|
|
}
|
|
})
|
|
|
|
// Lock here for only once at the same time.
|
|
do.m.Lock()
|
|
defer do.m.Unlock()
|
|
|
|
startTime := time.Now()
|
|
ver, err := do.store.CurrentVersion(kv.GlobalTxnScope)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
is, hitCache, oldSchemaVersion, changes, err := do.loadInfoSchema(ver.Ver)
|
|
metrics.LoadSchemaDuration.Observe(time.Since(startTime).Seconds())
|
|
if err != nil {
|
|
metrics.LoadSchemaCounter.WithLabelValues("failed").Inc()
|
|
return err
|
|
}
|
|
metrics.LoadSchemaCounter.WithLabelValues("succ").Inc()
|
|
|
|
// only update if it is not from cache
|
|
if !hitCache {
|
|
// loaded newer schema
|
|
if oldSchemaVersion < is.SchemaMetaVersion() {
|
|
// Update self schema version to etcd.
|
|
err = do.ddl.SchemaSyncer().UpdateSelfVersion(context.Background(), is.SchemaMetaVersion())
|
|
if err != nil {
|
|
logutil.BgLogger().Info("update self version failed",
|
|
zap.Int64("oldSchemaVersion", oldSchemaVersion),
|
|
zap.Int64("neededSchemaVersion", is.SchemaMetaVersion()), zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// it is full load
|
|
if changes == nil {
|
|
logutil.BgLogger().Info("full load and reset schema validator")
|
|
do.SchemaValidator.Reset()
|
|
}
|
|
}
|
|
|
|
// lease renew, so it must be executed despite it is cache or not
|
|
do.SchemaValidator.Update(ver.Ver, oldSchemaVersion, is.SchemaMetaVersion(), changes)
|
|
lease := do.DDL().GetLease()
|
|
sub := time.Since(startTime)
|
|
// Reload interval is lease / 2, if load schema time elapses more than this interval,
|
|
// some query maybe responded by ErrInfoSchemaExpired error.
|
|
if sub > (lease/2) && lease > 0 {
|
|
logutil.BgLogger().Warn("loading schema takes a long time", zap.Duration("take time", sub))
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// LogSlowQuery keeps topN recent slow queries in domain.
|
|
func (do *Domain) LogSlowQuery(query *SlowQueryInfo) {
|
|
do.slowQuery.mu.RLock()
|
|
defer do.slowQuery.mu.RUnlock()
|
|
if do.slowQuery.mu.closed {
|
|
return
|
|
}
|
|
|
|
select {
|
|
case do.slowQuery.ch <- query:
|
|
default:
|
|
}
|
|
}
|
|
|
|
// ShowSlowQuery returns the slow queries.
|
|
func (do *Domain) ShowSlowQuery(showSlow *ast.ShowSlow) []*SlowQueryInfo {
|
|
msg := &showSlowMessage{
|
|
request: showSlow,
|
|
}
|
|
msg.Add(1)
|
|
do.slowQuery.msgCh <- msg
|
|
msg.Wait()
|
|
return msg.result
|
|
}
|
|
|
|
func (do *Domain) topNSlowQueryLoop() {
|
|
defer util.Recover(metrics.LabelDomain, "topNSlowQueryLoop", nil, false)
|
|
ticker := time.NewTicker(time.Minute * 10)
|
|
defer func() {
|
|
ticker.Stop()
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("topNSlowQueryLoop exited.")
|
|
}()
|
|
for {
|
|
select {
|
|
case now := <-ticker.C:
|
|
do.slowQuery.RemoveExpired(now)
|
|
case info, ok := <-do.slowQuery.ch:
|
|
if !ok {
|
|
return
|
|
}
|
|
do.slowQuery.Append(info)
|
|
case msg := <-do.slowQuery.msgCh:
|
|
req := msg.request
|
|
switch req.Tp {
|
|
case ast.ShowSlowTop:
|
|
msg.result = do.slowQuery.QueryTop(int(req.Count), req.Kind)
|
|
case ast.ShowSlowRecent:
|
|
msg.result = do.slowQuery.QueryRecent(int(req.Count))
|
|
default:
|
|
msg.result = do.slowQuery.QueryAll()
|
|
}
|
|
msg.Done()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (do *Domain) infoSyncerKeeper() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("infoSyncerKeeper exited.")
|
|
util.Recover(metrics.LabelDomain, "infoSyncerKeeper", nil, false)
|
|
}()
|
|
ticker := time.NewTicker(infosync.ReportInterval)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
do.info.ReportMinStartTS(do.Store())
|
|
case <-do.info.Done():
|
|
logutil.BgLogger().Info("server info syncer need to restart")
|
|
if err := do.info.Restart(context.Background()); err != nil {
|
|
logutil.BgLogger().Error("server info syncer restart failed", zap.Error(err))
|
|
} else {
|
|
logutil.BgLogger().Info("server info syncer restarted")
|
|
}
|
|
case <-do.exit:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (do *Domain) globalConfigSyncerKeeper() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("globalConfigSyncerKeeper exited.")
|
|
util.Recover(metrics.LabelDomain, "globalConfigSyncerKeeper", nil, false)
|
|
}()
|
|
for {
|
|
select {
|
|
case entry := <-do.globalCfgSyncer.NotifyCh:
|
|
err := do.globalCfgSyncer.StoreGlobalConfig(context.Background(), entry)
|
|
if err != nil {
|
|
logutil.BgLogger().Error("global config syncer store failed", zap.Error(err))
|
|
}
|
|
// TODO(crazycs520): Add owner to maintain global config is consistency with global variable.
|
|
case <-do.exit:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (do *Domain) topologySyncerKeeper() {
|
|
defer util.Recover(metrics.LabelDomain, "topologySyncerKeeper", nil, false)
|
|
ticker := time.NewTicker(infosync.TopologyTimeToRefresh)
|
|
defer func() {
|
|
ticker.Stop()
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("topologySyncerKeeper exited.")
|
|
}()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
err := do.info.StoreTopologyInfo(context.Background())
|
|
if err != nil {
|
|
logutil.BgLogger().Error("refresh topology in loop failed", zap.Error(err))
|
|
}
|
|
case <-do.info.TopologyDone():
|
|
logutil.BgLogger().Info("server topology syncer need to restart")
|
|
if err := do.info.RestartTopology(context.Background()); err != nil {
|
|
logutil.BgLogger().Error("server topology syncer restart failed", zap.Error(err))
|
|
} else {
|
|
logutil.BgLogger().Info("server topology syncer restarted")
|
|
}
|
|
case <-do.exit:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (do *Domain) loadSchemaInLoop(ctx context.Context, lease time.Duration) {
|
|
defer util.Recover(metrics.LabelDomain, "loadSchemaInLoop", nil, true)
|
|
// Lease renewal can run at any frequency.
|
|
// Use lease/2 here as recommend by paper.
|
|
ticker := time.NewTicker(lease / 2)
|
|
defer func() {
|
|
ticker.Stop()
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("loadSchemaInLoop exited.")
|
|
}()
|
|
syncer := do.ddl.SchemaSyncer()
|
|
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
err := do.Reload()
|
|
if err != nil {
|
|
logutil.BgLogger().Error("reload schema in loop failed", zap.Error(err))
|
|
}
|
|
case _, ok := <-syncer.GlobalVersionCh():
|
|
err := do.Reload()
|
|
if err != nil {
|
|
logutil.BgLogger().Error("reload schema in loop failed", zap.Error(err))
|
|
}
|
|
if !ok {
|
|
logutil.BgLogger().Warn("reload schema in loop, schema syncer need rewatch")
|
|
// Make sure the rewatch doesn't affect load schema, so we watch the global schema version asynchronously.
|
|
syncer.WatchGlobalSchemaVer(context.Background())
|
|
}
|
|
case <-syncer.Done():
|
|
// The schema syncer stops, we need stop the schema validator to synchronize the schema version.
|
|
logutil.BgLogger().Info("reload schema in loop, schema syncer need restart")
|
|
// The etcd is responsible for schema synchronization, we should ensure there is at most two different schema version
|
|
// in the TiDB cluster, to make the data/schema be consistent. If we lost connection/session to etcd, the cluster
|
|
// will treats this TiDB as a down instance, and etcd will remove the key of `/tidb/ddl/all_schema_versions/tidb-id`.
|
|
// Say the schema version now is 1, the owner is changing the schema version to 2, it will not wait for this down TiDB syncing the schema,
|
|
// then continue to change the TiDB schema to version 3. Unfortunately, this down TiDB schema version will still be version 1.
|
|
// And version 1 is not consistent to version 3. So we need to stop the schema validator to prohibit the DML executing.
|
|
do.SchemaValidator.Stop()
|
|
err := do.mustRestartSyncer(ctx)
|
|
if err != nil {
|
|
logutil.BgLogger().Error("reload schema in loop, schema syncer restart failed", zap.Error(err))
|
|
break
|
|
}
|
|
// The schema maybe changed, must reload schema then the schema validator can restart.
|
|
exitLoop := do.mustReload()
|
|
// domain is cosed.
|
|
if exitLoop {
|
|
logutil.BgLogger().Error("domain is closed, exit loadSchemaInLoop")
|
|
return
|
|
}
|
|
do.SchemaValidator.Restart()
|
|
logutil.BgLogger().Info("schema syncer restarted")
|
|
case <-do.exit:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// mustRestartSyncer tries to restart the SchemaSyncer.
|
|
// It returns until it's successful or the domain is stoped.
|
|
func (do *Domain) mustRestartSyncer(ctx context.Context) error {
|
|
syncer := do.ddl.SchemaSyncer()
|
|
|
|
for {
|
|
err := syncer.Restart(ctx)
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
// If the domain has stopped, we return an error immediately.
|
|
if do.isClose() {
|
|
return err
|
|
}
|
|
logutil.BgLogger().Error("restart the schema syncer failed", zap.Error(err))
|
|
time.Sleep(time.Second)
|
|
}
|
|
}
|
|
|
|
// mustReload tries to Reload the schema, it returns until it's successful or the domain is closed.
|
|
// it returns false when it is successful, returns true when the domain is closed.
|
|
func (do *Domain) mustReload() (exitLoop bool) {
|
|
for {
|
|
err := do.Reload()
|
|
if err == nil {
|
|
logutil.BgLogger().Info("mustReload succeed")
|
|
return false
|
|
}
|
|
|
|
// If the domain is closed, we returns immediately.
|
|
logutil.BgLogger().Info("reload the schema failed", zap.Error(err))
|
|
if do.isClose() {
|
|
return true
|
|
}
|
|
time.Sleep(200 * time.Millisecond)
|
|
}
|
|
}
|
|
|
|
func (do *Domain) isClose() bool {
|
|
select {
|
|
case <-do.exit:
|
|
logutil.BgLogger().Info("domain is closed")
|
|
return true
|
|
default:
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Close closes the Domain and release its resource.
|
|
func (do *Domain) Close() {
|
|
if do == nil {
|
|
return
|
|
}
|
|
startTime := time.Now()
|
|
if do.ddl != nil {
|
|
terror.Log(do.ddl.Stop())
|
|
}
|
|
if do.info != nil {
|
|
do.info.RemoveServerInfo()
|
|
do.info.RemoveMinStartTS()
|
|
}
|
|
close(do.exit)
|
|
if do.etcdClient != nil {
|
|
terror.Log(errors.Trace(do.etcdClient.Close()))
|
|
}
|
|
|
|
do.slowQuery.Close()
|
|
if do.cancel != nil {
|
|
do.cancel()
|
|
}
|
|
do.wg.Wait()
|
|
do.sysSessionPool.Close()
|
|
variable.UnregisterStatistics(do.bindHandle)
|
|
if do.onClose != nil {
|
|
do.onClose()
|
|
}
|
|
logutil.BgLogger().Info("domain closed", zap.Duration("take time", time.Since(startTime)))
|
|
}
|
|
|
|
const resourceIdleTimeout = 3 * time.Minute // resources in the ResourcePool will be recycled after idleTimeout
|
|
|
|
// NewDomain creates a new domain. Should not create multiple domains for the same store.
|
|
func NewDomain(store kv.Storage, ddlLease time.Duration, statsLease time.Duration, idxUsageSyncLease time.Duration, planReplayerGCLease time.Duration, factory pools.Factory, onClose func()) *Domain {
|
|
capacity := 200 // capacity of the sysSessionPool size
|
|
do := &Domain{
|
|
store: store,
|
|
exit: make(chan struct{}),
|
|
sysSessionPool: newSessionPool(capacity, factory),
|
|
statsLease: statsLease,
|
|
infoCache: infoschema.NewCache(16),
|
|
slowQuery: newTopNSlowQueries(30, time.Hour*24*7, 500),
|
|
indexUsageSyncLease: idxUsageSyncLease,
|
|
planReplayer: &planReplayer{planReplayerGCLease: planReplayerGCLease},
|
|
onClose: onClose,
|
|
renewLeaseCh: make(chan func(), 10),
|
|
expiredTimeStamp4PC: types.NewTime(types.ZeroCoreTime, mysql.TypeTimestamp, types.DefaultFsp),
|
|
}
|
|
|
|
do.SchemaValidator = NewSchemaValidator(ddlLease, do)
|
|
do.expensiveQueryHandle = expensivequery.NewExpensiveQueryHandle(do.exit)
|
|
return do
|
|
}
|
|
|
|
const serverIDForStandalone = 1 // serverID for standalone deployment.
|
|
|
|
// Init initializes a domain.
|
|
func (do *Domain) Init(ddlLease time.Duration, sysExecutorFactory func(*Domain) (pools.Resource, error)) error {
|
|
do.sysExecutorFactory = sysExecutorFactory
|
|
perfschema.Init()
|
|
if ebd, ok := do.store.(kv.EtcdBackend); ok {
|
|
var addrs []string
|
|
var err error
|
|
if addrs, err = ebd.EtcdAddrs(); err != nil {
|
|
return err
|
|
}
|
|
if addrs != nil {
|
|
cfg := config.GetGlobalConfig()
|
|
// silence etcd warn log, when domain closed, it won't randomly print warn log
|
|
// see details at the issue https://github.com/pingcap/tidb/issues/15479
|
|
etcdLogCfg := zap.NewProductionConfig()
|
|
etcdLogCfg.Level = zap.NewAtomicLevelAt(zap.ErrorLevel)
|
|
cli, err := clientv3.New(clientv3.Config{
|
|
LogConfig: &etcdLogCfg,
|
|
Endpoints: addrs,
|
|
AutoSyncInterval: 30 * time.Second,
|
|
DialTimeout: 5 * time.Second,
|
|
DialOptions: []grpc.DialOption{
|
|
grpc.WithBackoffMaxDelay(time.Second * 3),
|
|
grpc.WithKeepaliveParams(keepalive.ClientParameters{
|
|
Time: time.Duration(cfg.TiKVClient.GrpcKeepAliveTime) * time.Second,
|
|
Timeout: time.Duration(cfg.TiKVClient.GrpcKeepAliveTimeout) * time.Second,
|
|
}),
|
|
},
|
|
TLS: ebd.TLSConfig(),
|
|
})
|
|
if err != nil {
|
|
return errors.Trace(err)
|
|
}
|
|
do.etcdClient = cli
|
|
}
|
|
}
|
|
|
|
// TODO: Here we create new sessions with sysFac in DDL,
|
|
// which will use `do` as Domain instead of call `domap.Get`.
|
|
// That's because `domap.Get` requires a lock, but before
|
|
// we initialize Domain finish, we can't require that again.
|
|
// After we remove the lazy logic of creating Domain, we
|
|
// can simplify code here.
|
|
sysFac := func() (pools.Resource, error) {
|
|
return sysExecutorFactory(do)
|
|
}
|
|
sysCtxPool := pools.NewResourcePool(sysFac, 2, 2, resourceIdleTimeout)
|
|
ctx, cancelFunc := context.WithCancel(context.Background())
|
|
do.cancel = cancelFunc
|
|
var callback ddl.Callback
|
|
newCallbackFunc, err := ddl.GetCustomizedHook("default_hook")
|
|
if err != nil {
|
|
return errors.Trace(err)
|
|
}
|
|
callback = newCallbackFunc(do)
|
|
d := do.ddl
|
|
do.ddl = ddl.NewDDL(
|
|
ctx,
|
|
ddl.WithEtcdClient(do.etcdClient),
|
|
ddl.WithStore(do.store),
|
|
ddl.WithInfoCache(do.infoCache),
|
|
ddl.WithHook(callback),
|
|
ddl.WithLease(ddlLease),
|
|
)
|
|
failpoint.Inject("MockReplaceDDL", func(val failpoint.Value) {
|
|
if val.(bool) {
|
|
do.ddl = d
|
|
}
|
|
})
|
|
// step 1: prepare the info/schema syncer which domain reload needed.
|
|
skipRegisterToDashboard := config.GetGlobalConfig().SkipRegisterToDashboard
|
|
do.info, err = infosync.GlobalInfoSyncerInit(ctx, do.ddl.GetID(), do.ServerID, do.etcdClient, skipRegisterToDashboard)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
do.globalCfgSyncer = globalconfigsync.NewGlobalConfigSyncer(do.etcdClient)
|
|
err = do.ddl.SchemaSyncer().Init(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// step 2: domain reload the infoSchema.
|
|
err = do.Reload()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// step 3: start the ddl after the domain reload, avoiding some internal sql running before infoSchema construction.
|
|
err = do.ddl.Start(sysCtxPool)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if config.GetGlobalConfig().Experimental.EnableGlobalKill {
|
|
if do.etcdClient != nil {
|
|
err := do.acquireServerID(ctx)
|
|
if err != nil {
|
|
logutil.BgLogger().Error("acquire serverID failed", zap.Error(err))
|
|
do.isLostConnectionToPD.Store(1) // will retry in `do.serverIDKeeper`
|
|
} else {
|
|
do.isLostConnectionToPD.Store(0)
|
|
}
|
|
|
|
do.wg.Add(1)
|
|
go do.serverIDKeeper()
|
|
} else {
|
|
// set serverID for standalone deployment to enable 'KILL'.
|
|
atomic.StoreUint64(&do.serverID, serverIDForStandalone)
|
|
}
|
|
}
|
|
|
|
// Only when the store is local that the lease value is 0.
|
|
// If the store is local, it doesn't need loadSchemaInLoop.
|
|
if ddlLease > 0 {
|
|
do.wg.Add(1)
|
|
// Local store needs to get the change information for every DDL state in each session.
|
|
go do.loadSchemaInLoop(ctx, ddlLease)
|
|
}
|
|
do.wg.Add(4)
|
|
go do.topNSlowQueryLoop()
|
|
go do.infoSyncerKeeper()
|
|
go do.renewLease()
|
|
go do.globalConfigSyncerKeeper()
|
|
if !skipRegisterToDashboard {
|
|
do.wg.Add(1)
|
|
go do.topologySyncerKeeper()
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type sessionPool struct {
|
|
resources chan pools.Resource
|
|
factory pools.Factory
|
|
mu struct {
|
|
sync.RWMutex
|
|
closed bool
|
|
}
|
|
}
|
|
|
|
func newSessionPool(capacity int, factory pools.Factory) *sessionPool {
|
|
return &sessionPool{
|
|
resources: make(chan pools.Resource, capacity),
|
|
factory: factory,
|
|
}
|
|
}
|
|
|
|
func (p *sessionPool) Get() (resource pools.Resource, err error) {
|
|
var ok bool
|
|
select {
|
|
case resource, ok = <-p.resources:
|
|
if !ok {
|
|
err = errors.New("session pool closed")
|
|
}
|
|
default:
|
|
resource, err = p.factory()
|
|
}
|
|
return
|
|
}
|
|
|
|
func (p *sessionPool) Put(resource pools.Resource) {
|
|
p.mu.RLock()
|
|
defer p.mu.RUnlock()
|
|
if p.mu.closed {
|
|
resource.Close()
|
|
return
|
|
}
|
|
|
|
select {
|
|
case p.resources <- resource:
|
|
default:
|
|
resource.Close()
|
|
}
|
|
}
|
|
func (p *sessionPool) Close() {
|
|
p.mu.Lock()
|
|
if p.mu.closed {
|
|
p.mu.Unlock()
|
|
return
|
|
}
|
|
p.mu.closed = true
|
|
close(p.resources)
|
|
p.mu.Unlock()
|
|
|
|
for r := range p.resources {
|
|
r.Close()
|
|
}
|
|
}
|
|
|
|
// SysSessionPool returns the system session pool.
|
|
func (do *Domain) SysSessionPool() *sessionPool {
|
|
return do.sysSessionPool
|
|
}
|
|
|
|
// GetEtcdClient returns the etcd client.
|
|
func (do *Domain) GetEtcdClient() *clientv3.Client {
|
|
return do.etcdClient
|
|
}
|
|
|
|
// LoadPrivilegeLoop create a goroutine loads privilege tables in a loop, it
|
|
// should be called only once in BootstrapSession.
|
|
func (do *Domain) LoadPrivilegeLoop(ctx sessionctx.Context) error {
|
|
ctx.GetSessionVars().InRestrictedSQL = true
|
|
_, err := ctx.(sqlexec.SQLExecutor).ExecuteInternal(context.Background(), "set @@autocommit = 1")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
do.privHandle = privileges.NewHandle()
|
|
err = do.privHandle.Update(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var watchCh clientv3.WatchChan
|
|
duration := 5 * time.Minute
|
|
if do.etcdClient != nil {
|
|
watchCh = do.etcdClient.Watch(context.Background(), privilegeKey)
|
|
duration = 10 * time.Minute
|
|
}
|
|
|
|
do.wg.Add(1)
|
|
go func() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("loadPrivilegeInLoop exited.")
|
|
util.Recover(metrics.LabelDomain, "loadPrivilegeInLoop", nil, false)
|
|
}()
|
|
var count int
|
|
for {
|
|
ok := true
|
|
select {
|
|
case <-do.exit:
|
|
return
|
|
case _, ok = <-watchCh:
|
|
case <-time.After(duration):
|
|
}
|
|
if !ok {
|
|
logutil.BgLogger().Error("load privilege loop watch channel closed")
|
|
watchCh = do.etcdClient.Watch(context.Background(), privilegeKey)
|
|
count++
|
|
if count > 10 {
|
|
time.Sleep(time.Duration(count) * time.Second)
|
|
}
|
|
continue
|
|
}
|
|
|
|
count = 0
|
|
err := do.privHandle.Update(ctx)
|
|
metrics.LoadPrivilegeCounter.WithLabelValues(metrics.RetLabel(err)).Inc()
|
|
if err != nil {
|
|
logutil.BgLogger().Error("load privilege failed", zap.Error(err))
|
|
}
|
|
}
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
// LoadSysVarCacheLoop create a goroutine loads sysvar cache in a loop,
|
|
// it should be called only once in BootstrapSession.
|
|
func (do *Domain) LoadSysVarCacheLoop(ctx sessionctx.Context) error {
|
|
ctx.GetSessionVars().InRestrictedSQL = true
|
|
err := do.rebuildSysVarCache(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
var watchCh clientv3.WatchChan
|
|
duration := 30 * time.Second
|
|
if do.etcdClient != nil {
|
|
watchCh = do.etcdClient.Watch(context.Background(), sysVarCacheKey)
|
|
}
|
|
do.wg.Add(1)
|
|
go func() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("LoadSysVarCacheLoop exited.")
|
|
util.Recover(metrics.LabelDomain, "LoadSysVarCacheLoop", nil, false)
|
|
}()
|
|
var count int
|
|
for {
|
|
ok := true
|
|
select {
|
|
case <-do.exit:
|
|
return
|
|
case _, ok = <-watchCh:
|
|
case <-time.After(duration):
|
|
}
|
|
|
|
failpoint.Inject("skipLoadSysVarCacheLoop", func(val failpoint.Value) {
|
|
// In some pkg integration test, there are many testSuite, and each testSuite has separate storage and
|
|
// `LoadSysVarCacheLoop` background goroutine. Then each testSuite `RebuildSysVarCache` from it's
|
|
// own storage.
|
|
// Each testSuit will also call `checkEnableServerGlobalVar` to update some local variables.
|
|
// That's the problem, each testSuit use different storage to update some same local variables.
|
|
// So just skip `RebuildSysVarCache` in some integration testing.
|
|
if val.(bool) {
|
|
failpoint.Continue()
|
|
}
|
|
})
|
|
|
|
if !ok {
|
|
logutil.BgLogger().Error("LoadSysVarCacheLoop loop watch channel closed")
|
|
watchCh = do.etcdClient.Watch(context.Background(), sysVarCacheKey)
|
|
count++
|
|
if count > 10 {
|
|
time.Sleep(time.Duration(count) * time.Second)
|
|
}
|
|
continue
|
|
}
|
|
count = 0
|
|
logutil.BgLogger().Debug("Rebuilding sysvar cache from etcd watch event.")
|
|
err := do.rebuildSysVarCache(ctx)
|
|
metrics.LoadSysVarCacheCounter.WithLabelValues(metrics.RetLabel(err)).Inc()
|
|
if err != nil {
|
|
logutil.BgLogger().Error("LoadSysVarCacheLoop failed", zap.Error(err))
|
|
}
|
|
}
|
|
}()
|
|
return nil
|
|
}
|
|
|
|
// PrivilegeHandle returns the MySQLPrivilege.
|
|
func (do *Domain) PrivilegeHandle() *privileges.Handle {
|
|
return do.privHandle
|
|
}
|
|
|
|
// BindHandle returns domain's bindHandle.
|
|
func (do *Domain) BindHandle() *bindinfo.BindHandle {
|
|
return do.bindHandle
|
|
}
|
|
|
|
// LoadBindInfoLoop create a goroutine loads BindInfo in a loop, it should
|
|
// be called only once in BootstrapSession.
|
|
func (do *Domain) LoadBindInfoLoop(ctxForHandle sessionctx.Context, ctxForEvolve sessionctx.Context) error {
|
|
ctxForHandle.GetSessionVars().InRestrictedSQL = true
|
|
ctxForEvolve.GetSessionVars().InRestrictedSQL = true
|
|
do.bindHandle = bindinfo.NewBindHandle(ctxForHandle)
|
|
err := do.bindHandle.Update(true)
|
|
if err != nil || bindinfo.Lease == 0 {
|
|
return err
|
|
}
|
|
|
|
owner := do.newOwnerManager(bindinfo.Prompt, bindinfo.OwnerKey)
|
|
do.globalBindHandleWorkerLoop(owner)
|
|
do.handleEvolvePlanTasksLoop(ctxForEvolve, owner)
|
|
return nil
|
|
}
|
|
|
|
func (do *Domain) globalBindHandleWorkerLoop(owner owner.Manager) {
|
|
do.wg.Add(1)
|
|
go func() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("globalBindHandleWorkerLoop exited.")
|
|
util.Recover(metrics.LabelDomain, "globalBindHandleWorkerLoop", nil, false)
|
|
}()
|
|
bindWorkerTicker := time.NewTicker(bindinfo.Lease)
|
|
gcBindTicker := time.NewTicker(100 * bindinfo.Lease)
|
|
defer func() {
|
|
bindWorkerTicker.Stop()
|
|
gcBindTicker.Stop()
|
|
}()
|
|
for {
|
|
select {
|
|
case <-do.exit:
|
|
return
|
|
case <-bindWorkerTicker.C:
|
|
err := do.bindHandle.Update(false)
|
|
if err != nil {
|
|
logutil.BgLogger().Error("update bindinfo failed", zap.Error(err))
|
|
}
|
|
do.bindHandle.DropInvalidBindRecord()
|
|
// Get Global
|
|
optVal, err := do.GetGlobalVar(variable.TiDBCapturePlanBaseline)
|
|
if err == nil && variable.TiDBOptOn(optVal) {
|
|
do.bindHandle.CaptureBaselines()
|
|
}
|
|
do.bindHandle.SaveEvolveTasksToStore()
|
|
case <-gcBindTicker.C:
|
|
if !owner.IsOwner() {
|
|
continue
|
|
}
|
|
err := do.bindHandle.GCBindRecord()
|
|
if err != nil {
|
|
logutil.BgLogger().Error("GC bind record failed", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
func (do *Domain) handleEvolvePlanTasksLoop(ctx sessionctx.Context, owner owner.Manager) {
|
|
do.wg.Add(1)
|
|
go func() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("handleEvolvePlanTasksLoop exited.")
|
|
util.Recover(metrics.LabelDomain, "handleEvolvePlanTasksLoop", nil, false)
|
|
}()
|
|
for {
|
|
select {
|
|
case <-do.exit:
|
|
owner.Cancel()
|
|
return
|
|
case <-time.After(bindinfo.Lease):
|
|
}
|
|
if owner.IsOwner() {
|
|
err := do.bindHandle.HandleEvolvePlanTask(ctx, false)
|
|
if err != nil {
|
|
logutil.BgLogger().Info("evolve plan failed", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// TelemetryReportLoop create a goroutine that reports usage data in a loop, it should be called only once
|
|
// in BootstrapSession.
|
|
func (do *Domain) TelemetryReportLoop(ctx sessionctx.Context) {
|
|
ctx.GetSessionVars().InRestrictedSQL = true
|
|
err := telemetry.InitialRun(ctx, do.GetEtcdClient())
|
|
if err != nil {
|
|
logutil.BgLogger().Warn("Initial telemetry run failed", zap.Error(err))
|
|
}
|
|
|
|
do.wg.Add(1)
|
|
go func() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("TelemetryReportLoop exited.")
|
|
util.Recover(metrics.LabelDomain, "TelemetryReportLoop", nil, false)
|
|
}()
|
|
owner := do.newOwnerManager(telemetry.Prompt, telemetry.OwnerKey)
|
|
for {
|
|
select {
|
|
case <-do.exit:
|
|
owner.Cancel()
|
|
return
|
|
case <-time.After(telemetry.ReportInterval):
|
|
if !owner.IsOwner() {
|
|
continue
|
|
}
|
|
err := telemetry.ReportUsageData(ctx, do.GetEtcdClient())
|
|
if err != nil {
|
|
// Only status update errors will be printed out
|
|
logutil.BgLogger().Warn("TelemetryReportLoop status update failed", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// TelemetryRotateSubWindowLoop create a goroutine that rotates the telemetry window regularly.
|
|
func (do *Domain) TelemetryRotateSubWindowLoop(ctx sessionctx.Context) {
|
|
ctx.GetSessionVars().InRestrictedSQL = true
|
|
do.wg.Add(1)
|
|
go func() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("TelemetryRotateSubWindowLoop exited.")
|
|
util.Recover(metrics.LabelDomain, "TelemetryRotateSubWindowLoop", nil, false)
|
|
}()
|
|
for {
|
|
select {
|
|
case <-do.exit:
|
|
return
|
|
case <-time.After(telemetry.SubWindowSize):
|
|
telemetry.RotateSubWindow()
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// PlanReplayerLoop creates a goroutine that handles `exit` and `gc`.
|
|
func (do *Domain) PlanReplayerLoop() {
|
|
do.wg.Add(1)
|
|
go func() {
|
|
gcTicker := time.NewTicker(do.planReplayer.planReplayerGCLease)
|
|
defer func() {
|
|
gcTicker.Stop()
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("PlanReplayerLoop exited.")
|
|
util.Recover(metrics.LabelDomain, "PlanReplayerLoop", nil, false)
|
|
}()
|
|
for {
|
|
select {
|
|
case <-do.exit:
|
|
return
|
|
case <-gcTicker.C:
|
|
do.planReplayer.planReplayerGC(time.Hour)
|
|
}
|
|
}
|
|
}()
|
|
}
|
|
|
|
// StatsHandle returns the statistic handle.
|
|
func (do *Domain) StatsHandle() *handle.Handle {
|
|
return (*handle.Handle)(atomic.LoadPointer(&do.statsHandle))
|
|
}
|
|
|
|
// CreateStatsHandle is used only for test.
|
|
func (do *Domain) CreateStatsHandle(ctx sessionctx.Context) error {
|
|
h, err := handle.NewHandle(ctx, do.statsLease, do.sysSessionPool)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
atomic.StorePointer(&do.statsHandle, unsafe.Pointer(h))
|
|
return nil
|
|
}
|
|
|
|
// StatsUpdating checks if the stats worker is updating.
|
|
func (do *Domain) StatsUpdating() bool {
|
|
return do.statsUpdating.Load() > 0
|
|
}
|
|
|
|
// SetStatsUpdating sets the value of stats updating.
|
|
func (do *Domain) SetStatsUpdating(val bool) {
|
|
if val {
|
|
do.statsUpdating.Store(1)
|
|
} else {
|
|
do.statsUpdating.Store(0)
|
|
}
|
|
}
|
|
|
|
// RunAutoAnalyze indicates if this TiDB server starts auto analyze worker and can run auto analyze job.
|
|
var RunAutoAnalyze = true
|
|
|
|
// UpdateTableStatsLoop creates a goroutine loads stats info and updates stats info in a loop.
|
|
// It will also start a goroutine to analyze tables automatically.
|
|
// It should be called only once in BootstrapSession.
|
|
func (do *Domain) UpdateTableStatsLoop(ctx sessionctx.Context) error {
|
|
ctx.GetSessionVars().InRestrictedSQL = true
|
|
statsHandle, err := handle.NewHandle(ctx, do.statsLease, do.sysSessionPool)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
atomic.StorePointer(&do.statsHandle, unsafe.Pointer(statsHandle))
|
|
do.ddl.RegisterStatsHandle(statsHandle)
|
|
// Negative stats lease indicates that it is in test, it does not need update.
|
|
if do.statsLease >= 0 {
|
|
do.wg.Add(1)
|
|
go do.loadStatsWorker()
|
|
}
|
|
owner := do.newOwnerManager(handle.StatsPrompt, handle.StatsOwnerKey)
|
|
if do.indexUsageSyncLease > 0 {
|
|
do.wg.Add(1)
|
|
go do.syncIndexUsageWorker(owner)
|
|
}
|
|
if do.statsLease <= 0 {
|
|
return nil
|
|
}
|
|
do.wg.Add(1)
|
|
do.SetStatsUpdating(true)
|
|
go do.updateStatsWorker(ctx, owner)
|
|
if RunAutoAnalyze {
|
|
do.wg.Add(1)
|
|
go do.autoAnalyzeWorker(owner)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// StartLoadStatsSubWorkers starts sub workers with new sessions to load stats concurrently
|
|
func (do *Domain) StartLoadStatsSubWorkers(ctxList []sessionctx.Context) {
|
|
statsHandle := do.StatsHandle()
|
|
for i, ctx := range ctxList {
|
|
statsHandle.StatsLoad.SubCtxs[i] = ctx
|
|
do.wg.Add(1)
|
|
go statsHandle.SubLoadWorker(ctx, do.exit, &do.wg)
|
|
}
|
|
}
|
|
|
|
func (do *Domain) newOwnerManager(prompt, ownerKey string) owner.Manager {
|
|
id := do.ddl.OwnerManager().ID()
|
|
var statsOwner owner.Manager
|
|
if do.etcdClient == nil {
|
|
statsOwner = owner.NewMockManager(context.Background(), id)
|
|
} else {
|
|
statsOwner = owner.NewOwnerManager(context.Background(), do.etcdClient, prompt, id, ownerKey)
|
|
}
|
|
// TODO: Need to do something when err is not nil.
|
|
err := statsOwner.CampaignOwner()
|
|
if err != nil {
|
|
logutil.BgLogger().Warn("campaign owner failed", zap.Error(err))
|
|
}
|
|
return statsOwner
|
|
}
|
|
|
|
func (do *Domain) loadStatsWorker() {
|
|
defer util.Recover(metrics.LabelDomain, "loadStatsWorker", nil, false)
|
|
lease := do.statsLease
|
|
if lease == 0 {
|
|
lease = 3 * time.Second
|
|
}
|
|
loadTicker := time.NewTicker(lease)
|
|
defer func() {
|
|
loadTicker.Stop()
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("loadStatsWorker exited.")
|
|
}()
|
|
statsHandle := do.StatsHandle()
|
|
t := time.Now()
|
|
err := statsHandle.InitStats(do.InfoSchema())
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("init stats info failed", zap.Error(err))
|
|
} else {
|
|
logutil.BgLogger().Info("init stats info time", zap.Duration("take time", time.Since(t)))
|
|
}
|
|
for {
|
|
select {
|
|
case <-loadTicker.C:
|
|
err = statsHandle.RefreshVars()
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("refresh variables failed", zap.Error(err))
|
|
}
|
|
err = statsHandle.Update(do.InfoSchema())
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("update stats info failed", zap.Error(err))
|
|
}
|
|
err = statsHandle.LoadNeededHistograms()
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("load histograms failed", zap.Error(err))
|
|
}
|
|
case <-do.exit:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
func (do *Domain) syncIndexUsageWorker(owner owner.Manager) {
|
|
defer util.Recover(metrics.LabelDomain, "syncIndexUsageWorker", nil, false)
|
|
idxUsageSyncTicker := time.NewTicker(do.indexUsageSyncLease)
|
|
gcStatsTicker := time.NewTicker(100 * do.indexUsageSyncLease)
|
|
handle := do.StatsHandle()
|
|
defer func() {
|
|
idxUsageSyncTicker.Stop()
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("syncIndexUsageWorker exited.")
|
|
}()
|
|
for {
|
|
select {
|
|
case <-do.exit:
|
|
// TODO: need flush index usage
|
|
return
|
|
case <-idxUsageSyncTicker.C:
|
|
if err := handle.DumpIndexUsageToKV(); err != nil {
|
|
logutil.BgLogger().Debug("dump index usage failed", zap.Error(err))
|
|
}
|
|
case <-gcStatsTicker.C:
|
|
if !owner.IsOwner() {
|
|
continue
|
|
}
|
|
if err := handle.GCIndexUsage(); err != nil {
|
|
logutil.BgLogger().Error("[stats] gc index usage failed", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (do *Domain) updateStatsWorker(ctx sessionctx.Context, owner owner.Manager) {
|
|
defer util.Recover(metrics.LabelDomain, "updateStatsWorker", nil, false)
|
|
lease := do.statsLease
|
|
deltaUpdateTicker := time.NewTicker(20 * lease)
|
|
gcStatsTicker := time.NewTicker(100 * lease)
|
|
dumpFeedbackTicker := time.NewTicker(200 * lease)
|
|
loadFeedbackTicker := time.NewTicker(5 * lease)
|
|
dumpColStatsUsageTicker := time.NewTicker(100 * lease)
|
|
statsHandle := do.StatsHandle()
|
|
defer func() {
|
|
dumpColStatsUsageTicker.Stop()
|
|
loadFeedbackTicker.Stop()
|
|
dumpFeedbackTicker.Stop()
|
|
gcStatsTicker.Stop()
|
|
deltaUpdateTicker.Stop()
|
|
do.SetStatsUpdating(false)
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("updateStatsWorker exited.")
|
|
}()
|
|
for {
|
|
select {
|
|
case <-do.exit:
|
|
statsHandle.FlushStats()
|
|
owner.Cancel()
|
|
return
|
|
// This channel is sent only by ddl owner.
|
|
case t := <-statsHandle.DDLEventCh():
|
|
err := statsHandle.HandleDDLEvent(t)
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("handle ddl event failed", zap.Error(err))
|
|
}
|
|
case <-deltaUpdateTicker.C:
|
|
err := statsHandle.DumpStatsDeltaToKV(handle.DumpDelta)
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("dump stats delta failed", zap.Error(err))
|
|
}
|
|
statsHandle.UpdateErrorRate(do.InfoSchema())
|
|
case <-loadFeedbackTicker.C:
|
|
statsHandle.UpdateStatsByLocalFeedback(do.InfoSchema())
|
|
if !owner.IsOwner() {
|
|
continue
|
|
}
|
|
err := statsHandle.HandleUpdateStats(do.InfoSchema())
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("update stats using feedback failed", zap.Error(err))
|
|
}
|
|
case <-dumpFeedbackTicker.C:
|
|
err := statsHandle.DumpStatsFeedbackToKV()
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("dump stats feedback failed", zap.Error(err))
|
|
}
|
|
case <-gcStatsTicker.C:
|
|
if !owner.IsOwner() {
|
|
continue
|
|
}
|
|
err := statsHandle.GCStats(do.InfoSchema(), do.DDL().GetLease())
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("GC stats failed", zap.Error(err))
|
|
}
|
|
case <-dumpColStatsUsageTicker.C:
|
|
err := statsHandle.DumpColStatsUsageToKV()
|
|
if err != nil {
|
|
logutil.BgLogger().Debug("dump column stats usage failed", zap.Error(err))
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (do *Domain) autoAnalyzeWorker(owner owner.Manager) {
|
|
defer util.Recover(metrics.LabelDomain, "autoAnalyzeWorker", nil, false)
|
|
statsHandle := do.StatsHandle()
|
|
analyzeTicker := time.NewTicker(do.statsLease)
|
|
defer func() {
|
|
analyzeTicker.Stop()
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("autoAnalyzeWorker exited.")
|
|
}()
|
|
for {
|
|
select {
|
|
case <-analyzeTicker.C:
|
|
if owner.IsOwner() {
|
|
statsHandle.HandleAutoAnalyze(do.InfoSchema())
|
|
}
|
|
case <-do.exit:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// ExpensiveQueryHandle returns the expensive query handle.
|
|
func (do *Domain) ExpensiveQueryHandle() *expensivequery.Handle {
|
|
return do.expensiveQueryHandle
|
|
}
|
|
|
|
const (
|
|
privilegeKey = "/tidb/privilege"
|
|
sysVarCacheKey = "/tidb/sysvars"
|
|
)
|
|
|
|
// NotifyUpdatePrivilege updates privilege key in etcd, TiDB client that watches
|
|
// the key will get notification.
|
|
func (do *Domain) NotifyUpdatePrivilege() error {
|
|
// No matter skip-grant-table is configured or not, sending an etcd message is required.
|
|
// Because we need to tell other TiDB instances to update privilege data, say, we're changing the
|
|
// password using a special TiDB instance and want the new password to take effect.
|
|
if do.etcdClient != nil {
|
|
row := do.etcdClient.KV
|
|
_, err := row.Put(context.Background(), privilegeKey, "")
|
|
if err != nil {
|
|
logutil.BgLogger().Warn("notify update privilege failed", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// If skip-grant-table is configured, do not flush privileges.
|
|
// Because LoadPrivilegeLoop does not run and the privilege Handle is nil,
|
|
// the call to do.PrivilegeHandle().Update would panic.
|
|
if config.GetGlobalConfig().Security.SkipGrantTable {
|
|
return nil
|
|
}
|
|
|
|
// update locally
|
|
sysSessionPool := do.SysSessionPool()
|
|
ctx, err := sysSessionPool.Get()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer sysSessionPool.Put(ctx)
|
|
return do.PrivilegeHandle().Update(ctx.(sessionctx.Context))
|
|
}
|
|
|
|
// NotifyUpdateSysVarCache updates the sysvar cache key in etcd, which other TiDB
|
|
// clients are subscribed to for updates. For the caller, the cache is also built
|
|
// synchronously so that the effect is immediate.
|
|
func (do *Domain) NotifyUpdateSysVarCache() {
|
|
if do.etcdClient != nil {
|
|
row := do.etcdClient.KV
|
|
_, err := row.Put(context.Background(), sysVarCacheKey, "")
|
|
if err != nil {
|
|
logutil.BgLogger().Warn("notify update sysvar cache failed", zap.Error(err))
|
|
}
|
|
}
|
|
// update locally
|
|
if err := do.rebuildSysVarCache(nil); err != nil {
|
|
logutil.BgLogger().Error("rebuilding sysvar cache failed", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// ServerID gets serverID.
|
|
func (do *Domain) ServerID() uint64 {
|
|
return atomic.LoadUint64(&do.serverID)
|
|
}
|
|
|
|
// IsLostConnectionToPD indicates lost connection to PD or not.
|
|
func (do *Domain) IsLostConnectionToPD() bool {
|
|
return do.isLostConnectionToPD.Load() != 0
|
|
}
|
|
|
|
const (
|
|
serverIDEtcdPath = "/tidb/server_id"
|
|
refreshServerIDRetryCnt = 3
|
|
acquireServerIDRetryInterval = 300 * time.Millisecond
|
|
acquireServerIDTimeout = 10 * time.Second
|
|
retrieveServerIDSessionTimeout = 10 * time.Second
|
|
)
|
|
|
|
var (
|
|
// serverIDTTL should be LONG ENOUGH to avoid barbarically killing an on-going long-run SQL.
|
|
serverIDTTL = 12 * time.Hour
|
|
// serverIDTimeToKeepAlive is the interval that we keep serverID TTL alive periodically.
|
|
serverIDTimeToKeepAlive = 5 * time.Minute
|
|
// serverIDTimeToCheckPDConnectionRestored is the interval that we check connection to PD restored (after broken) periodically.
|
|
serverIDTimeToCheckPDConnectionRestored = 10 * time.Second
|
|
// lostConnectionToPDTimeout is the duration that when TiDB cannot connect to PD excceeds this limit,
|
|
// we realize the connection to PD is lost utterly, and server ID acquired before should be released.
|
|
// Must be SHORTER than `serverIDTTL`.
|
|
lostConnectionToPDTimeout = 6 * time.Hour
|
|
)
|
|
|
|
var (
|
|
ldflagIsGlobalKillTest = "0" // 1:Yes, otherwise:No.
|
|
ldflagServerIDTTL = "10" // in seconds.
|
|
ldflagServerIDTimeToKeepAlive = "1" // in seconds.
|
|
ldflagServerIDTimeToCheckPDConnectionRestored = "1" // in seconds.
|
|
ldflagLostConnectionToPDTimeout = "5" // in seconds.
|
|
)
|
|
|
|
func initByLDFlagsForGlobalKill() {
|
|
if ldflagIsGlobalKillTest == "1" {
|
|
var (
|
|
i int
|
|
err error
|
|
)
|
|
|
|
if i, err = strconv.Atoi(ldflagServerIDTTL); err != nil {
|
|
panic("invalid ldflagServerIDTTL")
|
|
}
|
|
serverIDTTL = time.Duration(i) * time.Second
|
|
|
|
if i, err = strconv.Atoi(ldflagServerIDTimeToKeepAlive); err != nil {
|
|
panic("invalid ldflagServerIDTimeToKeepAlive")
|
|
}
|
|
serverIDTimeToKeepAlive = time.Duration(i) * time.Second
|
|
|
|
if i, err = strconv.Atoi(ldflagServerIDTimeToCheckPDConnectionRestored); err != nil {
|
|
panic("invalid ldflagServerIDTimeToCheckPDConnectionRestored")
|
|
}
|
|
serverIDTimeToCheckPDConnectionRestored = time.Duration(i) * time.Second
|
|
|
|
if i, err = strconv.Atoi(ldflagLostConnectionToPDTimeout); err != nil {
|
|
panic("invalid ldflagLostConnectionToPDTimeout")
|
|
}
|
|
lostConnectionToPDTimeout = time.Duration(i) * time.Second
|
|
|
|
logutil.BgLogger().Info("global_kill_test is enabled", zap.Duration("serverIDTTL", serverIDTTL),
|
|
zap.Duration("serverIDTimeToKeepAlive", serverIDTimeToKeepAlive),
|
|
zap.Duration("serverIDTimeToCheckPDConnectionRestored", serverIDTimeToCheckPDConnectionRestored),
|
|
zap.Duration("lostConnectionToPDTimeout", lostConnectionToPDTimeout))
|
|
}
|
|
}
|
|
|
|
func (do *Domain) retrieveServerIDSession(ctx context.Context) (*concurrency.Session, error) {
|
|
if do.serverIDSession != nil {
|
|
return do.serverIDSession, nil
|
|
}
|
|
|
|
// `etcdClient.Grant` needs a shortterm timeout, to avoid blocking if connection to PD lost,
|
|
// while `etcdClient.KeepAlive` should be longterm.
|
|
// So we separately invoke `etcdClient.Grant` and `concurrency.NewSession` with leaseID.
|
|
childCtx, cancel := context.WithTimeout(ctx, retrieveServerIDSessionTimeout)
|
|
resp, err := do.etcdClient.Grant(childCtx, int64(serverIDTTL.Seconds()))
|
|
cancel()
|
|
if err != nil {
|
|
logutil.BgLogger().Error("retrieveServerIDSession.Grant fail", zap.Error(err))
|
|
return nil, err
|
|
}
|
|
leaseID := resp.ID
|
|
|
|
session, err := concurrency.NewSession(do.etcdClient,
|
|
concurrency.WithLease(leaseID), concurrency.WithContext(context.Background()))
|
|
if err != nil {
|
|
logutil.BgLogger().Error("retrieveServerIDSession.NewSession fail", zap.Error(err))
|
|
return nil, err
|
|
}
|
|
do.serverIDSession = session
|
|
return session, nil
|
|
}
|
|
|
|
func (do *Domain) acquireServerID(ctx context.Context) error {
|
|
atomic.StoreUint64(&do.serverID, 0)
|
|
|
|
session, err := do.retrieveServerIDSession(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
for {
|
|
randServerID := rand.Int63n(int64(util.MaxServerID)) + 1 // get a random serverID: [1, MaxServerID] #nosec G404
|
|
key := fmt.Sprintf("%s/%v", serverIDEtcdPath, randServerID)
|
|
cmp := clientv3.Compare(clientv3.CreateRevision(key), "=", 0)
|
|
value := "0"
|
|
|
|
childCtx, cancel := context.WithTimeout(ctx, acquireServerIDTimeout)
|
|
txn := do.etcdClient.Txn(childCtx)
|
|
t := txn.If(cmp)
|
|
resp, err := t.Then(clientv3.OpPut(key, value, clientv3.WithLease(session.Lease()))).Commit()
|
|
cancel()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !resp.Succeeded {
|
|
logutil.BgLogger().Info("proposed random serverID exists, will randomize again", zap.Int64("randServerID", randServerID))
|
|
time.Sleep(acquireServerIDRetryInterval)
|
|
continue
|
|
}
|
|
|
|
atomic.StoreUint64(&do.serverID, uint64(randServerID))
|
|
logutil.BgLogger().Info("acquireServerID", zap.Uint64("serverID", do.ServerID()),
|
|
zap.String("lease id", strconv.FormatInt(int64(session.Lease()), 16)))
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func (do *Domain) refreshServerIDTTL(ctx context.Context) error {
|
|
session, err := do.retrieveServerIDSession(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
key := fmt.Sprintf("%s/%v", serverIDEtcdPath, do.ServerID())
|
|
value := "0"
|
|
err = ddlutil.PutKVToEtcd(ctx, do.etcdClient, refreshServerIDRetryCnt, key, value, clientv3.WithLease(session.Lease()))
|
|
if err != nil {
|
|
logutil.BgLogger().Error("refreshServerIDTTL fail", zap.Uint64("serverID", do.ServerID()), zap.Error(err))
|
|
} else {
|
|
logutil.BgLogger().Info("refreshServerIDTTL succeed", zap.Uint64("serverID", do.ServerID()),
|
|
zap.String("lease id", strconv.FormatInt(int64(session.Lease()), 16)))
|
|
}
|
|
return err
|
|
}
|
|
|
|
func (do *Domain) serverIDKeeper() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("serverIDKeeper exited.")
|
|
}()
|
|
defer util.Recover(metrics.LabelDomain, "serverIDKeeper", func() {
|
|
logutil.BgLogger().Info("recover serverIDKeeper.")
|
|
// should be called before `do.wg.Done()`, to ensure that Domain.Close() waits for the new `serverIDKeeper()` routine.
|
|
do.wg.Add(1)
|
|
go do.serverIDKeeper()
|
|
}, false)
|
|
|
|
tickerKeepAlive := time.NewTicker(serverIDTimeToKeepAlive)
|
|
tickerCheckRestored := time.NewTicker(serverIDTimeToCheckPDConnectionRestored)
|
|
defer func() {
|
|
tickerKeepAlive.Stop()
|
|
tickerCheckRestored.Stop()
|
|
}()
|
|
|
|
blocker := make(chan struct{}) // just used for blocking the sessionDone() when session is nil.
|
|
sessionDone := func() <-chan struct{} {
|
|
if do.serverIDSession == nil {
|
|
return blocker
|
|
}
|
|
return do.serverIDSession.Done()
|
|
}
|
|
|
|
var lastSucceedTimestamp time.Time
|
|
|
|
onConnectionToPDRestored := func() {
|
|
logutil.BgLogger().Info("restored connection to PD")
|
|
do.isLostConnectionToPD.Store(0)
|
|
lastSucceedTimestamp = time.Now()
|
|
|
|
if err := do.info.StoreServerInfo(context.Background()); err != nil {
|
|
logutil.BgLogger().Error("StoreServerInfo failed", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
onConnectionToPDLost := func() {
|
|
logutil.BgLogger().Warn("lost connection to PD")
|
|
do.isLostConnectionToPD.Store(1)
|
|
|
|
// Kill all connections when lost connection to PD,
|
|
// to avoid the possibility that another TiDB instance acquires the same serverID and generates a same connection ID,
|
|
// which will lead to a wrong connection killed.
|
|
do.InfoSyncer().GetSessionManager().KillAllConnections()
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case <-tickerKeepAlive.C:
|
|
if !do.IsLostConnectionToPD() {
|
|
if err := do.refreshServerIDTTL(context.Background()); err == nil {
|
|
lastSucceedTimestamp = time.Now()
|
|
} else {
|
|
if lostConnectionToPDTimeout > 0 && time.Since(lastSucceedTimestamp) > lostConnectionToPDTimeout {
|
|
onConnectionToPDLost()
|
|
}
|
|
}
|
|
}
|
|
case <-tickerCheckRestored.C:
|
|
if do.IsLostConnectionToPD() {
|
|
if err := do.acquireServerID(context.Background()); err == nil {
|
|
onConnectionToPDRestored()
|
|
}
|
|
}
|
|
case <-sessionDone():
|
|
// inform that TTL of `serverID` is expired. See https://godoc.org/github.com/coreos/etcd/clientv3/concurrency#Session.Done
|
|
// Should be in `IsLostConnectionToPD` state, as `lostConnectionToPDTimeout` is shorter than `serverIDTTL`.
|
|
// So just set `do.serverIDSession = nil` to restart `serverID` session in `retrieveServerIDSession()`.
|
|
logutil.BgLogger().Info("serverIDSession need restart")
|
|
do.serverIDSession = nil
|
|
case <-do.exit:
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// MockInfoCacheAndLoadInfoSchema only used in unit test
|
|
func (do *Domain) MockInfoCacheAndLoadInfoSchema(is infoschema.InfoSchema) {
|
|
do.infoCache = infoschema.NewCache(16)
|
|
do.infoCache.Insert(is, 0)
|
|
}
|
|
|
|
func (do *Domain) renewLease() {
|
|
defer func() {
|
|
do.wg.Done()
|
|
logutil.BgLogger().Info("renew lease goroutine exited.")
|
|
}()
|
|
for {
|
|
select {
|
|
case <-do.exit:
|
|
close(do.renewLeaseCh)
|
|
return
|
|
case op := <-do.renewLeaseCh:
|
|
op()
|
|
}
|
|
}
|
|
}
|
|
|
|
func init() {
|
|
initByLDFlagsForGlobalKill()
|
|
}
|
|
|
|
var (
|
|
// ErrInfoSchemaExpired returns the error that information schema is out of date.
|
|
ErrInfoSchemaExpired = dbterror.ClassDomain.NewStd(errno.ErrInfoSchemaExpired)
|
|
// ErrInfoSchemaChanged returns the error that information schema is changed.
|
|
ErrInfoSchemaChanged = dbterror.ClassDomain.NewStdErr(errno.ErrInfoSchemaChanged,
|
|
mysql.Message(errno.MySQLErrName[errno.ErrInfoSchemaChanged].Raw+". "+kv.TxnRetryableMark, nil))
|
|
)
|