tidb/util/cgroup/cgroup.go

// Copyright 2022 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package cgroup

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"math"
	"os"
	"path/filepath"
	"runtime"
	"strconv"
	"strings"

	"github.com/cockroachdb/errors"
	"github.com/pingcap/log"
	"go.uber.org/zap"
)

// CPUQuotaStatus presents the status of how CPU quota is used
type CPUQuotaStatus int

const (
	// CPUQuotaUndefined is returned when CPU quota is undefined
	CPUQuotaUndefined CPUQuotaStatus = iota
	// CPUQuotaUsed is returned when a valid CPU quota can be used
	CPUQuotaUsed
	// CPUQuotaMinUsed is return when CPU quota is smaller than the min value
	CPUQuotaMinUsed
)

const (
	_maxProcsKey = "GOMAXPROCS"

	// They are cgroup filename for different data
	cgroupV1MemStat      = "memory.stat"
	cgroupV2MemStat      = "memory.stat"
	cgroupV2MemLimit     = "memory.max"
	cgroupV1MemUsage     = "memory.usage_in_bytes"
	cgroupV2MemUsage     = "memory.current"
	cgroupV1CPUQuota     = "cpu.cfs_quota_us"
	cgroupV1CPUPeriod    = "cpu.cfs_period_us"
	cgroupV1CPUSysUsage  = "cpuacct.usage_sys"
	cgroupV1CPUUserUsage = "cpuacct.usage_user"
	cgroupV2CPUMax       = "cpu.max"
	cgroupV2CPUStat      = "cpu.stat"

	// {memory|cpu}.stat file keys
	//
	// key for # of bytes of file-backed memory on inactive LRU list in cgroupv1
	cgroupV1MemInactiveFileUsageStatKey = "total_inactive_file"
	// key for # of bytes of file-backed memory on inactive LRU list in cgroupv2
	cgroupV2MemInactiveFileUsageStatKey = "inactive_file"
	cgroupV1MemLimitStatKey             = "hierarchical_memory_limit"
)
const (
	procPathCGroup    = "/proc/self/cgroup"
	procPathMountInfo = "/proc/self/mountinfo"
)

// CPUUsage returns CPU usage and quotas for an entire cgroup.
type CPUUsage struct {
	// System time and user time taken by this cgroup or process. In nanoseconds.
	Stime, Utime uint64
	// CPU period and quota for this process, in microseconds. This cgroup has
	// access to up to (quota/period) proportion of CPU resources on the system.
	// For instance, if there are 4 CPUs, quota = 150000, period = 100000,
	// this cgroup can use around ~1.5 CPUs, or 37.5% of total scheduler time.
	// If quota is -1, it's unlimited.
	Period, Quota int64
	// NumCPUs is the number of CPUs in the system. Always returned even if
	// not called from a cgroup.
	NumCPU int
}

// SetGOMAXPROCS is to set GOMAXPROCS to the number of CPUs.
func SetGOMAXPROCS() (func(), error) {
	const minGOMAXPROCS int = 1

	undoNoop := func() {
		log.Info("maxprocs: No GOMAXPROCS change to reset")
	}

	if max, exists := os.LookupEnv(_maxProcsKey); exists {
		log.Info(fmt.Sprintf("maxprocs: Honoring GOMAXPROCS=%q as set in environment", max))
		return undoNoop, nil
	}

	maxProcs, status, err := CPUQuotaToGOMAXPROCS(minGOMAXPROCS)
	if err != nil {
		return undoNoop, err
	}

	if status == CPUQuotaUndefined {
		log.Info(fmt.Sprintf("maxprocs: Leaving GOMAXPROCS=%v: CPU quota undefined", runtime.GOMAXPROCS(0)))
		return undoNoop, nil
	}

	prev := runtime.GOMAXPROCS(0)
	undo := func() {
		log.Info(fmt.Sprintf("maxprocs: Resetting GOMAXPROCS to %v", prev))
		runtime.GOMAXPROCS(prev)
	}
	if prev == maxProcs {
		return undoNoop, nil
	}
	switch status {
	case CPUQuotaMinUsed:
		log.Info(fmt.Sprintf("maxprocs: Updating GOMAXPROCS=%v: using minimum allowed GOMAXPROCS", maxProcs))
	case CPUQuotaUsed:
		log.Info(fmt.Sprintf("maxprocs: Updating GOMAXPROCS=%v: determined from CPU quota", maxProcs))
	}
	runtime.GOMAXPROCS(maxProcs)
	return undo, nil
}

func readFile(filepath string) (res []byte, err error) {
	var f *os.File
	//nolint:gosec
	f, err = os.Open(filepath)
	if err != nil {
		return nil, err
	}
	defer func() {
		err = errors.CombineErrors(err, f.Close())
	}()
	res, err = io.ReadAll(f)
	return res, err
}

// The controller is defined via either type `memory` for cgroup v1 or via empty type for cgroup v2,
// where the type is the second field in /proc/[pid]/cgroup file
func detectControlPath(cgroupFilePath string, controller string) (string, error) {
	//nolint:gosec
	cgroup, err := os.Open(cgroupFilePath)
	if err != nil {
		return "", errors.Wrapf(err, "failed to read %s cgroup from cgroups file: %s", controller, cgroupFilePath)
	}
	defer func() {
		err := cgroup.Close()
		if err != nil {
			log.Error("close cgroupFilePath", zap.Error(err))
		}
	}()

	scanner := bufio.NewScanner(cgroup)
	var unifiedPathIfFound string
	for scanner.Scan() {
		fields := bytes.Split(scanner.Bytes(), []byte{':'})
		if len(fields) != 3 {
			// The lines should always have three fields, there's something fishy here.
			continue
		}

		f0, f1 := string(fields[0]), string(fields[1])
		// First case if v2, second - v1. We give v2 the priority here.
		// There is also a `hybrid` mode when both  versions are enabled,
		// but no known container solutions support it.
		if f0 == "0" && f1 == "" {
			unifiedPathIfFound = string(fields[2])
		} else if f1 == controller {
			return string(fields[2]), nil
		}
	}

	return unifiedPathIfFound, nil
}

// See http://man7.org/linux/man-pages/man5/proc.5.html for `mountinfo` format.
func getCgroupDetails(mountInfoPath string, cRoot string, controller string) (string, int, error) {
	//nolint:gosec
	info, err := os.Open(mountInfoPath)
	if err != nil {
		return "", 0, errors.Wrapf(err, "failed to read mounts info from file: %s", mountInfoPath)
	}
	defer func() {
		err := info.Close()
		if err != nil {
			log.Error("close mountInfoPath", zap.Error(err))
		}
	}()

	scanner := bufio.NewScanner(info)
	for scanner.Scan() {
		fields := bytes.Fields(scanner.Bytes())
		if len(fields) < 10 {
			continue
		}

		ver, ok := detectCgroupVersion(fields, controller)
		if ok {
			mountPoint := string(fields[4])
			if ver == 2 {
				return mountPoint, ver, nil
			}
			// It is possible that the controller mount and the cgroup path are not the same (both are relative to the NS root).
			// So start with the mount and construct the relative path of the cgroup.
			// To test:
			//  1、start a docker to run unit test or tidb-server
			//   > docker run -it --cpus=8 --memory=8g --name test --rm ubuntu:18.04 bash
			//
			//  2、change the limit when the container is running
			//	docker update --cpus=8 <containers>
			nsRelativePath := string(fields[3])
			if !strings.Contains(nsRelativePath, "..") {
				// We don't expect to see err here ever but in case that it happens
				// the best action is to ignore the line and hope that the rest of the lines
				// will allow us to extract a valid path.
				if relPath, err := filepath.Rel(nsRelativePath, cRoot); err == nil {
					return filepath.Join(mountPoint, relPath), ver, nil
				}
			}
		}
	}

	return "", 0, fmt.Errorf("failed to detect cgroup root mount and version")
}

func cgroupFileToUint64(filepath, desc string) (res uint64, err error) {
	contents, err := readFile(filepath)
	if err != nil {
		return 0, errors.Wrapf(err, "error when reading %s from cgroup v1 at %s", desc, filepath)
	}
	res, err = strconv.ParseUint(string(bytes.TrimSpace(contents)), 10, 64)
	if err != nil {
		return 0, errors.Wrapf(err, "error when parsing %s from cgroup v1 at %s", desc, filepath)
	}
	return res, err
}

func cgroupFileToInt64(filepath, desc string) (res int64, err error) {
	contents, err := readFile(filepath)
	if err != nil {
		return 0, errors.Wrapf(err, "error when reading %s from cgroup v1 at %s", desc, filepath)
	}
	res, err = strconv.ParseInt(string(bytes.TrimSpace(contents)), 10, 64)
	if err != nil {
		return 0, errors.Wrapf(err, "error when parsing %s from cgroup v1 at %s", desc, filepath)
	}
	return res, nil
}

// Return version of cgroup mount for memory controller if found
func detectCgroupVersion(fields [][]byte, controller string) (_ int, found bool) {
	if len(fields) < 10 {
		return 0, false
	}

	// Due to strange format there can be optional fields in the middle of the set, starting
	// from the field #7. The end of the fields is marked with "-" field
	var pos = 6
	for pos < len(fields) {
		if bytes.Equal(fields[pos], []byte{'-'}) {
			break
		}

		pos++
	}

	// No optional fields separator found or there is less than 3 fields after it which is wrong
	if (len(fields) - pos - 1) < 3 {
		return 0, false
	}

	pos++

	// Check for controller specifically in cgroup v1 (it is listed in super
	// options field), as the value can't be found if it is not enforced.
	if bytes.Equal(fields[pos], []byte("cgroup")) && bytes.Contains(fields[pos+2], []byte(controller)) {
		return 1, true
	} else if bytes.Equal(fields[pos], []byte("cgroup2")) {
		return 2, true
	}

	return 0, false
}

func detectCPUQuotaInV1(cRoot string) (period, quota int64, err error) {
	quotaFilePath := filepath.Join(cRoot, cgroupV1CPUQuota)
	periodFilePath := filepath.Join(cRoot, cgroupV1CPUPeriod)
	quota, err = cgroupFileToInt64(quotaFilePath, "cpu quota")
	if err != nil {
		return 0, 0, err
	}
	period, err = cgroupFileToInt64(periodFilePath, "cpu period")
	if err != nil {
		return 0, 0, err
	}

	return period, quota, err
}

func detectCPUUsageInV1(cRoot string) (stime, utime uint64, err error) {
	sysFilePath := filepath.Join(cRoot, cgroupV1CPUSysUsage)
	userFilePath := filepath.Join(cRoot, cgroupV1CPUUserUsage)
	stime, err = cgroupFileToUint64(sysFilePath, "cpu system time")
	if err != nil {
		return 0, 0, err
	}
	utime, err = cgroupFileToUint64(userFilePath, "cpu user time")
	if err != nil {
		return 0, 0, err
	}

	return stime, utime, err
}

func detectCPUQuotaInV2(cRoot string) (period, quota int64, err error) {
	maxFilePath := filepath.Join(cRoot, cgroupV2CPUMax)
	contents, err := readFile(maxFilePath)
	if err != nil {
		return 0, 0, errors.Wrapf(err, "error when read cpu quota from cgroup v2 at %s", maxFilePath)
	}
	fields := strings.Fields(string(contents))
	if len(fields) > 2 || len(fields) == 0 {
		return 0, 0, errors.Errorf("unexpected format when reading cpu quota from cgroup v2 at %s: %s", maxFilePath, contents)
	}
	if fields[0] == "max" {
		// Negative quota denotes no limit.
		quota = -1
	} else {
		quota, err = strconv.ParseInt(fields[0], 10, 64)
		if err != nil {
			return 0, 0, errors.Wrapf(err, "error when reading cpu quota from cgroup v2 at %s", maxFilePath)
		}
	}
	if len(fields) == 2 {
		period, err = strconv.ParseInt(fields[1], 10, 64)
		if err != nil {
			return 0, 0, errors.Wrapf(err, "error when reading cpu period from cgroup v2 at %s", maxFilePath)
		}
	}
	return period, quota, nil
}

func detectCPUUsageInV2(cRoot string) (stime, utime uint64, err error) {
	statFilePath := filepath.Join(cRoot, cgroupV2CPUStat)
	var stat *os.File
	//nolint:gosec
	stat, err = os.Open(statFilePath)
	if err != nil {
		return 0, 0, errors.Wrapf(err, "can't read cpu usage from cgroup v2 at %s", statFilePath)
	}
	defer func() {
		err = errors.CombineErrors(err, stat.Close())
	}()

	scanner := bufio.NewScanner(stat)
	for scanner.Scan() {
		fields := bytes.Fields(scanner.Bytes())
		if len(fields) != 2 || (string(fields[0]) != "user_usec" && string(fields[0]) != "system_usec") {
			continue
		}
		keyField := string(fields[0])

		trimmed := string(bytes.TrimSpace(fields[1]))
		usageVar := &stime
		if keyField == "user_usec" {
			usageVar = &utime
		}
		*usageVar, err = strconv.ParseUint(trimmed, 10, 64)
		if err != nil {
			return 0, 0, errors.Wrapf(err, "can't read cpu usage %s from cgroup v1 at %s", keyField, statFilePath)
		}
	}

	return stime, utime, err
}

func readInt64Value(root, filename string, cgVersion int) (value uint64, err error) {
	filePath := filepath.Join(root, filename)
	//nolint:gosec
	file, err := os.Open(filePath)
	if err != nil {
		return 0, errors.Wrapf(err, "can't read %s from cgroup v%d", filename, cgVersion)
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	present := scanner.Scan()
	if !present {
		return 0, errors.Wrapf(err, "no value found in %s from cgroup v%d", filename, cgVersion)
	}
	data := scanner.Bytes()
	trimmed := string(bytes.TrimSpace(data))
	// cgroupv2 has certain control files that default to "max", so handle here.
	if trimmed == "max" {
		return math.MaxInt64, nil
	}
	value, err = strconv.ParseUint(trimmed, 10, 64)
	if err != nil {
		return 0, errors.Wrapf(err, "failed to parse value in %s from cgroup v%d", filename, cgVersion)
	}
	return value, nil
}