kubernetes/pkg/kubelet/cm/cgroup_v2_manager_linux.go

/*
Copyright 2024 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package cm

import (
	"fmt"
	"os"
	"path"
	"path/filepath"
	"strconv"
	"strings"

	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
	v1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/util/sets"
	cmutil "k8s.io/kubernetes/pkg/kubelet/cm/util"
)

const cgroupv2MemLimitFile string = "memory.max"

// cgroupV2impl implements the CgroupManager interface
// for cgroup v2.
// It's a stateless object which can be used to
// update, create or delete any number of cgroups
// It relies on runc/libcontainer cgroup managers.
type cgroupV2impl struct {
	cgroupCommon
}

func NewCgroupV2Manager(cs *CgroupSubsystems, cgroupDriver string) CgroupManager {
	return &cgroupV2impl{
		cgroupCommon: newCgroupCommon(cs, cgroupDriver),
	}
}

// Version of the cgroup implementation on the host
func (c *cgroupV2impl) Version() int {
	return 2
}

// Validate checks if all subsystem cgroups are valid
func (c *cgroupV2impl) Validate(name CgroupName) error {
	cgroupPath := c.buildCgroupUnifiedPath(name)
	neededControllers := getSupportedUnifiedControllers()
	enabledControllers, err := readUnifiedControllers(cgroupPath)
	if err != nil {
		return fmt.Errorf("could not read controllers for cgroup %q: %w", name, err)
	}
	difference := neededControllers.Difference(enabledControllers)
	if difference.Len() > 0 {
		return fmt.Errorf("cgroup %q has some missing controllers: %v", name, strings.Join(sets.List(difference), ", "))
	}
	return nil
}

// Exists checks if all subsystem cgroups already exist
func (c *cgroupV2impl) Exists(name CgroupName) bool {
	return c.Validate(name) == nil
}

// MemoryUsage returns the current memory usage of the specified cgroup,
// as read from cgroupfs.
func (c *cgroupV2impl) MemoryUsage(name CgroupName) (int64, error) {
	var path, file string
	path = c.buildCgroupUnifiedPath(name)
	file = "memory.current"
	val, err := fscommon.GetCgroupParamUint(path, file)
	return int64(val), err
}

// Get the resource config values applied to the cgroup for specified resource type
func (c *cgroupV2impl) GetCgroupConfig(name CgroupName, resource v1.ResourceName) (*ResourceConfig, error) {
	cgroupPaths := c.buildCgroupPaths(name)
	cgroupResourcePath, found := cgroupPaths[string(resource)]
	if !found {
		return nil, fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name)
	}
	switch resource {
	case v1.ResourceCPU:
		return c.getCgroupCPUConfig(cgroupResourcePath)
	case v1.ResourceMemory:
		return c.getCgroupMemoryConfig(cgroupResourcePath)
	}
	return nil, fmt.Errorf("unsupported resource %v for cgroup %v", resource, name)
}

// Set resource config for the specified resource type on the cgroup
func (c *cgroupV2impl) SetCgroupConfig(name CgroupName, resource v1.ResourceName, resourceConfig *ResourceConfig) error {
	cgroupPaths := c.buildCgroupPaths(name)
	cgroupResourcePath, found := cgroupPaths[string(resource)]
	if !found {
		return fmt.Errorf("failed to build %v cgroup fs path for cgroup %v", resource, name)
	}
	switch resource {
	case v1.ResourceCPU:
		return c.setCgroupCPUConfig(cgroupResourcePath, resourceConfig)
	case v1.ResourceMemory:
		return c.setCgroupMemoryConfig(cgroupResourcePath, resourceConfig)
	}
	return nil
}

func (c *cgroupV2impl) getCgroupCPUConfig(cgroupPath string) (*ResourceConfig, error) {
	var cpuLimitStr, cpuPeriodStr string
	cpuLimitAndPeriod, err := fscommon.GetCgroupParamString(cgroupPath, "cpu.max")
	if err != nil {
		return nil, fmt.Errorf("failed to read cpu.max file for cgroup %v: %w", cgroupPath, err)
	}
	numItems, errScan := fmt.Sscanf(cpuLimitAndPeriod, "%s %s", &cpuLimitStr, &cpuPeriodStr)
	if errScan != nil || numItems != 2 {
		return nil, fmt.Errorf("failed to correctly parse content of cpu.max file ('%s') for cgroup %v: %w",
			cpuLimitAndPeriod, cgroupPath, errScan)
	}
	cpuLimit := int64(-1)
	if cpuLimitStr != Cgroup2MaxCpuLimit {
		cpuLimit, err = strconv.ParseInt(cpuLimitStr, 10, 64)
		if err != nil {
			return nil, fmt.Errorf("failed to convert CPU limit as integer for cgroup %v: %w", cgroupPath, err)
		}
	}
	cpuPeriod, errPeriod := strconv.ParseUint(cpuPeriodStr, 10, 64)
	if errPeriod != nil {
		return nil, fmt.Errorf("failed to convert CPU period as integer for cgroup %v: %w", cgroupPath, errPeriod)
	}
	cpuWeight, errWeight := fscommon.GetCgroupParamUint(cgroupPath, "cpu.weight")
	if errWeight != nil {
		return nil, fmt.Errorf("failed to read CPU weight for cgroup %v: %w", cgroupPath, errWeight)
	}
	cpuShares := cpuWeightToCPUShares(cpuWeight)
	return &ResourceConfig{CPUShares: &cpuShares, CPUQuota: &cpuLimit, CPUPeriod: &cpuPeriod}, nil
}

func (c *cgroupV2impl) setCgroupCPUConfig(cgroupPath string, resourceConfig *ResourceConfig) error {
	if resourceConfig.CPUQuota != nil {
		if resourceConfig.CPUPeriod == nil {
			return fmt.Errorf("CpuPeriod must be specified in order to set CpuLimit")
		}
		cpuLimitStr := Cgroup2MaxCpuLimit
		if *resourceConfig.CPUQuota > -1 {
			cpuLimitStr = strconv.FormatInt(*resourceConfig.CPUQuota, 10)
		}
		cpuPeriodStr := strconv.FormatUint(*resourceConfig.CPUPeriod, 10)
		cpuMaxStr := fmt.Sprintf("%s %s", cpuLimitStr, cpuPeriodStr)
		if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.max"), []byte(cpuMaxStr), 0700); err != nil {
			return fmt.Errorf("failed to write %v to %v: %w", cpuMaxStr, cgroupPath, err)
		}
	}
	if resourceConfig.CPUShares != nil {
		cpuWeight := cpuSharesToCPUWeight(*resourceConfig.CPUShares)
		cpuWeightStr := strconv.FormatUint(cpuWeight, 10)
		if err := os.WriteFile(filepath.Join(cgroupPath, "cpu.weight"), []byte(cpuWeightStr), 0700); err != nil {
			return fmt.Errorf("failed to write %v to %v: %w", cpuWeightStr, cgroupPath, err)
		}
	}
	return nil
}

func (c *cgroupV2impl) setCgroupMemoryConfig(cgroupPath string, resourceConfig *ResourceConfig) error {
	return writeCgroupMemoryLimit(filepath.Join(cgroupPath, cgroupv2MemLimitFile), resourceConfig)
}

func (c *cgroupV2impl) getCgroupMemoryConfig(cgroupPath string) (*ResourceConfig, error) {
	return readCgroupMemoryConfig(cgroupPath, cgroupv2MemLimitFile)
}

// getSupportedUnifiedControllers returns a set of supported controllers when running on cgroup v2
func getSupportedUnifiedControllers() sets.Set[string] {
	// This is the set of controllers used by the Kubelet
	supportedControllers := sets.New("cpu", "cpuset", "memory", "hugetlb", "pids")
	// Memoize the set of controllers that are present in the root cgroup
	availableRootControllersOnce.Do(func() {
		var err error
		availableRootControllers, err = readUnifiedControllers(cmutil.CgroupRoot)
		if err != nil {
			panic(fmt.Errorf("cannot read cgroup controllers at %s", cmutil.CgroupRoot))
		}
	})
	// Return the set of controllers that are supported both by the Kubelet and by the kernel
	return supportedControllers.Intersection(availableRootControllers)
}

// readUnifiedControllers reads the controllers available at the specified cgroup
func readUnifiedControllers(path string) (sets.Set[string], error) {
	controllersFileContent, err := os.ReadFile(filepath.Join(path, "cgroup.controllers"))
	if err != nil {
		return nil, err
	}
	controllers := strings.Fields(string(controllersFileContent))
	return sets.New(controllers...), nil
}

// buildCgroupUnifiedPath builds a path to the specified name.
func (c *cgroupV2impl) buildCgroupUnifiedPath(name CgroupName) string {
	cgroupFsAdaptedName := c.Name(name)
	return path.Join(cmutil.CgroupRoot, cgroupFsAdaptedName)
}

// Convert cgroup v1 cpu.shares value to cgroup v2 cpu.weight
// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2
func cpuSharesToCPUWeight(cpuShares uint64) uint64 {
	return uint64((((cpuShares - 2) * 9999) / 262142) + 1)
}

// Convert cgroup v2 cpu.weight value to cgroup v1 cpu.shares
// https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2254-cgroup-v2#phase-1-convert-from-cgroups-v1-settings-to-v2
func cpuWeightToCPUShares(cpuWeight uint64) uint64 {
	return uint64((((cpuWeight - 1) * 262142) / 9999) + 2)
}