kubernetes/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go

package fs

import (
	"bufio"
	"errors"
	"fmt"
	"math"
	"os"
	"path/filepath"
	"strconv"
	"strings"

	"golang.org/x/sys/unix"

	"github.com/opencontainers/runc/libcontainer/cgroups"
	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
	"github.com/opencontainers/runc/libcontainer/configs"
)

const (
	cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
	cgroupMemoryLimit     = "memory.limit_in_bytes"
	cgroupMemoryUsage     = "memory.usage_in_bytes"
	cgroupMemoryMaxUsage  = "memory.max_usage_in_bytes"
)

type MemoryGroup struct{}

func (s *MemoryGroup) Name() string {
	return "memory"
}

func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error {
	return apply(path, pid)
}

func setMemory(path string, val int64) error {
	if val == 0 {
		return nil
	}

	err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
	if !errors.Is(err, unix.EBUSY) {
		return err
	}

	// EBUSY means the kernel can't set new limit as it's too low
	// (lower than the current usage). Return more specific error.
	usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
	if err != nil {
		return err
	}
	max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
	if err != nil {
		return err
	}

	return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}

func setSwap(path string, val int64) error {
	if val == 0 {
		return nil
	}

	return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}

func setMemoryAndSwap(path string, r *configs.Resources) error {
	// If the memory update is set to -1 and the swap is not explicitly
	// set, we should also set swap to -1, it means unlimited memory.
	if r.Memory == -1 && r.MemorySwap == 0 {
		// Only set swap if it's enabled in kernel
		if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
			r.MemorySwap = -1
		}
	}

	// When memory and swap memory are both set, we need to handle the cases
	// for updating container.
	if r.Memory != 0 && r.MemorySwap != 0 {
		curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
		if err != nil {
			return err
		}

		// When update memory limit, we should adapt the write sequence
		// for memory and swap memory, so it won't fail because the new
		// value and the old value don't fit kernel's validation.
		if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
			if err := setSwap(path, r.MemorySwap); err != nil {
				return err
			}
			if err := setMemory(path, r.Memory); err != nil {
				return err
			}
			return nil
		}
	}

	if err := setMemory(path, r.Memory); err != nil {
		return err
	}
	if err := setSwap(path, r.MemorySwap); err != nil {
		return err
	}

	return nil
}

func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
	if err := setMemoryAndSwap(path, r); err != nil {
		return err
	}

	// ignore KernelMemory and KernelMemoryTCP

	if r.MemoryReservation != 0 {
		if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
			return err
		}
	}

	if r.OomKillDisable {
		if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
			return err
		}
	}
	if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
		return nil
	} else if *r.MemorySwappiness <= 100 {
		if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
			return err
		}
	} else {
		return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness)
	}

	return nil
}

func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
	const file = "memory.stat"
	statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY)
	if err != nil {
		if os.IsNotExist(err) {
			return nil
		}
		return err
	}
	defer statsFile.Close()

	sc := bufio.NewScanner(statsFile)
	for sc.Scan() {
		t, v, err := fscommon.ParseKeyValue(sc.Text())
		if err != nil {
			return &parseError{Path: path, File: file, Err: err}
		}
		stats.MemoryStats.Stats[t] = v
	}
	stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]

	memoryUsage, err := getMemoryData(path, "")
	if err != nil {
		return err
	}
	stats.MemoryStats.Usage = memoryUsage
	swapUsage, err := getMemoryData(path, "memsw")
	if err != nil {
		return err
	}
	stats.MemoryStats.SwapUsage = swapUsage
	stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{
		Usage:   swapUsage.Usage - memoryUsage.Usage,
		Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt,
	}
	kernelUsage, err := getMemoryData(path, "kmem")
	if err != nil {
		return err
	}
	stats.MemoryStats.KernelUsage = kernelUsage
	kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
	if err != nil {
		return err
	}
	stats.MemoryStats.KernelTCPUsage = kernelTCPUsage

	value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
	if err != nil {
		return err
	}
	if value == 1 {
		stats.MemoryStats.UseHierarchy = true
	}

	pagesByNUMA, err := getPageUsageByNUMA(path)
	if err != nil {
		return err
	}
	stats.MemoryStats.PageUsageByNUMA = pagesByNUMA

	return nil
}

func getMemoryData(path, name string) (cgroups.MemoryData, error) {
	memoryData := cgroups.MemoryData{}

	moduleName := "memory"
	if name != "" {
		moduleName = "memory." + name
	}
	var (
		usage    = moduleName + ".usage_in_bytes"
		maxUsage = moduleName + ".max_usage_in_bytes"
		failcnt  = moduleName + ".failcnt"
		limit    = moduleName + ".limit_in_bytes"
	)

	value, err := fscommon.GetCgroupParamUint(path, usage)
	if err != nil {
		if name != "" && os.IsNotExist(err) {
			// Ignore ENOENT as swap and kmem controllers
			// are optional in the kernel.
			return cgroups.MemoryData{}, nil
		}
		return cgroups.MemoryData{}, err
	}
	memoryData.Usage = value
	value, err = fscommon.GetCgroupParamUint(path, maxUsage)
	if err != nil {
		return cgroups.MemoryData{}, err
	}
	memoryData.MaxUsage = value
	value, err = fscommon.GetCgroupParamUint(path, failcnt)
	if err != nil {
		return cgroups.MemoryData{}, err
	}
	memoryData.Failcnt = value
	value, err = fscommon.GetCgroupParamUint(path, limit)
	if err != nil {
		if name == "kmem" && os.IsNotExist(err) {
			// Ignore ENOENT as kmem.limit_in_bytes has
			// been removed in newer kernels.
			return memoryData, nil
		}

		return cgroups.MemoryData{}, err
	}
	memoryData.Limit = value

	return memoryData, nil
}

func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
	const (
		maxColumns = math.MaxUint8 + 1
		file       = "memory.numa_stat"
	)
	stats := cgroups.PageUsageByNUMA{}

	fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
	if os.IsNotExist(err) {
		return stats, nil
	} else if err != nil {
		return stats, err
	}
	defer fd.Close()

	// File format is documented in linux/Documentation/cgroup-v1/memory.txt
	// and it looks like this:
	//
	// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
	// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
	// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
	// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
	// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...

	scanner := bufio.NewScanner(fd)
	for scanner.Scan() {
		var field *cgroups.PageStats

		line := scanner.Text()
		columns := strings.SplitN(line, " ", maxColumns)
		for i, column := range columns {
			byNode := strings.SplitN(column, "=", 2)
			// Some custom kernels have non-standard fields, like
			//   numa_locality 0 0 0 0 0 0 0 0 0 0
			//   numa_exectime 0
			if len(byNode) < 2 {
				if i == 0 {
					// Ignore/skip those.
					break
				} else {
					// The first column was already validated,
					// so be strict to the rest.
					return stats, malformedLine(path, file, line)
				}
			}
			key, val := byNode[0], byNode[1]
			if i == 0 { // First column: key is name, val is total.
				field = getNUMAField(&stats, key)
				if field == nil { // unknown field (new kernel?)
					break
				}
				field.Total, err = strconv.ParseUint(val, 0, 64)
				if err != nil {
					return stats, &parseError{Path: path, File: file, Err: err}
				}
				field.Nodes = map[uint8]uint64{}
			} else { // Subsequent columns: key is N<id>, val is usage.
				if len(key) < 2 || key[0] != 'N' {
					// This is definitely an error.
					return stats, malformedLine(path, file, line)
				}

				n, err := strconv.ParseUint(key[1:], 10, 8)
				if err != nil {
					return stats, &parseError{Path: path, File: file, Err: err}
				}

				usage, err := strconv.ParseUint(val, 10, 64)
				if err != nil {
					return stats, &parseError{Path: path, File: file, Err: err}
				}

				field.Nodes[uint8(n)] = usage
			}

		}
	}
	if err := scanner.Err(); err != nil {
		return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err}
	}

	return stats, nil
}

func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
	switch name {
	case "total":
		return &stats.Total
	case "file":
		return &stats.File
	case "anon":
		return &stats.Anon
	case "unevictable":
		return &stats.Unevictable
	case "hierarchical_total":
		return &stats.Hierarchical.Total
	case "hierarchical_file":
		return &stats.Hierarchical.File
	case "hierarchical_anon":
		return &stats.Hierarchical.Anon
	case "hierarchical_unevictable":
		return &stats.Hierarchical.Unevictable
	}
	return nil
}