kubernetes/vendor/github.com/google/cadvisor/resctrl/collector.go

//go:build linux
// +build linux

// Copyright 2021 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Collector of resctrl for a container.
package resctrl

import (
	"fmt"
	"os"
	"path/filepath"
	"strings"
	"sync"
	"time"

	"k8s.io/klog/v2"

	info "github.com/google/cadvisor/info/v1"
)

const noInterval = 0

type collector struct {
	id                string
	interval          time.Duration
	getContainerPids  func() ([]string, error)
	resctrlPath       string
	running           bool
	destroyed         bool
	numberOfNUMANodes int
	vendorID          string
	mu                sync.Mutex
	inHostNamespace   bool
}

func newCollector(id string, getContainerPids func() ([]string, error), interval time.Duration, numberOfNUMANodes int, vendorID string, inHostNamespace bool) *collector {
	return &collector{id: id, interval: interval, getContainerPids: getContainerPids, numberOfNUMANodes: numberOfNUMANodes,
		vendorID: vendorID, mu: sync.Mutex{}, inHostNamespace: inHostNamespace}
}

func (c *collector) setup() error {
	var err error
	c.resctrlPath, err = prepareMonitoringGroup(c.id, c.getContainerPids, c.inHostNamespace)

	if c.interval != noInterval {
		if err != nil {
			klog.Errorf("Failed to setup container %q resctrl collector: %s \n Trying again in next intervals.", c.id, err)
		} else {
			c.running = true
		}
		go func() {
			for {
				time.Sleep(c.interval)
				c.mu.Lock()
				if c.destroyed {
					break
				}
				klog.V(5).Infof("Trying to check %q containers control group.", c.id)
				if c.running {
					err = c.checkMonitoringGroup()
					if err != nil {
						c.running = false
						klog.Errorf("Failed to check %q resctrl collector control group: %s \n Trying again in next intervals.", c.id, err)
					}
				} else {
					c.resctrlPath, err = prepareMonitoringGroup(c.id, c.getContainerPids, c.inHostNamespace)
					if err != nil {
						c.running = false
						klog.Errorf("Failed to setup container %q resctrl collector: %s \n Trying again in next intervals.", c.id, err)
					}
				}
				c.mu.Unlock()
			}
		}()
	} else {
		// There is no interval set, if setup fail, stop.
		if err != nil {
			return fmt.Errorf("failed to setup container %q resctrl collector: %w", c.id, err)
		}
		c.running = true
	}

	return nil
}

func (c *collector) checkMonitoringGroup() error {
	newPath, err := prepareMonitoringGroup(c.id, c.getContainerPids, c.inHostNamespace)
	if err != nil {
		return fmt.Errorf("couldn't obtain mon_group path: %v", err)
	}

	// Check if container moved between control groups.
	if newPath != c.resctrlPath {
		err = c.clear()
		if err != nil {
			return fmt.Errorf("couldn't clear previous monitoring group: %w", err)
		}
		c.resctrlPath = newPath
	}

	return nil
}

func (c *collector) UpdateStats(stats *info.ContainerStats) error {
	c.mu.Lock()
	defer c.mu.Unlock()
	if c.running {
		stats.Resctrl = info.ResctrlStats{}

		resctrlStats, err := getIntelRDTStatsFrom(c.resctrlPath, c.vendorID)
		if err != nil {
			return err
		}

		stats.Resctrl.MemoryBandwidth = make([]info.MemoryBandwidthStats, 0, c.numberOfNUMANodes)
		stats.Resctrl.Cache = make([]info.CacheStats, 0, c.numberOfNUMANodes)

		for _, numaNodeStats := range *resctrlStats.MBMStats {
			stats.Resctrl.MemoryBandwidth = append(stats.Resctrl.MemoryBandwidth,
				info.MemoryBandwidthStats{
					TotalBytes: numaNodeStats.MBMTotalBytes,
					LocalBytes: numaNodeStats.MBMLocalBytes,
				})
		}

		for _, numaNodeStats := range *resctrlStats.CMTStats {
			stats.Resctrl.Cache = append(stats.Resctrl.Cache,
				info.CacheStats{LLCOccupancy: numaNodeStats.LLCOccupancy})
		}
	}

	return nil
}

func (c *collector) Destroy() {
	c.mu.Lock()
	defer c.mu.Unlock()
	c.running = false
	err := c.clear()
	if err != nil {
		klog.Errorf("trying to destroy %q resctrl collector but: %v", c.id, err)
	}
	c.destroyed = true
}

func (c *collector) clear() error {
	// Not allowed to remove root or undefined resctrl directory.
	if c.id != rootContainer && c.resctrlPath != "" {
		// Remove only own prepared mon group.
		if strings.HasPrefix(filepath.Base(c.resctrlPath), monGroupPrefix) {
			err := os.RemoveAll(c.resctrlPath)
			if err != nil {
				return fmt.Errorf("couldn't clear mon_group: %v", err)
			}
		}
	}
	return nil
}