psi.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Pressure stall information for CPU, memory and IO
 *
 * Copyright (c) 2018 Facebook, Inc.
 * Author: Johannes Weiner <[email protected]>
 *
 * Polling support by Suren Baghdasaryan <[email protected]>
 * Copyright (c) 2018 Google, Inc.
 *
 * When CPU, memory and IO are contended, tasks experience delays that
 * reduce throughput and introduce latencies into the workload. Memory
 * and IO contention, in addition, can cause a full loss of forward
 * progress in which the CPU goes idle.
 *
 * This code aggregates individual task delays into resource pressure
 * metrics that indicate problems with both workload health and
 * resource utilization.
 *
 *			Model
 *
 * The time in which a task can execute on a CPU is our baseline for
 * productivity. Pressure expresses the amount of time in which this
 * potential cannot be realized due to resource contention.
 *
 * This concept of productivity has two components: the workload and
 * the CPU. To measure the impact of pressure on both, we define two
 * contention states for a resource: SOME and FULL.
 *
 * In the SOME state of a given resource, one or more tasks are
 * delayed on that resource. This affects the workload's ability to
 * perform work, but the CPU may still be executing other tasks.
 *
 * In the FULL state of a given resource, all non-idle tasks are
 * delayed on that resource such that nobody is advancing and the CPU
 * goes idle. This leaves both workload and CPU unproductive.
 *
 *	SOME = nr_delayed_tasks != 0
 *	FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
 *
 * What it means for a task to be productive is defined differently
 * for each resource. For IO, productive means a running task. For
 * memory, productive means a running task that isn't a reclaimer. For
 * CPU, productive means an on-CPU task.
 *
 * Naturally, the FULL state doesn't exist for the CPU resource at the
 * system level, but exist at the cgroup level. At the cgroup level,
 * FULL means all non-idle tasks in the cgroup are delayed on the CPU
 * resource which is being used by others outside of the cgroup or
 * throttled by the cgroup cpu.max configuration.
 *
 * The percentage of wall clock time spent in those compound stall
 * states gives pressure numbers between 0 and 100 for each resource,
 * where the SOME percentage indicates workload slowdowns and the FULL
 * percentage indicates reduced CPU utilization:
 *
 *	%SOME = time(SOME) / period
 *	%FULL = time(FULL) / period
 *
 *			Multiple CPUs
 *
 * The more tasks and available CPUs there are, the more work can be
 * performed concurrently. This means that the potential that can go
 * unrealized due to resource contention *also* scales with non-idle
 * tasks and CPUs.
 *
 * Consider a scenario where 257 number crunching tasks are trying to
 * run concurrently on 256 CPUs. If we simply aggregated the task
 * states, we would have to conclude a CPU SOME pressure number of
 * 100%, since *somebody* is waiting on a runqueue at all
 * times. However, that is clearly not the amount of contention the
 * workload is experiencing: only one out of 256 possible execution
 * threads will be contended at any given time, or about 0.4%.
 *
 * Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
 * given time *one* of the tasks is delayed due to a lack of memory.
 * Again, looking purely at the task state would yield a memory FULL
 * pressure number of 0%, since *somebody* is always making forward
 * progress. But again this wouldn't capture the amount of execution
 * potential lost, which is 1 out of 4 CPUs, or 25%.
 *
 * To calculate wasted potential (pressure) with multiple processors,
 * we have to base our calculation on the number of non-idle tasks in
 * conjunction with the number of available CPUs, which is the number
 * of potential execution threads. SOME becomes then the proportion of
 * delayed tasks to possible threads, and FULL is the share of possible
 * threads that are unproductive due to delays:
 *
 *	threads = min(nr_nonidle_tasks, nr_cpus)
 *	   SOME = min(nr_delayed_tasks / threads, 1)
 *	   FULL = (threads - min(nr_productive_tasks, threads)) / threads
 *
 * For the 257 number crunchers on 256 CPUs, this yields:
 *
 *	threads = min(257, 256)
 *	   SOME = min(1 / 256, 1)             = 0.4%
 *	   FULL = (256 - min(256, 256)) / 256 = 0%
 *
 * For the 1 out of 4 memory-delayed tasks, this yields:
 *
 *	threads = min(4, 4)
 *	   SOME = min(1 / 4, 1)               = 25%
 *	   FULL = (4 - min(3, 4)) / 4         = 25%
 *
 * [ Substitute nr_cpus with 1, and you can see that it's a natural
 *   extension of the single-CPU model. ]
 *
 *			Implementation
 *
 * To assess the precise time spent in each such state, we would have
 * to freeze the system on task changes and start/stop the state
 * clocks accordingly. Obviously that doesn't scale in practice.
 *
 * Because the scheduler aims to distribute the compute load evenly
 * among the available CPUs, we can track task state locally to each
 * CPU and, at much lower frequency, extrapolate the global state for
 * the cumulative stall times and the running averages.
 *
 * For each runqueue, we track:
 *
 *	   tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
 *	   tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
 *	tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
 *
 * and then periodically aggregate:
 *
 *	tNONIDLE = sum(tNONIDLE[i])
 *
 *	   tSOME = sum(tSOME[i] * tNONIDLE[i]) / tNONIDLE
 *	   tFULL = sum(tFULL[i] * tNONIDLE[i]) / tNONIDLE
 *
 *	   %SOME = tSOME / period
 *	   %FULL = tFULL / period
 *
 * This gives us an approximation of pressure that is practical
 * cost-wise, yet way more sensitive and accurate than periodic
 * sampling of the aggregate task states would be.
 */

static int psi_bug __read_mostly;

DEFINE_STATIC_KEY_FALSE(psi_disabled);
static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);

#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
#else
static bool psi_enable = true;
#endif
static int __init setup_psi(char *str)
{ … }
__setup(…);

/* Running averages - we need to be higher-res than loadavg */
#define PSI_FREQ …
#define EXP_10s …
#define EXP_60s …
#define EXP_300s …

/* PSI trigger definitions */
#define WINDOW_MAX_US …
#define UPDATES_PER_WINDOW …

/* Sampling frequency in nanoseconds */
static u64 psi_period __read_mostly;

/* System-level pressure and stall tracking */
static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu);
struct psi_group psi_system = …;

static void psi_avgs_work(struct work_struct *work);

static void poll_timer_fn(struct timer_list *t);

static void group_init(struct psi_group *group)
{ … }

void __init psi_init(void)
{ … }

static u32 test_states(unsigned int *tasks, u32 state_mask)
{ … }

static void get_recent_times(struct psi_group *group, int cpu,
			     enum psi_aggregators aggregator, u32 *times,
			     u32 *pchanged_states)
{ … }

static void calc_avgs(unsigned long avg[3], int missed_periods,
		      u64 time, u64 period)
{ … }

static void collect_percpu_times(struct psi_group *group,
				 enum psi_aggregators aggregator,
				 u32 *pchanged_states)
{ … }

/* Trigger tracking window manipulations */
static void window_reset(struct psi_window *win, u64 now, u64 value,
			 u64 prev_growth)
{ … }

/*
 * PSI growth tracking window update and growth calculation routine.
 *
 * This approximates a sliding tracking window by interpolating
 * partially elapsed windows using historical growth data from the
 * previous intervals. This minimizes memory requirements (by not storing
 * all the intermediate values in the previous window) and simplifies
 * the calculations. It works well because PSI signal changes only in
 * positive direction and over relatively small window sizes the growth
 * is close to linear.
 */
static u64 window_update(struct psi_window *win, u64 now, u64 value)
{ … }

static void update_triggers(struct psi_group *group, u64 now,
						   enum psi_aggregators aggregator)
{ … }

static u64 update_averages(struct psi_group *group, u64 now)
{ … }

static void psi_avgs_work(struct work_struct *work)
{ … }

static void init_rtpoll_triggers(struct psi_group *group, u64 now)
{ … }

/* Schedule rtpolling if it's not already scheduled or forced. */
static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay,
				   bool force)
{ … }

static void psi_rtpoll_work(struct psi_group *group)
{ … }

static int psi_rtpoll_worker(void *data)
{ … }

static void poll_timer_fn(struct timer_list *t)
{ … }

static void record_times(struct psi_group_cpu *groupc, u64 now)
{ … }

static void psi_group_change(struct psi_group *group, int cpu,
			     unsigned int clear, unsigned int set, u64 now,
			     bool wake_clock)
{ … }

static inline struct psi_group *task_psi_group(struct task_struct *task)
{ … }

static void psi_flags_change(struct task_struct *task, int clear, int set)
{ … }

void psi_task_change(struct task_struct *task, int clear, int set)
{ … }

void psi_task_switch(struct task_struct *prev, struct task_struct *next,
		     bool sleep)
{ … }

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{ … }
#endif

/**
 * psi_memstall_enter - mark the beginning of a memory stall section
 * @flags: flags to handle nested sections
 *
 * Marks the calling task as being stalled due to a lack of memory,
 * such as waiting for a refault or performing reclaim.
 */
void psi_memstall_enter(unsigned long *flags)
{ … }
EXPORT_SYMBOL_GPL(…);

/**
 * psi_memstall_leave - mark the end of an memory stall section
 * @flags: flags to handle nested memdelay sections
 *
 * Marks the calling task as no longer stalled due to lack of memory.
 */
void psi_memstall_leave(unsigned long *flags)
{ … }
EXPORT_SYMBOL_GPL(…);

#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{ … }

void psi_cgroup_free(struct cgroup *cgroup)
{ … }

/**
 * cgroup_move_task - move task to a different cgroup
 * @task: the task
 * @to: the target css_set
 *
 * Move task to a new cgroup and safely migrate its associated stall
 * state between the different groups.
 *
 * This function acquires the task's rq lock to lock out concurrent
 * changes to the task's scheduling state and - in case the task is
 * running - concurrent changes to its stall state.
 */
void cgroup_move_task(struct task_struct *task, struct css_set *to)
{ … }

void psi_cgroup_restart(struct psi_group *group)
{ … }
#endif /* CONFIG_CGROUPS */

int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{ … }

struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf,
				       enum psi_res res, struct file *file,
				       struct kernfs_open_file *of)
{ … }

void psi_trigger_destroy(struct psi_trigger *t)
{ … }

__poll_t psi_trigger_poll(void **trigger_ptr,
				struct file *file, poll_table *wait)
{ … }

#ifdef CONFIG_PROC_FS
static int psi_io_show(struct seq_file *m, void *v)
{ … }

static int psi_memory_show(struct seq_file *m, void *v)
{ … }

static int psi_cpu_show(struct seq_file *m, void *v)
{ … }

static int psi_io_open(struct inode *inode, struct file *file)
{ … }

static int psi_memory_open(struct inode *inode, struct file *file)
{ … }

static int psi_cpu_open(struct inode *inode, struct file *file)
{ … }

static ssize_t psi_write(struct file *file, const char __user *user_buf,
			 size_t nbytes, enum psi_res res)
{ … }

static ssize_t psi_io_write(struct file *file, const char __user *user_buf,
			    size_t nbytes, loff_t *ppos)
{ … }

static ssize_t psi_memory_write(struct file *file, const char __user *user_buf,
				size_t nbytes, loff_t *ppos)
{ … }

static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf,
			     size_t nbytes, loff_t *ppos)
{ … }

static __poll_t psi_fop_poll(struct file *file, poll_table *wait)
{ … }

static int psi_fop_release(struct inode *inode, struct file *file)
{ … }

static const struct proc_ops psi_io_proc_ops = …;

static const struct proc_ops psi_memory_proc_ops = …;

static const struct proc_ops psi_cpu_proc_ops = …;

#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int psi_irq_show(struct seq_file *m, void *v)
{ … }

static int psi_irq_open(struct inode *inode, struct file *file)
{ … }

static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
			     size_t nbytes, loff_t *ppos)
{ … }

static const struct proc_ops psi_irq_proc_ops = …;
#endif

static int __init psi_proc_init(void)
{ … }
module_init(…) …;

#endif /* CONFIG_PROC_FS */
linux/kernel/sched/psi.c