aperfmperf.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * x86 APERF/MPERF KHz calculation for
 * /sys/.../cpufreq/scaling_cur_freq
 *
 * Copyright (C) 2017 Intel Corp.
 * Author: Len Brown <[email protected]>
 */
#include <linux/cpufreq.h>
#include <linux/delay.h>
#include <linux/ktime.h>
#include <linux/math64.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/sched/isolation.h>
#include <linux/sched/topology.h>
#include <linux/smp.h>
#include <linux/syscore_ops.h>

#include <asm/cpu.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>

#include "cpu.h"

struct aperfmperf { … };

static DEFINE_PER_CPU_SHARED_ALIGNED(struct aperfmperf, cpu_samples) = …;

static void init_counter_refs(void)
{ … }

#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
/*
 * APERF/MPERF frequency ratio computation.
 *
 * The scheduler wants to do frequency invariant accounting and needs a <1
 * ratio to account for the 'current' frequency, corresponding to
 * freq_curr / freq_max.
 *
 * Since the frequency freq_curr on x86 is controlled by micro-controller and
 * our P-state setting is little more than a request/hint, we need to observe
 * the effective frequency 'BusyMHz', i.e. the average frequency over a time
 * interval after discarding idle time. This is given by:
 *
 *   BusyMHz = delta_APERF / delta_MPERF * freq_base
 *
 * where freq_base is the max non-turbo P-state.
 *
 * The freq_max term has to be set to a somewhat arbitrary value, because we
 * can't know which turbo states will be available at a given point in time:
 * it all depends on the thermal headroom of the entire package. We set it to
 * the turbo level with 4 cores active.
 *
 * Benchmarks show that's a good compromise between the 1C turbo ratio
 * (freq_curr/freq_max would rarely reach 1) and something close to freq_base,
 * which would ignore the entire turbo range (a conspicuous part, making
 * freq_curr/freq_max always maxed out).
 *
 * An exception to the heuristic above is the Atom uarch, where we choose the
 * highest turbo level for freq_max since Atom's are generally oriented towards
 * power efficiency.
 *
 * Setting freq_max to anything less than the 1C turbo ratio makes the ratio
 * freq_curr / freq_max to eventually grow >1, in which case we clip it to 1.
 */

DEFINE_STATIC_KEY_FALSE(arch_scale_freq_key);

static u64 arch_turbo_freq_ratio = …;
static u64 arch_max_freq_ratio = …;

void arch_set_max_freq_ratio(bool turbo_disabled)
{ … }
EXPORT_SYMBOL_GPL(…);

static bool __init turbo_disabled(void)
{ … }

static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{ … }

#define X86_MATCH(vfm) …

static const struct x86_cpu_id has_knl_turbo_ratio_limits[] __initconst = …;

static const struct x86_cpu_id has_skx_turbo_ratio_limits[] __initconst = …;

static const struct x86_cpu_id has_glm_turbo_ratio_limits[] __initconst = …;

static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
					  int num_delta_fratio)
{ … }

static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
{ … }

static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{ … }

static bool __init intel_set_max_freq_ratio(void)
{ … }

#ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = …;

static void register_freq_invariance_syscore_ops(void)
{ … }
#else
static inline void register_freq_invariance_syscore_ops(void) {}
#endif

static void freq_invariance_enable(void)
{ … }

void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled)
{ … }

static void __init bp_init_freq_invariance(void)
{ … }

static void disable_freq_invariance_workfn(struct work_struct *work)
{ … }

static DECLARE_WORK(disable_freq_invariance_work,
		    disable_freq_invariance_workfn);

DEFINE_PER_CPU(unsigned long, arch_freq_scale) = …;
EXPORT_PER_CPU_SYMBOL_GPL(…);

static DEFINE_STATIC_KEY_FALSE(arch_hybrid_cap_scale_key);

struct arch_hybrid_cpu_scale { … };

static struct arch_hybrid_cpu_scale __percpu *arch_cpu_scale;

/**
 * arch_enable_hybrid_capacity_scale() - Enable hybrid CPU capacity scaling
 *
 * Allocate memory for per-CPU data used by hybrid CPU capacity scaling,
 * initialize it and set the static key controlling its code paths.
 *
 * Must be called before arch_set_cpu_capacity().
 */
bool arch_enable_hybrid_capacity_scale(void)
{ … }

/**
 * arch_set_cpu_capacity() - Set scale-invariance parameters for a CPU
 * @cpu: Target CPU.
 * @cap: Capacity of @cpu at its maximum frequency, relative to @max_cap.
 * @max_cap: System-wide maximum CPU capacity.
 * @cap_freq: Frequency of @cpu corresponding to @cap.
 * @base_freq: Frequency of @cpu at which MPERF counts.
 *
 * The units in which @cap and @max_cap are expressed do not matter, so long
 * as they are consistent, because the former is effectively divided by the
 * latter.  Analogously for @cap_freq and @base_freq.
 *
 * After calling this function for all CPUs, call arch_rebuild_sched_domains()
 * to let the scheduler know that capacity-aware scheduling can be used going
 * forward.
 */
void arch_set_cpu_capacity(int cpu, unsigned long cap, unsigned long max_cap,
			   unsigned long cap_freq, unsigned long base_freq)
{ … }

unsigned long arch_scale_cpu_capacity(int cpu)
{ … }
EXPORT_SYMBOL_GPL(…);

static void scale_freq_tick(u64 acnt, u64 mcnt)
{ … }
#else
static inline void bp_init_freq_invariance(void) { }
static inline void scale_freq_tick(u64 acnt, u64 mcnt) { }
#endif /* CONFIG_X86_64 && CONFIG_SMP */

void arch_scale_freq_tick(void)
{ … }

/*
 * Discard samples older than the define maximum sample age of 20ms. There
 * is no point in sending IPIs in such a case. If the scheduler tick was
 * not running then the CPU is either idle or isolated.
 */
#define MAX_SAMPLE_AGE …

unsigned int arch_freq_get_on_cpu(int cpu)
{ … }

static int __init bp_init_aperfmperf(void)
{ … }
early_initcall(bp_init_aperfmperf);

void ap_init_aperfmperf(void)
{ … }
linux/arch/x86/kernel/cpu/aperfmperf.c