linux/drivers/powercap/intel_rapl_common.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Common code for Intel Running Average Power Limit (RAPL) support.
 * Copyright (c) 2019, Intel Corporation.
 */
#define pr_fmt(fmt)

#include <linux/bitmap.h>
#include <linux/cleanup.h>
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/device.h>
#include <linux/intel_rapl.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/log2.h>
#include <linux/module.h>
#include <linux/nospec.h>
#include <linux/perf_event.h>
#include <linux/platform_device.h>
#include <linux/powercap.h>
#include <linux/processor.h>
#include <linux/slab.h>
#include <linux/suspend.h>
#include <linux/sysfs.h>
#include <linux/types.h>

#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/iosf_mbi.h>

/* bitmasks for RAPL MSRs, used by primitive access functions */
#define ENERGY_STATUS_MASK

#define POWER_LIMIT1_MASK
#define POWER_LIMIT1_ENABLE
#define POWER_LIMIT1_CLAMP

#define POWER_LIMIT2_MASK
#define POWER_LIMIT2_ENABLE
#define POWER_LIMIT2_CLAMP
#define POWER_HIGH_LOCK
#define POWER_LOW_LOCK

#define POWER_LIMIT4_MASK

#define TIME_WINDOW1_MASK
#define TIME_WINDOW2_MASK

#define POWER_UNIT_OFFSET
#define POWER_UNIT_MASK

#define ENERGY_UNIT_OFFSET
#define ENERGY_UNIT_MASK

#define TIME_UNIT_OFFSET
#define TIME_UNIT_MASK

#define POWER_INFO_MAX_MASK
#define POWER_INFO_MIN_MASK
#define POWER_INFO_MAX_TIME_WIN_MASK
#define POWER_INFO_THERMAL_SPEC_MASK

#define PERF_STATUS_THROTTLE_TIME_MASK
#define PP_POLICY_MASK

/*
 * SPR has different layout for Psys Domain PowerLimit registers.
 * There are 17 bits of PL1 and PL2 instead of 15 bits.
 * The Enable bits and TimeWindow bits are also shifted as a result.
 */
#define PSYS_POWER_LIMIT1_MASK
#define PSYS_POWER_LIMIT1_ENABLE

#define PSYS_POWER_LIMIT2_MASK
#define PSYS_POWER_LIMIT2_ENABLE

#define PSYS_TIME_WINDOW1_MASK
#define PSYS_TIME_WINDOW2_MASK

/* bitmasks for RAPL TPMI, used by primitive access functions */
#define TPMI_POWER_LIMIT_MASK
#define TPMI_POWER_LIMIT_ENABLE
#define TPMI_TIME_WINDOW_MASK
#define TPMI_INFO_SPEC_MASK
#define TPMI_INFO_MIN_MASK
#define TPMI_INFO_MAX_MASK
#define TPMI_INFO_MAX_TIME_WIN_MASK

/* Non HW constants */
#define RAPL_PRIMITIVE_DERIVED
#define RAPL_PRIMITIVE_DUMMY

#define TIME_WINDOW_MAX_MSEC
#define TIME_WINDOW_MIN_MSEC
#define ENERGY_UNIT_SCALE
enum unit_type {};

/* per domain data, some are optional */
#define NR_RAW_PRIMITIVES

#define DOMAIN_STATE_INACTIVE
#define DOMAIN_STATE_POWER_LIMIT_SET

static const char *pl_names[NR_POWER_LIMITS] =;

enum pl_prims {};

static bool is_pl_valid(struct rapl_domain *rd, int pl)
{}

static int get_pl_lock_prim(struct rapl_domain *rd, int pl)
{}

static int get_pl_prim(struct rapl_domain *rd, int pl, enum pl_prims prim)
{}

#define power_zone_to_rapl_domain(_zone)

struct rapl_defaults {};
static struct rapl_defaults *defaults_msr;
static const struct rapl_defaults defaults_tpmi;

static struct rapl_defaults *get_defaults(struct rapl_package *rp)
{}

/* Sideband MBI registers */
#define IOSF_CPU_POWER_BUDGET_CTL_BYT
#define IOSF_CPU_POWER_BUDGET_CTL_TNG

#define PACKAGE_PLN_INT_SAVED
#define MAX_PRIM_NAME

/* per domain data. used to describe individual knobs such that access function
 * can be consolidated into one instead of many inline functions.
 */
struct rapl_primitive_info {};

#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f)

static void rapl_init_domains(struct rapl_package *rp);
static int rapl_read_data_raw(struct rapl_domain *rd,
			      enum rapl_primitives prim,
			      bool xlate, u64 *data);
static int rapl_write_data_raw(struct rapl_domain *rd,
			       enum rapl_primitives prim,
			       unsigned long long value);
static int rapl_read_pl_data(struct rapl_domain *rd, int pl,
			      enum pl_prims pl_prim,
			      bool xlate, u64 *data);
static int rapl_write_pl_data(struct rapl_domain *rd, int pl,
			       enum pl_prims pl_prim,
			       unsigned long long value);
static u64 rapl_unit_xlate(struct rapl_domain *rd,
			   enum unit_type type, u64 value, int to_raw);
static void package_power_limit_irq_save(struct rapl_package *rp);

static LIST_HEAD(rapl_packages);	/* guarded by CPU hotplug lock */

static const char *const rapl_domain_names[] =;

static int get_energy_counter(struct powercap_zone *power_zone,
			      u64 *energy_raw)
{}

static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy)
{}

static int release_zone(struct powercap_zone *power_zone)
{}

static int find_nr_power_limit(struct rapl_domain *rd)
{}

static int set_domain_enable(struct powercap_zone *power_zone, bool mode)
{}

static int get_domain_enable(struct powercap_zone *power_zone, bool *mode)
{}

/* per RAPL domain ops, in the order of rapl_domain_type */
static const struct powercap_zone_ops zone_ops[] =;

/*
 * Constraint index used by powercap can be different than power limit (PL)
 * index in that some  PLs maybe missing due to non-existent MSRs. So we
 * need to convert here by finding the valid PLs only (name populated).
 */
static int contraint_to_pl(struct rapl_domain *rd, int cid)
{}

static int set_power_limit(struct powercap_zone *power_zone, int cid,
			   u64 power_limit)
{}

static int get_current_power_limit(struct powercap_zone *power_zone, int cid,
				   u64 *data)
{}

static int set_time_window(struct powercap_zone *power_zone, int cid,
			   u64 window)
{}

static int get_time_window(struct powercap_zone *power_zone, int cid,
			   u64 *data)
{}

static const char *get_constraint_name(struct powercap_zone *power_zone,
				       int cid)
{}

static int get_max_power(struct powercap_zone *power_zone, int cid, u64 *data)
{}

static const struct powercap_zone_constraint_ops constraint_ops =;

/* Return the id used for read_raw/write_raw callback */
static int get_rid(struct rapl_package *rp)
{}

/* called after domain detection and package level data are set */
static void rapl_init_domains(struct rapl_package *rp)
{}

static u64 rapl_unit_xlate(struct rapl_domain *rd, enum unit_type type,
			   u64 value, int to_raw)
{}

/* RAPL primitives for MSR and MMIO I/F */
static struct rapl_primitive_info rpi_msr[NR_RAPL_PRIMITIVES] =;

/* RAPL primitives for TPMI I/F */
static struct rapl_primitive_info rpi_tpmi[NR_RAPL_PRIMITIVES] =;

static struct rapl_primitive_info *get_rpi(struct rapl_package *rp, int prim)
{}

static int rapl_config(struct rapl_package *rp)
{}

static enum rapl_primitives
prim_fixups(struct rapl_domain *rd, enum rapl_primitives prim)
{}

/* Read primitive data based on its related struct rapl_primitive_info.
 * if xlate flag is set, return translated data based on data units, i.e.
 * time, energy, and power.
 * RAPL MSRs are non-architectual and are laid out not consistently across
 * domains. Here we use primitive info to allow writing consolidated access
 * functions.
 * For a given primitive, it is processed by MSR mask and shift. Unit conversion
 * is pre-assigned based on RAPL unit MSRs read at init time.
 * 63-------------------------- 31--------------------------- 0
 * |                           xxxxx (mask)                   |
 * |                                |<- shift ----------------|
 * 63-------------------------- 31--------------------------- 0
 */
static int rapl_read_data_raw(struct rapl_domain *rd,
			      enum rapl_primitives prim, bool xlate, u64 *data)
{}

/* Similar use of primitive info in the read counterpart */
static int rapl_write_data_raw(struct rapl_domain *rd,
			       enum rapl_primitives prim,
			       unsigned long long value)
{}

static int rapl_read_pl_data(struct rapl_domain *rd, int pl,
			      enum pl_prims pl_prim, bool xlate, u64 *data)
{}

static int rapl_write_pl_data(struct rapl_domain *rd, int pl,
			       enum pl_prims pl_prim,
			       unsigned long long value)
{}
/*
 * Raw RAPL data stored in MSRs are in certain scales. We need to
 * convert them into standard units based on the units reported in
 * the RAPL unit MSRs. This is specific to CPUs as the method to
 * calculate units differ on different CPUs.
 * We convert the units to below format based on CPUs.
 * i.e.
 * energy unit: picoJoules  : Represented in picoJoules by default
 * power unit : microWatts  : Represented in milliWatts by default
 * time unit  : microseconds: Represented in seconds by default
 */
static int rapl_check_unit_core(struct rapl_domain *rd)
{}

static int rapl_check_unit_atom(struct rapl_domain *rd)
{}

static void power_limit_irq_save_cpu(void *info)
{}

/* REVISIT:
 * When package power limit is set artificially low by RAPL, LVT
 * thermal interrupt for package power limit should be ignored
 * since we are not really exceeding the real limit. The intention
 * is to avoid excessive interrupts while we are trying to save power.
 * A useful feature might be routing the package_power_limit interrupt
 * to userspace via eventfd. once we have a usecase, this is simple
 * to do by adding an atomic notifier.
 */

static void package_power_limit_irq_save(struct rapl_package *rp)
{}

/*
 * Restore per package power limit interrupt enable state. Called from cpu
 * hotplug code on package removal.
 */
static void package_power_limit_irq_restore(struct rapl_package *rp)
{}

static void set_floor_freq_default(struct rapl_domain *rd, bool mode)
{}

static void set_floor_freq_atom(struct rapl_domain *rd, bool enable)
{}

static u64 rapl_compute_time_window_core(struct rapl_domain *rd, u64 value,
					 bool to_raw)
{}

static u64 rapl_compute_time_window_atom(struct rapl_domain *rd, u64 value,
					 bool to_raw)
{}

/* TPMI Unit register has different layout */
#define TPMI_POWER_UNIT_OFFSET
#define TPMI_POWER_UNIT_MASK
#define TPMI_ENERGY_UNIT_OFFSET
#define TPMI_ENERGY_UNIT_MASK
#define TPMI_TIME_UNIT_OFFSET
#define TPMI_TIME_UNIT_MASK

static int rapl_check_unit_tpmi(struct rapl_domain *rd)
{}

static const struct rapl_defaults defaults_tpmi =;

static const struct rapl_defaults rapl_defaults_core =;

static const struct rapl_defaults rapl_defaults_hsw_server =;

static const struct rapl_defaults rapl_defaults_spr_server =;

static const struct rapl_defaults rapl_defaults_byt =;

static const struct rapl_defaults rapl_defaults_tng =;

static const struct rapl_defaults rapl_defaults_ann =;

static const struct rapl_defaults rapl_defaults_cht =;

static const struct rapl_defaults rapl_defaults_amd =;

static const struct x86_cpu_id rapl_ids[] __initconst =;
MODULE_DEVICE_TABLE(x86cpu, rapl_ids);

/* Read once for all raw primitive data for domains */
static void rapl_update_domain_data(struct rapl_package *rp)
{}

static int rapl_package_register_powercap(struct rapl_package *rp)
{}

static int rapl_check_domain(int domain, struct rapl_package *rp)
{}

/*
 * Get per domain energy/power/time unit.
 * RAPL Interfaces without per domain unit register will use the package
 * scope unit register to set per domain units.
 */
static int rapl_get_domain_unit(struct rapl_domain *rd)
{}

/*
 * Check if power limits are available. Two cases when they are not available:
 * 1. Locked by BIOS, in this case we still provide read-only access so that
 *    users can see what limit is set by the BIOS.
 * 2. Some CPUs make some domains monitoring only which means PLx MSRs may not
 *    exist at all. In this case, we do not show the constraints in powercap.
 *
 * Called after domains are detected and initialized.
 */
static void rapl_detect_powerlimit(struct rapl_domain *rd)
{}

/* Detect active and valid domains for the given CPU, caller must
 * ensure the CPU belongs to the targeted package and CPU hotlug is disabled.
 */
static int rapl_detect_domains(struct rapl_package *rp)
{}

#ifdef CONFIG_PERF_EVENTS

/*
 * Support for RAPL PMU
 *
 * Register a PMU if any of the registered RAPL Packages have the requirement
 * of exposing its energy counters via Perf PMU.
 *
 * PMU Name:
 *	power
 *
 * Events:
 *	Name		Event id	RAPL Domain
 *	energy_cores	0x01		RAPL_DOMAIN_PP0
 *	energy_pkg	0x02		RAPL_DOMAIN_PACKAGE
 *	energy_ram	0x03		RAPL_DOMAIN_DRAM
 *	energy_gpu	0x04		RAPL_DOMAIN_PP1
 *	energy_psys	0x05		RAPL_DOMAIN_PLATFORM
 *
 * Unit:
 *	Joules
 *
 * Scale:
 *	2.3283064365386962890625e-10
 *	The same RAPL domain in different RAPL Packages may have different
 *	energy units. Use 2.3283064365386962890625e-10 (2^-32) Joules as
 *	the fixed unit for all energy counters, and covert each hardware
 *	counter increase to N times of PMU event counter increases.
 *
 * This is fully compatible with the current MSR RAPL PMU. This means that
 * userspace programs like turbostat can use the same code to handle RAPL Perf
 * PMU, no matter what RAPL Interface driver (MSR/TPMI, etc) is running
 * underlying on the platform.
 *
 * Note that RAPL Packages can be probed/removed dynamically, and the events
 * supported by each TPMI RAPL device can be different. Thus the RAPL PMU
 * support is done on demand, which means
 * 1. PMU is registered only if it is needed by a RAPL Package. PMU events for
 *    unsupported counters are not exposed.
 * 2. PMU is unregistered and registered when a new RAPL Package is probed and
 *    supports new counters that are not supported by current PMU.
 * 3. PMU is unregistered when all registered RAPL Packages don't need PMU.
 */

struct rapl_pmu {};

static struct rapl_pmu rapl_pmu;

/* PMU helpers */

static int get_pmu_cpu(struct rapl_package *rp)
{}

static bool is_rp_pmu_cpu(struct rapl_package *rp, int cpu)
{}

static struct rapl_package_pmu_data *event_to_pmu_data(struct perf_event *event)
{}

/* PMU event callbacks */

static u64 event_read_counter(struct perf_event *event)
{}

static void __rapl_pmu_event_start(struct perf_event *event)
{}

static void rapl_pmu_event_start(struct perf_event *event, int mode)
{}

static u64 rapl_event_update(struct perf_event *event)
{}

static void rapl_pmu_event_stop(struct perf_event *event, int mode)
{}

static int rapl_pmu_event_add(struct perf_event *event, int mode)
{}

static void rapl_pmu_event_del(struct perf_event *event, int flags)
{}

/* RAPL PMU event ids, same as shown in sysfs */
enum perf_rapl_events {};
#define RAPL_EVENT_MASK

static const int event_to_domain[PERF_RAPL_MAX] =;

static int rapl_pmu_event_init(struct perf_event *event)
{}

static void rapl_pmu_event_read(struct perf_event *event)
{}

static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
{}

/* PMU sysfs attributes */

/*
 * There are no default events, but we need to create "events" group (with
 * empty attrs) before updating it with detected events.
 */
static struct attribute *attrs_empty[] =;

static struct attribute_group pmu_events_group =;

static ssize_t cpumask_show(struct device *dev,
			    struct device_attribute *attr, char *buf)
{}

static DEVICE_ATTR_RO(cpumask);

static struct attribute *pmu_cpumask_attrs[] =;

static struct attribute_group pmu_cpumask_group =;

PMU_FORMAT_ATTR();
static struct attribute *pmu_format_attr[] =;

static struct attribute_group pmu_format_group =;

static const struct attribute_group *pmu_attr_groups[] =;

#define RAPL_EVENT_ATTR_STR(_name, v, str)

RAPL_EVENT_ATTR_STR(energy-cores,	rapl_cores,	"event=0x01");
RAPL_EVENT_ATTR_STR(energy-pkg,		rapl_pkg,	"event=0x02");
RAPL_EVENT_ATTR_STR(energy-ram,		rapl_ram,	"event=0x03");
RAPL_EVENT_ATTR_STR(energy-gpu,		rapl_gpu,	"event=0x04");
RAPL_EVENT_ATTR_STR(energy-psys,	rapl_psys,	"event=0x05");

RAPL_EVENT_ATTR_STR(energy-cores.unit,	rapl_unit_cores,	"Joules");
RAPL_EVENT_ATTR_STR(energy-pkg.unit,	rapl_unit_pkg,		"Joules");
RAPL_EVENT_ATTR_STR(energy-ram.unit,	rapl_unit_ram,		"Joules");
RAPL_EVENT_ATTR_STR(energy-gpu.unit,	rapl_unit_gpu,		"Joules");
RAPL_EVENT_ATTR_STR(energy-psys.unit,	rapl_unit_psys,		"Joules");

RAPL_EVENT_ATTR_STR(energy-cores.scale,	rapl_scale_cores,	"2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-pkg.scale,	rapl_scale_pkg,		"2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-ram.scale,	rapl_scale_ram,		"2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-gpu.scale,	rapl_scale_gpu,		"2.3283064365386962890625e-10");
RAPL_EVENT_ATTR_STR(energy-psys.scale,	rapl_scale_psys,	"2.3283064365386962890625e-10");

#define RAPL_EVENT_GROUP(_name, domain)

RAPL_EVENT_GROUP();
RAPL_EVENT_GROUP();
RAPL_EVENT_GROUP();
RAPL_EVENT_GROUP();
RAPL_EVENT_GROUP();

static const struct attribute_group *pmu_attr_update[] =;

static int rapl_pmu_update(struct rapl_package *rp)
{}

int rapl_package_add_pmu(struct rapl_package *rp)
{}
EXPORT_SYMBOL_GPL();

void rapl_package_remove_pmu(struct rapl_package *rp)
{}
EXPORT_SYMBOL_GPL();
#endif

/* called from CPU hotplug notifier, hotplug lock held */
void rapl_remove_package_cpuslocked(struct rapl_package *rp)
{}
EXPORT_SYMBOL_GPL();

void rapl_remove_package(struct rapl_package *rp)
{}
EXPORT_SYMBOL_GPL();

/*
 * RAPL Package energy counter scope:
 * 1. AMD/HYGON platforms use per-PKG package energy counter
 * 2. For Intel platforms
 *	2.1 CLX-AP platform has per-DIE package energy counter
 *	2.2 Other platforms that uses MSR RAPL are single die systems so the
 *          package energy counter can be considered as per-PKG/per-DIE,
 *          here it is considered as per-DIE.
 *	2.3 New platforms that use TPMI RAPL doesn't care about the
 *	    scope because they are not MSR/CPU based.
 */
#define rapl_msrs_are_pkg_scope()

/* caller to ensure CPU hotplug lock is held */
struct rapl_package *rapl_find_package_domain_cpuslocked(int id, struct rapl_if_priv *priv,
							 bool id_is_cpu)
{}
EXPORT_SYMBOL_GPL();

struct rapl_package *rapl_find_package_domain(int id, struct rapl_if_priv *priv, bool id_is_cpu)
{}
EXPORT_SYMBOL_GPL();

/* called from CPU hotplug notifier, hotplug lock held */
struct rapl_package *rapl_add_package_cpuslocked(int id, struct rapl_if_priv *priv, bool id_is_cpu)
{}
EXPORT_SYMBOL_GPL();

struct rapl_package *rapl_add_package(int id, struct rapl_if_priv *priv, bool id_is_cpu)
{}
EXPORT_SYMBOL_GPL();

static void power_limit_state_save(void)
{}

static void power_limit_state_restore(void)
{}

static int rapl_pm_callback(struct notifier_block *nb,
			    unsigned long mode, void *_unused)
{}

static struct notifier_block rapl_pm_notifier =;

static struct platform_device *rapl_msr_platdev;

static int __init rapl_init(void)
{}

static void __exit rapl_exit(void)
{}

fs_initcall(rapl_init);
module_exit(rapl_exit);

MODULE_DESCRIPTION();
MODULE_AUTHOR();
MODULE_LICENSE();