linux/drivers/idle/intel_idle.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * intel_idle.c - native hardware idle loop for modern Intel processors
 *
 * Copyright (c) 2013 - 2020, Intel Corporation.
 * Len Brown <[email protected]>
 * Rafael J. Wysocki <[email protected]>
 */

/*
 * intel_idle is a cpuidle driver that loads on all Intel CPUs with MWAIT
 * in lieu of the legacy ACPI processor_idle driver.  The intent is to
 * make Linux more efficient on these processors, as intel_idle knows
 * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
 */

/*
 * Design Assumptions
 *
 * All CPUs have same idle states as boot CPU
 *
 * Chipset BM_STS (bus master status) bit is a NOP
 *	for preventing entry into deep C-states
 *
 * CPU will flush caches as needed when entering a C-state via MWAIT
 *	(in contrast to entering ACPI C3, in which case the WBINVD
 *	instruction needs to be executed to flush the caches)
 */

/*
 * Known limitations
 *
 * ACPI has a .suspend hack to turn off deep c-statees during suspend
 * to avoid complications with the lapic timer workaround.
 * Have not seen issues with suspend, but may need same workaround here.
 *
 */

/* un-comment DEBUG to enable pr_debug() statements */
/* #define DEBUG */

#define pr_fmt(fmt)

#include <linux/acpi.h>
#include <linux/kernel.h>
#include <linux/cpuidle.h>
#include <linux/tick.h>
#include <trace/events/power.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/moduleparam.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
#include <asm/fpu/api.h>

#define INTEL_IDLE_VERSION

static struct cpuidle_driver intel_idle_driver =;
/* intel_idle.max_cstate=0 disables driver */
static int max_cstate =;
static unsigned int disabled_states_mask __read_mostly;
static unsigned int preferred_states_mask __read_mostly;
static bool force_irq_on __read_mostly;
static bool ibrs_off __read_mostly;

static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;

static unsigned long auto_demotion_disable_flags;

static enum {} c1e_promotion =;

struct idle_cpu {};

static const struct idle_cpu *icpu __initdata;
static struct cpuidle_state *cpuidle_state_table __initdata;

static unsigned int mwait_substates __initdata;

/*
 * Enable interrupts before entering the C-state. On some platforms and for
 * some C-states, this may measurably decrease interrupt latency.
 */
#define CPUIDLE_FLAG_IRQ_ENABLE

/*
 * Enable this state by default even if the ACPI _CST does not list it.
 */
#define CPUIDLE_FLAG_ALWAYS_ENABLE

/*
 * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE
 * above.
 */
#define CPUIDLE_FLAG_IBRS

/*
 * Initialize large xstate for the C6-state entrance.
 */
#define CPUIDLE_FLAG_INIT_XSTATE

/*
 * Ignore the sub-state when matching mwait hints between the ACPI _CST and
 * custom tables.
 */
#define CPUIDLE_FLAG_PARTIAL_HINT_MATCH

/*
 * MWAIT takes an 8-bit "hint" in EAX "suggesting"
 * the C-state (top nibble) and sub-state (bottom nibble)
 * 0x00 means "MWAIT(C1)", 0x10 means "MWAIT(C2)" etc.
 *
 * We store the hint at the top of our "flags" for each state.
 */
#define flg2MWAIT(flags)
#define MWAIT2flg(eax)

static __always_inline int __intel_idle(struct cpuidle_device *dev,
					struct cpuidle_driver *drv,
					int index, bool irqoff)
{}

/**
 * intel_idle - Ask the processor to enter the given idle state.
 * @dev: cpuidle device of the target CPU.
 * @drv: cpuidle driver (assumed to point to intel_idle_driver).
 * @index: Target idle state index.
 *
 * Use the MWAIT instruction to notify the processor that the CPU represented by
 * @dev is idle and it can try to enter the idle state corresponding to @index.
 *
 * If the local APIC timer is not known to be reliable in the target idle state,
 * enable one-shot tick broadcasting for the target CPU before executing MWAIT.
 *
 * Must be called under local_irq_disable().
 */
static __cpuidle int intel_idle(struct cpuidle_device *dev,
				struct cpuidle_driver *drv, int index)
{}

static __cpuidle int intel_idle_irq(struct cpuidle_device *dev,
				    struct cpuidle_driver *drv, int index)
{}

static __cpuidle int intel_idle_ibrs(struct cpuidle_device *dev,
				     struct cpuidle_driver *drv, int index)
{}

static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
				       struct cpuidle_driver *drv, int index)
{}

/**
 * intel_idle_s2idle - Ask the processor to enter the given idle state.
 * @dev: cpuidle device of the target CPU.
 * @drv: cpuidle driver (assumed to point to intel_idle_driver).
 * @index: Target idle state index.
 *
 * Use the MWAIT instruction to notify the processor that the CPU represented by
 * @dev is idle and it can try to enter the idle state corresponding to @index.
 *
 * Invoked as a suspend-to-idle callback routine with frozen user space, frozen
 * scheduler tick and suspended scheduler clock on the target CPU.
 */
static __cpuidle int intel_idle_s2idle(struct cpuidle_device *dev,
				       struct cpuidle_driver *drv, int index)
{}

/*
 * States are indexed by the cstate number,
 * which is also the index into the MWAIT hint array.
 * Thus C0 is a dummy.
 */
static struct cpuidle_state nehalem_cstates[] __initdata =;

static struct cpuidle_state snb_cstates[] __initdata =;

static struct cpuidle_state byt_cstates[] __initdata =;

static struct cpuidle_state cht_cstates[] __initdata =;

static struct cpuidle_state ivb_cstates[] __initdata =;

static struct cpuidle_state ivt_cstates[] __initdata =;

static struct cpuidle_state ivt_cstates_4s[] __initdata =;

static struct cpuidle_state ivt_cstates_8s[] __initdata =;

static struct cpuidle_state hsw_cstates[] __initdata =;
static struct cpuidle_state bdw_cstates[] __initdata =;

static struct cpuidle_state skl_cstates[] __initdata =;

static struct cpuidle_state skx_cstates[] __initdata =;

static struct cpuidle_state icx_cstates[] __initdata =;

/*
 * On AlderLake C1 has to be disabled if C1E is enabled, and vice versa.
 * C1E is enabled only if "C1E promotion" bit is set in MSR_IA32_POWER_CTL.
 * But in this case there is effectively no C1, because C1 requests are
 * promoted to C1E. If the "C1E promotion" bit is cleared, then both C1
 * and C1E requests end up with C1, so there is effectively no C1E.
 *
 * By default we enable C1E and disable C1 by marking it with
 * 'CPUIDLE_FLAG_UNUSABLE'.
 */
static struct cpuidle_state adl_cstates[] __initdata =;

static struct cpuidle_state adl_l_cstates[] __initdata =;

static struct cpuidle_state mtl_l_cstates[] __initdata =;

static struct cpuidle_state gmt_cstates[] __initdata =;

static struct cpuidle_state spr_cstates[] __initdata =;

static struct cpuidle_state gnr_cstates[] __initdata =;

static struct cpuidle_state atom_cstates[] __initdata =;
static struct cpuidle_state tangier_cstates[] __initdata =;
static struct cpuidle_state avn_cstates[] __initdata =;
static struct cpuidle_state knl_cstates[] __initdata =;

static struct cpuidle_state bxt_cstates[] __initdata =;

static struct cpuidle_state dnv_cstates[] __initdata =;

/*
 * Note, depending on HW and FW revision, SnowRidge SoC may or may not support
 * C6, and this is indicated in the CPUID mwait leaf.
 */
static struct cpuidle_state snr_cstates[] __initdata =;

static struct cpuidle_state grr_cstates[] __initdata =;

static struct cpuidle_state srf_cstates[] __initdata =;

static const struct idle_cpu idle_cpu_nehalem __initconst =;

static const struct idle_cpu idle_cpu_nhx __initconst =;

static const struct idle_cpu idle_cpu_atom __initconst =;

static const struct idle_cpu idle_cpu_tangier __initconst =;

static const struct idle_cpu idle_cpu_lincroft __initconst =;

static const struct idle_cpu idle_cpu_snb __initconst =;

static const struct idle_cpu idle_cpu_snx __initconst =;

static const struct idle_cpu idle_cpu_byt __initconst =;

static const struct idle_cpu idle_cpu_cht __initconst =;

static const struct idle_cpu idle_cpu_ivb __initconst =;

static const struct idle_cpu idle_cpu_ivt __initconst =;

static const struct idle_cpu idle_cpu_hsw __initconst =;

static const struct idle_cpu idle_cpu_hsx __initconst =;

static const struct idle_cpu idle_cpu_bdw __initconst =;

static const struct idle_cpu idle_cpu_bdx __initconst =;

static const struct idle_cpu idle_cpu_skl __initconst =;

static const struct idle_cpu idle_cpu_skx __initconst =;

static const struct idle_cpu idle_cpu_icx __initconst =;

static const struct idle_cpu idle_cpu_adl __initconst =;

static const struct idle_cpu idle_cpu_adl_l __initconst =;

static const struct idle_cpu idle_cpu_mtl_l __initconst =;

static const struct idle_cpu idle_cpu_gmt __initconst =;

static const struct idle_cpu idle_cpu_spr __initconst =;

static const struct idle_cpu idle_cpu_gnr __initconst =;

static const struct idle_cpu idle_cpu_avn __initconst =;

static const struct idle_cpu idle_cpu_knl __initconst =;

static const struct idle_cpu idle_cpu_bxt __initconst =;

static const struct idle_cpu idle_cpu_dnv __initconst =;

static const struct idle_cpu idle_cpu_tmt __initconst =;

static const struct idle_cpu idle_cpu_snr __initconst =;

static const struct idle_cpu idle_cpu_grr __initconst =;

static const struct idle_cpu idle_cpu_srf __initconst =;

static const struct x86_cpu_id intel_idle_ids[] __initconst =;

static const struct x86_cpu_id intel_mwait_ids[] __initconst =;

static bool __init intel_idle_max_cstate_reached(int cstate)
{}

static bool __init intel_idle_state_needs_timer_stop(struct cpuidle_state *state)
{}

#ifdef CONFIG_ACPI_PROCESSOR_CSTATE
#include <acpi/processor.h>

static bool no_acpi __read_mostly;
module_param(no_acpi, bool, 0444);
MODULE_PARM_DESC();

static bool force_use_acpi __read_mostly; /* No effect if no_acpi is set. */
module_param_named(use_acpi, force_use_acpi, bool, 0444);
MODULE_PARM_DESC();

static struct acpi_processor_power acpi_state_table __initdata;

/**
 * intel_idle_cst_usable - Check if the _CST information can be used.
 *
 * Check if all of the C-states listed by _CST in the max_cstate range are
 * ACPI_CSTATE_FFH, which means that they should be entered via MWAIT.
 */
static bool __init intel_idle_cst_usable(void)
{}

static bool __init intel_idle_acpi_cst_extract(void)
{}

static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
{}

static bool __init intel_idle_off_by_default(unsigned int flags, u32 mwait_hint)
{}
#else /* !CONFIG_ACPI_PROCESSOR_CSTATE */
#define force_use_acpi

static inline bool intel_idle_acpi_cst_extract(void) { return false; }
static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
static inline bool intel_idle_off_by_default(unsigned int flags, u32 mwait_hint)
{
	return false;
}
#endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */

/**
 * ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
 *
 * Tune IVT multi-socket targets.
 * Assumption: num_sockets == (max_package_num + 1).
 */
static void __init ivt_idle_state_table_update(void)
{}

/**
 * irtl_2_usec - IRTL to microseconds conversion.
 * @irtl: IRTL MSR value.
 *
 * Translate the IRTL (Interrupt Response Time Limit) MSR value to microseconds.
 */
static unsigned long long __init irtl_2_usec(unsigned long long irtl)
{}

/**
 * bxt_idle_state_table_update - Fix up the Broxton idle states table.
 *
 * On BXT, trust the IRTL (Interrupt Response Time Limit) MSR to show the
 * definitive maximum latency and use the same value for target_residency.
 */
static void __init bxt_idle_state_table_update(void)
{}

/**
 * sklh_idle_state_table_update - Fix up the Sky Lake idle states table.
 *
 * On SKL-H (model 0x5e) skip C8 and C9 if C10 is enabled and SGX disabled.
 */
static void __init sklh_idle_state_table_update(void)
{}

/**
 * skx_idle_state_table_update - Adjust the Sky Lake/Cascade Lake
 * idle states table.
 */
static void __init skx_idle_state_table_update(void)
{}

/**
 * adl_idle_state_table_update - Adjust AlderLake idle states table.
 */
static void __init adl_idle_state_table_update(void)
{}

/**
 * spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
 */
static void __init spr_idle_state_table_update(void)
{}

static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
{}

static void state_update_enter_method(struct cpuidle_state *state, int cstate)
{}

static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
{}

/**
 * intel_idle_cpuidle_driver_init - Create the list of available idle states.
 * @drv: cpuidle driver structure to initialize.
 */
static void __init intel_idle_cpuidle_driver_init(struct cpuidle_driver *drv)
{}

static void auto_demotion_disable(void)
{}

static void c1e_promotion_enable(void)
{}

static void c1e_promotion_disable(void)
{}

/**
 * intel_idle_cpu_init - Register the target CPU with the cpuidle core.
 * @cpu: CPU to initialize.
 *
 * Register a cpuidle device object for @cpu and update its MSRs in accordance
 * with the processor model flags.
 */
static int intel_idle_cpu_init(unsigned int cpu)
{}

static int intel_idle_cpu_online(unsigned int cpu)
{}

/**
 * intel_idle_cpuidle_devices_uninit - Unregister all cpuidle devices.
 */
static void __init intel_idle_cpuidle_devices_uninit(void)
{}

static int __init intel_idle_init(void)
{}
device_initcall(intel_idle_init);

/*
 * We are not really modular, but we used to support that.  Meaning we also
 * support "intel_idle.max_cstate=..." at boot and also a read-only export of
 * it at /sys/module/intel_idle/parameters/max_cstate -- so using module_param
 * is the easiest way (currently) to continue doing that.
 */
module_param(max_cstate, int, 0444);
/*
 * The positions of the bits that are set in this number are the indices of the
 * idle states to be disabled by default (as reflected by the names of the
 * corresponding idle state directories in sysfs, "state0", "state1" ...
 * "state<i>" ..., where <i> is the index of the given state).
 */
module_param_named(states_off, disabled_states_mask, uint, 0444);
MODULE_PARM_DESC();
/*
 * Some platforms come with mutually exclusive C-states, so that if one is
 * enabled, the other C-states must not be used. Example: C1 and C1E on
 * Sapphire Rapids platform. This parameter allows for selecting the
 * preferred C-states among the groups of mutually exclusive C-states - the
 * selected C-states will be registered, the other C-states from the mutually
 * exclusive group won't be registered. If the platform has no mutually
 * exclusive C-states, this parameter has no effect.
 */
module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
MODULE_PARM_DESC();
/*
 * Debugging option that forces the driver to enter all C-states with
 * interrupts enabled. Does not apply to C-states with
 * 'CPUIDLE_FLAG_INIT_XSTATE' and 'CPUIDLE_FLAG_IBRS' flags.
 */
module_param(force_irq_on, bool, 0444);
/*
 * Force the disabling of IBRS when X86_FEATURE_KERNEL_IBRS is on and
 * CPUIDLE_FLAG_IRQ_ENABLE isn't set.
 */
module_param(ibrs_off, bool, 0444);
MODULE_PARM_DESC();