// SPDX-License-Identifier: GPL-2.0 /* * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) * * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <[email protected]> * * Interactivity improvements by Mike Galbraith * (C) 2007 Mike Galbraith <[email protected]> * * Various enhancements by Dmitry Adamushko. * (C) 2007 Dmitry Adamushko <[email protected]> * * Group scheduling enhancements by Srivatsa Vaddagiri * Copyright IBM Corporation, 2007 * Author: Srivatsa Vaddagiri <[email protected]> * * Scaled math optimizations by Thomas Gleixner * Copyright (C) 2007, Thomas Gleixner <[email protected]> * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include <linux/energy_model.h> #include <linux/mmap_lock.h> #include <linux/hugetlb_inline.h> #include <linux/jiffies.h> #include <linux/mm_api.h> #include <linux/highmem.h> #include <linux/spinlock_api.h> #include <linux/cpumask_api.h> #include <linux/lockdep_api.h> #include <linux/softirq.h> #include <linux/refcount_api.h> #include <linux/topology.h> #include <linux/sched/clock.h> #include <linux/sched/cond_resched.h> #include <linux/sched/cputime.h> #include <linux/sched/isolation.h> #include <linux/sched/nohz.h> #include <linux/cpuidle.h> #include <linux/interrupt.h> #include <linux/memory-tiers.h> #include <linux/mempolicy.h> #include <linux/mutex_api.h> #include <linux/profile.h> #include <linux/psi.h> #include <linux/ratelimit.h> #include <linux/task_work.h> #include <linux/rbtree_augmented.h> #include <asm/switch_to.h> #include "sched.h" #include "stats.h" #include "autogroup.h" /* * The initial- and re-scaling of tunables is configurable * * Options are: * * SCHED_TUNABLESCALING_NONE - unscaled, always *1 * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) */ unsigned int sysctl_sched_tunable_scaling = …; /* * Minimal preemption granularity for CPU-bound tasks: * * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) */ unsigned int sysctl_sched_base_slice = …; static unsigned int normalized_sysctl_sched_base_slice = …; const_debug unsigned int sysctl_sched_migration_cost = …; static int __init setup_sched_thermal_decay_shift(char *str) { … } __setup(…); #ifdef CONFIG_SMP /* * For asym packing, by default the lower numbered CPU has higher priority. */ int __weak arch_asym_cpu_priority(int cpu) { … } /* * The margin used when comparing utilization with CPU capacity. * * (default: ~20%) */ #define fits_capacity(cap, max) … /* * The margin used when comparing CPU capacities. * is 'cap1' noticeably greater than 'cap2' * * (default: ~5%) */ #define capacity_greater(cap1, cap2) … #endif #ifdef CONFIG_CFS_BANDWIDTH /* * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool * each time a cfs_rq requests quota. * * Note: in the case that the slice exceeds the runtime remaining (either due * to consumption or the quota being specified to be smaller than the slice) * we will always only issue the remaining available time. * * (default: 5 msec, units: microseconds) */ static unsigned int sysctl_sched_cfs_bandwidth_slice = …; #endif #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ static unsigned int sysctl_numa_balancing_promote_rate_limit = …; #endif #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = …; static int __init sched_fair_sysctl_init(void) { … } late_initcall(sched_fair_sysctl_init); #endif static inline void update_load_add(struct load_weight *lw, unsigned long inc) { … } static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { … } static inline void update_load_set(struct load_weight *lw, unsigned long w) { … } /* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible * to users decreases. But the relationship is not linear, * so pick a second-best guess by going with the log2 of the * number of CPUs. * * This idea comes from the SD scheduler of Con Kolivas: */ static unsigned int get_update_sysctl_factor(void) { … } static void update_sysctl(void) { … } void __init sched_init_granularity(void) { … } #define WMULT_CONST … #define WMULT_SHIFT … static void __update_inv_weight(struct load_weight *lw) { … } /* * delta_exec * weight / lw.weight * OR * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT * * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case * we're guaranteed shift stays positive because inv_weight is guaranteed to * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22. * * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus * weight/lw.weight <= 1, and therefore our shift will also be positive. */ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) { … } /* * delta /= w */ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se) { … } const struct sched_class fair_sched_class; /************************************************************** * CFS operations on generic schedulable entities: */ #ifdef CONFIG_FAIR_GROUP_SCHED /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) … static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { … } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { … } static inline void assert_list_leaf_cfs_rq(struct rq *rq) { … } /* Iterate through all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) … /* Do the two (enqueued) entities belong to the same group ? */ static inline struct cfs_rq * is_same_group(struct sched_entity *se, struct sched_entity *pse) { … } static inline struct sched_entity *parent_entity(const struct sched_entity *se) { … } static void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { … } static int tg_is_idle(struct task_group *tg) { … } static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) { … } static int se_is_idle(struct sched_entity *se) { … } #else /* !CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity … static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) { return true; } static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) { } static inline void assert_list_leaf_cfs_rq(struct rq *rq) { } #define for_each_leaf_cfs_rq_safe … static inline struct sched_entity *parent_entity(struct sched_entity *se) { return NULL; } static inline void find_matching_se(struct sched_entity **se, struct sched_entity **pse) { } static inline int tg_is_idle(struct task_group *tg) { return 0; } static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) { return 0; } static int se_is_idle(struct sched_entity *se) { return task_has_idle_policy(task_of(se)); } #endif /* CONFIG_FAIR_GROUP_SCHED */ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: */ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) { … } static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) { … } static inline bool entity_before(const struct sched_entity *a, const struct sched_entity *b) { … } static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } #define __node_2_se(node) … /* * Compute virtual time from the per-task service numbers: * * Fair schedulers conserve lag: * * \Sum lag_i = 0 * * Where lag_i is given by: * * lag_i = S - s_i = w_i * (V - v_i) * * Where S is the ideal service time and V is it's virtual time counterpart. * Therefore: * * \Sum lag_i = 0 * \Sum w_i * (V - v_i) = 0 * \Sum w_i * V - w_i * v_i = 0 * * From which we can solve an expression for V in v_i (which we have in * se->vruntime): * * \Sum v_i * w_i \Sum v_i * w_i * V = -------------- = -------------- * \Sum w_i W * * Specifically, this is the weighted average of all entity virtual runtimes. * * [[ NOTE: this is only equal to the ideal scheduler under the condition * that join/leave operations happen at lag_i = 0, otherwise the * virtual time has non-contiguous motion equivalent to: * * V +-= lag_i / W * * Also see the comment in place_entity() that deals with this. ]] * * However, since v_i is u64, and the multiplication could easily overflow * transform it into a relative form that uses smaller quantities: * * Substitute: v_i == (v_i - v0) + v0 * * \Sum ((v_i - v0) + v0) * w_i \Sum (v_i - v0) * w_i * V = ---------------------------- = --------------------- + v0 * W W * * Which we track using: * * v0 := cfs_rq->min_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime * \Sum w_i := cfs_rq->avg_load * * Since min_vruntime is a monotonic increasing variable that closely tracks * the per-task service, these deltas: (v_i - v), will be in the order of the * maximal (virtual) lag induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. * * As measured, the max (key * weight) value was ~44 bits for a kernel build. */ static void avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static void avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static inline void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) { … } /* * Specifically: avg_runtime() + 0 must result in entity_eligible() := true * For this to be so, the result of this function must have a left bias. */ u64 avg_vruntime(struct cfs_rq *cfs_rq) { … } /* * lag_i = S - s_i = w_i * (V - v_i) * * However, since V is approximated by the weighted average of all entities it * is possible -- by addition/removal/reweight to the tree -- to move V around * and end up with a larger lag than we started with. * * Limit this to either double the slice length with a minimum of TICK_NSEC * since that is the timing granularity. * * EEVDF gives the following limit for a steady state system: * * -r_max < lag < max(r_max, q) * * XXX could add max_slice to the augmented data to track this. */ static s64 entity_lag(u64 avruntime, struct sched_entity *se) { … } static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } /* * Entity is eligible once it received less service than it ought to have, * eg. lag >= 0. * * lag_i = S - s_i = w_i*(V - v_i) * * lag_i >= 0 -> V >= v_i * * \Sum (v_i - v)*w_i * V = ------------------ + v * \Sum w_i * * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) * * Note: using 'avg_vruntime() > se->vruntime' is inaccurate due * to the loss in precision caused by the division. */ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) { … } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) { … } static void update_min_vruntime(struct cfs_rq *cfs_rq) { … } static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) { … } static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) { … } #define vruntime_gt(field, lse, rse) … static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node *node) { … } static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) { … } /* * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) */ static inline bool min_vruntime_update(struct sched_entity *se, bool exit) { … } RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, run_node, min_vruntime, min_vruntime_update); /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) { … } struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { … } /* * Earliest Eligible Virtual Deadline First * * In order to provide latency guarantees for different request sizes * EEVDF selects the best runnable task from two criteria: * * 1) the task must be eligible (must be owed service) * * 2) from those tasks that meet 1), we select the one * with the earliest virtual deadline. * * We can do this in O(log n) time due to an augmented RB-tree. The * tree keeps the entries sorted on deadline, but also functions as a * heap based on the vruntime by keeping: * * se->min_vruntime = min(se->vruntime, se->{left,right}->min_vruntime) * * Which allows tree pruning through eligibility. */ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) { … } #ifdef CONFIG_SCHED_DEBUG struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { … } /************************************************************** * Scheduling class statistics methods: */ #ifdef CONFIG_SMP int sched_update_scaling(void) { … } #endif #endif static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); /* * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i * this is probably good enough. */ static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } #include "pelt.h" #ifdef CONFIG_SMP static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); static unsigned long capacity_of(int cpu); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) { … } /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: * * util_avg = cfs_rq->avg.util_avg / (cfs_rq->avg.load_avg + 1) * * se_weight(se) * * However, in many cases, the above util_avg does not give a desired * value. Moreover, the sum of the util_avgs may be divergent, such * as when the series is a harmonic series. * * To solve this problem, we also cap the util_avg of successive tasks to * only 1/2 of the left utilization budget: * * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n * * where n denotes the nth task and cpu_scale the CPU capacity. * * For example, for a CPU with 1024 of capacity, a simplest series from * the beginning would be like: * * task util_avg: 512, 256, 128, 64, 32, 16, 8, ... * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ... * * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap) * if util_avg > util_avg_cap. */ void post_init_entity_util_avg(struct task_struct *p) { … } #else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } void post_init_entity_util_avg(struct task_struct *p) { } static void update_tg_load_avg(struct cfs_rq *cfs_rq) { } #endif /* CONFIG_SMP */ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) { … } static inline void update_curr_task(struct task_struct *p, s64 delta_exec) { … } static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) { … } static inline bool do_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *pse, struct sched_entity *se) { … } /* * Used by other classes to account runtime. */ s64 update_curr_common(struct rq *rq) { … } /* * Update the current task's runtime statistics. */ static void update_curr(struct cfs_rq *cfs_rq) { … } static void update_curr_fair(struct rq *rq) { … } static inline void update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static inline void update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static inline void update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } /* * Task is being enqueued - update stats: */ static inline void update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { … } static inline void update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { … } /* * We are picking a new current task - update its stats: */ static inline void update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } /************************************************** * Scheduling class queueing methods: */ static inline bool is_core_idle(int cpu) { … } #ifdef CONFIG_NUMA #define NUMA_IMBALANCE_MIN … static inline long adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr) { … } #endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA_BALANCING /* * Approximate time to scan a full NUMA task in ms. The task scan period is * calculated based on the tasks virtual memory size and * numa_balancing_scan_size. */ unsigned int sysctl_numa_balancing_scan_period_min = …; unsigned int sysctl_numa_balancing_scan_period_max = …; /* Portion of address space to scan in MB */ unsigned int sysctl_numa_balancing_scan_size = …; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = …; /* The page with hint page fault latency < threshold in ms is considered hot */ unsigned int sysctl_numa_balancing_hot_threshold = …; struct numa_group { … }; /* * For functions that can be called in multiple contexts that permit reading * ->numa_group (see struct task_struct for locking rules). */ static struct numa_group *deref_task_numa_group(struct task_struct *p) { … } static struct numa_group *deref_curr_numa_group(struct task_struct *p) { … } static inline unsigned long group_faults_priv(struct numa_group *ng); static inline unsigned long group_faults_shared(struct numa_group *ng); static unsigned int task_nr_scan_windows(struct task_struct *p) { … } /* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */ #define MAX_SCAN_WINDOW … static unsigned int task_scan_min(struct task_struct *p) { … } static unsigned int task_scan_start(struct task_struct *p) { … } static unsigned int task_scan_max(struct task_struct *p) { … } static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { … } static void account_numa_dequeue(struct rq *rq, struct task_struct *p) { … } /* Shared or private faults. */ #define NR_NUMA_HINT_FAULT_TYPES … /* Memory and CPU locality */ #define NR_NUMA_HINT_FAULT_STATS … /* Averaged statistics, and temporary buffers. */ #define NR_NUMA_HINT_FAULT_BUCKETS … pid_t task_numa_group_id(struct task_struct *p) { … } /* * The averaged statistics, shared & private, memory & CPU, * occupy the first half of the array. The second half of the * array is for current counters, which are averaged into the * first set by task_numa_placement. */ static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv) { … } static inline unsigned long task_faults(struct task_struct *p, int nid) { … } static inline unsigned long group_faults(struct task_struct *p, int nid) { … } static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) { … } static inline unsigned long group_faults_priv(struct numa_group *ng) { … } static inline unsigned long group_faults_shared(struct numa_group *ng) { … } /* * A node triggering more than 1/3 as many NUMA faults as the maximum is * considered part of a numa group's pseudo-interleaving set. Migrations * between these nodes are slowed down, to allow things to settle down. */ #define ACTIVE_NODE_FRACTION … static bool numa_is_active_node(int nid, struct numa_group *ng) { … } /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, int lim_dist, bool task) { … } /* * These return the fraction of accesses done by a particular task, or * task group, on a particular numa node. The group weight is given a * larger multiplier, in order to group tasks together that are almost * evenly spread out between numa nodes. */ static inline unsigned long task_weight(struct task_struct *p, int nid, int dist) { … } static inline unsigned long group_weight(struct task_struct *p, int nid, int dist) { … } /* * If memory tiering mode is enabled, cpupid of slow memory page is * used to record scan time instead of CPU and PID. When tiering mode * is disabled at run time, the scan time (in cpupid) will be * interpreted as CPU and PID. So CPU needs to be checked to avoid to * access out of array bound. */ static inline bool cpupid_valid(int cpupid) { … } /* * For memory tiering mode, if there are enough free pages (more than * enough watermark defined here) in fast memory node, to take full * advantage of fast memory capacity, all recently accessed slow * memory pages will be migrated to fast memory node without * considering hot threshold. */ static bool pgdat_free_space_enough(struct pglist_data *pgdat) { … } /* * For memory tiering mode, when page tables are scanned, the scan * time will be recorded in struct page in addition to make page * PROT_NONE for slow memory page. So when the page is accessed, in * hint page fault handler, the hint page fault latency is calculated * via, * * hint page fault latency = hint page fault time - scan time * * The smaller the hint page fault latency, the higher the possibility * for the page to be hot. */ static int numa_hint_fault_latency(struct folio *folio) { … } /* * For memory tiering mode, too high promotion/demotion throughput may * hurt application latency. So we provide a mechanism to rate limit * the number of pages that are tried to be promoted. */ static bool numa_promotion_rate_limit(struct pglist_data *pgdat, unsigned long rate_limit, int nr) { … } #define NUMA_MIGRATION_ADJUST_STEPS … static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, unsigned long rate_limit, unsigned int ref_th) { … } bool should_numa_migrate_memory(struct task_struct *p, struct folio *folio, int src_nid, int dst_cpu) { … } /* * 'numa_type' describes the node at the moment of load balancing. */ enum numa_type { … }; /* Cached statistics for all CPUs within a node */ struct numa_stats { … }; struct task_numa_env { … }; static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static inline enum numa_type numa_classify(unsigned int imbalance_pct, struct numa_stats *ns) { … } #ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { … } #else static inline int numa_idle_core(int idle_core, int cpu) { return idle_core; } #endif /* * Gather all necessary information to make NUMA balancing placement * decisions that are compatible with standard load balancer. This * borrows code and logic from update_sg_lb_stats but sharing a * common implementation is impractical. */ static void update_numa_stats(struct task_numa_env *env, struct numa_stats *ns, int nid, bool find_idle) { … } static void task_numa_assign(struct task_numa_env *env, struct task_struct *p, long imp) { … } static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { … } /* * Maximum NUMA importance can be 1998 (2*999); * SMALLIMP @ 30 would be close to 1998/64. * Used to deter task migration. */ #define SMALLIMP … /* * This checks if the overall compute and NUMA accesses of the system would * be improved if the source tasks was migrated to the target dst_cpu taking * into account that it might be best if task running on the dst_cpu should * be exchanged with the source task */ static bool task_numa_compare(struct task_numa_env *env, long taskimp, long groupimp, bool maymove) { … } static void task_numa_find_cpu(struct task_numa_env *env, long taskimp, long groupimp) { … } static int task_numa_migrate(struct task_struct *p) { … } /* Attempt to migrate a task to a CPU on the preferred node. */ static void numa_migrate_preferred(struct task_struct *p) { … } /* * Find out how many nodes the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. */ static void numa_group_count_active_nodes(struct numa_group *numa_group) { … } /* * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS * increments. The more local the fault statistics are, the higher the scan * period will be for the next scan window. If local/(local+remote) ratio is * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) * the scan period will decrease. Aim for 70% local accesses. */ #define NUMA_PERIOD_SLOTS … #define NUMA_PERIOD_THRESHOLD … /* * Increase the scan period (slow down scanning) if the majority of * our memory is already on our local node, or if the majority of * the page accesses are shared with other processes. * Otherwise, decrease the scan period. */ static void update_task_scan_period(struct task_struct *p, unsigned long shared, unsigned long private) { … } /* * Get the fraction of time the task has been running since the last * NUMA placement cycle. The scheduler keeps similar statistics, but * decays those on a 32ms period, which is orders of magnitude off * from the dozens-of-seconds NUMA balancing period. Use the scheduler * stats only if the task is so new there are no NUMA statistics yet. */ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) { … } /* * Determine the preferred nid for a task in a numa_group. This needs to * be done in a way that produces consistent results with group_weight, * otherwise workloads might not converge. */ static int preferred_group_nid(struct task_struct *p, int nid) { … } static void task_numa_placement(struct task_struct *p) { … } static inline int get_numa_group(struct numa_group *grp) { … } static inline void put_numa_group(struct numa_group *grp) { … } static void task_numa_group(struct task_struct *p, int cpupid, int flags, int *priv) { … } /* * Get rid of NUMA statistics associated with a task (either current or dead). * If @final is set, the task is dead and has reached refcount zero, so we can * safely free all relevant data structures. Otherwise, there might be * concurrent reads from places like load balancing and procfs, and we should * reset the data back to default state without freeing ->numa_faults. */ void task_numa_free(struct task_struct *p, bool final) { … } /* * Got a PROT_NONE fault for a page on @node. */ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) { … } static void reset_ptenuma_scan(struct task_struct *p) { … } static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) { … } #define VMA_PID_RESET_PERIOD … /* * The expensive part of numa migration is done from task_work context. * Triggered from task_tick_numa(). */ static void task_numa_work(struct callback_head *work) { … } void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) { … } /* * Drive the periodic memory faults.. */ static void task_tick_numa(struct rq *rq, struct task_struct *curr) { … } static void update_scan_period(struct task_struct *p, int new_cpu) { … } #else static void task_tick_numa(struct rq *rq, struct task_struct *curr) { } static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) { } static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p) { } static inline void update_scan_period(struct task_struct *p, int new_cpu) { } #endif /* CONFIG_NUMA_BALANCING */ static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } /* * Signed add and clamp on underflow. * * Explicitly do a load-store to ensure the intermediate value never hits * memory. This allows lockless observations without ever seeing the negative * values. */ #define add_positive(_ptr, _val) … /* * Unsigned subtract and clamp on underflow. * * Explicitly do a load-store to ensure the intermediate value never hits * memory. This allows lockless observations without ever seeing the negative * values. */ #define sub_positive(_ptr, _val) … /* * Remove and clamp on negative, from a local variable. * * A variant of sub_positive(), which does not use explicit load-store * and is thus optimized for local variable updates. */ #define lsub_positive(_ptr, _val) … #ifdef CONFIG_SMP static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } #else static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } #endif static void reweight_eevdf(struct sched_entity *se, u64 avruntime, unsigned long weight) { … } static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { … } static void reweight_task_fair(struct rq *rq, struct task_struct *p, const struct load_weight *lw) { … } static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* * All this does is approximate the hierarchical proportion which includes that * global sum we all love to hate. * * That is, the weight of a group entity, is the proportional share of the * group weight based on the group runqueue weights. That is: * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (1) * \Sum grq->load.weight * * Now, because computing that sum is prohibitively expensive to compute (been * there, done that) we approximate it with this average stuff. The average * moves slower and therefore the approximation is cheaper and more stable. * * So instead of the above, we substitute: * * grq->load.weight -> grq->avg.load_avg (2) * * which yields the following: * * tg->weight * grq->avg.load_avg * ge->load.weight = ------------------------------ (3) * tg->load_avg * * Where: tg->load_avg ~= \Sum grq->avg.load_avg * * That is shares_avg, and it is right (given the approximation (2)). * * The problem with it is that because the average is slow -- it was designed * to be exactly that of course -- this leads to transients in boundary * conditions. In specific, the case where the group was idle and we start the * one task. It takes time for our CPU's grq->avg.load_avg to build up, * yielding bad latency etc.. * * Now, in that special case (1) reduces to: * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- = tg->weight (4) * grp->load.weight * * That is, the sum collapses because all other CPUs are idle; the UP scenario. * * So what we do is modify our approximation (3) to approach (4) in the (near) * UP case, like: * * ge->load.weight = * * tg->weight * grq->load.weight * --------------------------------------------------- (5) * tg->load_avg - grq->avg.load_avg + grq->load.weight * * But because grq->load.weight can drop to 0, resulting in a divide by zero, * we need to use grq->avg.load_avg as its lower bound, which then gives: * * * tg->weight * grq->load.weight * ge->load.weight = ----------------------------- (6) * tg_load_avg' * * Where: * * tg_load_avg' = tg->load_avg - grq->avg.load_avg + * max(grq->load.weight, grq->avg.load_avg) * * And that is shares_weight and is icky. In the (near) UP case it approaches * (4) while in the normal case it approaches (3). It consistently * overestimates the ge->load.weight and therefore: * * \Sum ge->load.weight >= tg->weight * * hence icky! */ static long calc_group_shares(struct cfs_rq *cfs_rq) { … } #endif /* CONFIG_SMP */ /* * Recomputes the group entity based on the current state of its group * runqueue. */ static void update_cfs_group(struct sched_entity *se) { … } #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_cfs_group(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) { … } #ifdef CONFIG_SMP static inline bool load_avg_is_decayed(struct sched_avg *sa) { … } static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { … } #ifdef CONFIG_FAIR_GROUP_SCHED /* * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list * immediately before a parent cfs_rq, and cfs_rqs are removed from the list * bottom-up, we only have to test whether the cfs_rq before us on the list * is our child. * If cfs_rq is not on the list, test whether a child needs its to be added to * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). */ static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) { … } static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { … } /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed * * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load. * However, because tg->load_avg is a global value there are performance * considerations. * * In order to avoid having to look at the other cfs_rq's, we use a * differential update where we store the last value we propagated. This in * turn allows skipping updates if the differential is 'small'. * * Updating tg's load_avg is necessary before update_cfs_share(). */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { … } static inline void clear_tg_load_avg(struct cfs_rq *cfs_rq) { … } /* CPU offline callback: */ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) { … } /* * Called within set_task_rq() right before setting a task's CPU. The * caller only guarantees p->pi_lock is held; no other assumptions, * including the state of rq->lock, should be made. */ void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { … } /* * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to * propagate its contribution. The key to this propagation is the invariant * that for each group: * * ge->avg == grq->avg (1) * * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial * and simply copies the running/runnable sum over (but still wrong, because * the group entity and group rq do not have their PELT windows aligned). * * However, update_tg_cfs_load() is more complex. So we have: * * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2) * * And since, like util, the runnable part should be directly transferable, * the following would _appear_ to be the straight forward approach: * * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3) * * And per (1) we have: * * ge->avg.runnable_avg == grq->avg.runnable_avg * * Which gives: * * ge->load.weight * grq->avg.load_avg * ge->avg.load_avg = ----------------------------------- (4) * grq->load.weight * * Except that is wrong! * * Because while for entities historical weight is not important and we * really only care about our future and therefore can consider a pure * runnable sum, runqueues can NOT do this. * * We specifically want runqueues to have a load_avg that includes * historical weights. Those represent the blocked load, the load we expect * to (shortly) return to us. This only works by keeping the weights as * integral part of the sum. We therefore cannot decompose as per (3). * * Another reason this doesn't work is that runnable isn't a 0-sum entity. * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the * rq itself is runnable anywhere between 2/3 and 1 depending on how the * runnable section of these tasks overlap (or not). If they were to perfectly * align the rq as a whole would be runnable 2/3 of the time. If however we * always have at least 1 runnable task, the rq as a whole is always runnable. * * So we'll have to approximate.. :/ * * Given the constraint: * * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX * * We can construct a rule that adds runnable to a rq by assuming minimal * overlap. * * On removal, we'll assume each task is equally runnable; which yields: * * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight * * XXX: only do this for the part of runnable > running ? * */ static inline void update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { … } static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { … } static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { … } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) { … } /* Update task and its cfs_rq load average */ static inline int propagate_entity_load_avg(struct sched_entity *se) { … } /* * Check if we need to update the load and the utilization of a blocked * group_entity: */ static inline bool skip_blocked_update(struct sched_entity *se) { … } #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {} static inline void clear_tg_offline_cfs_rqs(struct rq *rq) {} static inline int propagate_entity_load_avg(struct sched_entity *se) { return 0; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_NO_HZ_COMMON static inline void migrate_se_pelt_lag(struct sched_entity *se) { … } #else static void migrate_se_pelt_lag(struct sched_entity *se) {} #endif /** * update_cfs_rq_load_avg - update the cfs_rq's load/util averages * @now: current time, as per cfs_rq_clock_pelt() * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * * Return: true if the load decayed or we removed load. * * Since both these conditions indicate a changed cfs_rq->avg.load we should * call update_tg_load_avg() when this function returns true. */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { … } /** * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } /** * detach_entity_load_avg - detach this entity from its cfs_rq load avg * @cfs_rq: cfs_rq to detach from * @se: sched_entity to detach * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } /* * Optional action to be done while updating the load average */ #define UPDATE_TG … #define SKIP_AGE_LOAD … #define DO_ATTACH … #define DO_DETACH … /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { … } /* * Synchronize entity load avg of dequeued entity without locking * the previous rq. */ static void sync_entity_load_avg(struct sched_entity *se) { … } /* * Task first catches up with cfs_rq, and then subtract * itself from the cfs_rq (task must be off the queue now). */ static void remove_entity_load_avg(struct sched_entity *se) { … } static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) { … } static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) { … } static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { … } static inline unsigned long task_runnable(struct task_struct *p) { … } static inline unsigned long _task_util_est(struct task_struct *p) { … } static inline unsigned long task_util_est(struct task_struct *p) { … } static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { … } static inline void util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) { … } #define UTIL_EST_MARGIN … static inline void util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) { … } static inline unsigned long get_actual_cpu_capacity(int cpu) { … } static inline int util_fits_cpu(unsigned long util, unsigned long uclamp_min, unsigned long uclamp_max, int cpu) { … } static inline int task_fits_cpu(struct task_struct *p, int cpu) { … } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { … } #else /* CONFIG_SMP */ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { return !cfs_rq->nr_running; } #define UPDATE_TG … #define SKIP_AGE_LOAD … #define DO_ATTACH … #define DO_DETACH … static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { cfs_rq_util_change(cfs_rq, 0); } static inline void remove_entity_load_avg(struct sched_entity *se) {} static inline void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline int sched_balance_newidle(struct rq *rq, struct rq_flags *rf) { return 0; } static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { … } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); static void requeue_delayed_entity(struct sched_entity *se); static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { … } static void __clear_buddies_next(struct sched_entity *se) { … } static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static inline void finish_delayed_dequeue_entity(struct sched_entity *se) { … } static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { … } static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { … } static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups * 2) pick the "next" process, since someone really wants that to run * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { … } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { … } static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { … } /************************************************** * CFS bandwidth control machinery */ #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_JUMP_LABEL static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { … } void cfs_bandwidth_usage_inc(void) { … } void cfs_bandwidth_usage_dec(void) { … } #else /* CONFIG_JUMP_LABEL */ static bool cfs_bandwidth_used(void) { return true; } void cfs_bandwidth_usage_inc(void) {} void cfs_bandwidth_usage_dec(void) {} #endif /* CONFIG_JUMP_LABEL */ /* * default period for cfs group bandwidth. * default: 0.1s, units: nanoseconds */ static inline u64 default_cfs_period(void) { … } static inline u64 sched_cfs_bandwidth_slice(void) { … } /* * Replenish runtime according to assigned quota. We use sched_clock_cpu * directly instead of rq->clock to avoid adding additional synchronization * around rq->lock. * * requires cfs_b->lock */ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) { … } static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) { … } /* returns 0 on failure to allocate runtime */ static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b, struct cfs_rq *cfs_rq, u64 target_runtime) { … } /* returns 0 on failure to allocate runtime */ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) { … } static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { … } static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) { … } static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { … } /* check whether cfs_rq, or any parent, is throttled */ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { … } /* * Ensure that neither of the group entities corresponding to src_cpu or * dest_cpu are members of a throttled hierarchy when performing group * load-balance operations. */ static inline int throttled_lb_pair(struct task_group *tg, int src_cpu, int dest_cpu) { … } static int tg_unthrottle_up(struct task_group *tg, void *data) { … } static int tg_throttle_down(struct task_group *tg, void *data) { … } static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { … } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { … } #ifdef CONFIG_SMP static void __cfsb_csd_unthrottle(void *arg) { … } static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { … } #else static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { unthrottle_cfs_rq(cfs_rq); } #endif static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) { … } static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { … } /* * Responsible for refilling a task_group's bandwidth and unthrottling its * cfs_rqs as appropriate. If there has been no activity within the last * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { … } /* a cfs_rq won't donate quota below this amount */ static const u64 min_cfs_rq_runtime = …; /* minimum remaining period time to redistribute slack quota */ static const u64 min_bandwidth_expiration = …; /* how long we wait to gather additional slack before distributing */ static const u64 cfs_bandwidth_slack_period = …; /* * Are we near the end of the current quota period? * * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the * hrtimer base being cleared by hrtimer_start. In the case of * migrate_hrtimers, base is never cleared, so we are fine. */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { … } static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) { … } /* we know any runtime found here is valid as update_curr() precedes return */ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { … } static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) { … } /* * This is done with a timer (instead of inline with bandwidth return) since * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs. */ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { … } /* * When a group wakes up we want to make sure that its quota is not already * expired/exceeded, otherwise it may be allowed to steal additional ticks of * runtime as update_curr() throttling can not trigger until it's on-rq. */ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) { … } static void sync_throttle(struct task_group *tg, int cpu) { … } /* conditionally throttle active cfs_rq's from put_prev_entity() */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { … } static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { … } extern const u64 max_cfs_quota_period; static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { … } void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) { … } static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { … } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { … } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { … } /* * Both these CPU hotplug callbacks race against unregister_fair_sched_group() * * The race is harmless, since modifying bandwidth settings of unhooked group * bits doesn't do much. */ /* cpu online callback */ static void __maybe_unused update_runtime_enabled(struct rq *rq) { … } /* cpu offline callback */ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) { … } bool cfs_task_bw_constrained(struct task_struct *p) { … } #ifdef CONFIG_NO_HZ_FULL /* called from pick_next_task_fair() */ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) { int cpu = cpu_of(rq); if (!cfs_bandwidth_used()) return; if (!tick_nohz_full_cpu(cpu)) return; if (rq->nr_running != 1) return; /* * We know there is only one task runnable and we've just picked it. The * normal enqueue path will have cleared TICK_DEP_BIT_SCHED if we will * be otherwise able to stop the tick. Just need to check if we are using * bandwidth control. */ if (cfs_task_bw_constrained(p)) tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); } #endif #else /* CONFIG_CFS_BANDWIDTH */ static inline bool cfs_bandwidth_used(void) { return false; } static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { return 0; } static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) { return 0; } static inline int throttled_lb_pair(struct task_group *tg, int src_cpu, int dest_cpu) { return 0; } #ifdef CONFIG_FAIR_GROUP_SCHED void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b, struct cfs_bandwidth *parent) {} static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} #endif static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) { return NULL; } static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {} static inline void update_runtime_enabled(struct rq *rq) {} static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} #ifdef CONFIG_CGROUP_SCHED bool cfs_task_bw_constrained(struct task_struct *p) { return false; } #endif #endif /* CONFIG_CFS_BANDWIDTH */ #if !defined(CONFIG_CFS_BANDWIDTH) || !defined(CONFIG_NO_HZ_FULL) static inline void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) { … } #endif /************************************************** * CFS operations on tasks: */ #ifdef CONFIG_SCHED_HRTICK static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { … } /* * called from enqueue/dequeue and updates the hrtick when the * current task is from our class and nr_running is low enough * to matter. */ static void hrtick_update(struct rq *rq) { … } #else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { } static inline void hrtick_update(struct rq *rq) { } #endif #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { … } /* * overutilized value make sense only if EAS is enabled */ static inline bool is_rd_overutilized(struct root_domain *rd) { … } static inline void set_rd_overutilized(struct root_domain *rd, bool flag) { … } static inline void check_update_overutilized_status(struct rq *rq) { … } #else static inline void check_update_overutilized_status(struct rq *rq) { } #endif /* Runqueue only has SCHED_IDLE tasks enqueued */ static int sched_idle_rq(struct rq *rq) { … } #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { … } #endif static void requeue_delayed_entity(struct sched_entity *se) { … } /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { … } static void set_next_buddy(struct sched_entity *se); /* * Basically dequeue_task_fair(), except it can deal with dequeue_entity() * failing half-way through and resume the dequeue later. * * Returns: * -1 - dequeue delayed * 0 - dequeue throttled * 1 - dequeue complete */ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) { … } /* * The dequeue_task method is called before nr_running is * decreased. We remove the task from the rbtree and * update the fair scheduling stats: */ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { … } #ifdef CONFIG_SMP /* Working cpumask for: sched_balance_rq(), sched_balance_newidle(). */ static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask); #ifdef CONFIG_NO_HZ_COMMON static struct { … } nohz ____cacheline_aligned; #endif /* CONFIG_NO_HZ_COMMON */ static unsigned long cpu_load(struct rq *rq) { … } /* * cpu_load_without - compute CPU load without any contributions from *p * @cpu: the CPU which load is requested * @p: the task which load should be discounted * * The load of a CPU is defined by the load of tasks currently enqueued on that * CPU as well as tasks which are currently sleeping after an execution on that * CPU. * * This method returns the load of the specified CPU by discounting the load of * the specified task, whenever the task is currently contributing to the CPU * load. */ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) { … } static unsigned long cpu_runnable(struct rq *rq) { … } static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p) { … } static unsigned long capacity_of(int cpu) { … } static void record_wakee(struct task_struct *p) { … } /* * Detect M:N waker/wakee relationships via a switching-frequency heuristic. * * A waker of many should wake a different task than the one last awakened * at a frequency roughly N times higher than one of its wakees. * * In order to determine whether we should let the load spread vs consolidating * to shared cache, we look for a minimum 'flip' frequency of llc_size in one * partner, and a factor of lls_size higher frequency in the other. * * With both conditions met, we can be relatively sure that the relationship is * non-monogamous, with partner count exceeding socket size. * * Waker/wakee being client/server, worker/dispatcher, interrupt source or * whatever is irrelevant, spread criteria is apparent partner count exceeds * socket size. */ static int wake_wide(struct task_struct *p) { … } /* * The purpose of wake_affine() is to quickly determine on which CPU we can run * soonest. For the purpose of speed we only consider the waking and previous * CPU. * * wake_affine_idle() - only considers 'now', it check if the waking CPU is * cache-affine and is (or will be) idle. * * wake_affine_weight() - considers the weight to reflect the average * scheduling latency of the CPUs. This seems to work * for the overloaded case. */ static int wake_affine_idle(int this_cpu, int prev_cpu, int sync) { … } static int wake_affine_weight(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { … } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { … } static struct sched_group * sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); /* * sched_balance_find_dst_group_cpu - find the idlest CPU among the CPUs in the group. */ static int sched_balance_find_dst_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { … } static inline int sched_balance_find_dst_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag) { … } static inline int __select_idle_cpu(int cpu, struct task_struct *p) { … } #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(…); static inline void set_idle_cores(int cpu, int val) { … } static inline bool test_idle_cores(int cpu) { … } /* * Scans the local SMT mask to see if the entire core is idle, and records this * information in sd_llc_shared->has_idle_cores. * * Since SMT siblings share all cache levels, inspecting this limited remote * state should be fairly cheap. */ void __update_idle_core(struct rq *rq) { … } /* * Scan the entire LLC domain for idle cores; this dynamically switches off if * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { … } /* * Scan the local SMT mask for idle CPUs. */ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { … } #else /* CONFIG_SCHED_SMT */ static inline void set_idle_cores(int cpu, int val) { } static inline bool test_idle_cores(int cpu) { return false; } static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { return __select_idle_cpu(core, p); } static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) { return -1; } #endif /* CONFIG_SCHED_SMT */ /* * Scan the LLC domain for idle CPUs; this is dynamically regulated by * comparing the average scan cost (tracked in sd->avg_scan_cost) against the * average idle time for this rq (as found in rq->avg_idle). */ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target) { … } /* * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which * the task fits. If no CPU is big enough, but there are idle ones, try to * maximize capacity. */ static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { … } static inline bool asym_fits_cpu(unsigned long util, unsigned long util_min, unsigned long util_max, int cpu) { … } /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { … } /** * cpu_util() - Estimates the amount of CPU capacity used by CFS tasks. * @cpu: the CPU to get the utilization for * @p: task for which the CPU utilization should be predicted or NULL * @dst_cpu: CPU @p migrates to, -1 if @p moves from @cpu or @p == NULL * @boost: 1 to enable boosting, otherwise 0 * * The unit of the return value must be the same as the one of CPU capacity * so that CPU utilization can be compared with CPU capacity. * * CPU utilization is the sum of running time of runnable tasks plus the * recent utilization of currently non-runnable tasks on that CPU. * It represents the amount of CPU capacity currently used by CFS tasks in * the range [0..max CPU capacity] with max CPU capacity being the CPU * capacity at f_max. * * The estimated CPU utilization is defined as the maximum between CPU * utilization and sum of the estimated utilization of the currently * runnable tasks on that CPU. It preserves a utilization "snapshot" of * previously-executed tasks, which helps better deduce how busy a CPU will * be when a long-sleeping task wakes up. The contribution to CPU utilization * of such a task would be significantly decayed at this point of time. * * Boosted CPU utilization is defined as max(CPU runnable, CPU utilization). * CPU contention for CFS tasks can be detected by CPU runnable > CPU * utilization. Boosting is implemented in cpu_util() so that internal * users (e.g. EAS) can use it next to external users (e.g. schedutil), * latter via cpu_util_cfs_boost(). * * CPU utilization can be higher than the current CPU capacity * (f_curr/f_max * max CPU capacity) or even the max CPU capacity because * of rounding errors as well as task migrations or wakeups of new tasks. * CPU utilization has to be capped to fit into the [0..max CPU capacity] * range. Otherwise a group of CPUs (CPU0 util = 121% + CPU1 util = 80%) * could be seen as over-utilized even though CPU1 has 20% of spare CPU * capacity. CPU utilization is allowed to overshoot current CPU capacity * though since this is useful for predicting the CPU capacity required * after task migrations (scheduler-driven DVFS). * * Return: (Boosted) (estimated) utilization for the specified CPU. */ static unsigned long cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) { … } unsigned long cpu_util_cfs(int cpu) { … } unsigned long cpu_util_cfs_boost(int cpu) { … } /* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested * @p: the task which utilization should be discounted * * The utilization of a CPU is defined by the utilization of tasks currently * enqueued on that CPU as well as tasks which are currently sleeping after an * execution on that CPU. * * This method returns the utilization of the specified CPU by discounting the * utilization of the specified task, whenever the task is currently * contributing to the CPU utilization. */ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { … } /* * This function computes an effective utilization for the given CPU, to be * used for frequency selection given the linear relation: f = u * f_max. * * The scheduler tracks the following metrics: * * cpu_util_{cfs,rt,dl,irq}() * cpu_bw_dl() * * Where the cfs,rt and dl util numbers are tracked with the same metric and * synchronized windows and are thus directly comparable. * * The cfs,rt,dl utilization are the running times measured with rq->clock_task * which excludes things like IRQ and steal-time. These latter are then accrued * in the IRQ utilization. * * The DL bandwidth number OTOH is not a measured metric but a value computed * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max) { … } unsigned long sched_cpu_util(int cpu) { … } /* * energy_env - Utilization landscape for energy estimation. * @task_busy_time: Utilization contribution by the task for which we test the * placement. Given by eenv_task_busy_time(). * @pd_busy_time: Utilization of the whole perf domain without the task * contribution. Given by eenv_pd_busy_time(). * @cpu_cap: Maximum CPU capacity for the perf domain. * @pd_cap: Entire perf domain capacity. (pd->nr_cpus * cpu_cap). */ struct energy_env { … }; /* * Compute the task busy time for compute_energy(). This time cannot be * injected directly into effective_cpu_util() because of the IRQ scaling. * The latter only makes sense with the most recent CPUs where the task has * run. */ static inline void eenv_task_busy_time(struct energy_env *eenv, struct task_struct *p, int prev_cpu) { … } /* * Compute the perf_domain (PD) busy time for compute_energy(). Based on the * utilization for each @pd_cpus, it however doesn't take into account * clamping since the ratio (utilization / cpu_capacity) is already enough to * scale the EM reported power consumption at the (eventually clamped) * cpu_capacity. * * The contribution of the task @p for which we want to estimate the * energy cost is removed (by cpu_util()) and must be calculated * separately (see eenv_task_busy_time). This ensures: * * - A stable PD utilization, no matter which CPU of that PD we want to place * the task on. * * - A fair comparison between CPUs as the task contribution (task_util()) * will always be the same no matter which CPU utilization we rely on * (util_avg or util_est). * * Set @eenv busy time for the PD that spans @pd_cpus. This busy time can't * exceed @eenv->pd_cap. */ static inline void eenv_pd_busy_time(struct energy_env *eenv, struct cpumask *pd_cpus, struct task_struct *p) { … } /* * Compute the maximum utilization for compute_energy() when the task @p * is placed on the cpu @dst_cpu. * * Returns the maximum utilization among @eenv->cpus. This utilization can't * exceed @eenv->cpu_cap. */ static inline unsigned long eenv_pd_max_util(struct energy_env *eenv, struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) { … } /* * compute_energy(): Use the Energy Model to estimate the energy that @pd would * consume for a given utilization landscape @eenv. When @dst_cpu < 0, the task * contribution is ignored. */ static inline unsigned long compute_energy(struct energy_env *eenv, struct perf_domain *pd, struct cpumask *pd_cpus, struct task_struct *p, int dst_cpu) { … } /* * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the * waking task. find_energy_efficient_cpu() looks for the CPU with maximum * spare capacity in each performance domain and uses it as a potential * candidate to execute the task. Then, it uses the Energy Model to figure * out which of the CPU candidates is the most energy-efficient. * * The rationale for this heuristic is as follows. In a performance domain, * all the most energy efficient CPU candidates (according to the Energy * Model) are those for which we'll request a low frequency. When there are * several CPUs for which the frequency request will be the same, we don't * have enough data to break the tie between them, because the Energy Model * only includes active power costs. With this model, if we assume that * frequency requests follow utilization (e.g. using schedutil), the CPU with * the maximum spare capacity in a performance domain is guaranteed to be among * the best candidates of the performance domain. * * In practice, it could be preferable from an energy standpoint to pack * small tasks on a CPU in order to let other CPUs go in deeper idle states, * but that could also hurt our chances to go cluster idle, and we have no * ways to tell with the current Energy Model if this is actually a good * idea or not. So, find_energy_efficient_cpu() basically favors * cluster-packing, and spreading inside a cluster. That should at least be * a good thing for latency, and this is consistent with the idea that most * of the energy savings of EAS come from the asymmetry of the system, and * not so much from breaking the tie between identical CPUs. That's also the * reason why EAS is enabled in the topology code only for systems where * SD_ASYM_CPUCAPACITY is set. * * NOTE: Forkees are not accepted in the energy-aware wake-up path because * they don't have any useful utilization data yet and it's not possible to * forecast their impact on energy consumption. Consequently, they will be * placed by sched_balance_find_dst_cpu() on the least loaded CPU, which might turn out * to be energy-inefficient in some use-cases. The alternative would be to * bias new tasks towards specific types of CPUs first, or to try to infer * their util_avg from the parent task, but those heuristics could hurt * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { … } /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * * Balances load by selecting the idlest CPU in the idlest group, or under * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. * * Returns the target CPU number. */ static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { … } /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { … } static void task_dead_fair(struct task_struct *p) { … } /* * Set the max capacity the task is allowed to run at for misfit detection. */ static void set_task_max_allowed_capacity(struct task_struct *p) { … } static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx) { … } static int balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { … } #else static inline void set_task_max_allowed_capacity(struct task_struct *p) {} #endif /* CONFIG_SMP */ static void set_next_buddy(struct sched_entity *se) { … } /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { … } static struct task_struct *pick_task_fair(struct rq *rq) { … } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { … } static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) { … } static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) { … } static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) { … } void fair_server_init(struct rq *rq) { … } /* * Account for a descheduled task: */ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) { … } /* * sched_yield() is very simple */ static void yield_task_fair(struct rq *rq) { … } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) { … } #ifdef CONFIG_SMP /************************************************** * Fair scheduling class load-balancing methods. * * BASICS * * The purpose of load-balancing is to achieve the same basic fairness the * per-CPU scheduler provides, namely provide a proportional amount of compute * time to each task. This is expressed in the following equation: * * W_i,n/P_i == W_j,n/P_j for all i,j (1) * * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight * W_i,0 is defined as: * * W_i,0 = \Sum_j w_i,j (2) * * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight * is derived from the nice value as per sched_prio_to_weight[]. * * The weight average is an exponential decay average of the instantaneous * weight: * * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) * * C_i is the compute capacity of CPU i, typically it is the * fraction of 'recent' time available for SCHED_OTHER task execution. But it * can also include other factors [XXX]. * * To achieve this balance we define a measure of imbalance which follows * directly from (1): * * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4) * * We them move tasks around to minimize the imbalance. In the continuous * function space it is obvious this converges, in the discrete case we get * a few fun cases generally called infeasible weight scenarios. * * [XXX expand on: * - infeasible weights; * - local vs global optima in the discrete case. ] * * * SCHED DOMAINS * * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) * for all i,j solution, we create a tree of CPUs that follows the hardware * topology where each level pairs two lower groups (or better). This results * in O(log n) layers. Furthermore we reduce the number of CPUs going up the * tree to only the first of the previous level and we decrease the frequency * of load-balance at each level inversely proportional to the number of CPUs in * the groups. * * This yields: * * log_2 n 1 n * \Sum { --- * --- * 2^i } = O(n) (5) * i = 0 2^i 2^i * `- size of each group * | | `- number of CPUs doing load-balance * | `- freq * `- sum over all levels * * Coupled with a limit on how many tasks we can migrate every balance pass, * this makes (5) the runtime complexity of the balancer. * * An important property here is that each CPU is still (indirectly) connected * to every other CPU in at most O(log n) steps: * * The adjacency matrix of the resulting graph is given by: * * log_2 n * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6) * k = 0 * * And you'll find that: * * A^(log_2 n)_i,j != 0 for all i,j (7) * * Showing there's indeed a path between every CPU in at most O(log n) steps. * The task movement gives a factor of O(m), giving a convergence complexity * of: * * O(nm log n), n := nr_cpus, m := nr_tasks (8) * * * WORK CONSERVING * * In order to avoid CPUs going idle while there's still work to do, new idle * balancing is more aggressive and has the newly idle CPU iterate up the domain * tree itself instead of relying on other CPUs to bring it work. * * This adds some complexity to both (5) and (8) but it reduces the total idle * time. * * [XXX more?] * * * CGROUPS * * Cgroups make a horror show out of (2), instead of a simple sum we get: * * s_k,i * W_i,0 = \Sum_j \Prod_k w_k * ----- (9) * S_k * * Where * * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) * * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. * * The big problem is S_k, its a global sum needed to compute a local (W_i) * property. * * [XXX write more on how we solve this.. _after_ merging pjt's patches that * rewrite all of this once again.] */ static unsigned long __read_mostly max_load_balance_interval = …; enum fbq_type { … }; /* * 'group_type' describes the group of CPUs at the moment of load balancing. * * The enum is ordered by pulling priority, with the group with lowest priority * first so the group_type can simply be compared when selecting the busiest * group. See update_sd_pick_busiest(). */ enum group_type { … }; enum migration_type { … }; #define LBF_ALL_PINNED … #define LBF_NEED_BREAK … #define LBF_DST_PINNED … #define LBF_SOME_PINNED … #define LBF_ACTIVE_LB … struct lb_env { … }; /* * Is this task likely cache-hot: */ static int task_hot(struct task_struct *p, struct lb_env *env) { … } #ifdef CONFIG_NUMA_BALANCING /* * Returns 1, if task migration degrades locality * Returns 0, if task migration improves locality i.e migration preferred. * Returns -1, if task migration is not affected by locality. */ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { … } #else static inline int migrate_degrades_locality(struct task_struct *p, struct lb_env *env) { return -1; } #endif /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static int can_migrate_task(struct task_struct *p, struct lb_env *env) { … } /* * detach_task() -- detach the task for the migration specified in env */ static void detach_task(struct task_struct *p, struct lb_env *env) { … } /* * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * part of active balancing operations within "domain". * * Returns a task if successful and NULL otherwise. */ static struct task_struct *detach_one_task(struct lb_env *env) { … } /* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". * * Returns number of detached tasks if successful and 0 otherwise. */ static int detach_tasks(struct lb_env *env) { … } /* * attach_task() -- attach the task detached by detach_task() to its new rq. */ static void attach_task(struct rq *rq, struct task_struct *p) { … } /* * attach_one_task() -- attaches the task returned from detach_one_task() to * its new rq. */ static void attach_one_task(struct rq *rq, struct task_struct *p) { … } /* * attach_tasks() -- attaches all tasks detached by detach_tasks() to their * new rq. */ static void attach_tasks(struct lb_env *env) { … } #ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { … } static inline bool others_have_blocked(struct rq *rq) { … } static inline void update_blocked_load_tick(struct rq *rq) { … } static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { … } #else static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } static inline bool others_have_blocked(struct rq *rq) { return false; } static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif static bool __update_blocked_others(struct rq *rq, bool *done) { … } #ifdef CONFIG_FAIR_GROUP_SCHED static bool __update_blocked_fair(struct rq *rq, bool *done) { … } /* * Compute the hierarchical load factor for cfs_rq and all its ascendants. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) { … } static unsigned long task_h_load(struct task_struct *p) { … } #else static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; bool decayed; decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); if (cfs_rq_has_blocked(cfs_rq)) *done = false; return decayed; } static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg; } #endif static void sched_balance_update_blocked_averages(int cpu) { … } /********** Helpers for sched_balance_find_src_group ************************/ /* * sg_lb_stats - stats of a sched_group required for load-balancing: */ struct sg_lb_stats { … }; /* * sd_lb_stats - stats of a sched_domain required for load-balancing: */ struct sd_lb_stats { … }; static inline void init_sd_lb_stats(struct sd_lb_stats *sds) { … } static unsigned long scale_rt_capacity(int cpu) { … } static void update_cpu_capacity(struct sched_domain *sd, int cpu) { … } void update_group_capacity(struct sched_domain *sd, int cpu) { … } /* * Check whether the capacity of the rq has been noticeably reduced by side * activity. The imbalance_pct is used for the threshold. * Return true is the capacity is reduced */ static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { … } /* Check if the rq has a misfit task */ static inline bool check_misfit_status(struct rq *rq) { … } /* * Group imbalance indicates (and tries to solve) the problem where balancing * groups is inadequate due to ->cpus_ptr constraints. * * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. * Something like: * * { 0 1 2 3 } { 4 5 6 7 } * * * * * * * If we were to balance group-wise we'd place two tasks in the first group and * two tasks in the second group. Clearly this is undesired as it will overload * cpu 3 and leave one of the CPUs in the second group unused. * * The current solution to this issue is detecting the skew in the first group * by noticing the lower domain failed to reach balance and had difficulty * moving tasks due to affinity constraints. * * When this is so detected; this group becomes a candidate for busiest; see * update_sd_pick_busiest(). And calculate_imbalance() and * sched_balance_find_src_group() avoid some of the usual balance conditions to allow it * to create an effective group imbalance. * * This is a somewhat tricky proposition since the next run might not find the * group imbalance and decide the groups need to be balanced again. A most * subtle and fragile situation. */ static inline int sg_imbalanced(struct sched_group *group) { … } /* * group_has_capacity returns true if the group has spare capacity that could * be used by some tasks. * We consider that a group has spare capacity if the number of task is * smaller than the number of CPUs or if the utilization is lower than the * available capacity for CFS tasks. * For the latter, we use a threshold to stabilize the state, to take into * account the variance of the tasks' load and to return true if the available * capacity in meaningful for the load balancer. * As an example, an available capacity of 1% can appear but it doesn't make * any benefit for the load balance. */ static inline bool group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { … } /* * group_is_overloaded returns true if the group has more tasks than it can * handle. * group_is_overloaded is not equals to !group_has_capacity because a group * with the exact right number of tasks, has no more spare capacity but is not * overloaded so both group_has_capacity and group_is_overloaded return * false. */ static inline bool group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) { … } static inline enum group_type group_classify(unsigned int imbalance_pct, struct sched_group *group, struct sg_lb_stats *sgs) { … } /** * sched_use_asym_prio - Check whether asym_packing priority must be used * @sd: The scheduling domain of the load balancing * @cpu: A CPU * * Always use CPU priority when balancing load between SMT siblings. When * balancing load between cores, it is not sufficient that @cpu is idle. Only * use CPU priority if the whole core is idle. * * Returns: True if the priority of @cpu must be followed. False otherwise. */ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) { … } static inline bool sched_asym(struct sched_domain *sd, int dst_cpu, int src_cpu) { … } /** * sched_group_asym - Check if the destination CPU can do asym_packing balance * @env: The load balancing environment * @sgs: Load-balancing statistics of the candidate busiest group * @group: The candidate busiest group * * @env::dst_cpu can do asym_packing if it has higher priority than the * preferred CPU of @group. * * Return: true if @env::dst_cpu can do with asym_packing load balance. False * otherwise. */ static inline bool sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { … } /* One group has more than one SMT CPU while the other group does not */ static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, struct sched_group *sg2) { … } static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group *group) { … } static inline long sibling_imbalance(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *busiest, struct sg_lb_stats *local) { … } static inline bool sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) { … } /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. * @sds: Load-balancing data with statistics of the local group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @sg_overloaded: sched_group is overloaded * @sg_overutilized: sched_group is overutilized */ static inline void update_sg_lb_stats(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *group, struct sg_lb_stats *sgs, bool *sg_overloaded, bool *sg_overutilized) { … } /** * update_sd_pick_busiest - return 1 on busiest group * @env: The load balancing environment. * @sds: sched_domain statistics * @sg: sched_group candidate to be checked for being the busiest * @sgs: sched_group statistics * * Determine if @sg is a busier group than the previously selected * busiest group. * * Return: %true if @sg is a busier group than the previously selected * busiest group. %false otherwise. */ static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, struct sched_group *sg, struct sg_lb_stats *sgs) { … } #ifdef CONFIG_NUMA_BALANCING static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { … } static inline enum fbq_type fbq_classify_rq(struct rq *rq) { … } #else static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs) { return all; } static inline enum fbq_type fbq_classify_rq(struct rq *rq) { return regular; } #endif /* CONFIG_NUMA_BALANCING */ struct sg_lb_stats; /* * task_running_on_cpu - return 1 if @p is running on @cpu. */ static unsigned int task_running_on_cpu(int cpu, struct task_struct *p) { … } /** * idle_cpu_without - would a given CPU be idle without p ? * @cpu: the processor on which idleness is tested. * @p: task which should be ignored. * * Return: 1 if the CPU would be idle. 0 otherwise. */ static int idle_cpu_without(int cpu, struct task_struct *p) { … } /* * update_sg_wakeup_stats - Update sched_group's statistics for wakeup. * @sd: The sched_domain level to look for idlest group. * @group: sched_group whose statistics are to be updated. * @sgs: variable to hold the statistics for this group. * @p: The task for which we look for the idlest group/CPU. */ static inline void update_sg_wakeup_stats(struct sched_domain *sd, struct sched_group *group, struct sg_lb_stats *sgs, struct task_struct *p) { … } static bool update_pick_idlest(struct sched_group *idlest, struct sg_lb_stats *idlest_sgs, struct sched_group *group, struct sg_lb_stats *sgs) { … } /* * sched_balance_find_dst_group() finds and returns the least busy CPU group within the * domain. * * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) { … } static void update_idle_cpu_scan(struct lb_env *env, unsigned long sum_util) { … } /** * update_sd_lb_stats - Update sched_domain's statistics for load balancing. * @env: The load balancing environment. * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { … } /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. * @env: load balance environment * @sds: statistics of the sched_domain whose imbalance is to be calculated. */ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { … } /******* sched_balance_find_src_group() helpers end here *********************/ /* * Decision matrix according to the local and busiest group type: * * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded * has_spare nr_idle balanced N/A N/A balanced balanced * fully_busy nr_idle nr_idle N/A N/A balanced balanced * misfit_task force N/A N/A N/A N/A N/A * asym_packing force force N/A N/A force force * imbalanced force force N/A N/A force force * overloaded force force N/A N/A force avg_load * * N/A : Not Applicable because already filtered while updating * statistics. * balanced : The system is balanced for these 2 groups. * force : Calculate the imbalance as load migration is probably needed. * avg_load : Only if imbalance is significant enough. * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite * different in groups. */ /** * sched_balance_find_src_group - Returns the busiest group within the sched_domain * if there is an imbalance. * @env: The load balancing environment. * * Also calculates the amount of runnable load which should be moved * to restore balance. * * Return: - The busiest group if imbalance exists. */ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) { … } /* * sched_balance_find_src_rq - find the busiest runqueue among the CPUs in the group. */ static struct rq *sched_balance_find_src_rq(struct lb_env *env, struct sched_group *group) { … } /* * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but * so long as it is large enough. */ #define MAX_PINNED_INTERVAL … static inline bool asym_active_balance(struct lb_env *env) { … } static inline bool imbalanced_active_balance(struct lb_env *env) { … } static int need_active_balance(struct lb_env *env) { … } static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) { … } /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int sched_balance_rq(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *continue_balancing) { … } static inline unsigned long get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) { … } static inline void update_next_balance(struct sched_domain *sd, unsigned long *next_balance) { … } /* * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at * least 1 task to be running on each physical CPU where possible, and * avoids physical / logical imbalances. */ static int active_load_balance_cpu_stop(void *data) { … } /* * This flag serializes load-balancing passes over large domains * (above the NODE topology level) - only one load-balancing instance * may run at a time, to reduce overhead on very large systems with * lots of CPUs and large NUMA distances. * * - Note that load-balancing passes triggered while another one * is executing are skipped and not re-tried. * * - Also note that this does not serialize rebalance_domains() * execution, as non-SD_SERIALIZE domains will still be * load-balanced in parallel. */ static atomic_t sched_balance_running = …; /* * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ void update_max_interval(void) { … } static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) { … } /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in init_sched_domains. */ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) { … } static inline int on_null_domain(struct rq *rq) { … } #ifdef CONFIG_NO_HZ_COMMON /* * NOHZ idle load balancing (ILB) details: * * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. * * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set * anywhere yet. */ static inline int find_new_ilb(void) { … } /* * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU * SMP function call (IPI). * * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { … } /* * Current decision point for kicking the idle load balancer in the presence * of idle CPUs in the system. */ static void nohz_balancer_kick(struct rq *rq) { … } static void set_cpu_sd_state_busy(int cpu) { … } void nohz_balance_exit_idle(struct rq *rq) { … } static void set_cpu_sd_state_idle(int cpu) { … } /* * This routine will record that the CPU is going idle with tick stopped. * This info will be used in performing idle load balancing in the future. */ void nohz_balance_enter_idle(int cpu) { … } static bool update_nohz_stats(struct rq *rq) { … } /* * Internal function that runs load balance for all idle CPUs. The load balance * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) { … } /* * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the * rebalancing for all the CPUs for whom scheduler ticks are stopped. */ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { … } /* * Check if we need to directly run the ILB for updating blocked load before * entering idle state. Here we run ILB directly without issuing IPIs. * * Note that when this function is called, the tick may not yet be stopped on * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is * called from this function on (this) CPU that's not yet in the mask. That's * OK because the goal of nohz_run_idle_balance() is to run ILB only for * updating the blocked load of already idle CPUs without waking up one of * those idle CPUs and outside the preempt disable / IRQ off phase of the local * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) { … } static void nohz_newidle_balance(struct rq *this_rq) { … } #else /* !CONFIG_NO_HZ_COMMON */ static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { return false; } static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* * sched_balance_newidle is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: * < 0 - we released the lock and there are !fair tasks present * 0 - failed, no new tasks * > 0 - success, new (fair) tasks present */ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) { … } /* * This softirq handler is triggered via SCHED_SOFTIRQ from two places: * * - directly from the local scheduler_tick() for periodic load balancing * * - indirectly from a remote scheduler_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ static __latent_entropy void sched_balance_softirq(void) { … } /* * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. */ void sched_balance_trigger(struct rq *rq) { … } static void rq_online_fair(struct rq *rq) { … } static void rq_offline_fair(struct rq *rq) { … } #endif /* CONFIG_SMP */ #ifdef CONFIG_SCHED_CORE static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) { … } #define MIN_NR_TASKS_DURING_FORCEIDLE … static inline void task_tick_core(struct rq *rq, struct task_struct *curr) { … } /* * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. */ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, bool forceidle) { … } void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) { … } bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, bool in_fi) { … } static int task_is_throttled_fair(struct task_struct *p, int cpu) { … } #else static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} #endif /* * scheduler tick hitting a task of our scheduling class. * * NOTE: This function can be called remotely by the tick offload that * goes along full dynticks. Therefore no local assumption can be made * and everything must be accessed through the @rq and @curr passed in * parameters. */ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { … } /* * called on fork with the child task as argument from the parent's context * - child not yet on the tasklist * - preemption disabled */ static void task_fork_fair(struct task_struct *p) { … } /* * Priority of the task has changed. Check to see if we preempt * the current task. */ static void prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) { … } #ifdef CONFIG_FAIR_GROUP_SCHED /* * Propagate the changes of the sched_entity across the tg tree to make it * visible to the root */ static void propagate_entity_cfs_rq(struct sched_entity *se) { … } #else static void propagate_entity_cfs_rq(struct sched_entity *se) { } #endif static void detach_entity_cfs_rq(struct sched_entity *se) { … } static void attach_entity_cfs_rq(struct sched_entity *se) { … } static void detach_task_cfs_rq(struct task_struct *p) { … } static void attach_task_cfs_rq(struct task_struct *p) { … } static void switched_from_fair(struct rq *rq, struct task_struct *p) { … } static void switched_to_fair(struct rq *rq, struct task_struct *p) { … } static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { … } /* * Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) { … } void init_cfs_rq(struct cfs_rq *cfs_rq) { … } #ifdef CONFIG_FAIR_GROUP_SCHED static void task_change_group_fair(struct task_struct *p) { … } void free_fair_sched_group(struct task_group *tg) { … } int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { … } void online_fair_sched_group(struct task_group *tg) { … } void unregister_fair_sched_group(struct task_group *tg) { … } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent) { … } static DEFINE_MUTEX(shares_mutex); static int __sched_group_set_shares(struct task_group *tg, unsigned long shares) { … } int sched_group_set_shares(struct task_group *tg, unsigned long shares) { … } int sched_group_set_idle(struct task_group *tg, long idle) { … } #endif /* CONFIG_FAIR_GROUP_SCHED */ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) { … } /* * All the scheduling class methods: */ DEFINE_SCHED_CLASS(fair) = …; #ifdef CONFIG_SCHED_DEBUG void print_cfs_stats(struct seq_file *m, int cpu) { … } #ifdef CONFIG_NUMA_BALANCING void show_numa_stats(struct task_struct *p, struct seq_file *m) { … } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ __init void init_sched_fair_class(void) { … }