/* * kernel/cpuset.c * * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2007 Silicon Graphics, Inc. * Copyright (C) 2006 Google, Inc * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups * 2008 Rework of the scheduler domains and CPU hotplug handling * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ #include "cgroup-internal.h" #include <linux/cpu.h> #include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/mempolicy.h> #include <linux/mm.h> #include <linux/memory.h> #include <linux/export.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/sched/deadline.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/security.h> #include <linux/spinlock.h> #include <linux/oom.h> #include <linux/sched/isolation.h> #include <linux/cgroup.h> #include <linux/wait.h> #include <linux/workqueue.h> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); /* See "Frequency meter" comments, below. */ struct fmeter { … }; /* * Invalid partition error code */ enum prs_errcode { … }; static const char * const perr_strings[] = …; struct cpuset { … }; /* * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously */ struct cpuset_remove_tasks_struct { … }; /* * Exclusive CPUs distributed out to sub-partitions of top_cpuset */ static cpumask_var_t subpartitions_cpus; /* * Exclusive CPUs in isolated partitions */ static cpumask_var_t isolated_cpus; /* List of remote partition root children */ static struct list_head remote_children; /* * Partition root states: * * 0 - member (not a partition root) * 1 - partition root * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root * * There are 2 types of partitions - local or remote. Local partitions are * those whose parents are partition root themselves. Setting of * cpuset.cpus.exclusive are optional in setting up local partitions. * Remote partitions are those whose parents are not partition roots. Passing * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor * nodes are mandatory in creating a remote partition. * * For simplicity, a local partition can be created under a local or remote * partition but a remote partition cannot have any partition root in its * ancestor chain except the cgroup root. */ #define PRS_MEMBER … #define PRS_ROOT … #define PRS_ISOLATED … #define PRS_INVALID_ROOT … #define PRS_INVALID_ISOLATED … static inline bool is_prs_invalid(int prs_state) { … } /* * Temporary cpumasks for working with partitions that are passed among * functions to avoid memory allocation in inner functions. */ struct tmpmasks { … }; static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) { … } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { … } static inline struct cpuset *parent_cs(struct cpuset *cs) { … } void inc_dl_tasks_cs(struct task_struct *p) { … } void dec_dl_tasks_cs(struct task_struct *p) { … } /* bits in struct cpuset flags field */ cpuset_flagbits_t; /* convenient tests for these bits */ static inline bool is_cpuset_online(struct cpuset *cs) { … } static inline int is_cpu_exclusive(const struct cpuset *cs) { … } static inline int is_mem_exclusive(const struct cpuset *cs) { … } static inline int is_mem_hardwall(const struct cpuset *cs) { … } static inline int is_sched_load_balance(const struct cpuset *cs) { … } static inline int is_memory_migrate(const struct cpuset *cs) { … } static inline int is_spread_page(const struct cpuset *cs) { … } static inline int is_spread_slab(const struct cpuset *cs) { … } static inline int is_partition_valid(const struct cpuset *cs) { … } static inline int is_partition_invalid(const struct cpuset *cs) { … } /* * Callers should hold callback_lock to modify partition_root_state. */ static inline void make_partition_invalid(struct cpuset *cs) { … } /* * Send notification event of whenever partition_root_state changes. */ static inline void notify_partition_change(struct cpuset *cs, int old_prs) { … } static struct cpuset top_cpuset = …; /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ #define cpuset_for_each_child(child_cs, pos_css, parent_cs) … /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used * with RCU read locked. The caller may modify @pos_css by calling * css_rightmost_descendant() to skip subtree. @root_cs is included in the * iteration and the first node to be visited. */ #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) … /* * There are two global locks guarding cpuset structures - cpuset_mutex and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset * structures. Note that cpuset_mutex needs to be a mutex as it is used in * paths that rely on priority inheritance (e.g. scheduler - on RT) for * correctness. * * A task must hold both locks to modify cpusets. If a task holds * cpuset_mutex, it blocks others, ensuring that it is the only task able to * also acquire callback_lock and be able to modify cpusets. It can perform * various checks on the cpuset structure first, knowing nothing will change. * It can also allocate memory while just holding cpuset_mutex. While it is * performing these checks, various callback routines can briefly acquire * callback_lock to query cpusets. Once it is ready to make the changes, it * takes callback_lock, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_lock, as that would risk double tripping on callback_lock * from one of the callbacks into the cpuset code from within * __alloc_pages(). * * If a task is only holding callback_lock, then it has read-only * access to cpusets. * * Now, the task_struct fields mems_allowed and mempolicy may be changed * by other task, we use alloc_lock in the task_struct fields to protect * them. * * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * * Accessing a task's cpuset should be done in accordance with the * guidelines for accessing subsystem state in kernel/cgroup.c */ static DEFINE_MUTEX(cpuset_mutex); void cpuset_lock(void) { … } void cpuset_unlock(void) { … } static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static inline void check_insane_mems_config(nodemask_t *nodes) { … } /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. * With v2 behavior, "cpus" and "mems" are always what the users have * requested and won't be changed by hotplug events. Only the effective * cpus or mems will be affected. */ static inline bool is_in_v2_mode(void) { … } /** * partition_is_populated - check if partition has tasks * @cs: partition root to be checked * @excluded_child: a child cpuset to be excluded in task checking * Return: true if there are tasks, false otherwise * * It is assumed that @cs is a valid partition root. @excluded_child should * be non-NULL when this cpuset is going to become a partition itself. */ static inline bool partition_is_populated(struct cpuset *cs, struct cpuset *excluded_child) { … } /* * Return in pmask the portion of a task's cpusets's cpus_allowed that * are online and are capable of running the task. If none are found, * walk up the cpuset hierarchy until we find one that does have some * appropriate cpus. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) { … } /* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some * online mems. The top cpuset always has some mems online. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * * Call with callback_lock or cpuset_mutex held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { … } /* * update task's spread flag if cpuset's page/slab spread flag is set * * Call with callback_lock or cpuset_mutex held. The check can be skipped * if on default hierarchy. */ static void cpuset_update_task_spread_flags(struct cpuset *cs, struct task_struct *tsk) { … } /* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags * are only set if the other's are set. Call holding cpuset_mutex. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) { … } /** * alloc_cpumasks - allocate three cpumasks for cpuset * @cs: the cpuset that have cpumasks to be allocated. * @tmp: the tmpmasks structure pointer * Return: 0 if successful, -ENOMEM otherwise. * * Only one of the two input arguments should be non-NULL. */ static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { … } /** * free_cpumasks - free cpumasks in a tmpmasks structure * @cs: the cpuset that have cpumasks to be free. * @tmp: the tmpmasks structure pointer */ static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) { … } /** * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) { … } /** * free_cpuset - free the cpuset * @cs: the cpuset to be freed */ static inline void free_cpuset(struct cpuset *cs) { … } /* Return user specified exclusive CPUs */ static inline struct cpumask *user_xcpus(struct cpuset *cs) { … } static inline bool xcpus_empty(struct cpuset *cs) { … } static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { … } /* * cpusets_are_exclusive() - check if two cpusets are exclusive * * Return true if exclusive, false if not */ static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2) { … } /* * validate_change_legacy() - Validate conditions specific to legacy (v1) * behavior. */ static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) { … } /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes * cpuset_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the * cpuset in the list must use cur below, not trial. * * 'trial' is the address of bulk structure copy of cur, with * perhaps one or more of the fields cpus_allowed, mems_allowed, * or flags changed to new, trial values. * * Return 0 if valid, -errno if not. */ static int validate_change(struct cpuset *cur, struct cpuset *trial) { … } #ifdef CONFIG_SMP /* * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping effective cpus_allowed masks? */ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { … } static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { … } static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { … } /* Must be called with cpuset_mutex held. */ static inline int nr_cpusets(void) { … } /* * generate_sched_domains() * * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. * The output of this function needs to be passed to kernel/sched/core.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. * * See "What is sched_load_balance" in Documentation/admin-guide/cgroup-v1/cpusets.rst * for a background explanation of this. * * Does not return errors, on the theory that the callers of this * routine would rather not worry about failures to rebuild sched * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * * Must be called with cpuset_mutex held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a * top-down scan of all cpusets. For our purposes, rebuilding * the schedulers sched domains, we can ignore !is_sched_load_ * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, * i.e the set of domains (subsets) of CPUs such that the * cpus_allowed of every cpuset marked is_sched_load_balance * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to * the kernel/sched/core.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) * * Finding the best partition (set of domains): * The triple nested loops below over i, j, k scan over the * load balanced cpusets (using the array of cpuset pointers in * csa[]) looking for pairs of cpusets that have overlapping * cpus_allowed, but which don't have the same 'pn' partition * number and gives them in the same partition number. It keeps * looping on the 'restart' label until it can no longer find * any such pairs. * * The union of the cpus_allowed masks from the set of * all cpusets having the same 'pn' value then form the one * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { … } static void dl_update_tasks_root_domain(struct cpuset *cs) { … } static void dl_rebuild_rd_accounting(void) { … } static void partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { … } /* * Rebuild scheduler domains. * * If the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset * which has that flag enabled, or if any cpuset with a non-empty * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * * Call with cpuset_mutex held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { … } #else /* !CONFIG_SMP */ static void rebuild_sched_domains_locked(void) { } #endif /* CONFIG_SMP */ static void rebuild_sched_domains_cpuslocked(void) { … } void rebuild_sched_domains(void) { … } /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @new_cpus: the temp variable for the new effective_cpus mask * * Iterate through each task of @cs updating its cpus_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() * is used instead of effective_cpus to make sure all offline CPUs are also * included as hotplug code won't update cpumasks for tasks in top_cpuset. */ static void update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus) { … } /** * compute_effective_cpumask - Compute the effective cpumask of the cpuset * @new_cpus: the temp variable for the new effective_cpus mask * @cs: the cpuset the need to recompute the new effective_cpus mask * @parent: the parent cpuset * * The result is valid only if the given cpuset isn't a partition root. */ static void compute_effective_cpumask(struct cpumask *new_cpus, struct cpuset *cs, struct cpuset *parent) { … } /* * Commands for update_parent_effective_cpumask */ enum partition_cmd { … }; static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on); static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp); /* * Update partition exclusive flag * * Return: 0 if successful, an error code otherwise */ static int update_partition_exclusive(struct cpuset *cs, int new_prs) { … } /* * Update partition load balance flag and/or rebuild sched domain * * Changing load balance flag will automatically call * rebuild_sched_domains_locked(). * This function is for cgroup v2 only. */ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) { … } /* * tasks_nocpu_error - Return true if tasks will have no effective_cpus */ static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs, struct cpumask *xcpus) { … } static void reset_partition_data(struct cpuset *cs) { … } /* * partition_xcpus_newstate - Exclusive CPUs state change * @old_prs: old partition_root_state * @new_prs: new partition_root_state * @xcpus: exclusive CPUs with state change */ static void partition_xcpus_newstate(int old_prs, int new_prs, struct cpumask *xcpus) { … } /* * partition_xcpus_add - Add new exclusive CPUs to partition * @new_prs: new partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be added * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ static bool partition_xcpus_add(int new_prs, struct cpuset *parent, struct cpumask *xcpus) { … } /* * partition_xcpus_del - Remove exclusive CPUs from partition * @old_prs: old partition_root_state * @parent: parent cpuset * @xcpus: exclusive CPUs to be removed * Return: true if isolated_cpus modified, false otherwise * * Remote partition if parent == NULL */ static bool partition_xcpus_del(int old_prs, struct cpuset *parent, struct cpumask *xcpus) { … } static void update_unbound_workqueue_cpumask(bool isolcpus_updated) { … } /** * cpuset_cpu_is_isolated - Check if the given CPU is isolated * @cpu: the CPU number to be checked * Return: true if CPU is used in an isolated partition, false otherwise */ bool cpuset_cpu_is_isolated(int cpu) { … } EXPORT_SYMBOL_GPL(…); /* * compute_effective_exclusive_cpumask - compute effective exclusive CPUs * @cs: cpuset * @xcpus: effective exclusive CPUs value to be set * Return: true if xcpus is not empty, false otherwise. * * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), * it must be a subset of parent's effective_xcpus. */ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) { … } static inline bool is_remote_partition(struct cpuset *cs) { … } static inline bool is_local_partition(struct cpuset *cs) { … } /* * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update * @new_prs: new partition_root_state * @tmp: temparary masks * Return: 1 if successful, 0 if error * * Enable the current cpuset to become a remote partition root taking CPUs * directly from the top cpuset. cpuset_mutex must be held by the caller. */ static int remote_partition_enable(struct cpuset *cs, int new_prs, struct tmpmasks *tmp) { … } /* * remote_partition_disable - Remove current cpuset from remote partition list * @cs: the cpuset to update * @tmp: temparary masks * * The effective_cpus is also updated. * * cpuset_mutex must be held by the caller. */ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) { … } /* * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask * @tmp: temparary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. */ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, struct tmpmasks *tmp) { … } /* * remote_partition_check - check if a child remote partition needs update * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask * @delmask: temporary mask for deletion (not in tmp) * @tmp: temparary masks * * This should be called before the given cs has updated its cpus_allowed * and/or effective_xcpus. */ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, struct cpumask *delmask, struct tmpmasks *tmp) { … } /* * prstate_housekeeping_conflict - check for partition & housekeeping conflicts * @prstate: partition root state to be checked * @new_cpus: cpu mask * Return: true if there is conflict, false otherwise * * CPUs outside of housekeeping_cpumask(HK_TYPE_DOMAIN) can only be used in * an isolated partition. */ static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus) { … } /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask * Return: 0 or a partition root state error code * * For partcmd_enable*, the cpuset is being transformed from a non-partition * root to a partition root. The effective_xcpus (cpus_allowed if * effective_xcpus not set) mask of the given cpuset will be taken away from * parent's effective_cpus. The function will return 0 if all the CPUs listed * in effective_xcpus can be granted or an error code will be returned. * * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in effective_xcpus will be * given back to parent's effective_cpus. 0 will always be returned. * * For partcmd_update, if the optional newmask is specified, the cpu list is * to be changed from effective_xcpus to newmask. Otherwise, effective_xcpus is * assumed to remain the same. The cpuset should either be a valid or invalid * partition root. The partition root state may change from valid to invalid * or vice versa. An error code will be returned if transitioning from * invalid to valid violates the exclusivity rule. * * For partcmd_invalidate, the current partition will be made invalid. * * The partcmd_enable* and partcmd_disable commands are used by * update_prstate(). An error code may be returned and the caller will check * for error. * * The partcmd_update command is used by update_cpumasks_hier() with newmask * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used * by update_cpumask() with NULL newmask. In both cases, the callers won't * check for error and so partition_root_state and prs_error will be updated * directly. */ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, struct cpumask *newmask, struct tmpmasks *tmp) { … } /** * compute_partition_effective_cpumask - compute effective_cpus for partition * @cs: partition root cpuset * @new_ecpus: previously computed effective_cpus to be updated * * Compute the effective_cpus of a partition root by scanning effective_xcpus * of child partition roots and excluding their effective_xcpus. * * This has the side effect of invalidating valid child partition roots, * if necessary. Since it is called from either cpuset_hotplug_update_tasks() * or update_cpumasks_hier() where parent and children are modified * successively, we don't need to call update_parent_effective_cpumask() * and the child's effective_cpus will be updated in later iterations. * * Note that rcu_read_lock() is assumed to be held. */ static void compute_partition_effective_cpumask(struct cpuset *cs, struct cpumask *new_ecpus) { … } /* * update_cpumasks_hier() flags */ #define HIER_CHECKALL … #define HIER_NO_SD_REBUILD … /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup * @force: don't skip any descendant cpusets if set * * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, int flags) { … } /** * update_sibling_cpumasks - Update siblings cpumasks * @parent: Parent cpuset * @cs: Current cpuset * @tmp: Temp variables */ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct tmpmasks *tmp) { … } /** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset */ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { … } /** * update_exclusive_cpumask - update the exclusive_cpus mask of a cpuset * @cs: the cpuset to consider * @trialcs: trial cpuset * @buf: buffer of cpu numbers written to this cpuset * * The tasks' cpumask will be updated if cs is a valid partition root. */ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { … } /* * Migrate memory region from one set of nodes to another. This is * performed asynchronously as it can be called from process migration path * holding locks involved in process management. All mm migrations are * performed in the queued order and can be waited for by flushing * cpuset_migrate_mm_wq. */ struct cpuset_migrate_mm_work { … }; static void cpuset_migrate_mm_workfn(struct work_struct *work) { … } static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { … } static void cpuset_post_attach(void) { … } /* * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy * @tsk: the task to change * @newmems: new nodes that the task will be set * * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed * and rebind an eventual tasks' mempolicy. If the task is allocating in * parallel, it might temporarily see an empty intersection, which results in * a seqlock check and retry before OOM or allocation failure. */ static void cpuset_change_task_nodemask(struct task_struct *tsk, nodemask_t *newmems) { … } static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the * effective cpuset's. As this function is called with cpuset_mutex held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { … } /* * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree * @cs: the cpuset to consider * @new_mems: a temp variable for calculating new effective_mems * * When configured nodemask is changed, the effective nodemasks of this cpuset * and all its descendants need to be updated. * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * * Called with cpuset_mutex held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { … } /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the * cpusets mems_allowed, and for each task in the cpuset, * update mems_allowed and rebind task's mempolicy and any vma * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * * Call with cpuset_mutex held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { … } bool current_cpuset_is_being_rebound(void) { … } static int update_relax_domain_level(struct cpuset *cs, s64 val) { … } /** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this * function is called with cpuset_mutex held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) { … } /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * * Call with cpuset_mutex held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on) { … } /** * update_prstate - update partition_root_state * @cs: the cpuset to update * @new_prs: new partition root state * Return: 0 if successful, != 0 if error * * Call with cpuset_mutex held. */ static int update_prstate(struct cpuset *cs, int new_prs) { … } /* * Frequency meter - How fast is some event occurring? * * These routines manage a digitally filtered, constant time based, * event frequency meter. There are four routines: * fmeter_init() - initialize a frequency meter. * fmeter_markevent() - called each time the event happens. * fmeter_getrate() - returns the recent rate of such events. * fmeter_update() - internal routine used to update fmeter. * * A common data structure is passed to each of these routines, * which is used to keep track of the state required to manage the * frequency meter and its digital filter. * * The filter works on the number of events marked per unit time. * The filter is single-pole low-pass recursive (IIR). The time unit * is 1 second. Arithmetic is done using 32-bit integers scaled to * simulate 3 decimal digits of precision (multiplied by 1000). * * With an FM_COEF of 933, and a time base of 1 second, the filter * has a half-life of 10 seconds, meaning that if the events quit * happening, then the rate returned from the fmeter_getrate() * will be cut in half each 10 seconds, until it converges to zero. * * It is not worth doing a real infinitely recursive filter. If more * than FM_MAXTICKS ticks have elapsed since the last filter event, * just compute FM_MAXTICKS ticks worth, by which point the level * will be stable. * * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid * arithmetic overflow in the fmeter_update() routine. * * Given the simple 32 bit integer arithmetic used, this meter works * best for reporting rates between one per millisecond (msec) and * one per 32 (approx) seconds. At constant rates faster than one * per msec it maxes out at values just under 1,000,000. At constant * rates between one per msec, and one per second it will stabilize * to a value N*1000, where N is the rate of events per second. * At constant rates between one per second and one per 32 seconds, * it will be choppy, moving up on the seconds that have an event, * and then decaying until the next event. At rates slower than * about one in 32 seconds, it decays all the way back to zero between * each event. */ #define FM_COEF … #define FM_MAXTICKS … #define FM_MAXCNT … #define FM_SCALE … /* Initialize a frequency meter */ static void fmeter_init(struct fmeter *fmp) { … } /* Internal meter update - process cnt events and update value */ static void fmeter_update(struct fmeter *fmp) { … } /* Process any previous ticks, then bump cnt by one (times scale). */ static void fmeter_markevent(struct fmeter *fmp) { … } /* Process any previous ticks, then return current value. */ static int fmeter_getrate(struct fmeter *fmp) { … } static struct cpuset *cpuset_attach_old_cs; /* * Check to see if a cpuset can accept a new task * For v1, cpus_allowed and mems_allowed can't be empty. * For v2, effective_cpus can't be empty. * Note that in v1, effective_cpus = cpus_allowed. */ static int cpuset_can_attach_check(struct cpuset *cs) { … } static void reset_migrate_dl_data(struct cpuset *cs) { … } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { … } static void cpuset_cancel_attach(struct cgroup_taskset *tset) { … } /* * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ static cpumask_var_t cpus_attach; static nodemask_t cpuset_attach_nodemask_to; static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) { … } static void cpuset_attach(struct cgroup_taskset *tset) { … } /* The various types of files and directories in a cpuset file system */ cpuset_filetype_t; static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { … } static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, s64 val) { … } /* * Common handling for a write to a "cpus" or "mems" file. */ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } /* * These ascii lists should be read in a single call, by using a user * buffer large enough to hold the entire map. If read in smaller * chunks, there is no guarantee of atomicity. Since the display format * used, list of ranges of sequential numbers, is variable length, * and since these maps can change value dynamically, one could read * gibberish by doing partial reads while a list was changing. */ static int cpuset_common_seq_show(struct seq_file *sf, void *v) { … } static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { … } static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { … } static int sched_partition_show(struct seq_file *seq, void *v) { … } static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { … } /* * for the common functions, 'private' gives the type of file */ static struct cftype legacy_files[] = …; /* * This is currently a minimal set for the default hierarchy. It can be * expanded later on by migrating more features and control files from v1. */ static struct cftype dfl_files[] = …; /** * cpuset_css_alloc - Allocate a cpuset css * @parent_css: Parent css of the control group that the new cpuset will be * part of * Return: cpuset css on success, -ENOMEM on failure. * * Allocate and initialize a new cpuset css, for non-NULL @parent_css, return * top cpuset css otherwise. */ static struct cgroup_subsys_state * cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { … } static int cpuset_css_online(struct cgroup_subsys_state *css) { … } /* * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which * will call rebuild_sched_domains_locked(). That is not needed * in the default hierarchy where only changes in partition * will cause repartitioning. * * If the cpuset has the 'sched.partition' flag enabled, simulate * turning 'sched.partition" off. */ static void cpuset_css_offline(struct cgroup_subsys_state *css) { … } static void cpuset_css_free(struct cgroup_subsys_state *css) { … } static void cpuset_bind(struct cgroup_subsys_state *root_css) { … } /* * In case the child is cloned into a cpuset different from its parent, * additional checks are done to see if the move is allowed. */ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset) { … } static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset) { … } /* * Make sure the new task conform to the current state of its parent, * which could have been changed by cpuset just after it inherits the * state from the parent and before it sits on the cgroup's task list. */ static void cpuset_fork(struct task_struct *task) { … } struct cgroup_subsys cpuset_cgrp_subsys = …; /** * cpuset_init - initialize cpusets at system boot * * Description: Initialize top_cpuset **/ int __init cpuset_init(void) { … } /* * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. */ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) { … } static void cpuset_migrate_tasks_workfn(struct work_struct *work) { … } static void hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { … } static void hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { … } static bool force_rebuild; void cpuset_force_rebuild(void) { … } /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest * @tmp: the tmpmasks structure pointer * * Compare @cs's cpu and mem masks against top_cpuset and if some have gone * offline, update @cs accordingly. If @cs ends up with no CPU or memory, * all its tasks are moved to the nearest ancestor with both resources. */ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) { … } /** * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always * synchronized to cpu_active_mask and N_MEMORY, which is necessary in * order to make cpusets transparent (of no affect) on systems that are * actively using CPU hotplug but making no active use of cpusets. * * Non-root cpusets are only affected by offlining. If any CPUs or memory * nodes have been taken down, cpuset_hotplug_update_tasks() is invoked on * all descendants. * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. * * CPU / memory hotplug is handled synchronously. */ static void cpuset_handle_hotplug(void) { … } void cpuset_update_active_cpus(void) { … } /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. * See cpuset_update_active_cpus() for CPU hotplug handling. */ static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { … } /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized */ void __init cpuset_init_smp(void) { … } /** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. * * Description: Returns the cpumask_var_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_mask, even if this means going outside the * tasks cpuset, except when the task is in the top cpuset. **/ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { … } /** * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. * @tsk: pointer to task_struct with which the scheduler is struggling * * Description: In the case that the scheduler cannot find an allowed cpu in * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy * mode however, this value is the same as task_cs(tsk)->effective_cpus, * which will not contain a sane cpumask during cases such as cpu hotplugging. * This is the absolute last resort for the scheduler and it is only used if * _every_ other avenue has been traveled. * * Returns true if the affinity of @tsk was changed, false otherwise. **/ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) { … } void __init cpuset_init_current_mems_allowed(void) { … } /** * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. * * Description: Returns the nodemask_t mems_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of node_states[N_MEMORY], even if this means going outside the * tasks cpuset. **/ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) { … } /** * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? */ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { … } /* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding * callback_lock. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { … } /* * cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * * If we're in interrupt, yes, we can always allocate. If @node is set in * current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this * node is set in the nearest hardwalled cpuset ancestor to current's cpuset, * yes. If current has access to memory reserves as an oom victim, yes. * Otherwise, no. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * * Scanning up parent cpusets requires callback_lock. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the * cpuset are short of memory, might require taking the callback_lock. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless * in interrupt, of course). * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set * in alloc_flags. That logic and the checks below have the combined * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * tsk_is_oom_victim - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ bool cpuset_node_allowed(int node, gfp_t gfp_mask) { … } /** * cpuset_spread_node() - On which node to begin search for a page * @rotor: round robin rotor * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), * and if the memory allocation used cpuset_mem_spread_node() * to determine on which node to start looking, as it will for * certain page cache or slab cache pages such as used for file * system buffers and inode caches, then instead of starting on the * local node to look for a free page, rather spread the starting * node around the tasks mems_allowed nodes. * * We don't have to worry about the returned node being offline * because "it can't happen", and even if it did, it would be ok. * * The routines calling guarantee_online_mems() are careful to * only set nodes in task->mems_allowed that are online. So it * should not be possible for the following code to return an * offline node. But if it did, that would be ok, as this routine * is not returning the node where the allocation must be, only * the node where the search should start. The zonelist passed to * __alloc_pages() will include all nodes. If the slab allocator * is passed an offline node, it will fall back to the local node. * See kmem_cache_alloc_node(). */ static int cpuset_spread_node(int *rotor) { … } /** * cpuset_mem_spread_node() - On which node to begin search for a file page */ int cpuset_mem_spread_node(void) { … } /** * cpuset_slab_spread_node() - On which node to begin search for a slab page */ int cpuset_slab_spread_node(void) { … } EXPORT_SYMBOL_GPL(…); /** * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? * @tsk1: pointer to task_struct of some task. * @tsk2: pointer to task_struct of some other task. * * Description: Return true if @tsk1's mems_allowed intersects the * mems_allowed of @tsk2. Used by the OOM killer to determine if * one of the task's memory usage might impact the memory available * to the other. **/ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { … } /** * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * * Description: Prints current's name, cpuset name, and cached copy of its * mems_allowed to the kernel log. */ void cpuset_print_current_mems_allowed(void) { … } /* * Collection of memory_pressure is suppressed unless * this flag is enabled by writing "1" to the special * cpuset file 'memory_pressure_enabled' in the root cpuset. */ int cpuset_memory_pressure_enabled __read_mostly; /* * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. * * Keep a running average of the rate of synchronous (direct) * page reclaim efforts initiated by tasks in each cpuset. * * This represents the rate at which some task in the cpuset * ran low on memory on all nodes it was allowed to use, and * had to enter the kernels page reclaim code in an effort to * create more free memory by tossing clean pages or swapping * or writing dirty pages. * * Display to user space in the per-cpuset read-only file * "memory_pressure". Value displayed is an integer * representing the recent rate of entry into the synchronous * (direct) page reclaim by any task attached to the cpuset. */ void __cpuset_memory_pressure_bump(void) { … } #ifdef CONFIG_PROC_PID_CPUSET /* * proc_cpuset_show() * - Print tasks cpuset path into seq_file. * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, * and we take cpuset_mutex, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk) { … } #endif /* CONFIG_PROC_PID_CPUSET */ /* Display task mems_allowed in /proc/<pid>/status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { … }