// SPDX-License-Identifier: GPL-2.0-only /* * kernel/sched/syscalls.c * * Core kernel scheduler syscalls related code * * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1998-2024 Ingo Molnar, Red Hat */ #include <linux/sched.h> #include <linux/cpuset.h> #include <linux/sched/debug.h> #include <uapi/linux/sched/types.h> #include "sched.h" #include "autogroup.h" static inline int __normal_prio(int policy, int rt_prio, int nice) { … } /* * Calculate the expected normal priority: i.e. priority * without taking RT-inheritance into account. Might be * boosted by interactivity modifiers. Changes upon fork, * setprio syscalls, and whenever the interactivity * estimator recalculates. */ static inline int normal_prio(struct task_struct *p) { … } /* * Calculate the current priority, i.e. the priority * taken into account by the scheduler. This value might * be boosted by RT tasks, or might be boosted by * interactivity modifiers. Will be RT if the task got * RT-boosted. If not then it returns p->normal_prio. */ static int effective_prio(struct task_struct *p) { … } void set_user_nice(struct task_struct *p, long nice) { … } EXPORT_SYMBOL(…); /* * is_nice_reduction - check if nice value is an actual reduction * * Similar to can_nice() but does not perform a capability check. * * @p: task * @nice: nice value */ static bool is_nice_reduction(const struct task_struct *p, const int nice) { … } /* * can_nice - check if a task can reduce its nice value * @p: task * @nice: nice value */ int can_nice(const struct task_struct *p, const int nice) { … } #ifdef __ARCH_WANT_SYS_NICE /* * sys_nice - change the priority of the current process. * @increment: priority increment * * sys_setpriority is a more generic, but much slower function that * does similar things. */ SYSCALL_DEFINE1(nice, int, increment) { … } #endif /** * task_prio - return the priority value of a given task. * @p: the task in question. * * Return: The priority value as seen by users in /proc. * * sched policy return value kernel prio user prio/nice * * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] * deadline -101 -1 0 */ int task_prio(const struct task_struct *p) { … } /** * idle_cpu - is a given CPU idle currently? * @cpu: the processor in question. * * Return: 1 if the CPU is currently idle. 0 otherwise. */ int idle_cpu(int cpu) { … } /** * available_idle_cpu - is a given CPU idle for enqueuing work. * @cpu: the CPU in question. * * Return: 1 if the CPU is currently idle. 0 otherwise. */ int available_idle_cpu(int cpu) { … } /** * idle_task - return the idle task for a given CPU. * @cpu: the processor in question. * * Return: The idle task for the CPU @cpu. */ struct task_struct *idle_task(int cpu) { … } #ifdef CONFIG_SCHED_CORE int sched_core_idle_cpu(int cpu) { … } #endif #ifdef CONFIG_SMP /* * This function computes an effective utilization for the given CPU, to be * used for frequency selection given the linear relation: f = u * f_max. * * The scheduler tracks the following metrics: * * cpu_util_{cfs,rt,dl,irq}() * cpu_bw_dl() * * Where the cfs,rt and dl util numbers are tracked with the same metric and * synchronized windows and are thus directly comparable. * * The cfs,rt,dl utilization are the running times measured with rq->clock_task * which excludes things like IRQ and steal-time. These latter are then accrued * in the IRQ utilization. * * The DL bandwidth number OTOH is not a measured metric but a value computed * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, unsigned long *min, unsigned long *max) { … } unsigned long sched_cpu_util(int cpu) { … } #endif /* CONFIG_SMP */ /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. * * The task of @pid, if found. %NULL otherwise. */ static struct task_struct *find_process_by_pid(pid_t pid) { … } static struct task_struct *find_get_task(pid_t pid) { … } DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), find_get_task(pid), pid_t pid) /* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. */ #define SETPARAM_POLICY … static void __setscheduler_params(struct task_struct *p, const struct sched_attr *attr) { … } /* * Check the target process has a UID that matches the current process's: */ static bool check_same_owner(struct task_struct *p) { … } #ifdef CONFIG_UCLAMP_TASK static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) { … } static bool uclamp_reset(const struct sched_attr *attr, enum uclamp_id clamp_id, struct uclamp_se *uc_se) { … } static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { … } #else /* !CONFIG_UCLAMP_TASK: */ static inline int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) { return -EOPNOTSUPP; } static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } #endif /* * Allow unprivileged RT tasks to decrease priority. * Only issue a capable test if needed and only once to avoid an audit * event on permitted non-privileged operations: */ static int user_check_sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, int policy, int reset_on_fork) { … } int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi) { … } static int _sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param, bool check) { … } /** * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. * * Use sched_set_fifo(), read its comment. * * Return: 0 on success. An error code otherwise. * * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) { … } int sched_setattr(struct task_struct *p, const struct sched_attr *attr) { … } int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { … } EXPORT_SYMBOL_GPL(…); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space. * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. * * Just like sched_setscheduler, only don't bother checking if the * current context has permission. For example, this is needed in * stop_machine(): we create temporary high priority worker threads, * but our caller might not have that capability. * * Return: 0 on success. An error code otherwise. */ int sched_setscheduler_nocheck(struct task_struct *p, int policy, const struct sched_param *param) { … } /* * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally * incapable of resource management, which is the one thing an OS really should * be doing. * * This is of course the reason it is limited to privileged users only. * * Worse still; it is fundamentally impossible to compose static priority * workloads. You cannot take two correctly working static prio workloads * and smash them together and still expect them to work. * * For this reason 'all' FIFO tasks the kernel creates are basically at: * * MAX_RT_PRIO / 2 * * The administrator _MUST_ configure the system, the kernel simply doesn't * know enough information to make a sensible choice. */ void sched_set_fifo(struct task_struct *p) { … } EXPORT_SYMBOL_GPL(…); /* * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. */ void sched_set_fifo_low(struct task_struct *p) { … } EXPORT_SYMBOL_GPL(…); void sched_set_normal(struct task_struct *p, int nice) { … } EXPORT_SYMBOL_GPL(…); static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { … } /* * Mimics kernel/events/core.c perf_copy_attr(). */ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) { … } static void get_params(struct task_struct *p, struct sched_attr *attr) { … } /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. * @policy: new policy. * @param: structure containing the new RT priority. * * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) { … } /** * sys_sched_setparam - set/change the RT priority of a thread * @pid: the pid in question. * @param: structure containing the new RT priority. * * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) { … } /** * sys_sched_setattr - same as above, but with extended sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @flags: for future extension. */ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) { … } /** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. * * Return: On success, the policy of the thread. Otherwise, a negative error * code. */ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { … } /** * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. * * Return: On success, 0 and the RT priority is in @param. Otherwise, an error * code. */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { … } /* * Copy the kernel size attribute structure (which might be larger * than what user-space knows about) to user-space. * * Note that all cases are valid: user-space buffer can be larger or * smaller than the kernel-space buffer. The usual case is that both * have the same size. */ static int sched_attr_copy_to_user(struct sched_attr __user *uattr, struct sched_attr *kattr, unsigned int usize) { … } /** * sys_sched_getattr - similar to sched_getparam, but with sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, usize, unsigned int, flags) { … } #ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { … } #endif /* CONFIG_SMP */ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { … } long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { … } static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, struct cpumask *new_mask) { … } /** * sys_sched_setaffinity - set the CPU affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new CPU mask * * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) { … } long sched_getaffinity(pid_t pid, struct cpumask *mask) { … } /** * sys_sched_getaffinity - get the CPU affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current CPU mask * * Return: size of CPU mask copied to user_mask_ptr on success. An * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) { … } static void do_sched_yield(void) { … } /** * sys_sched_yield - yield the current processor to other threads. * * This function yields the current CPU to other tasks. If there are no * other threads running on this CPU then this function will return. * * Return: 0. */ SYSCALL_DEFINE0(…) { … /** * yield - yield the current processor to other threads. * * Do not ever use this function, there's a 99% chance you're doing it wrong. * * The scheduler is at all times free to pick the calling task as the most * eligible task to run, if removing the yield() call from your code breaks * it, it's already broken. * * Typical broken usage is: * * while (!event) * yield(); * * where one assumes that yield() will let 'the other' process run that will * make event true. If the current task is a SCHED_FIFO task that will never * happen. Never use yield() as a progress guarantee!! * * If you want to use yield() to wait for something, use wait_event(). * If you want to use yield() to be 'nice' for others, use cond_resched(). * If you still want to use yield(), do not! */ void __sched yield(void) { … } EXPORT_SYMBOL(…); /** * yield_to - yield the current processor to another thread in * your thread group, or accelerate that thread toward the * processor it's on. * @p: target task * @preempt: whether task preemption is allowed or not * * It's the caller's job to ensure that the target task struct * can't go away on us before we can do any checks. * * Return: * true (>0) if we indeed boosted the target task. * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. */ int __sched yield_to(struct task_struct *p, bool preempt) { … } EXPORT_SYMBOL_GPL(…); /** * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. * * Return: On success, this syscall returns the maximum * rt_priority that can be used by a given scheduling class. * On failure, a negative error code is returned. */ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { … } /** * sys_sched_get_priority_min - return minimum RT priority. * @policy: scheduling class. * * Return: On success, this syscall returns the minimum * rt_priority that can be used by a given scheduling class. * On failure, a negative error code is returned. */ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { … } static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { … } /** * sys_sched_rr_get_interval - return the default time-slice of a process. * @pid: pid of the process. * @interval: userspace pointer to the time-slice value. * * this syscall writes the default time-slice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. * * Return: On success, 0 and the time-slice is in @interval. Otherwise, * an error code. */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct __kernel_timespec __user *, interval) { … } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, struct old_timespec32 __user *, interval) { … } #endif