rt.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
 * policies)
 */

int sched_rr_timeslice = …;
/* More than 4 hours if BW_SHIFT equals 20. */
static const u64 max_rt_runtime = …;

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

struct rt_bandwidth def_rt_bandwidth;

/*
 * period over which we measure -rt task CPU usage in us.
 * default: 1s
 */
int sysctl_sched_rt_period = …;

/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = …;

#ifdef CONFIG_SYSCTL
static int sysctl_sched_rr_timeslice = …;
static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
		size_t *lenp, loff_t *ppos);
static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
		size_t *lenp, loff_t *ppos);
static struct ctl_table sched_rt_sysctls[] = …;

static int __init sched_rt_sysctl_init(void)
{ … }
late_initcall(sched_rt_sysctl_init);
#endif

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{ … }

void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
{ … }

static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{ … }

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{ … }

void init_rt_rq(struct rt_rq *rt_rq)
{ … }

#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{ … }

#define rt_entity_is_task(rt_se) …

static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{ … }

static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{ … }

static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{ … }

static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{ … }

void unregister_rt_sched_group(struct task_group *tg)
{ … }

void free_rt_sched_group(struct task_group *tg)
{ … }

void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
		struct sched_rt_entity *rt_se, int cpu,
		struct sched_rt_entity *parent)
{ … }

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{ … }

#else /* CONFIG_RT_GROUP_SCHED */

#define rt_entity_is_task …

static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
	return container_of(rt_se, struct task_struct, rt);
}

static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
	return container_of(rt_rq, struct rq, rt);
}

static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
{
	struct task_struct *p = rt_task_of(rt_se);

	return task_rq(p);
}

static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
	struct rq *rq = rq_of_rt_se(rt_se);

	return &rq->rt;
}

void unregister_rt_sched_group(struct task_group *tg) { }

void free_rt_sched_group(struct task_group *tg) { }

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
	return 1;
}
#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_SMP

static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{ … }

static inline int rt_overloaded(struct rq *rq)
{ … }

static inline void rt_set_overload(struct rq *rq)
{ … }

static inline void rt_clear_overload(struct rq *rq)
{ … }

static inline int has_pushable_tasks(struct rq *rq)
{ … }

static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);

static void push_rt_tasks(struct rq *);
static void pull_rt_task(struct rq *);

static inline void rt_queue_push_tasks(struct rq *rq)
{ … }

static inline void rt_queue_pull_task(struct rq *rq)
{ … }

static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{ … }

static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{ … }

#else

static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
}

static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}

static inline void rt_queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */

static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);

static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{ … }

#ifdef CONFIG_UCLAMP_TASK
/*
 * Verify the fitness of task @p to run on @cpu taking into account the uclamp
 * settings.
 *
 * This check is only important for heterogeneous systems where uclamp_min value
 * is higher than the capacity of a @cpu. For non-heterogeneous system this
 * function will always return true.
 *
 * The function will return true if the capacity of the @cpu is >= the
 * uclamp_min and false otherwise.
 *
 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
 * > uclamp_max.
 */
static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
{ … }
#else
static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
{
	return true;
}
#endif

#ifdef CONFIG_RT_GROUP_SCHED

static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{ … }

static inline u64 sched_rt_period(struct rt_rq *rt_rq)
{ … }

rt_rq_iter_t;

static inline struct task_group *next_task_group(struct task_group *tg)
{ … }

#define for_each_rt_rq(rt_rq, iter, rq) …

#define for_each_sched_rt_entity(rt_se) …

static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
{ … }

static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);

static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{ … }

static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{ … }

static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{ … }

static int rt_se_boosted(struct sched_rt_entity *rt_se)
{ … }

#ifdef CONFIG_SMP
static inline const struct cpumask *sched_rt_period_mask(void)
{ … }
#else
static inline const struct cpumask *sched_rt_period_mask(void)
{
	return cpu_online_mask;
}
#endif

static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{ … }

static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
{ … }

#else /* !CONFIG_RT_GROUP_SCHED */

static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
{
	return rt_rq->rt_runtime;
}

static inline u64 sched_rt_period(struct rt_rq *rt_rq)
{
	return ktime_to_ns(def_rt_bandwidth.rt_period);
}

typedef struct rt_rq *rt_rq_iter_t;

#define for_each_rt_rq …

#define for_each_sched_rt_entity …

static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
{
	return NULL;
}

static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
	struct rq *rq = rq_of_rt_rq(rt_rq);

	if (!rt_rq->rt_nr_running)
		return;

	enqueue_top_rt_rq(rt_rq);
	resched_curr(rq);
}

static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
	dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
}

static inline int rt_rq_throttled(struct rt_rq *rt_rq)
{
	return rt_rq->rt_throttled;
}

static inline const struct cpumask *sched_rt_period_mask(void)
{
	return cpu_online_mask;
}

static inline
struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
{
	return &cpu_rq(cpu)->rt;
}

static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
{
	return &def_rt_bandwidth;
}

#endif /* CONFIG_RT_GROUP_SCHED */

bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
{ … }

#ifdef CONFIG_SMP
/*
 * We ran out of runtime, see if we can borrow some from our neighbours.
 */
static void do_balance_runtime(struct rt_rq *rt_rq)
{ … }

/*
 * Ensure this RQ takes back all the runtime it lend to its neighbours.
 */
static void __disable_runtime(struct rq *rq)
{ … }

static void __enable_runtime(struct rq *rq)
{ … }

static void balance_runtime(struct rt_rq *rt_rq)
{ … }
#else /* !CONFIG_SMP */
static inline void balance_runtime(struct rt_rq *rt_rq) {}
#endif /* CONFIG_SMP */

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
{ … }

static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{ … }

static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
{ … }

/*
 * Update the current task's runtime statistics. Skip current tasks that
 * are not in our scheduling class.
 */
static void update_curr_rt(struct rq *rq)
{ … }

static void
dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
{ … }

static void
enqueue_top_rt_rq(struct rt_rq *rt_rq)
{ … }

#if defined CONFIG_SMP

static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{ … }

static void
dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{ … }

#else /* CONFIG_SMP */

static inline
void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
static inline
void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}

#endif /* CONFIG_SMP */

#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{ … }

static void
dec_rt_prio(struct rt_rq *rt_rq, int prio)
{ … }

#else

static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}

#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_RT_GROUP_SCHED

static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{ … }

static void
dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{ … }

#else /* CONFIG_RT_GROUP_SCHED */

static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
	start_rt_bandwidth(&def_rt_bandwidth);
}

static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}

#endif /* CONFIG_RT_GROUP_SCHED */

static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
{ … }

static inline
unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
{ … }

static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{ … }

static inline
void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{ … }

/*
 * Change rt_se->run_list location unless SAVE && !MOVE
 *
 * assumes ENQUEUE/DEQUEUE flags match
 */
static inline bool move_entity(unsigned int flags)
{ … }

static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
{ … }

static inline struct sched_statistics *
__schedstats_from_rt_se(struct sched_rt_entity *rt_se)
{ … }

static inline void
update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
{ … }

static inline void
update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
{ … }

static inline void
update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
			int flags)
{ … }

static inline void
update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
{ … }

static inline void
update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
			int flags)
{ … }

static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{ … }

static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{ … }

/*
 * Because the prio of an upper entry depends on the lower
 * entries, we must remove entries top - down.
 */
static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{ … }

static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{ … }

static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{ … }

/*
 * Adding/removing a task to/from a priority array:
 */
static void
enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{ … }

static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{ … }

/*
 * Put task to the head or the end of the run list without the overhead of
 * dequeue followed by enqueue.
 */
static void
requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
{ … }

static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
{ … }

static void yield_task_rt(struct rq *rq)
{ … }

#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);

static int
select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{ … }

static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{ … }

static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
{ … }
#endif /* CONFIG_SMP */

/*
 * Preempt the current task with a newly woken task if needed:
 */
static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{ … }

static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{ … }

static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
{ … }

static struct task_struct *_pick_next_task_rt(struct rq *rq)
{ … }

static struct task_struct *pick_task_rt(struct rq *rq)
{ … }

static struct task_struct *pick_next_task_rt(struct rq *rq)
{ … }

static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{ … }

#ifdef CONFIG_SMP

/* Only try algorithms three times */
#define RT_MAX_TRIES …

static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
{ … }

/*
 * Return the highest pushable rq's task, which is suitable to be executed
 * on the CPU, NULL otherwise
 */
static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
{ … }

static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);

static int find_lowest_rq(struct task_struct *task)
{ … }

/* Will lock the rq it finds */
static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
{ … }

static struct task_struct *pick_next_pushable_task(struct rq *rq)
{ … }

/*
 * If the current CPU has more than one RT task, see if the non
 * running task can migrate over to a CPU that is running a task
 * of lesser priority.
 */
static int push_rt_task(struct rq *rq, bool pull)
{ … }

static void push_rt_tasks(struct rq *rq)
{ … }

#ifdef HAVE_RT_PUSH_IPI

/*
 * When a high priority task schedules out from a CPU and a lower priority
 * task is scheduled in, a check is made to see if there's any RT tasks
 * on other CPUs that are waiting to run because a higher priority RT task
 * is currently running on its CPU. In this case, the CPU with multiple RT
 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
 * up that may be able to run one of its non-running queued RT tasks.
 *
 * All CPUs with overloaded RT tasks need to be notified as there is currently
 * no way to know which of these CPUs have the highest priority task waiting
 * to run. Instead of trying to take a spinlock on each of these CPUs,
 * which has shown to cause large latency when done on machines with many
 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
 * RT tasks waiting to run.
 *
 * Just sending an IPI to each of the CPUs is also an issue, as on large
 * count CPU machines, this can cause an IPI storm on a CPU, especially
 * if its the only CPU with multiple RT tasks queued, and a large number
 * of CPUs scheduling a lower priority task at the same time.
 *
 * Each root domain has its own IRQ work function that can iterate over
 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
 * task must be checked if there's one or many CPUs that are lowering
 * their priority, there's a single IRQ work iterator that will try to
 * push off RT tasks that are waiting to run.
 *
 * When a CPU schedules a lower priority task, it will kick off the
 * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
 * As it only takes the first CPU that schedules a lower priority task
 * to start the process, the rto_start variable is incremented and if
 * the atomic result is one, then that CPU will try to take the rto_lock.
 * This prevents high contention on the lock as the process handles all
 * CPUs scheduling lower priority tasks.
 *
 * All CPUs that are scheduling a lower priority task will increment the
 * rt_loop_next variable. This will make sure that the IRQ work iterator
 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
 * priority task, even if the iterator is in the middle of a scan. Incrementing
 * the rt_loop_next will cause the iterator to perform another scan.
 *
 */
static int rto_next_cpu(struct root_domain *rd)
{ … }

static inline bool rto_start_trylock(atomic_t *v)
{ … }

static inline void rto_start_unlock(atomic_t *v)
{ … }

static void tell_cpu_to_push(struct rq *rq)
{ … }

/* Called from hardirq context */
void rto_push_irq_work_func(struct irq_work *work)
{ … }
#endif /* HAVE_RT_PUSH_IPI */

static void pull_rt_task(struct rq *this_rq)
{ … }

/*
 * If we are not running and we are not going to reschedule soon, we should
 * try to push tasks away now
 */
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{ … }

/* Assumes rq->lock is held */
static void rq_online_rt(struct rq *rq)
{ … }

/* Assumes rq->lock is held */
static void rq_offline_rt(struct rq *rq)
{ … }

/*
 * When switch from the rt queue, we bring ourselves to a position
 * that we might want to pull RT tasks from other runqueues.
 */
static void switched_from_rt(struct rq *rq, struct task_struct *p)
{ … }

void __init init_sched_rt_class(void)
{ … }
#endif /* CONFIG_SMP */

/*
 * When switching a task to RT, we may overload the runqueue
 * with RT tasks. In this case we try to push them off to
 * other runqueues.
 */
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{ … }

/*
 * Priority of the task has changed. This may cause
 * us to initiate a push or pull.
 */
static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{ … }

#ifdef CONFIG_POSIX_TIMERS
static void watchdog(struct rq *rq, struct task_struct *p)
{ … }
#else
static inline void watchdog(struct rq *rq, struct task_struct *p) { }
#endif

/*
 * scheduler tick hitting a task of our scheduling class.
 *
 * NOTE: This function can be called remotely by the tick offload that
 * goes along full dynticks. Therefore no local assumption can be made
 * and everything must be accessed through the @rq and @curr passed in
 * parameters.
 */
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{ … }

static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
{ … }

#ifdef CONFIG_SCHED_CORE
static int task_is_throttled_rt(struct task_struct *p, int cpu)
{ … }
#endif

DEFINE_SCHED_CLASS(rt) = …;

#ifdef CONFIG_RT_GROUP_SCHED
/*
 * Ensure that the real time constraints are schedulable.
 */
static DEFINE_MUTEX(rt_constraints_mutex);

static inline int tg_has_rt_tasks(struct task_group *tg)
{ … }

struct rt_schedulable_data { … };

static int tg_rt_schedulable(struct task_group *tg, void *data)
{ … }

static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
{ … }

static int tg_set_rt_bandwidth(struct task_group *tg,
		u64 rt_period, u64 rt_runtime)
{ … }

int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{ … }

long sched_group_rt_runtime(struct task_group *tg)
{ … }

int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{ … }

long sched_group_rt_period(struct task_group *tg)
{ … }

#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{ … }
#endif /* CONFIG_SYSCTL */

int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{ … }

#else /* !CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_SYSCTL
static int sched_rt_global_constraints(void)
{
	unsigned long flags;
	int i;

	raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
	for_each_possible_cpu(i) {
		struct rt_rq *rt_rq = &cpu_rq(i)->rt;

		raw_spin_lock(&rt_rq->rt_runtime_lock);
		rt_rq->rt_runtime = global_rt_runtime();
		raw_spin_unlock(&rt_rq->rt_runtime_lock);
	}
	raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);

	return 0;
}
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_SYSCTL
static int sched_rt_global_validate(void)
{ … }

static void sched_rt_do_global(void)
{ … }

static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
		size_t *lenp, loff_t *ppos)
{ … }

static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
		size_t *lenp, loff_t *ppos)
{ … }
#endif /* CONFIG_SYSCTL */

#ifdef CONFIG_SCHED_DEBUG
void print_rt_stats(struct seq_file *m, int cpu)
{ … }
#endif /* CONFIG_SCHED_DEBUG */
linux/kernel/sched/rt.c