// SPDX-License-Identifier: GPL-2.0+ /* * Read-Copy Update mechanism for mutual exclusion (tree-based version) * * Copyright IBM Corporation, 2008 * * Authors: Dipankar Sarma <[email protected]> * Manfred Spraul <[email protected]> * Paul E. McKenney <[email protected]> * * Based on the original work by Paul McKenney <[email protected]> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * * For detailed explanation of Read-Copy Update mechanism see - * Documentation/RCU */ #define pr_fmt(fmt) … #include <linux/types.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/rcupdate_wait.h> #include <linux/interrupt.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/nmi.h> #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/export.h> #include <linux/completion.h> #include <linux/kmemleak.h> #include <linux/moduleparam.h> #include <linux/panic.h> #include <linux/panic_notifier.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/time.h> #include <linux/kernel_stat.h> #include <linux/wait.h> #include <linux/kthread.h> #include <uapi/linux/sched/types.h> #include <linux/prefetch.h> #include <linux/delay.h> #include <linux/random.h> #include <linux/trace_events.h> #include <linux/suspend.h> #include <linux/ftrace.h> #include <linux/tick.h> #include <linux/sysrq.h> #include <linux/kprobes.h> #include <linux/gfp.h> #include <linux/oom.h> #include <linux/smpboot.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/sched/isolation.h> #include <linux/sched/clock.h> #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/kasan.h> #include <linux/context_tracking.h> #include "../time/tick-internal.h" #include "tree.h" #include "rcu.h" #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif #define MODULE_PARAM_PREFIX … /* Data structures. */ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = …; static struct rcu_state rcu_state = …; /* Dump rcu_node combining tree at boot to verify correct setup. */ static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ static bool use_softirq = !IS_ENABLED(…); #ifndef CONFIG_PREEMPT_RT module_param(use_softirq, bool, 0444); #endif /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */ static int rcu_fanout_leaf = …; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = …; /* Number of rcu_nodes at specified level. */ int num_rcu_lvl[] = …; int rcu_num_nodes __read_mostly = …; /* Total # rcu_nodes in use. */ /* * The rcu_scheduler_active variable is initialized to the value * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE, * RCU can assume that there is but one task, allowing RCU to (for example) * optimize synchronize_rcu() to a simple barrier(). When this variable * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required * to detect real grace periods. This variable is also used to suppress * boot-time false positives from lockdep-RCU error checking. Finally, it * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU * is fully initialized, including all of its kthreads having been spawned. */ int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(…); /* * The rcu_scheduler_fully_active variable transitions from zero to one * during the early_initcall() processing, which is after the scheduler * is capable of creating new tasks. So RCU processing (for example, * creating tasks for RCU priority boosting) must be delayed until after * rcu_scheduler_fully_active transitions from zero to one. We also * currently delay invocation of any RCU callbacks until after this point. * * It might later prove better for people registering RCU callbacks during * early boot to take responsibility for these callbacks, but one step at * a time. */ static int rcu_scheduler_fully_active __read_mostly; static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags); static struct task_struct *rcu_boost_task(struct rcu_node *rnp); static void invoke_rcu_core(void); static void rcu_report_exp_rdp(struct rcu_data *rdp); static void sync_sched_exp_online_cleanup(int cpu); static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp); static bool rcu_rdp_is_offloaded(struct rcu_data *rdp); static bool rcu_rdp_cpu_online(struct rcu_data *rdp); static bool rcu_init_invoked(void); static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); /* * rcuc/rcub/rcuop kthread realtime priority. The "rcuop" * real-time priority(enabling/disabling) is controlled by * the extra CONFIG_RCU_NOCB_CPU_CB_BOOST configuration. */ static int kthread_prio = …; module_param(kthread_prio, int, 0444); /* Delay in jiffies for grace-period initialization delays, debug only. */ static int gp_preinit_delay; module_param(gp_preinit_delay, int, 0444); static int gp_init_delay; module_param(gp_init_delay, int, 0444); static int gp_cleanup_delay; module_param(gp_cleanup_delay, int, 0444); static int nohz_full_patience_delay; module_param(nohz_full_patience_delay, int, 0444); static int nohz_full_patience_delay_jiffies; // Add delay to rcu_read_unlock() for strict grace periods. static int rcu_unlock_delay; #ifdef CONFIG_RCU_STRICT_GRACE_PERIOD module_param(rcu_unlock_delay, int, 0444); #endif /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached * per-CPU. Object size is equal to one page. This value * can be changed at boot time. */ static int rcu_min_cached_objs = …; module_param(rcu_min_cached_objs, int, 0444); // A page shrinker can ask for pages to be freed to make them // available for other parts of the system. This usually happens // under low memory conditions, and in that case we should also // defer page-cache filling for a short time period. // // The default value is 5 seconds, which is long enough to reduce // interference with the shrinker while it asks other systems to // drain their caches. static int rcu_delay_page_cache_fill_msec = …; module_param(rcu_delay_page_cache_fill_msec, int, 0444); /* Retrieve RCU kthreads priority for rcutorture */ int rcu_get_gp_kthreads_prio(void) { … } EXPORT_SYMBOL_GPL(…); /* * Number of grace periods between delays, normalized by the duration of * the delay. The longer the delay, the more the grace periods between * each delay. The reason for this normalization is that it means that, * for non-zero delays, the overall slowdown of grace periods is constant * regardless of the duration of the delay. This arrangement balances * the need for long delays to increase some race probabilities with the * need for fast grace periods to increase other race probabilities. */ #define PER_RCU_NODE_PERIOD … /* * Return true if an RCU grace period is in progress. The READ_ONCE()s * permit this function to be invoked without holding the root rcu_node * structure's ->lock, but of course results can be subject to change. */ static int rcu_gp_in_progress(void) { … } /* * Return the number of callbacks queued on the specified CPU. * Handles both the nocbs and normal cases. */ static long rcu_get_n_cbs_cpu(int cpu) { … } /** * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing * * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU. * This is a special-purpose function to be used in the softirq * infrastructure and perhaps the occasional long-running softirq * handler. * * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is * equivalent to momentarily completely enabling preemption. For * example, given this code:: * * local_bh_disable(); * do_something(); * rcu_softirq_qs(); // A * do_something_else(); * local_bh_enable(); // B * * A call to synchronize_rcu() that began concurrently with the * call to do_something() would be guaranteed to wait only until * execution reached statement A. Without that rcu_softirq_qs(), * that same synchronize_rcu() would instead be guaranteed to wait * until execution reached statement B. */ void rcu_softirq_qs(void) { … } /* * Reset the current CPU's RCU_WATCHING counter to indicate that the * newly onlined CPU is no longer in an extended quiescent state. * This will either leave the counter unchanged, or increment it * to the next non-quiescent value. * * The non-atomic test/increment sequence works because the upper bits * of the ->state variable are manipulated only by the corresponding CPU, * or when the corresponding CPU is offline. */ static void rcu_watching_online(void) { … } /* * Return true if the snapshot returned from ct_rcu_watching() * indicates that RCU is in an extended quiescent state. */ static bool rcu_watching_snap_in_eqs(int snap) { … } /** * rcu_watching_snap_stopped_since() - Has RCU stopped watching a given CPU * since the specified @snap? * * @rdp: The rcu_data corresponding to the CPU for which to check EQS. * @snap: rcu_watching snapshot taken when the CPU wasn't in an EQS. * * Returns true if the CPU corresponding to @rdp has spent some time in an * extended quiescent state since @snap. Note that this doesn't check if it * /still/ is in an EQS, just that it went through one since @snap. * * This is meant to be used in a loop waiting for a CPU to go through an EQS. */ static bool rcu_watching_snap_stopped_since(struct rcu_data *rdp, int snap) { … } /* * Return true if the referenced integer is zero while the specified * CPU remains within a single extended quiescent state. */ bool rcu_watching_zero_in_eqs(int cpu, int *vp) { … } /* * Let the RCU core know that this CPU has gone through the scheduler, * which is a quiescent state. This is called when the need for a * quiescent state is urgent, so we burn an atomic operation and full * memory barriers to let the RCU core know about it, regardless of what * this CPU might (or might not) do in the near future. * * We inform the RCU core by emulating a zero-duration dyntick-idle period. * * The caller must have disabled interrupts and must not be idle. */ notrace void rcu_momentary_eqs(void) { … } EXPORT_SYMBOL_GPL(…); /** * rcu_is_cpu_rrupt_from_idle - see if 'interrupted' from idle * * If the current CPU is idle and running at a first-level (not nested) * interrupt, or directly, from idle, return true. * * The caller must have at least disabled IRQs. */ static int rcu_is_cpu_rrupt_from_idle(void) { … } #define DEFAULT_RCU_BLIMIT … // Maximum callbacks per rcu_do_batch ... #define DEFAULT_MAX_RCU_BLIMIT … static long blimit = …; #define DEFAULT_RCU_QHIMARK … static long qhimark = …; #define DEFAULT_RCU_QLOMARK … static long qlowmark = …; #define DEFAULT_RCU_QOVLD_MULT … #define DEFAULT_RCU_QOVLD … static long qovld = …; // If this many pending, hammer QS. static long qovld_calc = …; // No pre-initialization lock acquisitions! module_param(blimit, long, 0444); module_param(qhimark, long, 0444); module_param(qlowmark, long, 0444); module_param(qovld, long, 0444); static ulong jiffies_till_first_fqs = …; static ulong jiffies_till_next_fqs = …; static bool rcu_kick_kthreads; static int rcu_divisor = …; module_param(rcu_divisor, int, 0644); /* Force an exit from rcu_do_batch() after 3 milliseconds. */ static long rcu_resched_ns = …; module_param(rcu_resched_ns, long, 0644); /* * How long the grace period must be before we start recruiting * quiescent-state help from rcu_note_context_switch(). */ static ulong jiffies_till_sched_qs = …; module_param(jiffies_till_sched_qs, ulong, 0444); static ulong jiffies_to_sched_qs; /* See adjust_jiffies_till_sched_qs(). */ module_param(jiffies_to_sched_qs, ulong, 0444); /* Display only! */ /* * Make sure that we give the grace-period kthread time to detect any * idle CPUs before taking active measures to force quiescent states. * However, don't go below 100 milliseconds, adjusted upwards for really * large systems. */ static void adjust_jiffies_till_sched_qs(void) { … } static int param_set_first_fqs_jiffies(const char *val, const struct kernel_param *kp) { … } static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param *kp) { … } static const struct kernel_param_ops first_fqs_jiffies_ops = …; static const struct kernel_param_ops next_fqs_jiffies_ops = …; module_param_cb(…); module_param_cb(…); module_param(rcu_kick_kthreads, bool, 0644); static void force_qs_rnp(int (*f)(struct rcu_data *rdp)); static int rcu_pending(int user); /* * Return the number of RCU GPs completed thus far for debug & stats. */ unsigned long rcu_get_gp_seq(void) { … } EXPORT_SYMBOL_GPL(…); /* * Return the number of RCU expedited batches completed thus far for * debug & stats. Odd numbers mean that a batch is in progress, even * numbers mean idle. The value returned will thus be roughly double * the cumulative batches since boot. */ unsigned long rcu_exp_batches_completed(void) { … } EXPORT_SYMBOL_GPL(…); /* * Return the root node of the rcu_state structure. */ static struct rcu_node *rcu_get_root(void) { … } /* * Send along grace-period-related data for rcutorture diagnostics. */ void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { … } EXPORT_SYMBOL_GPL(…); #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) /* * An empty function that will trigger a reschedule on * IRQ tail once IRQs get re-enabled on userspace/guest resume. */ static void late_wakeup_func(struct irq_work *work) { } static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = IRQ_WORK_INIT(late_wakeup_func); /* * If either: * * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry. * * In these cases the late RCU wake ups aren't supported in the resched loops and our * last resort is to fire a local irq_work that will trigger a reschedule once IRQs * get re-enabled again. */ noinstr void rcu_irq_work_resched(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) return; if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) return; instrumentation_begin(); if (do_nocb_deferred_wakeup(rdp) && need_resched()) { irq_work_queue(this_cpu_ptr(&late_wakeup_work)); } instrumentation_end(); } #endif /* #if defined(CONFIG_NO_HZ_FULL) && (!defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK)) */ #ifdef CONFIG_PROVE_RCU /** * rcu_irq_exit_check_preempt - Validate that scheduling is possible */ void rcu_irq_exit_check_preempt(void) { … } #endif /* #ifdef CONFIG_PROVE_RCU */ #ifdef CONFIG_NO_HZ_FULL /** * __rcu_irq_enter_check_tick - Enable scheduler tick on CPU if RCU needs it. * * The scheduler tick is not normally enabled when CPUs enter the kernel * from nohz_full userspace execution. After all, nohz_full userspace * execution is an RCU quiescent state and the time executing in the kernel * is quite short. Except of course when it isn't. And it is not hard to * cause a large system to spend tens of seconds or even minutes looping * in the kernel, which can cause a number of problems, include RCU CPU * stall warnings. * * Therefore, if a nohz_full CPU fails to report a quiescent state * in a timely manner, the RCU grace-period kthread sets that CPU's * ->rcu_urgent_qs flag with the expectation that the next interrupt or * exception will invoke this function, which will turn on the scheduler * tick, which will enable RCU to detect that CPU's quiescent states, * for example, due to cond_resched() calls in CONFIG_PREEMPT=n kernels. * The tick will be disabled once a quiescent state is reported for * this CPU. * * Of course, in carefully tuned systems, there might never be an * interrupt or exception. In that case, the RCU grace-period kthread * will eventually cause one to happen. However, in less carefully * controlled environments, this function allows RCU to get what it * needs without creating otherwise useless interruptions. */ void __rcu_irq_enter_check_tick(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); // If we're here from NMI there's nothing to do. if (in_nmi()) return; RCU_LOCKDEP_WARN(!rcu_is_watching_curr_cpu(), "Illegal rcu_irq_enter_check_tick() from extended quiescent state"); if (!tick_nohz_full_cpu(rdp->cpu) || !READ_ONCE(rdp->rcu_urgent_qs) || READ_ONCE(rdp->rcu_forced_tick)) { // RCU doesn't need nohz_full help from this CPU, or it is // already getting that help. return; } // We get here only when not in an extended quiescent state and // from interrupts (as opposed to NMIs). Therefore, (1) RCU is // already watching and (2) The fact that we are in an interrupt // handler and that the rcu_node lock is an irq-disabled lock // prevents self-deadlock. So we can safely recheck under the lock. // Note that the nohz_full state currently cannot change. raw_spin_lock_rcu_node(rdp->mynode); if (READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { // A nohz_full CPU is in the kernel and RCU needs a // quiescent state. Turn on the tick! WRITE_ONCE(rdp->rcu_forced_tick, true); tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); } raw_spin_unlock_rcu_node(rdp->mynode); } NOKPROBE_SYMBOL(__rcu_irq_enter_check_tick); #endif /* CONFIG_NO_HZ_FULL */ /* * Check to see if any future non-offloaded RCU-related work will need * to be done by the current CPU, even if none need be done immediately, * returning 1 if so. This function is part of the RCU implementation; * it is -not- an exported member of the RCU API. This is used by * the idle-entry code to figure out whether it is safe to disable the * scheduler-clock interrupt. * * Just check whether or not this CPU has non-offloaded RCU callbacks * queued. */ int rcu_needs_cpu(void) { … } /* * If any sort of urgency was applied to the current CPU (for example, * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order * to get to a quiescent state, disable it. */ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { … } /** * rcu_is_watching - RCU read-side critical sections permitted on current CPU? * * Return @true if RCU is watching the running CPU and @false otherwise. * An @true return means that this CPU can safely enter RCU read-side * critical sections. * * Although calls to rcu_is_watching() from most parts of the kernel * will return @true, there are important exceptions. For example, if the * current CPU is deep within its idle loop, in kernel entry/exit code, * or offline, rcu_is_watching() will return @false. * * Make notrace because it can be called by the internal functions of * ftrace, and making this notrace removes unnecessary recursion calls. */ notrace bool rcu_is_watching(void) { … } EXPORT_SYMBOL_GPL(…); /* * If a holdout task is actually running, request an urgent quiescent * state from its CPU. This is unsynchronized, so migrations can cause * the request to go to the wrong CPU. Which is OK, all that will happen * is that the CPU's next context switch will be a bit slower and next * time around this task will generate another request. */ void rcu_request_urgent_qs_task(struct task_struct *t) { … } /* * When trying to report a quiescent state on behalf of some other CPU, * it is our responsibility to check for and handle potential overflow * of the rcu_node ->gp_seq counter with respect to the rcu_data counters. * After all, the CPU might be in deep idle state, and thus executing no * code whatsoever. */ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp) { … } /* * Snapshot the specified CPU's RCU_WATCHING counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ static int rcu_watching_snap_save(struct rcu_data *rdp) { … } /* * Returns positive if the specified CPU has passed through a quiescent state * by virtue of being in or having passed through an dynticks idle state since * the last call to rcu_watching_snap_save() for this same CPU, or by * virtue of having been offline. * * Returns negative if the specified CPU needs a force resched. * * Returns zero otherwise. */ static int rcu_watching_snap_recheck(struct rcu_data *rdp) { … } /* Trace-event wrapper function for trace_rcu_future_grace_period. */ static void trace_rcu_this_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long gp_seq_req, const char *s) { … } /* * rcu_start_this_gp - Request the start of a particular grace period * @rnp_start: The leaf node of the CPU from which to start. * @rdp: The rcu_data corresponding to the CPU from which to start. * @gp_seq_req: The gp_seq of the grace period to start. * * Start the specified grace period, as needed to handle newly arrived * callbacks. The required future grace periods are recorded in each * rcu_node structure's ->gp_seq_needed field. Returns true if there * is reason to awaken the grace-period kthread. * * The caller must hold the specified rcu_node structure's ->lock, which * is why the caller is responsible for waking the grace-period kthread. * * Returns true if the GP thread needs to be awakened else false. */ static bool rcu_start_this_gp(struct rcu_node *rnp_start, struct rcu_data *rdp, unsigned long gp_seq_req) { … } /* * Clean up any old requests for the just-ended grace period. Also return * whether any additional grace periods have been requested. */ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) { … } static void swake_up_one_online_ipi(void *arg) { … } static void swake_up_one_online(struct swait_queue_head *wqh) { … } /* * Awaken the grace-period kthread. Don't do a self-awaken (unless in an * interrupt or softirq handler, in which case we just might immediately * sleep upon return, resulting in a grace-period hang), and don't bother * awakening when there is nothing for the grace-period kthread to do * (as in several CPUs raced to awaken, we lost), and finally don't try * to awaken a kthread that has not yet been created. If all those checks * are passed, track some debug information and awaken. * * So why do the self-wakeup when in an interrupt or softirq handler * in the grace-period kthread's context? Because the kthread might have * been interrupted just as it was going to sleep, and just after the final * pre-sleep check of the awaken condition. In this case, a wakeup really * is required, and is therefore supplied. */ static void rcu_gp_kthread_wake(void) { … } /* * If there is room, assign a ->gp_seq number to any callbacks on this * CPU that have not already been assigned. Also accelerate any callbacks * that were previously assigned a ->gp_seq number that has since proven * to be too conservative, which can happen if callbacks get assigned a * ->gp_seq number while RCU is idle, but with reference to a non-root * rcu_node structure. This function is idempotent, so it does not hurt * to call it repeatedly. Returns an flag saying that we should awaken * the RCU grace-period kthread. * * The caller must hold rnp->lock with interrupts disabled. */ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { … } /* * Similar to rcu_accelerate_cbs(), but does not require that the leaf * rcu_node structure's ->lock be held. It consults the cached value * of ->gp_seq_needed in the rcu_data structure, and if that indicates * that a new grace-period request be made, invokes rcu_accelerate_cbs() * while holding the leaf rcu_node structure's ->lock. */ static void rcu_accelerate_cbs_unlocked(struct rcu_node *rnp, struct rcu_data *rdp) { … } /* * Move any callbacks whose grace period has completed to the * RCU_DONE_TAIL sublist, then compact the remaining sublists and * assign ->gp_seq numbers to any callbacks in the RCU_NEXT_TAIL * sublist. This function is idempotent, so it does not hurt to * invoke it repeatedly. As long as it is not invoked -too- often... * Returns true if the RCU grace-period kthread needs to be awakened. * * The caller must hold rnp->lock with interrupts disabled. */ static bool rcu_advance_cbs(struct rcu_node *rnp, struct rcu_data *rdp) { … } /* * Move and classify callbacks, but only if doing so won't require * that the RCU grace-period kthread be awakened. */ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, struct rcu_data *rdp) { … } /* * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a * quiescent state. This is intended to be invoked when the CPU notices * a new grace period. */ static void rcu_strict_gp_check_qs(void) { … } /* * Update CPU-local rcu_data state to record the beginnings and ends of * grace periods. The caller must hold the ->lock of the leaf rcu_node * structure corresponding to the current CPU, and must have irqs disabled. * Returns true if the grace-period kthread needs to be awakened. */ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { … } static void note_gp_changes(struct rcu_data *rdp) { … } static atomic_t *rcu_gp_slow_suppress; /* Register a counter to suppress debugging grace-period delays. */ void rcu_gp_slow_register(atomic_t *rgssp) { … } EXPORT_SYMBOL_GPL(…); /* Unregister a counter, with NULL for not caring which. */ void rcu_gp_slow_unregister(atomic_t *rgssp) { … } EXPORT_SYMBOL_GPL(…); static bool rcu_gp_slow_is_suppressed(void) { … } static void rcu_gp_slow(int delay) { … } static unsigned long sleep_duration; /* Allow rcutorture to stall the grace-period kthread. */ void rcu_gp_set_torture_wait(int duration) { … } EXPORT_SYMBOL_GPL(…); /* Actually implement the aforementioned wait. */ static void rcu_gp_torture_wait(void) { … } /* * Handler for on_each_cpu() to invoke the target CPU's RCU core * processing. */ static void rcu_strict_gp_boundary(void *unused) { … } // Make the polled API aware of the beginning of a grace period. static void rcu_poll_gp_seq_start(unsigned long *snap) { … } // Make the polled API aware of the end of a grace period. static void rcu_poll_gp_seq_end(unsigned long *snap) { … } // Make the polled API aware of the beginning of a grace period, but // where caller does not hold the root rcu_node structure's lock. static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap) { … } // Make the polled API aware of the end of a grace period, but where // caller does not hold the root rcu_node structure's lock. static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) { … } /* * There is a single llist, which is used for handling * synchronize_rcu() users' enqueued rcu_synchronize nodes. * Within this llist, there are two tail pointers: * * wait tail: Tracks the set of nodes, which need to * wait for the current GP to complete. * done tail: Tracks the set of nodes, for which grace * period has elapsed. These nodes processing * will be done as part of the cleanup work * execution by a kworker. * * At every grace period init, a new wait node is added * to the llist. This wait node is used as wait tail * for this new grace period. Given that there are a fixed * number of wait nodes, if all wait nodes are in use * (which can happen when kworker callback processing * is delayed) and additional grace period is requested. * This means, a system is slow in processing callbacks. * * TODO: If a slow processing is detected, a first node * in the llist should be used as a wait-tail for this * grace period, therefore users which should wait due * to a slow process are handled by _this_ grace period * and not next. * * Below is an illustration of how the done and wait * tail pointers move from one set of rcu_synchronize nodes * to the other, as grace periods start and finish and * nodes are processed by kworker. * * * a. Initial llist callbacks list: * * +----------+ +--------+ +-------+ * | | | | | | * | head |---------> | cb2 |--------->| cb1 | * | | | | | | * +----------+ +--------+ +-------+ * * * * b. New GP1 Start: * * WAIT TAIL * | * | * v * +----------+ +--------+ +--------+ +-------+ * | | | | | | | | * | head ------> wait |------> cb2 |------> | cb1 | * | | | head1 | | | | | * +----------+ +--------+ +--------+ +-------+ * * * * c. GP completion: * * WAIT_TAIL == DONE_TAIL * * DONE TAIL * | * | * v * +----------+ +--------+ +--------+ +-------+ * | | | | | | | | * | head ------> wait |------> cb2 |------> | cb1 | * | | | head1 | | | | | * +----------+ +--------+ +--------+ +-------+ * * * * d. New callbacks and GP2 start: * * WAIT TAIL DONE TAIL * | | * | | * v v * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ * | | | | | | | | | | | | | | * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | * | | | head2| | | | | |head1| | | | | * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ * * * * e. GP2 completion: * * WAIT_TAIL == DONE_TAIL * DONE TAIL * | * | * v * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ * | | | | | | | | | | | | | | * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | * | | | head2| | | | | |head1| | | | | * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ * * * While the llist state transitions from d to e, a kworker * can start executing rcu_sr_normal_gp_cleanup_work() and * can observe either the old done tail (@c) or the new * done tail (@e). So, done tail updates and reads need * to use the rel-acq semantics. If the concurrent kworker * observes the old done tail, the newly queued work * execution will process the updated done tail. If the * concurrent kworker observes the new done tail, then * the newly queued work will skip processing the done * tail, as workqueue semantics guarantees that the new * work is executed only after the previous one completes. * * f. kworker callbacks processing complete: * * * DONE TAIL * | * | * v * +----------+ +--------+ * | | | | * | head ------> wait | * | | | head2 | * +----------+ +--------+ * */ static bool rcu_sr_is_wait_head(struct llist_node *node) { … } static struct llist_node *rcu_sr_get_wait_head(void) { … } static void rcu_sr_put_wait_head(struct llist_node *node) { … } /* Disabled by default. */ static int rcu_normal_wake_from_gp; module_param(rcu_normal_wake_from_gp, int, 0644); static struct workqueue_struct *sync_wq; static void rcu_sr_normal_complete(struct llist_node *node) { … } static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) { … } /* * Helper function for rcu_gp_cleanup(). */ static void rcu_sr_normal_gp_cleanup(void) { … } /* * Helper function for rcu_gp_init(). */ static bool rcu_sr_normal_gp_init(void) { … } static void rcu_sr_normal_add_req(struct rcu_synchronize *rs) { … } /* * Initialize a new grace period. Return false if no grace period required. */ static noinline_for_stack bool rcu_gp_init(void) { … } /* * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state * time. */ static bool rcu_gp_fqs_check_wake(int *gfp) { … } /* * Do one round of quiescent-state forcing. */ static void rcu_gp_fqs(bool first_time) { … } /* * Loop doing repeated quiescent-state forcing until the grace period ends. */ static noinline_for_stack void rcu_gp_fqs_loop(void) { … } /* * Clean up after the old grace period. */ static noinline void rcu_gp_cleanup(void) { … } /* * Body of kthread that handles grace periods. */ static int __noreturn rcu_gp_kthread(void *unused) { … } /* * Report a full set of quiescent states to the rcu_state data structure. * Invoke rcu_gp_kthread_wake() to awaken the grace-period kthread if * another grace period is required. Whether we wake the grace-period * kthread or it awakens itself for the next round of quiescent-state * forcing, that kthread will clean up after the just-completed grace * period. Note that the caller must hold rnp->lock, which is released * before return. */ static void rcu_report_qs_rsp(unsigned long flags) __releases(rcu_get_root()->lock) { … } /* * Similar to rcu_report_qs_rdp(), for which it is a helper function. * Allows quiescent states for a group of CPUs to be reported at one go * to the specified rcu_node structure, though all the CPUs in the group * must be represented by the same rcu_node structure (which need not be a * leaf rcu_node structure, though it often will be). The gps parameter * is the grace-period snapshot, which means that the quiescent states * are valid only if rnp->gp_seq is equal to gps. That structure's lock * must be held upon entry, and it is released before return. * * As a special case, if mask is zero, the bit-already-cleared check is * disabled. This allows propagating quiescent state due to resumed tasks * during grace-period initialization. */ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp, unsigned long gps, unsigned long flags) __releases(rnp->lock) { … } /* * Record a quiescent state for all tasks that were previously queued * on the specified rcu_node structure and that were blocking the current * RCU grace period. The caller must hold the corresponding rnp->lock with * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { … } /* * Record a quiescent state for the specified CPU to that CPU's rcu_data * structure. This must be called from the specified CPU. */ static void rcu_report_qs_rdp(struct rcu_data *rdp) { … } /* * Check to see if there is a new grace period of which this CPU * is not yet aware, and if so, set up local rcu_data state for it. * Otherwise, see if this CPU has just passed through its first * quiescent state for this grace period, and record that fact if so. */ static void rcu_check_quiescent_state(struct rcu_data *rdp) { … } /* Return true if callback-invocation time limit exceeded. */ static bool rcu_do_batch_check_time(long count, long tlimit, bool jlimit_check, unsigned long jlimit) { … } /* * Invoke any RCU callbacks that have made it to the end of their grace * period. Throttle as specified by rdp->blimit. */ static void rcu_do_batch(struct rcu_data *rdp) { … } /* * This function is invoked from each scheduling-clock interrupt, * and checks to see if this CPU is in a non-context-switch quiescent * state, for example, user mode or idle loop. It also schedules RCU * core processing. If the current grace period has gone on too long, * it will ask the scheduler to manufacture a context switch for the sole * purpose of providing the needed quiescent state. */ void rcu_sched_clock_irq(int user) { … } /* * Scan the leaf rcu_node structures. For each structure on which all * CPUs have reported a quiescent state and on which there are tasks * blocking the current grace period, initiate RCU priority boosting. * Otherwise, invoke the specified function to check dyntick state for * each CPU that has not yet reported a quiescent state. */ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) { … } /* * Force quiescent states on reluctant CPUs, and also detect which * CPUs are in dyntick-idle mode. */ void rcu_force_quiescent_state(void) { … } EXPORT_SYMBOL_GPL(…); // Workqueue handler for an RCU reader for kernels enforcing struct RCU // grace periods. static void strict_work_handler(struct work_struct *work) { … } /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { … } static void rcu_core_si(void) { … } static void rcu_wake_cond(struct task_struct *t, int status) { … } static void invoke_rcu_core_kthread(void) { … } /* * Wake up this CPU's rcuc kthread to do RCU core processing. */ static void invoke_rcu_core(void) { … } static void rcu_cpu_kthread_park(unsigned int cpu) { … } static int rcu_cpu_kthread_should_run(unsigned int cpu) { … } /* * Per-CPU kernel thread that invokes RCU callbacks. This replaces * the RCU softirq used in configurations of RCU that do not support RCU * priority boosting. */ static void rcu_cpu_kthread(unsigned int cpu) { … } static struct smp_hotplug_thread rcu_cpu_thread_spec = …; /* * Spawn per-CPU RCU core processing kthreads. */ static int __init rcu_spawn_core_kthreads(void) { … } static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { … } /* * Handle any core-RCU processing required by a call_rcu() invocation. */ static void call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func, unsigned long flags) { … } /* * RCU callback function to leak a callback. */ static void rcu_leak_callback(struct rcu_head *rhp) { … } /* * Check and if necessary update the leaf rcu_node structure's * ->cbovldmask bit corresponding to the current CPU based on that CPU's * number of queued RCU callbacks. The caller must hold the leaf rcu_node * structure's ->lock. */ static void check_cb_ovld_locked(struct rcu_data *rdp, struct rcu_node *rnp) { … } /* * Check and if necessary update the leaf rcu_node structure's * ->cbovldmask bit corresponding to the current CPU based on that CPU's * number of queued RCU callbacks. No locks need be held, but the * caller must have disabled interrupts. * * Note that this function ignores the possibility that there are a lot * of callbacks all of which have already seen the end of their respective * grace periods. This omission is due to the need for no-CBs CPUs to * be holding ->nocb_lock to do this check, which is too heavy for a * common-case operation. */ static void check_cb_ovld(struct rcu_data *rdp) { … } static void __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in) { … } #ifdef CONFIG_RCU_LAZY static bool enable_rcu_lazy __read_mostly = !IS_ENABLED(…); module_param(enable_rcu_lazy, bool, 0444); /** * call_rcu_hurry() - Queue RCU callback for invocation after grace period, and * flush all lazy callbacks (including the new one) to the main ->cblist while * doing so. * * @head: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period * * The callback function will be invoked some time after a full grace * period elapses, in other words after all pre-existing RCU read-side * critical sections have completed. * * Use this API instead of call_rcu() if you don't want the callback to be * invoked after very long periods of time, which can happen on systems without * memory pressure and on systems which are lightly loaded or mostly idle. * This function will cause callbacks to be invoked sooner than later at the * expense of extra power. Other than that, this function is identical to, and * reuses call_rcu()'s logic. Refer to call_rcu() for more details about memory * ordering and other functionality. */ void call_rcu_hurry(struct rcu_head *head, rcu_callback_t func) { … } EXPORT_SYMBOL_GPL(…); #else #define enable_rcu_lazy … #endif /** * call_rcu() - Queue an RCU callback for invocation after a grace period. * By default the callbacks are 'lazy' and are kept hidden from the main * ->cblist to prevent starting of grace periods too soon. * If you desire grace periods to start very soon, use call_rcu_hurry(). * * @head: structure to be used for queueing the RCU updates. * @func: actual callback function to be invoked after the grace period * * The callback function will be invoked some time after a full grace * period elapses, in other words after all pre-existing RCU read-side * critical sections have completed. However, the callback function * might well execute concurrently with RCU read-side critical sections * that started after call_rcu() was invoked. * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, * or softirqs have been disabled also serve as RCU read-side critical * sections. This includes hardware interrupt handlers, softirq handlers, * and NMI handlers. * * Note that all CPUs must agree that the grace period extended beyond * all pre-existing RCU read-side critical section. On systems with more * than one CPU, this means that when "func()" is invoked, each CPU is * guaranteed to have executed a full memory barrier since the end of its * last RCU read-side critical section whose beginning preceded the call * to call_rcu(). It also means that each CPU executing an RCU read-side * critical section that continues beyond the start of "func()" must have * executed a memory barrier after the call_rcu() but before the beginning * of that RCU read-side critical section. Note that these guarantees * include CPUs that are offline, idle, or executing in user mode, as * well as CPUs that are executing in the kernel. * * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the * resulting RCU callback function "func()", then both CPU A and CPU B are * guaranteed to execute a full memory barrier during the time interval * between the call to call_rcu() and the invocation of "func()" -- even * if CPU A and CPU B are the same CPU (but again only if the system has * more than one CPU). * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { … } EXPORT_SYMBOL_GPL(…); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES … #define KFREE_N_BATCHES … #define FREE_N_CHANNELS … /** * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers * @list: List node. All blocks are linked between each other * @gp_snap: Snapshot of RCU state for objects placed to this bulk * @nr_records: Number of active pointers in the array * @records: Array of the kvfree_rcu() pointers */ struct kvfree_rcu_bulk_data { … }; /* * This macro defines how many entries the "records" array * will contain. It is based on the fact that the size of * kvfree_rcu_bulk_data structure becomes exactly one page. */ #define KVFREE_BULK_MAX_ENTR … /** * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period * @head_free: List of kfree_rcu() objects waiting for a grace period * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees. * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period * @krcp: Pointer to @kfree_rcu_cpu structure */ struct kfree_rcu_cpu_work { … }; /** * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period * @head_gp_snap: Snapshot of RCU state for objects placed to "@head" * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @initialized: The @rcu_work fields have been initialized * @head_count: Number of objects in rcu_head singular list * @bulk_count: Number of objects in bulk-list * @bkvcache: * A simple cache list that contains objects for reuse purpose. * In order to save some per-cpu space the list is singular. * Even though it is lockless an access has to be protected by the * per-cpu lock. * @page_cache_work: A work to refill the cache when it is empty * @backoff_page_cache_fill: Delay cache refills * @work_in_progress: Indicates that page_cache_work is running * @hrtimer: A hrtimer for scheduling a page_cache_work * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in * the rcu_data structure is to permit this code to be extracted from * the RCU files. Such extraction could allow further optimization of * the interactions with the slab allocators. */ struct kfree_rcu_cpu { … }; static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = …; static __always_inline void debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead) { … } static inline struct kfree_rcu_cpu * krc_this_cpu_lock(unsigned long *flags) { … } static inline void krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { … } static inline struct kvfree_rcu_bulk_data * get_cached_bnode(struct kfree_rcu_cpu *krcp) { … } static inline bool put_cached_bnode(struct kfree_rcu_cpu *krcp, struct kvfree_rcu_bulk_data *bnode) { … } static int drain_page_cache(struct kfree_rcu_cpu *krcp) { … } static void kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp, struct kvfree_rcu_bulk_data *bnode, int idx) { … } static void kvfree_rcu_list(struct rcu_head *head) { … } /* * This function is invoked in workqueue context after a grace period. * It frees all the objects queued on ->bulk_head_free or ->head_free. */ static void kfree_rcu_work(struct work_struct *work) { … } static bool need_offload_krc(struct kfree_rcu_cpu *krcp) { … } static bool need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp) { … } static int krc_count(struct kfree_rcu_cpu *krcp) { … } static void schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp) { … } static void kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp) { … } /* * Return: %true if a work is queued, %false otherwise. */ static bool kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp) { … } /* * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. */ static void kfree_rcu_monitor(struct work_struct *work) { … } static enum hrtimer_restart schedule_page_work_fn(struct hrtimer *t) { … } static void fill_page_cache_func(struct work_struct *work) { … } static void run_page_cache_worker(struct kfree_rcu_cpu *krcp) { … } // Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock() // state specified by flags. If can_alloc is true, the caller must // be schedulable and not be holding any locks or mutexes that might be // acquired by the memory allocator or anything that it might invoke. // Returns true if ptr was successfully recorded, else the caller must // use a fallback. static inline bool add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, unsigned long *flags, void *ptr, bool can_alloc) { … } /* * Queue a request for lazy invocation of the appropriate free routine * after a grace period. Please note that three paths are maintained, * two for the common case using arrays of pointers and a third one that * is used only when the main paths cannot be used, for example, due to * memory pressure. * * Each kvfree_call_rcu() request is added to a batch. The batch will be drained * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will * be free'd in workqueue context. This allows us to: batch requests together to * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load. */ void kvfree_call_rcu(struct rcu_head *head, void *ptr) { … } EXPORT_SYMBOL_GPL(…); /** * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. * * Note that a single argument of kvfree_rcu() call has a slow path that * triggers synchronize_rcu() following by freeing a pointer. It is done * before the return from the function. Therefore for any single-argument * call that will result in a kfree() to a cache that is to be destroyed * during module exit, it is developer's responsibility to ensure that all * such calls have returned before the call to kmem_cache_destroy(). */ void kvfree_rcu_barrier(void) { … } EXPORT_SYMBOL_GPL(…); static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { … } static unsigned long kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { … } void __init kfree_rcu_scheduler_running(void) { … } /* * During early boot, any blocking grace-period wait automatically * implies a grace period. * * Later on, this could in theory be the case for kernels built with * CONFIG_SMP=y && CONFIG_PREEMPTION=y running on a single CPU, but this * is not a common case. Furthermore, this optimization would cause * the rcu_gp_oldstate structure to expand by 50%, so this potential * grace-period optimization is ignored once the scheduler is running. */ static int rcu_blocking_is_gp(void) { … } /* * Helper function for the synchronize_rcu() API. */ static void synchronize_rcu_normal(void) { … } /** * synchronize_rcu - wait until a grace period has elapsed. * * Control will return to the caller some time after a full grace * period has elapsed, in other words after all currently executing RCU * read-side critical sections have completed. Note, however, that * upon return from synchronize_rcu(), the caller might well be executing * concurrently with new RCU read-side critical sections that began while * synchronize_rcu() was waiting. * * RCU read-side critical sections are delimited by rcu_read_lock() * and rcu_read_unlock(), and may be nested. In addition, but only in * v5.0 and later, regions of code across which interrupts, preemption, * or softirqs have been disabled also serve as RCU read-side critical * sections. This includes hardware interrupt handlers, softirq handlers, * and NMI handlers. * * Note that this guarantee implies further memory-ordering guarantees. * On systems with more than one CPU, when synchronize_rcu() returns, * each CPU is guaranteed to have executed a full memory barrier since * the end of its last RCU read-side critical section whose beginning * preceded the call to synchronize_rcu(). In addition, each CPU having * an RCU read-side critical section that extends beyond the return from * synchronize_rcu() is guaranteed to have executed a full memory barrier * after the beginning of synchronize_rcu() and before the beginning of * that RCU read-side critical section. Note that these guarantees include * CPUs that are offline, idle, or executing in user mode, as well as CPUs * that are executing in the kernel. * * Furthermore, if CPU A invoked synchronize_rcu(), which returned * to its caller on CPU B, then both CPU A and CPU B are guaranteed * to have executed a full memory barrier during the execution of * synchronize_rcu() -- even if CPU A and CPU B are the same CPU (but * again only if the system has more than one CPU). * * Implementation of these memory-ordering guarantees is described here: * Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst. */ void synchronize_rcu(void) { … } EXPORT_SYMBOL_GPL(…); /** * get_completed_synchronize_rcu_full - Return a full pre-completed polled state cookie * @rgosp: Place to put state cookie * * Stores into @rgosp a value that will always be treated by functions * like poll_state_synchronize_rcu_full() as a cookie whose grace period * has already completed. */ void get_completed_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { … } EXPORT_SYMBOL_GPL(…); /** * get_state_synchronize_rcu - Snapshot current RCU state * * Returns a cookie that is used by a later call to cond_synchronize_rcu() * or poll_state_synchronize_rcu() to determine whether or not a full * grace period has elapsed in the meantime. */ unsigned long get_state_synchronize_rcu(void) { … } EXPORT_SYMBOL_GPL(…); /** * get_state_synchronize_rcu_full - Snapshot RCU state, both normal and expedited * @rgosp: location to place combined normal/expedited grace-period state * * Places the normal and expedited grace-period states in @rgosp. This * state value can be passed to a later call to cond_synchronize_rcu_full() * or poll_state_synchronize_rcu_full() to determine whether or not a * grace period (whether normal or expedited) has elapsed in the meantime. * The rcu_gp_oldstate structure takes up twice the memory of an unsigned * long, but is guaranteed to see all grace periods. In contrast, the * combined state occupies less memory, but can sometimes fail to take * grace periods into account. * * This does not guarantee that the needed grace period will actually * start. */ void get_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { … } EXPORT_SYMBOL_GPL(…); /* * Helper function for start_poll_synchronize_rcu() and * start_poll_synchronize_rcu_full(). */ static void start_poll_synchronize_rcu_common(void) { … } /** * start_poll_synchronize_rcu - Snapshot and start RCU grace period * * Returns a cookie that is used by a later call to cond_synchronize_rcu() * or poll_state_synchronize_rcu() to determine whether or not a full * grace period has elapsed in the meantime. If the needed grace period * is not already slated to start, notifies RCU core of the need for that * grace period. * * Interrupts must be enabled for the case where it is necessary to awaken * the grace-period kthread. */ unsigned long start_poll_synchronize_rcu(void) { … } EXPORT_SYMBOL_GPL(…); /** * start_poll_synchronize_rcu_full - Take a full snapshot and start RCU grace period * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() * * Places the normal and expedited grace-period states in *@rgos. This * state value can be passed to a later call to cond_synchronize_rcu_full() * or poll_state_synchronize_rcu_full() to determine whether or not a * grace period (whether normal or expedited) has elapsed in the meantime. * If the needed grace period is not already slated to start, notifies * RCU core of the need for that grace period. * * Interrupts must be enabled for the case where it is necessary to awaken * the grace-period kthread. */ void start_poll_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { … } EXPORT_SYMBOL_GPL(…); /** * poll_state_synchronize_rcu - Has the specified RCU grace period completed? * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu() * * If a full RCU grace period has elapsed since the earlier call from * which @oldstate was obtained, return @true, otherwise return @false. * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @oldstate * to either cond_synchronize_rcu() or cond_synchronize_rcu_expedited() * on the one hand or by directly invoking either synchronize_rcu() or * synchronize_rcu_expedited() on the other. * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for * more than a billion grace periods (and way more on a 64-bit system!). * Those needing to keep old state values for very long time periods * (many hours even on 32-bit systems) should check them occasionally and * either refresh them or set a flag indicating that the grace period has * completed. Alternatively, they can use get_completed_synchronize_rcu() * to get a guaranteed-completed grace-period state. * * In addition, because oldstate compresses the grace-period state for * both normal and expedited grace periods into a single unsigned long, * it can miss a grace period when synchronize_rcu() runs concurrently * with synchronize_rcu_expedited(). If this is unacceptable, please * instead use the _full() variant of these polling APIs. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call * to the function that provided @oldstate, and that returned at the end * of this function. */ bool poll_state_synchronize_rcu(unsigned long oldstate) { … } EXPORT_SYMBOL_GPL(…); /** * poll_state_synchronize_rcu_full - Has the specified RCU grace period completed? * @rgosp: value from get_state_synchronize_rcu_full() or start_poll_synchronize_rcu_full() * * If a full RCU grace period has elapsed since the earlier call from * which *rgosp was obtained, return @true, otherwise return @false. * If @false is returned, it is the caller's responsibility to invoke this * function later on until it does return @true. Alternatively, the caller * can explicitly wait for a grace period, for example, by passing @rgosp * to cond_synchronize_rcu() or by directly invoking synchronize_rcu(). * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited * for more than a billion grace periods (and way more on a 64-bit * system!). Those needing to keep rcu_gp_oldstate values for very * long time periods (many hours even on 32-bit systems) should check * them occasionally and either refresh them or set a flag indicating * that the grace period has completed. Alternatively, they can use * get_completed_synchronize_rcu_full() to get a guaranteed-completed * grace-period state. * * This function provides the same memory-ordering guarantees that would * be provided by a synchronize_rcu() that was invoked at the call to * the function that provided @rgosp, and that returned at the end of this * function. And this guarantee requires that the root rcu_node structure's * ->gp_seq field be checked instead of that of the rcu_state structure. * The problem is that the just-ending grace-period's callbacks can be * invoked between the time that the root rcu_node structure's ->gp_seq * field is updated and the time that the rcu_state structure's ->gp_seq * field is updated. Therefore, if a single synchronize_rcu() is to * cause a subsequent poll_state_synchronize_rcu_full() to return @true, * then the root rcu_node structure is the one that needs to be polled. */ bool poll_state_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { … } EXPORT_SYMBOL_GPL(…); /** * cond_synchronize_rcu - Conditionally wait for an RCU grace period * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited() * * If a full RCU grace period has elapsed since the earlier call to * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return. * Otherwise, invoke synchronize_rcu() to wait for a full grace period. * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), * so waiting for a couple of additional grace periods should be just fine. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call * to the function that provided @oldstate and that returned at the end * of this function. */ void cond_synchronize_rcu(unsigned long oldstate) { … } EXPORT_SYMBOL_GPL(…); /** * cond_synchronize_rcu_full - Conditionally wait for an RCU grace period * @rgosp: value from get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), or start_poll_synchronize_rcu_expedited_full() * * If a full RCU grace period has elapsed since the call to * get_state_synchronize_rcu_full(), start_poll_synchronize_rcu_full(), * or start_poll_synchronize_rcu_expedited_full() from which @rgosp was * obtained, just return. Otherwise, invoke synchronize_rcu() to wait * for a full grace period. * * Yes, this function does not take counter wrap into account. * But counter wrap is harmless. If the counter wraps, we have waited for * more than 2 billion grace periods (and way more on a 64-bit system!), * so waiting for a couple of additional grace periods should be just fine. * * This function provides the same memory-ordering guarantees that * would be provided by a synchronize_rcu() that was invoked at the call * to the function that provided @rgosp and that returned at the end of * this function. */ void cond_synchronize_rcu_full(struct rcu_gp_oldstate *rgosp) { … } EXPORT_SYMBOL_GPL(…); /* * Check to see if there is any immediate RCU-related work to be done by * the current CPU, returning 1 if so and zero otherwise. The checks are * in order of increasing expense: checks that can be carried out against * CPU-local state are performed first. However, we must check for CPU * stalls first, else we might not get a chance. */ static int rcu_pending(int user) { … } /* * Helper function for rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ static void rcu_barrier_trace(const char *s, int cpu, unsigned long done) { … } /* * RCU callback function for rcu_barrier(). If we are last, wake * up the task executing rcu_barrier(). * * Note that the value of rcu_state.barrier_sequence must be captured * before the atomic_dec_and_test(). Otherwise, if this CPU is not last, * other CPUs might count the value down to zero before this CPU gets * around to invoking rcu_barrier_trace(), which might result in bogus * data from the next instance of rcu_barrier(). */ static void rcu_barrier_callback(struct rcu_head *rhp) { … } /* * If needed, entrain an rcu_barrier() callback on rdp->cblist. */ static void rcu_barrier_entrain(struct rcu_data *rdp) { … } /* * Called with preemption disabled, and from cross-cpu IRQ context. */ static void rcu_barrier_handler(void *cpu_in) { … } /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. * * Note that this primitive does not necessarily wait for an RCU grace period * to complete. For example, if there are no RCU callbacks queued anywhere * in the system, then rcu_barrier() is within its rights to return * immediately, without waiting for anything, much less an RCU grace period. */ void rcu_barrier(void) { … } EXPORT_SYMBOL_GPL(…); static unsigned long rcu_barrier_last_throttle; /** * rcu_barrier_throttled - Do rcu_barrier(), but limit to one per second * * This can be thought of as guard rails around rcu_barrier() that * permits unrestricted userspace use, at least assuming the hardware's * try_cmpxchg() is robust. There will be at most one call per second to * rcu_barrier() system-wide from use of this function, which means that * callers might needlessly wait a second or three. * * This is intended for use by test suites to avoid OOM by flushing RCU * callbacks from the previous test before starting the next. See the * rcutree.do_rcu_barrier module parameter for more information. * * Why not simply make rcu_barrier() more scalable? That might be * the eventual endpoint, but let's keep it simple for the time being. * Note that the module parameter infrastructure serializes calls to a * given .set() function, but should concurrent .set() invocation ever be * possible, we are ready! */ static void rcu_barrier_throttled(void) { … } /* * Invoke rcu_barrier_throttled() when a rcutree.do_rcu_barrier * request arrives. We insist on a true value to allow for possible * future expansion. */ static int param_set_do_rcu_barrier(const char *val, const struct kernel_param *kp) { … } /* * Output the number of outstanding rcutree.do_rcu_barrier requests. */ static int param_get_do_rcu_barrier(char *buffer, const struct kernel_param *kp) { … } static const struct kernel_param_ops do_rcu_barrier_ops = …; static atomic_t do_rcu_barrier; module_param_cb(…); /* * Compute the mask of online CPUs for the specified rcu_node structure. * This will not be stable unless the rcu_node structure's ->lock is * held, but the bit corresponding to the current CPU will be stable * in most contexts. */ static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { … } /* * Is the CPU corresponding to the specified rcu_data structure online * from RCU's perspective? This perspective is given by that structure's * ->qsmaskinitnext field rather than by the global cpu_online_mask. */ static bool rcu_rdp_cpu_online(struct rcu_data *rdp) { … } bool rcu_cpu_online(int cpu) { … } #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* * Is the current CPU online as far as RCU is concerned? * * Disable preemption to avoid false positives that could otherwise * happen due to the current CPU number being sampled, this task being * preempted, its old CPU being taken offline, resuming on some other CPU, * then determining that its old CPU is now offline. * * Disable checking if in an NMI handler because we cannot safely * report errors from NMI handlers anyway. In addition, it is OK to use * RCU on an offline processor during initial boot, hence the check for * rcu_scheduler_fully_active. */ bool rcu_lockdep_current_cpu_online(void) { … } EXPORT_SYMBOL_GPL(…); #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ // Has rcu_init() been invoked? This is used (for example) to determine // whether spinlocks may be acquired safely. static bool rcu_init_invoked(void) { … } /* * All CPUs for the specified rcu_node structure have gone offline, * and all tasks that were preempted within an RCU read-side critical * section while running on one of those CPUs have since exited their RCU * read-side critical section. Some other CPU is reporting this fact with * the specified rcu_node structure's ->lock held and interrupts disabled. * This function therefore goes up the tree of rcu_node structures, * clearing the corresponding bits in the ->qsmaskinit fields. Note that * the leaf rcu_node structure's ->qsmaskinit field has already been * updated. * * This function does check that the specified rcu_node structure has * all CPUs offline and no blocked tasks, so it is OK to invoke it * prematurely. That said, invoking it after the fact will cost you * a needless lock acquisition. So once it has done its work, don't * invoke it again. */ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) { … } /* * Propagate ->qsinitmask bits up the rcu_node tree to account for the * first CPU in a given leaf rcu_node structure coming online. The caller * must hold the corresponding leaf rcu_node ->lock with interrupts * disabled. */ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) { … } /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ static void __init rcu_boot_init_percpu_data(int cpu) { … } struct kthread_worker *rcu_exp_gp_kworker; static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp) { … } static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp) { … } static void __init rcu_start_exp_gp_kworker(void) { … } static void rcu_spawn_rnp_kthreads(struct rcu_node *rnp) { … } /* * Invoked early in the CPU-online process, when pretty much all services * are available. The incoming CPU is not present. * * Initializes a CPU's per-CPU RCU data. Note that only one online or * offline event can be happening at a given time. Note also that we can * accept some slop in the rsp->gp_seq access due to the fact that this * CPU cannot possibly have any non-offloaded RCU callbacks in flight yet. * And any offloaded callbacks are being numbered elsewhere. */ int rcutree_prepare_cpu(unsigned int cpu) { … } /* * Update kthreads affinity during CPU-hotplug changes. * * Set the per-rcu_node kthread's affinity to cover all CPUs that are * served by the rcu_node in question. The CPU hotplug lock is still * held, so the value of rnp->qsmaskinit will be stable. * * We don't include outgoingcpu in the affinity set, use -1 if there is * no outgoing CPU. If there are no CPUs left in the affinity set, * this function allows the kthread to execute on any CPU. * * Any future concurrent calls are serialized via ->kthread_mutex. */ static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu) { … } /* * Has the specified (known valid) CPU ever been fully online? */ bool rcu_cpu_beenfullyonline(int cpu) { … } /* * Near the end of the CPU-online process. Pretty much all services * enabled, and the CPU is now very much alive. */ int rcutree_online_cpu(unsigned int cpu) { … } /* * Mark the specified CPU as being online so that subsequent grace periods * (both expedited and normal) will wait on it. Note that this means that * incoming CPUs are not allowed to use RCU read-side critical sections * until this function is called. Failing to observe this restriction * will result in lockdep splats. * * Note that this function is special in that it is invoked directly * from the incoming CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. * This incoming CPU must not have enabled interrupts yet. * * This mirrors the effects of rcutree_report_cpu_dead(). */ void rcutree_report_cpu_starting(unsigned int cpu) { … } /* * The outgoing function has no further need of RCU, so remove it from * the rcu_node tree's ->qsmaskinitnext bit masks. * * Note that this function is special in that it is invoked directly * from the outgoing CPU rather than from the cpuhp_step mechanism. * This is because this function must be invoked at a precise location. * * This mirrors the effect of rcutree_report_cpu_starting(). */ void rcutree_report_cpu_dead(void) { … } #ifdef CONFIG_HOTPLUG_CPU /* * The outgoing CPU has just passed through the dying-idle state, and we * are being invoked from the CPU that was IPIed to continue the offline * operation. Migrate the outgoing CPU's callbacks to the current CPU. */ void rcutree_migrate_callbacks(int cpu) { … } /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup. * There can only be one CPU hotplug operation at a time, so no need for * explicit locking. */ int rcutree_dead_cpu(unsigned int cpu) { … } /* * Near the end of the offline process. Trace the fact that this CPU * is going offline. */ int rcutree_dying_cpu(unsigned int cpu) { … } /* * Near the beginning of the process. The CPU is still very much alive * with pretty much all services enabled. */ int rcutree_offline_cpu(unsigned int cpu) { … } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ /* * On non-huge systems, use expedited RCU grace periods to make suspend * and hibernation run faster. */ static int rcu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { … } /* * Spawn the kthreads that handle RCU's grace periods. */ static int __init rcu_spawn_gp_kthread(void) { … } early_initcall(rcu_spawn_gp_kthread); /* * This function is invoked towards the end of the scheduler's * initialization process. Before this is called, the idle task might * contain synchronous grace-period primitives (during which time, this idle * task is booting the system, and such primitives are no-ops). After this * function is called, any synchronous grace-period primitives are run as * expedited, with the requesting task driving the grace period forward. * A later core_initcall() rcu_set_runtime_mode() will switch to full * runtime RCU functionality. */ void rcu_scheduler_starting(void) { … } /* * Helper function for rcu_init() that initializes the rcu_state structure. */ static void __init rcu_init_one(void) { … } /* * Force priority from the kernel command-line into range. */ static void __init sanitize_kthread_prio(void) { … } /* * Compute the rcu_node tree geometry from kernel parameters. This cannot * replace the definitions in tree.h because those are needed to size * the ->node array in the rcu_state structure. */ void rcu_init_geometry(void) { … } /* * Dump out the structure of the rcu_node combining tree associated * with the rcu_state structure. */ static void __init rcu_dump_rcu_node_tree(void) { … } struct workqueue_struct *rcu_gp_wq; static void __init kfree_rcu_batch_init(void) { … } void __init rcu_init(void) { … } #include "tree_stall.h" #include "tree_exp.h" #include "tree_nocb.h" #include "tree_plugin.h"