From 6cf539a87a61a4fbc43f625267dbcbcf283872ed Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 9 Oct 2019 17:57:43 +0200 Subject: rcu: Fix data-race due to atomic_t copy-by-value This fixes a data-race where `atomic_t dynticks` is copied by value. The copy is performed non-atomically, resulting in a data-race if `dynticks` is updated concurrently. This data-race was found with KCSAN: ================================================================== BUG: KCSAN: data-race in dyntick_save_progress_counter / rcu_irq_enter write to 0xffff989dbdbe98e0 of 4 bytes by task 10 on cpu 3: atomic_add_return include/asm-generic/atomic-instrumented.h:78 [inline] rcu_dynticks_snap kernel/rcu/tree.c:310 [inline] dyntick_save_progress_counter+0x43/0x1b0 kernel/rcu/tree.c:984 force_qs_rnp+0x183/0x200 kernel/rcu/tree.c:2286 rcu_gp_fqs kernel/rcu/tree.c:1601 [inline] rcu_gp_fqs_loop+0x71/0x880 kernel/rcu/tree.c:1653 rcu_gp_kthread+0x22c/0x3b0 kernel/rcu/tree.c:1799 kthread+0x1b5/0x200 kernel/kthread.c:255 read to 0xffff989dbdbe98e0 of 4 bytes by task 154 on cpu 7: rcu_nmi_enter_common kernel/rcu/tree.c:828 [inline] rcu_irq_enter+0xda/0x240 kernel/rcu/tree.c:870 irq_enter+0x5/0x50 kernel/softirq.c:347 Reported by Kernel Concurrency Sanitizer on: CPU: 7 PID: 154 Comm: kworker/7:1H Not tainted 5.3.0+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 Workqueue: kblockd blk_mq_run_work_fn ================================================================== Signed-off-by: Marco Elver Cc: Paul E. McKenney Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Joel Fernandes Cc: Ingo Molnar Cc: Dmitry Vyukov Cc: rcu@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..6145e08a1407 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -577,7 +577,7 @@ static void rcu_eqs_enter(bool user) } lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); do_nocb_deferred_wakeup(rdp); @@ -650,14 +650,15 @@ static __always_inline void rcu_nmi_exit_common(bool irq) * leave it in non-RCU-idle state. */ if (rdp->dynticks_nmi_nesting != 1) { - trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, rdp->dynticks); + trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, rdp->dynticks_nmi_nesting - 2, + atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */ rdp->dynticks_nmi_nesting - 2); return; } /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, rdp->dynticks); + trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ if (irq) @@ -744,7 +745,7 @@ static void rcu_eqs_exit(bool user) rcu_dynticks_task_exit(); rcu_dynticks_eqs_exit(); rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks); + trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); WARN_ON_ONCE(rdp->dynticks_nmi_nesting); @@ -833,7 +834,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, - rdp->dynticks_nmi_nesting + incby, rdp->dynticks); + rdp->dynticks_nmi_nesting + incby, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, /* Prevent store tearing. */ rdp->dynticks_nmi_nesting + incby); barrier(); -- cgit v1.2.3 From 90326f0521a88004194f88f1b597b54347482b5c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Oct 2019 21:18:14 +0200 Subject: rcu: Use CONFIG_PREEMPTION where appropriate The config option `CONFIG_PREEMPT' is used for the preemption model "Low-Latency Desktop". The config option `CONFIG_PREEMPTION' is enabled when kernel preemption is enabled which is true for the preemption model `CONFIG_PREEMPT' and `CONFIG_PREEMPT_RT'. Use `CONFIG_PREEMPTION' if it applies to both preemption models and not just to `CONFIG_PREEMPT'. Cc: "Paul E. McKenney" Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Davidlohr Bueso Cc: rcu@vger.kernel.org Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 4 ++-- kernel/rcu/Kconfig | 4 ++-- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/srcutiny.c | 2 +- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_exp.h | 2 +- kernel/rcu/tree_plugin.h | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 70a41cd8f58d..eb32fff81c30 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -154,7 +154,7 @@ static inline void exit_tasks_rcu_finish(void) { } * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() - * machinery were to be shut off, as some advocate for PREEMPT kernels. + * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ @@ -598,7 +598,7 @@ do { \ * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU - * read-side critical section that would block in a !PREEMPT kernel. + * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 0303934e6ef0..1cc940fef17c 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -201,8 +201,8 @@ config RCU_NOCB_CPU specified at boot time by the rcu_nocbs parameter. For each such CPU, a kthread ("rcuox/N") will be created to invoke callbacks, where the "N" is the CPU being offloaded, and where - the "p" for RCU-preempt (PREEMPT kernels) and "s" for RCU-sched - (!PREEMPT kernels). Nothing prevents this kthread from running + the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched + (!PREEMPTION kernels). Nothing prevents this kthread from running on the specified CPUs, but (1) the kthreads may be preempted between each callback, and (2) affinity or cgroups can be used to force the kthreads to run on whatever set of CPUs is desired. diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dee043feb71f..121a0507a7ce 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1730,7 +1730,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) // Give the scheduler a chance, even on nohz_full CPUs. static void rcu_torture_fwd_prog_cond_resched(unsigned long iter) { - if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { + if (IS_ENABLED(CONFIG_PREEMPTION) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { // Real call_rcu() floods hit userspace, so emulate that. if (need_resched() || (iter & 0xfff)) schedule(); diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 44d6606b8325..6208c1dae5c9 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -103,7 +103,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Workqueue handler to drive one grace period and invoke any callbacks - * that become ready as a result. Single-CPU and !PREEMPT operation + * that become ready as a result. Single-CPU and !PREEMPTION operation * means that we get away with murder on synchronization. ;-) */ void srcu_drive_gp(struct work_struct *wp) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..c9dbb05e4c13 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2698,9 +2698,9 @@ EXPORT_SYMBOL_GPL(kfree_call_rcu); /* * During early boot, any blocking grace-period wait automatically - * implies a grace period. Later on, this is never the case for PREEMPT. + * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPT, any + * Howevr, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index d632cd019597..98d078cafa5a 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -670,7 +670,7 @@ static void rcu_exp_handler(void *unused) } } -/* PREEMPT=y, so no PREEMPT=n expedited grace period to clean up after. */ +/* PREEMPTION=y, so no PREEMPTION=n expedited grace period to clean up after. */ static void sync_sched_exp_online_cleanup(int cpu) { } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ed54d36465e2..8cdce111ea73 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -789,7 +789,7 @@ static void __init rcu_bootup_announce(void) } /* - * Note a quiescent state for PREEMPT=n. Because we do not need to know + * Note a quiescent state for PREEMPTION=n. Because we do not need to know * how many quiescent states passed, just if there was at least one since * the start of the grace period, this just sets a flag. The caller must * have disabled preemption. @@ -839,7 +839,7 @@ void rcu_all_qs(void) EXPORT_SYMBOL_GPL(rcu_all_qs); /* - * Note a PREEMPT=n context switch. The caller must have disabled interrupts. + * Note a PREEMPTION=n context switch. The caller must have disabled interrupts. */ void rcu_note_context_switch(bool preempt) { -- cgit v1.2.3 From c30fe541896440667bd9e9068aedd1d440fbbcd2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 11 Oct 2019 21:40:09 -0700 Subject: rcu: Mark non-global functions and variables as static Each of rcu_state, rcu_rnp_online_cpus(), rcu_dynticks_curr_cpu_in_eqs(), and rcu_dynticks_snap() are used only in the kernel/rcu/tree.o translation unit, and may thus be marked static. This commit therefore makes this change. Reported-by: Ben Dooks Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) --- kernel/rcu/tree.c | 8 ++++---- kernel/rcu/tree.h | 2 -- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..dd8cfc34f4da 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -84,7 +84,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; -struct rcu_state rcu_state = { +static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, .gp_state = RCU_GP_IDLE, .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT, @@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(rcu_get_gp_kthreads_prio); * held, but the bit corresponding to the current CPU will be stable * in most contexts. */ -unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +static unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) { return READ_ONCE(rnp->qsmaskinitnext); } @@ -294,7 +294,7 @@ static void rcu_dynticks_eqs_online(void) * * No ordering, as we are sampling CPU-local information. */ -bool rcu_dynticks_curr_cpu_in_eqs(void) +static bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); @@ -305,7 +305,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -int rcu_dynticks_snap(struct rcu_data *rdp) +static int rcu_dynticks_snap(struct rcu_data *rdp) { int snap = atomic_add_return(0, &rdp->dynticks); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..e4dc5debfc84 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -403,8 +403,6 @@ static const char *tp_rcu_varname __used __tracepoint_string = rcu_name; #define RCU_NAME rcu_name #endif /* #else #ifdef CONFIG_TRACING */ -int rcu_dynticks_snap(struct rcu_data *rdp); - /* Forward declarations for tree_plugin.h */ static void rcu_bootup_announce(void); static void rcu_qs(void); -- cgit v1.2.3 From a35d16905efc6ad5523d864a5c6efcb1e657e386 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 5 Aug 2019 18:22:27 -0400 Subject: rcu: Add basic support for kfree_rcu() batching Recently a discussion about stability and performance of a system involving a high rate of kfree_rcu() calls surfaced on the list [1] which led to another discussion how to prepare for this situation. This patch adds basic batching support for kfree_rcu(). It is "basic" because we do none of the slab management, dynamic allocation, code moving or any of the other things, some of which previous attempts did [2]. These fancier improvements can be follow-up patches and there are different ideas being discussed in those regards. This is an effort to start simple, and build up from there. In the future, an extension to use kfree_bulk and possibly per-slab batching could be done to further improve performance due to cache-locality and slab-specific bulk free optimizations. By using an array of pointers, the worker thread processing the work would need to read lesser data since it does not need to deal with large rcu_head(s) any longer. Torture tests follow in the next patch and show improvements of around 5x reduction in number of grace periods on a 16 CPU system. More details and test data are in that patch. There is an implication with rcu_barrier() with this patch. Since the kfree_rcu() calls can be batched, and may not be handed yet to the RCU machinery in fact, the monitor may not have even run yet to do the queue_rcu_work(), there seems no easy way of implementing rcu_barrier() to wait for those kfree_rcu()s that are already made. So this means a kfree_rcu() followed by an rcu_barrier() does not imply that memory will be freed once rcu_barrier() returns. Another implication is higher active memory usage (although not run-away..) until the kfree_rcu() flooding ends, in comparison to without batching. More details about this are in the second patch which adds an rcuperf test. Finally, in the near future we will get rid of kfree_rcu() special casing within RCU such as in rcu_do_batch and switch everything to just batching. Currently we don't do that since timer subsystem is not yet up and we cannot schedule the kfree_rcu() monitor as the timer subsystem's lock are not initialized. That would also mean getting rid of kfree_call_rcu_nobatch() entirely. [1] http://lore.kernel.org/lkml/20190723035725-mutt-send-email-mst@kernel.org [2] https://lkml.org/lkml/2017/12/19/824 Cc: kernel-team@android.com Cc: kernel-team@lge.com Co-developed-by: Byungchul Park Signed-off-by: Byungchul Park Signed-off-by: Joel Fernandes (Google) [ paulmck: Applied 0day and Paul Walmsley feedback on ->monitor_todo. ] [ paulmck: Make it work during early boot. ] [ paulmck: Add a crude early boot self-test. ] [ paulmck: Style adjustments and experimental docbook structure header. ] Link: https://lore.kernel.org/lkml/alpine.DEB.2.21.9999.1908161931110.32497@viisi.sifive.com/T/#me9956f66cb611b95d26ae92700e1d901f46e8c59 Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 6 ++ include/linux/rcutree.h | 2 + kernel/rcu/tree.c | 194 ++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcu/update.c | 10 +++ 4 files changed, 206 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 37b6f0c2b79d..1bd166aab6f3 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -39,6 +39,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) call_rcu(head, func); } +static inline void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) +{ + call_rcu(head, func); +} + void rcu_qs(void); static inline void rcu_softirq_qs(void) @@ -85,6 +90,7 @@ static inline void rcu_scheduler_starting(void) { } static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_dyntick_idle(void) { } +static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index c5147de885ec..6a65d3a16dbd 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,10 +34,12 @@ static inline void rcu_virt_note_context_switch(int cpu) void synchronize_rcu_expedited(void); void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); void rcu_momentary_dyntick_idle(void); +void kfree_rcu_scheduler_running(void); unsigned long get_state_synchronize_rcu(void); void cond_synchronize_rcu(unsigned long oldstate); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 1694a6b57ad8..0af016fdbf19 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2683,19 +2683,187 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) } EXPORT_SYMBOL_GPL(call_rcu); + +/* Maximum number of jiffies to wait before draining a batch. */ +#define KFREE_DRAIN_JIFFIES (HZ / 50) + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head: List of kfree_rcu() objects not yet waiting for a grace period + * @head_free: List of kfree_rcu() objects already waiting for a grace period + * @lock: Synchronize access to this structure + * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES + * @monitor_todo: Tracks whether a @monitor_work delayed work is pending + * @initialized: The @lock and @rcu_work fields have been initialized + * + * This is a per-CPU structure. The reason that it is not included in + * the rcu_data structure is to permit this code to be extracted from + * the RCU files. Such extraction could allow further optimization of + * the interactions with the slab allocators. + */ +struct kfree_rcu_cpu { + struct rcu_work rcu_work; + struct rcu_head *head; + struct rcu_head *head_free; + spinlock_t lock; + struct delayed_work monitor_work; + int monitor_todo; + bool initialized; +}; + +static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc); + /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). + * This function is invoked in workqueue context after a grace period. + * It frees all the objects queued on ->head_free. */ -void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +static void kfree_rcu_work(struct work_struct *work) +{ + unsigned long flags; + struct rcu_head *head, *next; + struct kfree_rcu_cpu *krcp; + + krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work); + spin_lock_irqsave(&krcp->lock, flags); + head = krcp->head_free; + krcp->head_free = NULL; + spin_unlock_irqrestore(&krcp->lock, flags); + + // List "head" is now private, so traverse locklessly. + for (; head; head = next) { + next = head->next; + // Potentially optimize with kfree_bulk in future. + __rcu_reclaim(rcu_state.name, head); + cond_resched_tasks_rcu_qs(); + } +} + +/* + * Schedule the kfree batch RCU work to run in workqueue context after a GP. + * + * This function is invoked by kfree_rcu_monitor() when the KFREE_DRAIN_JIFFIES + * timeout has been reached. + */ +static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) +{ + lockdep_assert_held(&krcp->lock); + + // If a previous RCU batch is in progress, we cannot immediately + // queue another one, so return false to tell caller to retry. + if (krcp->head_free) + return false; + + krcp->head_free = krcp->head; + krcp->head = NULL; + INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krcp->rcu_work); + return true; +} + +static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, + unsigned long flags) +{ + // Attempt to start a new batch. + if (queue_kfree_rcu_work(krcp)) { + // Success! Our job is done here. + spin_unlock_irqrestore(&krcp->lock, flags); + return; + } + + // Previous RCU batch still in progress, try again later. + if (!xchg(&krcp->monitor_todo, true)) + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This function is invoked after the KFREE_DRAIN_JIFFIES timeout. + * It invokes kfree_rcu_drain_unlock() to attempt to start another batch. + */ +static void kfree_rcu_monitor(struct work_struct *work) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu, + monitor_work.work); + + spin_lock_irqsave(&krcp->lock, flags); + if (xchg(&krcp->monitor_todo, false)) + kfree_rcu_drain_unlock(krcp, flags); + else + spin_unlock_irqrestore(&krcp->lock, flags); +} + +/* + * This version of kfree_call_rcu does not do batching of kfree_rcu() requests. + * Used only by rcuperf torture test for comparison with kfree_rcu_batch(). + */ +void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) { __call_rcu(head, func, 1); } +EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); + +/* + * Queue a request for lazy invocation of kfree() after a grace period. + * + * Each kfree_call_rcu() request is added to a batch. The batch will be drained + * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch + * will be kfree'd in workqueue context. This allows us to: + * + * 1. Batch requests together to reduce the number of grace periods during + * heavy kfree_rcu() load. + * + * 2. It makes it possible to use kfree_bulk() on a large number of + * kfree_rcu() requests thus reducing cache misses and the per-object + * overhead of kfree(). + */ +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + unsigned long flags; + struct kfree_rcu_cpu *krcp; + + head->func = func; + + local_irq_save(flags); // For safely calling this_cpu_ptr(). + krcp = this_cpu_ptr(&krc); + if (krcp->initialized) + spin_lock(&krcp->lock); + + // Queue the object but don't yet schedule the batch. + head->func = func; + head->next = krcp->head; + krcp->head = head; + + // Set timer to drain after KFREE_DRAIN_JIFFIES. + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !xchg(&krcp->monitor_todo, true)) + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + + if (krcp->initialized) + spin_unlock(&krcp->lock); + local_irq_restore(flags); +} EXPORT_SYMBOL_GPL(kfree_call_rcu); +void __init kfree_rcu_scheduler_running(void) +{ + int cpu; + unsigned long flags; + + for_each_online_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_irqsave(&krcp->lock, flags); + if (!krcp->head || xchg(&krcp->monitor_todo, true)) { + spin_unlock_irqrestore(&krcp->lock, flags); + continue; + } + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + spin_unlock_irqrestore(&krcp->lock, flags); + } +} + /* * During early boot, any blocking grace-period wait automatically * implies a grace period. Later on, this is never the case for PREEMPT. @@ -3557,12 +3725,26 @@ static void __init rcu_dump_rcu_node_tree(void) struct workqueue_struct *rcu_gp_wq; struct workqueue_struct *rcu_par_gp_wq; +static void __init kfree_rcu_batch_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); + + spin_lock_init(&krcp->lock); + INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + krcp->initialized = true; + } +} + void __init rcu_init(void) { int cpu; rcu_early_boot_tests(); + kfree_rcu_batch_init(); rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 1861103662db..196487762b96 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -40,6 +40,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS @@ -218,6 +219,7 @@ static int __init rcu_set_runtime_mode(void) { rcu_test_sync_prims(); rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + kfree_rcu_scheduler_running(); rcu_test_sync_prims(); return 0; } @@ -853,14 +855,22 @@ static void test_callback(struct rcu_head *r) DEFINE_STATIC_SRCU(early_srcu); +struct early_boot_kfree_rcu { + struct rcu_head rh; +}; + static void early_boot_test_call_rcu(void) { static struct rcu_head head; static struct rcu_head shead; + struct early_boot_kfree_rcu *rhp; call_rcu(&head, test_callback); if (IS_ENABLED(CONFIG_SRCU)) call_srcu(&early_srcu, &shead, test_callback); + rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); + if (!WARN_ON_ONCE(!rhp)) + kfree_rcu(rhp, rh); } void rcu_early_boot_tests(void) -- cgit v1.2.3 From 569d767087ef56a59bc2d1d5f25ee447eeae442a Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 22 Sep 2019 10:49:57 -0700 Subject: rcu: Make kfree_rcu() use a non-atomic ->monitor_todo Because the ->monitor_todo field is always protected by krcp->lock, this commit downgrades from xchg() to non-atomic unmarked assignment statements. Signed-off-by: Joel Fernandes [ paulmck: Update to include early-boot kick code. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0af016fdbf19..6106b9e0b5fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2708,7 +2708,7 @@ struct kfree_rcu_cpu { struct rcu_head *head_free; spinlock_t lock; struct delayed_work monitor_work; - int monitor_todo; + bool monitor_todo; bool initialized; }; @@ -2765,6 +2765,7 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags) { // Attempt to start a new batch. + krcp->monitor_todo = false; if (queue_kfree_rcu_work(krcp)) { // Success! Our job is done here. spin_unlock_irqrestore(&krcp->lock, flags); @@ -2772,8 +2773,8 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp, } // Previous RCU batch still in progress, try again later. - if (!xchg(&krcp->monitor_todo, true)) - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + krcp->monitor_todo = true; + schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } @@ -2788,7 +2789,7 @@ static void kfree_rcu_monitor(struct work_struct *work) monitor_work.work); spin_lock_irqsave(&krcp->lock, flags); - if (xchg(&krcp->monitor_todo, false)) + if (krcp->monitor_todo) kfree_rcu_drain_unlock(krcp, flags); else spin_unlock_irqrestore(&krcp->lock, flags); @@ -2837,8 +2838,10 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) // Set timer to drain after KFREE_DRAIN_JIFFIES. if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && - !xchg(&krcp->monitor_todo, true)) + !krcp->monitor_todo) { + krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + } if (krcp->initialized) spin_unlock(&krcp->lock); @@ -2855,10 +2858,11 @@ void __init kfree_rcu_scheduler_running(void) struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_irqsave(&krcp->lock, flags); - if (!krcp->head || xchg(&krcp->monitor_todo, true)) { + if (!krcp->head || krcp->monitor_todo) { spin_unlock_irqrestore(&krcp->lock, flags); continue; } + krcp->monitor_todo = true; schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } -- cgit v1.2.3 From 0392bebebf26f09434e6c7ca4c09c014efeef76a Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Thu, 19 Sep 2019 14:58:26 -0700 Subject: rcu: Add multiple in-flight batches of kfree_rcu() work During testing, it was observed that amount of memory consumed due kfree_rcu() batching is 300-400MB. Previously we had only a single head_free pointer pointing to the list of rcu_head(s) that are to be freed after a grace period. Until this list is drained, we cannot queue any more objects on it since such objects may not be ready to be reclaimed when the worker thread eventually gets to drainin g the head_free list. We can do better by maintaining multiple lists as done by this patch. Testing shows that memory consumption came down by around 100-150MB with just adding another list. Adding more than 1 additional list did not show any improvement. Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) [ paulmck: Code style and initialization handling. ] [ paulmck: Fix field name, reported by kbuild test robot . ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 51 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 12 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6106b9e0b5fb..a40fd58bd4b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2686,12 +2686,25 @@ EXPORT_SYMBOL_GPL(call_rcu); /* Maximum number of jiffies to wait before draining a batch. */ #define KFREE_DRAIN_JIFFIES (HZ / 50) +#define KFREE_N_BATCHES 2 /** - * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period + * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period + * @head_free: List of kfree_rcu() objects waiting for a grace period + * @krcp: Pointer to @kfree_rcu_cpu structure + */ + +struct kfree_rcu_cpu_work { + struct rcu_work rcu_work; + struct rcu_head *head_free; + struct kfree_rcu_cpu *krcp; +}; + +/** + * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period * @head: List of kfree_rcu() objects not yet waiting for a grace period - * @head_free: List of kfree_rcu() objects already waiting for a grace period + * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period * @lock: Synchronize access to this structure * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES * @monitor_todo: Tracks whether a @monitor_work delayed work is pending @@ -2703,9 +2716,8 @@ EXPORT_SYMBOL_GPL(call_rcu); * the interactions with the slab allocators. */ struct kfree_rcu_cpu { - struct rcu_work rcu_work; struct rcu_head *head; - struct rcu_head *head_free; + struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES]; spinlock_t lock; struct delayed_work monitor_work; bool monitor_todo; @@ -2723,11 +2735,14 @@ static void kfree_rcu_work(struct work_struct *work) unsigned long flags; struct rcu_head *head, *next; struct kfree_rcu_cpu *krcp; + struct kfree_rcu_cpu_work *krwp; - krcp = container_of(to_rcu_work(work), struct kfree_rcu_cpu, rcu_work); + krwp = container_of(to_rcu_work(work), + struct kfree_rcu_cpu_work, rcu_work); + krcp = krwp->krcp; spin_lock_irqsave(&krcp->lock, flags); - head = krcp->head_free; - krcp->head_free = NULL; + head = krwp->head_free; + krwp->head_free = NULL; spin_unlock_irqrestore(&krcp->lock, flags); // List "head" is now private, so traverse locklessly. @@ -2747,17 +2762,25 @@ static void kfree_rcu_work(struct work_struct *work) */ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp) { + int i; + struct kfree_rcu_cpu_work *krwp = NULL; + lockdep_assert_held(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + if (!krcp->krw_arr[i].head_free) { + krwp = &(krcp->krw_arr[i]); + break; + } // If a previous RCU batch is in progress, we cannot immediately // queue another one, so return false to tell caller to retry. - if (krcp->head_free) + if (!krwp) return false; - krcp->head_free = krcp->head; + krwp->head_free = krcp->head; krcp->head = NULL; - INIT_RCU_WORK(&krcp->rcu_work, kfree_rcu_work); - queue_rcu_work(system_wq, &krcp->rcu_work); + INIT_RCU_WORK(&krwp->rcu_work, kfree_rcu_work); + queue_rcu_work(system_wq, &krwp->rcu_work); return true; } @@ -2863,7 +2886,8 @@ void __init kfree_rcu_scheduler_running(void) continue; } krcp->monitor_todo = true; - schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); + schedule_delayed_work_on(cpu, &krcp->monitor_work, + KFREE_DRAIN_JIFFIES); spin_unlock_irqrestore(&krcp->lock, flags); } } @@ -3732,11 +3756,14 @@ struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { int cpu; + int i; for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); spin_lock_init(&krcp->lock); + for (i = 0; i < KFREE_N_BATCHES; i++) + krcp->krw_arr[i].krcp = krcp; INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); krcp->initialized = true; } -- cgit v1.2.3 From e99637becb2e684bee2b9117f817f4d1346b8353 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 22 Sep 2019 13:03:17 -0700 Subject: rcu: Add support for debug_objects debugging for kfree_rcu() This commit applies RCU's debug_objects debugging to the new batched kfree_rcu() implementations. The object is queued at the kfree_rcu() call and dequeued during reclaim. Tested that enabling CONFIG_DEBUG_OBJECTS_RCU_HEAD successfully detects double kfree_rcu() calls. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix IRQ per kbuild test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a40fd58bd4b6..0512221cd84b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2749,6 +2749,7 @@ static void kfree_rcu_work(struct work_struct *work) for (; head; head = next) { next = head->next; // Potentially optimize with kfree_bulk in future. + debug_rcu_head_unqueue(head); __rcu_reclaim(rcu_state.name, head); cond_resched_tasks_rcu_qs(); } @@ -2855,6 +2856,12 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) spin_lock(&krcp->lock); // Queue the object but don't yet schedule the batch. + if (debug_rcu_head_queue(head)) { + // Probable double kfree_rcu(), just leak. + WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n", + __func__, head); + goto unlock_return; + } head->func = func; head->next = krcp->head; krcp->head = head; @@ -2866,6 +2873,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES); } +unlock_return: if (krcp->initialized) spin_unlock(&krcp->lock); local_irq_restore(flags); -- cgit v1.2.3 From 77a40f97030b27b3fc1640a3ed203870f0817f57 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:32 -0400 Subject: rcu: Remove kfree_rcu() special casing and lazy-callback handling This commit removes kfree_rcu() special-casing and the lazy-callback handling from Tree RCU. It moves some of this special casing to Tiny RCU, the removal of which will be the subject of later commits. This results in a nice negative delta. Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) [ paulmck: Add slab.h #include, thanks to kbuild test robot . ] Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 11 +++------- include/linux/rcu_segcblist.h | 2 -- include/trace/events/rcu.h | 32 +++++++++++---------------- kernel/rcu/rcu.h | 27 ----------------------- kernel/rcu/rcu_segcblist.c | 25 +++------------------ kernel/rcu/rcu_segcblist.h | 25 ++------------------- kernel/rcu/srcutree.c | 4 ++-- kernel/rcu/tiny.c | 28 +++++++++++++++++++++++- kernel/rcu/tree.c | 40 +++++++++++++++++++++++----------- kernel/rcu/tree.h | 1 - kernel/rcu/tree_plugin.h | 48 ++++++++++------------------------------- kernel/rcu/tree_stall.h | 6 ++---- 12 files changed, 90 insertions(+), 159 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index f48f4621ccbc..a360a8796710 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued across all CPUs In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed for each CPU: - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 Nonlazy posted: ..D + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1 The "last_accelerate:" prints the low-order 16 bits (in hex) of the jiffies counter when this CPU last invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from -rcu_prepare_for_idle(). The "Nonlazy posted:" indicates lazy-callback -status, so that an "l" indicates that all callbacks were lazy at the start -of the last idle period and an "L" indicates that there are currently -no non-lazy callbacks (in both cases, "." is printed otherwise, as -shown above) and "D" indicates that dyntick-idle processing is enabled -("." is printed otherwise, for example, if disabled via the "nohz=" -kernel boot parameter). +rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle +processing is enabled. If the grace period ends just as the stall warning starts printing, there will be a spurious stall-warning message, which will include diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 646759042333..b36afe7b22c9 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -22,7 +22,6 @@ struct rcu_cblist { struct rcu_head *head; struct rcu_head **tail; long len; - long len_lazy; }; #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } @@ -73,7 +72,6 @@ struct rcu_segcblist { #else long len; #endif - long len_lazy; u8 enabled; u8 offloaded; }; diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 66122602bd08..4ab16fcda895 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -481,16 +481,14 @@ TRACE_EVENT_RCU(rcu_dyntick, */ TRACE_EVENT_RCU(rcu_callback, - TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen_lazy, - long qlen), + TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen), - TP_ARGS(rcuname, rhp, qlen_lazy, qlen), + TP_ARGS(rcuname, rhp, qlen), TP_STRUCT__entry( __field(const char *, rcuname) __field(void *, rhp) __field(void *, func) - __field(long, qlen_lazy) __field(long, qlen) ), @@ -498,13 +496,12 @@ TRACE_EVENT_RCU(rcu_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->func = rhp->func; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%ps %ld/%ld", + TP_printk("%s rhp=%p func=%ps %ld", __entry->rcuname, __entry->rhp, __entry->func, - __entry->qlen_lazy, __entry->qlen) + __entry->qlen) ); /* @@ -518,15 +515,14 @@ TRACE_EVENT_RCU(rcu_callback, TRACE_EVENT_RCU(rcu_kfree_callback, TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen_lazy, long qlen), + long qlen), - TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen), + TP_ARGS(rcuname, rhp, offset, qlen), TP_STRUCT__entry( __field(const char *, rcuname) __field(void *, rhp) __field(unsigned long, offset) - __field(long, qlen_lazy) __field(long, qlen) ), @@ -534,13 +530,12 @@ TRACE_EVENT_RCU(rcu_kfree_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->offset = offset; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%ld %ld/%ld", + TP_printk("%s rhp=%p func=%ld %ld", __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen_lazy, __entry->qlen) + __entry->qlen) ); /* @@ -552,27 +547,24 @@ TRACE_EVENT_RCU(rcu_kfree_callback, */ TRACE_EVENT_RCU(rcu_batch_start, - TP_PROTO(const char *rcuname, long qlen_lazy, long qlen, long blimit), + TP_PROTO(const char *rcuname, long qlen, long blimit), - TP_ARGS(rcuname, qlen_lazy, qlen, blimit), + TP_ARGS(rcuname, qlen, blimit), TP_STRUCT__entry( __field(const char *, rcuname) - __field(long, qlen_lazy) __field(long, qlen) __field(long, blimit) ), TP_fast_assign( __entry->rcuname = rcuname; - __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; __entry->blimit = blimit; ), - TP_printk("%s CBs=%ld/%ld bl=%ld", - __entry->rcuname, __entry->qlen_lazy, __entry->qlen, - __entry->blimit) + TP_printk("%s CBs=%ld bl=%ld", + __entry->rcuname, __entry->qlen, __entry->blimit) ); /* diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index ab504fbc76ca..c30a1f7dbd15 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -198,33 +198,6 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -void kfree(const void *); - -/* - * Reclaim the specified callback, either by invoking it (non-lazy case) - * or freeing it directly (lazy case). Return true if lazy, false otherwise. - */ -static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) -{ - rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; - - rcu_lock_acquire(&rcu_callback_map); - if (__is_kfree_rcu_offset(offset)) { - trace_rcu_invoke_kfree_callback(rn, head, offset); - kfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } else { - trace_rcu_invoke_callback(rn, head); - f = head->func; - WRITE_ONCE(head->func, (rcu_callback_t)0L); - f(head); - rcu_lock_release(&rcu_callback_map); - return false; - } -} - #ifdef CONFIG_RCU_STALL_COMMON extern int rcu_cpu_stall_ftrace_dump; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index cbc87b804db9..5f4fd3b8777c 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -20,14 +20,10 @@ void rcu_cblist_init(struct rcu_cblist *rclp) rclp->head = NULL; rclp->tail = &rclp->head; rclp->len = 0; - rclp->len_lazy = 0; } /* * Enqueue an rcu_head structure onto the specified callback list. - * This function assumes that the callback is non-lazy because it - * is intended for use by no-CBs CPUs, which do not distinguish - * between lazy and non-lazy RCU callbacks. */ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp) { @@ -54,7 +50,6 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, else drclp->tail = &drclp->head; drclp->len = srclp->len; - drclp->len_lazy = srclp->len_lazy; if (!rhp) { rcu_cblist_init(srclp); } else { @@ -62,16 +57,12 @@ void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, srclp->head = rhp; srclp->tail = &rhp->next; WRITE_ONCE(srclp->len, 1); - srclp->len_lazy = 0; } } /* * Dequeue the oldest rcu_head structure from the specified callback - * list. This function assumes that the callback is non-lazy, but - * the caller can later invoke rcu_cblist_dequeued_lazy() if it - * finds otherwise (and if it cares about laziness). This allows - * different users to have different ways of determining laziness. + * list. */ struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) { @@ -161,7 +152,6 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) for (i = 0; i < RCU_CBLIST_NSEGS; i++) rsclp->tails[i] = &rsclp->head; rcu_segcblist_set_len(rsclp, 0); - rsclp->len_lazy = 0; rsclp->enabled = 1; } @@ -173,7 +163,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); rsclp->enabled = 0; } @@ -253,11 +242,9 @@ bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp) * absolutely not OK for it to ever miss posting a callback. */ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); @@ -275,15 +262,13 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, * period. You have been warned. */ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy) + struct rcu_head *rhp) { int i; if (rcu_segcblist_n_cbs(rsclp) == 0) return false; rcu_segcblist_inc_len(rsclp); - if (lazy) - rsclp->len_lazy++; smp_mb(); /* Ensure counts are updated before callback is entrained. */ rhp->next = NULL; for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) @@ -307,8 +292,6 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rclp->len_lazy += rsclp->len_lazy; - rsclp->len_lazy = 0; rclp->len = rcu_segcblist_xchg_len(rsclp, 0); } @@ -361,9 +344,7 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { - rsclp->len_lazy += rclp->len_lazy; rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len_lazy = 0; rclp->len = 0; } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 815c2fdd3fcc..5c293afc07b8 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,15 +15,6 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } -/* - * Account for the fact that a previously dequeued callback turned out - * to be marked as lazy. - */ -static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) -{ - rclp->len_lazy--; -} - void rcu_cblist_init(struct rcu_cblist *rclp); void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, @@ -59,18 +50,6 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) -{ - return rsclp->len_lazy; -} - -/* Return number of lazy callbacks in segmented callback list. */ -static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) -{ - return rcu_segcblist_n_cbs(rsclp) - rsclp->len_lazy; -} - /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? @@ -106,9 +85,9 @@ struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); bool rcu_segcblist_nextgp(struct rcu_segcblist *rsclp, unsigned long *lp); void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, - struct rcu_head *rhp, bool lazy); + struct rcu_head *rhp); void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 5dffade2d7cd..d0a9d5b69087 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -853,7 +853,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, local_irq_save(flags); sdp = this_cpu_ptr(ssp->sda); spin_lock_rcu_node(sdp); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); s = rcu_seq_snap(&ssp->srcu_gp_seq); @@ -1052,7 +1052,7 @@ void srcu_barrier(struct srcu_struct *ssp) sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); if (!rcu_segcblist_entrain(&sdp->srcu_cblist, - &sdp->srcu_barrier_head, 0)) { + &sdp->srcu_barrier_head)) { debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&ssp->srcu_barrier_cpu_cnt); } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 477b4eb44af5..dd572ce7c747 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "rcu.h" @@ -73,6 +74,31 @@ void rcu_sched_clock_irq(int user) } } +/* + * Reclaim the specified callback, either by invoking it for non-kfree cases or + * freeing it directly (for kfree). Return true if kfreeing, false otherwise. + */ +static inline bool rcu_reclaim_tiny(struct rcu_head *head) +{ + rcu_callback_t f; + unsigned long offset = (unsigned long)head->func; + + rcu_lock_acquire(&rcu_callback_map); + if (__is_kfree_rcu_offset(offset)) { + trace_rcu_invoke_kfree_callback("", head, offset); + kfree((void *)head - offset); + rcu_lock_release(&rcu_callback_map); + return true; + } + + trace_rcu_invoke_callback("", head); + f = head->func; + WRITE_ONCE(head->func, (rcu_callback_t)0L); + f(head); + rcu_lock_release(&rcu_callback_map); + return false; +} + /* Invoke the RCU callbacks whose grace period has elapsed. */ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) { @@ -100,7 +126,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim("", list); + rcu_reclaim_tiny(list); local_bh_enable(); list = next; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0512221cd84b..a8dd612098bf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include "../time/tick-internal.h" @@ -2146,7 +2147,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* If no callbacks are ready, just return. */ if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), 0); trace_rcu_batch_end(rcu_state.name, 0, !rcu_segcblist_empty(&rdp->cblist), @@ -2168,7 +2168,6 @@ static void rcu_do_batch(struct rcu_data *rdp) if (unlikely(bl > 100)) tlimit = local_clock() + rcu_resched_ns; trace_rcu_batch_start(rcu_state.name, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) @@ -2179,9 +2178,19 @@ static void rcu_do_batch(struct rcu_data *rdp) tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + rcu_callback_t f; + debug_rcu_head_unqueue(rhp); - if (__rcu_reclaim(rcu_state.name, rhp)) - rcu_cblist_dequeued_lazy(&rcl); + + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_callback(rcu_state.name, rhp); + + f = rhp->func; + WRITE_ONCE(rhp->func, (rcu_callback_t)0L); + f(rhp); + + rcu_lock_release(&rcu_callback_map); + /* * Stop only if limit reached and CPU has something to do. * Note: The rcl structure counts down from zero. @@ -2583,7 +2592,7 @@ static void rcu_leak_callback(struct rcu_head *rhp) * is expected to specify a CPU. */ static void -__call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) +__call_rcu(struct rcu_head *head, rcu_callback_t func) { unsigned long flags; struct rcu_data *rdp; @@ -2618,18 +2627,17 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); } + if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags)) return; // Enqueued onto ->nocb_bypass, so just leave. /* If we get here, rcu_nocb_try_bypass() acquired ->nocb_lock. */ - rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + rcu_segcblist_enqueue(&rdp->cblist, head); if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rcu_state.name, head, (unsigned long)func, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); else trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_lazy_cbs(&rdp->cblist), rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ @@ -2679,7 +2687,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, bool lazy) */ void call_rcu(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 0); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(call_rcu); @@ -2747,10 +2755,18 @@ static void kfree_rcu_work(struct work_struct *work) // List "head" is now private, so traverse locklessly. for (; head; head = next) { + unsigned long offset = (unsigned long)head->func; + next = head->next; // Potentially optimize with kfree_bulk in future. debug_rcu_head_unqueue(head); - __rcu_reclaim(rcu_state.name, head); + rcu_lock_acquire(&rcu_callback_map); + trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); + + /* Could be possible to optimize with kfree_bulk in future */ + kfree((void *)head - offset); + + rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); } } @@ -2825,7 +2841,7 @@ static void kfree_rcu_monitor(struct work_struct *work) */ void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) { - __call_rcu(head, func, 1); + __call_rcu(head, func); } EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); @@ -3100,7 +3116,7 @@ static void rcu_barrier_func(void *unused) debug_rcu_head_queue(&rdp->barrier_head); rcu_nocb_lock(rdp); WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); - if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { + if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) { atomic_inc(&rcu_state.barrier_cpu_count); } else { debug_rcu_head_unqueue(&rdp->barrier_head); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 055c31781d3a..15405420b40c 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -183,7 +183,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ #ifdef CONFIG_RCU_FAST_NO_HZ - bool all_lazy; /* All CPU's CBs lazy at idle start? */ unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fa08d55f7040..d5334e49ccca 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1262,10 +1262,9 @@ static void rcu_prepare_for_idle(void) /* * This code is invoked when a CPU goes idle, at which point we want * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. This is handled by a - * state machine implemented by rcu_prepare_for_idle() below. + * the energy-efficient dyntick-idle mode. * - * The following three proprocessor symbols control this state machine: + * The following preprocessor symbol controls this: * * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This @@ -1274,21 +1273,15 @@ static void rcu_prepare_for_idle(void) * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your * system. And if you are -that- concerned about energy efficiency, * just power the system down and be done with it! - * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is - * permitted to sleep in dyntick-idle mode with only lazy RCU - * callbacks pending. Setting this too high can OOM your system. * - * The values below work well in practice. If future workloads require + * The value below works well in practice. If future workloads require * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ -#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; module_param(rcu_idle_gp_delay, int, 0644); -static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; -module_param(rcu_idle_lazy_gp_delay, int, 0644); /* * Try to advance callbacks on the current CPU, but only if it has been @@ -1327,8 +1320,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) /* * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller to set the timeout based on whether or not there are non-lazy - * callbacks. + * caller about what to set the timeout. * * The caller must have disabled interrupts. */ @@ -1354,25 +1346,18 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt) } rdp->last_accelerate = jiffies; - /* Request timer delay depending on laziness, and round. */ - rdp->all_lazy = !rcu_segcblist_n_nonlazy_cbs(&rdp->cblist); - if (rdp->all_lazy) { - dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; - } else { - dj = round_up(rcu_idle_gp_delay + jiffies, - rcu_idle_gp_delay) - jiffies; - } + /* Request timer and round. */ + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; + *nextevt = basemono + dj * TICK_NSEC; return 0; } /* - * Prepare a CPU for idle from an RCU perspective. The first major task - * is to sense whether nohz mode has been enabled or disabled via sysfs. - * The second major task is to check to see if a non-lazy callback has - * arrived at a CPU that previously had only lazy callbacks. The third - * major task is to accelerate (that is, assign grace-period numbers to) - * any recently arrived callbacks. + * Prepare a CPU for idle from an RCU perspective. The first major task is to + * sense whether nohz mode has been enabled or disabled via sysfs. The second + * major task is to accelerate (that is, assign grace-period numbers to) any + * recently arrived callbacks. * * The caller must have disabled interrupts. */ @@ -1398,17 +1383,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* - * If a non-lazy callback arrived at a CPU having only lazy - * callbacks, invoke RCU core for the side-effect of recalculating - * idle duration on re-entry to idle. - */ - if (rdp->all_lazy && rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)) { - rdp->all_lazy = false; - invoke_rcu_core(); - return; - } - /* * If we have not yet accelerated this jiffy, accelerate all * callbacks on this CPU. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..806f2ddc8f74 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -263,11 +263,9 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_data *rdp = &per_cpu(rcu_data, cpu); - sprintf(cp, "last_accelerate: %04lx/%04lx, Nonlazy posted: %c%c%c", + sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", rdp->last_accelerate & 0xffff, jiffies & 0xffff, - ".l"[rdp->all_lazy], - ".L"[!rcu_segcblist_n_nonlazy_cbs(&rdp->cblist)], - ".D"[!!rdp->tick_nohz_enabled_snap]); + !!rdp->tick_nohz_enabled_snap); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ -- cgit v1.2.3 From 189a6883dcf7fa70e17403ae4225c60ffc9e404b Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Fri, 30 Aug 2019 12:36:33 -0400 Subject: rcu: Remove kfree_call_rcu_nobatch() Now that the kfree_rcu() special-casing has been removed from tree RCU, this commit removes kfree_call_rcu_nobatch() since it is no longer needed. Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ---- include/linux/rcutiny.h | 5 ----- include/linux/rcutree.h | 1 - kernel/rcu/rcuperf.c | 10 +--------- kernel/rcu/tree.c | 18 ++++-------------- 5 files changed, 5 insertions(+), 33 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3ce270b56f3a..ed83d6d90cc3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3991,10 +3991,6 @@ Number of loops doing rcuperf.kfree_alloc_num number of allocations and frees. - rcuperf.kfree_no_batch= [KNL] - Use the non-batching (less efficient) version of kfree_rcu(). - This is useful for comparing with the batched version. - rcuperf.nreaders= [KNL] Set number of RCU readers. The value -1 selects N, where N is the number of CPUs. A value diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 1bd166aab6f3..b2b2dc990da9 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -39,11 +39,6 @@ static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) call_rcu(head, func); } -static inline void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) -{ - call_rcu(head, func); -} - void rcu_qs(void); static inline void rcu_softirq_qs(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 6a65d3a16dbd..2f787b9029d1 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,7 +34,6 @@ static inline void rcu_virt_note_context_switch(int cpu) void synchronize_rcu_expedited(void); void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func); -void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func); void rcu_barrier(void); bool rcu_eqs_special_set(int cpu); diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index c1e25fd10f2a..da94b89cd531 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -593,7 +593,6 @@ rcu_perf_shutdown(void *arg) torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu()."); torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration."); torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees."); -torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version of kfree_rcu()."); static struct task_struct **kfree_reader_tasks; static int kfree_nrealthreads; @@ -632,14 +631,7 @@ kfree_perf_thread(void *arg) if (!alloc_ptr) return -ENOMEM; - if (!kfree_no_batch) { - kfree_rcu(alloc_ptr, rh); - } else { - rcu_callback_t cb; - - cb = (rcu_callback_t)(unsigned long)offsetof(struct kfree_obj, rh); - kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb); - } + kfree_rcu(alloc_ptr, rh); } cond_resched(); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a8dd612098bf..31d2d9255d95 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2763,8 +2763,10 @@ static void kfree_rcu_work(struct work_struct *work) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset); - /* Could be possible to optimize with kfree_bulk in future */ - kfree((void *)head - offset); + if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) { + /* Could be optimized with kfree_bulk() in future. */ + kfree((void *)head - offset); + } rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); @@ -2835,16 +2837,6 @@ static void kfree_rcu_monitor(struct work_struct *work) spin_unlock_irqrestore(&krcp->lock, flags); } -/* - * This version of kfree_call_rcu does not do batching of kfree_rcu() requests. - * Used only by rcuperf torture test for comparison with kfree_rcu_batch(). - */ -void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func) -{ - __call_rcu(head, func); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch); - /* * Queue a request for lazy invocation of kfree() after a grace period. * @@ -2864,8 +2856,6 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) unsigned long flags; struct kfree_rcu_cpu *krcp; - head->func = func; - local_irq_save(flags); // For safely calling this_cpu_ptr(). krcp = this_cpu_ptr(&krc); if (krcp->initialized) -- cgit v1.2.3 From c130d2dc93cd03323494d82dbe7b5fb0d101ab62 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:48 +0000 Subject: rcu: Rename some instance of CONFIG_PREEMPTION to CONFIG_PREEMPT_RCU CONFIG_PREEMPTION and CONFIG_PREEMPT_RCU are always identical, but some code depends on CONFIG_PREEMPTION to access to rcu_preempt functionality. This patch changes CONFIG_PREEMPTION to CONFIG_PREEMPT_RCU in these cases. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- kernel/rcu/tree_stall.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c9dbb05e4c13..5445da2326a0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1934,7 +1934,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; raw_lockdep_assert_held_rcu_node(rnp); - if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) || + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_RCU)) || WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || rnp->qsmask != 0) { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -2294,7 +2294,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) mask = 0; raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { - if (!IS_ENABLED(CONFIG_PREEMPTION) || + if (!IS_ENABLED(CONFIG_PREEMPT_RCU) || rcu_preempt_blocked_readers_cgp(rnp)) { /* * No point in scanning bits because they diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..a6652efedd48 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp) // // Printing RCU CPU stall warnings -#ifdef CONFIG_PREEMPTION +#ifdef CONFIG_PREEMPT_RCU /* * Dump detailed information for all tasks blocking the current RCU @@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) return ndetected; } -#else /* #ifdef CONFIG_PREEMPTION */ +#else /* #ifdef CONFIG_PREEMPT_RCU */ /* * Because preemptible RCU does not exist, we never have to check for @@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) { return 0; } -#endif /* #else #ifdef CONFIG_PREEMPTION */ +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* * Dump stacks of all tasks running on stalled CPUs. First try using -- cgit v1.2.3 From 5b14557b073c96a7cf79adc4d7b6c4a8c26b2a43 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 26 Nov 2019 18:05:45 -0800 Subject: rcu: Avoid tick_dep_set_cpu() misordering In the current code, rcu_nmi_enter_common() might decide to turn on the tick using tick_dep_set_cpu(), but be delayed just before doing so. Then the grace-period kthread might notice that the CPU in question had in fact gone through a quiescent state, thus turning off the tick using tick_dep_clear_cpu(). The later invocation of tick_dep_set_cpu() would then incorrectly leave the tick on. This commit therefore enlists the aid of the leaf rcu_node structure's ->lock to ensure that decisions to enable or disable the tick are carried out before they can be reversed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5445da2326a0..b0e0612392a9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -800,8 +800,8 @@ void rcu_user_exit(void) */ static __always_inline void rcu_nmi_enter_common(bool irq) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); long incby = 2; + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); /* Complain about underflow. */ WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0); @@ -828,8 +828,13 @@ static __always_inline void rcu_nmi_enter_common(bool irq) } else if (tick_nohz_full_cpu(rdp->cpu) && rdp->dynticks_nmi_nesting == DYNTICK_IRQ_NONIDLE && READ_ONCE(rdp->rcu_urgent_qs) && !rdp->rcu_forced_tick) { - rdp->rcu_forced_tick = true; - tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + raw_spin_lock_rcu_node(rdp->mynode); + // Recheck under lock. + if (rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) { + rdp->rcu_forced_tick = true; + tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU); + } + raw_spin_unlock_rcu_node(rdp->mynode); } trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), rdp->dynticks_nmi_nesting, @@ -898,6 +903,7 @@ void rcu_irq_enter_irqson(void) */ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) { + raw_lockdep_assert_held_rcu_node(rdp->mynode); WRITE_ONCE(rdp->rcu_urgent_qs, false); WRITE_ONCE(rdp->rcu_need_heavy_qs, false); if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) { -- cgit v1.2.3 From 2488a5e695564897a0f8ea42cd3af71647cd26d2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:23:57 +0000 Subject: rcu: Fix tracepoint tracking RCU CPU kthread utilization In the call to trace_rcu_utilization() at the start of the loop in rcu_cpu_kthread(), "rcu_wait" is incorrect, plus this trace event needs to be hoisted above the loop to balance with either the "rcu_wait" or "rcu_yield", depending on how the loop exits. This commit therefore makes these changes. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index dd8cfc34f4da..ba154a3080fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2474,8 +2474,8 @@ static void rcu_cpu_kthread(unsigned int cpu) char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); int spincnt; + trace_rcu_utilization(TPS("Start CPU kthread@rcu_run")); for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; local_irq_disable(); -- cgit v1.2.3 From e2167b38c87a0c9e85c342a823dae1e6f67b11d9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 15 Oct 2019 10:28:47 +0000 Subject: rcu: Move gp_state_names[] and gp_state_getname() to tree_stall.h Only tree_stall.h needs to get name from GP state, so this commit moves the gp_state_names[] array and the gp_state_getname() from kernel/rcu/tree.h and kernel/rcu/tree.c, respectively, to kernel/rcu/tree_stall.h. While moving gp_state_names[], this commit uses the GCC syntax to ensure that the right string is associated with the right CPP macro. Signed-off-by: Lai Jiangshan Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 10 ---------- kernel/rcu/tree.h | 12 ------------ kernel/rcu/tree_stall.h | 22 ++++++++++++++++++++++ 3 files changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ba154a3080fb..bbb60ed310b1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -528,16 +528,6 @@ static struct rcu_node *rcu_get_root(void) return &rcu_state.node[0]; } -/* - * Convert a ->gp_state value to a character string. - */ -static const char *gp_state_getname(short gs) -{ - if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) - return "???"; - return gp_state_names[gs]; -} - /* * Send along grace-period-related data for rcutorture diagnostics. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 54ff9896ae31..9d5986abfc67 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -368,18 +368,6 @@ struct rcu_state { #define RCU_GP_CLEANUP 7 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 8 /* Grace-period cleanup complete. */ -static const char * const gp_state_names[] = { - "RCU_GP_IDLE", - "RCU_GP_WAIT_GPS", - "RCU_GP_DONE_GPS", - "RCU_GP_ONOFF", - "RCU_GP_INIT", - "RCU_GP_WAIT_FQS", - "RCU_GP_DOING_FQS", - "RCU_GP_CLEANUP", - "RCU_GP_CLEANED", -}; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index c0b8c458d8a6..f18adaf3bf39 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -279,6 +279,28 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ +static const char * const gp_state_names[] = { + [RCU_GP_IDLE] = "RCU_GP_IDLE", + [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", + [RCU_GP_DONE_GPS] = "RCU_GP_DONE_GPS", + [RCU_GP_ONOFF] = "RCU_GP_ONOFF", + [RCU_GP_INIT] = "RCU_GP_INIT", + [RCU_GP_WAIT_FQS] = "RCU_GP_WAIT_FQS", + [RCU_GP_DOING_FQS] = "RCU_GP_DOING_FQS", + [RCU_GP_CLEANUP] = "RCU_GP_CLEANUP", + [RCU_GP_CLEANED] = "RCU_GP_CLEANED", +}; + +/* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + /* * Print out diagnostic information for the specified stalled CPU. * -- cgit v1.2.3 From 7441e7661d6586ae36329b7956e4d713d81e9903 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 30 Oct 2019 09:37:11 -0700 Subject: rcu: Switch force_qs_rnp() to for_each_leaf_node_cpu_mask() Currently, force_qs_rnp() uses a for_each_leaf_node_possible_cpu() loop containing a check of the current CPU's bit in ->qsmask. This works, but this commit saves three lines by instead using for_each_leaf_node_cpu_mask(), which combines the functionality of for_each_leaf_node_possible_cpu() and leaf_node_cpu_bit(). This commit also replaces the use of the local variable "bit" with rdp->grpmask. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bbb60ed310b1..d95076498488 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2298,14 +2298,11 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp)) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; } - for_each_leaf_node_possible_cpu(rnp, cpu) { - unsigned long bit = leaf_node_cpu_bit(rnp, cpu); - if ((rnp->qsmask & bit) != 0) { - rdp = per_cpu_ptr(&rcu_data, cpu); - if (f(rdp)) { - mask |= bit; - rcu_disable_urgency_upon_qs(rdp); - } + for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) { + rdp = per_cpu_ptr(&rcu_data, cpu); + if (f(rdp)) { + mask |= rdp->grpmask; + rcu_disable_urgency_upon_qs(rdp); } } if (mask != 0) { -- cgit v1.2.3 From f6105fc2a9c0ea5be6b9e5c19b2551af0b8b4eac Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Nov 2019 11:36:07 -0800 Subject: rcu: Remove unused stop-machine #include Long ago, RCU used the stop-machine mechanism to implement expedited grace periods, but no longer does so. This commit therefore removes the no-longer-needed #includes of linux/stop_machine.h. Link: https://lwn.net/Articles/805317/ Reported-by: Viresh Kumar Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 - kernel/rcu/tree.h | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel/rcu/tree.c') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d95076498488..878f62f218e9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9d5986abfc67..ce90c68c184b 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #include "rcu_segcblist.h" -- cgit v1.2.3